[med-svn] [spades] 01/09: Imported Upstream version 3.8.0+dfsg
Sascha Steinbiss
sascha at steinbiss.name
Mon Jun 6 22:54:13 UTC 2016
This is an automated email from the git hooks/post-receive script.
sascha-guest pushed a commit to branch master
in repository spades.
commit 5dac0a3bad20223dd32dba20645c1090b68eadd5
Author: Sascha Steinbiss <sascha at steinbiss.name>
Date: Tue May 31 21:50:22 2016 +0000
Imported Upstream version 3.8.0+dfsg
---
VERSION | 3 +-
changelog.html | 8 +
.../cclean/{config.info.template => config.info} | 0
configs/corrector/corrector.info.template | 7 -
configs/debruijn/careful_mode.info | 37 +
configs/debruijn/config.info | 42 +-
configs/debruijn/config.info.template | 180 --
configs/debruijn/construction.info.template | 23 -
configs/debruijn/detail_info_printer.info | 1 +
configs/debruijn/detail_info_printer.info.template | 43 -
configs/debruijn/diploid_mode.info | 16 +
configs/debruijn/distance_estimation.info | 59 +-
configs/debruijn/distance_estimation.info.template | 91 -
configs/debruijn/log.properties.template | 52 -
configs/debruijn/mda_mode.info | 120 +
configs/debruijn/meta_mode.info | 171 ++
configs/debruijn/moleculo_mode.info | 127 +
configs/debruijn/path_extend/pe_params.info | 186 --
.../debruijn/path_extend/pe_params.info.template | 186 --
configs/debruijn/pe_params.info | 138 ++
configs/debruijn/plasmid_mode.info | 12 +
configs/debruijn/rna_mode.info | 75 +
configs/debruijn/simplification.info | 374 +--
configs/debruijn/simplification.info.template | 561 -----
configs/debruijn/tsa.info.template | 5 -
configs/dipspades/config.info.template | 64 -
configs/dipspades/log.properties.template | 36 -
configs/hammer/config.info.template | 56 -
configs/ionhammer/ionhammer.cfg.template | 12 -
dipspades.py | 10 +-
ext/include/htrie/ahtable.h | 115 +
ext/include/htrie/common.h | 22 +
ext/include/htrie/hat-trie.h | 74 +
ext/include/llvm/ADT/ArrayRef.h | 384 +++
ext/include/llvm/ADT/DenseMap.h | 1074 +++++++++
ext/include/llvm/ADT/DenseMapInfo.h | 221 ++
ext/include/llvm/ADT/EpochTracker.h | 78 +
ext/include/llvm/ADT/FoldingSet.h | 750 ++++++
ext/include/llvm/ADT/Hashing.h | 661 ++++++
ext/include/llvm/ADT/IntrusiveRefCntPtr.h | 288 +++
ext/include/llvm/ADT/None.h | 26 +
ext/include/llvm/ADT/Optional.h | 228 ++
ext/include/llvm/ADT/PointerEmbeddedInt.h | 103 +
ext/include/llvm/ADT/PointerIntPair.h | 223 ++
ext/include/llvm/ADT/PointerSumType.h | 205 ++
ext/include/llvm/ADT/PointerUnion.h | 474 ++++
ext/include/llvm/ADT/STLExtras.h | 472 ++++
ext/include/llvm/ADT/SmallString.h | 297 +++
ext/include/llvm/ADT/SmallVector.h | 954 ++++++++
ext/include/llvm/ADT/StringExtras.h | 212 ++
ext/include/llvm/ADT/StringMap.h | 463 ++++
ext/include/llvm/ADT/StringRef.h | 593 +++++
ext/include/llvm/ADT/StringSwitch.h | 166 ++
ext/include/llvm/ADT/Twine.h | 540 +++++
ext/include/llvm/ADT/edit_distance.h | 103 +
ext/include/llvm/ADT/ilist.h | 800 +++++++
ext/include/llvm/ADT/ilist_node.h | 123 +
ext/include/llvm/ADT/iterator.h | 246 ++
ext/include/llvm/ADT/iterator_range.h | 68 +
ext/include/llvm/Config.h.in | 31 +
ext/include/llvm/Support/AlignOf.h | 173 ++
ext/include/llvm/Support/Allocator.h | 435 ++++
ext/include/llvm/Support/Atomic.h | 35 +
ext/include/llvm/Support/Casting.h | 326 +++
ext/include/llvm/Support/Compiler.h | 435 ++++
ext/include/llvm/Support/DataTypes.h | 56 +
ext/include/llvm/Support/Debug.h | 96 +
ext/include/llvm/Support/Errc.h | 86 +
ext/include/llvm/Support/Errno.h | 34 +
ext/include/llvm/Support/ErrorHandling.h | 106 +
ext/include/llvm/Support/ErrorOr.h | 298 +++
ext/include/llvm/Support/FileOutputBuffer.h | 89 +
ext/include/llvm/Support/FileSystem.h | 850 +++++++
ext/include/llvm/Support/FileUtilities.h | 78 +
ext/include/llvm/Support/Format.h | 191 ++
ext/include/llvm/Support/Host.h | 38 +
ext/include/llvm/Support/LineIterator.h | 88 +
ext/include/llvm/Support/MathExtras.h | 715 ++++++
ext/include/llvm/Support/Memory.h | 186 ++
ext/include/llvm/Support/MemoryBuffer.h | 169 ++
ext/include/llvm/Support/Mutex.h | 155 ++
ext/include/llvm/Support/MutexGuard.h | 41 +
ext/include/llvm/Support/Path.h | 437 ++++
ext/include/llvm/Support/PointerLikeTypeTraits.h | 92 +
ext/include/llvm/Support/Regex.h | 105 +
ext/include/llvm/Support/SMLoc.h | 63 +
ext/include/llvm/Support/Signals.h | 71 +
ext/include/llvm/Support/SourceMgr.h | 285 +++
ext/include/llvm/Support/StringSaver.h | 32 +
ext/include/llvm/Support/SwapByteOrder.h | 115 +
ext/include/llvm/Support/UniqueLock.h | 67 +
ext/include/llvm/Support/YAMLParser.h | 601 +++++
ext/include/llvm/Support/YAMLTraits.h | 1446 ++++++++++++
ext/include/llvm/Support/raw_ostream.h | 530 +++++
ext/include/llvm/Support/type_traits.h | 109 +
{src => ext}/include/ssw/ssw.h | 0
ext/include/ssw/ssw_cpp.h | 215 ++
ext/include/yaml-cpp/anchor.h | 16 -
ext/include/yaml-cpp/binary.h | 62 -
ext/include/yaml-cpp/contrib/anchordict.h | 42 -
ext/include/yaml-cpp/contrib/graphbuilder.h | 133 --
ext/include/yaml-cpp/dll.h | 28 -
ext/include/yaml-cpp/emitfromevents.h | 45 -
ext/include/yaml-cpp/emitter.h | 209 --
ext/include/yaml-cpp/emitterdef.h | 13 -
ext/include/yaml-cpp/emittermanip.h | 149 --
ext/include/yaml-cpp/eventhandler.h | 36 -
ext/include/yaml-cpp/exceptions.h | 201 --
ext/include/yaml-cpp/mark.h | 26 -
ext/include/yaml-cpp/node/convert.h | 224 --
ext/include/yaml-cpp/node/detail/bool_type.h | 26 -
ext/include/yaml-cpp/node/detail/impl.h | 163 --
ext/include/yaml-cpp/node/detail/iterator.h | 64 -
ext/include/yaml-cpp/node/detail/iterator_fwd.h | 27 -
ext/include/yaml-cpp/node/detail/memory.h | 39 -
ext/include/yaml-cpp/node/detail/node.h | 130 --
ext/include/yaml-cpp/node/detail/node_data.h | 110 -
ext/include/yaml-cpp/node/detail/node_iterator.h | 143 --
ext/include/yaml-cpp/node/detail/node_ref.h | 69 -
ext/include/yaml-cpp/node/emit.h | 23 -
ext/include/yaml-cpp/node/impl.h | 382 ---
ext/include/yaml-cpp/node/iterator.h | 28 -
ext/include/yaml-cpp/node/node.h | 112 -
ext/include/yaml-cpp/node/parse.h | 28 -
ext/include/yaml-cpp/node/ptr.h | 29 -
ext/include/yaml-cpp/node/type.h | 14 -
ext/include/yaml-cpp/noncopyable.h | 25 -
ext/include/yaml-cpp/null.h | 25 -
ext/include/yaml-cpp/ostream_wrapper.h | 69 -
ext/include/yaml-cpp/parser.h | 47 -
ext/include/yaml-cpp/stlemitter.h | 51 -
ext/include/yaml-cpp/traits.h | 57 -
ext/include/yaml-cpp/yaml.h | 21 -
ext/src/CMakeLists.txt | 5 +-
.../bamtools/api/internal/bam/BamMultiReader_p.cpp | 4 +-
ext/src/htrie/CMakeLists.txt | 11 +
ext/src/htrie/ahtable.c | 564 +++++
ext/src/htrie/hat-trie.c | 711 ++++++
ext/src/htrie/misc.c | 46 +
ext/src/htrie/misc.h | 22 +
ext/src/htrie/murmurhash3.c | 77 +
ext/src/htrie/murmurhash3.h | 12 +
ext/src/llvm/Atomic.cpp | 58 +
ext/src/llvm/CMakeLists.txt | 37 +
ext/src/llvm/ErrorHandling.cpp | 112 +
ext/src/llvm/Hashing.cpp | 29 +
ext/src/llvm/LineIterator.cpp | 94 +
ext/src/llvm/MemoryBuffer.cpp | 401 ++++
ext/src/llvm/Mutex.cpp | 93 +
ext/src/llvm/Path.cpp | 911 ++++++++
ext/src/llvm/Path.inc | 620 +++++
ext/src/llvm/Regex.cpp | 193 ++
ext/src/llvm/Signals.cpp | 62 +
ext/src/llvm/Signals.inc | 435 ++++
ext/src/llvm/SmallVector.cpp | 41 +
ext/src/llvm/SourceMgr.cpp | 476 ++++
ext/src/llvm/StringMap.cpp | 245 ++
ext/src/llvm/StringRef.cpp | 445 ++++
ext/src/llvm/Twine.cpp | 162 ++
ext/src/llvm/Unix.h | 57 +
ext/src/llvm/YAMLParser.cpp | 2445 ++++++++++++++++++++
ext/src/llvm/YAMLTraits.cpp | 1021 ++++++++
ext/src/llvm/raw_ostream.cpp | 752 ++++++
ext/src/llvm/regcclass.h | 75 +
ext/src/llvm/regcname.h | 144 ++
ext/src/llvm/regcomp.c | 1568 +++++++++++++
ext/src/llvm/regengine.inc | 1034 +++++++++
ext/src/llvm/regerror.c | 131 ++
ext/src/llvm/regex2.h | 162 ++
ext/src/llvm/regex_impl.h | 108 +
ext/src/llvm/regexec.c | 162 ++
ext/src/llvm/regfree.c | 72 +
ext/src/llvm/regstrlcpy.c | 52 +
ext/src/llvm/regutils.h | 58 +
ext/src/samtools/examples/ex1.bam | Bin 126580 -> 0 bytes
ext/src/samtools/examples/ex1.sam.gz | Bin 114565 -> 0 bytes
ext/src/yaml-cpp/CMakeLists.txt | 14 -
ext/src/yaml-cpp/binary.cpp | 93 -
ext/src/yaml-cpp/collectionstack.h | 35 -
ext/src/yaml-cpp/contrib/graphbuilder.cpp | 16 -
ext/src/yaml-cpp/contrib/graphbuilderadapter.cpp | 96 -
ext/src/yaml-cpp/contrib/graphbuilderadapter.h | 73 -
ext/src/yaml-cpp/convert.cpp | 83 -
ext/src/yaml-cpp/directives.cpp | 24 -
ext/src/yaml-cpp/directives.h | 29 -
ext/src/yaml-cpp/emit.cpp | 29 -
ext/src/yaml-cpp/emitfromevents.cpp | 105 -
ext/src/yaml-cpp/emitter.cpp | 951 --------
ext/src/yaml-cpp/emitterstate.cpp | 384 ---
ext/src/yaml-cpp/emitterstate.h | 190 --
ext/src/yaml-cpp/emitterutils.cpp | 424 ----
ext/src/yaml-cpp/emitterutils.h | 36 -
ext/src/yaml-cpp/exp.cpp | 113 -
ext/src/yaml-cpp/exp.h | 196 --
ext/src/yaml-cpp/indentation.h | 38 -
ext/src/yaml-cpp/memory.cpp | 29 -
ext/src/yaml-cpp/node.cpp | 14 -
ext/src/yaml-cpp/node_data.cpp | 302 ---
ext/src/yaml-cpp/nodebuilder.cpp | 138 --
ext/src/yaml-cpp/nodebuilder.h | 58 -
ext/src/yaml-cpp/nodeevents.cpp | 99 -
ext/src/yaml-cpp/nodeevents.h | 57 -
ext/src/yaml-cpp/null.cpp | 6 -
ext/src/yaml-cpp/ostream_wrapper.cpp | 56 -
ext/src/yaml-cpp/parse.cpp | 68 -
ext/src/yaml-cpp/parser.cpp | 141 --
ext/src/yaml-cpp/ptr_stack.h | 49 -
ext/src/yaml-cpp/ptr_vector.h | 47 -
ext/src/yaml-cpp/regex.cpp | 60 -
ext/src/yaml-cpp/regex.h | 67 -
ext/src/yaml-cpp/regeximpl.h | 186 --
ext/src/yaml-cpp/scanner.cpp | 394 ----
ext/src/yaml-cpp/scanner.h | 133 --
ext/src/yaml-cpp/scanscalar.cpp | 214 --
ext/src/yaml-cpp/scanscalar.h | 45 -
ext/src/yaml-cpp/scantag.cpp | 84 -
ext/src/yaml-cpp/scantag.h | 20 -
ext/src/yaml-cpp/scantoken.cpp | 439 ----
ext/src/yaml-cpp/setting.h | 105 -
ext/src/yaml-cpp/simplekey.cpp | 139 --
ext/src/yaml-cpp/singledocparser.cpp | 387 ----
ext/src/yaml-cpp/singledocparser.h | 65 -
ext/src/yaml-cpp/stream.cpp | 447 ----
ext/src/yaml-cpp/stream.h | 79 -
ext/src/yaml-cpp/streamcharsource.h | 48 -
ext/src/yaml-cpp/stringsource.h | 47 -
ext/src/yaml-cpp/tag.cpp | 52 -
ext/src/yaml-cpp/tag.h | 28 -
ext/src/yaml-cpp/token.h | 85 -
manual.html | 69 +-
metaspades.py | 951 ++++++++
plasmidspades.py | 951 ++++++++
spades.py | 122 +-
spades_compile.sh | 8 +-
src/CMakeLists.txt | 25 +-
src/cmake/includes.cmake | 2 +-
src/cmake/pack.cmake | 6 +-
src/corrector/CMakeLists.txt | 34 -
src/corrector/config_struct.cpp | 51 -
src/corrector/config_struct.hpp | 33 -
src/corrector/contig_processor.cpp | 302 ---
src/corrector/contig_processor.hpp | 65 -
src/corrector/dataset_processor.cpp | 278 ---
src/corrector/dataset_processor.hpp | 71 -
src/corrector/interesting_pos_processor.cpp | 125 -
src/corrector/main.cpp | 62 -
src/debruijn/CMakeLists.txt | 53 -
src/debruijn/bwa_pair_info_filler.cpp | 407 ----
src/debruijn/bwa_pair_info_filler.hpp | 254 --
src/debruijn/config_struct.cpp | 799 -------
src/debruijn/config_struct.hpp | 641 -----
src/debruijn/construction.cpp | 73 -
src/debruijn/construction.hpp | 23 -
src/debruijn/contig_output.hpp | 418 ----
src/debruijn/dataset_readers.hpp | 122 -
src/debruijn/debruijn debug.launch.template | 32 -
src/debruijn/debruijn release.launch.template | 17 -
src/debruijn/debruijn_data.hpp | 169 --
src/debruijn/debruijn_graph.hpp | 110 -
src/debruijn/debruijn_graph_constructor.hpp | 556 -----
src/debruijn/debruijn_stats.cpp | 525 -----
src/debruijn/detail_coverage.hpp | 257 --
src/debruijn/distance_estimation.cpp | 242 --
src/debruijn/distance_estimation.hpp | 24 -
src/debruijn/early_simplification.hpp | 269 ---
src/debruijn/edge_index.hpp | 113 -
src/debruijn/gap_closer.cpp | 505 ----
src/debruijn/gap_closer.hpp | 33 -
src/debruijn/genome_consistance_checker.cpp | 236 --
src/debruijn/genome_consistance_checker.hpp | 78 -
src/debruijn/genome_storage.cpp | 45 -
src/debruijn/genome_storage.hpp | 33 -
src/debruijn/genomic_info.hpp | 44 -
src/debruijn/genomic_info_filler.cpp | 121 -
src/debruijn/genomic_info_filler.hpp | 23 -
src/debruijn/genomic_quality.hpp | 553 -----
src/debruijn/graph_construction.hpp | 190 --
src/debruijn/graph_pack.hpp | 154 --
src/debruijn/graph_read_correction.hpp | 183 --
src/debruijn/graphio.hpp | 1017 --------
src/debruijn/indices/edge_index_builders.hpp | 179 --
src/debruijn/indices/edge_info_updater.hpp | 96 -
src/debruijn/indices/edge_multi_index.hpp | 152 --
src/debruijn/indices/edge_position_index.hpp | 191 --
src/debruijn/indices/kmer_extension_index.hpp | 413 ----
src/debruijn/indices/kmer_splitters.hpp | 444 ----
src/debruijn/indices/perfect_hash_map.hpp | 397 ----
src/debruijn/indices/storing_traits.hpp | 61 -
src/debruijn/is_counter.hpp | 173 --
src/debruijn/kmer_coverage_model.cpp | 379 ---
src/debruijn/kmer_coverage_model.hpp | 43 -
src/debruijn/kmer_mapper.hpp | 224 --
src/debruijn/kmer_mapper_logger.hpp | 44 -
src/debruijn/launch.hpp | 117 -
src/debruijn/long_read_mapper.hpp | 100 -
src/debruijn/long_read_storage.hpp | 376 ---
src/debruijn/main.cpp | 173 --
src/debruijn/mismatch_correction.cpp | 27 -
src/debruijn/mismatch_correction.hpp | 23 -
src/debruijn/mismatch_shall_not_pass.hpp | 339 ---
src/debruijn/moleculo.hpp | 36 -
src/debruijn/overlap_analysis.hpp | 113 -
src/debruijn/pacbio/pac_index.hpp | 833 -------
src/debruijn/pacbio/pacbio_gap_closer.hpp | 394 ----
src/debruijn/pacbio/pacbio_read_structures.hpp | 326 ---
src/debruijn/pacbio_aligning.cpp | 186 --
src/debruijn/pacbio_aligning.hpp | 23 -
src/debruijn/pair_info_count.cpp | 249 --
src/debruijn/pair_info_count.hpp | 24 -
src/debruijn/pair_info_filler.hpp | 119 -
src/debruijn/pair_info_improver.hpp | 235 --
src/debruijn/paired_statistics.hpp | 1058 ---------
src/debruijn/path_extend/bidirectional_path.cpp | 21 -
src/debruijn/path_extend/bidirectional_path.hpp | 1065 ---------
src/debruijn/path_extend/extension_chooser.hpp | 1443 ------------
src/debruijn/path_extend/ideal_pair_info.hpp | 129 --
src/debruijn/path_extend/loop_traverser.hpp | 213 --
src/debruijn/path_extend/next_path_searcher.hpp | 1031 ---------
src/debruijn/path_extend/paired_library.hpp | 180 --
src/debruijn/path_extend/path_extend_launch.hpp | 851 -------
src/debruijn/path_extend/path_extender.hpp | 1390 -----------
src/debruijn/path_extend/path_filter.hpp | 134 --
src/debruijn/path_extend/path_visualizer.hpp | 172 --
src/debruijn/path_extend/pe_config_struct.cpp | 164 --
src/debruijn/path_extend/pe_config_struct.hpp | 243 --
src/debruijn/path_extend/pe_io.hpp | 279 ---
src/debruijn/path_extend/pe_resolver.hpp | 518 -----
src/debruijn/path_extend/pe_utils.hpp | 461 ----
.../scaffolder2015/connection_condition2015.cpp | 111 -
.../scaffolder2015/connection_condition2015.hpp | 69 -
.../scaffolder2015/extension_chooser2015.cpp | 81 -
.../scaffolder2015/extension_chooser2015.hpp | 49 -
.../scaffolder2015/scaff_supplementary.hpp | 75 -
.../path_extend/scaffolder2015/scaffold_graph.cpp | 275 ---
.../path_extend/scaffolder2015/scaffold_graph.hpp | 233 --
.../scaffolder2015/scaffold_graph_constructor.cpp | 73 -
.../scaffolder2015/scaffold_graph_visualizer.cpp | 71 -
.../scaffolder2015/scaffold_graph_visualizer.hpp | 73 -
src/debruijn/path_extend/split_graph_pair_info.hpp | 449 ----
.../path_extend/utils/paired_info_checker.cpp | 204 --
src/debruijn/path_extend/weight_counter.hpp | 543 -----
src/debruijn/path_utils.hpp | 105 -
src/debruijn/positions.hpp | 112 -
src/debruijn/read_converter.hpp | 360 ---
src/debruijn/repeat_resolving.cpp | 99 -
src/debruijn/repeat_resolving.hpp | 40 -
src/debruijn/second_phase_setup.cpp | 52 -
src/debruijn/second_phase_setup.hpp | 22 -
src/debruijn/sequence_mapper.hpp | 431 ----
src/debruijn/sequence_mapper_notifier.hpp | 181 --
src/debruijn/short_read_mapper.hpp | 98 -
src/debruijn/simplification.cpp | 477 ----
src/debruijn/simplification.hpp | 34 -
.../simplification/graph_simplification.hpp | 825 -------
.../parallel_simplification_algorithms.hpp | 924 --------
.../simplification/simplification_settings.hpp | 105 -
.../simplification/single_cell_simplification.hpp | 110 -
src/debruijn/split_path_constructor.hpp | 134 --
src/debruijn/stage.cpp | 133 --
src/debruijn/stage.hpp | 155 --
src/debruijn/standard.hpp | 22 -
src/debruijn/stats/chimera_stats.hpp | 265 ---
src/debruijn/stats/debruijn_stats.hpp | 417 ----
src/debruijn/stats/statistics.hpp | 272 ---
src/debruijn/utils.hpp | 138 --
src/dipspades/CMakeLists.txt | 27 -
.../consensus_contigs_constructor.hpp | 332 ---
.../abstract_contig_corrector.hpp | 43 -
.../contig_correctors/close_gaps_corrector.hpp | 154 --
.../equal_path_deletion_correction.hpp | 82 -
.../contig_correctors/incorrect_contig_remover.hpp | 43 -
.../iterative_redundant_contigs_remover.hpp | 94 -
.../contig_correctors/overlap_searcher.hpp | 541 -----
.../contig_correctors/redundant_contig_remover.hpp | 891 -------
.../same_edge_deletion_corrector.hpp | 71 -
.../mapping_contig.hpp | 381 ---
.../mapping_contigs_storage.hpp | 114 -
.../overlap_graph.hpp | 1119 ---------
src/dipspades/dipspades.hpp | 265 ---
src/dipspades/dipspades_config.cpp | 143 --
src/dipspades/dipspades_config.hpp | 82 -
.../conservative_regions_searcher.hpp | 174 --
.../conservative_regions_storage.hpp | 44 -
.../haplotype_assembly/contig_separation_utils.hpp | 515 -----
.../haplotype_assembly/haplotype_assembler.hpp | 59 -
src/dipspades/kmer_gluing/equal_sequence_gluer.hpp | 146 --
src/dipspades/main.cpp | 120 -
.../bulge_correction_condition.hpp | 128 -
.../polymorphic_bulge_remover/bulge_gluer.hpp | 88 -
.../bulge_paths_searcher.hpp | 97 -
.../polymorphic_bulge_remover/bulge_splitter.hpp | 497 ----
.../complex_bulge_remover.hpp | 145 --
.../diploid_bulge_finder.hpp | 102 -
.../glue_direction_definer.hpp | 76 -
.../gluing_vertices_definer.hpp | 170 --
.../iterative_tails_gluing.hpp | 132 --
.../polymorphic_bulge_remover.hpp | 108 -
.../simple_bulge_remover.hpp | 51 -
src/dipspades/utils/bulge_utils.hpp | 267 ---
src/dipspades/utils/dijkstra_utils.hpp | 163 --
src/dipspades/utils/edge_gluer.hpp | 102 -
src/dipspades/utils/element_printers.hpp | 108 -
src/dipspades/utils/files_utils.cpp | 48 -
src/dipspades/utils/histogram.hpp | 104 -
src/dipspades/utils/lcs_utils.hpp | 146 --
src/dipspades/utils/path_index.hpp | 68 -
src/dipspades/utils/path_routines.hpp | 286 ---
src/dipspades/utils/range_utils.hpp | 57 -
src/dipspades/utils/redundancy_map.hpp | 235 --
src/dipspades/utils/sequence_utils.hpp | 36 -
src/hammer/CMakeLists.txt | 36 -
src/hammer/config_struct_hammer.cpp | 86 -
src/hammer/config_struct_hammer.hpp | 89 -
src/hammer/expander.cpp | 70 -
src/hammer/hamcluster.cpp | 288 ---
src/hammer/hamcluster.hpp | 161 --
src/hammer/hammer_tools.cpp | 274 ---
src/hammer/hammer_tools.hpp | 57 -
src/hammer/kmer_cluster.cpp | 656 ------
src/hammer/kmer_data.cpp | 569 -----
src/hammer/kmer_data.hpp | 141 --
src/hammer/kmer_stat.hpp | 291 ---
src/hammer/main.cpp | 291 ---
src/hammer/parallel_radix_sort.hpp | 592 -----
src/hammer/quake_correct/Read.cpp | 824 -------
src/hammer/quake_correct/bithash.cpp | 388 ----
src/hammer/quake_correct/correct.cpp | 897 -------
src/hammer/quake_correct/edit.cpp | 665 ------
src/hammer/quake_count/quake_count.cpp | 241 --
src/hammer/quake_count/quake_count_17.cpp | 238 --
src/hammer/quake_count/quake_count_19.cpp | 238 --
src/hammer/quake_count/quake_count_21.cpp | 238 --
src/hammer/quake_count/quake_count_25.cpp | 238 --
src/hammer/quake_count/quake_count_29.cpp | 238 --
src/hammer/quake_count/quake_count_33.cpp | 239 --
src/hammer/quake_count/quake_count_37.cpp | 238 --
src/hammer/quake_count/quake_count_45.cpp | 238 --
src/hammer/quake_count/quake_count_55.cpp | 240 --
src/hammer/quake_count/quake_count_65.cpp | 238 --
src/hammer/quake_count/quake_count_75.cpp | 238 --
src/hammer/quake_count/valid_kmer_generator.hpp | 194 --
src/hammer/quake_enhanced/count.cpp | 131 --
src/hammer/quake_enhanced/count/count.cpp | 226 --
.../quake_enhanced/filter_trusted_enh/main.cpp | 106 -
src/hammer/quake_enhanced/options.cpp | 206 --
.../test_correction_quality/main.cpp | 108 -
src/hammer/valid_kmer_generator.hpp | 200 --
src/include/adt/array_vector.hpp | 625 -----
src/include/adt/bag.hpp | 87 -
src/include/adt/chained_iterator.hpp | 76 -
src/include/adt/concurrent_dsu.hpp | 296 ---
src/include/adt/filter_iterator.hpp | 49 -
src/include/adt/function_traits.hpp | 70 -
src/include/adt/iterator_range.hpp | 46 -
src/include/adt/kmer_hash_vector.hpp | 370 ---
src/include/adt/kmer_map.hpp | 942 --------
src/include/adt/kmer_set.hpp | 364 ---
src/include/adt/kmer_vector.hpp | 165 --
src/include/adt/parallel_seq_vector.hpp | 110 -
src/include/adt/pointer_iterator.hpp | 172 --
src/include/adt/queue_iterator.hpp | 143 --
src/include/adt/small_pod_vector.hpp | 379 ---
src/include/config_common.hpp | 199 --
src/include/config_singl.hpp | 55 -
src/include/copy_file.hpp | 18 -
src/include/cpp_utils.hpp | 41 -
src/include/de/conj_iterator.hpp | 140 --
src/include/de/data_divider.hpp | 140 --
src/include/de/distance_estimation.hpp | 311 ---
src/include/de/extensive_distance_estimation.hpp | 211 --
src/include/de/index_point.hpp | 455 ----
src/include/de/insert_size_refiner.hpp | 166 --
src/include/de/pair_info_filters.hpp | 271 ---
src/include/de/paired_info.hpp | 863 -------
src/include/de/paired_info_helpers.hpp | 149 --
src/include/de/peak_finder.hpp | 386 ---
src/include/de/smoothing_distance_estimation.hpp | 221 --
src/include/de/weighted_distance_estimation.hpp | 115 -
src/include/file_limit.hpp | 33 -
src/include/func.hpp | 69 -
src/include/graph_print_utils.hpp | 328 ---
src/include/io/bam_parser.hpp | 67 -
src/include/io/bam_reader.hpp | 105 -
src/include/io/binary_converter.hpp | 295 ---
src/include/io/binary_streams.hpp | 357 ---
.../io/careful_filtering_reader_wrapper.hpp | 183 --
src/include/io/converting_reader_wrapper.hpp | 120 -
src/include/io/delegating_reader_wrapper.hpp | 64 -
src/include/io/easy_reader.hpp | 122 -
src/include/io/fasta_fastq_gz_parser.hpp | 165 --
src/include/io/file_reader.hpp | 129 --
src/include/io/filtering_reader_wrapper.hpp | 148 --
src/include/io/io_helper.hpp | 118 -
src/include/io/ireader.hpp | 116 -
src/include/io/ireadstream.hpp | 168 --
src/include/io/is_corrupting_wrapper.hpp | 33 -
src/include/io/kmer_iterator.hpp | 54 -
src/include/io/library.hpp | 392 ----
src/include/io/mmapped_reader.hpp | 360 ---
src/include/io/mmapped_writer.hpp | 171 --
src/include/io/modifying_reader_wrapper.hpp | 113 -
src/include/io/mpmc_bounded.hpp | 149 --
src/include/io/multifile_reader.hpp | 99 -
src/include/io/orientation.hpp | 93 -
src/include/io/osequencestream.hpp | 367 ---
src/include/io/paired_read.hpp | 186 --
src/include/io/paired_readers.hpp | 251 --
src/include/io/parser.hpp | 145 --
src/include/io/rc_reader_wrapper.hpp | 137 --
src/include/io/read.hpp | 231 --
src/include/io/read_processor.hpp | 200 --
src/include/io/read_stream_vector.hpp | 182 --
src/include/io/sam/sam_reader.hpp | 49 -
src/include/io/sequence_reader.hpp | 77 -
src/include/io/single_read.hpp | 331 ---
src/include/io/splitting_wrapper.hpp | 75 -
src/include/io/vector_reader.hpp | 60 -
src/include/io/wrapper_collection.hpp | 115 -
src/include/levenshtein.hpp | 238 --
src/include/log.hpp | 33 -
src/include/logger/log_writers.hpp | 38 -
src/include/logger/logger.hpp | 149 --
src/include/memory_limit.hpp | 91 -
src/include/mph_index/bitpair_vector.hpp | 103 -
src/include/mph_index/common.hpp | 66 -
src/include/mph_index/hypergraph_sorter_seq.hpp | 130 --
src/include/mph_index/kmer_index.hpp | 530 -----
src/include/mph_index/mphf.hpp | 136 --
src/include/omni/action_handlers.hpp | 345 ---
src/include/omni/basic_edge_conditions.hpp | 268 ---
src/include/omni/bulge_remover.hpp | 781 -------
src/include/omni/complex_bulge_remover.hpp | 1162 ----------
src/include/omni/complex_tip_clipper.hpp | 116 -
.../omni/concurrent_algo/bulge_remover_factory.hpp | 100 -
.../concurrent_algo/component_algorithm_runner.hpp | 130 --
.../concurrent_conjugate_graph_component.hpp | 121 -
.../concurrent_algo/concurrent_edge_algorithm.hpp | 193 --
.../concurrent_algo/concurrent_graph_component.hpp | 472 ----
.../conjugate_vertex_glued_graph.hpp | 124 -
.../omni/concurrent_algo/devisible_tree.hpp | 320 ---
.../sequential_algorihtm_factory.hpp | 38 -
.../omni/concurrent_algo/sequential_algorithm.hpp | 37 -
src/include/omni/coverage.hpp | 342 ---
.../omni/dijkstra_tools/dijkstra_algorithm.hpp | 288 ---
.../omni/dijkstra_tools/dijkstra_helper.hpp | 163 --
.../omni/dijkstra_tools/dijkstra_settings.hpp | 117 -
.../omni/dijkstra_tools/length_calculator.hpp | 112 -
.../omni/dijkstra_tools/neighbours_iterator.hpp | 164 --
.../omni/dijkstra_tools/vertex_process_checker.hpp | 72 -
.../omni/dijkstra_tools/vertex_put_checker.hpp | 63 -
src/include/omni/edge_labels_handler.hpp | 222 --
src/include/omni/edges_position_handler.hpp | 208 --
src/include/omni/erroneous_connection_remover.hpp | 381 ---
src/include/omni/graph_component.hpp | 198 --
src/include/omni/graph_core.hpp | 620 -----
src/include/omni/graph_iterators.hpp | 446 ----
src/include/omni/graph_processing_algorithm.hpp | 259 ---
src/include/omni/id_track_handler.hpp | 110 -
src/include/omni/loop_killer.hpp | 218 --
src/include/omni/loop_resolver.hpp | 75 -
src/include/omni/mapping_path.hpp | 227 --
src/include/omni/mf_ec_remover.hpp | 508 ----
src/include/omni/observable_graph.hpp | 497 ----
src/include/omni/omni_tools.hpp | 411 ----
src/include/omni/omni_utils.hpp | 586 -----
src/include/omni/order_and_law.hpp | 645 ------
src/include/omni/parallel_processing.hpp | 289 ---
src/include/omni/path_processor.hpp | 441 ----
src/include/omni/range.hpp | 92 -
src/include/omni/relative_coverage_remover.hpp | 674 ------
src/include/omni/splitters.hpp | 921 --------
src/include/omni/tip_clipper.hpp | 177 --
src/include/omni/visualization/graph_colorer.hpp | 340 ---
src/include/omni/visualization/graph_labeler.hpp | 304 ---
src/include/omni/visualization/graph_printer.hpp | 176 --
.../visualization/printing_parameter_storage.hpp | 81 -
src/include/omni/visualization/vertex_linker.hpp | 41 -
.../omni/visualization/visualization_utils.hpp | 210 --
src/include/omni/visualization/visualizers.hpp | 171 --
src/include/path_helper.hpp | 74 -
src/include/perfcounter.hpp | 123 -
src/include/pred.hpp | 165 --
src/include/runtime_k.hpp | 87 -
src/include/segfault_handler.hpp | 56 -
src/include/sequence/nucl.hpp | 123 -
src/include/sequence/quality.hpp | 39 -
src/include/sequence/rtseq.hpp | 724 ------
src/include/sequence/seq.hpp | 525 -----
src/include/sequence/sequence.hpp | 532 -----
src/include/sequence/sequence_tools.hpp | 159 --
src/include/sequence/simple_seq.hpp | 154 --
src/include/simple_tools.hpp | 184 --
src/include/smooth.hpp | 193 --
src/include/ssw/ssw_cpp.h | 219 --
src/include/standard_base.hpp | 142 --
src/include/verify.hpp | 34 -
src/include/xmath.h | 346 ---
src/io/CMakeLists.txt | 20 -
src/io/copy_file.cpp | 158 --
src/io/library.cpp | 179 --
src/io/logger_impl.cpp | 148 --
src/io/parser.cpp | 90 -
src/io/path_helper.cpp | 201 --
src/io/sam/read.cpp | 42 -
src/io/sam/sam_reader.cpp | 75 -
src/ionhammer/CMakeLists.txt | 33 -
src/ionhammer/HSeq.hpp | 289 ---
src/ionhammer/config_struct.cpp | 70 -
src/ionhammer/config_struct.hpp | 49 -
src/ionhammer/err_helper_table.cpp | 39 -
src/ionhammer/err_helper_table.hpp | 117 -
src/ionhammer/expander.cpp | 60 -
src/ionhammer/flow_space_read.hpp | 77 -
src/ionhammer/hamcluster.cpp | 219 --
src/ionhammer/hamcluster.hpp | 192 --
src/ionhammer/kmer_data.cpp | 245 --
src/ionhammer/kmer_data.hpp | 124 -
src/ionhammer/main.cpp | 336 ---
src/ionhammer/read_corrector.hpp | 1220 ----------
src/ionhammer/seqeval/BaseHypothesisEvaluator.cpp | 302 ---
src/ionhammer/seqeval/TreephaserLite.cpp | 593 -----
src/ionhammer/subcluster.cpp | 135 --
src/ionhammer/valid_hkmer_generator.hpp | 250 --
src/modules/CMakeLists.txt | 24 +
src/modules/algorithms/CMakeLists.txt | 11 +
.../algorithms/dijkstra/dijkstra_algorithm.hpp | 288 +++
.../algorithms/dijkstra/dijkstra_helper.hpp | 163 ++
.../algorithms/dijkstra/dijkstra_settings.hpp | 117 +
.../algorithms/dijkstra/length_calculator.hpp | 112 +
.../algorithms/dijkstra/neighbours_iterator.hpp | 164 ++
.../algorithms/dijkstra/vertex_process_checker.hpp | 72 +
.../algorithms/dijkstra/vertex_put_checker.hpp | 63 +
.../algorithms/genome_consistance_checker.cpp | 238 ++
.../algorithms/genome_consistance_checker.hpp | 77 +
src/modules/algorithms/graph_construction.hpp | 179 ++
src/modules/algorithms/graph_read_correction.hpp | 187 ++
src/modules/algorithms/mismatch_shall_not_pass.hpp | 344 +++
src/modules/algorithms/path_extend/CMakeLists.txt | 18 +
.../algorithms/path_extend/extension_chooser.hpp | 1511 ++++++++++++
.../algorithms/path_extend/ideal_pair_info.hpp | 129 ++
.../algorithms/path_extend/loop_traverser.hpp | 213 ++
.../algorithms/path_extend/next_path_searcher.hpp | 1031 +++++++++
.../algorithms/path_extend/overlap_analysis.hpp | 113 +
.../algorithms/path_extend/paired_library.hpp | 179 ++
.../algorithms/path_extend/path_extend_launch.hpp | 975 ++++++++
.../algorithms/path_extend/path_extender.hpp | 1458 ++++++++++++
src/modules/algorithms/path_extend/path_filter.hpp | 134 ++
.../algorithms/path_extend/path_visualizer.hpp | 172 ++
.../algorithms/path_extend/pe_config_struct.cpp | 172 ++
.../algorithms/path_extend/pe_config_struct.hpp | 252 ++
src/modules/algorithms/path_extend/pe_io.hpp | 263 +++
src/modules/algorithms/path_extend/pe_resolver.hpp | 520 +++++
src/modules/algorithms/path_extend/pe_utils.hpp | 462 ++++
.../scaffolder2015/connection_condition2015.cpp | 144 ++
.../scaffolder2015/connection_condition2015.hpp | 90 +
.../scaffolder2015/extension_chooser2015.cpp | 82 +
.../scaffolder2015/extension_chooser2015.hpp | 49 +
.../path_extend/scaffolder2015/scaffold_graph.cpp | 275 +++
.../path_extend/scaffolder2015/scaffold_graph.hpp | 234 ++
.../scaffolder2015/scaffold_graph_constructor.cpp | 77 +
.../scaffolder2015/scaffold_graph_constructor.hpp | 0
.../scaffolder2015/scaffold_graph_visualizer.cpp | 72 +
.../scaffolder2015/scaffold_graph_visualizer.hpp | 73 +
.../path_extend/split_graph_pair_info.hpp | 449 ++++
.../algorithms}/path_extend/utils/CMakeLists.txt | 0
.../algorithms}/path_extend/utils/find_aligns.py | 0
.../path_extend/utils/find_single_threshold.py | 0
.../path_extend/utils/paired_info_checker.cpp | 204 ++
.../path_extend/utils/run_all_parametrs.py | 0
.../algorithms/path_extend/weight_counter.hpp | 544 +++++
.../algorithms/simplification/bulge_remover.hpp | 783 +++++++
src/modules/algorithms/simplification/cleaner.hpp | 43 +
.../simplification/complex_bulge_remover.hpp | 1162 ++++++++++
.../simplification/complex_tip_clipper.hpp | 153 ++
.../algorithms/simplification/compressor.hpp | 141 ++
.../simplification/dominated_set_finder.hpp | 137 ++
.../simplification/ec_threshold_finder.hpp | 152 ++
.../erroneous_connection_remover.hpp | 567 +++++
.../algorithms/simplification/mf_ec_remover.hpp | 514 ++++
.../parallel_simplification_algorithms.hpp | 820 +++++++
.../simplification/relative_coverage_remover.hpp | 674 ++++++
.../algorithms/simplification/tip_clipper.hpp | 269 +++
src/modules/assembly_graph/CMakeLists.txt | 12 +
.../components}/component_filters.hpp | 0
.../components/connected_component.cpp | 76 +
.../components/connected_component.hpp | 26 +
.../assembly_graph/components/graph_component.hpp | 198 ++
.../assembly_graph/components/splitters.hpp | 921 ++++++++
.../assembly_graph/graph_alignment/edge_index.hpp | 112 +
.../assembly_graph/graph_alignment/kmer_map.hpp | 151 ++
.../assembly_graph/graph_alignment/kmer_mapper.hpp | 234 ++
.../graph_alignment/kmer_mapper_logger.hpp | 45 +
.../graph_alignment/long_read_mapper.hpp | 190 ++
.../graph_alignment/long_read_storage.hpp | 376 +++
.../graph_alignment/pacbio/pac_index.hpp | 834 +++++++
.../graph_alignment/pacbio/pacbio_gap_closer.hpp | 394 ++++
.../pacbio/pacbio_read_structures.hpp | 326 +++
.../graph_alignment/sequence_mapper.hpp | 408 ++++
.../graph_alignment/sequence_mapper_notifier.hpp | 175 ++
.../graph_alignment/short_read_mapper.hpp | 98 +
.../assembly_graph/graph_core/action_handlers.hpp | 347 +++
.../graph_core/basic_graph_stats.hpp | 53 +
.../graph_core}/construction_helper.hpp | 0
src/modules/assembly_graph/graph_core/coverage.hpp | 343 +++
.../assembly_graph/graph_core/debruijn_data.hpp | 170 ++
.../assembly_graph/graph_core/directions.hpp | 132 ++
src/modules/assembly_graph/graph_core/graph.hpp | 110 +
.../assembly_graph/graph_core/graph_core.hpp | 620 +++++
.../assembly_graph/graph_core/graph_iterators.hpp | 408 ++++
.../assembly_graph/graph_core/observable_graph.hpp | 499 ++++
.../assembly_graph/graph_core/order_and_law.hpp | 644 ++++++
.../graph_support/basic_edge_conditions.hpp | 272 +++
.../graph_support/basic_vertex_conditions.hpp | 52 +
.../assembly_graph/graph_support/chimera_stats.hpp | 266 +++
.../assembly_graph/graph_support/comparators.hpp | 62 +
.../assembly_graph/graph_support/contig_output.hpp | 421 ++++
.../graph_support/detail_coverage.hpp | 258 +++
.../graph_support/genomic_quality.hpp | 554 +++++
.../graph_support/graph_processing_algorithm.hpp | 262 +++
.../graph_support}/marks_and_locks.hpp | 0
.../graph_support/parallel_processing.hpp | 290 +++
.../graph_support}/scaff_supplementary.cpp | 0
.../graph_support/scaff_supplementary.hpp | 77 +
.../handlers/edge_labels_handler.hpp | 226 ++
.../handlers/edges_position_handler.hpp | 207 ++
.../assembly_graph/handlers/id_track_handler.hpp | 110 +
.../assembly_graph/paths/bidirectional_path.cpp | 21 +
.../assembly_graph/paths/bidirectional_path.hpp | 1087 +++++++++
src/modules/assembly_graph/paths/mapping_path.hpp | 227 ++
src/modules/assembly_graph/paths/path_finders.hpp | 124 +
.../assembly_graph/paths/path_processor.hpp | 441 ++++
src/modules/assembly_graph/paths/path_utils.hpp | 128 +
src/modules/assembly_graph/stats/picture_dump.hpp | 426 ++++
src/modules/assembly_graph/stats/statistics.hpp | 273 +++
.../debruijn_graph/debruijn_graph_constructor.hpp | 555 +++++
.../debruijn_graph/early_simplification.hpp | 269 +++
.../indices/edge_index_builders.hpp | 179 ++
.../data_structures/indices/edge_info_updater.hpp | 107 +
.../data_structures/indices/edge_multi_index.hpp | 161 ++
.../indices/edge_position_index.hpp | 191 ++
.../data_structures}/indices/editable_index.hpp | 0
.../data_structures}/indices/key_with_hash.hpp | 0
.../indices/kmer_extension_index.hpp | 413 ++++
.../data_structures/indices/kmer_splitters.hpp | 445 ++++
.../data_structures/indices/perfect_hash_map.hpp | 396 ++++
.../data_structures/indices/storing_traits.hpp | 61 +
.../data_structures}/indices/values.hpp | 0
.../data_structures/mph_index/CMakeLists.txt | 13 +
.../data_structures}/mph_index/base_hash.hpp | 0
.../data_structures/mph_index/bitpair_vector.cpp | 77 +
.../data_structures/mph_index/bitpair_vector.hpp | 27 +
src/modules/data_structures/mph_index/common.hpp | 66 +
.../data_structures}/mph_index/emphf_config.hpp | 0
.../data_structures}/mph_index/hypergraph.hpp | 0
.../mph_index/hypergraph_sorter_seq.hpp | 130 ++
.../data_structures/mph_index/kmer_index.hpp | 530 +++++
src/modules/data_structures/mph_index/mphf.hpp | 136 ++
.../mph_index/ranked_bitpair_vector.hpp | 0
.../data_structures/sequence/CMakeLists.txt | 10 +
.../data_structures/sequence/genome_storage.cpp | 45 +
.../data_structures/sequence/genome_storage.hpp | 33 +
src/modules/data_structures/sequence/nucl.hpp | 123 +
src/modules/data_structures/sequence/quality.hpp | 39 +
src/modules/data_structures/sequence/rtseq.hpp | 736 ++++++
src/modules/data_structures/sequence/runtime_k.hpp | 47 +
src/modules/data_structures/sequence/seq.hpp | 529 +++++
.../data_structures}/sequence/seq_common.hpp | 0
src/modules/data_structures/sequence/sequence.hpp | 542 +++++
.../data_structures/sequence/sequence_tools.hpp | 159 ++
.../data_structures/sequence/simple_seq.hpp | 154 ++
src/modules/dev_support/CMakeLists.txt | 13 +
src/modules/dev_support/autocompletion.cpp | 51 +
src/modules/dev_support/autocompletion.hpp | 16 +
src/modules/dev_support/copy_file.cpp | 158 ++
src/modules/dev_support/copy_file.hpp | 18 +
src/modules/dev_support/cpp_utils.hpp | 40 +
src/modules/dev_support/file_limit.hpp | 33 +
src/modules/dev_support/func.hpp | 69 +
src/modules/dev_support/log.hpp | 33 +
src/modules/dev_support/logger/log_writers.hpp | 43 +
src/modules/dev_support/logger/logger.hpp | 149 ++
src/modules/dev_support/logger/logger_impl.cpp | 148 ++
src/modules/dev_support/md5.h | 393 ++++
src/{include => modules/dev_support}/memory.hpp | 0
src/modules/dev_support/memory_limit.hpp | 97 +
.../dev_support}/openmp_wrapper.h | 0
.../dev_support}/parallel_wrapper.hpp | 0
src/modules/dev_support/path_helper.cpp | 249 ++
src/modules/dev_support/path_helper.hpp | 74 +
src/modules/dev_support/perfcounter.hpp | 123 +
src/modules/dev_support/range.hpp | 92 +
src/modules/dev_support/segfault_handler.hpp | 58 +
src/modules/dev_support/simple_tools.hpp | 184 ++
.../dev_support}/stacktrace.hpp | 0
src/modules/dev_support/standard_base.hpp | 140 ++
src/modules/dev_support/verify.hpp | 33 +
src/modules/empty.cpp | 0
src/modules/io/CMakeLists.txt | 16 +
src/modules/io/dataset_support/dataset_readers.hpp | 122 +
src/modules/io/dataset_support/read_converter.hpp | 360 +++
src/modules/io/graph_io/graph_print_utils.hpp | 328 +++
src/modules/io/kmers_io/kmer_iterator.hpp | 54 +
src/modules/io/kmers_io/mmapped_reader.hpp | 396 ++++
src/modules/io/kmers_io/mmapped_writer.hpp | 191 ++
src/modules/io/reads/paired_read.hpp | 186 ++
src/modules/io/reads/read.hpp | 244 ++
src/modules/io/reads/single_read.hpp | 334 +++
src/modules/io/reads_io/binary_converter.hpp | 295 +++
src/modules/io/reads_io/binary_streams.hpp | 357 +++
.../reads_io/careful_filtering_reader_wrapper.hpp | 183 ++
.../io/reads_io/converting_reader_wrapper.hpp | 121 +
.../io/reads_io}/cutting_reader_wrapper.hpp | 0
.../io/reads_io/delegating_reader_wrapper.hpp | 64 +
src/modules/io/reads_io/easy_reader.hpp | 122 +
src/modules/io/reads_io/fasta_fastq_gz_parser.hpp | 165 ++
src/modules/io/reads_io/file_reader.hpp | 129 ++
.../io/reads_io/filtering_reader_wrapper.hpp | 148 ++
src/modules/io/reads_io/io_helper.hpp | 118 +
src/modules/io/reads_io/ireader.hpp | 117 +
src/modules/io/reads_io/ireadstream.hpp | 170 ++
src/modules/io/reads_io/is_corrupting_wrapper.hpp | 33 +
.../io/reads_io/modifying_reader_wrapper.hpp | 114 +
src/modules/io/reads_io/mpmc_bounded.hpp | 153 ++
src/modules/io/reads_io/multifile_reader.hpp | 99 +
src/modules/io/reads_io/orientation.hpp | 93 +
src/modules/io/reads_io/osequencestream.hpp | 374 +++
src/modules/io/reads_io/paired_readers.hpp | 251 ++
src/modules/io/reads_io/parser.cpp | 90 +
src/modules/io/reads_io/parser.hpp | 145 ++
src/modules/io/reads_io/rc_reader_wrapper.hpp | 137 ++
src/modules/io/reads_io/read_processor.hpp | 201 ++
src/modules/io/reads_io/read_stream_vector.hpp | 183 ++
src/modules/io/reads_io/sequence_reader.hpp | 77 +
src/modules/io/reads_io/splitting_wrapper.hpp | 75 +
src/modules/io/reads_io/vector_reader.hpp | 61 +
src/modules/io/reads_io/wrapper_collection.hpp | 115 +
src/modules/io/sam_io/bam_parser.hpp | 67 +
src/modules/io/sam_io/bam_reader.hpp | 107 +
src/modules/io/sam_io/read.cpp | 42 +
src/{include/io/sam => modules/io/sam_io}/read.hpp | 0
src/modules/io/sam_io/sam_reader.cpp | 75 +
src/modules/io/sam_io/sam_reader.hpp | 49 +
src/modules/math/CMakeLists.txt | 14 +
src/modules/math/kmer_coverage_model.cpp | 394 ++++
src/modules/math/kmer_coverage_model.hpp | 50 +
src/modules/math/pred.hpp | 169 ++
src/modules/math/smooth.hpp | 195 ++
src/modules/math/xmath.h | 357 +++
src/modules/paired_info/CMakeLists.txt | 14 +
src/modules/paired_info/bwa_pair_info_filler.cpp | 408 ++++
src/modules/paired_info/bwa_pair_info_filler.hpp | 253 ++
src/modules/paired_info/data_divider.hpp | 137 ++
src/modules/paired_info/distance_estimation.hpp | 309 +++
src/modules/paired_info/histogram.hpp | 190 ++
src/modules/paired_info/index_point.hpp | 370 +++
src/modules/paired_info/insert_size_refiner.hpp | 165 ++
src/modules/paired_info/is_counter.hpp | 167 ++
src/modules/paired_info/pair_info_bounds.hpp | 30 +
src/modules/paired_info/pair_info_filler.hpp | 119 +
src/modules/paired_info/pair_info_filters.hpp | 271 +++
src/modules/paired_info/pair_info_improver.hpp | 279 +++
src/modules/paired_info/paired_info.hpp | 712 ++++++
src/modules/paired_info/paired_info_helpers.hpp | 142 ++
src/modules/paired_info/peak_finder.hpp | 385 +++
.../paired_info/smoothing_distance_estimation.hpp | 283 +++
src/modules/paired_info/split_path_constructor.hpp | 140 ++
.../paired_info/weighted_distance_estimation.hpp | 112 +
src/modules/paired_info/weights.hpp | 82 +
src/modules/pipeline/CMakeLists.txt | 14 +
src/modules/pipeline/config_common.hpp | 140 ++
src/modules/pipeline/config_singl.hpp | 57 +
src/modules/pipeline/config_struct.cpp | 786 +++++++
src/modules/pipeline/config_struct.hpp | 561 +++++
src/modules/pipeline/genomic_info.hpp | 48 +
src/modules/pipeline/genomic_info_filler.cpp | 149 ++
src/modules/pipeline/genomic_info_filler.hpp | 23 +
src/modules/pipeline/graph_pack.hpp | 163 ++
src/modules/pipeline/graphio.hpp | 1040 +++++++++
src/modules/pipeline/library.cpp | 137 ++
src/modules/pipeline/library.hpp | 365 +++
src/modules/pipeline/library.inl | 64 +
src/modules/pipeline/stage.cpp | 133 ++
src/modules/pipeline/stage.hpp | 165 ++
src/modules/stages/CMakeLists.txt | 12 +
src/modules/stages/construction.cpp | 69 +
src/modules/stages/construction.hpp | 23 +
src/modules/stages/simplification.cpp | 509 ++++
src/modules/stages/simplification.hpp | 34 +
.../graph_simplification.hpp | 978 ++++++++
.../simplification_settings.hpp | 105 +
.../single_cell_simplification.hpp | 110 +
src/modules/visualization/graph_colorer.hpp | 340 +++
src/modules/visualization/graph_labeler.hpp | 304 +++
src/modules/visualization/graph_printer.hpp | 176 ++
src/modules/visualization/position_filler.hpp | 91 +
.../visualization/printing_parameter_storage.hpp | 81 +
src/modules/visualization/vertex_linker.hpp | 41 +
.../visualization/visualization.hpp | 0
src/modules/visualization/visualization_utils.hpp | 210 ++
src/modules/visualization/visualizers.hpp | 173 ++
src/projects/CMakeLists.txt | 13 +
src/projects/cap/CMakeLists.txt | 47 +
src/projects/cap/assembly_compare.hpp | 520 +++++
src/projects/cap/assembly_problem_detection.hpp | 453 ++++
src/projects/cap/cap_commands.hpp | 731 ++++++
src/projects/cap/cap_config_struct.hpp | 40 +
src/projects/cap/cap_environment.hpp | 265 +++
src/projects/cap/cap_environment_manager.hpp | 493 ++++
src/projects/cap/cap_graph_pack.hpp | 33 +
src/projects/cap/cap_kmer_index.hpp | 535 +++++
src/projects/cap/cap_logger.hpp | 30 +
src/projects/cap/cap_online_visualizer.hpp | 41 +
src/projects/cap/colored_graph_construction.hpp | 397 ++++
src/projects/cap/coloring.hpp | 461 ++++
src/projects/cap/compare_standard.hpp | 47 +
src/projects/cap/comparison_utils.hpp | 208 ++
src/projects/cap/coordinates_handler.hpp | 1262 ++++++++++
src/projects/cap/deprecated/kmer_jumper.hpp | 73 +
src/projects/cap/deprecated/longseq_storage.hpp | 64 +
src/projects/cap/deprecated/tools_deprecated.cpp | 468 ++++
src/projects/cap/diff_masking.hpp | 335 +++
src/projects/cap/gene_analysis.hpp | 353 +++
src/projects/cap/genome_correction.hpp | 496 ++++
src/projects/cap/graph_traversal_constraints.hpp | 75 +
src/projects/cap/junk_cropping_reader.hpp | 54 +
src/projects/cap/longseq.hpp | 480 ++++
src/projects/cap/main.cpp | 73 +
src/projects/cap/mosaic.hpp | 1101 +++++++++
src/projects/cap/path_projector.hpp | 445 ++++
src/projects/cap/polynomial_hash.hpp | 404 ++++
src/projects/cap/repeat_masking.hpp | 544 +++++
src/projects/cap/serialization.hpp | 151 ++
src/projects/cap/simple_indel_finder.hpp | 382 +++
src/projects/cap/simple_inversion_finder.hpp | 433 ++++
src/projects/cap/stats.hpp | 1502 ++++++++++++
src/projects/cap/test_utils.hpp | 143 ++
src/projects/cap/tools.cpp | 183 ++
src/projects/cap/untangling.hpp | 345 +++
src/projects/cap/visualization.hpp | 171 ++
src/projects/corrector/CMakeLists.txt | 34 +
src/projects/corrector/config_struct.cpp | 78 +
src/projects/corrector/config_struct.hpp | 33 +
src/projects/corrector/contig_processor.cpp | 306 +++
src/projects/corrector/contig_processor.hpp | 65 +
src/projects/corrector/dataset_processor.cpp | 273 +++
src/projects/corrector/dataset_processor.hpp | 71 +
.../corrector/interesting_pos_processor.cpp | 127 +
.../corrector/interesting_pos_processor.hpp | 0
src/projects/corrector/main.cpp | 67 +
src/{ => projects}/corrector/positional_read.cpp | 0
src/{ => projects}/corrector/positional_read.hpp | 0
src/{ => projects}/corrector/variants_table.hpp | 0
src/projects/dipspades/CMakeLists.txt | 26 +
.../consensus_contigs_constructor.hpp | 332 +++
.../abstract_contig_corrector.hpp | 43 +
.../contig_correctors/close_gaps_corrector.hpp | 154 ++
.../equal_path_deletion_correction.hpp | 82 +
.../contig_correctors/incorrect_contig_remover.hpp | 43 +
.../iterative_redundant_contigs_remover.hpp | 94 +
.../contig_correctors/overlap_searcher.hpp | 541 +++++
.../contig_correctors/redundant_contig_remover.hpp | 891 +++++++
.../same_edge_deletion_corrector.hpp | 71 +
.../mapping_contig.hpp | 380 +++
.../mapping_contigs_storage.hpp | 114 +
.../overlap_graph.hpp | 1119 +++++++++
src/projects/dipspades/dipspades.hpp | 265 +++
src/projects/dipspades/dipspades_config.cpp | 132 ++
src/projects/dipspades/dipspades_config.hpp | 82 +
.../conservative_regions_searcher.hpp | 174 ++
.../conservative_regions_storage.hpp | 44 +
.../haplotype_assembly/contig_separation_utils.hpp | 515 +++++
.../haplotype_assembly/haplotype_assembler.hpp | 59 +
.../dipspades/kmer_gluing/equal_sequence_gluer.hpp | 146 ++
src/projects/dipspades/main.cpp | 110 +
.../bulge_correction_condition.hpp | 128 +
.../polymorphic_bulge_remover/bulge_gluer.hpp | 88 +
.../bulge_paths_searcher.hpp | 97 +
.../polymorphic_bulge_remover/bulge_splitter.hpp | 497 ++++
.../complex_bulge_remover.hpp | 145 ++
.../diploid_bulge_finder.hpp | 102 +
.../glue_direction_definer.hpp | 76 +
.../gluing_vertices_definer.hpp | 170 ++
.../iterative_tails_gluing.hpp | 132 ++
.../polymorphic_bulge_remover.hpp | 109 +
.../simple_bulge_remover.hpp | 51 +
src/projects/dipspades/utils/bulge_utils.hpp | 267 +++
src/projects/dipspades/utils/dijkstra_utils.hpp | 163 ++
src/projects/dipspades/utils/edge_gluer.hpp | 102 +
src/projects/dipspades/utils/element_printers.hpp | 108 +
src/projects/dipspades/utils/files_utils.cpp | 48 +
src/{ => projects}/dipspades/utils/files_utils.hpp | 0
src/projects/dipspades/utils/histogram.hpp | 104 +
src/projects/dipspades/utils/lcs_utils.hpp | 146 ++
src/projects/dipspades/utils/path_index.hpp | 68 +
src/projects/dipspades/utils/path_routines.hpp | 285 +++
src/projects/dipspades/utils/range_utils.hpp | 57 +
src/projects/dipspades/utils/redundancy_map.hpp | 235 ++
src/projects/dipspades/utils/sequence_utils.hpp | 36 +
src/projects/hammer/CMakeLists.txt | 36 +
src/projects/hammer/config_struct_hammer.cpp | 86 +
src/projects/hammer/config_struct_hammer.hpp | 89 +
src/projects/hammer/expander.cpp | 70 +
src/{ => projects}/hammer/expander.hpp | 0
.../hammer/gen_test_data/CMakeLists.txt | 0
src/{ => projects}/hammer/gen_test_data/main.cpp | 0
src/{ => projects}/hammer/globals.hpp | 0
src/projects/hammer/hamcluster.cpp | 288 +++
src/projects/hammer/hamcluster.hpp | 161 ++
.../hammer/hammer debug.launch.template | 0
.../hammer/hammer release.launch.template | 0
src/projects/hammer/hammer_tools.cpp | 274 +++
src/projects/hammer/hammer_tools.hpp | 57 +
src/projects/hammer/kmer_cluster.cpp | 656 ++++++
src/{ => projects}/hammer/kmer_cluster.hpp | 0
src/projects/hammer/kmer_data.cpp | 568 +++++
src/projects/hammer/kmer_data.hpp | 141 ++
src/projects/hammer/kmer_stat.hpp | 291 +++
src/projects/hammer/main.cpp | 286 +++
src/{ => projects}/hammer/misc/config.inp | 0
src/{ => projects}/hammer/misc/confignohdd.inp | 0
src/{ => projects}/hammer/misc/getresults.pl | 0
src/{ => projects}/hammer/misc/memusg | 0
src/{ => projects}/hammer/misc/pretty_latex.pl | 0
src/projects/hammer/parallel_radix_sort.hpp | 592 +++++
.../hammer/quake_correct/CMakeLists.txt | 0
src/projects/hammer/quake_correct/Read.cpp | 824 +++++++
src/{ => projects}/hammer/quake_correct/Read.h | 0
src/projects/hammer/quake_correct/bithash.cpp | 388 ++++
src/{ => projects}/hammer/quake_correct/bithash.h | 0
src/projects/hammer/quake_correct/correct.cpp | 897 +++++++
src/projects/hammer/quake_correct/edit.cpp | 665 ++++++
src/{ => projects}/hammer/quake_correct/edit.h | 0
src/{ => projects}/hammer/quake_correct/gzstream.C | 0
src/{ => projects}/hammer/quake_correct/gzstream.h | 0
.../hammer/quake_count/CMakeLists.txt | 0
.../hammer/quake_count/kmer_freq_info.hpp | 0
src/projects/hammer/quake_count/quake_count.cpp | 241 ++
src/projects/hammer/quake_count/quake_count_17.cpp | 238 ++
src/projects/hammer/quake_count/quake_count_19.cpp | 238 ++
src/projects/hammer/quake_count/quake_count_21.cpp | 238 ++
src/projects/hammer/quake_count/quake_count_25.cpp | 238 ++
src/projects/hammer/quake_count/quake_count_29.cpp | 238 ++
src/projects/hammer/quake_count/quake_count_33.cpp | 239 ++
src/projects/hammer/quake_count/quake_count_37.cpp | 238 ++
src/projects/hammer/quake_count/quake_count_45.cpp | 238 ++
src/projects/hammer/quake_count/quake_count_55.cpp | 240 ++
src/projects/hammer/quake_count/quake_count_65.cpp | 238 ++
src/projects/hammer/quake_count/quake_count_75.cpp | 238 ++
.../hammer/quake_count/valid_kmer_generator.hpp | 194 ++
.../hammer/quake_enhanced/CMakeLists.txt | 0
.../quake_enhanced/correct_hist/CMakeLists.txt | 0
.../hammer/quake_enhanced/correct_hist/main.cpp | 0
src/projects/hammer/quake_enhanced/count.cpp | 131 ++
.../hammer/quake_enhanced/count/CMakeLists.txt | 0
src/projects/hammer/quake_enhanced/count/count.cpp | 226 ++
.../hammer/quake_enhanced/filter_trusted.cpp | 0
.../quake_enhanced/filter_trusted/CMakeLists.txt | 0
.../hammer/quake_enhanced/filter_trusted/main.cpp | 0
.../filter_trusted_enh/CMakeLists.txt | 0
.../quake_enhanced/filter_trusted_enh/main.cpp | 106 +
.../quake_enhanced/generate_limits/CMakeLists.txt | 0
.../hammer/quake_enhanced/generate_limits/main.cpp | 0
src/{ => projects}/hammer/quake_enhanced/main.cpp | 0
src/projects/hammer/quake_enhanced/options.cpp | 206 ++
.../hammer/quake_enhanced/options.hpp | 0
.../quake_enhanced/prepare_graph/CMakeLists.txt | 0
.../quake_enhanced/prepare_graph/prepare_graph.cpp | 0
.../hammer/quake_enhanced/prepare_hist.cpp | 0
.../hammer/quake_enhanced/prepare_limits.cpp | 0
src/{ => projects}/hammer/quake_enhanced/quake.hpp | 0
.../test_correction_quality/CMakeLists.txt | 0
.../test_correction_quality/main.cpp | 108 +
src/{ => projects}/hammer/read_corrector.cpp | 0
src/{ => projects}/hammer/read_corrector.hpp | 0
src/projects/hammer/valid_kmer_generator.hpp | 200 ++
src/projects/ionhammer/CMakeLists.txt | 33 +
src/projects/ionhammer/HSeq.hpp | 289 +++
src/projects/ionhammer/config_struct.cpp | 84 +
src/projects/ionhammer/config_struct.hpp | 49 +
src/{ => projects}/ionhammer/consensus.hpp | 0
src/projects/ionhammer/err_helper_table.cpp | 39 +
src/projects/ionhammer/err_helper_table.hpp | 117 +
src/{ => projects}/ionhammer/err_helper_table.inc | 0
src/projects/ionhammer/expander.cpp | 60 +
src/{ => projects}/ionhammer/expander.hpp | 0
src/projects/ionhammer/flow_space_read.hpp | 77 +
src/projects/ionhammer/hamcluster.cpp | 219 ++
src/projects/ionhammer/hamcluster.hpp | 192 ++
src/{ => projects}/ionhammer/hkmer.hpp | 0
src/{ => projects}/ionhammer/hkmer_distance.hpp | 0
src/projects/ionhammer/kmer_data.cpp | 245 ++
src/projects/ionhammer/kmer_data.hpp | 124 +
src/projects/ionhammer/main.cpp | 332 +++
src/projects/ionhammer/read_corrector.hpp | 1220 ++++++++++
.../ionhammer/seqeval/BaseCallerUtils.h | 0
.../ionhammer/seqeval/BaseHypothesisEvaluator.cpp | 302 +++
.../ionhammer/seqeval/BaseHypothesisEvaluator.h | 0
src/projects/ionhammer/seqeval/TreephaserLite.cpp | 593 +++++
.../ionhammer/seqeval/TreephaserLite.h | 0
src/projects/ionhammer/subcluster.cpp | 135 ++
src/{ => projects}/ionhammer/subcluster.hpp | 0
src/projects/ionhammer/valid_hkmer_generator.hpp | 250 ++
src/projects/online_vis/CMakeLists.txt | 37 +
src/projects/online_vis/argument_list.hpp | 217 ++
src/projects/online_vis/base_commands.hpp | 503 ++++
src/projects/online_vis/command.hpp | 173 ++
src/projects/online_vis/command_mapping.hpp | 54 +
src/projects/online_vis/debruijn_commands.hpp | 14 +
src/projects/online_vis/debruijn_environment.hpp | 206 ++
.../online_vis/debruijn_online_visualizer.hpp | 53 +
src/projects/online_vis/drawing_commands.hpp | 113 +
.../drawing_commands/draw_connected_command.hpp | 62 +
.../drawing_commands/draw_contig_command.hpp | 126 +
.../drawing_commands/draw_missasemblies.hpp | 211 ++
.../draw_part_of_genome_command.hpp | 115 +
.../drawing_commands/draw_polymorphic_regions.hpp | 139 ++
.../drawing_commands/draw_poorly_assembled.hpp | 617 +++++
.../drawing_commands/draw_position_command.hpp | 80 +
.../drawing_commands/drawing_command.hpp | 100 +
.../drawing_commands/show_position_command.hpp | 80 +
src/projects/online_vis/environment.hpp | 58 +
src/projects/online_vis/errors.hpp | 122 +
src/projects/online_vis/history.hpp | 64 +
src/projects/online_vis/loaded_environments.hpp | 17 +
src/projects/online_vis/main.cpp | 71 +
src/projects/online_vis/online_visualizer.hpp | 167 ++
src/projects/online_vis/position_commands.hpp | 11 +
.../position_commands/clear_position_command.hpp | 35 +
.../position_commands/fill_position_command.hpp | 65 +
src/projects/online_vis/processing_commands.hpp | 66 +
src/projects/online_vis/setting_commands.hpp | 161 ++
src/projects/online_vis/standard_vis.hpp | 23 +
src/projects/online_vis/statistics_commands.hpp | 13 +
.../junction_sequence_command.hpp | 97 +
.../statistics_commands/print_contigs_stats.hpp | 220 ++
.../statistics_commands/print_edge_command.hpp | 60 +
.../statistics_commands/print_paths_command.hpp | 103 +
src/projects/online_vis/vis_logger.hpp | 28 +
src/projects/online_vis/vis_utils.hpp | 40 +
src/projects/scaffold_correction/CMakeLists.txt | 23 +
src/projects/scaffold_correction/main.cpp | 112 +
.../scaffold_correction/scaffold_correction.hpp | 333 +++
src/projects/spades/CMakeLists.txt | 32 +
src/projects/spades/chromosome_removal.cpp | 244 ++
src/projects/spades/chromosome_removal.hpp | 35 +
src/projects/spades/distance_estimation.cpp | 231 ++
src/projects/spades/distance_estimation.hpp | 24 +
src/projects/spades/gap_closer.cpp | 502 ++++
src/projects/spades/gap_closer.hpp | 33 +
src/projects/spades/launch.hpp | 120 +
src/projects/spades/main.cpp | 110 +
src/projects/spades/mismatch_correction.cpp | 27 +
src/projects/spades/mismatch_correction.hpp | 23 +
src/projects/spades/pacbio_aligning.cpp | 185 ++
src/projects/spades/pacbio_aligning.hpp | 23 +
src/projects/spades/pair_info_count.cpp | 259 +++
src/projects/spades/pair_info_count.hpp | 24 +
src/projects/spades/repeat_resolving.cpp | 96 +
src/projects/spades/repeat_resolving.hpp | 42 +
src/projects/spades/second_phase_setup.cpp | 42 +
src/projects/spades/second_phase_setup.hpp | 22 +
.../truseq_analysis/AlignmentAnalyserNew.cpp | 102 +
.../truseq_analysis/AlignmentAnalyserNew.hpp | 34 +
src/projects/truseq_analysis/CMakeLists.txt | 15 +
.../truseq_analysis/alignment_analyser.cpp | 116 +
.../truseq_analysis/alignment_analyser.hpp | 41 +
src/projects/truseq_analysis/analysis_pipeline.cpp | 140 ++
src/projects/truseq_analysis/analysis_pipeline.hpp | 39 +
.../truseq_analysis/consistent_mapping.cpp | 245 ++
src/projects/truseq_analysis/consistent_mapping.h | 90 +
src/projects/truseq_analysis/main.cpp | 95 +
src/scaffold_correction/CMakeLists.txt | 24 -
src/scaffold_correction/main.cpp | 171 --
src/scaffold_correction/scaffold_correction.hpp | 332 ---
src/spades_pipeline/corrector_logic.py | 11 +-
src/spades_pipeline/dipspades_logic.py | 3 -
src/spades_pipeline/hammer_logic.py | 85 +-
src/spades_pipeline/options_storage.py | 52 +-
src/spades_pipeline/spades_logic.py | 104 +-
src/spades_pipeline/support.py | 83 +-
src/spades_pipeline/truspades/launch_options.py | 10 +-
src/utils/adt/array_vector.hpp | 677 ++++++
src/utils/adt/bag.hpp | 87 +
src/utils/adt/chained_iterator.hpp | 76 +
src/utils/adt/concurrent_dsu.hpp | 297 +++
src/utils/adt/filter_iterator.hpp | 49 +
src/{include => utils}/adt/flat_map.hpp | 0
src/{include => utils}/adt/flat_set.hpp | 0
src/utils/adt/function_traits.hpp | 76 +
src/utils/adt/iterator_range.hpp | 50 +
src/utils/adt/kmer_hash_vector.hpp | 370 +++
src/utils/adt/kmer_vector.hpp | 179 ++
src/utils/adt/parallel_seq_vector.hpp | 110 +
.../adt/parallel_unordered_map.hpp | 0
src/utils/adt/pointer_iterator.hpp | 174 ++
src/utils/adt/queue_iterator.hpp | 143 ++
src/utils/adt/small_pod_vector.hpp | 399 ++++
src/utils/levenshtein.hpp | 241 ++
truspades.py | 21 +-
1199 files changed, 143574 insertions(+), 104584 deletions(-)
diff --git a/VERSION b/VERSION
index a76ccff..6641052 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1,2 @@
-3.7.1
+3.8.0
+
diff --git a/changelog.html b/changelog.html
index 7f16acb..99fd062 100644
--- a/changelog.html
+++ b/changelog.html
@@ -3,6 +3,14 @@
<h2>SPAdes Genome Assembler changelog</h2>
+<h3>SPAdes 3.8.0, 1 June 2016</h3>
+
+<p>NEW: Added plasmidSPAdes – a pipeline designed for extracting and assembling plasmids from WGS data sets.</p>
+
+<p>CHANGE: Significant improvements in metaSPAdes performance.</p>
+
+<p>CHANGE: Improved running time and RAM consumption.</p>
+
<h3>SPAdes 3.7.1, 8 March 2016</h3>
<p>FIX: MismatchCorrector fixed for MaxOS.</p>
diff --git a/configs/cclean/config.info.template b/configs/cclean/config.info
similarity index 100%
rename from configs/cclean/config.info.template
rename to configs/cclean/config.info
diff --git a/configs/corrector/corrector.info.template b/configs/corrector/corrector.info.template
deleted file mode 100644
index 22740a3..0000000
--- a/configs/corrector/corrector.info.template
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-dataset: ./configs/debruijn/datasets/ECOLI_IS220_QUAKE.yaml,
-work_dir: ./test_dataset/input/corrected/tmp,
-output_dir: ./test_dataset/input/corrected,
-max_nthreads: 16,
-strategy: mapped_squared
-}
diff --git a/configs/debruijn/careful_mode.info b/configs/debruijn/careful_mode.info
new file mode 100644
index 0000000..5cbb786
--- /dev/null
+++ b/configs/debruijn/careful_mode.info
@@ -0,0 +1,37 @@
+
+simp
+{
+ ; bulge remover:
+ br
+ {
+ max_coverage 1000000.0
+ max_relative_coverage 1.5 ; bulge_cov < this * not_bulge_cov
+ parallel false
+ }
+
+ ; complex bulge remover
+ cbr
+ {
+ enabled false
+ }
+
+ ; relative coverage erroneous component remover:
+ rcc
+ {
+ enabled false
+ }
+
+ init_clean
+ {
+ early_it_only true
+
+ activation_cov -1.
+ ier
+ {
+ enabled false
+ }
+
+ tip_condition ""
+ ec_condition ""
+ }
+}
diff --git a/configs/debruijn/config.info b/configs/debruijn/config.info
old mode 100755
new mode 100644
index 98b8803..f747c29
--- a/configs/debruijn/config.info
+++ b/configs/debruijn/config.info
@@ -5,10 +5,13 @@
#include "distance_estimation.info"
#include "detail_info_printer.info"
#include "tsa.info"
-#include "path_extend/pe_params.info"
+#include "pe_params.info"
K 55
+;FIXME introduce isolate mode
+mode base
+;FIXME remove!
run_mode false
project_name TOY_DATASET
dataset ./configs/debruijn/datasets_archive/toy.info
@@ -40,17 +43,14 @@ scaffold_correction_mode false
; enabled (1) or disabled (0) repeat resolution (former "paired_mode")
rr_enable true
-; two-step pipeline (currently in meta mode only)
-two_step_rr true
+; two-step pipeline
+two_step_rr false
; enables/disables usage of intermediate contigs in two-step pipeline
use_intermediate_contigs true
;use single reads for rr (all | only_single_libs | none )
single_reads_rr only_single_libs
-; diploid mode
-diploid_mode false
-
; The following parameters are used ONLY if developer_mode is true
; whether to output dot-files with pictures of graphs - ONLY in developer mode
@@ -70,9 +70,6 @@ use_additional_contigs false
; use unipaths as additional contigs instead of just graph edges
use_unipaths false
-;enables mismatch careful mode (primary changes some simplification settings)
-mismatch_careful false
-
;if true simple mismatches are corrected
correct_mismatches true
@@ -82,15 +79,12 @@ paired_info_statistics false
; set it true to get statistics for pair information (over gaps), such as false positive/negative, perfect match, etc.
paired_info_scaffolder false
+;FIXME is it always simple?
estimation_mode simple
; simple, weighted, extensive, smoothing
-;set it true to detach connections that are not supported by paired info before repeat resolution
-cut_bad_connections false
-
-;the only option left from repeat resolving -- WHY DO THEY DIFFER?
+;the only option left from repeat resolving
max_repeat_length 8000
-max_repeat_length_sc 8000
; repeat resolving mode (none path_extend)
resolving_mode path_extend
@@ -140,18 +134,14 @@ pacbio_processor
path_limit_stretching 1.3
path_limit_pressing 0.7
ignore_middle_alignment true
-;gap_closer
+ ;gap_closer
long_seq_limit 400
pacbio_min_gap_quantity 2
contigs_min_gap_quantity 1
max_contigs_gap_length 10000
}
-; consensus
-need_consensus false ; output is VERY large(gigabytes).
-uncorrected_reads none
-
-mismatch_ratio 2.0;
+;TODO move out!
graph_read_corr
{
enable false
@@ -159,18 +149,10 @@ graph_read_corr
binary true
}
-sc_cor
-{
- scaffolds_file scaffolds.fasta
- output_unfilled true
- max_insert 100
- max_cut_length 50
-}
-
-
bwa_aligner
{
- enabled false
+ ;stupid naming since spades.py cannot change config normally
+ bwa_enable false
debug false
path_to_bwa ./bin/bwa-spades
min_contig_len 0
diff --git a/configs/debruijn/config.info.template b/configs/debruijn/config.info.template
deleted file mode 100755
index 98b8803..0000000
--- a/configs/debruijn/config.info.template
+++ /dev/null
@@ -1,180 +0,0 @@
-; input options:
-
-#include "simplification.info"
-#include "construction.info"
-#include "distance_estimation.info"
-#include "detail_info_printer.info"
-#include "tsa.info"
-#include "path_extend/pe_params.info"
-
-K 55
-
-run_mode false
-project_name TOY_DATASET
-dataset ./configs/debruijn/datasets_archive/toy.info
-log_filename log.properties
-
-output_base ./data/debruijn/
-tmp_dir spades_tmp/
-
-main_iteration true
-additional_contigs tmp_contigs.fasta
-load_from latest/saves/ ; tmp or latest
-
-; Multithreading options
-temp_bin_reads_dir .bin_reads/
-max_threads 8
-max_memory 120; in Gigabytes
-buffer_size 512; in Megabytes
-
-entry_point construction
-;entry_point simplification
-;entry_point pacbio_aligning
-;entry_point late_pair_info_count
-;entry_point distance_estimation
-;entry_point repeat_resolving
-
-developer_mode true
-scaffold_correction_mode false
-
-; enabled (1) or disabled (0) repeat resolution (former "paired_mode")
-rr_enable true
-
-; two-step pipeline (currently in meta mode only)
-two_step_rr true
-; enables/disables usage of intermediate contigs in two-step pipeline
-use_intermediate_contigs true
-
-;use single reads for rr (all | only_single_libs | none )
-single_reads_rr only_single_libs
-
-; diploid mode
-diploid_mode false
-
-; The following parameters are used ONLY if developer_mode is true
-
-; whether to output dot-files with pictures of graphs - ONLY in developer mode
-output_pictures true
-
-; whether to output resulting contigs after intermediate stages - ONLY in developer mode
-output_nonfinal_contigs true
-
-; whether to compute number of paths statistics - ONLY in developer mode
-compute_paths_number false
-
-; End of developer_mode parameters
-
-; iterative mode switcher, activates additional contigs usage
-use_additional_contigs false
-
-; use unipaths as additional contigs instead of just graph edges
-use_unipaths false
-
-;enables mismatch careful mode (primary changes some simplification settings)
-mismatch_careful false
-
-;if true simple mismatches are corrected
-correct_mismatches true
-
-; set it true to get statistics, such as false positive/negative, perfect match, etc.
-paired_info_statistics false
-
-; set it true to get statistics for pair information (over gaps), such as false positive/negative, perfect match, etc.
-paired_info_scaffolder false
-
-estimation_mode simple
-; simple, weighted, extensive, smoothing
-
-;set it true to detach connections that are not supported by paired info before repeat resolution
-cut_bad_connections false
-
-;the only option left from repeat resolving -- WHY DO THEY DIFFER?
-max_repeat_length 8000
-max_repeat_length_sc 8000
-
-; repeat resolving mode (none path_extend)
-resolving_mode path_extend
-
-use_scaffolder true
-
-avoid_rc_connections true
-
-;position handling
-
-pos
-{
- max_mapping_gap 0 ; in terms of K+1 mers value will be K + max_mapping_gap
- max_gap_diff 0
- contigs_for_threading ./data/debruijn/contigs.fasta
- contigs_to_analyze ./data/debruijn/contigs.fasta
- late_threading true
- careful_labeling true
-
-}
-
-gap_closer_enable true
-
-gap_closer
-{
- minimal_intersection 10
- before_simplify true
- in_simplify false
- after_simplify true
- weight_threshold 2.0
-}
-
-kmer_coverage_model {
- probability_threshold 0.05
- strong_probability_threshold 0.999
- use_coverage_threshold false
- coverage_threshold 10.0
-}
-
-pacbio_processor
-{
-;align and traverse.
- pacbio_k 13
- additional_debug_info false
- compression_cutoff 0.6
- domination_cutoff 1.5
- path_limit_stretching 1.3
- path_limit_pressing 0.7
- ignore_middle_alignment true
-;gap_closer
- long_seq_limit 400
- pacbio_min_gap_quantity 2
- contigs_min_gap_quantity 1
- max_contigs_gap_length 10000
-}
-; consensus
-need_consensus false ; output is VERY large(gigabytes).
-uncorrected_reads none
-
-mismatch_ratio 2.0;
-
-graph_read_corr
-{
- enable false
- output_dir corrected_contigs/
- binary true
-}
-
-sc_cor
-{
- scaffolds_file scaffolds.fasta
- output_unfilled true
- max_insert 100
- max_cut_length 50
-}
-
-
-bwa_aligner
-{
- enabled false
- debug false
- path_to_bwa ./bin/bwa-spades
- min_contig_len 0
-}
-
-;flanking coverage range
-flanking_range 55
diff --git a/configs/debruijn/construction.info.template b/configs/debruijn/construction.info.template
deleted file mode 100644
index f3d1b2c..0000000
--- a/configs/debruijn/construction.info.template
+++ /dev/null
@@ -1,23 +0,0 @@
-; construction
-
-construction
-{
- ; mode of construction: extension (construct hash map of kmers to extentions), old (construct set of k+1-mers)
- mode extension
-
- ; enable keeping in graph perfect cycles. This slows down condensing but some plasmids can be lost if this is turned off.
- keep_perfect_loops true
-
- ; size of buffer for each thread in MB, 0 for autodetection
- read_buffer_size 0
-
- early_tip_clipper
- {
- ; tip clipper can be enabled only in extension mode
- enable true
-
- ; optional parameter. By default tips of length rl-k are removed
-; length_bound 10
- }
-}
-
diff --git a/configs/debruijn/detail_info_printer.info b/configs/debruijn/detail_info_printer.info
index e19cf36..c055798 100644
--- a/configs/debruijn/detail_info_printer.info
+++ b/configs/debruijn/detail_info_printer.info
@@ -3,6 +3,7 @@ info_printers
default
{
basic_stats false
+ lib_info false
save_full_graph false
extended_stats false
detailed_dot_write false
diff --git a/configs/debruijn/detail_info_printer.info.template b/configs/debruijn/detail_info_printer.info.template
deleted file mode 100644
index e19cf36..0000000
--- a/configs/debruijn/detail_info_printer.info.template
+++ /dev/null
@@ -1,43 +0,0 @@
-info_printers
-{
- default
- {
- basic_stats false
- save_full_graph false
- extended_stats false
- detailed_dot_write false
- write_components false
- components_for_genome_pos "" ; (k+1)-mers starting on this positions will be investigated
- components_for_kmer ""
- write_components_along_genome false
- write_components_along_contigs false
- write_error_loc false
- write_full_graph false
- write_full_nc_graph false
- }
-
- before_first_gap_closer
- {
- }
-
- before_simplification
- {
- }
-
- before_post_simplification
- {
- }
-
- final_simplified
- {
- }
-
- final_gap_closed
- {
- }
-
- before_repeat_resolution
- {
- }
-
-}
diff --git a/configs/debruijn/diploid_mode.info b/configs/debruijn/diploid_mode.info
new file mode 100644
index 0000000..1044227
--- /dev/null
+++ b/configs/debruijn/diploid_mode.info
@@ -0,0 +1,16 @@
+mode diploid
+
+simp
+{
+ post_simplif_enabled false
+
+ ; bulge remover:
+ br
+ {
+ enabled false
+ }
+}
+
+amb_de {
+ enabled true
+}
diff --git a/configs/debruijn/distance_estimation.info b/configs/debruijn/distance_estimation.info
index 949216b..3761b05 100644
--- a/configs/debruijn/distance_estimation.info
+++ b/configs/debruijn/distance_estimation.info
@@ -1,14 +1,6 @@
; distance estimator:
-sc_de
-{
- linkage_distance_coeff 0.0
- max_distance_coeff 2.0
- max_distance_coeff_scaff 2000.0
- filter_threshold 2.0
-}
-
-usual_de
+de
{
linkage_distance_coeff 0.0
max_distance_coeff 2.0
@@ -16,47 +8,8 @@ usual_de
filter_threshold 2.0
}
-old_sc_de
-{
- linkage_distance_coeff 0.3
- max_distance_coeff 2.0
- max_distance_coeff_scaff 2000.0
- filter_threshold 10.0 ;bigger than in non-single cell because normalization is disabled
-}
-
-old_usual_de
-{
- linkage_distance_coeff 0.3
- max_distance_coeff 2.0
- max_distance_coeff_scaff 2000.0
- filter_threshold 0.2
-}
-
-; advanced distance estimator:
-
-sc_ade
-{
- ;data dividing
- threshold 80 ;maximal distance between two points in cluster
-
- ;local maximum seeking
- range_coeff 0.2 ;data_length*range_coeff := width of the averaging window
- delta_coeff 0.4 ;data_length*delta_coeff := maximal difference between possible distance and real peak on the graph
-
- ;fft smoothing
- percentage 0.01 ;percent of data for baseline subraction
- cutoff 3 ;the number of the lowest freqs in fourier decomp being taken
-
- ;other
- min_peak_points 3 ;the minimal number of points in cluster to be considered
- inv_density 5.0 ;maximal inverse density of points in cluster to be considered
-
- ;hard_mode arguments
- derivative_threshold 0.2 ;threshold for derivative in hard mode
-
-}
-usual_ade
+ade
{
;data dividing
threshold 80 ;maximal distance between two points in cluster
@@ -80,10 +33,10 @@ usual_ade
; ambiguous pair info checker parameters
amb_de {
- enabled false; true
- haplom_threshold 500
- relative_length_threshold 0.8
- relative_seq_threshold 0.5
+ enabled false
+ haplom_threshold 500
+ relative_length_threshold 0.8
+ relative_seq_threshold 0.5
}
sensitive_mapper {
diff --git a/configs/debruijn/distance_estimation.info.template b/configs/debruijn/distance_estimation.info.template
deleted file mode 100644
index 949216b..0000000
--- a/configs/debruijn/distance_estimation.info.template
+++ /dev/null
@@ -1,91 +0,0 @@
-; distance estimator:
-
-sc_de
-{
- linkage_distance_coeff 0.0
- max_distance_coeff 2.0
- max_distance_coeff_scaff 2000.0
- filter_threshold 2.0
-}
-
-usual_de
-{
- linkage_distance_coeff 0.0
- max_distance_coeff 2.0
- max_distance_coeff_scaff 2000.0
- filter_threshold 2.0
-}
-
-old_sc_de
-{
- linkage_distance_coeff 0.3
- max_distance_coeff 2.0
- max_distance_coeff_scaff 2000.0
- filter_threshold 10.0 ;bigger than in non-single cell because normalization is disabled
-}
-
-old_usual_de
-{
- linkage_distance_coeff 0.3
- max_distance_coeff 2.0
- max_distance_coeff_scaff 2000.0
- filter_threshold 0.2
-}
-
-; advanced distance estimator:
-
-sc_ade
-{
- ;data dividing
- threshold 80 ;maximal distance between two points in cluster
-
- ;local maximum seeking
- range_coeff 0.2 ;data_length*range_coeff := width of the averaging window
- delta_coeff 0.4 ;data_length*delta_coeff := maximal difference between possible distance and real peak on the graph
-
- ;fft smoothing
- percentage 0.01 ;percent of data for baseline subraction
- cutoff 3 ;the number of the lowest freqs in fourier decomp being taken
-
- ;other
- min_peak_points 3 ;the minimal number of points in cluster to be considered
- inv_density 5.0 ;maximal inverse density of points in cluster to be considered
-
- ;hard_mode arguments
- derivative_threshold 0.2 ;threshold for derivative in hard mode
-
-}
-
-usual_ade
-{
- ;data dividing
- threshold 80 ;maximal distance between two points in cluster
-
- ;local maximum seeking
- range_coeff 0.2 ;data_length*range_coeff := width of the averaging window
- delta_coeff 0.4 ;data_length*delta_coeff := maximal difference between possible distance and real peak on the graph
-
- ;fft smoothing
- percentage 0.01 ;percent of data for baseline subraction
- cutoff 3 ;the number of the lowest freqs in fourier decomp being taken
-
- ;other
- min_peak_points 3 ;the minimal number of points in cluster to be considered
- inv_density 5.0 ;maximal inverse density of points in cluster to be considered
-
- ;hard_mode arguments
- derivative_threshold 0.2 ;threshold for derivative in hard mode
-
-}
-
-; ambiguous pair info checker parameters
-amb_de {
- enabled false; true
- haplom_threshold 500
- relative_length_threshold 0.8
- relative_seq_threshold 0.5
-}
-
-sensitive_mapper {
- k 19
-}
diff --git a/configs/debruijn/log.properties.template b/configs/debruijn/log.properties.template
deleted file mode 100644
index a4052e3..0000000
--- a/configs/debruijn/log.properties.template
+++ /dev/null
@@ -1,52 +0,0 @@
-default=INFO
-
-#RelativeCoverageHelper=TRACE
-#RelativelyLowCoveredComponentSearcher=TRACE
-#RelativelyLowCoveredComponentChecker=TRACE
-#RelativeCoverageComponentRemover=TRACE
-#FlankingCoverage=TRACE
-#PolymorphicBulgeRemover=TRACE
-#BulgeSplitter=TRACE
-#SubpathSplitter=TRACE
-#ComplexBulgeGluer=TRACE
-#GluingVericesDefiner=TRACE
-#GluingVericesDefinerResults=TRACE
-
-#TwoStepAlgorithmRunner=TRACE
-#AlgorithmRunner=TRACE
-#DeBruijnGraphConstructor=TRACE
-#PairedHandlerApplier=TRACE
-#QualityEdgeLocalityPrintingRH=TRACE
-#PairInfoAwareErroneousEdgeRemover=TRACE
-#QualityLoggingRemovalHandler=TRACE
-#MatePairTransformStat=TRACE
-#EditDistanceTrackingCallback=TRACE
-#RepeatResolver=TRACE
-#PairInfoImprover=TRACE
-#BulgeRemover=TRACE
-#AbstractConjugateGraph=TRACE
-#PathProcessor=TRACE
-#DistanceEstimationQualityStat=TRACE
-#Dijkstra=TRACE
-#AbstractGraph=TRACE
-#PathSetGraphConstructor=TRACE
-#NewExtendedSequenceMapper=TRACE
-#JumpingPairInfoChecker=TRACE
-
-#PathExtender=DEBUG
-#BidirectionalPath=DEBUG
-#NextPathSearcher=DEBUG
-#ExtensionChooser=DEBUG
-#WeightCounter=DEBUG
-#PathExtendIO=DEBUG
-#PathExtendPI=DEBUG
-#LoopTraverser=DEBUG
-#PEResolver=DEBUG
-#ExtensionChooser2015=DEBUG
-#ScaffoldingUniqueEdgeStorage=DEBUG
-#ScaffoldingUniqueEdgeAnalyzer=DEBUG
-#LoopDetectingPathExtender=DEBUG
-#SimpleExtender=DEBUG
-#ScaffoldingPathExtender=DEBUG
-
-#BWAPairInfo=TRACE
diff --git a/configs/debruijn/mda_mode.info b/configs/debruijn/mda_mode.info
new file mode 100644
index 0000000..c98df33
--- /dev/null
+++ b/configs/debruijn/mda_mode.info
@@ -0,0 +1,120 @@
+mode mda
+
+simp
+{
+ ; enable advanced ec removal algo
+ topology_simplif_enabled true
+
+ ; tip clipper:
+ tc
+ {
+ ; rctc: tip_cov < rctc * not_tip_cov
+ ; tc_lb: max_tip_length = max((min(k, read_length / 2) * tc_lb), read_length);
+ condition "{ tc_lb 3.5, cb 1000000, rctc 2.0 }"
+ }
+
+ ; erroneous connections remover:
+ ec
+ {
+ ; ec_lb: max_ec_length = k + ec_lb
+ ; icb: iterative coverage bound
+ ; condition "{ ec_lb 30, icb 20.0 }"
+ condition "{ ec_lb 30, icb auto }"
+ }
+
+ final_tc
+ {
+ condition "{ tc_lb 3.5, cb 100000, rctc 10000 }"
+ }
+
+ ; bulge remover:
+ final_br
+ {
+ enabled true
+ max_coverage 1000000.0
+ max_relative_coverage 100000. ; bulge_cov < this * not_bulge_cov
+ }
+
+ ; relative coverage erroneous component remover:
+ rcc
+ {
+ enabled true
+ coverage_gap 20.
+ max_length_coeff 2.0
+ max_length_with_tips_coeff 3.0
+ max_vertex_cnt 30
+ max_ec_length_coefficient 30
+ max_coverage_coeff 5.0
+ }
+
+ tec
+ {
+ max_ec_length_coefficient 55 ; max_ec_length = k + max_ec_length_coefficient
+ uniqueness_length 1500
+ plausibility_length 200
+ }
+
+ ; topology and reliability based erroneous connection remover
+ trec
+ {
+ max_ec_length_coefficient 100 ; max_ec_length = k + max_ec_length_coefficient
+ uniqueness_length 1500
+ unreliable_coverage 2.5
+ }
+
+ ; topology tip clipper:
+ ttc
+ {
+ length_coeff 3.5
+ plausibility_length 250
+ uniqueness_length 1500
+ }
+
+ ; complex bulge remover
+ cbr
+ {
+ enabled true
+ }
+
+ ; hidden ec remover
+ her
+ {
+ enabled true
+ uniqueness_length 1500
+ unreliability_threshold 0.2
+ relative_threshold 5
+ }
+
+ init_clean
+ {
+ activation_cov -1.
+ ier
+ {
+ enabled false
+ }
+
+ tip_condition ""
+ ec_condition ""
+ }
+}
+
+pe {
+params {
+ normalize_weight true
+
+ ; extension selection
+ extension_options
+ {
+ use_default_single_threshold false
+ single_threshold 0.001
+ weight_threshold 0.6
+ max_repeat_length 8000
+ }
+}
+
+long_reads {
+ pacbio_reads {
+ unique_edge_priority 10000.0
+ }
+}
+}
diff --git a/configs/debruijn/meta_mode.info b/configs/debruijn/meta_mode.info
new file mode 100644
index 0000000..5462e69
--- /dev/null
+++ b/configs/debruijn/meta_mode.info
@@ -0,0 +1,171 @@
+mode meta
+
+; two-step pipeline
+two_step_rr true
+; enables/disables usage of intermediate contigs in two-step pipeline
+use_intermediate_contigs true
+
+;if true simple mismatches are corrected
+correct_mismatches false
+
+;flanking coverage range
+flanking_range 30
+
+simp
+{
+ cycle_iter_count 3
+
+ ; enable advanced ec removal algo
+ topology_simplif_enabled false
+
+ ; erroneous connections remover:
+ ec
+ {
+ ; ec_lb: max_ec_length = k + ec_lb
+ ; icb: iterative coverage bound
+ ; condition "{ ec_lb 30, icb 20.0 }"
+ condition "{ ec_lb 30, icb 2.5 }"
+ }
+
+ ; tip clipper:
+ tc
+ {
+ ; rctc: tip_cov < rctc * not_tip_cov
+ ; tc_lb: max_tip_length = max((min(k, read_length / 2) * tc_lb), read_length);
+ condition "{ tc_lb 3.5, , cb 1000000, rctc 2.0 } { tc_lb 6., cb 2.5, rctc 1.0 }"
+ }
+
+ ; relative coverage erroneous component remover:
+ rcc
+ {
+ enabled true
+ coverage_gap 5.
+ max_length_coeff 3.0
+ max_length_with_tips_coeff 5.0
+ max_vertex_cnt 100
+ max_ec_length_coefficient 300
+ max_coverage_coeff -1.0
+ }
+
+ ; complex tip clipper
+ complex_tc
+ {
+ enabled true
+ }
+
+
+
+ ; relative edge disconnector:
+ relative_ed
+ {
+ enabled true
+ diff_mult 10.
+ }
+
+ ; bulge remover:
+ br
+ {
+ max_coverage 1000000.0
+ max_relative_coverage 100000. ; bulge_cov < this * not_bulge_cov
+ max_delta 10
+ max_relative_delta 0.1
+ parallel true
+ }
+
+ ; final tip clipper:
+ final_tc
+ {
+ ; rctc: tip_cov < rctc * not_tip_cov
+ ; tc_lb: max_tip_length = max((min(k, read_length / 2) * tc_lb), read_length);
+ condition "{ lb 500, cb 3., rctc 0.7 } { lb 1500, cb 20., rctc 0.2 }"
+ }
+
+ ; final bulge remover:
+ final_br
+ {
+ enabled true
+ main_iteration_only true
+ max_bulge_length_coefficient 50. ; max_bulge_length = max_bulge_length_coefficient * k
+ max_coverage 1000000.0
+ max_relative_coverage 0.5 ; bulge_cov < this * not_bulge_cov
+ max_delta 50
+ max_relative_delta 0.1
+ }
+
+ ; second final bulge remover:
+ ; only in meta mode, inherits settings of final_br
+ second_final_br
+ {
+ max_delta 1500
+ max_number_edges 3
+ }
+
+ init_clean
+ {
+ early_it_only true
+ ier {
+ enabled true
+ }
+ tip_condition "{ tc_lb 3.5, cb 2.0 }"
+ ec_condition "{ ec_lb 10, cb 0.5 }"
+ }
+
+}
+
+;FIXME rename
+preliminary_simp
+{
+ init_clean
+ {
+ self_conj_condition "{ ec_lb 100, cb 20.0 }"
+ early_it_only false
+ ier
+ {
+ enabled true
+ }
+ tip_condition "{ rlmk, cb 1.2, mmm 2 }"
+ ec_condition "{ ec_lb 0, cb 0.9 }"
+ disconnect_flank_cov 0.8
+ }
+
+ post_simplif_enabled false
+
+ ; bulge remover:
+ br
+ {
+ max_coverage 1000000.0
+ max_relative_coverage 0.5 ; bulge_cov < this * not_bulge_cov
+ max_delta 10
+ max_relative_delta 0.1
+ }
+
+}
+
+;NB decsends from sc_pe
+pe {
+params {
+ remove_overlaps true
+ cut_all_overlaps true
+
+ ;TODO proper configuration of different extenders is not supported
+ ;TODO most settings ard hardcoded for now
+
+ ;normalize_weight NA
+ extension_options
+ {
+ ;use_default_single_threshold NA
+ ;single_threshold NA
+ weight_threshold 0.6
+ max_repeat_length 1000000
+ }
+
+ use_coordinated_coverage true
+}
+}
+
+prelim_pe {
+params {
+ use_coordinated_coverage false
+ remove_overlaps false
+}
+}
diff --git a/configs/debruijn/moleculo_mode.info b/configs/debruijn/moleculo_mode.info
new file mode 100644
index 0000000..40c2a54
--- /dev/null
+++ b/configs/debruijn/moleculo_mode.info
@@ -0,0 +1,127 @@
+mode moleculo
+
+simp
+{
+ ; enable advanced ec removal algo
+ topology_simplif_enabled false
+
+ ; tip clipper:
+ tc
+ {
+ ; rctc: tip_cov < rctc * not_tip_cov
+ ; tc_lb: max_tip_length = max((min(k, read_length / 2) * tc_lb), read_length);
+ condition "{ tc_lb 2.5, cb 3, rctc 10000 } { tc_lb 4.5, mmm 2 }"
+ }
+
+ ; bulge remover:
+ br
+ {
+ max_coverage 3
+ max_relative_coverage 100000. ; bulge_cov < this * not_bulge_cov
+ }
+
+ ; erroneous connections remover:
+ ec
+ {
+ ; ec_lb: max_ec_length = k + ec_lb
+ ; icb: iterative coverage bound
+ ; condition "{ ec_lb 30, icb 20.0 }"
+ condition "{ ec_lb 30, icb 3.1 }"
+ }
+
+ ; relative coverage erroneous component remover:
+ rcc
+ {
+ enabled true
+ coverage_gap 20.
+ max_length_coeff 2.0
+ max_length_with_tips_coeff 3.0
+ max_vertex_cnt 30
+ max_ec_length_coefficient 30
+ max_coverage_coeff 5.0
+ }
+
+ tec
+ {
+ max_ec_length_coefficient 55 ; max_ec_length = k + max_ec_length_coefficient
+ uniqueness_length 1500
+ plausibility_length 200
+ }
+
+ ; topology and reliability based erroneous connection remover
+ trec
+ {
+ max_ec_length_coefficient 100 ; max_ec_length = k + max_ec_length_coefficient
+ uniqueness_length 1500
+ unreliable_coverage 2.5
+ }
+
+ ; topology tip clipper:
+ ttc
+ {
+ length_coeff 3.5
+ plausibility_length 250
+ uniqueness_length 1500
+ }
+
+ ; complex bulge remover
+ cbr
+ {
+ enabled true
+ pics_enabled 0
+ folder complex_br_components
+ max_relative_length 5.
+ max_length_difference 5
+ }
+
+ ; hidden ec remover
+ her
+ {
+ enabled true
+ uniqueness_length 1500
+ unreliability_threshold 0.2
+ relative_threshold 5
+ }
+
+ init_clean
+ {
+ early_it_only true
+
+ activation_cov -1.
+ ier
+ {
+ enabled false
+ }
+
+ tip_condition ""
+ ec_condition ""
+ }
+}
+
+pe {
+params {
+ normalize_weight true
+ cut_all_overlaps true
+
+ ; extension selection
+ extension_options
+ {
+ use_default_single_threshold false
+ single_threshold 0.001
+ weight_threshold 0.6
+ }
+
+ scaffolder {
+ short_overlap 10
+ use_la_gap_joiner false
+ }
+}
+}
+
+sc_cor
+{
+ scaffolds_file scaffolds.fasta
+ output_unfilled true
+ max_insert 100
+ max_cut_length 50
+}
diff --git a/configs/debruijn/path_extend/pe_params.info b/configs/debruijn/path_extend/pe_params.info
deleted file mode 100644
index 279c8e4..0000000
--- a/configs/debruijn/path_extend/pe_params.info
+++ /dev/null
@@ -1,186 +0,0 @@
-default_pe {
-
-; output options
-
-debug_output false
-
-output {
- write_overlaped_paths true
- write_paths true
-}
-
-visualize {
- print_overlaped_paths true
- print_paths true
-}
-
-; none | break_gaps | break_all
-output_broken_scaffolds break_gaps
-
-params {
- ; old | 2015 | combined | old_pe_2015
- scaffolding_mode old
-
- split_edge_length 99
- normalize_weight false
- cut_all_overlaps false
-
- ; extension selection
- extension_options
- {
- use_default_single_threshold false
- single_threshold 1.75676
-; A.mirum threshold 0.076
-; E.coli RL36 threshold 0.717949
-; E.coli IS220 threshold 1.75676
- weight_threshold 0.5
- priority_coeff 1.5
- }
-
- mate_pair_options
- {
- use_default_single_threshold true
- single_threshold 30
- weight_threshold 0.5
- priority_coeff 1.5
- }
-
- scaffolder {
- on true
- cutoff 2
- rel_cutoff 0.1
- sum_threshold 3
-
- cluster_info true
- cl_threshold 0
-
- fix_gaps true
- use_la_gap_joiner true
- ;next param should be 0.51 - 1.0 if use_old_score = true and 3.0 otherwise
- min_gap_score 0.7
-
- max_must_overlap -2
- max_can_overlap 0.5
- short_overlap 6
- artificial_gap 10
- use_old_score true
-
- min_overlap_length 10
- flank_addition_coefficient -5.9
- flank_multiplication_coefficient 0.97
- }
-
- loop_removal
- {
- max_loops 10
- mp_max_loops 10
- }
-
- remove_overlaps true
- use_coordinated_coverage false
- coordinated_coverage
- {
- max_edge_length_repeat 300
- delta 0.4
- }
-
- scaffolding2015 {
- autodetect true
- min_unique_length 10000
- unique_coverage_variation 0.5
- ; (median * (1+variation) > unique > median * (1 - variation))
- }
-
- scaffold_graph {
- construct false
- output false
- min_read_count 20
- graph_connectivity false
- max_path_length 10000
- }
-}
-
-
-long_reads {
- pacbio_reads {
- filtering 2.5
- weight_priority 1.2
- unique_edge_priority 5.0
- }
-
- single_reads {
- filtering 1.25
- weight_priority 5.0
- unique_edge_priority 1000.0
- }
-
- coverage_base_rr {
- filtering 0.0
- weight_priority 1.5
- unique_edge_priority 2.0
- }
-}
-}
-
-sc_pe {
-params {
- normalize_weight true
-
- ; extension selection
- extension_options
- {
- use_default_single_threshold false
- single_threshold 0.001
- weight_threshold 0.6
- }
-
-}
-}
-
-moleculo_pe {
-params {
- normalize_weight true
- cut_all_overlaps true
-
- ; extension selection
- extension_options
- {
- use_default_single_threshold false
- single_threshold 0.001
- weight_threshold 0.6
- }
-
- scaffolder {
- short_overlap 10
- use_la_gap_joiner false
- }
-}
-}
-
-;NB decsends from sc_pe
-meta_pe {
-params {
- remove_overlaps true
- cut_all_overlaps true
-
- ;TODO proper configuration of different extenders is not supported
- ;TODO most settings ard hardcoded for now
-
- ;normalize_weight NA
- extension_options
- {
- ;use_default_single_threshold NA
- ;single_threshold NA
- weight_threshold 0.6
- }
-
- use_coordinated_coverage true
-}
-}
-
-prelim_pe {
-params {
- use_coordinated_coverage false
- remove_overlaps false
-}
-}
diff --git a/configs/debruijn/path_extend/pe_params.info.template b/configs/debruijn/path_extend/pe_params.info.template
deleted file mode 100644
index 279c8e4..0000000
--- a/configs/debruijn/path_extend/pe_params.info.template
+++ /dev/null
@@ -1,186 +0,0 @@
-default_pe {
-
-; output options
-
-debug_output false
-
-output {
- write_overlaped_paths true
- write_paths true
-}
-
-visualize {
- print_overlaped_paths true
- print_paths true
-}
-
-; none | break_gaps | break_all
-output_broken_scaffolds break_gaps
-
-params {
- ; old | 2015 | combined | old_pe_2015
- scaffolding_mode old
-
- split_edge_length 99
- normalize_weight false
- cut_all_overlaps false
-
- ; extension selection
- extension_options
- {
- use_default_single_threshold false
- single_threshold 1.75676
-; A.mirum threshold 0.076
-; E.coli RL36 threshold 0.717949
-; E.coli IS220 threshold 1.75676
- weight_threshold 0.5
- priority_coeff 1.5
- }
-
- mate_pair_options
- {
- use_default_single_threshold true
- single_threshold 30
- weight_threshold 0.5
- priority_coeff 1.5
- }
-
- scaffolder {
- on true
- cutoff 2
- rel_cutoff 0.1
- sum_threshold 3
-
- cluster_info true
- cl_threshold 0
-
- fix_gaps true
- use_la_gap_joiner true
- ;next param should be 0.51 - 1.0 if use_old_score = true and 3.0 otherwise
- min_gap_score 0.7
-
- max_must_overlap -2
- max_can_overlap 0.5
- short_overlap 6
- artificial_gap 10
- use_old_score true
-
- min_overlap_length 10
- flank_addition_coefficient -5.9
- flank_multiplication_coefficient 0.97
- }
-
- loop_removal
- {
- max_loops 10
- mp_max_loops 10
- }
-
- remove_overlaps true
- use_coordinated_coverage false
- coordinated_coverage
- {
- max_edge_length_repeat 300
- delta 0.4
- }
-
- scaffolding2015 {
- autodetect true
- min_unique_length 10000
- unique_coverage_variation 0.5
- ; (median * (1+variation) > unique > median * (1 - variation))
- }
-
- scaffold_graph {
- construct false
- output false
- min_read_count 20
- graph_connectivity false
- max_path_length 10000
- }
-}
-
-
-long_reads {
- pacbio_reads {
- filtering 2.5
- weight_priority 1.2
- unique_edge_priority 5.0
- }
-
- single_reads {
- filtering 1.25
- weight_priority 5.0
- unique_edge_priority 1000.0
- }
-
- coverage_base_rr {
- filtering 0.0
- weight_priority 1.5
- unique_edge_priority 2.0
- }
-}
-}
-
-sc_pe {
-params {
- normalize_weight true
-
- ; extension selection
- extension_options
- {
- use_default_single_threshold false
- single_threshold 0.001
- weight_threshold 0.6
- }
-
-}
-}
-
-moleculo_pe {
-params {
- normalize_weight true
- cut_all_overlaps true
-
- ; extension selection
- extension_options
- {
- use_default_single_threshold false
- single_threshold 0.001
- weight_threshold 0.6
- }
-
- scaffolder {
- short_overlap 10
- use_la_gap_joiner false
- }
-}
-}
-
-;NB decsends from sc_pe
-meta_pe {
-params {
- remove_overlaps true
- cut_all_overlaps true
-
- ;TODO proper configuration of different extenders is not supported
- ;TODO most settings ard hardcoded for now
-
- ;normalize_weight NA
- extension_options
- {
- ;use_default_single_threshold NA
- ;single_threshold NA
- weight_threshold 0.6
- }
-
- use_coordinated_coverage true
-}
-}
-
-prelim_pe {
-params {
- use_coordinated_coverage false
- remove_overlaps false
-}
-}
diff --git a/configs/debruijn/pe_params.info b/configs/debruijn/pe_params.info
new file mode 100644
index 0000000..405ea42
--- /dev/null
+++ b/configs/debruijn/pe_params.info
@@ -0,0 +1,138 @@
+pe {
+
+; output options
+
+debug_output false
+
+output {
+ write_overlaped_paths true
+ write_paths true
+}
+
+visualize {
+ print_overlaped_paths true
+ print_paths true
+}
+
+; none | break_gaps | break_all
+output_broken_scaffolds break_gaps
+
+params {
+ multi_path_extend false
+ ; old | 2015 | combined | old_pe_2015
+ scaffolding_mode old
+
+ remove_overlaps true
+ cut_all_overlaps false
+
+ split_edge_length 99
+ normalize_weight false
+
+ ; extension selection
+ extension_options
+ {
+ use_default_single_threshold false
+ single_threshold 1.75676
+ weight_threshold 0.5
+ priority_coeff 1.5
+ max_repeat_length 8000
+ }
+
+ mate_pair_options
+ {
+ use_default_single_threshold true
+ single_threshold 30
+ weight_threshold 0.5
+ priority_coeff 1.5
+ max_repeat_length 8000
+ }
+
+ scaffolder {
+ on true
+ cutoff 2
+ rel_cutoff 0.1
+ sum_threshold 3
+
+ cluster_info true
+ cl_threshold 0
+
+ fix_gaps true
+ use_la_gap_joiner true
+ ;next param should be 0.51 - 1.0 if use_old_score = true and 3.0 otherwise
+ min_gap_score 0.7
+
+ max_must_overlap -2
+ max_can_overlap 0.5
+ short_overlap 6
+ artificial_gap 10
+ use_old_score true
+
+ min_overlap_length 10
+ flank_addition_coefficient -5.9
+ flank_multiplication_coefficient 0.97
+ }
+
+ loop_removal
+ {
+ max_loops 10
+ mp_max_loops 10
+ }
+
+ use_coordinated_coverage false
+ coordinated_coverage
+ {
+ max_edge_length_repeat 300
+ delta 0.5
+ min_path_len 1000
+ }
+
+ scaffolding2015 {
+ autodetect true
+ min_unique_length 10000
+ unique_coverage_variation 0.5
+ ; (median * (1+variation) > unique > median * (1 - variation))
+ }
+
+ scaffold_graph {
+ construct false
+ output false
+ always_add 40 ; connection with read count >= always_add are always added to the graph
+ never_add 5 ; connection with read count < never_add are never added to the graph
+ relative_threshold 0.25 ; connection with read count >= max_read_count * relative_threshod are added to the graph if satisfy condition above, max_read_count is calculated amond all alternatives
+ graph_connectivity false
+ max_path_length 10000
+ }
+}
+
+
+long_reads {
+ pacbio_reads {
+ filtering 2.5
+ weight_priority 1.2
+ unique_edge_priority 5.0
+ min_significant_overlap 0
+ }
+
+ single_reads {
+ filtering 1.25
+ weight_priority 5.0
+ unique_edge_priority 10000.0
+ min_significant_overlap 0
+ }
+
+ contigs {
+ filtering 0.0
+ weight_priority 1.5
+ unique_edge_priority 2.0
+ min_significant_overlap 0
+ }
+
+ meta_untrusted_contigs {
+ filtering 0.0
+ weight_priority 10000.0
+ unique_edge_priority 10000.0
+ min_significant_overlap 200
+ }
+
+}
+}
diff --git a/configs/debruijn/plasmid_mode.info b/configs/debruijn/plasmid_mode.info
new file mode 100644
index 0000000..2cd9a84
--- /dev/null
+++ b/configs/debruijn/plasmid_mode.info
@@ -0,0 +1,12 @@
+mode plasmid
+
+plasmid
+{
+ long_edge_length 1000
+ edge_length_for_median 10000
+ relative_coverage 0.3
+ small_component_size 10000
+ small_component_relative_coverage 2
+ min_component_length 10000
+ min_isolated_length 1000
+}
diff --git a/configs/debruijn/rna_mode.info b/configs/debruijn/rna_mode.info
new file mode 100644
index 0000000..727e22c
--- /dev/null
+++ b/configs/debruijn/rna_mode.info
@@ -0,0 +1,75 @@
+mode rna
+
+simp
+{
+ ; enable advanced ec removal algo
+ topology_simplif_enabled false
+ tc
+ {
+ ; rctc: tip_cov < rctc * not_tip_cov
+ ; tc_lb: max_tip_length = max((min(k, read_length / 2) * tc_lb), read_length);
+ condition "{ mmm 3 tc_lb 3.5, cb 100000, rctc 0.1 } { tc_lb 3.5, cb 4, rctc 10000 } { tc_lb 0.1, cb 20, rctc 10000 }"
+ }
+ ; bulge remover:
+ br
+ {
+ max_additive_length_coefficient 100
+ max_coverage 1000000.0
+ max_relative_coverage 100000.0 ; bulge_cov < this * not_bulge_cov
+ }
+ ; erroneous connections remover:
+ ec
+ {
+ ; ec_lb: max_ec_length = k + ec_lb
+ ; icb: iterative coverage bound
+ ; to_ec_lb: max_ec_length = 2*tip_length(to_ec_lb) - 1
+ ; condition "{ ec_lb 9, icb 40.0 }"
+ condition "{ ec_lb 30, icb auto }"
+ }
+ rcc
+ {
+ enabled true
+ coverage_gap 20.
+ max_length_coeff 2.0
+ max_length_with_tips_coeff 3.0
+ max_vertex_cnt 30
+ max_ec_length_coefficient 30
+ max_coverage_coeff 5.0
+ }
+ ;all topology based erroneous connection removers are off
+ ier
+ {
+ enabled true
+ max_length 200
+ max_coverage 4
+ max_length_any_cov 0 ; will be taken max with read_length
+ }
+ ; hidden ec remover
+ her
+ {
+ enabled true
+ uniqueness_length 1500
+ unreliability_threshold 0.2
+ relative_threshold 5
+ }
+
+ init_clean
+ {
+ activation_cov -1.
+ ier
+ {
+ enabled false
+ }
+
+ tip_condition ""
+ ec_condition ""
+ }
+
+}
+
+pe {
+params {
+ multi_path_extend true
+ remove_overlaps false
+}
+}
diff --git a/configs/debruijn/simplification.info b/configs/debruijn/simplification.info
index 7f6768d..6e05f34 100644
--- a/configs/debruijn/simplification.info
+++ b/configs/debruijn/simplification.info
@@ -1,6 +1,6 @@
; simplification
-default
+simp
{
; number of iterations in basic simplification cycle
cycle_iter_count 10
@@ -150,8 +150,11 @@ default
; complex tip clipper
complex_tc
{
- enabled 0
- }
+ enabled false
+ max_relative_coverage -1
+ max_edge_len 100
+ condition "{ tc_lb 3.5 }"
+ }
; complex bulge remover
cbr
@@ -194,368 +197,3 @@ default
}
}
-
-sc
-{
- ; enable advanced ec removal algo
- topology_simplif_enabled true
-
- ; tip clipper:
- tc
- {
- ; rctc: tip_cov < rctc * not_tip_cov
- ; tc_lb: max_tip_length = max((min(k, read_length / 2) * tc_lb), read_length);
- condition "{ tc_lb 3.5, cb 1000000, rctc 2.0 }"
- }
-
- ; erroneous connections remover:
- ec
- {
- ; ec_lb: max_ec_length = k + ec_lb
- ; icb: iterative coverage bound
- ; condition "{ ec_lb 30, icb 20.0 }"
- condition "{ ec_lb 30, icb auto }"
- }
-
- final_tc
- {
- condition "{ tc_lb 3.5, cb 100000, rctc 10000 }"
- }
-
- ; bulge remover:
- final_br
- {
- enabled true
- max_coverage 1000000.0
- max_relative_coverage 100000. ; bulge_cov < this * not_bulge_cov
- }
-
- ; relative coverage erroneous component remover:
- rcc
- {
- enabled true
- coverage_gap 20.
- max_length_coeff 2.0
- max_length_with_tips_coeff 3.0
- max_vertex_cnt 30
- max_ec_length_coefficient 30
- max_coverage_coeff 5.0
- }
-
- tec
- {
- max_ec_length_coefficient 55 ; max_ec_length = k + max_ec_length_coefficient
- uniqueness_length 1500
- plausibility_length 200
- }
-
- ; topology and reliability based erroneous connection remover
- trec
- {
- max_ec_length_coefficient 100 ; max_ec_length = k + max_ec_length_coefficient
- uniqueness_length 1500
- unreliable_coverage 2.5
- }
-
- ; topology tip clipper:
- ttc
- {
- length_coeff 3.5
- plausibility_length 250
- uniqueness_length 1500
- }
-
- ; complex bulge remover
- cbr
- {
- enabled true
- }
-
- ; hidden ec remover
- her
- {
- enabled true
- uniqueness_length 1500
- unreliability_threshold 0.2
- relative_threshold 5
- }
-
- init_clean
- {
- activation_cov -1.
- ier
- {
- enabled false
- }
-
- tip_condition ""
- ec_condition ""
- }
-}
-
-moleculo
-{
- ; enable advanced ec removal algo
- topology_simplif_enabled false
-
- ; tip clipper:
- tc
- {
- ; rctc: tip_cov < rctc * not_tip_cov
- ; tc_lb: max_tip_length = max((min(k, read_length / 2) * tc_lb), read_length);
- condition "{ tc_lb 2.5, cb 3, rctc 10000 } { tc_lb 4.5, mmm 2 }"
- }
-
- ; bulge remover:
- br
- {
- max_coverage 3
- max_relative_coverage 100000. ; bulge_cov < this * not_bulge_cov
- }
-
- ; erroneous connections remover:
- ec
- {
- ; ec_lb: max_ec_length = k + ec_lb
- ; icb: iterative coverage bound
- ; condition "{ ec_lb 30, icb 20.0 }"
- condition "{ ec_lb 30, icb 3.1 }"
- }
-
- ; relative coverage erroneous component remover:
- rcc
- {
- enabled true
- coverage_gap 20.
- max_length_coeff 2.0
- max_length_with_tips_coeff 3.0
- max_vertex_cnt 30
- max_ec_length_coefficient 30
- max_coverage_coeff 5.0
- }
-
- tec
- {
- max_ec_length_coefficient 55 ; max_ec_length = k + max_ec_length_coefficient
- uniqueness_length 1500
- plausibility_length 200
- }
-
- ; topology and reliability based erroneous connection remover
- trec
- {
- max_ec_length_coefficient 100 ; max_ec_length = k + max_ec_length_coefficient
- uniqueness_length 1500
- unreliable_coverage 2.5
- }
-
- ; topology tip clipper:
- ttc
- {
- length_coeff 3.5
- plausibility_length 250
- uniqueness_length 1500
- }
-
- ; complex bulge remover
- cbr
- {
- enabled true
- pics_enabled 0
- folder complex_br_components
- max_relative_length 5.
- max_length_difference 5
- }
-
- ; hidden ec remover
- her
- {
- enabled true
- uniqueness_length 1500
- unreliability_threshold 0.2
- relative_threshold 5
- }
-
- init_clean
- {
- early_it_only true
-
- activation_cov -1.
- ier
- {
- enabled false
- }
-
- tip_condition ""
- ec_condition ""
- }
-}
-
-careful
-{
- ; bulge remover:
- br
- {
- max_coverage 1000000.0
- max_relative_coverage 1.5 ; bulge_cov < this * not_bulge_cov
- parallel false
- }
-
- ; complex bulge remover
- cbr
- {
- enabled false
- }
-
- ; relative coverage erroneous component remover:
- rcc
- {
- enabled false
- }
-
- init_clean
- {
- early_it_only true
-
- activation_cov -1.
- ier
- {
- enabled false
- }
-
- tip_condition ""
- ec_condition ""
- }
-}
-
-diploid_simp
-{
- post_simplif_enabled false
-
- ; bulge remover:
- br
- {
- enabled false
- }
-}
-
-meta
-{
- cycle_iter_count 3
-
- ; enable advanced ec removal algo
- topology_simplif_enabled false
-
- ; erroneous connections remover:
- ec
- {
- ; ec_lb: max_ec_length = k + ec_lb
- ; icb: iterative coverage bound
- ; condition "{ ec_lb 30, icb 20.0 }"
- condition "{ ec_lb 30, icb 2.5 }"
- }
-
- ; tip clipper:
- tc
- {
- ; rctc: tip_cov < rctc * not_tip_cov
- ; tc_lb: max_tip_length = max((min(k, read_length / 2) * tc_lb), read_length);
- condition "{ tc_lb 3.5, cb 10000 } { tc_lb 6., cb 2.5 }"
- }
-
- ; relative coverage erroneous component remover:
- rcc
- {
- enabled true
- coverage_gap 5.
- max_length_coeff 3.0
- max_length_with_tips_coeff 5.0
- max_vertex_cnt 100
- max_ec_length_coefficient 300
- max_coverage_coeff -1.0
- }
-
- ; relative edge disconnector:
- relative_ed
- {
- enabled true
- diff_mult 10.
- }
-
- ; bulge remover:
- br
- {
- max_coverage 1000000.0
- max_relative_coverage 100000. ; bulge_cov < this * not_bulge_cov
- max_delta 10
- max_relative_delta 0.1
- parallel true
- }
-
- ; final tip clipper:
- final_tc
- {
- ; rctc: tip_cov < rctc * not_tip_cov
- ; tc_lb: max_tip_length = max((min(k, read_length / 2) * tc_lb), read_length);
- condition "{ lb 500, cb 3., rctc 1.0 } { lb 1500, cb 20., rctc 0.2 }"
- }
-
- ; final bulge remover:
- final_br
- {
- enabled true
- main_iteration_only true
- max_bulge_length_coefficient 50. ; max_bulge_length = max_bulge_length_coefficient * k
- max_coverage 1000000.0
- max_relative_coverage 0.5 ; bulge_cov < this * not_bulge_cov
- max_delta 50
- max_relative_delta 0.1
- }
-
- ; second final bulge remover:
- ; only in meta mode, inherits settings of final_br
- second_final_br
- {
- max_delta 1500
- max_number_edges 3
- }
-
- init_clean
- {
- early_it_only true
- ier {
- enabled true
- }
- tip_condition "{ tc_lb 3.5, cb 2.0 }"
- ec_condition "{ ec_lb 10, cb 0.5 }"
- }
-
-}
-
-preliminary
-{
- init_clean
- {
- self_conj_condition "{ ec_lb 100, cb 20.0 }"
- early_it_only false
- ier
- {
- enabled true
- }
- tip_condition "{ lb 20, cb 1.1, mmm 2 }"
- ec_condition "{ ec_lb 0, cb 0.9 }"
- disconnect_flank_cov 0.9
- }
-
- post_simplif_enabled false
-
- ; bulge remover:
- br
- {
- max_coverage 1000000.0
- max_relative_coverage 0.5 ; bulge_cov < this * not_bulge_cov
- max_delta 10
- max_relative_delta 0.1
- }
-
-}
diff --git a/configs/debruijn/simplification.info.template b/configs/debruijn/simplification.info.template
deleted file mode 100644
index 7f6768d..0000000
--- a/configs/debruijn/simplification.info.template
+++ /dev/null
@@ -1,561 +0,0 @@
-; simplification
-
-default
-{
- ; number of iterations in basic simplification cycle
- cycle_iter_count 10
-
- ; enable advanced simplification algo
- post_simplif_enabled true
-
- ; enable advanced ec removal algo
- topology_simplif_enabled false
-
- ; tip clipper:
- tc
- {
- ; rctc: tip_cov < rctc * not_tip_cov
- ; tc_lb: max_tip_length = max((min(k, read_length / 2) * tc_lb), read_length);
- ; todo think about params one more time
- condition "{ tc_lb 3.5, cb 1000000, rctc 2.0 } { tc_lb 10., cb auto }"
- }
-
- ; bulge remover:
- br
- {
- enabled true
- main_iteration_only false
- max_bulge_length_coefficient 3. ; max_bulge_length = max_bulge_length_coefficient * k
- max_additive_length_coefficient 100
- max_coverage 1000.0
- max_relative_coverage 1.1 ; bulge_cov < this * not_bulge_cov
- max_delta 3
- max_relative_delta 0.1
- max_number_edges 1000
- parallel true
- buff_size 10000
- buff_cov_diff 2.
- buff_cov_rel_diff 0.2
- }
-
- ; erroneous connections remover:
- ec
- {
- ; ec_lb: max_ec_length = k + ec_lb
- ; icb: iterative coverage bound
- ; to_ec_lb: max_ec_length = 2*tip_length(to_ec_lb) - 1
- condition "{ to_ec_lb 5, icb auto }"
- ; condition "{ ec_lb 9, icb 40.0 }"
- }
-
- ; relative coverage erroneous component remover:
- rcc
- {
- enabled false
- coverage_gap 5.
- max_length_coeff 2.0
- max_length_with_tips_coeff 3.0
- max_vertex_cnt 30
- max_ec_length_coefficient 30
- max_coverage_coeff 2.0
- }
-
- ; relative edge disconnector:
- relative_ed
- {
- enabled false
- diff_mult 20.
- }
-
- ; final tip clipper:
- final_tc
- {
- condition ""
- }
-
- ; final bulge remover:
- final_br
- {
- enabled false
- main_iteration_only false
- max_bulge_length_coefficient 3. ; max_bulge_length = max_bulge_length_coefficient * k
- max_additive_length_coefficient 100
- max_coverage 1000.0
- max_relative_coverage 1.1 ; bulge_cov < this * not_bulge_cov
- max_delta 3
- max_relative_delta 0.1
- max_number_edges 1000
- parallel true
- buff_size 10000
- buff_cov_diff 2.
- buff_cov_rel_diff 0.2
- }
-
- ; topology based erroneous connection remover
- tec
- {
- max_ec_length_coefficient 55 ; max_ec_length = k + max_ec_length_coefficient
- uniqueness_length 5000
- plausibility_length 200
- }
-
- ; topology and reliability based erroneous connection remover
- trec
- {
- max_ec_length_coefficient 100 ; max_ec_length = k + max_ec_length_coefficient
- uniqueness_length 1500
- unreliable_coverage 2.5
- }
-
- ; interstrand erroneous connection remover (thorn remover)
- isec
- {
- max_ec_length_coefficient 100 ; max_ec_length = k + max_ec_length_coefficient
- uniqueness_length 1500
- span_distance 15000
- }
-
- ; max flow erroneous connection remover
- mfec
- {
- enabled false
- max_ec_length_coefficient 30 ; max_ec_length = k + max_ec_length_coefficient
- uniqueness_length 1500
- plausibility_length 200
- }
-
- piec
- {
- max_ec_length_coefficient 30 ; max_ec_length = k + max_ec_length_coefficient
- min_neighbour_length 100
- }
-
- ; isolated edges remover
- ier
- {
- enabled true
- max_length 0
- max_coverage 2
- max_length_any_cov 150 ; will be taken max with read_length
- }
-
- ; topology tip clipper:
- ttc
- {
- length_coeff 3.5
- plausibility_length 250
- uniqueness_length 1500
- }
-
- ; complex tip clipper
- complex_tc
- {
- enabled 0
- }
-
- ; complex bulge remover
- cbr
- {
- enabled false
- max_relative_length 5.
- max_length_difference 5
- }
-
- ; hidden ec remover
- her
- {
- enabled false
- uniqueness_length 1500
- unreliability_threshold 4
- relative_threshold 5
- }
-
- init_clean
- {
- self_conj_condition "{ ec_lb 100, cb 1.0 }"
- early_it_only false
- ; will be enabled only if average coverage \leq activate_cov
- activation_cov 10.
-
- ; isolated edges remover
- ier
- {
- enabled true
- max_length 0
- max_coverage 0
- max_length_any_cov 0 ; will be taken max with read_length
- }
-
- tip_condition "{ tc_lb 3.5, cb auto }"
- ec_condition "{ ec_lb 10, cb 2.0 }"
- ; edges with flank cov around alternative less than value will be disconnected
- ; negative value to disable
- disconnect_flank_cov -1.0
- }
-
-}
-
-sc
-{
- ; enable advanced ec removal algo
- topology_simplif_enabled true
-
- ; tip clipper:
- tc
- {
- ; rctc: tip_cov < rctc * not_tip_cov
- ; tc_lb: max_tip_length = max((min(k, read_length / 2) * tc_lb), read_length);
- condition "{ tc_lb 3.5, cb 1000000, rctc 2.0 }"
- }
-
- ; erroneous connections remover:
- ec
- {
- ; ec_lb: max_ec_length = k + ec_lb
- ; icb: iterative coverage bound
- ; condition "{ ec_lb 30, icb 20.0 }"
- condition "{ ec_lb 30, icb auto }"
- }
-
- final_tc
- {
- condition "{ tc_lb 3.5, cb 100000, rctc 10000 }"
- }
-
- ; bulge remover:
- final_br
- {
- enabled true
- max_coverage 1000000.0
- max_relative_coverage 100000. ; bulge_cov < this * not_bulge_cov
- }
-
- ; relative coverage erroneous component remover:
- rcc
- {
- enabled true
- coverage_gap 20.
- max_length_coeff 2.0
- max_length_with_tips_coeff 3.0
- max_vertex_cnt 30
- max_ec_length_coefficient 30
- max_coverage_coeff 5.0
- }
-
- tec
- {
- max_ec_length_coefficient 55 ; max_ec_length = k + max_ec_length_coefficient
- uniqueness_length 1500
- plausibility_length 200
- }
-
- ; topology and reliability based erroneous connection remover
- trec
- {
- max_ec_length_coefficient 100 ; max_ec_length = k + max_ec_length_coefficient
- uniqueness_length 1500
- unreliable_coverage 2.5
- }
-
- ; topology tip clipper:
- ttc
- {
- length_coeff 3.5
- plausibility_length 250
- uniqueness_length 1500
- }
-
- ; complex bulge remover
- cbr
- {
- enabled true
- }
-
- ; hidden ec remover
- her
- {
- enabled true
- uniqueness_length 1500
- unreliability_threshold 0.2
- relative_threshold 5
- }
-
- init_clean
- {
- activation_cov -1.
- ier
- {
- enabled false
- }
-
- tip_condition ""
- ec_condition ""
- }
-}
-
-moleculo
-{
- ; enable advanced ec removal algo
- topology_simplif_enabled false
-
- ; tip clipper:
- tc
- {
- ; rctc: tip_cov < rctc * not_tip_cov
- ; tc_lb: max_tip_length = max((min(k, read_length / 2) * tc_lb), read_length);
- condition "{ tc_lb 2.5, cb 3, rctc 10000 } { tc_lb 4.5, mmm 2 }"
- }
-
- ; bulge remover:
- br
- {
- max_coverage 3
- max_relative_coverage 100000. ; bulge_cov < this * not_bulge_cov
- }
-
- ; erroneous connections remover:
- ec
- {
- ; ec_lb: max_ec_length = k + ec_lb
- ; icb: iterative coverage bound
- ; condition "{ ec_lb 30, icb 20.0 }"
- condition "{ ec_lb 30, icb 3.1 }"
- }
-
- ; relative coverage erroneous component remover:
- rcc
- {
- enabled true
- coverage_gap 20.
- max_length_coeff 2.0
- max_length_with_tips_coeff 3.0
- max_vertex_cnt 30
- max_ec_length_coefficient 30
- max_coverage_coeff 5.0
- }
-
- tec
- {
- max_ec_length_coefficient 55 ; max_ec_length = k + max_ec_length_coefficient
- uniqueness_length 1500
- plausibility_length 200
- }
-
- ; topology and reliability based erroneous connection remover
- trec
- {
- max_ec_length_coefficient 100 ; max_ec_length = k + max_ec_length_coefficient
- uniqueness_length 1500
- unreliable_coverage 2.5
- }
-
- ; topology tip clipper:
- ttc
- {
- length_coeff 3.5
- plausibility_length 250
- uniqueness_length 1500
- }
-
- ; complex bulge remover
- cbr
- {
- enabled true
- pics_enabled 0
- folder complex_br_components
- max_relative_length 5.
- max_length_difference 5
- }
-
- ; hidden ec remover
- her
- {
- enabled true
- uniqueness_length 1500
- unreliability_threshold 0.2
- relative_threshold 5
- }
-
- init_clean
- {
- early_it_only true
-
- activation_cov -1.
- ier
- {
- enabled false
- }
-
- tip_condition ""
- ec_condition ""
- }
-}
-
-careful
-{
- ; bulge remover:
- br
- {
- max_coverage 1000000.0
- max_relative_coverage 1.5 ; bulge_cov < this * not_bulge_cov
- parallel false
- }
-
- ; complex bulge remover
- cbr
- {
- enabled false
- }
-
- ; relative coverage erroneous component remover:
- rcc
- {
- enabled false
- }
-
- init_clean
- {
- early_it_only true
-
- activation_cov -1.
- ier
- {
- enabled false
- }
-
- tip_condition ""
- ec_condition ""
- }
-}
-
-diploid_simp
-{
- post_simplif_enabled false
-
- ; bulge remover:
- br
- {
- enabled false
- }
-}
-
-meta
-{
- cycle_iter_count 3
-
- ; enable advanced ec removal algo
- topology_simplif_enabled false
-
- ; erroneous connections remover:
- ec
- {
- ; ec_lb: max_ec_length = k + ec_lb
- ; icb: iterative coverage bound
- ; condition "{ ec_lb 30, icb 20.0 }"
- condition "{ ec_lb 30, icb 2.5 }"
- }
-
- ; tip clipper:
- tc
- {
- ; rctc: tip_cov < rctc * not_tip_cov
- ; tc_lb: max_tip_length = max((min(k, read_length / 2) * tc_lb), read_length);
- condition "{ tc_lb 3.5, cb 10000 } { tc_lb 6., cb 2.5 }"
- }
-
- ; relative coverage erroneous component remover:
- rcc
- {
- enabled true
- coverage_gap 5.
- max_length_coeff 3.0
- max_length_with_tips_coeff 5.0
- max_vertex_cnt 100
- max_ec_length_coefficient 300
- max_coverage_coeff -1.0
- }
-
- ; relative edge disconnector:
- relative_ed
- {
- enabled true
- diff_mult 10.
- }
-
- ; bulge remover:
- br
- {
- max_coverage 1000000.0
- max_relative_coverage 100000. ; bulge_cov < this * not_bulge_cov
- max_delta 10
- max_relative_delta 0.1
- parallel true
- }
-
- ; final tip clipper:
- final_tc
- {
- ; rctc: tip_cov < rctc * not_tip_cov
- ; tc_lb: max_tip_length = max((min(k, read_length / 2) * tc_lb), read_length);
- condition "{ lb 500, cb 3., rctc 1.0 } { lb 1500, cb 20., rctc 0.2 }"
- }
-
- ; final bulge remover:
- final_br
- {
- enabled true
- main_iteration_only true
- max_bulge_length_coefficient 50. ; max_bulge_length = max_bulge_length_coefficient * k
- max_coverage 1000000.0
- max_relative_coverage 0.5 ; bulge_cov < this * not_bulge_cov
- max_delta 50
- max_relative_delta 0.1
- }
-
- ; second final bulge remover:
- ; only in meta mode, inherits settings of final_br
- second_final_br
- {
- max_delta 1500
- max_number_edges 3
- }
-
- init_clean
- {
- early_it_only true
- ier {
- enabled true
- }
- tip_condition "{ tc_lb 3.5, cb 2.0 }"
- ec_condition "{ ec_lb 10, cb 0.5 }"
- }
-
-}
-
-preliminary
-{
- init_clean
- {
- self_conj_condition "{ ec_lb 100, cb 20.0 }"
- early_it_only false
- ier
- {
- enabled true
- }
- tip_condition "{ lb 20, cb 1.1, mmm 2 }"
- ec_condition "{ ec_lb 0, cb 0.9 }"
- disconnect_flank_cov 0.9
- }
-
- post_simplif_enabled false
-
- ; bulge remover:
- br
- {
- max_coverage 1000000.0
- max_relative_coverage 0.5 ; bulge_cov < this * not_bulge_cov
- max_delta 10
- max_relative_delta 0.1
- }
-
-}
diff --git a/configs/debruijn/tsa.info.template b/configs/debruijn/tsa.info.template
deleted file mode 100644
index c948068..0000000
--- a/configs/debruijn/tsa.info.template
+++ /dev/null
@@ -1,5 +0,0 @@
-tsa
-{
- scaffolds_file /home/anton/gitrep/algorithmic-biology/assembler/BC087/K55/scaffolds.fasta
- genome_file genome.fasta
-}
diff --git a/configs/dipspades/config.info.template b/configs/dipspades/config.info.template
deleted file mode 100644
index 773fdaa..0000000
--- a/configs/dipspades/config.info.template
+++ /dev/null
@@ -1,64 +0,0 @@
-; base parameters ;
-bp {
- K 55;
- use_multithreading true;
- max_threads 16;
- max_memory 512;
- ; size of buffer for each thread in MB, 0 for autodetection
- read_buffer_size 0
-}
-
-; input-output params ;
-io {
- haplocontigs haplocontigs
- log_filename log.properties
- output_base data/debruijn
- output_dir simulated_e.coli_100k
- tmp_dir dipspades_tmp/
- saves data/debruijn
-}
-
-; run params ;
-rp {
- entry_point dipspades
- : entry_point dipspades:heterozygosity_estimation
- ; entry_point dipspades:contig_graph_construction
- ; entry_point dipspades:polymorphic_br
- ; entry_point dipspades:dipspades
- ; entry_point dipspades:consensus_construction
- ; entry_point dipspades:haplotype_assembly
- developer_mode true
-}
-
-; polymorphic bulge remover config
-pbr {
- enabled true
- rel_bulge_length .8 ; min(len1, len2) / max(len1, len2) >= rel_bulge_length
- ; where len1, len2 - lengths of bulge sides
- rel_bulge_align .5 ; editdist(seq1, seq2) / min(|seq1|, |seq2|) <= rel_bulge_align
- ; where seq1, seq2 - sequences of bulge sides
- paired_vert_abs_threshold 50 ;
- paired_vert_rel_threshold .15 ;
- max_bulge_nucls_len 25000 ; maximal length (in nt number) of bulge sides
- max_neigh_number 100 ; maximal number of neighs for bulge search
- num_iters_lbr 15 ; number of light bulge remover iterations
-}
-
-; consensus constructor config
-cc {
- enabled true
- bulge_len_quantile .95 ; value of this quantile of bulge length histogram
- ; is upper bound of bulge length in contigs
- tails_lie_on_bulges true ; tail has to lie on bulge side
- estimate_tails true
- align_bulge_sides true ; check bulge into paired haplocontigs for alignment
- min_overlap_size 1500 ; minimal allowable length of overlap (in nt)
- min_lcs_size 1500 ; minimal allowable length of shared subsequence of
- ; paired contigs (in nt)
- max_loop_length 500 ; maximal length of loop that can ignored in remover red contigs
-}
-
-; haplotype_assembly
-ha {
- ha_enabled true
-}
diff --git a/configs/dipspades/log.properties.template b/configs/dipspades/log.properties.template
deleted file mode 100644
index 68df20a..0000000
--- a/configs/dipspades/log.properties.template
+++ /dev/null
@@ -1,36 +0,0 @@
-default=INFO
-
-#PolymorphicBulgeRemover=TRACE
-#BulgeSplitter=TRACE
-#SubpathSplitter=TRACE
-#ComplexBulgeGluer=TRACE
-#GluingVericesDefiner=TRACE
-#GluingVericesDefinerResults=TRACE
-#ConsensusContigsConstructor=TRACE
-#CloseGapsCorrector=TRACE
-#LoopBulgeDeletionCorrector=TRACE
-#CompositeMappingContig=TRACE
-#DiploidContigSeparator=TRACE
-#ContigLabelAllocator=TRACE
-#OverlappedContigsMap=TRACE
-#OverlapCorrector=TRACE
-#EqualSequencesGluer=TRACE
-
-#DeBruijnGraphConstructor=TRACE
-#PairedHandlerApplier=TRACE
-#QualityEdgeLocalityPrintingRH=TRACE
-#PairInfoAwareErroneousEdgeRemover=TRACE
-#QualityLoggingRemovalHandler=TRACE
-#MatePairTransformStat=TRACE
-#EditDistanceTrackingCallback=TRACE
-#RepeatResolver=TRACE
-#PairInfoImprover=TRACE
-#BulgeRemover=TRACE
-#AbstractConjugateGraph=TRACE
-#PathProcessor=TRACE
-#DistanceEstimationQualityStat=TRACE
-#Dijkstra=TRACE
-#AbstractGraph=TRACE
-#PathSetGraphConstructor=TRACE
-#NewExtendedSequenceMapper=TRACE
-#JumpingPairInfoChecker=TRACE
diff --git a/configs/hammer/config.info.template b/configs/hammer/config.info.template
deleted file mode 100644
index a7d3ffa..0000000
--- a/configs/hammer/config.info.template
+++ /dev/null
@@ -1,56 +0,0 @@
-; = HAMMER =
-; input options: working dir, input files, offset, and possibly kmers
-dataset dataset.yaml
-input_working_dir ./test_dataset/input/corrected/tmp
-input_trim_quality 4
-input_qvoffset
-output_dir ./test_dataset/input/corrected
-
-; == HAMMER GENERAL ==
-; general options
-general_do_everything_after_first_iteration 1
-general_hard_memory_limit 150
-general_max_nthreads 16
-general_tau 1
-general_max_iterations 1
-general_debug 0
-
-; count k-mers
-count_do 1
-count_numfiles 16
-count_merge_nthreads 16
-count_split_buffer 0
-count_filter_singletons 0
-
-; hamming graph clustering
-hamming_do 1
-hamming_blocksize_quadratic_threshold 50
-
-; bayesian subclustering
-bayes_do 1
-bayes_nthreads 16
-bayes_singleton_threshold 0.995
-bayes_nonsingleton_threshold 0.9
-bayes_use_hamming_dist 0
-bayes_discard_only_singletons 0
-bayes_debug_output 0
-bayes_hammer_mode 0
-bayes_write_solid_kmers 0
-bayes_write_bad_kmers 0
-bayes_initial_refine 1
-
-; iterative expansion step
-expand_do 1
-expand_max_iterations 25
-expand_nthreads 6
-expand_write_each_iteration 0
-expand_write_kmers_result 0
-
-; read correction
-correct_do 1
-correct_discard_bad 0
-correct_use_threshold 1
-correct_threshold 0.98
-correct_nthreads 4
-correct_readbuffer 100000
-correct_stats 1
diff --git a/configs/ionhammer/ionhammer.cfg.template b/configs/ionhammer/ionhammer.cfg.template
deleted file mode 100644
index 6daf8ef..0000000
--- a/configs/ionhammer/ionhammer.cfg.template
+++ /dev/null
@@ -1,12 +0,0 @@
-dataset : dataset.cfg
-working_dir : ./test_dataset/input/corrected/tmp
-output_dir : ./test_dataset/input/corrected
-hard_memory_limit : 250
-max_nthreads : 16
-kmer_qual_threshold : 1e-24
-center_qual_threshold : 1e-24
-delta_score_threshold : 10.0
-keep_uncorrected_ends : true
-tau : 1
-debug_mode : false
-start_stage : count
diff --git a/dipspades.py b/dipspades.py
index f12fa7f..8fa2047 100755
--- a/dipspades.py
+++ b/dipspades.py
@@ -33,11 +33,11 @@ def main():
except getopt.GetoptError:
_, exc, _ = sys.exc_info()
sys.stderr.write(str(exc) + "\n")
- options_storage.usage(spades_version, dipspades=True)
+ options_storage.usage(spades_version, mode="dip")
sys.stderr.flush()
sys.exit(1)
if not options:
- options_storage.usage(spades_version, dipspades=True)
+ options_storage.usage(spades_version, mode="dip")
sys.stderr.flush()
sys.exit(1)
@@ -57,13 +57,13 @@ def main():
elif opt == '--careful' or opt == '--mismatch-correction':
continue
if opt == '-v' or opt == '--version':
- options_storage.version(spades_version, mode="dipSPAdes")
+ options_storage.version(spades_version, mode="dip")
sys.exit(0)
if opt == '-h' or opt == '--help':
- options_storage.usage(spades_version, dipspades=True)
+ options_storage.usage(spades_version, mode="dip")
sys.exit(0)
elif opt == "--help-hidden":
- options_storage.usage(spades_version, show_hidden=True, dipspades=True)
+ options_storage.usage(spades_version, show_hidden=True, mode="dip")
sys.exit(0)
# for all other options
cur_opt_arg = [opt]
diff --git a/ext/include/htrie/ahtable.h b/ext/include/htrie/ahtable.h
new file mode 100644
index 0000000..bdfd9f9
--- /dev/null
+++ b/ext/include/htrie/ahtable.h
@@ -0,0 +1,115 @@
+/*
+ * This file is part of hat-trie.
+ *
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones at cs.washington.edu>
+ *
+ *
+ * This is an implementation of the 'cache-conscious' hash tables described in,
+ *
+ * Askitis, N., & Zobel, J. (2005). Cache-conscious collision resolution in
+ * string hash tables. String Processing and Information Retrieval (pp.
+ * 91–102). Springer.
+ *
+ * http://naskitis.com/naskitis-spire05.pdf
+ *
+ * Briefly, the idea behind an Array Hash Table is, as opposed to separate
+ * chaining with linked lists, to store keys contiguously in one big array,
+ * thereby improving the caching behavior, and reducing space requirements.
+ *
+ * ahtable keeps a fixed number (array) of slots, each of which contains a
+ * variable number of key/value pairs. Each key is preceded by its length--
+ * one byte for lengths < 128 bytes, and TWO bytes for longer keys. The least
+ * significant bit of the first byte indicates, if set, that the size is two
+ * bytes. The slot number where a key/value pair goes is determined by finding
+ * the murmurhashed integer value of its key, modulus the number of slots.
+ * The number of slots expands in a stepwise fashion when the number of
+ # key/value pairs reaches an arbitrarily large number.
+ *
+ * +-------+-------+-------+-------+-------+-------+
+ * | 0 | 1 | 2 | 3 | ... | N |
+ * +-------+-------+-------+-------+-------+-------+
+ * | | | | |
+ * v | | v v
+ * NULL | | 4html[VALUE] etc.
+ * | v
+ * | 5space[VALUE]4jury[VALUE]
+ * v
+ * 6justice[VALUE]3car[VALUE]4star[VALUE]
+ *
+ */
+
+#ifndef HATTRIE_AHTABLE_H
+#define HATTRIE_AHTABLE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include "htrie/common.h"
+
+typedef unsigned char* slot_t;
+
+typedef struct ahtable_t_
+{
+ /* these fields are reserved for hattrie to fiddle with */
+ uint8_t flag;
+ unsigned char c0;
+ unsigned char c1;
+
+ size_t n; // number of slots
+ size_t m; // number of key/value pairs stored
+ size_t max_m; // number of stored keys before we resize
+
+ size_t* slot_sizes;
+ slot_t* slots;
+} ahtable_t;
+
+extern const double ahtable_max_load_factor;
+extern const size_t ahtable_initial_size;
+
+ahtable_t* ahtable_create (void); // Create an empty hash table.
+ahtable_t* ahtable_create_n (size_t n); // Create an empty hash table, with
+ // n slots reserved.
+
+void ahtable_free (ahtable_t*); // Free all memory used by a table.
+void ahtable_clear (ahtable_t*); // Remove all entries.
+size_t ahtable_size (const ahtable_t*); // Number of stored keys.
+size_t ahtable_sizeof (const ahtable_t*); // Memory used by the table in bytes.
+
+
+/** Find the given key in the table, inserting it if it does not exist, and
+ * returning a pointer to it's value.
+ *
+ * This pointer is not guaranteed to be valid after additional calls to
+ * ahtable_get, ahtable_del, ahtable_clear, or other functions that modify the
+ * table.
+ */
+value_t* ahtable_get (ahtable_t*, const char* key, size_t len);
+
+
+/* Find a given key in the table, return a NULL pointer if it does not exist. */
+value_t* ahtable_tryget (ahtable_t*, const char* key, size_t len);
+
+
+int ahtable_del(ahtable_t*, const char* key, size_t len);
+
+
+typedef struct ahtable_iter_t_ ahtable_iter_t;
+
+ahtable_iter_t* ahtable_iter_begin (const ahtable_t*, bool sorted);
+void ahtable_iter_next (ahtable_iter_t*);
+bool ahtable_iter_finished (ahtable_iter_t*);
+void ahtable_iter_free (ahtable_iter_t*);
+const char* ahtable_iter_key (ahtable_iter_t*, size_t* len);
+value_t* ahtable_iter_val (ahtable_iter_t*);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/ext/include/htrie/common.h b/ext/include/htrie/common.h
new file mode 100644
index 0000000..7f0b034
--- /dev/null
+++ b/ext/include/htrie/common.h
@@ -0,0 +1,22 @@
+/*
+ * This file is part of hat-trie.
+ *
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones at cs.washington.edu>
+ *
+ *
+ * Common typedefs, etc.
+ *
+ */
+
+
+#ifndef HATTRIE_COMMON_H
+#define HATTRIE_COMMON_H
+
+#include <stdint.h>
+
+// an unsigned int that is guaranteed to be the same size as a pointer
+typedef uintptr_t value_t;
+
+#endif
+
+
diff --git a/ext/include/htrie/hat-trie.h b/ext/include/htrie/hat-trie.h
new file mode 100644
index 0000000..754d4f8
--- /dev/null
+++ b/ext/include/htrie/hat-trie.h
@@ -0,0 +1,74 @@
+/*
+ * This file is part of hat-trie
+ *
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones at cs.washington.edu>
+ *
+ *
+ * This is an implementation of the HAT-trie data structure described in,
+ *
+ * Askitis, N., & Sinha, R. (2007). HAT-trie: a cache-conscious trie-based data
+ * structure for strings. Proceedings of the thirtieth Australasian conference on
+ * Computer science-Volume 62 (pp. 97–105). Australian Computer Society, Inc.
+ *
+ * The HAT-trie is in essence a hybrid data structure, combining tries and hash
+ * tables in a clever way to try to get the best of both worlds.
+ *
+ */
+
+#ifndef HATTRIE_HATTRIE_H
+#define HATTRIE_HATTRIE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "htrie/common.h"
+#include <stdlib.h>
+#include <stdbool.h>
+
+typedef struct hattrie_t_ hattrie_t;
+
+hattrie_t* hattrie_create (void); // Create an empty hat-trie.
+void hattrie_free (hattrie_t*); // Free all memory used by a trie.
+hattrie_t* hattrie_dup (const hattrie_t*); // Duplicate an existing trie.
+void hattrie_clear (hattrie_t*); // Remove all entries.
+size_t hattrie_size (const hattrie_t*); // Number of stored keys.
+size_t hattrie_sizeof (const hattrie_t*); // Memory used in structure in bytes.
+
+
+/** Find the given key in the trie, inserting it if it does not exist, and
+ * returning a pointer to it's key.
+ *
+ * This pointer is not guaranteed to be valid after additional calls to
+ * hattrie_get, hattrie_del, hattrie_clear, or other functions that modifies the
+ * trie.
+ */
+value_t* hattrie_get (hattrie_t*, const char* key, size_t len);
+
+
+/** Find a given key in the table, returning a NULL pointer if it does not
+ * exist. */
+value_t* hattrie_tryget (hattrie_t*, const char* key, size_t len);
+
+/** Delete a given key from trie. Returns 0 if successful or -1 if not found.
+ */
+int hattrie_del(hattrie_t* T, const char* key, size_t len);
+
+typedef struct hattrie_iter_t_ hattrie_iter_t;
+
+hattrie_iter_t* hattrie_iter_begin (const hattrie_t*, bool sorted);
+void hattrie_iter_next (hattrie_iter_t*);
+bool hattrie_iter_finished (hattrie_iter_t*);
+void hattrie_iter_free (hattrie_iter_t*);
+const char* hattrie_iter_key (hattrie_iter_t*, size_t* len);
+value_t* hattrie_iter_val (hattrie_iter_t*);
+
+/* Return true if two iterators are equal. */
+bool hattrie_iter_equal (const hattrie_iter_t* a,
+ const hattrie_iter_t* b);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/ext/include/llvm/ADT/ArrayRef.h b/ext/include/llvm/ADT/ArrayRef.h
new file mode 100644
index 0000000..517ba39
--- /dev/null
+++ b/ext/include/llvm/ADT/ArrayRef.h
@@ -0,0 +1,384 @@
+//===--- ArrayRef.h - Array Reference Wrapper -------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_ARRAYREF_H
+#define LLVM_ADT_ARRAYREF_H
+
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/SmallVector.h"
+#include <vector>
+
+namespace llvm {
+
+ /// ArrayRef - Represent a constant reference to an array (0 or more elements
+ /// consecutively in memory), i.e. a start pointer and a length. It allows
+ /// various APIs to take consecutive elements easily and conveniently.
+ ///
+ /// This class does not own the underlying data, it is expected to be used in
+ /// situations where the data resides in some other buffer, whose lifetime
+ /// extends past that of the ArrayRef. For this reason, it is not in general
+ /// safe to store an ArrayRef.
+ ///
+ /// This is intended to be trivially copyable, so it should be passed by
+ /// value.
+ template<typename T>
+ class ArrayRef {
+ public:
+ typedef const T *iterator;
+ typedef const T *const_iterator;
+ typedef size_t size_type;
+
+ typedef std::reverse_iterator<iterator> reverse_iterator;
+
+ private:
+ /// The start of the array, in an external buffer.
+ const T *Data;
+
+ /// The number of elements.
+ size_type Length;
+
+ public:
+ /// @name Constructors
+ /// @{
+
+ /// Construct an empty ArrayRef.
+ /*implicit*/ ArrayRef() : Data(nullptr), Length(0) {}
+
+ /// Construct an empty ArrayRef from None.
+ /*implicit*/ ArrayRef(NoneType) : Data(nullptr), Length(0) {}
+
+ /// Construct an ArrayRef from a single element.
+ /*implicit*/ ArrayRef(const T &OneElt)
+ : Data(&OneElt), Length(1) {}
+
+ /// Construct an ArrayRef from a pointer and length.
+ /*implicit*/ ArrayRef(const T *data, size_t length)
+ : Data(data), Length(length) {}
+
+ /// Construct an ArrayRef from a range.
+ ArrayRef(const T *begin, const T *end)
+ : Data(begin), Length(end - begin) {}
+
+ /// Construct an ArrayRef from a SmallVector. This is templated in order to
+ /// avoid instantiating SmallVectorTemplateCommon<T> whenever we
+ /// copy-construct an ArrayRef.
+ template<typename U>
+ /*implicit*/ ArrayRef(const SmallVectorTemplateCommon<T, U> &Vec)
+ : Data(Vec.data()), Length(Vec.size()) {
+ }
+
+ /// Construct an ArrayRef from a std::vector.
+ template<typename A>
+ /*implicit*/ ArrayRef(const std::vector<T, A> &Vec)
+ : Data(Vec.data()), Length(Vec.size()) {}
+
+ /// Construct an ArrayRef from a C array.
+ template <size_t N>
+ /*implicit*/ LLVM_CONSTEXPR ArrayRef(const T (&Arr)[N])
+ : Data(Arr), Length(N) {}
+
+ /// Construct an ArrayRef from a std::initializer_list.
+ /*implicit*/ ArrayRef(const std::initializer_list<T> &Vec)
+ : Data(Vec.begin() == Vec.end() ? (T*)nullptr : Vec.begin()),
+ Length(Vec.size()) {}
+
+ /// Construct an ArrayRef<const T*> from ArrayRef<T*>. This uses SFINAE to
+ /// ensure that only ArrayRefs of pointers can be converted.
+ template <typename U>
+ ArrayRef(const ArrayRef<U *> &A,
+ typename std::enable_if<
+ std::is_convertible<U *const *, T const *>::value>::type* = 0)
+ : Data(A.data()), Length(A.size()) {}
+
+ /// Construct an ArrayRef<const T*> from a SmallVector<T*>. This is
+ /// templated in order to avoid instantiating SmallVectorTemplateCommon<T>
+ /// whenever we copy-construct an ArrayRef.
+ template<typename U, typename DummyT>
+ /*implicit*/ ArrayRef(const SmallVectorTemplateCommon<U*, DummyT> &Vec,
+ typename std::enable_if<
+ std::is_convertible<U *const *,
+ T const *>::value>::type* = 0)
+ : Data(Vec.data()), Length(Vec.size()) {
+ }
+
+ /// Construct an ArrayRef<const T*> from std::vector<T*>. This uses SFINAE
+ /// to ensure that only vectors of pointers can be converted.
+ template<typename U, typename A>
+ ArrayRef(const std::vector<U *, A> &Vec,
+ typename std::enable_if<
+ std::is_convertible<U *const *, T const *>::value>::type* = 0)
+ : Data(Vec.data()), Length(Vec.size()) {}
+
+ /// @}
+ /// @name Simple Operations
+ /// @{
+
+ iterator begin() const { return Data; }
+ iterator end() const { return Data + Length; }
+
+ reverse_iterator rbegin() const { return reverse_iterator(end()); }
+ reverse_iterator rend() const { return reverse_iterator(begin()); }
+
+ /// empty - Check if the array is empty.
+ bool empty() const { return Length == 0; }
+
+ const T *data() const { return Data; }
+
+ /// size - Get the array size.
+ size_t size() const { return Length; }
+
+ /// front - Get the first element.
+ const T &front() const {
+ assert(!empty());
+ return Data[0];
+ }
+
+ /// back - Get the last element.
+ const T &back() const {
+ assert(!empty());
+ return Data[Length-1];
+ }
+
+ // copy - Allocate copy in Allocator and return ArrayRef<T> to it.
+ template <typename Allocator> ArrayRef<T> copy(Allocator &A) {
+ T *Buff = A.template Allocate<T>(Length);
+ std::uninitialized_copy(begin(), end(), Buff);
+ return ArrayRef<T>(Buff, Length);
+ }
+
+ /// equals - Check for element-wise equality.
+ bool equals(ArrayRef RHS) const {
+ if (Length != RHS.Length)
+ return false;
+ return std::equal(begin(), end(), RHS.begin());
+ }
+
+ /// slice(n) - Chop off the first N elements of the array.
+ ArrayRef<T> slice(unsigned N) const {
+ assert(N <= size() && "Invalid specifier");
+ return ArrayRef<T>(data()+N, size()-N);
+ }
+
+ /// slice(n, m) - Chop off the first N elements of the array, and keep M
+ /// elements in the array.
+ ArrayRef<T> slice(unsigned N, unsigned M) const {
+ assert(N+M <= size() && "Invalid specifier");
+ return ArrayRef<T>(data()+N, M);
+ }
+
+ // \brief Drop the last \p N elements of the array.
+ ArrayRef<T> drop_back(unsigned N = 1) const {
+ assert(size() >= N && "Dropping more elements than exist");
+ return slice(0, size() - N);
+ }
+
+ /// @}
+ /// @name Operator Overloads
+ /// @{
+ const T &operator[](size_t Index) const {
+ assert(Index < Length && "Invalid index!");
+ return Data[Index];
+ }
+
+ /// @}
+ /// @name Expensive Operations
+ /// @{
+ std::vector<T> vec() const {
+ return std::vector<T>(Data, Data+Length);
+ }
+
+ /// @}
+ /// @name Conversion operators
+ /// @{
+ operator std::vector<T>() const {
+ return std::vector<T>(Data, Data+Length);
+ }
+
+ /// @}
+ };
+
+ /// MutableArrayRef - Represent a mutable reference to an array (0 or more
+ /// elements consecutively in memory), i.e. a start pointer and a length. It
+ /// allows various APIs to take and modify consecutive elements easily and
+ /// conveniently.
+ ///
+ /// This class does not own the underlying data, it is expected to be used in
+ /// situations where the data resides in some other buffer, whose lifetime
+ /// extends past that of the MutableArrayRef. For this reason, it is not in
+ /// general safe to store a MutableArrayRef.
+ ///
+ /// This is intended to be trivially copyable, so it should be passed by
+ /// value.
+ template<typename T>
+ class MutableArrayRef : public ArrayRef<T> {
+ public:
+ typedef T *iterator;
+
+ typedef std::reverse_iterator<iterator> reverse_iterator;
+
+ /// Construct an empty MutableArrayRef.
+ /*implicit*/ MutableArrayRef() : ArrayRef<T>() {}
+
+ /// Construct an empty MutableArrayRef from None.
+ /*implicit*/ MutableArrayRef(NoneType) : ArrayRef<T>() {}
+
+ /// Construct an MutableArrayRef from a single element.
+ /*implicit*/ MutableArrayRef(T &OneElt) : ArrayRef<T>(OneElt) {}
+
+ /// Construct an MutableArrayRef from a pointer and length.
+ /*implicit*/ MutableArrayRef(T *data, size_t length)
+ : ArrayRef<T>(data, length) {}
+
+ /// Construct an MutableArrayRef from a range.
+ MutableArrayRef(T *begin, T *end) : ArrayRef<T>(begin, end) {}
+
+ /// Construct an MutableArrayRef from a SmallVector.
+ /*implicit*/ MutableArrayRef(SmallVectorImpl<T> &Vec)
+ : ArrayRef<T>(Vec) {}
+
+ /// Construct a MutableArrayRef from a std::vector.
+ /*implicit*/ MutableArrayRef(std::vector<T> &Vec)
+ : ArrayRef<T>(Vec) {}
+
+ /// Construct an MutableArrayRef from a C array.
+ template <size_t N>
+ /*implicit*/ LLVM_CONSTEXPR MutableArrayRef(T (&Arr)[N])
+ : ArrayRef<T>(Arr) {}
+
+ T *data() const { return const_cast<T*>(ArrayRef<T>::data()); }
+
+ iterator begin() const { return data(); }
+ iterator end() const { return data() + this->size(); }
+
+ reverse_iterator rbegin() const { return reverse_iterator(end()); }
+ reverse_iterator rend() const { return reverse_iterator(begin()); }
+
+ /// front - Get the first element.
+ T &front() const {
+ assert(!this->empty());
+ return data()[0];
+ }
+
+ /// back - Get the last element.
+ T &back() const {
+ assert(!this->empty());
+ return data()[this->size()-1];
+ }
+
+ /// slice(n) - Chop off the first N elements of the array.
+ MutableArrayRef<T> slice(unsigned N) const {
+ assert(N <= this->size() && "Invalid specifier");
+ return MutableArrayRef<T>(data()+N, this->size()-N);
+ }
+
+ /// slice(n, m) - Chop off the first N elements of the array, and keep M
+ /// elements in the array.
+ MutableArrayRef<T> slice(unsigned N, unsigned M) const {
+ assert(N+M <= this->size() && "Invalid specifier");
+ return MutableArrayRef<T>(data()+N, M);
+ }
+
+ MutableArrayRef<T> drop_back(unsigned N) const {
+ assert(this->size() >= N && "Dropping more elements than exist");
+ return slice(0, this->size() - N);
+ }
+
+ /// @}
+ /// @name Operator Overloads
+ /// @{
+ T &operator[](size_t Index) const {
+ assert(Index < this->size() && "Invalid index!");
+ return data()[Index];
+ }
+ };
+
+ /// @name ArrayRef Convenience constructors
+ /// @{
+
+ /// Construct an ArrayRef from a single element.
+ template<typename T>
+ ArrayRef<T> makeArrayRef(const T &OneElt) {
+ return OneElt;
+ }
+
+ /// Construct an ArrayRef from a pointer and length.
+ template<typename T>
+ ArrayRef<T> makeArrayRef(const T *data, size_t length) {
+ return ArrayRef<T>(data, length);
+ }
+
+ /// Construct an ArrayRef from a range.
+ template<typename T>
+ ArrayRef<T> makeArrayRef(const T *begin, const T *end) {
+ return ArrayRef<T>(begin, end);
+ }
+
+ /// Construct an ArrayRef from a SmallVector.
+ template <typename T>
+ ArrayRef<T> makeArrayRef(const SmallVectorImpl<T> &Vec) {
+ return Vec;
+ }
+
+ /// Construct an ArrayRef from a SmallVector.
+ template <typename T, unsigned N>
+ ArrayRef<T> makeArrayRef(const SmallVector<T, N> &Vec) {
+ return Vec;
+ }
+
+ /// Construct an ArrayRef from a std::vector.
+ template<typename T>
+ ArrayRef<T> makeArrayRef(const std::vector<T> &Vec) {
+ return Vec;
+ }
+
+ /// Construct an ArrayRef from an ArrayRef (no-op) (const)
+ template <typename T> ArrayRef<T> makeArrayRef(const ArrayRef<T> &Vec) {
+ return Vec;
+ }
+
+ /// Construct an ArrayRef from an ArrayRef (no-op)
+ template <typename T> ArrayRef<T> &makeArrayRef(ArrayRef<T> &Vec) {
+ return Vec;
+ }
+
+ /// Construct an ArrayRef from a C array.
+ template<typename T, size_t N>
+ ArrayRef<T> makeArrayRef(const T (&Arr)[N]) {
+ return ArrayRef<T>(Arr);
+ }
+
+ /// @}
+ /// @name ArrayRef Comparison Operators
+ /// @{
+
+ template<typename T>
+ inline bool operator==(ArrayRef<T> LHS, ArrayRef<T> RHS) {
+ return LHS.equals(RHS);
+ }
+
+ template<typename T>
+ inline bool operator!=(ArrayRef<T> LHS, ArrayRef<T> RHS) {
+ return !(LHS == RHS);
+ }
+
+ /// @}
+
+ // ArrayRefs can be treated like a POD type.
+ template <typename T> struct isPodLike;
+ template <typename T> struct isPodLike<ArrayRef<T> > {
+ static const bool value = true;
+ };
+
+ template <typename T> hash_code hash_value(ArrayRef<T> S) {
+ return hash_combine_range(S.begin(), S.end());
+ }
+}
+
+#endif
diff --git a/ext/include/llvm/ADT/DenseMap.h b/ext/include/llvm/ADT/DenseMap.h
new file mode 100644
index 0000000..6ee1960
--- /dev/null
+++ b/ext/include/llvm/ADT/DenseMap.h
@@ -0,0 +1,1074 @@
+//===- llvm/ADT/DenseMap.h - Dense probed hash table ------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the DenseMap class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_DENSEMAP_H
+#define LLVM_ADT_DENSEMAP_H
+
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/EpochTracker.h"
+#include "llvm/Support/AlignOf.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/PointerLikeTypeTraits.h"
+#include "llvm/Support/type_traits.h"
+#include <algorithm>
+#include <cassert>
+#include <climits>
+#include <cstddef>
+#include <cstring>
+#include <iterator>
+#include <new>
+#include <utility>
+
+namespace llvm {
+
+namespace detail {
+// We extend a pair to allow users to override the bucket type with their own
+// implementation without requiring two members.
+template <typename KeyT, typename ValueT>
+struct DenseMapPair : public std::pair<KeyT, ValueT> {
+ KeyT &getFirst() { return std::pair<KeyT, ValueT>::first; }
+ const KeyT &getFirst() const { return std::pair<KeyT, ValueT>::first; }
+ ValueT &getSecond() { return std::pair<KeyT, ValueT>::second; }
+ const ValueT &getSecond() const { return std::pair<KeyT, ValueT>::second; }
+};
+}
+
+template <
+ typename KeyT, typename ValueT, typename KeyInfoT = DenseMapInfo<KeyT>,
+ typename Bucket = detail::DenseMapPair<KeyT, ValueT>, bool IsConst = false>
+class DenseMapIterator;
+
+template <typename DerivedT, typename KeyT, typename ValueT, typename KeyInfoT,
+ typename BucketT>
+class DenseMapBase : public DebugEpochBase {
+public:
+ typedef unsigned size_type;
+ typedef KeyT key_type;
+ typedef ValueT mapped_type;
+ typedef BucketT value_type;
+
+ typedef DenseMapIterator<KeyT, ValueT, KeyInfoT, BucketT> iterator;
+ typedef DenseMapIterator<KeyT, ValueT, KeyInfoT, BucketT, true>
+ const_iterator;
+ inline iterator begin() {
+ // When the map is empty, avoid the overhead of AdvancePastEmptyBuckets().
+ return empty() ? end() : iterator(getBuckets(), getBucketsEnd(), *this);
+ }
+ inline iterator end() {
+ return iterator(getBucketsEnd(), getBucketsEnd(), *this, true);
+ }
+ inline const_iterator begin() const {
+ return empty() ? end()
+ : const_iterator(getBuckets(), getBucketsEnd(), *this);
+ }
+ inline const_iterator end() const {
+ return const_iterator(getBucketsEnd(), getBucketsEnd(), *this, true);
+ }
+
+ bool LLVM_ATTRIBUTE_UNUSED_RESULT empty() const {
+ return getNumEntries() == 0;
+ }
+ unsigned size() const { return getNumEntries(); }
+
+ /// Grow the densemap so that it has at least Size buckets. Does not shrink
+ void resize(size_type Size) {
+ incrementEpoch();
+ if (Size > getNumBuckets())
+ grow(Size);
+ }
+
+ void clear() {
+ incrementEpoch();
+ if (getNumEntries() == 0 && getNumTombstones() == 0) return;
+
+ // If the capacity of the array is huge, and the # elements used is small,
+ // shrink the array.
+ if (getNumEntries() * 4 < getNumBuckets() && getNumBuckets() > 64) {
+ shrink_and_clear();
+ return;
+ }
+
+ const KeyT EmptyKey = getEmptyKey(), TombstoneKey = getTombstoneKey();
+ unsigned NumEntries = getNumEntries();
+ for (BucketT *P = getBuckets(), *E = getBucketsEnd(); P != E; ++P) {
+ if (!KeyInfoT::isEqual(P->getFirst(), EmptyKey)) {
+ if (!KeyInfoT::isEqual(P->getFirst(), TombstoneKey)) {
+ P->getSecond().~ValueT();
+ --NumEntries;
+ }
+ P->getFirst() = EmptyKey;
+ }
+ }
+ assert(NumEntries == 0 && "Node count imbalance!");
+ setNumEntries(0);
+ setNumTombstones(0);
+ }
+
+ /// Return 1 if the specified key is in the map, 0 otherwise.
+ size_type count(const KeyT &Val) const {
+ const BucketT *TheBucket;
+ return LookupBucketFor(Val, TheBucket) ? 1 : 0;
+ }
+
+ iterator find(const KeyT &Val) {
+ BucketT *TheBucket;
+ if (LookupBucketFor(Val, TheBucket))
+ return iterator(TheBucket, getBucketsEnd(), *this, true);
+ return end();
+ }
+ const_iterator find(const KeyT &Val) const {
+ const BucketT *TheBucket;
+ if (LookupBucketFor(Val, TheBucket))
+ return const_iterator(TheBucket, getBucketsEnd(), *this, true);
+ return end();
+ }
+
+ /// Alternate version of find() which allows a different, and possibly
+ /// less expensive, key type.
+ /// The DenseMapInfo is responsible for supplying methods
+ /// getHashValue(LookupKeyT) and isEqual(LookupKeyT, KeyT) for each key
+ /// type used.
+ template<class LookupKeyT>
+ iterator find_as(const LookupKeyT &Val) {
+ BucketT *TheBucket;
+ if (LookupBucketFor(Val, TheBucket))
+ return iterator(TheBucket, getBucketsEnd(), *this, true);
+ return end();
+ }
+ template<class LookupKeyT>
+ const_iterator find_as(const LookupKeyT &Val) const {
+ const BucketT *TheBucket;
+ if (LookupBucketFor(Val, TheBucket))
+ return const_iterator(TheBucket, getBucketsEnd(), *this, true);
+ return end();
+ }
+
+ /// lookup - Return the entry for the specified key, or a default
+ /// constructed value if no such entry exists.
+ ValueT lookup(const KeyT &Val) const {
+ const BucketT *TheBucket;
+ if (LookupBucketFor(Val, TheBucket))
+ return TheBucket->getSecond();
+ return ValueT();
+ }
+
+ // Inserts key,value pair into the map if the key isn't already in the map.
+ // If the key is already in the map, it returns false and doesn't update the
+ // value.
+ std::pair<iterator, bool> insert(const std::pair<KeyT, ValueT> &KV) {
+ BucketT *TheBucket;
+ if (LookupBucketFor(KV.first, TheBucket))
+ return std::make_pair(iterator(TheBucket, getBucketsEnd(), *this, true),
+ false); // Already in map.
+
+ // Otherwise, insert the new element.
+ TheBucket = InsertIntoBucket(KV.first, KV.second, TheBucket);
+ return std::make_pair(iterator(TheBucket, getBucketsEnd(), *this, true),
+ true);
+ }
+
+ // Inserts key,value pair into the map if the key isn't already in the map.
+ // If the key is already in the map, it returns false and doesn't update the
+ // value.
+ std::pair<iterator, bool> insert(std::pair<KeyT, ValueT> &&KV) {
+ BucketT *TheBucket;
+ if (LookupBucketFor(KV.first, TheBucket))
+ return std::make_pair(iterator(TheBucket, getBucketsEnd(), *this, true),
+ false); // Already in map.
+
+ // Otherwise, insert the new element.
+ TheBucket = InsertIntoBucket(std::move(KV.first),
+ std::move(KV.second),
+ TheBucket);
+ return std::make_pair(iterator(TheBucket, getBucketsEnd(), *this, true),
+ true);
+ }
+
+ /// insert - Range insertion of pairs.
+ template<typename InputIt>
+ void insert(InputIt I, InputIt E) {
+ for (; I != E; ++I)
+ insert(*I);
+ }
+
+
+ bool erase(const KeyT &Val) {
+ BucketT *TheBucket;
+ if (!LookupBucketFor(Val, TheBucket))
+ return false; // not in map.
+
+ TheBucket->getSecond().~ValueT();
+ TheBucket->getFirst() = getTombstoneKey();
+ decrementNumEntries();
+ incrementNumTombstones();
+ return true;
+ }
+ void erase(iterator I) {
+ BucketT *TheBucket = &*I;
+ TheBucket->getSecond().~ValueT();
+ TheBucket->getFirst() = getTombstoneKey();
+ decrementNumEntries();
+ incrementNumTombstones();
+ }
+
+ value_type& FindAndConstruct(const KeyT &Key) {
+ BucketT *TheBucket;
+ if (LookupBucketFor(Key, TheBucket))
+ return *TheBucket;
+
+ return *InsertIntoBucket(Key, ValueT(), TheBucket);
+ }
+
+ ValueT &operator[](const KeyT &Key) {
+ return FindAndConstruct(Key).second;
+ }
+
+ value_type& FindAndConstruct(KeyT &&Key) {
+ BucketT *TheBucket;
+ if (LookupBucketFor(Key, TheBucket))
+ return *TheBucket;
+
+ return *InsertIntoBucket(std::move(Key), ValueT(), TheBucket);
+ }
+
+ ValueT &operator[](KeyT &&Key) {
+ return FindAndConstruct(std::move(Key)).second;
+ }
+
+ /// isPointerIntoBucketsArray - Return true if the specified pointer points
+ /// somewhere into the DenseMap's array of buckets (i.e. either to a key or
+ /// value in the DenseMap).
+ bool isPointerIntoBucketsArray(const void *Ptr) const {
+ return Ptr >= getBuckets() && Ptr < getBucketsEnd();
+ }
+
+ /// getPointerIntoBucketsArray() - Return an opaque pointer into the buckets
+ /// array. In conjunction with the previous method, this can be used to
+ /// determine whether an insertion caused the DenseMap to reallocate.
+ const void *getPointerIntoBucketsArray() const { return getBuckets(); }
+
+protected:
+ DenseMapBase() = default;
+
+ void destroyAll() {
+ if (getNumBuckets() == 0) // Nothing to do.
+ return;
+
+ const KeyT EmptyKey = getEmptyKey(), TombstoneKey = getTombstoneKey();
+ for (BucketT *P = getBuckets(), *E = getBucketsEnd(); P != E; ++P) {
+ if (!KeyInfoT::isEqual(P->getFirst(), EmptyKey) &&
+ !KeyInfoT::isEqual(P->getFirst(), TombstoneKey))
+ P->getSecond().~ValueT();
+ P->getFirst().~KeyT();
+ }
+ }
+
+ void initEmpty() {
+ setNumEntries(0);
+ setNumTombstones(0);
+
+ assert((getNumBuckets() & (getNumBuckets()-1)) == 0 &&
+ "# initial buckets must be a power of two!");
+ const KeyT EmptyKey = getEmptyKey();
+ for (BucketT *B = getBuckets(), *E = getBucketsEnd(); B != E; ++B)
+ ::new (&B->getFirst()) KeyT(EmptyKey);
+ }
+
+ void moveFromOldBuckets(BucketT *OldBucketsBegin, BucketT *OldBucketsEnd) {
+ initEmpty();
+
+ // Insert all the old elements.
+ const KeyT EmptyKey = getEmptyKey();
+ const KeyT TombstoneKey = getTombstoneKey();
+ for (BucketT *B = OldBucketsBegin, *E = OldBucketsEnd; B != E; ++B) {
+ if (!KeyInfoT::isEqual(B->getFirst(), EmptyKey) &&
+ !KeyInfoT::isEqual(B->getFirst(), TombstoneKey)) {
+ // Insert the key/value into the new table.
+ BucketT *DestBucket;
+ bool FoundVal = LookupBucketFor(B->getFirst(), DestBucket);
+ (void)FoundVal; // silence warning.
+ assert(!FoundVal && "Key already in new map?");
+ DestBucket->getFirst() = std::move(B->getFirst());
+ ::new (&DestBucket->getSecond()) ValueT(std::move(B->getSecond()));
+ incrementNumEntries();
+
+ // Free the value.
+ B->getSecond().~ValueT();
+ }
+ B->getFirst().~KeyT();
+ }
+ }
+
+ template <typename OtherBaseT>
+ void copyFrom(
+ const DenseMapBase<OtherBaseT, KeyT, ValueT, KeyInfoT, BucketT> &other) {
+ assert(&other != this);
+ assert(getNumBuckets() == other.getNumBuckets());
+
+ setNumEntries(other.getNumEntries());
+ setNumTombstones(other.getNumTombstones());
+
+ if (isPodLike<KeyT>::value && isPodLike<ValueT>::value)
+ memcpy(getBuckets(), other.getBuckets(),
+ getNumBuckets() * sizeof(BucketT));
+ else
+ for (size_t i = 0; i < getNumBuckets(); ++i) {
+ ::new (&getBuckets()[i].getFirst())
+ KeyT(other.getBuckets()[i].getFirst());
+ if (!KeyInfoT::isEqual(getBuckets()[i].getFirst(), getEmptyKey()) &&
+ !KeyInfoT::isEqual(getBuckets()[i].getFirst(), getTombstoneKey()))
+ ::new (&getBuckets()[i].getSecond())
+ ValueT(other.getBuckets()[i].getSecond());
+ }
+ }
+
+ static unsigned getHashValue(const KeyT &Val) {
+ return KeyInfoT::getHashValue(Val);
+ }
+ template<typename LookupKeyT>
+ static unsigned getHashValue(const LookupKeyT &Val) {
+ return KeyInfoT::getHashValue(Val);
+ }
+ static const KeyT getEmptyKey() {
+ return KeyInfoT::getEmptyKey();
+ }
+ static const KeyT getTombstoneKey() {
+ return KeyInfoT::getTombstoneKey();
+ }
+
+private:
+ unsigned getNumEntries() const {
+ return static_cast<const DerivedT *>(this)->getNumEntries();
+ }
+ void setNumEntries(unsigned Num) {
+ static_cast<DerivedT *>(this)->setNumEntries(Num);
+ }
+ void incrementNumEntries() {
+ setNumEntries(getNumEntries() + 1);
+ }
+ void decrementNumEntries() {
+ setNumEntries(getNumEntries() - 1);
+ }
+ unsigned getNumTombstones() const {
+ return static_cast<const DerivedT *>(this)->getNumTombstones();
+ }
+ void setNumTombstones(unsigned Num) {
+ static_cast<DerivedT *>(this)->setNumTombstones(Num);
+ }
+ void incrementNumTombstones() {
+ setNumTombstones(getNumTombstones() + 1);
+ }
+ void decrementNumTombstones() {
+ setNumTombstones(getNumTombstones() - 1);
+ }
+ const BucketT *getBuckets() const {
+ return static_cast<const DerivedT *>(this)->getBuckets();
+ }
+ BucketT *getBuckets() {
+ return static_cast<DerivedT *>(this)->getBuckets();
+ }
+ unsigned getNumBuckets() const {
+ return static_cast<const DerivedT *>(this)->getNumBuckets();
+ }
+ BucketT *getBucketsEnd() {
+ return getBuckets() + getNumBuckets();
+ }
+ const BucketT *getBucketsEnd() const {
+ return getBuckets() + getNumBuckets();
+ }
+
+ void grow(unsigned AtLeast) {
+ static_cast<DerivedT *>(this)->grow(AtLeast);
+ }
+
+ void shrink_and_clear() {
+ static_cast<DerivedT *>(this)->shrink_and_clear();
+ }
+
+
+ BucketT *InsertIntoBucket(const KeyT &Key, const ValueT &Value,
+ BucketT *TheBucket) {
+ TheBucket = InsertIntoBucketImpl(Key, TheBucket);
+
+ TheBucket->getFirst() = Key;
+ ::new (&TheBucket->getSecond()) ValueT(Value);
+ return TheBucket;
+ }
+
+ BucketT *InsertIntoBucket(const KeyT &Key, ValueT &&Value,
+ BucketT *TheBucket) {
+ TheBucket = InsertIntoBucketImpl(Key, TheBucket);
+
+ TheBucket->getFirst() = Key;
+ ::new (&TheBucket->getSecond()) ValueT(std::move(Value));
+ return TheBucket;
+ }
+
+ BucketT *InsertIntoBucket(KeyT &&Key, ValueT &&Value, BucketT *TheBucket) {
+ TheBucket = InsertIntoBucketImpl(Key, TheBucket);
+
+ TheBucket->getFirst() = std::move(Key);
+ ::new (&TheBucket->getSecond()) ValueT(std::move(Value));
+ return TheBucket;
+ }
+
+ BucketT *InsertIntoBucketImpl(const KeyT &Key, BucketT *TheBucket) {
+ incrementEpoch();
+
+ // If the load of the hash table is more than 3/4, or if fewer than 1/8 of
+ // the buckets are empty (meaning that many are filled with tombstones),
+ // grow the table.
+ //
+ // The later case is tricky. For example, if we had one empty bucket with
+ // tons of tombstones, failing lookups (e.g. for insertion) would have to
+ // probe almost the entire table until it found the empty bucket. If the
+ // table completely filled with tombstones, no lookup would ever succeed,
+ // causing infinite loops in lookup.
+ unsigned NewNumEntries = getNumEntries() + 1;
+ unsigned NumBuckets = getNumBuckets();
+ if (LLVM_UNLIKELY(NewNumEntries * 4 >= NumBuckets * 3)) {
+ this->grow(NumBuckets * 2);
+ LookupBucketFor(Key, TheBucket);
+ NumBuckets = getNumBuckets();
+ } else if (LLVM_UNLIKELY(NumBuckets-(NewNumEntries+getNumTombstones()) <=
+ NumBuckets/8)) {
+ this->grow(NumBuckets);
+ LookupBucketFor(Key, TheBucket);
+ }
+ assert(TheBucket);
+
+ // Only update the state after we've grown our bucket space appropriately
+ // so that when growing buckets we have self-consistent entry count.
+ incrementNumEntries();
+
+ // If we are writing over a tombstone, remember this.
+ const KeyT EmptyKey = getEmptyKey();
+ if (!KeyInfoT::isEqual(TheBucket->getFirst(), EmptyKey))
+ decrementNumTombstones();
+
+ return TheBucket;
+ }
+
+ /// LookupBucketFor - Lookup the appropriate bucket for Val, returning it in
+ /// FoundBucket. If the bucket contains the key and a value, this returns
+ /// true, otherwise it returns a bucket with an empty marker or tombstone and
+ /// returns false.
+ template<typename LookupKeyT>
+ bool LookupBucketFor(const LookupKeyT &Val,
+ const BucketT *&FoundBucket) const {
+ const BucketT *BucketsPtr = getBuckets();
+ const unsigned NumBuckets = getNumBuckets();
+
+ if (NumBuckets == 0) {
+ FoundBucket = nullptr;
+ return false;
+ }
+
+ // FoundTombstone - Keep track of whether we find a tombstone while probing.
+ const BucketT *FoundTombstone = nullptr;
+ const KeyT EmptyKey = getEmptyKey();
+ const KeyT TombstoneKey = getTombstoneKey();
+ assert(!KeyInfoT::isEqual(Val, EmptyKey) &&
+ !KeyInfoT::isEqual(Val, TombstoneKey) &&
+ "Empty/Tombstone value shouldn't be inserted into map!");
+
+ unsigned BucketNo = getHashValue(Val) & (NumBuckets-1);
+ unsigned ProbeAmt = 1;
+ while (1) {
+ const BucketT *ThisBucket = BucketsPtr + BucketNo;
+ // Found Val's bucket? If so, return it.
+ if (LLVM_LIKELY(KeyInfoT::isEqual(Val, ThisBucket->getFirst()))) {
+ FoundBucket = ThisBucket;
+ return true;
+ }
+
+ // If we found an empty bucket, the key doesn't exist in the set.
+ // Insert it and return the default value.
+ if (LLVM_LIKELY(KeyInfoT::isEqual(ThisBucket->getFirst(), EmptyKey))) {
+ // If we've already seen a tombstone while probing, fill it in instead
+ // of the empty bucket we eventually probed to.
+ FoundBucket = FoundTombstone ? FoundTombstone : ThisBucket;
+ return false;
+ }
+
+ // If this is a tombstone, remember it. If Val ends up not in the map, we
+ // prefer to return it than something that would require more probing.
+ if (KeyInfoT::isEqual(ThisBucket->getFirst(), TombstoneKey) &&
+ !FoundTombstone)
+ FoundTombstone = ThisBucket; // Remember the first tombstone found.
+
+ // Otherwise, it's a hash collision or a tombstone, continue quadratic
+ // probing.
+ BucketNo += ProbeAmt++;
+ BucketNo &= (NumBuckets-1);
+ }
+ }
+
+ template <typename LookupKeyT>
+ bool LookupBucketFor(const LookupKeyT &Val, BucketT *&FoundBucket) {
+ const BucketT *ConstFoundBucket;
+ bool Result = const_cast<const DenseMapBase *>(this)
+ ->LookupBucketFor(Val, ConstFoundBucket);
+ FoundBucket = const_cast<BucketT *>(ConstFoundBucket);
+ return Result;
+ }
+
+public:
+ /// Return the approximate size (in bytes) of the actual map.
+ /// This is just the raw memory used by DenseMap.
+ /// If entries are pointers to objects, the size of the referenced objects
+ /// are not included.
+ size_t getMemorySize() const {
+ return getNumBuckets() * sizeof(BucketT);
+ }
+};
+
+template <typename KeyT, typename ValueT,
+ typename KeyInfoT = DenseMapInfo<KeyT>,
+ typename BucketT = detail::DenseMapPair<KeyT, ValueT>>
+class DenseMap : public DenseMapBase<DenseMap<KeyT, ValueT, KeyInfoT, BucketT>,
+ KeyT, ValueT, KeyInfoT, BucketT> {
+ // Lift some types from the dependent base class into this class for
+ // simplicity of referring to them.
+ typedef DenseMapBase<DenseMap, KeyT, ValueT, KeyInfoT, BucketT> BaseT;
+ friend class DenseMapBase<DenseMap, KeyT, ValueT, KeyInfoT, BucketT>;
+
+ BucketT *Buckets;
+ unsigned NumEntries;
+ unsigned NumTombstones;
+ unsigned NumBuckets;
+
+public:
+ explicit DenseMap(unsigned NumInitBuckets = 0) {
+ init(NumInitBuckets);
+ }
+
+ DenseMap(const DenseMap &other) : BaseT() {
+ init(0);
+ copyFrom(other);
+ }
+
+ DenseMap(DenseMap &&other) : BaseT() {
+ init(0);
+ swap(other);
+ }
+
+ template<typename InputIt>
+ DenseMap(const InputIt &I, const InputIt &E) {
+ init(NextPowerOf2(std::distance(I, E)));
+ this->insert(I, E);
+ }
+
+ ~DenseMap() {
+ this->destroyAll();
+ operator delete(Buckets);
+ }
+
+ void swap(DenseMap& RHS) {
+ this->incrementEpoch();
+ RHS.incrementEpoch();
+ std::swap(Buckets, RHS.Buckets);
+ std::swap(NumEntries, RHS.NumEntries);
+ std::swap(NumTombstones, RHS.NumTombstones);
+ std::swap(NumBuckets, RHS.NumBuckets);
+ }
+
+ DenseMap& operator=(const DenseMap& other) {
+ if (&other != this)
+ copyFrom(other);
+ return *this;
+ }
+
+ DenseMap& operator=(DenseMap &&other) {
+ this->destroyAll();
+ operator delete(Buckets);
+ init(0);
+ swap(other);
+ return *this;
+ }
+
+ void copyFrom(const DenseMap& other) {
+ this->destroyAll();
+ operator delete(Buckets);
+ if (allocateBuckets(other.NumBuckets)) {
+ this->BaseT::copyFrom(other);
+ } else {
+ NumEntries = 0;
+ NumTombstones = 0;
+ }
+ }
+
+ void init(unsigned InitBuckets) {
+ if (allocateBuckets(InitBuckets)) {
+ this->BaseT::initEmpty();
+ } else {
+ NumEntries = 0;
+ NumTombstones = 0;
+ }
+ }
+
+ void grow(unsigned AtLeast) {
+ unsigned OldNumBuckets = NumBuckets;
+ BucketT *OldBuckets = Buckets;
+
+ allocateBuckets(std::max<unsigned>(64, static_cast<unsigned>(NextPowerOf2(AtLeast-1))));
+ assert(Buckets);
+ if (!OldBuckets) {
+ this->BaseT::initEmpty();
+ return;
+ }
+
+ this->moveFromOldBuckets(OldBuckets, OldBuckets+OldNumBuckets);
+
+ // Free the old table.
+ operator delete(OldBuckets);
+ }
+
+ void shrink_and_clear() {
+ unsigned OldNumEntries = NumEntries;
+ this->destroyAll();
+
+ // Reduce the number of buckets.
+ unsigned NewNumBuckets = 0;
+ if (OldNumEntries)
+ NewNumBuckets = std::max(64, 1 << (Log2_32_Ceil(OldNumEntries) + 1));
+ if (NewNumBuckets == NumBuckets) {
+ this->BaseT::initEmpty();
+ return;
+ }
+
+ operator delete(Buckets);
+ init(NewNumBuckets);
+ }
+
+private:
+ unsigned getNumEntries() const {
+ return NumEntries;
+ }
+ void setNumEntries(unsigned Num) {
+ NumEntries = Num;
+ }
+
+ unsigned getNumTombstones() const {
+ return NumTombstones;
+ }
+ void setNumTombstones(unsigned Num) {
+ NumTombstones = Num;
+ }
+
+ BucketT *getBuckets() const {
+ return Buckets;
+ }
+
+ unsigned getNumBuckets() const {
+ return NumBuckets;
+ }
+
+ bool allocateBuckets(unsigned Num) {
+ NumBuckets = Num;
+ if (NumBuckets == 0) {
+ Buckets = nullptr;
+ return false;
+ }
+
+ Buckets = static_cast<BucketT*>(operator new(sizeof(BucketT) * NumBuckets));
+ return true;
+ }
+};
+
+template <typename KeyT, typename ValueT, unsigned InlineBuckets = 4,
+ typename KeyInfoT = DenseMapInfo<KeyT>,
+ typename BucketT = detail::DenseMapPair<KeyT, ValueT>>
+class SmallDenseMap
+ : public DenseMapBase<
+ SmallDenseMap<KeyT, ValueT, InlineBuckets, KeyInfoT, BucketT>, KeyT,
+ ValueT, KeyInfoT, BucketT> {
+ // Lift some types from the dependent base class into this class for
+ // simplicity of referring to them.
+ typedef DenseMapBase<SmallDenseMap, KeyT, ValueT, KeyInfoT, BucketT> BaseT;
+ friend class DenseMapBase<SmallDenseMap, KeyT, ValueT, KeyInfoT, BucketT>;
+
+ unsigned Small : 1;
+ unsigned NumEntries : 31;
+ unsigned NumTombstones;
+
+ struct LargeRep {
+ BucketT *Buckets;
+ unsigned NumBuckets;
+ };
+
+ /// A "union" of an inline bucket array and the struct representing
+ /// a large bucket. This union will be discriminated by the 'Small' bit.
+ AlignedCharArrayUnion<BucketT[InlineBuckets], LargeRep> storage;
+
+public:
+ explicit SmallDenseMap(unsigned NumInitBuckets = 0) {
+ init(NumInitBuckets);
+ }
+
+ SmallDenseMap(const SmallDenseMap &other) : BaseT() {
+ init(0);
+ copyFrom(other);
+ }
+
+ SmallDenseMap(SmallDenseMap &&other) : BaseT() {
+ init(0);
+ swap(other);
+ }
+
+ template<typename InputIt>
+ SmallDenseMap(const InputIt &I, const InputIt &E) {
+ init(NextPowerOf2(std::distance(I, E)));
+ this->insert(I, E);
+ }
+
+ ~SmallDenseMap() {
+ this->destroyAll();
+ deallocateBuckets();
+ }
+
+ void swap(SmallDenseMap& RHS) {
+ unsigned TmpNumEntries = RHS.NumEntries;
+ RHS.NumEntries = NumEntries;
+ NumEntries = TmpNumEntries;
+ std::swap(NumTombstones, RHS.NumTombstones);
+
+ const KeyT EmptyKey = this->getEmptyKey();
+ const KeyT TombstoneKey = this->getTombstoneKey();
+ if (Small && RHS.Small) {
+ // If we're swapping inline bucket arrays, we have to cope with some of
+ // the tricky bits of DenseMap's storage system: the buckets are not
+ // fully initialized. Thus we swap every key, but we may have
+ // a one-directional move of the value.
+ for (unsigned i = 0, e = InlineBuckets; i != e; ++i) {
+ BucketT *LHSB = &getInlineBuckets()[i],
+ *RHSB = &RHS.getInlineBuckets()[i];
+ bool hasLHSValue = (!KeyInfoT::isEqual(LHSB->getFirst(), EmptyKey) &&
+ !KeyInfoT::isEqual(LHSB->getFirst(), TombstoneKey));
+ bool hasRHSValue = (!KeyInfoT::isEqual(RHSB->getFirst(), EmptyKey) &&
+ !KeyInfoT::isEqual(RHSB->getFirst(), TombstoneKey));
+ if (hasLHSValue && hasRHSValue) {
+ // Swap together if we can...
+ std::swap(*LHSB, *RHSB);
+ continue;
+ }
+ // Swap separately and handle any assymetry.
+ std::swap(LHSB->getFirst(), RHSB->getFirst());
+ if (hasLHSValue) {
+ ::new (&RHSB->getSecond()) ValueT(std::move(LHSB->getSecond()));
+ LHSB->getSecond().~ValueT();
+ } else if (hasRHSValue) {
+ ::new (&LHSB->getSecond()) ValueT(std::move(RHSB->getSecond()));
+ RHSB->getSecond().~ValueT();
+ }
+ }
+ return;
+ }
+ if (!Small && !RHS.Small) {
+ std::swap(getLargeRep()->Buckets, RHS.getLargeRep()->Buckets);
+ std::swap(getLargeRep()->NumBuckets, RHS.getLargeRep()->NumBuckets);
+ return;
+ }
+
+ SmallDenseMap &SmallSide = Small ? *this : RHS;
+ SmallDenseMap &LargeSide = Small ? RHS : *this;
+
+ // First stash the large side's rep and move the small side across.
+ LargeRep TmpRep = std::move(*LargeSide.getLargeRep());
+ LargeSide.getLargeRep()->~LargeRep();
+ LargeSide.Small = true;
+ // This is similar to the standard move-from-old-buckets, but the bucket
+ // count hasn't actually rotated in this case. So we have to carefully
+ // move construct the keys and values into their new locations, but there
+ // is no need to re-hash things.
+ for (unsigned i = 0, e = InlineBuckets; i != e; ++i) {
+ BucketT *NewB = &LargeSide.getInlineBuckets()[i],
+ *OldB = &SmallSide.getInlineBuckets()[i];
+ ::new (&NewB->getFirst()) KeyT(std::move(OldB->getFirst()));
+ OldB->getFirst().~KeyT();
+ if (!KeyInfoT::isEqual(NewB->getFirst(), EmptyKey) &&
+ !KeyInfoT::isEqual(NewB->getFirst(), TombstoneKey)) {
+ ::new (&NewB->getSecond()) ValueT(std::move(OldB->getSecond()));
+ OldB->getSecond().~ValueT();
+ }
+ }
+
+ // The hard part of moving the small buckets across is done, just move
+ // the TmpRep into its new home.
+ SmallSide.Small = false;
+ new (SmallSide.getLargeRep()) LargeRep(std::move(TmpRep));
+ }
+
+ SmallDenseMap& operator=(const SmallDenseMap& other) {
+ if (&other != this)
+ copyFrom(other);
+ return *this;
+ }
+
+ SmallDenseMap& operator=(SmallDenseMap &&other) {
+ this->destroyAll();
+ deallocateBuckets();
+ init(0);
+ swap(other);
+ return *this;
+ }
+
+ void copyFrom(const SmallDenseMap& other) {
+ this->destroyAll();
+ deallocateBuckets();
+ Small = true;
+ if (other.getNumBuckets() > InlineBuckets) {
+ Small = false;
+ new (getLargeRep()) LargeRep(allocateBuckets(other.getNumBuckets()));
+ }
+ this->BaseT::copyFrom(other);
+ }
+
+ void init(unsigned InitBuckets) {
+ Small = true;
+ if (InitBuckets > InlineBuckets) {
+ Small = false;
+ new (getLargeRep()) LargeRep(allocateBuckets(InitBuckets));
+ }
+ this->BaseT::initEmpty();
+ }
+
+ void grow(unsigned AtLeast) {
+ if (AtLeast >= InlineBuckets)
+ AtLeast = std::max<unsigned>(64, NextPowerOf2(AtLeast-1));
+
+ if (Small) {
+ if (AtLeast < InlineBuckets)
+ return; // Nothing to do.
+
+ // First move the inline buckets into a temporary storage.
+ AlignedCharArrayUnion<BucketT[InlineBuckets]> TmpStorage;
+ BucketT *TmpBegin = reinterpret_cast<BucketT *>(TmpStorage.buffer);
+ BucketT *TmpEnd = TmpBegin;
+
+ // Loop over the buckets, moving non-empty, non-tombstones into the
+ // temporary storage. Have the loop move the TmpEnd forward as it goes.
+ const KeyT EmptyKey = this->getEmptyKey();
+ const KeyT TombstoneKey = this->getTombstoneKey();
+ for (BucketT *P = getBuckets(), *E = P + InlineBuckets; P != E; ++P) {
+ if (!KeyInfoT::isEqual(P->getFirst(), EmptyKey) &&
+ !KeyInfoT::isEqual(P->getFirst(), TombstoneKey)) {
+ assert(size_t(TmpEnd - TmpBegin) < InlineBuckets &&
+ "Too many inline buckets!");
+ ::new (&TmpEnd->getFirst()) KeyT(std::move(P->getFirst()));
+ ::new (&TmpEnd->getSecond()) ValueT(std::move(P->getSecond()));
+ ++TmpEnd;
+ P->getSecond().~ValueT();
+ }
+ P->getFirst().~KeyT();
+ }
+
+ // Now make this map use the large rep, and move all the entries back
+ // into it.
+ Small = false;
+ new (getLargeRep()) LargeRep(allocateBuckets(AtLeast));
+ this->moveFromOldBuckets(TmpBegin, TmpEnd);
+ return;
+ }
+
+ LargeRep OldRep = std::move(*getLargeRep());
+ getLargeRep()->~LargeRep();
+ if (AtLeast <= InlineBuckets) {
+ Small = true;
+ } else {
+ new (getLargeRep()) LargeRep(allocateBuckets(AtLeast));
+ }
+
+ this->moveFromOldBuckets(OldRep.Buckets, OldRep.Buckets+OldRep.NumBuckets);
+
+ // Free the old table.
+ operator delete(OldRep.Buckets);
+ }
+
+ void shrink_and_clear() {
+ unsigned OldSize = this->size();
+ this->destroyAll();
+
+ // Reduce the number of buckets.
+ unsigned NewNumBuckets = 0;
+ if (OldSize) {
+ NewNumBuckets = 1 << (Log2_32_Ceil(OldSize) + 1);
+ if (NewNumBuckets > InlineBuckets && NewNumBuckets < 64u)
+ NewNumBuckets = 64;
+ }
+ if ((Small && NewNumBuckets <= InlineBuckets) ||
+ (!Small && NewNumBuckets == getLargeRep()->NumBuckets)) {
+ this->BaseT::initEmpty();
+ return;
+ }
+
+ deallocateBuckets();
+ init(NewNumBuckets);
+ }
+
+private:
+ unsigned getNumEntries() const {
+ return NumEntries;
+ }
+ void setNumEntries(unsigned Num) {
+ assert(Num < INT_MAX && "Cannot support more than INT_MAX entries");
+ NumEntries = Num;
+ }
+
+ unsigned getNumTombstones() const {
+ return NumTombstones;
+ }
+ void setNumTombstones(unsigned Num) {
+ NumTombstones = Num;
+ }
+
+ const BucketT *getInlineBuckets() const {
+ assert(Small);
+ // Note that this cast does not violate aliasing rules as we assert that
+ // the memory's dynamic type is the small, inline bucket buffer, and the
+ // 'storage.buffer' static type is 'char *'.
+ return reinterpret_cast<const BucketT *>(storage.buffer);
+ }
+ BucketT *getInlineBuckets() {
+ return const_cast<BucketT *>(
+ const_cast<const SmallDenseMap *>(this)->getInlineBuckets());
+ }
+ const LargeRep *getLargeRep() const {
+ assert(!Small);
+ // Note, same rule about aliasing as with getInlineBuckets.
+ return reinterpret_cast<const LargeRep *>(storage.buffer);
+ }
+ LargeRep *getLargeRep() {
+ return const_cast<LargeRep *>(
+ const_cast<const SmallDenseMap *>(this)->getLargeRep());
+ }
+
+ const BucketT *getBuckets() const {
+ return Small ? getInlineBuckets() : getLargeRep()->Buckets;
+ }
+ BucketT *getBuckets() {
+ return const_cast<BucketT *>(
+ const_cast<const SmallDenseMap *>(this)->getBuckets());
+ }
+ unsigned getNumBuckets() const {
+ return Small ? InlineBuckets : getLargeRep()->NumBuckets;
+ }
+
+ void deallocateBuckets() {
+ if (Small)
+ return;
+
+ operator delete(getLargeRep()->Buckets);
+ getLargeRep()->~LargeRep();
+ }
+
+ LargeRep allocateBuckets(unsigned Num) {
+ assert(Num > InlineBuckets && "Must allocate more buckets than are inline");
+ LargeRep Rep = {
+ static_cast<BucketT*>(operator new(sizeof(BucketT) * Num)), Num
+ };
+ return Rep;
+ }
+};
+
+template <typename KeyT, typename ValueT, typename KeyInfoT, typename Bucket,
+ bool IsConst>
+class DenseMapIterator : DebugEpochBase::HandleBase {
+ typedef DenseMapIterator<KeyT, ValueT, KeyInfoT, Bucket, true> ConstIterator;
+ friend class DenseMapIterator<KeyT, ValueT, KeyInfoT, Bucket, true>;
+ friend class DenseMapIterator<KeyT, ValueT, KeyInfoT, Bucket, false>;
+
+public:
+ typedef ptrdiff_t difference_type;
+ typedef typename std::conditional<IsConst, const Bucket, Bucket>::type
+ value_type;
+ typedef value_type *pointer;
+ typedef value_type &reference;
+ typedef std::forward_iterator_tag iterator_category;
+private:
+ pointer Ptr, End;
+public:
+ DenseMapIterator() : Ptr(nullptr), End(nullptr) {}
+
+ DenseMapIterator(pointer Pos, pointer E, const DebugEpochBase &Epoch,
+ bool NoAdvance = false)
+ : DebugEpochBase::HandleBase(&Epoch), Ptr(Pos), End(E) {
+ assert(isHandleInSync() && "invalid construction!");
+ if (!NoAdvance) AdvancePastEmptyBuckets();
+ }
+
+ // Converting ctor from non-const iterators to const iterators. SFINAE'd out
+ // for const iterator destinations so it doesn't end up as a user defined copy
+ // constructor.
+ template <bool IsConstSrc,
+ typename = typename std::enable_if<!IsConstSrc && IsConst>::type>
+ DenseMapIterator(
+ const DenseMapIterator<KeyT, ValueT, KeyInfoT, Bucket, IsConstSrc> &I)
+ : DebugEpochBase::HandleBase(I), Ptr(I.Ptr), End(I.End) {}
+
+ reference operator*() const {
+ assert(isHandleInSync() && "invalid iterator access!");
+ return *Ptr;
+ }
+ pointer operator->() const {
+ assert(isHandleInSync() && "invalid iterator access!");
+ return Ptr;
+ }
+
+ bool operator==(const ConstIterator &RHS) const {
+ assert((!Ptr || isHandleInSync()) && "handle not in sync!");
+ assert((!RHS.Ptr || RHS.isHandleInSync()) && "handle not in sync!");
+ assert(getEpochAddress() == RHS.getEpochAddress() &&
+ "comparing incomparable iterators!");
+ return Ptr == RHS.Ptr;
+ }
+ bool operator!=(const ConstIterator &RHS) const {
+ assert((!Ptr || isHandleInSync()) && "handle not in sync!");
+ assert((!RHS.Ptr || RHS.isHandleInSync()) && "handle not in sync!");
+ assert(getEpochAddress() == RHS.getEpochAddress() &&
+ "comparing incomparable iterators!");
+ return Ptr != RHS.Ptr;
+ }
+
+ inline DenseMapIterator& operator++() { // Preincrement
+ assert(isHandleInSync() && "invalid iterator access!");
+ ++Ptr;
+ AdvancePastEmptyBuckets();
+ return *this;
+ }
+ DenseMapIterator operator++(int) { // Postincrement
+ assert(isHandleInSync() && "invalid iterator access!");
+ DenseMapIterator tmp = *this; ++*this; return tmp;
+ }
+
+private:
+ void AdvancePastEmptyBuckets() {
+ const KeyT Empty = KeyInfoT::getEmptyKey();
+ const KeyT Tombstone = KeyInfoT::getTombstoneKey();
+
+ while (Ptr != End && (KeyInfoT::isEqual(Ptr->getFirst(), Empty) ||
+ KeyInfoT::isEqual(Ptr->getFirst(), Tombstone)))
+ ++Ptr;
+ }
+};
+
+template<typename KeyT, typename ValueT, typename KeyInfoT>
+static inline size_t
+capacity_in_bytes(const DenseMap<KeyT, ValueT, KeyInfoT> &X) {
+ return X.getMemorySize();
+}
+
+} // end namespace llvm
+
+#endif
diff --git a/ext/include/llvm/ADT/DenseMapInfo.h b/ext/include/llvm/ADT/DenseMapInfo.h
new file mode 100644
index 0000000..a844ebc
--- /dev/null
+++ b/ext/include/llvm/ADT/DenseMapInfo.h
@@ -0,0 +1,221 @@
+//===- llvm/ADT/DenseMapInfo.h - Type traits for DenseMap -------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines DenseMapInfo traits for DenseMap.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_DENSEMAPINFO_H
+#define LLVM_ADT_DENSEMAPINFO_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/PointerLikeTypeTraits.h"
+#include "llvm/Support/type_traits.h"
+
+namespace llvm {
+
+template<typename T>
+struct DenseMapInfo {
+ //static inline T getEmptyKey();
+ //static inline T getTombstoneKey();
+ //static unsigned getHashValue(const T &Val);
+ //static bool isEqual(const T &LHS, const T &RHS);
+};
+
+// Provide DenseMapInfo for all pointers.
+template<typename T>
+struct DenseMapInfo<T*> {
+ static inline T* getEmptyKey() {
+ uintptr_t Val = static_cast<uintptr_t>(-1);
+ Val <<= PointerLikeTypeTraits<T*>::NumLowBitsAvailable;
+ return reinterpret_cast<T*>(Val);
+ }
+ static inline T* getTombstoneKey() {
+ uintptr_t Val = static_cast<uintptr_t>(-2);
+ Val <<= PointerLikeTypeTraits<T*>::NumLowBitsAvailable;
+ return reinterpret_cast<T*>(Val);
+ }
+ static unsigned getHashValue(const T *PtrVal) {
+ return (unsigned((uintptr_t)PtrVal) >> 4) ^
+ (unsigned((uintptr_t)PtrVal) >> 9);
+ }
+ static bool isEqual(const T *LHS, const T *RHS) { return LHS == RHS; }
+};
+
+// Provide DenseMapInfo for chars.
+template<> struct DenseMapInfo<char> {
+ static inline char getEmptyKey() { return ~0; }
+ static inline char getTombstoneKey() { return ~0 - 1; }
+ static unsigned getHashValue(const char& Val) { return Val * 37U; }
+ static bool isEqual(const char &LHS, const char &RHS) {
+ return LHS == RHS;
+ }
+};
+
+// Provide DenseMapInfo for unsigned ints.
+template<> struct DenseMapInfo<unsigned> {
+ static inline unsigned getEmptyKey() { return ~0U; }
+ static inline unsigned getTombstoneKey() { return ~0U - 1; }
+ static unsigned getHashValue(const unsigned& Val) { return Val * 37U; }
+ static bool isEqual(const unsigned& LHS, const unsigned& RHS) {
+ return LHS == RHS;
+ }
+};
+
+// Provide DenseMapInfo for unsigned longs.
+template<> struct DenseMapInfo<unsigned long> {
+ static inline unsigned long getEmptyKey() { return ~0UL; }
+ static inline unsigned long getTombstoneKey() { return ~0UL - 1L; }
+ static unsigned getHashValue(const unsigned long& Val) {
+ return (unsigned)(Val * 37UL);
+ }
+ static bool isEqual(const unsigned long& LHS, const unsigned long& RHS) {
+ return LHS == RHS;
+ }
+};
+
+// Provide DenseMapInfo for unsigned long longs.
+template<> struct DenseMapInfo<unsigned long long> {
+ static inline unsigned long long getEmptyKey() { return ~0ULL; }
+ static inline unsigned long long getTombstoneKey() { return ~0ULL - 1ULL; }
+ static unsigned getHashValue(const unsigned long long& Val) {
+ return (unsigned)(Val * 37ULL);
+ }
+ static bool isEqual(const unsigned long long& LHS,
+ const unsigned long long& RHS) {
+ return LHS == RHS;
+ }
+};
+
+// Provide DenseMapInfo for ints.
+template<> struct DenseMapInfo<int> {
+ static inline int getEmptyKey() { return 0x7fffffff; }
+ static inline int getTombstoneKey() { return -0x7fffffff - 1; }
+ static unsigned getHashValue(const int& Val) { return (unsigned)(Val * 37U); }
+ static bool isEqual(const int& LHS, const int& RHS) {
+ return LHS == RHS;
+ }
+};
+
+// Provide DenseMapInfo for longs.
+template<> struct DenseMapInfo<long> {
+ static inline long getEmptyKey() {
+ return (1UL << (sizeof(long) * 8 - 1)) - 1UL;
+ }
+ static inline long getTombstoneKey() { return getEmptyKey() - 1L; }
+ static unsigned getHashValue(const long& Val) {
+ return (unsigned)(Val * 37UL);
+ }
+ static bool isEqual(const long& LHS, const long& RHS) {
+ return LHS == RHS;
+ }
+};
+
+// Provide DenseMapInfo for long longs.
+template<> struct DenseMapInfo<long long> {
+ static inline long long getEmptyKey() { return 0x7fffffffffffffffLL; }
+ static inline long long getTombstoneKey() { return -0x7fffffffffffffffLL-1; }
+ static unsigned getHashValue(const long long& Val) {
+ return (unsigned)(Val * 37ULL);
+ }
+ static bool isEqual(const long long& LHS,
+ const long long& RHS) {
+ return LHS == RHS;
+ }
+};
+
+// Provide DenseMapInfo for all pairs whose members have info.
+template<typename T, typename U>
+struct DenseMapInfo<std::pair<T, U> > {
+ typedef std::pair<T, U> Pair;
+ typedef DenseMapInfo<T> FirstInfo;
+ typedef DenseMapInfo<U> SecondInfo;
+
+ static inline Pair getEmptyKey() {
+ return std::make_pair(FirstInfo::getEmptyKey(),
+ SecondInfo::getEmptyKey());
+ }
+ static inline Pair getTombstoneKey() {
+ return std::make_pair(FirstInfo::getTombstoneKey(),
+ SecondInfo::getTombstoneKey());
+ }
+ static unsigned getHashValue(const Pair& PairVal) {
+ uint64_t key = (uint64_t)FirstInfo::getHashValue(PairVal.first) << 32
+ | (uint64_t)SecondInfo::getHashValue(PairVal.second);
+ key += ~(key << 32);
+ key ^= (key >> 22);
+ key += ~(key << 13);
+ key ^= (key >> 8);
+ key += (key << 3);
+ key ^= (key >> 15);
+ key += ~(key << 27);
+ key ^= (key >> 31);
+ return (unsigned)key;
+ }
+ static bool isEqual(const Pair &LHS, const Pair &RHS) {
+ return FirstInfo::isEqual(LHS.first, RHS.first) &&
+ SecondInfo::isEqual(LHS.second, RHS.second);
+ }
+};
+
+// Provide DenseMapInfo for StringRefs.
+template <> struct DenseMapInfo<StringRef> {
+ static inline StringRef getEmptyKey() {
+ return StringRef(reinterpret_cast<const char *>(~static_cast<uintptr_t>(0)),
+ 0);
+ }
+ static inline StringRef getTombstoneKey() {
+ return StringRef(reinterpret_cast<const char *>(~static_cast<uintptr_t>(1)),
+ 0);
+ }
+ static unsigned getHashValue(StringRef Val) {
+ assert(Val.data() != getEmptyKey().data() && "Cannot hash the empty key!");
+ assert(Val.data() != getTombstoneKey().data() &&
+ "Cannot hash the tombstone key!");
+ return (unsigned)(hash_value(Val));
+ }
+ static bool isEqual(StringRef LHS, StringRef RHS) {
+ if (RHS.data() == getEmptyKey().data())
+ return LHS.data() == getEmptyKey().data();
+ if (RHS.data() == getTombstoneKey().data())
+ return LHS.data() == getTombstoneKey().data();
+ return LHS == RHS;
+ }
+};
+
+// Provide DenseMapInfo for ArrayRefs.
+template <typename T> struct DenseMapInfo<ArrayRef<T>> {
+ static inline ArrayRef<T> getEmptyKey() {
+ return ArrayRef<T>(reinterpret_cast<const T *>(~static_cast<uintptr_t>(0)),
+ size_t(0));
+ }
+ static inline ArrayRef<T> getTombstoneKey() {
+ return ArrayRef<T>(reinterpret_cast<const T *>(~static_cast<uintptr_t>(1)),
+ size_t(0));
+ }
+ static unsigned getHashValue(ArrayRef<T> Val) {
+ assert(Val.data() != getEmptyKey().data() && "Cannot hash the empty key!");
+ assert(Val.data() != getTombstoneKey().data() &&
+ "Cannot hash the tombstone key!");
+ return (unsigned)(hash_value(Val));
+ }
+ static bool isEqual(ArrayRef<T> LHS, ArrayRef<T> RHS) {
+ if (RHS.data() == getEmptyKey().data())
+ return LHS.data() == getEmptyKey().data();
+ if (RHS.data() == getTombstoneKey().data())
+ return LHS.data() == getTombstoneKey().data();
+ return LHS == RHS;
+ }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/ext/include/llvm/ADT/EpochTracker.h b/ext/include/llvm/ADT/EpochTracker.h
new file mode 100644
index 0000000..97f1f36
--- /dev/null
+++ b/ext/include/llvm/ADT/EpochTracker.h
@@ -0,0 +1,78 @@
+//===- llvm/ADT/EpochTracker.h - ADT epoch tracking --------------*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the DebugEpochBase and DebugEpochBase::HandleBase classes.
+// These can be used to write iterators that are fail-fast when LLVM is built
+// with asserts enabled.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_EPOCH_TRACKER_H
+#define LLVM_ADT_EPOCH_TRACKER_H
+
+#include <cstdint>
+
+namespace llvm {
+
+/// \brief A base class for data structure classes wishing to make iterators
+/// ("handles") pointing into themselves fail-fast. When building without
+/// asserts, this class is empty and does nothing.
+///
+/// DebugEpochBase does not by itself track handles pointing into itself. The
+/// expectation is that routines touching the handles will poll on
+/// isHandleInSync at appropriate points to assert that the handle they're using
+/// is still valid.
+///
+class DebugEpochBase {
+ uint64_t Epoch;
+
+public:
+ DebugEpochBase() : Epoch(0) {}
+
+ /// \brief Calling incrementEpoch invalidates all handles pointing into the
+ /// calling instance.
+ void incrementEpoch() { ++Epoch; }
+
+ /// \brief The destructor calls incrementEpoch to make use-after-free bugs
+ /// more likely to crash deterministically.
+ ~DebugEpochBase() { incrementEpoch(); }
+
+ /// \brief A base class for iterator classes ("handles") that wish to poll for
+ /// iterator invalidating modifications in the underlying data structure.
+ /// When LLVM is built without asserts, this class is empty and does nothing.
+ ///
+ /// HandleBase does not track the parent data structure by itself. It expects
+ /// the routines modifying the data structure to call incrementEpoch when they
+ /// make an iterator-invalidating modification.
+ ///
+ class HandleBase {
+ const uint64_t *EpochAddress;
+ uint64_t EpochAtCreation;
+
+ public:
+ HandleBase() : EpochAddress(nullptr), EpochAtCreation(UINT64_MAX) {}
+
+ explicit HandleBase(const DebugEpochBase *Parent)
+ : EpochAddress(&Parent->Epoch), EpochAtCreation(Parent->Epoch) {}
+
+ /// \brief Returns true if the DebugEpochBase this Handle is linked to has
+ /// not called incrementEpoch on itself since the creation of this
+ /// HandleBase instance.
+ bool isHandleInSync() const { return *EpochAddress == EpochAtCreation; }
+
+ /// \brief Returns a pointer to the epoch word stored in the data structure
+ /// this handle points into. Can be used to check if two iterators point
+ /// into the same data structure.
+ const void *getEpochAddress() const { return EpochAddress; }
+ };
+};
+
+} // namespace llvm
+
+#endif
diff --git a/ext/include/llvm/ADT/FoldingSet.h b/ext/include/llvm/ADT/FoldingSet.h
new file mode 100644
index 0000000..c920539
--- /dev/null
+++ b/ext/include/llvm/ADT/FoldingSet.h
@@ -0,0 +1,750 @@
+//===-- llvm/ADT/FoldingSet.h - Uniquing Hash Set ---------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a hash set that can be used to remove duplication of nodes
+// in a graph. This code was originally created by Chris Lattner for use with
+// SelectionDAGCSEMap, but was isolated to provide use across the llvm code set.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_FOLDINGSET_H
+#define LLVM_ADT_FOLDINGSET_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/iterator.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+/// This folding set used for two purposes:
+/// 1. Given information about a node we want to create, look up the unique
+/// instance of the node in the set. If the node already exists, return
+/// it, otherwise return the bucket it should be inserted into.
+/// 2. Given a node that has already been created, remove it from the set.
+///
+/// This class is implemented as a single-link chained hash table, where the
+/// "buckets" are actually the nodes themselves (the next pointer is in the
+/// node). The last node points back to the bucket to simplify node removal.
+///
+/// Any node that is to be included in the folding set must be a subclass of
+/// FoldingSetNode. The node class must also define a Profile method used to
+/// establish the unique bits of data for the node. The Profile method is
+/// passed a FoldingSetNodeID object which is used to gather the bits. Just
+/// call one of the Add* functions defined in the FoldingSetImpl::NodeID class.
+/// NOTE: That the folding set does not own the nodes and it is the
+/// responsibility of the user to dispose of the nodes.
+///
+/// Eg.
+/// class MyNode : public FoldingSetNode {
+/// private:
+/// std::string Name;
+/// unsigned Value;
+/// public:
+/// MyNode(const char *N, unsigned V) : Name(N), Value(V) {}
+/// ...
+/// void Profile(FoldingSetNodeID &ID) const {
+/// ID.AddString(Name);
+/// ID.AddInteger(Value);
+/// }
+/// ...
+/// };
+///
+/// To define the folding set itself use the FoldingSet template;
+///
+/// Eg.
+/// FoldingSet<MyNode> MyFoldingSet;
+///
+/// Four public methods are available to manipulate the folding set;
+///
+/// 1) If you have an existing node that you want add to the set but unsure
+/// that the node might already exist then call;
+///
+/// MyNode *M = MyFoldingSet.GetOrInsertNode(N);
+///
+/// If The result is equal to the input then the node has been inserted.
+/// Otherwise, the result is the node existing in the folding set, and the
+/// input can be discarded (use the result instead.)
+///
+/// 2) If you are ready to construct a node but want to check if it already
+/// exists, then call FindNodeOrInsertPos with a FoldingSetNodeID of the bits to
+/// check;
+///
+/// FoldingSetNodeID ID;
+/// ID.AddString(Name);
+/// ID.AddInteger(Value);
+/// void *InsertPoint;
+///
+/// MyNode *M = MyFoldingSet.FindNodeOrInsertPos(ID, InsertPoint);
+///
+/// If found then M with be non-NULL, else InsertPoint will point to where it
+/// should be inserted using InsertNode.
+///
+/// 3) If you get a NULL result from FindNodeOrInsertPos then you can as a new
+/// node with FindNodeOrInsertPos;
+///
+/// InsertNode(N, InsertPoint);
+///
+/// 4) Finally, if you want to remove a node from the folding set call;
+///
+/// bool WasRemoved = RemoveNode(N);
+///
+/// The result indicates whether the node existed in the folding set.
+
+class FoldingSetNodeID;
+
+//===----------------------------------------------------------------------===//
+/// FoldingSetImpl - Implements the folding set functionality. The main
+/// structure is an array of buckets. Each bucket is indexed by the hash of
+/// the nodes it contains. The bucket itself points to the nodes contained
+/// in the bucket via a singly linked list. The last node in the list points
+/// back to the bucket to facilitate node removal.
+///
+class FoldingSetImpl {
+ virtual void anchor(); // Out of line virtual method.
+
+protected:
+ /// Buckets - Array of bucket chains.
+ ///
+ void **Buckets;
+
+ /// NumBuckets - Length of the Buckets array. Always a power of 2.
+ ///
+ unsigned NumBuckets;
+
+ /// NumNodes - Number of nodes in the folding set. Growth occurs when NumNodes
+ /// is greater than twice the number of buckets.
+ unsigned NumNodes;
+
+ explicit FoldingSetImpl(unsigned Log2InitSize = 6);
+ FoldingSetImpl(FoldingSetImpl &&Arg);
+ FoldingSetImpl &operator=(FoldingSetImpl &&RHS);
+ ~FoldingSetImpl();
+
+public:
+ //===--------------------------------------------------------------------===//
+ /// Node - This class is used to maintain the singly linked bucket list in
+ /// a folding set.
+ ///
+ class Node {
+ private:
+ // NextInFoldingSetBucket - next link in the bucket list.
+ void *NextInFoldingSetBucket;
+
+ public:
+ Node() : NextInFoldingSetBucket(nullptr) {}
+
+ // Accessors
+ void *getNextInBucket() const { return NextInFoldingSetBucket; }
+ void SetNextInBucket(void *N) { NextInFoldingSetBucket = N; }
+ };
+
+ /// clear - Remove all nodes from the folding set.
+ void clear();
+
+ /// RemoveNode - Remove a node from the folding set, returning true if one
+ /// was removed or false if the node was not in the folding set.
+ bool RemoveNode(Node *N);
+
+ /// GetOrInsertNode - If there is an existing simple Node exactly
+ /// equal to the specified node, return it. Otherwise, insert 'N' and return
+ /// it instead.
+ Node *GetOrInsertNode(Node *N);
+
+ /// FindNodeOrInsertPos - Look up the node specified by ID. If it exists,
+ /// return it. If not, return the insertion token that will make insertion
+ /// faster.
+ Node *FindNodeOrInsertPos(const FoldingSetNodeID &ID, void *&InsertPos);
+
+ /// InsertNode - Insert the specified node into the folding set, knowing that
+ /// it is not already in the folding set. InsertPos must be obtained from
+ /// FindNodeOrInsertPos.
+ void InsertNode(Node *N, void *InsertPos);
+
+ /// InsertNode - Insert the specified node into the folding set, knowing that
+ /// it is not already in the folding set.
+ void InsertNode(Node *N) {
+ Node *Inserted = GetOrInsertNode(N);
+ (void)Inserted;
+ assert(Inserted == N && "Node already inserted!");
+ }
+
+ /// size - Returns the number of nodes in the folding set.
+ unsigned size() const { return NumNodes; }
+
+ /// empty - Returns true if there are no nodes in the folding set.
+ bool empty() const { return NumNodes == 0; }
+
+private:
+ /// GrowHashTable - Double the size of the hash table and rehash everything.
+ ///
+ void GrowHashTable();
+
+protected:
+ /// GetNodeProfile - Instantiations of the FoldingSet template implement
+ /// this function to gather data bits for the given node.
+ virtual void GetNodeProfile(Node *N, FoldingSetNodeID &ID) const = 0;
+ /// NodeEquals - Instantiations of the FoldingSet template implement
+ /// this function to compare the given node with the given ID.
+ virtual bool NodeEquals(Node *N, const FoldingSetNodeID &ID, unsigned IDHash,
+ FoldingSetNodeID &TempID) const=0;
+ /// ComputeNodeHash - Instantiations of the FoldingSet template implement
+ /// this function to compute a hash value for the given node.
+ virtual unsigned ComputeNodeHash(Node *N, FoldingSetNodeID &TempID) const = 0;
+};
+
+//===----------------------------------------------------------------------===//
+
+template<typename T> struct FoldingSetTrait;
+
+/// DefaultFoldingSetTrait - This class provides default implementations
+/// for FoldingSetTrait implementations.
+///
+template<typename T> struct DefaultFoldingSetTrait {
+ static void Profile(const T &X, FoldingSetNodeID &ID) {
+ X.Profile(ID);
+ }
+ static void Profile(T &X, FoldingSetNodeID &ID) {
+ X.Profile(ID);
+ }
+
+ // Equals - Test if the profile for X would match ID, using TempID
+ // to compute a temporary ID if necessary. The default implementation
+ // just calls Profile and does a regular comparison. Implementations
+ // can override this to provide more efficient implementations.
+ static inline bool Equals(T &X, const FoldingSetNodeID &ID, unsigned IDHash,
+ FoldingSetNodeID &TempID);
+
+ // ComputeHash - Compute a hash value for X, using TempID to
+ // compute a temporary ID if necessary. The default implementation
+ // just calls Profile and does a regular hash computation.
+ // Implementations can override this to provide more efficient
+ // implementations.
+ static inline unsigned ComputeHash(T &X, FoldingSetNodeID &TempID);
+};
+
+/// FoldingSetTrait - This trait class is used to define behavior of how
+/// to "profile" (in the FoldingSet parlance) an object of a given type.
+/// The default behavior is to invoke a 'Profile' method on an object, but
+/// through template specialization the behavior can be tailored for specific
+/// types. Combined with the FoldingSetNodeWrapper class, one can add objects
+/// to FoldingSets that were not originally designed to have that behavior.
+template<typename T> struct FoldingSetTrait
+ : public DefaultFoldingSetTrait<T> {};
+
+template<typename T, typename Ctx> struct ContextualFoldingSetTrait;
+
+/// DefaultContextualFoldingSetTrait - Like DefaultFoldingSetTrait, but
+/// for ContextualFoldingSets.
+template<typename T, typename Ctx>
+struct DefaultContextualFoldingSetTrait {
+ static void Profile(T &X, FoldingSetNodeID &ID, Ctx Context) {
+ X.Profile(ID, Context);
+ }
+ static inline bool Equals(T &X, const FoldingSetNodeID &ID, unsigned IDHash,
+ FoldingSetNodeID &TempID, Ctx Context);
+ static inline unsigned ComputeHash(T &X, FoldingSetNodeID &TempID,
+ Ctx Context);
+};
+
+/// ContextualFoldingSetTrait - Like FoldingSetTrait, but for
+/// ContextualFoldingSets.
+template<typename T, typename Ctx> struct ContextualFoldingSetTrait
+ : public DefaultContextualFoldingSetTrait<T, Ctx> {};
+
+//===--------------------------------------------------------------------===//
+/// FoldingSetNodeIDRef - This class describes a reference to an interned
+/// FoldingSetNodeID, which can be a useful to store node id data rather
+/// than using plain FoldingSetNodeIDs, since the 32-element SmallVector
+/// is often much larger than necessary, and the possibility of heap
+/// allocation means it requires a non-trivial destructor call.
+class FoldingSetNodeIDRef {
+ const unsigned *Data;
+ size_t Size;
+
+public:
+ FoldingSetNodeIDRef() : Data(nullptr), Size(0) {}
+ FoldingSetNodeIDRef(const unsigned *D, size_t S) : Data(D), Size(S) {}
+
+ /// ComputeHash - Compute a strong hash value for this FoldingSetNodeIDRef,
+ /// used to lookup the node in the FoldingSetImpl.
+ unsigned ComputeHash() const;
+
+ bool operator==(FoldingSetNodeIDRef) const;
+
+ bool operator!=(FoldingSetNodeIDRef RHS) const { return !(*this == RHS); }
+
+ /// Used to compare the "ordering" of two nodes as defined by the
+ /// profiled bits and their ordering defined by memcmp().
+ bool operator<(FoldingSetNodeIDRef) const;
+
+ const unsigned *getData() const { return Data; }
+ size_t getSize() const { return Size; }
+};
+
+//===--------------------------------------------------------------------===//
+/// FoldingSetNodeID - This class is used to gather all the unique data bits of
+/// a node. When all the bits are gathered this class is used to produce a
+/// hash value for the node.
+///
+class FoldingSetNodeID {
+ /// Bits - Vector of all the data bits that make the node unique.
+ /// Use a SmallVector to avoid a heap allocation in the common case.
+ SmallVector<unsigned, 32> Bits;
+
+public:
+ FoldingSetNodeID() {}
+
+ FoldingSetNodeID(FoldingSetNodeIDRef Ref)
+ : Bits(Ref.getData(), Ref.getData() + Ref.getSize()) {}
+
+ /// Add* - Add various data types to Bit data.
+ ///
+ void AddPointer(const void *Ptr);
+ void AddInteger(signed I);
+ void AddInteger(unsigned I);
+ void AddInteger(long I);
+ void AddInteger(unsigned long I);
+ void AddInteger(long long I);
+ void AddInteger(unsigned long long I);
+ void AddBoolean(bool B) { AddInteger(B ? 1U : 0U); }
+ void AddString(StringRef String);
+ void AddNodeID(const FoldingSetNodeID &ID);
+
+ template <typename T>
+ inline void Add(const T &x) { FoldingSetTrait<T>::Profile(x, *this); }
+
+ /// clear - Clear the accumulated profile, allowing this FoldingSetNodeID
+ /// object to be used to compute a new profile.
+ inline void clear() { Bits.clear(); }
+
+ /// ComputeHash - Compute a strong hash value for this FoldingSetNodeID, used
+ /// to lookup the node in the FoldingSetImpl.
+ unsigned ComputeHash() const;
+
+ /// operator== - Used to compare two nodes to each other.
+ ///
+ bool operator==(const FoldingSetNodeID &RHS) const;
+ bool operator==(const FoldingSetNodeIDRef RHS) const;
+
+ bool operator!=(const FoldingSetNodeID &RHS) const { return !(*this == RHS); }
+ bool operator!=(const FoldingSetNodeIDRef RHS) const { return !(*this ==RHS);}
+
+ /// Used to compare the "ordering" of two nodes as defined by the
+ /// profiled bits and their ordering defined by memcmp().
+ bool operator<(const FoldingSetNodeID &RHS) const;
+ bool operator<(const FoldingSetNodeIDRef RHS) const;
+
+ /// Intern - Copy this node's data to a memory region allocated from the
+ /// given allocator and return a FoldingSetNodeIDRef describing the
+ /// interned data.
+ FoldingSetNodeIDRef Intern(BumpPtrAllocator &Allocator) const;
+};
+
+// Convenience type to hide the implementation of the folding set.
+typedef FoldingSetImpl::Node FoldingSetNode;
+template<class T> class FoldingSetIterator;
+template<class T> class FoldingSetBucketIterator;
+
+// Definitions of FoldingSetTrait and ContextualFoldingSetTrait functions, which
+// require the definition of FoldingSetNodeID.
+template<typename T>
+inline bool
+DefaultFoldingSetTrait<T>::Equals(T &X, const FoldingSetNodeID &ID,
+ unsigned /*IDHash*/,
+ FoldingSetNodeID &TempID) {
+ FoldingSetTrait<T>::Profile(X, TempID);
+ return TempID == ID;
+}
+template<typename T>
+inline unsigned
+DefaultFoldingSetTrait<T>::ComputeHash(T &X, FoldingSetNodeID &TempID) {
+ FoldingSetTrait<T>::Profile(X, TempID);
+ return TempID.ComputeHash();
+}
+template<typename T, typename Ctx>
+inline bool
+DefaultContextualFoldingSetTrait<T, Ctx>::Equals(T &X,
+ const FoldingSetNodeID &ID,
+ unsigned /*IDHash*/,
+ FoldingSetNodeID &TempID,
+ Ctx Context) {
+ ContextualFoldingSetTrait<T, Ctx>::Profile(X, TempID, Context);
+ return TempID == ID;
+}
+template<typename T, typename Ctx>
+inline unsigned
+DefaultContextualFoldingSetTrait<T, Ctx>::ComputeHash(T &X,
+ FoldingSetNodeID &TempID,
+ Ctx Context) {
+ ContextualFoldingSetTrait<T, Ctx>::Profile(X, TempID, Context);
+ return TempID.ComputeHash();
+}
+
+//===----------------------------------------------------------------------===//
+/// FoldingSet - This template class is used to instantiate a specialized
+/// implementation of the folding set to the node class T. T must be a
+/// subclass of FoldingSetNode and implement a Profile function.
+///
+/// Note that this set type is movable and move-assignable. However, its
+/// moved-from state is not a valid state for anything other than
+/// move-assigning and destroying. This is primarily to enable movable APIs
+/// that incorporate these objects.
+template <class T> class FoldingSet final : public FoldingSetImpl {
+private:
+ /// GetNodeProfile - Each instantiatation of the FoldingSet needs to provide a
+ /// way to convert nodes into a unique specifier.
+ void GetNodeProfile(Node *N, FoldingSetNodeID &ID) const override {
+ T *TN = static_cast<T *>(N);
+ FoldingSetTrait<T>::Profile(*TN, ID);
+ }
+ /// NodeEquals - Instantiations may optionally provide a way to compare a
+ /// node with a specified ID.
+ bool NodeEquals(Node *N, const FoldingSetNodeID &ID, unsigned IDHash,
+ FoldingSetNodeID &TempID) const override {
+ T *TN = static_cast<T *>(N);
+ return FoldingSetTrait<T>::Equals(*TN, ID, IDHash, TempID);
+ }
+ /// ComputeNodeHash - Instantiations may optionally provide a way to compute a
+ /// hash value directly from a node.
+ unsigned ComputeNodeHash(Node *N, FoldingSetNodeID &TempID) const override {
+ T *TN = static_cast<T *>(N);
+ return FoldingSetTrait<T>::ComputeHash(*TN, TempID);
+ }
+
+public:
+ explicit FoldingSet(unsigned Log2InitSize = 6)
+ : FoldingSetImpl(Log2InitSize) {}
+
+ FoldingSet(FoldingSet &&Arg) : FoldingSetImpl(std::move(Arg)) {}
+ FoldingSet &operator=(FoldingSet &&RHS) {
+ (void)FoldingSetImpl::operator=(std::move(RHS));
+ return *this;
+ }
+
+ typedef FoldingSetIterator<T> iterator;
+ iterator begin() { return iterator(Buckets); }
+ iterator end() { return iterator(Buckets+NumBuckets); }
+
+ typedef FoldingSetIterator<const T> const_iterator;
+ const_iterator begin() const { return const_iterator(Buckets); }
+ const_iterator end() const { return const_iterator(Buckets+NumBuckets); }
+
+ typedef FoldingSetBucketIterator<T> bucket_iterator;
+
+ bucket_iterator bucket_begin(unsigned hash) {
+ return bucket_iterator(Buckets + (hash & (NumBuckets-1)));
+ }
+
+ bucket_iterator bucket_end(unsigned hash) {
+ return bucket_iterator(Buckets + (hash & (NumBuckets-1)), true);
+ }
+
+ /// GetOrInsertNode - If there is an existing simple Node exactly
+ /// equal to the specified node, return it. Otherwise, insert 'N' and
+ /// return it instead.
+ T *GetOrInsertNode(Node *N) {
+ return static_cast<T *>(FoldingSetImpl::GetOrInsertNode(N));
+ }
+
+ /// FindNodeOrInsertPos - Look up the node specified by ID. If it exists,
+ /// return it. If not, return the insertion token that will make insertion
+ /// faster.
+ T *FindNodeOrInsertPos(const FoldingSetNodeID &ID, void *&InsertPos) {
+ return static_cast<T *>(FoldingSetImpl::FindNodeOrInsertPos(ID, InsertPos));
+ }
+};
+
+//===----------------------------------------------------------------------===//
+/// ContextualFoldingSet - This template class is a further refinement
+/// of FoldingSet which provides a context argument when calling
+/// Profile on its nodes. Currently, that argument is fixed at
+/// initialization time.
+///
+/// T must be a subclass of FoldingSetNode and implement a Profile
+/// function with signature
+/// void Profile(llvm::FoldingSetNodeID &, Ctx);
+template <class T, class Ctx>
+class ContextualFoldingSet final : public FoldingSetImpl {
+ // Unfortunately, this can't derive from FoldingSet<T> because the
+ // construction vtable for FoldingSet<T> requires
+ // FoldingSet<T>::GetNodeProfile to be instantiated, which in turn
+ // requires a single-argument T::Profile().
+
+private:
+ Ctx Context;
+
+ /// GetNodeProfile - Each instantiatation of the FoldingSet needs to provide a
+ /// way to convert nodes into a unique specifier.
+ void GetNodeProfile(FoldingSetImpl::Node *N,
+ FoldingSetNodeID &ID) const override {
+ T *TN = static_cast<T *>(N);
+ ContextualFoldingSetTrait<T, Ctx>::Profile(*TN, ID, Context);
+ }
+ bool NodeEquals(FoldingSetImpl::Node *N, const FoldingSetNodeID &ID,
+ unsigned IDHash, FoldingSetNodeID &TempID) const override {
+ T *TN = static_cast<T *>(N);
+ return ContextualFoldingSetTrait<T, Ctx>::Equals(*TN, ID, IDHash, TempID,
+ Context);
+ }
+ unsigned ComputeNodeHash(FoldingSetImpl::Node *N,
+ FoldingSetNodeID &TempID) const override {
+ T *TN = static_cast<T *>(N);
+ return ContextualFoldingSetTrait<T, Ctx>::ComputeHash(*TN, TempID, Context);
+ }
+
+public:
+ explicit ContextualFoldingSet(Ctx Context, unsigned Log2InitSize = 6)
+ : FoldingSetImpl(Log2InitSize), Context(Context)
+ {}
+
+ Ctx getContext() const { return Context; }
+
+ typedef FoldingSetIterator<T> iterator;
+ iterator begin() { return iterator(Buckets); }
+ iterator end() { return iterator(Buckets+NumBuckets); }
+
+ typedef FoldingSetIterator<const T> const_iterator;
+ const_iterator begin() const { return const_iterator(Buckets); }
+ const_iterator end() const { return const_iterator(Buckets+NumBuckets); }
+
+ typedef FoldingSetBucketIterator<T> bucket_iterator;
+
+ bucket_iterator bucket_begin(unsigned hash) {
+ return bucket_iterator(Buckets + (hash & (NumBuckets-1)));
+ }
+
+ bucket_iterator bucket_end(unsigned hash) {
+ return bucket_iterator(Buckets + (hash & (NumBuckets-1)), true);
+ }
+
+ /// GetOrInsertNode - If there is an existing simple Node exactly
+ /// equal to the specified node, return it. Otherwise, insert 'N'
+ /// and return it instead.
+ T *GetOrInsertNode(Node *N) {
+ return static_cast<T *>(FoldingSetImpl::GetOrInsertNode(N));
+ }
+
+ /// FindNodeOrInsertPos - Look up the node specified by ID. If it
+ /// exists, return it. If not, return the insertion token that will
+ /// make insertion faster.
+ T *FindNodeOrInsertPos(const FoldingSetNodeID &ID, void *&InsertPos) {
+ return static_cast<T *>(FoldingSetImpl::FindNodeOrInsertPos(ID, InsertPos));
+ }
+};
+
+//===----------------------------------------------------------------------===//
+/// FoldingSetVector - This template class combines a FoldingSet and a vector
+/// to provide the interface of FoldingSet but with deterministic iteration
+/// order based on the insertion order. T must be a subclass of FoldingSetNode
+/// and implement a Profile function.
+template <class T, class VectorT = SmallVector<T*, 8> >
+class FoldingSetVector {
+ FoldingSet<T> Set;
+ VectorT Vector;
+
+public:
+ explicit FoldingSetVector(unsigned Log2InitSize = 6)
+ : Set(Log2InitSize) {
+ }
+
+ typedef pointee_iterator<typename VectorT::iterator> iterator;
+ iterator begin() { return Vector.begin(); }
+ iterator end() { return Vector.end(); }
+
+ typedef pointee_iterator<typename VectorT::const_iterator> const_iterator;
+ const_iterator begin() const { return Vector.begin(); }
+ const_iterator end() const { return Vector.end(); }
+
+ /// clear - Remove all nodes from the folding set.
+ void clear() { Set.clear(); Vector.clear(); }
+
+ /// FindNodeOrInsertPos - Look up the node specified by ID. If it exists,
+ /// return it. If not, return the insertion token that will make insertion
+ /// faster.
+ T *FindNodeOrInsertPos(const FoldingSetNodeID &ID, void *&InsertPos) {
+ return Set.FindNodeOrInsertPos(ID, InsertPos);
+ }
+
+ /// GetOrInsertNode - If there is an existing simple Node exactly
+ /// equal to the specified node, return it. Otherwise, insert 'N' and
+ /// return it instead.
+ T *GetOrInsertNode(T *N) {
+ T *Result = Set.GetOrInsertNode(N);
+ if (Result == N) Vector.push_back(N);
+ return Result;
+ }
+
+ /// InsertNode - Insert the specified node into the folding set, knowing that
+ /// it is not already in the folding set. InsertPos must be obtained from
+ /// FindNodeOrInsertPos.
+ void InsertNode(T *N, void *InsertPos) {
+ Set.InsertNode(N, InsertPos);
+ Vector.push_back(N);
+ }
+
+ /// InsertNode - Insert the specified node into the folding set, knowing that
+ /// it is not already in the folding set.
+ void InsertNode(T *N) {
+ Set.InsertNode(N);
+ Vector.push_back(N);
+ }
+
+ /// size - Returns the number of nodes in the folding set.
+ unsigned size() const { return Set.size(); }
+
+ /// empty - Returns true if there are no nodes in the folding set.
+ bool empty() const { return Set.empty(); }
+};
+
+//===----------------------------------------------------------------------===//
+/// FoldingSetIteratorImpl - This is the common iterator support shared by all
+/// folding sets, which knows how to walk the folding set hash table.
+class FoldingSetIteratorImpl {
+protected:
+ FoldingSetNode *NodePtr;
+ FoldingSetIteratorImpl(void **Bucket);
+ void advance();
+
+public:
+ bool operator==(const FoldingSetIteratorImpl &RHS) const {
+ return NodePtr == RHS.NodePtr;
+ }
+ bool operator!=(const FoldingSetIteratorImpl &RHS) const {
+ return NodePtr != RHS.NodePtr;
+ }
+};
+
+template <class T> class FoldingSetIterator : public FoldingSetIteratorImpl {
+public:
+ explicit FoldingSetIterator(void **Bucket) : FoldingSetIteratorImpl(Bucket) {}
+
+ T &operator*() const {
+ return *static_cast<T*>(NodePtr);
+ }
+
+ T *operator->() const {
+ return static_cast<T*>(NodePtr);
+ }
+
+ inline FoldingSetIterator &operator++() { // Preincrement
+ advance();
+ return *this;
+ }
+ FoldingSetIterator operator++(int) { // Postincrement
+ FoldingSetIterator tmp = *this; ++*this; return tmp;
+ }
+};
+
+//===----------------------------------------------------------------------===//
+/// FoldingSetBucketIteratorImpl - This is the common bucket iterator support
+/// shared by all folding sets, which knows how to walk a particular bucket
+/// of a folding set hash table.
+
+class FoldingSetBucketIteratorImpl {
+protected:
+ void *Ptr;
+
+ explicit FoldingSetBucketIteratorImpl(void **Bucket);
+
+ FoldingSetBucketIteratorImpl(void **Bucket, bool)
+ : Ptr(Bucket) {}
+
+ void advance() {
+ void *Probe = static_cast<FoldingSetNode*>(Ptr)->getNextInBucket();
+ uintptr_t x = reinterpret_cast<uintptr_t>(Probe) & ~0x1;
+ Ptr = reinterpret_cast<void*>(x);
+ }
+
+public:
+ bool operator==(const FoldingSetBucketIteratorImpl &RHS) const {
+ return Ptr == RHS.Ptr;
+ }
+ bool operator!=(const FoldingSetBucketIteratorImpl &RHS) const {
+ return Ptr != RHS.Ptr;
+ }
+};
+
+template <class T>
+class FoldingSetBucketIterator : public FoldingSetBucketIteratorImpl {
+public:
+ explicit FoldingSetBucketIterator(void **Bucket) :
+ FoldingSetBucketIteratorImpl(Bucket) {}
+
+ FoldingSetBucketIterator(void **Bucket, bool) :
+ FoldingSetBucketIteratorImpl(Bucket, true) {}
+
+ T &operator*() const { return *static_cast<T*>(Ptr); }
+ T *operator->() const { return static_cast<T*>(Ptr); }
+
+ inline FoldingSetBucketIterator &operator++() { // Preincrement
+ advance();
+ return *this;
+ }
+ FoldingSetBucketIterator operator++(int) { // Postincrement
+ FoldingSetBucketIterator tmp = *this; ++*this; return tmp;
+ }
+};
+
+//===----------------------------------------------------------------------===//
+/// FoldingSetNodeWrapper - This template class is used to "wrap" arbitrary
+/// types in an enclosing object so that they can be inserted into FoldingSets.
+template <typename T>
+class FoldingSetNodeWrapper : public FoldingSetNode {
+ T data;
+
+public:
+ template <typename... Ts>
+ explicit FoldingSetNodeWrapper(Ts &&... Args)
+ : data(std::forward<Ts>(Args)...) {}
+
+ void Profile(FoldingSetNodeID &ID) { FoldingSetTrait<T>::Profile(data, ID); }
+
+ T &getValue() { return data; }
+ const T &getValue() const { return data; }
+
+ operator T&() { return data; }
+ operator const T&() const { return data; }
+};
+
+//===----------------------------------------------------------------------===//
+/// FastFoldingSetNode - This is a subclass of FoldingSetNode which stores
+/// a FoldingSetNodeID value rather than requiring the node to recompute it
+/// each time it is needed. This trades space for speed (which can be
+/// significant if the ID is long), and it also permits nodes to drop
+/// information that would otherwise only be required for recomputing an ID.
+class FastFoldingSetNode : public FoldingSetNode {
+ FoldingSetNodeID FastID;
+
+protected:
+ explicit FastFoldingSetNode(const FoldingSetNodeID &ID) : FastID(ID) {}
+
+public:
+ void Profile(FoldingSetNodeID &ID) const { ID.AddNodeID(FastID); }
+};
+
+//===----------------------------------------------------------------------===//
+// Partial specializations of FoldingSetTrait.
+
+template<typename T> struct FoldingSetTrait<T*> {
+ static inline void Profile(T *X, FoldingSetNodeID &ID) {
+ ID.AddPointer(X);
+ }
+};
+template <typename T1, typename T2>
+struct FoldingSetTrait<std::pair<T1, T2>> {
+ static inline void Profile(const std::pair<T1, T2> &P,
+ llvm::FoldingSetNodeID &ID) {
+ ID.Add(P.first);
+ ID.Add(P.second);
+ }
+};
+} // End of namespace llvm.
+
+#endif
diff --git a/ext/include/llvm/ADT/Hashing.h b/ext/include/llvm/ADT/Hashing.h
new file mode 100644
index 0000000..de56f91
--- /dev/null
+++ b/ext/include/llvm/ADT/Hashing.h
@@ -0,0 +1,661 @@
+//===-- llvm/ADT/Hashing.h - Utilities for hashing --------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the newly proposed standard C++ interfaces for hashing
+// arbitrary data and building hash functions for user-defined types. This
+// interface was originally proposed in N3333[1] and is currently under review
+// for inclusion in a future TR and/or standard.
+//
+// The primary interfaces provide are comprised of one type and three functions:
+//
+// -- 'hash_code' class is an opaque type representing the hash code for some
+// data. It is the intended product of hashing, and can be used to implement
+// hash tables, checksumming, and other common uses of hashes. It is not an
+// integer type (although it can be converted to one) because it is risky
+// to assume much about the internals of a hash_code. In particular, each
+// execution of the program has a high probability of producing a different
+// hash_code for a given input. Thus their values are not stable to save or
+// persist, and should only be used during the execution for the
+// construction of hashing datastructures.
+//
+// -- 'hash_value' is a function designed to be overloaded for each
+// user-defined type which wishes to be used within a hashing context. It
+// should be overloaded within the user-defined type's namespace and found
+// via ADL. Overloads for primitive types are provided by this library.
+//
+// -- 'hash_combine' and 'hash_combine_range' are functions designed to aid
+// programmers in easily and intuitively combining a set of data into
+// a single hash_code for their object. They should only logically be used
+// within the implementation of a 'hash_value' routine or similar context.
+//
+// Note that 'hash_combine_range' contains very special logic for hashing
+// a contiguous array of integers or pointers. This logic is *extremely* fast,
+// on a modern Intel "Gainestown" Xeon (Nehalem uarch) @2.2 GHz, these were
+// benchmarked at over 6.5 GiB/s for large keys, and <20 cycles/hash for keys
+// under 32-bytes.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_HASHING_H
+#define LLVM_ADT_HASHING_H
+
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/SwapByteOrder.h"
+#include "llvm/Support/type_traits.h"
+#include <algorithm>
+#include <cassert>
+#include <cstring>
+#include <iterator>
+#include <string>
+#include <utility>
+
+namespace llvm {
+
+/// \brief An opaque object representing a hash code.
+///
+/// This object represents the result of hashing some entity. It is intended to
+/// be used to implement hashtables or other hashing-based data structures.
+/// While it wraps and exposes a numeric value, this value should not be
+/// trusted to be stable or predictable across processes or executions.
+///
+/// In order to obtain the hash_code for an object 'x':
+/// \code
+/// using llvm::hash_value;
+/// llvm::hash_code code = hash_value(x);
+/// \endcode
+class hash_code {
+ size_t value;
+
+public:
+ /// \brief Default construct a hash_code.
+ /// Note that this leaves the value uninitialized.
+ hash_code() = default;
+
+ /// \brief Form a hash code directly from a numerical value.
+ hash_code(size_t value) : value(value) {}
+
+ /// \brief Convert the hash code to its numerical value for use.
+ /*explicit*/ operator size_t() const { return value; }
+
+ friend bool operator==(const hash_code &lhs, const hash_code &rhs) {
+ return lhs.value == rhs.value;
+ }
+ friend bool operator!=(const hash_code &lhs, const hash_code &rhs) {
+ return lhs.value != rhs.value;
+ }
+
+ /// \brief Allow a hash_code to be directly run through hash_value.
+ friend size_t hash_value(const hash_code &code) { return code.value; }
+};
+
+/// \brief Compute a hash_code for any integer value.
+///
+/// Note that this function is intended to compute the same hash_code for
+/// a particular value without regard to the pre-promotion type. This is in
+/// contrast to hash_combine which may produce different hash_codes for
+/// differing argument types even if they would implicit promote to a common
+/// type without changing the value.
+template <typename T>
+typename std::enable_if<is_integral_or_enum<T>::value, hash_code>::type
+hash_value(T value);
+
+/// \brief Compute a hash_code for a pointer's address.
+///
+/// N.B.: This hashes the *address*. Not the value and not the type.
+template <typename T> hash_code hash_value(const T *ptr);
+
+/// \brief Compute a hash_code for a pair of objects.
+template <typename T, typename U>
+hash_code hash_value(const std::pair<T, U> &arg);
+
+/// \brief Compute a hash_code for a standard string.
+template <typename T>
+hash_code hash_value(const std::basic_string<T> &arg);
+
+
+/// \brief Override the execution seed with a fixed value.
+///
+/// This hashing library uses a per-execution seed designed to change on each
+/// run with high probability in order to ensure that the hash codes are not
+/// attackable and to ensure that output which is intended to be stable does
+/// not rely on the particulars of the hash codes produced.
+///
+/// That said, there are use cases where it is important to be able to
+/// reproduce *exactly* a specific behavior. To that end, we provide a function
+/// which will forcibly set the seed to a fixed value. This must be done at the
+/// start of the program, before any hashes are computed. Also, it cannot be
+/// undone. This makes it thread-hostile and very hard to use outside of
+/// immediately on start of a simple program designed for reproducible
+/// behavior.
+void set_fixed_execution_hash_seed(size_t fixed_value);
+
+
+// All of the implementation details of actually computing the various hash
+// code values are held within this namespace. These routines are included in
+// the header file mainly to allow inlining and constant propagation.
+namespace hashing {
+namespace detail {
+
+inline uint64_t fetch64(const char *p) {
+ uint64_t result;
+ memcpy(&result, p, sizeof(result));
+ if (sys::IsBigEndianHost)
+ sys::swapByteOrder(result);
+ return result;
+}
+
+inline uint32_t fetch32(const char *p) {
+ uint32_t result;
+ memcpy(&result, p, sizeof(result));
+ if (sys::IsBigEndianHost)
+ sys::swapByteOrder(result);
+ return result;
+}
+
+/// Some primes between 2^63 and 2^64 for various uses.
+static const uint64_t k0 = 0xc3a5c85c97cb3127ULL;
+static const uint64_t k1 = 0xb492b66fbe98f273ULL;
+static const uint64_t k2 = 0x9ae16a3b2f90404fULL;
+static const uint64_t k3 = 0xc949d7c7509e6557ULL;
+
+/// \brief Bitwise right rotate.
+/// Normally this will compile to a single instruction, especially if the
+/// shift is a manifest constant.
+inline uint64_t rotate(uint64_t val, size_t shift) {
+ // Avoid shifting by 64: doing so yields an undefined result.
+ return shift == 0 ? val : ((val >> shift) | (val << (64 - shift)));
+}
+
+inline uint64_t shift_mix(uint64_t val) {
+ return val ^ (val >> 47);
+}
+
+inline uint64_t hash_16_bytes(uint64_t low, uint64_t high) {
+ // Murmur-inspired hashing.
+ const uint64_t kMul = 0x9ddfea08eb382d69ULL;
+ uint64_t a = (low ^ high) * kMul;
+ a ^= (a >> 47);
+ uint64_t b = (high ^ a) * kMul;
+ b ^= (b >> 47);
+ b *= kMul;
+ return b;
+}
+
+inline uint64_t hash_1to3_bytes(const char *s, size_t len, uint64_t seed) {
+ uint8_t a = s[0];
+ uint8_t b = s[len >> 1];
+ uint8_t c = s[len - 1];
+ uint32_t y = static_cast<uint32_t>(a) + (static_cast<uint32_t>(b) << 8);
+ uint32_t z = len + (static_cast<uint32_t>(c) << 2);
+ return shift_mix(y * k2 ^ z * k3 ^ seed) * k2;
+}
+
+inline uint64_t hash_4to8_bytes(const char *s, size_t len, uint64_t seed) {
+ uint64_t a = fetch32(s);
+ return hash_16_bytes(len + (a << 3), seed ^ fetch32(s + len - 4));
+}
+
+inline uint64_t hash_9to16_bytes(const char *s, size_t len, uint64_t seed) {
+ uint64_t a = fetch64(s);
+ uint64_t b = fetch64(s + len - 8);
+ return hash_16_bytes(seed ^ a, rotate(b + len, len)) ^ b;
+}
+
+inline uint64_t hash_17to32_bytes(const char *s, size_t len, uint64_t seed) {
+ uint64_t a = fetch64(s) * k1;
+ uint64_t b = fetch64(s + 8);
+ uint64_t c = fetch64(s + len - 8) * k2;
+ uint64_t d = fetch64(s + len - 16) * k0;
+ return hash_16_bytes(rotate(a - b, 43) + rotate(c ^ seed, 30) + d,
+ a + rotate(b ^ k3, 20) - c + len + seed);
+}
+
+inline uint64_t hash_33to64_bytes(const char *s, size_t len, uint64_t seed) {
+ uint64_t z = fetch64(s + 24);
+ uint64_t a = fetch64(s) + (len + fetch64(s + len - 16)) * k0;
+ uint64_t b = rotate(a + z, 52);
+ uint64_t c = rotate(a, 37);
+ a += fetch64(s + 8);
+ c += rotate(a, 7);
+ a += fetch64(s + 16);
+ uint64_t vf = a + z;
+ uint64_t vs = b + rotate(a, 31) + c;
+ a = fetch64(s + 16) + fetch64(s + len - 32);
+ z = fetch64(s + len - 8);
+ b = rotate(a + z, 52);
+ c = rotate(a, 37);
+ a += fetch64(s + len - 24);
+ c += rotate(a, 7);
+ a += fetch64(s + len - 16);
+ uint64_t wf = a + z;
+ uint64_t ws = b + rotate(a, 31) + c;
+ uint64_t r = shift_mix((vf + ws) * k2 + (wf + vs) * k0);
+ return shift_mix((seed ^ (r * k0)) + vs) * k2;
+}
+
+inline uint64_t hash_short(const char *s, size_t length, uint64_t seed) {
+ if (length >= 4 && length <= 8)
+ return hash_4to8_bytes(s, length, seed);
+ if (length > 8 && length <= 16)
+ return hash_9to16_bytes(s, length, seed);
+ if (length > 16 && length <= 32)
+ return hash_17to32_bytes(s, length, seed);
+ if (length > 32)
+ return hash_33to64_bytes(s, length, seed);
+ if (length != 0)
+ return hash_1to3_bytes(s, length, seed);
+
+ return k2 ^ seed;
+}
+
+/// \brief The intermediate state used during hashing.
+/// Currently, the algorithm for computing hash codes is based on CityHash and
+/// keeps 56 bytes of arbitrary state.
+struct hash_state {
+ uint64_t h0, h1, h2, h3, h4, h5, h6;
+
+ /// \brief Create a new hash_state structure and initialize it based on the
+ /// seed and the first 64-byte chunk.
+ /// This effectively performs the initial mix.
+ static hash_state create(const char *s, uint64_t seed) {
+ hash_state state = {
+ 0, seed, hash_16_bytes(seed, k1), rotate(seed ^ k1, 49),
+ seed * k1, shift_mix(seed), 0 };
+ state.h6 = hash_16_bytes(state.h4, state.h5);
+ state.mix(s);
+ return state;
+ }
+
+ /// \brief Mix 32-bytes from the input sequence into the 16-bytes of 'a'
+ /// and 'b', including whatever is already in 'a' and 'b'.
+ static void mix_32_bytes(const char *s, uint64_t &a, uint64_t &b) {
+ a += fetch64(s);
+ uint64_t c = fetch64(s + 24);
+ b = rotate(b + a + c, 21);
+ uint64_t d = a;
+ a += fetch64(s + 8) + fetch64(s + 16);
+ b += rotate(a, 44) + d;
+ a += c;
+ }
+
+ /// \brief Mix in a 64-byte buffer of data.
+ /// We mix all 64 bytes even when the chunk length is smaller, but we
+ /// record the actual length.
+ void mix(const char *s) {
+ h0 = rotate(h0 + h1 + h3 + fetch64(s + 8), 37) * k1;
+ h1 = rotate(h1 + h4 + fetch64(s + 48), 42) * k1;
+ h0 ^= h6;
+ h1 += h3 + fetch64(s + 40);
+ h2 = rotate(h2 + h5, 33) * k1;
+ h3 = h4 * k1;
+ h4 = h0 + h5;
+ mix_32_bytes(s, h3, h4);
+ h5 = h2 + h6;
+ h6 = h1 + fetch64(s + 16);
+ mix_32_bytes(s + 32, h5, h6);
+ std::swap(h2, h0);
+ }
+
+ /// \brief Compute the final 64-bit hash code value based on the current
+ /// state and the length of bytes hashed.
+ uint64_t finalize(size_t length) {
+ return hash_16_bytes(hash_16_bytes(h3, h5) + shift_mix(h1) * k1 + h2,
+ hash_16_bytes(h4, h6) + shift_mix(length) * k1 + h0);
+ }
+};
+
+
+/// \brief A global, fixed seed-override variable.
+///
+/// This variable can be set using the \see llvm::set_fixed_execution_seed
+/// function. See that function for details. Do not, under any circumstances,
+/// set or read this variable.
+extern size_t fixed_seed_override;
+
+inline size_t get_execution_seed() {
+ // FIXME: This needs to be a per-execution seed. This is just a placeholder
+ // implementation. Switching to a per-execution seed is likely to flush out
+ // instability bugs and so will happen as its own commit.
+ //
+ // However, if there is a fixed seed override set the first time this is
+ // called, return that instead of the per-execution seed.
+ const uint64_t seed_prime = 0xff51afd7ed558ccdULL;
+ static size_t seed = fixed_seed_override ? fixed_seed_override
+ : (size_t)seed_prime;
+ return seed;
+}
+
+
+/// \brief Trait to indicate whether a type's bits can be hashed directly.
+///
+/// A type trait which is true if we want to combine values for hashing by
+/// reading the underlying data. It is false if values of this type must
+/// first be passed to hash_value, and the resulting hash_codes combined.
+//
+// FIXME: We want to replace is_integral_or_enum and is_pointer here with
+// a predicate which asserts that comparing the underlying storage of two
+// values of the type for equality is equivalent to comparing the two values
+// for equality. For all the platforms we care about, this holds for integers
+// and pointers, but there are platforms where it doesn't and we would like to
+// support user-defined types which happen to satisfy this property.
+template <typename T> struct is_hashable_data
+ : std::integral_constant<bool, ((is_integral_or_enum<T>::value ||
+ std::is_pointer<T>::value) &&
+ 64 % sizeof(T) == 0)> {};
+
+// Special case std::pair to detect when both types are viable and when there
+// is no alignment-derived padding in the pair. This is a bit of a lie because
+// std::pair isn't truly POD, but it's close enough in all reasonable
+// implementations for our use case of hashing the underlying data.
+template <typename T, typename U> struct is_hashable_data<std::pair<T, U> >
+ : std::integral_constant<bool, (is_hashable_data<T>::value &&
+ is_hashable_data<U>::value &&
+ (sizeof(T) + sizeof(U)) ==
+ sizeof(std::pair<T, U>))> {};
+
+/// \brief Helper to get the hashable data representation for a type.
+/// This variant is enabled when the type itself can be used.
+template <typename T>
+typename std::enable_if<is_hashable_data<T>::value, T>::type
+get_hashable_data(const T &value) {
+ return value;
+}
+/// \brief Helper to get the hashable data representation for a type.
+/// This variant is enabled when we must first call hash_value and use the
+/// result as our data.
+template <typename T>
+typename std::enable_if<!is_hashable_data<T>::value, size_t>::type
+get_hashable_data(const T &value) {
+ using ::llvm::hash_value;
+ return hash_value(value);
+}
+
+/// \brief Helper to store data from a value into a buffer and advance the
+/// pointer into that buffer.
+///
+/// This routine first checks whether there is enough space in the provided
+/// buffer, and if not immediately returns false. If there is space, it
+/// copies the underlying bytes of value into the buffer, advances the
+/// buffer_ptr past the copied bytes, and returns true.
+template <typename T>
+bool store_and_advance(char *&buffer_ptr, char *buffer_end, const T& value,
+ size_t offset = 0) {
+ size_t store_size = sizeof(value) - offset;
+ if (buffer_ptr + store_size > buffer_end)
+ return false;
+ const char *value_data = reinterpret_cast<const char *>(&value);
+ memcpy(buffer_ptr, value_data + offset, store_size);
+ buffer_ptr += store_size;
+ return true;
+}
+
+/// \brief Implement the combining of integral values into a hash_code.
+///
+/// This overload is selected when the value type of the iterator is
+/// integral. Rather than computing a hash_code for each object and then
+/// combining them, this (as an optimization) directly combines the integers.
+template <typename InputIteratorT>
+hash_code hash_combine_range_impl(InputIteratorT first, InputIteratorT last) {
+ const size_t seed = get_execution_seed();
+ char buffer[64], *buffer_ptr = buffer;
+ char *const buffer_end = std::end(buffer);
+ while (first != last && store_and_advance(buffer_ptr, buffer_end,
+ get_hashable_data(*first)))
+ ++first;
+ if (first == last)
+ return hash_short(buffer, buffer_ptr - buffer, seed);
+ assert(buffer_ptr == buffer_end);
+
+ hash_state state = state.create(buffer, seed);
+ size_t length = 64;
+ while (first != last) {
+ // Fill up the buffer. We don't clear it, which re-mixes the last round
+ // when only a partial 64-byte chunk is left.
+ buffer_ptr = buffer;
+ while (first != last && store_and_advance(buffer_ptr, buffer_end,
+ get_hashable_data(*first)))
+ ++first;
+
+ // Rotate the buffer if we did a partial fill in order to simulate doing
+ // a mix of the last 64-bytes. That is how the algorithm works when we
+ // have a contiguous byte sequence, and we want to emulate that here.
+ std::rotate(buffer, buffer_ptr, buffer_end);
+
+ // Mix this chunk into the current state.
+ state.mix(buffer);
+ length += buffer_ptr - buffer;
+ };
+
+ return state.finalize(length);
+}
+
+/// \brief Implement the combining of integral values into a hash_code.
+///
+/// This overload is selected when the value type of the iterator is integral
+/// and when the input iterator is actually a pointer. Rather than computing
+/// a hash_code for each object and then combining them, this (as an
+/// optimization) directly combines the integers. Also, because the integers
+/// are stored in contiguous memory, this routine avoids copying each value
+/// and directly reads from the underlying memory.
+template <typename ValueT>
+typename std::enable_if<is_hashable_data<ValueT>::value, hash_code>::type
+hash_combine_range_impl(ValueT *first, ValueT *last) {
+ const size_t seed = get_execution_seed();
+ const char *s_begin = reinterpret_cast<const char *>(first);
+ const char *s_end = reinterpret_cast<const char *>(last);
+ const size_t length = std::distance(s_begin, s_end);
+ if (length <= 64)
+ return hash_short(s_begin, length, seed);
+
+ const char *s_aligned_end = s_begin + (length & ~63);
+ hash_state state = state.create(s_begin, seed);
+ s_begin += 64;
+ while (s_begin != s_aligned_end) {
+ state.mix(s_begin);
+ s_begin += 64;
+ }
+ if (length & 63)
+ state.mix(s_end - 64);
+
+ return state.finalize(length);
+}
+
+} // namespace detail
+} // namespace hashing
+
+
+/// \brief Compute a hash_code for a sequence of values.
+///
+/// This hashes a sequence of values. It produces the same hash_code as
+/// 'hash_combine(a, b, c, ...)', but can run over arbitrary sized sequences
+/// and is significantly faster given pointers and types which can be hashed as
+/// a sequence of bytes.
+template <typename InputIteratorT>
+hash_code hash_combine_range(InputIteratorT first, InputIteratorT last) {
+ return ::llvm::hashing::detail::hash_combine_range_impl(first, last);
+}
+
+
+// Implementation details for hash_combine.
+namespace hashing {
+namespace detail {
+
+/// \brief Helper class to manage the recursive combining of hash_combine
+/// arguments.
+///
+/// This class exists to manage the state and various calls involved in the
+/// recursive combining of arguments used in hash_combine. It is particularly
+/// useful at minimizing the code in the recursive calls to ease the pain
+/// caused by a lack of variadic functions.
+struct hash_combine_recursive_helper {
+ char buffer[64];
+ hash_state state;
+ const size_t seed;
+
+public:
+ /// \brief Construct a recursive hash combining helper.
+ ///
+ /// This sets up the state for a recursive hash combine, including getting
+ /// the seed and buffer setup.
+ hash_combine_recursive_helper()
+ : seed(get_execution_seed()) {}
+
+ /// \brief Combine one chunk of data into the current in-flight hash.
+ ///
+ /// This merges one chunk of data into the hash. First it tries to buffer
+ /// the data. If the buffer is full, it hashes the buffer into its
+ /// hash_state, empties it, and then merges the new chunk in. This also
+ /// handles cases where the data straddles the end of the buffer.
+ template <typename T>
+ char *combine_data(size_t &length, char *buffer_ptr, char *buffer_end, T data) {
+ if (!store_and_advance(buffer_ptr, buffer_end, data)) {
+ // Check for skew which prevents the buffer from being packed, and do
+ // a partial store into the buffer to fill it. This is only a concern
+ // with the variadic combine because that formation can have varying
+ // argument types.
+ size_t partial_store_size = buffer_end - buffer_ptr;
+ memcpy(buffer_ptr, &data, partial_store_size);
+
+ // If the store fails, our buffer is full and ready to hash. We have to
+ // either initialize the hash state (on the first full buffer) or mix
+ // this buffer into the existing hash state. Length tracks the *hashed*
+ // length, not the buffered length.
+ if (length == 0) {
+ state = state.create(buffer, seed);
+ length = 64;
+ } else {
+ // Mix this chunk into the current state and bump length up by 64.
+ state.mix(buffer);
+ length += 64;
+ }
+ // Reset the buffer_ptr to the head of the buffer for the next chunk of
+ // data.
+ buffer_ptr = buffer;
+
+ // Try again to store into the buffer -- this cannot fail as we only
+ // store types smaller than the buffer.
+ if (!store_and_advance(buffer_ptr, buffer_end, data,
+ partial_store_size))
+ abort();
+ }
+ return buffer_ptr;
+ }
+
+ /// \brief Recursive, variadic combining method.
+ ///
+ /// This function recurses through each argument, combining that argument
+ /// into a single hash.
+ template <typename T, typename ...Ts>
+ hash_code combine(size_t length, char *buffer_ptr, char *buffer_end,
+ const T &arg, const Ts &...args) {
+ buffer_ptr = combine_data(length, buffer_ptr, buffer_end, get_hashable_data(arg));
+
+ // Recurse to the next argument.
+ return combine(length, buffer_ptr, buffer_end, args...);
+ }
+
+ /// \brief Base case for recursive, variadic combining.
+ ///
+ /// The base case when combining arguments recursively is reached when all
+ /// arguments have been handled. It flushes the remaining buffer and
+ /// constructs a hash_code.
+ hash_code combine(size_t length, char *buffer_ptr, char *buffer_end) {
+ // Check whether the entire set of values fit in the buffer. If so, we'll
+ // use the optimized short hashing routine and skip state entirely.
+ if (length == 0)
+ return hash_short(buffer, buffer_ptr - buffer, seed);
+
+ // Mix the final buffer, rotating it if we did a partial fill in order to
+ // simulate doing a mix of the last 64-bytes. That is how the algorithm
+ // works when we have a contiguous byte sequence, and we want to emulate
+ // that here.
+ std::rotate(buffer, buffer_ptr, buffer_end);
+
+ // Mix this chunk into the current state.
+ state.mix(buffer);
+ length += buffer_ptr - buffer;
+
+ return state.finalize(length);
+ }
+};
+
+} // namespace detail
+} // namespace hashing
+
+/// \brief Combine values into a single hash_code.
+///
+/// This routine accepts a varying number of arguments of any type. It will
+/// attempt to combine them into a single hash_code. For user-defined types it
+/// attempts to call a \see hash_value overload (via ADL) for the type. For
+/// integer and pointer types it directly combines their data into the
+/// resulting hash_code.
+///
+/// The result is suitable for returning from a user's hash_value
+/// *implementation* for their user-defined type. Consumers of a type should
+/// *not* call this routine, they should instead call 'hash_value'.
+template <typename ...Ts> hash_code hash_combine(const Ts &...args) {
+ // Recursively hash each argument using a helper class.
+ ::llvm::hashing::detail::hash_combine_recursive_helper helper;
+ return helper.combine(0, helper.buffer, helper.buffer + 64, args...);
+}
+
+// Implementation details for implementations of hash_value overloads provided
+// here.
+namespace hashing {
+namespace detail {
+
+/// \brief Helper to hash the value of a single integer.
+///
+/// Overloads for smaller integer types are not provided to ensure consistent
+/// behavior in the presence of integral promotions. Essentially,
+/// "hash_value('4')" and "hash_value('0' + 4)" should be the same.
+inline hash_code hash_integer_value(uint64_t value) {
+ // Similar to hash_4to8_bytes but using a seed instead of length.
+ const uint64_t seed = get_execution_seed();
+ const char *s = reinterpret_cast<const char *>(&value);
+ const uint64_t a = fetch32(s);
+ return hash_16_bytes(seed + (a << 3), fetch32(s + 4));
+}
+
+} // namespace detail
+} // namespace hashing
+
+// Declared and documented above, but defined here so that any of the hashing
+// infrastructure is available.
+template <typename T>
+typename std::enable_if<is_integral_or_enum<T>::value, hash_code>::type
+hash_value(T value) {
+ return ::llvm::hashing::detail::hash_integer_value(value);
+}
+
+// Declared and documented above, but defined here so that any of the hashing
+// infrastructure is available.
+template <typename T> hash_code hash_value(const T *ptr) {
+ return ::llvm::hashing::detail::hash_integer_value(
+ reinterpret_cast<uintptr_t>(ptr));
+}
+
+// Declared and documented above, but defined here so that any of the hashing
+// infrastructure is available.
+template <typename T, typename U>
+hash_code hash_value(const std::pair<T, U> &arg) {
+ return hash_combine(arg.first, arg.second);
+}
+
+// Declared and documented above, but defined here so that any of the hashing
+// infrastructure is available.
+template <typename T>
+hash_code hash_value(const std::basic_string<T> &arg) {
+ return hash_combine_range(arg.begin(), arg.end());
+}
+
+} // namespace llvm
+
+#endif
diff --git a/ext/include/llvm/ADT/IntrusiveRefCntPtr.h b/ext/include/llvm/ADT/IntrusiveRefCntPtr.h
new file mode 100644
index 0000000..8057ec1
--- /dev/null
+++ b/ext/include/llvm/ADT/IntrusiveRefCntPtr.h
@@ -0,0 +1,288 @@
+//== llvm/ADT/IntrusiveRefCntPtr.h - Smart Refcounting Pointer ---*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines IntrusiveRefCntPtr, a template class that
+// implements a "smart" pointer for objects that maintain their own
+// internal reference count, and RefCountedBase/RefCountedBaseVPTR, two
+// generic base classes for objects that wish to have their lifetimes
+// managed using reference counting.
+//
+// IntrusiveRefCntPtr is similar to Boost's intrusive_ptr with added
+// LLVM-style casting.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_INTRUSIVEREFCNTPTR_H
+#define LLVM_ADT_INTRUSIVEREFCNTPTR_H
+
+#include <atomic>
+#include <cassert>
+#include <cstddef>
+
+namespace llvm {
+
+ template <class T>
+ class IntrusiveRefCntPtr;
+
+//===----------------------------------------------------------------------===//
+/// RefCountedBase - A generic base class for objects that wish to
+/// have their lifetimes managed using reference counts. Classes
+/// subclass RefCountedBase to obtain such functionality, and are
+/// typically handled with IntrusiveRefCntPtr "smart pointers" (see below)
+/// which automatically handle the management of reference counts.
+/// Objects that subclass RefCountedBase should not be allocated on
+/// the stack, as invoking "delete" (which is called when the
+/// reference count hits 0) on such objects is an error.
+//===----------------------------------------------------------------------===//
+ template <class Derived>
+ class RefCountedBase {
+ mutable unsigned ref_cnt;
+
+ public:
+ RefCountedBase() : ref_cnt(0) {}
+ RefCountedBase(const RefCountedBase &) : ref_cnt(0) {}
+
+ void Retain() const { ++ref_cnt; }
+ void Release() const {
+ assert (ref_cnt > 0 && "Reference count is already zero.");
+ if (--ref_cnt == 0) delete static_cast<const Derived*>(this);
+ }
+ };
+
+//===----------------------------------------------------------------------===//
+/// RefCountedBaseVPTR - A class that has the same function as
+/// RefCountedBase, but with a virtual destructor. Should be used
+/// instead of RefCountedBase for classes that already have virtual
+/// methods to enforce dynamic allocation via 'new'. Classes that
+/// inherit from RefCountedBaseVPTR can't be allocated on stack -
+/// attempting to do this will produce a compile error.
+//===----------------------------------------------------------------------===//
+ class RefCountedBaseVPTR {
+ mutable unsigned ref_cnt;
+ virtual void anchor();
+
+ protected:
+ RefCountedBaseVPTR() : ref_cnt(0) {}
+ RefCountedBaseVPTR(const RefCountedBaseVPTR &) : ref_cnt(0) {}
+
+ virtual ~RefCountedBaseVPTR() {}
+
+ void Retain() const { ++ref_cnt; }
+ void Release() const {
+ assert (ref_cnt > 0 && "Reference count is already zero.");
+ if (--ref_cnt == 0) delete this;
+ }
+
+ template <typename T>
+ friend struct IntrusiveRefCntPtrInfo;
+ };
+
+
+ template <typename T> struct IntrusiveRefCntPtrInfo {
+ static void retain(T *obj) { obj->Retain(); }
+ static void release(T *obj) { obj->Release(); }
+ };
+
+/// \brief A thread-safe version of \c llvm::RefCountedBase.
+///
+/// A generic base class for objects that wish to have their lifetimes managed
+/// using reference counts. Classes subclass \c ThreadSafeRefCountedBase to
+/// obtain such functionality, and are typically handled with
+/// \c IntrusiveRefCntPtr "smart pointers" which automatically handle the
+/// management of reference counts.
+template <class Derived>
+class ThreadSafeRefCountedBase {
+ mutable std::atomic<int> RefCount;
+
+protected:
+ ThreadSafeRefCountedBase() : RefCount(0) {}
+
+public:
+ void Retain() const { ++RefCount; }
+
+ void Release() const {
+ int NewRefCount = --RefCount;
+ assert(NewRefCount >= 0 && "Reference count was already zero.");
+ if (NewRefCount == 0)
+ delete static_cast<const Derived*>(this);
+ }
+};
+
+//===----------------------------------------------------------------------===//
+/// IntrusiveRefCntPtr - A template class that implements a "smart pointer"
+/// that assumes the wrapped object has a reference count associated
+/// with it that can be managed via calls to
+/// IntrusivePtrAddRef/IntrusivePtrRelease. The smart pointers
+/// manage reference counts via the RAII idiom: upon creation of
+/// smart pointer the reference count of the wrapped object is
+/// incremented and upon destruction of the smart pointer the
+/// reference count is decremented. This class also safely handles
+/// wrapping NULL pointers.
+///
+/// Reference counting is implemented via calls to
+/// Obj->Retain()/Obj->Release(). Release() is required to destroy
+/// the object when the reference count reaches zero. Inheriting from
+/// RefCountedBase/RefCountedBaseVPTR takes care of this
+/// automatically.
+//===----------------------------------------------------------------------===//
+ template <typename T>
+ class IntrusiveRefCntPtr {
+ T* Obj;
+
+ public:
+ typedef T element_type;
+
+ explicit IntrusiveRefCntPtr() : Obj(nullptr) {}
+
+ IntrusiveRefCntPtr(T* obj) : Obj(obj) {
+ retain();
+ }
+
+ IntrusiveRefCntPtr(const IntrusiveRefCntPtr& S) : Obj(S.Obj) {
+ retain();
+ }
+
+ IntrusiveRefCntPtr(IntrusiveRefCntPtr&& S) : Obj(S.Obj) {
+ S.Obj = nullptr;
+ }
+
+ template <class X>
+ IntrusiveRefCntPtr(IntrusiveRefCntPtr<X>&& S) : Obj(S.get()) {
+ S.Obj = nullptr;
+ }
+
+ template <class X>
+ IntrusiveRefCntPtr(const IntrusiveRefCntPtr<X>& S)
+ : Obj(S.get()) {
+ retain();
+ }
+
+ IntrusiveRefCntPtr& operator=(IntrusiveRefCntPtr S) {
+ swap(S);
+ return *this;
+ }
+
+ ~IntrusiveRefCntPtr() { release(); }
+
+ T& operator*() const { return *Obj; }
+
+ T* operator->() const { return Obj; }
+
+ T* get() const { return Obj; }
+
+ explicit operator bool() const { return Obj; }
+
+ void swap(IntrusiveRefCntPtr& other) {
+ T* tmp = other.Obj;
+ other.Obj = Obj;
+ Obj = tmp;
+ }
+
+ void reset() {
+ release();
+ Obj = nullptr;
+ }
+
+ void resetWithoutRelease() {
+ Obj = nullptr;
+ }
+
+ private:
+ void retain() { if (Obj) IntrusiveRefCntPtrInfo<T>::retain(Obj); }
+ void release() { if (Obj) IntrusiveRefCntPtrInfo<T>::release(Obj); }
+
+ template <typename X>
+ friend class IntrusiveRefCntPtr;
+ };
+
+ template<class T, class U>
+ inline bool operator==(const IntrusiveRefCntPtr<T>& A,
+ const IntrusiveRefCntPtr<U>& B)
+ {
+ return A.get() == B.get();
+ }
+
+ template<class T, class U>
+ inline bool operator!=(const IntrusiveRefCntPtr<T>& A,
+ const IntrusiveRefCntPtr<U>& B)
+ {
+ return A.get() != B.get();
+ }
+
+ template<class T, class U>
+ inline bool operator==(const IntrusiveRefCntPtr<T>& A,
+ U* B)
+ {
+ return A.get() == B;
+ }
+
+ template<class T, class U>
+ inline bool operator!=(const IntrusiveRefCntPtr<T>& A,
+ U* B)
+ {
+ return A.get() != B;
+ }
+
+ template<class T, class U>
+ inline bool operator==(T* A,
+ const IntrusiveRefCntPtr<U>& B)
+ {
+ return A == B.get();
+ }
+
+ template<class T, class U>
+ inline bool operator!=(T* A,
+ const IntrusiveRefCntPtr<U>& B)
+ {
+ return A != B.get();
+ }
+
+ template <class T>
+ bool operator==(std::nullptr_t A, const IntrusiveRefCntPtr<T> &B) {
+ return !B;
+ }
+
+ template <class T>
+ bool operator==(const IntrusiveRefCntPtr<T> &A, std::nullptr_t B) {
+ return B == A;
+ }
+
+ template <class T>
+ bool operator!=(std::nullptr_t A, const IntrusiveRefCntPtr<T> &B) {
+ return !(A == B);
+ }
+
+ template <class T>
+ bool operator!=(const IntrusiveRefCntPtr<T> &A, std::nullptr_t B) {
+ return !(A == B);
+ }
+
+//===----------------------------------------------------------------------===//
+// LLVM-style downcasting support for IntrusiveRefCntPtr objects
+//===----------------------------------------------------------------------===//
+
+ template <typename From> struct simplify_type;
+
+ template<class T> struct simplify_type<IntrusiveRefCntPtr<T> > {
+ typedef T* SimpleType;
+ static SimpleType getSimplifiedValue(IntrusiveRefCntPtr<T>& Val) {
+ return Val.get();
+ }
+ };
+
+ template<class T> struct simplify_type<const IntrusiveRefCntPtr<T> > {
+ typedef /*const*/ T* SimpleType;
+ static SimpleType getSimplifiedValue(const IntrusiveRefCntPtr<T>& Val) {
+ return Val.get();
+ }
+ };
+
+} // end namespace llvm
+
+#endif // LLVM_ADT_INTRUSIVEREFCNTPTR_H
diff --git a/ext/include/llvm/ADT/None.h b/ext/include/llvm/ADT/None.h
new file mode 100644
index 0000000..d69ec17
--- /dev/null
+++ b/ext/include/llvm/ADT/None.h
@@ -0,0 +1,26 @@
+//===-- None.h - Simple null value for implicit construction ------*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides None, an enumerator for use in implicit constructors
+// of various (usually templated) types to make such construction more
+// terse.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_NONE_H
+#define LLVM_ADT_NONE_H
+
+namespace llvm {
+/// \brief A simple null object to allow implicit construction of Optional<T>
+/// and similar types without having to spell out the specialization's name.
+enum class NoneType { None };
+const NoneType None = None;
+}
+
+#endif
diff --git a/ext/include/llvm/ADT/Optional.h b/ext/include/llvm/ADT/Optional.h
new file mode 100644
index 0000000..d9acaf6
--- /dev/null
+++ b/ext/include/llvm/ADT/Optional.h
@@ -0,0 +1,228 @@
+//===-- Optional.h - Simple variant for passing optional values ---*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Optional, a template class modeled in the spirit of
+// OCaml's 'opt' variant. The idea is to strongly type whether or not
+// a value can be optional.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_OPTIONAL_H
+#define LLVM_ADT_OPTIONAL_H
+
+#include "llvm/ADT/None.h"
+#include "llvm/Support/AlignOf.h"
+#include "llvm/Support/Compiler.h"
+#include <cassert>
+#include <new>
+#include <utility>
+
+namespace llvm {
+
+template<typename T>
+class Optional {
+ AlignedCharArrayUnion<T> storage;
+ bool hasVal;
+public:
+ typedef T value_type;
+
+ Optional(NoneType) : hasVal(false) {}
+ explicit Optional() : hasVal(false) {}
+ Optional(const T &y) : hasVal(true) {
+ new (storage.buffer) T(y);
+ }
+ Optional(const Optional &O) : hasVal(O.hasVal) {
+ if (hasVal)
+ new (storage.buffer) T(*O);
+ }
+
+ Optional(T &&y) : hasVal(true) {
+ new (storage.buffer) T(std::forward<T>(y));
+ }
+ Optional(Optional<T> &&O) : hasVal(O) {
+ if (O) {
+ new (storage.buffer) T(std::move(*O));
+ O.reset();
+ }
+ }
+ Optional &operator=(T &&y) {
+ if (hasVal)
+ **this = std::move(y);
+ else {
+ new (storage.buffer) T(std::move(y));
+ hasVal = true;
+ }
+ return *this;
+ }
+ Optional &operator=(Optional &&O) {
+ if (!O)
+ reset();
+ else {
+ *this = std::move(*O);
+ O.reset();
+ }
+ return *this;
+ }
+
+ /// Create a new object by constructing it in place with the given arguments.
+ template<typename ...ArgTypes>
+ void emplace(ArgTypes &&...Args) {
+ reset();
+ hasVal = true;
+ new (storage.buffer) T(std::forward<ArgTypes>(Args)...);
+ }
+
+ static inline Optional create(const T* y) {
+ return y ? Optional(*y) : Optional();
+ }
+
+ // FIXME: these assignments (& the equivalent const T&/const Optional& ctors)
+ // could be made more efficient by passing by value, possibly unifying them
+ // with the rvalue versions above - but this could place a different set of
+ // requirements (notably: the existence of a default ctor) when implemented
+ // in that way. Careful SFINAE to avoid such pitfalls would be required.
+ Optional &operator=(const T &y) {
+ if (hasVal)
+ **this = y;
+ else {
+ new (storage.buffer) T(y);
+ hasVal = true;
+ }
+ return *this;
+ }
+
+ Optional &operator=(const Optional &O) {
+ if (!O)
+ reset();
+ else
+ *this = *O;
+ return *this;
+ }
+
+ void reset() {
+ if (hasVal) {
+ (**this).~T();
+ hasVal = false;
+ }
+ }
+
+ ~Optional() {
+ reset();
+ }
+
+ const T* getPointer() const { assert(hasVal); return reinterpret_cast<const T*>(storage.buffer); }
+ T* getPointer() { assert(hasVal); return reinterpret_cast<T*>(storage.buffer); }
+ const T& getValue() const LLVM_LVALUE_FUNCTION { assert(hasVal); return *getPointer(); }
+ T& getValue() LLVM_LVALUE_FUNCTION { assert(hasVal); return *getPointer(); }
+
+ explicit operator bool() const { return hasVal; }
+ bool hasValue() const { return hasVal; }
+ const T* operator->() const { return getPointer(); }
+ T* operator->() { return getPointer(); }
+ const T& operator*() const LLVM_LVALUE_FUNCTION { assert(hasVal); return *getPointer(); }
+ T& operator*() LLVM_LVALUE_FUNCTION { assert(hasVal); return *getPointer(); }
+
+ template <typename U>
+ LLVM_CONSTEXPR T getValueOr(U &&value) const LLVM_LVALUE_FUNCTION {
+ return hasValue() ? getValue() : std::forward<U>(value);
+ }
+
+#if LLVM_HAS_RVALUE_REFERENCE_THIS
+ T&& getValue() && { assert(hasVal); return std::move(*getPointer()); }
+ T&& operator*() && { assert(hasVal); return std::move(*getPointer()); }
+
+ template <typename U>
+ T getValueOr(U &&value) && {
+ return hasValue() ? std::move(getValue()) : std::forward<U>(value);
+ }
+#endif
+};
+
+template <typename T> struct isPodLike;
+template <typename T> struct isPodLike<Optional<T> > {
+ // An Optional<T> is pod-like if T is.
+ static const bool value = isPodLike<T>::value;
+};
+
+/// \brief Poison comparison between two \c Optional objects. Clients needs to
+/// explicitly compare the underlying values and account for empty \c Optional
+/// objects.
+///
+/// This routine will never be defined. It returns \c void to help diagnose
+/// errors at compile time.
+template<typename T, typename U>
+void operator==(const Optional<T> &X, const Optional<U> &Y);
+
+template<typename T>
+bool operator==(const Optional<T> &X, NoneType) {
+ return !X.hasValue();
+}
+
+template<typename T>
+bool operator==(NoneType, const Optional<T> &X) {
+ return X == None;
+}
+
+template<typename T>
+bool operator!=(const Optional<T> &X, NoneType) {
+ return !(X == None);
+}
+
+template<typename T>
+bool operator!=(NoneType, const Optional<T> &X) {
+ return X != None;
+}
+/// \brief Poison comparison between two \c Optional objects. Clients needs to
+/// explicitly compare the underlying values and account for empty \c Optional
+/// objects.
+///
+/// This routine will never be defined. It returns \c void to help diagnose
+/// errors at compile time.
+template<typename T, typename U>
+void operator!=(const Optional<T> &X, const Optional<U> &Y);
+
+/// \brief Poison comparison between two \c Optional objects. Clients needs to
+/// explicitly compare the underlying values and account for empty \c Optional
+/// objects.
+///
+/// This routine will never be defined. It returns \c void to help diagnose
+/// errors at compile time.
+template<typename T, typename U>
+void operator<(const Optional<T> &X, const Optional<U> &Y);
+
+/// \brief Poison comparison between two \c Optional objects. Clients needs to
+/// explicitly compare the underlying values and account for empty \c Optional
+/// objects.
+///
+/// This routine will never be defined. It returns \c void to help diagnose
+/// errors at compile time.
+template<typename T, typename U>
+void operator<=(const Optional<T> &X, const Optional<U> &Y);
+
+/// \brief Poison comparison between two \c Optional objects. Clients needs to
+/// explicitly compare the underlying values and account for empty \c Optional
+/// objects.
+///
+/// This routine will never be defined. It returns \c void to help diagnose
+/// errors at compile time.
+template<typename T, typename U>
+void operator>=(const Optional<T> &X, const Optional<U> &Y);
+
+/// \brief Poison comparison between two \c Optional objects. Clients needs to
+/// explicitly compare the underlying values and account for empty \c Optional
+/// objects.
+///
+/// This routine will never be defined. It returns \c void to help diagnose
+/// errors at compile time.
+template<typename T, typename U>
+void operator>(const Optional<T> &X, const Optional<U> &Y);
+
+} // end llvm namespace
+
+#endif
diff --git a/ext/include/llvm/ADT/PointerEmbeddedInt.h b/ext/include/llvm/ADT/PointerEmbeddedInt.h
new file mode 100644
index 0000000..8781d18
--- /dev/null
+++ b/ext/include/llvm/ADT/PointerEmbeddedInt.h
@@ -0,0 +1,103 @@
+//===- llvm/ADT/PointerEmbeddedInt.h ----------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_POINTEREMBEDDEDINT_H
+#define LLVM_ADT_POINTEREMBEDDEDINT_H
+
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/Support/PointerLikeTypeTraits.h"
+#include <climits>
+
+namespace llvm {
+
+/// Utility to embed an integer into a pointer-like type. This is specifically
+/// intended to allow embedding integers where fewer bits are required than
+/// exist in a pointer, and the integer can participate in abstractions along
+/// side other pointer-like types. For example it can be placed into a \c
+/// PointerSumType or \c PointerUnion.
+///
+/// Note that much like pointers, an integer value of zero has special utility
+/// due to boolean conversions. For example, a non-null value can be tested for
+/// in the above abstractions without testing the particular active member.
+/// Also, the default constructed value zero initializes the integer.
+template <typename IntT, int Bits = sizeof(IntT) * CHAR_BIT>
+class PointerEmbeddedInt {
+ uintptr_t Value;
+
+ static_assert(Bits < sizeof(uintptr_t) * CHAR_BIT,
+ "Cannot embed more bits than we have in a pointer!");
+
+ enum : uintptr_t {
+ // We shift as many zeros into the value as we can while preserving the
+ // number of bits desired for the integer.
+ Shift = sizeof(uintptr_t) * CHAR_BIT - Bits,
+
+ // We also want to be able to mask out the preserved bits for asserts.
+ Mask = static_cast<uintptr_t>(-1) << Bits
+ };
+
+ friend class PointerLikeTypeTraits<PointerEmbeddedInt>;
+
+ explicit PointerEmbeddedInt(uintptr_t Value) : Value(Value) {}
+
+public:
+ PointerEmbeddedInt() : Value(0) {}
+
+ PointerEmbeddedInt(IntT I) : Value(static_cast<uintptr_t>(I) << Shift) {
+ assert((I & Mask) == 0 && "Integer has bits outside those preserved!");
+ }
+
+ PointerEmbeddedInt &operator=(IntT I) {
+ assert((I & Mask) == 0 && "Integer has bits outside those preserved!");
+ Value = static_cast<uintptr_t>(I) << Shift;
+ }
+
+ // Note that this imilict conversion additionally allows all of the basic
+ // comparison operators to work transparently, etc.
+ operator IntT() const { return static_cast<IntT>(Value >> Shift); }
+};
+
+// Provide pointer like traits to support use with pointer unions and sum
+// types.
+template <typename IntT, int Bits>
+class PointerLikeTypeTraits<PointerEmbeddedInt<IntT, Bits>> {
+ typedef PointerEmbeddedInt<IntT, Bits> T;
+
+public:
+ static inline void *getAsVoidPointer(const T &P) {
+ return reinterpret_cast<void *>(P.Value);
+ }
+ static inline T getFromVoidPointer(void *P) {
+ return T(reinterpret_cast<uintptr_t>(P));
+ }
+ static inline T getFromVoidPointer(const void *P) {
+ return T(reinterpret_cast<uintptr_t>(P));
+ }
+
+ enum { NumLowBitsAvailable = T::Shift };
+};
+
+// Teach DenseMap how to use PointerEmbeddedInt objects as keys if the Int type
+// itself can be a key.
+template <typename IntT, int Bits>
+struct DenseMapInfo<PointerEmbeddedInt<IntT, Bits>> {
+ typedef PointerEmbeddedInt<IntT, Bits> T;
+
+ typedef DenseMapInfo<IntT> IntInfo;
+
+ static inline T getEmptyKey() { return IntInfo::getEmptyKey(); }
+ static inline T getTombstoneKey() { return IntInfo::getTombstoneKey(); }
+ static unsigned getHashValue(const T &Arg) {
+ return IntInfo::getHashValue(Arg);
+ }
+ static bool isEqual(const T &LHS, const T &RHS) { return LHS == RHS; }
+};
+}
+
+#endif
diff --git a/ext/include/llvm/ADT/PointerIntPair.h b/ext/include/llvm/ADT/PointerIntPair.h
new file mode 100644
index 0000000..83fbf12
--- /dev/null
+++ b/ext/include/llvm/ADT/PointerIntPair.h
@@ -0,0 +1,223 @@
+//===- llvm/ADT/PointerIntPair.h - Pair for pointer and int -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the PointerIntPair class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_POINTERINTPAIR_H
+#define LLVM_ADT_POINTERINTPAIR_H
+
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/PointerLikeTypeTraits.h"
+#include <cassert>
+#include <limits>
+
+namespace llvm {
+
+template <typename T> struct DenseMapInfo;
+
+template <typename PointerT, unsigned IntBits, typename PtrTraits>
+struct PointerIntPairInfo;
+
+/// PointerIntPair - This class implements a pair of a pointer and small
+/// integer. It is designed to represent this in the space required by one
+/// pointer by bitmangling the integer into the low part of the pointer. This
+/// can only be done for small integers: typically up to 3 bits, but it depends
+/// on the number of bits available according to PointerLikeTypeTraits for the
+/// type.
+///
+/// Note that PointerIntPair always puts the IntVal part in the highest bits
+/// possible. For example, PointerIntPair<void*, 1, bool> will put the bit for
+/// the bool into bit #2, not bit #0, which allows the low two bits to be used
+/// for something else. For example, this allows:
+/// PointerIntPair<PointerIntPair<void*, 1, bool>, 1, bool>
+/// ... and the two bools will land in different bits.
+///
+template <typename PointerTy, unsigned IntBits, typename IntType = unsigned,
+ typename PtrTraits = PointerLikeTypeTraits<PointerTy>,
+ typename Info = PointerIntPairInfo<PointerTy, IntBits, PtrTraits>>
+class PointerIntPair {
+ intptr_t Value;
+
+public:
+ PointerIntPair() : Value(0) {}
+ PointerIntPair(PointerTy PtrVal, IntType IntVal) {
+ setPointerAndInt(PtrVal, IntVal);
+ }
+ explicit PointerIntPair(PointerTy PtrVal) { initWithPointer(PtrVal); }
+
+ PointerTy getPointer() const { return Info::getPointer(Value); }
+
+ IntType getInt() const {
+ return (IntType)Info::getInt(Value);
+ }
+
+ void setPointer(PointerTy PtrVal) {
+ Value = Info::updatePointer(Value, PtrVal);
+ }
+
+ void setInt(IntType IntVal) {
+ Value = Info::updateInt(Value, static_cast<intptr_t>(IntVal));
+ }
+
+ void initWithPointer(PointerTy PtrVal) {
+ Value = Info::updatePointer(0, PtrVal);
+ }
+
+ void setPointerAndInt(PointerTy PtrVal, IntType IntVal) {
+ Value = Info::updateInt(Info::updatePointer(0, PtrVal),
+ static_cast<intptr_t>(IntVal));
+ }
+
+ PointerTy const *getAddrOfPointer() const {
+ return const_cast<PointerIntPair *>(this)->getAddrOfPointer();
+ }
+
+ PointerTy *getAddrOfPointer() {
+ assert(Value == reinterpret_cast<intptr_t>(getPointer()) &&
+ "Can only return the address if IntBits is cleared and "
+ "PtrTraits doesn't change the pointer");
+ return reinterpret_cast<PointerTy *>(&Value);
+ }
+
+ void *getOpaqueValue() const { return reinterpret_cast<void *>(Value); }
+ void setFromOpaqueValue(void *Val) {
+ Value = reinterpret_cast<intptr_t>(Val);
+ }
+
+ static PointerIntPair getFromOpaqueValue(void *V) {
+ PointerIntPair P;
+ P.setFromOpaqueValue(V);
+ return P;
+ }
+
+ // Allow PointerIntPairs to be created from const void * if and only if the
+ // pointer type could be created from a const void *.
+ static PointerIntPair getFromOpaqueValue(const void *V) {
+ (void)PtrTraits::getFromVoidPointer(V);
+ return getFromOpaqueValue(const_cast<void *>(V));
+ }
+
+ bool operator==(const PointerIntPair &RHS) const {
+ return Value == RHS.Value;
+ }
+ bool operator!=(const PointerIntPair &RHS) const {
+ return Value != RHS.Value;
+ }
+ bool operator<(const PointerIntPair &RHS) const { return Value < RHS.Value; }
+ bool operator>(const PointerIntPair &RHS) const { return Value > RHS.Value; }
+ bool operator<=(const PointerIntPair &RHS) const {
+ return Value <= RHS.Value;
+ }
+ bool operator>=(const PointerIntPair &RHS) const {
+ return Value >= RHS.Value;
+ }
+};
+
+template <typename PointerT, unsigned IntBits, typename PtrTraits>
+struct PointerIntPairInfo {
+ static_assert(PtrTraits::NumLowBitsAvailable <
+ std::numeric_limits<uintptr_t>::digits,
+ "cannot use a pointer type that has all bits free");
+ static_assert(IntBits <= PtrTraits::NumLowBitsAvailable,
+ "PointerIntPair with integer size too large for pointer");
+ enum : uintptr_t {
+ /// PointerBitMask - The bits that come from the pointer.
+ PointerBitMask =
+ ~(uintptr_t)(((intptr_t)1 << PtrTraits::NumLowBitsAvailable) - 1),
+
+ /// IntShift - The number of low bits that we reserve for other uses, and
+ /// keep zero.
+ IntShift = (uintptr_t)PtrTraits::NumLowBitsAvailable - IntBits,
+
+ /// IntMask - This is the unshifted mask for valid bits of the int type.
+ IntMask = (uintptr_t)(((intptr_t)1 << IntBits) - 1),
+
+ // ShiftedIntMask - This is the bits for the integer shifted in place.
+ ShiftedIntMask = (uintptr_t)(IntMask << IntShift)
+ };
+
+ static PointerT getPointer(intptr_t Value) {
+ return PtrTraits::getFromVoidPointer(
+ reinterpret_cast<void *>(Value & PointerBitMask));
+ }
+
+ static intptr_t getInt(intptr_t Value) {
+ return (Value >> IntShift) & IntMask;
+ }
+
+ static intptr_t updatePointer(intptr_t OrigValue, PointerT Ptr) {
+ intptr_t PtrWord =
+ reinterpret_cast<intptr_t>(PtrTraits::getAsVoidPointer(Ptr));
+ assert((PtrWord & ~PointerBitMask) == 0 &&
+ "Pointer is not sufficiently aligned");
+ // Preserve all low bits, just update the pointer.
+ return PtrWord | (OrigValue & ~PointerBitMask);
+ }
+
+ static intptr_t updateInt(intptr_t OrigValue, intptr_t Int) {
+ intptr_t IntWord = static_cast<intptr_t>(Int);
+ assert((IntWord & ~IntMask) == 0 && "Integer too large for field");
+
+ // Preserve all bits other than the ones we are updating.
+ return (OrigValue & ~ShiftedIntMask) | IntWord << IntShift;
+ }
+};
+
+template <typename T> struct isPodLike;
+template <typename PointerTy, unsigned IntBits, typename IntType>
+struct isPodLike<PointerIntPair<PointerTy, IntBits, IntType>> {
+ static const bool value = true;
+};
+
+// Provide specialization of DenseMapInfo for PointerIntPair.
+template <typename PointerTy, unsigned IntBits, typename IntType>
+struct DenseMapInfo<PointerIntPair<PointerTy, IntBits, IntType>> {
+ typedef PointerIntPair<PointerTy, IntBits, IntType> Ty;
+ static Ty getEmptyKey() {
+ uintptr_t Val = static_cast<uintptr_t>(-1);
+ Val <<= PointerLikeTypeTraits<Ty>::NumLowBitsAvailable;
+ return Ty::getFromOpaqueValue(reinterpret_cast<void *>(Val));
+ }
+ static Ty getTombstoneKey() {
+ uintptr_t Val = static_cast<uintptr_t>(-2);
+ Val <<= PointerLikeTypeTraits<PointerTy>::NumLowBitsAvailable;
+ return Ty::getFromOpaqueValue(reinterpret_cast<void *>(Val));
+ }
+ static unsigned getHashValue(Ty V) {
+ uintptr_t IV = reinterpret_cast<uintptr_t>(V.getOpaqueValue());
+ return unsigned(IV) ^ unsigned(IV >> 9);
+ }
+ static bool isEqual(const Ty &LHS, const Ty &RHS) { return LHS == RHS; }
+};
+
+// Teach SmallPtrSet that PointerIntPair is "basically a pointer".
+template <typename PointerTy, unsigned IntBits, typename IntType,
+ typename PtrTraits>
+class PointerLikeTypeTraits<
+ PointerIntPair<PointerTy, IntBits, IntType, PtrTraits>> {
+public:
+ static inline void *
+ getAsVoidPointer(const PointerIntPair<PointerTy, IntBits, IntType> &P) {
+ return P.getOpaqueValue();
+ }
+ static inline PointerIntPair<PointerTy, IntBits, IntType>
+ getFromVoidPointer(void *P) {
+ return PointerIntPair<PointerTy, IntBits, IntType>::getFromOpaqueValue(P);
+ }
+ static inline PointerIntPair<PointerTy, IntBits, IntType>
+ getFromVoidPointer(const void *P) {
+ return PointerIntPair<PointerTy, IntBits, IntType>::getFromOpaqueValue(P);
+ }
+ enum { NumLowBitsAvailable = PtrTraits::NumLowBitsAvailable - IntBits };
+};
+
+} // end namespace llvm
+#endif
diff --git a/ext/include/llvm/ADT/PointerSumType.h b/ext/include/llvm/ADT/PointerSumType.h
new file mode 100644
index 0000000..6b8618f
--- /dev/null
+++ b/ext/include/llvm/ADT/PointerSumType.h
@@ -0,0 +1,205 @@
+//===- llvm/ADT/PointerSumType.h --------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_POINTERSUMTYPE_H
+#define LLVM_ADT_POINTERSUMTYPE_H
+
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/PointerLikeTypeTraits.h"
+
+namespace llvm {
+
+/// A compile time pair of an integer tag and the pointer-like type which it
+/// indexes within a sum type. Also allows the user to specify a particular
+/// traits class for pointer types with custom behavior such as over-aligned
+/// allocation.
+template <uintptr_t N, typename PointerArgT,
+ typename TraitsArgT = PointerLikeTypeTraits<PointerArgT>>
+struct PointerSumTypeMember {
+ enum { Tag = N };
+ typedef PointerArgT PointerT;
+ typedef TraitsArgT TraitsT;
+};
+
+namespace detail {
+
+template <typename TagT, typename... MemberTs>
+struct PointerSumTypeHelper;
+
+}
+
+/// A sum type over pointer-like types.
+///
+/// This is a normal tagged union across pointer-like types that uses the low
+/// bits of the pointers to store the tag.
+///
+/// Each member of the sum type is specified by passing a \c
+/// PointerSumTypeMember specialization in the variadic member argument list.
+/// This allows the user to control the particular tag value associated with
+/// a particular type, use the same type for multiple different tags, and
+/// customize the pointer-like traits used for a particular member. Note that
+/// these *must* be specializations of \c PointerSumTypeMember, no other type
+/// will suffice, even if it provides a compatible interface.
+///
+/// This type implements all of the comparison operators and even hash table
+/// support by comparing the underlying storage of the pointer values. It
+/// doesn't support delegating to particular members for comparisons.
+///
+/// It also default constructs to a zero tag with a null pointer, whatever that
+/// would be. This means that the zero value for the tag type is significant
+/// and may be desireable to set to a state that is particularly desirable to
+/// default construct.
+///
+/// There is no support for constructing or accessing with a dynamic tag as
+/// that would fundamentally violate the type safety provided by the sum type.
+template <typename TagT, typename... MemberTs> class PointerSumType {
+ uintptr_t Value;
+
+ typedef detail::PointerSumTypeHelper<TagT, MemberTs...> HelperT;
+
+public:
+ PointerSumType() : Value(0) {}
+
+ /// A typed constructor for a specific tagged member of the sum type.
+ template <TagT N>
+ static PointerSumType
+ create(typename HelperT::template Lookup<N>::PointerT Pointer) {
+ PointerSumType Result;
+ void *V = HelperT::template Lookup<N>::TraitsT::getAsVoidPointer(Pointer);
+ assert((reinterpret_cast<uintptr_t>(V) & HelperT::TagMask) == 0 &&
+ "Pointer is insufficiently aligned to store the discriminant!");
+ Result.Value = reinterpret_cast<uintptr_t>(V) | N;
+ return Result;
+ }
+
+ TagT getTag() const { return static_cast<TagT>(Value & HelperT::TagMask); }
+
+ template <TagT N> bool is() const { return N == getTag(); }
+
+ template <TagT N> typename HelperT::template Lookup<N>::PointerT get() const {
+ void *P = is<N>() ? getImpl() : nullptr;
+ return HelperT::template Lookup<N>::TraitsT::getFromVoidPointer(P);
+ }
+
+ template <TagT N>
+ typename HelperT::template Lookup<N>::PointerT cast() const {
+ assert(is<N>() && "This instance has a different active member.");
+ return HelperT::template Lookup<N>::TraitsT::getFromVoidPointer(getImpl());
+ }
+
+ operator bool() const { return Value & HelperT::PointerMask; }
+ bool operator==(const PointerSumType &R) const { return Value == R.Value; }
+ bool operator!=(const PointerSumType &R) const { return Value != R.Value; }
+ bool operator<(const PointerSumType &R) const { return Value < R.Value; }
+ bool operator>(const PointerSumType &R) const { return Value > R.Value; }
+ bool operator<=(const PointerSumType &R) const { return Value <= R.Value; }
+ bool operator>=(const PointerSumType &R) const { return Value >= R.Value; }
+
+ uintptr_t getOpaqueValue() const { return Value; }
+
+protected:
+ void *getImpl() const {
+ return reinterpret_cast<void *>(Value & HelperT::PointerMask);
+ }
+};
+
+namespace detail {
+
+/// A helper template for implementing \c PointerSumType. It provides fast
+/// compile-time lookup of the member from a particular tag value, along with
+/// useful constants and compile time checking infrastructure..
+template <typename TagT, typename... MemberTs>
+struct PointerSumTypeHelper : MemberTs... {
+ // First we use a trick to allow quickly looking up information about
+ // a particular member of the sum type. This works because we arranged to
+ // have this type derive from all of the member type templates. We can select
+ // the matching member for a tag using type deduction during overload
+ // resolution.
+ template <TagT N, typename PointerT, typename TraitsT>
+ static PointerSumTypeMember<N, PointerT, TraitsT>
+ LookupOverload(PointerSumTypeMember<N, PointerT, TraitsT> *);
+ template <TagT N> static void LookupOverload(...);
+ template <TagT N> struct Lookup {
+ // Compute a particular member type by resolving the lookup helper ovorload.
+ typedef decltype(LookupOverload<N>(
+ static_cast<PointerSumTypeHelper *>(nullptr))) MemberT;
+
+ /// The Nth member's pointer type.
+ typedef typename MemberT::PointerT PointerT;
+
+ /// The Nth member's traits type.
+ typedef typename MemberT::TraitsT TraitsT;
+ };
+
+ // Next we need to compute the number of bits available for the discriminant
+ // by taking the min of the bits available for each member. Much of this
+ // would be amazingly easier with good constexpr support.
+ template <uintptr_t V, uintptr_t... Vs>
+ struct Min : std::integral_constant<
+ uintptr_t, (V < Min<Vs...>::value ? V : Min<Vs...>::value)> {
+ };
+ template <uintptr_t V>
+ struct Min<V> : std::integral_constant<uintptr_t, V> {};
+ enum { NumTagBits = Min<MemberTs::TraitsT::NumLowBitsAvailable...>::value };
+
+ // Also compute the smallest discriminant and various masks for convenience.
+ enum : uint64_t {
+ MinTag = Min<MemberTs::Tag...>::value,
+ PointerMask = static_cast<uint64_t>(-1) << NumTagBits,
+ TagMask = ~PointerMask
+ };
+
+ // Finally we need a recursive template to do static checks of each
+ // member.
+ template <typename MemberT, typename... InnerMemberTs>
+ struct Checker : Checker<InnerMemberTs...> {
+ static_assert(MemberT::Tag < (1 << NumTagBits),
+ "This discriminant value requires too many bits!");
+ };
+ template <typename MemberT> struct Checker<MemberT> : std::true_type {
+ static_assert(MemberT::Tag < (1 << NumTagBits),
+ "This discriminant value requires too many bits!");
+ };
+ static_assert(Checker<MemberTs...>::value,
+ "Each member must pass the checker.");
+};
+
+}
+
+// Teach DenseMap how to use PointerSumTypes as keys.
+template <typename TagT, typename... MemberTs>
+struct DenseMapInfo<PointerSumType<TagT, MemberTs...>> {
+ typedef PointerSumType<TagT, MemberTs...> SumType;
+
+ typedef detail::PointerSumTypeHelper<TagT, MemberTs...> HelperT;
+ enum { SomeTag = HelperT::MinTag };
+ typedef typename HelperT::template Lookup<HelperT::MinTag>::PointerT
+ SomePointerT;
+ typedef DenseMapInfo<SomePointerT> SomePointerInfo;
+
+ static inline SumType getEmptyKey() {
+ return SumType::create<SomeTag>(SomePointerInfo::getEmptyKey());
+ }
+ static inline SumType getTombstoneKey() {
+ return SumType::create<SomeTag>(
+ SomePointerInfo::getTombstoneKey());
+ }
+ static unsigned getHashValue(const SumType &Arg) {
+ uintptr_t OpaqueValue = Arg.getOpaqueValue();
+ return DenseMapInfo<uintptr_t>::getHashValue(OpaqueValue);
+ }
+ static bool isEqual(const SumType &LHS, const SumType &RHS) {
+ return LHS == RHS;
+ }
+};
+
+}
+
+#endif
diff --git a/ext/include/llvm/ADT/PointerUnion.h b/ext/include/llvm/ADT/PointerUnion.h
new file mode 100644
index 0000000..6b3fe57
--- /dev/null
+++ b/ext/include/llvm/ADT/PointerUnion.h
@@ -0,0 +1,474 @@
+//===- llvm/ADT/PointerUnion.h - Discriminated Union of 2 Ptrs --*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the PointerUnion class, which is a discriminated union of
+// pointer types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_POINTERUNION_H
+#define LLVM_ADT_POINTERUNION_H
+
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+
+template <typename T> struct PointerUnionTypeSelectorReturn {
+ typedef T Return;
+};
+
+/// Get a type based on whether two types are the same or not.
+///
+/// For:
+///
+/// \code
+/// typedef typename PointerUnionTypeSelector<T1, T2, EQ, NE>::Return Ret;
+/// \endcode
+///
+/// Ret will be EQ type if T1 is same as T2 or NE type otherwise.
+template <typename T1, typename T2, typename RET_EQ, typename RET_NE>
+struct PointerUnionTypeSelector {
+ typedef typename PointerUnionTypeSelectorReturn<RET_NE>::Return Return;
+};
+
+template <typename T, typename RET_EQ, typename RET_NE>
+struct PointerUnionTypeSelector<T, T, RET_EQ, RET_NE> {
+ typedef typename PointerUnionTypeSelectorReturn<RET_EQ>::Return Return;
+};
+
+template <typename T1, typename T2, typename RET_EQ, typename RET_NE>
+struct PointerUnionTypeSelectorReturn<
+ PointerUnionTypeSelector<T1, T2, RET_EQ, RET_NE>> {
+ typedef
+ typename PointerUnionTypeSelector<T1, T2, RET_EQ, RET_NE>::Return Return;
+};
+
+/// Provide PointerLikeTypeTraits for void* that is used by PointerUnion
+/// for the two template arguments.
+template <typename PT1, typename PT2> class PointerUnionUIntTraits {
+public:
+ static inline void *getAsVoidPointer(void *P) { return P; }
+ static inline void *getFromVoidPointer(void *P) { return P; }
+ enum {
+ PT1BitsAv = (int)(PointerLikeTypeTraits<PT1>::NumLowBitsAvailable),
+ PT2BitsAv = (int)(PointerLikeTypeTraits<PT2>::NumLowBitsAvailable),
+ NumLowBitsAvailable = PT1BitsAv < PT2BitsAv ? PT1BitsAv : PT2BitsAv
+ };
+};
+
+/// A discriminated union of two pointer types, with the discriminator in the
+/// low bit of the pointer.
+///
+/// This implementation is extremely efficient in space due to leveraging the
+/// low bits of the pointer, while exposing a natural and type-safe API.
+///
+/// Common use patterns would be something like this:
+/// PointerUnion<int*, float*> P;
+/// P = (int*)0;
+/// printf("%d %d", P.is<int*>(), P.is<float*>()); // prints "1 0"
+/// X = P.get<int*>(); // ok.
+/// Y = P.get<float*>(); // runtime assertion failure.
+/// Z = P.get<double*>(); // compile time failure.
+/// P = (float*)0;
+/// Y = P.get<float*>(); // ok.
+/// X = P.get<int*>(); // runtime assertion failure.
+template <typename PT1, typename PT2> class PointerUnion {
+public:
+ typedef PointerIntPair<void *, 1, bool, PointerUnionUIntTraits<PT1, PT2>>
+ ValTy;
+
+private:
+ ValTy Val;
+
+ struct IsPT1 {
+ static const int Num = 0;
+ };
+ struct IsPT2 {
+ static const int Num = 1;
+ };
+ template <typename T> struct UNION_DOESNT_CONTAIN_TYPE {};
+
+public:
+ PointerUnion() {}
+
+ PointerUnion(PT1 V)
+ : Val(const_cast<void *>(
+ PointerLikeTypeTraits<PT1>::getAsVoidPointer(V))) {}
+ PointerUnion(PT2 V)
+ : Val(const_cast<void *>(PointerLikeTypeTraits<PT2>::getAsVoidPointer(V)),
+ 1) {}
+
+ /// Test if the pointer held in the union is null, regardless of
+ /// which type it is.
+ bool isNull() const {
+ // Convert from the void* to one of the pointer types, to make sure that
+ // we recursively strip off low bits if we have a nested PointerUnion.
+ return !PointerLikeTypeTraits<PT1>::getFromVoidPointer(Val.getPointer());
+ }
+ explicit operator bool() const { return !isNull(); }
+
+ /// Test if the Union currently holds the type matching T.
+ template <typename T> int is() const {
+ typedef typename ::llvm::PointerUnionTypeSelector<
+ PT1, T, IsPT1, ::llvm::PointerUnionTypeSelector<
+ PT2, T, IsPT2, UNION_DOESNT_CONTAIN_TYPE<T>>>::Return
+ Ty;
+ int TyNo = Ty::Num;
+ return static_cast<int>(Val.getInt()) == TyNo;
+ }
+
+ /// Returns the value of the specified pointer type.
+ ///
+ /// If the specified pointer type is incorrect, assert.
+ template <typename T> T get() const {
+ assert(is<T>() && "Invalid accessor called");
+ return PointerLikeTypeTraits<T>::getFromVoidPointer(Val.getPointer());
+ }
+
+ /// Returns the current pointer if it is of the specified pointer type,
+ /// otherwises returns null.
+ template <typename T> T dyn_cast() const {
+ if (is<T>())
+ return get<T>();
+ return T();
+ }
+
+ /// If the union is set to the first pointer type get an address pointing to
+ /// it.
+ PT1 const *getAddrOfPtr1() const {
+ return const_cast<PointerUnion *>(this)->getAddrOfPtr1();
+ }
+
+ /// If the union is set to the first pointer type get an address pointing to
+ /// it.
+ PT1 *getAddrOfPtr1() {
+ assert(is<PT1>() && "Val is not the first pointer");
+ assert(
+ get<PT1>() == Val.getPointer() &&
+ "Can't get the address because PointerLikeTypeTraits changes the ptr");
+ return (PT1 *)Val.getAddrOfPointer();
+ }
+
+ /// Assignment from nullptr which just clears the union.
+ const PointerUnion &operator=(std::nullptr_t) {
+ Val.initWithPointer(nullptr);
+ return *this;
+ }
+
+ /// Assignment operators - Allow assigning into this union from either
+ /// pointer type, setting the discriminator to remember what it came from.
+ const PointerUnion &operator=(const PT1 &RHS) {
+ Val.initWithPointer(
+ const_cast<void *>(PointerLikeTypeTraits<PT1>::getAsVoidPointer(RHS)));
+ return *this;
+ }
+ const PointerUnion &operator=(const PT2 &RHS) {
+ Val.setPointerAndInt(
+ const_cast<void *>(PointerLikeTypeTraits<PT2>::getAsVoidPointer(RHS)),
+ 1);
+ return *this;
+ }
+
+ void *getOpaqueValue() const { return Val.getOpaqueValue(); }
+ static inline PointerUnion getFromOpaqueValue(void *VP) {
+ PointerUnion V;
+ V.Val = ValTy::getFromOpaqueValue(VP);
+ return V;
+ }
+};
+
+template <typename PT1, typename PT2>
+static bool operator==(PointerUnion<PT1, PT2> lhs, PointerUnion<PT1, PT2> rhs) {
+ return lhs.getOpaqueValue() == rhs.getOpaqueValue();
+}
+
+template <typename PT1, typename PT2>
+static bool operator!=(PointerUnion<PT1, PT2> lhs, PointerUnion<PT1, PT2> rhs) {
+ return lhs.getOpaqueValue() != rhs.getOpaqueValue();
+}
+
+template <typename PT1, typename PT2>
+static bool operator<(PointerUnion<PT1, PT2> lhs, PointerUnion<PT1, PT2> rhs) {
+ return lhs.getOpaqueValue() < rhs.getOpaqueValue();
+}
+
+// Teach SmallPtrSet that PointerUnion is "basically a pointer", that has
+// # low bits available = min(PT1bits,PT2bits)-1.
+template <typename PT1, typename PT2>
+class PointerLikeTypeTraits<PointerUnion<PT1, PT2>> {
+public:
+ static inline void *getAsVoidPointer(const PointerUnion<PT1, PT2> &P) {
+ return P.getOpaqueValue();
+ }
+ static inline PointerUnion<PT1, PT2> getFromVoidPointer(void *P) {
+ return PointerUnion<PT1, PT2>::getFromOpaqueValue(P);
+ }
+
+ // The number of bits available are the min of the two pointer types.
+ enum {
+ NumLowBitsAvailable = PointerLikeTypeTraits<
+ typename PointerUnion<PT1, PT2>::ValTy>::NumLowBitsAvailable
+ };
+};
+
+/// A pointer union of three pointer types. See documentation for PointerUnion
+/// for usage.
+template <typename PT1, typename PT2, typename PT3> class PointerUnion3 {
+public:
+ typedef PointerUnion<PT1, PT2> InnerUnion;
+ typedef PointerUnion<InnerUnion, PT3> ValTy;
+
+private:
+ ValTy Val;
+
+ struct IsInnerUnion {
+ ValTy Val;
+ IsInnerUnion(ValTy val) : Val(val) {}
+ template <typename T> int is() const {
+ return Val.template is<InnerUnion>() &&
+ Val.template get<InnerUnion>().template is<T>();
+ }
+ template <typename T> T get() const {
+ return Val.template get<InnerUnion>().template get<T>();
+ }
+ };
+
+ struct IsPT3 {
+ ValTy Val;
+ IsPT3(ValTy val) : Val(val) {}
+ template <typename T> int is() const { return Val.template is<T>(); }
+ template <typename T> T get() const { return Val.template get<T>(); }
+ };
+
+public:
+ PointerUnion3() {}
+
+ PointerUnion3(PT1 V) { Val = InnerUnion(V); }
+ PointerUnion3(PT2 V) { Val = InnerUnion(V); }
+ PointerUnion3(PT3 V) { Val = V; }
+
+ /// Test if the pointer held in the union is null, regardless of
+ /// which type it is.
+ bool isNull() const { return Val.isNull(); }
+ explicit operator bool() const { return !isNull(); }
+
+ /// Test if the Union currently holds the type matching T.
+ template <typename T> int is() const {
+ // If T is PT1/PT2 choose IsInnerUnion otherwise choose IsPT3.
+ typedef typename ::llvm::PointerUnionTypeSelector<
+ PT1, T, IsInnerUnion,
+ ::llvm::PointerUnionTypeSelector<PT2, T, IsInnerUnion, IsPT3>>::Return
+ Ty;
+ return Ty(Val).template is<T>();
+ }
+
+ /// Returns the value of the specified pointer type.
+ ///
+ /// If the specified pointer type is incorrect, assert.
+ template <typename T> T get() const {
+ assert(is<T>() && "Invalid accessor called");
+ // If T is PT1/PT2 choose IsInnerUnion otherwise choose IsPT3.
+ typedef typename ::llvm::PointerUnionTypeSelector<
+ PT1, T, IsInnerUnion,
+ ::llvm::PointerUnionTypeSelector<PT2, T, IsInnerUnion, IsPT3>>::Return
+ Ty;
+ return Ty(Val).template get<T>();
+ }
+
+ /// Returns the current pointer if it is of the specified pointer type,
+ /// otherwises returns null.
+ template <typename T> T dyn_cast() const {
+ if (is<T>())
+ return get<T>();
+ return T();
+ }
+
+ /// Assignment from nullptr which just clears the union.
+ const PointerUnion3 &operator=(std::nullptr_t) {
+ Val = nullptr;
+ return *this;
+ }
+
+ /// Assignment operators - Allow assigning into this union from either
+ /// pointer type, setting the discriminator to remember what it came from.
+ const PointerUnion3 &operator=(const PT1 &RHS) {
+ Val = InnerUnion(RHS);
+ return *this;
+ }
+ const PointerUnion3 &operator=(const PT2 &RHS) {
+ Val = InnerUnion(RHS);
+ return *this;
+ }
+ const PointerUnion3 &operator=(const PT3 &RHS) {
+ Val = RHS;
+ return *this;
+ }
+
+ void *getOpaqueValue() const { return Val.getOpaqueValue(); }
+ static inline PointerUnion3 getFromOpaqueValue(void *VP) {
+ PointerUnion3 V;
+ V.Val = ValTy::getFromOpaqueValue(VP);
+ return V;
+ }
+};
+
+// Teach SmallPtrSet that PointerUnion3 is "basically a pointer", that has
+// # low bits available = min(PT1bits,PT2bits,PT2bits)-2.
+template <typename PT1, typename PT2, typename PT3>
+class PointerLikeTypeTraits<PointerUnion3<PT1, PT2, PT3>> {
+public:
+ static inline void *getAsVoidPointer(const PointerUnion3<PT1, PT2, PT3> &P) {
+ return P.getOpaqueValue();
+ }
+ static inline PointerUnion3<PT1, PT2, PT3> getFromVoidPointer(void *P) {
+ return PointerUnion3<PT1, PT2, PT3>::getFromOpaqueValue(P);
+ }
+
+ // The number of bits available are the min of the two pointer types.
+ enum {
+ NumLowBitsAvailable = PointerLikeTypeTraits<
+ typename PointerUnion3<PT1, PT2, PT3>::ValTy>::NumLowBitsAvailable
+ };
+};
+
+/// A pointer union of four pointer types. See documentation for PointerUnion
+/// for usage.
+template <typename PT1, typename PT2, typename PT3, typename PT4>
+class PointerUnion4 {
+public:
+ typedef PointerUnion<PT1, PT2> InnerUnion1;
+ typedef PointerUnion<PT3, PT4> InnerUnion2;
+ typedef PointerUnion<InnerUnion1, InnerUnion2> ValTy;
+
+private:
+ ValTy Val;
+
+public:
+ PointerUnion4() {}
+
+ PointerUnion4(PT1 V) { Val = InnerUnion1(V); }
+ PointerUnion4(PT2 V) { Val = InnerUnion1(V); }
+ PointerUnion4(PT3 V) { Val = InnerUnion2(V); }
+ PointerUnion4(PT4 V) { Val = InnerUnion2(V); }
+
+ /// Test if the pointer held in the union is null, regardless of
+ /// which type it is.
+ bool isNull() const { return Val.isNull(); }
+ explicit operator bool() const { return !isNull(); }
+
+ /// Test if the Union currently holds the type matching T.
+ template <typename T> int is() const {
+ // If T is PT1/PT2 choose InnerUnion1 otherwise choose InnerUnion2.
+ typedef typename ::llvm::PointerUnionTypeSelector<
+ PT1, T, InnerUnion1, ::llvm::PointerUnionTypeSelector<
+ PT2, T, InnerUnion1, InnerUnion2>>::Return Ty;
+ return Val.template is<Ty>() && Val.template get<Ty>().template is<T>();
+ }
+
+ /// Returns the value of the specified pointer type.
+ ///
+ /// If the specified pointer type is incorrect, assert.
+ template <typename T> T get() const {
+ assert(is<T>() && "Invalid accessor called");
+ // If T is PT1/PT2 choose InnerUnion1 otherwise choose InnerUnion2.
+ typedef typename ::llvm::PointerUnionTypeSelector<
+ PT1, T, InnerUnion1, ::llvm::PointerUnionTypeSelector<
+ PT2, T, InnerUnion1, InnerUnion2>>::Return Ty;
+ return Val.template get<Ty>().template get<T>();
+ }
+
+ /// Returns the current pointer if it is of the specified pointer type,
+ /// otherwises returns null.
+ template <typename T> T dyn_cast() const {
+ if (is<T>())
+ return get<T>();
+ return T();
+ }
+
+ /// Assignment from nullptr which just clears the union.
+ const PointerUnion4 &operator=(std::nullptr_t) {
+ Val = nullptr;
+ return *this;
+ }
+
+ /// Assignment operators - Allow assigning into this union from either
+ /// pointer type, setting the discriminator to remember what it came from.
+ const PointerUnion4 &operator=(const PT1 &RHS) {
+ Val = InnerUnion1(RHS);
+ return *this;
+ }
+ const PointerUnion4 &operator=(const PT2 &RHS) {
+ Val = InnerUnion1(RHS);
+ return *this;
+ }
+ const PointerUnion4 &operator=(const PT3 &RHS) {
+ Val = InnerUnion2(RHS);
+ return *this;
+ }
+ const PointerUnion4 &operator=(const PT4 &RHS) {
+ Val = InnerUnion2(RHS);
+ return *this;
+ }
+
+ void *getOpaqueValue() const { return Val.getOpaqueValue(); }
+ static inline PointerUnion4 getFromOpaqueValue(void *VP) {
+ PointerUnion4 V;
+ V.Val = ValTy::getFromOpaqueValue(VP);
+ return V;
+ }
+};
+
+// Teach SmallPtrSet that PointerUnion4 is "basically a pointer", that has
+// # low bits available = min(PT1bits,PT2bits,PT2bits)-2.
+template <typename PT1, typename PT2, typename PT3, typename PT4>
+class PointerLikeTypeTraits<PointerUnion4<PT1, PT2, PT3, PT4>> {
+public:
+ static inline void *
+ getAsVoidPointer(const PointerUnion4<PT1, PT2, PT3, PT4> &P) {
+ return P.getOpaqueValue();
+ }
+ static inline PointerUnion4<PT1, PT2, PT3, PT4> getFromVoidPointer(void *P) {
+ return PointerUnion4<PT1, PT2, PT3, PT4>::getFromOpaqueValue(P);
+ }
+
+ // The number of bits available are the min of the two pointer types.
+ enum {
+ NumLowBitsAvailable = PointerLikeTypeTraits<
+ typename PointerUnion4<PT1, PT2, PT3, PT4>::ValTy>::NumLowBitsAvailable
+ };
+};
+
+// Teach DenseMap how to use PointerUnions as keys.
+template <typename T, typename U> struct DenseMapInfo<PointerUnion<T, U>> {
+ typedef PointerUnion<T, U> Pair;
+ typedef DenseMapInfo<T> FirstInfo;
+ typedef DenseMapInfo<U> SecondInfo;
+
+ static inline Pair getEmptyKey() { return Pair(FirstInfo::getEmptyKey()); }
+ static inline Pair getTombstoneKey() {
+ return Pair(FirstInfo::getTombstoneKey());
+ }
+ static unsigned getHashValue(const Pair &PairVal) {
+ intptr_t key = (intptr_t)PairVal.getOpaqueValue();
+ return DenseMapInfo<intptr_t>::getHashValue(key);
+ }
+ static bool isEqual(const Pair &LHS, const Pair &RHS) {
+ return LHS.template is<T>() == RHS.template is<T>() &&
+ (LHS.template is<T>() ? FirstInfo::isEqual(LHS.template get<T>(),
+ RHS.template get<T>())
+ : SecondInfo::isEqual(LHS.template get<U>(),
+ RHS.template get<U>()));
+ }
+};
+
+}
+
+#endif
diff --git a/ext/include/llvm/ADT/STLExtras.h b/ext/include/llvm/ADT/STLExtras.h
new file mode 100644
index 0000000..d4360fa
--- /dev/null
+++ b/ext/include/llvm/ADT/STLExtras.h
@@ -0,0 +1,472 @@
+//===- llvm/ADT/STLExtras.h - Useful STL related functions ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains some templates that are useful if you are working with the
+// STL at all.
+//
+// No library is required when using these functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_STLEXTRAS_H
+#define LLVM_ADT_STLEXTRAS_H
+
+#include "llvm/Support/Compiler.h"
+#include <algorithm> // for std::all_of
+#include <cassert>
+#include <cstddef> // for std::size_t
+#include <cstdlib> // for qsort
+#include <functional>
+#include <iterator>
+#include <memory>
+#include <utility> // for std::pair
+
+namespace llvm {
+
+//===----------------------------------------------------------------------===//
+// Extra additions to <functional>
+//===----------------------------------------------------------------------===//
+
+template<class Ty>
+struct identity : public std::unary_function<Ty, Ty> {
+ Ty &operator()(Ty &self) const {
+ return self;
+ }
+ const Ty &operator()(const Ty &self) const {
+ return self;
+ }
+};
+
+template<class Ty>
+struct less_ptr : public std::binary_function<Ty, Ty, bool> {
+ bool operator()(const Ty* left, const Ty* right) const {
+ return *left < *right;
+ }
+};
+
+template<class Ty>
+struct greater_ptr : public std::binary_function<Ty, Ty, bool> {
+ bool operator()(const Ty* left, const Ty* right) const {
+ return *right < *left;
+ }
+};
+
+/// An efficient, type-erasing, non-owning reference to a callable. This is
+/// intended for use as the type of a function parameter that is not used
+/// after the function in question returns.
+///
+/// This class does not own the callable, so it is not in general safe to store
+/// a function_ref.
+template<typename Fn> class function_ref;
+
+template<typename Ret, typename ...Params>
+class function_ref<Ret(Params...)> {
+ Ret (*callback)(intptr_t callable, Params ...params);
+ intptr_t callable;
+
+ template<typename Callable>
+ static Ret callback_fn(intptr_t callable, Params ...params) {
+ return (*reinterpret_cast<Callable*>(callable))(
+ std::forward<Params>(params)...);
+ }
+
+public:
+ template <typename Callable>
+ function_ref(Callable &&callable,
+ typename std::enable_if<
+ !std::is_same<typename std::remove_reference<Callable>::type,
+ function_ref>::value>::type * = nullptr)
+ : callback(callback_fn<typename std::remove_reference<Callable>::type>),
+ callable(reinterpret_cast<intptr_t>(&callable)) {}
+ Ret operator()(Params ...params) const {
+ return callback(callable, std::forward<Params>(params)...);
+ }
+};
+
+// deleter - Very very very simple method that is used to invoke operator
+// delete on something. It is used like this:
+//
+// for_each(V.begin(), B.end(), deleter<Interval>);
+//
+template <class T>
+inline void deleter(T *Ptr) {
+ delete Ptr;
+}
+
+
+
+//===----------------------------------------------------------------------===//
+// Extra additions to <iterator>
+//===----------------------------------------------------------------------===//
+
+// mapped_iterator - This is a simple iterator adapter that causes a function to
+// be dereferenced whenever operator* is invoked on the iterator.
+//
+template <class RootIt, class UnaryFunc>
+class mapped_iterator {
+ RootIt current;
+ UnaryFunc Fn;
+public:
+ typedef typename std::iterator_traits<RootIt>::iterator_category
+ iterator_category;
+ typedef typename std::iterator_traits<RootIt>::difference_type
+ difference_type;
+ typedef typename UnaryFunc::result_type value_type;
+
+ typedef void pointer;
+ //typedef typename UnaryFunc::result_type *pointer;
+ typedef void reference; // Can't modify value returned by fn
+
+ typedef RootIt iterator_type;
+
+ inline const RootIt &getCurrent() const { return current; }
+ inline const UnaryFunc &getFunc() const { return Fn; }
+
+ inline explicit mapped_iterator(const RootIt &I, UnaryFunc F)
+ : current(I), Fn(F) {}
+
+ inline value_type operator*() const { // All this work to do this
+ return Fn(*current); // little change
+ }
+
+ mapped_iterator &operator++() {
+ ++current;
+ return *this;
+ }
+ mapped_iterator &operator--() {
+ --current;
+ return *this;
+ }
+ mapped_iterator operator++(int) {
+ mapped_iterator __tmp = *this;
+ ++current;
+ return __tmp;
+ }
+ mapped_iterator operator--(int) {
+ mapped_iterator __tmp = *this;
+ --current;
+ return __tmp;
+ }
+ mapped_iterator operator+(difference_type n) const {
+ return mapped_iterator(current + n, Fn);
+ }
+ mapped_iterator &operator+=(difference_type n) {
+ current += n;
+ return *this;
+ }
+ mapped_iterator operator-(difference_type n) const {
+ return mapped_iterator(current - n, Fn);
+ }
+ mapped_iterator &operator-=(difference_type n) {
+ current -= n;
+ return *this;
+ }
+ reference operator[](difference_type n) const { return *(*this + n); }
+
+ bool operator!=(const mapped_iterator &X) const { return !operator==(X); }
+ bool operator==(const mapped_iterator &X) const {
+ return current == X.current;
+ }
+ bool operator<(const mapped_iterator &X) const { return current < X.current; }
+
+ difference_type operator-(const mapped_iterator &X) const {
+ return current - X.current;
+ }
+};
+
+template <class Iterator, class Func>
+inline mapped_iterator<Iterator, Func>
+operator+(typename mapped_iterator<Iterator, Func>::difference_type N,
+ const mapped_iterator<Iterator, Func> &X) {
+ return mapped_iterator<Iterator, Func>(X.getCurrent() - N, X.getFunc());
+}
+
+
+// map_iterator - Provide a convenient way to create mapped_iterators, just like
+// make_pair is useful for creating pairs...
+//
+template <class ItTy, class FuncTy>
+inline mapped_iterator<ItTy, FuncTy> map_iterator(const ItTy &I, FuncTy F) {
+ return mapped_iterator<ItTy, FuncTy>(I, F);
+}
+
+/// \brief Metafunction to determine if type T has a member called rbegin().
+template <typename T> struct has_rbegin {
+ template <typename U> static char(&f(const U &, decltype(&U::rbegin)))[1];
+ static char(&f(...))[2];
+ const static bool value = sizeof(f(std::declval<T>(), nullptr)) == 1;
+};
+
+// Returns an iterator_range over the given container which iterates in reverse.
+// Note that the container must have rbegin()/rend() methods for this to work.
+template <typename ContainerTy>
+auto reverse(ContainerTy &&C,
+ typename std::enable_if<has_rbegin<ContainerTy>::value>::type * =
+ nullptr) -> decltype(make_range(C.rbegin(), C.rend())) {
+ return make_range(C.rbegin(), C.rend());
+}
+
+// Returns a std::reverse_iterator wrapped around the given iterator.
+template <typename IteratorTy>
+std::reverse_iterator<IteratorTy> make_reverse_iterator(IteratorTy It) {
+ return std::reverse_iterator<IteratorTy>(It);
+}
+
+// Returns an iterator_range over the given container which iterates in reverse.
+// Note that the container must have begin()/end() methods which return
+// bidirectional iterators for this to work.
+template <typename ContainerTy>
+auto reverse(
+ ContainerTy &&C,
+ typename std::enable_if<!has_rbegin<ContainerTy>::value>::type * = nullptr)
+ -> decltype(make_range(llvm::make_reverse_iterator(std::end(C)),
+ llvm::make_reverse_iterator(std::begin(C)))) {
+ return make_range(llvm::make_reverse_iterator(std::end(C)),
+ llvm::make_reverse_iterator(std::begin(C)));
+}
+
+//===----------------------------------------------------------------------===//
+// Extra additions to <utility>
+//===----------------------------------------------------------------------===//
+
+/// \brief Function object to check whether the first component of a std::pair
+/// compares less than the first component of another std::pair.
+struct less_first {
+ template <typename T> bool operator()(const T &lhs, const T &rhs) const {
+ return lhs.first < rhs.first;
+ }
+};
+
+/// \brief Function object to check whether the second component of a std::pair
+/// compares less than the second component of another std::pair.
+struct less_second {
+ template <typename T> bool operator()(const T &lhs, const T &rhs) const {
+ return lhs.second < rhs.second;
+ }
+};
+
+// A subset of N3658. More stuff can be added as-needed.
+
+/// \brief Represents a compile-time sequence of integers.
+template <class T, T... I> struct integer_sequence {
+ typedef T value_type;
+
+ static LLVM_CONSTEXPR size_t size() { return sizeof...(I); }
+};
+
+/// \brief Alias for the common case of a sequence of size_ts.
+template <size_t... I>
+struct index_sequence : integer_sequence<std::size_t, I...> {};
+
+template <std::size_t N, std::size_t... I>
+struct build_index_impl : build_index_impl<N - 1, N - 1, I...> {};
+template <std::size_t... I>
+struct build_index_impl<0, I...> : index_sequence<I...> {};
+
+/// \brief Creates a compile-time integer sequence for a parameter pack.
+template <class... Ts>
+struct index_sequence_for : build_index_impl<sizeof...(Ts)> {};
+
+//===----------------------------------------------------------------------===//
+// Extra additions for arrays
+//===----------------------------------------------------------------------===//
+
+/// Find the length of an array.
+template <class T, std::size_t N>
+LLVM_CONSTEXPR inline size_t array_lengthof(T (&)[N]) {
+ return N;
+}
+
+/// Adapt std::less<T> for array_pod_sort.
+template<typename T>
+inline int array_pod_sort_comparator(const void *P1, const void *P2) {
+ if (std::less<T>()(*reinterpret_cast<const T*>(P1),
+ *reinterpret_cast<const T*>(P2)))
+ return -1;
+ if (std::less<T>()(*reinterpret_cast<const T*>(P2),
+ *reinterpret_cast<const T*>(P1)))
+ return 1;
+ return 0;
+}
+
+/// get_array_pod_sort_comparator - This is an internal helper function used to
+/// get type deduction of T right.
+template<typename T>
+inline int (*get_array_pod_sort_comparator(const T &))
+ (const void*, const void*) {
+ return array_pod_sort_comparator<T>;
+}
+
+
+/// array_pod_sort - This sorts an array with the specified start and end
+/// extent. This is just like std::sort, except that it calls qsort instead of
+/// using an inlined template. qsort is slightly slower than std::sort, but
+/// most sorts are not performance critical in LLVM and std::sort has to be
+/// template instantiated for each type, leading to significant measured code
+/// bloat. This function should generally be used instead of std::sort where
+/// possible.
+///
+/// This function assumes that you have simple POD-like types that can be
+/// compared with std::less and can be moved with memcpy. If this isn't true,
+/// you should use std::sort.
+///
+/// NOTE: If qsort_r were portable, we could allow a custom comparator and
+/// default to std::less.
+template<class IteratorTy>
+inline void array_pod_sort(IteratorTy Start, IteratorTy End) {
+ // Don't inefficiently call qsort with one element or trigger undefined
+ // behavior with an empty sequence.
+ auto NElts = End - Start;
+ if (NElts <= 1) return;
+ qsort(&*Start, NElts, sizeof(*Start), get_array_pod_sort_comparator(*Start));
+}
+
+template <class IteratorTy>
+inline void array_pod_sort(
+ IteratorTy Start, IteratorTy End,
+ int (*Compare)(
+ const typename std::iterator_traits<IteratorTy>::value_type *,
+ const typename std::iterator_traits<IteratorTy>::value_type *)) {
+ // Don't inefficiently call qsort with one element or trigger undefined
+ // behavior with an empty sequence.
+ auto NElts = End - Start;
+ if (NElts <= 1) return;
+ qsort(&*Start, NElts, sizeof(*Start),
+ reinterpret_cast<int (*)(const void *, const void *)>(Compare));
+}
+
+//===----------------------------------------------------------------------===//
+// Extra additions to <algorithm>
+//===----------------------------------------------------------------------===//
+
+/// For a container of pointers, deletes the pointers and then clears the
+/// container.
+template<typename Container>
+void DeleteContainerPointers(Container &C) {
+ for (typename Container::iterator I = C.begin(), E = C.end(); I != E; ++I)
+ delete *I;
+ C.clear();
+}
+
+/// In a container of pairs (usually a map) whose second element is a pointer,
+/// deletes the second elements and then clears the container.
+template<typename Container>
+void DeleteContainerSeconds(Container &C) {
+ for (typename Container::iterator I = C.begin(), E = C.end(); I != E; ++I)
+ delete I->second;
+ C.clear();
+}
+
+/// Provide wrappers to std::all_of which take ranges instead of having to pass
+/// begin/end explicitly.
+template<typename R, class UnaryPredicate>
+bool all_of(R &&Range, UnaryPredicate &&P) {
+ return std::all_of(Range.begin(), Range.end(),
+ std::forward<UnaryPredicate>(P));
+}
+
+/// Provide wrappers to std::any_of which take ranges instead of having to pass
+/// begin/end explicitly.
+template <typename R, class UnaryPredicate>
+bool any_of(R &&Range, UnaryPredicate &&P) {
+ return std::any_of(Range.begin(), Range.end(),
+ std::forward<UnaryPredicate>(P));
+}
+
+/// Provide wrappers to std::find which take ranges instead of having to pass
+/// begin/end explicitly.
+template<typename R, class T>
+auto find(R &&Range, const T &val) -> decltype(Range.begin()) {
+ return std::find(Range.begin(), Range.end(), val);
+}
+
+//===----------------------------------------------------------------------===//
+// Extra additions to <memory>
+//===----------------------------------------------------------------------===//
+
+// Implement make_unique according to N3656.
+
+/// \brief Constructs a `new T()` with the given args and returns a
+/// `unique_ptr<T>` which owns the object.
+///
+/// Example:
+///
+/// auto p = make_unique<int>();
+/// auto p = make_unique<std::tuple<int, int>>(0, 1);
+template <class T, class... Args>
+typename std::enable_if<!std::is_array<T>::value, std::unique_ptr<T>>::type
+make_unique(Args &&... args) {
+ return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+
+/// \brief Constructs a `new T[n]` with the given args and returns a
+/// `unique_ptr<T[]>` which owns the object.
+///
+/// \param n size of the new array.
+///
+/// Example:
+///
+/// auto p = make_unique<int[]>(2); // value-initializes the array with 0's.
+template <class T>
+typename std::enable_if<std::is_array<T>::value && std::extent<T>::value == 0,
+ std::unique_ptr<T>>::type
+make_unique(size_t n) {
+ return std::unique_ptr<T>(new typename std::remove_extent<T>::type[n]());
+}
+
+/// This function isn't used and is only here to provide better compile errors.
+template <class T, class... Args>
+typename std::enable_if<std::extent<T>::value != 0>::type
+make_unique(Args &&...) = delete;
+
+struct FreeDeleter {
+ void operator()(void* v) {
+ ::free(v);
+ }
+};
+
+template<typename First, typename Second>
+struct pair_hash {
+ size_t operator()(const std::pair<First, Second> &P) const {
+ return std::hash<First>()(P.first) * 31 + std::hash<Second>()(P.second);
+ }
+};
+
+/// A functor like C++14's std::less<void> in its absence.
+struct less {
+ template <typename A, typename B> bool operator()(A &&a, B &&b) const {
+ return std::forward<A>(a) < std::forward<B>(b);
+ }
+};
+
+/// A functor like C++14's std::equal<void> in its absence.
+struct equal {
+ template <typename A, typename B> bool operator()(A &&a, B &&b) const {
+ return std::forward<A>(a) == std::forward<B>(b);
+ }
+};
+
+/// Binary functor that adapts to any other binary functor after dereferencing
+/// operands.
+template <typename T> struct deref {
+ T func;
+ // Could be further improved to cope with non-derivable functors and
+ // non-binary functors (should be a variadic template member function
+ // operator()).
+ template <typename A, typename B>
+ auto operator()(A &lhs, B &rhs) const -> decltype(func(*lhs, *rhs)) {
+ assert(lhs);
+ assert(rhs);
+ return func(*lhs, *rhs);
+ }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/ext/include/llvm/ADT/SmallString.h b/ext/include/llvm/ADT/SmallString.h
new file mode 100644
index 0000000..e569f54
--- /dev/null
+++ b/ext/include/llvm/ADT/SmallString.h
@@ -0,0 +1,297 @@
+//===- llvm/ADT/SmallString.h - 'Normally small' strings --------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the SmallString class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_SMALLSTRING_H
+#define LLVM_ADT_SMALLSTRING_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace llvm {
+
+/// SmallString - A SmallString is just a SmallVector with methods and accessors
+/// that make it work better as a string (e.g. operator+ etc).
+template<unsigned InternalLen>
+class SmallString : public SmallVector<char, InternalLen> {
+public:
+ /// Default ctor - Initialize to empty.
+ SmallString() {}
+
+ /// Initialize from a StringRef.
+ SmallString(StringRef S) : SmallVector<char, InternalLen>(S.begin(), S.end()) {}
+
+ /// Initialize with a range.
+ template<typename ItTy>
+ SmallString(ItTy S, ItTy E) : SmallVector<char, InternalLen>(S, E) {}
+
+ // Note that in order to add new overloads for append & assign, we have to
+ // duplicate the inherited versions so as not to inadvertently hide them.
+
+ /// @}
+ /// @name String Assignment
+ /// @{
+
+ /// Assign from a repeated element.
+ void assign(size_t NumElts, char Elt) {
+ this->SmallVectorImpl<char>::assign(NumElts, Elt);
+ }
+
+ /// Assign from an iterator pair.
+ template<typename in_iter>
+ void assign(in_iter S, in_iter E) {
+ this->clear();
+ SmallVectorImpl<char>::append(S, E);
+ }
+
+ /// Assign from a StringRef.
+ void assign(StringRef RHS) {
+ this->clear();
+ SmallVectorImpl<char>::append(RHS.begin(), RHS.end());
+ }
+
+ /// Assign from a SmallVector.
+ void assign(const SmallVectorImpl<char> &RHS) {
+ this->clear();
+ SmallVectorImpl<char>::append(RHS.begin(), RHS.end());
+ }
+
+ /// @}
+ /// @name String Concatenation
+ /// @{
+
+ /// Append from an iterator pair.
+ template<typename in_iter>
+ void append(in_iter S, in_iter E) {
+ SmallVectorImpl<char>::append(S, E);
+ }
+
+ void append(size_t NumInputs, char Elt) {
+ SmallVectorImpl<char>::append(NumInputs, Elt);
+ }
+
+
+ /// Append from a StringRef.
+ void append(StringRef RHS) {
+ SmallVectorImpl<char>::append(RHS.begin(), RHS.end());
+ }
+
+ /// Append from a SmallVector.
+ void append(const SmallVectorImpl<char> &RHS) {
+ SmallVectorImpl<char>::append(RHS.begin(), RHS.end());
+ }
+
+ /// @}
+ /// @name String Comparison
+ /// @{
+
+ /// Check for string equality. This is more efficient than compare() when
+ /// the relative ordering of inequal strings isn't needed.
+ bool equals(StringRef RHS) const {
+ return str().equals(RHS);
+ }
+
+ /// Check for string equality, ignoring case.
+ bool equals_lower(StringRef RHS) const {
+ return str().equals_lower(RHS);
+ }
+
+ /// Compare two strings; the result is -1, 0, or 1 if this string is
+ /// lexicographically less than, equal to, or greater than the \p RHS.
+ int compare(StringRef RHS) const {
+ return str().compare(RHS);
+ }
+
+ /// compare_lower - Compare two strings, ignoring case.
+ int compare_lower(StringRef RHS) const {
+ return str().compare_lower(RHS);
+ }
+
+ /// compare_numeric - Compare two strings, treating sequences of digits as
+ /// numbers.
+ int compare_numeric(StringRef RHS) const {
+ return str().compare_numeric(RHS);
+ }
+
+ /// @}
+ /// @name String Predicates
+ /// @{
+
+ /// startswith - Check if this string starts with the given \p Prefix.
+ bool startswith(StringRef Prefix) const {
+ return str().startswith(Prefix);
+ }
+
+ /// endswith - Check if this string ends with the given \p Suffix.
+ bool endswith(StringRef Suffix) const {
+ return str().endswith(Suffix);
+ }
+
+ /// @}
+ /// @name String Searching
+ /// @{
+
+ /// find - Search for the first character \p C in the string.
+ ///
+ /// \return - The index of the first occurrence of \p C, or npos if not
+ /// found.
+ size_t find(char C, size_t From = 0) const {
+ return str().find(C, From);
+ }
+
+ /// Search for the first string \p Str in the string.
+ ///
+ /// \returns The index of the first occurrence of \p Str, or npos if not
+ /// found.
+ size_t find(StringRef Str, size_t From = 0) const {
+ return str().find(Str, From);
+ }
+
+ /// Search for the last character \p C in the string.
+ ///
+ /// \returns The index of the last occurrence of \p C, or npos if not
+ /// found.
+ size_t rfind(char C, size_t From = StringRef::npos) const {
+ return str().rfind(C, From);
+ }
+
+ /// Search for the last string \p Str in the string.
+ ///
+ /// \returns The index of the last occurrence of \p Str, or npos if not
+ /// found.
+ size_t rfind(StringRef Str) const {
+ return str().rfind(Str);
+ }
+
+ /// Find the first character in the string that is \p C, or npos if not
+ /// found. Same as find.
+ size_t find_first_of(char C, size_t From = 0) const {
+ return str().find_first_of(C, From);
+ }
+
+ /// Find the first character in the string that is in \p Chars, or npos if
+ /// not found.
+ ///
+ /// Complexity: O(size() + Chars.size())
+ size_t find_first_of(StringRef Chars, size_t From = 0) const {
+ return str().find_first_of(Chars, From);
+ }
+
+ /// Find the first character in the string that is not \p C or npos if not
+ /// found.
+ size_t find_first_not_of(char C, size_t From = 0) const {
+ return str().find_first_not_of(C, From);
+ }
+
+ /// Find the first character in the string that is not in the string
+ /// \p Chars, or npos if not found.
+ ///
+ /// Complexity: O(size() + Chars.size())
+ size_t find_first_not_of(StringRef Chars, size_t From = 0) const {
+ return str().find_first_not_of(Chars, From);
+ }
+
+ /// Find the last character in the string that is \p C, or npos if not
+ /// found.
+ size_t find_last_of(char C, size_t From = StringRef::npos) const {
+ return str().find_last_of(C, From);
+ }
+
+ /// Find the last character in the string that is in \p C, or npos if not
+ /// found.
+ ///
+ /// Complexity: O(size() + Chars.size())
+ size_t find_last_of(
+ StringRef Chars, size_t From = StringRef::npos) const {
+ return str().find_last_of(Chars, From);
+ }
+
+ /// @}
+ /// @name Helpful Algorithms
+ /// @{
+
+ /// Return the number of occurrences of \p C in the string.
+ size_t count(char C) const {
+ return str().count(C);
+ }
+
+ /// Return the number of non-overlapped occurrences of \p Str in the
+ /// string.
+ size_t count(StringRef Str) const {
+ return str().count(Str);
+ }
+
+ /// @}
+ /// @name Substring Operations
+ /// @{
+
+ /// Return a reference to the substring from [Start, Start + N).
+ ///
+ /// \param Start The index of the starting character in the substring; if
+ /// the index is npos or greater than the length of the string then the
+ /// empty substring will be returned.
+ ///
+ /// \param N The number of characters to included in the substring. If \p N
+ /// exceeds the number of characters remaining in the string, the string
+ /// suffix (starting with \p Start) will be returned.
+ StringRef substr(size_t Start, size_t N = StringRef::npos) const {
+ return str().substr(Start, N);
+ }
+
+ /// Return a reference to the substring from [Start, End).
+ ///
+ /// \param Start The index of the starting character in the substring; if
+ /// the index is npos or greater than the length of the string then the
+ /// empty substring will be returned.
+ ///
+ /// \param End The index following the last character to include in the
+ /// substring. If this is npos, or less than \p Start, or exceeds the
+ /// number of characters remaining in the string, the string suffix
+ /// (starting with \p Start) will be returned.
+ StringRef slice(size_t Start, size_t End) const {
+ return str().slice(Start, End);
+ }
+
+ // Extra methods.
+
+ /// Explicit conversion to StringRef.
+ StringRef str() const { return StringRef(this->begin(), this->size()); }
+
+ // TODO: Make this const, if it's safe...
+ const char* c_str() {
+ this->push_back(0);
+ this->pop_back();
+ return this->data();
+ }
+
+ /// Implicit conversion to StringRef.
+ operator StringRef() const { return str(); }
+
+ // Extra operators.
+ const SmallString &operator=(StringRef RHS) {
+ this->clear();
+ return *this += RHS;
+ }
+
+ SmallString &operator+=(StringRef RHS) {
+ this->append(RHS.begin(), RHS.end());
+ return *this;
+ }
+ SmallString &operator+=(char C) {
+ this->push_back(C);
+ return *this;
+ }
+};
+
+}
+
+#endif
diff --git a/ext/include/llvm/ADT/SmallVector.h b/ext/include/llvm/ADT/SmallVector.h
new file mode 100644
index 0000000..d1062ac
--- /dev/null
+++ b/ext/include/llvm/ADT/SmallVector.h
@@ -0,0 +1,954 @@
+//===- llvm/ADT/SmallVector.h - 'Normally small' vectors --------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the SmallVector class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_SMALLVECTOR_H
+#define LLVM_ADT_SMALLVECTOR_H
+
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Support/AlignOf.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/type_traits.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+#include <initializer_list>
+#include <iterator>
+#include <memory>
+
+namespace llvm {
+
+/// This is all the non-templated stuff common to all SmallVectors.
+class SmallVectorBase {
+protected:
+ void *BeginX, *EndX, *CapacityX;
+
+protected:
+ SmallVectorBase(void *FirstEl, size_t Size)
+ : BeginX(FirstEl), EndX(FirstEl), CapacityX((char*)FirstEl+Size) {}
+
+ /// This is an implementation of the grow() method which only works
+ /// on POD-like data types and is out of line to reduce code duplication.
+ void grow_pod(void *FirstEl, size_t MinSizeInBytes, size_t TSize);
+
+public:
+ /// This returns size()*sizeof(T).
+ size_t size_in_bytes() const {
+ return size_t((char*)EndX - (char*)BeginX);
+ }
+
+ /// capacity_in_bytes - This returns capacity()*sizeof(T).
+ size_t capacity_in_bytes() const {
+ return size_t((char*)CapacityX - (char*)BeginX);
+ }
+
+ bool LLVM_ATTRIBUTE_UNUSED_RESULT empty() const { return BeginX == EndX; }
+};
+
+template <typename T, unsigned N> struct SmallVectorStorage;
+
+/// This is the part of SmallVectorTemplateBase which does not depend on whether
+/// the type T is a POD. The extra dummy template argument is used by ArrayRef
+/// to avoid unnecessarily requiring T to be complete.
+template <typename T, typename = void>
+class SmallVectorTemplateCommon : public SmallVectorBase {
+private:
+ template <typename, unsigned> friend struct SmallVectorStorage;
+
+ // Allocate raw space for N elements of type T. If T has a ctor or dtor, we
+ // don't want it to be automatically run, so we need to represent the space as
+ // something else. Use an array of char of sufficient alignment.
+ typedef llvm::AlignedCharArrayUnion<T> U;
+ U FirstEl;
+ // Space after 'FirstEl' is clobbered, do not add any instance vars after it.
+
+protected:
+ SmallVectorTemplateCommon(size_t Size) : SmallVectorBase(&FirstEl, Size) {}
+
+ void grow_pod(size_t MinSizeInBytes, size_t TSize) {
+ SmallVectorBase::grow_pod(&FirstEl, MinSizeInBytes, TSize);
+ }
+
+ /// Return true if this is a smallvector which has not had dynamic
+ /// memory allocated for it.
+ bool isSmall() const {
+ return BeginX == static_cast<const void*>(&FirstEl);
+ }
+
+ /// Put this vector in a state of being small.
+ void resetToSmall() {
+ BeginX = EndX = CapacityX = &FirstEl;
+ }
+
+ void setEnd(T *P) { this->EndX = P; }
+public:
+ typedef size_t size_type;
+ typedef ptrdiff_t difference_type;
+ typedef T value_type;
+ typedef T *iterator;
+ typedef const T *const_iterator;
+
+ typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+ typedef std::reverse_iterator<iterator> reverse_iterator;
+
+ typedef T &reference;
+ typedef const T &const_reference;
+ typedef T *pointer;
+ typedef const T *const_pointer;
+
+ // forward iterator creation methods.
+ LLVM_ATTRIBUTE_ALWAYS_INLINE
+ iterator begin() { return (iterator)this->BeginX; }
+ LLVM_ATTRIBUTE_ALWAYS_INLINE
+ const_iterator begin() const { return (const_iterator)this->BeginX; }
+ LLVM_ATTRIBUTE_ALWAYS_INLINE
+ iterator end() { return (iterator)this->EndX; }
+ LLVM_ATTRIBUTE_ALWAYS_INLINE
+ const_iterator end() const { return (const_iterator)this->EndX; }
+protected:
+ iterator capacity_ptr() { return (iterator)this->CapacityX; }
+ const_iterator capacity_ptr() const { return (const_iterator)this->CapacityX;}
+public:
+
+ // reverse iterator creation methods.
+ reverse_iterator rbegin() { return reverse_iterator(end()); }
+ const_reverse_iterator rbegin() const{ return const_reverse_iterator(end()); }
+ reverse_iterator rend() { return reverse_iterator(begin()); }
+ const_reverse_iterator rend() const { return const_reverse_iterator(begin());}
+
+ LLVM_ATTRIBUTE_ALWAYS_INLINE
+ size_type size() const { return end()-begin(); }
+ size_type max_size() const { return size_type(-1) / sizeof(T); }
+
+ /// Return the total number of elements in the currently allocated buffer.
+ size_t capacity() const { return capacity_ptr() - begin(); }
+
+ /// Return a pointer to the vector's buffer, even if empty().
+ pointer data() { return pointer(begin()); }
+ /// Return a pointer to the vector's buffer, even if empty().
+ const_pointer data() const { return const_pointer(begin()); }
+
+ LLVM_ATTRIBUTE_ALWAYS_INLINE
+ reference operator[](size_type idx) {
+ assert(idx < size());
+ return begin()[idx];
+ }
+ LLVM_ATTRIBUTE_ALWAYS_INLINE
+ const_reference operator[](size_type idx) const {
+ assert(idx < size());
+ return begin()[idx];
+ }
+
+ reference front() {
+ assert(!empty());
+ return begin()[0];
+ }
+ const_reference front() const {
+ assert(!empty());
+ return begin()[0];
+ }
+
+ reference back() {
+ assert(!empty());
+ return end()[-1];
+ }
+ const_reference back() const {
+ assert(!empty());
+ return end()[-1];
+ }
+};
+
+/// SmallVectorTemplateBase<isPodLike = false> - This is where we put method
+/// implementations that are designed to work with non-POD-like T's.
+template <typename T, bool isPodLike>
+class SmallVectorTemplateBase : public SmallVectorTemplateCommon<T> {
+protected:
+ SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}
+
+ static void destroy_range(T *S, T *E) {
+ while (S != E) {
+ --E;
+ E->~T();
+ }
+ }
+
+ /// Use move-assignment to move the range [I, E) onto the
+ /// objects starting with "Dest". This is just <memory>'s
+ /// std::move, but not all stdlibs actually provide that.
+ template<typename It1, typename It2>
+ static It2 move(It1 I, It1 E, It2 Dest) {
+ for (; I != E; ++I, ++Dest)
+ *Dest = ::std::move(*I);
+ return Dest;
+ }
+
+ /// Use move-assignment to move the range
+ /// [I, E) onto the objects ending at "Dest", moving objects
+ /// in reverse order. This is just <algorithm>'s
+ /// std::move_backward, but not all stdlibs actually provide that.
+ template<typename It1, typename It2>
+ static It2 move_backward(It1 I, It1 E, It2 Dest) {
+ while (I != E)
+ *--Dest = ::std::move(*--E);
+ return Dest;
+ }
+
+ /// Move the range [I, E) into the uninitialized memory starting with "Dest",
+ /// constructing elements as needed.
+ template<typename It1, typename It2>
+ static void uninitialized_move(It1 I, It1 E, It2 Dest) {
+ for (; I != E; ++I, ++Dest)
+ ::new ((void*) &*Dest) T(::std::move(*I));
+ }
+
+ /// Copy the range [I, E) onto the uninitialized memory starting with "Dest",
+ /// constructing elements as needed.
+ template<typename It1, typename It2>
+ static void uninitialized_copy(It1 I, It1 E, It2 Dest) {
+ std::uninitialized_copy(I, E, Dest);
+ }
+
+ /// Grow the allocated memory (without initializing new elements), doubling
+ /// the size of the allocated memory. Guarantees space for at least one more
+ /// element, or MinSize more elements if specified.
+ void grow(size_t MinSize = 0);
+
+public:
+ void push_back(const T &Elt) {
+ if (LLVM_UNLIKELY(this->EndX >= this->CapacityX))
+ this->grow();
+ ::new ((void*) this->end()) T(Elt);
+ this->setEnd(this->end()+1);
+ }
+
+ void push_back(T &&Elt) {
+ if (LLVM_UNLIKELY(this->EndX >= this->CapacityX))
+ this->grow();
+ ::new ((void*) this->end()) T(::std::move(Elt));
+ this->setEnd(this->end()+1);
+ }
+
+ void pop_back() {
+ this->setEnd(this->end()-1);
+ this->end()->~T();
+ }
+};
+
+// Define this out-of-line to dissuade the C++ compiler from inlining it.
+template <typename T, bool isPodLike>
+void SmallVectorTemplateBase<T, isPodLike>::grow(size_t MinSize) {
+ size_t CurCapacity = this->capacity();
+ size_t CurSize = this->size();
+ // Always grow, even from zero.
+ size_t NewCapacity = size_t(NextPowerOf2(CurCapacity+2));
+ if (NewCapacity < MinSize)
+ NewCapacity = MinSize;
+ T *NewElts = static_cast<T*>(malloc(NewCapacity*sizeof(T)));
+
+ // Move the elements over.
+ this->uninitialized_move(this->begin(), this->end(), NewElts);
+
+ // Destroy the original elements.
+ destroy_range(this->begin(), this->end());
+
+ // If this wasn't grown from the inline copy, deallocate the old space.
+ if (!this->isSmall())
+ free(this->begin());
+
+ this->setEnd(NewElts+CurSize);
+ this->BeginX = NewElts;
+ this->CapacityX = this->begin()+NewCapacity;
+}
+
+
+/// SmallVectorTemplateBase<isPodLike = true> - This is where we put method
+/// implementations that are designed to work with POD-like T's.
+template <typename T>
+class SmallVectorTemplateBase<T, true> : public SmallVectorTemplateCommon<T> {
+protected:
+ SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}
+
+ // No need to do a destroy loop for POD's.
+ static void destroy_range(T *, T *) {}
+
+ /// Use move-assignment to move the range [I, E) onto the
+ /// objects starting with "Dest". For PODs, this is just memcpy.
+ template<typename It1, typename It2>
+ static It2 move(It1 I, It1 E, It2 Dest) {
+ return ::std::copy(I, E, Dest);
+ }
+
+ /// Use move-assignment to move the range [I, E) onto the objects ending at
+ /// "Dest", moving objects in reverse order.
+ template<typename It1, typename It2>
+ static It2 move_backward(It1 I, It1 E, It2 Dest) {
+ return ::std::copy_backward(I, E, Dest);
+ }
+
+ /// Move the range [I, E) onto the uninitialized memory
+ /// starting with "Dest", constructing elements into it as needed.
+ template<typename It1, typename It2>
+ static void uninitialized_move(It1 I, It1 E, It2 Dest) {
+ // Just do a copy.
+ uninitialized_copy(I, E, Dest);
+ }
+
+ /// Copy the range [I, E) onto the uninitialized memory
+ /// starting with "Dest", constructing elements into it as needed.
+ template<typename It1, typename It2>
+ static void uninitialized_copy(It1 I, It1 E, It2 Dest) {
+ // Arbitrary iterator types; just use the basic implementation.
+ std::uninitialized_copy(I, E, Dest);
+ }
+
+ /// Copy the range [I, E) onto the uninitialized memory
+ /// starting with "Dest", constructing elements into it as needed.
+ template <typename T1, typename T2>
+ static void uninitialized_copy(
+ T1 *I, T1 *E, T2 *Dest,
+ typename std::enable_if<std::is_same<typename std::remove_const<T1>::type,
+ T2>::value>::type * = nullptr) {
+ // Use memcpy for PODs iterated by pointers (which includes SmallVector
+ // iterators): std::uninitialized_copy optimizes to memmove, but we can
+ // use memcpy here. Note that I and E are iterators and thus might be
+ // invalid for memcpy if they are equal.
+ if (I != E)
+ memcpy(Dest, I, (E - I) * sizeof(T));
+ }
+
+ /// Double the size of the allocated memory, guaranteeing space for at
+ /// least one more element or MinSize if specified.
+ void grow(size_t MinSize = 0) {
+ this->grow_pod(MinSize*sizeof(T), sizeof(T));
+ }
+public:
+ void push_back(const T &Elt) {
+ if (LLVM_UNLIKELY(this->EndX >= this->CapacityX))
+ this->grow();
+ memcpy(this->end(), &Elt, sizeof(T));
+ this->setEnd(this->end()+1);
+ }
+
+ void pop_back() {
+ this->setEnd(this->end()-1);
+ }
+};
+
+
+/// This class consists of common code factored out of the SmallVector class to
+/// reduce code duplication based on the SmallVector 'N' template parameter.
+template <typename T>
+class SmallVectorImpl : public SmallVectorTemplateBase<T, isPodLike<T>::value> {
+ typedef SmallVectorTemplateBase<T, isPodLike<T>::value > SuperClass;
+
+ SmallVectorImpl(const SmallVectorImpl&) = delete;
+public:
+ typedef typename SuperClass::iterator iterator;
+ typedef typename SuperClass::size_type size_type;
+
+protected:
+ // Default ctor - Initialize to empty.
+ explicit SmallVectorImpl(unsigned N)
+ : SmallVectorTemplateBase<T, isPodLike<T>::value>(N*sizeof(T)) {
+ }
+
+public:
+ ~SmallVectorImpl() {
+ // Destroy the constructed elements in the vector.
+ this->destroy_range(this->begin(), this->end());
+
+ // If this wasn't grown from the inline copy, deallocate the old space.
+ if (!this->isSmall())
+ free(this->begin());
+ }
+
+
+ void clear() {
+ this->destroy_range(this->begin(), this->end());
+ this->EndX = this->BeginX;
+ }
+
+ void resize(size_type N) {
+ if (N < this->size()) {
+ this->destroy_range(this->begin()+N, this->end());
+ this->setEnd(this->begin()+N);
+ } else if (N > this->size()) {
+ if (this->capacity() < N)
+ this->grow(N);
+ for (auto I = this->end(), E = this->begin() + N; I != E; ++I)
+ new (&*I) T();
+ this->setEnd(this->begin()+N);
+ }
+ }
+
+ void resize(size_type N, const T &NV) {
+ if (N < this->size()) {
+ this->destroy_range(this->begin()+N, this->end());
+ this->setEnd(this->begin()+N);
+ } else if (N > this->size()) {
+ if (this->capacity() < N)
+ this->grow(N);
+ std::uninitialized_fill(this->end(), this->begin()+N, NV);
+ this->setEnd(this->begin()+N);
+ }
+ }
+
+ void reserve(size_type N) {
+ if (this->capacity() < N)
+ this->grow(N);
+ }
+
+ T LLVM_ATTRIBUTE_UNUSED_RESULT pop_back_val() {
+ T Result = ::std::move(this->back());
+ this->pop_back();
+ return Result;
+ }
+
+ void swap(SmallVectorImpl &RHS);
+
+ /// Add the specified range to the end of the SmallVector.
+ template<typename in_iter>
+ void append(in_iter in_start, in_iter in_end) {
+ size_type NumInputs = std::distance(in_start, in_end);
+ // Grow allocated space if needed.
+ if (NumInputs > size_type(this->capacity_ptr()-this->end()))
+ this->grow(this->size()+NumInputs);
+
+ // Copy the new elements over.
+ this->uninitialized_copy(in_start, in_end, this->end());
+ this->setEnd(this->end() + NumInputs);
+ }
+
+ /// Add the specified range to the end of the SmallVector.
+ void append(size_type NumInputs, const T &Elt) {
+ // Grow allocated space if needed.
+ if (NumInputs > size_type(this->capacity_ptr()-this->end()))
+ this->grow(this->size()+NumInputs);
+
+ // Copy the new elements over.
+ std::uninitialized_fill_n(this->end(), NumInputs, Elt);
+ this->setEnd(this->end() + NumInputs);
+ }
+
+ void append(std::initializer_list<T> IL) {
+ append(IL.begin(), IL.end());
+ }
+
+ void assign(size_type NumElts, const T &Elt) {
+ clear();
+ if (this->capacity() < NumElts)
+ this->grow(NumElts);
+ this->setEnd(this->begin()+NumElts);
+ std::uninitialized_fill(this->begin(), this->end(), Elt);
+ }
+
+ void assign(std::initializer_list<T> IL) {
+ clear();
+ append(IL);
+ }
+
+ iterator erase(iterator I) {
+ assert(I >= this->begin() && "Iterator to erase is out of bounds.");
+ assert(I < this->end() && "Erasing at past-the-end iterator.");
+
+ iterator N = I;
+ // Shift all elts down one.
+ this->move(I+1, this->end(), I);
+ // Drop the last elt.
+ this->pop_back();
+ return(N);
+ }
+
+ iterator erase(iterator S, iterator E) {
+ assert(S >= this->begin() && "Range to erase is out of bounds.");
+ assert(S <= E && "Trying to erase invalid range.");
+ assert(E <= this->end() && "Trying to erase past the end.");
+
+ iterator N = S;
+ // Shift all elts down.
+ iterator I = this->move(E, this->end(), S);
+ // Drop the last elts.
+ this->destroy_range(I, this->end());
+ this->setEnd(I);
+ return(N);
+ }
+
+ iterator insert(iterator I, T &&Elt) {
+ if (I == this->end()) { // Important special case for empty vector.
+ this->push_back(::std::move(Elt));
+ return this->end()-1;
+ }
+
+ assert(I >= this->begin() && "Insertion iterator is out of bounds.");
+ assert(I <= this->end() && "Inserting past the end of the vector.");
+
+ if (this->EndX >= this->CapacityX) {
+ size_t EltNo = I-this->begin();
+ this->grow();
+ I = this->begin()+EltNo;
+ }
+
+ ::new ((void*) this->end()) T(::std::move(this->back()));
+ // Push everything else over.
+ this->move_backward(I, this->end()-1, this->end());
+ this->setEnd(this->end()+1);
+
+ // If we just moved the element we're inserting, be sure to update
+ // the reference.
+ T *EltPtr = &Elt;
+ if (I <= EltPtr && EltPtr < this->EndX)
+ ++EltPtr;
+
+ *I = ::std::move(*EltPtr);
+ return I;
+ }
+
+ iterator insert(iterator I, const T &Elt) {
+ if (I == this->end()) { // Important special case for empty vector.
+ this->push_back(Elt);
+ return this->end()-1;
+ }
+
+ assert(I >= this->begin() && "Insertion iterator is out of bounds.");
+ assert(I <= this->end() && "Inserting past the end of the vector.");
+
+ if (this->EndX >= this->CapacityX) {
+ size_t EltNo = I-this->begin();
+ this->grow();
+ I = this->begin()+EltNo;
+ }
+ ::new ((void*) this->end()) T(std::move(this->back()));
+ // Push everything else over.
+ this->move_backward(I, this->end()-1, this->end());
+ this->setEnd(this->end()+1);
+
+ // If we just moved the element we're inserting, be sure to update
+ // the reference.
+ const T *EltPtr = &Elt;
+ if (I <= EltPtr && EltPtr < this->EndX)
+ ++EltPtr;
+
+ *I = *EltPtr;
+ return I;
+ }
+
+ iterator insert(iterator I, size_type NumToInsert, const T &Elt) {
+ // Convert iterator to elt# to avoid invalidating iterator when we reserve()
+ size_t InsertElt = I - this->begin();
+
+ if (I == this->end()) { // Important special case for empty vector.
+ append(NumToInsert, Elt);
+ return this->begin()+InsertElt;
+ }
+
+ assert(I >= this->begin() && "Insertion iterator is out of bounds.");
+ assert(I <= this->end() && "Inserting past the end of the vector.");
+
+ // Ensure there is enough space.
+ reserve(this->size() + NumToInsert);
+
+ // Uninvalidate the iterator.
+ I = this->begin()+InsertElt;
+
+ // If there are more elements between the insertion point and the end of the
+ // range than there are being inserted, we can use a simple approach to
+ // insertion. Since we already reserved space, we know that this won't
+ // reallocate the vector.
+ if (size_t(this->end()-I) >= NumToInsert) {
+ T *OldEnd = this->end();
+ append(std::move_iterator<iterator>(this->end() - NumToInsert),
+ std::move_iterator<iterator>(this->end()));
+
+ // Copy the existing elements that get replaced.
+ this->move_backward(I, OldEnd-NumToInsert, OldEnd);
+
+ std::fill_n(I, NumToInsert, Elt);
+ return I;
+ }
+
+ // Otherwise, we're inserting more elements than exist already, and we're
+ // not inserting at the end.
+
+ // Move over the elements that we're about to overwrite.
+ T *OldEnd = this->end();
+ this->setEnd(this->end() + NumToInsert);
+ size_t NumOverwritten = OldEnd-I;
+ this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten);
+
+ // Replace the overwritten part.
+ std::fill_n(I, NumOverwritten, Elt);
+
+ // Insert the non-overwritten middle part.
+ std::uninitialized_fill_n(OldEnd, NumToInsert-NumOverwritten, Elt);
+ return I;
+ }
+
+ template<typename ItTy>
+ iterator insert(iterator I, ItTy From, ItTy To) {
+ // Convert iterator to elt# to avoid invalidating iterator when we reserve()
+ size_t InsertElt = I - this->begin();
+
+ if (I == this->end()) { // Important special case for empty vector.
+ append(From, To);
+ return this->begin()+InsertElt;
+ }
+
+ assert(I >= this->begin() && "Insertion iterator is out of bounds.");
+ assert(I <= this->end() && "Inserting past the end of the vector.");
+
+ size_t NumToInsert = std::distance(From, To);
+
+ // Ensure there is enough space.
+ reserve(this->size() + NumToInsert);
+
+ // Uninvalidate the iterator.
+ I = this->begin()+InsertElt;
+
+ // If there are more elements between the insertion point and the end of the
+ // range than there are being inserted, we can use a simple approach to
+ // insertion. Since we already reserved space, we know that this won't
+ // reallocate the vector.
+ if (size_t(this->end()-I) >= NumToInsert) {
+ T *OldEnd = this->end();
+ append(std::move_iterator<iterator>(this->end() - NumToInsert),
+ std::move_iterator<iterator>(this->end()));
+
+ // Copy the existing elements that get replaced.
+ this->move_backward(I, OldEnd-NumToInsert, OldEnd);
+
+ std::copy(From, To, I);
+ return I;
+ }
+
+ // Otherwise, we're inserting more elements than exist already, and we're
+ // not inserting at the end.
+
+ // Move over the elements that we're about to overwrite.
+ T *OldEnd = this->end();
+ this->setEnd(this->end() + NumToInsert);
+ size_t NumOverwritten = OldEnd-I;
+ this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten);
+
+ // Replace the overwritten part.
+ for (T *J = I; NumOverwritten > 0; --NumOverwritten) {
+ *J = *From;
+ ++J; ++From;
+ }
+
+ // Insert the non-overwritten middle part.
+ this->uninitialized_copy(From, To, OldEnd);
+ return I;
+ }
+
+ void insert(iterator I, std::initializer_list<T> IL) {
+ insert(I, IL.begin(), IL.end());
+ }
+
+ template <typename... ArgTypes> void emplace_back(ArgTypes &&... Args) {
+ if (LLVM_UNLIKELY(this->EndX >= this->CapacityX))
+ this->grow();
+ ::new ((void *)this->end()) T(std::forward<ArgTypes>(Args)...);
+ this->setEnd(this->end() + 1);
+ }
+
+ SmallVectorImpl &operator=(const SmallVectorImpl &RHS);
+
+ SmallVectorImpl &operator=(SmallVectorImpl &&RHS);
+
+ bool operator==(const SmallVectorImpl &RHS) const {
+ if (this->size() != RHS.size()) return false;
+ return std::equal(this->begin(), this->end(), RHS.begin());
+ }
+ bool operator!=(const SmallVectorImpl &RHS) const {
+ return !(*this == RHS);
+ }
+
+ bool operator<(const SmallVectorImpl &RHS) const {
+ return std::lexicographical_compare(this->begin(), this->end(),
+ RHS.begin(), RHS.end());
+ }
+
+ /// Set the array size to \p N, which the current array must have enough
+ /// capacity for.
+ ///
+ /// This does not construct or destroy any elements in the vector.
+ ///
+ /// Clients can use this in conjunction with capacity() to write past the end
+ /// of the buffer when they know that more elements are available, and only
+ /// update the size later. This avoids the cost of value initializing elements
+ /// which will only be overwritten.
+ void set_size(size_type N) {
+ assert(N <= this->capacity());
+ this->setEnd(this->begin() + N);
+ }
+};
+
+
+template <typename T>
+void SmallVectorImpl<T>::swap(SmallVectorImpl<T> &RHS) {
+ if (this == &RHS) return;
+
+ // We can only avoid copying elements if neither vector is small.
+ if (!this->isSmall() && !RHS.isSmall()) {
+ std::swap(this->BeginX, RHS.BeginX);
+ std::swap(this->EndX, RHS.EndX);
+ std::swap(this->CapacityX, RHS.CapacityX);
+ return;
+ }
+ if (RHS.size() > this->capacity())
+ this->grow(RHS.size());
+ if (this->size() > RHS.capacity())
+ RHS.grow(this->size());
+
+ // Swap the shared elements.
+ size_t NumShared = this->size();
+ if (NumShared > RHS.size()) NumShared = RHS.size();
+ for (size_type i = 0; i != NumShared; ++i)
+ std::swap((*this)[i], RHS[i]);
+
+ // Copy over the extra elts.
+ if (this->size() > RHS.size()) {
+ size_t EltDiff = this->size() - RHS.size();
+ this->uninitialized_copy(this->begin()+NumShared, this->end(), RHS.end());
+ RHS.setEnd(RHS.end()+EltDiff);
+ this->destroy_range(this->begin()+NumShared, this->end());
+ this->setEnd(this->begin()+NumShared);
+ } else if (RHS.size() > this->size()) {
+ size_t EltDiff = RHS.size() - this->size();
+ this->uninitialized_copy(RHS.begin()+NumShared, RHS.end(), this->end());
+ this->setEnd(this->end() + EltDiff);
+ this->destroy_range(RHS.begin()+NumShared, RHS.end());
+ RHS.setEnd(RHS.begin()+NumShared);
+ }
+}
+
+template <typename T>
+SmallVectorImpl<T> &SmallVectorImpl<T>::
+ operator=(const SmallVectorImpl<T> &RHS) {
+ // Avoid self-assignment.
+ if (this == &RHS) return *this;
+
+ // If we already have sufficient space, assign the common elements, then
+ // destroy any excess.
+ size_t RHSSize = RHS.size();
+ size_t CurSize = this->size();
+ if (CurSize >= RHSSize) {
+ // Assign common elements.
+ iterator NewEnd;
+ if (RHSSize)
+ NewEnd = std::copy(RHS.begin(), RHS.begin()+RHSSize, this->begin());
+ else
+ NewEnd = this->begin();
+
+ // Destroy excess elements.
+ this->destroy_range(NewEnd, this->end());
+
+ // Trim.
+ this->setEnd(NewEnd);
+ return *this;
+ }
+
+ // If we have to grow to have enough elements, destroy the current elements.
+ // This allows us to avoid copying them during the grow.
+ // FIXME: don't do this if they're efficiently moveable.
+ if (this->capacity() < RHSSize) {
+ // Destroy current elements.
+ this->destroy_range(this->begin(), this->end());
+ this->setEnd(this->begin());
+ CurSize = 0;
+ this->grow(RHSSize);
+ } else if (CurSize) {
+ // Otherwise, use assignment for the already-constructed elements.
+ std::copy(RHS.begin(), RHS.begin()+CurSize, this->begin());
+ }
+
+ // Copy construct the new elements in place.
+ this->uninitialized_copy(RHS.begin()+CurSize, RHS.end(),
+ this->begin()+CurSize);
+
+ // Set end.
+ this->setEnd(this->begin()+RHSSize);
+ return *this;
+}
+
+template <typename T>
+SmallVectorImpl<T> &SmallVectorImpl<T>::operator=(SmallVectorImpl<T> &&RHS) {
+ // Avoid self-assignment.
+ if (this == &RHS) return *this;
+
+ // If the RHS isn't small, clear this vector and then steal its buffer.
+ if (!RHS.isSmall()) {
+ this->destroy_range(this->begin(), this->end());
+ if (!this->isSmall()) free(this->begin());
+ this->BeginX = RHS.BeginX;
+ this->EndX = RHS.EndX;
+ this->CapacityX = RHS.CapacityX;
+ RHS.resetToSmall();
+ return *this;
+ }
+
+ // If we already have sufficient space, assign the common elements, then
+ // destroy any excess.
+ size_t RHSSize = RHS.size();
+ size_t CurSize = this->size();
+ if (CurSize >= RHSSize) {
+ // Assign common elements.
+ iterator NewEnd = this->begin();
+ if (RHSSize)
+ NewEnd = this->move(RHS.begin(), RHS.end(), NewEnd);
+
+ // Destroy excess elements and trim the bounds.
+ this->destroy_range(NewEnd, this->end());
+ this->setEnd(NewEnd);
+
+ // Clear the RHS.
+ RHS.clear();
+
+ return *this;
+ }
+
+ // If we have to grow to have enough elements, destroy the current elements.
+ // This allows us to avoid copying them during the grow.
+ // FIXME: this may not actually make any sense if we can efficiently move
+ // elements.
+ if (this->capacity() < RHSSize) {
+ // Destroy current elements.
+ this->destroy_range(this->begin(), this->end());
+ this->setEnd(this->begin());
+ CurSize = 0;
+ this->grow(RHSSize);
+ } else if (CurSize) {
+ // Otherwise, use assignment for the already-constructed elements.
+ this->move(RHS.begin(), RHS.begin()+CurSize, this->begin());
+ }
+
+ // Move-construct the new elements in place.
+ this->uninitialized_move(RHS.begin()+CurSize, RHS.end(),
+ this->begin()+CurSize);
+
+ // Set end.
+ this->setEnd(this->begin()+RHSSize);
+
+ RHS.clear();
+ return *this;
+}
+
+/// Storage for the SmallVector elements which aren't contained in
+/// SmallVectorTemplateCommon. There are 'N-1' elements here. The remaining '1'
+/// element is in the base class. This is specialized for the N=1 and N=0 cases
+/// to avoid allocating unnecessary storage.
+template <typename T, unsigned N>
+struct SmallVectorStorage {
+ typename SmallVectorTemplateCommon<T>::U InlineElts[N - 1];
+};
+template <typename T> struct SmallVectorStorage<T, 1> {};
+template <typename T> struct SmallVectorStorage<T, 0> {};
+
+/// This is a 'vector' (really, a variable-sized array), optimized
+/// for the case when the array is small. It contains some number of elements
+/// in-place, which allows it to avoid heap allocation when the actual number of
+/// elements is below that threshold. This allows normal "small" cases to be
+/// fast without losing generality for large inputs.
+///
+/// Note that this does not attempt to be exception safe.
+///
+template <typename T, unsigned N>
+class SmallVector : public SmallVectorImpl<T> {
+ /// Inline space for elements which aren't stored in the base class.
+ SmallVectorStorage<T, N> Storage;
+public:
+ SmallVector() : SmallVectorImpl<T>(N) {
+ }
+
+ explicit SmallVector(size_t Size, const T &Value = T())
+ : SmallVectorImpl<T>(N) {
+ this->assign(Size, Value);
+ }
+
+ template<typename ItTy>
+ SmallVector(ItTy S, ItTy E) : SmallVectorImpl<T>(N) {
+ this->append(S, E);
+ }
+
+ template <typename RangeTy>
+ explicit SmallVector(const llvm::iterator_range<RangeTy> R)
+ : SmallVectorImpl<T>(N) {
+ this->append(R.begin(), R.end());
+ }
+
+ SmallVector(std::initializer_list<T> IL) : SmallVectorImpl<T>(N) {
+ this->assign(IL);
+ }
+
+ SmallVector(const SmallVector &RHS) : SmallVectorImpl<T>(N) {
+ if (!RHS.empty())
+ SmallVectorImpl<T>::operator=(RHS);
+ }
+
+ const SmallVector &operator=(const SmallVector &RHS) {
+ SmallVectorImpl<T>::operator=(RHS);
+ return *this;
+ }
+
+ SmallVector(SmallVector &&RHS) : SmallVectorImpl<T>(N) {
+ if (!RHS.empty())
+ SmallVectorImpl<T>::operator=(::std::move(RHS));
+ }
+
+ const SmallVector &operator=(SmallVector &&RHS) {
+ SmallVectorImpl<T>::operator=(::std::move(RHS));
+ return *this;
+ }
+
+ SmallVector(SmallVectorImpl<T> &&RHS) : SmallVectorImpl<T>(N) {
+ if (!RHS.empty())
+ SmallVectorImpl<T>::operator=(::std::move(RHS));
+ }
+
+ const SmallVector &operator=(SmallVectorImpl<T> &&RHS) {
+ SmallVectorImpl<T>::operator=(::std::move(RHS));
+ return *this;
+ }
+
+ const SmallVector &operator=(std::initializer_list<T> IL) {
+ this->assign(IL);
+ return *this;
+ }
+};
+
+template<typename T, unsigned N>
+static inline size_t capacity_in_bytes(const SmallVector<T, N> &X) {
+ return X.capacity_in_bytes();
+}
+
+} // End llvm namespace
+
+namespace std {
+ /// Implement std::swap in terms of SmallVector swap.
+ template<typename T>
+ inline void
+ swap(llvm::SmallVectorImpl<T> &LHS, llvm::SmallVectorImpl<T> &RHS) {
+ LHS.swap(RHS);
+ }
+
+ /// Implement std::swap in terms of SmallVector swap.
+ template<typename T, unsigned N>
+ inline void
+ swap(llvm::SmallVector<T, N> &LHS, llvm::SmallVector<T, N> &RHS) {
+ LHS.swap(RHS);
+ }
+}
+
+#endif
diff --git a/ext/include/llvm/ADT/StringExtras.h b/ext/include/llvm/ADT/StringExtras.h
new file mode 100644
index 0000000..0992f5d
--- /dev/null
+++ b/ext/include/llvm/ADT/StringExtras.h
@@ -0,0 +1,212 @@
+//===-- llvm/ADT/StringExtras.h - Useful string functions -------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains some functions that are useful when dealing with strings.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_STRINGEXTRAS_H
+#define LLVM_ADT_STRINGEXTRAS_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/DataTypes.h"
+#include <iterator>
+
+namespace llvm {
+template<typename T> class SmallVectorImpl;
+
+/// hexdigit - Return the hexadecimal character for the
+/// given number \p X (which should be less than 16).
+static inline char hexdigit(unsigned X, bool LowerCase = false) {
+ const char HexChar = LowerCase ? 'a' : 'A';
+ return X < 10 ? '0' + X : HexChar + X - 10;
+}
+
+/// Construct a string ref from a boolean.
+static inline StringRef toStringRef(bool B) {
+ return StringRef(B ? "true" : "false");
+}
+
+/// Interpret the given character \p C as a hexadecimal digit and return its
+/// value.
+///
+/// If \p C is not a valid hex digit, -1U is returned.
+static inline unsigned hexDigitValue(char C) {
+ if (C >= '0' && C <= '9') return C-'0';
+ if (C >= 'a' && C <= 'f') return C-'a'+10U;
+ if (C >= 'A' && C <= 'F') return C-'A'+10U;
+ return -1U;
+}
+
+/// utohex_buffer - Emit the specified number into the buffer specified by
+/// BufferEnd, returning a pointer to the start of the string. This can be used
+/// like this: (note that the buffer must be large enough to handle any number):
+/// char Buffer[40];
+/// printf("0x%s", utohex_buffer(X, Buffer+40));
+///
+/// This should only be used with unsigned types.
+///
+template<typename IntTy>
+static inline char *utohex_buffer(IntTy X, char *BufferEnd, bool LowerCase = false) {
+ char *BufPtr = BufferEnd;
+ *--BufPtr = 0; // Null terminate buffer.
+ if (X == 0) {
+ *--BufPtr = '0'; // Handle special case.
+ return BufPtr;
+ }
+
+ while (X) {
+ unsigned char Mod = static_cast<unsigned char>(X) & 15;
+ *--BufPtr = hexdigit(Mod, LowerCase);
+ X >>= 4;
+ }
+ return BufPtr;
+}
+
+static inline std::string utohexstr(uint64_t X, bool LowerCase = false) {
+ char Buffer[17];
+ return utohex_buffer(X, Buffer+17, LowerCase);
+}
+
+static inline std::string utostr_32(uint32_t X, bool isNeg = false) {
+ char Buffer[11];
+ char *BufPtr = Buffer+11;
+
+ if (X == 0) *--BufPtr = '0'; // Handle special case...
+
+ while (X) {
+ *--BufPtr = '0' + char(X % 10);
+ X /= 10;
+ }
+
+ if (isNeg) *--BufPtr = '-'; // Add negative sign...
+
+ return std::string(BufPtr, Buffer+11);
+}
+
+static inline std::string utostr(uint64_t X, bool isNeg = false) {
+ char Buffer[21];
+ char *BufPtr = Buffer+21;
+
+ if (X == 0) *--BufPtr = '0'; // Handle special case...
+
+ while (X) {
+ *--BufPtr = '0' + char(X % 10);
+ X /= 10;
+ }
+
+ if (isNeg) *--BufPtr = '-'; // Add negative sign...
+ return std::string(BufPtr, Buffer+21);
+}
+
+
+static inline std::string itostr(int64_t X) {
+ if (X < 0)
+ return utostr(static_cast<uint64_t>(-X), true);
+ else
+ return utostr(static_cast<uint64_t>(X));
+}
+
+/// StrInStrNoCase - Portable version of strcasestr. Locates the first
+/// occurrence of string 's1' in string 's2', ignoring case. Returns
+/// the offset of s2 in s1 or npos if s2 cannot be found.
+StringRef::size_type StrInStrNoCase(StringRef s1, StringRef s2);
+
+/// getToken - This function extracts one token from source, ignoring any
+/// leading characters that appear in the Delimiters string, and ending the
+/// token at any of the characters that appear in the Delimiters string. If
+/// there are no tokens in the source string, an empty string is returned.
+/// The function returns a pair containing the extracted token and the
+/// remaining tail string.
+std::pair<StringRef, StringRef> getToken(StringRef Source,
+ StringRef Delimiters = " \t\n\v\f\r");
+
+/// SplitString - Split up the specified string according to the specified
+/// delimiters, appending the result fragments to the output list.
+void SplitString(StringRef Source,
+ SmallVectorImpl<StringRef> &OutFragments,
+ StringRef Delimiters = " \t\n\v\f\r");
+
+/// HashString - Hash function for strings.
+///
+/// This is the Bernstein hash function.
+//
+// FIXME: Investigate whether a modified bernstein hash function performs
+// better: http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx
+// X*33+c -> X*33^c
+static inline unsigned HashString(StringRef Str, unsigned Result = 0) {
+ for (StringRef::size_type i = 0, e = Str.size(); i != e; ++i)
+ Result = Result * 33 + (unsigned char)Str[i];
+ return Result;
+}
+
+/// Returns the English suffix for an ordinal integer (-st, -nd, -rd, -th).
+static inline StringRef getOrdinalSuffix(unsigned Val) {
+ // It is critically important that we do this perfectly for
+ // user-written sequences with over 100 elements.
+ switch (Val % 100) {
+ case 11:
+ case 12:
+ case 13:
+ return "th";
+ default:
+ switch (Val % 10) {
+ case 1: return "st";
+ case 2: return "nd";
+ case 3: return "rd";
+ default: return "th";
+ }
+ }
+}
+
+template <typename IteratorT>
+inline std::string join_impl(IteratorT Begin, IteratorT End,
+ StringRef Separator, std::input_iterator_tag) {
+ std::string S;
+ if (Begin == End)
+ return S;
+
+ S += (*Begin);
+ while (++Begin != End) {
+ S += Separator;
+ S += (*Begin);
+ }
+ return S;
+}
+
+template <typename IteratorT>
+inline std::string join_impl(IteratorT Begin, IteratorT End,
+ StringRef Separator, std::forward_iterator_tag) {
+ std::string S;
+ if (Begin == End)
+ return S;
+
+ size_t Len = (std::distance(Begin, End) - 1) * Separator.size();
+ for (IteratorT I = Begin; I != End; ++I)
+ Len += (*Begin).size();
+ S.reserve(Len);
+ S += (*Begin);
+ while (++Begin != End) {
+ S += Separator;
+ S += (*Begin);
+ }
+ return S;
+}
+
+/// Joins the strings in the range [Begin, End), adding Separator between
+/// the elements.
+template <typename IteratorT>
+inline std::string join(IteratorT Begin, IteratorT End, StringRef Separator) {
+ typedef typename std::iterator_traits<IteratorT>::iterator_category tag;
+ return join_impl(Begin, End, Separator, tag());
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/ext/include/llvm/ADT/StringMap.h b/ext/include/llvm/ADT/StringMap.h
new file mode 100644
index 0000000..700bb9e
--- /dev/null
+++ b/ext/include/llvm/ADT/StringMap.h
@@ -0,0 +1,463 @@
+//===--- StringMap.h - String Hash table map interface ----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the StringMap class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_STRINGMAP_H
+#define LLVM_ADT_STRINGMAP_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Allocator.h"
+#include <cstring>
+#include <utility>
+
+namespace llvm {
+ template<typename ValueT>
+ class StringMapConstIterator;
+ template<typename ValueT>
+ class StringMapIterator;
+ template<typename ValueTy>
+ class StringMapEntry;
+
+/// StringMapEntryBase - Shared base class of StringMapEntry instances.
+class StringMapEntryBase {
+ unsigned StrLen;
+
+public:
+ explicit StringMapEntryBase(unsigned Len) : StrLen(Len) {}
+
+ unsigned getKeyLength() const { return StrLen; }
+};
+
+/// StringMapImpl - This is the base class of StringMap that is shared among
+/// all of its instantiations.
+class StringMapImpl {
+protected:
+ // Array of NumBuckets pointers to entries, null pointers are holes.
+ // TheTable[NumBuckets] contains a sentinel value for easy iteration. Followed
+ // by an array of the actual hash values as unsigned integers.
+ StringMapEntryBase **TheTable;
+ unsigned NumBuckets;
+ unsigned NumItems;
+ unsigned NumTombstones;
+ unsigned ItemSize;
+
+protected:
+ explicit StringMapImpl(unsigned itemSize)
+ : TheTable(nullptr),
+ // Initialize the map with zero buckets to allocation.
+ NumBuckets(0), NumItems(0), NumTombstones(0), ItemSize(itemSize) {}
+ StringMapImpl(StringMapImpl &&RHS)
+ : TheTable(RHS.TheTable), NumBuckets(RHS.NumBuckets),
+ NumItems(RHS.NumItems), NumTombstones(RHS.NumTombstones),
+ ItemSize(RHS.ItemSize) {
+ RHS.TheTable = nullptr;
+ RHS.NumBuckets = 0;
+ RHS.NumItems = 0;
+ RHS.NumTombstones = 0;
+ }
+
+ StringMapImpl(unsigned InitSize, unsigned ItemSize);
+ unsigned RehashTable(unsigned BucketNo = 0);
+
+ /// LookupBucketFor - Look up the bucket that the specified string should end
+ /// up in. If it already exists as a key in the map, the Item pointer for the
+ /// specified bucket will be non-null. Otherwise, it will be null. In either
+ /// case, the FullHashValue field of the bucket will be set to the hash value
+ /// of the string.
+ unsigned LookupBucketFor(StringRef Key);
+
+ /// FindKey - Look up the bucket that contains the specified key. If it exists
+ /// in the map, return the bucket number of the key. Otherwise return -1.
+ /// This does not modify the map.
+ int FindKey(StringRef Key) const;
+
+ /// RemoveKey - Remove the specified StringMapEntry from the table, but do not
+ /// delete it. This aborts if the value isn't in the table.
+ void RemoveKey(StringMapEntryBase *V);
+
+ /// RemoveKey - Remove the StringMapEntry for the specified key from the
+ /// table, returning it. If the key is not in the table, this returns null.
+ StringMapEntryBase *RemoveKey(StringRef Key);
+
+private:
+ void init(unsigned Size);
+
+public:
+ static StringMapEntryBase *getTombstoneVal() {
+ return (StringMapEntryBase*)-1;
+ }
+
+ unsigned getNumBuckets() const { return NumBuckets; }
+ unsigned getNumItems() const { return NumItems; }
+
+ bool empty() const { return NumItems == 0; }
+ unsigned size() const { return NumItems; }
+
+ void swap(StringMapImpl &Other) {
+ std::swap(TheTable, Other.TheTable);
+ std::swap(NumBuckets, Other.NumBuckets);
+ std::swap(NumItems, Other.NumItems);
+ std::swap(NumTombstones, Other.NumTombstones);
+ }
+};
+
+/// StringMapEntry - This is used to represent one value that is inserted into
+/// a StringMap. It contains the Value itself and the key: the string length
+/// and data.
+template<typename ValueTy>
+class StringMapEntry : public StringMapEntryBase {
+ StringMapEntry(StringMapEntry &E) = delete;
+
+public:
+ ValueTy second;
+
+ explicit StringMapEntry(unsigned strLen)
+ : StringMapEntryBase(strLen), second() {}
+ template <class InitTy>
+ StringMapEntry(unsigned strLen, InitTy &&V)
+ : StringMapEntryBase(strLen), second(std::forward<InitTy>(V)) {}
+
+ StringRef getKey() const {
+ return StringRef(getKeyData(), getKeyLength());
+ }
+
+ const ValueTy &getValue() const { return second; }
+ ValueTy &getValue() { return second; }
+
+ void setValue(const ValueTy &V) { second = V; }
+
+ /// getKeyData - Return the start of the string data that is the key for this
+ /// value. The string data is always stored immediately after the
+ /// StringMapEntry object.
+ const char *getKeyData() const {return reinterpret_cast<const char*>(this+1);}
+
+ StringRef first() const { return StringRef(getKeyData(), getKeyLength()); }
+
+ /// Create - Create a StringMapEntry for the specified key and default
+ /// construct the value.
+ template <typename AllocatorTy, typename InitType>
+ static StringMapEntry *Create(StringRef Key, AllocatorTy &Allocator,
+ InitType &&InitVal) {
+ unsigned KeyLength = Key.size();
+
+ // Allocate a new item with space for the string at the end and a null
+ // terminator.
+ unsigned AllocSize = static_cast<unsigned>(sizeof(StringMapEntry))+
+ KeyLength+1;
+ unsigned Alignment = alignOf<StringMapEntry>();
+
+ StringMapEntry *NewItem =
+ static_cast<StringMapEntry*>(Allocator.Allocate(AllocSize,Alignment));
+
+ // Default construct the value.
+ new (NewItem) StringMapEntry(KeyLength, std::forward<InitType>(InitVal));
+
+ // Copy the string information.
+ char *StrBuffer = const_cast<char*>(NewItem->getKeyData());
+ if (KeyLength > 0)
+ memcpy(StrBuffer, Key.data(), KeyLength);
+ StrBuffer[KeyLength] = 0; // Null terminate for convenience of clients.
+ return NewItem;
+ }
+
+ template<typename AllocatorTy>
+ static StringMapEntry *Create(StringRef Key, AllocatorTy &Allocator) {
+ return Create(Key, Allocator, ValueTy());
+ }
+
+ /// Create - Create a StringMapEntry with normal malloc/free.
+ template<typename InitType>
+ static StringMapEntry *Create(StringRef Key, InitType &&InitVal) {
+ MallocAllocator A;
+ return Create(Key, A, std::forward<InitType>(InitVal));
+ }
+
+ static StringMapEntry *Create(StringRef Key) {
+ return Create(Key, ValueTy());
+ }
+
+ /// GetStringMapEntryFromKeyData - Given key data that is known to be embedded
+ /// into a StringMapEntry, return the StringMapEntry itself.
+ static StringMapEntry &GetStringMapEntryFromKeyData(const char *KeyData) {
+ char *Ptr = const_cast<char*>(KeyData) - sizeof(StringMapEntry<ValueTy>);
+ return *reinterpret_cast<StringMapEntry*>(Ptr);
+ }
+
+ /// Destroy - Destroy this StringMapEntry, releasing memory back to the
+ /// specified allocator.
+ template<typename AllocatorTy>
+ void Destroy(AllocatorTy &Allocator) {
+ // Free memory referenced by the item.
+ unsigned AllocSize =
+ static_cast<unsigned>(sizeof(StringMapEntry)) + getKeyLength() + 1;
+ this->~StringMapEntry();
+ Allocator.Deallocate(static_cast<void *>(this), AllocSize);
+ }
+
+ /// Destroy this object, releasing memory back to the malloc allocator.
+ void Destroy() {
+ MallocAllocator A;
+ Destroy(A);
+ }
+};
+
+/// StringMap - This is an unconventional map that is specialized for handling
+/// keys that are "strings", which are basically ranges of bytes. This does some
+/// funky memory allocation and hashing things to make it extremely efficient,
+/// storing the string data *after* the value in the map.
+template<typename ValueTy, typename AllocatorTy = MallocAllocator>
+class StringMap : public StringMapImpl {
+ AllocatorTy Allocator;
+
+public:
+ typedef StringMapEntry<ValueTy> MapEntryTy;
+
+ StringMap() : StringMapImpl(static_cast<unsigned>(sizeof(MapEntryTy))) {}
+ explicit StringMap(unsigned InitialSize)
+ : StringMapImpl(InitialSize, static_cast<unsigned>(sizeof(MapEntryTy))) {}
+
+ explicit StringMap(AllocatorTy A)
+ : StringMapImpl(static_cast<unsigned>(sizeof(MapEntryTy))), Allocator(A) {}
+
+ StringMap(unsigned InitialSize, AllocatorTy A)
+ : StringMapImpl(InitialSize, static_cast<unsigned>(sizeof(MapEntryTy))),
+ Allocator(A) {}
+
+ StringMap(std::initializer_list<std::pair<StringRef, ValueTy>> List)
+ : StringMapImpl(static_cast<unsigned>(sizeof(MapEntryTy))) {
+ for (const auto &P : List) {
+ insert(P);
+ }
+ }
+
+ StringMap(StringMap &&RHS)
+ : StringMapImpl(std::move(RHS)), Allocator(std::move(RHS.Allocator)) {}
+
+ StringMap &operator=(StringMap RHS) {
+ StringMapImpl::swap(RHS);
+ std::swap(Allocator, RHS.Allocator);
+ return *this;
+ }
+
+ // FIXME: Implement copy operations if/when they're needed.
+
+ AllocatorTy &getAllocator() { return Allocator; }
+ const AllocatorTy &getAllocator() const { return Allocator; }
+
+ typedef const char* key_type;
+ typedef ValueTy mapped_type;
+ typedef StringMapEntry<ValueTy> value_type;
+ typedef size_t size_type;
+
+ typedef StringMapConstIterator<ValueTy> const_iterator;
+ typedef StringMapIterator<ValueTy> iterator;
+
+ iterator begin() {
+ return iterator(TheTable, NumBuckets == 0);
+ }
+ iterator end() {
+ return iterator(TheTable+NumBuckets, true);
+ }
+ const_iterator begin() const {
+ return const_iterator(TheTable, NumBuckets == 0);
+ }
+ const_iterator end() const {
+ return const_iterator(TheTable+NumBuckets, true);
+ }
+
+ iterator find(StringRef Key) {
+ int Bucket = FindKey(Key);
+ if (Bucket == -1) return end();
+ return iterator(TheTable+Bucket, true);
+ }
+
+ const_iterator find(StringRef Key) const {
+ int Bucket = FindKey(Key);
+ if (Bucket == -1) return end();
+ return const_iterator(TheTable+Bucket, true);
+ }
+
+ /// lookup - Return the entry for the specified key, or a default
+ /// constructed value if no such entry exists.
+ ValueTy lookup(StringRef Key) const {
+ const_iterator it = find(Key);
+ if (it != end())
+ return it->second;
+ return ValueTy();
+ }
+
+ ValueTy &operator[](StringRef Key) {
+ return insert(std::make_pair(Key, ValueTy())).first->second;
+ }
+
+ /// count - Return 1 if the element is in the map, 0 otherwise.
+ size_type count(StringRef Key) const {
+ return find(Key) == end() ? 0 : 1;
+ }
+
+ /// insert - Insert the specified key/value pair into the map. If the key
+ /// already exists in the map, return false and ignore the request, otherwise
+ /// insert it and return true.
+ bool insert(MapEntryTy *KeyValue) {
+ unsigned BucketNo = LookupBucketFor(KeyValue->getKey());
+ StringMapEntryBase *&Bucket = TheTable[BucketNo];
+ if (Bucket && Bucket != getTombstoneVal())
+ return false; // Already exists in map.
+
+ if (Bucket == getTombstoneVal())
+ --NumTombstones;
+ Bucket = KeyValue;
+ ++NumItems;
+ assert(NumItems + NumTombstones <= NumBuckets);
+
+ RehashTable();
+ return true;
+ }
+
+ /// insert - Inserts the specified key/value pair into the map if the key
+ /// isn't already in the map. The bool component of the returned pair is true
+ /// if and only if the insertion takes place, and the iterator component of
+ /// the pair points to the element with key equivalent to the key of the pair.
+ std::pair<iterator, bool> insert(std::pair<StringRef, ValueTy> KV) {
+ unsigned BucketNo = LookupBucketFor(KV.first);
+ StringMapEntryBase *&Bucket = TheTable[BucketNo];
+ if (Bucket && Bucket != getTombstoneVal())
+ return std::make_pair(iterator(TheTable + BucketNo, false),
+ false); // Already exists in map.
+
+ if (Bucket == getTombstoneVal())
+ --NumTombstones;
+ Bucket =
+ MapEntryTy::Create(KV.first, Allocator, std::move(KV.second));
+ ++NumItems;
+ assert(NumItems + NumTombstones <= NumBuckets);
+
+ BucketNo = RehashTable(BucketNo);
+ return std::make_pair(iterator(TheTable + BucketNo, false), true);
+ }
+
+ // clear - Empties out the StringMap
+ void clear() {
+ if (empty()) return;
+
+ // Zap all values, resetting the keys back to non-present (not tombstone),
+ // which is safe because we're removing all elements.
+ for (unsigned I = 0, E = NumBuckets; I != E; ++I) {
+ StringMapEntryBase *&Bucket = TheTable[I];
+ if (Bucket && Bucket != getTombstoneVal()) {
+ static_cast<MapEntryTy*>(Bucket)->Destroy(Allocator);
+ }
+ Bucket = nullptr;
+ }
+
+ NumItems = 0;
+ NumTombstones = 0;
+ }
+
+ /// remove - Remove the specified key/value pair from the map, but do not
+ /// erase it. This aborts if the key is not in the map.
+ void remove(MapEntryTy *KeyValue) {
+ RemoveKey(KeyValue);
+ }
+
+ void erase(iterator I) {
+ MapEntryTy &V = *I;
+ remove(&V);
+ V.Destroy(Allocator);
+ }
+
+ bool erase(StringRef Key) {
+ iterator I = find(Key);
+ if (I == end()) return false;
+ erase(I);
+ return true;
+ }
+
+ ~StringMap() {
+ // Delete all the elements in the map, but don't reset the elements
+ // to default values. This is a copy of clear(), but avoids unnecessary
+ // work not required in the destructor.
+ if (!empty()) {
+ for (unsigned I = 0, E = NumBuckets; I != E; ++I) {
+ StringMapEntryBase *Bucket = TheTable[I];
+ if (Bucket && Bucket != getTombstoneVal()) {
+ static_cast<MapEntryTy*>(Bucket)->Destroy(Allocator);
+ }
+ }
+ }
+ free(TheTable);
+ }
+};
+
+template <typename ValueTy> class StringMapConstIterator {
+protected:
+ StringMapEntryBase **Ptr;
+
+public:
+ typedef StringMapEntry<ValueTy> value_type;
+
+ StringMapConstIterator() : Ptr(nullptr) { }
+
+ explicit StringMapConstIterator(StringMapEntryBase **Bucket,
+ bool NoAdvance = false)
+ : Ptr(Bucket) {
+ if (!NoAdvance) AdvancePastEmptyBuckets();
+ }
+
+ const value_type &operator*() const {
+ return *static_cast<StringMapEntry<ValueTy>*>(*Ptr);
+ }
+ const value_type *operator->() const {
+ return static_cast<StringMapEntry<ValueTy>*>(*Ptr);
+ }
+
+ bool operator==(const StringMapConstIterator &RHS) const {
+ return Ptr == RHS.Ptr;
+ }
+ bool operator!=(const StringMapConstIterator &RHS) const {
+ return Ptr != RHS.Ptr;
+ }
+
+ inline StringMapConstIterator& operator++() { // Preincrement
+ ++Ptr;
+ AdvancePastEmptyBuckets();
+ return *this;
+ }
+ StringMapConstIterator operator++(int) { // Postincrement
+ StringMapConstIterator tmp = *this; ++*this; return tmp;
+ }
+
+private:
+ void AdvancePastEmptyBuckets() {
+ while (*Ptr == nullptr || *Ptr == StringMapImpl::getTombstoneVal())
+ ++Ptr;
+ }
+};
+
+template<typename ValueTy>
+class StringMapIterator : public StringMapConstIterator<ValueTy> {
+public:
+ StringMapIterator() {}
+ explicit StringMapIterator(StringMapEntryBase **Bucket,
+ bool NoAdvance = false)
+ : StringMapConstIterator<ValueTy>(Bucket, NoAdvance) {
+ }
+ StringMapEntry<ValueTy> &operator*() const {
+ return *static_cast<StringMapEntry<ValueTy>*>(*this->Ptr);
+ }
+ StringMapEntry<ValueTy> *operator->() const {
+ return static_cast<StringMapEntry<ValueTy>*>(*this->Ptr);
+ }
+};
+}
+
+#endif
diff --git a/ext/include/llvm/ADT/StringRef.h b/ext/include/llvm/ADT/StringRef.h
new file mode 100644
index 0000000..c3abec8
--- /dev/null
+++ b/ext/include/llvm/ADT/StringRef.h
@@ -0,0 +1,593 @@
+//===--- StringRef.h - Constant String Reference Wrapper --------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_STRINGREF_H
+#define LLVM_ADT_STRINGREF_H
+
+#include "llvm/Support/Compiler.h"
+#include <algorithm>
+#include <cassert>
+#include <cstring>
+#include <limits>
+#include <string>
+#include <utility>
+
+namespace llvm {
+ template <typename T>
+ class SmallVectorImpl;
+ class hash_code;
+ class StringRef;
+
+ /// Helper functions for StringRef::getAsInteger.
+ bool getAsUnsignedInteger(StringRef Str, unsigned Radix,
+ unsigned long long &Result);
+
+ bool getAsSignedInteger(StringRef Str, unsigned Radix, long long &Result);
+
+ /// StringRef - Represent a constant reference to a string, i.e. a character
+ /// array and a length, which need not be null terminated.
+ ///
+ /// This class does not own the string data, it is expected to be used in
+ /// situations where the character data resides in some other buffer, whose
+ /// lifetime extends past that of the StringRef. For this reason, it is not in
+ /// general safe to store a StringRef.
+ class StringRef {
+ public:
+ typedef const char *iterator;
+ typedef const char *const_iterator;
+ static const size_t npos = ~size_t(0);
+ typedef size_t size_type;
+
+ private:
+ /// The start of the string, in an external buffer.
+ const char *Data;
+
+ /// The length of the string.
+ size_t Length;
+
+ // Workaround memcmp issue with null pointers (undefined behavior)
+ // by providing a specialized version
+ LLVM_ATTRIBUTE_ALWAYS_INLINE
+ static int compareMemory(const char *Lhs, const char *Rhs, size_t Length) {
+ if (Length == 0) { return 0; }
+ return ::memcmp(Lhs,Rhs,Length);
+ }
+
+ public:
+ /// @name Constructors
+ /// @{
+
+ /// Construct an empty string ref.
+ /*implicit*/ StringRef() : Data(nullptr), Length(0) {}
+
+ /// Construct a string ref from a cstring.
+ /*implicit*/ StringRef(const char *Str)
+ : Data(Str) {
+ assert(Str && "StringRef cannot be built from a NULL argument");
+ Length = ::strlen(Str); // invoking strlen(NULL) is undefined behavior
+ }
+
+ /// Construct a string ref from a pointer and length.
+ LLVM_ATTRIBUTE_ALWAYS_INLINE
+ /*implicit*/ StringRef(const char *data, size_t length)
+ : Data(data), Length(length) {
+ assert((data || length == 0) &&
+ "StringRef cannot be built from a NULL argument with non-null length");
+ }
+
+ /// Construct a string ref from an std::string.
+ LLVM_ATTRIBUTE_ALWAYS_INLINE
+ /*implicit*/ StringRef(const std::string &Str)
+ : Data(Str.data()), Length(Str.length()) {}
+
+ /// @}
+ /// @name Iterators
+ /// @{
+
+ iterator begin() const { return Data; }
+
+ iterator end() const { return Data + Length; }
+
+ const unsigned char *bytes_begin() const {
+ return reinterpret_cast<const unsigned char *>(begin());
+ }
+ const unsigned char *bytes_end() const {
+ return reinterpret_cast<const unsigned char *>(end());
+ }
+
+ /// @}
+ /// @name String Operations
+ /// @{
+
+ /// data - Get a pointer to the start of the string (which may not be null
+ /// terminated).
+ LLVM_ATTRIBUTE_ALWAYS_INLINE
+ const char *data() const { return Data; }
+
+ /// empty - Check if the string is empty.
+ LLVM_ATTRIBUTE_ALWAYS_INLINE
+ bool empty() const { return Length == 0; }
+
+ /// size - Get the string size.
+ LLVM_ATTRIBUTE_ALWAYS_INLINE
+ size_t size() const { return Length; }
+
+ /// front - Get the first character in the string.
+ char front() const {
+ assert(!empty());
+ return Data[0];
+ }
+
+ /// back - Get the last character in the string.
+ char back() const {
+ assert(!empty());
+ return Data[Length-1];
+ }
+
+ // copy - Allocate copy in Allocator and return StringRef to it.
+ template <typename Allocator> StringRef copy(Allocator &A) const {
+ char *S = A.template Allocate<char>(Length);
+ std::copy(begin(), end(), S);
+ return StringRef(S, Length);
+ }
+
+ /// equals - Check for string equality, this is more efficient than
+ /// compare() when the relative ordering of inequal strings isn't needed.
+ LLVM_ATTRIBUTE_ALWAYS_INLINE
+ bool equals(StringRef RHS) const {
+ return (Length == RHS.Length &&
+ compareMemory(Data, RHS.Data, RHS.Length) == 0);
+ }
+
+ /// equals_lower - Check for string equality, ignoring case.
+ bool equals_lower(StringRef RHS) const {
+ return Length == RHS.Length && compare_lower(RHS) == 0;
+ }
+
+ /// compare - Compare two strings; the result is -1, 0, or 1 if this string
+ /// is lexicographically less than, equal to, or greater than the \p RHS.
+ LLVM_ATTRIBUTE_ALWAYS_INLINE
+ int compare(StringRef RHS) const {
+ // Check the prefix for a mismatch.
+ if (int Res = compareMemory(Data, RHS.Data, std::min(Length, RHS.Length)))
+ return Res < 0 ? -1 : 1;
+
+ // Otherwise the prefixes match, so we only need to check the lengths.
+ if (Length == RHS.Length)
+ return 0;
+ return Length < RHS.Length ? -1 : 1;
+ }
+
+ /// compare_lower - Compare two strings, ignoring case.
+ int compare_lower(StringRef RHS) const;
+
+ /// compare_numeric - Compare two strings, treating sequences of digits as
+ /// numbers.
+ int compare_numeric(StringRef RHS) const;
+
+ /// \brief Determine the edit distance between this string and another
+ /// string.
+ ///
+ /// \param Other the string to compare this string against.
+ ///
+ /// \param AllowReplacements whether to allow character
+ /// replacements (change one character into another) as a single
+ /// operation, rather than as two operations (an insertion and a
+ /// removal).
+ ///
+ /// \param MaxEditDistance If non-zero, the maximum edit distance that
+ /// this routine is allowed to compute. If the edit distance will exceed
+ /// that maximum, returns \c MaxEditDistance+1.
+ ///
+ /// \returns the minimum number of character insertions, removals,
+ /// or (if \p AllowReplacements is \c true) replacements needed to
+ /// transform one of the given strings into the other. If zero,
+ /// the strings are identical.
+ unsigned edit_distance(StringRef Other, bool AllowReplacements = true,
+ unsigned MaxEditDistance = 0) const;
+
+ /// str - Get the contents as an std::string.
+ std::string str() const {
+ if (!Data) return std::string();
+ return std::string(Data, Length);
+ }
+
+ /// @}
+ /// @name Operator Overloads
+ /// @{
+
+ char operator[](size_t Index) const {
+ assert(Index < Length && "Invalid index!");
+ return Data[Index];
+ }
+
+ /// @}
+ /// @name Type Conversions
+ /// @{
+
+ operator std::string() const {
+ return str();
+ }
+
+ /// @}
+ /// @name String Predicates
+ /// @{
+
+ /// Check if this string starts with the given \p Prefix.
+ LLVM_ATTRIBUTE_ALWAYS_INLINE
+ bool startswith(StringRef Prefix) const {
+ return Length >= Prefix.Length &&
+ compareMemory(Data, Prefix.Data, Prefix.Length) == 0;
+ }
+
+ /// Check if this string starts with the given \p Prefix, ignoring case.
+ bool startswith_lower(StringRef Prefix) const;
+
+ /// Check if this string ends with the given \p Suffix.
+ LLVM_ATTRIBUTE_ALWAYS_INLINE
+ bool endswith(StringRef Suffix) const {
+ return Length >= Suffix.Length &&
+ compareMemory(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0;
+ }
+
+ /// Check if this string ends with the given \p Suffix, ignoring case.
+ bool endswith_lower(StringRef Suffix) const;
+
+ /// @}
+ /// @name String Searching
+ /// @{
+
+ /// Search for the first character \p C in the string.
+ ///
+ /// \returns The index of the first occurrence of \p C, or npos if not
+ /// found.
+ LLVM_ATTRIBUTE_ALWAYS_INLINE
+ size_t find(char C, size_t From = 0) const {
+ size_t FindBegin = std::min(From, Length);
+ if (FindBegin < Length) { // Avoid calling memchr with nullptr.
+ // Just forward to memchr, which is faster than a hand-rolled loop.
+ if (const void *P = ::memchr(Data + FindBegin, C, Length - FindBegin))
+ return static_cast<const char *>(P) - Data;
+ }
+ return npos;
+ }
+
+ /// Search for the first string \p Str in the string.
+ ///
+ /// \returns The index of the first occurrence of \p Str, or npos if not
+ /// found.
+ size_t find(StringRef Str, size_t From = 0) const;
+
+ /// Search for the last character \p C in the string.
+ ///
+ /// \returns The index of the last occurrence of \p C, or npos if not
+ /// found.
+ size_t rfind(char C, size_t From = npos) const {
+ From = std::min(From, Length);
+ size_t i = From;
+ while (i != 0) {
+ --i;
+ if (Data[i] == C)
+ return i;
+ }
+ return npos;
+ }
+
+ /// Search for the last string \p Str in the string.
+ ///
+ /// \returns The index of the last occurrence of \p Str, or npos if not
+ /// found.
+ size_t rfind(StringRef Str) const;
+
+ /// Find the first character in the string that is \p C, or npos if not
+ /// found. Same as find.
+ size_t find_first_of(char C, size_t From = 0) const {
+ return find(C, From);
+ }
+
+ /// Find the first character in the string that is in \p Chars, or npos if
+ /// not found.
+ ///
+ /// Complexity: O(size() + Chars.size())
+ size_t find_first_of(StringRef Chars, size_t From = 0) const;
+
+ /// Find the first character in the string that is not \p C or npos if not
+ /// found.
+ size_t find_first_not_of(char C, size_t From = 0) const;
+
+ /// Find the first character in the string that is not in the string
+ /// \p Chars, or npos if not found.
+ ///
+ /// Complexity: O(size() + Chars.size())
+ size_t find_first_not_of(StringRef Chars, size_t From = 0) const;
+
+ /// Find the last character in the string that is \p C, or npos if not
+ /// found.
+ size_t find_last_of(char C, size_t From = npos) const {
+ return rfind(C, From);
+ }
+
+ /// Find the last character in the string that is in \p C, or npos if not
+ /// found.
+ ///
+ /// Complexity: O(size() + Chars.size())
+ size_t find_last_of(StringRef Chars, size_t From = npos) const;
+
+ /// Find the last character in the string that is not \p C, or npos if not
+ /// found.
+ size_t find_last_not_of(char C, size_t From = npos) const;
+
+ /// Find the last character in the string that is not in \p Chars, or
+ /// npos if not found.
+ ///
+ /// Complexity: O(size() + Chars.size())
+ size_t find_last_not_of(StringRef Chars, size_t From = npos) const;
+
+ /// @}
+ /// @name Helpful Algorithms
+ /// @{
+
+ /// Return the number of occurrences of \p C in the string.
+ size_t count(char C) const {
+ size_t Count = 0;
+ for (size_t i = 0, e = Length; i != e; ++i)
+ if (Data[i] == C)
+ ++Count;
+ return Count;
+ }
+
+ /// Return the number of non-overlapped occurrences of \p Str in
+ /// the string.
+ size_t count(StringRef Str) const;
+
+ /// Parse the current string as an integer of the specified radix. If
+ /// \p Radix is specified as zero, this does radix autosensing using
+ /// extended C rules: 0 is octal, 0x is hex, 0b is binary.
+ ///
+ /// If the string is invalid or if only a subset of the string is valid,
+ /// this returns true to signify the error. The string is considered
+ /// erroneous if empty or if it overflows T.
+ template <typename T>
+ typename std::enable_if<std::numeric_limits<T>::is_signed, bool>::type
+ getAsInteger(unsigned Radix, T &Result) const {
+ long long LLVal;
+ if (getAsSignedInteger(*this, Radix, LLVal) ||
+ static_cast<T>(LLVal) != LLVal)
+ return true;
+ Result = LLVal;
+ return false;
+ }
+
+ template <typename T>
+ typename std::enable_if<!std::numeric_limits<T>::is_signed, bool>::type
+ getAsInteger(unsigned Radix, T &Result) const {
+ unsigned long long ULLVal;
+ // The additional cast to unsigned long long is required to avoid the
+ // Visual C++ warning C4805: '!=' : unsafe mix of type 'bool' and type
+ // 'unsigned __int64' when instantiating getAsInteger with T = bool.
+ if (getAsUnsignedInteger(*this, Radix, ULLVal) ||
+ static_cast<unsigned long long>(static_cast<T>(ULLVal)) != ULLVal)
+ return true;
+ Result = ULLVal;
+ return false;
+ }
+
+ /// @}
+ /// @name String Operations
+ /// @{
+
+ // Convert the given ASCII string to lowercase.
+ std::string lower() const;
+
+ /// Convert the given ASCII string to uppercase.
+ std::string upper() const;
+
+ /// @}
+ /// @name Substring Operations
+ /// @{
+
+ /// Return a reference to the substring from [Start, Start + N).
+ ///
+ /// \param Start The index of the starting character in the substring; if
+ /// the index is npos or greater than the length of the string then the
+ /// empty substring will be returned.
+ ///
+ /// \param N The number of characters to included in the substring. If N
+ /// exceeds the number of characters remaining in the string, the string
+ /// suffix (starting with \p Start) will be returned.
+ LLVM_ATTRIBUTE_ALWAYS_INLINE
+ StringRef substr(size_t Start, size_t N = npos) const {
+ Start = std::min(Start, Length);
+ return StringRef(Data + Start, std::min(N, Length - Start));
+ }
+
+ /// Return a StringRef equal to 'this' but with the first \p N elements
+ /// dropped.
+ LLVM_ATTRIBUTE_ALWAYS_INLINE
+ StringRef drop_front(size_t N = 1) const {
+ assert(size() >= N && "Dropping more elements than exist");
+ return substr(N);
+ }
+
+ /// Return a StringRef equal to 'this' but with the last \p N elements
+ /// dropped.
+ LLVM_ATTRIBUTE_ALWAYS_INLINE
+ StringRef drop_back(size_t N = 1) const {
+ assert(size() >= N && "Dropping more elements than exist");
+ return substr(0, size()-N);
+ }
+
+ /// Return a reference to the substring from [Start, End).
+ ///
+ /// \param Start The index of the starting character in the substring; if
+ /// the index is npos or greater than the length of the string then the
+ /// empty substring will be returned.
+ ///
+ /// \param End The index following the last character to include in the
+ /// substring. If this is npos, or less than \p Start, or exceeds the
+ /// number of characters remaining in the string, the string suffix
+ /// (starting with \p Start) will be returned.
+ LLVM_ATTRIBUTE_ALWAYS_INLINE
+ StringRef slice(size_t Start, size_t End) const {
+ Start = std::min(Start, Length);
+ End = std::min(std::max(Start, End), Length);
+ return StringRef(Data + Start, End - Start);
+ }
+
+ /// Split into two substrings around the first occurrence of a separator
+ /// character.
+ ///
+ /// If \p Separator is in the string, then the result is a pair (LHS, RHS)
+ /// such that (*this == LHS + Separator + RHS) is true and RHS is
+ /// maximal. If \p Separator is not in the string, then the result is a
+ /// pair (LHS, RHS) where (*this == LHS) and (RHS == "").
+ ///
+ /// \param Separator The character to split on.
+ /// \returns The split substrings.
+ std::pair<StringRef, StringRef> split(char Separator) const {
+ size_t Idx = find(Separator);
+ if (Idx == npos)
+ return std::make_pair(*this, StringRef());
+ return std::make_pair(slice(0, Idx), slice(Idx+1, npos));
+ }
+
+ /// Split into two substrings around the first occurrence of a separator
+ /// string.
+ ///
+ /// If \p Separator is in the string, then the result is a pair (LHS, RHS)
+ /// such that (*this == LHS + Separator + RHS) is true and RHS is
+ /// maximal. If \p Separator is not in the string, then the result is a
+ /// pair (LHS, RHS) where (*this == LHS) and (RHS == "").
+ ///
+ /// \param Separator - The string to split on.
+ /// \return - The split substrings.
+ std::pair<StringRef, StringRef> split(StringRef Separator) const {
+ size_t Idx = find(Separator);
+ if (Idx == npos)
+ return std::make_pair(*this, StringRef());
+ return std::make_pair(slice(0, Idx), slice(Idx + Separator.size(), npos));
+ }
+
+ /// Split into substrings around the occurrences of a separator string.
+ ///
+ /// Each substring is stored in \p A. If \p MaxSplit is >= 0, at most
+ /// \p MaxSplit splits are done and consequently <= \p MaxSplit + 1
+ /// elements are added to A.
+ /// If \p KeepEmpty is false, empty strings are not added to \p A. They
+ /// still count when considering \p MaxSplit
+ /// An useful invariant is that
+ /// Separator.join(A) == *this if MaxSplit == -1 and KeepEmpty == true
+ ///
+ /// \param A - Where to put the substrings.
+ /// \param Separator - The string to split on.
+ /// \param MaxSplit - The maximum number of times the string is split.
+ /// \param KeepEmpty - True if empty substring should be added.
+ void split(SmallVectorImpl<StringRef> &A,
+ StringRef Separator, int MaxSplit = -1,
+ bool KeepEmpty = true) const;
+
+ /// Split into substrings around the occurrences of a separator character.
+ ///
+ /// Each substring is stored in \p A. If \p MaxSplit is >= 0, at most
+ /// \p MaxSplit splits are done and consequently <= \p MaxSplit + 1
+ /// elements are added to A.
+ /// If \p KeepEmpty is false, empty strings are not added to \p A. They
+ /// still count when considering \p MaxSplit
+ /// An useful invariant is that
+ /// Separator.join(A) == *this if MaxSplit == -1 and KeepEmpty == true
+ ///
+ /// \param A - Where to put the substrings.
+ /// \param Separator - The string to split on.
+ /// \param MaxSplit - The maximum number of times the string is split.
+ /// \param KeepEmpty - True if empty substring should be added.
+ void split(SmallVectorImpl<StringRef> &A, char Separator, int MaxSplit = -1,
+ bool KeepEmpty = true) const;
+
+ /// Split into two substrings around the last occurrence of a separator
+ /// character.
+ ///
+ /// If \p Separator is in the string, then the result is a pair (LHS, RHS)
+ /// such that (*this == LHS + Separator + RHS) is true and RHS is
+ /// minimal. If \p Separator is not in the string, then the result is a
+ /// pair (LHS, RHS) where (*this == LHS) and (RHS == "").
+ ///
+ /// \param Separator - The character to split on.
+ /// \return - The split substrings.
+ std::pair<StringRef, StringRef> rsplit(char Separator) const {
+ size_t Idx = rfind(Separator);
+ if (Idx == npos)
+ return std::make_pair(*this, StringRef());
+ return std::make_pair(slice(0, Idx), slice(Idx+1, npos));
+ }
+
+ /// Return string with consecutive characters in \p Chars starting from
+ /// the left removed.
+ StringRef ltrim(StringRef Chars = " \t\n\v\f\r") const {
+ return drop_front(std::min(Length, find_first_not_of(Chars)));
+ }
+
+ /// Return string with consecutive characters in \p Chars starting from
+ /// the right removed.
+ StringRef rtrim(StringRef Chars = " \t\n\v\f\r") const {
+ return drop_back(Length - std::min(Length, find_last_not_of(Chars) + 1));
+ }
+
+ /// Return string with consecutive characters in \p Chars starting from
+ /// the left and right removed.
+ StringRef trim(StringRef Chars = " \t\n\v\f\r") const {
+ return ltrim(Chars).rtrim(Chars);
+ }
+
+ /// @}
+ };
+
+ /// @name StringRef Comparison Operators
+ /// @{
+
+ LLVM_ATTRIBUTE_ALWAYS_INLINE
+ inline bool operator==(StringRef LHS, StringRef RHS) {
+ return LHS.equals(RHS);
+ }
+
+ LLVM_ATTRIBUTE_ALWAYS_INLINE
+ inline bool operator!=(StringRef LHS, StringRef RHS) {
+ return !(LHS == RHS);
+ }
+
+ inline bool operator<(StringRef LHS, StringRef RHS) {
+ return LHS.compare(RHS) == -1;
+ }
+
+ inline bool operator<=(StringRef LHS, StringRef RHS) {
+ return LHS.compare(RHS) != 1;
+ }
+
+ inline bool operator>(StringRef LHS, StringRef RHS) {
+ return LHS.compare(RHS) == 1;
+ }
+
+ inline bool operator>=(StringRef LHS, StringRef RHS) {
+ return LHS.compare(RHS) != -1;
+ }
+
+ inline std::string &operator+=(std::string &buffer, StringRef string) {
+ return buffer.append(string.data(), string.size());
+ }
+
+ /// @}
+
+ /// \brief Compute a hash_code for a StringRef.
+ hash_code hash_value(StringRef S);
+
+ // StringRefs can be treated like a POD type.
+ template <typename T> struct isPodLike;
+ template <> struct isPodLike<StringRef> { static const bool value = true; };
+}
+
+#endif
diff --git a/ext/include/llvm/ADT/StringSwitch.h b/ext/include/llvm/ADT/StringSwitch.h
new file mode 100644
index 0000000..42b0fc4
--- /dev/null
+++ b/ext/include/llvm/ADT/StringSwitch.h
@@ -0,0 +1,166 @@
+//===--- StringSwitch.h - Switch-on-literal-string Construct --------------===/
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//===----------------------------------------------------------------------===/
+//
+// This file implements the StringSwitch template, which mimics a switch()
+// statement whose cases are string literals.
+//
+//===----------------------------------------------------------------------===/
+#ifndef LLVM_ADT_STRINGSWITCH_H
+#define LLVM_ADT_STRINGSWITCH_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
+#include <cassert>
+#include <cstring>
+
+namespace llvm {
+
+/// \brief A switch()-like statement whose cases are string literals.
+///
+/// The StringSwitch class is a simple form of a switch() statement that
+/// determines whether the given string matches one of the given string
+/// literals. The template type parameter \p T is the type of the value that
+/// will be returned from the string-switch expression. For example,
+/// the following code switches on the name of a color in \c argv[i]:
+///
+/// \code
+/// Color color = StringSwitch<Color>(argv[i])
+/// .Case("red", Red)
+/// .Case("orange", Orange)
+/// .Case("yellow", Yellow)
+/// .Case("green", Green)
+/// .Case("blue", Blue)
+/// .Case("indigo", Indigo)
+/// .Cases("violet", "purple", Violet)
+/// .Default(UnknownColor);
+/// \endcode
+template<typename T, typename R = T>
+class StringSwitch {
+ /// \brief The string we are matching.
+ StringRef Str;
+
+ /// \brief The pointer to the result of this switch statement, once known,
+ /// null before that.
+ const T *Result;
+
+public:
+ LLVM_ATTRIBUTE_ALWAYS_INLINE
+ explicit StringSwitch(StringRef S)
+ : Str(S), Result(nullptr) { }
+
+ template<unsigned N>
+ LLVM_ATTRIBUTE_ALWAYS_INLINE
+ StringSwitch& Case(const char (&S)[N], const T& Value) {
+ if (!Result && N-1 == Str.size() &&
+ (std::memcmp(S, Str.data(), N-1) == 0)) {
+ Result = &Value;
+ }
+
+ return *this;
+ }
+
+ template<unsigned N>
+ LLVM_ATTRIBUTE_ALWAYS_INLINE
+ StringSwitch& EndsWith(const char (&S)[N], const T &Value) {
+ if (!Result && Str.size() >= N-1 &&
+ std::memcmp(S, Str.data() + Str.size() + 1 - N, N-1) == 0) {
+ Result = &Value;
+ }
+
+ return *this;
+ }
+
+ template<unsigned N>
+ LLVM_ATTRIBUTE_ALWAYS_INLINE
+ StringSwitch& StartsWith(const char (&S)[N], const T &Value) {
+ if (!Result && Str.size() >= N-1 &&
+ std::memcmp(S, Str.data(), N-1) == 0) {
+ Result = &Value;
+ }
+
+ return *this;
+ }
+
+ template<unsigned N0, unsigned N1>
+ LLVM_ATTRIBUTE_ALWAYS_INLINE
+ StringSwitch& Cases(const char (&S0)[N0], const char (&S1)[N1],
+ const T& Value) {
+ if (!Result && (
+ (N0-1 == Str.size() && std::memcmp(S0, Str.data(), N0-1) == 0) ||
+ (N1-1 == Str.size() && std::memcmp(S1, Str.data(), N1-1) == 0))) {
+ Result = &Value;
+ }
+
+ return *this;
+ }
+
+ template<unsigned N0, unsigned N1, unsigned N2>
+ LLVM_ATTRIBUTE_ALWAYS_INLINE
+ StringSwitch& Cases(const char (&S0)[N0], const char (&S1)[N1],
+ const char (&S2)[N2], const T& Value) {
+ if (!Result && (
+ (N0-1 == Str.size() && std::memcmp(S0, Str.data(), N0-1) == 0) ||
+ (N1-1 == Str.size() && std::memcmp(S1, Str.data(), N1-1) == 0) ||
+ (N2-1 == Str.size() && std::memcmp(S2, Str.data(), N2-1) == 0))) {
+ Result = &Value;
+ }
+
+ return *this;
+ }
+
+ template<unsigned N0, unsigned N1, unsigned N2, unsigned N3>
+ LLVM_ATTRIBUTE_ALWAYS_INLINE
+ StringSwitch& Cases(const char (&S0)[N0], const char (&S1)[N1],
+ const char (&S2)[N2], const char (&S3)[N3],
+ const T& Value) {
+ if (!Result && (
+ (N0-1 == Str.size() && std::memcmp(S0, Str.data(), N0-1) == 0) ||
+ (N1-1 == Str.size() && std::memcmp(S1, Str.data(), N1-1) == 0) ||
+ (N2-1 == Str.size() && std::memcmp(S2, Str.data(), N2-1) == 0) ||
+ (N3-1 == Str.size() && std::memcmp(S3, Str.data(), N3-1) == 0))) {
+ Result = &Value;
+ }
+
+ return *this;
+ }
+
+ template<unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4>
+ LLVM_ATTRIBUTE_ALWAYS_INLINE
+ StringSwitch& Cases(const char (&S0)[N0], const char (&S1)[N1],
+ const char (&S2)[N2], const char (&S3)[N3],
+ const char (&S4)[N4], const T& Value) {
+ if (!Result && (
+ (N0-1 == Str.size() && std::memcmp(S0, Str.data(), N0-1) == 0) ||
+ (N1-1 == Str.size() && std::memcmp(S1, Str.data(), N1-1) == 0) ||
+ (N2-1 == Str.size() && std::memcmp(S2, Str.data(), N2-1) == 0) ||
+ (N3-1 == Str.size() && std::memcmp(S3, Str.data(), N3-1) == 0) ||
+ (N4-1 == Str.size() && std::memcmp(S4, Str.data(), N4-1) == 0))) {
+ Result = &Value;
+ }
+
+ return *this;
+ }
+
+ LLVM_ATTRIBUTE_ALWAYS_INLINE
+ R Default(const T& Value) const {
+ if (Result)
+ return *Result;
+
+ return Value;
+ }
+
+ LLVM_ATTRIBUTE_ALWAYS_INLINE
+ operator R() const {
+ assert(Result && "Fell off the end of a string-switch");
+ return *Result;
+ }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_ADT_STRINGSWITCH_H
diff --git a/ext/include/llvm/ADT/Twine.h b/ext/include/llvm/ADT/Twine.h
new file mode 100644
index 0000000..81b1a6d
--- /dev/null
+++ b/ext/include/llvm/ADT/Twine.h
@@ -0,0 +1,540 @@
+//===-- Twine.h - Fast Temporary String Concatenation -----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_TWINE_H
+#define LLVM_ADT_TWINE_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+#include <string>
+
+namespace llvm {
+ class raw_ostream;
+
+ /// Twine - A lightweight data structure for efficiently representing the
+ /// concatenation of temporary values as strings.
+ ///
+ /// A Twine is a kind of rope, it represents a concatenated string using a
+ /// binary-tree, where the string is the preorder of the nodes. Since the
+ /// Twine can be efficiently rendered into a buffer when its result is used,
+ /// it avoids the cost of generating temporary values for intermediate string
+ /// results -- particularly in cases when the Twine result is never
+ /// required. By explicitly tracking the type of leaf nodes, we can also avoid
+ /// the creation of temporary strings for conversions operations (such as
+ /// appending an integer to a string).
+ ///
+ /// A Twine is not intended for use directly and should not be stored, its
+ /// implementation relies on the ability to store pointers to temporary stack
+ /// objects which may be deallocated at the end of a statement. Twines should
+ /// only be used accepted as const references in arguments, when an API wishes
+ /// to accept possibly-concatenated strings.
+ ///
+ /// Twines support a special 'null' value, which always concatenates to form
+ /// itself, and renders as an empty string. This can be returned from APIs to
+ /// effectively nullify any concatenations performed on the result.
+ ///
+ /// \b Implementation
+ ///
+ /// Given the nature of a Twine, it is not possible for the Twine's
+ /// concatenation method to construct interior nodes; the result must be
+ /// represented inside the returned value. For this reason a Twine object
+ /// actually holds two values, the left- and right-hand sides of a
+ /// concatenation. We also have nullary Twine objects, which are effectively
+ /// sentinel values that represent empty strings.
+ ///
+ /// Thus, a Twine can effectively have zero, one, or two children. The \see
+ /// isNullary(), \see isUnary(), and \see isBinary() predicates exist for
+ /// testing the number of children.
+ ///
+ /// We maintain a number of invariants on Twine objects (FIXME: Why):
+ /// - Nullary twines are always represented with their Kind on the left-hand
+ /// side, and the Empty kind on the right-hand side.
+ /// - Unary twines are always represented with the value on the left-hand
+ /// side, and the Empty kind on the right-hand side.
+ /// - If a Twine has another Twine as a child, that child should always be
+ /// binary (otherwise it could have been folded into the parent).
+ ///
+ /// These invariants are check by \see isValid().
+ ///
+ /// \b Efficiency Considerations
+ ///
+ /// The Twine is designed to yield efficient and small code for common
+ /// situations. For this reason, the concat() method is inlined so that
+ /// concatenations of leaf nodes can be optimized into stores directly into a
+ /// single stack allocated object.
+ ///
+ /// In practice, not all compilers can be trusted to optimize concat() fully,
+ /// so we provide two additional methods (and accompanying operator+
+ /// overloads) to guarantee that particularly important cases (cstring plus
+ /// StringRef) codegen as desired.
+ class Twine {
+ /// NodeKind - Represent the type of an argument.
+ enum NodeKind : unsigned char {
+ /// An empty string; the result of concatenating anything with it is also
+ /// empty.
+ NullKind,
+
+ /// The empty string.
+ EmptyKind,
+
+ /// A pointer to a Twine instance.
+ TwineKind,
+
+ /// A pointer to a C string instance.
+ CStringKind,
+
+ /// A pointer to an std::string instance.
+ StdStringKind,
+
+ /// A pointer to a StringRef instance.
+ StringRefKind,
+
+ /// A pointer to a SmallString instance.
+ SmallStringKind,
+
+ /// A char value, to render as a character.
+ CharKind,
+
+ /// An unsigned int value, to render as an unsigned decimal integer.
+ DecUIKind,
+
+ /// An int value, to render as a signed decimal integer.
+ DecIKind,
+
+ /// A pointer to an unsigned long value, to render as an unsigned decimal
+ /// integer.
+ DecULKind,
+
+ /// A pointer to a long value, to render as a signed decimal integer.
+ DecLKind,
+
+ /// A pointer to an unsigned long long value, to render as an unsigned
+ /// decimal integer.
+ DecULLKind,
+
+ /// A pointer to a long long value, to render as a signed decimal integer.
+ DecLLKind,
+
+ /// A pointer to a uint64_t value, to render as an unsigned hexadecimal
+ /// integer.
+ UHexKind
+ };
+
+ union Child
+ {
+ const Twine *twine;
+ const char *cString;
+ const std::string *stdString;
+ const StringRef *stringRef;
+ const SmallVectorImpl<char> *smallString;
+ char character;
+ unsigned int decUI;
+ int decI;
+ const unsigned long *decUL;
+ const long *decL;
+ const unsigned long long *decULL;
+ const long long *decLL;
+ const uint64_t *uHex;
+ };
+
+ private:
+ /// LHS - The prefix in the concatenation, which may be uninitialized for
+ /// Null or Empty kinds.
+ Child LHS;
+ /// RHS - The suffix in the concatenation, which may be uninitialized for
+ /// Null or Empty kinds.
+ Child RHS;
+ /// LHSKind - The NodeKind of the left hand side, \see getLHSKind().
+ NodeKind LHSKind;
+ /// RHSKind - The NodeKind of the right hand side, \see getRHSKind().
+ NodeKind RHSKind;
+
+ private:
+ /// Construct a nullary twine; the kind must be NullKind or EmptyKind.
+ explicit Twine(NodeKind Kind)
+ : LHSKind(Kind), RHSKind(EmptyKind) {
+ assert(isNullary() && "Invalid kind!");
+ }
+
+ /// Construct a binary twine.
+ explicit Twine(const Twine &LHS, const Twine &RHS)
+ : LHSKind(TwineKind), RHSKind(TwineKind) {
+ this->LHS.twine = &LHS;
+ this->RHS.twine = &RHS;
+ assert(isValid() && "Invalid twine!");
+ }
+
+ /// Construct a twine from explicit values.
+ explicit Twine(Child LHS, NodeKind LHSKind, Child RHS, NodeKind RHSKind)
+ : LHS(LHS), RHS(RHS), LHSKind(LHSKind), RHSKind(RHSKind) {
+ assert(isValid() && "Invalid twine!");
+ }
+
+ /// Since the intended use of twines is as temporary objects, assignments
+ /// when concatenating might cause undefined behavior or stack corruptions
+ Twine &operator=(const Twine &Other) = delete;
+
+ /// Check for the null twine.
+ bool isNull() const {
+ return getLHSKind() == NullKind;
+ }
+
+ /// Check for the empty twine.
+ bool isEmpty() const {
+ return getLHSKind() == EmptyKind;
+ }
+
+ /// Check if this is a nullary twine (null or empty).
+ bool isNullary() const {
+ return isNull() || isEmpty();
+ }
+
+ /// Check if this is a unary twine.
+ bool isUnary() const {
+ return getRHSKind() == EmptyKind && !isNullary();
+ }
+
+ /// Check if this is a binary twine.
+ bool isBinary() const {
+ return getLHSKind() != NullKind && getRHSKind() != EmptyKind;
+ }
+
+ /// Check if this is a valid twine (satisfying the invariants on
+ /// order and number of arguments).
+ bool isValid() const {
+ // Nullary twines always have Empty on the RHS.
+ if (isNullary() && getRHSKind() != EmptyKind)
+ return false;
+
+ // Null should never appear on the RHS.
+ if (getRHSKind() == NullKind)
+ return false;
+
+ // The RHS cannot be non-empty if the LHS is empty.
+ if (getRHSKind() != EmptyKind && getLHSKind() == EmptyKind)
+ return false;
+
+ // A twine child should always be binary.
+ if (getLHSKind() == TwineKind &&
+ !LHS.twine->isBinary())
+ return false;
+ if (getRHSKind() == TwineKind &&
+ !RHS.twine->isBinary())
+ return false;
+
+ return true;
+ }
+
+ /// Get the NodeKind of the left-hand side.
+ NodeKind getLHSKind() const { return LHSKind; }
+
+ /// Get the NodeKind of the right-hand side.
+ NodeKind getRHSKind() const { return RHSKind; }
+
+ /// Print one child from a twine.
+ void printOneChild(raw_ostream &OS, Child Ptr, NodeKind Kind) const;
+
+ /// Print the representation of one child from a twine.
+ void printOneChildRepr(raw_ostream &OS, Child Ptr,
+ NodeKind Kind) const;
+
+ public:
+ /// @name Constructors
+ /// @{
+
+ /// Construct from an empty string.
+ /*implicit*/ Twine() : LHSKind(EmptyKind), RHSKind(EmptyKind) {
+ assert(isValid() && "Invalid twine!");
+ }
+
+ Twine(const Twine &) = default;
+
+ /// Construct from a C string.
+ ///
+ /// We take care here to optimize "" into the empty twine -- this will be
+ /// optimized out for string constants. This allows Twine arguments have
+ /// default "" values, without introducing unnecessary string constants.
+ /*implicit*/ Twine(const char *Str)
+ : RHSKind(EmptyKind) {
+ if (Str[0] != '\0') {
+ LHS.cString = Str;
+ LHSKind = CStringKind;
+ } else
+ LHSKind = EmptyKind;
+
+ assert(isValid() && "Invalid twine!");
+ }
+
+ /// Construct from an std::string.
+ /*implicit*/ Twine(const std::string &Str)
+ : LHSKind(StdStringKind), RHSKind(EmptyKind) {
+ LHS.stdString = &Str;
+ assert(isValid() && "Invalid twine!");
+ }
+
+ /// Construct from a StringRef.
+ /*implicit*/ Twine(const StringRef &Str)
+ : LHSKind(StringRefKind), RHSKind(EmptyKind) {
+ LHS.stringRef = &Str;
+ assert(isValid() && "Invalid twine!");
+ }
+
+ /// Construct from a SmallString.
+ /*implicit*/ Twine(const SmallVectorImpl<char> &Str)
+ : LHSKind(SmallStringKind), RHSKind(EmptyKind) {
+ LHS.smallString = &Str;
+ assert(isValid() && "Invalid twine!");
+ }
+
+ /// Construct from a char.
+ explicit Twine(char Val)
+ : LHSKind(CharKind), RHSKind(EmptyKind) {
+ LHS.character = Val;
+ }
+
+ /// Construct from a signed char.
+ explicit Twine(signed char Val)
+ : LHSKind(CharKind), RHSKind(EmptyKind) {
+ LHS.character = static_cast<char>(Val);
+ }
+
+ /// Construct from an unsigned char.
+ explicit Twine(unsigned char Val)
+ : LHSKind(CharKind), RHSKind(EmptyKind) {
+ LHS.character = static_cast<char>(Val);
+ }
+
+ /// Construct a twine to print \p Val as an unsigned decimal integer.
+ explicit Twine(unsigned Val)
+ : LHSKind(DecUIKind), RHSKind(EmptyKind) {
+ LHS.decUI = Val;
+ }
+
+ /// Construct a twine to print \p Val as a signed decimal integer.
+ explicit Twine(int Val)
+ : LHSKind(DecIKind), RHSKind(EmptyKind) {
+ LHS.decI = Val;
+ }
+
+ /// Construct a twine to print \p Val as an unsigned decimal integer.
+ explicit Twine(const unsigned long &Val)
+ : LHSKind(DecULKind), RHSKind(EmptyKind) {
+ LHS.decUL = &Val;
+ }
+
+ /// Construct a twine to print \p Val as a signed decimal integer.
+ explicit Twine(const long &Val)
+ : LHSKind(DecLKind), RHSKind(EmptyKind) {
+ LHS.decL = &Val;
+ }
+
+ /// Construct a twine to print \p Val as an unsigned decimal integer.
+ explicit Twine(const unsigned long long &Val)
+ : LHSKind(DecULLKind), RHSKind(EmptyKind) {
+ LHS.decULL = &Val;
+ }
+
+ /// Construct a twine to print \p Val as a signed decimal integer.
+ explicit Twine(const long long &Val)
+ : LHSKind(DecLLKind), RHSKind(EmptyKind) {
+ LHS.decLL = &Val;
+ }
+
+ // FIXME: Unfortunately, to make sure this is as efficient as possible we
+ // need extra binary constructors from particular types. We can't rely on
+ // the compiler to be smart enough to fold operator+()/concat() down to the
+ // right thing. Yet.
+
+ /// Construct as the concatenation of a C string and a StringRef.
+ /*implicit*/ Twine(const char *LHS, const StringRef &RHS)
+ : LHSKind(CStringKind), RHSKind(StringRefKind) {
+ this->LHS.cString = LHS;
+ this->RHS.stringRef = &RHS;
+ assert(isValid() && "Invalid twine!");
+ }
+
+ /// Construct as the concatenation of a StringRef and a C string.
+ /*implicit*/ Twine(const StringRef &LHS, const char *RHS)
+ : LHSKind(StringRefKind), RHSKind(CStringKind) {
+ this->LHS.stringRef = &LHS;
+ this->RHS.cString = RHS;
+ assert(isValid() && "Invalid twine!");
+ }
+
+ /// Create a 'null' string, which is an empty string that always
+ /// concatenates to form another empty string.
+ static Twine createNull() {
+ return Twine(NullKind);
+ }
+
+ /// @}
+ /// @name Numeric Conversions
+ /// @{
+
+ // Construct a twine to print \p Val as an unsigned hexadecimal integer.
+ static Twine utohexstr(const uint64_t &Val) {
+ Child LHS, RHS;
+ LHS.uHex = &Val;
+ RHS.twine = nullptr;
+ return Twine(LHS, UHexKind, RHS, EmptyKind);
+ }
+
+ /// @}
+ /// @name Predicate Operations
+ /// @{
+
+ /// Check if this twine is trivially empty; a false return value does not
+ /// necessarily mean the twine is empty.
+ bool isTriviallyEmpty() const {
+ return isNullary();
+ }
+
+ /// Return true if this twine can be dynamically accessed as a single
+ /// StringRef value with getSingleStringRef().
+ bool isSingleStringRef() const {
+ if (getRHSKind() != EmptyKind) return false;
+
+ switch (getLHSKind()) {
+ case EmptyKind:
+ case CStringKind:
+ case StdStringKind:
+ case StringRefKind:
+ case SmallStringKind:
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ /// @}
+ /// @name String Operations
+ /// @{
+
+ Twine concat(const Twine &Suffix) const;
+
+ /// @}
+ /// @name Output & Conversion.
+ /// @{
+
+ /// Return the twine contents as a std::string.
+ std::string str() const;
+
+ /// Append the concatenated string into the given SmallString or SmallVector.
+ void toVector(SmallVectorImpl<char> &Out) const;
+
+ /// This returns the twine as a single StringRef. This method is only valid
+ /// if isSingleStringRef() is true.
+ StringRef getSingleStringRef() const {
+ assert(isSingleStringRef() &&"This cannot be had as a single stringref!");
+ switch (getLHSKind()) {
+ default: llvm_unreachable("Out of sync with isSingleStringRef");
+ case EmptyKind: return StringRef();
+ case CStringKind: return StringRef(LHS.cString);
+ case StdStringKind: return StringRef(*LHS.stdString);
+ case StringRefKind: return *LHS.stringRef;
+ case SmallStringKind:
+ return StringRef(LHS.smallString->data(), LHS.smallString->size());
+ }
+ }
+
+ /// This returns the twine as a single StringRef if it can be
+ /// represented as such. Otherwise the twine is written into the given
+ /// SmallVector and a StringRef to the SmallVector's data is returned.
+ StringRef toStringRef(SmallVectorImpl<char> &Out) const {
+ if (isSingleStringRef())
+ return getSingleStringRef();
+ toVector(Out);
+ return StringRef(Out.data(), Out.size());
+ }
+
+ /// This returns the twine as a single null terminated StringRef if it
+ /// can be represented as such. Otherwise the twine is written into the
+ /// given SmallVector and a StringRef to the SmallVector's data is returned.
+ ///
+ /// The returned StringRef's size does not include the null terminator.
+ StringRef toNullTerminatedStringRef(SmallVectorImpl<char> &Out) const;
+
+ /// Write the concatenated string represented by this twine to the
+ /// stream \p OS.
+ void print(raw_ostream &OS) const;
+
+ /// Dump the concatenated string represented by this twine to stderr.
+ void dump() const;
+
+ /// Write the representation of this twine to the stream \p OS.
+ void printRepr(raw_ostream &OS) const;
+
+ /// Dump the representation of this twine to stderr.
+ void dumpRepr() const;
+
+ /// @}
+ };
+
+ /// @name Twine Inline Implementations
+ /// @{
+
+ inline Twine Twine::concat(const Twine &Suffix) const {
+ // Concatenation with null is null.
+ if (isNull() || Suffix.isNull())
+ return Twine(NullKind);
+
+ // Concatenation with empty yields the other side.
+ if (isEmpty())
+ return Suffix;
+ if (Suffix.isEmpty())
+ return *this;
+
+ // Otherwise we need to create a new node, taking care to fold in unary
+ // twines.
+ Child NewLHS, NewRHS;
+ NewLHS.twine = this;
+ NewRHS.twine = &Suffix;
+ NodeKind NewLHSKind = TwineKind, NewRHSKind = TwineKind;
+ if (isUnary()) {
+ NewLHS = LHS;
+ NewLHSKind = getLHSKind();
+ }
+ if (Suffix.isUnary()) {
+ NewRHS = Suffix.LHS;
+ NewRHSKind = Suffix.getLHSKind();
+ }
+
+ return Twine(NewLHS, NewLHSKind, NewRHS, NewRHSKind);
+ }
+
+ inline Twine operator+(const Twine &LHS, const Twine &RHS) {
+ return LHS.concat(RHS);
+ }
+
+ /// Additional overload to guarantee simplified codegen; this is equivalent to
+ /// concat().
+
+ inline Twine operator+(const char *LHS, const StringRef &RHS) {
+ return Twine(LHS, RHS);
+ }
+
+ /// Additional overload to guarantee simplified codegen; this is equivalent to
+ /// concat().
+
+ inline Twine operator+(const StringRef &LHS, const char *RHS) {
+ return Twine(LHS, RHS);
+ }
+
+ inline raw_ostream &operator<<(raw_ostream &OS, const Twine &RHS) {
+ RHS.print(OS);
+ return OS;
+ }
+
+ /// @}
+}
+
+#endif
diff --git a/ext/include/llvm/ADT/edit_distance.h b/ext/include/llvm/ADT/edit_distance.h
new file mode 100644
index 0000000..06a01b1
--- /dev/null
+++ b/ext/include/llvm/ADT/edit_distance.h
@@ -0,0 +1,103 @@
+//===-- llvm/ADT/edit_distance.h - Array edit distance function --- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a Levenshtein distance function that works for any two
+// sequences, with each element of each sequence being analogous to a character
+// in a string.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_EDIT_DISTANCE_H
+#define LLVM_ADT_EDIT_DISTANCE_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include <algorithm>
+#include <memory>
+
+namespace llvm {
+
+/// \brief Determine the edit distance between two sequences.
+///
+/// \param FromArray the first sequence to compare.
+///
+/// \param ToArray the second sequence to compare.
+///
+/// \param AllowReplacements whether to allow element replacements (change one
+/// element into another) as a single operation, rather than as two operations
+/// (an insertion and a removal).
+///
+/// \param MaxEditDistance If non-zero, the maximum edit distance that this
+/// routine is allowed to compute. If the edit distance will exceed that
+/// maximum, returns \c MaxEditDistance+1.
+///
+/// \returns the minimum number of element insertions, removals, or (if
+/// \p AllowReplacements is \c true) replacements needed to transform one of
+/// the given sequences into the other. If zero, the sequences are identical.
+template<typename T>
+unsigned ComputeEditDistance(ArrayRef<T> FromArray, ArrayRef<T> ToArray,
+ bool AllowReplacements = true,
+ unsigned MaxEditDistance = 0) {
+ // The algorithm implemented below is the "classic"
+ // dynamic-programming algorithm for computing the Levenshtein
+ // distance, which is described here:
+ //
+ // http://en.wikipedia.org/wiki/Levenshtein_distance
+ //
+ // Although the algorithm is typically described using an m x n
+ // array, only one row plus one element are used at a time, so this
+ // implementation just keeps one vector for the row. To update one entry,
+ // only the entries to the left, top, and top-left are needed. The left
+ // entry is in Row[x-1], the top entry is what's in Row[x] from the last
+ // iteration, and the top-left entry is stored in Previous.
+ typename ArrayRef<T>::size_type m = FromArray.size();
+ typename ArrayRef<T>::size_type n = ToArray.size();
+
+ const unsigned SmallBufferSize = 64;
+ unsigned SmallBuffer[SmallBufferSize];
+ std::unique_ptr<unsigned[]> Allocated;
+ unsigned *Row = SmallBuffer;
+ if (n + 1 > SmallBufferSize) {
+ Row = new unsigned[n + 1];
+ Allocated.reset(Row);
+ }
+
+ for (unsigned i = 1; i <= n; ++i)
+ Row[i] = i;
+
+ for (typename ArrayRef<T>::size_type y = 1; y <= m; ++y) {
+ Row[0] = y;
+ unsigned BestThisRow = Row[0];
+
+ unsigned Previous = y - 1;
+ for (typename ArrayRef<T>::size_type x = 1; x <= n; ++x) {
+ int OldRow = Row[x];
+ if (AllowReplacements) {
+ Row[x] = std::min(
+ Previous + (FromArray[y-1] == ToArray[x-1] ? 0u : 1u),
+ std::min(Row[x-1], Row[x])+1);
+ }
+ else {
+ if (FromArray[y-1] == ToArray[x-1]) Row[x] = Previous;
+ else Row[x] = std::min(Row[x-1], Row[x]) + 1;
+ }
+ Previous = OldRow;
+ BestThisRow = std::min(BestThisRow, Row[x]);
+ }
+
+ if (MaxEditDistance && BestThisRow > MaxEditDistance)
+ return MaxEditDistance + 1;
+ }
+
+ unsigned Result = Row[n];
+ return Result;
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/ext/include/llvm/ADT/ilist.h b/ext/include/llvm/ADT/ilist.h
new file mode 100644
index 0000000..3044a6c
--- /dev/null
+++ b/ext/include/llvm/ADT/ilist.h
@@ -0,0 +1,800 @@
+//==-- llvm/ADT/ilist.h - Intrusive Linked List Template ---------*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines classes to implement an intrusive doubly linked list class
+// (i.e. each node of the list must contain a next and previous field for the
+// list.
+//
+// The ilist_traits trait class is used to gain access to the next and previous
+// fields of the node type that the list is instantiated with. If it is not
+// specialized, the list defaults to using the getPrev(), getNext() method calls
+// to get the next and previous pointers.
+//
+// The ilist class itself, should be a plug in replacement for list, assuming
+// that the nodes contain next/prev pointers. This list replacement does not
+// provide a constant time size() method, so be careful to use empty() when you
+// really want to know if it's empty.
+//
+// The ilist class is implemented by allocating a 'tail' node when the list is
+// created (using ilist_traits<>::createSentinel()). This tail node is
+// absolutely required because the user must be able to compute end()-1. Because
+// of this, users of the direct next/prev links will see an extra link on the
+// end of the list, which should be ignored.
+//
+// Requirements for a user of this list:
+//
+// 1. The user must provide {g|s}et{Next|Prev} methods, or specialize
+// ilist_traits to provide an alternate way of getting and setting next and
+// prev links.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_ILIST_H
+#define LLVM_ADT_ILIST_H
+
+#include "llvm/Support/Compiler.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <iterator>
+
+namespace llvm {
+
+template<typename NodeTy, typename Traits> class iplist;
+template<typename NodeTy> class ilist_iterator;
+
+/// ilist_nextprev_traits - A fragment for template traits for intrusive list
+/// that provides default next/prev implementations for common operations.
+///
+template<typename NodeTy>
+struct ilist_nextprev_traits {
+ static NodeTy *getPrev(NodeTy *N) { return N->getPrev(); }
+ static NodeTy *getNext(NodeTy *N) { return N->getNext(); }
+ static const NodeTy *getPrev(const NodeTy *N) { return N->getPrev(); }
+ static const NodeTy *getNext(const NodeTy *N) { return N->getNext(); }
+
+ static void setPrev(NodeTy *N, NodeTy *Prev) { N->setPrev(Prev); }
+ static void setNext(NodeTy *N, NodeTy *Next) { N->setNext(Next); }
+};
+
+template<typename NodeTy>
+struct ilist_traits;
+
+/// ilist_sentinel_traits - A fragment for template traits for intrusive list
+/// that provides default sentinel implementations for common operations.
+///
+/// ilist_sentinel_traits implements a lazy dynamic sentinel allocation
+/// strategy. The sentinel is stored in the prev field of ilist's Head.
+///
+template<typename NodeTy>
+struct ilist_sentinel_traits {
+ /// createSentinel - create the dynamic sentinel
+ static NodeTy *createSentinel() { return new NodeTy(); }
+
+ /// destroySentinel - deallocate the dynamic sentinel
+ static void destroySentinel(NodeTy *N) { delete N; }
+
+ /// provideInitialHead - when constructing an ilist, provide a starting
+ /// value for its Head
+ /// @return null node to indicate that it needs to be allocated later
+ static NodeTy *provideInitialHead() { return nullptr; }
+
+ /// ensureHead - make sure that Head is either already
+ /// initialized or assigned a fresh sentinel
+ /// @return the sentinel
+ static NodeTy *ensureHead(NodeTy *&Head) {
+ if (!Head) {
+ Head = ilist_traits<NodeTy>::createSentinel();
+ ilist_traits<NodeTy>::noteHead(Head, Head);
+ ilist_traits<NodeTy>::setNext(Head, nullptr);
+ return Head;
+ }
+ return ilist_traits<NodeTy>::getPrev(Head);
+ }
+
+ /// noteHead - stash the sentinel into its default location
+ static void noteHead(NodeTy *NewHead, NodeTy *Sentinel) {
+ ilist_traits<NodeTy>::setPrev(NewHead, Sentinel);
+ }
+};
+
+template <typename NodeTy> class ilist_half_node;
+template <typename NodeTy> class ilist_node;
+
+/// Traits with an embedded ilist_node as a sentinel.
+///
+/// FIXME: The downcast in createSentinel() is UB.
+template <typename NodeTy> struct ilist_embedded_sentinel_traits {
+ /// Get hold of the node that marks the end of the list.
+ NodeTy *createSentinel() const {
+ // Since i(p)lists always publicly derive from their corresponding traits,
+ // placing a data member in this class will augment the i(p)list. But since
+ // the NodeTy is expected to be publicly derive from ilist_node<NodeTy>,
+ // there is a legal viable downcast from it to NodeTy. We use this trick to
+ // superimpose an i(p)list with a "ghostly" NodeTy, which becomes the
+ // sentinel. Dereferencing the sentinel is forbidden (save the
+ // ilist_node<NodeTy>), so no one will ever notice the superposition.
+ return static_cast<NodeTy *>(&Sentinel);
+ }
+ static void destroySentinel(NodeTy *) {}
+
+ NodeTy *provideInitialHead() const { return createSentinel(); }
+ NodeTy *ensureHead(NodeTy *) const { return createSentinel(); }
+ static void noteHead(NodeTy *, NodeTy *) {}
+
+private:
+ mutable ilist_node<NodeTy> Sentinel;
+};
+
+/// Trait with an embedded ilist_half_node as a sentinel.
+///
+/// FIXME: The downcast in createSentinel() is UB.
+template <typename NodeTy> struct ilist_half_embedded_sentinel_traits {
+ /// Get hold of the node that marks the end of the list.
+ NodeTy *createSentinel() const {
+ // See comment in ilist_embedded_sentinel_traits::createSentinel().
+ return static_cast<NodeTy *>(&Sentinel);
+ }
+ static void destroySentinel(NodeTy *) {}
+
+ NodeTy *provideInitialHead() const { return createSentinel(); }
+ NodeTy *ensureHead(NodeTy *) const { return createSentinel(); }
+ static void noteHead(NodeTy *, NodeTy *) {}
+
+private:
+ mutable ilist_half_node<NodeTy> Sentinel;
+};
+
+/// ilist_node_traits - A fragment for template traits for intrusive list
+/// that provides default node related operations.
+///
+template<typename NodeTy>
+struct ilist_node_traits {
+ static NodeTy *createNode(const NodeTy &V) { return new NodeTy(V); }
+ static void deleteNode(NodeTy *V) { delete V; }
+
+ void addNodeToList(NodeTy *) {}
+ void removeNodeFromList(NodeTy *) {}
+ void transferNodesFromList(ilist_node_traits & /*SrcTraits*/,
+ ilist_iterator<NodeTy> /*first*/,
+ ilist_iterator<NodeTy> /*last*/) {}
+};
+
+/// ilist_default_traits - Default template traits for intrusive list.
+/// By inheriting from this, you can easily use default implementations
+/// for all common operations.
+///
+template<typename NodeTy>
+struct ilist_default_traits : public ilist_nextprev_traits<NodeTy>,
+ public ilist_sentinel_traits<NodeTy>,
+ public ilist_node_traits<NodeTy> {
+};
+
+// Template traits for intrusive list. By specializing this template class, you
+// can change what next/prev fields are used to store the links...
+template<typename NodeTy>
+struct ilist_traits : public ilist_default_traits<NodeTy> {};
+
+// Const traits are the same as nonconst traits...
+template<typename Ty>
+struct ilist_traits<const Ty> : public ilist_traits<Ty> {};
+
+//===----------------------------------------------------------------------===//
+// ilist_iterator<Node> - Iterator for intrusive list.
+//
+template<typename NodeTy>
+class ilist_iterator
+ : public std::iterator<std::bidirectional_iterator_tag, NodeTy, ptrdiff_t> {
+
+public:
+ typedef ilist_traits<NodeTy> Traits;
+ typedef std::iterator<std::bidirectional_iterator_tag,
+ NodeTy, ptrdiff_t> super;
+
+ typedef typename super::value_type value_type;
+ typedef typename super::difference_type difference_type;
+ typedef typename super::pointer pointer;
+ typedef typename super::reference reference;
+private:
+ pointer NodePtr;
+
+ // ilist_iterator is not a random-access iterator, but it has an
+ // implicit conversion to pointer-type, which is. Declare (but
+ // don't define) these functions as private to help catch
+ // accidental misuse.
+ void operator[](difference_type) const;
+ void operator+(difference_type) const;
+ void operator-(difference_type) const;
+ void operator+=(difference_type) const;
+ void operator-=(difference_type) const;
+ template<class T> void operator<(T) const;
+ template<class T> void operator<=(T) const;
+ template<class T> void operator>(T) const;
+ template<class T> void operator>=(T) const;
+ template<class T> void operator-(T) const;
+public:
+
+ explicit ilist_iterator(pointer NP) : NodePtr(NP) {}
+ explicit ilist_iterator(reference NR) : NodePtr(&NR) {}
+ ilist_iterator() : NodePtr(nullptr) {}
+
+ // This is templated so that we can allow constructing a const iterator from
+ // a nonconst iterator...
+ template<class node_ty>
+ ilist_iterator(const ilist_iterator<node_ty> &RHS)
+ : NodePtr(RHS.getNodePtrUnchecked()) {}
+
+ // This is templated so that we can allow assigning to a const iterator from
+ // a nonconst iterator...
+ template<class node_ty>
+ const ilist_iterator &operator=(const ilist_iterator<node_ty> &RHS) {
+ NodePtr = RHS.getNodePtrUnchecked();
+ return *this;
+ }
+
+ void reset(pointer NP) { NodePtr = NP; }
+
+ // Accessors...
+ explicit operator pointer() const {
+ return NodePtr;
+ }
+
+ reference operator*() const {
+ return *NodePtr;
+ }
+ pointer operator->() const { return &operator*(); }
+
+ // Comparison operators
+ template <class Y> bool operator==(const ilist_iterator<Y> &RHS) const {
+ return NodePtr == RHS.getNodePtrUnchecked();
+ }
+ template <class Y> bool operator!=(const ilist_iterator<Y> &RHS) const {
+ return NodePtr != RHS.getNodePtrUnchecked();
+ }
+
+ // Increment and decrement operators...
+ ilist_iterator &operator--() { // predecrement - Back up
+ NodePtr = Traits::getPrev(NodePtr);
+ assert(NodePtr && "--'d off the beginning of an ilist!");
+ return *this;
+ }
+ ilist_iterator &operator++() { // preincrement - Advance
+ NodePtr = Traits::getNext(NodePtr);
+ return *this;
+ }
+ ilist_iterator operator--(int) { // postdecrement operators...
+ ilist_iterator tmp = *this;
+ --*this;
+ return tmp;
+ }
+ ilist_iterator operator++(int) { // postincrement operators...
+ ilist_iterator tmp = *this;
+ ++*this;
+ return tmp;
+ }
+
+ // Internal interface, do not use...
+ pointer getNodePtrUnchecked() const { return NodePtr; }
+};
+
+// These are to catch errors when people try to use them as random access
+// iterators.
+template<typename T>
+void operator-(int, ilist_iterator<T>) = delete;
+template<typename T>
+void operator-(ilist_iterator<T>,int) = delete;
+
+template<typename T>
+void operator+(int, ilist_iterator<T>) = delete;
+template<typename T>
+void operator+(ilist_iterator<T>,int) = delete;
+
+// operator!=/operator== - Allow mixed comparisons without dereferencing
+// the iterator, which could very likely be pointing to end().
+template<typename T>
+bool operator!=(const T* LHS, const ilist_iterator<const T> &RHS) {
+ return LHS != RHS.getNodePtrUnchecked();
+}
+template<typename T>
+bool operator==(const T* LHS, const ilist_iterator<const T> &RHS) {
+ return LHS == RHS.getNodePtrUnchecked();
+}
+template<typename T>
+bool operator!=(T* LHS, const ilist_iterator<T> &RHS) {
+ return LHS != RHS.getNodePtrUnchecked();
+}
+template<typename T>
+bool operator==(T* LHS, const ilist_iterator<T> &RHS) {
+ return LHS == RHS.getNodePtrUnchecked();
+}
+
+
+// Allow ilist_iterators to convert into pointers to a node automatically when
+// used by the dyn_cast, cast, isa mechanisms...
+
+template<typename From> struct simplify_type;
+
+template<typename NodeTy> struct simplify_type<ilist_iterator<NodeTy> > {
+ typedef NodeTy* SimpleType;
+
+ static SimpleType getSimplifiedValue(ilist_iterator<NodeTy> &Node) {
+ return &*Node;
+ }
+};
+template<typename NodeTy> struct simplify_type<const ilist_iterator<NodeTy> > {
+ typedef /*const*/ NodeTy* SimpleType;
+
+ static SimpleType getSimplifiedValue(const ilist_iterator<NodeTy> &Node) {
+ return &*Node;
+ }
+};
+
+
+//===----------------------------------------------------------------------===//
+//
+/// iplist - The subset of list functionality that can safely be used on nodes
+/// of polymorphic types, i.e. a heterogeneous list with a common base class that
+/// holds the next/prev pointers. The only state of the list itself is a single
+/// pointer to the head of the list.
+///
+/// This list can be in one of three interesting states:
+/// 1. The list may be completely unconstructed. In this case, the head
+/// pointer is null. When in this form, any query for an iterator (e.g.
+/// begin() or end()) causes the list to transparently change to state #2.
+/// 2. The list may be empty, but contain a sentinel for the end iterator. This
+/// sentinel is created by the Traits::createSentinel method and is a link
+/// in the list. When the list is empty, the pointer in the iplist points
+/// to the sentinel. Once the sentinel is constructed, it
+/// is not destroyed until the list is.
+/// 3. The list may contain actual objects in it, which are stored as a doubly
+/// linked list of nodes. One invariant of the list is that the predecessor
+/// of the first node in the list always points to the last node in the list,
+/// and the successor pointer for the sentinel (which always stays at the
+/// end of the list) is always null.
+///
+template<typename NodeTy, typename Traits=ilist_traits<NodeTy> >
+class iplist : public Traits {
+ mutable NodeTy *Head;
+
+ // Use the prev node pointer of 'head' as the tail pointer. This is really a
+ // circularly linked list where we snip the 'next' link from the sentinel node
+ // back to the first node in the list (to preserve assertions about going off
+ // the end of the list).
+ NodeTy *getTail() { return this->ensureHead(Head); }
+ const NodeTy *getTail() const { return this->ensureHead(Head); }
+ void setTail(NodeTy *N) const { this->noteHead(Head, N); }
+
+ /// CreateLazySentinel - This method verifies whether the sentinel for the
+ /// list has been created and lazily makes it if not.
+ void CreateLazySentinel() const {
+ this->ensureHead(Head);
+ }
+
+ static bool op_less(NodeTy &L, NodeTy &R) { return L < R; }
+ static bool op_equal(NodeTy &L, NodeTy &R) { return L == R; }
+
+ // No fundamental reason why iplist can't be copyable, but the default
+ // copy/copy-assign won't do.
+ iplist(const iplist &) = delete;
+ void operator=(const iplist &) = delete;
+
+public:
+ typedef NodeTy *pointer;
+ typedef const NodeTy *const_pointer;
+ typedef NodeTy &reference;
+ typedef const NodeTy &const_reference;
+ typedef NodeTy value_type;
+ typedef ilist_iterator<NodeTy> iterator;
+ typedef ilist_iterator<const NodeTy> const_iterator;
+ typedef size_t size_type;
+ typedef ptrdiff_t difference_type;
+ typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+ typedef std::reverse_iterator<iterator> reverse_iterator;
+
+ iplist() : Head(this->provideInitialHead()) {}
+ ~iplist() {
+ if (!Head) return;
+ clear();
+ Traits::destroySentinel(getTail());
+ }
+
+ // Iterator creation methods.
+ iterator begin() {
+ CreateLazySentinel();
+ return iterator(Head);
+ }
+ const_iterator begin() const {
+ CreateLazySentinel();
+ return const_iterator(Head);
+ }
+ iterator end() {
+ CreateLazySentinel();
+ return iterator(getTail());
+ }
+ const_iterator end() const {
+ CreateLazySentinel();
+ return const_iterator(getTail());
+ }
+
+ // reverse iterator creation methods.
+ reverse_iterator rbegin() { return reverse_iterator(end()); }
+ const_reverse_iterator rbegin() const{ return const_reverse_iterator(end()); }
+ reverse_iterator rend() { return reverse_iterator(begin()); }
+ const_reverse_iterator rend() const { return const_reverse_iterator(begin());}
+
+
+ // Miscellaneous inspection routines.
+ size_type max_size() const { return size_type(-1); }
+ bool LLVM_ATTRIBUTE_UNUSED_RESULT empty() const {
+ return !Head || Head == getTail();
+ }
+
+ // Front and back accessor functions...
+ reference front() {
+ assert(!empty() && "Called front() on empty list!");
+ return *Head;
+ }
+ const_reference front() const {
+ assert(!empty() && "Called front() on empty list!");
+ return *Head;
+ }
+ reference back() {
+ assert(!empty() && "Called back() on empty list!");
+ return *this->getPrev(getTail());
+ }
+ const_reference back() const {
+ assert(!empty() && "Called back() on empty list!");
+ return *this->getPrev(getTail());
+ }
+
+ void swap(iplist &RHS) {
+ assert(0 && "Swap does not use list traits callback correctly yet!");
+ std::swap(Head, RHS.Head);
+ }
+
+ iterator insert(iterator where, NodeTy *New) {
+ NodeTy *CurNode = where.getNodePtrUnchecked();
+ NodeTy *PrevNode = this->getPrev(CurNode);
+ this->setNext(New, CurNode);
+ this->setPrev(New, PrevNode);
+
+ if (CurNode != Head) // Is PrevNode off the beginning of the list?
+ this->setNext(PrevNode, New);
+ else
+ Head = New;
+ this->setPrev(CurNode, New);
+
+ this->addNodeToList(New); // Notify traits that we added a node...
+ return iterator(New);
+ }
+
+ iterator insertAfter(iterator where, NodeTy *New) {
+ if (empty())
+ return insert(begin(), New);
+ else
+ return insert(++where, New);
+ }
+
+ NodeTy *remove(iterator &IT) {
+ assert(IT != end() && "Cannot remove end of list!");
+ NodeTy *Node = &*IT;
+ NodeTy *NextNode = this->getNext(Node);
+ NodeTy *PrevNode = this->getPrev(Node);
+
+ if (Node != Head) // Is PrevNode off the beginning of the list?
+ this->setNext(PrevNode, NextNode);
+ else
+ Head = NextNode;
+ this->setPrev(NextNode, PrevNode);
+ IT.reset(NextNode);
+ this->removeNodeFromList(Node); // Notify traits that we removed a node...
+
+ // Set the next/prev pointers of the current node to null. This isn't
+ // strictly required, but this catches errors where a node is removed from
+ // an ilist (and potentially deleted) with iterators still pointing at it.
+ // When those iterators are incremented or decremented, they will assert on
+ // the null next/prev pointer instead of "usually working".
+ this->setNext(Node, nullptr);
+ this->setPrev(Node, nullptr);
+ return Node;
+ }
+
+ NodeTy *remove(const iterator &IT) {
+ iterator MutIt = IT;
+ return remove(MutIt);
+ }
+
+ NodeTy *remove(NodeTy *IT) { return remove(iterator(IT)); }
+ NodeTy *remove(NodeTy &IT) { return remove(iterator(IT)); }
+
+ // erase - remove a node from the controlled sequence... and delete it.
+ iterator erase(iterator where) {
+ this->deleteNode(remove(where));
+ return where;
+ }
+
+ iterator erase(NodeTy *IT) { return erase(iterator(IT)); }
+ iterator erase(NodeTy &IT) { return erase(iterator(IT)); }
+
+ /// Remove all nodes from the list like clear(), but do not call
+ /// removeNodeFromList() or deleteNode().
+ ///
+ /// This should only be used immediately before freeing nodes in bulk to
+ /// avoid traversing the list and bringing all the nodes into cache.
+ void clearAndLeakNodesUnsafely() {
+ if (Head) {
+ Head = getTail();
+ this->setPrev(Head, Head);
+ }
+ }
+
+private:
+ // transfer - The heart of the splice function. Move linked list nodes from
+ // [first, last) into position.
+ //
+ void transfer(iterator position, iplist &L2, iterator first, iterator last) {
+ assert(first != last && "Should be checked by callers");
+ // Position cannot be contained in the range to be transferred.
+ // Check for the most common mistake.
+ assert(position != first &&
+ "Insertion point can't be one of the transferred nodes");
+
+ if (position != last) {
+ // Note: we have to be careful about the case when we move the first node
+ // in the list. This node is the list sentinel node and we can't move it.
+ NodeTy *ThisSentinel = getTail();
+ setTail(nullptr);
+ NodeTy *L2Sentinel = L2.getTail();
+ L2.setTail(nullptr);
+
+ // Remove [first, last) from its old position.
+ NodeTy *First = &*first, *Prev = this->getPrev(First);
+ NodeTy *Next = last.getNodePtrUnchecked(), *Last = this->getPrev(Next);
+ if (Prev)
+ this->setNext(Prev, Next);
+ else
+ L2.Head = Next;
+ this->setPrev(Next, Prev);
+
+ // Splice [first, last) into its new position.
+ NodeTy *PosNext = position.getNodePtrUnchecked();
+ NodeTy *PosPrev = this->getPrev(PosNext);
+
+ // Fix head of list...
+ if (PosPrev)
+ this->setNext(PosPrev, First);
+ else
+ Head = First;
+ this->setPrev(First, PosPrev);
+
+ // Fix end of list...
+ this->setNext(Last, PosNext);
+ this->setPrev(PosNext, Last);
+
+ this->transferNodesFromList(L2, iterator(First), iterator(PosNext));
+
+ // Now that everything is set, restore the pointers to the list sentinels.
+ L2.setTail(L2Sentinel);
+ setTail(ThisSentinel);
+ }
+ }
+
+public:
+
+ //===----------------------------------------------------------------------===
+ // Functionality derived from other functions defined above...
+ //
+
+ size_type LLVM_ATTRIBUTE_UNUSED_RESULT size() const {
+ if (!Head) return 0; // Don't require construction of sentinel if empty.
+ return std::distance(begin(), end());
+ }
+
+ iterator erase(iterator first, iterator last) {
+ while (first != last)
+ first = erase(first);
+ return last;
+ }
+
+ void clear() { if (Head) erase(begin(), end()); }
+
+ // Front and back inserters...
+ void push_front(NodeTy *val) { insert(begin(), val); }
+ void push_back(NodeTy *val) { insert(end(), val); }
+ void pop_front() {
+ assert(!empty() && "pop_front() on empty list!");
+ erase(begin());
+ }
+ void pop_back() {
+ assert(!empty() && "pop_back() on empty list!");
+ iterator t = end(); erase(--t);
+ }
+
+ // Special forms of insert...
+ template<class InIt> void insert(iterator where, InIt first, InIt last) {
+ for (; first != last; ++first) insert(where, *first);
+ }
+
+ // Splice members - defined in terms of transfer...
+ void splice(iterator where, iplist &L2) {
+ if (!L2.empty())
+ transfer(where, L2, L2.begin(), L2.end());
+ }
+ void splice(iterator where, iplist &L2, iterator first) {
+ iterator last = first; ++last;
+ if (where == first || where == last) return; // No change
+ transfer(where, L2, first, last);
+ }
+ void splice(iterator where, iplist &L2, iterator first, iterator last) {
+ if (first != last) transfer(where, L2, first, last);
+ }
+ void splice(iterator where, iplist &L2, NodeTy &N) {
+ splice(where, L2, iterator(N));
+ }
+ void splice(iterator where, iplist &L2, NodeTy *N) {
+ splice(where, L2, iterator(N));
+ }
+
+ template <class Compare>
+ void merge(iplist &Right, Compare comp) {
+ if (this == &Right)
+ return;
+ iterator First1 = begin(), Last1 = end();
+ iterator First2 = Right.begin(), Last2 = Right.end();
+ while (First1 != Last1 && First2 != Last2) {
+ if (comp(*First2, *First1)) {
+ iterator Next = First2;
+ transfer(First1, Right, First2, ++Next);
+ First2 = Next;
+ } else {
+ ++First1;
+ }
+ }
+ if (First2 != Last2)
+ transfer(Last1, Right, First2, Last2);
+ }
+ void merge(iplist &Right) { return merge(Right, op_less); }
+
+ template <class Compare>
+ void sort(Compare comp) {
+ // The list is empty, vacuously sorted.
+ if (empty())
+ return;
+ // The list has a single element, vacuously sorted.
+ if (std::next(begin()) == end())
+ return;
+ // Find the split point for the list.
+ iterator Center = begin(), End = begin();
+ while (End != end() && std::next(End) != end()) {
+ Center = std::next(Center);
+ End = std::next(std::next(End));
+ }
+ // Split the list into two.
+ iplist RightHalf;
+ RightHalf.splice(RightHalf.begin(), *this, Center, end());
+
+ // Sort the two sublists.
+ sort(comp);
+ RightHalf.sort(comp);
+
+ // Merge the two sublists back together.
+ merge(RightHalf, comp);
+ }
+ void sort() { sort(op_less); }
+
+ /// \brief Get the previous node, or \c nullptr for the list head.
+ NodeTy *getPrevNode(NodeTy &N) const {
+ auto I = N.getIterator();
+ if (I == begin())
+ return nullptr;
+ return &*std::prev(I);
+ }
+ /// \brief Get the previous node, or \c nullptr for the list head.
+ const NodeTy *getPrevNode(const NodeTy &N) const {
+ return getPrevNode(const_cast<NodeTy &>(N));
+ }
+
+ /// \brief Get the next node, or \c nullptr for the list tail.
+ NodeTy *getNextNode(NodeTy &N) const {
+ auto Next = std::next(N.getIterator());
+ if (Next == end())
+ return nullptr;
+ return &*Next;
+ }
+ /// \brief Get the next node, or \c nullptr for the list tail.
+ const NodeTy *getNextNode(const NodeTy &N) const {
+ return getNextNode(const_cast<NodeTy &>(N));
+ }
+};
+
+
+template<typename NodeTy>
+struct ilist : public iplist<NodeTy> {
+ typedef typename iplist<NodeTy>::size_type size_type;
+ typedef typename iplist<NodeTy>::iterator iterator;
+
+ ilist() {}
+ ilist(const ilist &right) {
+ insert(this->begin(), right.begin(), right.end());
+ }
+ explicit ilist(size_type count) {
+ insert(this->begin(), count, NodeTy());
+ }
+ ilist(size_type count, const NodeTy &val) {
+ insert(this->begin(), count, val);
+ }
+ template<class InIt> ilist(InIt first, InIt last) {
+ insert(this->begin(), first, last);
+ }
+
+ // bring hidden functions into scope
+ using iplist<NodeTy>::insert;
+ using iplist<NodeTy>::push_front;
+ using iplist<NodeTy>::push_back;
+
+ // Main implementation here - Insert for a node passed by value...
+ iterator insert(iterator where, const NodeTy &val) {
+ return insert(where, this->createNode(val));
+ }
+
+
+ // Front and back inserters...
+ void push_front(const NodeTy &val) { insert(this->begin(), val); }
+ void push_back(const NodeTy &val) { insert(this->end(), val); }
+
+ void insert(iterator where, size_type count, const NodeTy &val) {
+ for (; count != 0; --count) insert(where, val);
+ }
+
+ // Assign special forms...
+ void assign(size_type count, const NodeTy &val) {
+ iterator I = this->begin();
+ for (; I != this->end() && count != 0; ++I, --count)
+ *I = val;
+ if (count != 0)
+ insert(this->end(), val, val);
+ else
+ erase(I, this->end());
+ }
+ template<class InIt> void assign(InIt first1, InIt last1) {
+ iterator first2 = this->begin(), last2 = this->end();
+ for ( ; first1 != last1 && first2 != last2; ++first1, ++first2)
+ *first1 = *first2;
+ if (first2 == last2)
+ erase(first1, last1);
+ else
+ insert(last1, first2, last2);
+ }
+
+
+ // Resize members...
+ void resize(size_type newsize, NodeTy val) {
+ iterator i = this->begin();
+ size_type len = 0;
+ for ( ; i != this->end() && len < newsize; ++i, ++len) /* empty*/ ;
+
+ if (len == newsize)
+ erase(i, this->end());
+ else // i == end()
+ insert(this->end(), newsize - len, val);
+ }
+ void resize(size_type newsize) { resize(newsize, NodeTy()); }
+};
+
+} // End llvm namespace
+
+namespace std {
+ // Ensure that swap uses the fast list swap...
+ template<class Ty>
+ void swap(llvm::iplist<Ty> &Left, llvm::iplist<Ty> &Right) {
+ Left.swap(Right);
+ }
+} // End 'std' extensions...
+
+#endif // LLVM_ADT_ILIST_H
diff --git a/ext/include/llvm/ADT/ilist_node.h b/ext/include/llvm/ADT/ilist_node.h
new file mode 100644
index 0000000..7e5a0e0
--- /dev/null
+++ b/ext/include/llvm/ADT/ilist_node.h
@@ -0,0 +1,123 @@
+//==-- llvm/ADT/ilist_node.h - Intrusive Linked List Helper ------*- C++ -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the ilist_node class template, which is a convenient
+// base class for creating classes that can be used with ilists.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_ILIST_NODE_H
+#define LLVM_ADT_ILIST_NODE_H
+
+namespace llvm {
+
+template<typename NodeTy>
+struct ilist_traits;
+template <typename NodeTy> struct ilist_embedded_sentinel_traits;
+template <typename NodeTy> struct ilist_half_embedded_sentinel_traits;
+
+/// ilist_half_node - Base class that provides prev services for sentinels.
+///
+template<typename NodeTy>
+class ilist_half_node {
+ friend struct ilist_traits<NodeTy>;
+ friend struct ilist_half_embedded_sentinel_traits<NodeTy>;
+ NodeTy *Prev;
+protected:
+ NodeTy *getPrev() { return Prev; }
+ const NodeTy *getPrev() const { return Prev; }
+ void setPrev(NodeTy *P) { Prev = P; }
+ ilist_half_node() : Prev(nullptr) {}
+};
+
+template<typename NodeTy>
+struct ilist_nextprev_traits;
+
+template <typename NodeTy> class ilist_iterator;
+
+/// ilist_node - Base class that provides next/prev services for nodes
+/// that use ilist_nextprev_traits or ilist_default_traits.
+///
+template<typename NodeTy>
+class ilist_node : private ilist_half_node<NodeTy> {
+ friend struct ilist_nextprev_traits<NodeTy>;
+ friend struct ilist_traits<NodeTy>;
+ friend struct ilist_half_embedded_sentinel_traits<NodeTy>;
+ friend struct ilist_embedded_sentinel_traits<NodeTy>;
+ NodeTy *Next;
+ NodeTy *getNext() { return Next; }
+ const NodeTy *getNext() const { return Next; }
+ void setNext(NodeTy *N) { Next = N; }
+protected:
+ ilist_node() : Next(nullptr) {}
+
+public:
+ ilist_iterator<NodeTy> getIterator() {
+ // FIXME: Stop downcasting to create the iterator (potential UB).
+ return ilist_iterator<NodeTy>(static_cast<NodeTy *>(this));
+ }
+ ilist_iterator<const NodeTy> getIterator() const {
+ // FIXME: Stop downcasting to create the iterator (potential UB).
+ return ilist_iterator<const NodeTy>(static_cast<const NodeTy *>(this));
+ }
+};
+
+/// An ilist node that can access its parent list.
+///
+/// Requires \c NodeTy to have \a getParent() to find the parent node, and the
+/// \c ParentTy to have \a getSublistAccess() to get a reference to the list.
+template <typename NodeTy, typename ParentTy>
+class ilist_node_with_parent : public ilist_node<NodeTy> {
+protected:
+ ilist_node_with_parent() = default;
+
+private:
+ /// Forward to NodeTy::getParent().
+ ///
+ /// Note: do not use the name "getParent()". We want a compile error
+ /// (instead of recursion) when the subclass fails to implement \a
+ /// getParent().
+ const ParentTy *getNodeParent() const {
+ return static_cast<const NodeTy *>(this)->getParent();
+ }
+
+public:
+ /// @name Adjacent Node Accessors
+ /// @{
+ /// \brief Get the previous node, or \c nullptr for the list head.
+ NodeTy *getPrevNode() {
+ // Should be separated to a reused function, but then we couldn't use auto
+ // (and would need the type of the list).
+ const auto &List =
+ getNodeParent()->*(ParentTy::getSublistAccess((NodeTy *)nullptr));
+ return List.getPrevNode(*static_cast<NodeTy *>(this));
+ }
+ /// \brief Get the previous node, or \c nullptr for the list head.
+ const NodeTy *getPrevNode() const {
+ return const_cast<ilist_node_with_parent *>(this)->getPrevNode();
+ }
+
+ /// \brief Get the next node, or \c nullptr for the list tail.
+ NodeTy *getNextNode() {
+ // Should be separated to a reused function, but then we couldn't use auto
+ // (and would need the type of the list).
+ const auto &List =
+ getNodeParent()->*(ParentTy::getSublistAccess((NodeTy *)nullptr));
+ return List.getNextNode(*static_cast<NodeTy *>(this));
+ }
+ /// \brief Get the next node, or \c nullptr for the list tail.
+ const NodeTy *getNextNode() const {
+ return const_cast<ilist_node_with_parent *>(this)->getNextNode();
+ }
+ /// @}
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/ext/include/llvm/ADT/iterator.h b/ext/include/llvm/ADT/iterator.h
new file mode 100644
index 0000000..c307928
--- /dev/null
+++ b/ext/include/llvm/ADT/iterator.h
@@ -0,0 +1,246 @@
+//===- iterator.h - Utilities for using and defining iterators --*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_ITERATOR_H
+#define LLVM_ADT_ITERATOR_H
+
+#include <cstddef>
+#include <iterator>
+
+namespace llvm {
+
+/// \brief CRTP base class which implements the entire standard iterator facade
+/// in terms of a minimal subset of the interface.
+///
+/// Use this when it is reasonable to implement most of the iterator
+/// functionality in terms of a core subset. If you need special behavior or
+/// there are performance implications for this, you may want to override the
+/// relevant members instead.
+///
+/// Note, one abstraction that this does *not* provide is implementing
+/// subtraction in terms of addition by negating the difference. Negation isn't
+/// always information preserving, and I can see very reasonable iterator
+/// designs where this doesn't work well. It doesn't really force much added
+/// boilerplate anyways.
+///
+/// Another abstraction that this doesn't provide is implementing increment in
+/// terms of addition of one. These aren't equivalent for all iterator
+/// categories, and respecting that adds a lot of complexity for little gain.
+template <typename DerivedT, typename IteratorCategoryT, typename T,
+ typename DifferenceTypeT = std::ptrdiff_t, typename PointerT = T *,
+ typename ReferenceT = T &>
+class iterator_facade_base
+ : public std::iterator<IteratorCategoryT, T, DifferenceTypeT, PointerT,
+ ReferenceT> {
+protected:
+ enum {
+ IsRandomAccess =
+ std::is_base_of<std::random_access_iterator_tag, IteratorCategoryT>::value,
+ IsBidirectional =
+ std::is_base_of<std::bidirectional_iterator_tag, IteratorCategoryT>::value,
+ };
+
+public:
+ DerivedT operator+(DifferenceTypeT n) const {
+ static_assert(
+ IsRandomAccess,
+ "The '+' operator is only defined for random access iterators.");
+ DerivedT tmp = *static_cast<const DerivedT *>(this);
+ tmp += n;
+ return tmp;
+ }
+ friend DerivedT operator+(DifferenceTypeT n, const DerivedT &i) {
+ static_assert(
+ IsRandomAccess,
+ "The '+' operator is only defined for random access iterators.");
+ return i + n;
+ }
+ DerivedT operator-(DifferenceTypeT n) const {
+ static_assert(
+ IsRandomAccess,
+ "The '-' operator is only defined for random access iterators.");
+ DerivedT tmp = *static_cast<const DerivedT *>(this);
+ tmp -= n;
+ return tmp;
+ }
+
+ DerivedT &operator++() {
+ return static_cast<DerivedT *>(this)->operator+=(1);
+ }
+ DerivedT operator++(int) {
+ DerivedT tmp = *static_cast<DerivedT *>(this);
+ ++*static_cast<DerivedT *>(this);
+ return tmp;
+ }
+ DerivedT &operator--() {
+ static_assert(
+ IsBidirectional,
+ "The decrement operator is only defined for bidirectional iterators.");
+ return static_cast<DerivedT *>(this)->operator-=(1);
+ }
+ DerivedT operator--(int) {
+ static_assert(
+ IsBidirectional,
+ "The decrement operator is only defined for bidirectional iterators.");
+ DerivedT tmp = *static_cast<DerivedT *>(this);
+ --*static_cast<DerivedT *>(this);
+ return tmp;
+ }
+
+ bool operator!=(const DerivedT &RHS) const {
+ return !static_cast<const DerivedT *>(this)->operator==(RHS);
+ }
+
+ bool operator>(const DerivedT &RHS) const {
+ static_assert(
+ IsRandomAccess,
+ "Relational operators are only defined for random access iterators.");
+ return !static_cast<const DerivedT *>(this)->operator<(RHS) &&
+ !static_cast<const DerivedT *>(this)->operator==(RHS);
+ }
+ bool operator<=(const DerivedT &RHS) const {
+ static_assert(
+ IsRandomAccess,
+ "Relational operators are only defined for random access iterators.");
+ return !static_cast<const DerivedT *>(this)->operator>(RHS);
+ }
+ bool operator>=(const DerivedT &RHS) const {
+ static_assert(
+ IsRandomAccess,
+ "Relational operators are only defined for random access iterators.");
+ return !static_cast<const DerivedT *>(this)->operator<(RHS);
+ }
+
+ PointerT operator->() const {
+ return &static_cast<const DerivedT *>(this)->operator*();
+ }
+ ReferenceT operator[](DifferenceTypeT n) const {
+ static_assert(IsRandomAccess,
+ "Subscripting is only defined for random access iterators.");
+ return *static_cast<const DerivedT *>(this)->operator+(n);
+ }
+};
+
+/// \brief CRTP base class for adapting an iterator to a different type.
+///
+/// This class can be used through CRTP to adapt one iterator into another.
+/// Typically this is done through providing in the derived class a custom \c
+/// operator* implementation. Other methods can be overridden as well.
+template <
+ typename DerivedT, typename WrappedIteratorT,
+ typename IteratorCategoryT =
+ typename std::iterator_traits<WrappedIteratorT>::iterator_category,
+ typename T = typename std::iterator_traits<WrappedIteratorT>::value_type,
+ typename DifferenceTypeT =
+ typename std::iterator_traits<WrappedIteratorT>::difference_type,
+ typename PointerT = T *, typename ReferenceT = T &,
+ // Don't provide these, they are mostly to act as aliases below.
+ typename WrappedTraitsT = std::iterator_traits<WrappedIteratorT>>
+class iterator_adaptor_base
+ : public iterator_facade_base<DerivedT, IteratorCategoryT, T,
+ DifferenceTypeT, PointerT, ReferenceT> {
+ typedef typename iterator_adaptor_base::iterator_facade_base BaseT;
+
+protected:
+ WrappedIteratorT I;
+
+ iterator_adaptor_base() = default;
+
+ template <typename U>
+ explicit iterator_adaptor_base(
+ U &&u,
+ typename std::enable_if<
+ !std::is_base_of<typename std::remove_cv<
+ typename std::remove_reference<U>::type>::type,
+ DerivedT>::value,
+ int>::type = 0)
+ : I(std::forward<U &&>(u)) {}
+
+ const WrappedIteratorT &wrapped() const { return I; }
+
+public:
+ typedef DifferenceTypeT difference_type;
+
+ DerivedT &operator+=(difference_type n) {
+ static_assert(
+ BaseT::IsRandomAccess,
+ "The '+=' operator is only defined for random access iterators.");
+ I += n;
+ return *static_cast<DerivedT *>(this);
+ }
+ DerivedT &operator-=(difference_type n) {
+ static_assert(
+ BaseT::IsRandomAccess,
+ "The '-=' operator is only defined for random access iterators.");
+ I -= n;
+ return *static_cast<DerivedT *>(this);
+ }
+ using BaseT::operator-;
+ difference_type operator-(const DerivedT &RHS) const {
+ static_assert(
+ BaseT::IsRandomAccess,
+ "The '-' operator is only defined for random access iterators.");
+ return I - RHS.I;
+ }
+
+ // We have to explicitly provide ++ and -- rather than letting the facade
+ // forward to += because WrappedIteratorT might not support +=.
+ using BaseT::operator++;
+ DerivedT &operator++() {
+ ++I;
+ return *static_cast<DerivedT *>(this);
+ }
+ using BaseT::operator--;
+ DerivedT &operator--() {
+ static_assert(
+ BaseT::IsBidirectional,
+ "The decrement operator is only defined for bidirectional iterators.");
+ --I;
+ return *static_cast<DerivedT *>(this);
+ }
+
+ bool operator==(const DerivedT &RHS) const { return I == RHS.I; }
+ bool operator<(const DerivedT &RHS) const {
+ static_assert(
+ BaseT::IsRandomAccess,
+ "Relational operators are only defined for random access iterators.");
+ return I < RHS.I;
+ }
+
+ ReferenceT operator*() const { return *I; }
+};
+
+/// \brief An iterator type that allows iterating over the pointees via some
+/// other iterator.
+///
+/// The typical usage of this is to expose a type that iterates over Ts, but
+/// which is implemented with some iterator over T*s:
+///
+/// \code
+/// typedef pointee_iterator<SmallVectorImpl<T *>::iterator> iterator;
+/// \endcode
+template <typename WrappedIteratorT,
+ typename T = typename std::remove_reference<
+ decltype(**std::declval<WrappedIteratorT>())>::type>
+struct pointee_iterator
+ : iterator_adaptor_base<
+ pointee_iterator<WrappedIteratorT>, WrappedIteratorT,
+ typename std::iterator_traits<WrappedIteratorT>::iterator_category,
+ T> {
+ pointee_iterator() = default;
+ template <typename U>
+ pointee_iterator(U &&u)
+ : pointee_iterator::iterator_adaptor_base(std::forward<U &&>(u)) {}
+
+ T &operator*() const { return **this->I; }
+};
+
+}
+
+#endif
diff --git a/ext/include/llvm/ADT/iterator_range.h b/ext/include/llvm/ADT/iterator_range.h
new file mode 100644
index 0000000..3dd679b
--- /dev/null
+++ b/ext/include/llvm/ADT/iterator_range.h
@@ -0,0 +1,68 @@
+//===- iterator_range.h - A range adaptor for iterators ---------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This provides a very simple, boring adaptor for a begin and end iterator
+/// into a range type. This should be used to build range views that work well
+/// with range based for loops and range based constructors.
+///
+/// Note that code here follows more standards-based coding conventions as it
+/// is mirroring proposed interfaces for standardization.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_ITERATOR_RANGE_H
+#define LLVM_ADT_ITERATOR_RANGE_H
+
+#include <utility>
+#include <iterator>
+
+namespace llvm {
+
+/// \brief A range adaptor for a pair of iterators.
+///
+/// This just wraps two iterators into a range-compatible interface. Nothing
+/// fancy at all.
+template <typename IteratorT>
+class iterator_range {
+ IteratorT begin_iterator, end_iterator;
+
+public:
+ //TODO: Add SFINAE to test that the Container's iterators match the range's
+ // iterators.
+ template <typename Container>
+ iterator_range(Container &&c)
+ //TODO: Consider ADL/non-member begin/end calls.
+ : begin_iterator(c.begin()), end_iterator(c.end()) {}
+ iterator_range(IteratorT begin_iterator, IteratorT end_iterator)
+ : begin_iterator(std::move(begin_iterator)),
+ end_iterator(std::move(end_iterator)) {}
+
+ IteratorT begin() const { return begin_iterator; }
+ IteratorT end() const { return end_iterator; }
+};
+
+/// \brief Convenience function for iterating over sub-ranges.
+///
+/// This provides a bit of syntactic sugar to make using sub-ranges
+/// in for loops a bit easier. Analogous to std::make_pair().
+template <class T> iterator_range<T> make_range(T x, T y) {
+ return iterator_range<T>(std::move(x), std::move(y));
+}
+
+template <typename T> iterator_range<T> make_range(std::pair<T, T> p) {
+ return iterator_range<T>(std::move(p.first), std::move(p.second));
+}
+
+template<typename T>
+iterator_range<decltype(begin(std::declval<T>()))> drop_begin(T &&t, int n) {
+ return make_range(std::next(begin(t), n), end(t));
+}
+}
+
+#endif
diff --git a/ext/include/llvm/Config.h.in b/ext/include/llvm/Config.h.in
new file mode 100644
index 0000000..fc16c22
--- /dev/null
+++ b/ext/include/llvm/Config.h.in
@@ -0,0 +1,31 @@
+#ifndef __LLVM_CONFIG_H__
+#define __LLVM_CONFIG_H__
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#cmakedefine HAVE_DLFCN_H ${HAVE_DLFCN_H}
+
+/* Define to 1 if you have the <execinfo.h> header file. */
+#cmakedefine HAVE_EXECINFO_H ${HAVE_EXECINFO_H}
+
+/* Define to 1 if you have the <link.h> header file. */
+#cmakedefine HAVE_LINK_H ${HAVE_LINK_H}
+
+/* Define to 1 if you have the <mach/mach.h> header file. */
+#cmakedefine HAVE_MACH_MACH_H ${HAVE_MACH_MACH_H}
+
+/* Define to 1 if you have the <signal.h> header file. */
+#cmakedefine HAVE_SIGNAL_H ${HAVE_SIGNAL_H}
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#cmakedefine HAVE_SYS_STAT_H ${HAVE_SYS_STAT_H}
+
+/* Define to 1 if you have the `backtrace' function. */
+#cmakedefine HAVE_BACKTRACE ${HAVE_BACKTRACE}
+
+/* Define to 1 if you have the <cxxabi.h> header file. */
+#cmakedefine HAVE_CXXABI_H ${HAVE_CXXABI_H}
+
+/* Define as the return type of signal handlers (`int' or `void'). */
+#cmakedefine RETSIGTYPE ${RETSIGTYPE}
+
+#endif
diff --git a/ext/include/llvm/Support/AlignOf.h b/ext/include/llvm/Support/AlignOf.h
new file mode 100644
index 0000000..e9c2966
--- /dev/null
+++ b/ext/include/llvm/Support/AlignOf.h
@@ -0,0 +1,173 @@
+//===--- AlignOf.h - Portable calculation of type alignment -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the AlignOf function that computes alignments for
+// arbitrary types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_ALIGNOF_H
+#define LLVM_SUPPORT_ALIGNOF_H
+
+#include "llvm/Support/Compiler.h"
+#include <cstddef>
+#include <type_traits>
+
+namespace llvm {
+
+namespace detail {
+
+// For everything other than an abstract class we can calulate alignment by
+// building a class with a single character and a member of the given type.
+template <typename T, bool = std::is_abstract<T>::value>
+struct AlignmentCalcImpl {
+ char x;
+ T t;
+private:
+ AlignmentCalcImpl() {} // Never instantiate.
+};
+
+// Abstract base class helper, this will have the minimal alignment and size
+// for any abstract class. We don't even define its destructor because this
+// type should never be used in a way that requires it.
+struct AlignmentCalcImplBase {
+ virtual ~AlignmentCalcImplBase() = 0;
+};
+
+// When we have an abstract class type, specialize the alignment computation
+// engine to create another abstract class that derives from both an empty
+// abstract base class and the provided type. This has the same effect as the
+// above except that it handles the fact that we can't actually create a member
+// of type T.
+template <typename T>
+struct AlignmentCalcImpl<T, true> : AlignmentCalcImplBase, T {
+ virtual ~AlignmentCalcImpl() = 0;
+};
+
+} // End detail namespace.
+
+/// AlignOf - A templated class that contains an enum value representing
+/// the alignment of the template argument. For example,
+/// AlignOf<int>::Alignment represents the alignment of type "int". The
+/// alignment calculated is the minimum alignment, and not necessarily
+/// the "desired" alignment returned by GCC's __alignof__ (for example). Note
+/// that because the alignment is an enum value, it can be used as a
+/// compile-time constant (e.g., for template instantiation).
+template <typename T>
+struct AlignOf {
+ // Avoid warnings from GCC like:
+ // comparison between 'enum llvm::AlignOf<X>::<anonymous>' and 'enum
+ // llvm::AlignOf<Y>::<anonymous>' [-Wenum-compare]
+ // by using constexpr instead of enum.
+ // (except on MSVC, since it doesn't support constexpr yet).
+ static constexpr unsigned Alignment = static_cast<unsigned int>(
+ sizeof(detail::AlignmentCalcImpl<T>) - sizeof(T));
+ enum { Alignment_GreaterEqual_2Bytes = Alignment >= 2 ? 1 : 0 };
+ enum { Alignment_GreaterEqual_4Bytes = Alignment >= 4 ? 1 : 0 };
+ enum { Alignment_GreaterEqual_8Bytes = Alignment >= 8 ? 1 : 0 };
+ enum { Alignment_GreaterEqual_16Bytes = Alignment >= 16 ? 1 : 0 };
+
+ enum { Alignment_LessEqual_2Bytes = Alignment <= 2 ? 1 : 0 };
+ enum { Alignment_LessEqual_4Bytes = Alignment <= 4 ? 1 : 0 };
+ enum { Alignment_LessEqual_8Bytes = Alignment <= 8 ? 1 : 0 };
+ enum { Alignment_LessEqual_16Bytes = Alignment <= 16 ? 1 : 0 };
+};
+
+template <typename T> constexpr unsigned AlignOf<T>::Alignment;
+
+/// alignOf - A templated function that returns the minimum alignment of
+/// of a type. This provides no extra functionality beyond the AlignOf
+/// class besides some cosmetic cleanliness. Example usage:
+/// alignOf<int>() returns the alignment of an int.
+template <typename T>
+inline unsigned alignOf() { return AlignOf<T>::Alignment; }
+
+/// \struct AlignedCharArray
+/// \brief Helper for building an aligned character array type.
+///
+/// This template is used to explicitly build up a collection of aligned
+/// character array types. We have to build these up using a macro and explicit
+/// specialization to cope with old versions of MSVC and GCC where only an
+/// integer literal can be used to specify an alignment constraint. Once built
+/// up here, we can then begin to indirect between these using normal C++
+/// template parameters.
+
+#if __has_feature(cxx_alignas)
+template<std::size_t Alignment, std::size_t Size>
+struct AlignedCharArray {
+ alignas(Alignment) char buffer[Size];
+};
+
+#elif defined(__GNUC__) || defined(__IBM_ATTRIBUTES)
+/// \brief Create a type with an aligned char buffer.
+template<std::size_t Alignment, std::size_t Size>
+struct AlignedCharArray;
+
+#define LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(x) \
+ template<std::size_t Size> \
+ struct AlignedCharArray<x, Size> { \
+ __attribute__((aligned(x))) char buffer[Size]; \
+ };
+
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(1)
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(2)
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(4)
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(8)
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(16)
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(32)
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(64)
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(128)
+
+#undef LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT
+
+#else
+# error No supported align as directive.
+#endif
+
+namespace detail {
+template <typename T1,
+ typename T2 = char, typename T3 = char, typename T4 = char,
+ typename T5 = char, typename T6 = char, typename T7 = char,
+ typename T8 = char, typename T9 = char, typename T10 = char>
+class AlignerImpl {
+ T1 t1; T2 t2; T3 t3; T4 t4; T5 t5; T6 t6; T7 t7; T8 t8; T9 t9; T10 t10;
+
+ AlignerImpl(); // Never defined or instantiated.
+};
+
+template <typename T1,
+ typename T2 = char, typename T3 = char, typename T4 = char,
+ typename T5 = char, typename T6 = char, typename T7 = char,
+ typename T8 = char, typename T9 = char, typename T10 = char>
+union SizerImpl {
+ char arr1[sizeof(T1)], arr2[sizeof(T2)], arr3[sizeof(T3)], arr4[sizeof(T4)],
+ arr5[sizeof(T5)], arr6[sizeof(T6)], arr7[sizeof(T7)], arr8[sizeof(T8)],
+ arr9[sizeof(T9)], arr10[sizeof(T10)];
+};
+} // end namespace detail
+
+/// \brief This union template exposes a suitably aligned and sized character
+/// array member which can hold elements of any of up to ten types.
+///
+/// These types may be arrays, structs, or any other types. The goal is to
+/// expose a char array buffer member which can be used as suitable storage for
+/// a placement new of any of these types. Support for more than ten types can
+/// be added at the cost of more boilerplate.
+template <typename T1,
+ typename T2 = char, typename T3 = char, typename T4 = char,
+ typename T5 = char, typename T6 = char, typename T7 = char,
+ typename T8 = char, typename T9 = char, typename T10 = char>
+struct AlignedCharArrayUnion : llvm::AlignedCharArray<
+ AlignOf<detail::AlignerImpl<T1, T2, T3, T4, T5,
+ T6, T7, T8, T9, T10> >::Alignment,
+ sizeof(detail::SizerImpl<T1, T2, T3, T4, T5,
+ T6, T7, T8, T9, T10>)> {
+};
+} // end namespace llvm
+#endif
diff --git a/ext/include/llvm/Support/Allocator.h b/ext/include/llvm/Support/Allocator.h
new file mode 100644
index 0000000..043d823
--- /dev/null
+++ b/ext/include/llvm/Support/Allocator.h
@@ -0,0 +1,435 @@
+//===--- Allocator.h - Simple memory allocation abstraction -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines the MallocAllocator and BumpPtrAllocator interfaces. Both
+/// of these conform to an LLVM "Allocator" concept which consists of an
+/// Allocate method accepting a size and alignment, and a Deallocate accepting
+/// a pointer and size. Further, the LLVM "Allocator" concept has overloads of
+/// Allocate and Deallocate for setting size and alignment based on the final
+/// type. These overloads are typically provided by a base class template \c
+/// AllocatorBase.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_ALLOCATOR_H
+#define LLVM_SUPPORT_ALLOCATOR_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/AlignOf.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Memory.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdlib>
+
+namespace llvm {
+
+/// \brief CRTP base class providing obvious overloads for the core \c
+/// Allocate() methods of LLVM-style allocators.
+///
+/// This base class both documents the full public interface exposed by all
+/// LLVM-style allocators, and redirects all of the overloads to a single core
+/// set of methods which the derived class must define.
+template <typename DerivedT> class AllocatorBase {
+public:
+ /// \brief Allocate \a Size bytes of \a Alignment aligned memory. This method
+ /// must be implemented by \c DerivedT.
+ void *Allocate(size_t Size, size_t Alignment) {
+#ifdef __clang__
+ static_assert(static_cast<void *(AllocatorBase::*)(size_t, size_t)>(
+ &AllocatorBase::Allocate) !=
+ static_cast<void *(DerivedT::*)(size_t, size_t)>(
+ &DerivedT::Allocate),
+ "Class derives from AllocatorBase without implementing the "
+ "core Allocate(size_t, size_t) overload!");
+#endif
+ return static_cast<DerivedT *>(this)->Allocate(Size, Alignment);
+ }
+
+ /// \brief Deallocate \a Ptr to \a Size bytes of memory allocated by this
+ /// allocator.
+ void Deallocate(const void *Ptr, size_t Size) {
+#ifdef __clang__
+ static_assert(static_cast<void (AllocatorBase::*)(const void *, size_t)>(
+ &AllocatorBase::Deallocate) !=
+ static_cast<void (DerivedT::*)(const void *, size_t)>(
+ &DerivedT::Deallocate),
+ "Class derives from AllocatorBase without implementing the "
+ "core Deallocate(void *) overload!");
+#endif
+ return static_cast<DerivedT *>(this)->Deallocate(Ptr, Size);
+ }
+
+ // The rest of these methods are helpers that redirect to one of the above
+ // core methods.
+
+ /// \brief Allocate space for a sequence of objects without constructing them.
+ template <typename T> T *Allocate(size_t Num = 1) {
+ return static_cast<T *>(Allocate(Num * sizeof(T), AlignOf<T>::Alignment));
+ }
+
+ /// \brief Deallocate space for a sequence of objects without constructing them.
+ template <typename T>
+ typename std::enable_if<
+ !std::is_same<typename std::remove_cv<T>::type, void>::value, void>::type
+ Deallocate(T *Ptr, size_t Num = 1) {
+ Deallocate(static_cast<const void *>(Ptr), Num * sizeof(T));
+ }
+};
+
+class MallocAllocator : public AllocatorBase<MallocAllocator> {
+public:
+ void Reset() {}
+
+ LLVM_ATTRIBUTE_RETURNS_NONNULL void *Allocate(size_t Size,
+ size_t /*Alignment*/) {
+ return malloc(Size);
+ }
+
+ // Pull in base class overloads.
+ using AllocatorBase<MallocAllocator>::Allocate;
+
+ void Deallocate(const void *Ptr, size_t /*Size*/) {
+ free(const_cast<void *>(Ptr));
+ }
+
+ // Pull in base class overloads.
+ using AllocatorBase<MallocAllocator>::Deallocate;
+
+ void PrintStats() const {}
+};
+
+namespace detail {
+
+// We call out to an external function to actually print the message as the
+// printing code uses Allocator.h in its implementation.
+void printBumpPtrAllocatorStats(unsigned NumSlabs, size_t BytesAllocated,
+ size_t TotalMemory);
+} // End namespace detail.
+
+/// \brief Allocate memory in an ever growing pool, as if by bump-pointer.
+///
+/// This isn't strictly a bump-pointer allocator as it uses backing slabs of
+/// memory rather than relying on a boundless contiguous heap. However, it has
+/// bump-pointer semantics in that it is a monotonically growing pool of memory
+/// where every allocation is found by merely allocating the next N bytes in
+/// the slab, or the next N bytes in the next slab.
+///
+/// Note that this also has a threshold for forcing allocations above a certain
+/// size into their own slab.
+///
+/// The BumpPtrAllocatorImpl template defaults to using a MallocAllocator
+/// object, which wraps malloc, to allocate memory, but it can be changed to
+/// use a custom allocator.
+template <typename AllocatorT = MallocAllocator, size_t SlabSize = 4096,
+ size_t SizeThreshold = SlabSize>
+class BumpPtrAllocatorImpl
+ : public AllocatorBase<
+ BumpPtrAllocatorImpl<AllocatorT, SlabSize, SizeThreshold>> {
+public:
+ static_assert(SizeThreshold <= SlabSize,
+ "The SizeThreshold must be at most the SlabSize to ensure "
+ "that objects larger than a slab go into their own memory "
+ "allocation.");
+
+ BumpPtrAllocatorImpl()
+ : CurPtr(nullptr), End(nullptr), BytesAllocated(0), Allocator() {}
+ template <typename T>
+ BumpPtrAllocatorImpl(T &&Allocator)
+ : CurPtr(nullptr), End(nullptr), BytesAllocated(0),
+ Allocator(std::forward<T &&>(Allocator)) {}
+
+ // Manually implement a move constructor as we must clear the old allocator's
+ // slabs as a matter of correctness.
+ BumpPtrAllocatorImpl(BumpPtrAllocatorImpl &&Old)
+ : CurPtr(Old.CurPtr), End(Old.End), Slabs(std::move(Old.Slabs)),
+ CustomSizedSlabs(std::move(Old.CustomSizedSlabs)),
+ BytesAllocated(Old.BytesAllocated),
+ Allocator(std::move(Old.Allocator)) {
+ Old.CurPtr = Old.End = nullptr;
+ Old.BytesAllocated = 0;
+ Old.Slabs.clear();
+ Old.CustomSizedSlabs.clear();
+ }
+
+ ~BumpPtrAllocatorImpl() {
+ DeallocateSlabs(Slabs.begin(), Slabs.end());
+ DeallocateCustomSizedSlabs();
+ }
+
+ BumpPtrAllocatorImpl &operator=(BumpPtrAllocatorImpl &&RHS) {
+ DeallocateSlabs(Slabs.begin(), Slabs.end());
+ DeallocateCustomSizedSlabs();
+
+ CurPtr = RHS.CurPtr;
+ End = RHS.End;
+ BytesAllocated = RHS.BytesAllocated;
+ Slabs = std::move(RHS.Slabs);
+ CustomSizedSlabs = std::move(RHS.CustomSizedSlabs);
+ Allocator = std::move(RHS.Allocator);
+
+ RHS.CurPtr = RHS.End = nullptr;
+ RHS.BytesAllocated = 0;
+ RHS.Slabs.clear();
+ RHS.CustomSizedSlabs.clear();
+ return *this;
+ }
+
+ /// \brief Deallocate all but the current slab and reset the current pointer
+ /// to the beginning of it, freeing all memory allocated so far.
+ void Reset() {
+ // Deallocate all but the first slab, and deallocate all custom-sized slabs.
+ DeallocateCustomSizedSlabs();
+ CustomSizedSlabs.clear();
+
+ if (Slabs.empty())
+ return;
+
+ // Reset the state.
+ BytesAllocated = 0;
+ CurPtr = (char *)Slabs.front();
+ End = CurPtr + SlabSize;
+
+ __asan_poison_memory_region(*Slabs.begin(), computeSlabSize(0));
+ DeallocateSlabs(std::next(Slabs.begin()), Slabs.end());
+ Slabs.erase(std::next(Slabs.begin()), Slabs.end());
+ }
+
+ /// \brief Allocate space at the specified alignment.
+ LLVM_ATTRIBUTE_RETURNS_NONNULL LLVM_ATTRIBUTE_RETURNS_NOALIAS void *
+ Allocate(size_t Size, size_t Alignment) {
+ assert(Alignment > 0 && "0-byte alignnment is not allowed. Use 1 instead.");
+
+ // Keep track of how many bytes we've allocated.
+ BytesAllocated += Size;
+
+ size_t Adjustment = alignmentAdjustment(CurPtr, Alignment);
+ assert(Adjustment + Size >= Size && "Adjustment + Size must not overflow");
+
+ // Check if we have enough space.
+ if (Adjustment + Size <= size_t(End - CurPtr)) {
+ char *AlignedPtr = CurPtr + Adjustment;
+ CurPtr = AlignedPtr + Size;
+ // Update the allocation point of this memory block in MemorySanitizer.
+ // Without this, MemorySanitizer messages for values originated from here
+ // will point to the allocation of the entire slab.
+ __msan_allocated_memory(AlignedPtr, Size);
+ // Similarly, tell ASan about this space.
+ __asan_unpoison_memory_region(AlignedPtr, Size);
+ return AlignedPtr;
+ }
+
+ // If Size is really big, allocate a separate slab for it.
+ size_t PaddedSize = Size + Alignment - 1;
+ if (PaddedSize > SizeThreshold) {
+ void *NewSlab = Allocator.Allocate(PaddedSize, 0);
+ // We own the new slab and don't want anyone reading anyting other than
+ // pieces returned from this method. So poison the whole slab.
+ __asan_poison_memory_region(NewSlab, PaddedSize);
+ CustomSizedSlabs.push_back(std::make_pair(NewSlab, PaddedSize));
+
+ uintptr_t AlignedAddr = alignAddr(NewSlab, Alignment);
+ assert(AlignedAddr + Size <= (uintptr_t)NewSlab + PaddedSize);
+ char *AlignedPtr = (char*)AlignedAddr;
+ __msan_allocated_memory(AlignedPtr, Size);
+ __asan_unpoison_memory_region(AlignedPtr, Size);
+ return AlignedPtr;
+ }
+
+ // Otherwise, start a new slab and try again.
+ StartNewSlab();
+ uintptr_t AlignedAddr = alignAddr(CurPtr, Alignment);
+ assert(AlignedAddr + Size <= (uintptr_t)End &&
+ "Unable to allocate memory!");
+ char *AlignedPtr = (char*)AlignedAddr;
+ CurPtr = AlignedPtr + Size;
+ __msan_allocated_memory(AlignedPtr, Size);
+ __asan_unpoison_memory_region(AlignedPtr, Size);
+ return AlignedPtr;
+ }
+
+ // Pull in base class overloads.
+ using AllocatorBase<BumpPtrAllocatorImpl>::Allocate;
+
+ void Deallocate(const void *Ptr, size_t Size) {
+ __asan_poison_memory_region(Ptr, Size);
+ }
+
+ // Pull in base class overloads.
+ using AllocatorBase<BumpPtrAllocatorImpl>::Deallocate;
+
+ size_t GetNumSlabs() const { return Slabs.size() + CustomSizedSlabs.size(); }
+
+ size_t getTotalMemory() const {
+ size_t TotalMemory = 0;
+ for (auto I = Slabs.begin(), E = Slabs.end(); I != E; ++I)
+ TotalMemory += computeSlabSize(std::distance(Slabs.begin(), I));
+ for (auto &PtrAndSize : CustomSizedSlabs)
+ TotalMemory += PtrAndSize.second;
+ return TotalMemory;
+ }
+
+ void PrintStats() const {
+ detail::printBumpPtrAllocatorStats(Slabs.size(), BytesAllocated,
+ getTotalMemory());
+ }
+
+private:
+ /// \brief The current pointer into the current slab.
+ ///
+ /// This points to the next free byte in the slab.
+ char *CurPtr;
+
+ /// \brief The end of the current slab.
+ char *End;
+
+ /// \brief The slabs allocated so far.
+ SmallVector<void *, 4> Slabs;
+
+ /// \brief Custom-sized slabs allocated for too-large allocation requests.
+ SmallVector<std::pair<void *, size_t>, 0> CustomSizedSlabs;
+
+ /// \brief How many bytes we've allocated.
+ ///
+ /// Used so that we can compute how much space was wasted.
+ size_t BytesAllocated;
+
+ /// \brief The allocator instance we use to get slabs of memory.
+ AllocatorT Allocator;
+
+ static size_t computeSlabSize(unsigned SlabIdx) {
+ // Scale the actual allocated slab size based on the number of slabs
+ // allocated. Every 128 slabs allocated, we double the allocated size to
+ // reduce allocation frequency, but saturate at multiplying the slab size by
+ // 2^30.
+ return SlabSize * ((size_t)1 << std::min<size_t>(30, SlabIdx / 128));
+ }
+
+ /// \brief Allocate a new slab and move the bump pointers over into the new
+ /// slab, modifying CurPtr and End.
+ void StartNewSlab() {
+ size_t AllocatedSlabSize = computeSlabSize(Slabs.size());
+
+ void *NewSlab = Allocator.Allocate(AllocatedSlabSize, 0);
+ // We own the new slab and don't want anyone reading anything other than
+ // pieces returned from this method. So poison the whole slab.
+ __asan_poison_memory_region(NewSlab, AllocatedSlabSize);
+
+ Slabs.push_back(NewSlab);
+ CurPtr = (char *)(NewSlab);
+ End = ((char *)NewSlab) + AllocatedSlabSize;
+ }
+
+ /// \brief Deallocate a sequence of slabs.
+ void DeallocateSlabs(SmallVectorImpl<void *>::iterator I,
+ SmallVectorImpl<void *>::iterator E) {
+ for (; I != E; ++I) {
+ size_t AllocatedSlabSize =
+ computeSlabSize(std::distance(Slabs.begin(), I));
+ Allocator.Deallocate(*I, AllocatedSlabSize);
+ }
+ }
+
+ /// \brief Deallocate all memory for custom sized slabs.
+ void DeallocateCustomSizedSlabs() {
+ for (auto &PtrAndSize : CustomSizedSlabs) {
+ void *Ptr = PtrAndSize.first;
+ size_t Size = PtrAndSize.second;
+ Allocator.Deallocate(Ptr, Size);
+ }
+ }
+
+ template <typename T> friend class SpecificBumpPtrAllocator;
+};
+
+/// \brief The standard BumpPtrAllocator which just uses the default template
+/// paramaters.
+typedef BumpPtrAllocatorImpl<> BumpPtrAllocator;
+
+/// \brief A BumpPtrAllocator that allows only elements of a specific type to be
+/// allocated.
+///
+/// This allows calling the destructor in DestroyAll() and when the allocator is
+/// destroyed.
+template <typename T> class SpecificBumpPtrAllocator {
+ BumpPtrAllocator Allocator;
+
+public:
+ SpecificBumpPtrAllocator() : Allocator() {}
+ SpecificBumpPtrAllocator(SpecificBumpPtrAllocator &&Old)
+ : Allocator(std::move(Old.Allocator)) {}
+ ~SpecificBumpPtrAllocator() { DestroyAll(); }
+
+ SpecificBumpPtrAllocator &operator=(SpecificBumpPtrAllocator &&RHS) {
+ Allocator = std::move(RHS.Allocator);
+ return *this;
+ }
+
+ /// Call the destructor of each allocated object and deallocate all but the
+ /// current slab and reset the current pointer to the beginning of it, freeing
+ /// all memory allocated so far.
+ void DestroyAll() {
+ auto DestroyElements = [](char *Begin, char *End) {
+ assert(Begin == (char*)alignAddr(Begin, alignOf<T>()));
+ for (char *Ptr = Begin; Ptr + sizeof(T) <= End; Ptr += sizeof(T))
+ reinterpret_cast<T *>(Ptr)->~T();
+ };
+
+ for (auto I = Allocator.Slabs.begin(), E = Allocator.Slabs.end(); I != E;
+ ++I) {
+ size_t AllocatedSlabSize = BumpPtrAllocator::computeSlabSize(
+ std::distance(Allocator.Slabs.begin(), I));
+ char *Begin = (char*)alignAddr(*I, alignOf<T>());
+ char *End = *I == Allocator.Slabs.back() ? Allocator.CurPtr
+ : (char *)*I + AllocatedSlabSize;
+
+ DestroyElements(Begin, End);
+ }
+
+ for (auto &PtrAndSize : Allocator.CustomSizedSlabs) {
+ void *Ptr = PtrAndSize.first;
+ size_t Size = PtrAndSize.second;
+ DestroyElements((char*)alignAddr(Ptr, alignOf<T>()), (char *)Ptr + Size);
+ }
+
+ Allocator.Reset();
+ }
+
+ /// \brief Allocate space for an array of objects without constructing them.
+ T *Allocate(size_t num = 1) { return Allocator.Allocate<T>(num); }
+};
+
+} // end namespace llvm
+
+template <typename AllocatorT, size_t SlabSize, size_t SizeThreshold>
+void *operator new(size_t Size,
+ llvm::BumpPtrAllocatorImpl<AllocatorT, SlabSize,
+ SizeThreshold> &Allocator) {
+ struct S {
+ char c;
+ union {
+ double D;
+ long double LD;
+ long long L;
+ void *P;
+ } x;
+ };
+ return Allocator.Allocate(
+ Size, std::min((size_t)llvm::NextPowerOf2(Size), offsetof(S, x)));
+}
+
+template <typename AllocatorT, size_t SlabSize, size_t SizeThreshold>
+void operator delete(
+ void *, llvm::BumpPtrAllocatorImpl<AllocatorT, SlabSize, SizeThreshold> &) {
+}
+
+#endif // LLVM_SUPPORT_ALLOCATOR_H
diff --git a/ext/include/llvm/Support/Atomic.h b/ext/include/llvm/Support/Atomic.h
new file mode 100644
index 0000000..1ac7dba
--- /dev/null
+++ b/ext/include/llvm/Support/Atomic.h
@@ -0,0 +1,35 @@
+//===- llvm/Support/Atomic.h - Atomic Operations -----------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the llvm::sys atomic operations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_ATOMIC_H
+#define LLVM_SUPPORT_ATOMIC_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+ namespace sys {
+ void MemoryFence();
+
+ typedef uint32_t cas_flag;
+ cas_flag CompareAndSwap(volatile cas_flag* ptr,
+ cas_flag new_value,
+ cas_flag old_value);
+ cas_flag AtomicIncrement(volatile cas_flag* ptr);
+ cas_flag AtomicDecrement(volatile cas_flag* ptr);
+ cas_flag AtomicAdd(volatile cas_flag* ptr, cas_flag val);
+ cas_flag AtomicMul(volatile cas_flag* ptr, cas_flag val);
+ cas_flag AtomicDiv(volatile cas_flag* ptr, cas_flag val);
+ }
+}
+
+#endif
diff --git a/ext/include/llvm/Support/Casting.h b/ext/include/llvm/Support/Casting.h
new file mode 100644
index 0000000..6ba5efa
--- /dev/null
+++ b/ext/include/llvm/Support/Casting.h
@@ -0,0 +1,326 @@
+//===-- llvm/Support/Casting.h - Allow flexible, checked, casts -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the isa<X>(), cast<X>(), dyn_cast<X>(), cast_or_null<X>(),
+// and dyn_cast_or_null<X>() templates.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_CASTING_H
+#define LLVM_SUPPORT_CASTING_H
+
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/type_traits.h"
+#include <cassert>
+
+namespace llvm {
+
+//===----------------------------------------------------------------------===//
+// isa<x> Support Templates
+//===----------------------------------------------------------------------===//
+
+// Define a template that can be specialized by smart pointers to reflect the
+// fact that they are automatically dereferenced, and are not involved with the
+// template selection process... the default implementation is a noop.
+//
+template<typename From> struct simplify_type {
+ typedef From SimpleType; // The real type this represents...
+
+ // An accessor to get the real value...
+ static SimpleType &getSimplifiedValue(From &Val) { return Val; }
+};
+
+template<typename From> struct simplify_type<const From> {
+ typedef typename simplify_type<From>::SimpleType NonConstSimpleType;
+ typedef typename add_const_past_pointer<NonConstSimpleType>::type
+ SimpleType;
+ typedef typename add_lvalue_reference_if_not_pointer<SimpleType>::type
+ RetType;
+ static RetType getSimplifiedValue(const From& Val) {
+ return simplify_type<From>::getSimplifiedValue(const_cast<From&>(Val));
+ }
+};
+
+// The core of the implementation of isa<X> is here; To and From should be
+// the names of classes. This template can be specialized to customize the
+// implementation of isa<> without rewriting it from scratch.
+template <typename To, typename From, typename Enabler = void>
+struct isa_impl {
+ static inline bool doit(const From &Val) {
+ return To::classof(&Val);
+ }
+};
+
+/// \brief Always allow upcasts, and perform no dynamic check for them.
+template <typename To, typename From>
+struct isa_impl<
+ To, From, typename std::enable_if<std::is_base_of<To, From>::value>::type> {
+ static inline bool doit(const From &) { return true; }
+};
+
+template <typename To, typename From> struct isa_impl_cl {
+ static inline bool doit(const From &Val) {
+ return isa_impl<To, From>::doit(Val);
+ }
+};
+
+template <typename To, typename From> struct isa_impl_cl<To, const From> {
+ static inline bool doit(const From &Val) {
+ return isa_impl<To, From>::doit(Val);
+ }
+};
+
+template <typename To, typename From> struct isa_impl_cl<To, From*> {
+ static inline bool doit(const From *Val) {
+ assert(Val && "isa<> used on a null pointer");
+ return isa_impl<To, From>::doit(*Val);
+ }
+};
+
+template <typename To, typename From> struct isa_impl_cl<To, From*const> {
+ static inline bool doit(const From *Val) {
+ assert(Val && "isa<> used on a null pointer");
+ return isa_impl<To, From>::doit(*Val);
+ }
+};
+
+template <typename To, typename From> struct isa_impl_cl<To, const From*> {
+ static inline bool doit(const From *Val) {
+ assert(Val && "isa<> used on a null pointer");
+ return isa_impl<To, From>::doit(*Val);
+ }
+};
+
+template <typename To, typename From> struct isa_impl_cl<To, const From*const> {
+ static inline bool doit(const From *Val) {
+ assert(Val && "isa<> used on a null pointer");
+ return isa_impl<To, From>::doit(*Val);
+ }
+};
+
+template<typename To, typename From, typename SimpleFrom>
+struct isa_impl_wrap {
+ // When From != SimplifiedType, we can simplify the type some more by using
+ // the simplify_type template.
+ static bool doit(const From &Val) {
+ return isa_impl_wrap<To, SimpleFrom,
+ typename simplify_type<SimpleFrom>::SimpleType>::doit(
+ simplify_type<const From>::getSimplifiedValue(Val));
+ }
+};
+
+template<typename To, typename FromTy>
+struct isa_impl_wrap<To, FromTy, FromTy> {
+ // When From == SimpleType, we are as simple as we are going to get.
+ static bool doit(const FromTy &Val) {
+ return isa_impl_cl<To,FromTy>::doit(Val);
+ }
+};
+
+// isa<X> - Return true if the parameter to the template is an instance of the
+// template type argument. Used like this:
+//
+// if (isa<Type>(myVal)) { ... }
+//
+template <class X, class Y>
+LLVM_ATTRIBUTE_UNUSED_RESULT inline bool isa(const Y &Val) {
+ return isa_impl_wrap<X, const Y,
+ typename simplify_type<const Y>::SimpleType>::doit(Val);
+}
+
+//===----------------------------------------------------------------------===//
+// cast<x> Support Templates
+//===----------------------------------------------------------------------===//
+
+template<class To, class From> struct cast_retty;
+
+
+// Calculate what type the 'cast' function should return, based on a requested
+// type of To and a source type of From.
+template<class To, class From> struct cast_retty_impl {
+ typedef To& ret_type; // Normal case, return Ty&
+};
+template<class To, class From> struct cast_retty_impl<To, const From> {
+ typedef const To &ret_type; // Normal case, return Ty&
+};
+
+template<class To, class From> struct cast_retty_impl<To, From*> {
+ typedef To* ret_type; // Pointer arg case, return Ty*
+};
+
+template<class To, class From> struct cast_retty_impl<To, const From*> {
+ typedef const To* ret_type; // Constant pointer arg case, return const Ty*
+};
+
+template<class To, class From> struct cast_retty_impl<To, const From*const> {
+ typedef const To* ret_type; // Constant pointer arg case, return const Ty*
+};
+
+
+template<class To, class From, class SimpleFrom>
+struct cast_retty_wrap {
+ // When the simplified type and the from type are not the same, use the type
+ // simplifier to reduce the type, then reuse cast_retty_impl to get the
+ // resultant type.
+ typedef typename cast_retty<To, SimpleFrom>::ret_type ret_type;
+};
+
+template<class To, class FromTy>
+struct cast_retty_wrap<To, FromTy, FromTy> {
+ // When the simplified type is equal to the from type, use it directly.
+ typedef typename cast_retty_impl<To,FromTy>::ret_type ret_type;
+};
+
+template<class To, class From>
+struct cast_retty {
+ typedef typename cast_retty_wrap<To, From,
+ typename simplify_type<From>::SimpleType>::ret_type ret_type;
+};
+
+// Ensure the non-simple values are converted using the simplify_type template
+// that may be specialized by smart pointers...
+//
+template<class To, class From, class SimpleFrom> struct cast_convert_val {
+ // This is not a simple type, use the template to simplify it...
+ static typename cast_retty<To, From>::ret_type doit(From &Val) {
+ return cast_convert_val<To, SimpleFrom,
+ typename simplify_type<SimpleFrom>::SimpleType>::doit(
+ simplify_type<From>::getSimplifiedValue(Val));
+ }
+};
+
+template<class To, class FromTy> struct cast_convert_val<To,FromTy,FromTy> {
+ // This _is_ a simple type, just cast it.
+ static typename cast_retty<To, FromTy>::ret_type doit(const FromTy &Val) {
+ typename cast_retty<To, FromTy>::ret_type Res2
+ = (typename cast_retty<To, FromTy>::ret_type)const_cast<FromTy&>(Val);
+ return Res2;
+ }
+};
+
+template <class X> struct is_simple_type {
+ static const bool value =
+ std::is_same<X, typename simplify_type<X>::SimpleType>::value;
+};
+
+// cast<X> - Return the argument parameter cast to the specified type. This
+// casting operator asserts that the type is correct, so it does not return null
+// on failure. It does not allow a null argument (use cast_or_null for that).
+// It is typically used like this:
+//
+// cast<Instruction>(myVal)->getParent()
+//
+template <class X, class Y>
+inline typename std::enable_if<!is_simple_type<Y>::value,
+ typename cast_retty<X, const Y>::ret_type>::type
+cast(const Y &Val) {
+ assert(isa<X>(Val) && "cast<Ty>() argument of incompatible type!");
+ return cast_convert_val<
+ X, const Y, typename simplify_type<const Y>::SimpleType>::doit(Val);
+}
+
+template <class X, class Y>
+inline typename cast_retty<X, Y>::ret_type cast(Y &Val) {
+ assert(isa<X>(Val) && "cast<Ty>() argument of incompatible type!");
+ return cast_convert_val<X, Y,
+ typename simplify_type<Y>::SimpleType>::doit(Val);
+}
+
+template <class X, class Y>
+inline typename cast_retty<X, Y *>::ret_type cast(Y *Val) {
+ assert(isa<X>(Val) && "cast<Ty>() argument of incompatible type!");
+ return cast_convert_val<X, Y*,
+ typename simplify_type<Y*>::SimpleType>::doit(Val);
+}
+
+// cast_or_null<X> - Functionally identical to cast, except that a null value is
+// accepted.
+//
+template <class X, class Y>
+LLVM_ATTRIBUTE_UNUSED_RESULT inline typename std::enable_if<
+ !is_simple_type<Y>::value, typename cast_retty<X, const Y>::ret_type>::type
+cast_or_null(const Y &Val) {
+ if (!Val)
+ return nullptr;
+ assert(isa<X>(Val) && "cast_or_null<Ty>() argument of incompatible type!");
+ return cast<X>(Val);
+}
+
+template <class X, class Y>
+LLVM_ATTRIBUTE_UNUSED_RESULT inline typename std::enable_if<
+ !is_simple_type<Y>::value, typename cast_retty<X, Y>::ret_type>::type
+cast_or_null(Y &Val) {
+ if (!Val)
+ return nullptr;
+ assert(isa<X>(Val) && "cast_or_null<Ty>() argument of incompatible type!");
+ return cast<X>(Val);
+}
+
+template <class X, class Y>
+LLVM_ATTRIBUTE_UNUSED_RESULT inline typename cast_retty<X, Y *>::ret_type
+cast_or_null(Y *Val) {
+ if (!Val) return nullptr;
+ assert(isa<X>(Val) && "cast_or_null<Ty>() argument of incompatible type!");
+ return cast<X>(Val);
+}
+
+
+// dyn_cast<X> - Return the argument parameter cast to the specified type. This
+// casting operator returns null if the argument is of the wrong type, so it can
+// be used to test for a type as well as cast if successful. This should be
+// used in the context of an if statement like this:
+//
+// if (const Instruction *I = dyn_cast<Instruction>(myVal)) { ... }
+//
+
+template <class X, class Y>
+LLVM_ATTRIBUTE_UNUSED_RESULT inline typename std::enable_if<
+ !is_simple_type<Y>::value, typename cast_retty<X, const Y>::ret_type>::type
+dyn_cast(const Y &Val) {
+ return isa<X>(Val) ? cast<X>(Val) : nullptr;
+}
+
+template <class X, class Y>
+LLVM_ATTRIBUTE_UNUSED_RESULT inline typename cast_retty<X, Y>::ret_type
+dyn_cast(Y &Val) {
+ return isa<X>(Val) ? cast<X>(Val) : nullptr;
+}
+
+template <class X, class Y>
+LLVM_ATTRIBUTE_UNUSED_RESULT inline typename cast_retty<X, Y *>::ret_type
+dyn_cast(Y *Val) {
+ return isa<X>(Val) ? cast<X>(Val) : nullptr;
+}
+
+// dyn_cast_or_null<X> - Functionally identical to dyn_cast, except that a null
+// value is accepted.
+//
+template <class X, class Y>
+LLVM_ATTRIBUTE_UNUSED_RESULT inline typename std::enable_if<
+ !is_simple_type<Y>::value, typename cast_retty<X, const Y>::ret_type>::type
+dyn_cast_or_null(const Y &Val) {
+ return (Val && isa<X>(Val)) ? cast<X>(Val) : nullptr;
+}
+
+template <class X, class Y>
+LLVM_ATTRIBUTE_UNUSED_RESULT inline typename std::enable_if<
+ !is_simple_type<Y>::value, typename cast_retty<X, Y>::ret_type>::type
+dyn_cast_or_null(Y &Val) {
+ return (Val && isa<X>(Val)) ? cast<X>(Val) : nullptr;
+}
+
+template <class X, class Y>
+LLVM_ATTRIBUTE_UNUSED_RESULT inline typename cast_retty<X, Y *>::ret_type
+dyn_cast_or_null(Y *Val) {
+ return (Val && isa<X>(Val)) ? cast<X>(Val) : nullptr;
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/ext/include/llvm/Support/Compiler.h b/ext/include/llvm/Support/Compiler.h
new file mode 100644
index 0000000..128e680
--- /dev/null
+++ b/ext/include/llvm/Support/Compiler.h
@@ -0,0 +1,435 @@
+//===-- llvm/Support/Compiler.h - Compiler abstraction support --*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines several macros, based on the current compiler. This allows
+// use of compiler-specific features in a way that remains portable.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_COMPILER_H
+#define LLVM_SUPPORT_COMPILER_H
+
+#ifndef __has_feature
+# define __has_feature(x) 0
+#endif
+
+#ifndef __has_extension
+# define __has_extension(x) 0
+#endif
+
+#ifndef __has_attribute
+# define __has_attribute(x) 0
+#endif
+
+#ifndef __has_builtin
+# define __has_builtin(x) 0
+#endif
+
+/// \macro LLVM_GNUC_PREREQ
+/// \brief Extend the default __GNUC_PREREQ even if glibc's features.h isn't
+/// available.
+#ifndef LLVM_GNUC_PREREQ
+# if defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__)
+# define LLVM_GNUC_PREREQ(maj, min, patch) \
+ ((__GNUC__ << 20) + (__GNUC_MINOR__ << 10) + __GNUC_PATCHLEVEL__ >= \
+ ((maj) << 20) + ((min) << 10) + (patch))
+# elif defined(__GNUC__) && defined(__GNUC_MINOR__)
+# define LLVM_GNUC_PREREQ(maj, min, patch) \
+ ((__GNUC__ << 20) + (__GNUC_MINOR__ << 10) >= ((maj) << 20) + ((min) << 10))
+# else
+# define LLVM_GNUC_PREREQ(maj, min, patch) 0
+# endif
+#endif
+
+/// \macro LLVM_MSC_PREREQ
+/// \brief Is the compiler MSVC of at least the specified version?
+/// The common \param version values to check for are:
+/// * 1800: Microsoft Visual Studio 2013 / 12.0
+/// * 1900: Microsoft Visual Studio 2015 / 14.0
+#ifdef _MSC_VER
+#define LLVM_MSC_PREREQ(version) (_MSC_VER >= (version))
+
+// We require at least MSVC 2013.
+#if !LLVM_MSC_PREREQ(1800)
+#error LLVM requires at least MSVC 2013.
+#endif
+
+#else
+#define LLVM_MSC_PREREQ(version) 0
+#endif
+
+#if !defined(_MSC_VER) || defined(__clang__) || LLVM_MSC_PREREQ(1900)
+#define LLVM_NOEXCEPT noexcept
+#else
+#define LLVM_NOEXCEPT throw()
+#endif
+
+/// \brief Does the compiler support ref-qualifiers for *this?
+///
+/// Sadly, this is separate from just rvalue reference support because GCC
+/// and MSVC implemented this later than everything else.
+#if __has_feature(cxx_rvalue_references) || LLVM_GNUC_PREREQ(4, 8, 1)
+#define LLVM_HAS_RVALUE_REFERENCE_THIS 1
+#else
+#define LLVM_HAS_RVALUE_REFERENCE_THIS 0
+#endif
+
+/// Expands to '&' if ref-qualifiers for *this are supported.
+///
+/// This can be used to provide lvalue/rvalue overrides of member functions.
+/// The rvalue override should be guarded by LLVM_HAS_RVALUE_REFERENCE_THIS
+#if LLVM_HAS_RVALUE_REFERENCE_THIS
+#define LLVM_LVALUE_FUNCTION &
+#else
+#define LLVM_LVALUE_FUNCTION
+#endif
+
+#if __has_feature(cxx_constexpr) || defined(__GXX_EXPERIMENTAL_CXX0X__)
+# define LLVM_CONSTEXPR constexpr
+#else
+# define LLVM_CONSTEXPR
+#endif
+
+/// LLVM_LIBRARY_VISIBILITY - If a class marked with this attribute is linked
+/// into a shared library, then the class should be private to the library and
+/// not accessible from outside it. Can also be used to mark variables and
+/// functions, making them private to any shared library they are linked into.
+/// On PE/COFF targets, library visibility is the default, so this isn't needed.
+#if __has_attribute(visibility) || LLVM_GNUC_PREREQ(4, 0, 0)
+#define LLVM_LIBRARY_VISIBILITY __attribute__ ((visibility("hidden")))
+#else
+#define LLVM_LIBRARY_VISIBILITY
+#endif
+
+#if __has_attribute(sentinel) || LLVM_GNUC_PREREQ(3, 0, 0)
+#define LLVM_END_WITH_NULL __attribute__((sentinel))
+#else
+#define LLVM_END_WITH_NULL
+#endif
+
+#if __has_attribute(used) || LLVM_GNUC_PREREQ(3, 1, 0)
+#define LLVM_ATTRIBUTE_USED __attribute__((__used__))
+#else
+#define LLVM_ATTRIBUTE_USED
+#endif
+
+#if __has_attribute(warn_unused_result) || LLVM_GNUC_PREREQ(3, 4, 0)
+#define LLVM_ATTRIBUTE_UNUSED_RESULT __attribute__((__warn_unused_result__))
+#else
+#define LLVM_ATTRIBUTE_UNUSED_RESULT
+#endif
+
+// Some compilers warn about unused functions. When a function is sometimes
+// used or not depending on build settings (e.g. a function only called from
+// within "assert"), this attribute can be used to suppress such warnings.
+//
+// However, it shouldn't be used for unused *variables*, as those have a much
+// more portable solution:
+// (void)unused_var_name;
+// Prefer cast-to-void wherever it is sufficient.
+#if __has_attribute(unused) || LLVM_GNUC_PREREQ(3, 1, 0)
+#define LLVM_ATTRIBUTE_UNUSED __attribute__((__unused__))
+#else
+#define LLVM_ATTRIBUTE_UNUSED
+#endif
+
+// FIXME: Provide this for PE/COFF targets.
+#if __has_attribute(weak) || LLVM_GNUC_PREREQ(4, 0, 0)
+#define LLVM_ATTRIBUTE_WEAK __attribute__((__weak__))
+#else
+#define LLVM_ATTRIBUTE_WEAK
+#endif
+
+// Prior to clang 3.2, clang did not accept any spelling of
+// __has_attribute(const), so assume it is supported.
+#if defined(__clang__) || defined(__GNUC__)
+// aka 'CONST' but following LLVM Conventions.
+#define LLVM_READNONE __attribute__((__const__))
+#else
+#define LLVM_READNONE
+#endif
+
+#if __has_attribute(pure) || defined(__GNUC__)
+// aka 'PURE' but following LLVM Conventions.
+#define LLVM_READONLY __attribute__((__pure__))
+#else
+#define LLVM_READONLY
+#endif
+
+#if __has_builtin(__builtin_expect) || LLVM_GNUC_PREREQ(4, 0, 0)
+#define LLVM_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true)
+#define LLVM_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false)
+#else
+#define LLVM_LIKELY(EXPR) (EXPR)
+#define LLVM_UNLIKELY(EXPR) (EXPR)
+#endif
+
+/// LLVM_ATTRIBUTE_NOINLINE - On compilers where we have a directive to do so,
+/// mark a method "not for inlining".
+#if __has_attribute(noinline) || LLVM_GNUC_PREREQ(3, 4, 0)
+#define LLVM_ATTRIBUTE_NOINLINE __attribute__((noinline))
+#elif defined(_MSC_VER)
+#define LLVM_ATTRIBUTE_NOINLINE __declspec(noinline)
+#else
+#define LLVM_ATTRIBUTE_NOINLINE
+#endif
+
+/// LLVM_ATTRIBUTE_ALWAYS_INLINE - On compilers where we have a directive to do
+/// so, mark a method "always inline" because it is performance sensitive. GCC
+/// 3.4 supported this but is buggy in various cases and produces unimplemented
+/// errors, just use it in GCC 4.0 and later.
+#if __has_attribute(always_inline) || LLVM_GNUC_PREREQ(4, 0, 0)
+#define LLVM_ATTRIBUTE_ALWAYS_INLINE __attribute__((always_inline))
+#elif defined(_MSC_VER)
+#define LLVM_ATTRIBUTE_ALWAYS_INLINE __forceinline
+#else
+#define LLVM_ATTRIBUTE_ALWAYS_INLINE
+#endif
+
+#ifdef __GNUC__
+#define LLVM_ATTRIBUTE_NORETURN __attribute__((noreturn))
+#elif defined(_MSC_VER)
+#define LLVM_ATTRIBUTE_NORETURN __declspec(noreturn)
+#else
+#define LLVM_ATTRIBUTE_NORETURN
+#endif
+
+#if __has_attribute(returns_nonnull) || LLVM_GNUC_PREREQ(4, 9, 0)
+#define LLVM_ATTRIBUTE_RETURNS_NONNULL __attribute__((returns_nonnull))
+#else
+#define LLVM_ATTRIBUTE_RETURNS_NONNULL
+#endif
+
+/// \macro LLVM_ATTRIBUTE_RETURNS_NOALIAS Used to mark a function as returning a
+/// pointer that does not alias any other valid pointer.
+#ifdef __GNUC__
+#define LLVM_ATTRIBUTE_RETURNS_NOALIAS __attribute__((__malloc__))
+#elif defined(_MSC_VER)
+#define LLVM_ATTRIBUTE_RETURNS_NOALIAS __declspec(restrict)
+#else
+#define LLVM_ATTRIBUTE_RETURNS_NOALIAS
+#endif
+
+/// LLVM_EXTENSION - Support compilers where we have a keyword to suppress
+/// pedantic diagnostics.
+#ifdef __GNUC__
+#define LLVM_EXTENSION __extension__
+#else
+#define LLVM_EXTENSION
+#endif
+
+// LLVM_ATTRIBUTE_DEPRECATED(decl, "message")
+#if __has_feature(attribute_deprecated_with_message)
+# define LLVM_ATTRIBUTE_DEPRECATED(decl, message) \
+ decl __attribute__((deprecated(message)))
+#elif defined(__GNUC__)
+# define LLVM_ATTRIBUTE_DEPRECATED(decl, message) \
+ decl __attribute__((deprecated))
+#elif defined(_MSC_VER)
+# define LLVM_ATTRIBUTE_DEPRECATED(decl, message) \
+ __declspec(deprecated(message)) decl
+#else
+# define LLVM_ATTRIBUTE_DEPRECATED(decl, message) \
+ decl
+#endif
+
+/// LLVM_BUILTIN_UNREACHABLE - On compilers which support it, expands
+/// to an expression which states that it is undefined behavior for the
+/// compiler to reach this point. Otherwise is not defined.
+#if __has_builtin(__builtin_unreachable) || LLVM_GNUC_PREREQ(4, 5, 0)
+# define LLVM_BUILTIN_UNREACHABLE __builtin_unreachable()
+#elif defined(_MSC_VER)
+# define LLVM_BUILTIN_UNREACHABLE __assume(false)
+#endif
+
+/// LLVM_BUILTIN_TRAP - On compilers which support it, expands to an expression
+/// which causes the program to exit abnormally.
+#if __has_builtin(__builtin_trap) || LLVM_GNUC_PREREQ(4, 3, 0)
+# define LLVM_BUILTIN_TRAP __builtin_trap()
+#elif defined(_MSC_VER)
+// The __debugbreak intrinsic is supported by MSVC, does not require forward
+// declarations involving platform-specific typedefs (unlike RaiseException),
+// results in a call to vectored exception handlers, and encodes to a short
+// instruction that still causes the trapping behavior we want.
+# define LLVM_BUILTIN_TRAP __debugbreak()
+#else
+# define LLVM_BUILTIN_TRAP *(volatile int*)0x11 = 0
+#endif
+
+/// \macro LLVM_ASSUME_ALIGNED
+/// \brief Returns a pointer with an assumed alignment.
+#if __has_builtin(__builtin_assume_aligned) || LLVM_GNUC_PREREQ(4, 7, 0)
+# define LLVM_ASSUME_ALIGNED(p, a) __builtin_assume_aligned(p, a)
+#elif defined(LLVM_BUILTIN_UNREACHABLE)
+// As of today, clang does not support __builtin_assume_aligned.
+# define LLVM_ASSUME_ALIGNED(p, a) \
+ (((uintptr_t(p) % (a)) == 0) ? (p) : (LLVM_BUILTIN_UNREACHABLE, (p)))
+#else
+# define LLVM_ASSUME_ALIGNED(p, a) (p)
+#endif
+
+/// \macro LLVM_ALIGNAS
+/// \brief Used to specify a minimum alignment for a structure or variable. The
+/// alignment must be a constant integer. Use LLVM_PTR_SIZE to compute
+/// alignments in terms of the size of a pointer.
+///
+/// Note that __declspec(align) has special quirks, it's not legal to pass a
+/// structure with __declspec(align) as a formal parameter.
+#ifdef _MSC_VER
+# define LLVM_ALIGNAS(x) __declspec(align(x))
+#elif __GNUC__ && !__has_feature(cxx_alignas) && !LLVM_GNUC_PREREQ(4, 8, 0)
+# define LLVM_ALIGNAS(x) __attribute__((aligned(x)))
+#else
+# define LLVM_ALIGNAS(x) alignas(x)
+#endif
+
+/// \macro LLVM_PACKED
+/// \brief Used to specify a packed structure.
+/// LLVM_PACKED(
+/// struct A {
+/// int i;
+/// int j;
+/// int k;
+/// long long l;
+/// });
+///
+/// LLVM_PACKED_START
+/// struct B {
+/// int i;
+/// int j;
+/// int k;
+/// long long l;
+/// };
+/// LLVM_PACKED_END
+#ifdef _MSC_VER
+# define LLVM_PACKED(d) __pragma(pack(push, 1)) d __pragma(pack(pop))
+# define LLVM_PACKED_START __pragma(pack(push, 1))
+# define LLVM_PACKED_END __pragma(pack(pop))
+#else
+# define LLVM_PACKED(d) d __attribute__((packed))
+# define LLVM_PACKED_START _Pragma("pack(push, 1)")
+# define LLVM_PACKED_END _Pragma("pack(pop)")
+#endif
+
+/// \macro LLVM_PTR_SIZE
+/// \brief A constant integer equivalent to the value of sizeof(void*).
+/// Generally used in combination with LLVM_ALIGNAS or when doing computation in
+/// the preprocessor.
+#ifdef __SIZEOF_POINTER__
+# define LLVM_PTR_SIZE __SIZEOF_POINTER__
+#elif defined(_WIN64)
+# define LLVM_PTR_SIZE 8
+#elif defined(_WIN32)
+# define LLVM_PTR_SIZE 4
+#elif defined(_MSC_VER)
+# error "could not determine LLVM_PTR_SIZE as a constant int for MSVC"
+#else
+# define LLVM_PTR_SIZE sizeof(void *)
+#endif
+
+/// \macro LLVM_FUNCTION_NAME
+/// \brief Expands to __func__ on compilers which support it. Otherwise,
+/// expands to a compiler-dependent replacement.
+#if defined(_MSC_VER)
+# define LLVM_FUNCTION_NAME __FUNCTION__
+#else
+# define LLVM_FUNCTION_NAME __func__
+#endif
+
+/// \macro LLVM_MEMORY_SANITIZER_BUILD
+/// \brief Whether LLVM itself is built with MemorySanitizer instrumentation.
+#if __has_feature(memory_sanitizer)
+# define LLVM_MEMORY_SANITIZER_BUILD 1
+# include <sanitizer/msan_interface.h>
+#else
+# define LLVM_MEMORY_SANITIZER_BUILD 0
+# define __msan_allocated_memory(p, size)
+# define __msan_unpoison(p, size)
+#endif
+
+/// \macro LLVM_ADDRESS_SANITIZER_BUILD
+/// \brief Whether LLVM itself is built with AddressSanitizer instrumentation.
+#if __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__)
+# define LLVM_ADDRESS_SANITIZER_BUILD 1
+# include <sanitizer/asan_interface.h>
+#else
+# define LLVM_ADDRESS_SANITIZER_BUILD 0
+# define __asan_poison_memory_region(p, size)
+# define __asan_unpoison_memory_region(p, size)
+#endif
+
+/// \macro LLVM_THREAD_SANITIZER_BUILD
+/// \brief Whether LLVM itself is built with ThreadSanitizer instrumentation.
+#if __has_feature(thread_sanitizer) || defined(__SANITIZE_THREAD__)
+# define LLVM_THREAD_SANITIZER_BUILD 1
+#else
+# define LLVM_THREAD_SANITIZER_BUILD 0
+#endif
+
+#if LLVM_THREAD_SANITIZER_BUILD
+// Thread Sanitizer is a tool that finds races in code.
+// See http://code.google.com/p/data-race-test/wiki/DynamicAnnotations .
+// tsan detects these exact functions by name.
+extern "C" {
+void AnnotateHappensAfter(const char *file, int line, const volatile void *cv);
+void AnnotateHappensBefore(const char *file, int line, const volatile void *cv);
+void AnnotateIgnoreWritesBegin(const char *file, int line);
+void AnnotateIgnoreWritesEnd(const char *file, int line);
+}
+
+// This marker is used to define a happens-before arc. The race detector will
+// infer an arc from the begin to the end when they share the same pointer
+// argument.
+# define TsanHappensBefore(cv) AnnotateHappensBefore(__FILE__, __LINE__, cv)
+
+// This marker defines the destination of a happens-before arc.
+# define TsanHappensAfter(cv) AnnotateHappensAfter(__FILE__, __LINE__, cv)
+
+// Ignore any races on writes between here and the next TsanIgnoreWritesEnd.
+# define TsanIgnoreWritesBegin() AnnotateIgnoreWritesBegin(__FILE__, __LINE__)
+
+// Resume checking for racy writes.
+# define TsanIgnoreWritesEnd() AnnotateIgnoreWritesEnd(__FILE__, __LINE__)
+#else
+# define TsanHappensBefore(cv)
+# define TsanHappensAfter(cv)
+# define TsanIgnoreWritesBegin()
+# define TsanIgnoreWritesEnd()
+#endif
+
+/// \brief Mark debug helper function definitions like dump() that should not be
+/// stripped from debug builds.
+// FIXME: Move this to a private config.h as it's not usable in public headers.
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+#define LLVM_DUMP_METHOD LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED
+#else
+#define LLVM_DUMP_METHOD LLVM_ATTRIBUTE_NOINLINE
+#endif
+
+/// \macro LLVM_THREAD_LOCAL
+/// \brief A thread-local storage specifier which can be used with globals,
+/// extern globals, and static globals.
+///
+/// This is essentially an extremely restricted analog to C++11's thread_local
+/// support, and uses that when available. However, it falls back on
+/// platform-specific or vendor-provided extensions when necessary. These
+/// extensions don't support many of the C++11 thread_local's features. You
+/// should only use this for PODs that you can statically initialize to
+/// some constant value. In almost all circumstances this is most appropriate
+/// for use with a pointer, integer, or small aggregation of pointers and
+/// integers.
+#if __has_feature(cxx_thread_local)
+#define LLVM_THREAD_LOCAL thread_local
+#else
+// Clang, GCC, and other compatible compilers used __thread prior to C++11 and
+// we only need the restricted functionality that provides.
+#define LLVM_THREAD_LOCAL __thread
+#endif
+
+#endif
diff --git a/ext/include/llvm/Support/DataTypes.h b/ext/include/llvm/Support/DataTypes.h
new file mode 100644
index 0000000..dd90896
--- /dev/null
+++ b/ext/include/llvm/Support/DataTypes.h
@@ -0,0 +1,56 @@
+/*===-- include/Support/DataTypes.h - Define fixed size types -----*- C -*-===*\
+|* *|
+|* The LLVM Compiler Infrastructure *|
+|* *|
+|* This file is distributed under the University of Illinois Open Source *|
+|* License. See LICENSE.TXT for details. *|
+|* *|
+|*===----------------------------------------------------------------------===*|
+|* *|
+|* This file contains definitions to figure out the size of _HOST_ data types.*|
+|* This file is important because different host OS's define different macros,*|
+|* which makes portability tough. This file exports the following *|
+|* definitions: *|
+|* *|
+|* [u]int(32|64)_t : typedefs for signed and unsigned 32/64 bit system types*|
+|* [U]INT(8|16|32|64)_(MIN|MAX) : Constants for the min and max values. *|
+|* *|
+|* No library is required when using these functions. *|
+|* *|
+|*===----------------------------------------------------------------------===*/
+
+/* Please leave this file C-compatible. */
+
+/* Please keep this file in sync with DataTypes.h.in */
+
+#ifndef SUPPORT_DATATYPES_H
+#define SUPPORT_DATATYPES_H
+
+#define HAVE_INTTYPES_H 1
+#define HAVE_STDINT_H 1
+#define HAVE_UINT64_T 1
+#define HAVE_U_INT64_T 1
+
+#ifdef __cplusplus
+#include <cmath>
+#else
+#include <math.h>
+#endif
+
+#include <inttypes.h>
+
+/* Note that <inttypes.h> includes <stdint.h>, if this is a C99 system. */
+#include <sys/types.h>
+
+/* Set defaults for constants which we cannot find. */
+#if !defined(INT64_MAX)
+# define INT64_MAX 9223372036854775807LL
+#endif
+#if !defined(INT64_MIN)
+# define INT64_MIN ((-INT64_MAX)-1)
+#endif
+#if !defined(UINT64_MAX)
+# define UINT64_MAX 0xffffffffffffffffULL
+#endif
+
+#endif /* SUPPORT_DATATYPES_H */
diff --git a/ext/include/llvm/Support/Debug.h b/ext/include/llvm/Support/Debug.h
new file mode 100644
index 0000000..6e21347
--- /dev/null
+++ b/ext/include/llvm/Support/Debug.h
@@ -0,0 +1,96 @@
+//===- llvm/Support/Debug.h - Easy way to add debug output ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a handy way of adding debugging information to your
+// code, without it being enabled all of the time, and without having to add
+// command line options to enable it.
+//
+// In particular, just wrap your code with the DEBUG() macro, and it will be
+// enabled automatically if you specify '-debug' on the command-line.
+// DEBUG() requires the DEBUG_TYPE macro to be defined. Set it to "foo" specify
+// that your debug code belongs to class "foo". Be careful that you only do
+// this after including Debug.h and not around any #include of headers. Headers
+// should define and undef the macro acround the code that needs to use the
+// DEBUG() macro. Then, on the command line, you can specify '-debug-only=foo'
+// to enable JUST the debug information for the foo class.
+//
+// When compiling without assertions, the -debug-* options and all code in
+// DEBUG() statements disappears, so it does not affect the runtime of the code.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_DEBUG_H
+#define LLVM_SUPPORT_DEBUG_H
+
+namespace llvm {
+class raw_ostream;
+
+#ifndef NDEBUG
+/// DebugFlag - This boolean is set to true if the '-debug' command line option
+/// is specified. This should probably not be referenced directly, instead, use
+/// the DEBUG macro below.
+///
+extern bool DebugFlag;
+
+/// isCurrentDebugType - Return true if the specified string is the debug type
+/// specified on the command line, or if none was specified on the command line
+/// with the -debug-only=X option.
+///
+bool isCurrentDebugType(const char *Type);
+
+/// setCurrentDebugType - Set the current debug type, as if the -debug-only=X
+/// option were specified. Note that DebugFlag also needs to be set to true for
+/// debug output to be produced.
+///
+void setCurrentDebugType(const char *Type);
+
+/// DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug
+/// information. In the '-debug' option is specified on the commandline, and if
+/// this is a debug build, then the code specified as the option to the macro
+/// will be executed. Otherwise it will not be. Example:
+///
+/// DEBUG_WITH_TYPE("bitset", dbgs() << "Bitset contains: " << Bitset << "\n");
+///
+/// This will emit the debug information if -debug is present, and -debug-only
+/// is not specified, or is specified as "bitset".
+#define DEBUG_WITH_TYPE(TYPE, X) \
+ do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType(TYPE)) { X; } \
+ } while (0)
+
+#else
+#define isCurrentDebugType(X) (false)
+#define setCurrentDebugType(X)
+#define DEBUG_WITH_TYPE(TYPE, X) do { } while (0)
+#endif
+
+/// EnableDebugBuffering - This defaults to false. If true, the debug
+/// stream will install signal handlers to dump any buffered debug
+/// output. It allows clients to selectively allow the debug stream
+/// to install signal handlers if they are certain there will be no
+/// conflict.
+///
+extern bool EnableDebugBuffering;
+
+/// dbgs() - This returns a reference to a raw_ostream for debugging
+/// messages. If debugging is disabled it returns errs(). Use it
+/// like: dbgs() << "foo" << "bar";
+raw_ostream &dbgs();
+
+// DEBUG macro - This macro should be used by passes to emit debug information.
+// In the '-debug' option is specified on the commandline, and if this is a
+// debug build, then the code specified as the option to the macro will be
+// executed. Otherwise it will not be. Example:
+//
+// DEBUG(dbgs() << "Bitset contains: " << Bitset << "\n");
+//
+#define DEBUG(X) DEBUG_WITH_TYPE(DEBUG_TYPE, X)
+
+} // End llvm namespace
+
+#endif
diff --git a/ext/include/llvm/Support/Errc.h b/ext/include/llvm/Support/Errc.h
new file mode 100644
index 0000000..80bfe2a
--- /dev/null
+++ b/ext/include/llvm/Support/Errc.h
@@ -0,0 +1,86 @@
+//===- llvm/Support/Errc.h - Defines the llvm::errc enum --------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// While std::error_code works OK on all platforms we use, there are some
+// some problems with std::errc that can be avoided by using our own
+// enumeration:
+//
+// * std::errc is a namespace in some implementations. That meas that ADL
+// doesn't work and it is sometimes necessary to write std::make_error_code
+// or in templates:
+// using std::make_error_code;
+// make_error_code(...);
+//
+// with this enum it is safe to always just use make_error_code.
+//
+// * Some implementations define fewer names than others. This header has
+// the intersection of all the ones we support.
+//
+// * std::errc is just marked with is_error_condition_enum. This means that
+// common patters like AnErrorCode == errc::no_such_file_or_directory take
+// 4 virtual calls instead of two comparisons.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_ERRC_H
+#define LLVM_SUPPORT_ERRC_H
+
+#include <system_error>
+
+namespace llvm {
+enum class errc {
+ argument_list_too_long = int(std::errc::argument_list_too_long),
+ argument_out_of_domain = int(std::errc::argument_out_of_domain),
+ bad_address = int(std::errc::bad_address),
+ bad_file_descriptor = int(std::errc::bad_file_descriptor),
+ broken_pipe = int(std::errc::broken_pipe),
+ device_or_resource_busy = int(std::errc::device_or_resource_busy),
+ directory_not_empty = int(std::errc::directory_not_empty),
+ executable_format_error = int(std::errc::executable_format_error),
+ file_exists = int(std::errc::file_exists),
+ file_too_large = int(std::errc::file_too_large),
+ filename_too_long = int(std::errc::filename_too_long),
+ function_not_supported = int(std::errc::function_not_supported),
+ illegal_byte_sequence = int(std::errc::illegal_byte_sequence),
+ inappropriate_io_control_operation =
+ int(std::errc::inappropriate_io_control_operation),
+ interrupted = int(std::errc::interrupted),
+ invalid_argument = int(std::errc::invalid_argument),
+ invalid_seek = int(std::errc::invalid_seek),
+ io_error = int(std::errc::io_error),
+ is_a_directory = int(std::errc::is_a_directory),
+ no_child_process = int(std::errc::no_child_process),
+ no_lock_available = int(std::errc::no_lock_available),
+ no_space_on_device = int(std::errc::no_space_on_device),
+ no_such_device_or_address = int(std::errc::no_such_device_or_address),
+ no_such_device = int(std::errc::no_such_device),
+ no_such_file_or_directory = int(std::errc::no_such_file_or_directory),
+ no_such_process = int(std::errc::no_such_process),
+ not_a_directory = int(std::errc::not_a_directory),
+ not_enough_memory = int(std::errc::not_enough_memory),
+ operation_not_permitted = int(std::errc::operation_not_permitted),
+ permission_denied = int(std::errc::permission_denied),
+ read_only_file_system = int(std::errc::read_only_file_system),
+ resource_deadlock_would_occur = int(std::errc::resource_deadlock_would_occur),
+ resource_unavailable_try_again =
+ int(std::errc::resource_unavailable_try_again),
+ result_out_of_range = int(std::errc::result_out_of_range),
+ too_many_files_open_in_system = int(std::errc::too_many_files_open_in_system),
+ too_many_files_open = int(std::errc::too_many_files_open),
+ too_many_links = int(std::errc::too_many_links)
+};
+
+inline std::error_code make_error_code(errc E) {
+ return std::error_code(static_cast<int>(E), std::generic_category());
+}
+}
+
+namespace std {
+template <> struct is_error_code_enum<llvm::errc> : std::true_type {};
+}
+#endif
diff --git a/ext/include/llvm/Support/Errno.h b/ext/include/llvm/Support/Errno.h
new file mode 100644
index 0000000..8e145c7
--- /dev/null
+++ b/ext/include/llvm/Support/Errno.h
@@ -0,0 +1,34 @@
+//===- llvm/Support/Errno.h - Portable+convenient errno handling -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares some portable and convenient functions to deal with errno.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_ERRNO_H
+#define LLVM_SUPPORT_ERRNO_H
+
+#include <string>
+
+namespace llvm {
+namespace sys {
+
+/// Returns a string representation of the errno value, using whatever
+/// thread-safe variant of strerror() is available. Be sure to call this
+/// immediately after the function that set errno, or errno may have been
+/// overwritten by an intervening call.
+std::string StrError();
+
+/// Like the no-argument version above, but uses \p errnum instead of errno.
+std::string StrError(int errnum);
+
+} // namespace sys
+} // namespace llvm
+
+#endif // LLVM_SYSTEM_ERRNO_H
diff --git a/ext/include/llvm/Support/ErrorHandling.h b/ext/include/llvm/Support/ErrorHandling.h
new file mode 100644
index 0000000..32f05e0
--- /dev/null
+++ b/ext/include/llvm/Support/ErrorHandling.h
@@ -0,0 +1,106 @@
+//===- llvm/Support/ErrorHandling.h - Fatal error handling ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an API used to indicate fatal error conditions. Non-fatal
+// errors (most of them) should be handled through LLVMContext.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_ERRORHANDLING_H
+#define LLVM_SUPPORT_ERRORHANDLING_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
+#include <string>
+
+namespace llvm {
+ class Twine;
+
+ /// An error handler callback.
+ typedef void (*fatal_error_handler_t)(void *user_data,
+ const std::string& reason,
+ bool gen_crash_diag);
+
+ /// install_fatal_error_handler - Installs a new error handler to be used
+ /// whenever a serious (non-recoverable) error is encountered by LLVM.
+ ///
+ /// If no error handler is installed the default is to print the error message
+ /// to stderr, and call exit(1). If an error handler is installed then it is
+ /// the handler's responsibility to log the message, it will no longer be
+ /// printed to stderr. If the error handler returns, then exit(1) will be
+ /// called.
+ ///
+ /// It is dangerous to naively use an error handler which throws an exception.
+ /// Even though some applications desire to gracefully recover from arbitrary
+ /// faults, blindly throwing exceptions through unfamiliar code isn't a way to
+ /// achieve this.
+ ///
+ /// \param user_data - An argument which will be passed to the install error
+ /// handler.
+ void install_fatal_error_handler(fatal_error_handler_t handler,
+ void *user_data = nullptr);
+
+ /// Restores default error handling behaviour.
+ void remove_fatal_error_handler();
+
+ /// ScopedFatalErrorHandler - This is a simple helper class which just
+ /// calls install_fatal_error_handler in its constructor and
+ /// remove_fatal_error_handler in its destructor.
+ struct ScopedFatalErrorHandler {
+ explicit ScopedFatalErrorHandler(fatal_error_handler_t handler,
+ void *user_data = nullptr) {
+ install_fatal_error_handler(handler, user_data);
+ }
+
+ ~ScopedFatalErrorHandler() { remove_fatal_error_handler(); }
+ };
+
+/// Reports a serious error, calling any installed error handler. These
+/// functions are intended to be used for error conditions which are outside
+/// the control of the compiler (I/O errors, invalid user input, etc.)
+///
+/// If no error handler is installed the default is to print the message to
+/// standard error, followed by a newline.
+/// After the error handler is called this function will call exit(1), it
+/// does not return.
+LLVM_ATTRIBUTE_NORETURN void report_fatal_error(const char *reason,
+ bool gen_crash_diag = true);
+LLVM_ATTRIBUTE_NORETURN void report_fatal_error(const std::string &reason,
+ bool gen_crash_diag = true);
+LLVM_ATTRIBUTE_NORETURN void report_fatal_error(StringRef reason,
+ bool gen_crash_diag = true);
+LLVM_ATTRIBUTE_NORETURN void report_fatal_error(const Twine &reason,
+ bool gen_crash_diag = true);
+
+ /// This function calls abort(), and prints the optional message to stderr.
+ /// Use the llvm_unreachable macro (that adds location info), instead of
+ /// calling this function directly.
+ LLVM_ATTRIBUTE_NORETURN void
+ llvm_unreachable_internal(const char *msg=nullptr, const char *file=nullptr,
+ unsigned line=0);
+}
+
+/// Marks that the current location is not supposed to be reachable.
+/// In !NDEBUG builds, prints the message and location info to stderr.
+/// In NDEBUG builds, becomes an optimizer hint that the current location
+/// is not supposed to be reachable. On compilers that don't support
+/// such hints, prints a reduced message instead.
+///
+/// Use this instead of assert(0). It conveys intent more clearly and
+/// allows compilers to omit some unnecessary code.
+#ifndef NDEBUG
+#define llvm_unreachable(msg) \
+ ::llvm::llvm_unreachable_internal(msg, __FILE__, __LINE__)
+#elif defined(LLVM_BUILTIN_UNREACHABLE)
+#define llvm_unreachable(msg) LLVM_BUILTIN_UNREACHABLE
+#else
+#define llvm_unreachable(msg) ::llvm::llvm_unreachable_internal()
+#endif
+
+#endif
diff --git a/ext/include/llvm/Support/ErrorOr.h b/ext/include/llvm/Support/ErrorOr.h
new file mode 100644
index 0000000..ca6ede7
--- /dev/null
+++ b/ext/include/llvm/Support/ErrorOr.h
@@ -0,0 +1,298 @@
+//===- llvm/Support/ErrorOr.h - Error Smart Pointer -------------*- C++ -*-===//
+//
+// The LLVM Linker
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+///
+/// Provides ErrorOr<T> smart pointer.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_ERROROR_H
+#define LLVM_SUPPORT_ERROROR_H
+
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/Support/AlignOf.h"
+#include <cassert>
+#include <system_error>
+#include <type_traits>
+
+namespace llvm {
+template<class T, class V>
+typename std::enable_if< std::is_constructible<T, V>::value
+ , typename std::remove_reference<V>::type>::type &&
+ moveIfMoveConstructible(V &Val) {
+ return std::move(Val);
+}
+
+template<class T, class V>
+typename std::enable_if< !std::is_constructible<T, V>::value
+ , typename std::remove_reference<V>::type>::type &
+moveIfMoveConstructible(V &Val) {
+ return Val;
+}
+
+/// \brief Stores a reference that can be changed.
+template <typename T>
+class ReferenceStorage {
+ T *Storage;
+
+public:
+ ReferenceStorage(T &Ref) : Storage(&Ref) {}
+
+ operator T &() const { return *Storage; }
+ T &get() const { return *Storage; }
+};
+
+/// \brief Represents either an error or a value T.
+///
+/// ErrorOr<T> is a pointer-like class that represents the result of an
+/// operation. The result is either an error, or a value of type T. This is
+/// designed to emulate the usage of returning a pointer where nullptr indicates
+/// failure. However instead of just knowing that the operation failed, we also
+/// have an error_code and optional user data that describes why it failed.
+///
+/// It is used like the following.
+/// \code
+/// ErrorOr<Buffer> getBuffer();
+///
+/// auto buffer = getBuffer();
+/// if (error_code ec = buffer.getError())
+/// return ec;
+/// buffer->write("adena");
+/// \endcode
+///
+///
+/// Implicit conversion to bool returns true if there is a usable value. The
+/// unary * and -> operators provide pointer like access to the value. Accessing
+/// the value when there is an error has undefined behavior.
+///
+/// When T is a reference type the behaivor is slightly different. The reference
+/// is held in a std::reference_wrapper<std::remove_reference<T>::type>, and
+/// there is special handling to make operator -> work as if T was not a
+/// reference.
+///
+/// T cannot be a rvalue reference.
+template<class T>
+class ErrorOr {
+ template <class OtherT> friend class ErrorOr;
+ static const bool isRef = std::is_reference<T>::value;
+ typedef ReferenceStorage<typename std::remove_reference<T>::type> wrap;
+
+public:
+ typedef typename std::conditional<isRef, wrap, T>::type storage_type;
+
+private:
+ typedef typename std::remove_reference<T>::type &reference;
+ typedef const typename std::remove_reference<T>::type &const_reference;
+ typedef typename std::remove_reference<T>::type *pointer;
+ typedef const typename std::remove_reference<T>::type *const_pointer;
+
+public:
+ template <class E>
+ ErrorOr(E ErrorCode,
+ typename std::enable_if<std::is_error_code_enum<E>::value ||
+ std::is_error_condition_enum<E>::value,
+ void *>::type = 0)
+ : HasError(true) {
+ new (getErrorStorage()) std::error_code(make_error_code(ErrorCode));
+ }
+
+ ErrorOr(std::error_code EC) : HasError(true) {
+ new (getErrorStorage()) std::error_code(EC);
+ }
+
+ ErrorOr(T Val) : HasError(false) {
+ new (getStorage()) storage_type(moveIfMoveConstructible<storage_type>(Val));
+ }
+
+ ErrorOr(const ErrorOr &Other) {
+ copyConstruct(Other);
+ }
+
+ template <class OtherT>
+ ErrorOr(
+ const ErrorOr<OtherT> &Other,
+ typename std::enable_if<std::is_convertible<OtherT, T>::value>::type * =
+ nullptr) {
+ copyConstruct(Other);
+ }
+
+ template <class OtherT>
+ explicit ErrorOr(
+ const ErrorOr<OtherT> &Other,
+ typename std::enable_if<
+ !std::is_convertible<OtherT, const T &>::value>::type * = nullptr) {
+ copyConstruct(Other);
+ }
+
+ ErrorOr(ErrorOr &&Other) {
+ moveConstruct(std::move(Other));
+ }
+
+ template <class OtherT>
+ ErrorOr(
+ ErrorOr<OtherT> &&Other,
+ typename std::enable_if<std::is_convertible<OtherT, T>::value>::type * =
+ nullptr) {
+ moveConstruct(std::move(Other));
+ }
+
+ // This might eventually need SFINAE but it's more complex than is_convertible
+ // & I'm too lazy to write it right now.
+ template <class OtherT>
+ explicit ErrorOr(
+ ErrorOr<OtherT> &&Other,
+ typename std::enable_if<!std::is_convertible<OtherT, T>::value>::type * =
+ nullptr) {
+ moveConstruct(std::move(Other));
+ }
+
+ ErrorOr &operator=(const ErrorOr &Other) {
+ copyAssign(Other);
+ return *this;
+ }
+
+ ErrorOr &operator=(ErrorOr &&Other) {
+ moveAssign(std::move(Other));
+ return *this;
+ }
+
+ ~ErrorOr() {
+ if (!HasError)
+ getStorage()->~storage_type();
+ }
+
+ /// \brief Return false if there is an error.
+ explicit operator bool() const {
+ return !HasError;
+ }
+
+ reference get() { return *getStorage(); }
+ const_reference get() const { return const_cast<ErrorOr<T> *>(this)->get(); }
+
+ std::error_code getError() const {
+ return HasError ? *getErrorStorage() : std::error_code();
+ }
+
+ pointer operator ->() {
+ return toPointer(getStorage());
+ }
+
+ const_pointer operator->() const { return toPointer(getStorage()); }
+
+ reference operator *() {
+ return *getStorage();
+ }
+
+ const_reference operator*() const { return *getStorage(); }
+
+private:
+ template <class OtherT>
+ void copyConstruct(const ErrorOr<OtherT> &Other) {
+ if (!Other.HasError) {
+ // Get the other value.
+ HasError = false;
+ new (getStorage()) storage_type(*Other.getStorage());
+ } else {
+ // Get other's error.
+ HasError = true;
+ new (getErrorStorage()) std::error_code(Other.getError());
+ }
+ }
+
+ template <class T1>
+ static bool compareThisIfSameType(const T1 &a, const T1 &b) {
+ return &a == &b;
+ }
+
+ template <class T1, class T2>
+ static bool compareThisIfSameType(const T1 &a, const T2 &b) {
+ return false;
+ }
+
+ template <class OtherT>
+ void copyAssign(const ErrorOr<OtherT> &Other) {
+ if (compareThisIfSameType(*this, Other))
+ return;
+
+ this->~ErrorOr();
+ new (this) ErrorOr(Other);
+ }
+
+ template <class OtherT>
+ void moveConstruct(ErrorOr<OtherT> &&Other) {
+ if (!Other.HasError) {
+ // Get the other value.
+ HasError = false;
+ new (getStorage()) storage_type(std::move(*Other.getStorage()));
+ } else {
+ // Get other's error.
+ HasError = true;
+ new (getErrorStorage()) std::error_code(Other.getError());
+ }
+ }
+
+ template <class OtherT>
+ void moveAssign(ErrorOr<OtherT> &&Other) {
+ if (compareThisIfSameType(*this, Other))
+ return;
+
+ this->~ErrorOr();
+ new (this) ErrorOr(std::move(Other));
+ }
+
+ pointer toPointer(pointer Val) {
+ return Val;
+ }
+
+ const_pointer toPointer(const_pointer Val) const { return Val; }
+
+ pointer toPointer(wrap *Val) {
+ return &Val->get();
+ }
+
+ const_pointer toPointer(const wrap *Val) const { return &Val->get(); }
+
+ storage_type *getStorage() {
+ assert(!HasError && "Cannot get value when an error exists!");
+ return reinterpret_cast<storage_type*>(TStorage.buffer);
+ }
+
+ const storage_type *getStorage() const {
+ assert(!HasError && "Cannot get value when an error exists!");
+ return reinterpret_cast<const storage_type*>(TStorage.buffer);
+ }
+
+ std::error_code *getErrorStorage() {
+ assert(HasError && "Cannot get error when a value exists!");
+ return reinterpret_cast<std::error_code *>(ErrorStorage.buffer);
+ }
+
+ const std::error_code *getErrorStorage() const {
+ return const_cast<ErrorOr<T> *>(this)->getErrorStorage();
+ }
+
+
+ union {
+ AlignedCharArrayUnion<storage_type> TStorage;
+ AlignedCharArrayUnion<std::error_code> ErrorStorage;
+ };
+ bool HasError : 1;
+};
+
+template <class T, class E>
+typename std::enable_if<std::is_error_code_enum<E>::value ||
+ std::is_error_condition_enum<E>::value,
+ bool>::type
+operator==(const ErrorOr<T> &Err, E Code) {
+ return Err.getError() == Code;
+}
+} // end namespace llvm
+
+#endif
diff --git a/ext/include/llvm/Support/FileOutputBuffer.h b/ext/include/llvm/Support/FileOutputBuffer.h
new file mode 100644
index 0000000..3bcf64a
--- /dev/null
+++ b/ext/include/llvm/Support/FileOutputBuffer.h
@@ -0,0 +1,89 @@
+//=== FileOutputBuffer.h - File Output Buffer -------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Utility for creating a in-memory buffer that will be written to a file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_FILEOUTPUTBUFFER_H
+#define LLVM_SUPPORT_FILEOUTPUTBUFFER_H
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/FileSystem.h"
+
+namespace llvm {
+/// FileOutputBuffer - This interface provides simple way to create an in-memory
+/// buffer which will be written to a file. During the lifetime of these
+/// objects, the content or existence of the specified file is undefined. That
+/// is, creating an OutputBuffer for a file may immediately remove the file.
+/// If the FileOutputBuffer is committed, the target file's content will become
+/// the buffer content at the time of the commit. If the FileOutputBuffer is
+/// not committed, the file will be deleted in the FileOutputBuffer destructor.
+class FileOutputBuffer {
+public:
+
+ enum {
+ F_executable = 1 /// set the 'x' bit on the resulting file
+ };
+
+ /// Factory method to create an OutputBuffer object which manages a read/write
+ /// buffer of the specified size. When committed, the buffer will be written
+ /// to the file at the specified path.
+ static ErrorOr<std::unique_ptr<FileOutputBuffer>>
+ create(StringRef FilePath, size_t Size, unsigned Flags = 0);
+
+ /// Returns a pointer to the start of the buffer.
+ uint8_t *getBufferStart() {
+ return (uint8_t*)Region->data();
+ }
+
+ /// Returns a pointer to the end of the buffer.
+ uint8_t *getBufferEnd() {
+ return (uint8_t*)Region->data() + Region->size();
+ }
+
+ /// Returns size of the buffer.
+ size_t getBufferSize() const {
+ return Region->size();
+ }
+
+ /// Returns path where file will show up if buffer is committed.
+ StringRef getPath() const {
+ return FinalPath;
+ }
+
+ /// Flushes the content of the buffer to its file and deallocates the
+ /// buffer. If commit() is not called before this object's destructor
+ /// is called, the file is deleted in the destructor. The optional parameter
+ /// is used if it turns out you want the file size to be smaller than
+ /// initially requested.
+ std::error_code commit();
+
+ /// If this object was previously committed, the destructor just deletes
+ /// this object. If this object was not committed, the destructor
+ /// deallocates the buffer and the target file is never written.
+ ~FileOutputBuffer();
+
+private:
+ FileOutputBuffer(const FileOutputBuffer &) = delete;
+ FileOutputBuffer &operator=(const FileOutputBuffer &) = delete;
+
+ FileOutputBuffer(std::unique_ptr<llvm::sys::fs::mapped_file_region> R,
+ StringRef Path, StringRef TempPath);
+
+ std::unique_ptr<llvm::sys::fs::mapped_file_region> Region;
+ SmallString<128> FinalPath;
+ SmallString<128> TempPath;
+};
+} // end namespace llvm
+
+#endif
diff --git a/ext/include/llvm/Support/FileSystem.h b/ext/include/llvm/Support/FileSystem.h
new file mode 100644
index 0000000..f764a2c
--- /dev/null
+++ b/ext/include/llvm/Support/FileSystem.h
@@ -0,0 +1,850 @@
+//===- llvm/Support/FileSystem.h - File System OS Concept -------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the llvm::sys::fs namespace. It is designed after
+// TR2/boost filesystem (v3), but modified to remove exception handling and the
+// path class.
+//
+// All functions return an error_code and their actual work via the last out
+// argument. The out argument is defined if and only if errc::success is
+// returned. A function may return any error code in the generic or system
+// category. However, they shall be equivalent to any error conditions listed
+// in each functions respective documentation if the condition applies. [ note:
+// this does not guarantee that error_code will be in the set of explicitly
+// listed codes, but it does guarantee that if any of the explicitly listed
+// errors occur, the correct error_code will be used ]. All functions may
+// return errc::not_enough_memory if there is not enough memory to complete the
+// operation.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_FILESYSTEM_H
+#define LLVM_SUPPORT_FILESYSTEM_H
+
+#include "llvm/ADT/IntrusiveRefCntPtr.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <ctime>
+#include <iterator>
+#include <stack>
+#include <string>
+#include <system_error>
+#include <tuple>
+#include <vector>
+
+#ifdef HAVE_SYS_STAT_H
+#include <sys/stat.h>
+#endif
+
+namespace llvm {
+namespace sys {
+namespace fs {
+
+/// An enumeration for the file system's view of the type.
+enum class file_type {
+ status_error,
+ file_not_found,
+ regular_file,
+ directory_file,
+ symlink_file,
+ block_file,
+ character_file,
+ fifo_file,
+ socket_file,
+ type_unknown
+};
+
+/// space_info - Self explanatory.
+struct space_info {
+ uint64_t capacity;
+ uint64_t free;
+ uint64_t available;
+};
+
+enum perms {
+ no_perms = 0,
+ owner_read = 0400,
+ owner_write = 0200,
+ owner_exe = 0100,
+ owner_all = owner_read | owner_write | owner_exe,
+ group_read = 040,
+ group_write = 020,
+ group_exe = 010,
+ group_all = group_read | group_write | group_exe,
+ others_read = 04,
+ others_write = 02,
+ others_exe = 01,
+ others_all = others_read | others_write | others_exe,
+ all_read = owner_read | group_read | others_read,
+ all_write = owner_write | group_write | others_write,
+ all_exe = owner_exe | group_exe | others_exe,
+ all_all = owner_all | group_all | others_all,
+ set_uid_on_exe = 04000,
+ set_gid_on_exe = 02000,
+ sticky_bit = 01000,
+ perms_not_known = 0xFFFF
+};
+
+// Helper functions so that you can use & and | to manipulate perms bits:
+inline perms operator|(perms l, perms r) {
+ return static_cast<perms>(static_cast<unsigned short>(l) |
+ static_cast<unsigned short>(r));
+}
+inline perms operator&(perms l, perms r) {
+ return static_cast<perms>(static_cast<unsigned short>(l) &
+ static_cast<unsigned short>(r));
+}
+inline perms &operator|=(perms &l, perms r) {
+ l = l | r;
+ return l;
+}
+inline perms &operator&=(perms &l, perms r) {
+ l = l & r;
+ return l;
+}
+inline perms operator~(perms x) {
+ return static_cast<perms>(~static_cast<unsigned short>(x));
+}
+
+class UniqueID {
+ uint64_t Device;
+ uint64_t File;
+
+public:
+ UniqueID() = default;
+ UniqueID(uint64_t Device, uint64_t File) : Device(Device), File(File) {}
+ bool operator==(const UniqueID &Other) const {
+ return Device == Other.Device && File == Other.File;
+ }
+ bool operator!=(const UniqueID &Other) const { return !(*this == Other); }
+ bool operator<(const UniqueID &Other) const {
+ return std::tie(Device, File) < std::tie(Other.Device, Other.File);
+ }
+ uint64_t getDevice() const { return Device; }
+ uint64_t getFile() const { return File; }
+};
+
+/// file_status - Represents the result of a call to stat and friends. It has
+/// a platform-specific member to store the result.
+class file_status
+{
+ dev_t fs_st_dev;
+ ino_t fs_st_ino;
+ time_t fs_st_mtime;
+ uid_t fs_st_uid;
+ gid_t fs_st_gid;
+ off_t fs_st_size;
+ friend bool equivalent(file_status A, file_status B);
+ file_type Type;
+ perms Perms;
+
+public:
+ file_status() : fs_st_dev(0), fs_st_ino(0), fs_st_mtime(0),
+ fs_st_uid(0), fs_st_gid(0), fs_st_size(0),
+ Type(file_type::status_error), Perms(perms_not_known) {}
+
+ file_status(file_type Type) : fs_st_dev(0), fs_st_ino(0), fs_st_mtime(0),
+ fs_st_uid(0), fs_st_gid(0), fs_st_size(0), Type(Type),
+ Perms(perms_not_known) {}
+
+ file_status(file_type Type, perms Perms, dev_t Dev, ino_t Ino, time_t MTime,
+ uid_t UID, gid_t GID, off_t Size)
+ : fs_st_dev(Dev), fs_st_ino(Ino), fs_st_mtime(MTime), fs_st_uid(UID),
+ fs_st_gid(GID), fs_st_size(Size), Type(Type), Perms(Perms) {}
+
+ // getters
+ file_type type() const { return Type; }
+ perms permissions() const { return Perms; }
+ UniqueID getUniqueID() const;
+
+ uint32_t getUser() const { return fs_st_uid; }
+ uint32_t getGroup() const { return fs_st_gid; }
+ uint64_t getSize() const { return fs_st_size; }
+
+ // setters
+ void type(file_type v) { Type = v; }
+ void permissions(perms p) { Perms = p; }
+};
+
+/// file_magic - An "enum class" enumeration of file types based on magic (the first
+/// N bytes of the file).
+struct file_magic {
+ enum Impl {
+ unknown = 0, ///< Unrecognized file
+ bitcode, ///< Bitcode file
+ archive, ///< ar style archive file
+ elf, ///< ELF Unknown type
+ elf_relocatable, ///< ELF Relocatable object file
+ elf_executable, ///< ELF Executable image
+ elf_shared_object, ///< ELF dynamically linked shared lib
+ elf_core, ///< ELF core image
+ macho_object, ///< Mach-O Object file
+ macho_executable, ///< Mach-O Executable
+ macho_fixed_virtual_memory_shared_lib, ///< Mach-O Shared Lib, FVM
+ macho_core, ///< Mach-O Core File
+ macho_preload_executable, ///< Mach-O Preloaded Executable
+ macho_dynamically_linked_shared_lib, ///< Mach-O dynlinked shared lib
+ macho_dynamic_linker, ///< The Mach-O dynamic linker
+ macho_bundle, ///< Mach-O Bundle file
+ macho_dynamically_linked_shared_lib_stub, ///< Mach-O Shared lib stub
+ macho_dsym_companion, ///< Mach-O dSYM companion file
+ macho_kext_bundle, ///< Mach-O kext bundle file
+ macho_universal_binary, ///< Mach-O universal binary
+ coff_object, ///< COFF object file
+ coff_import_library, ///< COFF import library
+ pecoff_executable, ///< PECOFF executable file
+ windows_resource ///< Windows compiled resource file (.rc)
+ };
+
+ bool is_object() const {
+ return V == unknown ? false : true;
+ }
+
+ file_magic() : V(unknown) {}
+ file_magic(Impl V) : V(V) {}
+ operator Impl() const { return V; }
+
+private:
+ Impl V;
+};
+
+/// @}
+/// @name Physical Operators
+/// @{
+
+/// @brief Make \a path an absolute path.
+///
+/// Makes \a path absolute using the \a current_directory if it is not already.
+/// An empty \a path will result in the \a current_directory.
+///
+/// /absolute/path => /absolute/path
+/// relative/../path => <current-directory>/relative/../path
+///
+/// @param path A path that is modified to be an absolute path.
+/// @returns errc::success if \a path has been made absolute, otherwise a
+/// platform-specific error_code.
+std::error_code make_absolute(const Twine ¤t_directory,
+ SmallVectorImpl<char> &path);
+
+/// @brief Make \a path an absolute path.
+///
+/// Makes \a path absolute using the current directory if it is not already. An
+/// empty \a path will result in the current directory.
+///
+/// /absolute/path => /absolute/path
+/// relative/../path => <current-directory>/relative/../path
+///
+/// @param path A path that is modified to be an absolute path.
+/// @returns errc::success if \a path has been made absolute, otherwise a
+/// platform-specific error_code.
+std::error_code make_absolute(SmallVectorImpl<char> &path);
+
+/// @brief Create all the non-existent directories in path.
+///
+/// @param path Directories to create.
+/// @returns errc::success if is_directory(path), otherwise a platform
+/// specific error_code. If IgnoreExisting is false, also returns
+/// error if the directory already existed.
+std::error_code create_directories(const Twine &path,
+ bool IgnoreExisting = true,
+ perms Perms = owner_all | group_all);
+
+/// @brief Create the directory in path.
+///
+/// @param path Directory to create.
+/// @returns errc::success if is_directory(path), otherwise a platform
+/// specific error_code. If IgnoreExisting is false, also returns
+/// error if the directory already existed.
+std::error_code create_directory(const Twine &path, bool IgnoreExisting = true,
+ perms Perms = owner_all | group_all);
+
+/// @brief Create a link from \a from to \a to.
+///
+/// The link may be a soft or a hard link, depending on the platform. The caller
+/// may not assume which one. Currently on windows it creates a hard link since
+/// soft links require extra privileges. On unix, it creates a soft link since
+/// hard links don't work on SMB file systems.
+///
+/// @param to The path to hard link to.
+/// @param from The path to hard link from. This is created.
+/// @returns errc::success if the link was created, otherwise a platform
+/// specific error_code.
+std::error_code create_link(const Twine &to, const Twine &from);
+
+/// @brief Get the current path.
+///
+/// @param result Holds the current path on return.
+/// @returns errc::success if the current path has been stored in result,
+/// otherwise a platform-specific error_code.
+std::error_code current_path(SmallVectorImpl<char> &result);
+
+/// @brief Remove path. Equivalent to POSIX remove().
+///
+/// @param path Input path.
+/// @returns errc::success if path has been removed or didn't exist, otherwise a
+/// platform-specific error code. If IgnoreNonExisting is false, also
+/// returns error if the file didn't exist.
+std::error_code remove(const Twine &path, bool IgnoreNonExisting = true);
+
+/// @brief Rename \a from to \a to. Files are renamed as if by POSIX rename().
+///
+/// @param from The path to rename from.
+/// @param to The path to rename to. This is created.
+std::error_code rename(const Twine &from, const Twine &to);
+
+/// @brief Copy the contents of \a From to \a To.
+///
+/// @param From The path to copy from.
+/// @param To The path to copy to. This is created.
+std::error_code copy_file(const Twine &From, const Twine &To);
+
+/// @brief Resize path to size. File is resized as if by POSIX truncate().
+///
+/// @param FD Input file descriptor.
+/// @param Size Size to resize to.
+/// @returns errc::success if \a path has been resized to \a size, otherwise a
+/// platform-specific error_code.
+std::error_code resize_file(int FD, uint64_t Size);
+
+/// @}
+/// @name Physical Observers
+/// @{
+
+/// @brief Does file exist?
+///
+/// @param status A file_status previously returned from stat.
+/// @returns True if the file represented by status exists, false if it does
+/// not.
+bool exists(file_status status);
+
+enum class AccessMode { Exist, Write, Execute };
+
+/// @brief Can the file be accessed?
+///
+/// @param Path Input path.
+/// @returns errc::success if the path can be accessed, otherwise a
+/// platform-specific error_code.
+std::error_code access(const Twine &Path, AccessMode Mode);
+
+/// @brief Does file exist?
+///
+/// @param Path Input path.
+/// @returns True if it exists, false otherwise.
+inline bool exists(const Twine &Path) {
+ return !access(Path, AccessMode::Exist);
+}
+
+/// @brief Can we execute this file?
+///
+/// @param Path Input path.
+/// @returns True if we can execute it, false otherwise.
+bool can_execute(const Twine &Path);
+
+/// @brief Can we write this file?
+///
+/// @param Path Input path.
+/// @returns True if we can write to it, false otherwise.
+inline bool can_write(const Twine &Path) {
+ return !access(Path, AccessMode::Write);
+}
+
+/// @brief Do file_status's represent the same thing?
+///
+/// @param A Input file_status.
+/// @param B Input file_status.
+///
+/// assert(status_known(A) || status_known(B));
+///
+/// @returns True if A and B both represent the same file system entity, false
+/// otherwise.
+bool equivalent(file_status A, file_status B);
+
+/// @brief Do paths represent the same thing?
+///
+/// assert(status_known(A) || status_known(B));
+///
+/// @param A Input path A.
+/// @param B Input path B.
+/// @param result Set to true if stat(A) and stat(B) have the same device and
+/// inode (or equivalent).
+/// @returns errc::success if result has been successfully set, otherwise a
+/// platform-specific error_code.
+std::error_code equivalent(const Twine &A, const Twine &B, bool &result);
+
+/// @brief Simpler version of equivalent for clients that don't need to
+/// differentiate between an error and false.
+inline bool equivalent(const Twine &A, const Twine &B) {
+ bool result;
+ return !equivalent(A, B, result) && result;
+}
+
+/// @brief Does status represent a directory?
+///
+/// @param status A file_status previously returned from status.
+/// @returns status.type() == file_type::directory_file.
+bool is_directory(file_status status);
+
+/// @brief Is path a directory?
+///
+/// @param path Input path.
+/// @param result Set to true if \a path is a directory, false if it is not.
+/// Undefined otherwise.
+/// @returns errc::success if result has been successfully set, otherwise a
+/// platform-specific error_code.
+std::error_code is_directory(const Twine &path, bool &result);
+
+/// @brief Simpler version of is_directory for clients that don't need to
+/// differentiate between an error and false.
+inline bool is_directory(const Twine &Path) {
+ bool Result;
+ return !is_directory(Path, Result) && Result;
+}
+
+/// @brief Does status represent a regular file?
+///
+/// @param status A file_status previously returned from status.
+/// @returns status_known(status) && status.type() == file_type::regular_file.
+bool is_regular_file(file_status status);
+
+/// @brief Is path a regular file?
+///
+/// @param path Input path.
+/// @param result Set to true if \a path is a regular file, false if it is not.
+/// Undefined otherwise.
+/// @returns errc::success if result has been successfully set, otherwise a
+/// platform-specific error_code.
+std::error_code is_regular_file(const Twine &path, bool &result);
+
+/// @brief Simpler version of is_regular_file for clients that don't need to
+/// differentiate between an error and false.
+inline bool is_regular_file(const Twine &Path) {
+ bool Result;
+ if (is_regular_file(Path, Result))
+ return false;
+ return Result;
+}
+
+/// @brief Does this status represent something that exists but is not a
+/// directory, regular file, or symlink?
+///
+/// @param status A file_status previously returned from status.
+/// @returns exists(s) && !is_regular_file(s) && !is_directory(s)
+bool is_other(file_status status);
+
+/// @brief Is path something that exists but is not a directory,
+/// regular file, or symlink?
+///
+/// @param path Input path.
+/// @param result Set to true if \a path exists, but is not a directory, regular
+/// file, or a symlink, false if it does not. Undefined otherwise.
+/// @returns errc::success if result has been successfully set, otherwise a
+/// platform-specific error_code.
+std::error_code is_other(const Twine &path, bool &result);
+
+/// @brief Get file status as if by POSIX stat().
+///
+/// @param path Input path.
+/// @param result Set to the file status.
+/// @returns errc::success if result has been successfully set, otherwise a
+/// platform-specific error_code.
+std::error_code status(const Twine &path, file_status &result);
+
+/// @brief A version for when a file descriptor is already available.
+std::error_code status(int FD, file_status &Result);
+
+/// @brief Get file size.
+///
+/// @param Path Input path.
+/// @param Result Set to the size of the file in \a Path.
+/// @returns errc::success if result has been successfully set, otherwise a
+/// platform-specific error_code.
+inline std::error_code file_size(const Twine &Path, uint64_t &Result) {
+ file_status Status;
+ std::error_code EC = status(Path, Status);
+ if (EC)
+ return EC;
+ Result = Status.getSize();
+ return std::error_code();
+}
+
+/// @brief Is status available?
+///
+/// @param s Input file status.
+/// @returns True if status() != status_error.
+bool status_known(file_status s);
+
+/// @brief Is status available?
+///
+/// @param path Input path.
+/// @param result Set to true if status() != status_error.
+/// @returns errc::success if result has been successfully set, otherwise a
+/// platform-specific error_code.
+std::error_code status_known(const Twine &path, bool &result);
+
+/// @brief Create a uniquely named file.
+///
+/// Generates a unique path suitable for a temporary file and then opens it as a
+/// file. The name is based on \a model with '%' replaced by a random char in
+/// [0-9a-f]. If \a model is not an absolute path, the temporary file will be
+/// created in the current directory.
+///
+/// Example: clang-%%-%%-%%-%%-%%.s => clang-a0-b1-c2-d3-e4.s
+///
+/// This is an atomic operation. Either the file is created and opened, or the
+/// file system is left untouched.
+///
+/// The intended use is for files that are to be kept, possibly after
+/// renaming them. For example, when running 'clang -c foo.o', the file can
+/// be first created as foo-abc123.o and then renamed.
+///
+/// @param Model Name to base unique path off of.
+/// @param ResultFD Set to the opened file's file descriptor.
+/// @param ResultPath Set to the opened file's absolute path.
+/// @returns errc::success if Result{FD,Path} have been successfully set,
+/// otherwise a platform-specific error_code.
+std::error_code createUniqueFile(const Twine &Model, int &ResultFD,
+ SmallVectorImpl<char> &ResultPath,
+ unsigned Mode = all_read | all_write);
+
+/// @brief Simpler version for clients that don't want an open file.
+std::error_code createUniqueFile(const Twine &Model,
+ SmallVectorImpl<char> &ResultPath);
+
+/// @brief Create a file in the system temporary directory.
+///
+/// The filename is of the form prefix-random_chars.suffix. Since the directory
+/// is not know to the caller, Prefix and Suffix cannot have path separators.
+/// The files are created with mode 0600.
+///
+/// This should be used for things like a temporary .s that is removed after
+/// running the assembler.
+std::error_code createTemporaryFile(const Twine &Prefix, StringRef Suffix,
+ int &ResultFD,
+ SmallVectorImpl<char> &ResultPath);
+
+/// @brief Simpler version for clients that don't want an open file.
+std::error_code createTemporaryFile(const Twine &Prefix, StringRef Suffix,
+ SmallVectorImpl<char> &ResultPath);
+
+std::error_code createUniqueDirectory(const Twine &Prefix,
+ SmallVectorImpl<char> &ResultPath);
+
+enum OpenFlags : unsigned {
+ F_None = 0,
+
+ /// F_Excl - When opening a file, this flag makes raw_fd_ostream
+ /// report an error if the file already exists.
+ F_Excl = 1,
+
+ /// F_Append - When opening a file, if it already exists append to the
+ /// existing file instead of returning an error. This may not be specified
+ /// with F_Excl.
+ F_Append = 2,
+
+ /// The file should be opened in text mode on platforms that make this
+ /// distinction.
+ F_Text = 4,
+
+ /// Open the file for read and write.
+ F_RW = 8
+};
+
+inline OpenFlags operator|(OpenFlags A, OpenFlags B) {
+ return OpenFlags(unsigned(A) | unsigned(B));
+}
+
+inline OpenFlags &operator|=(OpenFlags &A, OpenFlags B) {
+ A = A | B;
+ return A;
+}
+
+std::error_code openFileForWrite(const Twine &Name, int &ResultFD,
+ OpenFlags Flags, unsigned Mode = 0666);
+
+std::error_code openFileForRead(const Twine &Name, int &ResultFD);
+
+/// @brief Identify the type of a binary file based on how magical it is.
+file_magic identify_magic(StringRef magic);
+
+/// @brief Get and identify \a path's type based on its content.
+///
+/// @param path Input path.
+/// @param result Set to the type of file, or file_magic::unknown.
+/// @returns errc::success if result has been successfully set, otherwise a
+/// platform-specific error_code.
+std::error_code identify_magic(const Twine &path, file_magic &result);
+
+std::error_code getUniqueID(const Twine Path, UniqueID &Result);
+
+/// This class represents a memory mapped file. It is based on
+/// boost::iostreams::mapped_file.
+class mapped_file_region {
+ mapped_file_region() = delete;
+ mapped_file_region(mapped_file_region&) = delete;
+ mapped_file_region &operator =(mapped_file_region&) = delete;
+
+public:
+ enum mapmode {
+ readonly, ///< May only access map via const_data as read only.
+ readwrite, ///< May access map via data and modify it. Written to path.
+ priv ///< May modify via data, but changes are lost on destruction.
+ };
+
+private:
+ /// Platform-specific mapping state.
+ uint64_t Size;
+ void *Mapping;
+
+ std::error_code init(int FD, uint64_t Offset, mapmode Mode);
+
+public:
+ /// \param fd An open file descriptor to map. mapped_file_region takes
+ /// ownership if closefd is true. It must have been opended in the correct
+ /// mode.
+ mapped_file_region(int fd, mapmode mode, uint64_t length, uint64_t offset,
+ std::error_code &ec);
+
+ ~mapped_file_region();
+
+ uint64_t size() const;
+ char *data() const;
+
+ /// Get a const view of the data. Modifying this memory has undefined
+ /// behavior.
+ const char *const_data() const;
+
+ /// \returns The minimum alignment offset must be.
+ static int alignment();
+};
+
+/// Return the path to the main executable, given the value of argv[0] from
+/// program startup and the address of main itself. In extremis, this function
+/// may fail and return an empty path.
+std::string getMainExecutable(const char *argv0, void *MainExecAddr);
+
+/// @}
+/// @name Iterators
+/// @{
+
+/// directory_entry - A single entry in a directory. Caches the status either
+/// from the result of the iteration syscall, or the first time status is
+/// called.
+class directory_entry {
+ std::string Path;
+ mutable file_status Status;
+
+public:
+ explicit directory_entry(const Twine &path, file_status st = file_status())
+ : Path(path.str())
+ , Status(st) {}
+
+ directory_entry() {}
+
+ void assign(const Twine &path, file_status st = file_status()) {
+ Path = path.str();
+ Status = st;
+ }
+
+ void replace_filename(const Twine &filename, file_status st = file_status());
+
+ const std::string &path() const { return Path; }
+ std::error_code status(file_status &result) const;
+
+ bool operator==(const directory_entry& rhs) const { return Path == rhs.Path; }
+ bool operator!=(const directory_entry& rhs) const { return !(*this == rhs); }
+ bool operator< (const directory_entry& rhs) const;
+ bool operator<=(const directory_entry& rhs) const;
+ bool operator> (const directory_entry& rhs) const;
+ bool operator>=(const directory_entry& rhs) const;
+};
+
+namespace detail {
+ struct DirIterState;
+
+ std::error_code directory_iterator_construct(DirIterState &, StringRef);
+ std::error_code directory_iterator_increment(DirIterState &);
+ std::error_code directory_iterator_destruct(DirIterState &);
+
+ /// DirIterState - Keeps state for the directory_iterator. It is reference
+ /// counted in order to preserve InputIterator semantics on copy.
+ struct DirIterState : public RefCountedBase<DirIterState> {
+ DirIterState()
+ : IterationHandle(0) {}
+
+ ~DirIterState() {
+ directory_iterator_destruct(*this);
+ }
+
+ intptr_t IterationHandle;
+ directory_entry CurrentEntry;
+ };
+}
+
+/// directory_iterator - Iterates through the entries in path. There is no
+/// operator++ because we need an error_code. If it's really needed we can make
+/// it call report_fatal_error on error.
+class directory_iterator {
+ IntrusiveRefCntPtr<detail::DirIterState> State;
+
+public:
+ explicit directory_iterator(const Twine &path, std::error_code &ec) {
+ State = new detail::DirIterState;
+ SmallString<128> path_storage;
+ ec = detail::directory_iterator_construct(*State,
+ path.toStringRef(path_storage));
+ }
+
+ explicit directory_iterator(const directory_entry &de, std::error_code &ec) {
+ State = new detail::DirIterState;
+ ec = detail::directory_iterator_construct(*State, de.path());
+ }
+
+ /// Construct end iterator.
+ directory_iterator() : State(nullptr) {}
+
+ // No operator++ because we need error_code.
+ directory_iterator &increment(std::error_code &ec) {
+ ec = directory_iterator_increment(*State);
+ return *this;
+ }
+
+ const directory_entry &operator*() const { return State->CurrentEntry; }
+ const directory_entry *operator->() const { return &State->CurrentEntry; }
+
+ bool operator==(const directory_iterator &RHS) const {
+ if (State == RHS.State)
+ return true;
+ if (!RHS.State)
+ return State->CurrentEntry == directory_entry();
+ if (!State)
+ return RHS.State->CurrentEntry == directory_entry();
+ return State->CurrentEntry == RHS.State->CurrentEntry;
+ }
+
+ bool operator!=(const directory_iterator &RHS) const {
+ return !(*this == RHS);
+ }
+ // Other members as required by
+ // C++ Std, 24.1.1 Input iterators [input.iterators]
+};
+
+namespace detail {
+ /// RecDirIterState - Keeps state for the recursive_directory_iterator. It is
+ /// reference counted in order to preserve InputIterator semantics on copy.
+ struct RecDirIterState : public RefCountedBase<RecDirIterState> {
+ RecDirIterState()
+ : Level(0)
+ , HasNoPushRequest(false) {}
+
+ std::stack<directory_iterator, std::vector<directory_iterator> > Stack;
+ uint16_t Level;
+ bool HasNoPushRequest;
+ };
+}
+
+/// recursive_directory_iterator - Same as directory_iterator except for it
+/// recurses down into child directories.
+class recursive_directory_iterator {
+ IntrusiveRefCntPtr<detail::RecDirIterState> State;
+
+public:
+ recursive_directory_iterator() {}
+ explicit recursive_directory_iterator(const Twine &path, std::error_code &ec)
+ : State(new detail::RecDirIterState) {
+ State->Stack.push(directory_iterator(path, ec));
+ if (State->Stack.top() == directory_iterator())
+ State.reset();
+ }
+ // No operator++ because we need error_code.
+ recursive_directory_iterator &increment(std::error_code &ec) {
+ const directory_iterator end_itr;
+
+ if (State->HasNoPushRequest)
+ State->HasNoPushRequest = false;
+ else {
+ file_status st;
+ if ((ec = State->Stack.top()->status(st))) return *this;
+ if (is_directory(st)) {
+ State->Stack.push(directory_iterator(*State->Stack.top(), ec));
+ if (ec) return *this;
+ if (State->Stack.top() != end_itr) {
+ ++State->Level;
+ return *this;
+ }
+ State->Stack.pop();
+ }
+ }
+
+ while (!State->Stack.empty()
+ && State->Stack.top().increment(ec) == end_itr) {
+ State->Stack.pop();
+ --State->Level;
+ }
+
+ // Check if we are done. If so, create an end iterator.
+ if (State->Stack.empty())
+ State.reset();
+
+ return *this;
+ }
+
+ const directory_entry &operator*() const { return *State->Stack.top(); }
+ const directory_entry *operator->() const { return &*State->Stack.top(); }
+
+ // observers
+ /// Gets the current level. Starting path is at level 0.
+ int level() const { return State->Level; }
+
+ /// Returns true if no_push has been called for this directory_entry.
+ bool no_push_request() const { return State->HasNoPushRequest; }
+
+ // modifiers
+ /// Goes up one level if Level > 0.
+ void pop() {
+ assert(State && "Cannot pop an end iterator!");
+ assert(State->Level > 0 && "Cannot pop an iterator with level < 1");
+
+ const directory_iterator end_itr;
+ std::error_code ec;
+ do {
+ if (ec)
+ report_fatal_error("Error incrementing directory iterator.");
+ State->Stack.pop();
+ --State->Level;
+ } while (!State->Stack.empty()
+ && State->Stack.top().increment(ec) == end_itr);
+
+ // Check if we are done. If so, create an end iterator.
+ if (State->Stack.empty())
+ State.reset();
+ }
+
+ /// Does not go down into the current directory_entry.
+ void no_push() { State->HasNoPushRequest = true; }
+
+ bool operator==(const recursive_directory_iterator &RHS) const {
+ return State == RHS.State;
+ }
+
+ bool operator!=(const recursive_directory_iterator &RHS) const {
+ return !(*this == RHS);
+ }
+ // Other members as required by
+ // C++ Std, 24.1.1 Input iterators [input.iterators]
+};
+
+/// @}
+
+} // end namespace fs
+} // end namespace sys
+} // end namespace llvm
+
+#endif
diff --git a/ext/include/llvm/Support/FileUtilities.h b/ext/include/llvm/Support/FileUtilities.h
new file mode 100644
index 0000000..2ee2c60
--- /dev/null
+++ b/ext/include/llvm/Support/FileUtilities.h
@@ -0,0 +1,78 @@
+//===- llvm/Support/FileUtilities.h - File System Utilities -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a family of utility functions which are useful for doing
+// various things with files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_FILEUTILITIES_H
+#define LLVM_SUPPORT_FILEUTILITIES_H
+
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
+
+namespace llvm {
+
+ /// DiffFilesWithTolerance - Compare the two files specified, returning 0 if
+ /// the files match, 1 if they are different, and 2 if there is a file error.
+ /// This function allows you to specify an absolute and relative FP error that
+ /// is allowed to exist. If you specify a string to fill in for the error
+ /// option, it will set the string to an error message if an error occurs, or
+ /// if the files are different.
+ ///
+ int DiffFilesWithTolerance(StringRef FileA,
+ StringRef FileB,
+ double AbsTol, double RelTol,
+ std::string *Error = nullptr);
+
+
+ /// FileRemover - This class is a simple object meant to be stack allocated.
+ /// If an exception is thrown from a region, the object removes the filename
+ /// specified (if deleteIt is true).
+ ///
+ class FileRemover {
+ SmallString<128> Filename;
+ bool DeleteIt;
+ public:
+ FileRemover() : DeleteIt(false) {}
+
+ explicit FileRemover(const Twine& filename, bool deleteIt = true)
+ : DeleteIt(deleteIt) {
+ filename.toVector(Filename);
+ }
+
+ ~FileRemover() {
+ if (DeleteIt) {
+ // Ignore problems deleting the file.
+ sys::fs::remove(Filename);
+ }
+ }
+
+ /// setFile - Give ownership of the file to the FileRemover so it will
+ /// be removed when the object is destroyed. If the FileRemover already
+ /// had ownership of a file, remove it first.
+ void setFile(const Twine& filename, bool deleteIt = true) {
+ if (DeleteIt) {
+ // Ignore problems deleting the file.
+ sys::fs::remove(Filename);
+ }
+
+ Filename.clear();
+ filename.toVector(Filename);
+ DeleteIt = deleteIt;
+ }
+
+ /// releaseFile - Take ownership of the file away from the FileRemover so it
+ /// will not be removed when the object is destroyed.
+ void releaseFile() { DeleteIt = false; }
+ };
+} // End llvm namespace
+
+#endif
diff --git a/ext/include/llvm/Support/Format.h b/ext/include/llvm/Support/Format.h
new file mode 100644
index 0000000..7dea34f
--- /dev/null
+++ b/ext/include/llvm/Support/Format.h
@@ -0,0 +1,191 @@
+//===- Format.h - Efficient printf-style formatting for streams -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the format() function, which can be used with other
+// LLVM subsystems to provide printf-style formatting. This gives all the power
+// and risk of printf. This can be used like this (with raw_ostreams as an
+// example):
+//
+// OS << "mynumber: " << format("%4.5f", 1234.412) << '\n';
+//
+// Or if you prefer:
+//
+// OS << format("mynumber: %4.5f\n", 1234.412);
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_FORMAT_H
+#define LLVM_SUPPORT_FORMAT_H
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/DataTypes.h"
+#include <cassert>
+#include <cstdio>
+#include <tuple>
+
+namespace llvm {
+
+/// This is a helper class used for handling formatted output. It is the
+/// abstract base class of a templated derived class.
+class format_object_base {
+protected:
+ const char *Fmt;
+ ~format_object_base() = default; // Disallow polymorphic deletion.
+ format_object_base(const format_object_base &) = default;
+ virtual void home(); // Out of line virtual method.
+
+ /// Call snprintf() for this object, on the given buffer and size.
+ virtual int snprint(char *Buffer, unsigned BufferSize) const = 0;
+
+public:
+ format_object_base(const char *fmt) : Fmt(fmt) {}
+
+ /// Format the object into the specified buffer. On success, this returns
+ /// the length of the formatted string. If the buffer is too small, this
+ /// returns a length to retry with, which will be larger than BufferSize.
+ unsigned print(char *Buffer, unsigned BufferSize) const {
+ assert(BufferSize && "Invalid buffer size!");
+
+ // Print the string, leaving room for the terminating null.
+ int N = snprint(Buffer, BufferSize);
+
+ // VC++ and old GlibC return negative on overflow, just double the size.
+ if (N < 0)
+ return BufferSize * 2;
+
+ // Other implementations yield number of bytes needed, not including the
+ // final '\0'.
+ if (unsigned(N) >= BufferSize)
+ return N + 1;
+
+ // Otherwise N is the length of output (not including the final '\0').
+ return N;
+ }
+};
+
+/// These are templated helper classes used by the format function that
+/// capture the object to be formated and the format string. When actually
+/// printed, this synthesizes the string into a temporary buffer provided and
+/// returns whether or not it is big enough.
+
+template <typename... Ts>
+class format_object final : public format_object_base {
+ std::tuple<Ts...> Vals;
+
+ template <std::size_t... Is>
+ int snprint_tuple(char *Buffer, unsigned BufferSize,
+ index_sequence<Is...>) const {
+ return snprintf(Buffer, BufferSize, Fmt, std::get<Is>(Vals)...);
+ }
+
+public:
+ format_object(const char *fmt, const Ts &... vals)
+ : format_object_base(fmt), Vals(vals...) {}
+
+ int snprint(char *Buffer, unsigned BufferSize) const override {
+ return snprint_tuple(Buffer, BufferSize, index_sequence_for<Ts...>());
+ }
+};
+
+/// These are helper functions used to produce formatted output. They use
+/// template type deduction to construct the appropriate instance of the
+/// format_object class to simplify their construction.
+///
+/// This is typically used like:
+/// \code
+/// OS << format("%0.4f", myfloat) << '\n';
+/// \endcode
+
+template <typename... Ts>
+inline format_object<Ts...> format(const char *Fmt, const Ts &... Vals) {
+ return format_object<Ts...>(Fmt, Vals...);
+}
+
+/// This is a helper class used for left_justify() and right_justify().
+class FormattedString {
+ StringRef Str;
+ unsigned Width;
+ bool RightJustify;
+ friend class raw_ostream;
+
+public:
+ FormattedString(StringRef S, unsigned W, bool R)
+ : Str(S), Width(W), RightJustify(R) { }
+};
+
+/// left_justify - append spaces after string so total output is
+/// \p Width characters. If \p Str is larger that \p Width, full string
+/// is written with no padding.
+inline FormattedString left_justify(StringRef Str, unsigned Width) {
+ return FormattedString(Str, Width, false);
+}
+
+/// right_justify - add spaces before string so total output is
+/// \p Width characters. If \p Str is larger that \p Width, full string
+/// is written with no padding.
+inline FormattedString right_justify(StringRef Str, unsigned Width) {
+ return FormattedString(Str, Width, true);
+}
+
+/// This is a helper class used for format_hex() and format_decimal().
+class FormattedNumber {
+ uint64_t HexValue;
+ int64_t DecValue;
+ unsigned Width;
+ bool Hex;
+ bool Upper;
+ bool HexPrefix;
+ friend class raw_ostream;
+
+public:
+ FormattedNumber(uint64_t HV, int64_t DV, unsigned W, bool H, bool U,
+ bool Prefix)
+ : HexValue(HV), DecValue(DV), Width(W), Hex(H), Upper(U),
+ HexPrefix(Prefix) {}
+};
+
+/// format_hex - Output \p N as a fixed width hexadecimal. If number will not
+/// fit in width, full number is still printed. Examples:
+/// OS << format_hex(255, 4) => 0xff
+/// OS << format_hex(255, 4, true) => 0xFF
+/// OS << format_hex(255, 6) => 0x00ff
+/// OS << format_hex(255, 2) => 0xff
+inline FormattedNumber format_hex(uint64_t N, unsigned Width,
+ bool Upper = false) {
+ assert(Width <= 18 && "hex width must be <= 18");
+ return FormattedNumber(N, 0, Width, true, Upper, true);
+}
+
+/// format_hex_no_prefix - Output \p N as a fixed width hexadecimal. Does not
+/// prepend '0x' to the outputted string. If number will not fit in width,
+/// full number is still printed. Examples:
+/// OS << format_hex_no_prefix(255, 4) => ff
+/// OS << format_hex_no_prefix(255, 4, true) => FF
+/// OS << format_hex_no_prefix(255, 6) => 00ff
+/// OS << format_hex_no_prefix(255, 2) => ff
+inline FormattedNumber format_hex_no_prefix(uint64_t N, unsigned Width,
+ bool Upper = false) {
+ assert(Width <= 18 && "hex width must be <= 18");
+ return FormattedNumber(N, 0, Width, true, Upper, false);
+}
+
+/// format_decimal - Output \p N as a right justified, fixed-width decimal. If
+/// number will not fit in width, full number is still printed. Examples:
+/// OS << format_decimal(0, 5) => " 0"
+/// OS << format_decimal(255, 5) => " 255"
+/// OS << format_decimal(-1, 3) => " -1"
+/// OS << format_decimal(12345, 3) => "12345"
+inline FormattedNumber format_decimal(int64_t N, unsigned Width) {
+ return FormattedNumber(0, N, Width, false, false, false);
+}
+
+} // end namespace llvm
+
+#endif
diff --git a/ext/include/llvm/Support/Host.h b/ext/include/llvm/Support/Host.h
new file mode 100644
index 0000000..d49cc11
--- /dev/null
+++ b/ext/include/llvm/Support/Host.h
@@ -0,0 +1,38 @@
+//===- llvm/Support/Host.h - Host machine characteristics --------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Methods for querying the nature of the host machine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_HOST_H
+#define LLVM_SUPPORT_HOST_H
+
+#if defined(__linux__) || defined(__GNU__)
+#include <endian.h>
+#else
+#if !defined(BYTE_ORDER)
+#include <machine/endian.h>
+#endif
+#endif
+
+namespace llvm {
+namespace sys {
+
+#if defined(BYTE_ORDER) && defined(BIG_ENDIAN) && BYTE_ORDER == BIG_ENDIAN
+ static const bool IsBigEndianHost = true;
+#else
+ static const bool IsBigEndianHost = false;
+#endif
+
+ static const bool IsLittleEndianHost = !IsBigEndianHost;
+}
+}
+
+#endif
diff --git a/ext/include/llvm/Support/LineIterator.h b/ext/include/llvm/Support/LineIterator.h
new file mode 100644
index 0000000..9d4cd3b
--- /dev/null
+++ b/ext/include/llvm/Support/LineIterator.h
@@ -0,0 +1,88 @@
+//===- LineIterator.h - Iterator to read a text buffer's lines --*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_LINEITERATOR_H
+#define LLVM_SUPPORT_LINEITERATOR_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/DataTypes.h"
+#include <iterator>
+
+namespace llvm {
+
+class MemoryBuffer;
+
+/// \brief A forward iterator which reads text lines from a buffer.
+///
+/// This class provides a forward iterator interface for reading one line at
+/// a time from a buffer. When default constructed the iterator will be the
+/// "end" iterator.
+///
+/// The iterator is aware of what line number it is currently processing. It
+/// strips blank lines by default, and comment lines given a comment-starting
+/// character.
+///
+/// Note that this iterator requires the buffer to be nul terminated.
+class line_iterator
+ : public std::iterator<std::forward_iterator_tag, StringRef> {
+ const MemoryBuffer *Buffer;
+ char CommentMarker;
+ bool SkipBlanks;
+
+ unsigned LineNumber;
+ StringRef CurrentLine;
+
+public:
+ /// \brief Default construct an "end" iterator.
+ line_iterator() : Buffer(nullptr) {}
+
+ /// \brief Construct a new iterator around some memory buffer.
+ explicit line_iterator(const MemoryBuffer &Buffer, bool SkipBlanks = true,
+ char CommentMarker = '\0');
+
+ /// \brief Return true if we've reached EOF or are an "end" iterator.
+ bool is_at_eof() const { return !Buffer; }
+
+ /// \brief Return true if we're an "end" iterator or have reached EOF.
+ bool is_at_end() const { return is_at_eof(); }
+
+ /// \brief Return the current line number. May return any number at EOF.
+ int64_t line_number() const { return LineNumber; }
+
+ /// \brief Advance to the next (non-empty, non-comment) line.
+ line_iterator &operator++() {
+ advance();
+ return *this;
+ }
+ line_iterator operator++(int) {
+ line_iterator tmp(*this);
+ advance();
+ return tmp;
+ }
+
+ /// \brief Get the current line as a \c StringRef.
+ StringRef operator*() const { return CurrentLine; }
+ const StringRef *operator->() const { return &CurrentLine; }
+
+ friend bool operator==(const line_iterator &LHS, const line_iterator &RHS) {
+ return LHS.Buffer == RHS.Buffer &&
+ LHS.CurrentLine.begin() == RHS.CurrentLine.begin();
+ }
+
+ friend bool operator!=(const line_iterator &LHS, const line_iterator &RHS) {
+ return !(LHS == RHS);
+ }
+
+private:
+ /// \brief Advance the iterator to the next line.
+ void advance();
+};
+}
+
+#endif
diff --git a/ext/include/llvm/Support/MathExtras.h b/ext/include/llvm/Support/MathExtras.h
new file mode 100644
index 0000000..8c0b110
--- /dev/null
+++ b/ext/include/llvm/Support/MathExtras.h
@@ -0,0 +1,715 @@
+//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains some functions that are useful for math stuff.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_MATHEXTRAS_H
+#define LLVM_SUPPORT_MATHEXTRAS_H
+
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/SwapByteOrder.h"
+#include <cassert>
+#include <cstring>
+#include <type_traits>
+#include <cstdint>
+
+namespace llvm {
+/// \brief The behavior an operation has on an input of 0.
+enum ZeroBehavior {
+ /// \brief The returned value is undefined.
+ ZB_Undefined,
+ /// \brief The returned value is numeric_limits<T>::max()
+ ZB_Max,
+ /// \brief The returned value is numeric_limits<T>::digits
+ ZB_Width
+};
+
+namespace detail {
+template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
+ static std::size_t count(T Val, ZeroBehavior) {
+ if (!Val)
+ return std::numeric_limits<T>::digits;
+ if (Val & 0x1)
+ return 0;
+
+ // Bisection method.
+ std::size_t ZeroBits = 0;
+ T Shift = std::numeric_limits<T>::digits >> 1;
+ T Mask = std::numeric_limits<T>::max() >> Shift;
+ while (Shift) {
+ if ((Val & Mask) == 0) {
+ Val >>= Shift;
+ ZeroBits |= Shift;
+ }
+ Shift >>= 1;
+ Mask >>= Shift;
+ }
+ return ZeroBits;
+ }
+};
+
+#if __GNUC__ >= 4
+template <typename T> struct TrailingZerosCounter<T, 4> {
+ static std::size_t count(T Val, ZeroBehavior ZB) {
+ if (ZB != ZB_Undefined && Val == 0)
+ return 32;
+
+#if __has_builtin(__builtin_ctz) || LLVM_GNUC_PREREQ(4, 0, 0)
+ return __builtin_ctz(Val);
+#endif
+ }
+};
+
+template <typename T> struct TrailingZerosCounter<T, 8> {
+ static std::size_t count(T Val, ZeroBehavior ZB) {
+ if (ZB != ZB_Undefined && Val == 0)
+ return 64;
+
+#if __has_builtin(__builtin_ctzll) || LLVM_GNUC_PREREQ(4, 0, 0)
+ return __builtin_ctzll(Val);
+#endif
+ }
+};
+#endif
+} // namespace detail
+
+/// \brief Count number of 0's from the least significant bit to the most
+/// stopping at the first 1.
+///
+/// Only unsigned integral types are allowed.
+///
+/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
+/// valid arguments.
+template <typename T>
+std::size_t countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
+ static_assert(std::numeric_limits<T>::is_integer &&
+ !std::numeric_limits<T>::is_signed,
+ "Only unsigned integral types are allowed.");
+ return detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
+}
+
+namespace detail {
+template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
+ static std::size_t count(T Val, ZeroBehavior) {
+ if (!Val)
+ return std::numeric_limits<T>::digits;
+
+ // Bisection method.
+ std::size_t ZeroBits = 0;
+ for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
+ T Tmp = Val >> Shift;
+ if (Tmp)
+ Val = Tmp;
+ else
+ ZeroBits |= Shift;
+ }
+ return ZeroBits;
+ }
+};
+
+#if __GNUC__ >= 4
+template <typename T> struct LeadingZerosCounter<T, 4> {
+ static std::size_t count(T Val, ZeroBehavior ZB) {
+ if (ZB != ZB_Undefined && Val == 0)
+ return 32;
+
+#if __has_builtin(__builtin_clz) || LLVM_GNUC_PREREQ(4, 0, 0)
+ return __builtin_clz(Val);
+#endif
+ }
+};
+
+template <typename T> struct LeadingZerosCounter<T, 8> {
+ static std::size_t count(T Val, ZeroBehavior ZB) {
+ if (ZB != ZB_Undefined && Val == 0)
+ return 64;
+
+#if __has_builtin(__builtin_clzll) || LLVM_GNUC_PREREQ(4, 0, 0)
+ return __builtin_clzll(Val);
+#endif
+ }
+};
+#endif
+} // namespace detail
+
+/// \brief Count number of 0's from the most significant bit to the least
+/// stopping at the first 1.
+///
+/// Only unsigned integral types are allowed.
+///
+/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
+/// valid arguments.
+template <typename T>
+std::size_t countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
+ static_assert(std::numeric_limits<T>::is_integer &&
+ !std::numeric_limits<T>::is_signed,
+ "Only unsigned integral types are allowed.");
+ return detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
+}
+
+/// \brief Get the index of the first set bit starting from the least
+/// significant bit.
+///
+/// Only unsigned integral types are allowed.
+///
+/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
+/// valid arguments.
+template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
+ if (ZB == ZB_Max && Val == 0)
+ return std::numeric_limits<T>::max();
+
+ return countTrailingZeros(Val, ZB_Undefined);
+}
+
+/// \brief Get the index of the last set bit starting from the least
+/// significant bit.
+///
+/// Only unsigned integral types are allowed.
+///
+/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
+/// valid arguments.
+template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
+ if (ZB == ZB_Max && Val == 0)
+ return std::numeric_limits<T>::max();
+
+ // Use ^ instead of - because both gcc and llvm can remove the associated ^
+ // in the __builtin_clz intrinsic on x86.
+ return countLeadingZeros(Val, ZB_Undefined) ^
+ (std::numeric_limits<T>::digits - 1);
+}
+
+/// \brief Macro compressed bit reversal table for 256 bits.
+///
+/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
+static const unsigned char BitReverseTable256[256] = {
+#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
+#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
+#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
+ R6(0), R6(2), R6(1), R6(3)
+#undef R2
+#undef R4
+#undef R6
+};
+
+/// \brief Reverse the bits in \p Val.
+template <typename T>
+T reverseBits(T Val) {
+ unsigned char in[sizeof(Val)];
+ unsigned char out[sizeof(Val)];
+ std::memcpy(in, &Val, sizeof(Val));
+ for (unsigned i = 0; i < sizeof(Val); ++i)
+ out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
+ std::memcpy(&Val, out, sizeof(Val));
+ return Val;
+}
+
+// NOTE: The following support functions use the _32/_64 extensions instead of
+// type overloading so that signed and unsigned integers can be used without
+// ambiguity.
+
+/// Hi_32 - This function returns the high 32 bits of a 64 bit value.
+inline uint32_t Hi_32(uint64_t Value) {
+ return static_cast<uint32_t>(Value >> 32);
+}
+
+/// Lo_32 - This function returns the low 32 bits of a 64 bit value.
+inline uint32_t Lo_32(uint64_t Value) {
+ return static_cast<uint32_t>(Value);
+}
+
+/// Make_64 - This functions makes a 64-bit integer from a high / low pair of
+/// 32-bit integers.
+inline uint64_t Make_64(uint32_t High, uint32_t Low) {
+ return ((uint64_t)High << 32) | (uint64_t)Low;
+}
+
+/// isInt - Checks if an integer fits into the given bit width.
+template<unsigned N>
+inline bool isInt(int64_t x) {
+ return N >= 64 || (-(INT64_C(1)<<(N-1)) <= x && x < (INT64_C(1)<<(N-1)));
+}
+// Template specializations to get better code for common cases.
+template<>
+inline bool isInt<8>(int64_t x) {
+ return static_cast<int8_t>(x) == x;
+}
+template<>
+inline bool isInt<16>(int64_t x) {
+ return static_cast<int16_t>(x) == x;
+}
+template<>
+inline bool isInt<32>(int64_t x) {
+ return static_cast<int32_t>(x) == x;
+}
+
+/// isShiftedInt<N,S> - Checks if a signed integer is an N bit number shifted
+/// left by S.
+template<unsigned N, unsigned S>
+inline bool isShiftedInt(int64_t x) {
+ return isInt<N+S>(x) && (x % (1<<S) == 0);
+}
+
+/// isUInt - Checks if an unsigned integer fits into the given bit width.
+template<unsigned N>
+inline bool isUInt(uint64_t x) {
+ return N >= 64 || x < (UINT64_C(1)<<(N));
+}
+// Template specializations to get better code for common cases.
+template<>
+inline bool isUInt<8>(uint64_t x) {
+ return static_cast<uint8_t>(x) == x;
+}
+template<>
+inline bool isUInt<16>(uint64_t x) {
+ return static_cast<uint16_t>(x) == x;
+}
+template<>
+inline bool isUInt<32>(uint64_t x) {
+ return static_cast<uint32_t>(x) == x;
+}
+
+/// isShiftedUInt<N,S> - Checks if a unsigned integer is an N bit number shifted
+/// left by S.
+template<unsigned N, unsigned S>
+inline bool isShiftedUInt(uint64_t x) {
+ return isUInt<N+S>(x) && (x % (1<<S) == 0);
+}
+
+/// isUIntN - Checks if an unsigned integer fits into the given (dynamic)
+/// bit width.
+inline bool isUIntN(unsigned N, uint64_t x) {
+ return N >= 64 || x < (UINT64_C(1)<<(N));
+}
+
+/// isIntN - Checks if an signed integer fits into the given (dynamic)
+/// bit width.
+inline bool isIntN(unsigned N, int64_t x) {
+ return N >= 64 || (-(INT64_C(1)<<(N-1)) <= x && x < (INT64_C(1)<<(N-1)));
+}
+
+/// isMask_32 - This function returns true if the argument is a non-empty
+/// sequence of ones starting at the least significant bit with the remainder
+/// zero (32 bit version). Ex. isMask_32(0x0000FFFFU) == true.
+inline bool isMask_32(uint32_t Value) {
+ return Value && ((Value + 1) & Value) == 0;
+}
+
+/// isMask_64 - This function returns true if the argument is a non-empty
+/// sequence of ones starting at the least significant bit with the remainder
+/// zero (64 bit version).
+inline bool isMask_64(uint64_t Value) {
+ return Value && ((Value + 1) & Value) == 0;
+}
+
+/// isShiftedMask_32 - This function returns true if the argument contains a
+/// non-empty sequence of ones with the remainder zero (32 bit version.)
+/// Ex. isShiftedMask_32(0x0000FF00U) == true.
+inline bool isShiftedMask_32(uint32_t Value) {
+ return Value && isMask_32((Value - 1) | Value);
+}
+
+/// isShiftedMask_64 - This function returns true if the argument contains a
+/// non-empty sequence of ones with the remainder zero (64 bit version.)
+inline bool isShiftedMask_64(uint64_t Value) {
+ return Value && isMask_64((Value - 1) | Value);
+}
+
+/// isPowerOf2_32 - This function returns true if the argument is a power of
+/// two > 0. Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
+inline bool isPowerOf2_32(uint32_t Value) {
+ return Value && !(Value & (Value - 1));
+}
+
+/// isPowerOf2_64 - This function returns true if the argument is a power of two
+/// > 0 (64 bit edition.)
+inline bool isPowerOf2_64(uint64_t Value) {
+ return Value && !(Value & (Value - int64_t(1L)));
+}
+
+/// ByteSwap_16 - This function returns a byte-swapped representation of the
+/// 16-bit argument, Value.
+inline uint16_t ByteSwap_16(uint16_t Value) {
+ return sys::SwapByteOrder_16(Value);
+}
+
+/// ByteSwap_32 - This function returns a byte-swapped representation of the
+/// 32-bit argument, Value.
+inline uint32_t ByteSwap_32(uint32_t Value) {
+ return sys::SwapByteOrder_32(Value);
+}
+
+/// ByteSwap_64 - This function returns a byte-swapped representation of the
+/// 64-bit argument, Value.
+inline uint64_t ByteSwap_64(uint64_t Value) {
+ return sys::SwapByteOrder_64(Value);
+}
+
+/// \brief Count the number of ones from the most significant bit to the first
+/// zero bit.
+///
+/// Ex. CountLeadingOnes(0xFF0FFF00) == 8.
+/// Only unsigned integral types are allowed.
+///
+/// \param ZB the behavior on an input of all ones. Only ZB_Width and
+/// ZB_Undefined are valid arguments.
+template <typename T>
+std::size_t countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
+ static_assert(std::numeric_limits<T>::is_integer &&
+ !std::numeric_limits<T>::is_signed,
+ "Only unsigned integral types are allowed.");
+ return countLeadingZeros(~Value, ZB);
+}
+
+/// \brief Count the number of ones from the least significant bit to the first
+/// zero bit.
+///
+/// Ex. countTrailingOnes(0x00FF00FF) == 8.
+/// Only unsigned integral types are allowed.
+///
+/// \param ZB the behavior on an input of all ones. Only ZB_Width and
+/// ZB_Undefined are valid arguments.
+template <typename T>
+std::size_t countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
+ static_assert(std::numeric_limits<T>::is_integer &&
+ !std::numeric_limits<T>::is_signed,
+ "Only unsigned integral types are allowed.");
+ return countTrailingZeros(~Value, ZB);
+}
+
+namespace detail {
+template <typename T, std::size_t SizeOfT> struct PopulationCounter {
+ static unsigned count(T Value) {
+ // Generic version, forward to 32 bits.
+ static_assert(SizeOfT <= 4, "Not implemented!");
+#if __GNUC__ >= 4
+ return __builtin_popcount(Value);
+#else
+ uint32_t v = Value;
+ v = v - ((v >> 1) & 0x55555555);
+ v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
+ return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
+#endif
+ }
+};
+
+template <typename T> struct PopulationCounter<T, 8> {
+ static unsigned count(T Value) {
+#if __GNUC__ >= 4
+ return __builtin_popcountll(Value);
+#else
+ uint64_t v = Value;
+ v = v - ((v >> 1) & 0x5555555555555555ULL);
+ v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
+ v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
+ return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
+#endif
+ }
+};
+} // namespace detail
+
+/// \brief Count the number of set bits in a value.
+/// Ex. countPopulation(0xF000F000) = 8
+/// Returns 0 if the word is zero.
+template <typename T>
+inline unsigned countPopulation(T Value) {
+ static_assert(std::numeric_limits<T>::is_integer &&
+ !std::numeric_limits<T>::is_signed,
+ "Only unsigned integral types are allowed.");
+ return detail::PopulationCounter<T, sizeof(T)>::count(Value);
+}
+
+/// Log2 - This function returns the log base 2 of the specified value
+inline double Log2(double Value) {
+#if defined(__ANDROID_API__) && __ANDROID_API__ < 18
+ return __builtin_log(Value) / __builtin_log(2.0);
+#else
+ return log2(Value);
+#endif
+}
+
+/// Log2_32 - This function returns the floor log base 2 of the specified value,
+/// -1 if the value is zero. (32 bit edition.)
+/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
+inline unsigned Log2_32(uint32_t Value) {
+ return 31 - countLeadingZeros(Value);
+}
+
+/// Log2_64 - This function returns the floor log base 2 of the specified value,
+/// -1 if the value is zero. (64 bit edition.)
+inline unsigned Log2_64(uint64_t Value) {
+ return 63 - countLeadingZeros(Value);
+}
+
+/// Log2_32_Ceil - This function returns the ceil log base 2 of the specified
+/// value, 32 if the value is zero. (32 bit edition).
+/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
+inline unsigned Log2_32_Ceil(uint32_t Value) {
+ return 32 - countLeadingZeros(Value - 1);
+}
+
+/// Log2_64_Ceil - This function returns the ceil log base 2 of the specified
+/// value, 64 if the value is zero. (64 bit edition.)
+inline unsigned Log2_64_Ceil(uint64_t Value) {
+ return 64 - countLeadingZeros(Value - 1);
+}
+
+/// GreatestCommonDivisor64 - Return the greatest common divisor of the two
+/// values using Euclid's algorithm.
+inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) {
+ while (B) {
+ uint64_t T = B;
+ B = A % B;
+ A = T;
+ }
+ return A;
+}
+
+/// BitsToDouble - This function takes a 64-bit integer and returns the bit
+/// equivalent double.
+inline double BitsToDouble(uint64_t Bits) {
+ union {
+ uint64_t L;
+ double D;
+ } T;
+ T.L = Bits;
+ return T.D;
+}
+
+/// BitsToFloat - This function takes a 32-bit integer and returns the bit
+/// equivalent float.
+inline float BitsToFloat(uint32_t Bits) {
+ union {
+ uint32_t I;
+ float F;
+ } T;
+ T.I = Bits;
+ return T.F;
+}
+
+/// DoubleToBits - This function takes a double and returns the bit
+/// equivalent 64-bit integer. Note that copying doubles around
+/// changes the bits of NaNs on some hosts, notably x86, so this
+/// routine cannot be used if these bits are needed.
+inline uint64_t DoubleToBits(double Double) {
+ union {
+ uint64_t L;
+ double D;
+ } T;
+ T.D = Double;
+ return T.L;
+}
+
+/// FloatToBits - This function takes a float and returns the bit
+/// equivalent 32-bit integer. Note that copying floats around
+/// changes the bits of NaNs on some hosts, notably x86, so this
+/// routine cannot be used if these bits are needed.
+inline uint32_t FloatToBits(float Float) {
+ union {
+ uint32_t I;
+ float F;
+ } T;
+ T.F = Float;
+ return T.I;
+}
+
+/// MinAlign - A and B are either alignments or offsets. Return the minimum
+/// alignment that may be assumed after adding the two together.
+inline uint64_t MinAlign(uint64_t A, uint64_t B) {
+ // The largest power of 2 that divides both A and B.
+ //
+ // Replace "-Value" by "1+~Value" in the following commented code to avoid
+ // MSVC warning C4146
+ // return (A | B) & -(A | B);
+ return (A | B) & (1 + ~(A | B));
+}
+
+/// \brief Aligns \c Addr to \c Alignment bytes, rounding up.
+///
+/// Alignment should be a power of two. This method rounds up, so
+/// alignAddr(7, 4) == 8 and alignAddr(8, 4) == 8.
+inline uintptr_t alignAddr(const void *Addr, size_t Alignment) {
+ assert(Alignment && isPowerOf2_64((uint64_t)Alignment) &&
+ "Alignment is not a power of two!");
+
+ assert((uintptr_t)Addr + Alignment - 1 >= (uintptr_t)Addr);
+
+ return (((uintptr_t)Addr + Alignment - 1) & ~(uintptr_t)(Alignment - 1));
+}
+
+/// \brief Returns the necessary adjustment for aligning \c Ptr to \c Alignment
+/// bytes, rounding up.
+inline size_t alignmentAdjustment(const void *Ptr, size_t Alignment) {
+ return alignAddr(Ptr, Alignment) - (uintptr_t)Ptr;
+}
+
+/// NextPowerOf2 - Returns the next power of two (in 64-bits)
+/// that is strictly greater than A. Returns zero on overflow.
+inline uint64_t NextPowerOf2(uint64_t A) {
+ A |= (A >> 1);
+ A |= (A >> 2);
+ A |= (A >> 4);
+ A |= (A >> 8);
+ A |= (A >> 16);
+ A |= (A >> 32);
+ return A + 1;
+}
+
+/// Returns the power of two which is less than or equal to the given value.
+/// Essentially, it is a floor operation across the domain of powers of two.
+inline uint64_t PowerOf2Floor(uint64_t A) {
+ if (!A) return 0;
+ return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
+}
+
+/// Returns the next integer (mod 2**64) that is greater than or equal to
+/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
+///
+/// If non-zero \p Skew is specified, the return value will be a minimal
+/// integer that is greater than or equal to \p Value and equal to
+/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than
+/// \p Align, its value is adjusted to '\p Skew mod \p Align'.
+///
+/// Examples:
+/// \code
+/// RoundUpToAlignment(5, 8) = 8
+/// RoundUpToAlignment(17, 8) = 24
+/// RoundUpToAlignment(~0LL, 8) = 0
+/// RoundUpToAlignment(321, 255) = 510
+///
+/// RoundUpToAlignment(5, 8, 7) = 7
+/// RoundUpToAlignment(17, 8, 1) = 17
+/// RoundUpToAlignment(~0LL, 8, 3) = 3
+/// RoundUpToAlignment(321, 255, 42) = 552
+/// \endcode
+inline uint64_t RoundUpToAlignment(uint64_t Value, uint64_t Align,
+ uint64_t Skew = 0) {
+ Skew %= Align;
+ return (Value + Align - 1 - Skew) / Align * Align + Skew;
+}
+
+/// Returns the offset to the next integer (mod 2**64) that is greater than
+/// or equal to \p Value and is a multiple of \p Align. \p Align must be
+/// non-zero.
+inline uint64_t OffsetToAlignment(uint64_t Value, uint64_t Align) {
+ return RoundUpToAlignment(Value, Align) - Value;
+}
+
+/// SignExtend32 - Sign extend B-bit number x to 32-bit int.
+/// Usage int32_t r = SignExtend32<5>(x);
+template <unsigned B> inline int32_t SignExtend32(uint32_t x) {
+ return int32_t(x << (32 - B)) >> (32 - B);
+}
+
+/// \brief Sign extend number in the bottom B bits of X to a 32-bit int.
+/// Requires 0 < B <= 32.
+inline int32_t SignExtend32(uint32_t X, unsigned B) {
+ return int32_t(X << (32 - B)) >> (32 - B);
+}
+
+/// SignExtend64 - Sign extend B-bit number x to 64-bit int.
+/// Usage int64_t r = SignExtend64<5>(x);
+template <unsigned B> inline int64_t SignExtend64(uint64_t x) {
+ return int64_t(x << (64 - B)) >> (64 - B);
+}
+
+/// \brief Sign extend number in the bottom B bits of X to a 64-bit int.
+/// Requires 0 < B <= 64.
+inline int64_t SignExtend64(uint64_t X, unsigned B) {
+ return int64_t(X << (64 - B)) >> (64 - B);
+}
+
+/// \brief Add two unsigned integers, X and Y, of type T.
+/// Clamp the result to the maximum representable value of T on overflow.
+/// ResultOverflowed indicates if the result is larger than the maximum
+/// representable value of type T.
+template <typename T>
+typename std::enable_if<std::is_unsigned<T>::value, T>::type
+SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) {
+ bool Dummy;
+ bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
+ // Hacker's Delight, p. 29
+ T Z = X + Y;
+ Overflowed = (Z < X || Z < Y);
+ if (Overflowed)
+ return std::numeric_limits<T>::max();
+ else
+ return Z;
+}
+
+/// \brief Multiply two unsigned integers, X and Y, of type T.
+/// Clamp the result to the maximum representable value of T on overflow.
+/// ResultOverflowed indicates if the result is larger than the maximum
+/// representable value of type T.
+template <typename T>
+typename std::enable_if<std::is_unsigned<T>::value, T>::type
+SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) {
+ bool Dummy;
+ bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
+
+ // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
+ // because it fails for uint16_t (where multiplication can have undefined
+ // behavior due to promotion to int), and requires a division in addition
+ // to the multiplication.
+
+ Overflowed = false;
+
+ // Log2(Z) would be either Log2Z or Log2Z + 1.
+ // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
+ // will necessarily be less than Log2Max as desired.
+ int Log2Z = Log2_64(X) + Log2_64(Y);
+ const T Max = std::numeric_limits<T>::max();
+ int Log2Max = Log2_64(Max);
+ if (Log2Z < Log2Max) {
+ return X * Y;
+ }
+ if (Log2Z > Log2Max) {
+ Overflowed = true;
+ return Max;
+ }
+
+ // We're going to use the top bit, and maybe overflow one
+ // bit past it. Multiply all but the bottom bit then add
+ // that on at the end.
+ T Z = (X >> 1) * Y;
+ if (Z & ~(Max >> 1)) {
+ Overflowed = true;
+ return Max;
+ }
+ Z <<= 1;
+ if (X & 1)
+ return SaturatingAdd(Z, Y, ResultOverflowed);
+
+ return Z;
+}
+
+/// \brief Multiply two unsigned integers, X and Y, and add the unsigned
+/// integer, A to the product. Clamp the result to the maximum representable
+/// value of T on overflow. ResultOverflowed indicates if the result is larger
+/// than the maximum representable value of type T.
+/// Note that this is purely a convenience function as there is no distinction
+/// where overflow occurred in a 'fused' multiply-add for unsigned numbers.
+template <typename T>
+typename std::enable_if<std::is_unsigned<T>::value, T>::type
+SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) {
+ bool Dummy;
+ bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
+
+ T Product = SaturatingMultiply(X, Y, &Overflowed);
+ if (Overflowed)
+ return Product;
+
+ return SaturatingAdd(A, Product, &Overflowed);
+}
+
+extern const float huge_valf;
+} // End llvm namespace
+
+#endif
diff --git a/ext/include/llvm/Support/Memory.h b/ext/include/llvm/Support/Memory.h
new file mode 100644
index 0000000..8103aea
--- /dev/null
+++ b/ext/include/llvm/Support/Memory.h
@@ -0,0 +1,186 @@
+//===- llvm/Support/Memory.h - Memory Support -------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the llvm::sys::Memory class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_MEMORY_H
+#define LLVM_SUPPORT_MEMORY_H
+
+#include "llvm/Support/DataTypes.h"
+#include <string>
+#include <system_error>
+
+namespace llvm {
+namespace sys {
+
+ /// This class encapsulates the notion of a memory block which has an address
+ /// and a size. It is used by the Memory class (a friend) as the result of
+ /// various memory allocation operations.
+ /// @see Memory
+ /// @brief Memory block abstraction.
+ class MemoryBlock {
+ public:
+ MemoryBlock() : Address(nullptr), Size(0) { }
+ MemoryBlock(void *addr, size_t size) : Address(addr), Size(size) { }
+ void *base() const { return Address; }
+ size_t size() const { return Size; }
+
+ private:
+ void *Address; ///< Address of first byte of memory area
+ size_t Size; ///< Size, in bytes of the memory area
+ friend class Memory;
+ };
+
+ /// This class provides various memory handling functions that manipulate
+ /// MemoryBlock instances.
+ /// @since 1.4
+ /// @brief An abstraction for memory operations.
+ class Memory {
+ public:
+ enum ProtectionFlags {
+ MF_READ = 0x1000000,
+ MF_WRITE = 0x2000000,
+ MF_EXEC = 0x4000000
+ };
+
+ /// This method allocates a block of memory that is suitable for loading
+ /// dynamically generated code (e.g. JIT). An attempt to allocate
+ /// \p NumBytes bytes of virtual memory is made.
+ /// \p NearBlock may point to an existing allocation in which case
+ /// an attempt is made to allocate more memory near the existing block.
+ /// The actual allocated address is not guaranteed to be near the requested
+ /// address.
+ /// \p Flags is used to set the initial protection flags for the block
+ /// of the memory.
+ /// \p EC [out] returns an object describing any error that occurs.
+ ///
+ /// This method may allocate more than the number of bytes requested. The
+ /// actual number of bytes allocated is indicated in the returned
+ /// MemoryBlock.
+ ///
+ /// The start of the allocated block must be aligned with the
+ /// system allocation granularity (64K on Windows, page size on Linux).
+ /// If the address following \p NearBlock is not so aligned, it will be
+ /// rounded up to the next allocation granularity boundary.
+ ///
+ /// \r a non-null MemoryBlock if the function was successful,
+ /// otherwise a null MemoryBlock is with \p EC describing the error.
+ ///
+ /// @brief Allocate mapped memory.
+ static MemoryBlock allocateMappedMemory(size_t NumBytes,
+ const MemoryBlock *const NearBlock,
+ unsigned Flags,
+ std::error_code &EC);
+
+ /// This method releases a block of memory that was allocated with the
+ /// allocateMappedMemory method. It should not be used to release any
+ /// memory block allocated any other way.
+ /// \p Block describes the memory to be released.
+ ///
+ /// \r error_success if the function was successful, or an error_code
+ /// describing the failure if an error occurred.
+ ///
+ /// @brief Release mapped memory.
+ static std::error_code releaseMappedMemory(MemoryBlock &Block);
+
+ /// This method sets the protection flags for a block of memory to the
+ /// state specified by /p Flags. The behavior is not specified if the
+ /// memory was not allocated using the allocateMappedMemory method.
+ /// \p Block describes the memory block to be protected.
+ /// \p Flags specifies the new protection state to be assigned to the block.
+ /// \p ErrMsg [out] returns a string describing any error that occurred.
+ ///
+ /// If \p Flags is MF_WRITE, the actual behavior varies
+ /// with the operating system (i.e. MF_READ | MF_WRITE on Windows) and the
+ /// target architecture (i.e. MF_WRITE -> MF_READ | MF_WRITE on i386).
+ ///
+ /// \r error_success if the function was successful, or an error_code
+ /// describing the failure if an error occurred.
+ ///
+ /// @brief Set memory protection state.
+ static std::error_code protectMappedMemory(const MemoryBlock &Block,
+ unsigned Flags);
+
+ /// This method allocates a block of Read/Write/Execute memory that is
+ /// suitable for executing dynamically generated code (e.g. JIT). An
+ /// attempt to allocate \p NumBytes bytes of virtual memory is made.
+ /// \p NearBlock may point to an existing allocation in which case
+ /// an attempt is made to allocate more memory near the existing block.
+ ///
+ /// On success, this returns a non-null memory block, otherwise it returns
+ /// a null memory block and fills in *ErrMsg.
+ ///
+ /// @brief Allocate Read/Write/Execute memory.
+ static MemoryBlock AllocateRWX(size_t NumBytes,
+ const MemoryBlock *NearBlock,
+ std::string *ErrMsg = nullptr);
+
+ /// This method releases a block of Read/Write/Execute memory that was
+ /// allocated with the AllocateRWX method. It should not be used to
+ /// release any memory block allocated any other way.
+ ///
+ /// On success, this returns false, otherwise it returns true and fills
+ /// in *ErrMsg.
+ /// @brief Release Read/Write/Execute memory.
+ static bool ReleaseRWX(MemoryBlock &block, std::string *ErrMsg = nullptr);
+
+ /// InvalidateInstructionCache - Before the JIT can run a block of code
+ /// that has been emitted it must invalidate the instruction cache on some
+ /// platforms.
+ static void InvalidateInstructionCache(const void *Addr, size_t Len);
+
+ /// setExecutable - Before the JIT can run a block of code, it has to be
+ /// given read and executable privilege. Return true if it is already r-x
+ /// or the system is able to change its previlege.
+ static bool setExecutable(MemoryBlock &M, std::string *ErrMsg = nullptr);
+
+ /// setWritable - When adding to a block of code, the JIT may need
+ /// to mark a block of code as RW since the protections are on page
+ /// boundaries, and the JIT internal allocations are not page aligned.
+ static bool setWritable(MemoryBlock &M, std::string *ErrMsg = nullptr);
+
+ /// setRangeExecutable - Mark the page containing a range of addresses
+ /// as executable.
+ static bool setRangeExecutable(const void *Addr, size_t Size);
+
+ /// setRangeWritable - Mark the page containing a range of addresses
+ /// as writable.
+ static bool setRangeWritable(const void *Addr, size_t Size);
+ };
+
+ /// Owning version of MemoryBlock.
+ class OwningMemoryBlock {
+ public:
+ OwningMemoryBlock() = default;
+ explicit OwningMemoryBlock(MemoryBlock M) : M(M) {}
+ OwningMemoryBlock(OwningMemoryBlock &&Other) {
+ M = Other.M;
+ Other.M = MemoryBlock();
+ }
+ OwningMemoryBlock& operator=(OwningMemoryBlock &&Other) {
+ M = Other.M;
+ Other.M = MemoryBlock();
+ return *this;
+ }
+ ~OwningMemoryBlock() {
+ Memory::releaseMappedMemory(M);
+ }
+ void *base() const { return M.base(); }
+ size_t size() const { return M.size(); }
+ MemoryBlock getMemoryBlock() const { return M; }
+ private:
+ MemoryBlock M;
+ };
+
+}
+}
+
+#endif
diff --git a/ext/include/llvm/Support/MemoryBuffer.h b/ext/include/llvm/Support/MemoryBuffer.h
new file mode 100644
index 0000000..ba6b5fe
--- /dev/null
+++ b/ext/include/llvm/Support/MemoryBuffer.h
@@ -0,0 +1,169 @@
+//===--- MemoryBuffer.h - Memory Buffer Interface ---------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the MemoryBuffer interface.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_MEMORYBUFFER_H
+#define LLVM_SUPPORT_MEMORYBUFFER_H
+
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ErrorOr.h"
+#include <memory>
+
+namespace llvm {
+class MemoryBufferRef;
+
+/// This interface provides simple read-only access to a block of memory, and
+/// provides simple methods for reading files and standard input into a memory
+/// buffer. In addition to basic access to the characters in the file, this
+/// interface guarantees you can read one character past the end of the file,
+/// and that this character will read as '\0'.
+///
+/// The '\0' guarantee is needed to support an optimization -- it's intended to
+/// be more efficient for clients which are reading all the data to stop
+/// reading when they encounter a '\0' than to continually check the file
+/// position to see if it has reached the end of the file.
+class MemoryBuffer {
+ const char *BufferStart; // Start of the buffer.
+ const char *BufferEnd; // End of the buffer.
+
+ MemoryBuffer(const MemoryBuffer &) = delete;
+ MemoryBuffer &operator=(const MemoryBuffer &) = delete;
+protected:
+ MemoryBuffer() {}
+ void init(const char *BufStart, const char *BufEnd,
+ bool RequiresNullTerminator);
+public:
+ virtual ~MemoryBuffer();
+
+ const char *getBufferStart() const { return BufferStart; }
+ const char *getBufferEnd() const { return BufferEnd; }
+ size_t getBufferSize() const { return BufferEnd-BufferStart; }
+
+ StringRef getBuffer() const {
+ return StringRef(BufferStart, getBufferSize());
+ }
+
+ /// Return an identifier for this buffer, typically the filename it was read
+ /// from.
+ virtual const char *getBufferIdentifier() const {
+ return "Unknown buffer";
+ }
+
+ /// Open the specified file as a MemoryBuffer, returning a new MemoryBuffer
+ /// if successful, otherwise returning null. If FileSize is specified, this
+ /// means that the client knows that the file exists and that it has the
+ /// specified size.
+ ///
+ /// \param IsVolatileSize Set to true to indicate that the file size may be
+ /// changing, e.g. when libclang tries to parse while the user is
+ /// editing/updating the file.
+ static ErrorOr<std::unique_ptr<MemoryBuffer>>
+ getFile(const Twine &Filename, int64_t FileSize = -1,
+ bool RequiresNullTerminator = true, bool IsVolatileSize = false);
+
+ /// Given an already-open file descriptor, map some slice of it into a
+ /// MemoryBuffer. The slice is specified by an \p Offset and \p MapSize.
+ /// Since this is in the middle of a file, the buffer is not null terminated.
+ static ErrorOr<std::unique_ptr<MemoryBuffer>>
+ getOpenFileSlice(int FD, const Twine &Filename, uint64_t MapSize,
+ int64_t Offset);
+
+ /// Given an already-open file descriptor, read the file and return a
+ /// MemoryBuffer.
+ ///
+ /// \param IsVolatileSize Set to true to indicate that the file size may be
+ /// changing, e.g. when libclang tries to parse while the user is
+ /// editing/updating the file.
+ static ErrorOr<std::unique_ptr<MemoryBuffer>>
+ getOpenFile(int FD, const Twine &Filename, uint64_t FileSize,
+ bool RequiresNullTerminator = true, bool IsVolatileSize = false);
+
+ /// Open the specified memory range as a MemoryBuffer. Note that InputData
+ /// must be null terminated if RequiresNullTerminator is true.
+ static std::unique_ptr<MemoryBuffer>
+ getMemBuffer(StringRef InputData, StringRef BufferName = "",
+ bool RequiresNullTerminator = true);
+
+ static std::unique_ptr<MemoryBuffer>
+ getMemBuffer(MemoryBufferRef Ref, bool RequiresNullTerminator = true);
+
+ /// Open the specified memory range as a MemoryBuffer, copying the contents
+ /// and taking ownership of it. InputData does not have to be null terminated.
+ static std::unique_ptr<MemoryBuffer>
+ getMemBufferCopy(StringRef InputData, const Twine &BufferName = "");
+
+ /// Allocate a new zero-initialized MemoryBuffer of the specified size. Note
+ /// that the caller need not initialize the memory allocated by this method.
+ /// The memory is owned by the MemoryBuffer object.
+ static std::unique_ptr<MemoryBuffer>
+ getNewMemBuffer(size_t Size, StringRef BufferName = "");
+
+ /// Allocate a new MemoryBuffer of the specified size that is not initialized.
+ /// Note that the caller should initialize the memory allocated by this
+ /// method. The memory is owned by the MemoryBuffer object.
+ static std::unique_ptr<MemoryBuffer>
+ getNewUninitMemBuffer(size_t Size, const Twine &BufferName = "");
+
+ /// Read all of stdin into a file buffer, and return it.
+ static ErrorOr<std::unique_ptr<MemoryBuffer>> getSTDIN();
+
+ /// Open the specified file as a MemoryBuffer, or open stdin if the Filename
+ /// is "-".
+ static ErrorOr<std::unique_ptr<MemoryBuffer>>
+ getFileOrSTDIN(const Twine &Filename, int64_t FileSize = -1,
+ bool RequiresNullTerminator = true);
+
+ /// Map a subrange of the specified file as a MemoryBuffer.
+ static ErrorOr<std::unique_ptr<MemoryBuffer>>
+ getFileSlice(const Twine &Filename, uint64_t MapSize, uint64_t Offset);
+
+ //===--------------------------------------------------------------------===//
+ // Provided for performance analysis.
+ //===--------------------------------------------------------------------===//
+
+ /// The kind of memory backing used to support the MemoryBuffer.
+ enum BufferKind {
+ MemoryBuffer_Malloc,
+ MemoryBuffer_MMap
+ };
+
+ /// Return information on the memory mechanism used to support the
+ /// MemoryBuffer.
+ virtual BufferKind getBufferKind() const = 0;
+
+ MemoryBufferRef getMemBufferRef() const;
+};
+
+class MemoryBufferRef {
+ StringRef Buffer;
+ StringRef Identifier;
+
+public:
+ MemoryBufferRef() {}
+ MemoryBufferRef(MemoryBuffer& Buffer)
+ : Buffer(Buffer.getBuffer()), Identifier(Buffer.getBufferIdentifier()) {}
+ MemoryBufferRef(StringRef Buffer, StringRef Identifier)
+ : Buffer(Buffer), Identifier(Identifier) {}
+
+ StringRef getBuffer() const { return Buffer; }
+
+ StringRef getBufferIdentifier() const { return Identifier; }
+
+ const char *getBufferStart() const { return Buffer.begin(); }
+ const char *getBufferEnd() const { return Buffer.end(); }
+ size_t getBufferSize() const { return Buffer.size(); }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/ext/include/llvm/Support/Mutex.h b/ext/include/llvm/Support/Mutex.h
new file mode 100644
index 0000000..85237f2
--- /dev/null
+++ b/ext/include/llvm/Support/Mutex.h
@@ -0,0 +1,155 @@
+//===- llvm/Support/Mutex.h - Mutex Operating System Concept -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the llvm::sys::Mutex class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_MUTEX_H
+#define LLVM_SUPPORT_MUTEX_H
+
+#include "llvm/Support/Compiler.h"
+#include <cassert>
+
+namespace llvm
+{
+ namespace sys
+ {
+ /// @brief Platform agnostic Mutex class.
+ class MutexImpl
+ {
+ /// @name Constructors
+ /// @{
+ public:
+
+ /// Initializes the lock but doesn't acquire it. if \p recursive is set
+ /// to false, the lock will not be recursive which makes it cheaper but
+ /// also more likely to deadlock (same thread can't acquire more than
+ /// once).
+ /// @brief Default Constructor.
+ explicit MutexImpl(bool recursive = true);
+
+ /// Releases and removes the lock
+ /// @brief Destructor
+ ~MutexImpl();
+
+ /// @}
+ /// @name Methods
+ /// @{
+ public:
+
+ /// Attempts to unconditionally acquire the lock. If the lock is held by
+ /// another thread, this method will wait until it can acquire the lock.
+ /// @returns false if any kind of error occurs, true otherwise.
+ /// @brief Unconditionally acquire the lock.
+ bool acquire();
+
+ /// Attempts to release the lock. If the lock is held by the current
+ /// thread, the lock is released allowing other threads to acquire the
+ /// lock.
+ /// @returns false if any kind of error occurs, true otherwise.
+ /// @brief Unconditionally release the lock.
+ bool release();
+
+ /// Attempts to acquire the lock without blocking. If the lock is not
+ /// available, this function returns false quickly (without blocking). If
+ /// the lock is available, it is acquired.
+ /// @returns false if any kind of error occurs or the lock is not
+ /// available, true otherwise.
+ /// @brief Try to acquire the lock.
+ bool tryacquire();
+
+ //@}
+ /// @name Platform Dependent Data
+ /// @{
+ private:
+ void* data_; ///< We don't know what the data will be
+
+ /// @}
+ /// @name Do Not Implement
+ /// @{
+ private:
+ MutexImpl(const MutexImpl &) = delete;
+ void operator=(const MutexImpl &) = delete;
+ /// @}
+ };
+
+
+ /// SmartMutex - A mutex with a compile time constant parameter that
+ /// indicates whether this mutex should become a no-op when we're not
+ /// running in multithreaded mode.
+ template<bool mt_only>
+ class SmartMutex {
+ MutexImpl impl;
+ unsigned acquired;
+ bool recursive;
+ public:
+ explicit SmartMutex(bool rec = true) :
+ impl(rec), acquired(0), recursive(rec) { }
+
+ bool lock() {
+ if (!mt_only) {
+ return impl.acquire();
+ } else {
+ // Single-threaded debugging code. This would be racy in
+ // multithreaded mode, but provides not sanity checks in single
+ // threaded mode.
+ assert((recursive || acquired == 0) && "Lock already acquired!!");
+ ++acquired;
+ return true;
+ }
+ }
+
+ bool unlock() {
+ if (!mt_only) {
+ return impl.release();
+ } else {
+ // Single-threaded debugging code. This would be racy in
+ // multithreaded mode, but provides not sanity checks in single
+ // threaded mode.
+ assert(((recursive && acquired) || (acquired == 1)) &&
+ "Lock not acquired before release!");
+ --acquired;
+ return true;
+ }
+ }
+
+ bool try_lock() {
+ if (!mt_only)
+ return impl.tryacquire();
+ else return true;
+ }
+
+ private:
+ SmartMutex(const SmartMutex<mt_only> & original);
+ void operator=(const SmartMutex<mt_only> &);
+ };
+
+ /// Mutex - A standard, always enforced mutex.
+ typedef SmartMutex<false> Mutex;
+
+ template<bool mt_only>
+ class SmartScopedLock {
+ SmartMutex<mt_only>& mtx;
+
+ public:
+ SmartScopedLock(SmartMutex<mt_only>& m) : mtx(m) {
+ mtx.lock();
+ }
+
+ ~SmartScopedLock() {
+ mtx.unlock();
+ }
+ };
+
+ typedef SmartScopedLock<false> ScopedLock;
+ }
+}
+
+#endif
diff --git a/ext/include/llvm/Support/MutexGuard.h b/ext/include/llvm/Support/MutexGuard.h
new file mode 100644
index 0000000..07b64b6
--- /dev/null
+++ b/ext/include/llvm/Support/MutexGuard.h
@@ -0,0 +1,41 @@
+//===-- Support/MutexGuard.h - Acquire/Release Mutex In Scope ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a guard for a block of code that ensures a Mutex is locked
+// upon construction and released upon destruction.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_MUTEXGUARD_H
+#define LLVM_SUPPORT_MUTEXGUARD_H
+
+#include "llvm/Support/Mutex.h"
+
+namespace llvm {
+ /// Instances of this class acquire a given Mutex Lock when constructed and
+ /// hold that lock until destruction. The intention is to instantiate one of
+ /// these on the stack at the top of some scope to be assured that C++
+ /// destruction of the object will always release the Mutex and thus avoid
+ /// a host of nasty multi-threading problems in the face of exceptions, etc.
+ /// @brief Guard a section of code with a Mutex.
+ class MutexGuard {
+ sys::Mutex &M;
+ MutexGuard(const MutexGuard &) = delete;
+ void operator=(const MutexGuard &) = delete;
+ public:
+ MutexGuard(sys::Mutex &m) : M(m) { M.lock(); }
+ ~MutexGuard() { M.unlock(); }
+ /// holds - Returns true if this locker instance holds the specified lock.
+ /// This is mostly used in assertions to validate that the correct mutex
+ /// is held.
+ bool holds(const sys::Mutex& lock) const { return &M == &lock; }
+ };
+}
+
+#endif // LLVM_SUPPORT_MUTEXGUARD_H
diff --git a/ext/include/llvm/Support/Path.h b/ext/include/llvm/Support/Path.h
new file mode 100644
index 0000000..955cc99
--- /dev/null
+++ b/ext/include/llvm/Support/Path.h
@@ -0,0 +1,437 @@
+//===- llvm/Support/Path.h - Path Operating System Concept ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the llvm::sys::path namespace. It is designed after
+// TR2/boost filesystem (v3), but modified to remove exception handling and the
+// path class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_PATH_H
+#define LLVM_SUPPORT_PATH_H
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/DataTypes.h"
+#include <iterator>
+
+namespace llvm {
+namespace sys {
+namespace path {
+
+/// @name Lexical Component Iterator
+/// @{
+
+/// @brief Path iterator.
+///
+/// This is an input iterator that iterates over the individual components in
+/// \a path. The traversal order is as follows:
+/// * The root-name element, if present.
+/// * The root-directory element, if present.
+/// * Each successive filename element, if present.
+/// * Dot, if one or more trailing non-root slash characters are present.
+/// Traversing backwards is possible with \a reverse_iterator
+///
+/// Iteration examples. Each component is separated by ',':
+/// @code
+/// / => /
+/// /foo => /,foo
+/// foo/ => foo,.
+/// /foo/bar => /,foo,bar
+/// ../ => ..,.
+/// C:\foo\bar => C:,/,foo,bar
+/// @endcode
+class const_iterator
+ : public std::iterator<std::input_iterator_tag, const StringRef> {
+ StringRef Path; ///< The entire path.
+ StringRef Component; ///< The current component. Not necessarily in Path.
+ size_t Position; ///< The iterators current position within Path.
+
+ // An end iterator has Position = Path.size() + 1.
+ friend const_iterator begin(StringRef path);
+ friend const_iterator end(StringRef path);
+
+public:
+ reference operator*() const { return Component; }
+ pointer operator->() const { return &Component; }
+ const_iterator &operator++(); // preincrement
+ bool operator==(const const_iterator &RHS) const;
+ bool operator!=(const const_iterator &RHS) const { return !(*this == RHS); }
+
+ /// @brief Difference in bytes between this and RHS.
+ ptrdiff_t operator-(const const_iterator &RHS) const;
+};
+
+/// @brief Reverse path iterator.
+///
+/// This is an input iterator that iterates over the individual components in
+/// \a path in reverse order. The traversal order is exactly reversed from that
+/// of \a const_iterator
+class reverse_iterator
+ : public std::iterator<std::input_iterator_tag, const StringRef> {
+ StringRef Path; ///< The entire path.
+ StringRef Component; ///< The current component. Not necessarily in Path.
+ size_t Position; ///< The iterators current position within Path.
+
+ friend reverse_iterator rbegin(StringRef path);
+ friend reverse_iterator rend(StringRef path);
+
+public:
+ reference operator*() const { return Component; }
+ pointer operator->() const { return &Component; }
+ reverse_iterator &operator++(); // preincrement
+ bool operator==(const reverse_iterator &RHS) const;
+ bool operator!=(const reverse_iterator &RHS) const { return !(*this == RHS); }
+};
+
+/// @brief Get begin iterator over \a path.
+/// @param path Input path.
+/// @returns Iterator initialized with the first component of \a path.
+const_iterator begin(StringRef path);
+
+/// @brief Get end iterator over \a path.
+/// @param path Input path.
+/// @returns Iterator initialized to the end of \a path.
+const_iterator end(StringRef path);
+
+/// @brief Get reverse begin iterator over \a path.
+/// @param path Input path.
+/// @returns Iterator initialized with the first reverse component of \a path.
+reverse_iterator rbegin(StringRef path);
+
+/// @brief Get reverse end iterator over \a path.
+/// @param path Input path.
+/// @returns Iterator initialized to the reverse end of \a path.
+reverse_iterator rend(StringRef path);
+
+/// @}
+/// @name Lexical Modifiers
+/// @{
+
+/// @brief Remove the last component from \a path unless it is the root dir.
+///
+/// @code
+/// directory/filename.cpp => directory/
+/// directory/ => directory
+/// filename.cpp => <empty>
+/// / => /
+/// @endcode
+///
+/// @param path A path that is modified to not have a file component.
+void remove_filename(SmallVectorImpl<char> &path);
+
+/// @brief Replace the file extension of \a path with \a extension.
+///
+/// @code
+/// ./filename.cpp => ./filename.extension
+/// ./filename => ./filename.extension
+/// ./ => ./.extension
+/// @endcode
+///
+/// @param path A path that has its extension replaced with \a extension.
+/// @param extension The extension to be added. It may be empty. It may also
+/// optionally start with a '.', if it does not, one will be
+/// prepended.
+void replace_extension(SmallVectorImpl<char> &path, const Twine &extension);
+
+/// @brief Append to path.
+///
+/// @code
+/// /foo + bar/f => /foo/bar/f
+/// /foo/ + bar/f => /foo/bar/f
+/// foo + bar/f => foo/bar/f
+/// @endcode
+///
+/// @param path Set to \a path + \a component.
+/// @param a The component to be appended to \a path.
+void append(SmallVectorImpl<char> &path, const Twine &a,
+ const Twine &b = "",
+ const Twine &c = "",
+ const Twine &d = "");
+
+/// @brief Append to path.
+///
+/// @code
+/// /foo + [bar,f] => /foo/bar/f
+/// /foo/ + [bar,f] => /foo/bar/f
+/// foo + [bar,f] => foo/bar/f
+/// @endcode
+///
+/// @param path Set to \a path + [\a begin, \a end).
+/// @param begin Start of components to append.
+/// @param end One past the end of components to append.
+void append(SmallVectorImpl<char> &path,
+ const_iterator begin, const_iterator end);
+
+/// @}
+/// @name Transforms (or some other better name)
+/// @{
+
+/// Convert path to the native form. This is used to give paths to users and
+/// operating system calls in the platform's normal way. For example, on Windows
+/// all '/' are converted to '\'.
+///
+/// @param path A path that is transformed to native format.
+/// @param result Holds the result of the transformation.
+void native(const Twine &path, SmallVectorImpl<char> &result);
+
+/// Convert path to the native form in place. This is used to give paths to
+/// users and operating system calls in the platform's normal way. For example,
+/// on Windows all '/' are converted to '\'.
+///
+/// @param path A path that is transformed to native format.
+void native(SmallVectorImpl<char> &path);
+
+/// @}
+/// @name Lexical Observers
+/// @{
+
+/// @brief Get root name.
+///
+/// @code
+/// //net/hello => //net
+/// c:/hello => c: (on Windows, on other platforms nothing)
+/// /hello => <empty>
+/// @endcode
+///
+/// @param path Input path.
+/// @result The root name of \a path if it has one, otherwise "".
+StringRef root_name(StringRef path);
+
+/// @brief Get root directory.
+///
+/// @code
+/// /goo/hello => /
+/// c:/hello => /
+/// d/file.txt => <empty>
+/// @endcode
+///
+/// @param path Input path.
+/// @result The root directory of \a path if it has one, otherwise
+/// "".
+StringRef root_directory(StringRef path);
+
+/// @brief Get root path.
+///
+/// Equivalent to root_name + root_directory.
+///
+/// @param path Input path.
+/// @result The root path of \a path if it has one, otherwise "".
+StringRef root_path(StringRef path);
+
+/// @brief Get relative path.
+///
+/// @code
+/// C:\hello\world => hello\world
+/// foo/bar => foo/bar
+/// /foo/bar => foo/bar
+/// @endcode
+///
+/// @param path Input path.
+/// @result The path starting after root_path if one exists, otherwise "".
+StringRef relative_path(StringRef path);
+
+/// @brief Get parent path.
+///
+/// @code
+/// / => <empty>
+/// /foo => /
+/// foo/../bar => foo/..
+/// @endcode
+///
+/// @param path Input path.
+/// @result The parent path of \a path if one exists, otherwise "".
+StringRef parent_path(StringRef path);
+
+/// @brief Get filename.
+///
+/// @code
+/// /foo.txt => foo.txt
+/// . => .
+/// .. => ..
+/// / => /
+/// @endcode
+///
+/// @param path Input path.
+/// @result The filename part of \a path. This is defined as the last component
+/// of \a path.
+StringRef filename(StringRef path);
+
+/// @brief Get stem.
+///
+/// If filename contains a dot but not solely one or two dots, result is the
+/// substring of filename ending at (but not including) the last dot. Otherwise
+/// it is filename.
+///
+/// @code
+/// /foo/bar.txt => bar
+/// /foo/bar => bar
+/// /foo/.txt => <empty>
+/// /foo/. => .
+/// /foo/.. => ..
+/// @endcode
+///
+/// @param path Input path.
+/// @result The stem of \a path.
+StringRef stem(StringRef path);
+
+/// @brief Get extension.
+///
+/// If filename contains a dot but not solely one or two dots, result is the
+/// substring of filename starting at (and including) the last dot, and ending
+/// at the end of \a path. Otherwise "".
+///
+/// @code
+/// /foo/bar.txt => .txt
+/// /foo/bar => <empty>
+/// /foo/.txt => .txt
+/// @endcode
+///
+/// @param path Input path.
+/// @result The extension of \a path.
+StringRef extension(StringRef path);
+
+/// @brief Check whether the given char is a path separator on the host OS.
+///
+/// @param value a character
+/// @result true if \a value is a path separator character on the host OS
+bool is_separator(char value);
+
+/// @brief Return the preferred separator for this platform.
+///
+/// @result StringRef of the preferred separator, null-terminated.
+StringRef get_separator();
+
+/// @brief Get the typical temporary directory for the system, e.g.,
+/// "/var/tmp" or "C:/TEMP"
+///
+/// @param erasedOnReboot Whether to favor a path that is erased on reboot
+/// rather than one that potentially persists longer. This parameter will be
+/// ignored if the user or system has set the typical environment variable
+/// (e.g., TEMP on Windows, TMPDIR on *nix) to specify a temporary directory.
+///
+/// @param result Holds the resulting path name.
+void system_temp_directory(bool erasedOnReboot, SmallVectorImpl<char> &result);
+
+/// @brief Get the user's home directory.
+///
+/// @param result Holds the resulting path name.
+/// @result True if a home directory is set, false otherwise.
+bool home_directory(SmallVectorImpl<char> &result);
+
+/// @brief Get the user's cache directory.
+///
+/// Expect the resulting path to be a directory shared with other
+/// applications/services used by the user. Params \p Path1 to \p Path3 can be
+/// used to append additional directory names to the resulting path. Recommended
+/// pattern is <user_cache_directory>/<vendor>/<application>.
+///
+/// @param Result Holds the resulting path.
+/// @param Path1 Additional path to be appended to the user's cache directory
+/// path. "" can be used to append nothing.
+/// @param Path2 Second additional path to be appended.
+/// @param Path3 Third additional path to be appended.
+/// @result True if a cache directory path is set, false otherwise.
+bool user_cache_directory(SmallVectorImpl<char> &Result, const Twine &Path1,
+ const Twine &Path2 = "", const Twine &Path3 = "");
+
+/// @brief Has root name?
+///
+/// root_name != ""
+///
+/// @param path Input path.
+/// @result True if the path has a root name, false otherwise.
+bool has_root_name(const Twine &path);
+
+/// @brief Has root directory?
+///
+/// root_directory != ""
+///
+/// @param path Input path.
+/// @result True if the path has a root directory, false otherwise.
+bool has_root_directory(const Twine &path);
+
+/// @brief Has root path?
+///
+/// root_path != ""
+///
+/// @param path Input path.
+/// @result True if the path has a root path, false otherwise.
+bool has_root_path(const Twine &path);
+
+/// @brief Has relative path?
+///
+/// relative_path != ""
+///
+/// @param path Input path.
+/// @result True if the path has a relative path, false otherwise.
+bool has_relative_path(const Twine &path);
+
+/// @brief Has parent path?
+///
+/// parent_path != ""
+///
+/// @param path Input path.
+/// @result True if the path has a parent path, false otherwise.
+bool has_parent_path(const Twine &path);
+
+/// @brief Has filename?
+///
+/// filename != ""
+///
+/// @param path Input path.
+/// @result True if the path has a filename, false otherwise.
+bool has_filename(const Twine &path);
+
+/// @brief Has stem?
+///
+/// stem != ""
+///
+/// @param path Input path.
+/// @result True if the path has a stem, false otherwise.
+bool has_stem(const Twine &path);
+
+/// @brief Has extension?
+///
+/// extension != ""
+///
+/// @param path Input path.
+/// @result True if the path has a extension, false otherwise.
+bool has_extension(const Twine &path);
+
+/// @brief Is path absolute?
+///
+/// @param path Input path.
+/// @result True if the path is absolute, false if it is not.
+bool is_absolute(const Twine &path);
+
+/// @brief Is path relative?
+///
+/// @param path Input path.
+/// @result True if the path is relative, false if it is not.
+bool is_relative(const Twine &path);
+
+/// @brief Remove redundant leading "./" pieces and consecutive separators.
+///
+/// @param path Input path.
+/// @result The cleaned-up \a path.
+StringRef remove_leading_dotslash(StringRef path);
+
+/// @brief In-place remove any './' and optionally '../' components from a path.
+///
+/// @param path processed path
+/// @param remove_dot_dot specify if '../' should be removed
+/// @result True if path was changed
+bool remove_dots(SmallVectorImpl<char> &path, bool remove_dot_dot = false);
+
+} // end namespace path
+} // end namespace sys
+} // end namespace llvm
+
+#endif
diff --git a/ext/include/llvm/Support/PointerLikeTypeTraits.h b/ext/include/llvm/Support/PointerLikeTypeTraits.h
new file mode 100644
index 0000000..c12d237
--- /dev/null
+++ b/ext/include/llvm/Support/PointerLikeTypeTraits.h
@@ -0,0 +1,92 @@
+//===- llvm/Support/PointerLikeTypeTraits.h - Pointer Traits ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the PointerLikeTypeTraits class. This allows data
+// structures to reason about pointers and other things that are pointer sized.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_POINTERLIKETYPETRAITS_H
+#define LLVM_SUPPORT_POINTERLIKETYPETRAITS_H
+
+#include "llvm/Support/AlignOf.h"
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+/// A traits type that is used to handle pointer types and things that are just
+/// wrappers for pointers as a uniform entity.
+template <typename T> class PointerLikeTypeTraits {
+ // getAsVoidPointer
+ // getFromVoidPointer
+ // getNumLowBitsAvailable
+};
+
+namespace detail {
+/// A tiny meta function to compute the log2 of a compile time constant.
+template <size_t N>
+struct ConstantLog2
+ : std::integral_constant<size_t, ConstantLog2<N / 2>::value + 1> {};
+template <> struct ConstantLog2<1> : std::integral_constant<size_t, 0> {};
+}
+
+// Provide PointerLikeTypeTraits for non-cvr pointers.
+template <typename T> struct PointerLikeTypeTraits<T *> {
+ static inline void *getAsVoidPointer(T *P) { return P; }
+ static inline T *getFromVoidPointer(void *P) { return static_cast<T *>(P); }
+
+ enum {
+ NumLowBitsAvailable = detail::ConstantLog2<AlignOf<T>::Alignment>::value
+ };
+};
+
+template <> struct PointerLikeTypeTraits<void *> {
+ static inline void *getAsVoidPointer(void *P) { return P; }
+ static inline void *getFromVoidPointer(void *P) { return P; }
+
+ /// Note, we assume here that void* is related to raw malloc'ed memory and
+ /// that malloc returns objects at least 4-byte aligned. However, this may be
+ /// wrong, or pointers may be from something other than malloc. In this case,
+ /// you should specify a real typed pointer or avoid this template.
+ ///
+ /// All clients should use assertions to do a run-time check to ensure that
+ /// this is actually true.
+ enum { NumLowBitsAvailable = 2 };
+};
+
+// Provide PointerLikeTypeTraits for const pointers.
+template <typename T> class PointerLikeTypeTraits<const T *> {
+ typedef PointerLikeTypeTraits<T *> NonConst;
+
+public:
+ static inline const void *getAsVoidPointer(const T *P) {
+ return NonConst::getAsVoidPointer(const_cast<T *>(P));
+ }
+ static inline const T *getFromVoidPointer(const void *P) {
+ return NonConst::getFromVoidPointer(const_cast<void *>(P));
+ }
+ enum { NumLowBitsAvailable = NonConst::NumLowBitsAvailable };
+};
+
+// Provide PointerLikeTypeTraits for uintptr_t.
+template <> class PointerLikeTypeTraits<uintptr_t> {
+public:
+ static inline void *getAsVoidPointer(uintptr_t P) {
+ return reinterpret_cast<void *>(P);
+ }
+ static inline uintptr_t getFromVoidPointer(void *P) {
+ return reinterpret_cast<uintptr_t>(P);
+ }
+ // No bits are available!
+ enum { NumLowBitsAvailable = 0 };
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/ext/include/llvm/Support/Regex.h b/ext/include/llvm/Support/Regex.h
new file mode 100644
index 0000000..31b35ed
--- /dev/null
+++ b/ext/include/llvm/Support/Regex.h
@@ -0,0 +1,105 @@
+//===-- Regex.h - Regular Expression matcher implementation -*- C++ -*-----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a POSIX regular expression matcher. Both Basic and
+// Extended POSIX regular expressions (ERE) are supported. EREs were extended
+// to support backreferences in matches.
+// This implementation also supports matching strings with embedded NUL chars.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_REGEX_H
+#define LLVM_SUPPORT_REGEX_H
+
+#include <string>
+
+struct llvm_regex;
+
+namespace llvm {
+ class StringRef;
+ template<typename T> class SmallVectorImpl;
+
+ class Regex {
+ public:
+ enum {
+ NoFlags=0,
+ /// Compile for matching that ignores upper/lower case distinctions.
+ IgnoreCase=1,
+ /// Compile for newline-sensitive matching. With this flag '[^' bracket
+ /// expressions and '.' never match newline. A ^ anchor matches the
+ /// null string after any newline in the string in addition to its normal
+ /// function, and the $ anchor matches the null string before any
+ /// newline in the string in addition to its normal function.
+ Newline=2,
+ /// By default, the POSIX extended regular expression (ERE) syntax is
+ /// assumed. Pass this flag to turn on basic regular expressions (BRE)
+ /// instead.
+ BasicRegex=4
+ };
+
+ /// Compiles the given regular expression \p Regex.
+ Regex(StringRef Regex, unsigned Flags = NoFlags);
+ Regex(const Regex &) = delete;
+ Regex &operator=(Regex regex) {
+ std::swap(preg, regex.preg);
+ std::swap(error, regex.error);
+ return *this;
+ }
+ Regex(Regex &®ex) {
+ preg = regex.preg;
+ error = regex.error;
+ regex.preg = nullptr;
+ }
+ ~Regex();
+
+ /// isValid - returns the error encountered during regex compilation, or
+ /// matching, if any.
+ bool isValid(std::string &Error);
+
+ /// getNumMatches - In a valid regex, return the number of parenthesized
+ /// matches it contains. The number filled in by match will include this
+ /// many entries plus one for the whole regex (as element 0).
+ unsigned getNumMatches() const;
+
+ /// matches - Match the regex against a given \p String.
+ ///
+ /// \param Matches - If given, on a successful match this will be filled in
+ /// with references to the matched group expressions (inside \p String),
+ /// the first group is always the entire pattern.
+ ///
+ /// This returns true on a successful match.
+ bool match(StringRef String, SmallVectorImpl<StringRef> *Matches = nullptr);
+
+ /// sub - Return the result of replacing the first match of the regex in
+ /// \p String with the \p Repl string. Backreferences like "\0" in the
+ /// replacement string are replaced with the appropriate match substring.
+ ///
+ /// Note that the replacement string has backslash escaping performed on
+ /// it. Invalid backreferences are ignored (replaced by empty strings).
+ ///
+ /// \param Error If non-null, any errors in the substitution (invalid
+ /// backreferences, trailing backslashes) will be recorded as a non-empty
+ /// string.
+ std::string sub(StringRef Repl, StringRef String,
+ std::string *Error = nullptr);
+
+ /// \brief If this function returns true, ^Str$ is an extended regular
+ /// expression that matches Str and only Str.
+ static bool isLiteralERE(StringRef Str);
+
+ /// \brief Turn String into a regex by escaping its special characters.
+ static std::string escape(StringRef String);
+
+ private:
+ struct llvm_regex *preg;
+ int error;
+ };
+}
+
+#endif // LLVM_SUPPORT_REGEX_H
diff --git a/ext/include/llvm/Support/SMLoc.h b/ext/include/llvm/Support/SMLoc.h
new file mode 100644
index 0000000..c6e9a14
--- /dev/null
+++ b/ext/include/llvm/Support/SMLoc.h
@@ -0,0 +1,63 @@
+//===- SMLoc.h - Source location for use with diagnostics -------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the SMLoc class. This class encapsulates a location in
+// source code for use in diagnostics.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_SMLOC_H
+#define LLVM_SUPPORT_SMLOC_H
+
+#include <cassert>
+
+namespace llvm {
+
+/// Represents a location in source code.
+class SMLoc {
+ const char *Ptr;
+
+public:
+ SMLoc() : Ptr(nullptr) {}
+
+ bool isValid() const { return Ptr != nullptr; }
+
+ bool operator==(const SMLoc &RHS) const { return RHS.Ptr == Ptr; }
+ bool operator!=(const SMLoc &RHS) const { return RHS.Ptr != Ptr; }
+
+ const char *getPointer() const { return Ptr; }
+
+ static SMLoc getFromPointer(const char *Ptr) {
+ SMLoc L;
+ L.Ptr = Ptr;
+ return L;
+ }
+};
+
+/// Represents a range in source code.
+///
+/// SMRange is implemented using a half-open range, as is the convention in C++.
+/// In the string "abc", the range (1,3] represents the substring "bc", and the
+/// range (2,2] represents an empty range between the characters "b" and "c".
+class SMRange {
+public:
+ SMLoc Start, End;
+
+ SMRange() {}
+ SMRange(SMLoc St, SMLoc En) : Start(St), End(En) {
+ assert(Start.isValid() == End.isValid() &&
+ "Start and end should either both be valid or both be invalid!");
+ }
+
+ bool isValid() const { return Start.isValid(); }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/ext/include/llvm/Support/Signals.h b/ext/include/llvm/Support/Signals.h
new file mode 100644
index 0000000..2a4d84b
--- /dev/null
+++ b/ext/include/llvm/Support/Signals.h
@@ -0,0 +1,71 @@
+//===- llvm/Support/Signals.h - Signal Handling support ----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines some helpful functions for dealing with the possibility of
+// unix signals occurring while your program is running.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_SIGNALS_H
+#define LLVM_SUPPORT_SIGNALS_H
+
+#include <string>
+
+namespace llvm {
+class StringRef;
+class raw_ostream;
+
+namespace sys {
+
+ /// This function runs all the registered interrupt handlers, including the
+ /// removal of files registered by RemoveFileOnSignal.
+ void RunInterruptHandlers();
+
+ /// This function registers signal handlers to ensure that if a signal gets
+ /// delivered that the named file is removed.
+ /// @brief Remove a file if a fatal signal occurs.
+ bool RemoveFileOnSignal(StringRef Filename, std::string* ErrMsg = nullptr);
+
+ /// This function removes a file from the list of files to be removed on
+ /// signal delivery.
+ void DontRemoveFileOnSignal(StringRef Filename);
+
+ /// When an error signal (such as SIBABRT or SIGSEGV) is delivered to the
+ /// process, print a stack trace and then exit.
+ /// @brief Print a stack trace if a fatal signal occurs.
+ void PrintStackTraceOnErrorSignal(bool DisableCrashReporting = false);
+
+ /// Disable all system dialog boxes that appear when the process crashes.
+ void DisableSystemDialogsOnCrash();
+
+ /// \brief Print the stack trace using the given \c raw_ostream object.
+ void PrintStackTrace(raw_ostream &OS);
+
+ // Run all registered signal handlers.
+ void RunSignalHandlers();
+
+ /// AddSignalHandler - Add a function to be called when an abort/kill signal
+ /// is delivered to the process. The handler can have a cookie passed to it
+ /// to identify what instance of the handler it is.
+ void AddSignalHandler(void (*FnPtr)(void *), void *Cookie);
+
+ /// This function registers a function to be called when the user "interrupts"
+ /// the program (typically by pressing ctrl-c). When the user interrupts the
+ /// program, the specified interrupt function is called instead of the program
+ /// being killed, and the interrupt function automatically disabled. Note
+ /// that interrupt functions are not allowed to call any non-reentrant
+ /// functions. An null interrupt function pointer disables the current
+ /// installed function. Note also that the handler may be executed on a
+ /// different thread on some platforms.
+ /// @brief Register a function to be called when ctrl-c is pressed.
+ void SetInterruptFunction(void (*IF)());
+} // End sys namespace
+} // End llvm namespace
+
+#endif
diff --git a/ext/include/llvm/Support/SourceMgr.h b/ext/include/llvm/Support/SourceMgr.h
new file mode 100644
index 0000000..1f8b1a0
--- /dev/null
+++ b/ext/include/llvm/Support/SourceMgr.h
@@ -0,0 +1,285 @@
+//===- SourceMgr.h - Manager for Source Buffers & Diagnostics ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the SMDiagnostic and SourceMgr classes. This
+// provides a simple substrate for diagnostics, #include handling, and other low
+// level things for simple parsers.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_SOURCEMGR_H
+#define LLVM_SUPPORT_SOURCEMGR_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SMLoc.h"
+#include <string>
+
+namespace llvm {
+ class SourceMgr;
+ class SMDiagnostic;
+ class SMFixIt;
+ class Twine;
+ class raw_ostream;
+
+/// This owns the files read by a parser, handles include stacks,
+/// and handles diagnostic wrangling.
+class SourceMgr {
+public:
+ enum DiagKind {
+ DK_Error,
+ DK_Warning,
+ DK_Note
+ };
+
+ /// Clients that want to handle their own diagnostics in a custom way can
+ /// register a function pointer+context as a diagnostic handler.
+ /// It gets called each time PrintMessage is invoked.
+ typedef void (*DiagHandlerTy)(const SMDiagnostic &, void *Context);
+private:
+ struct SrcBuffer {
+ /// The memory buffer for the file.
+ std::unique_ptr<MemoryBuffer> Buffer;
+
+ /// This is the location of the parent include, or null if at the top level.
+ SMLoc IncludeLoc;
+
+ SrcBuffer() {}
+
+ SrcBuffer(SrcBuffer &&O)
+ : Buffer(std::move(O.Buffer)), IncludeLoc(O.IncludeLoc) {}
+ };
+
+ /// This is all of the buffers that we are reading from.
+ std::vector<SrcBuffer> Buffers;
+
+ // This is the list of directories we should search for include files in.
+ std::vector<std::string> IncludeDirectories;
+
+ /// This is a cache for line number queries, its implementation is really
+ /// private to SourceMgr.cpp.
+ mutable void *LineNoCache;
+
+ DiagHandlerTy DiagHandler;
+ void *DiagContext;
+
+ bool isValidBufferID(unsigned i) const { return i && i <= Buffers.size(); }
+
+ SourceMgr(const SourceMgr&) = delete;
+ void operator=(const SourceMgr&) = delete;
+public:
+ SourceMgr()
+ : LineNoCache(nullptr), DiagHandler(nullptr), DiagContext(nullptr) {}
+ ~SourceMgr();
+
+ void setIncludeDirs(const std::vector<std::string> &Dirs) {
+ IncludeDirectories = Dirs;
+ }
+
+ /// Specify a diagnostic handler to be invoked every time PrintMessage is
+ /// called. \p Ctx is passed into the handler when it is invoked.
+ void setDiagHandler(DiagHandlerTy DH, void *Ctx = nullptr) {
+ DiagHandler = DH;
+ DiagContext = Ctx;
+ }
+
+ DiagHandlerTy getDiagHandler() const { return DiagHandler; }
+ void *getDiagContext() const { return DiagContext; }
+
+ const SrcBuffer &getBufferInfo(unsigned i) const {
+ assert(isValidBufferID(i));
+ return Buffers[i - 1];
+ }
+
+ const MemoryBuffer *getMemoryBuffer(unsigned i) const {
+ assert(isValidBufferID(i));
+ return Buffers[i - 1].Buffer.get();
+ }
+
+ unsigned getNumBuffers() const {
+ return Buffers.size();
+ }
+
+ unsigned getMainFileID() const {
+ assert(getNumBuffers());
+ return 1;
+ }
+
+ SMLoc getParentIncludeLoc(unsigned i) const {
+ assert(isValidBufferID(i));
+ return Buffers[i - 1].IncludeLoc;
+ }
+
+ /// Add a new source buffer to this source manager. This takes ownership of
+ /// the memory buffer.
+ unsigned AddNewSourceBuffer(std::unique_ptr<MemoryBuffer> F,
+ SMLoc IncludeLoc) {
+ SrcBuffer NB;
+ NB.Buffer = std::move(F);
+ NB.IncludeLoc = IncludeLoc;
+ Buffers.push_back(std::move(NB));
+ return Buffers.size();
+ }
+
+ /// Search for a file with the specified name in the current directory or in
+ /// one of the IncludeDirs.
+ ///
+ /// If no file is found, this returns 0, otherwise it returns the buffer ID
+ /// of the stacked file. The full path to the included file can be found in
+ /// \p IncludedFile.
+ unsigned AddIncludeFile(const std::string &Filename, SMLoc IncludeLoc,
+ std::string &IncludedFile);
+
+ /// Return the ID of the buffer containing the specified location.
+ ///
+ /// 0 is returned if the buffer is not found.
+ unsigned FindBufferContainingLoc(SMLoc Loc) const;
+
+ /// Find the line number for the specified location in the specified file.
+ /// This is not a fast method.
+ unsigned FindLineNumber(SMLoc Loc, unsigned BufferID = 0) const {
+ return getLineAndColumn(Loc, BufferID).first;
+ }
+
+ /// Find the line and column number for the specified location in the
+ /// specified file. This is not a fast method.
+ std::pair<unsigned, unsigned> getLineAndColumn(SMLoc Loc,
+ unsigned BufferID = 0) const;
+
+ /// Emit a message about the specified location with the specified string.
+ ///
+ /// \param ShowColors Display colored messages if output is a terminal and
+ /// the default error handler is used.
+ void PrintMessage(raw_ostream &OS, SMLoc Loc, DiagKind Kind,
+ const Twine &Msg,
+ ArrayRef<SMRange> Ranges = None,
+ ArrayRef<SMFixIt> FixIts = None,
+ bool ShowColors = true) const;
+
+ /// Emits a diagnostic to llvm::errs().
+ void PrintMessage(SMLoc Loc, DiagKind Kind, const Twine &Msg,
+ ArrayRef<SMRange> Ranges = None,
+ ArrayRef<SMFixIt> FixIts = None,
+ bool ShowColors = true) const;
+
+ /// Emits a manually-constructed diagnostic to the given output stream.
+ ///
+ /// \param ShowColors Display colored messages if output is a terminal and
+ /// the default error handler is used.
+ void PrintMessage(raw_ostream &OS, const SMDiagnostic &Diagnostic,
+ bool ShowColors = true) const;
+
+ /// Return an SMDiagnostic at the specified location with the specified
+ /// string.
+ ///
+ /// \param Msg If non-null, the kind of message (e.g., "error") which is
+ /// prefixed to the message.
+ SMDiagnostic GetMessage(SMLoc Loc, DiagKind Kind, const Twine &Msg,
+ ArrayRef<SMRange> Ranges = None,
+ ArrayRef<SMFixIt> FixIts = None) const;
+
+ /// Prints the names of included files and the line of the file they were
+ /// included from. A diagnostic handler can use this before printing its
+ /// custom formatted message.
+ ///
+ /// \param IncludeLoc The location of the include.
+ /// \param OS the raw_ostream to print on.
+ void PrintIncludeStack(SMLoc IncludeLoc, raw_ostream &OS) const;
+};
+
+
+/// Represents a single fixit, a replacement of one range of text with another.
+class SMFixIt {
+ SMRange Range;
+
+ std::string Text;
+
+public:
+ // FIXME: Twine.str() is not very efficient.
+ SMFixIt(SMLoc Loc, const Twine &Insertion)
+ : Range(Loc, Loc), Text(Insertion.str()) {
+ assert(Loc.isValid());
+ }
+
+ // FIXME: Twine.str() is not very efficient.
+ SMFixIt(SMRange R, const Twine &Replacement)
+ : Range(R), Text(Replacement.str()) {
+ assert(R.isValid());
+ }
+
+ StringRef getText() const { return Text; }
+ SMRange getRange() const { return Range; }
+
+ bool operator<(const SMFixIt &Other) const {
+ if (Range.Start.getPointer() != Other.Range.Start.getPointer())
+ return Range.Start.getPointer() < Other.Range.Start.getPointer();
+ if (Range.End.getPointer() != Other.Range.End.getPointer())
+ return Range.End.getPointer() < Other.Range.End.getPointer();
+ return Text < Other.Text;
+ }
+};
+
+
+/// Instances of this class encapsulate one diagnostic report, allowing
+/// printing to a raw_ostream as a caret diagnostic.
+class SMDiagnostic {
+ const SourceMgr *SM;
+ SMLoc Loc;
+ std::string Filename;
+ int LineNo, ColumnNo;
+ SourceMgr::DiagKind Kind;
+ std::string Message, LineContents;
+ std::vector<std::pair<unsigned, unsigned> > Ranges;
+ SmallVector<SMFixIt, 4> FixIts;
+
+public:
+ // Null diagnostic.
+ SMDiagnostic()
+ : SM(nullptr), LineNo(0), ColumnNo(0), Kind(SourceMgr::DK_Error) {}
+ // Diagnostic with no location (e.g. file not found, command line arg error).
+ SMDiagnostic(StringRef filename, SourceMgr::DiagKind Knd, StringRef Msg)
+ : SM(nullptr), Filename(filename), LineNo(-1), ColumnNo(-1), Kind(Knd),
+ Message(Msg) {}
+
+ // Diagnostic with a location.
+ SMDiagnostic(const SourceMgr &sm, SMLoc L, StringRef FN,
+ int Line, int Col, SourceMgr::DiagKind Kind,
+ StringRef Msg, StringRef LineStr,
+ ArrayRef<std::pair<unsigned,unsigned> > Ranges,
+ ArrayRef<SMFixIt> FixIts = None);
+
+ const SourceMgr *getSourceMgr() const { return SM; }
+ SMLoc getLoc() const { return Loc; }
+ StringRef getFilename() const { return Filename; }
+ int getLineNo() const { return LineNo; }
+ int getColumnNo() const { return ColumnNo; }
+ SourceMgr::DiagKind getKind() const { return Kind; }
+ StringRef getMessage() const { return Message; }
+ StringRef getLineContents() const { return LineContents; }
+ ArrayRef<std::pair<unsigned, unsigned> > getRanges() const {
+ return Ranges;
+ }
+
+ void addFixIt(const SMFixIt &Hint) {
+ FixIts.push_back(Hint);
+ }
+
+ ArrayRef<SMFixIt> getFixIts() const {
+ return FixIts;
+ }
+
+ void print(const char *ProgName, raw_ostream &S, bool ShowColors = true,
+ bool ShowKindLabel = true) const;
+};
+
+} // end llvm namespace
+
+#endif
diff --git a/ext/include/llvm/Support/StringSaver.h b/ext/include/llvm/Support/StringSaver.h
new file mode 100644
index 0000000..38fb7bb
--- /dev/null
+++ b/ext/include/llvm/Support/StringSaver.h
@@ -0,0 +1,32 @@
+//===- llvm/Support/StringSaver.h -------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_STRINGSAVER_H
+#define LLVM_SUPPORT_STRINGSAVER_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/Allocator.h"
+
+namespace llvm {
+
+/// \brief Saves strings in the inheritor's stable storage and returns a stable
+/// raw character pointer.
+class StringSaver final {
+ BumpPtrAllocator &Alloc;
+
+public:
+ StringSaver(BumpPtrAllocator &Alloc) : Alloc(Alloc) {}
+ const char *save(const char *S) { return save(StringRef(S)); }
+ const char *save(StringRef S);
+ const char *save(const Twine &S) { return save(StringRef(S.str())); }
+ const char *save(std::string &S) { return save(StringRef(S)); }
+};
+}
+#endif
diff --git a/ext/include/llvm/Support/SwapByteOrder.h b/ext/include/llvm/Support/SwapByteOrder.h
new file mode 100644
index 0000000..c685ddb
--- /dev/null
+++ b/ext/include/llvm/Support/SwapByteOrder.h
@@ -0,0 +1,115 @@
+//===- SwapByteOrder.h - Generic and optimized byte swaps -------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares generic and optimized functions to swap the byte order of
+// an integral type.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_SWAPBYTEORDER_H
+#define LLVM_SUPPORT_SWAPBYTEORDER_H
+
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/DataTypes.h"
+#include <cstddef>
+#include <limits>
+
+namespace llvm {
+namespace sys {
+
+/// SwapByteOrder_16 - This function returns a byte-swapped representation of
+/// the 16-bit argument.
+inline uint16_t SwapByteOrder_16(uint16_t value) {
+ uint16_t Hi = value << 8;
+ uint16_t Lo = value >> 8;
+ return Hi | Lo;
+}
+
+/// SwapByteOrder_32 - This function returns a byte-swapped representation of
+/// the 32-bit argument.
+inline uint32_t SwapByteOrder_32(uint32_t value) {
+#if defined(__llvm__) || (LLVM_GNUC_PREREQ(4, 3, 0) && !defined(__ICC))
+ return __builtin_bswap32(value);
+#else
+ uint32_t Byte0 = value & 0x000000FF;
+ uint32_t Byte1 = value & 0x0000FF00;
+ uint32_t Byte2 = value & 0x00FF0000;
+ uint32_t Byte3 = value & 0xFF000000;
+ return (Byte0 << 24) | (Byte1 << 8) | (Byte2 >> 8) | (Byte3 >> 24);
+#endif
+}
+
+/// SwapByteOrder_64 - This function returns a byte-swapped representation of
+/// the 64-bit argument.
+inline uint64_t SwapByteOrder_64(uint64_t value) {
+#if defined(__llvm__) || (LLVM_GNUC_PREREQ(4, 3, 0) && !defined(__ICC))
+ return __builtin_bswap64(value);
+#else
+ uint64_t Hi = SwapByteOrder_32(uint32_t(value));
+ uint32_t Lo = SwapByteOrder_32(uint32_t(value >> 32));
+ return (Hi << 32) | Lo;
+#endif
+}
+
+inline unsigned char getSwappedBytes(unsigned char C) { return C; }
+inline signed char getSwappedBytes(signed char C) { return C; }
+inline char getSwappedBytes(char C) { return C; }
+
+inline unsigned short getSwappedBytes(unsigned short C) { return SwapByteOrder_16(C); }
+inline signed short getSwappedBytes( signed short C) { return SwapByteOrder_16(C); }
+
+inline unsigned int getSwappedBytes(unsigned int C) { return SwapByteOrder_32(C); }
+inline signed int getSwappedBytes( signed int C) { return SwapByteOrder_32(C); }
+
+#if __LONG_MAX__ == __INT_MAX__
+inline unsigned long getSwappedBytes(unsigned long C) { return SwapByteOrder_32(C); }
+inline signed long getSwappedBytes( signed long C) { return SwapByteOrder_32(C); }
+#elif __LONG_MAX__ == __LONG_LONG_MAX__
+inline unsigned long getSwappedBytes(unsigned long C) { return SwapByteOrder_64(C); }
+inline signed long getSwappedBytes( signed long C) { return SwapByteOrder_64(C); }
+#else
+#error "Unknown long size!"
+#endif
+
+inline unsigned long long getSwappedBytes(unsigned long long C) {
+ return SwapByteOrder_64(C);
+}
+inline signed long long getSwappedBytes(signed long long C) {
+ return SwapByteOrder_64(C);
+}
+
+inline float getSwappedBytes(float C) {
+ union {
+ uint32_t i;
+ float f;
+ } in, out;
+ in.f = C;
+ out.i = SwapByteOrder_32(in.i);
+ return out.f;
+}
+
+inline double getSwappedBytes(double C) {
+ union {
+ uint64_t i;
+ double d;
+ } in, out;
+ in.d = C;
+ out.i = SwapByteOrder_64(in.i);
+ return out.d;
+}
+
+template<typename T>
+inline void swapByteOrder(T &Value) {
+ Value = getSwappedBytes(Value);
+}
+
+} // end namespace sys
+} // end namespace llvm
+
+#endif
diff --git a/ext/include/llvm/Support/UniqueLock.h b/ext/include/llvm/Support/UniqueLock.h
new file mode 100644
index 0000000..529284d
--- /dev/null
+++ b/ext/include/llvm/Support/UniqueLock.h
@@ -0,0 +1,67 @@
+//===-- Support/UniqueLock.h - Acquire/Release Mutex In Scope ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a guard for a block of code that ensures a Mutex is locked
+// upon construction and released upon destruction.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_UNIQUE_LOCK_H
+#define LLVM_SUPPORT_UNIQUE_LOCK_H
+
+#include "llvm/Support/Mutex.h"
+
+namespace llvm {
+ /// A pared-down imitation of std::unique_lock from C++11. Contrary to the
+ /// name, it's really more of a wrapper for a lock. It may or may not have
+ /// an associated mutex, which is guaranteed to be locked upon creation
+ /// and unlocked after destruction. unique_lock can also unlock the mutex
+ /// and re-lock it freely during its lifetime.
+ /// @brief Guard a section of code with a mutex.
+ template<typename MutexT>
+ class unique_lock {
+ MutexT *M;
+ bool locked;
+
+ unique_lock(const unique_lock &) = delete;
+ void operator=(const unique_lock &) = delete;
+ public:
+ unique_lock() : M(nullptr), locked(false) {}
+ explicit unique_lock(MutexT &m) : M(&m), locked(true) { M->lock(); }
+
+ void operator=(unique_lock &&o) {
+ if (owns_lock())
+ M->unlock();
+ M = o.M;
+ locked = o.locked;
+ o.M = nullptr;
+ o.locked = false;
+ }
+
+ ~unique_lock() { if (owns_lock()) M->unlock(); }
+
+ void lock() {
+ assert(!locked && "mutex already locked!");
+ assert(M && "no associated mutex!");
+ M->lock();
+ locked = true;
+ }
+
+ void unlock() {
+ assert(locked && "unlocking a mutex that isn't locked!");
+ assert(M && "no associated mutex!");
+ M->unlock();
+ locked = false;
+ }
+
+ bool owns_lock() { return locked; }
+ };
+}
+
+#endif // LLVM_SUPPORT_UNIQUE_LOCK_H
diff --git a/ext/include/llvm/Support/YAMLParser.h b/ext/include/llvm/Support/YAMLParser.h
new file mode 100644
index 0000000..a5addfa
--- /dev/null
+++ b/ext/include/llvm/Support/YAMLParser.h
@@ -0,0 +1,601 @@
+//===--- YAMLParser.h - Simple YAML parser --------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a YAML 1.2 parser.
+//
+// See http://www.yaml.org/spec/1.2/spec.html for the full standard.
+//
+// This currently does not implement the following:
+// * Multi-line literal folding.
+// * Tag resolution.
+// * UTF-16.
+// * BOMs anywhere other than the first Unicode scalar value in the file.
+//
+// The most important class here is Stream. This represents a YAML stream with
+// 0, 1, or many documents.
+//
+// SourceMgr sm;
+// StringRef input = getInput();
+// yaml::Stream stream(input, sm);
+//
+// for (yaml::document_iterator di = stream.begin(), de = stream.end();
+// di != de; ++di) {
+// yaml::Node *n = di->getRoot();
+// if (n) {
+// // Do something with n...
+// } else
+// break;
+// }
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_YAMLPARSER_H
+#define LLVM_SUPPORT_YAMLPARSER_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/SMLoc.h"
+#include <limits>
+#include <map>
+#include <utility>
+
+namespace llvm {
+class MemoryBufferRef;
+class SourceMgr;
+class Twine;
+class raw_ostream;
+
+namespace yaml {
+
+class document_iterator;
+class Document;
+class Node;
+class Scanner;
+struct Token;
+
+/// \brief Dump all the tokens in this stream to OS.
+/// \returns true if there was an error, false otherwise.
+bool dumpTokens(StringRef Input, raw_ostream &);
+
+/// \brief Scans all tokens in input without outputting anything. This is used
+/// for benchmarking the tokenizer.
+/// \returns true if there was an error, false otherwise.
+bool scanTokens(StringRef Input);
+
+/// \brief Escape \a Input for a double quoted scalar.
+std::string escape(StringRef Input);
+
+/// \brief This class represents a YAML stream potentially containing multiple
+/// documents.
+class Stream {
+public:
+ /// \brief This keeps a reference to the string referenced by \p Input.
+ Stream(StringRef Input, SourceMgr &, bool ShowColors = true);
+
+ Stream(MemoryBufferRef InputBuffer, SourceMgr &, bool ShowColors = true);
+ ~Stream();
+
+ document_iterator begin();
+ document_iterator end();
+ void skip();
+ bool failed();
+ bool validate() {
+ skip();
+ return !failed();
+ }
+
+ void printError(Node *N, const Twine &Msg);
+
+private:
+ std::unique_ptr<Scanner> scanner;
+ std::unique_ptr<Document> CurrentDoc;
+
+ friend class Document;
+};
+
+/// \brief Abstract base class for all Nodes.
+class Node {
+ virtual void anchor();
+
+public:
+ enum NodeKind {
+ NK_Null,
+ NK_Scalar,
+ NK_BlockScalar,
+ NK_KeyValue,
+ NK_Mapping,
+ NK_Sequence,
+ NK_Alias
+ };
+
+ Node(unsigned int Type, std::unique_ptr<Document> &, StringRef Anchor,
+ StringRef Tag);
+
+ /// \brief Get the value of the anchor attached to this node. If it does not
+ /// have one, getAnchor().size() will be 0.
+ StringRef getAnchor() const { return Anchor; }
+
+ /// \brief Get the tag as it was written in the document. This does not
+ /// perform tag resolution.
+ StringRef getRawTag() const { return Tag; }
+
+ /// \brief Get the verbatium tag for a given Node. This performs tag resoluton
+ /// and substitution.
+ std::string getVerbatimTag() const;
+
+ SMRange getSourceRange() const { return SourceRange; }
+ void setSourceRange(SMRange SR) { SourceRange = SR; }
+
+ // These functions forward to Document and Scanner.
+ Token &peekNext();
+ Token getNext();
+ Node *parseBlockNode();
+ BumpPtrAllocator &getAllocator();
+ void setError(const Twine &Message, Token &Location) const;
+ bool failed() const;
+
+ virtual void skip() {}
+
+ unsigned int getType() const { return TypeID; }
+
+ void *operator new(size_t Size, BumpPtrAllocator &Alloc,
+ size_t Alignment = 16) LLVM_NOEXCEPT {
+ return Alloc.Allocate(Size, Alignment);
+ }
+
+ void operator delete(void *Ptr, BumpPtrAllocator &Alloc,
+ size_t Size) LLVM_NOEXCEPT {
+ Alloc.Deallocate(Ptr, Size);
+ }
+
+protected:
+ std::unique_ptr<Document> &Doc;
+ SMRange SourceRange;
+
+ void operator delete(void *) LLVM_NOEXCEPT = delete;
+
+ ~Node() = default;
+
+private:
+ unsigned int TypeID;
+ StringRef Anchor;
+ /// \brief The tag as typed in the document.
+ StringRef Tag;
+};
+
+/// \brief A null value.
+///
+/// Example:
+/// !!null null
+class NullNode final : public Node {
+ void anchor() override;
+
+public:
+ NullNode(std::unique_ptr<Document> &D)
+ : Node(NK_Null, D, StringRef(), StringRef()) {}
+
+ static inline bool classof(const Node *N) { return N->getType() == NK_Null; }
+};
+
+/// \brief A scalar node is an opaque datum that can be presented as a
+/// series of zero or more Unicode scalar values.
+///
+/// Example:
+/// Adena
+class ScalarNode final : public Node {
+ void anchor() override;
+
+public:
+ ScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
+ StringRef Val)
+ : Node(NK_Scalar, D, Anchor, Tag), Value(Val) {
+ SMLoc Start = SMLoc::getFromPointer(Val.begin());
+ SMLoc End = SMLoc::getFromPointer(Val.end());
+ SourceRange = SMRange(Start, End);
+ }
+
+ // Return Value without any escaping or folding or other fun YAML stuff. This
+ // is the exact bytes that are contained in the file (after conversion to
+ // utf8).
+ StringRef getRawValue() const { return Value; }
+
+ /// \brief Gets the value of this node as a StringRef.
+ ///
+ /// \param Storage is used to store the content of the returned StringRef iff
+ /// it requires any modification from how it appeared in the source.
+ /// This happens with escaped characters and multi-line literals.
+ StringRef getValue(SmallVectorImpl<char> &Storage) const;
+
+ static inline bool classof(const Node *N) {
+ return N->getType() == NK_Scalar;
+ }
+
+private:
+ StringRef Value;
+
+ StringRef unescapeDoubleQuoted(StringRef UnquotedValue,
+ StringRef::size_type Start,
+ SmallVectorImpl<char> &Storage) const;
+};
+
+/// \brief A block scalar node is an opaque datum that can be presented as a
+/// series of zero or more Unicode scalar values.
+///
+/// Example:
+/// |
+/// Hello
+/// World
+class BlockScalarNode final : public Node {
+ void anchor() override;
+
+public:
+ BlockScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
+ StringRef Value, StringRef RawVal)
+ : Node(NK_BlockScalar, D, Anchor, Tag), Value(Value) {
+ SMLoc Start = SMLoc::getFromPointer(RawVal.begin());
+ SMLoc End = SMLoc::getFromPointer(RawVal.end());
+ SourceRange = SMRange(Start, End);
+ }
+
+ /// \brief Gets the value of this node as a StringRef.
+ StringRef getValue() const { return Value; }
+
+ static inline bool classof(const Node *N) {
+ return N->getType() == NK_BlockScalar;
+ }
+
+private:
+ StringRef Value;
+};
+
+/// \brief A key and value pair. While not technically a Node under the YAML
+/// representation graph, it is easier to treat them this way.
+///
+/// TODO: Consider making this not a child of Node.
+///
+/// Example:
+/// Section: .text
+class KeyValueNode final : public Node {
+ void anchor() override;
+
+public:
+ KeyValueNode(std::unique_ptr<Document> &D)
+ : Node(NK_KeyValue, D, StringRef(), StringRef()), Key(nullptr),
+ Value(nullptr) {}
+
+ /// \brief Parse and return the key.
+ ///
+ /// This may be called multiple times.
+ ///
+ /// \returns The key, or nullptr if failed() == true.
+ Node *getKey();
+
+ /// \brief Parse and return the value.
+ ///
+ /// This may be called multiple times.
+ ///
+ /// \returns The value, or nullptr if failed() == true.
+ Node *getValue();
+
+ void skip() override {
+ getKey()->skip();
+ if (Node *Val = getValue())
+ Val->skip();
+ }
+
+ static inline bool classof(const Node *N) {
+ return N->getType() == NK_KeyValue;
+ }
+
+private:
+ Node *Key;
+ Node *Value;
+};
+
+/// \brief This is an iterator abstraction over YAML collections shared by both
+/// sequences and maps.
+///
+/// BaseT must have a ValueT* member named CurrentEntry and a member function
+/// increment() which must set CurrentEntry to 0 to create an end iterator.
+template <class BaseT, class ValueT>
+class basic_collection_iterator
+ : public std::iterator<std::input_iterator_tag, ValueT> {
+public:
+ basic_collection_iterator() : Base(nullptr) {}
+ basic_collection_iterator(BaseT *B) : Base(B) {}
+
+ ValueT *operator->() const {
+ assert(Base && Base->CurrentEntry && "Attempted to access end iterator!");
+ return Base->CurrentEntry;
+ }
+
+ ValueT &operator*() const {
+ assert(Base && Base->CurrentEntry &&
+ "Attempted to dereference end iterator!");
+ return *Base->CurrentEntry;
+ }
+
+ operator ValueT *() const {
+ assert(Base && Base->CurrentEntry && "Attempted to access end iterator!");
+ return Base->CurrentEntry;
+ }
+
+ /// Note on EqualityComparable:
+ ///
+ /// The iterator is not re-entrant,
+ /// it is meant to be used for parsing YAML on-demand
+ /// Once iteration started - it can point only to one entry at a time
+ /// hence Base.CurrentEntry and Other.Base.CurrentEntry are equal
+ /// iff Base and Other.Base are equal.
+ bool operator==(const basic_collection_iterator &Other) const {
+ if (Base && (Base == Other.Base)) {
+ assert((Base->CurrentEntry == Other.Base->CurrentEntry)
+ && "Equal Bases expected to point to equal Entries");
+ }
+
+ return Base == Other.Base;
+ }
+
+ bool operator!=(const basic_collection_iterator &Other) const {
+ return !(Base == Other.Base);
+ }
+
+ basic_collection_iterator &operator++() {
+ assert(Base && "Attempted to advance iterator past end!");
+ Base->increment();
+ // Create an end iterator.
+ if (!Base->CurrentEntry)
+ Base = nullptr;
+ return *this;
+ }
+
+private:
+ BaseT *Base;
+};
+
+// The following two templates are used for both MappingNode and Sequence Node.
+template <class CollectionType>
+typename CollectionType::iterator begin(CollectionType &C) {
+ assert(C.IsAtBeginning && "You may only iterate over a collection once!");
+ C.IsAtBeginning = false;
+ typename CollectionType::iterator ret(&C);
+ ++ret;
+ return ret;
+}
+
+template <class CollectionType> void skip(CollectionType &C) {
+ // TODO: support skipping from the middle of a parsed collection ;/
+ assert((C.IsAtBeginning || C.IsAtEnd) && "Cannot skip mid parse!");
+ if (C.IsAtBeginning)
+ for (typename CollectionType::iterator i = begin(C), e = C.end(); i != e;
+ ++i)
+ i->skip();
+}
+
+/// \brief Represents a YAML map created from either a block map for a flow map.
+///
+/// This parses the YAML stream as increment() is called.
+///
+/// Example:
+/// Name: _main
+/// Scope: Global
+class MappingNode final : public Node {
+ void anchor() override;
+
+public:
+ enum MappingType {
+ MT_Block,
+ MT_Flow,
+ MT_Inline ///< An inline mapping node is used for "[key: value]".
+ };
+
+ MappingNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
+ MappingType MT)
+ : Node(NK_Mapping, D, Anchor, Tag), Type(MT), IsAtBeginning(true),
+ IsAtEnd(false), CurrentEntry(nullptr) {}
+
+ friend class basic_collection_iterator<MappingNode, KeyValueNode>;
+ typedef basic_collection_iterator<MappingNode, KeyValueNode> iterator;
+ template <class T> friend typename T::iterator yaml::begin(T &);
+ template <class T> friend void yaml::skip(T &);
+
+ iterator begin() { return yaml::begin(*this); }
+
+ iterator end() { return iterator(); }
+
+ void skip() override { yaml::skip(*this); }
+
+ static inline bool classof(const Node *N) {
+ return N->getType() == NK_Mapping;
+ }
+
+private:
+ MappingType Type;
+ bool IsAtBeginning;
+ bool IsAtEnd;
+ KeyValueNode *CurrentEntry;
+
+ void increment();
+};
+
+/// \brief Represents a YAML sequence created from either a block sequence for a
+/// flow sequence.
+///
+/// This parses the YAML stream as increment() is called.
+///
+/// Example:
+/// - Hello
+/// - World
+class SequenceNode final : public Node {
+ void anchor() override;
+
+public:
+ enum SequenceType {
+ ST_Block,
+ ST_Flow,
+ // Use for:
+ //
+ // key:
+ // - val1
+ // - val2
+ //
+ // As a BlockMappingEntry and BlockEnd are not created in this case.
+ ST_Indentless
+ };
+
+ SequenceNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
+ SequenceType ST)
+ : Node(NK_Sequence, D, Anchor, Tag), SeqType(ST), IsAtBeginning(true),
+ IsAtEnd(false),
+ WasPreviousTokenFlowEntry(true), // Start with an imaginary ','.
+ CurrentEntry(nullptr) {}
+
+ friend class basic_collection_iterator<SequenceNode, Node>;
+ typedef basic_collection_iterator<SequenceNode, Node> iterator;
+ template <class T> friend typename T::iterator yaml::begin(T &);
+ template <class T> friend void yaml::skip(T &);
+
+ void increment();
+
+ iterator begin() { return yaml::begin(*this); }
+
+ iterator end() { return iterator(); }
+
+ void skip() override { yaml::skip(*this); }
+
+ static inline bool classof(const Node *N) {
+ return N->getType() == NK_Sequence;
+ }
+
+private:
+ SequenceType SeqType;
+ bool IsAtBeginning;
+ bool IsAtEnd;
+ bool WasPreviousTokenFlowEntry;
+ Node *CurrentEntry;
+};
+
+/// \brief Represents an alias to a Node with an anchor.
+///
+/// Example:
+/// *AnchorName
+class AliasNode final : public Node {
+ void anchor() override;
+
+public:
+ AliasNode(std::unique_ptr<Document> &D, StringRef Val)
+ : Node(NK_Alias, D, StringRef(), StringRef()), Name(Val) {}
+
+ StringRef getName() const { return Name; }
+ Node *getTarget();
+
+ static inline bool classof(const Node *N) { return N->getType() == NK_Alias; }
+
+private:
+ StringRef Name;
+};
+
+/// \brief A YAML Stream is a sequence of Documents. A document contains a root
+/// node.
+class Document {
+public:
+ /// \brief Root for parsing a node. Returns a single node.
+ Node *parseBlockNode();
+
+ Document(Stream &ParentStream);
+
+ /// \brief Finish parsing the current document and return true if there are
+ /// more. Return false otherwise.
+ bool skip();
+
+ /// \brief Parse and return the root level node.
+ Node *getRoot() {
+ if (Root)
+ return Root;
+ return Root = parseBlockNode();
+ }
+
+ const std::map<StringRef, StringRef> &getTagMap() const { return TagMap; }
+
+private:
+ friend class Node;
+ friend class document_iterator;
+
+ /// \brief Stream to read tokens from.
+ Stream &stream;
+
+ /// \brief Used to allocate nodes to. All are destroyed without calling their
+ /// destructor when the document is destroyed.
+ BumpPtrAllocator NodeAllocator;
+
+ /// \brief The root node. Used to support skipping a partially parsed
+ /// document.
+ Node *Root;
+
+ /// \brief Maps tag prefixes to their expansion.
+ std::map<StringRef, StringRef> TagMap;
+
+ Token &peekNext();
+ Token getNext();
+ void setError(const Twine &Message, Token &Location) const;
+ bool failed() const;
+
+ /// \brief Parse %BLAH directives and return true if any were encountered.
+ bool parseDirectives();
+
+ /// \brief Parse %YAML
+ void parseYAMLDirective();
+
+ /// \brief Parse %TAG
+ void parseTAGDirective();
+
+ /// \brief Consume the next token and error if it is not \a TK.
+ bool expectToken(int TK);
+};
+
+/// \brief Iterator abstraction for Documents over a Stream.
+class document_iterator {
+public:
+ document_iterator() : Doc(nullptr) {}
+ document_iterator(std::unique_ptr<Document> &D) : Doc(&D) {}
+
+ bool operator==(const document_iterator &Other) {
+ if (isAtEnd() || Other.isAtEnd())
+ return isAtEnd() && Other.isAtEnd();
+
+ return Doc == Other.Doc;
+ }
+ bool operator!=(const document_iterator &Other) { return !(*this == Other); }
+
+ document_iterator operator++() {
+ assert(Doc && "incrementing iterator past the end.");
+ if (!(*Doc)->skip()) {
+ Doc->reset(nullptr);
+ } else {
+ Stream &S = (*Doc)->stream;
+ Doc->reset(new Document(S));
+ }
+ return *this;
+ }
+
+ Document &operator*() { return *Doc->get(); }
+
+ std::unique_ptr<Document> &operator->() { return *Doc; }
+
+private:
+ bool isAtEnd() const { return !Doc || !*Doc; }
+
+ std::unique_ptr<Document> *Doc;
+};
+
+} // End namespace yaml.
+
+} // End namespace llvm.
+
+#endif
diff --git a/ext/include/llvm/Support/YAMLTraits.h b/ext/include/llvm/Support/YAMLTraits.h
new file mode 100644
index 0000000..9c1a0db
--- /dev/null
+++ b/ext/include/llvm/Support/YAMLTraits.h
@@ -0,0 +1,1446 @@
+//===- llvm/Support/YAMLTraits.h --------------------------------*- C++ -*-===//
+//
+// The LLVM Linker
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_YAMLTRAITS_H
+#define LLVM_SUPPORT_YAMLTRAITS_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/YAMLParser.h"
+#include "llvm/Support/raw_ostream.h"
+#include <system_error>
+
+namespace llvm {
+namespace yaml {
+
+/// This class should be specialized by any type that needs to be converted
+/// to/from a YAML mapping. For example:
+///
+/// struct MappingTraits<MyStruct> {
+/// static void mapping(IO &io, MyStruct &s) {
+/// io.mapRequired("name", s.name);
+/// io.mapRequired("size", s.size);
+/// io.mapOptional("age", s.age);
+/// }
+/// };
+template<class T>
+struct MappingTraits {
+ // Must provide:
+ // static void mapping(IO &io, T &fields);
+ // Optionally may provide:
+ // static StringRef validate(IO &io, T &fields);
+ //
+ // The optional flow flag will cause generated YAML to use a flow mapping
+ // (e.g. { a: 0, b: 1 }):
+ // static const bool flow = true;
+};
+
+/// This class should be specialized by any integral type that converts
+/// to/from a YAML scalar where there is a one-to-one mapping between
+/// in-memory values and a string in YAML. For example:
+///
+/// struct ScalarEnumerationTraits<Colors> {
+/// static void enumeration(IO &io, Colors &value) {
+/// io.enumCase(value, "red", cRed);
+/// io.enumCase(value, "blue", cBlue);
+/// io.enumCase(value, "green", cGreen);
+/// }
+/// };
+template<typename T>
+struct ScalarEnumerationTraits {
+ // Must provide:
+ // static void enumeration(IO &io, T &value);
+};
+
+/// This class should be specialized by any integer type that is a union
+/// of bit values and the YAML representation is a flow sequence of
+/// strings. For example:
+///
+/// struct ScalarBitSetTraits<MyFlags> {
+/// static void bitset(IO &io, MyFlags &value) {
+/// io.bitSetCase(value, "big", flagBig);
+/// io.bitSetCase(value, "flat", flagFlat);
+/// io.bitSetCase(value, "round", flagRound);
+/// }
+/// };
+template<typename T>
+struct ScalarBitSetTraits {
+ // Must provide:
+ // static void bitset(IO &io, T &value);
+};
+
+/// This class should be specialized by type that requires custom conversion
+/// to/from a yaml scalar. For example:
+///
+/// template<>
+/// struct ScalarTraits<MyType> {
+/// static void output(const MyType &val, void*, llvm::raw_ostream &out) {
+/// // stream out custom formatting
+/// out << llvm::format("%x", val);
+/// }
+/// static StringRef input(StringRef scalar, void*, MyType &value) {
+/// // parse scalar and set `value`
+/// // return empty string on success, or error string
+/// return StringRef();
+/// }
+/// static bool mustQuote(StringRef) { return true; }
+/// };
+template<typename T>
+struct ScalarTraits {
+ // Must provide:
+ //
+ // Function to write the value as a string:
+ //static void output(const T &value, void *ctxt, llvm::raw_ostream &out);
+ //
+ // Function to convert a string to a value. Returns the empty
+ // StringRef on success or an error string if string is malformed:
+ //static StringRef input(StringRef scalar, void *ctxt, T &value);
+ //
+ // Function to determine if the value should be quoted.
+ //static bool mustQuote(StringRef);
+};
+
+
+/// This class should be specialized by type that requires custom conversion
+/// to/from a YAML literal block scalar. For example:
+///
+/// template <>
+/// struct BlockScalarTraits<MyType> {
+/// static void output(const MyType &Value, void*, llvm::raw_ostream &Out)
+/// {
+/// // stream out custom formatting
+/// Out << Val;
+/// }
+/// static StringRef input(StringRef Scalar, void*, MyType &Value) {
+/// // parse scalar and set `value`
+/// // return empty string on success, or error string
+/// return StringRef();
+/// }
+/// };
+template <typename T>
+struct BlockScalarTraits {
+ // Must provide:
+ //
+ // Function to write the value as a string:
+ // static void output(const T &Value, void *ctx, llvm::raw_ostream &Out);
+ //
+ // Function to convert a string to a value. Returns the empty
+ // StringRef on success or an error string if string is malformed:
+ // static StringRef input(StringRef Scalar, void *ctxt, T &Value);
+};
+
+/// This class should be specialized by any type that needs to be converted
+/// to/from a YAML sequence. For example:
+///
+/// template<>
+/// struct SequenceTraits< std::vector<MyType> > {
+/// static size_t size(IO &io, std::vector<MyType> &seq) {
+/// return seq.size();
+/// }
+/// static MyType& element(IO &, std::vector<MyType> &seq, size_t index) {
+/// if ( index >= seq.size() )
+/// seq.resize(index+1);
+/// return seq[index];
+/// }
+/// };
+template<typename T>
+struct SequenceTraits {
+ // Must provide:
+ // static size_t size(IO &io, T &seq);
+ // static T::value_type& element(IO &io, T &seq, size_t index);
+ //
+ // The following is option and will cause generated YAML to use
+ // a flow sequence (e.g. [a,b,c]).
+ // static const bool flow = true;
+};
+
+/// This class should be specialized by any type that needs to be converted
+/// to/from a list of YAML documents.
+template<typename T>
+struct DocumentListTraits {
+ // Must provide:
+ // static size_t size(IO &io, T &seq);
+ // static T::value_type& element(IO &io, T &seq, size_t index);
+};
+
+// Only used by compiler if both template types are the same
+template <typename T, T>
+struct SameType;
+
+// Only used for better diagnostics of missing traits
+template <typename T>
+struct MissingTrait;
+
+// Test if ScalarEnumerationTraits<T> is defined on type T.
+template <class T>
+struct has_ScalarEnumerationTraits
+{
+ typedef void (*Signature_enumeration)(class IO&, T&);
+
+ template <typename U>
+ static char test(SameType<Signature_enumeration, &U::enumeration>*);
+
+ template <typename U>
+ static double test(...);
+
+public:
+ static bool const value =
+ (sizeof(test<ScalarEnumerationTraits<T> >(nullptr)) == 1);
+};
+
+// Test if ScalarBitSetTraits<T> is defined on type T.
+template <class T>
+struct has_ScalarBitSetTraits
+{
+ typedef void (*Signature_bitset)(class IO&, T&);
+
+ template <typename U>
+ static char test(SameType<Signature_bitset, &U::bitset>*);
+
+ template <typename U>
+ static double test(...);
+
+public:
+ static bool const value = (sizeof(test<ScalarBitSetTraits<T> >(nullptr)) == 1);
+};
+
+// Test if ScalarTraits<T> is defined on type T.
+template <class T>
+struct has_ScalarTraits
+{
+ typedef StringRef (*Signature_input)(StringRef, void*, T&);
+ typedef void (*Signature_output)(const T&, void*, llvm::raw_ostream&);
+ typedef bool (*Signature_mustQuote)(StringRef);
+
+ template <typename U>
+ static char test(SameType<Signature_input, &U::input> *,
+ SameType<Signature_output, &U::output> *,
+ SameType<Signature_mustQuote, &U::mustQuote> *);
+
+ template <typename U>
+ static double test(...);
+
+public:
+ static bool const value =
+ (sizeof(test<ScalarTraits<T>>(nullptr, nullptr, nullptr)) == 1);
+};
+
+// Test if BlockScalarTraits<T> is defined on type T.
+template <class T>
+struct has_BlockScalarTraits
+{
+ typedef StringRef (*Signature_input)(StringRef, void *, T &);
+ typedef void (*Signature_output)(const T &, void *, llvm::raw_ostream &);
+
+ template <typename U>
+ static char test(SameType<Signature_input, &U::input> *,
+ SameType<Signature_output, &U::output> *);
+
+ template <typename U>
+ static double test(...);
+
+public:
+ static bool const value =
+ (sizeof(test<BlockScalarTraits<T>>(nullptr, nullptr)) == 1);
+};
+
+// Test if MappingTraits<T> is defined on type T.
+template <class T>
+struct has_MappingTraits
+{
+ typedef void (*Signature_mapping)(class IO&, T&);
+
+ template <typename U>
+ static char test(SameType<Signature_mapping, &U::mapping>*);
+
+ template <typename U>
+ static double test(...);
+
+public:
+ static bool const value = (sizeof(test<MappingTraits<T> >(nullptr)) == 1);
+};
+
+// Test if MappingTraits<T>::validate() is defined on type T.
+template <class T>
+struct has_MappingValidateTraits
+{
+ typedef StringRef (*Signature_validate)(class IO&, T&);
+
+ template <typename U>
+ static char test(SameType<Signature_validate, &U::validate>*);
+
+ template <typename U>
+ static double test(...);
+
+public:
+ static bool const value = (sizeof(test<MappingTraits<T> >(nullptr)) == 1);
+};
+
+// Test if SequenceTraits<T> is defined on type T.
+template <class T>
+struct has_SequenceMethodTraits
+{
+ typedef size_t (*Signature_size)(class IO&, T&);
+
+ template <typename U>
+ static char test(SameType<Signature_size, &U::size>*);
+
+ template <typename U>
+ static double test(...);
+
+public:
+ static bool const value = (sizeof(test<SequenceTraits<T> >(nullptr)) == 1);
+};
+
+// has_FlowTraits<int> will cause an error with some compilers because
+// it subclasses int. Using this wrapper only instantiates the
+// real has_FlowTraits only if the template type is a class.
+template <typename T, bool Enabled = std::is_class<T>::value>
+class has_FlowTraits
+{
+public:
+ static const bool value = false;
+};
+
+// Some older gcc compilers don't support straight forward tests
+// for members, so test for ambiguity cause by the base and derived
+// classes both defining the member.
+template <class T>
+struct has_FlowTraits<T, true>
+{
+ struct Fallback { bool flow; };
+ struct Derived : T, Fallback { };
+
+ template<typename C>
+ static char (&f(SameType<bool Fallback::*, &C::flow>*))[1];
+
+ template<typename C>
+ static char (&f(...))[2];
+
+public:
+ static bool const value = sizeof(f<Derived>(nullptr)) == 2;
+};
+
+// Test if SequenceTraits<T> is defined on type T
+template<typename T>
+struct has_SequenceTraits : public std::integral_constant<bool,
+ has_SequenceMethodTraits<T>::value > { };
+
+// Test if DocumentListTraits<T> is defined on type T
+template <class T>
+struct has_DocumentListTraits
+{
+ typedef size_t (*Signature_size)(class IO&, T&);
+
+ template <typename U>
+ static char test(SameType<Signature_size, &U::size>*);
+
+ template <typename U>
+ static double test(...);
+
+public:
+ static bool const value = (sizeof(test<DocumentListTraits<T> >(nullptr))==1);
+};
+
+inline bool isNumber(StringRef S) {
+ static const char OctalChars[] = "01234567";
+ if (S.startswith("0") &&
+ S.drop_front().find_first_not_of(OctalChars) == StringRef::npos)
+ return true;
+
+ if (S.startswith("0o") &&
+ S.drop_front(2).find_first_not_of(OctalChars) == StringRef::npos)
+ return true;
+
+ static const char HexChars[] = "0123456789abcdefABCDEF";
+ if (S.startswith("0x") &&
+ S.drop_front(2).find_first_not_of(HexChars) == StringRef::npos)
+ return true;
+
+ static const char DecChars[] = "0123456789";
+ if (S.find_first_not_of(DecChars) == StringRef::npos)
+ return true;
+
+ if (S.equals(".inf") || S.equals(".Inf") || S.equals(".INF"))
+ return true;
+
+ Regex FloatMatcher("^(\\.[0-9]+|[0-9]+(\\.[0-9]*)?)([eE][-+]?[0-9]+)?$");
+ if (FloatMatcher.match(S))
+ return true;
+
+ return false;
+}
+
+inline bool isNumeric(StringRef S) {
+ if ((S.front() == '-' || S.front() == '+') && isNumber(S.drop_front()))
+ return true;
+
+ if (isNumber(S))
+ return true;
+
+ if (S.equals(".nan") || S.equals(".NaN") || S.equals(".NAN"))
+ return true;
+
+ return false;
+}
+
+inline bool isNull(StringRef S) {
+ return S.equals("null") || S.equals("Null") || S.equals("NULL") ||
+ S.equals("~");
+}
+
+inline bool isBool(StringRef S) {
+ return S.equals("true") || S.equals("True") || S.equals("TRUE") ||
+ S.equals("false") || S.equals("False") || S.equals("FALSE");
+}
+
+inline bool needsQuotes(StringRef S) {
+ if (S.empty())
+ return true;
+ if (isspace(S.front()) || isspace(S.back()))
+ return true;
+ if (S.front() == ',')
+ return true;
+
+ static const char ScalarSafeChars[] =
+ "abcdefghijklmnopqrstuvwxyz"
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-/^., \t";
+ if (S.find_first_not_of(ScalarSafeChars) != StringRef::npos)
+ return true;
+
+ if (isNull(S))
+ return true;
+ if (isBool(S))
+ return true;
+ if (isNumeric(S))
+ return true;
+
+ return false;
+}
+
+template<typename T>
+struct missingTraits : public std::integral_constant<bool,
+ !has_ScalarEnumerationTraits<T>::value
+ && !has_ScalarBitSetTraits<T>::value
+ && !has_ScalarTraits<T>::value
+ && !has_BlockScalarTraits<T>::value
+ && !has_MappingTraits<T>::value
+ && !has_SequenceTraits<T>::value
+ && !has_DocumentListTraits<T>::value > {};
+
+template<typename T>
+struct validatedMappingTraits : public std::integral_constant<bool,
+ has_MappingTraits<T>::value
+ && has_MappingValidateTraits<T>::value> {};
+
+template<typename T>
+struct unvalidatedMappingTraits : public std::integral_constant<bool,
+ has_MappingTraits<T>::value
+ && !has_MappingValidateTraits<T>::value> {};
+// Base class for Input and Output.
+class IO {
+public:
+
+ IO(void *Ctxt=nullptr);
+ virtual ~IO();
+
+ virtual bool outputting() = 0;
+
+ virtual unsigned beginSequence() = 0;
+ virtual bool preflightElement(unsigned, void *&) = 0;
+ virtual void postflightElement(void*) = 0;
+ virtual void endSequence() = 0;
+ virtual bool canElideEmptySequence() = 0;
+
+ virtual unsigned beginFlowSequence() = 0;
+ virtual bool preflightFlowElement(unsigned, void *&) = 0;
+ virtual void postflightFlowElement(void*) = 0;
+ virtual void endFlowSequence() = 0;
+
+ virtual bool mapTag(StringRef Tag, bool Default=false) = 0;
+ virtual void beginMapping() = 0;
+ virtual void endMapping() = 0;
+ virtual bool preflightKey(const char*, bool, bool, bool &, void *&) = 0;
+ virtual void postflightKey(void*) = 0;
+ virtual std::vector<StringRef> getKeys() const = 0;
+
+ virtual void beginFlowMapping() = 0;
+ virtual void endFlowMapping() = 0;
+
+ virtual void beginEnumScalar() = 0;
+ virtual bool matchEnumScalar(const char*, bool) = 0;
+ virtual bool matchEnumFallback() = 0;
+ virtual void endEnumScalar() = 0;
+
+ virtual bool beginBitSetScalar(bool &) = 0;
+ virtual bool bitSetMatch(const char*, bool) = 0;
+ virtual void endBitSetScalar() = 0;
+
+ virtual void scalarString(StringRef &, bool) = 0;
+ virtual void blockScalarString(StringRef &) = 0;
+
+ virtual void setError(const Twine &) = 0;
+
+ template <typename T>
+ void enumCase(T &Val, const char* Str, const T ConstVal) {
+ if ( matchEnumScalar(Str, outputting() && Val == ConstVal) ) {
+ Val = ConstVal;
+ }
+ }
+
+ // allow anonymous enum values to be used with LLVM_YAML_STRONG_TYPEDEF
+ template <typename T>
+ void enumCase(T &Val, const char* Str, const uint32_t ConstVal) {
+ if ( matchEnumScalar(Str, outputting() && Val == static_cast<T>(ConstVal)) ) {
+ Val = ConstVal;
+ }
+ }
+
+ template <typename FBT, typename T>
+ void enumFallback(T &Val) {
+ if ( matchEnumFallback() ) {
+ // FIXME: Force integral conversion to allow strong typedefs to convert.
+ FBT Res = (uint64_t)Val;
+ yamlize(*this, Res, true);
+ Val = (uint64_t)Res;
+ }
+ }
+
+ template <typename T>
+ void bitSetCase(T &Val, const char* Str, const T ConstVal) {
+ if ( bitSetMatch(Str, outputting() && (Val & ConstVal) == ConstVal) ) {
+ Val = Val | ConstVal;
+ }
+ }
+
+ // allow anonymous enum values to be used with LLVM_YAML_STRONG_TYPEDEF
+ template <typename T>
+ void bitSetCase(T &Val, const char* Str, const uint32_t ConstVal) {
+ if ( bitSetMatch(Str, outputting() && (Val & ConstVal) == ConstVal) ) {
+ Val = Val | ConstVal;
+ }
+ }
+
+ template <typename T>
+ void maskedBitSetCase(T &Val, const char *Str, T ConstVal, T Mask) {
+ if (bitSetMatch(Str, outputting() && (Val & Mask) == ConstVal))
+ Val = Val | ConstVal;
+ }
+
+ template <typename T>
+ void maskedBitSetCase(T &Val, const char *Str, uint32_t ConstVal,
+ uint32_t Mask) {
+ if (bitSetMatch(Str, outputting() && (Val & Mask) == ConstVal))
+ Val = Val | ConstVal;
+ }
+
+ void *getContext();
+ void setContext(void *);
+
+ template <typename T>
+ void mapRequired(const char* Key, T& Val) {
+ this->processKey(Key, Val, true);
+ }
+
+ template <typename T>
+ typename std::enable_if<has_SequenceTraits<T>::value,void>::type
+ mapOptional(const char* Key, T& Val) {
+ // omit key/value instead of outputting empty sequence
+ if ( this->canElideEmptySequence() && !(Val.begin() != Val.end()) )
+ return;
+ this->processKey(Key, Val, false);
+ }
+
+ template <typename T>
+ void mapOptional(const char* Key, Optional<T> &Val) {
+ processKeyWithDefault(Key, Val, Optional<T>(), /*Required=*/false);
+ }
+
+ template <typename T>
+ typename std::enable_if<!has_SequenceTraits<T>::value,void>::type
+ mapOptional(const char* Key, T& Val) {
+ this->processKey(Key, Val, false);
+ }
+
+ template <typename T>
+ void mapOptional(const char* Key, T& Val, const T& Default) {
+ this->processKeyWithDefault(Key, Val, Default, false);
+ }
+
+private:
+ template <typename T>
+ void processKeyWithDefault(const char *Key, Optional<T> &Val,
+ const Optional<T> &DefaultValue, bool Required) {
+ assert(DefaultValue.hasValue() == false &&
+ "Optional<T> shouldn't have a value!");
+ void *SaveInfo;
+ bool UseDefault;
+ const bool sameAsDefault = outputting() && !Val.hasValue();
+ if (!outputting() && !Val.hasValue())
+ Val = T();
+ if (this->preflightKey(Key, Required, sameAsDefault, UseDefault,
+ SaveInfo)) {
+ yamlize(*this, Val.getValue(), Required);
+ this->postflightKey(SaveInfo);
+ } else {
+ if (UseDefault)
+ Val = DefaultValue;
+ }
+ }
+
+ template <typename T>
+ void processKeyWithDefault(const char *Key, T &Val, const T& DefaultValue,
+ bool Required) {
+ void *SaveInfo;
+ bool UseDefault;
+ const bool sameAsDefault = outputting() && Val == DefaultValue;
+ if ( this->preflightKey(Key, Required, sameAsDefault, UseDefault,
+ SaveInfo) ) {
+ yamlize(*this, Val, Required);
+ this->postflightKey(SaveInfo);
+ }
+ else {
+ if ( UseDefault )
+ Val = DefaultValue;
+ }
+ }
+
+ template <typename T>
+ void processKey(const char *Key, T &Val, bool Required) {
+ void *SaveInfo;
+ bool UseDefault;
+ if ( this->preflightKey(Key, Required, false, UseDefault, SaveInfo) ) {
+ yamlize(*this, Val, Required);
+ this->postflightKey(SaveInfo);
+ }
+ }
+
+ template <typename MK, typename MV>
+ void processKey(const char *Key, std::map<MK, MV> &Val, bool Required) {
+ void *SaveInfo;
+ bool UseDefault;
+ if ( this->preflightKey(Key, Required, false, UseDefault, SaveInfo) ) {
+ this->beginFlowMapping();
+
+ if (this->outputting()) {
+ for (auto &entry : Val) {
+ std::string Storage;
+ llvm::raw_string_ostream Buffer(Storage);
+ ScalarTraits<MK>::output(entry.first, this->getContext(), Buffer);
+ StringRef Str = Buffer.str();
+ this->processKey(Str.data(), entry.second, Required);
+ }
+ } else {
+ for (StringRef &key : this->getKeys()) {
+ MK entry;
+ StringRef Result = ScalarTraits<MK>::input(key, this->getContext(), entry);
+ if (!Result.empty())
+ this->setError(llvm::Twine(Result));
+ else
+ this->processKey(key.data(), Val[entry], Required);
+ }
+ }
+
+ this->endFlowMapping();
+
+ this->postflightKey(SaveInfo);
+ }
+ }
+
+private:
+ void *Ctxt;
+};
+
+template<typename T>
+typename std::enable_if<has_ScalarEnumerationTraits<T>::value,void>::type
+yamlize(IO &io, T &Val, bool) {
+ io.beginEnumScalar();
+ ScalarEnumerationTraits<T>::enumeration(io, Val);
+ io.endEnumScalar();
+}
+
+template<typename T>
+typename std::enable_if<has_ScalarBitSetTraits<T>::value,void>::type
+yamlize(IO &io, T &Val, bool) {
+ bool DoClear;
+ if ( io.beginBitSetScalar(DoClear) ) {
+ if ( DoClear )
+ Val = static_cast<T>(0);
+ ScalarBitSetTraits<T>::bitset(io, Val);
+ io.endBitSetScalar();
+ }
+}
+
+template<typename T>
+typename std::enable_if<has_ScalarTraits<T>::value,void>::type
+yamlize(IO &io, T &Val, bool) {
+ if ( io.outputting() ) {
+ std::string Storage;
+ llvm::raw_string_ostream Buffer(Storage);
+ ScalarTraits<T>::output(Val, io.getContext(), Buffer);
+ StringRef Str = Buffer.str();
+ io.scalarString(Str, ScalarTraits<T>::mustQuote(Str));
+ }
+ else {
+ StringRef Str;
+ io.scalarString(Str, ScalarTraits<T>::mustQuote(Str));
+ StringRef Result = ScalarTraits<T>::input(Str, io.getContext(), Val);
+ if ( !Result.empty() ) {
+ io.setError(llvm::Twine(Result));
+ }
+ }
+}
+
+template <typename T>
+typename std::enable_if<has_BlockScalarTraits<T>::value, void>::type
+yamlize(IO &YamlIO, T &Val, bool) {
+ if (YamlIO.outputting()) {
+ std::string Storage;
+ llvm::raw_string_ostream Buffer(Storage);
+ BlockScalarTraits<T>::output(Val, YamlIO.getContext(), Buffer);
+ StringRef Str = Buffer.str();
+ YamlIO.blockScalarString(Str);
+ } else {
+ StringRef Str;
+ YamlIO.blockScalarString(Str);
+ StringRef Result =
+ BlockScalarTraits<T>::input(Str, YamlIO.getContext(), Val);
+ if (!Result.empty())
+ YamlIO.setError(llvm::Twine(Result));
+ }
+}
+
+template<typename T>
+typename std::enable_if<validatedMappingTraits<T>::value, void>::type
+yamlize(IO &io, T &Val, bool) {
+ if (has_FlowTraits<MappingTraits<T>>::value)
+ io.beginFlowMapping();
+ else
+ io.beginMapping();
+ if (io.outputting()) {
+ StringRef Err = MappingTraits<T>::validate(io, Val);
+ if (!Err.empty()) {
+ llvm::errs() << Err << "\n";
+ assert(Err.empty() && "invalid struct trying to be written as yaml");
+ }
+ }
+ MappingTraits<T>::mapping(io, Val);
+ if (!io.outputting()) {
+ StringRef Err = MappingTraits<T>::validate(io, Val);
+ if (!Err.empty())
+ io.setError(Err);
+ }
+ if (has_FlowTraits<MappingTraits<T>>::value)
+ io.endFlowMapping();
+ else
+ io.endMapping();
+}
+
+template<typename T>
+typename std::enable_if<unvalidatedMappingTraits<T>::value, void>::type
+yamlize(IO &io, T &Val, bool) {
+ if (has_FlowTraits<MappingTraits<T>>::value) {
+ io.beginFlowMapping();
+ MappingTraits<T>::mapping(io, Val);
+ io.endFlowMapping();
+ } else {
+ io.beginMapping();
+ MappingTraits<T>::mapping(io, Val);
+ io.endMapping();
+ }
+}
+
+template<typename T>
+typename std::enable_if<missingTraits<T>::value, void>::type
+yamlize(IO &io, T &Val, bool) {
+ char missing_yaml_trait_for_type[sizeof(MissingTrait<T>)];
+}
+
+template<typename T>
+typename std::enable_if<has_SequenceTraits<T>::value,void>::type
+yamlize(IO &io, T &Seq, bool) {
+ if ( has_FlowTraits< SequenceTraits<T> >::value ) {
+ unsigned incnt = io.beginFlowSequence();
+ unsigned count = io.outputting() ? SequenceTraits<T>::size(io, Seq) : incnt;
+ for(unsigned i=0; i < count; ++i) {
+ void *SaveInfo;
+ if ( io.preflightFlowElement(i, SaveInfo) ) {
+ yamlize(io, SequenceTraits<T>::element(io, Seq, i), true);
+ io.postflightFlowElement(SaveInfo);
+ }
+ }
+ io.endFlowSequence();
+ }
+ else {
+ unsigned incnt = io.beginSequence();
+ unsigned count = io.outputting() ? SequenceTraits<T>::size(io, Seq) : incnt;
+ for(unsigned i=0; i < count; ++i) {
+ void *SaveInfo;
+ if ( io.preflightElement(i, SaveInfo) ) {
+ yamlize(io, SequenceTraits<T>::element(io, Seq, i), true);
+ io.postflightElement(SaveInfo);
+ }
+ }
+ io.endSequence();
+ }
+}
+
+template<>
+struct ScalarTraits<bool> {
+ static void output(const bool &, void*, llvm::raw_ostream &);
+ static StringRef input(StringRef, void*, bool &);
+ static bool mustQuote(StringRef) { return false; }
+};
+
+template<>
+struct ScalarTraits<StringRef> {
+ static void output(const StringRef &, void*, llvm::raw_ostream &);
+ static StringRef input(StringRef, void*, StringRef &);
+ static bool mustQuote(StringRef S) { return needsQuotes(S); }
+};
+
+template<>
+struct ScalarTraits<std::string> {
+ static void output(const std::string &, void*, llvm::raw_ostream &);
+ static StringRef input(StringRef, void*, std::string &);
+ static bool mustQuote(StringRef S) { return needsQuotes(S); }
+};
+
+template<>
+struct ScalarTraits<uint8_t> {
+ static void output(const uint8_t &, void*, llvm::raw_ostream &);
+ static StringRef input(StringRef, void*, uint8_t &);
+ static bool mustQuote(StringRef) { return false; }
+};
+
+template<>
+struct ScalarTraits<uint16_t> {
+ static void output(const uint16_t &, void*, llvm::raw_ostream &);
+ static StringRef input(StringRef, void*, uint16_t &);
+ static bool mustQuote(StringRef) { return false; }
+};
+
+template<>
+struct ScalarTraits<uint32_t> {
+ static void output(const uint32_t &, void*, llvm::raw_ostream &);
+ static StringRef input(StringRef, void*, uint32_t &);
+ static bool mustQuote(StringRef) { return false; }
+};
+
+template<>
+struct ScalarTraits<unsigned long> {
+ static void output(const unsigned long &, void*, llvm::raw_ostream &);
+ static StringRef input(StringRef, void*, unsigned long &);
+ static bool mustQuote(StringRef) { return false; }
+};
+
+template<>
+struct ScalarTraits<unsigned long long> {
+ static void output(const unsigned long long &, void*, llvm::raw_ostream &);
+ static StringRef input(StringRef, void*, unsigned long long &);
+ static bool mustQuote(StringRef) { return false; }
+};
+
+template<>
+struct ScalarTraits<int8_t> {
+ static void output(const int8_t &, void*, llvm::raw_ostream &);
+ static StringRef input(StringRef, void*, int8_t &);
+ static bool mustQuote(StringRef) { return false; }
+};
+
+template<>
+struct ScalarTraits<int16_t> {
+ static void output(const int16_t &, void*, llvm::raw_ostream &);
+ static StringRef input(StringRef, void*, int16_t &);
+ static bool mustQuote(StringRef) { return false; }
+};
+
+template<>
+struct ScalarTraits<int32_t> {
+ static void output(const int32_t &, void*, llvm::raw_ostream &);
+ static StringRef input(StringRef, void*, int32_t &);
+ static bool mustQuote(StringRef) { return false; }
+};
+
+template<>
+struct ScalarTraits<int64_t> {
+ static void output(const int64_t &, void*, llvm::raw_ostream &);
+ static StringRef input(StringRef, void*, int64_t &);
+ static bool mustQuote(StringRef) { return false; }
+};
+
+
+template<>
+struct ScalarTraits<float> {
+ static void output(const float &, void*, llvm::raw_ostream &);
+ static StringRef input(StringRef, void*, float &);
+ static bool mustQuote(StringRef) { return false; }
+};
+
+template<>
+struct ScalarTraits<double> {
+ static void output(const double &, void*, llvm::raw_ostream &);
+ static StringRef input(StringRef, void*, double &);
+ static bool mustQuote(StringRef) { return false; }
+};
+
+// Utility for use within MappingTraits<>::mapping() method
+// to [de]normalize an object for use with YAML conversion.
+template <typename TNorm, typename TFinal>
+struct MappingNormalization {
+ MappingNormalization(IO &i_o, TFinal &Obj)
+ : io(i_o), BufPtr(nullptr), Result(Obj) {
+ if ( io.outputting() ) {
+ BufPtr = new (&Buffer) TNorm(io, Obj);
+ }
+ else {
+ BufPtr = new (&Buffer) TNorm(io);
+ }
+ }
+
+ ~MappingNormalization() {
+ if ( ! io.outputting() ) {
+ Result = BufPtr->denormalize(io);
+ }
+ BufPtr->~TNorm();
+ }
+
+ TNorm* operator->() { return BufPtr; }
+
+private:
+ typedef llvm::AlignedCharArrayUnion<TNorm> Storage;
+
+ Storage Buffer;
+ IO &io;
+ TNorm *BufPtr;
+ TFinal &Result;
+};
+
+// Utility for use within MappingTraits<>::mapping() method
+// to [de]normalize an object for use with YAML conversion.
+template <typename TNorm, typename TFinal>
+struct MappingNormalizationHeap {
+ MappingNormalizationHeap(IO &i_o, TFinal &Obj)
+ : io(i_o), BufPtr(nullptr), Result(Obj) {
+ if ( io.outputting() ) {
+ BufPtr = new (&Buffer) TNorm(io, Obj);
+ }
+ else {
+ BufPtr = new TNorm(io);
+ }
+ }
+
+ ~MappingNormalizationHeap() {
+ if ( io.outputting() ) {
+ BufPtr->~TNorm();
+ }
+ else {
+ Result = BufPtr->denormalize(io);
+ }
+ }
+
+ TNorm* operator->() { return BufPtr; }
+
+private:
+ typedef llvm::AlignedCharArrayUnion<TNorm> Storage;
+
+ Storage Buffer;
+ IO &io;
+ TNorm *BufPtr;
+ TFinal &Result;
+};
+
+///
+/// The Input class is used to parse a yaml document into in-memory structs
+/// and vectors.
+///
+/// It works by using YAMLParser to do a syntax parse of the entire yaml
+/// document, then the Input class builds a graph of HNodes which wraps
+/// each yaml Node. The extra layer is buffering. The low level yaml
+/// parser only lets you look at each node once. The buffering layer lets
+/// you search and interate multiple times. This is necessary because
+/// the mapRequired() method calls may not be in the same order
+/// as the keys in the document.
+///
+class Input : public IO {
+public:
+ // Construct a yaml Input object from a StringRef and optional
+ // user-data. The DiagHandler can be specified to provide
+ // alternative error reporting.
+ Input(StringRef InputContent,
+ void *Ctxt = nullptr,
+ SourceMgr::DiagHandlerTy DiagHandler = nullptr,
+ void *DiagHandlerCtxt = nullptr);
+
+ // Construct a yaml Input object from a MemoryBuffer and optional
+ // user-data. The DiagHandler can be specified to provide
+ // alternative error reporting.
+ Input(MemoryBufferRef InputContent,
+ void *Ctxt = nullptr,
+ SourceMgr::DiagHandlerTy DiagHandler = nullptr,
+ void *DiagHandlerCtxt = nullptr);
+
+ ~Input() override;
+
+ // Check if there was an syntax or semantic error during parsing.
+ std::error_code error();
+
+private:
+ bool outputting() override;
+ bool mapTag(StringRef, bool) override;
+ void beginMapping() override;
+ void endMapping() override;
+ bool preflightKey(const char *, bool, bool, bool &, void *&) override;
+ void postflightKey(void *) override;
+ std::vector<StringRef> getKeys() const override;
+ void beginFlowMapping() override;
+ void endFlowMapping() override;
+ unsigned beginSequence() override;
+ void endSequence() override;
+ bool preflightElement(unsigned index, void *&) override;
+ void postflightElement(void *) override;
+ unsigned beginFlowSequence() override;
+ bool preflightFlowElement(unsigned , void *&) override;
+ void postflightFlowElement(void *) override;
+ void endFlowSequence() override;
+ void beginEnumScalar() override;
+ bool matchEnumScalar(const char*, bool) override;
+ bool matchEnumFallback() override;
+ void endEnumScalar() override;
+ bool beginBitSetScalar(bool &) override;
+ bool bitSetMatch(const char *, bool ) override;
+ void endBitSetScalar() override;
+ void scalarString(StringRef &, bool) override;
+ void blockScalarString(StringRef &) override;
+ void setError(const Twine &message) override;
+ bool canElideEmptySequence() override;
+
+ class HNode {
+ virtual void anchor();
+ public:
+ HNode(Node *n) : _node(n) { }
+ virtual ~HNode() { }
+ static inline bool classof(const HNode *) { return true; }
+
+ Node *_node;
+ };
+
+ class EmptyHNode : public HNode {
+ void anchor() override;
+ public:
+ EmptyHNode(Node *n) : HNode(n) { }
+ static inline bool classof(const HNode *n) {
+ return NullNode::classof(n->_node);
+ }
+ static inline bool classof(const EmptyHNode *) { return true; }
+ };
+
+ class ScalarHNode : public HNode {
+ void anchor() override;
+ public:
+ ScalarHNode(Node *n, StringRef s) : HNode(n), _value(s) { }
+
+ StringRef value() const { return _value; }
+
+ static inline bool classof(const HNode *n) {
+ return ScalarNode::classof(n->_node) ||
+ BlockScalarNode::classof(n->_node);
+ }
+ static inline bool classof(const ScalarHNode *) { return true; }
+ protected:
+ StringRef _value;
+ };
+
+ class MapHNode : public HNode {
+ void anchor() override;
+
+ public:
+ MapHNode(Node *n) : HNode(n) { }
+
+ static inline bool classof(const HNode *n) {
+ return MappingNode::classof(n->_node);
+ }
+ static inline bool classof(const MapHNode *) { return true; }
+
+ typedef llvm::StringMap<std::unique_ptr<HNode>> NameToNode;
+
+ bool isValidKey(StringRef key);
+
+ NameToNode Mapping;
+ llvm::SmallVector<const char*, 6> ValidKeys;
+ };
+
+ class SequenceHNode : public HNode {
+ void anchor() override;
+
+ public:
+ SequenceHNode(Node *n) : HNode(n) { }
+
+ static inline bool classof(const HNode *n) {
+ return SequenceNode::classof(n->_node);
+ }
+ static inline bool classof(const SequenceHNode *) { return true; }
+
+ std::vector<std::unique_ptr<HNode>> Entries;
+ };
+
+ std::unique_ptr<Input::HNode> createHNodes(Node *node);
+ void setError(HNode *hnode, const Twine &message);
+ void setError(Node *node, const Twine &message);
+
+public:
+ // These are only used by operator>>. They could be private
+ // if those templated things could be made friends.
+ bool setCurrentDocument();
+ bool nextDocument();
+
+ /// Returns the current node that's being parsed by the YAML Parser.
+ const Node *getCurrentNode() const;
+
+private:
+ llvm::SourceMgr SrcMgr; // must be before Strm
+ std::unique_ptr<llvm::yaml::Stream> Strm;
+ std::unique_ptr<HNode> TopNode;
+ std::error_code EC;
+ llvm::BumpPtrAllocator StringAllocator;
+ llvm::yaml::document_iterator DocIterator;
+ std::vector<bool> BitValuesUsed;
+ HNode *CurrentNode;
+ bool ScalarMatchFound;
+};
+
+///
+/// The Output class is used to generate a yaml document from in-memory structs
+/// and vectors.
+///
+class Output : public IO {
+public:
+ Output(llvm::raw_ostream &, void *Ctxt = nullptr, int WrapColumn = 70);
+ ~Output() override;
+
+ bool outputting() override;
+ bool mapTag(StringRef, bool) override;
+ void beginMapping() override;
+ void endMapping() override;
+ bool preflightKey(const char *key, bool, bool, bool &, void *&) override;
+ void postflightKey(void *) override;
+ std::vector<StringRef> getKeys() const override;
+ void beginFlowMapping() override;
+ void endFlowMapping() override;
+ unsigned beginSequence() override;
+ void endSequence() override;
+ bool preflightElement(unsigned, void *&) override;
+ void postflightElement(void *) override;
+ unsigned beginFlowSequence() override;
+ bool preflightFlowElement(unsigned, void *&) override;
+ void postflightFlowElement(void *) override;
+ void endFlowSequence() override;
+ void beginEnumScalar() override;
+ bool matchEnumScalar(const char*, bool) override;
+ bool matchEnumFallback() override;
+ void endEnumScalar() override;
+ bool beginBitSetScalar(bool &) override;
+ bool bitSetMatch(const char *, bool ) override;
+ void endBitSetScalar() override;
+ void scalarString(StringRef &, bool) override;
+ void blockScalarString(StringRef &) override;
+ void setError(const Twine &message) override;
+ bool canElideEmptySequence() override;
+public:
+ // These are only used by operator<<. They could be private
+ // if that templated operator could be made a friend.
+ void beginDocuments();
+ bool preflightDocument(unsigned);
+ void postflightDocument();
+ void endDocuments();
+
+private:
+ void output(StringRef s);
+ void outputUpToEndOfLine(StringRef s);
+ void newLineCheck();
+ void outputNewLine();
+ void paddedKey(StringRef key);
+ void flowKey(StringRef Key);
+
+ enum InState {
+ inSeq,
+ inFlowSeq,
+ inMapFirstKey,
+ inMapOtherKey,
+ inFlowMapFirstKey,
+ inFlowMapOtherKey
+ };
+
+ llvm::raw_ostream &Out;
+ int WrapColumn;
+ SmallVector<InState, 8> StateStack;
+ int Column;
+ int ColumnAtFlowStart;
+ int ColumnAtMapFlowStart;
+ bool NeedBitValueComma;
+ bool NeedFlowSequenceComma;
+ bool EnumerationMatchFound;
+ bool NeedsNewLine;
+};
+
+/// YAML I/O does conversion based on types. But often native data types
+/// are just a typedef of built in intergral types (e.g. int). But the C++
+/// type matching system sees through the typedef and all the typedefed types
+/// look like a built in type. This will cause the generic YAML I/O conversion
+/// to be used. To provide better control over the YAML conversion, you can
+/// use this macro instead of typedef. It will create a class with one field
+/// and automatic conversion operators to and from the base type.
+/// Based on BOOST_STRONG_TYPEDEF
+#define LLVM_YAML_STRONG_TYPEDEF(_base, _type) \
+ struct _type { \
+ _type() { } \
+ _type(const _base v) : value(v) { } \
+ _type(const _type &v) : value(v.value) {} \
+ _type &operator=(const _type &rhs) { value = rhs.value; return *this; }\
+ _type &operator=(const _base &rhs) { value = rhs; return *this; } \
+ operator const _base & () const { return value; } \
+ bool operator==(const _type &rhs) const { return value == rhs.value; } \
+ bool operator==(const _base &rhs) const { return value == rhs; } \
+ bool operator<(const _type &rhs) const { return value < rhs.value; } \
+ _base value; \
+ };
+
+///
+/// Use these types instead of uintXX_t in any mapping to have
+/// its yaml output formatted as hexadecimal.
+///
+LLVM_YAML_STRONG_TYPEDEF(uint8_t, Hex8)
+LLVM_YAML_STRONG_TYPEDEF(uint16_t, Hex16)
+LLVM_YAML_STRONG_TYPEDEF(uint32_t, Hex32)
+LLVM_YAML_STRONG_TYPEDEF(uint64_t, Hex64)
+
+template<>
+struct ScalarTraits<Hex8> {
+ static void output(const Hex8 &, void*, llvm::raw_ostream &);
+ static StringRef input(StringRef, void*, Hex8 &);
+ static bool mustQuote(StringRef) { return false; }
+};
+
+template<>
+struct ScalarTraits<Hex16> {
+ static void output(const Hex16 &, void*, llvm::raw_ostream &);
+ static StringRef input(StringRef, void*, Hex16 &);
+ static bool mustQuote(StringRef) { return false; }
+};
+
+template<>
+struct ScalarTraits<Hex32> {
+ static void output(const Hex32 &, void*, llvm::raw_ostream &);
+ static StringRef input(StringRef, void*, Hex32 &);
+ static bool mustQuote(StringRef) { return false; }
+};
+
+template<>
+struct ScalarTraits<Hex64> {
+ static void output(const Hex64 &, void*, llvm::raw_ostream &);
+ static StringRef input(StringRef, void*, Hex64 &);
+ static bool mustQuote(StringRef) { return false; }
+};
+
+// Define non-member operator>> so that Input can stream in a document list.
+template <typename T>
+inline
+typename std::enable_if<has_DocumentListTraits<T>::value, Input &>::type
+operator>>(Input &yin, T &docList) {
+ int i = 0;
+ while ( yin.setCurrentDocument() ) {
+ yamlize(yin, DocumentListTraits<T>::element(yin, docList, i), true);
+ if ( yin.error() )
+ return yin;
+ yin.nextDocument();
+ ++i;
+ }
+ return yin;
+}
+
+// Define non-member operator>> so that Input can stream in a map as a document.
+template <typename T>
+inline
+typename std::enable_if<has_MappingTraits<T>::value, Input &>::type
+operator>>(Input &yin, T &docMap) {
+ yin.setCurrentDocument();
+ yamlize(yin, docMap, true);
+ return yin;
+}
+
+// Define non-member operator>> so that Input can stream in a sequence as
+// a document.
+template <typename T>
+inline
+typename std::enable_if<has_SequenceTraits<T>::value, Input &>::type
+operator>>(Input &yin, T &docSeq) {
+ if (yin.setCurrentDocument())
+ yamlize(yin, docSeq, true);
+ return yin;
+}
+
+// Define non-member operator>> so that Input can stream in a block scalar.
+template <typename T>
+inline
+typename std::enable_if<has_BlockScalarTraits<T>::value, Input &>::type
+operator>>(Input &In, T &Val) {
+ if (In.setCurrentDocument())
+ yamlize(In, Val, true);
+ return In;
+}
+
+// Provide better error message about types missing a trait specialization
+template <typename T>
+inline
+typename std::enable_if<missingTraits<T>::value, Input &>::type
+operator>>(Input &yin, T &docSeq) {
+ char missing_yaml_trait_for_type[sizeof(MissingTrait<T>)];
+ return yin;
+}
+
+// Define non-member operator<< so that Output can stream out document list.
+template <typename T>
+inline
+typename std::enable_if<has_DocumentListTraits<T>::value, Output &>::type
+operator<<(Output &yout, T &docList) {
+ yout.beginDocuments();
+ const size_t count = DocumentListTraits<T>::size(yout, docList);
+ for(size_t i=0; i < count; ++i) {
+ if ( yout.preflightDocument(i) ) {
+ yamlize(yout, DocumentListTraits<T>::element(yout, docList, i), true);
+ yout.postflightDocument();
+ }
+ }
+ yout.endDocuments();
+ return yout;
+}
+
+// Define non-member operator<< so that Output can stream out a map.
+template <typename T>
+inline
+typename std::enable_if<has_MappingTraits<T>::value, Output &>::type
+operator<<(Output &yout, T &map) {
+ yout.beginDocuments();
+ if ( yout.preflightDocument(0) ) {
+ yamlize(yout, map, true);
+ yout.postflightDocument();
+ }
+ yout.endDocuments();
+ return yout;
+}
+
+// Define non-member operator<< so that Output can stream out a sequence.
+template <typename T>
+inline
+typename std::enable_if<has_SequenceTraits<T>::value, Output &>::type
+operator<<(Output &yout, T &seq) {
+ yout.beginDocuments();
+ if ( yout.preflightDocument(0) ) {
+ yamlize(yout, seq, true);
+ yout.postflightDocument();
+ }
+ yout.endDocuments();
+ return yout;
+}
+
+// Define non-member operator<< so that Output can stream out a block scalar.
+template <typename T>
+inline
+typename std::enable_if<has_BlockScalarTraits<T>::value, Output &>::type
+operator<<(Output &Out, T &Val) {
+ Out.beginDocuments();
+ if (Out.preflightDocument(0)) {
+ yamlize(Out, Val, true);
+ Out.postflightDocument();
+ }
+ Out.endDocuments();
+ return Out;
+}
+
+// Provide better error message about types missing a trait specialization
+template <typename T>
+inline
+typename std::enable_if<missingTraits<T>::value, Output &>::type
+operator<<(Output &yout, T &seq) {
+ char missing_yaml_trait_for_type[sizeof(MissingTrait<T>)];
+ return yout;
+}
+
+} // namespace yaml
+} // namespace llvm
+
+/// Utility for declaring that a std::vector of a particular type
+/// should be considered a YAML sequence.
+#define LLVM_YAML_IS_SEQUENCE_VECTOR(_type) \
+ namespace llvm { \
+ namespace yaml { \
+ template<> \
+ struct SequenceTraits< std::vector<_type> > { \
+ static size_t size(IO &io, std::vector<_type> &seq) { \
+ return seq.size(); \
+ } \
+ static _type& element(IO &io, std::vector<_type> &seq, size_t index) {\
+ if ( index >= seq.size() ) \
+ seq.resize(index+1); \
+ return seq[index]; \
+ } \
+ }; \
+ } \
+ }
+
+/// Utility for declaring that a std::vector of a particular type
+/// should be considered a YAML flow sequence.
+#define LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(_type) \
+ namespace llvm { \
+ namespace yaml { \
+ template<> \
+ struct SequenceTraits< std::vector<_type> > { \
+ static size_t size(IO &io, std::vector<_type> &seq) { \
+ return seq.size(); \
+ } \
+ static _type& element(IO &io, std::vector<_type> &seq, size_t index) {\
+ (void)flow; /* Remove this workaround after PR17897 is fixed */ \
+ if ( index >= seq.size() ) \
+ seq.resize(index+1); \
+ return seq[index]; \
+ } \
+ static const bool flow = true; \
+ }; \
+ } \
+ }
+
+/// Utility for declaring that a std::vector of a particular type
+/// should be considered a YAML document list.
+#define LLVM_YAML_IS_DOCUMENT_LIST_VECTOR(_type) \
+ namespace llvm { \
+ namespace yaml { \
+ template<> \
+ struct DocumentListTraits< std::vector<_type> > { \
+ static size_t size(IO &io, std::vector<_type> &seq) { \
+ return seq.size(); \
+ } \
+ static _type& element(IO &io, std::vector<_type> &seq, size_t index) {\
+ if ( index >= seq.size() ) \
+ seq.resize(index+1); \
+ return seq[index]; \
+ } \
+ }; \
+ } \
+ }
+
+#endif // LLVM_SUPPORT_YAMLTRAITS_H
diff --git a/ext/include/llvm/Support/raw_ostream.h b/ext/include/llvm/Support/raw_ostream.h
new file mode 100644
index 0000000..d1e96f8
--- /dev/null
+++ b/ext/include/llvm/Support/raw_ostream.h
@@ -0,0 +1,530 @@
+//===--- raw_ostream.h - Raw output stream ----------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the raw_ostream class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_RAW_OSTREAM_H
+#define LLVM_SUPPORT_RAW_OSTREAM_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/DataTypes.h"
+#include <system_error>
+
+namespace llvm {
+class format_object_base;
+class FormattedString;
+class FormattedNumber;
+template <typename T> class SmallVectorImpl;
+
+namespace sys {
+namespace fs {
+enum OpenFlags : unsigned;
+}
+}
+
+/// This class implements an extremely fast bulk output stream that can *only*
+/// output to a stream. It does not support seeking, reopening, rewinding, line
+/// buffered disciplines etc. It is a simple buffer that outputs
+/// a chunk at a time.
+class raw_ostream {
+private:
+ void operator=(const raw_ostream &) = delete;
+ raw_ostream(const raw_ostream &) = delete;
+
+ /// The buffer is handled in such a way that the buffer is
+ /// uninitialized, unbuffered, or out of space when OutBufCur >=
+ /// OutBufEnd. Thus a single comparison suffices to determine if we
+ /// need to take the slow path to write a single character.
+ ///
+ /// The buffer is in one of three states:
+ /// 1. Unbuffered (BufferMode == Unbuffered)
+ /// 1. Uninitialized (BufferMode != Unbuffered && OutBufStart == 0).
+ /// 2. Buffered (BufferMode != Unbuffered && OutBufStart != 0 &&
+ /// OutBufEnd - OutBufStart >= 1).
+ ///
+ /// If buffered, then the raw_ostream owns the buffer if (BufferMode ==
+ /// InternalBuffer); otherwise the buffer has been set via SetBuffer and is
+ /// managed by the subclass.
+ ///
+ /// If a subclass installs an external buffer using SetBuffer then it can wait
+ /// for a \see write_impl() call to handle the data which has been put into
+ /// this buffer.
+ char *OutBufStart, *OutBufEnd, *OutBufCur;
+
+ enum BufferKind {
+ Unbuffered = 0,
+ InternalBuffer,
+ ExternalBuffer
+ } BufferMode;
+
+public:
+ // color order matches ANSI escape sequence, don't change
+ enum Colors {
+ BLACK=0,
+ RED,
+ GREEN,
+ YELLOW,
+ BLUE,
+ MAGENTA,
+ CYAN,
+ WHITE,
+ SAVEDCOLOR
+ };
+
+ explicit raw_ostream(bool unbuffered = false)
+ : BufferMode(unbuffered ? Unbuffered : InternalBuffer) {
+ // Start out ready to flush.
+ OutBufStart = OutBufEnd = OutBufCur = nullptr;
+ }
+
+ virtual ~raw_ostream();
+
+ /// tell - Return the current offset with the file.
+ uint64_t tell() const { return current_pos() + GetNumBytesInBuffer(); }
+
+ //===--------------------------------------------------------------------===//
+ // Configuration Interface
+ //===--------------------------------------------------------------------===//
+
+ /// Set the stream to be buffered, with an automatically determined buffer
+ /// size.
+ void SetBuffered();
+
+ /// Set the stream to be buffered, using the specified buffer size.
+ void SetBufferSize(size_t Size) {
+ flush();
+ SetBufferAndMode(new char[Size], Size, InternalBuffer);
+ }
+
+ size_t GetBufferSize() const {
+ // If we're supposed to be buffered but haven't actually gotten around
+ // to allocating the buffer yet, return the value that would be used.
+ if (BufferMode != Unbuffered && OutBufStart == nullptr)
+ return preferred_buffer_size();
+
+ // Otherwise just return the size of the allocated buffer.
+ return OutBufEnd - OutBufStart;
+ }
+
+ /// Set the stream to be unbuffered. When unbuffered, the stream will flush
+ /// after every write. This routine will also flush the buffer immediately
+ /// when the stream is being set to unbuffered.
+ void SetUnbuffered() {
+ flush();
+ SetBufferAndMode(nullptr, 0, Unbuffered);
+ }
+
+ size_t GetNumBytesInBuffer() const {
+ return OutBufCur - OutBufStart;
+ }
+
+ //===--------------------------------------------------------------------===//
+ // Data Output Interface
+ //===--------------------------------------------------------------------===//
+
+ void flush() {
+ if (OutBufCur != OutBufStart)
+ flush_nonempty();
+ }
+
+ raw_ostream &operator<<(char C) {
+ if (OutBufCur >= OutBufEnd)
+ return write(C);
+ *OutBufCur++ = C;
+ return *this;
+ }
+
+ raw_ostream &operator<<(unsigned char C) {
+ if (OutBufCur >= OutBufEnd)
+ return write(C);
+ *OutBufCur++ = C;
+ return *this;
+ }
+
+ raw_ostream &operator<<(signed char C) {
+ if (OutBufCur >= OutBufEnd)
+ return write(C);
+ *OutBufCur++ = C;
+ return *this;
+ }
+
+ raw_ostream &operator<<(StringRef Str) {
+ // Inline fast path, particularly for strings with a known length.
+ size_t Size = Str.size();
+
+ // Make sure we can use the fast path.
+ if (Size > (size_t)(OutBufEnd - OutBufCur))
+ return write(Str.data(), Size);
+
+ if (Size) {
+ memcpy(OutBufCur, Str.data(), Size);
+ OutBufCur += Size;
+ }
+ return *this;
+ }
+
+ raw_ostream &operator<<(const char *Str) {
+ // Inline fast path, particularly for constant strings where a sufficiently
+ // smart compiler will simplify strlen.
+
+ return this->operator<<(StringRef(Str));
+ }
+
+ raw_ostream &operator<<(const std::string &Str) {
+ // Avoid the fast path, it would only increase code size for a marginal win.
+ return write(Str.data(), Str.length());
+ }
+
+ raw_ostream &operator<<(const llvm::SmallVectorImpl<char> &Str) {
+ return write(Str.data(), Str.size());
+ }
+
+ raw_ostream &operator<<(unsigned long N);
+ raw_ostream &operator<<(long N);
+ raw_ostream &operator<<(unsigned long long N);
+ raw_ostream &operator<<(long long N);
+ raw_ostream &operator<<(const void *P);
+ raw_ostream &operator<<(unsigned int N) {
+ return this->operator<<(static_cast<unsigned long>(N));
+ }
+
+ raw_ostream &operator<<(int N) {
+ return this->operator<<(static_cast<long>(N));
+ }
+
+ raw_ostream &operator<<(double N);
+
+ /// Output \p N in hexadecimal, without any prefix or padding.
+ raw_ostream &write_hex(unsigned long long N);
+
+ /// Output \p Str, turning '\\', '\t', '\n', '"', and anything that doesn't
+ /// satisfy std::isprint into an escape sequence.
+ raw_ostream &write_escaped(StringRef Str, bool UseHexEscapes = false);
+
+ raw_ostream &write(unsigned char C);
+ raw_ostream &write(const char *Ptr, size_t Size);
+
+ // Formatted output, see the format() function in Support/Format.h.
+ raw_ostream &operator<<(const format_object_base &Fmt);
+
+ // Formatted output, see the leftJustify() function in Support/Format.h.
+ raw_ostream &operator<<(const FormattedString &);
+
+ // Formatted output, see the formatHex() function in Support/Format.h.
+ raw_ostream &operator<<(const FormattedNumber &);
+
+ /// indent - Insert 'NumSpaces' spaces.
+ raw_ostream &indent(unsigned NumSpaces);
+
+ /// Changes the foreground color of text that will be output from this point
+ /// forward.
+ /// @param Color ANSI color to use, the special SAVEDCOLOR can be used to
+ /// change only the bold attribute, and keep colors untouched
+ /// @param Bold bold/brighter text, default false
+ /// @param BG if true change the background, default: change foreground
+ /// @returns itself so it can be used within << invocations
+ virtual raw_ostream &changeColor(enum Colors Color,
+ bool Bold = false,
+ bool BG = false) {
+ (void)Color;
+ (void)Bold;
+ (void)BG;
+ return *this;
+ }
+
+ /// Resets the colors to terminal defaults. Call this when you are done
+ /// outputting colored text, or before program exit.
+ virtual raw_ostream &resetColor() { return *this; }
+
+ /// Reverses the foreground and background colors.
+ virtual raw_ostream &reverseColor() { return *this; }
+
+ /// This function determines if this stream is connected to a "tty" or
+ /// "console" window. That is, the output would be displayed to the user
+ /// rather than being put on a pipe or stored in a file.
+ virtual bool is_displayed() const { return false; }
+
+ /// This function determines if this stream is displayed and supports colors.
+ virtual bool has_colors() const { return is_displayed(); }
+
+ //===--------------------------------------------------------------------===//
+ // Subclass Interface
+ //===--------------------------------------------------------------------===//
+
+private:
+ /// The is the piece of the class that is implemented by subclasses. This
+ /// writes the \p Size bytes starting at
+ /// \p Ptr to the underlying stream.
+ ///
+ /// This function is guaranteed to only be called at a point at which it is
+ /// safe for the subclass to install a new buffer via SetBuffer.
+ ///
+ /// \param Ptr The start of the data to be written. For buffered streams this
+ /// is guaranteed to be the start of the buffer.
+ ///
+ /// \param Size The number of bytes to be written.
+ ///
+ /// \invariant { Size > 0 }
+ virtual void write_impl(const char *Ptr, size_t Size) = 0;
+
+ // An out of line virtual method to provide a home for the class vtable.
+ virtual void handle();
+
+ /// Return the current position within the stream, not counting the bytes
+ /// currently in the buffer.
+ virtual uint64_t current_pos() const = 0;
+
+protected:
+ /// Use the provided buffer as the raw_ostream buffer. This is intended for
+ /// use only by subclasses which can arrange for the output to go directly
+ /// into the desired output buffer, instead of being copied on each flush.
+ void SetBuffer(char *BufferStart, size_t Size) {
+ SetBufferAndMode(BufferStart, Size, ExternalBuffer);
+ }
+
+ /// Return an efficient buffer size for the underlying output mechanism.
+ virtual size_t preferred_buffer_size() const;
+
+ /// Return the beginning of the current stream buffer, or 0 if the stream is
+ /// unbuffered.
+ const char *getBufferStart() const { return OutBufStart; }
+
+ //===--------------------------------------------------------------------===//
+ // Private Interface
+ //===--------------------------------------------------------------------===//
+private:
+ /// Install the given buffer and mode.
+ void SetBufferAndMode(char *BufferStart, size_t Size, BufferKind Mode);
+
+ /// Flush the current buffer, which is known to be non-empty. This outputs the
+ /// currently buffered data and resets the buffer to empty.
+ void flush_nonempty();
+
+ /// Copy data into the buffer. Size must not be greater than the number of
+ /// unused bytes in the buffer.
+ void copy_to_buffer(const char *Ptr, size_t Size);
+};
+
+/// An abstract base class for streams implementations that also support a
+/// pwrite operation. This is useful for code that can mostly stream out data,
+/// but needs to patch in a header that needs to know the output size.
+class raw_pwrite_stream : public raw_ostream {
+ virtual void pwrite_impl(const char *Ptr, size_t Size, uint64_t Offset) = 0;
+
+public:
+ explicit raw_pwrite_stream(bool Unbuffered = false)
+ : raw_ostream(Unbuffered) {}
+ void pwrite(const char *Ptr, size_t Size, uint64_t Offset) {
+#ifndef NDBEBUG
+ uint64_t Pos = tell();
+ // /dev/null always reports a pos of 0, so we cannot perform this check
+ // in that case.
+ if (Pos)
+ assert(Size + Offset <= Pos && "We don't support extending the stream");
+#endif
+ pwrite_impl(Ptr, Size, Offset);
+ }
+};
+
+//===----------------------------------------------------------------------===//
+// File Output Streams
+//===----------------------------------------------------------------------===//
+
+/// A raw_ostream that writes to a file descriptor.
+///
+class raw_fd_ostream : public raw_pwrite_stream {
+ int FD;
+ bool ShouldClose;
+
+ /// Error This flag is true if an error of any kind has been detected.
+ ///
+ bool Error;
+
+ uint64_t pos;
+
+ bool SupportsSeeking;
+
+ /// See raw_ostream::write_impl.
+ void write_impl(const char *Ptr, size_t Size) override;
+
+ void pwrite_impl(const char *Ptr, size_t Size, uint64_t Offset) override;
+
+ /// Return the current position within the stream, not counting the bytes
+ /// currently in the buffer.
+ uint64_t current_pos() const override { return pos; }
+
+ /// Determine an efficient buffer size.
+ size_t preferred_buffer_size() const override;
+
+ /// Set the flag indicating that an output error has been encountered.
+ void error_detected() { Error = true; }
+
+public:
+ /// Open the specified file for writing. If an error occurs, information
+ /// about the error is put into EC, and the stream should be immediately
+ /// destroyed;
+ /// \p Flags allows optional flags to control how the file will be opened.
+ ///
+ /// As a special case, if Filename is "-", then the stream will use
+ /// STDOUT_FILENO instead of opening a file. Note that it will still consider
+ /// itself to own the file descriptor. In particular, it will close the
+ /// file descriptor when it is done (this is necessary to detect
+ /// output errors).
+ raw_fd_ostream(StringRef Filename, std::error_code &EC,
+ sys::fs::OpenFlags Flags);
+
+ /// FD is the file descriptor that this writes to. If ShouldClose is true,
+ /// this closes the file when the stream is destroyed.
+ raw_fd_ostream(int fd, bool shouldClose, bool unbuffered=false);
+
+ ~raw_fd_ostream() override;
+
+ /// Manually flush the stream and close the file. Note that this does not call
+ /// fsync.
+ void close();
+
+ bool supportsSeeking() { return SupportsSeeking; }
+
+ /// Flushes the stream and repositions the underlying file descriptor position
+ /// to the offset specified from the beginning of the file.
+ uint64_t seek(uint64_t off);
+
+ raw_ostream &changeColor(enum Colors colors, bool bold=false,
+ bool bg=false) override;
+ raw_ostream &resetColor() override;
+
+ raw_ostream &reverseColor() override;
+
+ bool is_displayed() const override;
+
+ bool has_colors() const override;
+
+ /// Return the value of the flag in this raw_fd_ostream indicating whether an
+ /// output error has been encountered.
+ /// This doesn't implicitly flush any pending output. Also, it doesn't
+ /// guarantee to detect all errors unless the stream has been closed.
+ bool has_error() const {
+ return Error;
+ }
+
+ /// Set the flag read by has_error() to false. If the error flag is set at the
+ /// time when this raw_ostream's destructor is called, report_fatal_error is
+ /// called to report the error. Use clear_error() after handling the error to
+ /// avoid this behavior.
+ ///
+ /// "Errors should never pass silently.
+ /// Unless explicitly silenced."
+ /// - from The Zen of Python, by Tim Peters
+ ///
+ void clear_error() {
+ Error = false;
+ }
+};
+
+/// This returns a reference to a raw_ostream for standard output. Use it like:
+/// outs() << "foo" << "bar";
+raw_ostream &outs();
+
+/// This returns a reference to a raw_ostream for standard error. Use it like:
+/// errs() << "foo" << "bar";
+raw_ostream &errs();
+
+/// This returns a reference to a raw_ostream which simply discards output.
+raw_ostream &nulls();
+
+//===----------------------------------------------------------------------===//
+// Output Stream Adaptors
+//===----------------------------------------------------------------------===//
+
+/// A raw_ostream that writes to an std::string. This is a simple adaptor
+/// class. This class does not encounter output errors.
+class raw_string_ostream : public raw_ostream {
+ std::string &OS;
+
+ /// See raw_ostream::write_impl.
+ void write_impl(const char *Ptr, size_t Size) override;
+
+ /// Return the current position within the stream, not counting the bytes
+ /// currently in the buffer.
+ uint64_t current_pos() const override { return OS.size(); }
+
+public:
+ explicit raw_string_ostream(std::string &O) : OS(O) {}
+ ~raw_string_ostream() override;
+
+ /// Flushes the stream contents to the target string and returns the string's
+ /// reference.
+ std::string& str() {
+ flush();
+ return OS;
+ }
+};
+
+/// A raw_ostream that writes to an SmallVector or SmallString. This is a
+/// simple adaptor class. This class does not encounter output errors.
+/// raw_svector_ostream operates without a buffer, delegating all memory
+/// management to the SmallString. Thus the SmallString is always up-to-date,
+/// may be used directly and there is no need to call flush().
+class raw_svector_ostream : public raw_pwrite_stream {
+ SmallVectorImpl<char> &OS;
+
+ /// See raw_ostream::write_impl.
+ void write_impl(const char *Ptr, size_t Size) override;
+
+ void pwrite_impl(const char *Ptr, size_t Size, uint64_t Offset) override;
+
+ /// Return the current position within the stream.
+ uint64_t current_pos() const override;
+
+public:
+ /// Construct a new raw_svector_ostream.
+ ///
+ /// \param O The vector to write to; this should generally have at least 128
+ /// bytes free to avoid any extraneous memory overhead.
+ explicit raw_svector_ostream(SmallVectorImpl<char> &O) : OS(O) {
+ SetUnbuffered();
+ }
+ ~raw_svector_ostream() override {}
+
+ void flush() = delete;
+
+ /// Return a StringRef for the vector contents.
+ StringRef str() { return StringRef(OS.data(), OS.size()); }
+};
+
+/// A raw_ostream that discards all output.
+class raw_null_ostream : public raw_pwrite_stream {
+ /// See raw_ostream::write_impl.
+ void write_impl(const char *Ptr, size_t size) override;
+ void pwrite_impl(const char *Ptr, size_t Size, uint64_t Offset) override;
+
+ /// Return the current position within the stream, not counting the bytes
+ /// currently in the buffer.
+ uint64_t current_pos() const override;
+
+public:
+ explicit raw_null_ostream() {}
+ ~raw_null_ostream() override;
+};
+
+class buffer_ostream : public raw_svector_ostream {
+ raw_ostream &OS;
+ SmallVector<char, 0> Buffer;
+
+public:
+ buffer_ostream(raw_ostream &OS) : raw_svector_ostream(Buffer), OS(OS) {}
+ ~buffer_ostream() override { OS << str(); }
+};
+
+} // end llvm namespace
+
+#endif // LLVM_SUPPORT_RAW_OSTREAM_H
diff --git a/ext/include/llvm/Support/type_traits.h b/ext/include/llvm/Support/type_traits.h
new file mode 100644
index 0000000..88385c3
--- /dev/null
+++ b/ext/include/llvm/Support/type_traits.h
@@ -0,0 +1,109 @@
+//===- llvm/Support/type_traits.h - Simplfied type traits -------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides useful additions to the standard type_traits library.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_TYPE_TRAITS_H
+#define LLVM_SUPPORT_TYPE_TRAITS_H
+
+#include <type_traits>
+#include <utility>
+
+#ifndef __has_feature
+#define LLVM_DEFINED_HAS_FEATURE
+#define __has_feature(x) 0
+#endif
+
+namespace llvm {
+
+/// isPodLike - This is a type trait that is used to determine whether a given
+/// type can be copied around with memcpy instead of running ctors etc.
+template <typename T>
+struct isPodLike {
+ // std::is_trivially_copyable is available in libc++ with clang, libstdc++
+ // that comes with GCC 5.
+#if (__has_feature(is_trivially_copyable) && defined(_LIBCPP_VERSION)) || \
+ (defined(__GNUC__) && __GNUC__ >= 5)
+ // If the compiler supports the is_trivially_copyable trait use it, as it
+ // matches the definition of isPodLike closely.
+ static const bool value = std::is_trivially_copyable<T>::value;
+#elif __has_feature(is_trivially_copyable)
+ // Use the internal name if the compiler supports is_trivially_copyable but we
+ // don't know if the standard library does. This is the case for clang in
+ // conjunction with libstdc++ from GCC 4.x.
+ static const bool value = __is_trivially_copyable(T);
+#else
+ // If we don't know anything else, we can (at least) assume that all non-class
+ // types are PODs.
+ static const bool value = !std::is_class<T>::value;
+#endif
+};
+
+// std::pair's are pod-like if their elements are.
+template<typename T, typename U>
+struct isPodLike<std::pair<T, U> > {
+ static const bool value = isPodLike<T>::value && isPodLike<U>::value;
+};
+
+/// \brief Metafunction that determines whether the given type is either an
+/// integral type or an enumeration type.
+///
+/// Note that this accepts potentially more integral types than is_integral
+/// because it is based on merely being convertible implicitly to an integral
+/// type.
+template <typename T> class is_integral_or_enum {
+ typedef typename std::remove_reference<T>::type UnderlyingT;
+
+public:
+ static const bool value =
+ !std::is_class<UnderlyingT>::value && // Filter conversion operators.
+ !std::is_pointer<UnderlyingT>::value &&
+ !std::is_floating_point<UnderlyingT>::value &&
+ std::is_convertible<UnderlyingT, unsigned long long>::value;
+};
+
+/// \brief If T is a pointer, just return it. If it is not, return T&.
+template<typename T, typename Enable = void>
+struct add_lvalue_reference_if_not_pointer { typedef T &type; };
+
+template <typename T>
+struct add_lvalue_reference_if_not_pointer<
+ T, typename std::enable_if<std::is_pointer<T>::value>::type> {
+ typedef T type;
+};
+
+/// \brief If T is a pointer to X, return a pointer to const X. If it is not,
+/// return const T.
+template<typename T, typename Enable = void>
+struct add_const_past_pointer { typedef const T type; };
+
+template <typename T>
+struct add_const_past_pointer<
+ T, typename std::enable_if<std::is_pointer<T>::value>::type> {
+ typedef const typename std::remove_pointer<T>::type *type;
+};
+
+}
+
+// If the compiler supports detecting whether a class is final, define
+// an LLVM_IS_FINAL macro. If it cannot be defined properly, this
+// macro will be left undefined.
+#if __cplusplus >= 201402L
+#define LLVM_IS_FINAL(Ty) std::is_final<Ty>()
+#elif __has_feature(is_final) || LLVM_GNUC_PREREQ(4, 7, 0)
+#define LLVM_IS_FINAL(Ty) __is_final(Ty)
+#endif
+
+#ifdef LLVM_DEFINED_HAS_FEATURE
+#undef __has_feature
+#endif
+
+#endif
diff --git a/src/include/ssw/ssw.h b/ext/include/ssw/ssw.h
similarity index 100%
rename from src/include/ssw/ssw.h
rename to ext/include/ssw/ssw.h
diff --git a/ext/include/ssw/ssw_cpp.h b/ext/include/ssw/ssw_cpp.h
new file mode 100644
index 0000000..0af9cf4
--- /dev/null
+++ b/ext/include/ssw/ssw_cpp.h
@@ -0,0 +1,215 @@
+#ifndef COMPLETE_STRIPED_SMITH_WATERMAN_CPP_H_
+#define COMPLETE_STRIPED_SMITH_WATERMAN_CPP_H_
+
+#include <stdint.h>
+#include <string>
+#include <vector>
+
+namespace StripedSmithWaterman {
+
+struct Alignment {
+ uint16_t sw_score; // The best alignment score
+ uint16_t sw_score_next_best; // The next best alignment score
+ int32_t ref_begin; // Reference begin position of the best alignment
+ int32_t ref_end; // Reference end position of the best alignment
+ int32_t query_begin; // Query begin position of the best alignment
+ int32_t query_end; // Query end position of the best alignment
+ int32_t ref_end_next_best; // Reference end position of the next best alignment
+ int32_t mismatches; // Number of mismatches of the alignment
+ std::string cigar_string; // Cigar string of the best alignment
+ std::vector<uint32_t> cigar; // Cigar stored in the BAM format
+ // high 28 bits: length
+ // low 4 bits: M/I/D/S/X (0/1/2/4/8);
+ void Clear() {
+ sw_score = 0;
+ sw_score_next_best = 0;
+ ref_begin = 0;
+ ref_end = 0;
+ query_begin = 0;
+ query_end = 0;
+ ref_end_next_best = 0;
+ mismatches = 0;
+ cigar_string.clear();
+ cigar.clear();
+ };
+};
+
+struct Filter {
+ // NOTE: No matter the filter, those five fields of Alignment will be given anyway.
+ // sw_score; sw_score_next_best; ref_end; query_end; ref_end_next_best.
+ // NOTE: Only need score of alignments, please set 'report_begin_position'
+ // and 'report_cigar' false.
+
+ bool report_begin_position; // Give ref_begin and query_begin.
+ // If it is not set, ref_begin and query_begin are -1.
+ bool report_cigar; // Give cigar_string and cigar.
+ // report_begin_position is automatically TRUE.
+
+ // When *report_cigar* is true and alignment passes these two filters,
+ // cigar_string and cigar will be given.
+ uint16_t score_filter; // score >= score_filter
+ uint16_t distance_filter; // ((ref_end - ref_begin) < distance_filter) &&
+ // ((query_end - read_begin) < distance_filter)
+
+ Filter()
+ : report_begin_position(true), report_cigar(true), score_filter(0), distance_filter(32767) { };
+
+ Filter(const bool &pos, const bool &cigar, const uint16_t &score, const uint16_t &dis)
+ : report_begin_position(pos), report_cigar(cigar), score_filter(score), distance_filter(dis) { };
+};
+
+class Aligner {
+public:
+ // =========
+ // @function Construct an Aligner on default values.
+ // The function will build the {A.C,G,T,N} aligner.
+ // If you target for other character aligners, then please
+ // use the other constructor and pass the corresponding matrix in.
+ // =========
+ Aligner(void);
+
+ // =========
+ // @function Construct an Aligner by assigning scores.
+ // The function will build the {A.C,G,T,N} aligner.
+ // If you target for other character aligners, then please
+ // use the other constructor and pass the corresponding matrix in.
+ // =========
+ Aligner(const uint8_t &match_score,
+ const uint8_t &mismatch_penalty,
+ const uint8_t &gap_opening_penalty,
+ const uint8_t &gap_extending_penalty);
+
+ // =========
+ // @function Construct an Aligner by the specific matrixs.
+ // =========
+ Aligner(const int8_t *score_matrix,
+ const int &score_matrix_size,
+ const int8_t *translation_matrix,
+ const int &translation_matrix_size);
+
+ ~Aligner(void);
+
+ // =========
+ // @function Build the reference sequence and thus make
+ // Align(const char* query, s_align* alignment) function;
+ // otherwise the reference should be given when aligning.
+ // [NOTICE] If there exists a sequence, that one will be deleted
+ // and replaced.
+ // @param seq The reference bases;
+ // [NOTICE] It is not necessary null terminated.
+ // @param length The length of bases will be be built.
+ // @return The length of the built bases.
+ // =========
+ int SetReferenceSequence(const char *seq, const int &length);
+
+ void CleanReferenceSequence(void);
+
+ // =========
+ // @function Set penalties for opening and extending gaps
+ // [NOTICE] The defaults are 3 and 1 respectively.
+ // =========
+ void SetGapPenalty(const uint8_t &opening, const uint8_t &extending) {
+ gap_opening_penalty_ = opening;
+ gap_extending_penalty_ = extending;
+ };
+
+ // =========
+ // @function Align the query againt the reference that is set by
+ // SetReferenceSequence.
+ // @param query The query sequence.
+ // @param filter The filter for the alignment.
+ // @param alignment The container contains the result.
+ // @return True: succeed; false: fail.
+ // =========
+ bool Align(const char *query, const Filter &filter, Alignment *alignment) const;
+
+ // =========
+ // @function Align the query againt the reference.
+ // [NOTICE] The reference won't replace the reference
+ // set by SetReferenceSequence.
+ // @param query The query sequence.
+ // @param ref The reference sequence.
+ // [NOTICE] It is not necessary null terminated.
+ // @param ref_len The length of the reference sequence.
+ // @param filter The filter for the alignment.
+ // @param alignment The container contains the result.
+ // @return True: succeed; false: fail.
+ // =========
+ bool Align(const char *query, const char *ref, const int &ref_len,
+ const Filter &filter, Alignment *alignment) const;
+
+ // @function Clear up all containers and thus the aligner is disabled.
+ // To rebuild the aligner please use Build functions.
+ void Clear(void);
+
+ // =========
+ // @function Rebuild the aligner's ability on default values.
+ // [NOTICE] If the aligner is not cleaned, rebuilding will fail.
+ // @return True: succeed; false: fail.
+ // =========
+ bool ReBuild(void);
+
+ // =========
+ // @function Rebuild the aligner's ability by the specific matrixs.
+ // [NOTICE] If the aligner is not cleaned, rebuilding will fail.
+ // @return True: succeed; false: fail.
+ // =========
+ bool ReBuild(
+ const uint8_t &match_score,
+ const uint8_t &mismatch_penalty,
+ const uint8_t &gap_opening_penalty,
+ const uint8_t &gap_extending_penalty);
+
+ // =========
+ // @function Construct an Aligner by the specific matrixs.
+ // [NOTICE] If the aligner is not cleaned, rebuilding will fail.
+ // @return True: succeed; false: fail.
+ // =========
+ bool ReBuild(
+ const int8_t *score_matrix,
+ const int &score_matrix_size,
+ const int8_t *translation_matrix,
+ const int &translation_matrix_size);
+
+private:
+ int8_t *score_matrix_;
+ int score_matrix_size_;
+ int8_t *translation_matrix_;
+
+ uint8_t match_score_; // default: 2
+ uint8_t mismatch_penalty_; // default: 2
+ uint8_t gap_opening_penalty_; // default: 3
+ uint8_t gap_extending_penalty_; // default: 1
+
+ int8_t *translated_reference_;
+ int32_t reference_length_;
+
+ int TranslateBase(const char *bases, const int &length, int8_t *translated) const;
+
+ void SetAllDefault(void);
+
+ void BuildDefaultMatrix(void);
+
+ void ClearMatrices(void);
+
+ Aligner &operator=(const Aligner &);
+
+ Aligner(const Aligner &);
+}; // class Aligner
+
+
+// ================
+// inline functions
+// ================
+inline void Aligner::CleanReferenceSequence(void) {
+ if (reference_length_ == 0) return;
+
+ // delete the current buffer
+ if (reference_length_ > 1) delete[] translated_reference_;
+ else delete translated_reference_;
+
+ reference_length_ = 0;
+}
+} // namespace StripedSmithWaterman
+
+#endif // COMPLETE_STRIPED_SMITH_WATERMAN_CPP_H_
diff --git a/ext/include/yaml-cpp/anchor.h b/ext/include/yaml-cpp/anchor.h
deleted file mode 100644
index 433f2fa..0000000
--- a/ext/include/yaml-cpp/anchor.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef ANCHOR_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define ANCHOR_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-#include <cstddef>
-
-namespace YAML
-{
- typedef std::size_t anchor_t;
- const anchor_t NullAnchor = 0;
-}
-
-#endif // ANCHOR_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/include/yaml-cpp/binary.h b/ext/include/yaml-cpp/binary.h
deleted file mode 100644
index edc0b2c..0000000
--- a/ext/include/yaml-cpp/binary.h
+++ /dev/null
@@ -1,62 +0,0 @@
-#ifndef BASE64_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define BASE64_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-#include <string>
-#include <vector>
-
-namespace YAML
-{
- std::string EncodeBase64(const unsigned char *data, std::size_t size);
- std::vector<unsigned char> DecodeBase64(const std::string& input);
-
- class Binary {
- public:
- Binary(): m_unownedData(0), m_unownedSize(0) {}
- Binary(const unsigned char *data_, std::size_t size_): m_unownedData(data_), m_unownedSize(size_) {}
-
- bool owned() const { return !m_unownedData; }
- std::size_t size() const { return owned() ? m_data.size() : m_unownedSize; }
- const unsigned char *data() const { return owned() ? &m_data[0] : m_unownedData; }
-
- void swap(std::vector<unsigned char>& rhs) {
- if(m_unownedData) {
- m_data.swap(rhs);
- rhs.clear();
- rhs.resize(m_unownedSize);
- std::copy(m_unownedData, m_unownedData + m_unownedSize, &rhs[0]);
- m_unownedData = 0;
- m_unownedSize = 0;
- } else {
- m_data.swap(rhs);
- }
- }
-
- bool operator == (const Binary& rhs) const {
- const std::size_t s = size();
- if(s != rhs.size())
- return false;
- const unsigned char *d1 = data();
- const unsigned char *d2 = rhs.data();
- for(std::size_t i=0;i<s;i++) {
- if(*d1++ != *d2++)
- return false;
- }
- return true;
- }
-
- bool operator != (const Binary& rhs) const {
- return !(*this == rhs);
- }
-
- private:
- std::vector<unsigned char> m_data;
- const unsigned char *m_unownedData;
- std::size_t m_unownedSize;
- };
-}
-
-#endif // BASE64_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/include/yaml-cpp/contrib/anchordict.h b/ext/include/yaml-cpp/contrib/anchordict.h
deleted file mode 100644
index e483dc4..0000000
--- a/ext/include/yaml-cpp/contrib/anchordict.h
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef ANCHORDICT_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define ANCHORDICT_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-#include <vector>
-
-#include "../anchor.h"
-
-namespace YAML
-{
- /// AnchorDict
- /// . An object that stores and retrieves values correlating to anchor_t
- /// values.
- /// . Efficient implementation that can make assumptions about how anchor_t
- /// values are assigned by the Parser class.
- template <class T>
- class AnchorDict
- {
- public:
- void Register(anchor_t anchor, T value)
- {
- if (anchor > m_data.size())
- {
- m_data.resize(anchor);
- }
- m_data[anchor - 1] = value;
- }
-
- T Get(anchor_t anchor) const
- {
- return m_data[anchor - 1];
- }
-
- private:
- std::vector<T> m_data;
- };
-}
-
-#endif // ANCHORDICT_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/include/yaml-cpp/contrib/graphbuilder.h b/ext/include/yaml-cpp/contrib/graphbuilder.h
deleted file mode 100644
index 6739a12..0000000
--- a/ext/include/yaml-cpp/contrib/graphbuilder.h
+++ /dev/null
@@ -1,133 +0,0 @@
-#ifndef GRAPHBUILDER_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define GRAPHBUILDER_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-#include "yaml-cpp/mark.h"
-#include <string>
-
-namespace YAML
-{
- class Parser;
-
- // GraphBuilderInterface
- // . Abstraction of node creation
- // . pParentNode is always NULL or the return value of one of the NewXXX()
- // functions.
- class GraphBuilderInterface
- {
- public:
- // Create and return a new node with a null value.
- virtual void *NewNull(const Mark& mark, void *pParentNode) = 0;
-
- // Create and return a new node with the given tag and value.
- virtual void *NewScalar(const Mark& mark, const std::string& tag, void *pParentNode, const std::string& value) = 0;
-
- // Create and return a new sequence node
- virtual void *NewSequence(const Mark& mark, const std::string& tag, void *pParentNode) = 0;
- // Add pNode to pSequence. pNode was created with one of the NewXxx()
- // functions and pSequence with NewSequence().
- virtual void AppendToSequence(void *pSequence, void *pNode) = 0;
- // Note that no moew entries will be added to pSequence
- virtual void SequenceComplete(void *pSequence) {(void)pSequence;}
-
- // Create and return a new map node
- virtual void *NewMap(const Mark& mark, const std::string& tag, void *pParentNode) = 0;
- // Add the pKeyNode => pValueNode mapping to pMap. pKeyNode and pValueNode
- // were created with one of the NewXxx() methods and pMap with NewMap().
- virtual void AssignInMap(void *pMap, void *pKeyNode, void *pValueNode) = 0;
- // Note that no more assignments will be made in pMap
- virtual void MapComplete(void *pMap) {(void)pMap;}
-
- // Return the node that should be used in place of an alias referencing
- // pNode (pNode by default)
- virtual void *AnchorReference(const Mark& mark, void *pNode) {(void)mark; return pNode;}
- };
-
- // Typesafe wrapper for GraphBuilderInterface. Assumes that Impl defines
- // Node, Sequence, and Map types. Sequence and Map must derive from Node
- // (unless Node is defined as void). Impl must also implement function with
- // all of the same names as the virtual functions in GraphBuilderInterface
- // -- including the ones with default implementations -- but with the
- // prototypes changed to accept an explicit Node*, Sequence*, or Map* where
- // appropriate.
- template <class Impl>
- class GraphBuilder : public GraphBuilderInterface
- {
- public:
- typedef typename Impl::Node Node;
- typedef typename Impl::Sequence Sequence;
- typedef typename Impl::Map Map;
-
- GraphBuilder(Impl& impl) : m_impl(impl)
- {
- Map* pMap = NULL;
- Sequence* pSeq = NULL;
- Node* pNode = NULL;
-
- // Type consistency checks
- pNode = pMap;
- pNode = pSeq;
- }
-
- GraphBuilderInterface& AsBuilderInterface() {return *this;}
-
- virtual void *NewNull(const Mark& mark, void* pParentNode) {
- return CheckType<Node>(m_impl.NewNull(mark, AsNode(pParentNode)));
- }
-
- virtual void *NewScalar(const Mark& mark, const std::string& tag, void *pParentNode, const std::string& value) {
- return CheckType<Node>(m_impl.NewScalar(mark, tag, AsNode(pParentNode), value));
- }
-
- virtual void *NewSequence(const Mark& mark, const std::string& tag, void *pParentNode) {
- return CheckType<Sequence>(m_impl.NewSequence(mark, tag, AsNode(pParentNode)));
- }
- virtual void AppendToSequence(void *pSequence, void *pNode) {
- m_impl.AppendToSequence(AsSequence(pSequence), AsNode(pNode));
- }
- virtual void SequenceComplete(void *pSequence) {
- m_impl.SequenceComplete(AsSequence(pSequence));
- }
-
- virtual void *NewMap(const Mark& mark, const std::string& tag, void *pParentNode) {
- return CheckType<Map>(m_impl.NewMap(mark, tag, AsNode(pParentNode)));
- }
- virtual void AssignInMap(void *pMap, void *pKeyNode, void *pValueNode) {
- m_impl.AssignInMap(AsMap(pMap), AsNode(pKeyNode), AsNode(pValueNode));
- }
- virtual void MapComplete(void *pMap) {
- m_impl.MapComplete(AsMap(pMap));
- }
-
- virtual void *AnchorReference(const Mark& mark, void *pNode) {
- return CheckType<Node>(m_impl.AnchorReference(mark, AsNode(pNode)));
- }
-
- private:
- Impl& m_impl;
-
- // Static check for pointer to T
- template <class T, class U>
- static T* CheckType(U* p) {return p;}
-
- static Node *AsNode(void *pNode) {return static_cast<Node*>(pNode);}
- static Sequence *AsSequence(void *pSeq) {return static_cast<Sequence*>(pSeq);}
- static Map *AsMap(void *pMap) {return static_cast<Map*>(pMap);}
- };
-
- void *BuildGraphOfNextDocument(Parser& parser, GraphBuilderInterface& graphBuilder);
-
- template <class Impl>
- typename Impl::Node *BuildGraphOfNextDocument(Parser& parser, Impl& impl)
- {
- GraphBuilder<Impl> graphBuilder(impl);
- return static_cast<typename Impl::Node *>(BuildGraphOfNextDocument(
- parser, graphBuilder
- ));
- }
-}
-
-#endif // GRAPHBUILDER_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/include/yaml-cpp/dll.h b/ext/include/yaml-cpp/dll.h
deleted file mode 100644
index ea13840..0000000
--- a/ext/include/yaml-cpp/dll.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef DLL_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define DLL_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-// The following ifdef block is the standard way of creating macros which make exporting
-// from a DLL simpler. All files within this DLL are compiled with the yaml_cpp_EXPORTS
-// symbol defined on the command line. this symbol should not be defined on any project
-// that uses this DLL. This way any other project whose source files include this file see
-// YAML_CPP_API functions as being imported from a DLL, whereas this DLL sees symbols
-// defined with this macro as being exported.
-#undef YAML_CPP_API
-
-#ifdef YAML_CPP_DLL // Using or Building YAML-CPP DLL (definition defined manually)
- #ifdef yaml_cpp_EXPORTS // Building YAML-CPP DLL (definition created by CMake or defined manually)
- // #pragma message( "Defining YAML_CPP_API for DLL export" )
- #define YAML_CPP_API __declspec(dllexport)
- #else // yaml_cpp_EXPORTS
- // #pragma message( "Defining YAML_CPP_API for DLL import" )
- #define YAML_CPP_API __declspec(dllimport)
- #endif // yaml_cpp_EXPORTS
-#else //YAML_CPP_DLL
-#define YAML_CPP_API
-#endif // YAML_CPP_DLL
-
-#endif // DLL_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/include/yaml-cpp/emitfromevents.h b/ext/include/yaml-cpp/emitfromevents.h
deleted file mode 100644
index e11ae64..0000000
--- a/ext/include/yaml-cpp/emitfromevents.h
+++ /dev/null
@@ -1,45 +0,0 @@
-#ifndef EMITFROMEVENTS_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define EMITFROMEVENTS_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-#include "yaml-cpp/eventhandler.h"
-#include <stack>
-
-namespace YAML
-{
- class Emitter;
-
- class EmitFromEvents: public EventHandler
- {
- public:
- EmitFromEvents(Emitter& emitter);
-
- virtual void OnDocumentStart(const Mark& mark);
- virtual void OnDocumentEnd();
-
- virtual void OnNull(const Mark& mark, anchor_t anchor);
- virtual void OnAlias(const Mark& mark, anchor_t anchor);
- virtual void OnScalar(const Mark& mark, const std::string& tag, anchor_t anchor, const std::string& value);
-
- virtual void OnSequenceStart(const Mark& mark, const std::string& tag, anchor_t anchor);
- virtual void OnSequenceEnd();
-
- virtual void OnMapStart(const Mark& mark, const std::string& tag, anchor_t anchor);
- virtual void OnMapEnd();
-
- private:
- void BeginNode();
- void EmitProps(const std::string& tag, anchor_t anchor);
-
- private:
- Emitter& m_emitter;
-
- struct State { enum value { WaitingForSequenceEntry, WaitingForKey, WaitingForValue }; };
- std::stack<State::value> m_stateStack;
- };
-}
-
-#endif // EMITFROMEVENTS_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/include/yaml-cpp/emitter.h b/ext/include/yaml-cpp/emitter.h
deleted file mode 100644
index 927fe6c..0000000
--- a/ext/include/yaml-cpp/emitter.h
+++ /dev/null
@@ -1,209 +0,0 @@
-#ifndef EMITTER_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define EMITTER_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-#include "yaml-cpp/dll.h"
-#include "yaml-cpp/binary.h"
-#include "yaml-cpp/emitterdef.h"
-#include "yaml-cpp/emittermanip.h"
-#include "yaml-cpp/noncopyable.h"
-#include "yaml-cpp/null.h"
-#include "yaml-cpp/ostream_wrapper.h"
-#include <memory>
-#include <string>
-#include <sstream>
-
-namespace YAML
-{
- class EmitterState;
-
- class YAML_CPP_API Emitter: private noncopyable
- {
- public:
- Emitter();
- explicit Emitter(std::ostream& stream);
- ~Emitter();
-
- // output
- const char *c_str() const;
- std::size_t size() const;
-
- // state checking
- bool good() const;
- const std::string GetLastError() const;
-
- // global setters
- bool SetOutputCharset(EMITTER_MANIP value);
- bool SetStringFormat(EMITTER_MANIP value);
- bool SetBoolFormat(EMITTER_MANIP value);
- bool SetIntBase(EMITTER_MANIP value);
- bool SetSeqFormat(EMITTER_MANIP value);
- bool SetMapFormat(EMITTER_MANIP value);
- bool SetIndent(unsigned n);
- bool SetPreCommentIndent(unsigned n);
- bool SetPostCommentIndent(unsigned n);
- bool SetFloatPrecision(unsigned n);
- bool SetDoublePrecision(unsigned n);
-
- // local setters
- Emitter& SetLocalValue(EMITTER_MANIP value);
- Emitter& SetLocalIndent(const _Indent& indent);
- Emitter& SetLocalPrecision(const _Precision& precision);
-
- // overloads of write
- Emitter& Write(const std::string& str);
- Emitter& Write(bool b);
- Emitter& Write(char ch);
- Emitter& Write(const _Alias& alias);
- Emitter& Write(const _Anchor& anchor);
- Emitter& Write(const _Tag& tag);
- Emitter& Write(const _Comment& comment);
- Emitter& Write(const _Null& n);
- Emitter& Write(const Binary& binary);
-
- template <typename T>
- Emitter& WriteIntegralType(T value);
-
- template <typename T>
- Emitter& WriteStreamable(T value);
-
- private:
- template<typename T> void SetStreamablePrecision(std::stringstream&) {}
- unsigned GetFloatPrecision() const;
- unsigned GetDoublePrecision() const;
-
- void PrepareIntegralStream(std::stringstream& stream) const;
- void StartedScalar();
-
- private:
- void EmitBeginDoc();
- void EmitEndDoc();
- void EmitBeginSeq();
- void EmitEndSeq();
- void EmitBeginMap();
- void EmitEndMap();
- void EmitNewline();
- void EmitKindTag();
- void EmitTag(bool verbatim, const _Tag& tag);
-
- void PrepareNode(EmitterNodeType::value child);
- void PrepareTopNode(EmitterNodeType::value child);
- void FlowSeqPrepareNode(EmitterNodeType::value child);
- void BlockSeqPrepareNode(EmitterNodeType::value child);
-
- void FlowMapPrepareNode(EmitterNodeType::value child);
-
- void FlowMapPrepareLongKey(EmitterNodeType::value child);
- void FlowMapPrepareLongKeyValue(EmitterNodeType::value child);
- void FlowMapPrepareSimpleKey(EmitterNodeType::value child);
- void FlowMapPrepareSimpleKeyValue(EmitterNodeType::value child);
-
- void BlockMapPrepareNode(EmitterNodeType::value child);
-
- void BlockMapPrepareLongKey(EmitterNodeType::value child);
- void BlockMapPrepareLongKeyValue(EmitterNodeType::value child);
- void BlockMapPrepareSimpleKey(EmitterNodeType::value child);
- void BlockMapPrepareSimpleKeyValue(EmitterNodeType::value child);
-
- void SpaceOrIndentTo(bool requireSpace, unsigned indent);
-
- const char *ComputeFullBoolName(bool b) const;
- bool CanEmitNewline() const;
-
- private:
- std::auto_ptr<EmitterState> m_pState;
- ostream_wrapper m_stream;
- };
-
- template <typename T>
- inline Emitter& Emitter::WriteIntegralType(T value)
- {
- if(!good())
- return *this;
-
- PrepareNode(EmitterNodeType::Scalar);
-
- std::stringstream stream;
- PrepareIntegralStream(stream);
- stream << value;
- m_stream << stream.str();
-
- StartedScalar();
-
- return *this;
- }
-
- template <typename T>
- inline Emitter& Emitter::WriteStreamable(T value)
- {
- if(!good())
- return *this;
-
- PrepareNode(EmitterNodeType::Scalar);
-
- std::stringstream stream;
- SetStreamablePrecision<T>(stream);
- stream << value;
- m_stream << stream.str();
-
- StartedScalar();
-
- return *this;
- }
-
- template<>
- inline void Emitter::SetStreamablePrecision<float>(std::stringstream& stream)
- {
- stream.precision(GetFloatPrecision());
- }
-
- template<>
- inline void Emitter::SetStreamablePrecision<double>(std::stringstream& stream)
- {
- stream.precision(GetDoublePrecision());
- }
-
- // overloads of insertion
- inline Emitter& operator << (Emitter& emitter, const std::string& v) { return emitter.Write(v); }
- inline Emitter& operator << (Emitter& emitter, bool v) { return emitter.Write(v); }
- inline Emitter& operator << (Emitter& emitter, char v) { return emitter.Write(v); }
- inline Emitter& operator << (Emitter& emitter, unsigned char v) { return emitter.Write(static_cast<char>(v)); }
- inline Emitter& operator << (Emitter& emitter, const _Alias& v) { return emitter.Write(v); }
- inline Emitter& operator << (Emitter& emitter, const _Anchor& v) { return emitter.Write(v); }
- inline Emitter& operator << (Emitter& emitter, const _Tag& v) { return emitter.Write(v); }
- inline Emitter& operator << (Emitter& emitter, const _Comment& v) { return emitter.Write(v); }
- inline Emitter& operator << (Emitter& emitter, const _Null& v) { return emitter.Write(v); }
- inline Emitter& operator << (Emitter& emitter, const Binary& b) { return emitter.Write(b); }
-
- inline Emitter& operator << (Emitter& emitter, const char *v) { return emitter.Write(std::string(v)); }
-
- inline Emitter& operator << (Emitter& emitter, int v) { return emitter.WriteIntegralType(v); }
- inline Emitter& operator << (Emitter& emitter, unsigned int v) { return emitter.WriteIntegralType(v); }
- inline Emitter& operator << (Emitter& emitter, short v) { return emitter.WriteIntegralType(v); }
- inline Emitter& operator << (Emitter& emitter, unsigned short v) { return emitter.WriteIntegralType(v); }
- inline Emitter& operator << (Emitter& emitter, long v) { return emitter.WriteIntegralType(v); }
- inline Emitter& operator << (Emitter& emitter, unsigned long v) { return emitter.WriteIntegralType(v); }
- inline Emitter& operator << (Emitter& emitter, long long v) { return emitter.WriteIntegralType(v); }
- inline Emitter& operator << (Emitter& emitter, unsigned long long v) { return emitter.WriteIntegralType(v); }
-
- inline Emitter& operator << (Emitter& emitter, float v) { return emitter.WriteStreamable(v); }
- inline Emitter& operator << (Emitter& emitter, double v) { return emitter.WriteStreamable(v); }
-
- inline Emitter& operator << (Emitter& emitter, EMITTER_MANIP value) {
- return emitter.SetLocalValue(value);
- }
-
- inline Emitter& operator << (Emitter& emitter, _Indent indent) {
- return emitter.SetLocalIndent(indent);
- }
-
- inline Emitter& operator << (Emitter& emitter, _Precision precision) {
- return emitter.SetLocalPrecision(precision);
- }
-}
-
-#endif // EMITTER_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/include/yaml-cpp/emitterdef.h b/ext/include/yaml-cpp/emitterdef.h
deleted file mode 100644
index c5ca00b..0000000
--- a/ext/include/yaml-cpp/emitterdef.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef EMITTERDEF_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define EMITTERDEF_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-namespace YAML
-{
- struct EmitterNodeType { enum value { None, Property, Scalar, FlowSeq, BlockSeq, FlowMap, BlockMap }; };
-}
-
-#endif // EMITTERDEF_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/include/yaml-cpp/emittermanip.h b/ext/include/yaml-cpp/emittermanip.h
deleted file mode 100644
index a8ec64a..0000000
--- a/ext/include/yaml-cpp/emittermanip.h
+++ /dev/null
@@ -1,149 +0,0 @@
-#ifndef EMITTERMANIP_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define EMITTERMANIP_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-#include <string>
-
-namespace YAML
-{
- enum EMITTER_MANIP {
- // general manipulators
- Auto,
- TagByKind,
- Newline,
-
- // output character set
- EmitNonAscii,
- EscapeNonAscii,
-
- // string manipulators
- // Auto, // duplicate
- SingleQuoted,
- DoubleQuoted,
- Literal,
-
- // bool manipulators
- YesNoBool, // yes, no
- TrueFalseBool, // true, false
- OnOffBool, // on, off
- UpperCase, // TRUE, N
- LowerCase, // f, yes
- CamelCase, // No, Off
- LongBool, // yes, On
- ShortBool, // y, t
-
- // int manipulators
- Dec,
- Hex,
- Oct,
-
- // document manipulators
- BeginDoc,
- EndDoc,
-
- // sequence manipulators
- BeginSeq,
- EndSeq,
- Flow,
- Block,
-
- // map manipulators
- BeginMap,
- EndMap,
- Key,
- Value,
- // Flow, // duplicate
- // Block, // duplicate
- // Auto, // duplicate
- LongKey
- };
-
- struct _Indent {
- _Indent(int value_): value(value_) {}
- int value;
- };
-
- inline _Indent Indent(int value) {
- return _Indent(value);
- }
-
- struct _Alias {
- _Alias(const std::string& content_): content(content_) {}
- std::string content;
- };
-
- inline _Alias Alias(const std::string content) {
- return _Alias(content);
- }
-
- struct _Anchor {
- _Anchor(const std::string& content_): content(content_) {}
- std::string content;
- };
-
- inline _Anchor Anchor(const std::string content) {
- return _Anchor(content);
- }
-
- struct _Tag {
- struct Type { enum value { Verbatim, PrimaryHandle, NamedHandle }; };
-
- explicit _Tag(const std::string& prefix_, const std::string& content_, Type::value type_)
- : prefix(prefix_), content(content_), type(type_)
- {
- }
- std::string prefix;
- std::string content;
- Type::value type;
- };
-
- inline _Tag VerbatimTag(const std::string content) {
- return _Tag("", content, _Tag::Type::Verbatim);
- }
-
- inline _Tag LocalTag(const std::string content) {
- return _Tag("", content, _Tag::Type::PrimaryHandle);
- }
-
- inline _Tag LocalTag(const std::string& prefix, const std::string content) {
- return _Tag(prefix, content, _Tag::Type::NamedHandle);
- }
-
- inline _Tag SecondaryTag(const std::string content) {
- return _Tag("", content, _Tag::Type::NamedHandle);
- }
-
- struct _Comment {
- _Comment(const std::string& content_): content(content_) {}
- std::string content;
- };
-
- inline _Comment Comment(const std::string content) {
- return _Comment(content);
- }
-
- struct _Precision {
- _Precision(int floatPrecision_, int doublePrecision_): floatPrecision(floatPrecision_), doublePrecision(doublePrecision_) {}
-
- int floatPrecision;
- int doublePrecision;
- };
-
- inline _Precision FloatPrecision(int n) {
- return _Precision(n, -1);
- }
-
- inline _Precision DoublePrecision(int n) {
- return _Precision(-1, n);
- }
-
- inline _Precision Precision(int n) {
- return _Precision(n, n);
- }
-}
-
-#endif // EMITTERMANIP_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/include/yaml-cpp/eventhandler.h b/ext/include/yaml-cpp/eventhandler.h
deleted file mode 100644
index 3173a1f..0000000
--- a/ext/include/yaml-cpp/eventhandler.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef EVENTHANDLER_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define EVENTHANDLER_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-#include "yaml-cpp/anchor.h"
-#include <string>
-
-namespace YAML
-{
- struct Mark;
-
- class EventHandler
- {
- public:
- virtual ~EventHandler() {}
-
- virtual void OnDocumentStart(const Mark& mark) = 0;
- virtual void OnDocumentEnd() = 0;
-
- virtual void OnNull(const Mark& mark, anchor_t anchor) = 0;
- virtual void OnAlias(const Mark& mark, anchor_t anchor) = 0;
- virtual void OnScalar(const Mark& mark, const std::string& tag, anchor_t anchor, const std::string& value) = 0;
-
- virtual void OnSequenceStart(const Mark& mark, const std::string& tag, anchor_t anchor) = 0;
- virtual void OnSequenceEnd() = 0;
-
- virtual void OnMapStart(const Mark& mark, const std::string& tag, anchor_t anchor) = 0;
- virtual void OnMapEnd() = 0;
- };
-}
-
-#endif // EVENTHANDLER_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
diff --git a/ext/include/yaml-cpp/exceptions.h b/ext/include/yaml-cpp/exceptions.h
deleted file mode 100644
index 3a2bd22..0000000
--- a/ext/include/yaml-cpp/exceptions.h
+++ /dev/null
@@ -1,201 +0,0 @@
-#ifndef EXCEPTIONS_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define EXCEPTIONS_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-#include "yaml-cpp/mark.h"
-#include "yaml-cpp/traits.h"
-#include <stdexcept>
-#include <string>
-#include <sstream>
-
-namespace YAML
-{
- // error messages
- namespace ErrorMsg
- {
- const char * const YAML_DIRECTIVE_ARGS = "YAML directives must have exactly one argument";
- const char * const YAML_VERSION = "bad YAML version: ";
- const char * const YAML_MAJOR_VERSION = "YAML major version too large";
- const char * const REPEATED_YAML_DIRECTIVE= "repeated YAML directive";
- const char * const TAG_DIRECTIVE_ARGS = "TAG directives must have exactly two arguments";
- const char * const REPEATED_TAG_DIRECTIVE = "repeated TAG directive";
- const char * const CHAR_IN_TAG_HANDLE = "illegal character found while scanning tag handle";
- const char * const TAG_WITH_NO_SUFFIX = "tag handle with no suffix";
- const char * const END_OF_VERBATIM_TAG = "end of verbatim tag not found";
- const char * const END_OF_MAP = "end of map not found";
- const char * const END_OF_MAP_FLOW = "end of map flow not found";
- const char * const END_OF_SEQ = "end of sequence not found";
- const char * const END_OF_SEQ_FLOW = "end of sequence flow not found";
- const char * const MULTIPLE_TAGS = "cannot assign multiple tags to the same node";
- const char * const MULTIPLE_ANCHORS = "cannot assign multiple anchors to the same node";
- const char * const MULTIPLE_ALIASES = "cannot assign multiple aliases to the same node";
- const char * const ALIAS_CONTENT = "aliases can't have any content, *including* tags";
- const char * const INVALID_HEX = "bad character found while scanning hex number";
- const char * const INVALID_UNICODE = "invalid unicode: ";
- const char * const INVALID_ESCAPE = "unknown escape character: ";
- const char * const UNKNOWN_TOKEN = "unknown token";
- const char * const DOC_IN_SCALAR = "illegal document indicator in scalar";
- const char * const EOF_IN_SCALAR = "illegal EOF in scalar";
- const char * const CHAR_IN_SCALAR = "illegal character in scalar";
- const char * const TAB_IN_INDENTATION = "illegal tab when looking for indentation";
- const char * const FLOW_END = "illegal flow end";
- const char * const BLOCK_ENTRY = "illegal block entry";
- const char * const MAP_KEY = "illegal map key";
- const char * const MAP_VALUE = "illegal map value";
- const char * const ALIAS_NOT_FOUND = "alias not found after *";
- const char * const ANCHOR_NOT_FOUND = "anchor not found after &";
- const char * const CHAR_IN_ALIAS = "illegal character found while scanning alias";
- const char * const CHAR_IN_ANCHOR = "illegal character found while scanning anchor";
- const char * const ZERO_INDENT_IN_BLOCK = "cannot set zero indentation for a block scalar";
- const char * const CHAR_IN_BLOCK = "unexpected character in block scalar";
- const char * const AMBIGUOUS_ANCHOR = "cannot assign the same alias to multiple nodes";
- const char * const UNKNOWN_ANCHOR = "the referenced anchor is not defined";
-
- const char * const INVALID_SCALAR = "invalid scalar";
- const char * const KEY_NOT_FOUND = "key not found";
- const char * const BAD_CONVERSION = "bad conversion";
- const char * const BAD_DEREFERENCE = "bad dereference";
- const char * const BAD_SUBSCRIPT = "operator[] call on a scalar";
- const char * const BAD_PUSHBACK = "appending to a non-sequence";
- const char * const BAD_INSERT = "inserting in a non-convertible-to-map";
-
- const char * const UNMATCHED_GROUP_TAG = "unmatched group tag";
- const char * const UNEXPECTED_END_SEQ = "unexpected end sequence token";
- const char * const UNEXPECTED_END_MAP = "unexpected end map token";
- const char * const SINGLE_QUOTED_CHAR = "invalid character in single-quoted string";
- const char * const INVALID_ANCHOR = "invalid anchor";
- const char * const INVALID_ALIAS = "invalid alias";
- const char * const INVALID_TAG = "invalid tag";
- const char * const BAD_FILE = "bad file";
-
- template <typename T>
- inline const std::string KEY_NOT_FOUND_WITH_KEY(const T&, typename disable_if<is_numeric<T> >::type * = 0) {
- return KEY_NOT_FOUND;
- }
-
- inline const std::string KEY_NOT_FOUND_WITH_KEY(const std::string& key) {
- std::stringstream stream;
- stream << KEY_NOT_FOUND << ": " << key;
- return stream.str();
- }
-
- template <typename T>
- inline const std::string KEY_NOT_FOUND_WITH_KEY(const T& key, typename enable_if<is_numeric<T> >::type * = 0) {
- std::stringstream stream;
- stream << KEY_NOT_FOUND << ": " << key;
- return stream.str();
- }
- }
-
- class Exception: public std::runtime_error {
- public:
- Exception(const Mark& mark_, const std::string& msg_)
- : std::runtime_error(build_what(mark_, msg_)), mark(mark_), msg(msg_) {}
- virtual ~Exception() throw() {}
-
- Mark mark;
- std::string msg;
-
- private:
- static const std::string build_what(const Mark& mark, const std::string& msg) {
- std::stringstream output;
- output << "yaml-cpp: error at line " << mark.line+1 << ", column " << mark.column+1 << ": " << msg;
- return output.str();
- }
- };
-
- class ParserException: public Exception {
- public:
- ParserException(const Mark& mark_, const std::string& msg_)
- : Exception(mark_, msg_) {}
- };
-
- class RepresentationException: public Exception {
- public:
- RepresentationException(const Mark& mark_, const std::string& msg_)
- : Exception(mark_, msg_) {}
- };
-
- // representation exceptions
- class InvalidScalar: public RepresentationException {
- public:
- InvalidScalar(const Mark& mark_)
- : RepresentationException(mark_, ErrorMsg::INVALID_SCALAR) {}
- };
-
- class KeyNotFound: public RepresentationException {
- public:
- template <typename T>
- KeyNotFound(const Mark& mark_, const T& key_)
- : RepresentationException(mark_, ErrorMsg::KEY_NOT_FOUND_WITH_KEY(key_)) {}
- };
-
- template <typename T>
- class TypedKeyNotFound: public KeyNotFound {
- public:
- TypedKeyNotFound(const Mark& mark_, const T& key_)
- : KeyNotFound(mark_, key_), key(key_) {}
- virtual ~TypedKeyNotFound() throw() {}
-
- T key;
- };
-
- template <typename T>
- inline TypedKeyNotFound <T> MakeTypedKeyNotFound(const Mark& mark, const T& key) {
- return TypedKeyNotFound <T> (mark, key);
- }
-
- class BadConversion: public RepresentationException {
- public:
- BadConversion()
- : RepresentationException(Mark::null_mark(), ErrorMsg::BAD_CONVERSION) {}
- };
-
- template<typename T>
- class TypedBadConversion: public BadConversion {
- public:
- TypedBadConversion()
- : BadConversion() {}
- };
-
- class BadDereference: public RepresentationException {
- public:
- BadDereference()
- : RepresentationException(Mark::null_mark(), ErrorMsg::BAD_DEREFERENCE) {}
- };
-
- class BadSubscript: public RepresentationException {
- public:
- BadSubscript()
- : RepresentationException(Mark::null_mark(), ErrorMsg::BAD_SUBSCRIPT) {}
- };
-
- class BadPushback: public RepresentationException {
- public:
- BadPushback()
- : RepresentationException(Mark::null_mark(), ErrorMsg::BAD_PUSHBACK) {}
- };
-
- class BadInsert: public RepresentationException {
- public:
- BadInsert()
- : RepresentationException(Mark::null_mark(), ErrorMsg::BAD_INSERT) {}
- };
-
- class EmitterException: public Exception {
- public:
- EmitterException(const std::string& msg_)
- : Exception(Mark::null_mark(), msg_) {}
- };
-
- class BadFile: public Exception {
- public:
- BadFile(): Exception(Mark::null_mark(), ErrorMsg::BAD_FILE) {}
- };
-}
-
-#endif // EXCEPTIONS_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/include/yaml-cpp/mark.h b/ext/include/yaml-cpp/mark.h
deleted file mode 100644
index e07b317..0000000
--- a/ext/include/yaml-cpp/mark.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef MARK_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define MARK_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-#include "yaml-cpp/dll.h"
-
-namespace YAML
-{
- struct YAML_CPP_API Mark {
- Mark(): pos(0), line(0), column(0) {}
-
- static const Mark null_mark() { return Mark(-1, -1, -1); }
-
- int pos;
- int line, column;
-
- private:
- Mark(int pos_, int line_, int column_): pos(pos_), line(line_), column(column_) {}
- };
-}
-
-#endif // MARK_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/include/yaml-cpp/node/convert.h b/ext/include/yaml-cpp/node/convert.h
deleted file mode 100644
index 2e2f3f9..0000000
--- a/ext/include/yaml-cpp/node/convert.h
+++ /dev/null
@@ -1,224 +0,0 @@
-#ifndef NODE_CONVERT_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define NODE_CONVERT_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-#include "yaml-cpp/binary.h"
-#include "yaml-cpp/node/node.h"
-#include "yaml-cpp/node/iterator.h"
-#include "yaml-cpp/null.h"
-#include <limits>
-#include <list>
-#include <map>
-#include <sstream>
-#include <vector>
-
-namespace YAML
-{
- namespace conversion {
- inline bool IsInfinity(const std::string& input) {
- return input == ".inf" || input == ".Inf" || input == ".INF" || input == "+.inf" || input == "+.Inf" || input == "+.INF";
- }
-
- inline bool IsNegativeInfinity(const std::string& input) {
- return input == "-.inf" || input == "-.Inf" || input == "-.INF";
- }
-
- inline bool IsNaN(const std::string& input) {
- return input == ".nan" || input == ".NaN" || input == ".NAN";
- }
- }
-
- // std::string
- template<>
- struct convert<std::string> {
- static Node encode(const std::string& rhs) {
- return Node(rhs);
- }
-
- static bool decode(const Node& node, std::string& rhs) {
- if(!node.IsScalar())
- return false;
- rhs = node.Scalar();
- return true;
- }
- };
-
- template<>
- struct convert<_Null> {
- static Node encode(const _Null& /* rhs */) {
- return Node();
- }
-
- static bool decode(const Node& node, _Null& /* rhs */) {
- return node.IsNull();
- }
- };
-
-#define YAML_DEFINE_CONVERT_STREAMABLE(type)\
- template<>\
- struct convert<type> {\
- static Node encode(const type& rhs) {\
- std::stringstream stream;\
- stream << rhs;\
- return Node(stream.str());\
- }\
- \
- static bool decode(const Node& node, type& rhs) {\
- if(node.Type() != NodeType::Scalar)\
- return false;\
- const std::string& input = node.Scalar();\
- std::stringstream stream(input);\
- stream.unsetf(std::ios::dec);\
- if((stream >> rhs) && (stream >> std::ws).eof())\
- return true;\
- if(std::numeric_limits<type>::has_infinity) {\
- if(conversion::IsInfinity(input)) {\
- rhs = std::numeric_limits<type>::infinity();\
- return true;\
- } else if(conversion::IsNegativeInfinity(input)) {\
- rhs = -std::numeric_limits<type>::infinity();\
- return true;\
- }\
- }\
- \
- if(std::numeric_limits<type>::has_quiet_NaN && conversion::IsNaN(input)) {\
- rhs = std::numeric_limits<type>::quiet_NaN();\
- return true;\
- }\
- \
- return false;\
- }\
- }
-
- YAML_DEFINE_CONVERT_STREAMABLE(int);
- YAML_DEFINE_CONVERT_STREAMABLE(unsigned);
- YAML_DEFINE_CONVERT_STREAMABLE(short);
- YAML_DEFINE_CONVERT_STREAMABLE(unsigned short);
- YAML_DEFINE_CONVERT_STREAMABLE(long);
- YAML_DEFINE_CONVERT_STREAMABLE(unsigned long);
- YAML_DEFINE_CONVERT_STREAMABLE(long long);
- YAML_DEFINE_CONVERT_STREAMABLE(unsigned long long);
-
- YAML_DEFINE_CONVERT_STREAMABLE(char);
- YAML_DEFINE_CONVERT_STREAMABLE(unsigned char);
-
- YAML_DEFINE_CONVERT_STREAMABLE(float);
- YAML_DEFINE_CONVERT_STREAMABLE(double);
- YAML_DEFINE_CONVERT_STREAMABLE(long double);
-
-#undef YAML_DEFINE_CONVERT_STREAMABLE
-
- // bool
- template<>
- struct convert<bool> {
- static Node encode(bool rhs) {
- return rhs ? Node("true") : Node("false");
- }
-
- static bool decode(const Node& node, bool& rhs);
- };
-
- // std::map
- template<typename K, typename V>
- struct convert<std::map<K, V> > {
- static Node encode(const std::map<K, V>& rhs) {
- Node node(NodeType::Map);
- for(typename std::map<K, V>::const_iterator it=rhs.begin();it!=rhs.end();++it)
- node.force_insert(it->first, it->second);
- return node;
- }
-
- static bool decode(const Node& node, std::map<K, V>& rhs) {
- if(!node.IsMap())
- return false;
-
- rhs.clear();
- for(const_iterator it=node.begin();it!=node.end();++it)
-#if defined(__GNUC__) && __GNUC__ < 4
-//workaround for GCC 3:
- rhs[it->first.template as<K>()] = it->second.template as<V>();
-#else
- rhs[it->first.as<K>()] = it->second.as<V>();
-#endif
- return true;
- }
- };
-
- // std::vector
- template<typename T>
- struct convert<std::vector<T> > {
- static Node encode(const std::vector<T>& rhs) {
- Node node(NodeType::Sequence);
- for(typename std::vector<T>::const_iterator it=rhs.begin();it!=rhs.end();++it)
- node.push_back(*it);
- return node;
- }
-
- static bool decode(const Node& node, std::vector<T>& rhs) {
- if(!node.IsSequence())
- return false;
-
- rhs.clear();
- for(const_iterator it=node.begin();it!=node.end();++it)
-#if defined(__GNUC__) && __GNUC__ < 4
-//workaround for GCC 3:
- rhs.push_back(it->template as<T>());
-#else
- rhs.push_back(it->as<T>());
-#endif
- return true;
- }
- };
-
- // std::list
- template<typename T>
- struct convert<std::list<T> > {
- static Node encode(const std::list<T>& rhs) {
- Node node(NodeType::Sequence);
- for(typename std::list<T>::const_iterator it=rhs.begin();it!=rhs.end();++it)
- node.push_back(*it);
- return node;
- }
-
- static bool decode(const Node& node, std::list<T>& rhs) {
- if(!node.IsSequence())
- return false;
-
- rhs.clear();
- for(const_iterator it=node.begin();it!=node.end();++it)
-#if defined(__GNUC__) && __GNUC__ < 4
-//workaround for GCC 3:
- rhs.push_back(it->template as<T>());
-#else
- rhs.push_back(it->as<T>());
-#endif
- return true;
- }
- };
-
- // binary
- template<>
- struct convert<Binary> {
- static Node encode(const Binary& rhs) {
- return Node(EncodeBase64(rhs.data(), rhs.size()));
- }
-
- static bool decode(const Node& node, Binary& rhs) {
- if(!node.IsScalar())
- return false;
-
- std::vector<unsigned char> data = DecodeBase64(node.Scalar());
- if(data.empty() && !node.Scalar().empty())
- return false;
-
- rhs.swap(data);
- return true;
- }
- };
-}
-
-#endif // NODE_CONVERT_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/include/yaml-cpp/node/detail/bool_type.h b/ext/include/yaml-cpp/node/detail/bool_type.h
deleted file mode 100644
index 80ed9a4..0000000
--- a/ext/include/yaml-cpp/node/detail/bool_type.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef NODE_DETAIL_BOOL_TYPE_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define NODE_DETAIL_BOOL_TYPE_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-namespace YAML
-{
- namespace detail
- {
- struct unspecified_bool {
- struct NOT_ALLOWED;
- static void true_value(NOT_ALLOWED*) {}
- };
- typedef void (*unspecified_bool_type)(unspecified_bool::NOT_ALLOWED*);
- }
-}
-
-#define YAML_CPP_OPERATOR_BOOL()\
-operator YAML::detail::unspecified_bool_type() const\
-{\
- return this->operator!() ? 0 : &YAML::detail::unspecified_bool::true_value;\
-}
-
-#endif // NODE_DETAIL_BOOL_TYPE_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/include/yaml-cpp/node/detail/impl.h b/ext/include/yaml-cpp/node/detail/impl.h
deleted file mode 100644
index b04fc05..0000000
--- a/ext/include/yaml-cpp/node/detail/impl.h
+++ /dev/null
@@ -1,163 +0,0 @@
-#ifndef NODE_DETAIL_IMPL_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define NODE_DETAIL_IMPL_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-#include "yaml-cpp/node/detail/node.h"
-#include "yaml-cpp/node/detail/node_data.h"
-#include <boost/type_traits.hpp>
-
-namespace YAML
-{
- namespace detail
- {
- template<typename Key, typename Enable = void>
- struct get_idx {
- static node *get(const std::vector<node *>& /* sequence */, const Key& /* key */, shared_memory_holder /* pMemory */) {
- return 0;
- }
- };
-
- template<typename Key>
- struct get_idx<Key, typename boost::enable_if<boost::is_unsigned<Key> >::type> {
- static node *get(const std::vector<node *>& sequence, const Key& key, shared_memory_holder /* pMemory */) {
- return key < sequence.size() ? sequence[key] : 0;
- }
-
- static node *get(std::vector<node *>& sequence, const Key& key, shared_memory_holder pMemory) {
- if(key > sequence.size())
- return 0;
- if(key == sequence.size())
- sequence.push_back(&pMemory->create_node());
- return sequence[key];
- }
- };
-
- template<typename Key>
- struct get_idx<Key, typename boost::enable_if<boost::is_signed<Key> >::type> {
- static node *get(const std::vector<node *>& sequence, const Key& key, shared_memory_holder pMemory) {
- return key >= 0 ? get_idx<std::size_t>::get(sequence, static_cast<std::size_t>(key), pMemory) : 0;
- }
- static node *get(std::vector<node *>& sequence, const Key& key, shared_memory_holder pMemory) {
- return key >= 0 ? get_idx<std::size_t>::get(sequence, static_cast<std::size_t>(key), pMemory) : 0;
- }
- };
-
- // indexing
- template<typename Key>
- inline node& node_data::get(const Key& key, shared_memory_holder pMemory) const
- {
- switch(m_type) {
- case NodeType::Map:
- break;
- case NodeType::Undefined:
- case NodeType::Null:
- return pMemory->create_node();
- case NodeType::Sequence:
- if(node *pNode = get_idx<Key>::get(m_sequence, key, pMemory))
- return *pNode;
- return pMemory->create_node();
- case NodeType::Scalar:
- throw BadSubscript();
- }
-
- for(node_map::const_iterator it=m_map.begin();it!=m_map.end();++it) {
- if(equals(*it->first, key, pMemory))
- return *it->second;
- }
-
- return pMemory->create_node();
- }
-
- template<typename Key>
- inline node& node_data::get(const Key& key, shared_memory_holder pMemory)
- {
- switch(m_type) {
- case NodeType::Map:
- break;
- case NodeType::Undefined:
- case NodeType::Null:
- case NodeType::Sequence:
- if(node *pNode = get_idx<Key>::get(m_sequence, key, pMemory)) {
- m_type = NodeType::Sequence;
- return *pNode;
- }
-
- convert_to_map(pMemory);
- break;
- case NodeType::Scalar:
- throw BadSubscript();
- }
-
- for(node_map::const_iterator it=m_map.begin();it!=m_map.end();++it) {
- if(equals(*it->first, key, pMemory))
- return *it->second;
- }
-
- node& k = convert_to_node(key, pMemory);
- node& v = pMemory->create_node();
- insert_map_pair(k, v);
- return v;
- }
-
- template<typename Key>
- inline bool node_data::remove(const Key& key, shared_memory_holder pMemory)
- {
- if(m_type != NodeType::Map)
- return false;
-
- for(node_map::iterator it=m_map.begin();it!=m_map.end();++it) {
- if(equals(*it->first, key, pMemory)) {
- m_map.erase(it);
- return true;
- }
- }
-
- return false;
- }
-
- // map
- template<typename Key, typename Value>
- inline void node_data::force_insert(const Key& key, const Value& value, shared_memory_holder pMemory)
- {
- switch(m_type) {
- case NodeType::Map:
- break;
- case NodeType::Undefined:
- case NodeType::Null:
- case NodeType::Sequence:
- convert_to_map(pMemory);
- break;
- case NodeType::Scalar:
- throw BadInsert();
- }
-
- node& k = convert_to_node(key, pMemory);
- node& v = convert_to_node(value, pMemory);
- insert_map_pair(k, v);
- }
-
- template<typename T>
- inline bool node_data::equals(node& node, const T& rhs, shared_memory_holder pMemory)
- {
- T lhs;
- if(convert<T>::decode(Node(node, pMemory), lhs))
- return lhs == rhs;
- return false;
- }
-
- template<typename T>
- inline node& node_data::convert_to_node(const T& rhs, shared_memory_holder pMemory)
- {
- Node value = convert<T>::encode(rhs);
- value.EnsureNodeExists();
- pMemory->merge(*value.m_pMemory);
- return *value.m_pNode;
- }
- }
-}
-
-#endif // NODE_DETAIL_IMPL_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/include/yaml-cpp/node/detail/iterator.h b/ext/include/yaml-cpp/node/detail/iterator.h
deleted file mode 100644
index dc699f4..0000000
--- a/ext/include/yaml-cpp/node/detail/iterator.h
+++ /dev/null
@@ -1,64 +0,0 @@
-#ifndef VALUE_DETAIL_ITERATOR_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define VALUE_DETAIL_ITERATOR_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-#include "yaml-cpp/dll.h"
-#include "yaml-cpp/node/ptr.h"
-#include "yaml-cpp/node/detail/node_iterator.h"
-#include <boost/iterator/iterator_adaptor.hpp>
-#include <boost/utility.hpp>
-
-namespace YAML
-{
- namespace detail
- {
- struct iterator_value;
-
- template<typename V>
- class iterator_base: public boost::iterator_adaptor<
- iterator_base<V>,
- node_iterator,
- V,
- std::forward_iterator_tag,
- V>
- {
- private:
- template<typename> friend class iterator_base;
- struct enabler {};
- typedef typename iterator_base::base_type base_type;
-
- public:
- typedef typename iterator_base::value_type value_type;
-
- public:
- iterator_base() {}
- explicit iterator_base(base_type rhs, shared_memory_holder pMemory): iterator_base::iterator_adaptor_(rhs), m_pMemory(pMemory) {}
-
- template<class W>
- iterator_base(const iterator_base<W>& rhs, typename boost::enable_if<boost::is_convertible<W*, V*>, enabler>::type = enabler()): iterator_base::iterator_adaptor_(rhs.base()), m_pMemory(rhs.m_pMemory) {}
-
- private:
- friend class boost::iterator_core_access;
-
- void increment() { this->base_reference() = boost::next(this->base()); }
-
- value_type dereference() const {
- const typename base_type::value_type& v = *this->base();
- if(v.pNode)
- return value_type(Node(*v, m_pMemory));
- if(v.first && v.second)
- return value_type(Node(*v.first, m_pMemory), Node(*v.second, m_pMemory));
- return value_type();
- }
-
- private:
- shared_memory_holder m_pMemory;
- };
- }
-}
-
-#endif // VALUE_DETAIL_ITERATOR_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/include/yaml-cpp/node/detail/iterator_fwd.h b/ext/include/yaml-cpp/node/detail/iterator_fwd.h
deleted file mode 100644
index c4efb2c..0000000
--- a/ext/include/yaml-cpp/node/detail/iterator_fwd.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef VALUE_DETAIL_ITERATOR_FWD_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define VALUE_DETAIL_ITERATOR_FWD_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-#include "yaml-cpp/dll.h"
-#include <list>
-#include <utility>
-#include <vector>
-
-namespace YAML
-{
- class node;
-
- namespace detail {
- struct iterator_value;
- template<typename V> class iterator_base;
- }
-
- typedef detail::iterator_base<detail::iterator_value> iterator;
- typedef detail::iterator_base<const detail::iterator_value> const_iterator;
-}
-
-#endif // VALUE_DETAIL_ITERATOR_FWD_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/include/yaml-cpp/node/detail/memory.h b/ext/include/yaml-cpp/node/detail/memory.h
deleted file mode 100644
index 243a81a..0000000
--- a/ext/include/yaml-cpp/node/detail/memory.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#ifndef VALUE_DETAIL_MEMORY_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define VALUE_DETAIL_MEMORY_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-#include "yaml-cpp/node/ptr.h"
-#include <set>
-#include <boost/shared_ptr.hpp>
-
-namespace YAML
-{
- namespace detail
- {
- class memory {
- public:
- node& create_node();
- void merge(const memory& rhs);
-
- private:
- typedef std::set<shared_node> Nodes;
- Nodes m_nodes;
- };
-
- class memory_holder {
- public:
- memory_holder(): m_pMemory(new memory) {}
-
- node& create_node() { return m_pMemory->create_node(); }
- void merge(memory_holder& rhs);
-
- private:
- boost::shared_ptr<memory> m_pMemory;
- };
- }
-}
-
-#endif // VALUE_DETAIL_MEMORY_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/include/yaml-cpp/node/detail/node.h b/ext/include/yaml-cpp/node/detail/node.h
deleted file mode 100644
index ce3a76d..0000000
--- a/ext/include/yaml-cpp/node/detail/node.h
+++ /dev/null
@@ -1,130 +0,0 @@
-#ifndef NODE_DETAIL_NODE_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define NODE_DETAIL_NODE_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-#include "yaml-cpp/dll.h"
-#include "yaml-cpp/node/type.h"
-#include "yaml-cpp/node/ptr.h"
-#include "yaml-cpp/node/detail/node_ref.h"
-#include <set>
-#include <boost/utility.hpp>
-
-namespace YAML
-{
- namespace detail
- {
- class node: private boost::noncopyable
- {
- public:
- node(): m_pRef(new node_ref) {}
-
- bool is(const node& rhs) const { return m_pRef == rhs.m_pRef; }
- const node_ref *ref() const { return m_pRef.get(); }
-
- bool is_defined() const { return m_pRef->is_defined(); }
- NodeType::value type() const { return m_pRef->type(); }
-
- const std::string& scalar() const { return m_pRef->scalar(); }
- const std::string& tag() const { return m_pRef->tag(); }
-
- void mark_defined() {
- if(is_defined())
- return;
-
- m_pRef->mark_defined();
- for(nodes::iterator it=m_dependencies.begin();it!=m_dependencies.end();++it)
- (*it)->mark_defined();
- m_dependencies.clear();
- }
-
- void add_dependency(node& rhs) {
- if(is_defined())
- rhs.mark_defined();
- else
- m_dependencies.insert(&rhs);
- }
-
- void set_ref(const node& rhs) {
- if(rhs.is_defined())
- mark_defined();
- m_pRef = rhs.m_pRef;
- }
- void set_data(const node& rhs) {
- if(rhs.is_defined())
- mark_defined();
- m_pRef->set_data(*rhs.m_pRef);
- }
-
- void set_type(NodeType::value type) {
- if(type != NodeType::Undefined)
- mark_defined();
- m_pRef->set_type(type);
- }
- void set_null() {
- mark_defined();
- m_pRef->set_null();
- }
- void set_scalar(const std::string& scalar) {
- mark_defined();
- m_pRef->set_scalar(scalar);
- }
- void set_tag(const std::string& tag) {
- mark_defined();
- m_pRef->set_tag(tag);
- }
-
- // size/iterator
- std::size_t size() const { return m_pRef->size(); }
-
- const_node_iterator begin() const { return static_cast<const node_ref&>(*m_pRef).begin(); }
- node_iterator begin() { return m_pRef->begin(); }
-
- const_node_iterator end() const { return static_cast<const node_ref&>(*m_pRef).end(); }
- node_iterator end() { return m_pRef->end(); }
-
- // sequence
- void push_back(node& node, shared_memory_holder pMemory) {
- m_pRef->push_back(node, pMemory);
- node.add_dependency(*this);
- }
- void insert(node& key, node& value, shared_memory_holder pMemory) {
- m_pRef->insert(key, value, pMemory);
- key.add_dependency(*this);
- value.add_dependency(*this);
- }
-
- // indexing
- template<typename Key> node& get(const Key& key, shared_memory_holder pMemory) const { return static_cast<const node_ref&>(*m_pRef).get(key, pMemory); }
- template<typename Key> node& get(const Key& key, shared_memory_holder pMemory) {
- node& value = m_pRef->get(key, pMemory);
- value.add_dependency(*this);
- return value;
- }
- template<typename Key> bool remove(const Key& key, shared_memory_holder pMemory) { return m_pRef->remove(key, pMemory); }
-
- node& get(node& key, shared_memory_holder pMemory) const { return static_cast<const node_ref&>(*m_pRef).get(key, pMemory); }
- node& get(node& key, shared_memory_holder pMemory) {
- node& value = m_pRef->get(key, pMemory);
- key.add_dependency(*this);
- value.add_dependency(*this);
- return value;
- }
- bool remove(node& key, shared_memory_holder pMemory) { return m_pRef->remove(key, pMemory); }
-
- // map
- template<typename Key, typename Value>
- void force_insert(const Key& key, const Value& value, shared_memory_holder pMemory){ m_pRef->force_insert(key, value, pMemory); }
-
- private:
- shared_node_ref m_pRef;
- typedef std::set<node *> nodes;
- nodes m_dependencies;
- };
- }
-}
-
-#endif // NODE_DETAIL_NODE_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/include/yaml-cpp/node/detail/node_data.h b/ext/include/yaml-cpp/node/detail/node_data.h
deleted file mode 100644
index 4c0a156..0000000
--- a/ext/include/yaml-cpp/node/detail/node_data.h
+++ /dev/null
@@ -1,110 +0,0 @@
-#ifndef VALUE_DETAIL_NODE_DATA_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define VALUE_DETAIL_NODE_DATA_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-#include "yaml-cpp/dll.h"
-#include "yaml-cpp/node/iterator.h"
-#include "yaml-cpp/node/ptr.h"
-#include "yaml-cpp/node/type.h"
-#include <boost/utility.hpp>
-#include <list>
-#include <string>
-#include <utility>
-#include <vector>
-
-namespace YAML
-{
- namespace detail
- {
- class node_data: private boost::noncopyable
- {
- public:
- node_data();
-
- void mark_defined();
- void set_type(NodeType::value type);
- void set_tag(const std::string& tag);
- void set_null();
- void set_scalar(const std::string& scalar);
-
- bool is_defined() const { return m_isDefined; }
- NodeType::value type() const { return m_isDefined ? m_type : NodeType::Undefined; }
- const std::string& scalar() const { return m_scalar; }
- const std::string& tag() const { return m_tag; }
-
- // size/iterator
- std::size_t size() const;
-
- const_node_iterator begin() const;
- node_iterator begin();
-
- const_node_iterator end() const;
- node_iterator end();
-
- // sequence
- void push_back(node& node, shared_memory_holder pMemory);
- void insert(node& key, node& value, shared_memory_holder pMemory);
-
- // indexing
- template<typename Key> node& get(const Key& key, shared_memory_holder pMemory) const;
- template<typename Key> node& get(const Key& key, shared_memory_holder pMemory);
- template<typename Key> bool remove(const Key& key, shared_memory_holder pMemory);
-
- node& get(node& key, shared_memory_holder pMemory) const;
- node& get(node& key, shared_memory_holder pMemory);
- bool remove(node& key, shared_memory_holder pMemory);
-
- // map
- template<typename Key, typename Value>
- void force_insert(const Key& key, const Value& value, shared_memory_holder pMemory);
-
- public:
- static std::string empty_scalar;
-
- private:
- void compute_seq_size() const;
- void compute_map_size() const;
-
- void reset_sequence();
- void reset_map();
-
- void insert_map_pair(node& key, node& value);
- void convert_to_map(shared_memory_holder pMemory);
- void convert_sequence_to_map(shared_memory_holder pMemory);
-
- template<typename T>
- static bool equals(node& node, const T& rhs, shared_memory_holder pMemory);
-
- template<typename T>
- static node& convert_to_node(const T& rhs, shared_memory_holder pMemory);
-
- private:
- bool m_isDefined;
- NodeType::value m_type;
- std::string m_tag;
-
- // scalar
- std::string m_scalar;
-
- // sequence
- typedef std::vector<node *> node_seq;
- node_seq m_sequence;
-
- mutable std::size_t m_seqSize;
-
- // map
- typedef std::map<node *, node *, node_cmp> node_map;
- node_map m_map;
-
- typedef std::pair<node *, node *> kv_pair;
- typedef std::list<kv_pair> kv_pairs;
- mutable kv_pairs m_undefinedPairs;
- };
- }
-}
-
-#endif // VALUE_DETAIL_NODE_DATA_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/include/yaml-cpp/node/detail/node_iterator.h b/ext/include/yaml-cpp/node/detail/node_iterator.h
deleted file mode 100644
index b95778b..0000000
--- a/ext/include/yaml-cpp/node/detail/node_iterator.h
+++ /dev/null
@@ -1,143 +0,0 @@
-#ifndef VALUE_DETAIL_NODE_ITERATOR_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define VALUE_DETAIL_NODE_ITERATOR_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-#include "yaml-cpp/dll.h"
-#include "yaml-cpp/node/ptr.h"
-#include <boost/iterator/iterator_facade.hpp>
-#include <boost/utility/enable_if.hpp>
-#include <map>
-#include <utility>
-#include <vector>
-
-namespace YAML
-{
- namespace detail
- {
- struct node_cmp {
- bool operator()(const node *lhs, const node *rhs) const;
- };
-
- struct iterator_type { enum value { None, Sequence, Map }; };
-
- template<typename V>
- struct node_iterator_value: public std::pair<V*, V*> {
- typedef std::pair<V*, V*> kv;
-
- node_iterator_value(): kv(), pNode(0) {}
- explicit node_iterator_value(V& rhs): kv(), pNode(&rhs) {}
- explicit node_iterator_value(V& key, V& value): kv(&key, &value), pNode(0) {}
-
- V& operator *() const { return *pNode; }
- V& operator ->() const { return *pNode; }
-
- V *pNode;
- };
-
- typedef std::vector<node *> node_seq;
- typedef std::map<node *, node *, node_cmp> node_map;
-
- template<typename V>
- struct node_iterator_type {
- typedef node_seq::iterator seq;
- typedef node_map::iterator map;
- };
-
- template<typename V>
- struct node_iterator_type<const V> {
- typedef node_seq::const_iterator seq;
- typedef node_map::const_iterator map;
- };
-
-
- template<typename V>
- class node_iterator_base: public boost::iterator_facade<
- node_iterator_base<V>,
- node_iterator_value<V>,
- std::forward_iterator_tag,
- node_iterator_value<V> >
- {
- private:
- struct enabler {};
-
- public:
- typedef typename node_iterator_type<V>::seq SeqIter;
- typedef typename node_iterator_type<V>::map MapIter;
- typedef node_iterator_value<V> value_type;
-
- node_iterator_base(): m_type(iterator_type::None) {}
- explicit node_iterator_base(SeqIter seqIt): m_type(iterator_type::Sequence), m_seqIt(seqIt) {}
- explicit node_iterator_base(MapIter mapIt, MapIter mapEnd): m_type(iterator_type::Map), m_mapIt(mapIt), m_mapEnd(mapEnd) {
- m_mapIt = increment_until_defined(m_mapIt);
- }
-
- template<typename W>
- node_iterator_base(const node_iterator_base<W>& rhs, typename boost::enable_if<boost::is_convertible<W*, V*>, enabler>::type = enabler())
- : m_type(rhs.m_type), m_seqIt(rhs.m_seqIt), m_mapIt(rhs.m_mapIt), m_mapEnd(rhs.m_mapEnd) {}
-
- private:
- friend class boost::iterator_core_access;
- template<typename> friend class node_iterator_base;
-
- template<typename W>
- bool equal(const node_iterator_base<W>& rhs) const {
- if(m_type != rhs.m_type)
- return false;
-
- switch(m_type) {
- case iterator_type::None: return true;
- case iterator_type::Sequence: return m_seqIt == rhs.m_seqIt;
- case iterator_type::Map: return m_mapIt == rhs.m_mapIt;
- }
- return true;
- }
-
- void increment() {
- switch(m_type) {
- case iterator_type::None: break;
- case iterator_type::Sequence:
- ++m_seqIt;
- break;
- case iterator_type::Map:
- ++m_mapIt;
- m_mapIt = increment_until_defined(m_mapIt);
- break;
- }
- }
-
- value_type dereference() const {
- switch(m_type) {
- case iterator_type::None: return value_type();
- case iterator_type::Sequence: return value_type(**m_seqIt);
- case iterator_type::Map: return value_type(*m_mapIt->first, *m_mapIt->second);
- }
- return value_type();
- }
-
- MapIter increment_until_defined(MapIter it) {
- while(it != m_mapEnd && !is_defined(it))
- ++it;
- return it;
- }
-
- bool is_defined(MapIter it) const {
- return it->first->is_defined() && it->second->is_defined();
- }
-
- private:
- typename iterator_type::value m_type;
-
- SeqIter m_seqIt;
- MapIter m_mapIt, m_mapEnd;
- };
-
- typedef node_iterator_base<node> node_iterator;
- typedef node_iterator_base<const node> const_node_iterator;
- }
-}
-
-#endif // VALUE_DETAIL_NODE_ITERATOR_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/include/yaml-cpp/node/detail/node_ref.h b/ext/include/yaml-cpp/node/detail/node_ref.h
deleted file mode 100644
index 64cdb98..0000000
--- a/ext/include/yaml-cpp/node/detail/node_ref.h
+++ /dev/null
@@ -1,69 +0,0 @@
-#ifndef VALUE_DETAIL_NODE_REF_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define VALUE_DETAIL_NODE_REF_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-#include "yaml-cpp/dll.h"
-#include "yaml-cpp/node/type.h"
-#include "yaml-cpp/node/ptr.h"
-#include "yaml-cpp/node/detail/node_data.h"
-#include <boost/utility.hpp>
-
-namespace YAML
-{
- namespace detail
- {
- class node_ref: private boost::noncopyable
- {
- public:
- node_ref(): m_pData(new node_data) {}
-
- bool is_defined() const { return m_pData->is_defined(); }
- NodeType::value type() const { return m_pData->type(); }
- const std::string& scalar() const { return m_pData->scalar(); }
- const std::string& tag() const { return m_pData->tag(); }
-
- void mark_defined() { m_pData->mark_defined(); }
- void set_data(const node_ref& rhs) { m_pData = rhs.m_pData; }
-
- void set_type(NodeType::value type) { m_pData->set_type(type); }
- void set_tag(const std::string& tag) { m_pData->set_tag(tag); }
- void set_null() { m_pData->set_null(); }
- void set_scalar(const std::string& scalar) { m_pData->set_scalar(scalar); }
-
- // size/iterator
- std::size_t size() const { return m_pData->size(); }
-
- const_node_iterator begin() const { return static_cast<const node_data&>(*m_pData).begin(); }
- node_iterator begin() {return m_pData->begin(); }
-
- const_node_iterator end() const { return static_cast<const node_data&>(*m_pData).end(); }
- node_iterator end() {return m_pData->end(); }
-
- // sequence
- void push_back(node& node, shared_memory_holder pMemory) { m_pData->push_back(node, pMemory); }
- void insert(node& key, node& value, shared_memory_holder pMemory) { m_pData->insert(key, value, pMemory); }
-
- // indexing
- template<typename Key> node& get(const Key& key, shared_memory_holder pMemory) const { return static_cast<const node_data&>(*m_pData).get(key, pMemory); }
- template<typename Key> node& get(const Key& key, shared_memory_holder pMemory) { return m_pData->get(key, pMemory); }
- template<typename Key> bool remove(const Key& key, shared_memory_holder pMemory) { return m_pData->remove(key, pMemory); }
-
- node& get(node& key, shared_memory_holder pMemory) const { return static_cast<const node_data&>(*m_pData).get(key, pMemory); }
- node& get(node& key, shared_memory_holder pMemory) { return m_pData->get(key, pMemory); }
- bool remove(node& key, shared_memory_holder pMemory) { return m_pData->remove(key, pMemory); }
-
- // map
- template<typename Key, typename Value>
- void force_insert(const Key& key, const Value& value, shared_memory_holder pMemory) { m_pData->force_insert(key, value, pMemory); }
-
- private:
- shared_node_data m_pData;
- };
- }
-}
-
-#endif // VALUE_DETAIL_NODE_REF_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/include/yaml-cpp/node/emit.h b/ext/include/yaml-cpp/node/emit.h
deleted file mode 100644
index 7abf80b..0000000
--- a/ext/include/yaml-cpp/node/emit.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef NODE_EMIT_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define NODE_EMIT_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-#include <string>
-#include <iosfwd>
-
-namespace YAML
-{
- class Emitter;
- class Node;
-
- Emitter& operator << (Emitter& out, const Node& node);
- std::ostream& operator << (std::ostream& out, const Node& node);
-
- std::string Dump(const Node& node);
-}
-
-#endif // NODE_EMIT_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
diff --git a/ext/include/yaml-cpp/node/impl.h b/ext/include/yaml-cpp/node/impl.h
deleted file mode 100644
index d46f329..0000000
--- a/ext/include/yaml-cpp/node/impl.h
+++ /dev/null
@@ -1,382 +0,0 @@
-#ifndef NODE_IMPL_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define NODE_IMPL_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-#include "yaml-cpp/node/node.h"
-#include "yaml-cpp/node/iterator.h"
-#include "yaml-cpp/node/detail/memory.h"
-#include "yaml-cpp/node/detail/node.h"
-#include "yaml-cpp/exceptions.h"
-#include <string>
-
-namespace YAML
-{
- inline Node::Node(): m_pNode(NULL)
- {
- }
-
- inline Node::Node(NodeType::value type): m_pMemory(new detail::memory_holder), m_pNode(&m_pMemory->create_node())
- {
- m_pNode->set_type(type);
- }
-
- template<typename T>
- inline Node::Node(const T& rhs): m_pMemory(new detail::memory_holder), m_pNode(&m_pMemory->create_node())
- {
- Assign(rhs);
- }
-
- inline Node::Node(const detail::iterator_value& rhs): m_pMemory(rhs.m_pMemory), m_pNode(rhs.m_pNode)
- {
- }
-
- inline Node::Node(const Node& rhs): m_pMemory(rhs.m_pMemory), m_pNode(rhs.m_pNode)
- {
- }
-
- inline Node::Node(detail::node& node, detail::shared_memory_holder pMemory): m_pMemory(pMemory), m_pNode(&node)
- {
- }
-
- inline Node::~Node()
- {
- }
-
- inline void Node::EnsureNodeExists() const
- {
- if(!m_pNode) {
- m_pMemory.reset(new detail::memory_holder);
- m_pNode = &m_pMemory->create_node();
- m_pNode->set_null();
- }
- }
-
- inline bool Node::IsDefined() const
- {
- return m_pNode ? m_pNode->is_defined() : true;
- }
-
- inline NodeType::value Node::Type() const
- {
- return m_pNode ? m_pNode->type() : NodeType::Null;
- }
-
- // access
-
- // template helpers
- template<typename T, typename S>
- struct as_if {
- explicit as_if(const Node& node_): node(node_) {}
- const Node& node;
-
- const T operator()(const S& fallback) const {
- if(!node.m_pNode)
- return fallback;
-
- T t;
- if(convert<T>::decode(node, t))
- return t;
- return fallback;
- }
- };
-
- template<typename S>
- struct as_if<std::string, S> {
- explicit as_if(const Node& node_): node(node_) {}
- const Node& node;
-
- const std::string operator()(const S& fallback) const {
- if(node.Type() != NodeType::Scalar)
- return fallback;
- return node.Scalar();
- }
- };
-
- template<typename T>
- struct as_if<T, void> {
- explicit as_if(const Node& node_): node(node_) {}
- const Node& node;
-
- const T operator()() const {
- if(!node.m_pNode)
- throw TypedBadConversion<T>();
-
- T t;
- if(convert<T>::decode(node, t))
- return t;
- throw TypedBadConversion<T>();
- }
- };
-
- template<>
- struct as_if<std::string, void> {
- explicit as_if(const Node& node_): node(node_) {}
- const Node& node;
-
- const std::string operator()() const {
- if(node.Type() != NodeType::Scalar)
- throw TypedBadConversion<std::string>();
- return node.Scalar();
- }
- };
-
- // access functions
- template<typename T>
- inline const T Node::as() const
- {
- return as_if<T, void>(*this)();
- }
-
- template<typename T, typename S>
- inline const T Node::as(const S& fallback) const
- {
- return as_if<T, S>(*this)(fallback);
- }
-
- inline const std::string& Node::Scalar() const
- {
- return m_pNode ? m_pNode->scalar() : detail::node_data::empty_scalar;
- }
-
- inline const std::string& Node::Tag() const
- {
- return m_pNode ? m_pNode->tag() : detail::node_data::empty_scalar;
- }
-
- inline void Node::SetTag(const std::string& tag)
- {
- EnsureNodeExists();
- m_pNode->set_tag(tag);
- }
-
- // assignment
- inline bool Node::is(const Node& rhs) const
- {
- if(!m_pNode || !rhs.m_pNode)
- return false;
- return m_pNode->is(*rhs.m_pNode);
- }
-
- template<typename T>
- inline Node& Node::operator=(const T& rhs)
- {
- Assign(rhs);
- return *this;
- }
-
- inline void Node::clear()
- {
- m_pNode = NULL;
- }
-
- template<typename T>
- inline void Node::Assign(const T& rhs)
- {
- AssignData(convert<T>::encode(rhs));
- }
-
- template<>
- inline void Node::Assign(const std::string& rhs)
- {
- EnsureNodeExists();
- m_pNode->set_scalar(rhs);
- }
-
- inline void Node::Assign(const char *rhs)
- {
- EnsureNodeExists();
- m_pNode->set_scalar(rhs);
- }
-
- inline void Node::Assign(char *rhs)
- {
- EnsureNodeExists();
- m_pNode->set_scalar(rhs);
- }
-
- inline Node& Node::operator=(const Node& rhs)
- {
- if(is(rhs))
- return *this;
- AssignNode(rhs);
- return *this;
- }
-
- inline void Node::AssignData(const Node& rhs)
- {
- EnsureNodeExists();
- rhs.EnsureNodeExists();
-
- m_pNode->set_data(*rhs.m_pNode);
- m_pMemory->merge(*rhs.m_pMemory);
- }
-
- inline void Node::AssignNode(const Node& rhs)
- {
- rhs.EnsureNodeExists();
-
- if(!m_pNode) {
- m_pNode = rhs.m_pNode;
- m_pMemory = rhs.m_pMemory;
- return;
- }
-
- m_pNode->set_ref(*rhs.m_pNode);
- m_pMemory->merge(*rhs.m_pMemory);
- m_pNode = rhs.m_pNode;
- }
-
- // size/iterator
- inline std::size_t Node::size() const
- {
- return m_pNode ? m_pNode->size() : 0;
- }
-
- inline const_iterator Node::begin() const
- {
- return m_pNode ? const_iterator(m_pNode->begin(), m_pMemory) : const_iterator();
- }
-
- inline iterator Node::begin()
- {
- return m_pNode ? iterator(m_pNode->begin(), m_pMemory) : iterator();
- }
-
- inline const_iterator Node::end() const
- {
- return m_pNode ? const_iterator(m_pNode->end(), m_pMemory) : const_iterator();
- }
-
- inline iterator Node::end()
- {
- return m_pNode ? iterator(m_pNode->end(), m_pMemory) : iterator();
- }
-
- // sequence
- template<typename T>
- inline void Node::push_back(const T& rhs)
- {
- push_back(Node(rhs));
- }
-
- inline void Node::push_back(const Node& rhs)
- {
- EnsureNodeExists();
- rhs.EnsureNodeExists();
-
- m_pNode->push_back(*rhs.m_pNode, m_pMemory);
- m_pMemory->merge(*rhs.m_pMemory);
- }
-
- // helpers for indexing
- namespace detail {
- template<typename T>
- struct to_value_t {
- explicit to_value_t(const T& t_): t(t_) {}
- const T& t;
- typedef const T& return_type;
-
- const T& operator()() const { return t; }
- };
-
- template<>
- struct to_value_t<const char*> {
- explicit to_value_t(const char *t_): t(t_) {}
- const char *t;
- typedef std::string return_type;
-
- const std::string operator()() const { return t; }
- };
-
- template<>
- struct to_value_t<char*> {
- explicit to_value_t(char *t_): t(t_) {}
- const char *t;
- typedef std::string return_type;
-
- const std::string operator()() const { return t; }
- };
-
- template<std::size_t N>
- struct to_value_t<char [N]> {
- explicit to_value_t(const char *t_): t(t_) {}
- const char *t;
- typedef std::string return_type;
-
- const std::string operator()() const { return t; }
- };
-
- // converts C-strings to std::strings so they can be copied
- template<typename T>
- inline typename to_value_t<T>::return_type to_value(const T& t) {
- return to_value_t<T>(t)();
- }
- }
-
- // indexing
- template<typename Key>
- inline const Node Node::operator[](const Key& key) const
- {
- EnsureNodeExists();
- detail::node& value = static_cast<const detail::node&>(*m_pNode).get(detail::to_value(key), m_pMemory);
- return Node(value, m_pMemory);
- }
-
- template<typename Key>
- inline Node Node::operator[](const Key& key)
- {
- EnsureNodeExists();
- detail::node& value = m_pNode->get(detail::to_value(key), m_pMemory);
- return Node(value, m_pMemory);
- }
-
- template<typename Key>
- inline bool Node::remove(const Key& key)
- {
- EnsureNodeExists();
- return m_pNode->remove(detail::to_value(key), m_pMemory);
- }
-
- inline const Node Node::operator[](const Node& key) const
- {
- EnsureNodeExists();
- key.EnsureNodeExists();
- detail::node& value = static_cast<const detail::node&>(*m_pNode).get(*key.m_pNode, m_pMemory);
- return Node(value, m_pMemory);
- }
-
- inline Node Node::operator[](const Node& key)
- {
- EnsureNodeExists();
- key.EnsureNodeExists();
- detail::node& value = m_pNode->get(*key.m_pNode, m_pMemory);
- return Node(value, m_pMemory);
- }
-
- inline bool Node::remove(const Node& key)
- {
- EnsureNodeExists();
- key.EnsureNodeExists();
- return m_pNode->remove(*key.m_pNode, m_pMemory);
- }
-
- // map
- template<typename Key, typename Value>
- inline void Node::force_insert(const Key& key, const Value& value)
- {
- EnsureNodeExists();
- m_pNode->force_insert(detail::to_value(key), detail::to_value(value), m_pMemory);
- }
-
- // free functions
- inline bool operator==(const Node& lhs, const Node& rhs)
- {
- return lhs.is(rhs);
- }
-}
-
-#endif // NODE_IMPL_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/include/yaml-cpp/node/iterator.h b/ext/include/yaml-cpp/node/iterator.h
deleted file mode 100644
index 4cc4719..0000000
--- a/ext/include/yaml-cpp/node/iterator.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef VALUE_ITERATOR_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define VALUE_ITERATOR_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-#include "yaml-cpp/dll.h"
-#include "yaml-cpp/node/node.h"
-#include "yaml-cpp/node/detail/iterator_fwd.h"
-#include "yaml-cpp/node/detail/iterator.h"
-#include <list>
-#include <utility>
-#include <vector>
-
-namespace YAML
-{
- namespace detail {
- struct iterator_value: public Node, std::pair<Node, Node> {
- iterator_value() {}
- explicit iterator_value(const Node& rhs): Node(rhs) {}
- explicit iterator_value(const Node& key, const Node& value): std::pair<Node, Node>(key, value) {}
- };
- }
-}
-
-#endif // VALUE_ITERATOR_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/include/yaml-cpp/node/node.h b/ext/include/yaml-cpp/node/node.h
deleted file mode 100644
index d4ee442..0000000
--- a/ext/include/yaml-cpp/node/node.h
+++ /dev/null
@@ -1,112 +0,0 @@
-#ifndef NODE_NODE_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define NODE_NODE_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-#include "yaml-cpp/dll.h"
-#include "yaml-cpp/node/ptr.h"
-#include "yaml-cpp/node/type.h"
-#include "yaml-cpp/node/detail/iterator_fwd.h"
-#include "yaml-cpp/node/detail/bool_type.h"
-#include <stdexcept>
-
-namespace YAML
-{
- class Node
- {
- public:
- friend class NodeBuilder;
- friend class NodeEvents;
- friend class detail::node_data;
- template<typename> friend class detail::iterator_base;
- template<typename T, typename S> friend struct as_if;
-
- typedef YAML::iterator iterator;
- typedef YAML::const_iterator const_iterator;
-
- Node();
- explicit Node(NodeType::value type);
- template<typename T> explicit Node(const T& rhs);
- explicit Node(const detail::iterator_value& rhs);
- Node(const Node& rhs);
- ~Node();
-
- NodeType::value Type() const;
- bool IsDefined() const;
- bool IsNull() const { return Type() == NodeType::Null; }
- bool IsScalar() const { return Type() == NodeType::Scalar; }
- bool IsSequence() const { return Type() == NodeType::Sequence; }
- bool IsMap() const { return Type() == NodeType::Map; }
-
- // bool conversions
- YAML_CPP_OPERATOR_BOOL();
- bool operator!() const { return !IsDefined(); }
-
- // access
- template<typename T> const T as() const;
- template<typename T, typename S> const T as(const S& fallback) const;
- const std::string& Scalar() const;
- const std::string& Tag() const;
- void SetTag(const std::string& tag);
-
- // assignment
- bool is(const Node& rhs) const;
- template<typename T> Node& operator=(const T& rhs);
- Node& operator=(const Node& rhs);
- void clear();
-
- // size/iterator
- std::size_t size() const;
-
- const_iterator begin() const;
- iterator begin();
-
- const_iterator end() const;
- iterator end();
-
- // sequence
- template<typename T> void push_back(const T& rhs);
- void push_back(const Node& rhs);
-
- // indexing
- template<typename Key> const Node operator[](const Key& key) const;
- template<typename Key> Node operator[](const Key& key);
- template<typename Key> bool remove(const Key& key);
-
- const Node operator[](const Node& key) const;
- Node operator[](const Node& key);
- bool remove(const Node& key);
-
- // map
- template<typename Key, typename Value>
- void force_insert(const Key& key, const Value& value);
-
- private:
- explicit Node(detail::node& node, detail::shared_memory_holder pMemory);
-
- void EnsureNodeExists() const;
-
- template<typename T> void Assign(const T& rhs);
- void Assign(const char *rhs);
- void Assign(char *rhs);
-
- void AssignData(const Node& rhs);
- void AssignNode(const Node& rhs);
-
- private:
- mutable detail::shared_memory_holder m_pMemory;
- mutable detail::node *m_pNode;
- };
-
- bool operator==(const Node& lhs, const Node& rhs);
-
- Node Clone(const Node& node);
-
- template<typename T>
- struct convert;
-}
-
-#endif // NODE_NODE_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/include/yaml-cpp/node/parse.h b/ext/include/yaml-cpp/node/parse.h
deleted file mode 100644
index 82dbdc1..0000000
--- a/ext/include/yaml-cpp/node/parse.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef VALUE_PARSE_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define VALUE_PARSE_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-#include <iosfwd>
-#include <string>
-#include <vector>
-
-namespace YAML
-{
- class Node;
-
- Node Load(const std::string& input);
- Node Load(const char *input);
- Node Load(std::istream& input);
- Node LoadFile(const std::string& filename);
-
- std::vector<Node> LoadAll(const std::string& input);
- std::vector<Node> LoadAll(const char *input);
- std::vector<Node> LoadAll(std::istream& input);
- std::vector<Node> LoadAllFromFile(const std::string& filename);
-}
-
-#endif // VALUE_PARSE_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
diff --git a/ext/include/yaml-cpp/node/ptr.h b/ext/include/yaml-cpp/node/ptr.h
deleted file mode 100644
index 316dbd2..0000000
--- a/ext/include/yaml-cpp/node/ptr.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef VALUE_PTR_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define VALUE_PTR_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-#include "yaml-cpp/dll.h"
-#include <boost/shared_ptr.hpp>
-
-namespace YAML
-{
- namespace detail {
- class node;
- class node_ref;
- class node_data;
- class memory;
- class memory_holder;
-
- typedef boost::shared_ptr<node> shared_node;
- typedef boost::shared_ptr<node_ref> shared_node_ref;
- typedef boost::shared_ptr<node_data> shared_node_data;
- typedef boost::shared_ptr<memory_holder> shared_memory_holder;
- typedef boost::shared_ptr<memory> shared_memory;
- }
-}
-
-#endif // VALUE_PTR_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/include/yaml-cpp/node/type.h b/ext/include/yaml-cpp/node/type.h
deleted file mode 100644
index 5ac8041..0000000
--- a/ext/include/yaml-cpp/node/type.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef VALUE_TYPE_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define VALUE_TYPE_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-namespace YAML
-{
- struct NodeType { enum value { Undefined, Null, Scalar, Sequence, Map }; };
-}
-
-#endif // VALUE_TYPE_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/include/yaml-cpp/noncopyable.h b/ext/include/yaml-cpp/noncopyable.h
deleted file mode 100644
index 8e61e43..0000000
--- a/ext/include/yaml-cpp/noncopyable.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef NONCOPYABLE_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define NONCOPYABLE_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-#include "yaml-cpp/dll.h"
-
-namespace YAML
-{
- // this is basically boost::noncopyable
- class YAML_CPP_API noncopyable
- {
- protected:
- noncopyable() {}
- ~noncopyable() {}
-
- private:
- noncopyable(const noncopyable&);
- const noncopyable& operator = (const noncopyable&);
- };
-}
-
-#endif // NONCOPYABLE_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/include/yaml-cpp/null.h b/ext/include/yaml-cpp/null.h
deleted file mode 100644
index 711f18c..0000000
--- a/ext/include/yaml-cpp/null.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef NULL_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define NULL_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-#include "yaml-cpp/dll.h"
-
-namespace YAML
-{
- class Node;
-
- struct YAML_CPP_API _Null {};
- inline bool operator == (const _Null&, const _Null&) { return true; }
- inline bool operator != (const _Null&, const _Null&) { return false; }
-
- YAML_CPP_API bool IsNull(const Node& node); // old API only
-
- extern YAML_CPP_API _Null Null;
-}
-
-#endif // NULL_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
diff --git a/ext/include/yaml-cpp/ostream_wrapper.h b/ext/include/yaml-cpp/ostream_wrapper.h
deleted file mode 100644
index a6d96c5..0000000
--- a/ext/include/yaml-cpp/ostream_wrapper.h
+++ /dev/null
@@ -1,69 +0,0 @@
-#ifndef OSTREAM_WRAPPER_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define OSTREAM_WRAPPER_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-#include <string>
-#include <vector>
-
-namespace YAML
-{
- class ostream_wrapper
- {
- public:
- ostream_wrapper();
- explicit ostream_wrapper(std::ostream& stream);
- ~ostream_wrapper();
-
- void write(const std::string& str);
- void write(const char *str, std::size_t size);
-
- void set_comment() { m_comment = true; }
-
- const char *str() const {
- if(m_pStream) {
- return 0;
- } else {
- m_buffer[m_pos] = '\0';
- return &m_buffer[0];
- }
- }
-
- std::size_t row() const { return m_row; }
- std::size_t col() const { return m_col; }
- std::size_t pos() const { return m_pos; }
- bool comment() const { return m_comment; }
-
- private:
- void update_pos(char ch);
-
- private:
- mutable std::vector<char> m_buffer;
- std::ostream *m_pStream;
-
- std::size_t m_pos;
- std::size_t m_row, m_col;
- bool m_comment;
- };
-
- template<std::size_t N>
- inline ostream_wrapper& operator << (ostream_wrapper& stream, const char (&str)[N]) {
- stream.write(str, N-1);
- return stream;
- }
-
- inline ostream_wrapper& operator << (ostream_wrapper& stream, const std::string& str) {
- stream.write(str);
- return stream;
- }
-
- inline ostream_wrapper& operator << (ostream_wrapper& stream, char ch) {
- stream.write(&ch, 1);
- return stream;
- }
-}
-
-#endif // OSTREAM_WRAPPER_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/include/yaml-cpp/parser.h b/ext/include/yaml-cpp/parser.h
deleted file mode 100644
index 8ec926b..0000000
--- a/ext/include/yaml-cpp/parser.h
+++ /dev/null
@@ -1,47 +0,0 @@
-#ifndef PARSER_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define PARSER_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-#include "yaml-cpp/dll.h"
-#include "yaml-cpp/noncopyable.h"
-#include <ios>
-#include <memory>
-
-namespace YAML
-{
- struct Directives;
- struct Token;
- class EventHandler;
- class Scanner;
-
- class YAML_CPP_API Parser: private noncopyable
- {
- public:
- Parser();
- Parser(std::istream& in);
- ~Parser();
-
- operator bool() const;
-
- void Load(std::istream& in);
- bool HandleNextDocument(EventHandler& eventHandler);
-
- void PrintTokens(std::ostream& out);
-
- private:
- void ParseDirectives();
- void HandleDirective(const Token& token);
- void HandleYamlDirective(const Token& token);
- void HandleTagDirective(const Token& token);
-
- private:
- std::auto_ptr<Scanner> m_pScanner;
- std::auto_ptr<Directives> m_pDirectives;
- };
-}
-
-#endif // PARSER_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/include/yaml-cpp/stlemitter.h b/ext/include/yaml-cpp/stlemitter.h
deleted file mode 100644
index f8ff20e..0000000
--- a/ext/include/yaml-cpp/stlemitter.h
+++ /dev/null
@@ -1,51 +0,0 @@
-#ifndef STLEMITTER_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define STLEMITTER_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-#include <vector>
-#include <list>
-#include <set>
-#include <map>
-
-namespace YAML
-{
- template<typename Seq>
- inline Emitter& EmitSeq(Emitter& emitter, const Seq& seq) {
- emitter << BeginSeq;
- for(typename Seq::const_iterator it=seq.begin();it!=seq.end();++it)
- emitter << *it;
- emitter << EndSeq;
- return emitter;
- }
-
- template<typename T>
- inline Emitter& operator << (Emitter& emitter, const std::vector<T>& v) {
- return EmitSeq(emitter, v);
- }
-
- template<typename T>
- inline Emitter& operator << (Emitter& emitter, const std::list<T>& v) {
- return EmitSeq(emitter, v);
- }
-
- template<typename T>
- inline Emitter& operator << (Emitter& emitter, const std::set<T>& v) {
- return EmitSeq(emitter, v);
- }
-
- template <typename K, typename V>
- inline Emitter& operator << (Emitter& emitter, const std::map<K, V>& m) {
- typedef typename std::map <K, V> map;
- emitter << BeginMap;
- for(typename map::const_iterator it=m.begin();it!=m.end();++it)
- emitter << Key << it->first << Value << it->second;
- emitter << EndMap;
- return emitter;
- }
-}
-
-#endif // STLEMITTER_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/include/yaml-cpp/traits.h b/ext/include/yaml-cpp/traits.h
deleted file mode 100644
index 09eead4..0000000
--- a/ext/include/yaml-cpp/traits.h
+++ /dev/null
@@ -1,57 +0,0 @@
-#ifndef TRAITS_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define TRAITS_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-namespace YAML
-{
- template <typename>
- struct is_numeric { enum { value = false }; };
-
- template <> struct is_numeric <char> { enum { value = true }; };
- template <> struct is_numeric <unsigned char> { enum { value = true }; };
- template <> struct is_numeric <int> { enum { value = true }; };
- template <> struct is_numeric <unsigned int> { enum { value = true }; };
- template <> struct is_numeric <long int> { enum { value = true }; };
- template <> struct is_numeric <unsigned long int> { enum { value = true }; };
- template <> struct is_numeric <short int> { enum { value = true }; };
- template <> struct is_numeric <unsigned short int> { enum { value = true }; };
-#if defined(_MSC_VER) && (_MSC_VER < 1310)
- template <> struct is_numeric <__int64> { enum { value = true }; };
- template <> struct is_numeric <unsigned __int64> { enum { value = true }; };
-#else
- template <> struct is_numeric <long long> { enum { value = true }; };
- template <> struct is_numeric <unsigned long long> { enum { value = true }; };
-#endif
- template <> struct is_numeric <float> { enum { value = true }; };
- template <> struct is_numeric <double> { enum { value = true }; };
- template <> struct is_numeric <long double> { enum { value = true }; };
-
- template <bool, class T = void>
- struct enable_if_c {
- typedef T type;
- };
-
- template <class T>
- struct enable_if_c<false, T> {};
-
- template <class Cond, class T = void>
- struct enable_if : public enable_if_c<Cond::value, T> {};
-
- template <bool, class T = void>
- struct disable_if_c {
- typedef T type;
- };
-
- template <class T>
- struct disable_if_c<true, T> {};
-
- template <class Cond, class T = void>
- struct disable_if : public disable_if_c<Cond::value, T> {};
-}
-
-#endif // TRAITS_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
diff --git a/ext/include/yaml-cpp/yaml.h b/ext/include/yaml-cpp/yaml.h
deleted file mode 100644
index 4e63408..0000000
--- a/ext/include/yaml-cpp/yaml.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef YAML_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define YAML_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-#include "yaml-cpp/parser.h"
-#include "yaml-cpp/emitter.h"
-#include "yaml-cpp/stlemitter.h"
-#include "yaml-cpp/exceptions.h"
-
-#include "yaml-cpp/node/node.h"
-#include "yaml-cpp/node/impl.h"
-#include "yaml-cpp/node/convert.h"
-#include "yaml-cpp/node/iterator.h"
-#include "yaml-cpp/node/detail/impl.h"
-#include "yaml-cpp/node/parse.h"
-#include "yaml-cpp/node/emit.h"
-
-#endif // YAML_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/src/CMakeLists.txt b/ext/src/CMakeLists.txt
index 26d0f6b..b702eb9 100644
--- a/ext/src/CMakeLists.txt
+++ b/ext/src/CMakeLists.txt
@@ -6,10 +6,11 @@ add_subdirectory(python_libs/joblib2)
add_subdirectory(python_libs/joblib3)
add_subdirectory(python_libs/pyyaml2)
add_subdirectory(python_libs/pyyaml3)
-add_subdirectory(yaml-cpp)
add_subdirectory(ConsensusCore)
add_subdirectory(bamtools)
add_subdirectory(samtools)
add_subdirectory(cppformat)
add_subdirectory(ssw)
-add_subdirectory(cityhash)
\ No newline at end of file
+add_subdirectory(cityhash)
+add_subdirectory(llvm)
+add_subdirectory(htrie)
\ No newline at end of file
diff --git a/ext/src/bamtools/api/internal/bam/BamMultiReader_p.cpp b/ext/src/bamtools/api/internal/bam/BamMultiReader_p.cpp
index 2921e6b..8f313f5 100644
--- a/ext/src/bamtools/api/internal/bam/BamMultiReader_p.cpp
+++ b/ext/src/bamtools/api/internal/bam/BamMultiReader_p.cpp
@@ -461,7 +461,7 @@ bool BamMultiReaderPrivate::Open(const vector<string>& filenames) {
// put all current readers back at beginning (refreshes alignment cache)
if ( !Rewind() ) {
const string currentError = m_errorString;
- const string message = string("unable to rewind existing readers: \n\t") + currentError;
+ const string message = string("unable to rewind existing reads_io: \n\t") + currentError;
SetErrorString("BamMultiReader::Open", message);
return false;
}
@@ -614,7 +614,7 @@ bool BamMultiReaderPrivate::Rewind(void) {
// attempt to rewind files
if ( !RewindReaders() ) {
const string currentError = m_errorString;
- const string message = string("could not rewind readers: \n\t") + currentError;
+ const string message = string("could not rewind reads_io: \n\t") + currentError;
SetErrorString("BamMultiReader::Rewind", message);
return false;
}
diff --git a/ext/src/htrie/CMakeLists.txt b/ext/src/htrie/CMakeLists.txt
new file mode 100644
index 0000000..4345de8
--- /dev/null
+++ b/ext/src/htrie/CMakeLists.txt
@@ -0,0 +1,11 @@
+############################################################################
+# Copyright (c) 2015 Saint Petersburg State University
+# All Rights Reserved
+# See file LICENSE for details.
+############################################################################
+
+project(hattrie C)
+
+add_library(hattrie STATIC
+ ahtable.c hat-trie.c misc.c murmurhash3.c)
+
diff --git a/ext/src/htrie/ahtable.c b/ext/src/htrie/ahtable.c
new file mode 100644
index 0000000..c84c433
--- /dev/null
+++ b/ext/src/htrie/ahtable.c
@@ -0,0 +1,564 @@
+/*
+ * This file is part of hat-trie.
+ *
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones at cs.washington.edu>
+ *
+ * See ahtable.h for description of the Array Hash Table.
+ *
+ */
+
+#include "htrie/ahtable.h"
+#include "misc.h"
+#include "murmurhash3.h"
+#include <assert.h>
+#include <string.h>
+
+const double ahtable_max_load_factor = 100000.0; /* arbitrary large number => don't resize */
+const size_t ahtable_initial_size = 4096;
+
+static size_t keylen(slot_t s) {
+ if (0x1 & *s) {
+ return (size_t) (*((uint16_t*) s) >> 1);
+ }
+ else {
+ return (size_t) (*s >> 1);
+ }
+}
+
+
+ahtable_t* ahtable_create()
+{
+ return ahtable_create_n(ahtable_initial_size);
+}
+
+
+ahtable_t* ahtable_create_n(size_t n)
+{
+ ahtable_t* table = malloc_or_die(sizeof(ahtable_t));
+ table->flag = 0;
+ table->c0 = table->c1 = '\0';
+
+ table->n = n;
+ table->m = 0;
+ table->max_m = (size_t) (ahtable_max_load_factor * (double) table->n);
+ table->slots = malloc_or_die(n * sizeof(slot_t));
+ memset(table->slots, 0, n * sizeof(slot_t));
+
+ table->slot_sizes = malloc_or_die(n * sizeof(size_t));
+ memset(table->slot_sizes, 0, n * sizeof(size_t));
+
+ return table;
+}
+
+
+void ahtable_free(ahtable_t* table)
+{
+ if (table == NULL) return;
+ size_t i;
+ for (i = 0; i < table->n; ++i) free(table->slots[i]);
+ free(table->slots);
+ free(table->slot_sizes);
+ free(table);
+}
+
+
+size_t ahtable_size(const ahtable_t* table)
+{
+ return table->m;
+}
+
+
+size_t ahtable_sizeof(const ahtable_t* table)
+{
+ size_t nbytes = sizeof(ahtable_t) +
+ table->n * (sizeof(size_t) + sizeof(slot_t));
+ size_t i;
+ for (i = 0; i < table->n; ++i) {
+ nbytes += table->slot_sizes[i];
+ }
+ return nbytes;
+}
+
+
+void ahtable_clear(ahtable_t* table)
+{
+ size_t i;
+ for (i = 0; i < table->n; ++i) free(table->slots[i]);
+ table->n = ahtable_initial_size;
+ table->slots = realloc_or_die(table->slots, table->n * sizeof(slot_t));
+ memset(table->slots, 0, table->n * sizeof(slot_t));
+
+ table->slot_sizes = realloc_or_die(table->slot_sizes, table->n * sizeof(size_t));
+ memset(table->slot_sizes, 0, table->n * sizeof(size_t));
+}
+
+/** Inserts a key with value into slot s, and returns a pointer to the
+ * space immediately after.
+ */
+static slot_t ins_key(slot_t s, const char* key, size_t len, value_t** val)
+{
+ // key length
+ if (len < 128) {
+ s[0] = (unsigned char) (len << 1);
+ s += 1;
+ }
+ else {
+ /* The least significant bit is set to indicate that two bytes are
+ * being used to store the key length. */
+ *((uint16_t*) s) = ((uint16_t) len << 1) | 0x1;
+ s += 2;
+ }
+
+ // key
+ memcpy(s, key, len * sizeof(unsigned char));
+ s += len;
+
+ // value
+ *val = (value_t*) s;
+ **val = 0;
+ s += sizeof(value_t);
+
+ return s;
+}
+
+
+static void ahtable_expand(ahtable_t* table)
+{
+ /* Resizing a table is essentially building a brand new one.
+ * One little shortcut we can take on the memory allocation front is to
+ * figure out how much memory each slot needs in advance.
+ */
+ assert(table->n > 0);
+ size_t new_n = 2 * table->n;
+ size_t* slot_sizes = malloc_or_die(new_n * sizeof(size_t));
+ memset(slot_sizes, 0, new_n * sizeof(size_t));
+
+ const char* key;
+ size_t len = 0;
+ size_t m = 0;
+ ahtable_iter_t* i = ahtable_iter_begin(table, false);
+ while (!ahtable_iter_finished(i)) {
+ key = ahtable_iter_key(i, &len);
+ slot_sizes[hash(key, len) % new_n] +=
+ len + sizeof(value_t) + (len >= 128 ? 2 : 1);
+
+ ++m;
+ ahtable_iter_next(i);
+ }
+ assert(m == table->m);
+ ahtable_iter_free(i);
+
+
+ /* allocate slots */
+ slot_t* slots = malloc_or_die(new_n * sizeof(slot_t));
+ size_t j;
+ for (j = 0; j < new_n; ++j) {
+ if (slot_sizes[j] > 0) {
+ slots[j] = malloc_or_die(slot_sizes[j]);
+ }
+ else slots[j] = NULL;
+ }
+
+ /* rehash values. A few shortcuts can be taken here as well, as we know
+ * there will be no collisions. Instead of the regular insertion routine,
+ * we keep track of the ends of every slot and simply insert keys.
+ * */
+ slot_t* slots_next = malloc_or_die(new_n * sizeof(slot_t));
+ memcpy(slots_next, slots, new_n * sizeof(slot_t));
+ size_t h;
+ m = 0;
+ value_t* u;
+ value_t* v;
+ i = ahtable_iter_begin(table, false);
+ while (!ahtable_iter_finished(i)) {
+
+ key = ahtable_iter_key(i, &len);
+ h = hash(key, len) % new_n;
+
+ slots_next[h] = ins_key(slots_next[h], key, len, &u);
+ v = ahtable_iter_val(i);
+ *u = *v;
+
+ ++m;
+ ahtable_iter_next(i);
+ }
+ assert(m == table->m);
+ ahtable_iter_free(i);
+
+
+ free(slots_next);
+ for (j = 0; j < table->n; ++j) free(table->slots[j]);
+
+ free(table->slots);
+ table->slots = slots;
+
+ free(table->slot_sizes);
+ table->slot_sizes = slot_sizes;
+
+ table->n = new_n;
+ table->max_m = (size_t) (ahtable_max_load_factor * (double) table->n);
+}
+
+
+static value_t* get_key(ahtable_t* table, const char* key, size_t len, bool insert_missing)
+{
+ /* if we are at capacity, preemptively resize */
+ if (insert_missing && table->m >= table->max_m) {
+ ahtable_expand(table);
+ }
+
+
+ uint32_t i = hash(key, len) % table->n;
+ size_t k;
+ slot_t s;
+ value_t* val;
+
+ /* search the array for our key */
+ s = table->slots[i];
+ while ((size_t) (s - table->slots[i]) < table->slot_sizes[i]) {
+ /* get the key length */
+ k = keylen(s);
+ s += k < 128 ? 1 : 2;
+
+ /* skip keys that are longer than ours */
+ if (k != len) {
+ s += k + sizeof(value_t);
+ continue;
+ }
+
+ /* key found. */
+ if (memcmp(s, key, len) == 0) {
+ return (value_t*) (s + len);
+ }
+ /* key not found. */
+ else {
+ s += k + sizeof(value_t);
+ continue;
+ }
+ }
+
+
+ if (insert_missing) {
+ /* the key was not found, so we must insert it. */
+ size_t new_size = table->slot_sizes[i];
+ new_size += 1 + (len >= 128 ? 1 : 0); // key length
+ new_size += len * sizeof(unsigned char); // key
+ new_size += sizeof(value_t); // value
+
+ table->slots[i] = realloc_or_die(table->slots[i], new_size);
+
+ ++table->m;
+ ins_key(table->slots[i] + table->slot_sizes[i], key, len, &val);
+ table->slot_sizes[i] = new_size;
+
+ return val;
+ }
+ else return NULL;
+}
+
+
+value_t* ahtable_get(ahtable_t* table, const char* key, size_t len)
+{
+ return get_key(table, key, len, true);
+}
+
+
+value_t* ahtable_tryget(ahtable_t* table, const char* key, size_t len )
+{
+ return get_key(table, key, len, false);
+}
+
+
+int ahtable_del(ahtable_t* table, const char* key, size_t len)
+{
+ uint32_t i = hash(key, len) % table->n;
+ size_t k;
+ slot_t s;
+
+ /* search the array for our key */
+ s = table->slots[i];
+ while ((size_t) (s - table->slots[i]) < table->slot_sizes[i]) {
+ /* get the key length */
+ k = keylen(s);
+ s += k < 128 ? 1 : 2;
+
+ /* skip keys that are longer than ours */
+ if (k != len) {
+ s += k + sizeof(value_t);
+ continue;
+ }
+
+ /* key found. */
+ if (memcmp(s, key, len) == 0) {
+ /* move everything over, resize the array */
+ unsigned char* t = s + len + sizeof(value_t);
+ s -= k < 128 ? 1 : 2;
+ memmove(s, t, table->slot_sizes[i] - (size_t) (t - table->slots[i]));
+ table->slot_sizes[i] -= (size_t) (t - s);
+ --table->m;
+ return 0;
+ }
+ /* key not found. */
+ else {
+ s += k + sizeof(value_t);
+ continue;
+ }
+ }
+
+ // Key was not found. Do nothing.
+ return -1;
+}
+
+
+
+static int cmpkey(const void* a_, const void* b_)
+{
+ slot_t a = *(slot_t*) a_;
+ slot_t b = *(slot_t*) b_;
+
+ size_t ka = keylen(a), kb = keylen(b);
+
+ a += ka < 128 ? 1 : 2;
+ b += kb < 128 ? 1 : 2;
+
+ int c = memcmp(a, b, ka < kb ? ka : kb);
+ return c == 0 ? (int) ka - (int) kb : c;
+}
+
+
+/* Sorted/unsorted iterators are kept private and exposed by passing the
+sorted flag to ahtable_iter_begin. */
+
+typedef struct ahtable_sorted_iter_t_
+{
+ const ahtable_t* table; // parent
+ slot_t* xs; // pointers to keys
+ size_t i; // current key
+} ahtable_sorted_iter_t;
+
+
+static ahtable_sorted_iter_t* ahtable_sorted_iter_begin(const ahtable_t* table)
+{
+ ahtable_sorted_iter_t* i = malloc_or_die(sizeof(ahtable_sorted_iter_t));
+ i->table = table;
+ i->xs = malloc_or_die(table->m * sizeof(slot_t));
+ i->i = 0;
+
+ slot_t s;
+ size_t j, k, u;
+ for (j = 0, u = 0; j < table->n; ++j) {
+ s = table->slots[j];
+ while (s < table->slots[j] + table->slot_sizes[j]) {
+ i->xs[u++] = s;
+ k = keylen(s);
+ s += k < 128 ? 1 : 2;
+ s += k + sizeof(value_t);
+ }
+ }
+
+ qsort(i->xs, table->m, sizeof(slot_t), cmpkey);
+
+ return i;
+}
+
+
+static bool ahtable_sorted_iter_finished(ahtable_sorted_iter_t* i)
+{
+ return i->i >= i->table->m;
+}
+
+
+static void ahtable_sorted_iter_next(ahtable_sorted_iter_t* i)
+{
+ if (ahtable_sorted_iter_finished(i)) return;
+ ++i->i;
+}
+
+
+static void ahtable_sorted_iter_free(ahtable_sorted_iter_t* i)
+{
+ if (i == NULL) return;
+ free(i->xs);
+ free(i);
+}
+
+
+static const char* ahtable_sorted_iter_key(ahtable_sorted_iter_t* i, size_t* len)
+{
+ if (ahtable_sorted_iter_finished(i)) return NULL;
+
+ slot_t s = i->xs[i->i];
+ if (len) *len = keylen(s);
+
+ return (const char*) (s + (*len < 128 ? 1 : 2));
+}
+
+
+static value_t* ahtable_sorted_iter_val(ahtable_sorted_iter_t* i)
+{
+ if (ahtable_sorted_iter_finished(i)) return NULL;
+
+ slot_t s = i->xs[i->i];
+ size_t k = keylen(s);
+
+ s += k < 128 ? 1 : 2;
+ s += k;
+
+ return (value_t*) s;
+}
+
+
+typedef struct ahtable_unsorted_iter_t_
+{
+ const ahtable_t* table; // parent
+ size_t i; // slot index
+ slot_t s; // slot position
+} ahtable_unsorted_iter_t;
+
+
+static ahtable_unsorted_iter_t* ahtable_unsorted_iter_begin(const ahtable_t* table)
+{
+ ahtable_unsorted_iter_t* i = malloc_or_die(sizeof(ahtable_unsorted_iter_t));
+ i->table = table;
+
+ for (i->i = 0; i->i < i->table->n; ++i->i) {
+ i->s = table->slots[i->i];
+ if ((size_t) (i->s - table->slots[i->i]) >= table->slot_sizes[i->i]) continue;
+ break;
+ }
+
+ return i;
+}
+
+
+static bool ahtable_unsorted_iter_finished(ahtable_unsorted_iter_t* i)
+{
+ return i->i >= i->table->n;
+}
+
+
+static void ahtable_unsorted_iter_next(ahtable_unsorted_iter_t* i)
+{
+ if (ahtable_unsorted_iter_finished(i)) return;
+
+ /* get the key length */
+ size_t k = keylen(i->s);
+ i->s += k < 128 ? 1 : 2;
+
+ /* skip to the next key */
+ i->s += k + sizeof(value_t);
+
+ if ((size_t) (i->s - i->table->slots[i->i]) >= i->table->slot_sizes[i->i]) {
+ do {
+ ++i->i;
+ } while(i->i < i->table->n &&
+ i->table->slot_sizes[i->i] == 0);
+
+ if (i->i < i->table->n) i->s = i->table->slots[i->i];
+ else i->s = NULL;
+ }
+}
+
+
+static void ahtable_unsorted_iter_free(ahtable_unsorted_iter_t* i)
+{
+ free(i);
+}
+
+
+static const char* ahtable_unsorted_iter_key(ahtable_unsorted_iter_t* i, size_t* len)
+{
+ if (ahtable_unsorted_iter_finished(i)) return NULL;
+
+ slot_t s = i->s;
+ size_t k;
+ if (0x1 & *s) {
+ k = (size_t) (*((uint16_t*) s)) >> 1;
+ s += 2;
+ }
+ else {
+ k = (size_t) (*s >> 1);
+ s += 1;
+ }
+
+ if(len) *len = k;
+ return (const char*) s;
+}
+
+
+static value_t* ahtable_unsorted_iter_val(ahtable_unsorted_iter_t* i)
+{
+ if (ahtable_unsorted_iter_finished(i)) return NULL;
+
+ slot_t s = i->s;
+
+ size_t k;
+ if (0x1 & *s) {
+ k = (size_t) (*((uint16_t*) s)) >> 1;
+ s += 2;
+ }
+ else {
+ k = (size_t) (*s >> 1);
+ s += 1;
+ }
+
+ s += k;
+ return (value_t*) s;
+}
+
+
+struct ahtable_iter_t_
+{
+ bool sorted;
+ union {
+ ahtable_unsorted_iter_t* unsorted;
+ ahtable_sorted_iter_t* sorted;
+ } i;
+};
+
+
+ahtable_iter_t* ahtable_iter_begin(const ahtable_t* table, bool sorted) {
+ ahtable_iter_t* i = malloc_or_die(sizeof(ahtable_iter_t));
+ i->sorted = sorted;
+ if (sorted) i->i.sorted = ahtable_sorted_iter_begin(table);
+ else i->i.unsorted = ahtable_unsorted_iter_begin(table);
+ return i;
+}
+
+
+void ahtable_iter_next(ahtable_iter_t* i)
+{
+ if (i->sorted) ahtable_sorted_iter_next(i->i.sorted);
+ else ahtable_unsorted_iter_next(i->i.unsorted);
+}
+
+
+bool ahtable_iter_finished(ahtable_iter_t* i)
+{
+ if (i->sorted) return ahtable_sorted_iter_finished(i->i.sorted);
+ else return ahtable_unsorted_iter_finished(i->i.unsorted);
+}
+
+
+void ahtable_iter_free(ahtable_iter_t* i)
+{
+ if (i == NULL) return;
+ if (i->sorted) ahtable_sorted_iter_free(i->i.sorted);
+ else ahtable_unsorted_iter_free(i->i.unsorted);
+ free(i);
+}
+
+
+const char* ahtable_iter_key(ahtable_iter_t* i, size_t* len)
+{
+ if (i->sorted) return ahtable_sorted_iter_key(i->i.sorted, len);
+ else return ahtable_unsorted_iter_key(i->i.unsorted, len);
+}
+
+
+value_t* ahtable_iter_val(ahtable_iter_t* i)
+{
+ if (i->sorted) return ahtable_sorted_iter_val(i->i.sorted);
+ else return ahtable_unsorted_iter_val(i->i.unsorted);
+}
+
diff --git a/ext/src/htrie/hat-trie.c b/ext/src/htrie/hat-trie.c
new file mode 100644
index 0000000..812c0d2
--- /dev/null
+++ b/ext/src/htrie/hat-trie.c
@@ -0,0 +1,711 @@
+/*
+ * This file is part of hat-trie.
+ *
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones at cs.washington.edu>
+ *
+ */
+
+#include "htrie/hat-trie.h"
+#include "htrie/ahtable.h"
+#include "misc.h"
+#include <stdint.h>
+#include <assert.h>
+#include <string.h>
+
+#define HT_UNUSED(x) x=x
+
+/* maximum number of keys that may be stored in a bucket before it is burst */
+static const size_t MAX_BUCKET_SIZE = 16384;
+#define NODE_MAXCHAR 0xff // 0x7f for 7-bit ASCII
+#define NODE_CHILDS (NODE_MAXCHAR+1)
+
+static const uint8_t NODE_TYPE_TRIE = 0x1;
+static const uint8_t NODE_TYPE_PURE_BUCKET = 0x2;
+static const uint8_t NODE_TYPE_HYBRID_BUCKET = 0x4;
+static const uint8_t NODE_HAS_VAL = 0x8;
+
+
+struct trie_node_t_;
+
+/* Node's may be trie nodes or buckets. This union allows us to keep
+ * non-specific pointer. */
+typedef union node_ptr_
+{
+ ahtable_t* b;
+ struct trie_node_t_* t;
+ uint8_t* flag;
+} node_ptr;
+
+
+typedef struct trie_node_t_
+{
+ uint8_t flag;
+
+ /* the value for the key that is consumed on a trie node */
+ value_t val;
+
+ /* Map a character to either a trie_node_t or a ahtable_t. The first byte
+ * must be examined to determine which. */
+ node_ptr xs[NODE_CHILDS];
+
+} trie_node_t;
+
+struct hattrie_t_
+{
+ node_ptr root; // root node
+ size_t m; // number of stored keys
+};
+
+
+
+size_t hattrie_size(const hattrie_t* T)
+{
+ return T->m;
+}
+
+
+static size_t node_sizeof(node_ptr node)
+{
+ if (*node.flag & NODE_TYPE_TRIE) {
+ size_t nbytes = sizeof(trie_node_t);
+ size_t i;
+ nbytes += node_sizeof(node.t->xs[0]);
+ for (i = 1; i < NODE_CHILDS; ++i) {
+ if (node.t->xs[i].t != node.t->xs[i-1].t) nbytes += node_sizeof(node.t->xs[i]);
+ }
+ return nbytes;
+ }
+ else {
+ return ahtable_sizeof(node.b);
+ }
+}
+
+
+size_t hattrie_sizeof(const hattrie_t* T)
+{
+ return sizeof(hattrie_t) + node_sizeof(T->root);
+}
+
+
+/* Create a new trie node with all pointers pointing to the given child (which
+ * can be NULL). */
+static trie_node_t* alloc_trie_node(hattrie_t* T, node_ptr child)
+{
+ trie_node_t* node = malloc_or_die(sizeof(trie_node_t));
+ node->flag = NODE_TYPE_TRIE;
+ node->val = 0;
+
+ /* pass T to allow custom allocator for trie. */
+ HT_UNUSED(T); /* unused now */
+
+ size_t i;
+ for (i = 0; i < NODE_CHILDS; ++i) node->xs[i] = child;
+ return node;
+}
+
+/* iterate trie nodes until string is consumed or bucket is found */
+static node_ptr hattrie_consume(node_ptr *p, const char **k, size_t *l, unsigned brk)
+{
+ node_ptr node = p->t->xs[(unsigned char) **k];
+ while (*node.flag & NODE_TYPE_TRIE && *l > brk) {
+ ++*k;
+ --*l;
+ *p = node;
+ node = node.t->xs[(unsigned char) **k];
+ }
+
+ /* copy and writeback variables if it's faster */
+
+ assert(*p->flag & NODE_TYPE_TRIE);
+ return node;
+}
+
+/* use node value and return pointer to it */
+static inline value_t* hattrie_useval(hattrie_t *T, node_ptr n)
+{
+ if (!(n.t->flag & NODE_HAS_VAL)) {
+ n.t->flag |= NODE_HAS_VAL;
+ ++T->m;
+ }
+ return &n.t->val;
+}
+
+/* clear node value if exists */
+static inline int hattrie_clrval(hattrie_t *T, node_ptr n)
+{
+ if (n.t->flag & NODE_HAS_VAL) {
+ n.t->flag &= ~NODE_HAS_VAL;
+ n.t->val = 0;
+ --T->m;
+ return 0;
+ }
+ return -1;
+}
+
+/* find node in trie */
+static node_ptr hattrie_find(hattrie_t* T, const char **key, size_t *len)
+{
+ node_ptr parent = T->root;
+ assert(*parent.flag & NODE_TYPE_TRIE);
+
+ if (*len == 0) return parent;
+
+ node_ptr node = hattrie_consume(&parent, key, len, 1);
+
+ /* if the trie node consumes value, use it */
+ if (*node.flag & NODE_TYPE_TRIE) {
+ if (!(node.t->flag & NODE_HAS_VAL)) {
+ node.flag = NULL;
+ }
+ return node;
+ }
+
+ /* pure bucket holds only key suffixes, skip current char */
+ if (*node.flag & NODE_TYPE_PURE_BUCKET) {
+ *key += 1;
+ *len -= 1;
+ }
+
+ /* do not scan bucket, it's not needed for this operation */
+ return node;
+}
+
+hattrie_t* hattrie_create()
+{
+ hattrie_t* T = malloc_or_die(sizeof(hattrie_t));
+ T->m = 0;
+
+ node_ptr node;
+ node.b = ahtable_create();
+ node.b->flag = NODE_TYPE_HYBRID_BUCKET;
+ node.b->c0 = 0x00;
+ node.b->c1 = NODE_MAXCHAR;
+ T->root.t = alloc_trie_node(T, node);
+
+ return T;
+}
+
+
+static void hattrie_free_node(node_ptr node)
+{
+ if (*node.flag & NODE_TYPE_TRIE) {
+ size_t i;
+ for (i = 0; i < NODE_CHILDS; ++i) {
+ if (i > 0 && node.t->xs[i].t == node.t->xs[i - 1].t) continue;
+
+ /* XXX: recursion might not be the best choice here. It is possible
+ * to build a very deep trie. */
+ if (node.t->xs[i].t) hattrie_free_node(node.t->xs[i]);
+ }
+ free(node.t);
+ }
+ else {
+ ahtable_free(node.b);
+ }
+}
+
+
+void hattrie_free(hattrie_t* T)
+{
+ hattrie_free_node(T->root);
+ free(T);
+}
+
+
+void hattrie_clear(hattrie_t* T)
+{
+ hattrie_free_node(T->root);
+ node_ptr node;
+ node.b = ahtable_create();
+ node.b->flag = NODE_TYPE_HYBRID_BUCKET;
+ node.b->c0 = 0x00;
+ node.b->c1 = 0xff;
+ T->root.t = alloc_trie_node(T, node);
+}
+
+
+/* Perform one split operation on the given node with the given parent.
+ */
+static void hattrie_split(hattrie_t* T, node_ptr parent, node_ptr node)
+{
+ /* only buckets may be split */
+ assert(*node.flag & NODE_TYPE_PURE_BUCKET ||
+ *node.flag & NODE_TYPE_HYBRID_BUCKET);
+
+ assert(*parent.flag & NODE_TYPE_TRIE);
+
+ if (*node.flag & NODE_TYPE_PURE_BUCKET) {
+ /* turn the pure bucket into a hybrid bucket */
+ parent.t->xs[node.b->c0].t = alloc_trie_node(T, node);
+
+ /* if the bucket had an empty key, move it to the new trie node */
+ value_t* val = ahtable_tryget(node.b, NULL, 0);
+ if (val) {
+ parent.t->xs[node.b->c0].t->val = *val;
+ parent.t->xs[node.b->c0].t->flag |= NODE_HAS_VAL;
+ *val = 0;
+ ahtable_del(node.b, NULL, 0);
+ }
+
+ node.b->c0 = 0x00;
+ node.b->c1 = NODE_MAXCHAR;
+ node.b->flag = NODE_TYPE_HYBRID_BUCKET;
+
+ return;
+ }
+
+ /* This is a hybrid bucket. Perform a proper split. */
+
+ /* count the number of occourances of every leading character */
+ unsigned int cs[NODE_CHILDS]; // occurance count for leading chars
+ memset(cs, 0, NODE_CHILDS * sizeof(unsigned int));
+ size_t len;
+ const char* key;
+
+ ahtable_iter_t* i = ahtable_iter_begin(node.b, false);
+ while (!ahtable_iter_finished(i)) {
+ key = ahtable_iter_key(i, &len);
+ assert(len > 0);
+ cs[(unsigned char) key[0]] += 1;
+ ahtable_iter_next(i);
+ }
+ ahtable_iter_free(i);
+
+ /* choose a split point */
+ unsigned int left_m, right_m, all_m;
+ unsigned char j = node.b->c0;
+ all_m = ahtable_size(node.b);
+ left_m = cs[j];
+ right_m = all_m - left_m;
+ int d;
+
+ while (j + 1 < node.b->c1) {
+ d = abs((int) (left_m + cs[j + 1]) - (int) (right_m - cs[j + 1]));
+ if (d <= abs(left_m - right_m) && left_m + cs[j + 1] < all_m) {
+ j += 1;
+ left_m += cs[j];
+ right_m -= cs[j];
+ }
+ else break;
+ }
+
+ /* now split into two node cooresponding to ranges [0, j] and
+ * [j + 1, NODE_MAXCHAR], respectively. */
+
+
+ /* create new left and right nodes */
+
+ /* TODO: Add a special case if either node is a hybrid bucket containing all
+ * the keys. In such a case, do not build a new table, just use the old one.
+ * */
+ size_t num_slots;
+
+
+ for (num_slots = ahtable_initial_size;
+ (double) left_m > ahtable_max_load_factor * (double) num_slots;
+ num_slots *= 2);
+
+ node_ptr left, right;
+ left.b = ahtable_create_n(num_slots);
+ left.b->c0 = node.b->c0;
+ left.b->c1 = j;
+ left.b->flag = left.b->c0 == left.b->c1 ?
+ NODE_TYPE_PURE_BUCKET : NODE_TYPE_HYBRID_BUCKET;
+
+
+ for (num_slots = ahtable_initial_size;
+ (double) right_m > ahtable_max_load_factor * (double) num_slots;
+ num_slots *= 2);
+
+ right.b = ahtable_create_n(num_slots);
+ right.b->c0 = j + 1;
+ right.b->c1 = node.b->c1;
+ right.b->flag = right.b->c0 == right.b->c1 ?
+ NODE_TYPE_PURE_BUCKET : NODE_TYPE_HYBRID_BUCKET;
+
+
+ /* update the parent's pointer */
+
+ unsigned int c;
+ for (c = node.b->c0; c <= j; ++c) parent.t->xs[c] = left;
+ for (; c <= node.b->c1; ++c) parent.t->xs[c] = right;
+
+
+
+ /* distribute keys to the new left or right node */
+ value_t* u;
+ value_t* v;
+ i = ahtable_iter_begin(node.b, false);
+ while (!ahtable_iter_finished(i)) {
+ key = ahtable_iter_key(i, &len);
+ u = ahtable_iter_val(i);
+ assert(len > 0);
+
+ /* left */
+ if ((unsigned char) key[0] <= j) {
+ if (*left.flag & NODE_TYPE_PURE_BUCKET) {
+ v = ahtable_get(left.b, key + 1, len - 1);
+ }
+ else {
+ v = ahtable_get(left.b, key, len);
+ }
+ *v = *u;
+ }
+
+ /* right */
+ else {
+ if (*right.flag & NODE_TYPE_PURE_BUCKET) {
+ v = ahtable_get(right.b, key + 1, len - 1);
+ }
+ else {
+ v = ahtable_get(right.b, key, len);
+ }
+ *v = *u;
+ }
+
+ ahtable_iter_next(i);
+ }
+
+ ahtable_iter_free(i);
+ ahtable_free(node.b);
+}
+
+value_t* hattrie_get(hattrie_t* T, const char* key, size_t len)
+{
+ node_ptr parent = T->root;
+ assert(*parent.flag & NODE_TYPE_TRIE);
+
+ if (len == 0) return &parent.t->val;
+
+ /* consume all trie nodes, now parent must be trie and child anything */
+ node_ptr node = hattrie_consume(&parent, &key, &len, 0);
+ assert(*parent.flag & NODE_TYPE_TRIE);
+
+ /* if the key has been consumed on a trie node, use its value */
+ if (len == 0) {
+ if (*node.flag & NODE_TYPE_TRIE) {
+ return hattrie_useval(T, node);
+ }
+ else if (*node.flag & NODE_TYPE_HYBRID_BUCKET) {
+ return hattrie_useval(T, parent);
+ }
+ }
+
+
+ /* preemptively split the bucket if it is full */
+ while (ahtable_size(node.b) >= MAX_BUCKET_SIZE) {
+ hattrie_split(T, parent, node);
+
+ /* after the split, the node pointer is invalidated, so we search from
+ * the parent again. */
+ node = hattrie_consume(&parent, &key, &len, 0);
+
+ /* if the key has been consumed on a trie node, use its value */
+ if (len == 0) {
+ if (*node.flag & NODE_TYPE_TRIE) {
+ return hattrie_useval(T, node);
+ }
+ else if (*node.flag & NODE_TYPE_HYBRID_BUCKET) {
+ return hattrie_useval(T, parent);
+ }
+ }
+ }
+
+ assert(*node.flag & NODE_TYPE_PURE_BUCKET || *node.flag & NODE_TYPE_HYBRID_BUCKET);
+
+ assert(len > 0);
+ size_t m_old = node.b->m;
+ value_t* val;
+ if (*node.flag & NODE_TYPE_PURE_BUCKET) {
+ val = ahtable_get(node.b, key + 1, len - 1);
+ }
+ else {
+ val = ahtable_get(node.b, key, len);
+ }
+ T->m += (node.b->m - m_old);
+
+ return val;
+}
+
+
+value_t* hattrie_tryget(hattrie_t* T, const char* key, size_t len)
+{
+ /* find node for given key */
+ node_ptr node = hattrie_find(T, &key, &len);
+ if (node.flag == NULL) {
+ return NULL;
+ }
+
+ /* if the trie node consumes value, use it */
+ if (*node.flag & NODE_TYPE_TRIE) {
+ return &node.t->val;
+ }
+
+ return ahtable_tryget(node.b, key, len);
+}
+
+
+int hattrie_del(hattrie_t* T, const char* key, size_t len)
+{
+ node_ptr parent = T->root;
+ HT_UNUSED(parent);
+ assert(*parent.flag & NODE_TYPE_TRIE);
+
+ /* find node for deletion */
+ node_ptr node = hattrie_find(T, &key, &len);
+ if (node.flag == NULL) {
+ return -1;
+ }
+
+ /* if consumed on a trie node, clear the value */
+ if (*node.flag & NODE_TYPE_TRIE) {
+ return hattrie_clrval(T, node);
+ }
+
+ /* remove from bucket */
+ size_t m_old = ahtable_size(node.b);
+ int ret = ahtable_del(node.b, key, len);
+ T->m -= (m_old - ahtable_size(node.b));
+
+ /* merge empty buckets */
+ /*! \todo */
+
+ return ret;
+}
+
+
+/* plan for iteration:
+ * This is tricky, as we have no parent pointers currently, and I would like to
+ * avoid adding them. That means maintaining a stack
+ *
+ */
+
+typedef struct hattrie_node_stack_t_
+{
+ unsigned char c;
+ size_t level;
+
+ node_ptr node;
+ struct hattrie_node_stack_t_* next;
+
+} hattrie_node_stack_t;
+
+
+struct hattrie_iter_t_
+{
+ char* key;
+ size_t keysize; // space reserved for the key
+ size_t level;
+
+ /* keep track of keys stored in trie nodes */
+ bool has_nil_key;
+ value_t nil_val;
+
+ const hattrie_t* T;
+ bool sorted;
+ ahtable_iter_t* i;
+ hattrie_node_stack_t* stack;
+};
+
+
+static void hattrie_iter_pushchar(hattrie_iter_t* i, size_t level, char c)
+{
+ if (i->keysize < level) {
+ i->keysize *= 2;
+ i->key = realloc_or_die(i->key, i->keysize * sizeof(char));
+ }
+
+ if (level > 0) {
+ i->key[level - 1] = c;
+ }
+
+ i->level = level;
+}
+
+
+static void hattrie_iter_nextnode(hattrie_iter_t* i)
+{
+ if (i->stack == NULL) return;
+
+ /* pop the stack */
+ node_ptr node;
+ hattrie_node_stack_t* next;
+ unsigned char c;
+ size_t level;
+
+ node = i->stack->node;
+ next = i->stack->next;
+ c = i->stack->c;
+ level = i->stack->level;
+
+ free(i->stack);
+ i->stack = next;
+
+ if (*node.flag & NODE_TYPE_TRIE) {
+ hattrie_iter_pushchar(i, level, c);
+
+ if(node.t->flag & NODE_HAS_VAL) {
+ i->has_nil_key = true;
+ i->nil_val = node.t->val;
+ }
+
+ /* push all child nodes from right to left */
+ int j;
+ for (j = NODE_MAXCHAR; j >= 0; --j) {
+
+ /* skip repeated pointers to hybrid bucket */
+ if (j < NODE_MAXCHAR && node.t->xs[j].t == node.t->xs[j + 1].t) continue;
+
+ // push stack
+ next = i->stack;
+ i->stack = malloc_or_die(sizeof(hattrie_node_stack_t));
+ i->stack->node = node.t->xs[j];
+ i->stack->next = next;
+ i->stack->level = level + 1;
+ i->stack->c = (unsigned char) j;
+ }
+ }
+ else {
+ if (*node.flag & NODE_TYPE_PURE_BUCKET) {
+ hattrie_iter_pushchar(i, level, c);
+ }
+ else {
+ i->level = level - 1;
+ }
+
+ i->i = ahtable_iter_begin(node.b, i->sorted);
+ }
+}
+
+
+hattrie_iter_t* hattrie_iter_begin(const hattrie_t* T, bool sorted)
+{
+ hattrie_iter_t* i = malloc_or_die(sizeof(hattrie_iter_t));
+ i->T = T;
+ i->sorted = sorted;
+ i->i = NULL;
+ i->keysize = 16;
+ i->key = malloc_or_die(i->keysize * sizeof(char));
+ i->level = 0;
+ i->has_nil_key = false;
+ i->nil_val = 0;
+
+ i->stack = malloc_or_die(sizeof(hattrie_node_stack_t));
+ i->stack->next = NULL;
+ i->stack->node = T->root;
+ i->stack->c = '\0';
+ i->stack->level = 0;
+
+
+ while (((i->i == NULL || ahtable_iter_finished(i->i)) && !i->has_nil_key) &&
+ i->stack != NULL ) {
+
+ ahtable_iter_free(i->i);
+ i->i = NULL;
+ hattrie_iter_nextnode(i);
+ }
+
+ if (i->i != NULL && ahtable_iter_finished(i->i)) {
+ ahtable_iter_free(i->i);
+ i->i = NULL;
+ }
+
+ return i;
+}
+
+
+void hattrie_iter_next(hattrie_iter_t* i)
+{
+ if (hattrie_iter_finished(i)) return;
+
+ if (i->i != NULL && !ahtable_iter_finished(i->i)) {
+ ahtable_iter_next(i->i);
+ }
+ else if (i->has_nil_key) {
+ i->has_nil_key = false;
+ i->nil_val = 0;
+ hattrie_iter_nextnode(i);
+ }
+
+ while (((i->i == NULL || ahtable_iter_finished(i->i)) && !i->has_nil_key) &&
+ i->stack != NULL ) {
+
+ ahtable_iter_free(i->i);
+ i->i = NULL;
+ hattrie_iter_nextnode(i);
+ }
+
+ if (i->i != NULL && ahtable_iter_finished(i->i)) {
+ ahtable_iter_free(i->i);
+ i->i = NULL;
+ }
+}
+
+
+bool hattrie_iter_finished(hattrie_iter_t* i)
+{
+ return i->stack == NULL && i->i == NULL && !i->has_nil_key;
+}
+
+
+void hattrie_iter_free(hattrie_iter_t* i)
+{
+ if (i == NULL) return;
+ if (i->i) ahtable_iter_free(i->i);
+
+ hattrie_node_stack_t* next;
+ while (i->stack) {
+ next = i->stack->next;
+ free(i->stack);
+ i->stack = next;
+ }
+
+ free(i->key);
+ free(i);
+}
+
+
+const char* hattrie_iter_key(hattrie_iter_t* i, size_t* len)
+{
+ if (hattrie_iter_finished(i)) return NULL;
+
+ size_t sublen;
+ const char* subkey;
+
+ if (i->has_nil_key) {
+ subkey = NULL;
+ sublen = 0;
+ }
+ else subkey = ahtable_iter_key(i->i, &sublen);
+
+ if (i->keysize < i->level + sublen + 1) {
+ while (i->keysize < i->level + sublen + 1) i->keysize *= 2;
+ i->key = realloc_or_die(i->key, i->keysize * sizeof(char));
+ }
+
+ memcpy(i->key + i->level, subkey, sublen);
+ i->key[i->level + sublen] = '\0';
+
+ if (len) *len = i->level + sublen;
+ return i->key;
+}
+
+
+value_t* hattrie_iter_val(hattrie_iter_t* i)
+{
+ if (i->has_nil_key) return &i->nil_val;
+
+ if (hattrie_iter_finished(i)) return NULL;
+
+ return ahtable_iter_val(i->i);
+}
+
+
+
+bool hattrie_iter_equal(const hattrie_iter_t* a,
+ const hattrie_iter_t* b)
+{
+ return a->T == b->T &&
+ a->sorted == b->sorted &&
+ a->i == b->i;
+}
diff --git a/ext/src/htrie/misc.c b/ext/src/htrie/misc.c
new file mode 100644
index 0000000..0530c34
--- /dev/null
+++ b/ext/src/htrie/misc.c
@@ -0,0 +1,46 @@
+/*
+ * This file is part of hat-trie.
+ *
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones at cs.washington.edu>
+ *
+ */
+
+#include "misc.h"
+#include <stdlib.h>
+
+
+void* malloc_or_die(size_t n)
+{
+ void* p = malloc(n);
+ if (p == NULL && n != 0) {
+ fprintf(stderr, "Cannot allocate %zu bytes.\n", n);
+ exit(EXIT_FAILURE);
+ }
+ return p;
+}
+
+
+void* realloc_or_die(void* ptr, size_t n)
+{
+ void* p = realloc(ptr, n);
+ if (p == NULL && n != 0) {
+ fprintf(stderr, "Cannot allocate %zu bytes.\n", n);
+ exit(EXIT_FAILURE);
+ }
+ return p;
+}
+
+
+FILE* fopen_or_die(const char* path, const char* mode)
+{
+ FILE* f = fopen(path, mode);
+ if (f == NULL) {
+ fprintf(stderr, "Cannot open file %s with mode %s.\n", path, mode);
+ exit(EXIT_FAILURE);
+ }
+ return f;
+}
+
+
+
+
diff --git a/ext/src/htrie/misc.h b/ext/src/htrie/misc.h
new file mode 100644
index 0000000..7223b8b
--- /dev/null
+++ b/ext/src/htrie/misc.h
@@ -0,0 +1,22 @@
+/*
+ * This file is part of hat-trie.
+ *
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones at cs.washington.edu>
+ *
+ * misc :
+ * miscelaneous functions.
+ *
+ */
+
+#ifndef LINESET_MISC_H
+#define LINESET_MISC_H
+
+#include <stdio.h>
+
+void* malloc_or_die(size_t);
+void* realloc_or_die(void*, size_t);
+FILE* fopen_or_die(const char*, const char*);
+
+#endif
+
+
diff --git a/ext/src/htrie/murmurhash3.c b/ext/src/htrie/murmurhash3.c
new file mode 100644
index 0000000..cb24c8f
--- /dev/null
+++ b/ext/src/htrie/murmurhash3.c
@@ -0,0 +1,77 @@
+/* This is MurmurHash3. The original C++ code was placed in the public domain
+ * by its author, Austin Appleby. */
+
+#include "murmurhash3.h"
+
+static inline uint32_t fmix(uint32_t h)
+{
+ h ^= h >> 16;
+ h *= 0x85ebca6b;
+ h ^= h >> 13;
+ h *= 0xc2b2ae35;
+ h ^= h >> 16;
+
+ return h;
+}
+
+
+static inline uint32_t rotl32(uint32_t x, int8_t r)
+{
+ return (x << r) | (x >> (32 - r));
+}
+
+
+uint32_t hash(const char* data, size_t len_)
+{
+ const int len = (int) len_;
+ const int nblocks = len / 4;
+
+ uint32_t h1 = 0xc062fb4a;
+
+ uint32_t c1 = 0xcc9e2d51;
+ uint32_t c2 = 0x1b873593;
+
+ //----------
+ // body
+
+ const uint32_t * blocks = (const uint32_t*) (data + nblocks * 4);
+
+ int i;
+ for(i = -nblocks; i; i++)
+ {
+ uint32_t k1 = blocks[i];
+
+ k1 *= c1;
+ k1 = rotl32(k1, 15);
+ k1 *= c2;
+
+ h1 ^= k1;
+ h1 = rotl32(h1, 13);
+ h1 = h1*5+0xe6546b64;
+ }
+
+ //----------
+ // tail
+
+ const uint8_t * tail = (const uint8_t*)(data + nblocks*4);
+
+ uint32_t k1 = 0;
+
+ switch(len & 3)
+ {
+ case 3: k1 ^= tail[2] << 16;
+ case 2: k1 ^= tail[1] << 8;
+ case 1: k1 ^= tail[0];
+ k1 *= c1; k1 = rotl32(k1,15); k1 *= c2; h1 ^= k1;
+ }
+
+ //----------
+ // finalization
+
+ h1 ^= len;
+
+ h1 = fmix(h1);
+
+ return h1;
+}
+
diff --git a/ext/src/htrie/murmurhash3.h b/ext/src/htrie/murmurhash3.h
new file mode 100644
index 0000000..9aa2dba
--- /dev/null
+++ b/ext/src/htrie/murmurhash3.h
@@ -0,0 +1,12 @@
+
+#ifndef MURMURHASH3_H
+#define MURMURHASH3_H
+
+#include <stdlib.h>
+
+#include <stdint.h>
+
+uint32_t hash(const char* data, size_t len);
+
+#endif
+
diff --git a/ext/src/llvm/Atomic.cpp b/ext/src/llvm/Atomic.cpp
new file mode 100644
index 0000000..e5dd17e
--- /dev/null
+++ b/ext/src/llvm/Atomic.cpp
@@ -0,0 +1,58 @@
+//===-- Atomic.cpp - Atomic Operations --------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements atomic operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Atomic.h"
+
+using namespace llvm;
+
+void sys::MemoryFence() {
+ __sync_synchronize();
+}
+
+sys::cas_flag sys::CompareAndSwap(volatile sys::cas_flag* ptr,
+ sys::cas_flag new_value,
+ sys::cas_flag old_value) {
+ return __sync_val_compare_and_swap(ptr, old_value, new_value);
+}
+
+sys::cas_flag sys::AtomicIncrement(volatile sys::cas_flag* ptr) {
+ return __sync_add_and_fetch(ptr, 1);
+}
+
+sys::cas_flag sys::AtomicDecrement(volatile sys::cas_flag* ptr) {
+ return __sync_sub_and_fetch(ptr, 1);
+}
+
+sys::cas_flag sys::AtomicAdd(volatile sys::cas_flag* ptr, sys::cas_flag val) {
+ return __sync_add_and_fetch(ptr, val);
+}
+
+sys::cas_flag sys::AtomicMul(volatile sys::cas_flag* ptr, sys::cas_flag val) {
+ sys::cas_flag original, result;
+ do {
+ original = *ptr;
+ result = original * val;
+ } while (sys::CompareAndSwap(ptr, result, original) != original);
+
+ return result;
+}
+
+sys::cas_flag sys::AtomicDiv(volatile sys::cas_flag* ptr, sys::cas_flag val) {
+ sys::cas_flag original, result;
+ do {
+ original = *ptr;
+ result = original / val;
+ } while (sys::CompareAndSwap(ptr, result, original) != original);
+
+ return result;
+}
diff --git a/ext/src/llvm/CMakeLists.txt b/ext/src/llvm/CMakeLists.txt
new file mode 100644
index 0000000..f34f99a
--- /dev/null
+++ b/ext/src/llvm/CMakeLists.txt
@@ -0,0 +1,37 @@
+project(llvm-support)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+include_directories(${EXT_DIR}/include)
+
+include(CheckIncludeFiles)
+include(CheckIncludeFileCXX)
+include(CheckSymbolExists)
+
+check_include_files(execinfo.h HAVE_EXECINFO_H)
+check_include_files(signal.h HAVE_SIGNAL_H)
+check_include_files(sys/stat.h HAVE_SYS_STAT_H)
+check_include_files(cxxabi.h HAVE_CXXABI_H)
+check_include_files(dlfcn.h HAVE_DLFCN_H)
+check_include_files(mach/mach.h HAVE_MACH_MACH_H)
+check_include_files(link.h HAVE_LINK_H)
+
+check_symbol_exists(backtrace "execinfo.h" HAVE_BACKTRACE)
+
+find_library(DL_LIB NAMES "dl")
+
+add_definitions(-D__STDC_CONSTANT_MACROS)
+add_definitions(-D__STDC_LIMIT_MACROS)
+
+# FIXME: Signal handler return type, currently hardcoded to 'void'
+set(RETSIGTYPE void)
+
+configure_file(${EXT_DIR}/include/llvm/Config.h.in
+ ${SPADES_BUILT_INCLUDE_DIR}/llvm/Config.h)
+
+file(GLOB sources "[a-zA-Z]*.cpp" "[a-zA-Z]*.c")
+
+add_library(llvm-support STATIC
+ ${sources})
+if (DL_LIB)
+ target_link_libraries(llvm-support ${DL_LIB})
+endif()
diff --git a/ext/src/llvm/ErrorHandling.cpp b/ext/src/llvm/ErrorHandling.cpp
new file mode 100644
index 0000000..0d5f494
--- /dev/null
+++ b/ext/src/llvm/ErrorHandling.cpp
@@ -0,0 +1,112 @@
+//===- lib/Support/ErrorHandling.cpp - Callbacks for errors ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an API used to indicate fatal error conditions. Non-fatal
+// errors (most of them) should be handled through LLVMContext.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Mutex.h"
+#include "llvm/Support/MutexGuard.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdlib>
+
+#include <unistd.h>
+
+using namespace llvm;
+
+static fatal_error_handler_t ErrorHandler = nullptr;
+static void *ErrorHandlerUserData = nullptr;
+
+static sys::Mutex ErrorHandlerMutex;
+
+void llvm::install_fatal_error_handler(fatal_error_handler_t handler,
+ void *user_data) {
+ llvm::MutexGuard Lock(ErrorHandlerMutex);
+ assert(!ErrorHandler && "Error handler already registered!\n");
+ ErrorHandler = handler;
+ ErrorHandlerUserData = user_data;
+}
+
+void llvm::remove_fatal_error_handler() {
+ llvm::MutexGuard Lock(ErrorHandlerMutex);
+ ErrorHandler = nullptr;
+ ErrorHandlerUserData = nullptr;
+}
+
+void llvm::report_fatal_error(const char *Reason, bool GenCrashDiag) {
+ report_fatal_error(Twine(Reason), GenCrashDiag);
+}
+
+void llvm::report_fatal_error(const std::string &Reason, bool GenCrashDiag) {
+ report_fatal_error(Twine(Reason), GenCrashDiag);
+}
+
+void llvm::report_fatal_error(StringRef Reason, bool GenCrashDiag) {
+ report_fatal_error(Twine(Reason), GenCrashDiag);
+}
+
+void llvm::report_fatal_error(const Twine &Reason, bool GenCrashDiag) {
+ llvm::fatal_error_handler_t handler = nullptr;
+ void* handlerData = nullptr;
+ {
+ // Only acquire the mutex while reading the handler, so as not to invoke a
+ // user-supplied callback under a lock.
+ llvm::MutexGuard Lock(ErrorHandlerMutex);
+ handler = ErrorHandler;
+ handlerData = ErrorHandlerUserData;
+ }
+
+ if (handler) {
+ handler(handlerData, Reason.str(), GenCrashDiag);
+ } else {
+ // Blast the result out to stderr. We don't try hard to make sure this
+ // succeeds (e.g. handling EINTR) and we can't use errs() here because
+ // raw ostreams can call report_fatal_error.
+ SmallVector<char, 64> Buffer;
+ raw_svector_ostream OS(Buffer);
+ OS << "LLVM ERROR: " << Reason << "\n";
+ StringRef MessageStr = OS.str();
+ ssize_t written = ::write(2, MessageStr.data(), MessageStr.size());
+ (void)written; // If something went wrong, we deliberately just give up.
+ }
+
+ // If we reached here, we are failing ungracefully. Run the interrupt handlers
+ // to make sure any special cleanups get done, in particular that we remove
+ // files registered with RemoveFileOnSignal.
+ sys::RunInterruptHandlers();
+
+ exit(1);
+}
+
+void llvm::llvm_unreachable_internal(const char *msg, const char *file,
+ unsigned line) {
+ // This code intentionally doesn't call the ErrorHandler callback, because
+ // llvm_unreachable is intended to be used to indicate "impossible"
+ // situations, and not legitimate runtime errors.
+ if (msg)
+ errs() << msg << "\n";
+ errs() << "UNREACHABLE executed";
+ if (file)
+ errs() << " at " << file << ":" << line;
+ errs() << "!\n";
+ abort();
+#ifdef LLVM_BUILTIN_UNREACHABLE
+ // Windows systems and possibly others don't declare abort() to be noreturn,
+ // so use the unreachable builtin to avoid a Clang self-host warning.
+ LLVM_BUILTIN_UNREACHABLE;
+#endif
+}
diff --git a/ext/src/llvm/Hashing.cpp b/ext/src/llvm/Hashing.cpp
new file mode 100644
index 0000000..c69efb7
--- /dev/null
+++ b/ext/src/llvm/Hashing.cpp
@@ -0,0 +1,29 @@
+//===-------------- lib/Support/Hashing.cpp -------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides implementation bits for the LLVM common hashing
+// infrastructure. Documentation and most of the other information is in the
+// header file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Hashing.h"
+
+using namespace llvm;
+
+// Provide a definition and static initializer for the fixed seed. This
+// initializer should always be zero to ensure its value can never appear to be
+// non-zero, even during dynamic initialization.
+size_t llvm::hashing::detail::fixed_seed_override = 0;
+
+// Implement the function for forced setting of the fixed seed.
+// FIXME: Use atomic operations here so that there is no data race.
+void llvm::set_fixed_execution_hash_seed(size_t fixed_value) {
+ hashing::detail::fixed_seed_override = fixed_value;
+}
diff --git a/ext/src/llvm/LineIterator.cpp b/ext/src/llvm/LineIterator.cpp
new file mode 100644
index 0000000..5baa1a3
--- /dev/null
+++ b/ext/src/llvm/LineIterator.cpp
@@ -0,0 +1,94 @@
+//===- LineIterator.cpp - Implementation of line iteration ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/LineIterator.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+using namespace llvm;
+
+static bool isAtLineEnd(const char *P) {
+ if (*P == '\n')
+ return true;
+ if (*P == '\r' && *(P + 1) == '\n')
+ return true;
+ return false;
+}
+
+static bool skipIfAtLineEnd(const char *&P) {
+ if (*P == '\n') {
+ ++P;
+ return true;
+ }
+ if (*P == '\r' && *(P + 1) == '\n') {
+ P += 2;
+ return true;
+ }
+ return false;
+}
+
+line_iterator::line_iterator(const MemoryBuffer &Buffer, bool SkipBlanks,
+ char CommentMarker)
+ : Buffer(Buffer.getBufferSize() ? &Buffer : nullptr),
+ CommentMarker(CommentMarker), SkipBlanks(SkipBlanks), LineNumber(1),
+ CurrentLine(Buffer.getBufferSize() ? Buffer.getBufferStart() : nullptr,
+ 0) {
+ // Ensure that if we are constructed on a non-empty memory buffer that it is
+ // a null terminated buffer.
+ if (Buffer.getBufferSize()) {
+ assert(Buffer.getBufferEnd()[0] == '\0');
+ // Make sure we don't skip a leading newline if we're keeping blanks
+ if (SkipBlanks || !isAtLineEnd(Buffer.getBufferStart()))
+ advance();
+ }
+}
+
+void line_iterator::advance() {
+ assert(Buffer && "Cannot advance past the end!");
+
+ const char *Pos = CurrentLine.end();
+ assert(Pos == Buffer->getBufferStart() || isAtLineEnd(Pos) || *Pos == '\0');
+
+ if (skipIfAtLineEnd(Pos))
+ ++LineNumber;
+ if (!SkipBlanks && isAtLineEnd(Pos)) {
+ // Nothing to do for a blank line.
+ } else if (CommentMarker == '\0') {
+ // If we're not stripping comments, this is simpler.
+ while (skipIfAtLineEnd(Pos))
+ ++LineNumber;
+ } else {
+ // Skip comments and count line numbers, which is a bit more complex.
+ for (;;) {
+ if (isAtLineEnd(Pos) && !SkipBlanks)
+ break;
+ if (*Pos == CommentMarker)
+ do {
+ ++Pos;
+ } while (*Pos != '\0' && !isAtLineEnd(Pos));
+ if (!skipIfAtLineEnd(Pos))
+ break;
+ ++LineNumber;
+ }
+ }
+
+ if (*Pos == '\0') {
+ // We've hit the end of the buffer, reset ourselves to the end state.
+ Buffer = nullptr;
+ CurrentLine = StringRef();
+ return;
+ }
+
+ // Measure the line.
+ size_t Length = 0;
+ while (Pos[Length] != '\0' && !isAtLineEnd(&Pos[Length])) {
+ ++Length;
+ }
+
+ CurrentLine = StringRef(Pos, Length);
+}
diff --git a/ext/src/llvm/MemoryBuffer.cpp b/ext/src/llvm/MemoryBuffer.cpp
new file mode 100644
index 0000000..00b52c7
--- /dev/null
+++ b/ext/src/llvm/MemoryBuffer.cpp
@@ -0,0 +1,401 @@
+//===--- MemoryBuffer.cpp - Memory Buffer implementation ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MemoryBuffer interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Errno.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Path.h"
+#include <cassert>
+#include <cerrno>
+#include <cstring>
+#include <new>
+#include <sys/types.h>
+#include <system_error>
+#include <unistd.h>
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// MemoryBuffer implementation itself.
+//===----------------------------------------------------------------------===//
+
+MemoryBuffer::~MemoryBuffer() { }
+
+/// init - Initialize this MemoryBuffer as a reference to externally allocated
+/// memory, memory that we know is already null terminated.
+void MemoryBuffer::init(const char *BufStart, const char *BufEnd,
+ bool RequiresNullTerminator) {
+ assert((!RequiresNullTerminator || BufEnd[0] == 0) &&
+ "Buffer is not null terminated!");
+ BufferStart = BufStart;
+ BufferEnd = BufEnd;
+}
+
+//===----------------------------------------------------------------------===//
+// MemoryBufferMem implementation.
+//===----------------------------------------------------------------------===//
+
+/// CopyStringRef - Copies contents of a StringRef into a block of memory and
+/// null-terminates it.
+static void CopyStringRef(char *Memory, StringRef Data) {
+ if (!Data.empty())
+ memcpy(Memory, Data.data(), Data.size());
+ Memory[Data.size()] = 0; // Null terminate string.
+}
+
+namespace {
+struct NamedBufferAlloc {
+ const Twine &Name;
+ NamedBufferAlloc(const Twine &Name) : Name(Name) {}
+};
+}
+
+void *operator new(size_t N, const NamedBufferAlloc &Alloc) {
+ SmallString<256> NameBuf;
+ StringRef NameRef = Alloc.Name.toStringRef(NameBuf);
+
+ char *Mem = static_cast<char *>(operator new(N + NameRef.size() + 1));
+ CopyStringRef(Mem + N, NameRef);
+ return Mem;
+}
+
+namespace {
+/// MemoryBufferMem - Named MemoryBuffer pointing to a block of memory.
+class MemoryBufferMem : public MemoryBuffer {
+public:
+ MemoryBufferMem(StringRef InputData, bool RequiresNullTerminator) {
+ init(InputData.begin(), InputData.end(), RequiresNullTerminator);
+ }
+
+ const char *getBufferIdentifier() const override {
+ // The name is stored after the class itself.
+ return reinterpret_cast<const char*>(this + 1);
+ }
+
+ BufferKind getBufferKind() const override {
+ return MemoryBuffer_Malloc;
+ }
+};
+}
+
+static ErrorOr<std::unique_ptr<MemoryBuffer>>
+getFileAux(const Twine &Filename, int64_t FileSize, uint64_t MapSize,
+ uint64_t Offset, bool RequiresNullTerminator, bool IsVolatileSize);
+
+std::unique_ptr<MemoryBuffer>
+MemoryBuffer::getMemBuffer(StringRef InputData, StringRef BufferName,
+ bool RequiresNullTerminator) {
+ auto *Ret = new (NamedBufferAlloc(BufferName))
+ MemoryBufferMem(InputData, RequiresNullTerminator);
+ return std::unique_ptr<MemoryBuffer>(Ret);
+}
+
+std::unique_ptr<MemoryBuffer>
+MemoryBuffer::getMemBuffer(MemoryBufferRef Ref, bool RequiresNullTerminator) {
+ return std::unique_ptr<MemoryBuffer>(getMemBuffer(
+ Ref.getBuffer(), Ref.getBufferIdentifier(), RequiresNullTerminator));
+}
+
+std::unique_ptr<MemoryBuffer>
+MemoryBuffer::getMemBufferCopy(StringRef InputData, const Twine &BufferName) {
+ std::unique_ptr<MemoryBuffer> Buf =
+ getNewUninitMemBuffer(InputData.size(), BufferName);
+ if (!Buf)
+ return nullptr;
+ memcpy(const_cast<char*>(Buf->getBufferStart()), InputData.data(),
+ InputData.size());
+ return Buf;
+}
+
+std::unique_ptr<MemoryBuffer>
+MemoryBuffer::getNewUninitMemBuffer(size_t Size, const Twine &BufferName) {
+ // Allocate space for the MemoryBuffer, the data and the name. It is important
+ // that MemoryBuffer and data are aligned so PointerIntPair works with them.
+ // TODO: Is 16-byte alignment enough? We copy small object files with large
+ // alignment expectations into this buffer.
+ SmallString<256> NameBuf;
+ StringRef NameRef = BufferName.toStringRef(NameBuf);
+ size_t AlignedStringLen =
+ RoundUpToAlignment(sizeof(MemoryBufferMem) + NameRef.size() + 1, 16);
+ size_t RealLen = AlignedStringLen + Size + 1;
+ char *Mem = static_cast<char*>(operator new(RealLen, std::nothrow));
+ if (!Mem)
+ return nullptr;
+
+ // The name is stored after the class itself.
+ CopyStringRef(Mem + sizeof(MemoryBufferMem), NameRef);
+
+ // The buffer begins after the name and must be aligned.
+ char *Buf = Mem + AlignedStringLen;
+ Buf[Size] = 0; // Null terminate buffer.
+
+ auto *Ret = new (Mem) MemoryBufferMem(StringRef(Buf, Size), true);
+ return std::unique_ptr<MemoryBuffer>(Ret);
+}
+
+std::unique_ptr<MemoryBuffer>
+MemoryBuffer::getNewMemBuffer(size_t Size, StringRef BufferName) {
+ std::unique_ptr<MemoryBuffer> SB = getNewUninitMemBuffer(Size, BufferName);
+ if (!SB)
+ return nullptr;
+ memset(const_cast<char*>(SB->getBufferStart()), 0, Size);
+ return SB;
+}
+
+ErrorOr<std::unique_ptr<MemoryBuffer>>
+MemoryBuffer::getFileOrSTDIN(const Twine &Filename, int64_t FileSize,
+ bool RequiresNullTerminator) {
+ SmallString<256> NameBuf;
+ StringRef NameRef = Filename.toStringRef(NameBuf);
+ return getFile(Filename, FileSize, RequiresNullTerminator);
+}
+
+ErrorOr<std::unique_ptr<MemoryBuffer>>
+MemoryBuffer::getFileSlice(const Twine &FilePath, uint64_t MapSize,
+ uint64_t Offset) {
+ return getFileAux(FilePath, -1, MapSize, Offset, false, false);
+}
+
+
+//===----------------------------------------------------------------------===//
+// MemoryBuffer::getFile implementation.
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// \brief Memory maps a file descriptor using sys::fs::mapped_file_region.
+///
+/// This handles converting the offset into a legal offset on the platform.
+class MemoryBufferMMapFile : public MemoryBuffer {
+ sys::fs::mapped_file_region MFR;
+
+ static uint64_t getLegalMapOffset(uint64_t Offset) {
+ return Offset & ~(sys::fs::mapped_file_region::alignment() - 1);
+ }
+
+ static uint64_t getLegalMapSize(uint64_t Len, uint64_t Offset) {
+ return Len + (Offset - getLegalMapOffset(Offset));
+ }
+
+ const char *getStart(uint64_t Len, uint64_t Offset) {
+ return MFR.const_data() + (Offset - getLegalMapOffset(Offset));
+ }
+
+public:
+ MemoryBufferMMapFile(bool RequiresNullTerminator, int FD, uint64_t Len,
+ uint64_t Offset, std::error_code &EC)
+ : MFR(FD, sys::fs::mapped_file_region::readonly,
+ getLegalMapSize(Len, Offset), getLegalMapOffset(Offset), EC) {
+ if (!EC) {
+ const char *Start = getStart(Len, Offset);
+ init(Start, Start + Len, RequiresNullTerminator);
+ }
+ }
+
+ const char *getBufferIdentifier() const override {
+ // The name is stored after the class itself.
+ return reinterpret_cast<const char *>(this + 1);
+ }
+
+ BufferKind getBufferKind() const override {
+ return MemoryBuffer_MMap;
+ }
+};
+}
+
+static ErrorOr<std::unique_ptr<MemoryBuffer>>
+getMemoryBufferForStream(int FD, const Twine &BufferName) {
+ const ssize_t ChunkSize = 4096*4;
+ SmallString<ChunkSize> Buffer;
+ ssize_t ReadBytes;
+ // Read into Buffer until we hit EOF.
+ do {
+ Buffer.reserve(Buffer.size() + ChunkSize);
+ ReadBytes = read(FD, Buffer.end(), ChunkSize);
+ if (ReadBytes == -1) {
+ if (errno == EINTR) continue;
+ return std::error_code(errno, std::generic_category());
+ }
+ Buffer.set_size(Buffer.size() + ReadBytes);
+ } while (ReadBytes != 0);
+
+ return MemoryBuffer::getMemBufferCopy(Buffer, BufferName);
+}
+
+
+ErrorOr<std::unique_ptr<MemoryBuffer>>
+MemoryBuffer::getFile(const Twine &Filename, int64_t FileSize,
+ bool RequiresNullTerminator, bool IsVolatileSize) {
+ return getFileAux(Filename, FileSize, FileSize, 0,
+ RequiresNullTerminator, IsVolatileSize);
+}
+
+static ErrorOr<std::unique_ptr<MemoryBuffer>>
+getOpenFileImpl(int FD, const Twine &Filename, uint64_t FileSize,
+ uint64_t MapSize, int64_t Offset, bool RequiresNullTerminator,
+ bool IsVolatileSize);
+
+static ErrorOr<std::unique_ptr<MemoryBuffer>>
+getFileAux(const Twine &Filename, int64_t FileSize, uint64_t MapSize,
+ uint64_t Offset, bool RequiresNullTerminator, bool IsVolatileSize) {
+ int FD;
+ std::error_code EC = sys::fs::openFileForRead(Filename, FD);
+ if (EC)
+ return EC;
+
+ ErrorOr<std::unique_ptr<MemoryBuffer>> Ret =
+ getOpenFileImpl(FD, Filename, FileSize, MapSize, Offset,
+ RequiresNullTerminator, IsVolatileSize);
+ close(FD);
+ return Ret;
+}
+
+static bool shouldUseMmap(int FD,
+ size_t FileSize,
+ size_t MapSize,
+ off_t Offset,
+ bool RequiresNullTerminator,
+ int PageSize,
+ bool IsVolatileSize) {
+ // mmap may leave the buffer without null terminator if the file size changed
+ // by the time the last page is mapped in, so avoid it if the file size is
+ // likely to change.
+ if (IsVolatileSize)
+ return false;
+
+ // We don't use mmap for small files because this can severely fragment our
+ // address space.
+ if (MapSize < 4 * 4096 || MapSize < (unsigned)PageSize)
+ return false;
+
+ if (!RequiresNullTerminator)
+ return true;
+
+
+ // If we don't know the file size, use fstat to find out. fstat on an open
+ // file descriptor is cheaper than stat on a random path.
+ // FIXME: this chunk of code is duplicated, but it avoids a fstat when
+ // RequiresNullTerminator = false and MapSize != -1.
+ if (FileSize == size_t(-1)) {
+ sys::fs::file_status Status;
+ if (sys::fs::status(FD, Status))
+ return false;
+ FileSize = Status.getSize();
+ }
+
+ // If we need a null terminator and the end of the map is inside the file,
+ // we cannot use mmap.
+ size_t End = Offset + MapSize;
+ assert(End <= FileSize);
+ if (End != FileSize)
+ return false;
+
+ // Don't try to map files that are exactly a multiple of the system page size
+ // if we need a null terminator.
+ if ((FileSize & (PageSize -1)) == 0)
+ return false;
+
+ return true;
+}
+
+static ErrorOr<std::unique_ptr<MemoryBuffer>>
+getOpenFileImpl(int FD, const Twine &Filename, uint64_t FileSize,
+ uint64_t MapSize, int64_t Offset, bool RequiresNullTerminator,
+ bool IsVolatileSize) {
+ static int PageSize = ::getpagesize();
+
+ // Default is to map the full file.
+ if (MapSize == uint64_t(-1)) {
+ // If we don't know the file size, use fstat to find out. fstat on an open
+ // file descriptor is cheaper than stat on a random path.
+ if (FileSize == uint64_t(-1)) {
+ sys::fs::file_status Status;
+ std::error_code EC = sys::fs::status(FD, Status);
+ if (EC)
+ return EC;
+
+ // If this not a file or a block device (e.g. it's a named pipe
+ // or character device), we can't trust the size. Create the memory
+ // buffer by copying off the stream.
+ sys::fs::file_type Type = Status.type();
+ if (Type != sys::fs::file_type::regular_file &&
+ Type != sys::fs::file_type::block_file)
+ return getMemoryBufferForStream(FD, Filename);
+
+ FileSize = Status.getSize();
+ }
+ MapSize = FileSize;
+ }
+
+ if (shouldUseMmap(FD, FileSize, MapSize, Offset, RequiresNullTerminator,
+ PageSize, IsVolatileSize)) {
+ std::error_code EC;
+ std::unique_ptr<MemoryBuffer> Result(
+ new (NamedBufferAlloc(Filename))
+ MemoryBufferMMapFile(RequiresNullTerminator, FD, MapSize, Offset, EC));
+ if (!EC)
+ return std::move(Result);
+ }
+
+ std::unique_ptr<MemoryBuffer> Buf =
+ MemoryBuffer::getNewUninitMemBuffer(MapSize, Filename);
+ if (!Buf) {
+ // Failed to create a buffer. The only way it can fail is if
+ // new(std::nothrow) returns 0.
+ return make_error_code(errc::not_enough_memory);
+ }
+
+ char *BufPtr = const_cast<char *>(Buf->getBufferStart());
+
+ size_t BytesLeft = MapSize;
+ while (BytesLeft) {
+ ssize_t NumRead = ::pread(FD, BufPtr, BytesLeft, MapSize-BytesLeft+Offset);
+ if (NumRead == -1) {
+ if (errno == EINTR)
+ continue;
+ // Error while reading.
+ return std::error_code(errno, std::generic_category());
+ }
+ if (NumRead == 0) {
+ memset(BufPtr, 0, BytesLeft); // zero-initialize rest of the buffer.
+ break;
+ }
+ BytesLeft -= NumRead;
+ BufPtr += NumRead;
+ }
+
+ return std::move(Buf);
+}
+
+ErrorOr<std::unique_ptr<MemoryBuffer>>
+MemoryBuffer::getOpenFile(int FD, const Twine &Filename, uint64_t FileSize,
+ bool RequiresNullTerminator, bool IsVolatileSize) {
+ return getOpenFileImpl(FD, Filename, FileSize, FileSize, 0,
+ RequiresNullTerminator, IsVolatileSize);
+}
+
+ErrorOr<std::unique_ptr<MemoryBuffer>>
+MemoryBuffer::getOpenFileSlice(int FD, const Twine &Filename, uint64_t MapSize,
+ int64_t Offset) {
+ assert(MapSize != uint64_t(-1));
+ return getOpenFileImpl(FD, Filename, -1, MapSize, Offset, false,
+ /*IsVolatileSize*/ false);
+}
+
+MemoryBufferRef MemoryBuffer::getMemBufferRef() const {
+ StringRef Data = getBuffer();
+ StringRef Identifier = getBufferIdentifier();
+ return MemoryBufferRef(Data, Identifier);
+}
diff --git a/ext/src/llvm/Mutex.cpp b/ext/src/llvm/Mutex.cpp
new file mode 100644
index 0000000..8b4b077
--- /dev/null
+++ b/ext/src/llvm/Mutex.cpp
@@ -0,0 +1,93 @@
+//===- Mutex.cpp - Mutual Exclusion Lock ------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the llvm::sys::Mutex class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Mutex.h"
+
+#include <cassert>
+#include <pthread.h>
+#include <stdlib.h>
+
+namespace llvm {
+using namespace sys;
+
+// Construct a Mutex using pthread calls
+MutexImpl::MutexImpl( bool recursive)
+ : data_(nullptr)
+{
+ // Declare the pthread_mutex data structures
+ pthread_mutex_t* mutex =
+ static_cast<pthread_mutex_t*>(malloc(sizeof(pthread_mutex_t)));
+ pthread_mutexattr_t attr;
+
+ // Initialize the mutex attributes
+ int errorcode = pthread_mutexattr_init(&attr);
+ assert(errorcode == 0); (void)errorcode;
+
+ // Initialize the mutex as a recursive mutex, if requested, or normal
+ // otherwise.
+ int kind = ( recursive ? PTHREAD_MUTEX_RECURSIVE : PTHREAD_MUTEX_NORMAL );
+ errorcode = pthread_mutexattr_settype(&attr, kind);
+ assert(errorcode == 0);
+
+ // Initialize the mutex
+ errorcode = pthread_mutex_init(mutex, &attr);
+ assert(errorcode == 0);
+
+ // Destroy the attributes
+ errorcode = pthread_mutexattr_destroy(&attr);
+ assert(errorcode == 0);
+
+ // Assign the data member
+ data_ = mutex;
+}
+
+// Destruct a Mutex
+MutexImpl::~MutexImpl()
+{
+ pthread_mutex_t* mutex = static_cast<pthread_mutex_t*>(data_);
+ assert(mutex != nullptr);
+ pthread_mutex_destroy(mutex);
+ free(mutex);
+}
+
+bool
+MutexImpl::acquire()
+{
+ pthread_mutex_t* mutex = static_cast<pthread_mutex_t*>(data_);
+ assert(mutex != nullptr);
+
+ int errorcode = pthread_mutex_lock(mutex);
+ return errorcode == 0;
+}
+
+bool
+MutexImpl::release()
+{
+ pthread_mutex_t* mutex = static_cast<pthread_mutex_t*>(data_);
+ assert(mutex != nullptr);
+
+ int errorcode = pthread_mutex_unlock(mutex);
+ return errorcode == 0;
+}
+
+bool
+MutexImpl::tryacquire()
+{
+ pthread_mutex_t* mutex = static_cast<pthread_mutex_t*>(data_);
+ assert(mutex != nullptr);
+
+ int errorcode = pthread_mutex_trylock(mutex);
+ return errorcode == 0;
+}
+
+}
diff --git a/ext/src/llvm/Path.cpp b/ext/src/llvm/Path.cpp
new file mode 100644
index 0000000..397d552
--- /dev/null
+++ b/ext/src/llvm/Path.cpp
@@ -0,0 +1,911 @@
+//===-- Path.cpp - Implement OS Path Concept ------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the operating system Path API.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
+#include <cctype>
+#include <cstring>
+
+#include <unistd.h>
+
+using namespace llvm;
+
+namespace {
+ using llvm::StringRef;
+ using llvm::sys::path::is_separator;
+
+ const char separators = '/';
+ const char preferred_separator = '/';
+
+ StringRef find_first_component(StringRef path) {
+ // Look for this first component in the following order.
+ // * empty (in this case we return an empty string)
+ // * either C: or {//,\\}net.
+ // * {/,\}
+ // * {file,directory}name
+
+ if (path.empty())
+ return path;
+
+ // //net
+ if ((path.size() > 2) &&
+ is_separator(path[0]) &&
+ path[0] == path[1] &&
+ !is_separator(path[2])) {
+ // Find the next directory separator.
+ size_t end = path.find_first_of(separators, 2);
+ return path.substr(0, end);
+ }
+
+ // {/,\}
+ if (is_separator(path[0]))
+ return path.substr(0, 1);
+
+ // * {file,directory}name
+ size_t end = path.find_first_of(separators);
+ return path.substr(0, end);
+ }
+
+ size_t filename_pos(StringRef str) {
+ if (str.size() == 2 &&
+ is_separator(str[0]) &&
+ str[0] == str[1])
+ return 0;
+
+ if (str.size() > 0 && is_separator(str[str.size() - 1]))
+ return str.size() - 1;
+
+ size_t pos = str.find_last_of(separators, str.size() - 1);
+
+ if (pos == StringRef::npos ||
+ (pos == 1 && is_separator(str[0])))
+ return 0;
+
+ return pos + 1;
+ }
+
+ size_t root_dir_start(StringRef str) {
+ // case "//"
+ if (str.size() == 2 &&
+ is_separator(str[0]) &&
+ str[0] == str[1])
+ return StringRef::npos;
+
+ // case "//net"
+ if (str.size() > 3 &&
+ is_separator(str[0]) &&
+ str[0] == str[1] &&
+ !is_separator(str[2])) {
+ return str.find_first_of(separators, 2);
+ }
+
+ // case "/"
+ if (str.size() > 0 && is_separator(str[0]))
+ return 0;
+
+ return StringRef::npos;
+ }
+
+ size_t parent_path_end(StringRef path) {
+ size_t end_pos = filename_pos(path);
+
+ bool filename_was_sep = path.size() > 0 && is_separator(path[end_pos]);
+
+ // Skip separators except for root dir.
+ size_t root_dir_pos = root_dir_start(path.substr(0, end_pos));
+
+ while(end_pos > 0 &&
+ (end_pos - 1) != root_dir_pos &&
+ is_separator(path[end_pos - 1]))
+ --end_pos;
+
+ if (end_pos == 1 && root_dir_pos == 0 && filename_was_sep)
+ return StringRef::npos;
+
+ return end_pos;
+ }
+} // end unnamed namespace
+
+enum FSEntity {
+ FS_Dir,
+ FS_File,
+ FS_Name
+};
+
+static std::error_code createUniqueEntity(const Twine &Model, int &ResultFD,
+ SmallVectorImpl<char> &ResultPath,
+ bool MakeAbsolute, unsigned Mode,
+ FSEntity Type) {
+ SmallString<128> ModelStorage;
+ Model.toVector(ModelStorage);
+
+ if (MakeAbsolute) {
+ // Make model absolute by prepending a temp directory if it's not already.
+ if (!sys::path::is_absolute(Twine(ModelStorage))) {
+ SmallString<128> TDir;
+ sys::path::system_temp_directory(true, TDir);
+ sys::path::append(TDir, Twine(ModelStorage));
+ ModelStorage.swap(TDir);
+ }
+ }
+
+ // From here on, DO NOT modify model. It may be needed if the randomly chosen
+ // path already exists.
+ ResultPath = ModelStorage;
+ // Null terminate.
+ ResultPath.push_back(0);
+ ResultPath.pop_back();
+
+retry_random_path:
+ // Replace '%' with random chars.
+ for (unsigned i = 0, e = ModelStorage.size(); i != e; ++i) {
+ if (ModelStorage[i] == '%')
+ ResultPath[i] = "0123456789abcdef"[rand() & 15];
+ }
+
+ // Try to open + create the file.
+ switch (Type) {
+ case FS_File: {
+ if (std::error_code EC =
+ sys::fs::openFileForWrite(Twine(ResultPath.begin()), ResultFD,
+ sys::fs::F_RW | sys::fs::F_Excl, Mode)) {
+ if (EC == errc::file_exists)
+ goto retry_random_path;
+ return EC;
+ }
+
+ return std::error_code();
+ }
+
+ case FS_Name: {
+ std::error_code EC =
+ sys::fs::access(ResultPath.begin(), sys::fs::AccessMode::Exist);
+ if (EC == errc::no_such_file_or_directory)
+ return std::error_code();
+ if (EC)
+ return EC;
+ goto retry_random_path;
+ }
+
+ case FS_Dir: {
+ if (std::error_code EC =
+ sys::fs::create_directory(ResultPath.begin(), false)) {
+ if (EC == errc::file_exists)
+ goto retry_random_path;
+ return EC;
+ }
+ return std::error_code();
+ }
+ }
+ llvm_unreachable("Invalid Type");
+}
+
+namespace llvm {
+namespace sys {
+namespace path {
+
+const_iterator begin(StringRef path) {
+ const_iterator i;
+ i.Path = path;
+ i.Component = find_first_component(path);
+ i.Position = 0;
+ return i;
+}
+
+const_iterator end(StringRef path) {
+ const_iterator i;
+ i.Path = path;
+ i.Position = path.size();
+ return i;
+}
+
+const_iterator &const_iterator::operator++() {
+ assert(Position < Path.size() && "Tried to increment past end!");
+
+ // Increment Position to past the current component
+ Position += Component.size();
+
+ // Check for end.
+ if (Position == Path.size()) {
+ Component = StringRef();
+ return *this;
+ }
+
+ // Both POSIX and Windows treat paths that begin with exactly two separators
+ // specially.
+ bool was_net = Component.size() > 2 &&
+ is_separator(Component[0]) &&
+ Component[1] == Component[0] &&
+ !is_separator(Component[2]);
+
+ // Handle separators.
+ if (is_separator(Path[Position])) {
+ // Root dir.
+ if (was_net) {
+ Component = Path.substr(Position, 1);
+ return *this;
+ }
+
+ // Skip extra separators.
+ while (Position != Path.size() &&
+ is_separator(Path[Position])) {
+ ++Position;
+ }
+
+ // Treat trailing '/' as a '.'.
+ if (Position == Path.size()) {
+ --Position;
+ Component = ".";
+ return *this;
+ }
+ }
+
+ // Find next component.
+ size_t end_pos = Path.find_first_of(separators, Position);
+ Component = Path.slice(Position, end_pos);
+
+ return *this;
+}
+
+bool const_iterator::operator==(const const_iterator &RHS) const {
+ return Path.begin() == RHS.Path.begin() && Position == RHS.Position;
+}
+
+ptrdiff_t const_iterator::operator-(const const_iterator &RHS) const {
+ return Position - RHS.Position;
+}
+
+reverse_iterator rbegin(StringRef Path) {
+ reverse_iterator I;
+ I.Path = Path;
+ I.Position = Path.size();
+ return ++I;
+}
+
+reverse_iterator rend(StringRef Path) {
+ reverse_iterator I;
+ I.Path = Path;
+ I.Component = Path.substr(0, 0);
+ I.Position = 0;
+ return I;
+}
+
+reverse_iterator &reverse_iterator::operator++() {
+ // If we're at the end and the previous char was a '/', return '.' unless
+ // we are the root path.
+ size_t root_dir_pos = root_dir_start(Path);
+ if (Position == Path.size() &&
+ Path.size() > root_dir_pos + 1 &&
+ is_separator(Path[Position - 1])) {
+ --Position;
+ Component = ".";
+ return *this;
+ }
+
+ // Skip separators unless it's the root directory.
+ size_t end_pos = Position;
+
+ while(end_pos > 0 &&
+ (end_pos - 1) != root_dir_pos &&
+ is_separator(Path[end_pos - 1]))
+ --end_pos;
+
+ // Find next separator.
+ size_t start_pos = filename_pos(Path.substr(0, end_pos));
+ Component = Path.slice(start_pos, end_pos);
+ Position = start_pos;
+ return *this;
+}
+
+bool reverse_iterator::operator==(const reverse_iterator &RHS) const {
+ return Path.begin() == RHS.Path.begin() && Component == RHS.Component &&
+ Position == RHS.Position;
+}
+
+StringRef root_path(StringRef path) {
+ const_iterator b = begin(path),
+ pos = b,
+ e = end(path);
+ if (b != e) {
+ bool has_net = b->size() > 2 && is_separator((*b)[0]) && (*b)[1] == (*b)[0];
+ bool has_drive = false;
+
+ if (has_net || has_drive) {
+ if ((++pos != e) && is_separator((*pos)[0])) {
+ // {C:/,//net/}, so get the first two components.
+ return path.substr(0, b->size() + pos->size());
+ } else {
+ // just {C:,//net}, return the first component.
+ return *b;
+ }
+ }
+
+ // POSIX style root directory.
+ if (is_separator((*b)[0])) {
+ return *b;
+ }
+ }
+
+ return StringRef();
+}
+
+StringRef root_name(StringRef path) {
+ const_iterator b = begin(path),
+ e = end(path);
+ if (b != e) {
+ bool has_net = b->size() > 2 && is_separator((*b)[0]) && (*b)[1] == (*b)[0];
+ bool has_drive = false;
+
+ if (has_net || has_drive) {
+ // just {C:,//net}, return the first component.
+ return *b;
+ }
+ }
+
+ // No path or no name.
+ return StringRef();
+}
+
+StringRef root_directory(StringRef path) {
+ const_iterator b = begin(path),
+ pos = b,
+ e = end(path);
+ if (b != e) {
+ bool has_net = b->size() > 2 && is_separator((*b)[0]) && (*b)[1] == (*b)[0];
+ bool has_drive = false;
+
+ if ((has_net || has_drive) &&
+ // {C:,//net}, skip to the next component.
+ (++pos != e) && is_separator((*pos)[0])) {
+ return *pos;
+ }
+
+ // POSIX style root directory.
+ if (!has_net && is_separator((*b)[0])) {
+ return *b;
+ }
+ }
+
+ // No path or no root.
+ return StringRef();
+}
+
+StringRef relative_path(StringRef path) {
+ StringRef root = root_path(path);
+ return path.substr(root.size());
+}
+
+void append(SmallVectorImpl<char> &path, const Twine &a,
+ const Twine &b,
+ const Twine &c,
+ const Twine &d) {
+ SmallString<32> a_storage;
+ SmallString<32> b_storage;
+ SmallString<32> c_storage;
+ SmallString<32> d_storage;
+
+ SmallVector<StringRef, 4> components;
+ if (!a.isTriviallyEmpty()) components.push_back(a.toStringRef(a_storage));
+ if (!b.isTriviallyEmpty()) components.push_back(b.toStringRef(b_storage));
+ if (!c.isTriviallyEmpty()) components.push_back(c.toStringRef(c_storage));
+ if (!d.isTriviallyEmpty()) components.push_back(d.toStringRef(d_storage));
+
+ for (auto &component : components) {
+ bool path_has_sep = !path.empty() && is_separator(path[path.size() - 1]);
+ bool component_has_sep = !component.empty() && is_separator(component[0]);
+ bool is_root_name = has_root_name(component);
+
+ if (path_has_sep) {
+ // Strip separators from beginning of component.
+ size_t loc = component.find_first_not_of(separators);
+ StringRef c = component.substr(loc);
+
+ // Append it.
+ path.append(c.begin(), c.end());
+ continue;
+ }
+
+ if (!component_has_sep && !(path.empty() || is_root_name)) {
+ // Add a separator.
+ path.push_back(preferred_separator);
+ }
+
+ path.append(component.begin(), component.end());
+ }
+}
+
+void append(SmallVectorImpl<char> &path,
+ const_iterator begin, const_iterator end) {
+ for (; begin != end; ++begin)
+ path::append(path, *begin);
+}
+
+StringRef parent_path(StringRef path) {
+ size_t end_pos = parent_path_end(path);
+ if (end_pos == StringRef::npos)
+ return StringRef();
+ else
+ return path.substr(0, end_pos);
+}
+
+void remove_filename(SmallVectorImpl<char> &path) {
+ size_t end_pos = parent_path_end(StringRef(path.begin(), path.size()));
+ if (end_pos != StringRef::npos)
+ path.set_size(end_pos);
+}
+
+void replace_extension(SmallVectorImpl<char> &path, const Twine &extension) {
+ StringRef p(path.begin(), path.size());
+ SmallString<32> ext_storage;
+ StringRef ext = extension.toStringRef(ext_storage);
+
+ // Erase existing extension.
+ size_t pos = p.find_last_of('.');
+ if (pos != StringRef::npos && pos >= filename_pos(p))
+ path.set_size(pos);
+
+ // Append '.' if needed.
+ if (ext.size() > 0 && ext[0] != '.')
+ path.push_back('.');
+
+ // Append extension.
+ path.append(ext.begin(), ext.end());
+}
+
+void native(const Twine &path, SmallVectorImpl<char> &result) {
+ assert((!path.isSingleStringRef() ||
+ path.getSingleStringRef().data() != result.data()) &&
+ "path and result are not allowed to overlap!");
+ // Clear result.
+ result.clear();
+ path.toVector(result);
+ native(result);
+}
+
+void native(SmallVectorImpl<char> &Path) {
+ for (auto PI = Path.begin(), PE = Path.end(); PI < PE; ++PI) {
+ if (*PI == '\\') {
+ auto PN = PI + 1;
+ if (PN < PE && *PN == '\\')
+ ++PI; // increment once, the for loop will move over the escaped slash
+ else
+ *PI = '/';
+ }
+ }
+}
+
+StringRef filename(StringRef path) {
+ return *rbegin(path);
+}
+
+StringRef stem(StringRef path) {
+ StringRef fname = filename(path);
+ size_t pos = fname.find_last_of('.');
+ if (pos == StringRef::npos)
+ return fname;
+ else
+ if ((fname.size() == 1 && fname == ".") ||
+ (fname.size() == 2 && fname == ".."))
+ return fname;
+ else
+ return fname.substr(0, pos);
+}
+
+StringRef extension(StringRef path) {
+ StringRef fname = filename(path);
+ size_t pos = fname.find_last_of('.');
+ if (pos == StringRef::npos)
+ return StringRef();
+ else
+ if ((fname.size() == 1 && fname == ".") ||
+ (fname.size() == 2 && fname == ".."))
+ return StringRef();
+ else
+ return fname.substr(pos);
+}
+
+bool is_separator(char value) {
+ switch(value) {
+ case '/': return true;
+ default: return false;
+ }
+}
+
+static const char preferred_separator_string[] = { preferred_separator, '\0' };
+
+StringRef get_separator() {
+ return preferred_separator_string;
+}
+
+bool has_root_name(const Twine &path) {
+ SmallString<128> path_storage;
+ StringRef p = path.toStringRef(path_storage);
+
+ return !root_name(p).empty();
+}
+
+bool has_root_directory(const Twine &path) {
+ SmallString<128> path_storage;
+ StringRef p = path.toStringRef(path_storage);
+
+ return !root_directory(p).empty();
+}
+
+bool has_root_path(const Twine &path) {
+ SmallString<128> path_storage;
+ StringRef p = path.toStringRef(path_storage);
+
+ return !root_path(p).empty();
+}
+
+bool has_relative_path(const Twine &path) {
+ SmallString<128> path_storage;
+ StringRef p = path.toStringRef(path_storage);
+
+ return !relative_path(p).empty();
+}
+
+bool has_filename(const Twine &path) {
+ SmallString<128> path_storage;
+ StringRef p = path.toStringRef(path_storage);
+
+ return !filename(p).empty();
+}
+
+bool has_parent_path(const Twine &path) {
+ SmallString<128> path_storage;
+ StringRef p = path.toStringRef(path_storage);
+
+ return !parent_path(p).empty();
+}
+
+bool has_stem(const Twine &path) {
+ SmallString<128> path_storage;
+ StringRef p = path.toStringRef(path_storage);
+
+ return !stem(p).empty();
+}
+
+bool has_extension(const Twine &path) {
+ SmallString<128> path_storage;
+ StringRef p = path.toStringRef(path_storage);
+
+ return !extension(p).empty();
+}
+
+bool is_absolute(const Twine &path) {
+ SmallString<128> path_storage;
+ StringRef p = path.toStringRef(path_storage);
+
+ bool rootDir = has_root_directory(p),
+ rootName = true;
+
+ return rootDir && rootName;
+}
+
+bool is_relative(const Twine &path) { return !is_absolute(path); }
+
+StringRef remove_leading_dotslash(StringRef Path) {
+ // Remove leading "./" (or ".//" or "././" etc.)
+ while (Path.size() > 2 && Path[0] == '.' && is_separator(Path[1])) {
+ Path = Path.substr(2);
+ while (Path.size() > 0 && is_separator(Path[0]))
+ Path = Path.substr(1);
+ }
+ return Path;
+}
+
+static SmallString<256> remove_dots(StringRef path, bool remove_dot_dot) {
+ SmallVector<StringRef, 16> components;
+
+ // Skip the root path, then look for traversal in the components.
+ StringRef rel = path::relative_path(path);
+ for (StringRef C : llvm::make_range(path::begin(rel), path::end(rel))) {
+ if (C == ".")
+ continue;
+ if (remove_dot_dot) {
+ if (C == "..") {
+ if (!components.empty())
+ components.pop_back();
+ continue;
+ }
+ }
+ components.push_back(C);
+ }
+
+ SmallString<256> buffer = path::root_path(path);
+ for (StringRef C : components)
+ path::append(buffer, C);
+ return buffer;
+}
+
+bool remove_dots(SmallVectorImpl<char> &path, bool remove_dot_dot) {
+ StringRef p(path.data(), path.size());
+
+ SmallString<256> result = remove_dots(p, remove_dot_dot);
+ if (result == path)
+ return false;
+
+ path.swap(result);
+ return true;
+}
+
+} // end namespace path
+
+namespace fs {
+
+std::error_code getUniqueID(const Twine Path, UniqueID &Result) {
+ file_status Status;
+ std::error_code EC = status(Path, Status);
+ if (EC)
+ return EC;
+ Result = Status.getUniqueID();
+ return std::error_code();
+}
+
+std::error_code createUniqueFile(const Twine &Model, int &ResultFd,
+ SmallVectorImpl<char> &ResultPath,
+ unsigned Mode) {
+ return createUniqueEntity(Model, ResultFd, ResultPath, false, Mode, FS_File);
+}
+
+std::error_code createUniqueFile(const Twine &Model,
+ SmallVectorImpl<char> &ResultPath) {
+ int Dummy;
+ return createUniqueEntity(Model, Dummy, ResultPath, false, 0, FS_Name);
+}
+
+static std::error_code
+createTemporaryFile(const Twine &Model, int &ResultFD,
+ llvm::SmallVectorImpl<char> &ResultPath, FSEntity Type) {
+ SmallString<128> Storage;
+ StringRef P = Model.toNullTerminatedStringRef(Storage);
+ assert(P.find_first_of(separators) == StringRef::npos &&
+ "Model must be a simple filename.");
+ // Use P.begin() so that createUniqueEntity doesn't need to recreate Storage.
+ return createUniqueEntity(P.begin(), ResultFD, ResultPath,
+ true, owner_read | owner_write, Type);
+}
+
+static std::error_code
+createTemporaryFile(const Twine &Prefix, StringRef Suffix, int &ResultFD,
+ llvm::SmallVectorImpl<char> &ResultPath, FSEntity Type) {
+ const char *Middle = Suffix.empty() ? "-%%%%%%" : "-%%%%%%.";
+ return createTemporaryFile(Prefix + Middle + Suffix, ResultFD, ResultPath,
+ Type);
+}
+
+std::error_code createTemporaryFile(const Twine &Prefix, StringRef Suffix,
+ int &ResultFD,
+ SmallVectorImpl<char> &ResultPath) {
+ return createTemporaryFile(Prefix, Suffix, ResultFD, ResultPath, FS_File);
+}
+
+std::error_code createTemporaryFile(const Twine &Prefix, StringRef Suffix,
+ SmallVectorImpl<char> &ResultPath) {
+ int Dummy;
+ return createTemporaryFile(Prefix, Suffix, Dummy, ResultPath, FS_Name);
+}
+
+
+// This is a mkdtemp with a different pattern. We use createUniqueEntity mostly
+// for consistency. We should try using mkdtemp.
+std::error_code createUniqueDirectory(const Twine &Prefix,
+ SmallVectorImpl<char> &ResultPath) {
+ int Dummy;
+ return createUniqueEntity(Prefix + "-%%%%%%", Dummy, ResultPath,
+ true, 0, FS_Dir);
+}
+
+static std::error_code make_absolute(const Twine ¤t_directory,
+ SmallVectorImpl<char> &path,
+ bool use_current_directory) {
+ StringRef p(path.data(), path.size());
+
+ bool rootDirectory = path::has_root_directory(p),
+ rootName = true;
+
+ // Already absolute.
+ if (rootName && rootDirectory)
+ return std::error_code();
+
+ // All of the following conditions will need the current directory.
+ SmallString<128> current_dir;
+ if (use_current_directory)
+ current_directory.toVector(current_dir);
+ else if (std::error_code ec = current_path(current_dir))
+ return ec;
+
+ // Relative path. Prepend the current directory.
+ if (!rootName && !rootDirectory) {
+ // Append path to the current directory.
+ path::append(current_dir, p);
+ // Set path to the result.
+ path.swap(current_dir);
+ return std::error_code();
+ }
+
+ if (!rootName && rootDirectory) {
+ StringRef cdrn = path::root_name(current_dir);
+ SmallString<128> curDirRootName(cdrn.begin(), cdrn.end());
+ path::append(curDirRootName, p);
+ // Set path to the result.
+ path.swap(curDirRootName);
+ return std::error_code();
+ }
+
+ if (rootName && !rootDirectory) {
+ StringRef pRootName = path::root_name(p);
+ StringRef bRootDirectory = path::root_directory(current_dir);
+ StringRef bRelativePath = path::relative_path(current_dir);
+ StringRef pRelativePath = path::relative_path(p);
+
+ SmallString<128> res;
+ path::append(res, pRootName, bRootDirectory, bRelativePath, pRelativePath);
+ path.swap(res);
+ return std::error_code();
+ }
+
+ llvm_unreachable("All rootName and rootDirectory combinations should have "
+ "occurred above!");
+}
+
+std::error_code make_absolute(const Twine ¤t_directory,
+ SmallVectorImpl<char> &path) {
+ return make_absolute(current_directory, path, true);
+}
+
+std::error_code make_absolute(SmallVectorImpl<char> &path) {
+ return make_absolute(Twine(), path, false);
+}
+
+std::error_code create_directories(const Twine &Path, bool IgnoreExisting,
+ perms Perms) {
+ SmallString<128> PathStorage;
+ StringRef P = Path.toStringRef(PathStorage);
+
+ // Be optimistic and try to create the directory
+ std::error_code EC = create_directory(P, IgnoreExisting, Perms);
+ // If we succeeded, or had any error other than the parent not existing, just
+ // return it.
+ if (EC != errc::no_such_file_or_directory)
+ return EC;
+
+ // We failed because of a no_such_file_or_directory, try to create the
+ // parent.
+ StringRef Parent = path::parent_path(P);
+ if (Parent.empty())
+ return EC;
+
+ if ((EC = create_directories(Parent, IgnoreExisting, Perms)))
+ return EC;
+
+ return create_directory(P, IgnoreExisting, Perms);
+}
+
+std::error_code copy_file(const Twine &From, const Twine &To) {
+ int ReadFD, WriteFD;
+ if (std::error_code EC = openFileForRead(From, ReadFD))
+ return EC;
+ if (std::error_code EC = openFileForWrite(To, WriteFD, F_None)) {
+ close(ReadFD);
+ return EC;
+ }
+
+ const size_t BufSize = 4096;
+ char *Buf = new char[BufSize];
+ int BytesRead = 0, BytesWritten = 0;
+ for (;;) {
+ BytesRead = read(ReadFD, Buf, BufSize);
+ if (BytesRead <= 0)
+ break;
+ while (BytesRead) {
+ BytesWritten = write(WriteFD, Buf, BytesRead);
+ if (BytesWritten < 0)
+ break;
+ BytesRead -= BytesWritten;
+ }
+ if (BytesWritten < 0)
+ break;
+ }
+ close(ReadFD);
+ close(WriteFD);
+ delete[] Buf;
+
+ if (BytesRead < 0 || BytesWritten < 0)
+ return std::error_code(errno, std::generic_category());
+ return std::error_code();
+}
+
+bool exists(file_status status) {
+ return status_known(status) && status.type() != file_type::file_not_found;
+}
+
+bool status_known(file_status s) {
+ return s.type() != file_type::status_error;
+}
+
+bool is_directory(file_status status) {
+ return status.type() == file_type::directory_file;
+}
+
+std::error_code is_directory(const Twine &path, bool &result) {
+ file_status st;
+ if (std::error_code ec = status(path, st))
+ return ec;
+ result = is_directory(st);
+ return std::error_code();
+}
+
+bool is_regular_file(file_status status) {
+ return status.type() == file_type::regular_file;
+}
+
+std::error_code is_regular_file(const Twine &path, bool &result) {
+ file_status st;
+ if (std::error_code ec = status(path, st))
+ return ec;
+ result = is_regular_file(st);
+ return std::error_code();
+}
+
+bool is_other(file_status status) {
+ return exists(status) &&
+ !is_regular_file(status) &&
+ !is_directory(status);
+}
+
+std::error_code is_other(const Twine &Path, bool &Result) {
+ file_status FileStatus;
+ if (std::error_code EC = status(Path, FileStatus))
+ return EC;
+ Result = is_other(FileStatus);
+ return std::error_code();
+}
+
+void directory_entry::replace_filename(const Twine &filename, file_status st) {
+ SmallString<128> path = path::parent_path(Path);
+ path::append(path, filename);
+ Path = path.str();
+ Status = st;
+}
+
+std::error_code directory_entry::status(file_status &result) const {
+ return fs::status(Path, result);
+}
+
+} // end namespace fs
+} // end namespace sys
+} // end namespace llvm
+
+// Include the truly platform-specific parts.
+#include "Path.inc"
+
+namespace llvm {
+namespace sys {
+namespace path {
+
+bool user_cache_directory(SmallVectorImpl<char> &Result, const Twine &Path1,
+ const Twine &Path2, const Twine &Path3) {
+ if (getUserCacheDir(Result)) {
+ append(Result, Path1, Path2, Path3);
+ return true;
+ }
+ return false;
+}
+
+} // end namespace path
+} // end namsspace sys
+} // end namespace llvm
diff --git a/ext/src/llvm/Path.inc b/ext/src/llvm/Path.inc
new file mode 100644
index 0000000..43e3d93
--- /dev/null
+++ b/ext/src/llvm/Path.inc
@@ -0,0 +1,620 @@
+//===- llvm/Support/Unix/Path.inc - Unix Path Implementation ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Unix specific implementation of the Path API.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only generic UNIX code that
+//=== is guaranteed to work on *all* UNIX variants.
+//===----------------------------------------------------------------------===//
+
+#include "Unix.h"
+#include <limits.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#if 1 || HAVE_DIRENT_H
+# include <dirent.h>
+# define NAMLEN(dirent) strlen((dirent)->d_name)
+#else
+# define dirent direct
+# define NAMLEN(dirent) (dirent)->d_namlen
+# if HAVE_SYS_NDIR_H
+# include <sys/ndir.h>
+# endif
+# if HAVE_SYS_DIR_H
+# include <sys/dir.h>
+# endif
+# if HAVE_NDIR_H
+# include <ndir.h>
+# endif
+#endif
+
+#ifdef __APPLE__
+#include <mach-o/dyld.h>
+#endif
+
+// Both stdio.h and cstdio are included via different pathes and
+// stdcxx's cstdio doesn't include stdio.h, so it doesn't #undef the macros
+// either.
+#undef ferror
+#undef feof
+
+// For GNU Hurd
+#if defined(__GNU__) && !defined(PATH_MAX)
+# define PATH_MAX 4096
+#endif
+
+using namespace llvm;
+
+namespace llvm {
+namespace sys {
+namespace fs {
+#if defined(__FreeBSD__) || defined (__NetBSD__) || defined(__Bitrig__) || \
+ defined(__OpenBSD__) || defined(__minix) || defined(__FreeBSD_kernel__) || \
+ defined(__linux__) || defined(__CYGWIN__) || defined(__DragonFly__)
+static int
+test_dir(char ret[PATH_MAX], const char *dir, const char *bin)
+{
+ struct stat sb;
+ char fullpath[PATH_MAX];
+
+ snprintf(fullpath, PATH_MAX, "%s/%s", dir, bin);
+ if (!realpath(fullpath, ret))
+ return 1;
+ if (stat(fullpath, &sb) != 0)
+ return 1;
+
+ return 0;
+}
+
+static char *
+getprogpath(char ret[PATH_MAX], const char *bin)
+{
+ char *pv, *s, *t;
+
+ /* First approach: absolute path. */
+ if (bin[0] == '/') {
+ if (test_dir(ret, "/", bin) == 0)
+ return ret;
+ return nullptr;
+ }
+
+ /* Second approach: relative path. */
+ if (strchr(bin, '/')) {
+ char cwd[PATH_MAX];
+ if (!getcwd(cwd, PATH_MAX))
+ return nullptr;
+ if (test_dir(ret, cwd, bin) == 0)
+ return ret;
+ return nullptr;
+ }
+
+ /* Third approach: $PATH */
+ if ((pv = getenv("PATH")) == nullptr)
+ return nullptr;
+ s = pv = strdup(pv);
+ if (!pv)
+ return nullptr;
+ while ((t = strsep(&s, ":")) != nullptr) {
+ if (test_dir(ret, t, bin) == 0) {
+ free(pv);
+ return ret;
+ }
+ }
+ free(pv);
+ return nullptr;
+}
+#endif // __FreeBSD__ || __NetBSD__ || __FreeBSD_kernel__
+
+/// GetMainExecutable - Return the path to the main executable, given the
+/// value of argv[0] from program startup.
+std::string getMainExecutable(const char *argv0, void *MainAddr) {
+#if defined(__APPLE__)
+ // On OS X the executable path is saved to the stack by dyld. Reading it
+ // from there is much faster than calling dladdr, especially for large
+ // binaries with symbols.
+ char exe_path[MAXPATHLEN];
+ uint32_t size = sizeof(exe_path);
+ if (_NSGetExecutablePath(exe_path, &size) == 0) {
+ char link_path[MAXPATHLEN];
+ if (realpath(exe_path, link_path))
+ return link_path;
+ }
+#elif defined(__FreeBSD__) || defined (__NetBSD__) || defined(__Bitrig__) || \
+ defined(__OpenBSD__) || defined(__minix) || defined(__DragonFly__) || \
+ defined(__FreeBSD_kernel__)
+ char exe_path[PATH_MAX];
+
+ if (getprogpath(exe_path, argv0) != NULL)
+ return exe_path;
+#elif defined(__linux__) || defined(__CYGWIN__)
+ char exe_path[MAXPATHLEN];
+ StringRef aPath("/proc/self/exe");
+ if (sys::fs::exists(aPath)) {
+ // /proc is not always mounted under Linux (chroot for example).
+ ssize_t len = readlink(aPath.str().c_str(), exe_path, sizeof(exe_path));
+ if (len >= 0)
+ return std::string(exe_path, len);
+ } else {
+ // Fall back to the classical detection.
+ if (getprogpath(exe_path, argv0))
+ return exe_path;
+ }
+#elif defined(HAVE_DLFCN_H)
+ // Use dladdr to get executable path if available.
+ Dl_info DLInfo;
+ int err = dladdr(MainAddr, &DLInfo);
+ if (err == 0)
+ return "";
+
+ // If the filename is a symlink, we need to resolve and return the location of
+ // the actual executable.
+ char link_path[MAXPATHLEN];
+ if (realpath(DLInfo.dli_fname, link_path))
+ return link_path;
+#else
+#error GetMainExecutable is not implemented on this host yet.
+#endif
+ return "";
+}
+
+UniqueID file_status::getUniqueID() const {
+ return UniqueID(fs_st_dev, fs_st_ino);
+}
+
+std::error_code current_path(SmallVectorImpl<char> &result) {
+ result.clear();
+
+ const char *pwd = ::getenv("PWD");
+ llvm::sys::fs::file_status PWDStatus, DotStatus;
+ if (pwd && llvm::sys::path::is_absolute(pwd) &&
+ !llvm::sys::fs::status(pwd, PWDStatus) &&
+ !llvm::sys::fs::status(".", DotStatus) &&
+ PWDStatus.getUniqueID() == DotStatus.getUniqueID()) {
+ result.append(pwd, pwd + strlen(pwd));
+ return std::error_code();
+ }
+
+#ifdef MAXPATHLEN
+ result.reserve(MAXPATHLEN);
+#else
+// For GNU Hurd
+ result.reserve(1024);
+#endif
+
+ while (true) {
+ if (::getcwd(result.data(), result.capacity()) == nullptr) {
+ // See if there was a real error.
+ if (errno != ENOMEM)
+ return std::error_code(errno, std::generic_category());
+ // Otherwise there just wasn't enough space.
+ result.reserve(result.capacity() * 2);
+ } else
+ break;
+ }
+
+ result.set_size(strlen(result.data()));
+ return std::error_code();
+}
+
+std::error_code create_directory(const Twine &path, bool IgnoreExisting,
+ perms Perms) {
+ SmallString<128> path_storage;
+ StringRef p = path.toNullTerminatedStringRef(path_storage);
+
+ if (::mkdir(p.begin(), Perms) == -1) {
+ if (errno != EEXIST || !IgnoreExisting)
+ return std::error_code(errno, std::generic_category());
+ }
+
+ return std::error_code();
+}
+
+// Note that we are using symbolic link because hard links are not supported by
+// all filesystems (SMB doesn't).
+std::error_code create_link(const Twine &to, const Twine &from) {
+ // Get arguments.
+ SmallString<128> from_storage;
+ SmallString<128> to_storage;
+ StringRef f = from.toNullTerminatedStringRef(from_storage);
+ StringRef t = to.toNullTerminatedStringRef(to_storage);
+
+ if (::symlink(t.begin(), f.begin()) == -1)
+ return std::error_code(errno, std::generic_category());
+
+ return std::error_code();
+}
+
+std::error_code remove(const Twine &path, bool IgnoreNonExisting) {
+ SmallString<128> path_storage;
+ StringRef p = path.toNullTerminatedStringRef(path_storage);
+
+ struct stat buf;
+ if (lstat(p.begin(), &buf) != 0) {
+ if (errno != ENOENT || !IgnoreNonExisting)
+ return std::error_code(errno, std::generic_category());
+ return std::error_code();
+ }
+
+ // Note: this check catches strange situations. In all cases, LLVM should
+ // only be involved in the creation and deletion of regular files. This
+ // check ensures that what we're trying to erase is a regular file. It
+ // effectively prevents LLVM from erasing things like /dev/null, any block
+ // special file, or other things that aren't "regular" files.
+ if (!S_ISREG(buf.st_mode) && !S_ISDIR(buf.st_mode) && !S_ISLNK(buf.st_mode))
+ return make_error_code(errc::operation_not_permitted);
+
+ if (::remove(p.begin()) == -1) {
+ if (errno != ENOENT || !IgnoreNonExisting)
+ return std::error_code(errno, std::generic_category());
+ }
+
+ return std::error_code();
+}
+
+std::error_code rename(const Twine &from, const Twine &to) {
+ // Get arguments.
+ SmallString<128> from_storage;
+ SmallString<128> to_storage;
+ StringRef f = from.toNullTerminatedStringRef(from_storage);
+ StringRef t = to.toNullTerminatedStringRef(to_storage);
+
+ if (::rename(f.begin(), t.begin()) == -1)
+ return std::error_code(errno, std::generic_category());
+
+ return std::error_code();
+}
+
+std::error_code resize_file(int FD, uint64_t Size) {
+ if (::ftruncate(FD, Size) == -1)
+ return std::error_code(errno, std::generic_category());
+
+ return std::error_code();
+}
+
+static int convertAccessMode(AccessMode Mode) {
+ switch (Mode) {
+ case AccessMode::Exist:
+ return F_OK;
+ case AccessMode::Write:
+ return W_OK;
+ case AccessMode::Execute:
+ return R_OK | X_OK; // scripts also need R_OK.
+ }
+ llvm_unreachable("invalid enum");
+}
+
+std::error_code access(const Twine &Path, AccessMode Mode) {
+ SmallString<128> PathStorage;
+ StringRef P = Path.toNullTerminatedStringRef(PathStorage);
+
+ if (::access(P.begin(), convertAccessMode(Mode)) == -1)
+ return std::error_code(errno, std::generic_category());
+
+ if (Mode == AccessMode::Execute) {
+ // Don't say that directories are executable.
+ struct stat buf;
+ if (0 != stat(P.begin(), &buf))
+ return errc::permission_denied;
+ if (!S_ISREG(buf.st_mode))
+ return errc::permission_denied;
+ }
+
+ return std::error_code();
+}
+
+bool can_execute(const Twine &Path) {
+ return !access(Path, AccessMode::Execute);
+}
+
+bool equivalent(file_status A, file_status B) {
+ assert(status_known(A) && status_known(B));
+ return A.fs_st_dev == B.fs_st_dev &&
+ A.fs_st_ino == B.fs_st_ino;
+}
+
+std::error_code equivalent(const Twine &A, const Twine &B, bool &result) {
+ file_status fsA, fsB;
+ if (std::error_code ec = status(A, fsA))
+ return ec;
+ if (std::error_code ec = status(B, fsB))
+ return ec;
+ result = equivalent(fsA, fsB);
+ return std::error_code();
+}
+
+static std::error_code fillStatus(int StatRet, const struct stat &Status,
+ file_status &Result) {
+ if (StatRet != 0) {
+ std::error_code ec(errno, std::generic_category());
+ if (ec == errc::no_such_file_or_directory)
+ Result = file_status(file_type::file_not_found);
+ else
+ Result = file_status(file_type::status_error);
+ return ec;
+ }
+
+ file_type Type = file_type::type_unknown;
+
+ if (S_ISDIR(Status.st_mode))
+ Type = file_type::directory_file;
+ else if (S_ISREG(Status.st_mode))
+ Type = file_type::regular_file;
+ else if (S_ISBLK(Status.st_mode))
+ Type = file_type::block_file;
+ else if (S_ISCHR(Status.st_mode))
+ Type = file_type::character_file;
+ else if (S_ISFIFO(Status.st_mode))
+ Type = file_type::fifo_file;
+ else if (S_ISSOCK(Status.st_mode))
+ Type = file_type::socket_file;
+
+ perms Perms = static_cast<perms>(Status.st_mode);
+ Result =
+ file_status(Type, Perms, Status.st_dev, Status.st_ino, Status.st_mtime,
+ Status.st_uid, Status.st_gid, Status.st_size);
+
+ return std::error_code();
+}
+
+std::error_code status(const Twine &Path, file_status &Result) {
+ SmallString<128> PathStorage;
+ StringRef P = Path.toNullTerminatedStringRef(PathStorage);
+
+ struct stat Status;
+ int StatRet = ::stat(P.begin(), &Status);
+ return fillStatus(StatRet, Status, Result);
+}
+
+std::error_code status(int FD, file_status &Result) {
+ struct stat Status;
+ int StatRet = ::fstat(FD, &Status);
+ return fillStatus(StatRet, Status, Result);
+}
+
+std::error_code mapped_file_region::init(int FD, uint64_t Offset,
+ mapmode Mode) {
+ assert(Size != 0);
+
+ int flags = (Mode == readwrite) ? MAP_SHARED : MAP_PRIVATE;
+ int prot = (Mode == readonly) ? PROT_READ : (PROT_READ | PROT_WRITE);
+ Mapping = ::mmap(nullptr, Size, prot, flags, FD, Offset);
+ if (Mapping == MAP_FAILED)
+ return std::error_code(errno, std::generic_category());
+ return std::error_code();
+}
+
+mapped_file_region::mapped_file_region(int fd, mapmode mode, uint64_t length,
+ uint64_t offset, std::error_code &ec)
+ : Size(length), Mapping() {
+ // Make sure that the requested size fits within SIZE_T.
+ if (length > std::numeric_limits<size_t>::max()) {
+ ec = make_error_code(errc::invalid_argument);
+ return;
+ }
+
+ ec = init(fd, offset, mode);
+ if (ec)
+ Mapping = nullptr;
+}
+
+mapped_file_region::~mapped_file_region() {
+ if (Mapping)
+ ::munmap(Mapping, Size);
+}
+
+uint64_t mapped_file_region::size() const {
+ assert(Mapping && "Mapping failed but used anyway!");
+ return Size;
+}
+
+char *mapped_file_region::data() const {
+ assert(Mapping && "Mapping failed but used anyway!");
+ return reinterpret_cast<char*>(Mapping);
+}
+
+const char *mapped_file_region::const_data() const {
+ assert(Mapping && "Mapping failed but used anyway!");
+ return reinterpret_cast<const char*>(Mapping);
+}
+
+int mapped_file_region::alignment() {
+ return ::getpagesize();
+}
+
+std::error_code detail::directory_iterator_construct(detail::DirIterState &it,
+ StringRef path){
+ SmallString<128> path_null(path);
+ DIR *directory = ::opendir(path_null.c_str());
+ if (!directory)
+ return std::error_code(errno, std::generic_category());
+
+ it.IterationHandle = reinterpret_cast<intptr_t>(directory);
+ // Add something for replace_filename to replace.
+ path::append(path_null, ".");
+ it.CurrentEntry = directory_entry(path_null.str());
+ return directory_iterator_increment(it);
+}
+
+std::error_code detail::directory_iterator_destruct(detail::DirIterState &it) {
+ if (it.IterationHandle)
+ ::closedir(reinterpret_cast<DIR *>(it.IterationHandle));
+ it.IterationHandle = 0;
+ it.CurrentEntry = directory_entry();
+ return std::error_code();
+}
+
+std::error_code detail::directory_iterator_increment(detail::DirIterState &it) {
+ errno = 0;
+ dirent *cur_dir = ::readdir(reinterpret_cast<DIR *>(it.IterationHandle));
+ if (cur_dir == nullptr && errno != 0) {
+ return std::error_code(errno, std::generic_category());
+ } else if (cur_dir != nullptr) {
+ StringRef name(cur_dir->d_name, NAMLEN(cur_dir));
+ if ((name.size() == 1 && name[0] == '.') ||
+ (name.size() == 2 && name[0] == '.' && name[1] == '.'))
+ return directory_iterator_increment(it);
+ it.CurrentEntry.replace_filename(name);
+ } else
+ return directory_iterator_destruct(it);
+
+ return std::error_code();
+}
+
+std::error_code openFileForRead(const Twine &Name, int &ResultFD) {
+ SmallString<128> Storage;
+ StringRef P = Name.toNullTerminatedStringRef(Storage);
+ while ((ResultFD = open(P.begin(), O_RDONLY)) < 0) {
+ if (errno != EINTR)
+ return std::error_code(errno, std::generic_category());
+ }
+ return std::error_code();
+}
+
+std::error_code openFileForWrite(const Twine &Name, int &ResultFD,
+ sys::fs::OpenFlags Flags, unsigned Mode) {
+ // Verify that we don't have both "append" and "excl".
+ assert((!(Flags & sys::fs::F_Excl) || !(Flags & sys::fs::F_Append)) &&
+ "Cannot specify both 'excl' and 'append' file creation flags!");
+
+ int OpenFlags = O_CREAT;
+
+ if (Flags & F_RW)
+ OpenFlags |= O_RDWR;
+ else
+ OpenFlags |= O_WRONLY;
+
+ if (Flags & F_Append)
+ OpenFlags |= O_APPEND;
+ else
+ OpenFlags |= O_TRUNC;
+
+ if (Flags & F_Excl)
+ OpenFlags |= O_EXCL;
+
+ SmallString<128> Storage;
+ StringRef P = Name.toNullTerminatedStringRef(Storage);
+ while ((ResultFD = open(P.begin(), OpenFlags, Mode)) < 0) {
+ if (errno != EINTR)
+ return std::error_code(errno, std::generic_category());
+ }
+ return std::error_code();
+}
+
+} // end namespace fs
+
+namespace path {
+
+bool home_directory(SmallVectorImpl<char> &result) {
+ if (char *RequestedDir = getenv("HOME")) {
+ result.clear();
+ result.append(RequestedDir, RequestedDir + strlen(RequestedDir));
+ return true;
+ }
+
+ return false;
+}
+
+static bool getDarwinConfDir(bool TempDir, SmallVectorImpl<char> &Result) {
+ #if defined(_CS_DARWIN_USER_TEMP_DIR) && defined(_CS_DARWIN_USER_CACHE_DIR)
+ // On Darwin, use DARWIN_USER_TEMP_DIR or DARWIN_USER_CACHE_DIR.
+ // macros defined in <unistd.h> on darwin >= 9
+ int ConfName = TempDir ? _CS_DARWIN_USER_TEMP_DIR
+ : _CS_DARWIN_USER_CACHE_DIR;
+ size_t ConfLen = confstr(ConfName, nullptr, 0);
+ if (ConfLen > 0) {
+ do {
+ Result.resize(ConfLen);
+ ConfLen = confstr(ConfName, Result.data(), Result.size());
+ } while (ConfLen > 0 && ConfLen != Result.size());
+
+ if (ConfLen > 0) {
+ assert(Result.back() == 0);
+ Result.pop_back();
+ return true;
+ }
+
+ Result.clear();
+ }
+ #endif
+ return false;
+}
+
+static bool getUserCacheDir(SmallVectorImpl<char> &Result) {
+ // First try using XDS_CACHE_HOME env variable,
+ // as specified in XDG Base Directory Specification at
+ // http://standards.freedesktop.org/basedir-spec/basedir-spec-latest.html
+ if (const char *XdsCacheDir = std::getenv("XDS_CACHE_HOME")) {
+ Result.clear();
+ Result.append(XdsCacheDir, XdsCacheDir + strlen(XdsCacheDir));
+ return true;
+ }
+
+ // Try Darwin configuration query
+ if (getDarwinConfDir(false, Result))
+ return true;
+
+ // Use "$HOME/.cache" if $HOME is available
+ if (home_directory(Result)) {
+ append(Result, ".cache");
+ return true;
+ }
+
+ return false;
+}
+
+static const char *getEnvTempDir() {
+ // Check whether the temporary directory is specified by an environment
+ // variable.
+ const char *EnvironmentVariables[] = {"TMPDIR", "TMP", "TEMP", "TEMPDIR"};
+ for (const char *Env : EnvironmentVariables) {
+ if (const char *Dir = std::getenv(Env))
+ return Dir;
+ }
+
+ return nullptr;
+}
+
+static const char *getDefaultTempDir(bool ErasedOnReboot) {
+#ifdef P_tmpdir
+ if ((bool)P_tmpdir)
+ return P_tmpdir;
+#endif
+
+ if (ErasedOnReboot)
+ return "/tmp";
+ return "/var/tmp";
+}
+
+void system_temp_directory(bool ErasedOnReboot, SmallVectorImpl<char> &Result) {
+ Result.clear();
+
+ if (ErasedOnReboot) {
+ // There is no env variable for the cache directory.
+ if (const char *RequestedDir = getEnvTempDir()) {
+ Result.append(RequestedDir, RequestedDir + strlen(RequestedDir));
+ return;
+ }
+ }
+
+ if (getDarwinConfDir(ErasedOnReboot, Result))
+ return;
+
+ const char *RequestedDir = getDefaultTempDir(ErasedOnReboot);
+ Result.append(RequestedDir, RequestedDir + strlen(RequestedDir));
+}
+
+} // end namespace path
+
+} // end namespace sys
+} // end namespace llvm
diff --git a/ext/src/llvm/Regex.cpp b/ext/src/llvm/Regex.cpp
new file mode 100644
index 0000000..e8344ef
--- /dev/null
+++ b/ext/src/llvm/Regex.cpp
@@ -0,0 +1,193 @@
+//===-- Regex.cpp - Regular Expression matcher implementation -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a POSIX regular expression matcher.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Regex.h"
+#include "regex_impl.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include <string>
+using namespace llvm;
+
+Regex::Regex(StringRef regex, unsigned Flags) {
+ unsigned flags = 0;
+ preg = new llvm_regex();
+ preg->re_endp = regex.end();
+ if (Flags & IgnoreCase)
+ flags |= REG_ICASE;
+ if (Flags & Newline)
+ flags |= REG_NEWLINE;
+ if (!(Flags & BasicRegex))
+ flags |= REG_EXTENDED;
+ error = llvm_regcomp(preg, regex.data(), flags|REG_PEND);
+}
+
+Regex::~Regex() {
+ if (preg) {
+ llvm_regfree(preg);
+ delete preg;
+ }
+}
+
+bool Regex::isValid(std::string &Error) {
+ if (!error)
+ return true;
+
+ size_t len = llvm_regerror(error, preg, nullptr, 0);
+
+ Error.resize(len - 1);
+ llvm_regerror(error, preg, &Error[0], len);
+ return false;
+}
+
+/// getNumMatches - In a valid regex, return the number of parenthesized
+/// matches it contains.
+unsigned Regex::getNumMatches() const {
+ return preg->re_nsub;
+}
+
+bool Regex::match(StringRef String, SmallVectorImpl<StringRef> *Matches){
+ unsigned nmatch = Matches ? preg->re_nsub+1 : 0;
+
+ // pmatch needs to have at least one element.
+ SmallVector<llvm_regmatch_t, 8> pm;
+ pm.resize(nmatch > 0 ? nmatch : 1);
+ pm[0].rm_so = 0;
+ pm[0].rm_eo = String.size();
+
+ int rc = llvm_regexec(preg, String.data(), nmatch, pm.data(), REG_STARTEND);
+
+ if (rc == REG_NOMATCH)
+ return false;
+ if (rc != 0) {
+ // regexec can fail due to invalid pattern or running out of memory.
+ error = rc;
+ return false;
+ }
+
+ // There was a match.
+
+ if (Matches) { // match position requested
+ Matches->clear();
+
+ for (unsigned i = 0; i != nmatch; ++i) {
+ if (pm[i].rm_so == -1) {
+ // this group didn't match
+ Matches->push_back(StringRef());
+ continue;
+ }
+ assert(pm[i].rm_eo >= pm[i].rm_so);
+ Matches->push_back(StringRef(String.data()+pm[i].rm_so,
+ pm[i].rm_eo-pm[i].rm_so));
+ }
+ }
+
+ return true;
+}
+
+std::string Regex::sub(StringRef Repl, StringRef String,
+ std::string *Error) {
+ SmallVector<StringRef, 8> Matches;
+
+ // Reset error, if given.
+ if (Error && !Error->empty()) *Error = "";
+
+ // Return the input if there was no match.
+ if (!match(String, &Matches))
+ return String;
+
+ // Otherwise splice in the replacement string, starting with the prefix before
+ // the match.
+ std::string Res(String.begin(), Matches[0].begin());
+
+ // Then the replacement string, honoring possible substitutions.
+ while (!Repl.empty()) {
+ // Skip to the next escape.
+ std::pair<StringRef, StringRef> Split = Repl.split('\\');
+
+ // Add the skipped substring.
+ Res += Split.first;
+
+ // Check for terminimation and trailing backslash.
+ if (Split.second.empty()) {
+ if (Repl.size() != Split.first.size() &&
+ Error && Error->empty())
+ *Error = "replacement string contained trailing backslash";
+ break;
+ }
+
+ // Otherwise update the replacement string and interpret escapes.
+ Repl = Split.second;
+
+ // FIXME: We should have a StringExtras function for mapping C99 escapes.
+ switch (Repl[0]) {
+ // Treat all unrecognized characters as self-quoting.
+ default:
+ Res += Repl[0];
+ Repl = Repl.substr(1);
+ break;
+
+ // Single character escapes.
+ case 't':
+ Res += '\t';
+ Repl = Repl.substr(1);
+ break;
+ case 'n':
+ Res += '\n';
+ Repl = Repl.substr(1);
+ break;
+
+ // Decimal escapes are backreferences.
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9': {
+ // Extract the backreference number.
+ StringRef Ref = Repl.slice(0, Repl.find_first_not_of("0123456789"));
+ Repl = Repl.substr(Ref.size());
+
+ unsigned RefValue;
+ if (!Ref.getAsInteger(10, RefValue) &&
+ RefValue < Matches.size())
+ Res += Matches[RefValue];
+ else if (Error && Error->empty())
+ *Error = ("invalid backreference string '" + Twine(Ref) + "'").str();
+ break;
+ }
+ }
+ }
+
+ // And finally the suffix.
+ Res += StringRef(Matches[0].end(), String.end() - Matches[0].end());
+
+ return Res;
+}
+
+// These are the special characters matched in functions like "p_ere_exp".
+static const char RegexMetachars[] = "()^$|*+?.[]\\{}";
+
+bool Regex::isLiteralERE(StringRef Str) {
+ // Check for regex metacharacters. This list was derived from our regex
+ // implementation in regcomp.c and double checked against the POSIX extended
+ // regular expression specification.
+ return Str.find_first_of(RegexMetachars) == StringRef::npos;
+}
+
+std::string Regex::escape(StringRef String) {
+ std::string RegexStr;
+ for (unsigned i = 0, e = String.size(); i != e; ++i) {
+ if (strchr(RegexMetachars, String[i]))
+ RegexStr += '\\';
+ RegexStr += String[i];
+ }
+
+ return RegexStr;
+}
diff --git a/ext/src/llvm/Signals.cpp b/ext/src/llvm/Signals.cpp
new file mode 100644
index 0000000..4b456d7
--- /dev/null
+++ b/ext/src/llvm/Signals.cpp
@@ -0,0 +1,62 @@
+//===- Signals.cpp - Signal Handling support --------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines some helpful functions for dealing with the possibility of
+// Unix signals occurring while your program is running.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/FileUtilities.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Mutex.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/StringSaver.h"
+#include "llvm/Support/raw_ostream.h"
+#include <vector>
+
+namespace llvm {
+using namespace sys;
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only TRULY operating system
+//=== independent code.
+//===----------------------------------------------------------------------===//
+
+static std::vector<std::pair<void (*)(void *), void *>> CallBacksToRun;
+void sys::RunSignalHandlers() {
+ if (!CallBacksToRun.size())
+ return;
+ for (auto &I : CallBacksToRun)
+ I.first(I.second);
+ CallBacksToRun.clear();
+}
+}
+
+using namespace llvm;
+
+static bool findModulesAndOffsets(void **StackTrace, int Depth,
+ const char **Modules, intptr_t *Offsets,
+ const char *MainExecutableName,
+ StringSaver &StrPool);
+
+/// Format a pointer value as hexadecimal. Zero pad it out so its always the
+/// same width.
+static FormattedNumber format_ptr(void *PC) {
+ // Each byte is two hex digits plus 2 for the 0x prefix.
+ unsigned PtrWidth = 2 + 2 * sizeof(void *);
+ return format_hex((uint64_t)PC, PtrWidth);
+}
+
+// Include the platform-specific parts of this class.
+#include "Signals.inc"
diff --git a/ext/src/llvm/Signals.inc b/ext/src/llvm/Signals.inc
new file mode 100644
index 0000000..198c783
--- /dev/null
+++ b/ext/src/llvm/Signals.inc
@@ -0,0 +1,435 @@
+//===- Signals.cpp - Generic Unix Signals Implementation -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines some helpful functions for dealing with the possibility of
+// Unix signals occurring while your program is running.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Unix.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/FileUtilities.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Mutex.h"
+#include "llvm/Support/UniqueLock.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Config.h"
+#include <algorithm>
+#include <string>
+#if HAVE_EXECINFO_H
+# include <execinfo.h> // For backtrace().
+#endif
+#if HAVE_SIGNAL_H
+#include <signal.h>
+#endif
+#if HAVE_SYS_STAT_H
+#include <sys/stat.h>
+#endif
+#if HAVE_CXXABI_H
+#include <cxxabi.h>
+#endif
+#if HAVE_DLFCN_H
+#include <dlfcn.h>
+#endif
+#if HAVE_MACH_MACH_H
+#include <mach/mach.h>
+#endif
+#if HAVE_LINK_H
+#include <link.h>
+#endif
+
+using namespace llvm;
+
+static RETSIGTYPE SignalHandler(int Sig); // defined below.
+
+static SmartMutex<true> SignalsMutex;
+
+/// InterruptFunction - The function to call if ctrl-c is pressed.
+static void (*InterruptFunction)() = nullptr;
+
+static std::vector<std::string> FilesToRemove;
+
+// IntSigs - Signals that represent requested termination. There's no bug
+// or failure, or if there is, it's not our direct responsibility. For whatever
+// reason, our continued execution is no longer desirable.
+static const int IntSigs[] = {
+ SIGHUP, SIGINT, SIGPIPE, SIGTERM, SIGUSR1, SIGUSR2
+};
+
+// KillSigs - Signals that represent that we have a bug, and our prompt
+// termination has been ordered.
+static const int KillSigs[] = {
+ SIGILL, SIGTRAP, SIGABRT, SIGFPE, SIGBUS, SIGSEGV, SIGQUIT
+#ifdef SIGSYS
+ , SIGSYS
+#endif
+#ifdef SIGXCPU
+ , SIGXCPU
+#endif
+#ifdef SIGXFSZ
+ , SIGXFSZ
+#endif
+#ifdef SIGEMT
+ , SIGEMT
+#endif
+};
+
+static unsigned NumRegisteredSignals = 0;
+static struct {
+ struct sigaction SA;
+ int SigNo;
+} RegisteredSignalInfo[array_lengthof(IntSigs) + array_lengthof(KillSigs)];
+
+
+static void RegisterHandler(int Signal) {
+ assert(NumRegisteredSignals < array_lengthof(RegisteredSignalInfo) &&
+ "Out of space for signal handlers!");
+
+ struct sigaction NewHandler;
+
+ NewHandler.sa_handler = SignalHandler;
+ NewHandler.sa_flags = SA_NODEFER|SA_RESETHAND;
+ sigemptyset(&NewHandler.sa_mask);
+
+ // Install the new handler, save the old one in RegisteredSignalInfo.
+ sigaction(Signal, &NewHandler,
+ &RegisteredSignalInfo[NumRegisteredSignals].SA);
+ RegisteredSignalInfo[NumRegisteredSignals].SigNo = Signal;
+ ++NumRegisteredSignals;
+}
+
+static void RegisterHandlers() {
+ // If the handlers are already registered, we're done.
+ if (NumRegisteredSignals != 0) return;
+
+ for (auto S : IntSigs) RegisterHandler(S);
+ for (auto S : KillSigs) RegisterHandler(S);
+}
+
+static void UnregisterHandlers() {
+ // Restore all of the signal handlers to how they were before we showed up.
+ for (unsigned i = 0, e = NumRegisteredSignals; i != e; ++i)
+ sigaction(RegisteredSignalInfo[i].SigNo,
+ &RegisteredSignalInfo[i].SA, nullptr);
+ NumRegisteredSignals = 0;
+}
+
+
+/// RemoveFilesToRemove - Process the FilesToRemove list. This function
+/// should be called with the SignalsMutex lock held.
+/// NB: This must be an async signal safe function. It cannot allocate or free
+/// memory, even in debug builds.
+static void RemoveFilesToRemove() {
+ // Avoid constructing ManagedStatic in the signal handler.
+ // If FilesToRemove is not constructed, there are no files to remove.
+ if (!FilesToRemove.size())
+ return;
+
+ // We avoid iterators in case of debug iterators that allocate or release
+ // memory.
+ std::vector<std::string>& FilesToRemoveRef = FilesToRemove;
+ for (unsigned i = 0, e = FilesToRemoveRef.size(); i != e; ++i) {
+ const char *path = FilesToRemoveRef[i].c_str();
+
+ // Get the status so we can determine if it's a file or directory. If we
+ // can't stat the file, ignore it.
+ struct stat buf;
+ if (stat(path, &buf) != 0)
+ continue;
+
+ // If this is not a regular file, ignore it. We want to prevent removal of
+ // special files like /dev/null, even if the compiler is being run with the
+ // super-user permissions.
+ if (!S_ISREG(buf.st_mode))
+ continue;
+
+ // Otherwise, remove the file. We ignore any errors here as there is nothing
+ // else we can do.
+ unlink(path);
+ }
+}
+
+// SignalHandler - The signal handler that runs.
+static RETSIGTYPE SignalHandler(int Sig) {
+ // Restore the signal behavior to default, so that the program actually
+ // crashes when we return and the signal reissues. This also ensures that if
+ // we crash in our signal handler that the program will terminate immediately
+ // instead of recursing in the signal handler.
+ UnregisterHandlers();
+
+ // Unmask all potentially blocked kill signals.
+ sigset_t SigMask;
+ sigfillset(&SigMask);
+ sigprocmask(SIG_UNBLOCK, &SigMask, nullptr);
+
+ {
+ unique_lock<SmartMutex<true>> Guard(SignalsMutex);
+ RemoveFilesToRemove();
+
+ if (std::find(std::begin(IntSigs), std::end(IntSigs), Sig)
+ != std::end(IntSigs)) {
+ if (InterruptFunction) {
+ void (*IF)() = InterruptFunction;
+ Guard.unlock();
+ InterruptFunction = nullptr;
+ IF(); // run the interrupt function.
+ return;
+ }
+
+ Guard.unlock();
+ raise(Sig); // Execute the default handler.
+ return;
+ }
+ }
+
+ // Otherwise if it is a fault (like SEGV) run any handler.
+ llvm::sys::RunSignalHandlers();
+
+#ifdef __s390__
+ // On S/390, certain signals are delivered with PSW Address pointing to
+ // *after* the faulting instruction. Simply returning from the signal
+ // handler would continue execution after that point, instead of
+ // re-raising the signal. Raise the signal manually in those cases.
+ if (Sig == SIGILL || Sig == SIGFPE || Sig == SIGTRAP)
+ raise(Sig);
+#endif
+}
+
+void llvm::sys::RunInterruptHandlers() {
+ sys::SmartScopedLock<true> Guard(SignalsMutex);
+ RemoveFilesToRemove();
+}
+
+void llvm::sys::SetInterruptFunction(void (*IF)()) {
+ {
+ sys::SmartScopedLock<true> Guard(SignalsMutex);
+ InterruptFunction = IF;
+ }
+ RegisterHandlers();
+}
+
+// RemoveFileOnSignal - The public API
+bool llvm::sys::RemoveFileOnSignal(StringRef Filename,
+ std::string* ErrMsg) {
+ {
+ sys::SmartScopedLock<true> Guard(SignalsMutex);
+ FilesToRemove.push_back(Filename);
+ }
+
+ RegisterHandlers();
+ return false;
+}
+
+// DontRemoveFileOnSignal - The public API
+void llvm::sys::DontRemoveFileOnSignal(StringRef Filename) {
+ sys::SmartScopedLock<true> Guard(SignalsMutex);
+ std::vector<std::string>::reverse_iterator RI =
+ std::find(FilesToRemove.rbegin(), FilesToRemove.rend(), Filename);
+ std::vector<std::string>::iterator I = FilesToRemove.end();
+ if (RI != FilesToRemove.rend())
+ I = FilesToRemove.erase(RI.base()-1);
+}
+
+/// AddSignalHandler - Add a function to be called when a signal is delivered
+/// to the process. The handler can have a cookie passed to it to identify
+/// what instance of the handler it is.
+void llvm::sys::AddSignalHandler(void (*FnPtr)(void *), void *Cookie) {
+ CallBacksToRun.push_back(std::make_pair(FnPtr, Cookie));
+ RegisterHandlers();
+}
+
+#if defined(HAVE_BACKTRACE) && HAVE_LINK_H && \
+ (defined(__linux__) || defined(__FreeBSD__) || \
+ defined(__FreeBSD_kernel__) || defined(__NetBSD__))
+struct DlIteratePhdrData {
+ void **StackTrace;
+ int depth;
+ bool first;
+ const char **modules;
+ intptr_t *offsets;
+ const char *main_exec_name;
+};
+
+static int dl_iterate_phdr_cb(dl_phdr_info *info, size_t size, void *arg) {
+ DlIteratePhdrData *data = (DlIteratePhdrData*)arg;
+ const char *name = data->first ? data->main_exec_name : info->dlpi_name;
+ data->first = false;
+ for (int i = 0; i < info->dlpi_phnum; i++) {
+ const auto *phdr = &info->dlpi_phdr[i];
+ if (phdr->p_type != PT_LOAD)
+ continue;
+ intptr_t beg = info->dlpi_addr + phdr->p_vaddr;
+ intptr_t end = beg + phdr->p_memsz;
+ for (int j = 0; j < data->depth; j++) {
+ if (data->modules[j])
+ continue;
+ intptr_t addr = (intptr_t)data->StackTrace[j];
+ if (beg <= addr && addr < end) {
+ data->modules[j] = name;
+ data->offsets[j] = addr - info->dlpi_addr;
+ }
+ }
+ }
+ return 0;
+}
+
+/// If this is an ELF platform, we can find all loaded modules and their virtual
+/// addresses with dl_iterate_phdr.
+static bool findModulesAndOffsets(void **StackTrace, int Depth,
+ const char **Modules, intptr_t *Offsets,
+ const char *MainExecutableName,
+ StringSaver &StrPool) {
+ DlIteratePhdrData data = {StackTrace, Depth, true,
+ Modules, Offsets, MainExecutableName};
+ dl_iterate_phdr(dl_iterate_phdr_cb, &data);
+ return true;
+}
+#else
+/// This platform does not have dl_iterate_phdr, so we do not yet know how to
+/// find all loaded DSOs.
+static bool findModulesAndOffsets(void **StackTrace, int Depth,
+ const char **Modules, intptr_t *Offsets,
+ const char *MainExecutableName,
+ StringSaver &StrPool) {
+ return false;
+}
+#endif // defined(HAVE_BACKTRACE) && defined(ENABLE_BACKTRACES) && ...
+
+// PrintStackTrace - In the case of a program crash or fault, print out a stack
+// trace so that the user has an indication of why and where we died.
+//
+// On glibc systems we have the 'backtrace' function, which works nicely, but
+// doesn't demangle symbols.
+void llvm::sys::PrintStackTrace(raw_ostream &OS) {
+#if defined(HAVE_BACKTRACE)
+ static void* StackTrace[256];
+ // Use backtrace() to output a backtrace on Linux systems with glibc.
+ int depth = backtrace(StackTrace,
+ static_cast<int>(array_lengthof(StackTrace)));
+#if HAVE_DLFCN_H && __GNUG__
+ int width = 0;
+ for (int i = 0; i < depth; ++i) {
+ Dl_info dlinfo;
+ dladdr(StackTrace[i], &dlinfo);
+ const char* name = strrchr(dlinfo.dli_fname, '/');
+
+ int nwidth;
+ if (!name) nwidth = strlen(dlinfo.dli_fname);
+ else nwidth = strlen(name) - 1;
+
+ if (nwidth > width) width = nwidth;
+ }
+
+ for (int i = 0; i < depth; ++i) {
+ Dl_info dlinfo;
+ dladdr(StackTrace[i], &dlinfo);
+
+ OS << format("%-2d", i);
+
+ const char* name = strrchr(dlinfo.dli_fname, '/');
+ if (!name) OS << format(" %-*s", width, dlinfo.dli_fname);
+ else OS << format(" %-*s", width, name+1);
+
+ OS << format(" %#0*lx", (int)(sizeof(void*) * 2) + 2,
+ (unsigned long)StackTrace[i]);
+
+ if (dlinfo.dli_sname != nullptr) {
+ OS << ' ';
+# if HAVE_CXXABI_H
+ int res;
+ char* d = abi::__cxa_demangle(dlinfo.dli_sname, nullptr, nullptr, &res);
+# else
+ char* d = NULL;
+# endif
+ if (!d) OS << dlinfo.dli_sname;
+ else OS << d;
+ free(d);
+
+ // FIXME: When we move to C++11, use %t length modifier. It's not in
+ // C++03 and causes gcc to issue warnings. Losing the upper 32 bits of
+ // the stack offset for a stack dump isn't likely to cause any problems.
+ OS << format(" + %u",(unsigned)((char*)StackTrace[i]-
+ (char*)dlinfo.dli_saddr));
+ }
+ OS << '\n';
+ }
+#else
+ backtrace_symbols_fd(StackTrace, depth, STDERR_FILENO);
+#endif
+#endif
+}
+
+static void PrintStackTraceSignalHandler(void *) {
+ PrintStackTrace(llvm::errs());
+}
+
+void llvm::sys::DisableSystemDialogsOnCrash() {}
+
+/// PrintStackTraceOnErrorSignal - When an error signal (such as SIGABRT or
+/// SIGSEGV) is delivered to the process, print a stack trace and then exit.
+void llvm::sys::PrintStackTraceOnErrorSignal(bool DisableCrashReporting) {
+ AddSignalHandler(PrintStackTraceSignalHandler, nullptr);
+
+#if defined(__APPLE__) && defined(ENABLE_CRASH_OVERRIDES)
+ // Environment variable to disable any kind of crash dialog.
+ if (DisableCrashReporting || getenv("LLVM_DISABLE_CRASH_REPORT")) {
+ mach_port_t self = mach_task_self();
+
+ exception_mask_t mask = EXC_MASK_CRASH;
+
+ kern_return_t ret = task_set_exception_ports(self,
+ mask,
+ MACH_PORT_NULL,
+ EXCEPTION_STATE_IDENTITY | MACH_EXCEPTION_CODES,
+ THREAD_STATE_NONE);
+ (void)ret;
+ }
+#endif
+}
+
+
+/***/
+
+// On Darwin, raise sends a signal to the main thread instead of the current
+// thread. This has the unfortunate effect that assert() and abort() will end up
+// bypassing our crash recovery attempts. We work around this for anything in
+// the same linkage unit by just defining our own versions of the assert handler
+// and abort.
+
+#if defined(__APPLE__) && defined(ENABLE_CRASH_OVERRIDES)
+
+#include <signal.h>
+#include <pthread.h>
+
+int raise(int sig) {
+ return pthread_kill(pthread_self(), sig);
+}
+
+void __assert_rtn(const char *func,
+ const char *file,
+ int line,
+ const char *expr) {
+ if (func)
+ fprintf(stderr, "Assertion failed: (%s), function %s, file %s, line %d.\n",
+ expr, func, file, line);
+ else
+ fprintf(stderr, "Assertion failed: (%s), file %s, line %d.\n",
+ expr, file, line);
+ abort();
+}
+
+void abort() {
+ raise(SIGABRT);
+ usleep(1000);
+ __builtin_trap();
+}
+
+#endif
diff --git a/ext/src/llvm/SmallVector.cpp b/ext/src/llvm/SmallVector.cpp
new file mode 100644
index 0000000..b931505
--- /dev/null
+++ b/ext/src/llvm/SmallVector.cpp
@@ -0,0 +1,41 @@
+//===- llvm/ADT/SmallVector.cpp - 'Normally small' vectors ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SmallVector class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+using namespace llvm;
+
+/// grow_pod - This is an implementation of the grow() method which only works
+/// on POD-like datatypes and is out of line to reduce code duplication.
+void SmallVectorBase::grow_pod(void *FirstEl, size_t MinSizeInBytes,
+ size_t TSize) {
+ size_t CurSizeBytes = size_in_bytes();
+ size_t NewCapacityInBytes = 2 * capacity_in_bytes() + TSize; // Always grow.
+ if (NewCapacityInBytes < MinSizeInBytes)
+ NewCapacityInBytes = MinSizeInBytes;
+
+ void *NewElts;
+ if (BeginX == FirstEl) {
+ NewElts = malloc(NewCapacityInBytes);
+
+ // Copy the elements over. No need to run dtors on PODs.
+ memcpy(NewElts, this->BeginX, CurSizeBytes);
+ } else {
+ // If this wasn't grown from the inline copy, grow the allocated space.
+ NewElts = realloc(this->BeginX, NewCapacityInBytes);
+ }
+ assert(NewElts && "Out of memory");
+
+ this->EndX = (char*)NewElts+CurSizeBytes;
+ this->BeginX = NewElts;
+ this->CapacityX = (char*)this->BeginX + NewCapacityInBytes;
+}
diff --git a/ext/src/llvm/SourceMgr.cpp b/ext/src/llvm/SourceMgr.cpp
new file mode 100644
index 0000000..699a61c
--- /dev/null
+++ b/ext/src/llvm/SourceMgr.cpp
@@ -0,0 +1,476 @@
+//===- SourceMgr.cpp - Manager for Simple Source Buffers & Diagnostics ----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SourceMgr class. This class is used as a simple
+// substrate for diagnostics, #include handling, and other low level things for
+// simple parsers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+static const size_t TabStop = 8;
+
+namespace {
+ struct LineNoCacheTy {
+ unsigned LastQueryBufferID;
+ const char *LastQuery;
+ unsigned LineNoOfQuery;
+ };
+}
+
+static LineNoCacheTy *getCache(void *Ptr) {
+ return (LineNoCacheTy*)Ptr;
+}
+
+
+SourceMgr::~SourceMgr() {
+ // Delete the line # cache if allocated.
+ if (LineNoCacheTy *Cache = getCache(LineNoCache))
+ delete Cache;
+}
+
+unsigned SourceMgr::AddIncludeFile(const std::string &Filename,
+ SMLoc IncludeLoc,
+ std::string &IncludedFile) {
+ IncludedFile = Filename;
+ ErrorOr<std::unique_ptr<MemoryBuffer>> NewBufOrErr =
+ MemoryBuffer::getFile(IncludedFile);
+
+ // If the file didn't exist directly, see if it's in an include path.
+ for (unsigned i = 0, e = IncludeDirectories.size(); i != e && !NewBufOrErr;
+ ++i) {
+ IncludedFile =
+ IncludeDirectories[i] + sys::path::get_separator().data() + Filename;
+ NewBufOrErr = MemoryBuffer::getFile(IncludedFile);
+ }
+
+ if (!NewBufOrErr)
+ return 0;
+
+ return AddNewSourceBuffer(std::move(*NewBufOrErr), IncludeLoc);
+}
+
+unsigned SourceMgr::FindBufferContainingLoc(SMLoc Loc) const {
+ for (unsigned i = 0, e = Buffers.size(); i != e; ++i)
+ if (Loc.getPointer() >= Buffers[i].Buffer->getBufferStart() &&
+ // Use <= here so that a pointer to the null at the end of the buffer
+ // is included as part of the buffer.
+ Loc.getPointer() <= Buffers[i].Buffer->getBufferEnd())
+ return i + 1;
+ return 0;
+}
+
+std::pair<unsigned, unsigned>
+SourceMgr::getLineAndColumn(SMLoc Loc, unsigned BufferID) const {
+ if (!BufferID)
+ BufferID = FindBufferContainingLoc(Loc);
+ assert(BufferID && "Invalid Location!");
+
+ const MemoryBuffer *Buff = getMemoryBuffer(BufferID);
+
+ // Count the number of \n's between the start of the file and the specified
+ // location.
+ unsigned LineNo = 1;
+
+ const char *BufStart = Buff->getBufferStart();
+ const char *Ptr = BufStart;
+
+ // If we have a line number cache, and if the query is to a later point in the
+ // same file, start searching from the last query location. This optimizes
+ // for the case when multiple diagnostics come out of one file in order.
+ if (LineNoCacheTy *Cache = getCache(LineNoCache))
+ if (Cache->LastQueryBufferID == BufferID &&
+ Cache->LastQuery <= Loc.getPointer()) {
+ Ptr = Cache->LastQuery;
+ LineNo = Cache->LineNoOfQuery;
+ }
+
+ // Scan for the location being queried, keeping track of the number of lines
+ // we see.
+ for (; SMLoc::getFromPointer(Ptr) != Loc; ++Ptr)
+ if (*Ptr == '\n') ++LineNo;
+
+ // Allocate the line number cache if it doesn't exist.
+ if (!LineNoCache)
+ LineNoCache = new LineNoCacheTy();
+
+ // Update the line # cache.
+ LineNoCacheTy &Cache = *getCache(LineNoCache);
+ Cache.LastQueryBufferID = BufferID;
+ Cache.LastQuery = Ptr;
+ Cache.LineNoOfQuery = LineNo;
+
+ size_t NewlineOffs = StringRef(BufStart, Ptr-BufStart).find_last_of("\n\r");
+ if (NewlineOffs == StringRef::npos) NewlineOffs = ~(size_t)0;
+ return std::make_pair(LineNo, Ptr-BufStart-NewlineOffs);
+}
+
+void SourceMgr::PrintIncludeStack(SMLoc IncludeLoc, raw_ostream &OS) const {
+ if (IncludeLoc == SMLoc()) return; // Top of stack.
+
+ unsigned CurBuf = FindBufferContainingLoc(IncludeLoc);
+ assert(CurBuf && "Invalid or unspecified location!");
+
+ PrintIncludeStack(getBufferInfo(CurBuf).IncludeLoc, OS);
+
+ OS << "Included from "
+ << getBufferInfo(CurBuf).Buffer->getBufferIdentifier()
+ << ":" << FindLineNumber(IncludeLoc, CurBuf) << ":\n";
+}
+
+
+SMDiagnostic SourceMgr::GetMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
+ const Twine &Msg,
+ ArrayRef<SMRange> Ranges,
+ ArrayRef<SMFixIt> FixIts) const {
+
+ // First thing to do: find the current buffer containing the specified
+ // location to pull out the source line.
+ SmallVector<std::pair<unsigned, unsigned>, 4> ColRanges;
+ std::pair<unsigned, unsigned> LineAndCol;
+ const char *BufferID = "<unknown>";
+ std::string LineStr;
+
+ if (Loc.isValid()) {
+ unsigned CurBuf = FindBufferContainingLoc(Loc);
+ assert(CurBuf && "Invalid or unspecified location!");
+
+ const MemoryBuffer *CurMB = getMemoryBuffer(CurBuf);
+ BufferID = CurMB->getBufferIdentifier();
+
+ // Scan backward to find the start of the line.
+ const char *LineStart = Loc.getPointer();
+ const char *BufStart = CurMB->getBufferStart();
+ while (LineStart != BufStart && LineStart[-1] != '\n' &&
+ LineStart[-1] != '\r')
+ --LineStart;
+
+ // Get the end of the line.
+ const char *LineEnd = Loc.getPointer();
+ const char *BufEnd = CurMB->getBufferEnd();
+ while (LineEnd != BufEnd && LineEnd[0] != '\n' && LineEnd[0] != '\r')
+ ++LineEnd;
+ LineStr = std::string(LineStart, LineEnd);
+
+ // Convert any ranges to column ranges that only intersect the line of the
+ // location.
+ for (unsigned i = 0, e = Ranges.size(); i != e; ++i) {
+ SMRange R = Ranges[i];
+ if (!R.isValid()) continue;
+
+ // If the line doesn't contain any part of the range, then ignore it.
+ if (R.Start.getPointer() > LineEnd || R.End.getPointer() < LineStart)
+ continue;
+
+ // Ignore pieces of the range that go onto other lines.
+ if (R.Start.getPointer() < LineStart)
+ R.Start = SMLoc::getFromPointer(LineStart);
+ if (R.End.getPointer() > LineEnd)
+ R.End = SMLoc::getFromPointer(LineEnd);
+
+ // Translate from SMLoc ranges to column ranges.
+ // FIXME: Handle multibyte characters.
+ ColRanges.push_back(std::make_pair(R.Start.getPointer()-LineStart,
+ R.End.getPointer()-LineStart));
+ }
+
+ LineAndCol = getLineAndColumn(Loc, CurBuf);
+ }
+
+ return SMDiagnostic(*this, Loc, BufferID, LineAndCol.first,
+ LineAndCol.second-1, Kind, Msg.str(),
+ LineStr, ColRanges, FixIts);
+}
+
+void SourceMgr::PrintMessage(raw_ostream &OS, const SMDiagnostic &Diagnostic,
+ bool ShowColors) const {
+ // Report the message with the diagnostic handler if present.
+ if (DiagHandler) {
+ DiagHandler(Diagnostic, DiagContext);
+ return;
+ }
+
+ if (Diagnostic.getLoc().isValid()) {
+ unsigned CurBuf = FindBufferContainingLoc(Diagnostic.getLoc());
+ assert(CurBuf && "Invalid or unspecified location!");
+ PrintIncludeStack(getBufferInfo(CurBuf).IncludeLoc, OS);
+ }
+
+ Diagnostic.print(nullptr, OS, ShowColors);
+}
+
+void SourceMgr::PrintMessage(raw_ostream &OS, SMLoc Loc,
+ SourceMgr::DiagKind Kind,
+ const Twine &Msg, ArrayRef<SMRange> Ranges,
+ ArrayRef<SMFixIt> FixIts, bool ShowColors) const {
+ PrintMessage(OS, GetMessage(Loc, Kind, Msg, Ranges, FixIts), ShowColors);
+}
+
+void SourceMgr::PrintMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
+ const Twine &Msg, ArrayRef<SMRange> Ranges,
+ ArrayRef<SMFixIt> FixIts, bool ShowColors) const {
+ PrintMessage(llvm::errs(), Loc, Kind, Msg, Ranges, FixIts, ShowColors);
+}
+
+//===----------------------------------------------------------------------===//
+// SMDiagnostic Implementation
+//===----------------------------------------------------------------------===//
+
+SMDiagnostic::SMDiagnostic(const SourceMgr &sm, SMLoc L, StringRef FN,
+ int Line, int Col, SourceMgr::DiagKind Kind,
+ StringRef Msg, StringRef LineStr,
+ ArrayRef<std::pair<unsigned,unsigned> > Ranges,
+ ArrayRef<SMFixIt> Hints)
+ : SM(&sm), Loc(L), Filename(FN), LineNo(Line), ColumnNo(Col), Kind(Kind),
+ Message(Msg), LineContents(LineStr), Ranges(Ranges.vec()),
+ FixIts(Hints.begin(), Hints.end()) {
+ std::sort(FixIts.begin(), FixIts.end());
+}
+
+static void buildFixItLine(std::string &CaretLine, std::string &FixItLine,
+ ArrayRef<SMFixIt> FixIts, ArrayRef<char> SourceLine){
+ if (FixIts.empty())
+ return;
+
+ const char *LineStart = SourceLine.begin();
+ const char *LineEnd = SourceLine.end();
+
+ size_t PrevHintEndCol = 0;
+
+ for (ArrayRef<SMFixIt>::iterator I = FixIts.begin(), E = FixIts.end();
+ I != E; ++I) {
+ // If the fixit contains a newline or tab, ignore it.
+ if (I->getText().find_first_of("\n\r\t") != StringRef::npos)
+ continue;
+
+ SMRange R = I->getRange();
+
+ // If the line doesn't contain any part of the range, then ignore it.
+ if (R.Start.getPointer() > LineEnd || R.End.getPointer() < LineStart)
+ continue;
+
+ // Translate from SMLoc to column.
+ // Ignore pieces of the range that go onto other lines.
+ // FIXME: Handle multibyte characters in the source line.
+ unsigned FirstCol;
+ if (R.Start.getPointer() < LineStart)
+ FirstCol = 0;
+ else
+ FirstCol = R.Start.getPointer() - LineStart;
+
+ // If we inserted a long previous hint, push this one forwards, and add
+ // an extra space to show that this is not part of the previous
+ // completion. This is sort of the best we can do when two hints appear
+ // to overlap.
+ //
+ // Note that if this hint is located immediately after the previous
+ // hint, no space will be added, since the location is more important.
+ unsigned HintCol = FirstCol;
+ if (HintCol < PrevHintEndCol)
+ HintCol = PrevHintEndCol + 1;
+
+ // This relies on one byte per column in our fixit hints.
+ unsigned LastColumnModified = HintCol + I->getText().size();
+ if (LastColumnModified > FixItLine.size())
+ FixItLine.resize(LastColumnModified, ' ');
+
+ std::copy(I->getText().begin(), I->getText().end(),
+ FixItLine.begin() + HintCol);
+
+ PrevHintEndCol = LastColumnModified;
+
+ // For replacements, mark the removal range with '~'.
+ // FIXME: Handle multibyte characters in the source line.
+ unsigned LastCol;
+ if (R.End.getPointer() >= LineEnd)
+ LastCol = LineEnd - LineStart;
+ else
+ LastCol = R.End.getPointer() - LineStart;
+
+ std::fill(&CaretLine[FirstCol], &CaretLine[LastCol], '~');
+ }
+}
+
+static void printSourceLine(raw_ostream &S, StringRef LineContents) {
+ // Print out the source line one character at a time, so we can expand tabs.
+ for (unsigned i = 0, e = LineContents.size(), OutCol = 0; i != e; ++i) {
+ if (LineContents[i] != '\t') {
+ S << LineContents[i];
+ ++OutCol;
+ continue;
+ }
+
+ // If we have a tab, emit at least one space, then round up to 8 columns.
+ do {
+ S << ' ';
+ ++OutCol;
+ } while ((OutCol % TabStop) != 0);
+ }
+ S << '\n';
+}
+
+static bool isNonASCII(char c) {
+ return c & 0x80;
+}
+
+void SMDiagnostic::print(const char *ProgName, raw_ostream &S, bool ShowColors,
+ bool ShowKindLabel) const {
+ // Display colors only if OS supports colors.
+ ShowColors &= S.has_colors();
+
+ if (ShowColors)
+ S.changeColor(raw_ostream::SAVEDCOLOR, true);
+
+ if (ProgName && ProgName[0])
+ S << ProgName << ": ";
+
+ if (!Filename.empty()) {
+ if (Filename == "-")
+ S << "<stdin>";
+ else
+ S << Filename;
+
+ if (LineNo != -1) {
+ S << ':' << LineNo;
+ if (ColumnNo != -1)
+ S << ':' << (ColumnNo+1);
+ }
+ S << ": ";
+ }
+
+ if (ShowKindLabel) {
+ switch (Kind) {
+ case SourceMgr::DK_Error:
+ if (ShowColors)
+ S.changeColor(raw_ostream::RED, true);
+ S << "error: ";
+ break;
+ case SourceMgr::DK_Warning:
+ if (ShowColors)
+ S.changeColor(raw_ostream::MAGENTA, true);
+ S << "warning: ";
+ break;
+ case SourceMgr::DK_Note:
+ if (ShowColors)
+ S.changeColor(raw_ostream::BLACK, true);
+ S << "note: ";
+ break;
+ }
+
+ if (ShowColors) {
+ S.resetColor();
+ S.changeColor(raw_ostream::SAVEDCOLOR, true);
+ }
+ }
+
+ S << Message << '\n';
+
+ if (ShowColors)
+ S.resetColor();
+
+ if (LineNo == -1 || ColumnNo == -1)
+ return;
+
+ // FIXME: If there are multibyte or multi-column characters in the source, all
+ // our ranges will be wrong. To do this properly, we'll need a byte-to-column
+ // map like Clang's TextDiagnostic. For now, we'll just handle tabs by
+ // expanding them later, and bail out rather than show incorrect ranges and
+ // misaligned fixits for any other odd characters.
+ if (std::find_if(LineContents.begin(), LineContents.end(), isNonASCII) !=
+ LineContents.end()) {
+ printSourceLine(S, LineContents);
+ return;
+ }
+ size_t NumColumns = LineContents.size();
+
+ // Build the line with the caret and ranges.
+ std::string CaretLine(NumColumns+1, ' ');
+
+ // Expand any ranges.
+ for (unsigned r = 0, e = Ranges.size(); r != e; ++r) {
+ std::pair<unsigned, unsigned> R = Ranges[r];
+ std::fill(&CaretLine[R.first],
+ &CaretLine[std::min((size_t)R.second, CaretLine.size())],
+ '~');
+ }
+
+ // Add any fix-its.
+ // FIXME: Find the beginning of the line properly for multibyte characters.
+ std::string FixItInsertionLine;
+ buildFixItLine(CaretLine, FixItInsertionLine, FixIts,
+ makeArrayRef(Loc.getPointer() - ColumnNo,
+ LineContents.size()));
+
+ // Finally, plop on the caret.
+ if (unsigned(ColumnNo) <= NumColumns)
+ CaretLine[ColumnNo] = '^';
+ else
+ CaretLine[NumColumns] = '^';
+
+ // ... and remove trailing whitespace so the output doesn't wrap for it. We
+ // know that the line isn't completely empty because it has the caret in it at
+ // least.
+ CaretLine.erase(CaretLine.find_last_not_of(' ')+1);
+
+ printSourceLine(S, LineContents);
+
+ if (ShowColors)
+ S.changeColor(raw_ostream::GREEN, true);
+
+ // Print out the caret line, matching tabs in the source line.
+ for (unsigned i = 0, e = CaretLine.size(), OutCol = 0; i != e; ++i) {
+ if (i >= LineContents.size() || LineContents[i] != '\t') {
+ S << CaretLine[i];
+ ++OutCol;
+ continue;
+ }
+
+ // Okay, we have a tab. Insert the appropriate number of characters.
+ do {
+ S << CaretLine[i];
+ ++OutCol;
+ } while ((OutCol % TabStop) != 0);
+ }
+ S << '\n';
+
+ if (ShowColors)
+ S.resetColor();
+
+ // Print out the replacement line, matching tabs in the source line.
+ if (FixItInsertionLine.empty())
+ return;
+
+ for (size_t i = 0, e = FixItInsertionLine.size(), OutCol = 0; i < e; ++i) {
+ if (i >= LineContents.size() || LineContents[i] != '\t') {
+ S << FixItInsertionLine[i];
+ ++OutCol;
+ continue;
+ }
+
+ // Okay, we have a tab. Insert the appropriate number of characters.
+ do {
+ S << FixItInsertionLine[i];
+ // FIXME: This is trying not to break up replacements, but then to re-sync
+ // with the tabs between replacements. This will fail, though, if two
+ // fix-it replacements are exactly adjacent, or if a fix-it contains a
+ // space. Really we should be precomputing column widths, which we'll
+ // need anyway for multibyte chars.
+ if (FixItInsertionLine[i] != ' ')
+ ++i;
+ ++OutCol;
+ } while (((OutCol % TabStop) != 0) && i != e);
+ }
+ S << '\n';
+}
diff --git a/ext/src/llvm/StringMap.cpp b/ext/src/llvm/StringMap.cpp
new file mode 100644
index 0000000..7be9466
--- /dev/null
+++ b/ext/src/llvm/StringMap.cpp
@@ -0,0 +1,245 @@
+//===--- StringMap.cpp - String Hash table map implementation -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the StringMap class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Compiler.h"
+#include <cassert>
+using namespace llvm;
+
+StringMapImpl::StringMapImpl(unsigned InitSize, unsigned itemSize) {
+ ItemSize = itemSize;
+
+ // If a size is specified, initialize the table with that many buckets.
+ if (InitSize) {
+ init(InitSize);
+ return;
+ }
+
+ // Otherwise, initialize it with zero buckets to avoid the allocation.
+ TheTable = nullptr;
+ NumBuckets = 0;
+ NumItems = 0;
+ NumTombstones = 0;
+}
+
+void StringMapImpl::init(unsigned InitSize) {
+ assert((InitSize & (InitSize-1)) == 0 &&
+ "Init Size must be a power of 2 or zero!");
+ NumBuckets = InitSize ? InitSize : 16;
+ NumItems = 0;
+ NumTombstones = 0;
+
+ TheTable = (StringMapEntryBase **)calloc(NumBuckets+1,
+ sizeof(StringMapEntryBase **) +
+ sizeof(unsigned));
+
+ // Allocate one extra bucket, set it to look filled so the iterators stop at
+ // end.
+ TheTable[NumBuckets] = (StringMapEntryBase*)2;
+}
+
+
+/// LookupBucketFor - Look up the bucket that the specified string should end
+/// up in. If it already exists as a key in the map, the Item pointer for the
+/// specified bucket will be non-null. Otherwise, it will be null. In either
+/// case, the FullHashValue field of the bucket will be set to the hash value
+/// of the string.
+unsigned StringMapImpl::LookupBucketFor(StringRef Name) {
+ unsigned HTSize = NumBuckets;
+ if (HTSize == 0) { // Hash table unallocated so far?
+ init(16);
+ HTSize = NumBuckets;
+ }
+ unsigned FullHashValue = HashString(Name);
+ unsigned BucketNo = FullHashValue & (HTSize-1);
+ unsigned *HashTable = (unsigned *)(TheTable + NumBuckets + 1);
+
+ unsigned ProbeAmt = 1;
+ int FirstTombstone = -1;
+ while (1) {
+ StringMapEntryBase *BucketItem = TheTable[BucketNo];
+ // If we found an empty bucket, this key isn't in the table yet, return it.
+ if (LLVM_LIKELY(!BucketItem)) {
+ // If we found a tombstone, we want to reuse the tombstone instead of an
+ // empty bucket. This reduces probing.
+ if (FirstTombstone != -1) {
+ HashTable[FirstTombstone] = FullHashValue;
+ return FirstTombstone;
+ }
+
+ HashTable[BucketNo] = FullHashValue;
+ return BucketNo;
+ }
+
+ if (BucketItem == getTombstoneVal()) {
+ // Skip over tombstones. However, remember the first one we see.
+ if (FirstTombstone == -1) FirstTombstone = BucketNo;
+ } else if (LLVM_LIKELY(HashTable[BucketNo] == FullHashValue)) {
+ // If the full hash value matches, check deeply for a match. The common
+ // case here is that we are only looking at the buckets (for item info
+ // being non-null and for the full hash value) not at the items. This
+ // is important for cache locality.
+
+ // Do the comparison like this because Name isn't necessarily
+ // null-terminated!
+ char *ItemStr = (char*)BucketItem+ItemSize;
+ if (Name == StringRef(ItemStr, BucketItem->getKeyLength())) {
+ // We found a match!
+ return BucketNo;
+ }
+ }
+
+ // Okay, we didn't find the item. Probe to the next bucket.
+ BucketNo = (BucketNo+ProbeAmt) & (HTSize-1);
+
+ // Use quadratic probing, it has fewer clumping artifacts than linear
+ // probing and has good cache behavior in the common case.
+ ++ProbeAmt;
+ }
+}
+
+
+/// FindKey - Look up the bucket that contains the specified key. If it exists
+/// in the map, return the bucket number of the key. Otherwise return -1.
+/// This does not modify the map.
+int StringMapImpl::FindKey(StringRef Key) const {
+ unsigned HTSize = NumBuckets;
+ if (HTSize == 0) return -1; // Really empty table?
+ unsigned FullHashValue = HashString(Key);
+ unsigned BucketNo = FullHashValue & (HTSize-1);
+ unsigned *HashTable = (unsigned *)(TheTable + NumBuckets + 1);
+
+ unsigned ProbeAmt = 1;
+ while (1) {
+ StringMapEntryBase *BucketItem = TheTable[BucketNo];
+ // If we found an empty bucket, this key isn't in the table yet, return.
+ if (LLVM_LIKELY(!BucketItem))
+ return -1;
+
+ if (BucketItem == getTombstoneVal()) {
+ // Ignore tombstones.
+ } else if (LLVM_LIKELY(HashTable[BucketNo] == FullHashValue)) {
+ // If the full hash value matches, check deeply for a match. The common
+ // case here is that we are only looking at the buckets (for item info
+ // being non-null and for the full hash value) not at the items. This
+ // is important for cache locality.
+
+ // Do the comparison like this because NameStart isn't necessarily
+ // null-terminated!
+ char *ItemStr = (char*)BucketItem+ItemSize;
+ if (Key == StringRef(ItemStr, BucketItem->getKeyLength())) {
+ // We found a match!
+ return BucketNo;
+ }
+ }
+
+ // Okay, we didn't find the item. Probe to the next bucket.
+ BucketNo = (BucketNo+ProbeAmt) & (HTSize-1);
+
+ // Use quadratic probing, it has fewer clumping artifacts than linear
+ // probing and has good cache behavior in the common case.
+ ++ProbeAmt;
+ }
+}
+
+/// RemoveKey - Remove the specified StringMapEntry from the table, but do not
+/// delete it. This aborts if the value isn't in the table.
+void StringMapImpl::RemoveKey(StringMapEntryBase *V) {
+ const char *VStr = (char*)V + ItemSize;
+ StringMapEntryBase *V2 = RemoveKey(StringRef(VStr, V->getKeyLength()));
+ (void)V2;
+ assert(V == V2 && "Didn't find key?");
+}
+
+/// RemoveKey - Remove the StringMapEntry for the specified key from the
+/// table, returning it. If the key is not in the table, this returns null.
+StringMapEntryBase *StringMapImpl::RemoveKey(StringRef Key) {
+ int Bucket = FindKey(Key);
+ if (Bucket == -1) return nullptr;
+
+ StringMapEntryBase *Result = TheTable[Bucket];
+ TheTable[Bucket] = getTombstoneVal();
+ --NumItems;
+ ++NumTombstones;
+ assert(NumItems + NumTombstones <= NumBuckets);
+
+ return Result;
+}
+
+
+
+/// RehashTable - Grow the table, redistributing values into the buckets with
+/// the appropriate mod-of-hashtable-size.
+unsigned StringMapImpl::RehashTable(unsigned BucketNo) {
+ unsigned NewSize;
+ unsigned *HashTable = (unsigned *)(TheTable + NumBuckets + 1);
+
+ // If the hash table is now more than 3/4 full, or if fewer than 1/8 of
+ // the buckets are empty (meaning that many are filled with tombstones),
+ // grow/rehash the table.
+ if (LLVM_UNLIKELY(NumItems * 4 > NumBuckets * 3)) {
+ NewSize = NumBuckets*2;
+ } else if (LLVM_UNLIKELY(NumBuckets - (NumItems + NumTombstones) <=
+ NumBuckets / 8)) {
+ NewSize = NumBuckets;
+ } else {
+ return BucketNo;
+ }
+
+ unsigned NewBucketNo = BucketNo;
+ // Allocate one extra bucket which will always be non-empty. This allows the
+ // iterators to stop at end.
+ StringMapEntryBase **NewTableArray =
+ (StringMapEntryBase **)calloc(NewSize+1, sizeof(StringMapEntryBase *) +
+ sizeof(unsigned));
+ unsigned *NewHashArray = (unsigned *)(NewTableArray + NewSize + 1);
+ NewTableArray[NewSize] = (StringMapEntryBase*)2;
+
+ // Rehash all the items into their new buckets. Luckily :) we already have
+ // the hash values available, so we don't have to rehash any strings.
+ for (unsigned I = 0, E = NumBuckets; I != E; ++I) {
+ StringMapEntryBase *Bucket = TheTable[I];
+ if (Bucket && Bucket != getTombstoneVal()) {
+ // Fast case, bucket available.
+ unsigned FullHash = HashTable[I];
+ unsigned NewBucket = FullHash & (NewSize-1);
+ if (!NewTableArray[NewBucket]) {
+ NewTableArray[FullHash & (NewSize-1)] = Bucket;
+ NewHashArray[FullHash & (NewSize-1)] = FullHash;
+ if (I == BucketNo)
+ NewBucketNo = NewBucket;
+ continue;
+ }
+
+ // Otherwise probe for a spot.
+ unsigned ProbeSize = 1;
+ do {
+ NewBucket = (NewBucket + ProbeSize++) & (NewSize-1);
+ } while (NewTableArray[NewBucket]);
+
+ // Finally found a slot. Fill it in.
+ NewTableArray[NewBucket] = Bucket;
+ NewHashArray[NewBucket] = FullHash;
+ if (I == BucketNo)
+ NewBucketNo = NewBucket;
+ }
+ }
+
+ free(TheTable);
+
+ TheTable = NewTableArray;
+ NumBuckets = NewSize;
+ NumTombstones = 0;
+ return NewBucketNo;
+}
diff --git a/ext/src/llvm/StringRef.cpp b/ext/src/llvm/StringRef.cpp
new file mode 100644
index 0000000..643100c
--- /dev/null
+++ b/ext/src/llvm/StringRef.cpp
@@ -0,0 +1,445 @@
+//===-- StringRef.cpp - Lightweight String References ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/edit_distance.h"
+#include <climits>
+#include <bitset>
+
+using namespace llvm;
+
+const size_t StringRef::npos;
+
+static char ascii_tolower(char x) {
+ if (x >= 'A' && x <= 'Z')
+ return x - 'A' + 'a';
+ return x;
+}
+
+static char ascii_toupper(char x) {
+ if (x >= 'a' && x <= 'z')
+ return x - 'a' + 'A';
+ return x;
+}
+
+static bool ascii_isdigit(char x) {
+ return x >= '0' && x <= '9';
+}
+
+// strncasecmp() is not available on non-POSIX systems, so define an
+// alternative function here.
+static int ascii_strncasecmp(const char *LHS, const char *RHS, size_t Length) {
+ for (size_t I = 0; I < Length; ++I) {
+ unsigned char LHC = ascii_tolower(LHS[I]);
+ unsigned char RHC = ascii_tolower(RHS[I]);
+ if (LHC != RHC)
+ return LHC < RHC ? -1 : 1;
+ }
+ return 0;
+}
+
+/// compare_lower - Compare strings, ignoring case.
+int StringRef::compare_lower(StringRef RHS) const {
+ if (int Res = ascii_strncasecmp(Data, RHS.Data, std::min(Length, RHS.Length)))
+ return Res;
+ if (Length == RHS.Length)
+ return 0;
+ return Length < RHS.Length ? -1 : 1;
+}
+
+/// Check if this string starts with the given \p Prefix, ignoring case.
+bool StringRef::startswith_lower(StringRef Prefix) const {
+ return Length >= Prefix.Length &&
+ ascii_strncasecmp(Data, Prefix.Data, Prefix.Length) == 0;
+}
+
+/// Check if this string ends with the given \p Suffix, ignoring case.
+bool StringRef::endswith_lower(StringRef Suffix) const {
+ return Length >= Suffix.Length &&
+ ascii_strncasecmp(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0;
+}
+
+/// compare_numeric - Compare strings, handle embedded numbers.
+int StringRef::compare_numeric(StringRef RHS) const {
+ for (size_t I = 0, E = std::min(Length, RHS.Length); I != E; ++I) {
+ // Check for sequences of digits.
+ if (ascii_isdigit(Data[I]) && ascii_isdigit(RHS.Data[I])) {
+ // The longer sequence of numbers is considered larger.
+ // This doesn't really handle prefixed zeros well.
+ size_t J;
+ for (J = I + 1; J != E + 1; ++J) {
+ bool ld = J < Length && ascii_isdigit(Data[J]);
+ bool rd = J < RHS.Length && ascii_isdigit(RHS.Data[J]);
+ if (ld != rd)
+ return rd ? -1 : 1;
+ if (!rd)
+ break;
+ }
+ // The two number sequences have the same length (J-I), just memcmp them.
+ if (int Res = compareMemory(Data + I, RHS.Data + I, J - I))
+ return Res < 0 ? -1 : 1;
+ // Identical number sequences, continue search after the numbers.
+ I = J - 1;
+ continue;
+ }
+ if (Data[I] != RHS.Data[I])
+ return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1;
+ }
+ if (Length == RHS.Length)
+ return 0;
+ return Length < RHS.Length ? -1 : 1;
+}
+
+// Compute the edit distance between the two given strings.
+unsigned StringRef::edit_distance(llvm::StringRef Other,
+ bool AllowReplacements,
+ unsigned MaxEditDistance) const {
+ return llvm::ComputeEditDistance(
+ makeArrayRef(data(), size()),
+ makeArrayRef(Other.data(), Other.size()),
+ AllowReplacements, MaxEditDistance);
+}
+
+//===----------------------------------------------------------------------===//
+// String Operations
+//===----------------------------------------------------------------------===//
+
+std::string StringRef::lower() const {
+ std::string Result(size(), char());
+ for (size_type i = 0, e = size(); i != e; ++i) {
+ Result[i] = ascii_tolower(Data[i]);
+ }
+ return Result;
+}
+
+std::string StringRef::upper() const {
+ std::string Result(size(), char());
+ for (size_type i = 0, e = size(); i != e; ++i) {
+ Result[i] = ascii_toupper(Data[i]);
+ }
+ return Result;
+}
+
+//===----------------------------------------------------------------------===//
+// String Searching
+//===----------------------------------------------------------------------===//
+
+
+/// find - Search for the first string \arg Str in the string.
+///
+/// \return - The index of the first occurrence of \arg Str, or npos if not
+/// found.
+size_t StringRef::find(StringRef Str, size_t From) const {
+ if (From > Length)
+ return npos;
+
+ const char *Needle = Str.data();
+ size_t N = Str.size();
+ if (N == 0)
+ return From;
+
+ size_t Size = Length - From;
+ if (Size < N)
+ return npos;
+
+ const char *Start = Data + From;
+ const char *Stop = Start + (Size - N + 1);
+
+ // For short haystacks or unsupported needles fall back to the naive algorithm
+ if (Size < 16 || N > 255) {
+ do {
+ if (std::memcmp(Start, Needle, N) == 0)
+ return Start - Data;
+ ++Start;
+ } while (Start < Stop);
+ return npos;
+ }
+
+ // Build the bad char heuristic table, with uint8_t to reduce cache thrashing.
+ uint8_t BadCharSkip[256];
+ std::memset(BadCharSkip, N, 256);
+ for (unsigned i = 0; i != N-1; ++i)
+ BadCharSkip[(uint8_t)Str[i]] = N-1-i;
+
+ do {
+ if (std::memcmp(Start, Needle, N) == 0)
+ return Start - Data;
+
+ // Otherwise skip the appropriate number of bytes.
+ Start += BadCharSkip[(uint8_t)Start[N-1]];
+ } while (Start < Stop);
+
+ return npos;
+}
+
+/// rfind - Search for the last string \arg Str in the string.
+///
+/// \return - The index of the last occurrence of \arg Str, or npos if not
+/// found.
+size_t StringRef::rfind(StringRef Str) const {
+ size_t N = Str.size();
+ if (N > Length)
+ return npos;
+ for (size_t i = Length - N + 1, e = 0; i != e;) {
+ --i;
+ if (substr(i, N).equals(Str))
+ return i;
+ }
+ return npos;
+}
+
+/// find_first_of - Find the first character in the string that is in \arg
+/// Chars, or npos if not found.
+///
+/// Note: O(size() + Chars.size())
+StringRef::size_type StringRef::find_first_of(StringRef Chars,
+ size_t From) const {
+ std::bitset<1 << CHAR_BIT> CharBits;
+ for (size_type i = 0; i != Chars.size(); ++i)
+ CharBits.set((unsigned char)Chars[i]);
+
+ for (size_type i = std::min(From, Length), e = Length; i != e; ++i)
+ if (CharBits.test((unsigned char)Data[i]))
+ return i;
+ return npos;
+}
+
+/// find_first_not_of - Find the first character in the string that is not
+/// \arg C or npos if not found.
+StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const {
+ for (size_type i = std::min(From, Length), e = Length; i != e; ++i)
+ if (Data[i] != C)
+ return i;
+ return npos;
+}
+
+/// find_first_not_of - Find the first character in the string that is not
+/// in the string \arg Chars, or npos if not found.
+///
+/// Note: O(size() + Chars.size())
+StringRef::size_type StringRef::find_first_not_of(StringRef Chars,
+ size_t From) const {
+ std::bitset<1 << CHAR_BIT> CharBits;
+ for (size_type i = 0; i != Chars.size(); ++i)
+ CharBits.set((unsigned char)Chars[i]);
+
+ for (size_type i = std::min(From, Length), e = Length; i != e; ++i)
+ if (!CharBits.test((unsigned char)Data[i]))
+ return i;
+ return npos;
+}
+
+/// find_last_of - Find the last character in the string that is in \arg C,
+/// or npos if not found.
+///
+/// Note: O(size() + Chars.size())
+StringRef::size_type StringRef::find_last_of(StringRef Chars,
+ size_t From) const {
+ std::bitset<1 << CHAR_BIT> CharBits;
+ for (size_type i = 0; i != Chars.size(); ++i)
+ CharBits.set((unsigned char)Chars[i]);
+
+ for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i)
+ if (CharBits.test((unsigned char)Data[i]))
+ return i;
+ return npos;
+}
+
+/// find_last_not_of - Find the last character in the string that is not
+/// \arg C, or npos if not found.
+StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const {
+ for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i)
+ if (Data[i] != C)
+ return i;
+ return npos;
+}
+
+/// find_last_not_of - Find the last character in the string that is not in
+/// \arg Chars, or npos if not found.
+///
+/// Note: O(size() + Chars.size())
+StringRef::size_type StringRef::find_last_not_of(StringRef Chars,
+ size_t From) const {
+ std::bitset<1 << CHAR_BIT> CharBits;
+ for (size_type i = 0, e = Chars.size(); i != e; ++i)
+ CharBits.set((unsigned char)Chars[i]);
+
+ for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i)
+ if (!CharBits.test((unsigned char)Data[i]))
+ return i;
+ return npos;
+}
+
+void StringRef::split(SmallVectorImpl<StringRef> &A,
+ StringRef Separator, int MaxSplit,
+ bool KeepEmpty) const {
+ StringRef S = *this;
+
+ // Count down from MaxSplit. When MaxSplit is -1, this will just split
+ // "forever". This doesn't support splitting more than 2^31 times
+ // intentionally; if we ever want that we can make MaxSplit a 64-bit integer
+ // but that seems unlikely to be useful.
+ while (MaxSplit-- != 0) {
+ size_t Idx = S.find(Separator);
+ if (Idx == npos)
+ break;
+
+ // Push this split.
+ if (KeepEmpty || Idx > 0)
+ A.push_back(S.slice(0, Idx));
+
+ // Jump forward.
+ S = S.slice(Idx + Separator.size(), npos);
+ }
+
+ // Push the tail.
+ if (KeepEmpty || !S.empty())
+ A.push_back(S);
+}
+
+void StringRef::split(SmallVectorImpl<StringRef> &A, char Separator,
+ int MaxSplit, bool KeepEmpty) const {
+ StringRef S = *this;
+
+ // Count down from MaxSplit. When MaxSplit is -1, this will just split
+ // "forever". This doesn't support splitting more than 2^31 times
+ // intentionally; if we ever want that we can make MaxSplit a 64-bit integer
+ // but that seems unlikely to be useful.
+ while (MaxSplit-- != 0) {
+ size_t Idx = S.find(Separator);
+ if (Idx == npos)
+ break;
+
+ // Push this split.
+ if (KeepEmpty || Idx > 0)
+ A.push_back(S.slice(0, Idx));
+
+ // Jump forward.
+ S = S.slice(Idx + 1, npos);
+ }
+
+ // Push the tail.
+ if (KeepEmpty || !S.empty())
+ A.push_back(S);
+}
+
+//===----------------------------------------------------------------------===//
+// Helpful Algorithms
+//===----------------------------------------------------------------------===//
+
+/// count - Return the number of non-overlapped occurrences of \arg Str in
+/// the string.
+size_t StringRef::count(StringRef Str) const {
+ size_t Count = 0;
+ size_t N = Str.size();
+ if (N > Length)
+ return 0;
+ for (size_t i = 0, e = Length - N + 1; i != e; ++i)
+ if (substr(i, N).equals(Str))
+ ++Count;
+ return Count;
+}
+
+static unsigned GetAutoSenseRadix(StringRef &Str) {
+ if (Str.startswith("0x")) {
+ Str = Str.substr(2);
+ return 16;
+ }
+
+ if (Str.startswith("0b")) {
+ Str = Str.substr(2);
+ return 2;
+ }
+
+ if (Str.startswith("0o")) {
+ Str = Str.substr(2);
+ return 8;
+ }
+
+ if (Str.startswith("0"))
+ return 8;
+
+ return 10;
+}
+
+
+/// GetAsUnsignedInteger - Workhorse method that converts a integer character
+/// sequence of radix up to 36 to an unsigned long long value.
+bool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix,
+ unsigned long long &Result) {
+ // Autosense radix if not specified.
+ if (Radix == 0)
+ Radix = GetAutoSenseRadix(Str);
+
+ // Empty strings (after the radix autosense) are invalid.
+ if (Str.empty()) return true;
+
+ // Parse all the bytes of the string given this radix. Watch for overflow.
+ Result = 0;
+ while (!Str.empty()) {
+ unsigned CharVal;
+ if (Str[0] >= '0' && Str[0] <= '9')
+ CharVal = Str[0]-'0';
+ else if (Str[0] >= 'a' && Str[0] <= 'z')
+ CharVal = Str[0]-'a'+10;
+ else if (Str[0] >= 'A' && Str[0] <= 'Z')
+ CharVal = Str[0]-'A'+10;
+ else
+ return true;
+
+ // If the parsed value is larger than the integer radix, the string is
+ // invalid.
+ if (CharVal >= Radix)
+ return true;
+
+ // Add in this character.
+ unsigned long long PrevResult = Result;
+ Result = Result*Radix+CharVal;
+
+ // Check for overflow by shifting back and seeing if bits were lost.
+ if (Result/Radix < PrevResult)
+ return true;
+
+ Str = Str.substr(1);
+ }
+
+ return false;
+}
+
+bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix,
+ long long &Result) {
+ unsigned long long ULLVal;
+
+ // Handle positive strings first.
+ if (Str.empty() || Str.front() != '-') {
+ if (getAsUnsignedInteger(Str, Radix, ULLVal) ||
+ // Check for value so large it overflows a signed value.
+ (long long)ULLVal < 0)
+ return true;
+ Result = ULLVal;
+ return false;
+ }
+
+ // Get the positive part of the value.
+ if (getAsUnsignedInteger(Str.substr(1), Radix, ULLVal) ||
+ // Reject values so large they'd overflow as negative signed, but allow
+ // "-0". This negates the unsigned so that the negative isn't undefined
+ // on signed overflow.
+ (long long)-ULLVal > 0)
+ return true;
+
+ Result = -ULLVal;
+ return false;
+}
+
+// Implementation of StringRef hashing.
+hash_code llvm::hash_value(StringRef S) {
+ return hash_combine_range(S.begin(), S.end());
+}
diff --git a/ext/src/llvm/Twine.cpp b/ext/src/llvm/Twine.cpp
new file mode 100644
index 0000000..9b199bd
--- /dev/null
+++ b/ext/src/llvm/Twine.cpp
@@ -0,0 +1,162 @@
+//===-- Twine.cpp - Fast Temporary String Concatenation -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+std::string Twine::str() const {
+ // If we're storing only a std::string, just return it.
+ if (LHSKind == StdStringKind && RHSKind == EmptyKind)
+ return *LHS.stdString;
+
+ // Otherwise, flatten and copy the contents first.
+ SmallString<256> Vec;
+ return toStringRef(Vec).str();
+}
+
+void Twine::toVector(SmallVectorImpl<char> &Out) const {
+ raw_svector_ostream OS(Out);
+ print(OS);
+}
+
+StringRef Twine::toNullTerminatedStringRef(SmallVectorImpl<char> &Out) const {
+ if (isUnary()) {
+ switch (getLHSKind()) {
+ case CStringKind:
+ // Already null terminated, yay!
+ return StringRef(LHS.cString);
+ case StdStringKind: {
+ const std::string *str = LHS.stdString;
+ return StringRef(str->c_str(), str->size());
+ }
+ default:
+ break;
+ }
+ }
+ toVector(Out);
+ Out.push_back(0);
+ Out.pop_back();
+ return StringRef(Out.data(), Out.size());
+}
+
+void Twine::printOneChild(raw_ostream &OS, Child Ptr,
+ NodeKind Kind) const {
+ switch (Kind) {
+ case Twine::NullKind: break;
+ case Twine::EmptyKind: break;
+ case Twine::TwineKind:
+ Ptr.twine->print(OS);
+ break;
+ case Twine::CStringKind:
+ OS << Ptr.cString;
+ break;
+ case Twine::StdStringKind:
+ OS << *Ptr.stdString;
+ break;
+ case Twine::StringRefKind:
+ OS << *Ptr.stringRef;
+ break;
+ case Twine::SmallStringKind:
+ OS << *Ptr.smallString;
+ break;
+ case Twine::CharKind:
+ OS << Ptr.character;
+ break;
+ case Twine::DecUIKind:
+ OS << Ptr.decUI;
+ break;
+ case Twine::DecIKind:
+ OS << Ptr.decI;
+ break;
+ case Twine::DecULKind:
+ OS << *Ptr.decUL;
+ break;
+ case Twine::DecLKind:
+ OS << *Ptr.decL;
+ break;
+ case Twine::DecULLKind:
+ OS << *Ptr.decULL;
+ break;
+ case Twine::DecLLKind:
+ OS << *Ptr.decLL;
+ break;
+ case Twine::UHexKind:
+ OS.write_hex(*Ptr.uHex);
+ break;
+ }
+}
+
+void Twine::printOneChildRepr(raw_ostream &OS, Child Ptr,
+ NodeKind Kind) const {
+ switch (Kind) {
+ case Twine::NullKind:
+ OS << "null"; break;
+ case Twine::EmptyKind:
+ OS << "empty"; break;
+ case Twine::TwineKind:
+ OS << "rope:";
+ Ptr.twine->printRepr(OS);
+ break;
+ case Twine::CStringKind:
+ OS << "cstring:\""
+ << Ptr.cString << "\"";
+ break;
+ case Twine::StdStringKind:
+ OS << "std::string:\""
+ << Ptr.stdString << "\"";
+ break;
+ case Twine::StringRefKind:
+ OS << "stringref:\""
+ << Ptr.stringRef << "\"";
+ break;
+ case Twine::SmallStringKind:
+ OS << "smallstring:\"" << *Ptr.smallString << "\"";
+ break;
+ case Twine::CharKind:
+ OS << "char:\"" << Ptr.character << "\"";
+ break;
+ case Twine::DecUIKind:
+ OS << "decUI:\"" << Ptr.decUI << "\"";
+ break;
+ case Twine::DecIKind:
+ OS << "decI:\"" << Ptr.decI << "\"";
+ break;
+ case Twine::DecULKind:
+ OS << "decUL:\"" << *Ptr.decUL << "\"";
+ break;
+ case Twine::DecLKind:
+ OS << "decL:\"" << *Ptr.decL << "\"";
+ break;
+ case Twine::DecULLKind:
+ OS << "decULL:\"" << *Ptr.decULL << "\"";
+ break;
+ case Twine::DecLLKind:
+ OS << "decLL:\"" << *Ptr.decLL << "\"";
+ break;
+ case Twine::UHexKind:
+ OS << "uhex:\"" << Ptr.uHex << "\"";
+ break;
+ }
+}
+
+void Twine::print(raw_ostream &OS) const {
+ printOneChild(OS, LHS, getLHSKind());
+ printOneChild(OS, RHS, getRHSKind());
+}
+
+void Twine::printRepr(raw_ostream &OS) const {
+ OS << "(Twine ";
+ printOneChildRepr(OS, LHS, getLHSKind());
+ OS << " ";
+ printOneChildRepr(OS, RHS, getRHSKind());
+ OS << ")";
+}
diff --git a/ext/src/llvm/Unix.h b/ext/src/llvm/Unix.h
new file mode 100644
index 0000000..e7dac6e
--- /dev/null
+++ b/ext/src/llvm/Unix.h
@@ -0,0 +1,57 @@
+//===- llvm/Support/Unix/Unix.h - Common Unix Include File -------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines things specific to Unix implementations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_SUPPORT_UNIX_UNIX_H
+#define LLVM_LIB_SUPPORT_UNIX_UNIX_H
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only generic UNIX code that
+//=== is guaranteed to work on all UNIX variants.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Errno.h"
+#include <algorithm>
+#include <assert.h>
+#include <cerrno>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#include <unistd.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <time.h>
+
+#include <dlfcn.h>
+
+/// This function builds an error message into \p ErrMsg using the \p prefix
+/// string and the Unix error number given by \p errnum. If errnum is -1, the
+/// default then the value of errno is used.
+/// @brief Make an error message
+///
+/// If the error number can be converted to a string, it will be
+/// separated from prefix by ": ".
+static inline bool MakeErrMsg(
+ std::string* ErrMsg, const std::string& prefix, int errnum = -1) {
+ if (!ErrMsg)
+ return true;
+ if (errnum == -1)
+ errnum = errno;
+ *ErrMsg = prefix + ": " + llvm::sys::StrError(errnum);
+ return true;
+}
+
+#endif
diff --git a/ext/src/llvm/YAMLParser.cpp b/ext/src/llvm/YAMLParser.cpp
new file mode 100644
index 0000000..c4384ca
--- /dev/null
+++ b/ext/src/llvm/YAMLParser.cpp
@@ -0,0 +1,2445 @@
+//===--- YAMLParser.cpp - Simple YAML parser ------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a YAML parser.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/YAMLParser.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/ilist.h"
+#include "llvm/ADT/ilist_node.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace yaml;
+
+enum UnicodeEncodingForm {
+ UEF_UTF32_LE, ///< UTF-32 Little Endian
+ UEF_UTF32_BE, ///< UTF-32 Big Endian
+ UEF_UTF16_LE, ///< UTF-16 Little Endian
+ UEF_UTF16_BE, ///< UTF-16 Big Endian
+ UEF_UTF8, ///< UTF-8 or ascii.
+ UEF_Unknown ///< Not a valid Unicode encoding.
+};
+
+/// EncodingInfo - Holds the encoding type and length of the byte order mark if
+/// it exists. Length is in {0, 2, 3, 4}.
+typedef std::pair<UnicodeEncodingForm, unsigned> EncodingInfo;
+
+/// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode
+/// encoding form of \a Input.
+///
+/// @param Input A string of length 0 or more.
+/// @returns An EncodingInfo indicating the Unicode encoding form of the input
+/// and how long the byte order mark is if one exists.
+static EncodingInfo getUnicodeEncoding(StringRef Input) {
+ if (Input.size() == 0)
+ return std::make_pair(UEF_Unknown, 0);
+
+ switch (uint8_t(Input[0])) {
+ case 0x00:
+ if (Input.size() >= 4) {
+ if ( Input[1] == 0
+ && uint8_t(Input[2]) == 0xFE
+ && uint8_t(Input[3]) == 0xFF)
+ return std::make_pair(UEF_UTF32_BE, 4);
+ if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0)
+ return std::make_pair(UEF_UTF32_BE, 0);
+ }
+
+ if (Input.size() >= 2 && Input[1] != 0)
+ return std::make_pair(UEF_UTF16_BE, 0);
+ return std::make_pair(UEF_Unknown, 0);
+ case 0xFF:
+ if ( Input.size() >= 4
+ && uint8_t(Input[1]) == 0xFE
+ && Input[2] == 0
+ && Input[3] == 0)
+ return std::make_pair(UEF_UTF32_LE, 4);
+
+ if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE)
+ return std::make_pair(UEF_UTF16_LE, 2);
+ return std::make_pair(UEF_Unknown, 0);
+ case 0xFE:
+ if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF)
+ return std::make_pair(UEF_UTF16_BE, 2);
+ return std::make_pair(UEF_Unknown, 0);
+ case 0xEF:
+ if ( Input.size() >= 3
+ && uint8_t(Input[1]) == 0xBB
+ && uint8_t(Input[2]) == 0xBF)
+ return std::make_pair(UEF_UTF8, 3);
+ return std::make_pair(UEF_Unknown, 0);
+ }
+
+ // It could still be utf-32 or utf-16.
+ if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0)
+ return std::make_pair(UEF_UTF32_LE, 0);
+
+ if (Input.size() >= 2 && Input[1] == 0)
+ return std::make_pair(UEF_UTF16_LE, 0);
+
+ return std::make_pair(UEF_UTF8, 0);
+}
+
+namespace llvm {
+namespace yaml {
+/// Pin the vtables to this file.
+void Node::anchor() {}
+void NullNode::anchor() {}
+void ScalarNode::anchor() {}
+void BlockScalarNode::anchor() {}
+void KeyValueNode::anchor() {}
+void MappingNode::anchor() {}
+void SequenceNode::anchor() {}
+void AliasNode::anchor() {}
+
+/// Token - A single YAML token.
+struct Token : ilist_node<Token> {
+ enum TokenKind {
+ TK_Error, // Uninitialized token.
+ TK_StreamStart,
+ TK_StreamEnd,
+ TK_VersionDirective,
+ TK_TagDirective,
+ TK_DocumentStart,
+ TK_DocumentEnd,
+ TK_BlockEntry,
+ TK_BlockEnd,
+ TK_BlockSequenceStart,
+ TK_BlockMappingStart,
+ TK_FlowEntry,
+ TK_FlowSequenceStart,
+ TK_FlowSequenceEnd,
+ TK_FlowMappingStart,
+ TK_FlowMappingEnd,
+ TK_Key,
+ TK_Value,
+ TK_Scalar,
+ TK_BlockScalar,
+ TK_Alias,
+ TK_Anchor,
+ TK_Tag
+ } Kind;
+
+ /// A string of length 0 or more whose begin() points to the logical location
+ /// of the token in the input.
+ StringRef Range;
+
+ /// The value of a block scalar node.
+ std::string Value;
+
+ Token() : Kind(TK_Error) {}
+};
+}
+}
+
+namespace llvm {
+template<>
+struct ilist_sentinel_traits<Token> {
+ Token *createSentinel() const {
+ return &Sentinel;
+ }
+ static void destroySentinel(Token*) {}
+
+ Token *provideInitialHead() const { return createSentinel(); }
+ Token *ensureHead(Token*) const { return createSentinel(); }
+ static void noteHead(Token*, Token*) {}
+
+private:
+ mutable Token Sentinel;
+};
+
+template<>
+struct ilist_node_traits<Token> {
+ Token *createNode(const Token &V) {
+ return new (Alloc.Allocate<Token>()) Token(V);
+ }
+ static void deleteNode(Token *V) { V->~Token(); }
+
+ void addNodeToList(Token *) {}
+ void removeNodeFromList(Token *) {}
+ void transferNodesFromList(ilist_node_traits & /*SrcTraits*/,
+ ilist_iterator<Token> /*first*/,
+ ilist_iterator<Token> /*last*/) {}
+
+ BumpPtrAllocator Alloc;
+};
+}
+
+typedef ilist<Token> TokenQueueT;
+
+namespace {
+/// @brief This struct is used to track simple keys.
+///
+/// Simple keys are handled by creating an entry in SimpleKeys for each Token
+/// which could legally be the start of a simple key. When peekNext is called,
+/// if the Token To be returned is referenced by a SimpleKey, we continue
+/// tokenizing until that potential simple key has either been found to not be
+/// a simple key (we moved on to the next line or went further than 1024 chars).
+/// Or when we run into a Value, and then insert a Key token (and possibly
+/// others) before the SimpleKey's Tok.
+struct SimpleKey {
+ TokenQueueT::iterator Tok;
+ unsigned Column;
+ unsigned Line;
+ unsigned FlowLevel;
+ bool IsRequired;
+
+ bool operator ==(const SimpleKey &Other) {
+ return Tok == Other.Tok;
+ }
+};
+}
+
+/// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit
+/// subsequence and the subsequence's length in code units (uint8_t).
+/// A length of 0 represents an error.
+typedef std::pair<uint32_t, unsigned> UTF8Decoded;
+
+static UTF8Decoded decodeUTF8(StringRef Range) {
+ StringRef::iterator Position= Range.begin();
+ StringRef::iterator End = Range.end();
+ // 1 byte: [0x00, 0x7f]
+ // Bit pattern: 0xxxxxxx
+ if ((*Position & 0x80) == 0) {
+ return std::make_pair(*Position, 1);
+ }
+ // 2 bytes: [0x80, 0x7ff]
+ // Bit pattern: 110xxxxx 10xxxxxx
+ if (Position + 1 != End &&
+ ((*Position & 0xE0) == 0xC0) &&
+ ((*(Position + 1) & 0xC0) == 0x80)) {
+ uint32_t codepoint = ((*Position & 0x1F) << 6) |
+ (*(Position + 1) & 0x3F);
+ if (codepoint >= 0x80)
+ return std::make_pair(codepoint, 2);
+ }
+ // 3 bytes: [0x8000, 0xffff]
+ // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx
+ if (Position + 2 != End &&
+ ((*Position & 0xF0) == 0xE0) &&
+ ((*(Position + 1) & 0xC0) == 0x80) &&
+ ((*(Position + 2) & 0xC0) == 0x80)) {
+ uint32_t codepoint = ((*Position & 0x0F) << 12) |
+ ((*(Position + 1) & 0x3F) << 6) |
+ (*(Position + 2) & 0x3F);
+ // Codepoints between 0xD800 and 0xDFFF are invalid, as
+ // they are high / low surrogate halves used by UTF-16.
+ if (codepoint >= 0x800 &&
+ (codepoint < 0xD800 || codepoint > 0xDFFF))
+ return std::make_pair(codepoint, 3);
+ }
+ // 4 bytes: [0x10000, 0x10FFFF]
+ // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ if (Position + 3 != End &&
+ ((*Position & 0xF8) == 0xF0) &&
+ ((*(Position + 1) & 0xC0) == 0x80) &&
+ ((*(Position + 2) & 0xC0) == 0x80) &&
+ ((*(Position + 3) & 0xC0) == 0x80)) {
+ uint32_t codepoint = ((*Position & 0x07) << 18) |
+ ((*(Position + 1) & 0x3F) << 12) |
+ ((*(Position + 2) & 0x3F) << 6) |
+ (*(Position + 3) & 0x3F);
+ if (codepoint >= 0x10000 && codepoint <= 0x10FFFF)
+ return std::make_pair(codepoint, 4);
+ }
+ return std::make_pair(0, 0);
+}
+
+namespace llvm {
+namespace yaml {
+/// @brief Scans YAML tokens from a MemoryBuffer.
+class Scanner {
+public:
+ Scanner(StringRef Input, SourceMgr &SM, bool ShowColors = true);
+ Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors = true);
+
+ /// @brief Parse the next token and return it without popping it.
+ Token &peekNext();
+
+ /// @brief Parse the next token and pop it from the queue.
+ Token getNext();
+
+ void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message,
+ ArrayRef<SMRange> Ranges = None) {
+ SM.PrintMessage(Loc, Kind, Message, Ranges, /* FixIts= */ None, ShowColors);
+ }
+
+ void setError(const Twine &Message, StringRef::iterator Position) {
+ if (Current >= End)
+ Current = End - 1;
+
+ // Don't print out more errors after the first one we encounter. The rest
+ // are just the result of the first, and have no meaning.
+ if (!Failed)
+ printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message);
+ Failed = true;
+ }
+
+ void setError(const Twine &Message) {
+ setError(Message, Current);
+ }
+
+ /// @brief Returns true if an error occurred while parsing.
+ bool failed() {
+ return Failed;
+ }
+
+private:
+ void init(MemoryBufferRef Buffer);
+
+ StringRef currentInput() {
+ return StringRef(Current, End - Current);
+ }
+
+ /// @brief Decode a UTF-8 minimal well-formed code unit subsequence starting
+ /// at \a Position.
+ ///
+ /// If the UTF-8 code units starting at Position do not form a well-formed
+ /// code unit subsequence, then the Unicode scalar value is 0, and the length
+ /// is 0.
+ UTF8Decoded decodeUTF8(StringRef::iterator Position) {
+ return ::decodeUTF8(StringRef(Position, End - Position));
+ }
+
+ // The following functions are based on the gramar rules in the YAML spec. The
+ // style of the function names it meant to closely match how they are written
+ // in the spec. The number within the [] is the number of the grammar rule in
+ // the spec.
+ //
+ // See 4.2 [Production Naming Conventions] for the meaning of the prefixes.
+ //
+ // c-
+ // A production starting and ending with a special character.
+ // b-
+ // A production matching a single line break.
+ // nb-
+ // A production starting and ending with a non-break character.
+ // s-
+ // A production starting and ending with a white space character.
+ // ns-
+ // A production starting and ending with a non-space character.
+ // l-
+ // A production matching complete line(s).
+
+ /// @brief Skip a single nb-char[27] starting at Position.
+ ///
+ /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE]
+ /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF]
+ ///
+ /// @returns The code unit after the nb-char, or Position if it's not an
+ /// nb-char.
+ StringRef::iterator skip_nb_char(StringRef::iterator Position);
+
+ /// @brief Skip a single b-break[28] starting at Position.
+ ///
+ /// A b-break is 0xD 0xA | 0xD | 0xA
+ ///
+ /// @returns The code unit after the b-break, or Position if it's not a
+ /// b-break.
+ StringRef::iterator skip_b_break(StringRef::iterator Position);
+
+ /// Skip a single s-space[31] starting at Position.
+ ///
+ /// An s-space is 0x20
+ ///
+ /// @returns The code unit after the s-space, or Position if it's not a
+ /// s-space.
+ StringRef::iterator skip_s_space(StringRef::iterator Position);
+
+ /// @brief Skip a single s-white[33] starting at Position.
+ ///
+ /// A s-white is 0x20 | 0x9
+ ///
+ /// @returns The code unit after the s-white, or Position if it's not a
+ /// s-white.
+ StringRef::iterator skip_s_white(StringRef::iterator Position);
+
+ /// @brief Skip a single ns-char[34] starting at Position.
+ ///
+ /// A ns-char is nb-char - s-white
+ ///
+ /// @returns The code unit after the ns-char, or Position if it's not a
+ /// ns-char.
+ StringRef::iterator skip_ns_char(StringRef::iterator Position);
+
+ typedef StringRef::iterator (Scanner::*SkipWhileFunc)(StringRef::iterator);
+ /// @brief Skip minimal well-formed code unit subsequences until Func
+ /// returns its input.
+ ///
+ /// @returns The code unit after the last minimal well-formed code unit
+ /// subsequence that Func accepted.
+ StringRef::iterator skip_while( SkipWhileFunc Func
+ , StringRef::iterator Position);
+
+ /// Skip minimal well-formed code unit subsequences until Func returns its
+ /// input.
+ void advanceWhile(SkipWhileFunc Func);
+
+ /// @brief Scan ns-uri-char[39]s starting at Cur.
+ ///
+ /// This updates Cur and Column while scanning.
+ ///
+ /// @returns A StringRef starting at Cur which covers the longest contiguous
+ /// sequence of ns-uri-char.
+ StringRef scan_ns_uri_char();
+
+ /// @brief Consume a minimal well-formed code unit subsequence starting at
+ /// \a Cur. Return false if it is not the same Unicode scalar value as
+ /// \a Expected. This updates \a Column.
+ bool consume(uint32_t Expected);
+
+ /// @brief Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column.
+ void skip(uint32_t Distance);
+
+ /// @brief Return true if the minimal well-formed code unit subsequence at
+ /// Pos is whitespace or a new line
+ bool isBlankOrBreak(StringRef::iterator Position);
+
+ /// Consume a single b-break[28] if it's present at the current position.
+ ///
+ /// Return false if the code unit at the current position isn't a line break.
+ bool consumeLineBreakIfPresent();
+
+ /// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey.
+ void saveSimpleKeyCandidate( TokenQueueT::iterator Tok
+ , unsigned AtColumn
+ , bool IsRequired);
+
+ /// @brief Remove simple keys that can no longer be valid simple keys.
+ ///
+ /// Invalid simple keys are not on the current line or are further than 1024
+ /// columns back.
+ void removeStaleSimpleKeyCandidates();
+
+ /// @brief Remove all simple keys on FlowLevel \a Level.
+ void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level);
+
+ /// @brief Unroll indentation in \a Indents back to \a Col. Creates BlockEnd
+ /// tokens if needed.
+ bool unrollIndent(int ToColumn);
+
+ /// @brief Increase indent to \a Col. Creates \a Kind token at \a InsertPoint
+ /// if needed.
+ bool rollIndent( int ToColumn
+ , Token::TokenKind Kind
+ , TokenQueueT::iterator InsertPoint);
+
+ /// @brief Skip a single-line comment when the comment starts at the current
+ /// position of the scanner.
+ void skipComment();
+
+ /// @brief Skip whitespace and comments until the start of the next token.
+ void scanToNextToken();
+
+ /// @brief Must be the first token generated.
+ bool scanStreamStart();
+
+ /// @brief Generate tokens needed to close out the stream.
+ bool scanStreamEnd();
+
+ /// @brief Scan a %BLAH directive.
+ bool scanDirective();
+
+ /// @brief Scan a ... or ---.
+ bool scanDocumentIndicator(bool IsStart);
+
+ /// @brief Scan a [ or { and generate the proper flow collection start token.
+ bool scanFlowCollectionStart(bool IsSequence);
+
+ /// @brief Scan a ] or } and generate the proper flow collection end token.
+ bool scanFlowCollectionEnd(bool IsSequence);
+
+ /// @brief Scan the , that separates entries in a flow collection.
+ bool scanFlowEntry();
+
+ /// @brief Scan the - that starts block sequence entries.
+ bool scanBlockEntry();
+
+ /// @brief Scan an explicit ? indicating a key.
+ bool scanKey();
+
+ /// @brief Scan an explicit : indicating a value.
+ bool scanValue();
+
+ /// @brief Scan a quoted scalar.
+ bool scanFlowScalar(bool IsDoubleQuoted);
+
+ /// @brief Scan an unquoted scalar.
+ bool scanPlainScalar();
+
+ /// @brief Scan an Alias or Anchor starting with * or &.
+ bool scanAliasOrAnchor(bool IsAlias);
+
+ /// @brief Scan a block scalar starting with | or >.
+ bool scanBlockScalar(bool IsLiteral);
+
+ /// Scan a chomping indicator in a block scalar header.
+ char scanBlockChompingIndicator();
+
+ /// Scan an indentation indicator in a block scalar header.
+ unsigned scanBlockIndentationIndicator();
+
+ /// Scan a block scalar header.
+ ///
+ /// Return false if an error occurred.
+ bool scanBlockScalarHeader(char &ChompingIndicator, unsigned &IndentIndicator,
+ bool &IsDone);
+
+ /// Look for the indentation level of a block scalar.
+ ///
+ /// Return false if an error occurred.
+ bool findBlockScalarIndent(unsigned &BlockIndent, unsigned BlockExitIndent,
+ unsigned &LineBreaks, bool &IsDone);
+
+ /// Scan the indentation of a text line in a block scalar.
+ ///
+ /// Return false if an error occurred.
+ bool scanBlockScalarIndent(unsigned BlockIndent, unsigned BlockExitIndent,
+ bool &IsDone);
+
+ /// @brief Scan a tag of the form !stuff.
+ bool scanTag();
+
+ /// @brief Dispatch to the next scanning function based on \a *Cur.
+ bool fetchMoreTokens();
+
+ /// @brief The SourceMgr used for diagnostics and buffer management.
+ SourceMgr &SM;
+
+ /// @brief The original input.
+ MemoryBufferRef InputBuffer;
+
+ /// @brief The current position of the scanner.
+ StringRef::iterator Current;
+
+ /// @brief The end of the input (one past the last character).
+ StringRef::iterator End;
+
+ /// @brief Current YAML indentation level in spaces.
+ int Indent;
+
+ /// @brief Current column number in Unicode code points.
+ unsigned Column;
+
+ /// @brief Current line number.
+ unsigned Line;
+
+ /// @brief How deep we are in flow style containers. 0 Means at block level.
+ unsigned FlowLevel;
+
+ /// @brief Are we at the start of the stream?
+ bool IsStartOfStream;
+
+ /// @brief Can the next token be the start of a simple key?
+ bool IsSimpleKeyAllowed;
+
+ /// @brief True if an error has occurred.
+ bool Failed;
+
+ /// @brief Should colors be used when printing out the diagnostic messages?
+ bool ShowColors;
+
+ /// @brief Queue of tokens. This is required to queue up tokens while looking
+ /// for the end of a simple key. And for cases where a single character
+ /// can produce multiple tokens (e.g. BlockEnd).
+ TokenQueueT TokenQueue;
+
+ /// @brief Indentation levels.
+ SmallVector<int, 4> Indents;
+
+ /// @brief Potential simple keys.
+ SmallVector<SimpleKey, 4> SimpleKeys;
+};
+
+} // end namespace yaml
+} // end namespace llvm
+
+/// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result.
+static void encodeUTF8( uint32_t UnicodeScalarValue
+ , SmallVectorImpl<char> &Result) {
+ if (UnicodeScalarValue <= 0x7F) {
+ Result.push_back(UnicodeScalarValue & 0x7F);
+ } else if (UnicodeScalarValue <= 0x7FF) {
+ uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6);
+ uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F);
+ Result.push_back(FirstByte);
+ Result.push_back(SecondByte);
+ } else if (UnicodeScalarValue <= 0xFFFF) {
+ uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12);
+ uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6);
+ uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F);
+ Result.push_back(FirstByte);
+ Result.push_back(SecondByte);
+ Result.push_back(ThirdByte);
+ } else if (UnicodeScalarValue <= 0x10FFFF) {
+ uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18);
+ uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12);
+ uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6);
+ uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F);
+ Result.push_back(FirstByte);
+ Result.push_back(SecondByte);
+ Result.push_back(ThirdByte);
+ Result.push_back(FourthByte);
+ }
+}
+
+bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) {
+ SourceMgr SM;
+ Scanner scanner(Input, SM);
+ while (true) {
+ Token T = scanner.getNext();
+ switch (T.Kind) {
+ case Token::TK_StreamStart:
+ OS << "Stream-Start: ";
+ break;
+ case Token::TK_StreamEnd:
+ OS << "Stream-End: ";
+ break;
+ case Token::TK_VersionDirective:
+ OS << "Version-Directive: ";
+ break;
+ case Token::TK_TagDirective:
+ OS << "Tag-Directive: ";
+ break;
+ case Token::TK_DocumentStart:
+ OS << "Document-Start: ";
+ break;
+ case Token::TK_DocumentEnd:
+ OS << "Document-End: ";
+ break;
+ case Token::TK_BlockEntry:
+ OS << "Block-Entry: ";
+ break;
+ case Token::TK_BlockEnd:
+ OS << "Block-End: ";
+ break;
+ case Token::TK_BlockSequenceStart:
+ OS << "Block-Sequence-Start: ";
+ break;
+ case Token::TK_BlockMappingStart:
+ OS << "Block-Mapping-Start: ";
+ break;
+ case Token::TK_FlowEntry:
+ OS << "Flow-Entry: ";
+ break;
+ case Token::TK_FlowSequenceStart:
+ OS << "Flow-Sequence-Start: ";
+ break;
+ case Token::TK_FlowSequenceEnd:
+ OS << "Flow-Sequence-End: ";
+ break;
+ case Token::TK_FlowMappingStart:
+ OS << "Flow-Mapping-Start: ";
+ break;
+ case Token::TK_FlowMappingEnd:
+ OS << "Flow-Mapping-End: ";
+ break;
+ case Token::TK_Key:
+ OS << "Key: ";
+ break;
+ case Token::TK_Value:
+ OS << "Value: ";
+ break;
+ case Token::TK_Scalar:
+ OS << "Scalar: ";
+ break;
+ case Token::TK_BlockScalar:
+ OS << "Block Scalar: ";
+ break;
+ case Token::TK_Alias:
+ OS << "Alias: ";
+ break;
+ case Token::TK_Anchor:
+ OS << "Anchor: ";
+ break;
+ case Token::TK_Tag:
+ OS << "Tag: ";
+ break;
+ case Token::TK_Error:
+ break;
+ }
+ OS << T.Range << "\n";
+ if (T.Kind == Token::TK_StreamEnd)
+ break;
+ else if (T.Kind == Token::TK_Error)
+ return false;
+ }
+ return true;
+}
+
+bool yaml::scanTokens(StringRef Input) {
+ llvm::SourceMgr SM;
+ llvm::yaml::Scanner scanner(Input, SM);
+ for (;;) {
+ llvm::yaml::Token T = scanner.getNext();
+ if (T.Kind == Token::TK_StreamEnd)
+ break;
+ else if (T.Kind == Token::TK_Error)
+ return false;
+ }
+ return true;
+}
+
+std::string yaml::escape(StringRef Input) {
+ std::string EscapedInput;
+ for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) {
+ if (*i == '\\')
+ EscapedInput += "\\\\";
+ else if (*i == '"')
+ EscapedInput += "\\\"";
+ else if (*i == 0)
+ EscapedInput += "\\0";
+ else if (*i == 0x07)
+ EscapedInput += "\\a";
+ else if (*i == 0x08)
+ EscapedInput += "\\b";
+ else if (*i == 0x09)
+ EscapedInput += "\\t";
+ else if (*i == 0x0A)
+ EscapedInput += "\\n";
+ else if (*i == 0x0B)
+ EscapedInput += "\\v";
+ else if (*i == 0x0C)
+ EscapedInput += "\\f";
+ else if (*i == 0x0D)
+ EscapedInput += "\\r";
+ else if (*i == 0x1B)
+ EscapedInput += "\\e";
+ else if ((unsigned char)*i < 0x20) { // Control characters not handled above.
+ std::string HexStr = utohexstr(*i);
+ EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr;
+ } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence.
+ UTF8Decoded UnicodeScalarValue
+ = decodeUTF8(StringRef(i, Input.end() - i));
+ if (UnicodeScalarValue.second == 0) {
+ // Found invalid char.
+ SmallString<4> Val;
+ encodeUTF8(0xFFFD, Val);
+ EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end());
+ // FIXME: Error reporting.
+ return EscapedInput;
+ }
+ if (UnicodeScalarValue.first == 0x85)
+ EscapedInput += "\\N";
+ else if (UnicodeScalarValue.first == 0xA0)
+ EscapedInput += "\\_";
+ else if (UnicodeScalarValue.first == 0x2028)
+ EscapedInput += "\\L";
+ else if (UnicodeScalarValue.first == 0x2029)
+ EscapedInput += "\\P";
+ else {
+ std::string HexStr = utohexstr(UnicodeScalarValue.first);
+ if (HexStr.size() <= 2)
+ EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr;
+ else if (HexStr.size() <= 4)
+ EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr;
+ else if (HexStr.size() <= 8)
+ EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr;
+ }
+ i += UnicodeScalarValue.second - 1;
+ } else
+ EscapedInput.push_back(*i);
+ }
+ return EscapedInput;
+}
+
+Scanner::Scanner(StringRef Input, SourceMgr &sm, bool ShowColors)
+ : SM(sm), ShowColors(ShowColors) {
+ init(MemoryBufferRef(Input, "YAML"));
+}
+
+Scanner::Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors)
+ : SM(SM_), ShowColors(ShowColors) {
+ init(Buffer);
+}
+
+void Scanner::init(MemoryBufferRef Buffer) {
+ InputBuffer = Buffer;
+ Current = InputBuffer.getBufferStart();
+ End = InputBuffer.getBufferEnd();
+ Indent = -1;
+ Column = 0;
+ Line = 0;
+ FlowLevel = 0;
+ IsStartOfStream = true;
+ IsSimpleKeyAllowed = true;
+ Failed = false;
+ std::unique_ptr<MemoryBuffer> InputBufferOwner =
+ MemoryBuffer::getMemBuffer(Buffer);
+ SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc());
+}
+
+Token &Scanner::peekNext() {
+ // If the current token is a possible simple key, keep parsing until we
+ // can confirm.
+ bool NeedMore = false;
+ while (true) {
+ if (TokenQueue.empty() || NeedMore) {
+ if (!fetchMoreTokens()) {
+ TokenQueue.clear();
+ TokenQueue.push_back(Token());
+ return TokenQueue.front();
+ }
+ }
+ assert(!TokenQueue.empty() &&
+ "fetchMoreTokens lied about getting tokens!");
+
+ removeStaleSimpleKeyCandidates();
+ SimpleKey SK;
+ SK.Tok = TokenQueue.begin();
+ if (std::find(SimpleKeys.begin(), SimpleKeys.end(), SK)
+ == SimpleKeys.end())
+ break;
+ else
+ NeedMore = true;
+ }
+ return TokenQueue.front();
+}
+
+Token Scanner::getNext() {
+ Token Ret = peekNext();
+ // TokenQueue can be empty if there was an error getting the next token.
+ if (!TokenQueue.empty())
+ TokenQueue.pop_front();
+
+ // There cannot be any referenced Token's if the TokenQueue is empty. So do a
+ // quick deallocation of them all.
+ if (TokenQueue.empty()) {
+ TokenQueue.Alloc.Reset();
+ }
+
+ return Ret;
+}
+
+StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) {
+ if (Position == End)
+ return Position;
+ // Check 7 bit c-printable - b-char.
+ if ( *Position == 0x09
+ || (*Position >= 0x20 && *Position <= 0x7E))
+ return Position + 1;
+
+ // Check for valid UTF-8.
+ if (uint8_t(*Position) & 0x80) {
+ UTF8Decoded u8d = decodeUTF8(Position);
+ if ( u8d.second != 0
+ && u8d.first != 0xFEFF
+ && ( u8d.first == 0x85
+ || ( u8d.first >= 0xA0
+ && u8d.first <= 0xD7FF)
+ || ( u8d.first >= 0xE000
+ && u8d.first <= 0xFFFD)
+ || ( u8d.first >= 0x10000
+ && u8d.first <= 0x10FFFF)))
+ return Position + u8d.second;
+ }
+ return Position;
+}
+
+StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) {
+ if (Position == End)
+ return Position;
+ if (*Position == 0x0D) {
+ if (Position + 1 != End && *(Position + 1) == 0x0A)
+ return Position + 2;
+ return Position + 1;
+ }
+
+ if (*Position == 0x0A)
+ return Position + 1;
+ return Position;
+}
+
+StringRef::iterator Scanner::skip_s_space(StringRef::iterator Position) {
+ if (Position == End)
+ return Position;
+ if (*Position == ' ')
+ return Position + 1;
+ return Position;
+}
+
+StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) {
+ if (Position == End)
+ return Position;
+ if (*Position == ' ' || *Position == '\t')
+ return Position + 1;
+ return Position;
+}
+
+StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) {
+ if (Position == End)
+ return Position;
+ if (*Position == ' ' || *Position == '\t')
+ return Position;
+ return skip_nb_char(Position);
+}
+
+StringRef::iterator Scanner::skip_while( SkipWhileFunc Func
+ , StringRef::iterator Position) {
+ while (true) {
+ StringRef::iterator i = (this->*Func)(Position);
+ if (i == Position)
+ break;
+ Position = i;
+ }
+ return Position;
+}
+
+void Scanner::advanceWhile(SkipWhileFunc Func) {
+ auto Final = skip_while(Func, Current);
+ Column += Final - Current;
+ Current = Final;
+}
+
+static bool is_ns_hex_digit(const char C) {
+ return (C >= '0' && C <= '9')
+ || (C >= 'a' && C <= 'z')
+ || (C >= 'A' && C <= 'Z');
+}
+
+static bool is_ns_word_char(const char C) {
+ return C == '-'
+ || (C >= 'a' && C <= 'z')
+ || (C >= 'A' && C <= 'Z');
+}
+
+StringRef Scanner::scan_ns_uri_char() {
+ StringRef::iterator Start = Current;
+ while (true) {
+ if (Current == End)
+ break;
+ if (( *Current == '%'
+ && Current + 2 < End
+ && is_ns_hex_digit(*(Current + 1))
+ && is_ns_hex_digit(*(Current + 2)))
+ || is_ns_word_char(*Current)
+ || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]")
+ != StringRef::npos) {
+ ++Current;
+ ++Column;
+ } else
+ break;
+ }
+ return StringRef(Start, Current - Start);
+}
+
+bool Scanner::consume(uint32_t Expected) {
+ if (Expected >= 0x80)
+ report_fatal_error("Not dealing with this yet");
+ if (Current == End)
+ return false;
+ if (uint8_t(*Current) >= 0x80)
+ report_fatal_error("Not dealing with this yet");
+ if (uint8_t(*Current) == Expected) {
+ ++Current;
+ ++Column;
+ return true;
+ }
+ return false;
+}
+
+void Scanner::skip(uint32_t Distance) {
+ Current += Distance;
+ Column += Distance;
+ assert(Current <= End && "Skipped past the end");
+}
+
+bool Scanner::isBlankOrBreak(StringRef::iterator Position) {
+ if (Position == End)
+ return false;
+ return *Position == ' ' || *Position == '\t' || *Position == '\r' ||
+ *Position == '\n';
+}
+
+bool Scanner::consumeLineBreakIfPresent() {
+ auto Next = skip_b_break(Current);
+ if (Next == Current)
+ return false;
+ Column = 0;
+ ++Line;
+ Current = Next;
+ return true;
+}
+
+void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok
+ , unsigned AtColumn
+ , bool IsRequired) {
+ if (IsSimpleKeyAllowed) {
+ SimpleKey SK;
+ SK.Tok = Tok;
+ SK.Line = Line;
+ SK.Column = AtColumn;
+ SK.IsRequired = IsRequired;
+ SK.FlowLevel = FlowLevel;
+ SimpleKeys.push_back(SK);
+ }
+}
+
+void Scanner::removeStaleSimpleKeyCandidates() {
+ for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin();
+ i != SimpleKeys.end();) {
+ if (i->Line != Line || i->Column + 1024 < Column) {
+ if (i->IsRequired)
+ setError( "Could not find expected : for simple key"
+ , i->Tok->Range.begin());
+ i = SimpleKeys.erase(i);
+ } else
+ ++i;
+ }
+}
+
+void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) {
+ if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level)
+ SimpleKeys.pop_back();
+}
+
+bool Scanner::unrollIndent(int ToColumn) {
+ Token T;
+ // Indentation is ignored in flow.
+ if (FlowLevel != 0)
+ return true;
+
+ while (Indent > ToColumn) {
+ T.Kind = Token::TK_BlockEnd;
+ T.Range = StringRef(Current, 1);
+ TokenQueue.push_back(T);
+ Indent = Indents.pop_back_val();
+ }
+
+ return true;
+}
+
+bool Scanner::rollIndent( int ToColumn
+ , Token::TokenKind Kind
+ , TokenQueueT::iterator InsertPoint) {
+ if (FlowLevel)
+ return true;
+ if (Indent < ToColumn) {
+ Indents.push_back(Indent);
+ Indent = ToColumn;
+
+ Token T;
+ T.Kind = Kind;
+ T.Range = StringRef(Current, 0);
+ TokenQueue.insert(InsertPoint, T);
+ }
+ return true;
+}
+
+void Scanner::skipComment() {
+ if (*Current != '#')
+ return;
+ while (true) {
+ // This may skip more than one byte, thus Column is only incremented
+ // for code points.
+ StringRef::iterator I = skip_nb_char(Current);
+ if (I == Current)
+ break;
+ Current = I;
+ ++Column;
+ }
+}
+
+void Scanner::scanToNextToken() {
+ while (true) {
+ while (*Current == ' ' || *Current == '\t') {
+ skip(1);
+ }
+
+ skipComment();
+
+ // Skip EOL.
+ StringRef::iterator i = skip_b_break(Current);
+ if (i == Current)
+ break;
+ Current = i;
+ ++Line;
+ Column = 0;
+ // New lines may start a simple key.
+ if (!FlowLevel)
+ IsSimpleKeyAllowed = true;
+ }
+}
+
+bool Scanner::scanStreamStart() {
+ IsStartOfStream = false;
+
+ EncodingInfo EI = getUnicodeEncoding(currentInput());
+
+ Token T;
+ T.Kind = Token::TK_StreamStart;
+ T.Range = StringRef(Current, EI.second);
+ TokenQueue.push_back(T);
+ Current += EI.second;
+ return true;
+}
+
+bool Scanner::scanStreamEnd() {
+ // Force an ending new line if one isn't present.
+ if (Column != 0) {
+ Column = 0;
+ ++Line;
+ }
+
+ unrollIndent(-1);
+ SimpleKeys.clear();
+ IsSimpleKeyAllowed = false;
+
+ Token T;
+ T.Kind = Token::TK_StreamEnd;
+ T.Range = StringRef(Current, 0);
+ TokenQueue.push_back(T);
+ return true;
+}
+
+bool Scanner::scanDirective() {
+ // Reset the indentation level.
+ unrollIndent(-1);
+ SimpleKeys.clear();
+ IsSimpleKeyAllowed = false;
+
+ StringRef::iterator Start = Current;
+ consume('%');
+ StringRef::iterator NameStart = Current;
+ Current = skip_while(&Scanner::skip_ns_char, Current);
+ StringRef Name(NameStart, Current - NameStart);
+ Current = skip_while(&Scanner::skip_s_white, Current);
+
+ Token T;
+ if (Name == "YAML") {
+ Current = skip_while(&Scanner::skip_ns_char, Current);
+ T.Kind = Token::TK_VersionDirective;
+ T.Range = StringRef(Start, Current - Start);
+ TokenQueue.push_back(T);
+ return true;
+ } else if(Name == "TAG") {
+ Current = skip_while(&Scanner::skip_ns_char, Current);
+ Current = skip_while(&Scanner::skip_s_white, Current);
+ Current = skip_while(&Scanner::skip_ns_char, Current);
+ T.Kind = Token::TK_TagDirective;
+ T.Range = StringRef(Start, Current - Start);
+ TokenQueue.push_back(T);
+ return true;
+ }
+ return false;
+}
+
+bool Scanner::scanDocumentIndicator(bool IsStart) {
+ unrollIndent(-1);
+ SimpleKeys.clear();
+ IsSimpleKeyAllowed = false;
+
+ Token T;
+ T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd;
+ T.Range = StringRef(Current, 3);
+ skip(3);
+ TokenQueue.push_back(T);
+ return true;
+}
+
+bool Scanner::scanFlowCollectionStart(bool IsSequence) {
+ Token T;
+ T.Kind = IsSequence ? Token::TK_FlowSequenceStart
+ : Token::TK_FlowMappingStart;
+ T.Range = StringRef(Current, 1);
+ skip(1);
+ TokenQueue.push_back(T);
+
+ // [ and { may begin a simple key.
+ saveSimpleKeyCandidate(--TokenQueue.end(), Column - 1, false);
+
+ // And may also be followed by a simple key.
+ IsSimpleKeyAllowed = true;
+ ++FlowLevel;
+ return true;
+}
+
+bool Scanner::scanFlowCollectionEnd(bool IsSequence) {
+ removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
+ IsSimpleKeyAllowed = false;
+ Token T;
+ T.Kind = IsSequence ? Token::TK_FlowSequenceEnd
+ : Token::TK_FlowMappingEnd;
+ T.Range = StringRef(Current, 1);
+ skip(1);
+ TokenQueue.push_back(T);
+ if (FlowLevel)
+ --FlowLevel;
+ return true;
+}
+
+bool Scanner::scanFlowEntry() {
+ removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
+ IsSimpleKeyAllowed = true;
+ Token T;
+ T.Kind = Token::TK_FlowEntry;
+ T.Range = StringRef(Current, 1);
+ skip(1);
+ TokenQueue.push_back(T);
+ return true;
+}
+
+bool Scanner::scanBlockEntry() {
+ rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end());
+ removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
+ IsSimpleKeyAllowed = true;
+ Token T;
+ T.Kind = Token::TK_BlockEntry;
+ T.Range = StringRef(Current, 1);
+ skip(1);
+ TokenQueue.push_back(T);
+ return true;
+}
+
+bool Scanner::scanKey() {
+ if (!FlowLevel)
+ rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
+
+ removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
+ IsSimpleKeyAllowed = !FlowLevel;
+
+ Token T;
+ T.Kind = Token::TK_Key;
+ T.Range = StringRef(Current, 1);
+ skip(1);
+ TokenQueue.push_back(T);
+ return true;
+}
+
+bool Scanner::scanValue() {
+ // If the previous token could have been a simple key, insert the key token
+ // into the token queue.
+ if (!SimpleKeys.empty()) {
+ SimpleKey SK = SimpleKeys.pop_back_val();
+ Token T;
+ T.Kind = Token::TK_Key;
+ T.Range = SK.Tok->Range;
+ TokenQueueT::iterator i, e;
+ for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) {
+ if (i == SK.Tok)
+ break;
+ }
+ assert(i != e && "SimpleKey not in token queue!");
+ i = TokenQueue.insert(i, T);
+
+ // We may also need to add a Block-Mapping-Start token.
+ rollIndent(SK.Column, Token::TK_BlockMappingStart, i);
+
+ IsSimpleKeyAllowed = false;
+ } else {
+ if (!FlowLevel)
+ rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
+ IsSimpleKeyAllowed = !FlowLevel;
+ }
+
+ Token T;
+ T.Kind = Token::TK_Value;
+ T.Range = StringRef(Current, 1);
+ skip(1);
+ TokenQueue.push_back(T);
+ return true;
+}
+
+// Forbidding inlining improves performance by roughly 20%.
+// FIXME: Remove once llvm optimizes this to the faster version without hints.
+LLVM_ATTRIBUTE_NOINLINE static bool
+wasEscaped(StringRef::iterator First, StringRef::iterator Position);
+
+// Returns whether a character at 'Position' was escaped with a leading '\'.
+// 'First' specifies the position of the first character in the string.
+static bool wasEscaped(StringRef::iterator First,
+ StringRef::iterator Position) {
+ assert(Position - 1 >= First);
+ StringRef::iterator I = Position - 1;
+ // We calculate the number of consecutive '\'s before the current position
+ // by iterating backwards through our string.
+ while (I >= First && *I == '\\') --I;
+ // (Position - 1 - I) now contains the number of '\'s before the current
+ // position. If it is odd, the character at 'Position' was escaped.
+ return (Position - 1 - I) % 2 == 1;
+}
+
+bool Scanner::scanFlowScalar(bool IsDoubleQuoted) {
+ StringRef::iterator Start = Current;
+ unsigned ColStart = Column;
+ if (IsDoubleQuoted) {
+ do {
+ ++Current;
+ while (Current != End && *Current != '"')
+ ++Current;
+ // Repeat until the previous character was not a '\' or was an escaped
+ // backslash.
+ } while ( Current != End
+ && *(Current - 1) == '\\'
+ && wasEscaped(Start + 1, Current));
+ } else {
+ skip(1);
+ while (true) {
+ // Skip a ' followed by another '.
+ if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') {
+ skip(2);
+ continue;
+ } else if (*Current == '\'')
+ break;
+ StringRef::iterator i = skip_nb_char(Current);
+ if (i == Current) {
+ i = skip_b_break(Current);
+ if (i == Current)
+ break;
+ Current = i;
+ Column = 0;
+ ++Line;
+ } else {
+ if (i == End)
+ break;
+ Current = i;
+ ++Column;
+ }
+ }
+ }
+
+ if (Current == End) {
+ setError("Expected quote at end of scalar", Current);
+ return false;
+ }
+
+ skip(1); // Skip ending quote.
+ Token T;
+ T.Kind = Token::TK_Scalar;
+ T.Range = StringRef(Start, Current - Start);
+ TokenQueue.push_back(T);
+
+ saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
+
+ IsSimpleKeyAllowed = false;
+
+ return true;
+}
+
+bool Scanner::scanPlainScalar() {
+ StringRef::iterator Start = Current;
+ unsigned ColStart = Column;
+ unsigned LeadingBlanks = 0;
+ assert(Indent >= -1 && "Indent must be >= -1 !");
+ unsigned indent = static_cast<unsigned>(Indent + 1);
+ while (true) {
+ if (*Current == '#')
+ break;
+
+ while (!isBlankOrBreak(Current)) {
+ if ( FlowLevel && *Current == ':'
+ && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) {
+ setError("Found unexpected ':' while scanning a plain scalar", Current);
+ return false;
+ }
+
+ // Check for the end of the plain scalar.
+ if ( (*Current == ':' && isBlankOrBreak(Current + 1))
+ || ( FlowLevel
+ && (StringRef(Current, 1).find_first_of(",:?[]{}")
+ != StringRef::npos)))
+ break;
+
+ StringRef::iterator i = skip_nb_char(Current);
+ if (i == Current)
+ break;
+ Current = i;
+ ++Column;
+ }
+
+ // Are we at the end?
+ if (!isBlankOrBreak(Current))
+ break;
+
+ // Eat blanks.
+ StringRef::iterator Tmp = Current;
+ while (isBlankOrBreak(Tmp)) {
+ StringRef::iterator i = skip_s_white(Tmp);
+ if (i != Tmp) {
+ if (LeadingBlanks && (Column < indent) && *Tmp == '\t') {
+ setError("Found invalid tab character in indentation", Tmp);
+ return false;
+ }
+ Tmp = i;
+ ++Column;
+ } else {
+ i = skip_b_break(Tmp);
+ if (!LeadingBlanks)
+ LeadingBlanks = 1;
+ Tmp = i;
+ Column = 0;
+ ++Line;
+ }
+ }
+
+ if (!FlowLevel && Column < indent)
+ break;
+
+ Current = Tmp;
+ }
+ if (Start == Current) {
+ setError("Got empty plain scalar", Start);
+ return false;
+ }
+ Token T;
+ T.Kind = Token::TK_Scalar;
+ T.Range = StringRef(Start, Current - Start);
+ TokenQueue.push_back(T);
+
+ // Plain scalars can be simple keys.
+ saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
+
+ IsSimpleKeyAllowed = false;
+
+ return true;
+}
+
+bool Scanner::scanAliasOrAnchor(bool IsAlias) {
+ StringRef::iterator Start = Current;
+ unsigned ColStart = Column;
+ skip(1);
+ while(true) {
+ if ( *Current == '[' || *Current == ']'
+ || *Current == '{' || *Current == '}'
+ || *Current == ','
+ || *Current == ':')
+ break;
+ StringRef::iterator i = skip_ns_char(Current);
+ if (i == Current)
+ break;
+ Current = i;
+ ++Column;
+ }
+
+ if (Start == Current) {
+ setError("Got empty alias or anchor", Start);
+ return false;
+ }
+
+ Token T;
+ T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor;
+ T.Range = StringRef(Start, Current - Start);
+ TokenQueue.push_back(T);
+
+ // Alias and anchors can be simple keys.
+ saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
+
+ IsSimpleKeyAllowed = false;
+
+ return true;
+}
+
+char Scanner::scanBlockChompingIndicator() {
+ char Indicator = ' ';
+ if (Current != End && (*Current == '+' || *Current == '-')) {
+ Indicator = *Current;
+ skip(1);
+ }
+ return Indicator;
+}
+
+/// Get the number of line breaks after chomping.
+///
+/// Return the number of trailing line breaks to emit, depending on
+/// \p ChompingIndicator.
+static unsigned getChompedLineBreaks(char ChompingIndicator,
+ unsigned LineBreaks, StringRef Str) {
+ if (ChompingIndicator == '-') // Strip all line breaks.
+ return 0;
+ if (ChompingIndicator == '+') // Keep all line breaks.
+ return LineBreaks;
+ // Clip trailing lines.
+ return Str.empty() ? 0 : 1;
+}
+
+unsigned Scanner::scanBlockIndentationIndicator() {
+ unsigned Indent = 0;
+ if (Current != End && (*Current >= '1' && *Current <= '9')) {
+ Indent = unsigned(*Current - '0');
+ skip(1);
+ }
+ return Indent;
+}
+
+bool Scanner::scanBlockScalarHeader(char &ChompingIndicator,
+ unsigned &IndentIndicator, bool &IsDone) {
+ auto Start = Current;
+
+ ChompingIndicator = scanBlockChompingIndicator();
+ IndentIndicator = scanBlockIndentationIndicator();
+ // Check for the chomping indicator once again.
+ if (ChompingIndicator == ' ')
+ ChompingIndicator = scanBlockChompingIndicator();
+ Current = skip_while(&Scanner::skip_s_white, Current);
+ skipComment();
+
+ if (Current == End) { // EOF, we have an empty scalar.
+ Token T;
+ T.Kind = Token::TK_BlockScalar;
+ T.Range = StringRef(Start, Current - Start);
+ TokenQueue.push_back(T);
+ IsDone = true;
+ return true;
+ }
+
+ if (!consumeLineBreakIfPresent()) {
+ setError("Expected a line break after block scalar header", Current);
+ return false;
+ }
+ return true;
+}
+
+bool Scanner::findBlockScalarIndent(unsigned &BlockIndent,
+ unsigned BlockExitIndent,
+ unsigned &LineBreaks, bool &IsDone) {
+ unsigned MaxAllSpaceLineCharacters = 0;
+ StringRef::iterator LongestAllSpaceLine;
+
+ while (true) {
+ advanceWhile(&Scanner::skip_s_space);
+ if (skip_nb_char(Current) != Current) {
+ // This line isn't empty, so try and find the indentation.
+ if (Column <= BlockExitIndent) { // End of the block literal.
+ IsDone = true;
+ return true;
+ }
+ // We found the block's indentation.
+ BlockIndent = Column;
+ if (MaxAllSpaceLineCharacters > BlockIndent) {
+ setError(
+ "Leading all-spaces line must be smaller than the block indent",
+ LongestAllSpaceLine);
+ return false;
+ }
+ return true;
+ }
+ if (skip_b_break(Current) != Current &&
+ Column > MaxAllSpaceLineCharacters) {
+ // Record the longest all-space line in case it's longer than the
+ // discovered block indent.
+ MaxAllSpaceLineCharacters = Column;
+ LongestAllSpaceLine = Current;
+ }
+
+ // Check for EOF.
+ if (Current == End) {
+ IsDone = true;
+ return true;
+ }
+
+ if (!consumeLineBreakIfPresent()) {
+ IsDone = true;
+ return true;
+ }
+ ++LineBreaks;
+ }
+ return true;
+}
+
+bool Scanner::scanBlockScalarIndent(unsigned BlockIndent,
+ unsigned BlockExitIndent, bool &IsDone) {
+ // Skip the indentation.
+ while (Column < BlockIndent) {
+ auto I = skip_s_space(Current);
+ if (I == Current)
+ break;
+ Current = I;
+ ++Column;
+ }
+
+ if (skip_nb_char(Current) == Current)
+ return true;
+
+ if (Column <= BlockExitIndent) { // End of the block literal.
+ IsDone = true;
+ return true;
+ }
+
+ if (Column < BlockIndent) {
+ if (Current != End && *Current == '#') { // Trailing comment.
+ IsDone = true;
+ return true;
+ }
+ setError("A text line is less indented than the block scalar", Current);
+ return false;
+ }
+ return true; // A normal text line.
+}
+
+bool Scanner::scanBlockScalar(bool IsLiteral) {
+ // Eat '|' or '>'
+ assert(*Current == '|' || *Current == '>');
+ skip(1);
+
+ char ChompingIndicator;
+ unsigned BlockIndent;
+ bool IsDone = false;
+ if (!scanBlockScalarHeader(ChompingIndicator, BlockIndent, IsDone))
+ return false;
+ if (IsDone)
+ return true;
+
+ auto Start = Current;
+ unsigned BlockExitIndent = Indent < 0 ? 0 : (unsigned)Indent;
+ unsigned LineBreaks = 0;
+ if (BlockIndent == 0) {
+ if (!findBlockScalarIndent(BlockIndent, BlockExitIndent, LineBreaks,
+ IsDone))
+ return false;
+ }
+
+ // Scan the block's scalars body.
+ SmallString<256> Str;
+ while (!IsDone) {
+ if (!scanBlockScalarIndent(BlockIndent, BlockExitIndent, IsDone))
+ return false;
+ if (IsDone)
+ break;
+
+ // Parse the current line.
+ auto LineStart = Current;
+ advanceWhile(&Scanner::skip_nb_char);
+ if (LineStart != Current) {
+ Str.append(LineBreaks, '\n');
+ Str.append(StringRef(LineStart, Current - LineStart));
+ LineBreaks = 0;
+ }
+
+ // Check for EOF.
+ if (Current == End)
+ break;
+
+ if (!consumeLineBreakIfPresent())
+ break;
+ ++LineBreaks;
+ }
+
+ if (Current == End && !LineBreaks)
+ // Ensure that there is at least one line break before the end of file.
+ LineBreaks = 1;
+ Str.append(getChompedLineBreaks(ChompingIndicator, LineBreaks, Str), '\n');
+
+ // New lines may start a simple key.
+ if (!FlowLevel)
+ IsSimpleKeyAllowed = true;
+
+ Token T;
+ T.Kind = Token::TK_BlockScalar;
+ T.Range = StringRef(Start, Current - Start);
+ T.Value = Str.str().str();
+ TokenQueue.push_back(T);
+ return true;
+}
+
+bool Scanner::scanTag() {
+ StringRef::iterator Start = Current;
+ unsigned ColStart = Column;
+ skip(1); // Eat !.
+ if (Current == End || isBlankOrBreak(Current)); // An empty tag.
+ else if (*Current == '<') {
+ skip(1);
+ scan_ns_uri_char();
+ if (!consume('>'))
+ return false;
+ } else {
+ // FIXME: Actually parse the c-ns-shorthand-tag rule.
+ Current = skip_while(&Scanner::skip_ns_char, Current);
+ }
+
+ Token T;
+ T.Kind = Token::TK_Tag;
+ T.Range = StringRef(Start, Current - Start);
+ TokenQueue.push_back(T);
+
+ // Tags can be simple keys.
+ saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
+
+ IsSimpleKeyAllowed = false;
+
+ return true;
+}
+
+bool Scanner::fetchMoreTokens() {
+ if (IsStartOfStream)
+ return scanStreamStart();
+
+ scanToNextToken();
+
+ if (Current == End)
+ return scanStreamEnd();
+
+ removeStaleSimpleKeyCandidates();
+
+ unrollIndent(Column);
+
+ if (Column == 0 && *Current == '%')
+ return scanDirective();
+
+ if (Column == 0 && Current + 4 <= End
+ && *Current == '-'
+ && *(Current + 1) == '-'
+ && *(Current + 2) == '-'
+ && (Current + 3 == End || isBlankOrBreak(Current + 3)))
+ return scanDocumentIndicator(true);
+
+ if (Column == 0 && Current + 4 <= End
+ && *Current == '.'
+ && *(Current + 1) == '.'
+ && *(Current + 2) == '.'
+ && (Current + 3 == End || isBlankOrBreak(Current + 3)))
+ return scanDocumentIndicator(false);
+
+ if (*Current == '[')
+ return scanFlowCollectionStart(true);
+
+ if (*Current == '{')
+ return scanFlowCollectionStart(false);
+
+ if (*Current == ']')
+ return scanFlowCollectionEnd(true);
+
+ if (*Current == '}')
+ return scanFlowCollectionEnd(false);
+
+ if (*Current == ',')
+ return scanFlowEntry();
+
+ if (*Current == '-' && isBlankOrBreak(Current + 1))
+ return scanBlockEntry();
+
+ if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1)))
+ return scanKey();
+
+ if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1)))
+ return scanValue();
+
+ if (*Current == '*')
+ return scanAliasOrAnchor(true);
+
+ if (*Current == '&')
+ return scanAliasOrAnchor(false);
+
+ if (*Current == '!')
+ return scanTag();
+
+ if (*Current == '|' && !FlowLevel)
+ return scanBlockScalar(true);
+
+ if (*Current == '>' && !FlowLevel)
+ return scanBlockScalar(false);
+
+ if (*Current == '\'')
+ return scanFlowScalar(false);
+
+ if (*Current == '"')
+ return scanFlowScalar(true);
+
+ // Get a plain scalar.
+ StringRef FirstChar(Current, 1);
+ if (!(isBlankOrBreak(Current)
+ || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos)
+ || (*Current == '-' && !isBlankOrBreak(Current + 1))
+ || (!FlowLevel && (*Current == '?' || *Current == ':')
+ && isBlankOrBreak(Current + 1))
+ || (!FlowLevel && *Current == ':'
+ && Current + 2 < End
+ && *(Current + 1) == ':'
+ && !isBlankOrBreak(Current + 2)))
+ return scanPlainScalar();
+
+ setError("Unrecognized character while tokenizing.");
+ return false;
+}
+
+Stream::Stream(StringRef Input, SourceMgr &SM, bool ShowColors)
+ : scanner(new Scanner(Input, SM, ShowColors)), CurrentDoc() {}
+
+Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM, bool ShowColors)
+ : scanner(new Scanner(InputBuffer, SM, ShowColors)), CurrentDoc() {}
+
+Stream::~Stream() {}
+
+bool Stream::failed() { return scanner->failed(); }
+
+void Stream::printError(Node *N, const Twine &Msg) {
+ scanner->printError( N->getSourceRange().Start
+ , SourceMgr::DK_Error
+ , Msg
+ , N->getSourceRange());
+}
+
+document_iterator Stream::begin() {
+ if (CurrentDoc)
+ report_fatal_error("Can only iterate over the stream once");
+
+ // Skip Stream-Start.
+ scanner->getNext();
+
+ CurrentDoc.reset(new Document(*this));
+ return document_iterator(CurrentDoc);
+}
+
+document_iterator Stream::end() {
+ return document_iterator();
+}
+
+void Stream::skip() {
+ for (document_iterator i = begin(), e = end(); i != e; ++i)
+ i->skip();
+}
+
+Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A,
+ StringRef T)
+ : Doc(D), TypeID(Type), Anchor(A), Tag(T) {
+ SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin());
+ SourceRange = SMRange(Start, Start);
+}
+
+std::string Node::getVerbatimTag() const {
+ StringRef Raw = getRawTag();
+ if (!Raw.empty() && Raw != "!") {
+ std::string Ret;
+ if (Raw.find_last_of('!') == 0) {
+ Ret = Doc->getTagMap().find("!")->second;
+ Ret += Raw.substr(1);
+ return Ret;
+ } else if (Raw.startswith("!!")) {
+ Ret = Doc->getTagMap().find("!!")->second;
+ Ret += Raw.substr(2);
+ return Ret;
+ } else {
+ StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1);
+ std::map<StringRef, StringRef>::const_iterator It =
+ Doc->getTagMap().find(TagHandle);
+ if (It != Doc->getTagMap().end())
+ Ret = It->second;
+ else {
+ Token T;
+ T.Kind = Token::TK_Tag;
+ T.Range = TagHandle;
+ setError(Twine("Unknown tag handle ") + TagHandle, T);
+ }
+ Ret += Raw.substr(Raw.find_last_of('!') + 1);
+ return Ret;
+ }
+ }
+
+ switch (getType()) {
+ case NK_Null:
+ return "tag:yaml.org,2002:null";
+ case NK_Scalar:
+ case NK_BlockScalar:
+ // TODO: Tag resolution.
+ return "tag:yaml.org,2002:str";
+ case NK_Mapping:
+ return "tag:yaml.org,2002:map";
+ case NK_Sequence:
+ return "tag:yaml.org,2002:seq";
+ }
+
+ return "";
+}
+
+Token &Node::peekNext() {
+ return Doc->peekNext();
+}
+
+Token Node::getNext() {
+ return Doc->getNext();
+}
+
+Node *Node::parseBlockNode() {
+ return Doc->parseBlockNode();
+}
+
+BumpPtrAllocator &Node::getAllocator() {
+ return Doc->NodeAllocator;
+}
+
+void Node::setError(const Twine &Msg, Token &Tok) const {
+ Doc->setError(Msg, Tok);
+}
+
+bool Node::failed() const {
+ return Doc->failed();
+}
+
+
+
+StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const {
+ // TODO: Handle newlines properly. We need to remove leading whitespace.
+ if (Value[0] == '"') { // Double quoted.
+ // Pull off the leading and trailing "s.
+ StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
+ // Search for characters that would require unescaping the value.
+ StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n");
+ if (i != StringRef::npos)
+ return unescapeDoubleQuoted(UnquotedValue, i, Storage);
+ return UnquotedValue;
+ } else if (Value[0] == '\'') { // Single quoted.
+ // Pull off the leading and trailing 's.
+ StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
+ StringRef::size_type i = UnquotedValue.find('\'');
+ if (i != StringRef::npos) {
+ // We're going to need Storage.
+ Storage.clear();
+ Storage.reserve(UnquotedValue.size());
+ for (; i != StringRef::npos; i = UnquotedValue.find('\'')) {
+ StringRef Valid(UnquotedValue.begin(), i);
+ Storage.insert(Storage.end(), Valid.begin(), Valid.end());
+ Storage.push_back('\'');
+ UnquotedValue = UnquotedValue.substr(i + 2);
+ }
+ Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end());
+ return StringRef(Storage.begin(), Storage.size());
+ }
+ return UnquotedValue;
+ }
+ // Plain or block.
+ return Value.rtrim(" ");
+}
+
+StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue
+ , StringRef::size_type i
+ , SmallVectorImpl<char> &Storage)
+ const {
+ // Use Storage to build proper value.
+ Storage.clear();
+ Storage.reserve(UnquotedValue.size());
+ for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) {
+ // Insert all previous chars into Storage.
+ StringRef Valid(UnquotedValue.begin(), i);
+ Storage.insert(Storage.end(), Valid.begin(), Valid.end());
+ // Chop off inserted chars.
+ UnquotedValue = UnquotedValue.substr(i);
+
+ assert(!UnquotedValue.empty() && "Can't be empty!");
+
+ // Parse escape or line break.
+ switch (UnquotedValue[0]) {
+ case '\r':
+ case '\n':
+ Storage.push_back('\n');
+ if ( UnquotedValue.size() > 1
+ && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n'))
+ UnquotedValue = UnquotedValue.substr(1);
+ UnquotedValue = UnquotedValue.substr(1);
+ break;
+ default:
+ if (UnquotedValue.size() == 1)
+ // TODO: Report error.
+ break;
+ UnquotedValue = UnquotedValue.substr(1);
+ switch (UnquotedValue[0]) {
+ default: {
+ Token T;
+ T.Range = StringRef(UnquotedValue.begin(), 1);
+ setError("Unrecognized escape code!", T);
+ return "";
+ }
+ case '\r':
+ case '\n':
+ // Remove the new line.
+ if ( UnquotedValue.size() > 1
+ && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n'))
+ UnquotedValue = UnquotedValue.substr(1);
+ // If this was just a single byte newline, it will get skipped
+ // below.
+ break;
+ case '0':
+ Storage.push_back(0x00);
+ break;
+ case 'a':
+ Storage.push_back(0x07);
+ break;
+ case 'b':
+ Storage.push_back(0x08);
+ break;
+ case 't':
+ case 0x09:
+ Storage.push_back(0x09);
+ break;
+ case 'n':
+ Storage.push_back(0x0A);
+ break;
+ case 'v':
+ Storage.push_back(0x0B);
+ break;
+ case 'f':
+ Storage.push_back(0x0C);
+ break;
+ case 'r':
+ Storage.push_back(0x0D);
+ break;
+ case 'e':
+ Storage.push_back(0x1B);
+ break;
+ case ' ':
+ Storage.push_back(0x20);
+ break;
+ case '"':
+ Storage.push_back(0x22);
+ break;
+ case '/':
+ Storage.push_back(0x2F);
+ break;
+ case '\\':
+ Storage.push_back(0x5C);
+ break;
+ case 'N':
+ encodeUTF8(0x85, Storage);
+ break;
+ case '_':
+ encodeUTF8(0xA0, Storage);
+ break;
+ case 'L':
+ encodeUTF8(0x2028, Storage);
+ break;
+ case 'P':
+ encodeUTF8(0x2029, Storage);
+ break;
+ case 'x': {
+ if (UnquotedValue.size() < 3)
+ // TODO: Report error.
+ break;
+ unsigned int UnicodeScalarValue;
+ if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue))
+ // TODO: Report error.
+ UnicodeScalarValue = 0xFFFD;
+ encodeUTF8(UnicodeScalarValue, Storage);
+ UnquotedValue = UnquotedValue.substr(2);
+ break;
+ }
+ case 'u': {
+ if (UnquotedValue.size() < 5)
+ // TODO: Report error.
+ break;
+ unsigned int UnicodeScalarValue;
+ if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue))
+ // TODO: Report error.
+ UnicodeScalarValue = 0xFFFD;
+ encodeUTF8(UnicodeScalarValue, Storage);
+ UnquotedValue = UnquotedValue.substr(4);
+ break;
+ }
+ case 'U': {
+ if (UnquotedValue.size() < 9)
+ // TODO: Report error.
+ break;
+ unsigned int UnicodeScalarValue;
+ if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue))
+ // TODO: Report error.
+ UnicodeScalarValue = 0xFFFD;
+ encodeUTF8(UnicodeScalarValue, Storage);
+ UnquotedValue = UnquotedValue.substr(8);
+ break;
+ }
+ }
+ UnquotedValue = UnquotedValue.substr(1);
+ }
+ }
+ Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end());
+ return StringRef(Storage.begin(), Storage.size());
+}
+
+Node *KeyValueNode::getKey() {
+ if (Key)
+ return Key;
+ // Handle implicit null keys.
+ {
+ Token &t = peekNext();
+ if ( t.Kind == Token::TK_BlockEnd
+ || t.Kind == Token::TK_Value
+ || t.Kind == Token::TK_Error) {
+ return Key = new (getAllocator()) NullNode(Doc);
+ }
+ if (t.Kind == Token::TK_Key)
+ getNext(); // skip TK_Key.
+ }
+
+ // Handle explicit null keys.
+ Token &t = peekNext();
+ if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) {
+ return Key = new (getAllocator()) NullNode(Doc);
+ }
+
+ // We've got a normal key.
+ return Key = parseBlockNode();
+}
+
+Node *KeyValueNode::getValue() {
+ if (Value)
+ return Value;
+ getKey()->skip();
+ if (failed())
+ return Value = new (getAllocator()) NullNode(Doc);
+
+ // Handle implicit null values.
+ {
+ Token &t = peekNext();
+ if ( t.Kind == Token::TK_BlockEnd
+ || t.Kind == Token::TK_FlowMappingEnd
+ || t.Kind == Token::TK_Key
+ || t.Kind == Token::TK_FlowEntry
+ || t.Kind == Token::TK_Error) {
+ return Value = new (getAllocator()) NullNode(Doc);
+ }
+
+ if (t.Kind != Token::TK_Value) {
+ setError("Unexpected token in Key Value.", t);
+ return Value = new (getAllocator()) NullNode(Doc);
+ }
+ getNext(); // skip TK_Value.
+ }
+
+ // Handle explicit null values.
+ Token &t = peekNext();
+ if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) {
+ return Value = new (getAllocator()) NullNode(Doc);
+ }
+
+ // We got a normal value.
+ return Value = parseBlockNode();
+}
+
+void MappingNode::increment() {
+ if (failed()) {
+ IsAtEnd = true;
+ CurrentEntry = nullptr;
+ return;
+ }
+ if (CurrentEntry) {
+ CurrentEntry->skip();
+ if (Type == MT_Inline) {
+ IsAtEnd = true;
+ CurrentEntry = nullptr;
+ return;
+ }
+ }
+ Token T = peekNext();
+ if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) {
+ // KeyValueNode eats the TK_Key. That way it can detect null keys.
+ CurrentEntry = new (getAllocator()) KeyValueNode(Doc);
+ } else if (Type == MT_Block) {
+ switch (T.Kind) {
+ case Token::TK_BlockEnd:
+ getNext();
+ IsAtEnd = true;
+ CurrentEntry = nullptr;
+ break;
+ default:
+ setError("Unexpected token. Expected Key or Block End", T);
+ case Token::TK_Error:
+ IsAtEnd = true;
+ CurrentEntry = nullptr;
+ }
+ } else {
+ switch (T.Kind) {
+ case Token::TK_FlowEntry:
+ // Eat the flow entry and recurse.
+ getNext();
+ return increment();
+ case Token::TK_FlowMappingEnd:
+ getNext();
+ case Token::TK_Error:
+ // Set this to end iterator.
+ IsAtEnd = true;
+ CurrentEntry = nullptr;
+ break;
+ default:
+ setError( "Unexpected token. Expected Key, Flow Entry, or Flow "
+ "Mapping End."
+ , T);
+ IsAtEnd = true;
+ CurrentEntry = nullptr;
+ }
+ }
+}
+
+void SequenceNode::increment() {
+ if (failed()) {
+ IsAtEnd = true;
+ CurrentEntry = nullptr;
+ return;
+ }
+ if (CurrentEntry)
+ CurrentEntry->skip();
+ Token T = peekNext();
+ if (SeqType == ST_Block) {
+ switch (T.Kind) {
+ case Token::TK_BlockEntry:
+ getNext();
+ CurrentEntry = parseBlockNode();
+ if (!CurrentEntry) { // An error occurred.
+ IsAtEnd = true;
+ CurrentEntry = nullptr;
+ }
+ break;
+ case Token::TK_BlockEnd:
+ getNext();
+ IsAtEnd = true;
+ CurrentEntry = nullptr;
+ break;
+ default:
+ setError( "Unexpected token. Expected Block Entry or Block End."
+ , T);
+ case Token::TK_Error:
+ IsAtEnd = true;
+ CurrentEntry = nullptr;
+ }
+ } else if (SeqType == ST_Indentless) {
+ switch (T.Kind) {
+ case Token::TK_BlockEntry:
+ getNext();
+ CurrentEntry = parseBlockNode();
+ if (!CurrentEntry) { // An error occurred.
+ IsAtEnd = true;
+ CurrentEntry = nullptr;
+ }
+ break;
+ default:
+ case Token::TK_Error:
+ IsAtEnd = true;
+ CurrentEntry = nullptr;
+ }
+ } else if (SeqType == ST_Flow) {
+ switch (T.Kind) {
+ case Token::TK_FlowEntry:
+ // Eat the flow entry and recurse.
+ getNext();
+ WasPreviousTokenFlowEntry = true;
+ return increment();
+ case Token::TK_FlowSequenceEnd:
+ getNext();
+ case Token::TK_Error:
+ // Set this to end iterator.
+ IsAtEnd = true;
+ CurrentEntry = nullptr;
+ break;
+ case Token::TK_StreamEnd:
+ case Token::TK_DocumentEnd:
+ case Token::TK_DocumentStart:
+ setError("Could not find closing ]!", T);
+ // Set this to end iterator.
+ IsAtEnd = true;
+ CurrentEntry = nullptr;
+ break;
+ default:
+ if (!WasPreviousTokenFlowEntry) {
+ setError("Expected , between entries!", T);
+ IsAtEnd = true;
+ CurrentEntry = nullptr;
+ break;
+ }
+ // Otherwise it must be a flow entry.
+ CurrentEntry = parseBlockNode();
+ if (!CurrentEntry) {
+ IsAtEnd = true;
+ }
+ WasPreviousTokenFlowEntry = false;
+ break;
+ }
+ }
+}
+
+Document::Document(Stream &S) : stream(S), Root(nullptr) {
+ // Tag maps starts with two default mappings.
+ TagMap["!"] = "!";
+ TagMap["!!"] = "tag:yaml.org,2002:";
+
+ if (parseDirectives())
+ expectToken(Token::TK_DocumentStart);
+ Token &T = peekNext();
+ if (T.Kind == Token::TK_DocumentStart)
+ getNext();
+}
+
+bool Document::skip() {
+ if (stream.scanner->failed())
+ return false;
+ if (!Root)
+ getRoot();
+ Root->skip();
+ Token &T = peekNext();
+ if (T.Kind == Token::TK_StreamEnd)
+ return false;
+ if (T.Kind == Token::TK_DocumentEnd) {
+ getNext();
+ return skip();
+ }
+ return true;
+}
+
+Token &Document::peekNext() {
+ return stream.scanner->peekNext();
+}
+
+Token Document::getNext() {
+ return stream.scanner->getNext();
+}
+
+void Document::setError(const Twine &Message, Token &Location) const {
+ stream.scanner->setError(Message, Location.Range.begin());
+}
+
+bool Document::failed() const {
+ return stream.scanner->failed();
+}
+
+Node *Document::parseBlockNode() {
+ Token T = peekNext();
+ // Handle properties.
+ Token AnchorInfo;
+ Token TagInfo;
+parse_property:
+ switch (T.Kind) {
+ case Token::TK_Alias:
+ getNext();
+ return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1));
+ case Token::TK_Anchor:
+ if (AnchorInfo.Kind == Token::TK_Anchor) {
+ setError("Already encountered an anchor for this node!", T);
+ return nullptr;
+ }
+ AnchorInfo = getNext(); // Consume TK_Anchor.
+ T = peekNext();
+ goto parse_property;
+ case Token::TK_Tag:
+ if (TagInfo.Kind == Token::TK_Tag) {
+ setError("Already encountered a tag for this node!", T);
+ return nullptr;
+ }
+ TagInfo = getNext(); // Consume TK_Tag.
+ T = peekNext();
+ goto parse_property;
+ default:
+ break;
+ }
+
+ switch (T.Kind) {
+ case Token::TK_BlockEntry:
+ // We got an unindented BlockEntry sequence. This is not terminated with
+ // a BlockEnd.
+ // Don't eat the TK_BlockEntry, SequenceNode needs it.
+ return new (NodeAllocator) SequenceNode( stream.CurrentDoc
+ , AnchorInfo.Range.substr(1)
+ , TagInfo.Range
+ , SequenceNode::ST_Indentless);
+ case Token::TK_BlockSequenceStart:
+ getNext();
+ return new (NodeAllocator)
+ SequenceNode( stream.CurrentDoc
+ , AnchorInfo.Range.substr(1)
+ , TagInfo.Range
+ , SequenceNode::ST_Block);
+ case Token::TK_BlockMappingStart:
+ getNext();
+ return new (NodeAllocator)
+ MappingNode( stream.CurrentDoc
+ , AnchorInfo.Range.substr(1)
+ , TagInfo.Range
+ , MappingNode::MT_Block);
+ case Token::TK_FlowSequenceStart:
+ getNext();
+ return new (NodeAllocator)
+ SequenceNode( stream.CurrentDoc
+ , AnchorInfo.Range.substr(1)
+ , TagInfo.Range
+ , SequenceNode::ST_Flow);
+ case Token::TK_FlowMappingStart:
+ getNext();
+ return new (NodeAllocator)
+ MappingNode( stream.CurrentDoc
+ , AnchorInfo.Range.substr(1)
+ , TagInfo.Range
+ , MappingNode::MT_Flow);
+ case Token::TK_Scalar:
+ getNext();
+ return new (NodeAllocator)
+ ScalarNode( stream.CurrentDoc
+ , AnchorInfo.Range.substr(1)
+ , TagInfo.Range
+ , T.Range);
+ case Token::TK_BlockScalar: {
+ getNext();
+ StringRef NullTerminatedStr(T.Value.c_str(), T.Value.length() + 1);
+ StringRef StrCopy = NullTerminatedStr.copy(NodeAllocator).drop_back();
+ return new (NodeAllocator)
+ BlockScalarNode(stream.CurrentDoc, AnchorInfo.Range.substr(1),
+ TagInfo.Range, StrCopy, T.Range);
+ }
+ case Token::TK_Key:
+ // Don't eat the TK_Key, KeyValueNode expects it.
+ return new (NodeAllocator)
+ MappingNode( stream.CurrentDoc
+ , AnchorInfo.Range.substr(1)
+ , TagInfo.Range
+ , MappingNode::MT_Inline);
+ case Token::TK_DocumentStart:
+ case Token::TK_DocumentEnd:
+ case Token::TK_StreamEnd:
+ default:
+ // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not
+ // !!null null.
+ return new (NodeAllocator) NullNode(stream.CurrentDoc);
+ case Token::TK_Error:
+ return nullptr;
+ }
+ llvm_unreachable("Control flow shouldn't reach here.");
+ return nullptr;
+}
+
+bool Document::parseDirectives() {
+ bool isDirective = false;
+ while (true) {
+ Token T = peekNext();
+ if (T.Kind == Token::TK_TagDirective) {
+ parseTAGDirective();
+ isDirective = true;
+ } else if (T.Kind == Token::TK_VersionDirective) {
+ parseYAMLDirective();
+ isDirective = true;
+ } else
+ break;
+ }
+ return isDirective;
+}
+
+void Document::parseYAMLDirective() {
+ getNext(); // Eat %YAML <version>
+}
+
+void Document::parseTAGDirective() {
+ Token Tag = getNext(); // %TAG <handle> <prefix>
+ StringRef T = Tag.Range;
+ // Strip %TAG
+ T = T.substr(T.find_first_of(" \t")).ltrim(" \t");
+ std::size_t HandleEnd = T.find_first_of(" \t");
+ StringRef TagHandle = T.substr(0, HandleEnd);
+ StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t");
+ TagMap[TagHandle] = TagPrefix;
+}
+
+bool Document::expectToken(int TK) {
+ Token T = getNext();
+ if (T.Kind != TK) {
+ setError("Unexpected token", T);
+ return false;
+ }
+ return true;
+}
diff --git a/ext/src/llvm/YAMLTraits.cpp b/ext/src/llvm/YAMLTraits.cpp
new file mode 100644
index 0000000..5af4efe
--- /dev/null
+++ b/ext/src/llvm/YAMLTraits.cpp
@@ -0,0 +1,1021 @@
+//===- lib/Support/YAMLTraits.cpp -----------------------------------------===//
+//
+// The LLVM Linker
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/LineIterator.h"
+#include "llvm/Support/YAMLParser.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cctype>
+#include <cstring>
+using namespace llvm;
+using namespace yaml;
+
+//===----------------------------------------------------------------------===//
+// IO
+//===----------------------------------------------------------------------===//
+
+IO::IO(void *Context) : Ctxt(Context) {
+}
+
+IO::~IO() {
+}
+
+void *IO::getContext() {
+ return Ctxt;
+}
+
+void IO::setContext(void *Context) {
+ Ctxt = Context;
+}
+
+//===----------------------------------------------------------------------===//
+// Input
+//===----------------------------------------------------------------------===//
+
+Input::Input(StringRef InputContent,
+ void *Ctxt,
+ SourceMgr::DiagHandlerTy DiagHandler,
+ void *DiagHandlerCtxt)
+ : IO(Ctxt),
+ Strm(new Stream(InputContent, SrcMgr)),
+ CurrentNode(nullptr) {
+ if (DiagHandler)
+ SrcMgr.setDiagHandler(DiagHandler, DiagHandlerCtxt);
+ DocIterator = Strm->begin();
+}
+
+Input::Input(MemoryBufferRef InputContent,
+ void *Ctxt,
+ SourceMgr::DiagHandlerTy DiagHandler,
+ void *DiagHandlerCtxt)
+ : IO(Ctxt),
+ Strm(new Stream(InputContent, SrcMgr)),
+ CurrentNode(nullptr) {
+ if (DiagHandler)
+ SrcMgr.setDiagHandler(DiagHandler, DiagHandlerCtxt);
+ DocIterator = Strm->begin();
+}
+
+Input::~Input() {
+}
+
+std::error_code Input::error() { return EC; }
+
+// Pin the vtables to this file.
+void Input::HNode::anchor() {}
+void Input::EmptyHNode::anchor() {}
+void Input::ScalarHNode::anchor() {}
+void Input::MapHNode::anchor() {}
+void Input::SequenceHNode::anchor() {}
+
+bool Input::outputting() {
+ return false;
+}
+
+bool Input::setCurrentDocument() {
+ if (DocIterator != Strm->end()) {
+ Node *N = DocIterator->getRoot();
+ if (!N) {
+ assert(Strm->failed() && "Root is NULL iff parsing failed");
+ EC = make_error_code(errc::invalid_argument);
+ return false;
+ }
+
+ if (isa<NullNode>(N)) {
+ // Empty files are allowed and ignored
+ ++DocIterator;
+ return setCurrentDocument();
+ }
+ TopNode = this->createHNodes(N);
+ CurrentNode = TopNode.get();
+ return true;
+ }
+ return false;
+}
+
+bool Input::nextDocument() {
+ return ++DocIterator != Strm->end();
+}
+
+const Node *Input::getCurrentNode() const {
+ return CurrentNode ? CurrentNode->_node : nullptr;
+}
+
+bool Input::mapTag(StringRef Tag, bool Default) {
+ std::string foundTag = CurrentNode->_node->getVerbatimTag();
+ if (foundTag.empty()) {
+ // If no tag found and 'Tag' is the default, say it was found.
+ return Default;
+ }
+ // Return true iff found tag matches supplied tag.
+ return Tag.equals(foundTag);
+}
+
+void Input::beginMapping() {
+ if (EC)
+ return;
+ // CurrentNode can be null if the document is empty.
+ MapHNode *MN = dyn_cast_or_null<MapHNode>(CurrentNode);
+ if (MN) {
+ MN->ValidKeys.clear();
+ }
+}
+
+bool Input::preflightKey(const char *Key, bool Required, bool, bool &UseDefault,
+ void *&SaveInfo) {
+ UseDefault = false;
+ if (EC)
+ return false;
+
+ // CurrentNode is null for empty documents, which is an error in case required
+ // nodes are present.
+ if (!CurrentNode) {
+ if (Required)
+ EC = make_error_code(errc::invalid_argument);
+ return false;
+ }
+
+ MapHNode *MN = dyn_cast<MapHNode>(CurrentNode);
+ if (!MN) {
+ setError(CurrentNode, "not a mapping");
+ return false;
+ }
+ MN->ValidKeys.push_back(Key);
+ HNode *Value = MN->Mapping[Key].get();
+ if (!Value) {
+ if (Required)
+ setError(CurrentNode, Twine("missing required key '") + Key + "'");
+ else
+ UseDefault = true;
+ return false;
+ }
+ SaveInfo = CurrentNode;
+ CurrentNode = Value;
+ return true;
+}
+
+void Input::postflightKey(void *saveInfo) {
+ CurrentNode = reinterpret_cast<HNode *>(saveInfo);
+}
+
+void Input::endMapping() {
+ if (EC)
+ return;
+ // CurrentNode can be null if the document is empty.
+ MapHNode *MN = dyn_cast_or_null<MapHNode>(CurrentNode);
+ if (!MN)
+ return;
+ for (const auto &NN : MN->Mapping) {
+ if (!MN->isValidKey(NN.first())) {
+ setError(NN.second.get(), Twine("unknown key '") + NN.first() + "'");
+ break;
+ }
+ }
+}
+
+std::vector<StringRef> Input::getKeys() const {
+ std::vector<StringRef> res;
+
+ if (EC)
+ return res;
+ // CurrentNode can be null if the document is empty.
+ MapHNode *MN = dyn_cast_or_null<MapHNode>(CurrentNode);
+ if (!MN)
+ return res;
+
+ for (const auto &NN : MN->Mapping) {
+ res.push_back(NN.first());
+ }
+
+ return res;
+}
+
+void Input::beginFlowMapping() { beginMapping(); }
+
+void Input::endFlowMapping() { endMapping(); }
+
+unsigned Input::beginSequence() {
+ if (SequenceHNode *SQ = dyn_cast<SequenceHNode>(CurrentNode))
+ return SQ->Entries.size();
+ if (isa<EmptyHNode>(CurrentNode))
+ return 0;
+ // Treat case where there's a scalar "null" value as an empty sequence.
+ if (ScalarHNode *SN = dyn_cast<ScalarHNode>(CurrentNode)) {
+ if (isNull(SN->value()))
+ return 0;
+ }
+ // Any other type of HNode is an error.
+ setError(CurrentNode, "not a sequence");
+ return 0;
+}
+
+void Input::endSequence() {
+}
+
+bool Input::preflightElement(unsigned Index, void *&SaveInfo) {
+ if (EC)
+ return false;
+ if (SequenceHNode *SQ = dyn_cast<SequenceHNode>(CurrentNode)) {
+ SaveInfo = CurrentNode;
+ CurrentNode = SQ->Entries[Index].get();
+ return true;
+ }
+ return false;
+}
+
+void Input::postflightElement(void *SaveInfo) {
+ CurrentNode = reinterpret_cast<HNode *>(SaveInfo);
+}
+
+unsigned Input::beginFlowSequence() { return beginSequence(); }
+
+bool Input::preflightFlowElement(unsigned index, void *&SaveInfo) {
+ if (EC)
+ return false;
+ if (SequenceHNode *SQ = dyn_cast<SequenceHNode>(CurrentNode)) {
+ SaveInfo = CurrentNode;
+ CurrentNode = SQ->Entries[index].get();
+ return true;
+ }
+ return false;
+}
+
+void Input::postflightFlowElement(void *SaveInfo) {
+ CurrentNode = reinterpret_cast<HNode *>(SaveInfo);
+}
+
+void Input::endFlowSequence() {
+}
+
+void Input::beginEnumScalar() {
+ ScalarMatchFound = false;
+}
+
+bool Input::matchEnumScalar(const char *Str, bool) {
+ if (ScalarMatchFound)
+ return false;
+ if (ScalarHNode *SN = dyn_cast<ScalarHNode>(CurrentNode)) {
+ if (SN->value().equals(Str)) {
+ ScalarMatchFound = true;
+ return true;
+ }
+ }
+ return false;
+}
+
+bool Input::matchEnumFallback() {
+ if (ScalarMatchFound)
+ return false;
+ ScalarMatchFound = true;
+ return true;
+}
+
+void Input::endEnumScalar() {
+ if (!ScalarMatchFound) {
+ setError(CurrentNode, "unknown enumerated scalar");
+ }
+}
+
+bool Input::beginBitSetScalar(bool &DoClear) {
+ BitValuesUsed.clear();
+ if (SequenceHNode *SQ = dyn_cast<SequenceHNode>(CurrentNode)) {
+ BitValuesUsed.insert(BitValuesUsed.begin(), SQ->Entries.size(), false);
+ } else {
+ setError(CurrentNode, "expected sequence of bit values");
+ }
+ DoClear = true;
+ return true;
+}
+
+bool Input::bitSetMatch(const char *Str, bool) {
+ if (EC)
+ return false;
+ if (SequenceHNode *SQ = dyn_cast<SequenceHNode>(CurrentNode)) {
+ unsigned Index = 0;
+ for (auto &N : SQ->Entries) {
+ if (ScalarHNode *SN = dyn_cast<ScalarHNode>(N.get())) {
+ if (SN->value().equals(Str)) {
+ BitValuesUsed[Index] = true;
+ return true;
+ }
+ } else {
+ setError(CurrentNode, "unexpected scalar in sequence of bit values");
+ }
+ ++Index;
+ }
+ } else {
+ setError(CurrentNode, "expected sequence of bit values");
+ }
+ return false;
+}
+
+void Input::endBitSetScalar() {
+ if (EC)
+ return;
+ if (SequenceHNode *SQ = dyn_cast<SequenceHNode>(CurrentNode)) {
+ assert(BitValuesUsed.size() == SQ->Entries.size());
+ for (unsigned i = 0; i < SQ->Entries.size(); ++i) {
+ if (!BitValuesUsed[i]) {
+ setError(SQ->Entries[i].get(), "unknown bit value");
+ return;
+ }
+ }
+ }
+}
+
+void Input::scalarString(StringRef &S, bool) {
+ if (ScalarHNode *SN = dyn_cast<ScalarHNode>(CurrentNode)) {
+ S = SN->value();
+ } else {
+ setError(CurrentNode, "unexpected scalar");
+ }
+}
+
+void Input::blockScalarString(StringRef &S) { scalarString(S, false); }
+
+void Input::setError(HNode *hnode, const Twine &message) {
+ assert(hnode && "HNode must not be NULL");
+ this->setError(hnode->_node, message);
+}
+
+void Input::setError(Node *node, const Twine &message) {
+ Strm->printError(node, message);
+ EC = make_error_code(errc::invalid_argument);
+}
+
+std::unique_ptr<Input::HNode> Input::createHNodes(Node *N) {
+ SmallString<128> StringStorage;
+ if (ScalarNode *SN = dyn_cast<ScalarNode>(N)) {
+ StringRef KeyStr = SN->getValue(StringStorage);
+ if (!StringStorage.empty()) {
+ // Copy string to permanent storage
+ KeyStr = StringStorage.str().copy(StringAllocator);
+ }
+ return llvm::make_unique<ScalarHNode>(N, KeyStr);
+ } else if (BlockScalarNode *BSN = dyn_cast<BlockScalarNode>(N)) {
+ StringRef ValueCopy = BSN->getValue().copy(StringAllocator);
+ return llvm::make_unique<ScalarHNode>(N, ValueCopy);
+ } else if (SequenceNode *SQ = dyn_cast<SequenceNode>(N)) {
+ auto SQHNode = llvm::make_unique<SequenceHNode>(N);
+ for (Node &SN : *SQ) {
+ auto Entry = this->createHNodes(&SN);
+ if (EC)
+ break;
+ SQHNode->Entries.push_back(std::move(Entry));
+ }
+ return std::move(SQHNode);
+ } else if (MappingNode *Map = dyn_cast<MappingNode>(N)) {
+ auto mapHNode = llvm::make_unique<MapHNode>(N);
+ for (KeyValueNode &KVN : *Map) {
+ Node *KeyNode = KVN.getKey();
+ ScalarNode *KeyScalar = dyn_cast<ScalarNode>(KeyNode);
+ if (!KeyScalar) {
+ setError(KeyNode, "Map key must be a scalar");
+ break;
+ }
+ StringStorage.clear();
+ StringRef KeyStr = KeyScalar->getValue(StringStorage);
+ if (!StringStorage.empty()) {
+ // Copy string to permanent storage
+ KeyStr = StringStorage.str().copy(StringAllocator);
+ }
+ auto ValueHNode = this->createHNodes(KVN.getValue());
+ if (EC)
+ break;
+ mapHNode->Mapping[KeyStr] = std::move(ValueHNode);
+ }
+ return std::move(mapHNode);
+ } else if (isa<NullNode>(N)) {
+ return llvm::make_unique<EmptyHNode>(N);
+ } else {
+ setError(N, "unknown node kind");
+ return nullptr;
+ }
+}
+
+bool Input::MapHNode::isValidKey(StringRef Key) {
+ for (const char *K : ValidKeys) {
+ if (Key.equals(K))
+ return true;
+ }
+ return false;
+}
+
+void Input::setError(const Twine &Message) {
+ this->setError(CurrentNode, Message);
+}
+
+bool Input::canElideEmptySequence() {
+ return false;
+}
+
+//===----------------------------------------------------------------------===//
+// Output
+//===----------------------------------------------------------------------===//
+
+Output::Output(raw_ostream &yout, void *context, int WrapColumn)
+ : IO(context),
+ Out(yout),
+ WrapColumn(WrapColumn),
+ Column(0),
+ ColumnAtFlowStart(0),
+ ColumnAtMapFlowStart(0),
+ NeedBitValueComma(false),
+ NeedFlowSequenceComma(false),
+ EnumerationMatchFound(false),
+ NeedsNewLine(false) {
+}
+
+Output::~Output() {
+}
+
+bool Output::outputting() {
+ return true;
+}
+
+void Output::beginMapping() {
+ StateStack.push_back(inMapFirstKey);
+ NeedsNewLine = true;
+}
+
+std::vector<StringRef> Output::getKeys() const {
+ return std::vector<StringRef>();
+}
+
+bool Output::mapTag(StringRef Tag, bool Use) {
+ if (Use) {
+ this->output(" ");
+ this->output(Tag);
+ }
+ return Use;
+}
+
+void Output::endMapping() {
+ StateStack.pop_back();
+}
+
+bool Output::preflightKey(const char *Key, bool Required, bool SameAsDefault,
+ bool &UseDefault, void *&) {
+ UseDefault = false;
+ if (Required || !SameAsDefault) {
+ auto State = StateStack.back();
+ if (State == inFlowMapFirstKey || State == inFlowMapOtherKey) {
+ flowKey(Key);
+ } else {
+ this->newLineCheck();
+ this->paddedKey(Key);
+ }
+ return true;
+ }
+ return false;
+}
+
+void Output::postflightKey(void *) {
+ if (StateStack.back() == inMapFirstKey) {
+ StateStack.pop_back();
+ StateStack.push_back(inMapOtherKey);
+ } else if (StateStack.back() == inFlowMapFirstKey) {
+ StateStack.pop_back();
+ StateStack.push_back(inFlowMapOtherKey);
+ }
+}
+
+void Output::beginFlowMapping() {
+ StateStack.push_back(inFlowMapFirstKey);
+ this->newLineCheck();
+ ColumnAtMapFlowStart = Column;
+ output("{ ");
+}
+
+void Output::endFlowMapping() {
+ StateStack.pop_back();
+ this->outputUpToEndOfLine(" }");
+}
+
+void Output::beginDocuments() {
+ this->outputUpToEndOfLine("---");
+}
+
+bool Output::preflightDocument(unsigned index) {
+ if (index > 0)
+ this->outputUpToEndOfLine("\n---");
+ return true;
+}
+
+void Output::postflightDocument() {
+}
+
+void Output::endDocuments() {
+ output("\n...\n");
+}
+
+unsigned Output::beginSequence() {
+ StateStack.push_back(inSeq);
+ NeedsNewLine = true;
+ return 0;
+}
+
+void Output::endSequence() {
+ StateStack.pop_back();
+}
+
+bool Output::preflightElement(unsigned, void *&) {
+ return true;
+}
+
+void Output::postflightElement(void *) {
+}
+
+unsigned Output::beginFlowSequence() {
+ StateStack.push_back(inFlowSeq);
+ this->newLineCheck();
+ ColumnAtFlowStart = Column;
+ output("[ ");
+ NeedFlowSequenceComma = false;
+ return 0;
+}
+
+void Output::endFlowSequence() {
+ StateStack.pop_back();
+ this->outputUpToEndOfLine(" ]");
+}
+
+bool Output::preflightFlowElement(unsigned, void *&) {
+ if (NeedFlowSequenceComma)
+ output(", ");
+ if (WrapColumn && Column > WrapColumn) {
+ output("\n");
+ for (int i = 0; i < ColumnAtFlowStart; ++i)
+ output(" ");
+ Column = ColumnAtFlowStart;
+ output(" ");
+ }
+ return true;
+}
+
+void Output::postflightFlowElement(void *) {
+ NeedFlowSequenceComma = true;
+}
+
+void Output::beginEnumScalar() {
+ EnumerationMatchFound = false;
+}
+
+bool Output::matchEnumScalar(const char *Str, bool Match) {
+ if (Match && !EnumerationMatchFound) {
+ this->newLineCheck();
+ this->outputUpToEndOfLine(Str);
+ EnumerationMatchFound = true;
+ }
+ return false;
+}
+
+bool Output::matchEnumFallback() {
+ if (EnumerationMatchFound)
+ return false;
+ EnumerationMatchFound = true;
+ return true;
+}
+
+void Output::endEnumScalar() {
+ if (!EnumerationMatchFound)
+ llvm_unreachable("bad runtime enum value");
+}
+
+bool Output::beginBitSetScalar(bool &DoClear) {
+ this->newLineCheck();
+ output("[ ");
+ NeedBitValueComma = false;
+ DoClear = false;
+ return true;
+}
+
+bool Output::bitSetMatch(const char *Str, bool Matches) {
+ if (Matches) {
+ if (NeedBitValueComma)
+ output(", ");
+ this->output(Str);
+ NeedBitValueComma = true;
+ }
+ return false;
+}
+
+void Output::endBitSetScalar() {
+ this->outputUpToEndOfLine(" ]");
+}
+
+void Output::scalarString(StringRef &S, bool MustQuote) {
+ this->newLineCheck();
+ if (S.empty()) {
+ // Print '' for the empty string because leaving the field empty is not
+ // allowed.
+ this->outputUpToEndOfLine("''");
+ return;
+ }
+ if (!MustQuote) {
+ // Only quote if we must.
+ this->outputUpToEndOfLine(S);
+ return;
+ }
+ unsigned i = 0;
+ unsigned j = 0;
+ unsigned End = S.size();
+ output("'"); // Starting single quote.
+ const char *Base = S.data();
+ while (j < End) {
+ // Escape a single quote by doubling it.
+ if (S[j] == '\'') {
+ output(StringRef(&Base[i], j - i + 1));
+ output("'");
+ i = j + 1;
+ }
+ ++j;
+ }
+ output(StringRef(&Base[i], j - i));
+ this->outputUpToEndOfLine("'"); // Ending single quote.
+}
+
+void Output::blockScalarString(StringRef &S) {
+ if (!StateStack.empty())
+ newLineCheck();
+ output(" |");
+ outputNewLine();
+
+ unsigned Indent = StateStack.empty() ? 1 : StateStack.size();
+
+ auto Buffer = MemoryBuffer::getMemBuffer(S, "", false);
+ for (line_iterator Lines(*Buffer, false); !Lines.is_at_end(); ++Lines) {
+ for (unsigned I = 0; I < Indent; ++I) {
+ output(" ");
+ }
+ output(*Lines);
+ outputNewLine();
+ }
+}
+
+void Output::setError(const Twine &message) {
+}
+
+bool Output::canElideEmptySequence() {
+ // Normally, with an optional key/value where the value is an empty sequence,
+ // the whole key/value can be not written. But, that produces wrong yaml
+ // if the key/value is the only thing in the map and the map is used in
+ // a sequence. This detects if the this sequence is the first key/value
+ // in map that itself is embedded in a sequnce.
+ if (StateStack.size() < 2)
+ return true;
+ if (StateStack.back() != inMapFirstKey)
+ return true;
+ return (StateStack[StateStack.size()-2] != inSeq);
+}
+
+void Output::output(StringRef s) {
+ Column += s.size();
+ Out << s;
+}
+
+void Output::outputUpToEndOfLine(StringRef s) {
+ this->output(s);
+ if (StateStack.empty() || (StateStack.back() != inFlowSeq &&
+ StateStack.back() != inFlowMapFirstKey &&
+ StateStack.back() != inFlowMapOtherKey))
+ NeedsNewLine = true;
+}
+
+void Output::outputNewLine() {
+ Out << "\n";
+ Column = 0;
+}
+
+// if seq at top, indent as if map, then add "- "
+// if seq in middle, use "- " if firstKey, else use " "
+//
+
+void Output::newLineCheck() {
+ if (!NeedsNewLine)
+ return;
+ NeedsNewLine = false;
+
+ this->outputNewLine();
+
+ assert(StateStack.size() > 0);
+ unsigned Indent = StateStack.size() - 1;
+ bool OutputDash = false;
+
+ if (StateStack.back() == inSeq) {
+ OutputDash = true;
+ } else if ((StateStack.size() > 1) && ((StateStack.back() == inMapFirstKey) ||
+ (StateStack.back() == inFlowSeq) ||
+ (StateStack.back() == inFlowMapFirstKey)) &&
+ (StateStack[StateStack.size() - 2] == inSeq)) {
+ --Indent;
+ OutputDash = true;
+ }
+
+ for (unsigned i = 0; i < Indent; ++i) {
+ output(" ");
+ }
+ if (OutputDash) {
+ output("- ");
+ }
+
+}
+
+void Output::paddedKey(StringRef key) {
+ output(key);
+ output(":");
+ const char *spaces = " ";
+ if (key.size() < strlen(spaces))
+ output(&spaces[key.size()]);
+ else
+ output(" ");
+}
+
+void Output::flowKey(StringRef Key) {
+ if (StateStack.back() == inFlowMapOtherKey)
+ output(", ");
+ if (WrapColumn && Column > WrapColumn) {
+ output("\n");
+ for (int I = 0; I < ColumnAtMapFlowStart; ++I)
+ output(" ");
+ Column = ColumnAtMapFlowStart;
+ output(" ");
+ }
+ output(Key);
+ output(": ");
+}
+
+//===----------------------------------------------------------------------===//
+// traits for built-in types
+//===----------------------------------------------------------------------===//
+
+void ScalarTraits<bool>::output(const bool &Val, void *, raw_ostream &Out) {
+ Out << (Val ? "true" : "false");
+}
+
+StringRef ScalarTraits<bool>::input(StringRef Scalar, void *, bool &Val) {
+ if (Scalar.equals("true")) {
+ Val = true;
+ return StringRef();
+ } else if (Scalar.equals("false")) {
+ Val = false;
+ return StringRef();
+ }
+ return "invalid boolean";
+}
+
+void ScalarTraits<StringRef>::output(const StringRef &Val, void *,
+ raw_ostream &Out) {
+ Out << Val;
+}
+
+StringRef ScalarTraits<StringRef>::input(StringRef Scalar, void *,
+ StringRef &Val) {
+ Val = Scalar;
+ return StringRef();
+}
+
+void ScalarTraits<std::string>::output(const std::string &Val, void *,
+ raw_ostream &Out) {
+ Out << Val;
+}
+
+StringRef ScalarTraits<std::string>::input(StringRef Scalar, void *,
+ std::string &Val) {
+ Val = Scalar.str();
+ return StringRef();
+}
+
+void ScalarTraits<uint8_t>::output(const uint8_t &Val, void *,
+ raw_ostream &Out) {
+ // use temp uin32_t because ostream thinks uint8_t is a character
+ uint32_t Num = Val;
+ Out << Num;
+}
+
+StringRef ScalarTraits<uint8_t>::input(StringRef Scalar, void *, uint8_t &Val) {
+ unsigned long long n;
+ if (getAsUnsignedInteger(Scalar, 0, n))
+ return "invalid number";
+ if (n > 0xFF)
+ return "out of range number";
+ Val = n;
+ return StringRef();
+}
+
+void ScalarTraits<uint16_t>::output(const uint16_t &Val, void *,
+ raw_ostream &Out) {
+ Out << Val;
+}
+
+StringRef ScalarTraits<uint16_t>::input(StringRef Scalar, void *,
+ uint16_t &Val) {
+ unsigned long long n;
+ if (getAsUnsignedInteger(Scalar, 0, n))
+ return "invalid number";
+ if (n > 0xFFFF)
+ return "out of range number";
+ Val = n;
+ return StringRef();
+}
+
+void ScalarTraits<uint32_t>::output(const uint32_t &Val, void *,
+ raw_ostream &Out) {
+ Out << Val;
+}
+
+StringRef ScalarTraits<uint32_t>::input(StringRef Scalar, void *,
+ uint32_t &Val) {
+ unsigned long long n;
+ if (getAsUnsignedInteger(Scalar, 0, n))
+ return "invalid number";
+ if (n > 0xFFFFFFFFUL)
+ return "out of range number";
+ Val = n;
+ return StringRef();
+}
+
+void ScalarTraits<unsigned long>::output(const unsigned long &Val, void *,
+ raw_ostream &Out) {
+ Out << Val;
+}
+
+StringRef ScalarTraits<unsigned long>::input(StringRef Scalar, void *,
+ unsigned long &Val) {
+ unsigned long long N;
+ if (getAsUnsignedInteger(Scalar, 0, N))
+ return "invalid number";
+ if (N > ULONG_MAX)
+ return "out of range number";
+ Val = N;
+ return StringRef();
+}
+
+void ScalarTraits<unsigned long long>::output(const unsigned long long &Val, void *,
+ raw_ostream &Out) {
+ Out << Val;
+}
+
+StringRef ScalarTraits<unsigned long long>::input(StringRef Scalar, void *,
+ unsigned long long &Val) {
+ unsigned long long N;
+ if (getAsUnsignedInteger(Scalar, 0, N))
+ return "invalid number";
+ Val = N;
+ return StringRef();
+}
+
+void ScalarTraits<int8_t>::output(const int8_t &Val, void *, raw_ostream &Out) {
+ // use temp in32_t because ostream thinks int8_t is a character
+ int32_t Num = Val;
+ Out << Num;
+}
+
+StringRef ScalarTraits<int8_t>::input(StringRef Scalar, void *, int8_t &Val) {
+ long long N;
+ if (getAsSignedInteger(Scalar, 0, N))
+ return "invalid number";
+ if ((N > 127) || (N < -128))
+ return "out of range number";
+ Val = N;
+ return StringRef();
+}
+
+void ScalarTraits<int16_t>::output(const int16_t &Val, void *,
+ raw_ostream &Out) {
+ Out << Val;
+}
+
+StringRef ScalarTraits<int16_t>::input(StringRef Scalar, void *, int16_t &Val) {
+ long long N;
+ if (getAsSignedInteger(Scalar, 0, N))
+ return "invalid number";
+ if ((N > INT16_MAX) || (N < INT16_MIN))
+ return "out of range number";
+ Val = N;
+ return StringRef();
+}
+
+void ScalarTraits<int32_t>::output(const int32_t &Val, void *,
+ raw_ostream &Out) {
+ Out << Val;
+}
+
+StringRef ScalarTraits<int32_t>::input(StringRef Scalar, void *, int32_t &Val) {
+ long long N;
+ if (getAsSignedInteger(Scalar, 0, N))
+ return "invalid number";
+ if ((N > INT32_MAX) || (N < INT32_MIN))
+ return "out of range number";
+ Val = N;
+ return StringRef();
+}
+
+void ScalarTraits<int64_t>::output(const int64_t &Val, void *,
+ raw_ostream &Out) {
+ Out << Val;
+}
+
+StringRef ScalarTraits<int64_t>::input(StringRef Scalar, void *, int64_t &Val) {
+ long long N;
+ if (getAsSignedInteger(Scalar, 0, N))
+ return "invalid number";
+ Val = N;
+ return StringRef();
+}
+
+void ScalarTraits<double>::output(const double &Val, void *, raw_ostream &Out) {
+ Out << format("%g", Val);
+}
+
+StringRef ScalarTraits<double>::input(StringRef Scalar, void *, double &Val) {
+ SmallString<32> buff(Scalar.begin(), Scalar.end());
+ char *end;
+ Val = strtod(buff.c_str(), &end);
+ if (*end != '\0')
+ return "invalid floating point number";
+ return StringRef();
+}
+
+void ScalarTraits<float>::output(const float &Val, void *, raw_ostream &Out) {
+ Out << format("%g", Val);
+}
+
+StringRef ScalarTraits<float>::input(StringRef Scalar, void *, float &Val) {
+ SmallString<32> buff(Scalar.begin(), Scalar.end());
+ char *end;
+ Val = strtod(buff.c_str(), &end);
+ if (*end != '\0')
+ return "invalid floating point number";
+ return StringRef();
+}
+
+void ScalarTraits<Hex8>::output(const Hex8 &Val, void *, raw_ostream &Out) {
+ uint8_t Num = Val;
+ Out << format("0x%02X", Num);
+}
+
+StringRef ScalarTraits<Hex8>::input(StringRef Scalar, void *, Hex8 &Val) {
+ unsigned long long n;
+ if (getAsUnsignedInteger(Scalar, 0, n))
+ return "invalid hex8 number";
+ if (n > 0xFF)
+ return "out of range hex8 number";
+ Val = n;
+ return StringRef();
+}
+
+void ScalarTraits<Hex16>::output(const Hex16 &Val, void *, raw_ostream &Out) {
+ uint16_t Num = Val;
+ Out << format("0x%04X", Num);
+}
+
+StringRef ScalarTraits<Hex16>::input(StringRef Scalar, void *, Hex16 &Val) {
+ unsigned long long n;
+ if (getAsUnsignedInteger(Scalar, 0, n))
+ return "invalid hex16 number";
+ if (n > 0xFFFF)
+ return "out of range hex16 number";
+ Val = n;
+ return StringRef();
+}
+
+void ScalarTraits<Hex32>::output(const Hex32 &Val, void *, raw_ostream &Out) {
+ uint32_t Num = Val;
+ Out << format("0x%08X", Num);
+}
+
+StringRef ScalarTraits<Hex32>::input(StringRef Scalar, void *, Hex32 &Val) {
+ unsigned long long n;
+ if (getAsUnsignedInteger(Scalar, 0, n))
+ return "invalid hex32 number";
+ if (n > 0xFFFFFFFFUL)
+ return "out of range hex32 number";
+ Val = n;
+ return StringRef();
+}
+
+void ScalarTraits<Hex64>::output(const Hex64 &Val, void *, raw_ostream &Out) {
+ uint64_t Num = Val;
+ Out << format("0x%016llX", Num);
+}
+
+StringRef ScalarTraits<Hex64>::input(StringRef Scalar, void *, Hex64 &Val) {
+ unsigned long long Num;
+ if (getAsUnsignedInteger(Scalar, 0, Num))
+ return "invalid hex64 number";
+ Val = Num;
+ return StringRef();
+}
diff --git a/ext/src/llvm/raw_ostream.cpp b/ext/src/llvm/raw_ostream.cpp
new file mode 100644
index 0000000..41957fd
--- /dev/null
+++ b/ext/src/llvm/raw_ostream.cpp
@@ -0,0 +1,752 @@
+//===--- raw_ostream.cpp - Implement the raw_ostream classes --------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements support for bulk buffered stream output.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
+#include <cctype>
+#include <cerrno>
+#include <sys/stat.h>
+#include <system_error>
+#include <signal.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/uio.h>
+
+#if defined(__CYGWIN__)
+#include <io.h>
+#endif
+
+using namespace llvm;
+
+raw_ostream::~raw_ostream() {
+ // raw_ostream's subclasses should take care to flush the buffer
+ // in their destructors.
+ assert(OutBufCur == OutBufStart &&
+ "raw_ostream destructor called with non-empty buffer!");
+
+ if (BufferMode == InternalBuffer)
+ delete [] OutBufStart;
+}
+
+// An out of line virtual method to provide a home for the class vtable.
+void raw_ostream::handle() {}
+
+size_t raw_ostream::preferred_buffer_size() const {
+ // BUFSIZ is intended to be a reasonable default.
+ return BUFSIZ;
+}
+
+void raw_ostream::SetBuffered() {
+ // Ask the subclass to determine an appropriate buffer size.
+ if (size_t Size = preferred_buffer_size())
+ SetBufferSize(Size);
+ else
+ // It may return 0, meaning this stream should be unbuffered.
+ SetUnbuffered();
+}
+
+void raw_ostream::SetBufferAndMode(char *BufferStart, size_t Size,
+ BufferKind Mode) {
+ assert(((Mode == Unbuffered && !BufferStart && Size == 0) ||
+ (Mode != Unbuffered && BufferStart && Size != 0)) &&
+ "stream must be unbuffered or have at least one byte");
+ // Make sure the current buffer is free of content (we can't flush here; the
+ // child buffer management logic will be in write_impl).
+ assert(GetNumBytesInBuffer() == 0 && "Current buffer is non-empty!");
+
+ if (BufferMode == InternalBuffer)
+ delete [] OutBufStart;
+ OutBufStart = BufferStart;
+ OutBufEnd = OutBufStart+Size;
+ OutBufCur = OutBufStart;
+ BufferMode = Mode;
+
+ assert(OutBufStart <= OutBufEnd && "Invalid size!");
+}
+
+raw_ostream &raw_ostream::operator<<(unsigned long N) {
+ // Zero is a special case.
+ if (N == 0)
+ return *this << '0';
+
+ char NumberBuffer[20];
+ char *EndPtr = NumberBuffer+sizeof(NumberBuffer);
+ char *CurPtr = EndPtr;
+
+ while (N) {
+ *--CurPtr = '0' + char(N % 10);
+ N /= 10;
+ }
+ return write(CurPtr, EndPtr-CurPtr);
+}
+
+raw_ostream &raw_ostream::operator<<(long N) {
+ if (N < 0) {
+ *this << '-';
+ // Avoid undefined behavior on LONG_MIN with a cast.
+ N = -(unsigned long)N;
+ }
+
+ return this->operator<<(static_cast<unsigned long>(N));
+}
+
+raw_ostream &raw_ostream::operator<<(unsigned long long N) {
+ // Output using 32-bit div/mod when possible.
+ if (N == static_cast<unsigned long>(N))
+ return this->operator<<(static_cast<unsigned long>(N));
+
+ char NumberBuffer[20];
+ char *EndPtr = NumberBuffer+sizeof(NumberBuffer);
+ char *CurPtr = EndPtr;
+
+ while (N) {
+ *--CurPtr = '0' + char(N % 10);
+ N /= 10;
+ }
+ return write(CurPtr, EndPtr-CurPtr);
+}
+
+raw_ostream &raw_ostream::operator<<(long long N) {
+ if (N < 0) {
+ *this << '-';
+ // Avoid undefined behavior on INT64_MIN with a cast.
+ N = -(unsigned long long)N;
+ }
+
+ return this->operator<<(static_cast<unsigned long long>(N));
+}
+
+raw_ostream &raw_ostream::write_hex(unsigned long long N) {
+ // Zero is a special case.
+ if (N == 0)
+ return *this << '0';
+
+ char NumberBuffer[20];
+ char *EndPtr = NumberBuffer+sizeof(NumberBuffer);
+ char *CurPtr = EndPtr;
+
+ while (N) {
+ uintptr_t x = N % 16;
+ *--CurPtr = (x < 10 ? '0' + x : 'a' + x - 10);
+ N /= 16;
+ }
+
+ return write(CurPtr, EndPtr-CurPtr);
+}
+
+raw_ostream &raw_ostream::write_escaped(StringRef Str,
+ bool UseHexEscapes) {
+ for (unsigned i = 0, e = Str.size(); i != e; ++i) {
+ unsigned char c = Str[i];
+
+ switch (c) {
+ case '\\':
+ *this << '\\' << '\\';
+ break;
+ case '\t':
+ *this << '\\' << 't';
+ break;
+ case '\n':
+ *this << '\\' << 'n';
+ break;
+ case '"':
+ *this << '\\' << '"';
+ break;
+ default:
+ if (std::isprint(c)) {
+ *this << c;
+ break;
+ }
+
+ // Write out the escaped representation.
+ if (UseHexEscapes) {
+ *this << '\\' << 'x';
+ *this << hexdigit((c >> 4 & 0xF));
+ *this << hexdigit((c >> 0) & 0xF);
+ } else {
+ // Always use a full 3-character octal escape.
+ *this << '\\';
+ *this << char('0' + ((c >> 6) & 7));
+ *this << char('0' + ((c >> 3) & 7));
+ *this << char('0' + ((c >> 0) & 7));
+ }
+ }
+ }
+
+ return *this;
+}
+
+raw_ostream &raw_ostream::operator<<(const void *P) {
+ *this << '0' << 'x';
+
+ return write_hex((uintptr_t) P);
+}
+
+raw_ostream &raw_ostream::operator<<(double N) {
+#ifdef _WIN32
+ // On MSVCRT and compatible, output of %e is incompatible to Posix
+ // by default. Number of exponent digits should be at least 2. "%+03d"
+ // FIXME: Implement our formatter to here or Support/Format.h!
+#if __cplusplus >= 201103L && defined(__MINGW32__)
+ // FIXME: It should be generic to C++11.
+ if (N == 0.0 && std::signbit(N))
+ return *this << "-0.000000e+00";
+#else
+ int fpcl = _fpclass(N);
+
+ // negative zero
+ if (fpcl == _FPCLASS_NZ)
+ return *this << "-0.000000e+00";
+#endif
+
+ char buf[16];
+ unsigned len;
+ len = format("%e", N).snprint(buf, sizeof(buf));
+ if (len <= sizeof(buf) - 2) {
+ if (len >= 5 && buf[len - 5] == 'e' && buf[len - 3] == '0') {
+ int cs = buf[len - 4];
+ if (cs == '+' || cs == '-') {
+ int c1 = buf[len - 2];
+ int c0 = buf[len - 1];
+ if (isdigit(static_cast<unsigned char>(c1)) &&
+ isdigit(static_cast<unsigned char>(c0))) {
+ // Trim leading '0': "...e+012" -> "...e+12\0"
+ buf[len - 3] = c1;
+ buf[len - 2] = c0;
+ buf[--len] = 0;
+ }
+ }
+ }
+ return this->operator<<(buf);
+ }
+#endif
+ return this->operator<<(format("%e", N));
+}
+
+
+
+void raw_ostream::flush_nonempty() {
+ assert(OutBufCur > OutBufStart && "Invalid call to flush_nonempty.");
+ size_t Length = OutBufCur - OutBufStart;
+ OutBufCur = OutBufStart;
+ write_impl(OutBufStart, Length);
+}
+
+raw_ostream &raw_ostream::write(unsigned char C) {
+ // Group exceptional cases into a single branch.
+ if (LLVM_UNLIKELY(OutBufCur >= OutBufEnd)) {
+ if (LLVM_UNLIKELY(!OutBufStart)) {
+ if (BufferMode == Unbuffered) {
+ write_impl(reinterpret_cast<char*>(&C), 1);
+ return *this;
+ }
+ // Set up a buffer and start over.
+ SetBuffered();
+ return write(C);
+ }
+
+ flush_nonempty();
+ }
+
+ *OutBufCur++ = C;
+ return *this;
+}
+
+raw_ostream &raw_ostream::write(const char *Ptr, size_t Size) {
+ // Group exceptional cases into a single branch.
+ if (LLVM_UNLIKELY(size_t(OutBufEnd - OutBufCur) < Size)) {
+ if (LLVM_UNLIKELY(!OutBufStart)) {
+ if (BufferMode == Unbuffered) {
+ write_impl(Ptr, Size);
+ return *this;
+ }
+ // Set up a buffer and start over.
+ SetBuffered();
+ return write(Ptr, Size);
+ }
+
+ size_t NumBytes = OutBufEnd - OutBufCur;
+
+ // If the buffer is empty at this point we have a string that is larger
+ // than the buffer. Directly write the chunk that is a multiple of the
+ // preferred buffer size and put the remainder in the buffer.
+ if (LLVM_UNLIKELY(OutBufCur == OutBufStart)) {
+ assert(NumBytes != 0 && "undefined behavior");
+ size_t BytesToWrite = Size - (Size % NumBytes);
+ write_impl(Ptr, BytesToWrite);
+ size_t BytesRemaining = Size - BytesToWrite;
+ if (BytesRemaining > size_t(OutBufEnd - OutBufCur)) {
+ // Too much left over to copy into our buffer.
+ return write(Ptr + BytesToWrite, BytesRemaining);
+ }
+ copy_to_buffer(Ptr + BytesToWrite, BytesRemaining);
+ return *this;
+ }
+
+ // We don't have enough space in the buffer to fit the string in. Insert as
+ // much as possible, flush and start over with the remainder.
+ copy_to_buffer(Ptr, NumBytes);
+ flush_nonempty();
+ return write(Ptr + NumBytes, Size - NumBytes);
+ }
+
+ copy_to_buffer(Ptr, Size);
+
+ return *this;
+}
+
+void raw_ostream::copy_to_buffer(const char *Ptr, size_t Size) {
+ assert(Size <= size_t(OutBufEnd - OutBufCur) && "Buffer overrun!");
+
+ // Handle short strings specially, memcpy isn't very good at very short
+ // strings.
+ switch (Size) {
+ case 4: OutBufCur[3] = Ptr[3]; // FALL THROUGH
+ case 3: OutBufCur[2] = Ptr[2]; // FALL THROUGH
+ case 2: OutBufCur[1] = Ptr[1]; // FALL THROUGH
+ case 1: OutBufCur[0] = Ptr[0]; // FALL THROUGH
+ case 0: break;
+ default:
+ memcpy(OutBufCur, Ptr, Size);
+ break;
+ }
+
+ OutBufCur += Size;
+}
+
+// Formatted output.
+raw_ostream &raw_ostream::operator<<(const format_object_base &Fmt) {
+ // If we have more than a few bytes left in our output buffer, try
+ // formatting directly onto its end.
+ size_t NextBufferSize = 127;
+ size_t BufferBytesLeft = OutBufEnd - OutBufCur;
+ if (BufferBytesLeft > 3) {
+ size_t BytesUsed = Fmt.print(OutBufCur, BufferBytesLeft);
+
+ // Common case is that we have plenty of space.
+ if (BytesUsed <= BufferBytesLeft) {
+ OutBufCur += BytesUsed;
+ return *this;
+ }
+
+ // Otherwise, we overflowed and the return value tells us the size to try
+ // again with.
+ NextBufferSize = BytesUsed;
+ }
+
+ // If we got here, we didn't have enough space in the output buffer for the
+ // string. Try printing into a SmallVector that is resized to have enough
+ // space. Iterate until we win.
+ SmallVector<char, 128> V;
+
+ while (1) {
+ V.resize(NextBufferSize);
+
+ // Try formatting into the SmallVector.
+ size_t BytesUsed = Fmt.print(V.data(), NextBufferSize);
+
+ // If BytesUsed fit into the vector, we win.
+ if (BytesUsed <= NextBufferSize)
+ return write(V.data(), BytesUsed);
+
+ // Otherwise, try again with a new size.
+ assert(BytesUsed > NextBufferSize && "Didn't grow buffer!?");
+ NextBufferSize = BytesUsed;
+ }
+}
+
+raw_ostream &raw_ostream::operator<<(const FormattedString &FS) {
+ unsigned Len = FS.Str.size();
+ int PadAmount = FS.Width - Len;
+ if (FS.RightJustify && (PadAmount > 0))
+ this->indent(PadAmount);
+ this->operator<<(FS.Str);
+ if (!FS.RightJustify && (PadAmount > 0))
+ this->indent(PadAmount);
+ return *this;
+}
+
+raw_ostream &raw_ostream::operator<<(const FormattedNumber &FN) {
+ if (FN.Hex) {
+ unsigned Nibbles = (64 - countLeadingZeros(FN.HexValue)+3)/4;
+ unsigned PrefixChars = FN.HexPrefix ? 2 : 0;
+ unsigned Width = std::max(FN.Width, Nibbles + PrefixChars);
+
+ char NumberBuffer[20] = "0x0000000000000000";
+ if (!FN.HexPrefix)
+ NumberBuffer[1] = '0';
+ char *EndPtr = NumberBuffer+Width;
+ char *CurPtr = EndPtr;
+ const char A = FN.Upper ? 'A' : 'a';
+ unsigned long long N = FN.HexValue;
+ while (N) {
+ uintptr_t x = N % 16;
+ *--CurPtr = (x < 10 ? '0' + x : A + x - 10);
+ N /= 16;
+ }
+
+ return write(NumberBuffer, Width);
+ } else {
+ // Zero is a special case.
+ if (FN.DecValue == 0) {
+ this->indent(FN.Width-1);
+ return *this << '0';
+ }
+ char NumberBuffer[32];
+ char *EndPtr = NumberBuffer+sizeof(NumberBuffer);
+ char *CurPtr = EndPtr;
+ bool Neg = (FN.DecValue < 0);
+ uint64_t N = Neg ? -static_cast<uint64_t>(FN.DecValue) : FN.DecValue;
+ while (N) {
+ *--CurPtr = '0' + char(N % 10);
+ N /= 10;
+ }
+ int Len = EndPtr - CurPtr;
+ int Pad = FN.Width - Len;
+ if (Neg)
+ --Pad;
+ if (Pad > 0)
+ this->indent(Pad);
+ if (Neg)
+ *this << '-';
+ return write(CurPtr, Len);
+ }
+}
+
+
+/// indent - Insert 'NumSpaces' spaces.
+raw_ostream &raw_ostream::indent(unsigned NumSpaces) {
+ static const char Spaces[] = " "
+ " "
+ " ";
+
+ // Usually the indentation is small, handle it with a fastpath.
+ if (NumSpaces < array_lengthof(Spaces))
+ return write(Spaces, NumSpaces);
+
+ while (NumSpaces) {
+ unsigned NumToWrite = std::min(NumSpaces,
+ (unsigned)array_lengthof(Spaces)-1);
+ write(Spaces, NumToWrite);
+ NumSpaces -= NumToWrite;
+ }
+ return *this;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Formatted Output
+//===----------------------------------------------------------------------===//
+
+// Out of line virtual method.
+void format_object_base::home() {
+}
+
+//===----------------------------------------------------------------------===//
+// raw_fd_ostream
+//===----------------------------------------------------------------------===//
+
+static int getFD(StringRef Filename, std::error_code &EC,
+ sys::fs::OpenFlags Flags) {
+ // Handle "-" as stdout. Note that when we do this, we consider ourself
+ // the owner of stdout. This means that we can do things like close the
+ // file descriptor when we're done and set the "binary" flag globally.
+ if (Filename == "-") {
+ EC = std::error_code();
+ return STDOUT_FILENO;
+ }
+
+ int FD;
+ EC = sys::fs::openFileForWrite(Filename, FD, Flags);
+ if (EC)
+ return -1;
+
+ return FD;
+}
+
+raw_fd_ostream::raw_fd_ostream(StringRef Filename, std::error_code &EC,
+ sys::fs::OpenFlags Flags)
+ : raw_fd_ostream(getFD(Filename, EC, Flags), true) {}
+
+/// FD is the file descriptor that this writes to. If ShouldClose is true, this
+/// closes the file when the stream is destroyed.
+raw_fd_ostream::raw_fd_ostream(int fd, bool shouldClose, bool unbuffered)
+ : raw_pwrite_stream(unbuffered), FD(fd), ShouldClose(shouldClose),
+ Error(false) {
+ if (FD < 0 ) {
+ ShouldClose = false;
+ return;
+ }
+
+ // Get the starting position.
+ off_t loc = ::lseek(FD, 0, SEEK_CUR);
+ SupportsSeeking = loc != (off_t)-1;
+ if (!SupportsSeeking)
+ pos = 0;
+ else
+ pos = static_cast<uint64_t>(loc);
+}
+
+static std::error_code SafelyCloseFileDescriptor(int FD);
+
+raw_fd_ostream::~raw_fd_ostream() {
+ if (FD >= 0) {
+ flush();
+ if (ShouldClose && SafelyCloseFileDescriptor(FD))
+ error_detected();
+ }
+
+#ifdef __MINGW32__
+ // On mingw, global dtors should not call exit().
+ // report_fatal_error() invokes exit(). We know report_fatal_error()
+ // might not write messages to stderr when any errors were detected
+ // on FD == 2.
+ if (FD == 2) return;
+#endif
+
+ // If there are any pending errors, report them now. Clients wishing
+ // to avoid report_fatal_error calls should check for errors with
+ // has_error() and clear the error flag with clear_error() before
+ // destructing raw_ostream objects which may have errors.
+ if (has_error())
+ report_fatal_error("IO failure on output stream.", /*GenCrashDiag=*/false);
+}
+
+
+void raw_fd_ostream::write_impl(const char *Ptr, size_t Size) {
+ assert(FD >= 0 && "File already closed.");
+ pos += Size;
+
+ bool ShouldWriteInChunks = false;
+
+ do {
+ size_t ChunkSize = Size;
+ if (ChunkSize > 32767 && ShouldWriteInChunks)
+ ChunkSize = 32767;
+
+ ssize_t ret = ::write(FD, Ptr, ChunkSize);
+
+ if (ret < 0) {
+ // If it's a recoverable error, swallow it and retry the write.
+ //
+ // Ideally we wouldn't ever see EAGAIN or EWOULDBLOCK here, since
+ // raw_ostream isn't designed to do non-blocking I/O. However, some
+ // programs, such as old versions of bjam, have mistakenly used
+ // O_NONBLOCK. For compatibility, emulate blocking semantics by
+ // spinning until the write succeeds. If you don't want spinning,
+ // don't use O_NONBLOCK file descriptors with raw_ostream.
+ if (errno == EINTR || errno == EAGAIN
+#ifdef EWOULDBLOCK
+ || errno == EWOULDBLOCK
+#endif
+ )
+ continue;
+
+ // Otherwise it's a non-recoverable error. Note it and quit.
+ error_detected();
+ break;
+ }
+
+ // The write may have written some or all of the data. Update the
+ // size and buffer pointer to reflect the remainder that needs
+ // to be written. If there are no bytes left, we're done.
+ Ptr += ret;
+ Size -= ret;
+ } while (Size > 0);
+}
+
+void raw_fd_ostream::close() {
+ assert(ShouldClose);
+ ShouldClose = false;
+ flush();
+ if (SafelyCloseFileDescriptor(FD))
+ error_detected();
+ FD = -1;
+}
+
+uint64_t raw_fd_ostream::seek(uint64_t off) {
+ flush();
+ pos = ::lseek(FD, off, SEEK_SET);
+ if (pos == (uint64_t)-1)
+ error_detected();
+ return pos;
+}
+
+void raw_fd_ostream::pwrite_impl(const char *Ptr, size_t Size,
+ uint64_t Offset) {
+ uint64_t Pos = tell();
+ seek(Offset);
+ write(Ptr, Size);
+ seek(Pos);
+}
+
+size_t raw_fd_ostream::preferred_buffer_size() const {
+ // Windows and Minix have no st_blksize.
+ assert(FD >= 0 && "File not yet open!");
+ struct stat statbuf;
+ if (fstat(FD, &statbuf) != 0)
+ return 0;
+
+ // If this is a terminal, don't use buffering. Line buffering
+ // would be a more traditional thing to do, but it's not worth
+ // the complexity.
+ if (S_ISCHR(statbuf.st_mode) && isatty(FD))
+ return 0;
+ // Return the preferred block size.
+ return statbuf.st_blksize;
+}
+
+raw_ostream &raw_fd_ostream::changeColor(enum Colors colors, bool bold,
+ bool bg) {
+ return *this;
+}
+
+raw_ostream &raw_fd_ostream::resetColor() {
+ return *this;
+}
+
+raw_ostream &raw_fd_ostream::reverseColor() {
+ return *this;
+}
+
+bool raw_fd_ostream::is_displayed() const {
+ return isatty(FD);
+}
+
+bool raw_fd_ostream::has_colors() const {
+ return false;
+}
+
+//===----------------------------------------------------------------------===//
+// outs(), errs(), nulls()
+//===----------------------------------------------------------------------===//
+
+/// outs() - This returns a reference to a raw_ostream for standard output.
+/// Use it like: outs() << "foo" << "bar";
+raw_ostream &llvm::outs() {
+ // Set buffer settings to model stdout behavior.
+ // Delete the file descriptor when the program exits, forcing error
+ // detection. If you don't want this behavior, don't use outs().
+ std::error_code EC;
+ static raw_fd_ostream S("-", EC, sys::fs::F_None);
+ assert(!EC);
+ return S;
+}
+
+/// errs() - This returns a reference to a raw_ostream for standard error.
+/// Use it like: errs() << "foo" << "bar";
+raw_ostream &llvm::errs() {
+ // Set standard error to be unbuffered by default.
+ static raw_fd_ostream S(STDERR_FILENO, false, true);
+ return S;
+}
+
+/// nulls() - This returns a reference to a raw_ostream which discards output.
+raw_ostream &llvm::nulls() {
+ static raw_null_ostream S;
+ return S;
+}
+
+
+//===----------------------------------------------------------------------===//
+// raw_string_ostream
+//===----------------------------------------------------------------------===//
+
+raw_string_ostream::~raw_string_ostream() {
+ flush();
+}
+
+void raw_string_ostream::write_impl(const char *Ptr, size_t Size) {
+ OS.append(Ptr, Size);
+}
+
+//===----------------------------------------------------------------------===//
+// raw_svector_ostream
+//===----------------------------------------------------------------------===//
+
+uint64_t raw_svector_ostream::current_pos() const { return OS.size(); }
+
+void raw_svector_ostream::write_impl(const char *Ptr, size_t Size) {
+ OS.append(Ptr, Ptr + Size);
+}
+
+void raw_svector_ostream::pwrite_impl(const char *Ptr, size_t Size,
+ uint64_t Offset) {
+ memcpy(OS.data() + Offset, Ptr, Size);
+}
+
+//===----------------------------------------------------------------------===//
+// raw_null_ostream
+//===----------------------------------------------------------------------===//
+
+raw_null_ostream::~raw_null_ostream() {
+#ifndef NDEBUG
+ // ~raw_ostream asserts that the buffer is empty. This isn't necessary
+ // with raw_null_ostream, but it's better to have raw_null_ostream follow
+ // the rules than to change the rules just for raw_null_ostream.
+ flush();
+#endif
+}
+
+void raw_null_ostream::write_impl(const char *Ptr, size_t Size) {
+}
+
+uint64_t raw_null_ostream::current_pos() const {
+ return 0;
+}
+
+void raw_null_ostream::pwrite_impl(const char *Ptr, size_t Size,
+ uint64_t Offset) {}
+
+
+static std::error_code SafelyCloseFileDescriptor(int FD) {
+ // Create a signal set filled with *all* signals.
+ sigset_t FullSet;
+ if (sigfillset(&FullSet) < 0)
+ return std::error_code(errno, std::generic_category());
+ // Atomically swap our current signal mask with a full mask.
+ sigset_t SavedSet;
+#if LLVM_ENABLE_THREADS
+ if (int EC = pthread_sigmask(SIG_SETMASK, &FullSet, &SavedSet))
+ return std::error_code(EC, std::generic_category());
+#else
+ if (sigprocmask(SIG_SETMASK, &FullSet, &SavedSet) < 0)
+ return std::error_code(errno, std::generic_category());
+#endif
+ // Attempt to close the file descriptor.
+ // We need to save the error, if one occurs, because our subsequent call to
+ // pthread_sigmask might tamper with errno.
+ int ErrnoFromClose = 0;
+ if (::close(FD) < 0)
+ ErrnoFromClose = errno;
+ // Restore the signal mask back to what we saved earlier.
+ int EC = 0;
+#if LLVM_ENABLE_THREADS
+ EC = pthread_sigmask(SIG_SETMASK, &SavedSet, nullptr);
+#else
+ if (sigprocmask(SIG_SETMASK, &SavedSet, nullptr) < 0)
+ EC = errno;
+#endif
+ // The error code from close takes precedence over the one from
+ // pthread_sigmask.
+ if (ErrnoFromClose)
+ return std::error_code(ErrnoFromClose, std::generic_category());
+ return std::error_code(EC, std::generic_category());
+}
diff --git a/ext/src/llvm/regcclass.h b/ext/src/llvm/regcclass.h
new file mode 100644
index 0000000..7fd6604
--- /dev/null
+++ b/ext/src/llvm/regcclass.h
@@ -0,0 +1,75 @@
+/*-
+ * This code is derived from OpenBSD's libc/regex, original license follows:
+ *
+ * This code is derived from OpenBSD's libc/regex, original license follows:
+ *
+ * Copyright (c) 1992, 1993, 1994 Henry Spencer.
+ * Copyright (c) 1992, 1993, 1994
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Henry Spencer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)cclass.h 8.3 (Berkeley) 3/20/94
+ */
+
+#ifndef LLVM_SUPPORT_REGCCLASS_H
+#define LLVM_SUPPORT_REGCCLASS_H
+
+/* character-class table */
+static struct cclass {
+ const char *name;
+ const char *chars;
+ const char *multis;
+} cclasses[] = {
+ { "alnum", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
+0123456789", ""} ,
+ { "alpha", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
+ ""} ,
+ { "blank", " \t", ""} ,
+ { "cntrl", "\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\
+\25\26\27\30\31\32\33\34\35\36\37\177", ""} ,
+ { "digit", "0123456789", ""} ,
+ { "graph", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
+0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
+ ""} ,
+ { "lower", "abcdefghijklmnopqrstuvwxyz",
+ ""} ,
+ { "print", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
+0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ",
+ ""} ,
+ { "punct", "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
+ ""} ,
+ { "space", "\t\n\v\f\r ", ""} ,
+ { "upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
+ ""} ,
+ { "xdigit", "0123456789ABCDEFabcdef",
+ ""} ,
+ { NULL, 0, "" }
+};
+
+#endif
diff --git a/ext/src/llvm/regcname.h b/ext/src/llvm/regcname.h
new file mode 100644
index 0000000..891d255
--- /dev/null
+++ b/ext/src/llvm/regcname.h
@@ -0,0 +1,144 @@
+/*-
+ * This code is derived from OpenBSD's libc/regex, original license follows:
+ *
+ * Copyright (c) 1992, 1993, 1994 Henry Spencer.
+ * Copyright (c) 1992, 1993, 1994
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Henry Spencer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)cname.h 8.3 (Berkeley) 3/20/94
+ */
+
+#ifndef LLVM_SUPPORT_REGCNAME_H
+#define LLVM_SUPPORT_REGCNAME_H
+
+/* character-name table */
+static struct cname {
+ const char *name;
+ char code;
+} cnames[] = {
+ { "NUL", '\0' },
+ { "SOH", '\001' },
+ { "STX", '\002' },
+ { "ETX", '\003' },
+ { "EOT", '\004' },
+ { "ENQ", '\005' },
+ { "ACK", '\006' },
+ { "BEL", '\007' },
+ { "alert", '\007' },
+ { "BS", '\010' },
+ { "backspace", '\b' },
+ { "HT", '\011' },
+ { "tab", '\t' },
+ { "LF", '\012' },
+ { "newline", '\n' },
+ { "VT", '\013' },
+ { "vertical-tab", '\v' },
+ { "FF", '\014' },
+ { "form-feed", '\f' },
+ { "CR", '\015' },
+ { "carriage-return", '\r' },
+ { "SO", '\016' },
+ { "SI", '\017' },
+ { "DLE", '\020' },
+ { "DC1", '\021' },
+ { "DC2", '\022' },
+ { "DC3", '\023' },
+ { "DC4", '\024' },
+ { "NAK", '\025' },
+ { "SYN", '\026' },
+ { "ETB", '\027' },
+ { "CAN", '\030' },
+ { "EM", '\031' },
+ { "SUB", '\032' },
+ { "ESC", '\033' },
+ { "IS4", '\034' },
+ { "FS", '\034' },
+ { "IS3", '\035' },
+ { "GS", '\035' },
+ { "IS2", '\036' },
+ { "RS", '\036' },
+ { "IS1", '\037' },
+ { "US", '\037' },
+ { "space", ' ' },
+ { "exclamation-mark", '!' },
+ { "quotation-mark", '"' },
+ { "number-sign", '#' },
+ { "dollar-sign", '$' },
+ { "percent-sign", '%' },
+ { "ampersand", '&' },
+ { "apostrophe", '\'' },
+ { "left-parenthesis", '(' },
+ { "right-parenthesis", ')' },
+ { "asterisk", '*' },
+ { "plus-sign", '+' },
+ { "comma", ',' },
+ { "hyphen", '-' },
+ { "hyphen-minus", '-' },
+ { "period", '.' },
+ { "full-stop", '.' },
+ { "slash", '/' },
+ { "solidus", '/' },
+ { "zero", '0' },
+ { "one", '1' },
+ { "two", '2' },
+ { "three", '3' },
+ { "four", '4' },
+ { "five", '5' },
+ { "six", '6' },
+ { "seven", '7' },
+ { "eight", '8' },
+ { "nine", '9' },
+ { "colon", ':' },
+ { "semicolon", ';' },
+ { "less-than-sign", '<' },
+ { "equals-sign", '=' },
+ { "greater-than-sign", '>' },
+ { "question-mark", '?' },
+ { "commercial-at", '@' },
+ { "left-square-bracket", '[' },
+ { "backslash", '\\' },
+ { "reverse-solidus", '\\' },
+ { "right-square-bracket", ']' },
+ { "circumflex", '^' },
+ { "circumflex-accent", '^' },
+ { "underscore", '_' },
+ { "low-line", '_' },
+ { "grave-accent", '`' },
+ { "left-brace", '{' },
+ { "left-curly-bracket", '{' },
+ { "vertical-line", '|' },
+ { "right-brace", '}' },
+ { "right-curly-bracket", '}' },
+ { "tilde", '~' },
+ { "DEL", '\177' },
+ { NULL, 0 }
+};
+
+#endif
diff --git a/ext/src/llvm/regcomp.c b/ext/src/llvm/regcomp.c
new file mode 100644
index 0000000..efdcdfb
--- /dev/null
+++ b/ext/src/llvm/regcomp.c
@@ -0,0 +1,1568 @@
+/*-
+ * This code is derived from OpenBSD's libc/regex, original license follows:
+ *
+ * Copyright (c) 1992, 1993, 1994 Henry Spencer.
+ * Copyright (c) 1992, 1993, 1994
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Henry Spencer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)regcomp.c 8.5 (Berkeley) 3/20/94
+ */
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <limits.h>
+#include <stdlib.h>
+#include "regex_impl.h"
+
+#include "regutils.h"
+#include "regex2.h"
+
+#include "regcclass.h"
+#include "regcname.h"
+
+#include <stdint.h>
+
+/*
+ * parse structure, passed up and down to avoid global variables and
+ * other clumsinesses
+ */
+struct parse {
+ char *next; /* next character in RE */
+ char *end; /* end of string (-> NUL normally) */
+ int error; /* has an error been seen? */
+ sop *strip; /* malloced strip */
+ sopno ssize; /* malloced strip size (allocated) */
+ sopno slen; /* malloced strip length (used) */
+ int ncsalloc; /* number of csets allocated */
+ struct re_guts *g;
+# define NPAREN 10 /* we need to remember () 1-9 for back refs */
+ sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
+ sopno pend[NPAREN]; /* -> ) ([0] unused) */
+};
+
+static void p_ere(struct parse *, int);
+static void p_ere_exp(struct parse *);
+static void p_str(struct parse *);
+static void p_bre(struct parse *, int, int);
+static int p_simp_re(struct parse *, int);
+static int p_count(struct parse *);
+static void p_bracket(struct parse *);
+static void p_b_term(struct parse *, cset *);
+static void p_b_cclass(struct parse *, cset *);
+static void p_b_eclass(struct parse *, cset *);
+static char p_b_symbol(struct parse *);
+static char p_b_coll_elem(struct parse *, int);
+static char othercase(int);
+static void bothcases(struct parse *, int);
+static void ordinary(struct parse *, int);
+static void nonnewline(struct parse *);
+static void repeat(struct parse *, sopno, int, int);
+static int seterr(struct parse *, int);
+static cset *allocset(struct parse *);
+static void freeset(struct parse *, cset *);
+static int freezeset(struct parse *, cset *);
+static int firstch(struct parse *, cset *);
+static int nch(struct parse *, cset *);
+static void mcadd(struct parse *, cset *, const char *);
+static void mcinvert(struct parse *, cset *);
+static void mccase(struct parse *, cset *);
+static int isinsets(struct re_guts *, int);
+static int samesets(struct re_guts *, int, int);
+static void categorize(struct parse *, struct re_guts *);
+static sopno dupl(struct parse *, sopno, sopno);
+static void doemit(struct parse *, sop, size_t);
+static void doinsert(struct parse *, sop, size_t, sopno);
+static void dofwd(struct parse *, sopno, sop);
+static void enlarge(struct parse *, sopno);
+static void stripsnug(struct parse *, struct re_guts *);
+static void findmust(struct parse *, struct re_guts *);
+static sopno pluscount(struct parse *, struct re_guts *);
+
+static char nuls[10]; /* place to point scanner in event of error */
+
+/*
+ * macros for use with parse structure
+ * BEWARE: these know that the parse structure is named `p' !!!
+ */
+#define PEEK() (*p->next)
+#define PEEK2() (*(p->next+1))
+#define MORE() (p->next < p->end)
+#define MORE2() (p->next+1 < p->end)
+#define SEE(c) (MORE() && PEEK() == (c))
+#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
+#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
+#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
+#define NEXT() (p->next++)
+#define NEXT2() (p->next += 2)
+#define NEXTn(n) (p->next += (n))
+#define GETNEXT() (*p->next++)
+#define SETERROR(e) seterr(p, (e))
+#define REQUIRE(co, e) (void)((co) || SETERROR(e))
+#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
+#define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
+#define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
+#define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
+#define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
+#define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
+#define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
+#define HERE() (p->slen)
+#define THERE() (p->slen - 1)
+#define THERETHERE() (p->slen - 2)
+#define DROP(n) (p->slen -= (n))
+
+#ifdef _POSIX2_RE_DUP_MAX
+#define DUPMAX _POSIX2_RE_DUP_MAX
+#else
+#define DUPMAX 255
+#endif
+#define INFINITY (DUPMAX + 1)
+
+#ifndef NDEBUG
+static int never = 0; /* for use in asserts; shuts lint up */
+#else
+#define never 0 /* some <assert.h>s have bugs too */
+#endif
+
+/*
+ - llvm_regcomp - interface for parser and compilation
+ */
+int /* 0 success, otherwise REG_something */
+llvm_regcomp(llvm_regex_t *preg, const char *pattern, int cflags)
+{
+ struct parse pa;
+ struct re_guts *g;
+ struct parse *p = &pa;
+ int i;
+ size_t len;
+#ifdef REDEBUG
+# define GOODFLAGS(f) (f)
+#else
+# define GOODFLAGS(f) ((f)&~REG_DUMP)
+#endif
+
+ cflags = GOODFLAGS(cflags);
+ if ((cflags®_EXTENDED) && (cflags®_NOSPEC))
+ return(REG_INVARG);
+
+ if (cflags®_PEND) {
+ if (preg->re_endp < pattern)
+ return(REG_INVARG);
+ len = preg->re_endp - pattern;
+ } else
+ len = strlen((const char *)pattern);
+
+ /* do the mallocs early so failure handling is easy */
+ g = (struct re_guts *)malloc(sizeof(struct re_guts) +
+ (NC-1)*sizeof(cat_t));
+ if (g == NULL)
+ return(REG_ESPACE);
+ p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
+ p->strip = (sop *)calloc(p->ssize, sizeof(sop));
+ p->slen = 0;
+ if (p->strip == NULL) {
+ free((char *)g);
+ return(REG_ESPACE);
+ }
+
+ /* set things up */
+ p->g = g;
+ p->next = (char *)pattern; /* convenience; we do not modify it */
+ p->end = p->next + len;
+ p->error = 0;
+ p->ncsalloc = 0;
+ for (i = 0; i < NPAREN; i++) {
+ p->pbegin[i] = 0;
+ p->pend[i] = 0;
+ }
+ g->csetsize = NC;
+ g->sets = NULL;
+ g->setbits = NULL;
+ g->ncsets = 0;
+ g->cflags = cflags;
+ g->iflags = 0;
+ g->nbol = 0;
+ g->neol = 0;
+ g->must = NULL;
+ g->mlen = 0;
+ g->nsub = 0;
+ g->ncategories = 1; /* category 0 is "everything else" */
+ g->categories = &g->catspace[-(CHAR_MIN)];
+ (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
+ g->backrefs = 0;
+
+ /* do it */
+ EMIT(OEND, 0);
+ g->firststate = THERE();
+ if (cflags®_EXTENDED)
+ p_ere(p, OUT);
+ else if (cflags®_NOSPEC)
+ p_str(p);
+ else
+ p_bre(p, OUT, OUT);
+ EMIT(OEND, 0);
+ g->laststate = THERE();
+
+ /* tidy up loose ends and fill things in */
+ categorize(p, g);
+ stripsnug(p, g);
+ findmust(p, g);
+ g->nplus = pluscount(p, g);
+ g->magic = MAGIC2;
+ preg->re_nsub = g->nsub;
+ preg->re_g = g;
+ preg->re_magic = MAGIC1;
+#ifndef REDEBUG
+ /* not debugging, so can't rely on the assert() in llvm_regexec() */
+ if (g->iflags®EX_BAD)
+ SETERROR(REG_ASSERT);
+#endif
+
+ /* win or lose, we're done */
+ if (p->error != 0) /* lose */
+ llvm_regfree(preg);
+ return(p->error);
+}
+
+/*
+ - p_ere - ERE parser top level, concatenation and alternation
+ */
+static void
+p_ere(struct parse *p, int stop) /* character this ERE should end at */
+{
+ char c;
+ sopno prevback = 0;
+ sopno prevfwd = 0;
+ sopno conc;
+ int first = 1; /* is this the first alternative? */
+
+ for (;;) {
+ /* do a bunch of concatenated expressions */
+ conc = HERE();
+ while (MORE() && (c = PEEK()) != '|' && c != stop)
+ p_ere_exp(p);
+ REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
+
+ if (!EAT('|'))
+ break; /* NOTE BREAK OUT */
+
+ if (first) {
+ INSERT(OCH_, conc); /* offset is wrong */
+ prevfwd = conc;
+ prevback = conc;
+ first = 0;
+ }
+ ASTERN(OOR1, prevback);
+ prevback = THERE();
+ AHEAD(prevfwd); /* fix previous offset */
+ prevfwd = HERE();
+ EMIT(OOR2, 0); /* offset is very wrong */
+ }
+
+ if (!first) { /* tail-end fixups */
+ AHEAD(prevfwd);
+ ASTERN(O_CH, prevback);
+ }
+
+ assert(!MORE() || SEE(stop));
+}
+
+/*
+ - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
+ */
+static void
+p_ere_exp(struct parse *p)
+{
+ char c;
+ sopno pos;
+ int count;
+ int count2;
+ int backrefnum;
+ sopno subno;
+ int wascaret = 0;
+
+ assert(MORE()); /* caller should have ensured this */
+ c = GETNEXT();
+
+ pos = HERE();
+ switch (c) {
+ case '(':
+ REQUIRE(MORE(), REG_EPAREN);
+ p->g->nsub++;
+ subno = p->g->nsub;
+ if (subno < NPAREN)
+ p->pbegin[subno] = HERE();
+ EMIT(OLPAREN, subno);
+ if (!SEE(')'))
+ p_ere(p, ')');
+ if (subno < NPAREN) {
+ p->pend[subno] = HERE();
+ assert(p->pend[subno] != 0);
+ }
+ EMIT(ORPAREN, subno);
+ MUSTEAT(')', REG_EPAREN);
+ break;
+#ifndef POSIX_MISTAKE
+ case ')': /* happens only if no current unmatched ( */
+ /*
+ * You may ask, why the ifndef? Because I didn't notice
+ * this until slightly too late for 1003.2, and none of the
+ * other 1003.2 regular-expression reviewers noticed it at
+ * all. So an unmatched ) is legal POSIX, at least until
+ * we can get it fixed.
+ */
+ SETERROR(REG_EPAREN);
+ break;
+#endif
+ case '^':
+ EMIT(OBOL, 0);
+ p->g->iflags |= USEBOL;
+ p->g->nbol++;
+ wascaret = 1;
+ break;
+ case '$':
+ EMIT(OEOL, 0);
+ p->g->iflags |= USEEOL;
+ p->g->neol++;
+ break;
+ case '|':
+ SETERROR(REG_EMPTY);
+ break;
+ case '*':
+ case '+':
+ case '?':
+ SETERROR(REG_BADRPT);
+ break;
+ case '.':
+ if (p->g->cflags®_NEWLINE)
+ nonnewline(p);
+ else
+ EMIT(OANY, 0);
+ break;
+ case '[':
+ p_bracket(p);
+ break;
+ case '\\':
+ REQUIRE(MORE(), REG_EESCAPE);
+ c = GETNEXT();
+ if (c >= '1' && c <= '9') {
+ /* \[0-9] is taken to be a back-reference to a previously specified
+ * matching group. backrefnum will hold the number. The matching
+ * group must exist (i.e. if \4 is found there must have been at
+ * least 4 matching groups specified in the pattern previously).
+ */
+ backrefnum = c - '0';
+ if (p->pend[backrefnum] == 0) {
+ SETERROR(REG_ESUBREG);
+ break;
+ }
+
+ /* Make sure everything checks out and emit the sequence
+ * that marks a back-reference to the parse structure.
+ */
+ assert(backrefnum <= p->g->nsub);
+ EMIT(OBACK_, backrefnum);
+ assert(p->pbegin[backrefnum] != 0);
+ assert(OP(p->strip[p->pbegin[backrefnum]]) != OLPAREN);
+ assert(OP(p->strip[p->pend[backrefnum]]) != ORPAREN);
+ (void) dupl(p, p->pbegin[backrefnum]+1, p->pend[backrefnum]);
+ EMIT(O_BACK, backrefnum);
+ p->g->backrefs = 1;
+ } else {
+ /* Other chars are simply themselves when escaped with a backslash.
+ */
+ ordinary(p, c);
+ }
+ break;
+ case '{': /* okay as ordinary except if digit follows */
+ REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
+ /* FALLTHROUGH */
+ default:
+ ordinary(p, c);
+ break;
+ }
+
+ if (!MORE())
+ return;
+ c = PEEK();
+ /* we call { a repetition if followed by a digit */
+ if (!( c == '*' || c == '+' || c == '?' ||
+ (c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
+ return; /* no repetition, we're done */
+ NEXT();
+
+ REQUIRE(!wascaret, REG_BADRPT);
+ switch (c) {
+ case '*': /* implemented as +? */
+ /* this case does not require the (y|) trick, noKLUDGE */
+ INSERT(OPLUS_, pos);
+ ASTERN(O_PLUS, pos);
+ INSERT(OQUEST_, pos);
+ ASTERN(O_QUEST, pos);
+ break;
+ case '+':
+ INSERT(OPLUS_, pos);
+ ASTERN(O_PLUS, pos);
+ break;
+ case '?':
+ /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
+ INSERT(OCH_, pos); /* offset slightly wrong */
+ ASTERN(OOR1, pos); /* this one's right */
+ AHEAD(pos); /* fix the OCH_ */
+ EMIT(OOR2, 0); /* offset very wrong... */
+ AHEAD(THERE()); /* ...so fix it */
+ ASTERN(O_CH, THERETHERE());
+ break;
+ case '{':
+ count = p_count(p);
+ if (EAT(',')) {
+ if (isdigit((uch)PEEK())) {
+ count2 = p_count(p);
+ REQUIRE(count <= count2, REG_BADBR);
+ } else /* single number with comma */
+ count2 = INFINITY;
+ } else /* just a single number */
+ count2 = count;
+ repeat(p, pos, count, count2);
+ if (!EAT('}')) { /* error heuristics */
+ while (MORE() && PEEK() != '}')
+ NEXT();
+ REQUIRE(MORE(), REG_EBRACE);
+ SETERROR(REG_BADBR);
+ }
+ break;
+ }
+
+ if (!MORE())
+ return;
+ c = PEEK();
+ if (!( c == '*' || c == '+' || c == '?' ||
+ (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
+ return;
+ SETERROR(REG_BADRPT);
+}
+
+/*
+ - p_str - string (no metacharacters) "parser"
+ */
+static void
+p_str(struct parse *p)
+{
+ REQUIRE(MORE(), REG_EMPTY);
+ while (MORE())
+ ordinary(p, GETNEXT());
+}
+
+/*
+ - p_bre - BRE parser top level, anchoring and concatenation
+ * Giving end1 as OUT essentially eliminates the end1/end2 check.
+ *
+ * This implementation is a bit of a kludge, in that a trailing $ is first
+ * taken as an ordinary character and then revised to be an anchor. The
+ * only undesirable side effect is that '$' gets included as a character
+ * category in such cases. This is fairly harmless; not worth fixing.
+ * The amount of lookahead needed to avoid this kludge is excessive.
+ */
+static void
+p_bre(struct parse *p,
+ int end1, /* first terminating character */
+ int end2) /* second terminating character */
+{
+ sopno start = HERE();
+ int first = 1; /* first subexpression? */
+ int wasdollar = 0;
+
+ if (EAT('^')) {
+ EMIT(OBOL, 0);
+ p->g->iflags |= USEBOL;
+ p->g->nbol++;
+ }
+ while (MORE() && !SEETWO(end1, end2)) {
+ wasdollar = p_simp_re(p, first);
+ first = 0;
+ }
+ if (wasdollar) { /* oops, that was a trailing anchor */
+ DROP(1);
+ EMIT(OEOL, 0);
+ p->g->iflags |= USEEOL;
+ p->g->neol++;
+ }
+
+ REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */
+}
+
+/*
+ - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
+ */
+static int /* was the simple RE an unbackslashed $? */
+p_simp_re(struct parse *p,
+ int starordinary) /* is a leading * an ordinary character? */
+{
+ int c;
+ int count;
+ int count2;
+ sopno pos;
+ int i;
+ sopno subno;
+# define BACKSL (1<<CHAR_BIT)
+
+ pos = HERE(); /* repetition op, if any, covers from here */
+
+ assert(MORE()); /* caller should have ensured this */
+ c = GETNEXT();
+ if (c == '\\') {
+ REQUIRE(MORE(), REG_EESCAPE);
+ c = BACKSL | GETNEXT();
+ }
+ switch (c) {
+ case '.':
+ if (p->g->cflags®_NEWLINE)
+ nonnewline(p);
+ else
+ EMIT(OANY, 0);
+ break;
+ case '[':
+ p_bracket(p);
+ break;
+ case BACKSL|'{':
+ SETERROR(REG_BADRPT);
+ break;
+ case BACKSL|'(':
+ p->g->nsub++;
+ subno = p->g->nsub;
+ if (subno < NPAREN)
+ p->pbegin[subno] = HERE();
+ EMIT(OLPAREN, subno);
+ /* the MORE here is an error heuristic */
+ if (MORE() && !SEETWO('\\', ')'))
+ p_bre(p, '\\', ')');
+ if (subno < NPAREN) {
+ p->pend[subno] = HERE();
+ assert(p->pend[subno] != 0);
+ }
+ EMIT(ORPAREN, subno);
+ REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
+ break;
+ case BACKSL|')': /* should not get here -- must be user */
+ case BACKSL|'}':
+ SETERROR(REG_EPAREN);
+ break;
+ case BACKSL|'1':
+ case BACKSL|'2':
+ case BACKSL|'3':
+ case BACKSL|'4':
+ case BACKSL|'5':
+ case BACKSL|'6':
+ case BACKSL|'7':
+ case BACKSL|'8':
+ case BACKSL|'9':
+ i = (c&~BACKSL) - '0';
+ assert(i < NPAREN);
+ if (p->pend[i] != 0) {
+ assert(i <= p->g->nsub);
+ EMIT(OBACK_, i);
+ assert(p->pbegin[i] != 0);
+ assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
+ assert(OP(p->strip[p->pend[i]]) == ORPAREN);
+ (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
+ EMIT(O_BACK, i);
+ } else
+ SETERROR(REG_ESUBREG);
+ p->g->backrefs = 1;
+ break;
+ case '*':
+ REQUIRE(starordinary, REG_BADRPT);
+ /* FALLTHROUGH */
+ default:
+ ordinary(p, (char)c);
+ break;
+ }
+
+ if (EAT('*')) { /* implemented as +? */
+ /* this case does not require the (y|) trick, noKLUDGE */
+ INSERT(OPLUS_, pos);
+ ASTERN(O_PLUS, pos);
+ INSERT(OQUEST_, pos);
+ ASTERN(O_QUEST, pos);
+ } else if (EATTWO('\\', '{')) {
+ count = p_count(p);
+ if (EAT(',')) {
+ if (MORE() && isdigit((uch)PEEK())) {
+ count2 = p_count(p);
+ REQUIRE(count <= count2, REG_BADBR);
+ } else /* single number with comma */
+ count2 = INFINITY;
+ } else /* just a single number */
+ count2 = count;
+ repeat(p, pos, count, count2);
+ if (!EATTWO('\\', '}')) { /* error heuristics */
+ while (MORE() && !SEETWO('\\', '}'))
+ NEXT();
+ REQUIRE(MORE(), REG_EBRACE);
+ SETERROR(REG_BADBR);
+ }
+ } else if (c == '$') /* $ (but not \$) ends it */
+ return(1);
+
+ return(0);
+}
+
+/*
+ - p_count - parse a repetition count
+ */
+static int /* the value */
+p_count(struct parse *p)
+{
+ int count = 0;
+ int ndigits = 0;
+
+ while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
+ count = count*10 + (GETNEXT() - '0');
+ ndigits++;
+ }
+
+ REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
+ return(count);
+}
+
+/*
+ - p_bracket - parse a bracketed character list
+ *
+ * Note a significant property of this code: if the allocset() did SETERROR,
+ * no set operations are done.
+ */
+static void
+p_bracket(struct parse *p)
+{
+ cset *cs;
+ int invert = 0;
+
+ /* Dept of Truly Sickening Special-Case Kludges */
+ if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
+ EMIT(OBOW, 0);
+ NEXTn(6);
+ return;
+ }
+ if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
+ EMIT(OEOW, 0);
+ NEXTn(6);
+ return;
+ }
+
+ if ((cs = allocset(p)) == NULL) {
+ /* allocset did set error status in p */
+ return;
+ }
+
+ if (EAT('^'))
+ invert++; /* make note to invert set at end */
+ if (EAT(']'))
+ CHadd(cs, ']');
+ else if (EAT('-'))
+ CHadd(cs, '-');
+ while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
+ p_b_term(p, cs);
+ if (EAT('-'))
+ CHadd(cs, '-');
+ MUSTEAT(']', REG_EBRACK);
+
+ if (p->error != 0) { /* don't mess things up further */
+ freeset(p, cs);
+ return;
+ }
+
+ if (p->g->cflags®_ICASE) {
+ int i;
+ int ci;
+
+ for (i = p->g->csetsize - 1; i >= 0; i--)
+ if (CHIN(cs, i) && isalpha(i)) {
+ ci = othercase(i);
+ if (ci != i)
+ CHadd(cs, ci);
+ }
+ if (cs->multis != NULL)
+ mccase(p, cs);
+ }
+ if (invert) {
+ int i;
+
+ for (i = p->g->csetsize - 1; i >= 0; i--)
+ if (CHIN(cs, i))
+ CHsub(cs, i);
+ else
+ CHadd(cs, i);
+ if (p->g->cflags®_NEWLINE)
+ CHsub(cs, '\n');
+ if (cs->multis != NULL)
+ mcinvert(p, cs);
+ }
+
+ assert(cs->multis == NULL); /* xxx */
+
+ if (nch(p, cs) == 1) { /* optimize singleton sets */
+ ordinary(p, firstch(p, cs));
+ freeset(p, cs);
+ } else
+ EMIT(OANYOF, freezeset(p, cs));
+}
+
+/*
+ - p_b_term - parse one term of a bracketed character list
+ */
+static void
+p_b_term(struct parse *p, cset *cs)
+{
+ char c;
+ char start, finish;
+ int i;
+
+ /* classify what we've got */
+ switch ((MORE()) ? PEEK() : '\0') {
+ case '[':
+ c = (MORE2()) ? PEEK2() : '\0';
+ break;
+ case '-':
+ SETERROR(REG_ERANGE);
+ return; /* NOTE RETURN */
+ break;
+ default:
+ c = '\0';
+ break;
+ }
+
+ switch (c) {
+ case ':': /* character class */
+ NEXT2();
+ REQUIRE(MORE(), REG_EBRACK);
+ c = PEEK();
+ REQUIRE(c != '-' && c != ']', REG_ECTYPE);
+ p_b_cclass(p, cs);
+ REQUIRE(MORE(), REG_EBRACK);
+ REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
+ break;
+ case '=': /* equivalence class */
+ NEXT2();
+ REQUIRE(MORE(), REG_EBRACK);
+ c = PEEK();
+ REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
+ p_b_eclass(p, cs);
+ REQUIRE(MORE(), REG_EBRACK);
+ REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
+ break;
+ default: /* symbol, ordinary character, or range */
+/* xxx revision needed for multichar stuff */
+ start = p_b_symbol(p);
+ if (SEE('-') && MORE2() && PEEK2() != ']') {
+ /* range */
+ NEXT();
+ if (EAT('-'))
+ finish = '-';
+ else
+ finish = p_b_symbol(p);
+ } else
+ finish = start;
+/* xxx what about signed chars here... */
+ REQUIRE(start <= finish, REG_ERANGE);
+ for (i = start; i <= finish; i++)
+ CHadd(cs, i);
+ break;
+ }
+}
+
+/*
+ - p_b_cclass - parse a character-class name and deal with it
+ */
+static void
+p_b_cclass(struct parse *p, cset *cs)
+{
+ char *sp = p->next;
+ struct cclass *cp;
+ size_t len;
+ const char *u;
+ char c;
+
+ while (MORE() && isalpha((uch)PEEK()))
+ NEXT();
+ len = p->next - sp;
+ for (cp = cclasses; cp->name != NULL; cp++)
+ if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
+ break;
+ if (cp->name == NULL) {
+ /* oops, didn't find it */
+ SETERROR(REG_ECTYPE);
+ return;
+ }
+
+ u = cp->chars;
+ while ((c = *u++) != '\0')
+ CHadd(cs, c);
+ for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
+ MCadd(p, cs, u);
+}
+
+/*
+ - p_b_eclass - parse an equivalence-class name and deal with it
+ *
+ * This implementation is incomplete. xxx
+ */
+static void
+p_b_eclass(struct parse *p, cset *cs)
+{
+ char c;
+
+ c = p_b_coll_elem(p, '=');
+ CHadd(cs, c);
+}
+
+/*
+ - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
+ */
+static char /* value of symbol */
+p_b_symbol(struct parse *p)
+{
+ char value;
+
+ REQUIRE(MORE(), REG_EBRACK);
+ if (!EATTWO('[', '.'))
+ return(GETNEXT());
+
+ /* collating symbol */
+ value = p_b_coll_elem(p, '.');
+ REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
+ return(value);
+}
+
+/*
+ - p_b_coll_elem - parse a collating-element name and look it up
+ */
+static char /* value of collating element */
+p_b_coll_elem(struct parse *p,
+ int endc) /* name ended by endc,']' */
+{
+ char *sp = p->next;
+ struct cname *cp;
+ int len;
+
+ while (MORE() && !SEETWO(endc, ']'))
+ NEXT();
+ if (!MORE()) {
+ SETERROR(REG_EBRACK);
+ return(0);
+ }
+ len = p->next - sp;
+ for (cp = cnames; cp->name != NULL; cp++)
+ if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
+ return(cp->code); /* known name */
+ if (len == 1)
+ return(*sp); /* single character */
+ SETERROR(REG_ECOLLATE); /* neither */
+ return(0);
+}
+
+/*
+ - othercase - return the case counterpart of an alphabetic
+ */
+static char /* if no counterpart, return ch */
+othercase(int ch)
+{
+ ch = (uch)ch;
+ assert(isalpha(ch));
+ if (isupper(ch))
+ return ((uch)tolower(ch));
+ else if (islower(ch))
+ return ((uch)toupper(ch));
+ else /* peculiar, but could happen */
+ return(ch);
+}
+
+/*
+ - bothcases - emit a dualcase version of a two-case character
+ *
+ * Boy, is this implementation ever a kludge...
+ */
+static void
+bothcases(struct parse *p, int ch)
+{
+ char *oldnext = p->next;
+ char *oldend = p->end;
+ char bracket[3];
+
+ ch = (uch)ch;
+ assert(othercase(ch) != ch); /* p_bracket() would recurse */
+ p->next = bracket;
+ p->end = bracket+2;
+ bracket[0] = ch;
+ bracket[1] = ']';
+ bracket[2] = '\0';
+ p_bracket(p);
+ assert(p->next == bracket+2);
+ p->next = oldnext;
+ p->end = oldend;
+}
+
+/*
+ - ordinary - emit an ordinary character
+ */
+static void
+ordinary(struct parse *p, int ch)
+{
+ cat_t *cap = p->g->categories;
+
+ if ((p->g->cflags®_ICASE) && isalpha((uch)ch) && othercase(ch) != ch)
+ bothcases(p, ch);
+ else {
+ EMIT(OCHAR, (uch)ch);
+ if (cap[ch] == 0)
+ cap[ch] = p->g->ncategories++;
+ }
+}
+
+/*
+ - nonnewline - emit REG_NEWLINE version of OANY
+ *
+ * Boy, is this implementation ever a kludge...
+ */
+static void
+nonnewline(struct parse *p)
+{
+ char *oldnext = p->next;
+ char *oldend = p->end;
+ char bracket[4];
+
+ p->next = bracket;
+ p->end = bracket+3;
+ bracket[0] = '^';
+ bracket[1] = '\n';
+ bracket[2] = ']';
+ bracket[3] = '\0';
+ p_bracket(p);
+ assert(p->next == bracket+3);
+ p->next = oldnext;
+ p->end = oldend;
+}
+
+/*
+ - repeat - generate code for a bounded repetition, recursively if needed
+ */
+static void
+repeat(struct parse *p,
+ sopno start, /* operand from here to end of strip */
+ int from, /* repeated from this number */
+ int to) /* to this number of times (maybe INFINITY) */
+{
+ sopno finish = HERE();
+# define N 2
+# define INF 3
+# define REP(f, t) ((f)*8 + (t))
+# define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
+ sopno copy;
+
+ if (p->error != 0) /* head off possible runaway recursion */
+ return;
+
+ assert(from <= to);
+
+ switch (REP(MAP(from), MAP(to))) {
+ case REP(0, 0): /* must be user doing this */
+ DROP(finish-start); /* drop the operand */
+ break;
+ case REP(0, 1): /* as x{1,1}? */
+ case REP(0, N): /* as x{1,n}? */
+ case REP(0, INF): /* as x{1,}? */
+ /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
+ INSERT(OCH_, start); /* offset is wrong... */
+ repeat(p, start+1, 1, to);
+ ASTERN(OOR1, start);
+ AHEAD(start); /* ... fix it */
+ EMIT(OOR2, 0);
+ AHEAD(THERE());
+ ASTERN(O_CH, THERETHERE());
+ break;
+ case REP(1, 1): /* trivial case */
+ /* done */
+ break;
+ case REP(1, N): /* as x?x{1,n-1} */
+ /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
+ INSERT(OCH_, start);
+ ASTERN(OOR1, start);
+ AHEAD(start);
+ EMIT(OOR2, 0); /* offset very wrong... */
+ AHEAD(THERE()); /* ...so fix it */
+ ASTERN(O_CH, THERETHERE());
+ copy = dupl(p, start+1, finish+1);
+ assert(copy == finish+4);
+ repeat(p, copy, 1, to-1);
+ break;
+ case REP(1, INF): /* as x+ */
+ INSERT(OPLUS_, start);
+ ASTERN(O_PLUS, start);
+ break;
+ case REP(N, N): /* as xx{m-1,n-1} */
+ copy = dupl(p, start, finish);
+ repeat(p, copy, from-1, to-1);
+ break;
+ case REP(N, INF): /* as xx{n-1,INF} */
+ copy = dupl(p, start, finish);
+ repeat(p, copy, from-1, to);
+ break;
+ default: /* "can't happen" */
+ SETERROR(REG_ASSERT); /* just in case */
+ break;
+ }
+}
+
+/*
+ - seterr - set an error condition
+ */
+static int /* useless but makes type checking happy */
+seterr(struct parse *p, int e)
+{
+ if (p->error == 0) /* keep earliest error condition */
+ p->error = e;
+ p->next = nuls; /* try to bring things to a halt */
+ p->end = nuls;
+ return(0); /* make the return value well-defined */
+}
+
+/*
+ - allocset - allocate a set of characters for []
+ */
+static cset *
+allocset(struct parse *p)
+{
+ int no = p->g->ncsets++;
+ size_t nc;
+ size_t nbytes;
+ cset *cs;
+ size_t css = (size_t)p->g->csetsize;
+ int i;
+
+ if (no >= p->ncsalloc) { /* need another column of space */
+ void *ptr;
+
+ p->ncsalloc += CHAR_BIT;
+ nc = p->ncsalloc;
+ if (nc > SIZE_MAX / sizeof(cset))
+ goto nomem;
+ assert(nc % CHAR_BIT == 0);
+ nbytes = nc / CHAR_BIT * css;
+
+ ptr = (cset *)realloc((char *)p->g->sets, nc * sizeof(cset));
+ if (ptr == NULL)
+ goto nomem;
+ p->g->sets = ptr;
+
+ ptr = (uch *)realloc((char *)p->g->setbits, nbytes);
+ if (ptr == NULL)
+ goto nomem;
+ p->g->setbits = ptr;
+
+ for (i = 0; i < no; i++)
+ p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
+
+ (void) memset((char *)p->g->setbits + (nbytes - css), 0, css);
+ }
+ /* XXX should not happen */
+ if (p->g->sets == NULL || p->g->setbits == NULL)
+ goto nomem;
+
+ cs = &p->g->sets[no];
+ cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
+ cs->mask = 1 << ((no) % CHAR_BIT);
+ cs->hash = 0;
+ cs->smultis = 0;
+ cs->multis = NULL;
+
+ return(cs);
+nomem:
+ free(p->g->sets);
+ p->g->sets = NULL;
+ free(p->g->setbits);
+ p->g->setbits = NULL;
+
+ SETERROR(REG_ESPACE);
+ /* caller's responsibility not to do set ops */
+ return(NULL);
+}
+
+/*
+ - freeset - free a now-unused set
+ */
+static void
+freeset(struct parse *p, cset *cs)
+{
+ size_t i;
+ cset *top = &p->g->sets[p->g->ncsets];
+ size_t css = (size_t)p->g->csetsize;
+
+ for (i = 0; i < css; i++)
+ CHsub(cs, i);
+ if (cs == top-1) /* recover only the easy case */
+ p->g->ncsets--;
+}
+
+/*
+ - freezeset - final processing on a set of characters
+ *
+ * The main task here is merging identical sets. This is usually a waste
+ * of time (although the hash code minimizes the overhead), but can win
+ * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash
+ * is done using addition rather than xor -- all ASCII [aA] sets xor to
+ * the same value!
+ */
+static int /* set number */
+freezeset(struct parse *p, cset *cs)
+{
+ uch h = cs->hash;
+ size_t i;
+ cset *top = &p->g->sets[p->g->ncsets];
+ cset *cs2;
+ size_t css = (size_t)p->g->csetsize;
+
+ /* look for an earlier one which is the same */
+ for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
+ if (cs2->hash == h && cs2 != cs) {
+ /* maybe */
+ for (i = 0; i < css; i++)
+ if (!!CHIN(cs2, i) != !!CHIN(cs, i))
+ break; /* no */
+ if (i == css)
+ break; /* yes */
+ }
+
+ if (cs2 < top) { /* found one */
+ freeset(p, cs);
+ cs = cs2;
+ }
+
+ return((int)(cs - p->g->sets));
+}
+
+/*
+ - firstch - return first character in a set (which must have at least one)
+ */
+static int /* character; there is no "none" value */
+firstch(struct parse *p, cset *cs)
+{
+ size_t i;
+ size_t css = (size_t)p->g->csetsize;
+
+ for (i = 0; i < css; i++)
+ if (CHIN(cs, i))
+ return((char)i);
+ assert(never);
+ return(0); /* arbitrary */
+}
+
+/*
+ - nch - number of characters in a set
+ */
+static int
+nch(struct parse *p, cset *cs)
+{
+ size_t i;
+ size_t css = (size_t)p->g->csetsize;
+ int n = 0;
+
+ for (i = 0; i < css; i++)
+ if (CHIN(cs, i))
+ n++;
+ return(n);
+}
+
+/*
+ - mcadd - add a collating element to a cset
+ */
+static void
+mcadd( struct parse *p, cset *cs, const char *cp)
+{
+ size_t oldend = cs->smultis;
+ void *np;
+
+ cs->smultis += strlen(cp) + 1;
+ np = realloc(cs->multis, cs->smultis);
+ if (np == NULL) {
+ if (cs->multis)
+ free(cs->multis);
+ cs->multis = NULL;
+ SETERROR(REG_ESPACE);
+ return;
+ }
+ cs->multis = np;
+
+ llvm_strlcpy(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1);
+}
+
+/*
+ - mcinvert - invert the list of collating elements in a cset
+ *
+ * This would have to know the set of possibilities. Implementation
+ * is deferred.
+ */
+/* ARGSUSED */
+static void
+mcinvert(struct parse *p, cset *cs)
+{
+ assert(cs->multis == NULL); /* xxx */
+}
+
+/*
+ - mccase - add case counterparts of the list of collating elements in a cset
+ *
+ * This would have to know the set of possibilities. Implementation
+ * is deferred.
+ */
+/* ARGSUSED */
+static void
+mccase(struct parse *p, cset *cs)
+{
+ assert(cs->multis == NULL); /* xxx */
+}
+
+/*
+ - isinsets - is this character in any sets?
+ */
+static int /* predicate */
+isinsets(struct re_guts *g, int c)
+{
+ uch *col;
+ int i;
+ int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
+ unsigned uc = (uch)c;
+
+ for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
+ if (col[uc] != 0)
+ return(1);
+ return(0);
+}
+
+/*
+ - samesets - are these two characters in exactly the same sets?
+ */
+static int /* predicate */
+samesets(struct re_guts *g, int c1, int c2)
+{
+ uch *col;
+ int i;
+ int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
+ unsigned uc1 = (uch)c1;
+ unsigned uc2 = (uch)c2;
+
+ for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
+ if (col[uc1] != col[uc2])
+ return(0);
+ return(1);
+}
+
+/*
+ - categorize - sort out character categories
+ */
+static void
+categorize(struct parse *p, struct re_guts *g)
+{
+ cat_t *cats = g->categories;
+ int c;
+ int c2;
+ cat_t cat;
+
+ /* avoid making error situations worse */
+ if (p->error != 0)
+ return;
+
+ for (c = CHAR_MIN; c <= CHAR_MAX; c++)
+ if (cats[c] == 0 && isinsets(g, c)) {
+ cat = g->ncategories++;
+ cats[c] = cat;
+ for (c2 = c+1; c2 <= CHAR_MAX; c2++)
+ if (cats[c2] == 0 && samesets(g, c, c2))
+ cats[c2] = cat;
+ }
+}
+
+/*
+ - dupl - emit a duplicate of a bunch of sops
+ */
+static sopno /* start of duplicate */
+dupl(struct parse *p,
+ sopno start, /* from here */
+ sopno finish) /* to this less one */
+{
+ sopno ret = HERE();
+ sopno len = finish - start;
+
+ assert(finish >= start);
+ if (len == 0)
+ return(ret);
+ enlarge(p, p->ssize + len); /* this many unexpected additions */
+ assert(p->ssize >= p->slen + len);
+ (void) memmove((char *)(p->strip + p->slen),
+ (char *)(p->strip + start), (size_t)len*sizeof(sop));
+ p->slen += len;
+ return(ret);
+}
+
+/*
+ - doemit - emit a strip operator
+ *
+ * It might seem better to implement this as a macro with a function as
+ * hard-case backup, but it's just too big and messy unless there are
+ * some changes to the data structures. Maybe later.
+ */
+static void
+doemit(struct parse *p, sop op, size_t opnd)
+{
+ /* avoid making error situations worse */
+ if (p->error != 0)
+ return;
+
+ /* deal with oversize operands ("can't happen", more or less) */
+ assert(opnd < 1<<OPSHIFT);
+
+ /* deal with undersized strip */
+ if (p->slen >= p->ssize)
+ enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */
+ assert(p->slen < p->ssize);
+
+ /* finally, it's all reduced to the easy case */
+ p->strip[p->slen++] = SOP(op, opnd);
+}
+
+/*
+ - doinsert - insert a sop into the strip
+ */
+static void
+doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
+{
+ sopno sn;
+ sop s;
+ int i;
+
+ /* avoid making error situations worse */
+ if (p->error != 0)
+ return;
+
+ sn = HERE();
+ EMIT(op, opnd); /* do checks, ensure space */
+ assert(HERE() == sn+1);
+ s = p->strip[sn];
+
+ /* adjust paren pointers */
+ assert(pos > 0);
+ for (i = 1; i < NPAREN; i++) {
+ if (p->pbegin[i] >= pos) {
+ p->pbegin[i]++;
+ }
+ if (p->pend[i] >= pos) {
+ p->pend[i]++;
+ }
+ }
+
+ memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
+ (HERE()-pos-1)*sizeof(sop));
+ p->strip[pos] = s;
+}
+
+/*
+ - dofwd - complete a forward reference
+ */
+static void
+dofwd(struct parse *p, sopno pos, sop value)
+{
+ /* avoid making error situations worse */
+ if (p->error != 0)
+ return;
+
+ assert(value < 1<<OPSHIFT);
+ p->strip[pos] = OP(p->strip[pos]) | value;
+}
+
+/*
+ - enlarge - enlarge the strip
+ */
+static void
+enlarge(struct parse *p, sopno size)
+{
+ sop *sp;
+
+ if (p->ssize >= size)
+ return;
+
+ if ((uintptr_t)size > SIZE_MAX / sizeof(sop)) {
+ SETERROR(REG_ESPACE);
+ return;
+ }
+
+ sp = (sop *)realloc(p->strip, size*sizeof(sop));
+ if (sp == NULL) {
+ SETERROR(REG_ESPACE);
+ return;
+ }
+ p->strip = sp;
+ p->ssize = size;
+}
+
+/*
+ - stripsnug - compact the strip
+ */
+static void
+stripsnug(struct parse *p, struct re_guts *g)
+{
+ g->nstates = p->slen;
+ if ((uintptr_t)p->slen > SIZE_MAX / sizeof(sop)) {
+ g->strip = p->strip;
+ SETERROR(REG_ESPACE);
+ return;
+ }
+
+ g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
+ if (g->strip == NULL) {
+ SETERROR(REG_ESPACE);
+ g->strip = p->strip;
+ }
+}
+
+/*
+ - findmust - fill in must and mlen with longest mandatory literal string
+ *
+ * This algorithm could do fancy things like analyzing the operands of |
+ * for common subsequences. Someday. This code is simple and finds most
+ * of the interesting cases.
+ *
+ * Note that must and mlen got initialized during setup.
+ */
+static void
+findmust(struct parse *p, struct re_guts *g)
+{
+ sop *scan;
+ sop *start = 0; /* start initialized in the default case, after that */
+ sop *newstart = 0; /* newstart was initialized in the OCHAR case */
+ sopno newlen;
+ sop s;
+ char *cp;
+ sopno i;
+
+ /* avoid making error situations worse */
+ if (p->error != 0)
+ return;
+
+ /* find the longest OCHAR sequence in strip */
+ newlen = 0;
+ scan = g->strip + 1;
+ do {
+ s = *scan++;
+ switch (OP(s)) {
+ case OCHAR: /* sequence member */
+ if (newlen == 0) /* new sequence */
+ newstart = scan - 1;
+ newlen++;
+ break;
+ case OPLUS_: /* things that don't break one */
+ case OLPAREN:
+ case ORPAREN:
+ break;
+ case OQUEST_: /* things that must be skipped */
+ case OCH_:
+ scan--;
+ do {
+ scan += OPND(s);
+ s = *scan;
+ /* assert() interferes w debug printouts */
+ if (OP(s) != O_QUEST && OP(s) != O_CH &&
+ OP(s) != OOR2) {
+ g->iflags |= REGEX_BAD;
+ return;
+ }
+ } while (OP(s) != O_QUEST && OP(s) != O_CH);
+ /* fallthrough */
+ default: /* things that break a sequence */
+ if (newlen > g->mlen) { /* ends one */
+ start = newstart;
+ g->mlen = newlen;
+ }
+ newlen = 0;
+ break;
+ }
+ } while (OP(s) != OEND);
+
+ if (g->mlen == 0) /* there isn't one */
+ return;
+
+ /* turn it into a character string */
+ g->must = malloc((size_t)g->mlen + 1);
+ if (g->must == NULL) { /* argh; just forget it */
+ g->mlen = 0;
+ return;
+ }
+ cp = g->must;
+ scan = start;
+ for (i = g->mlen; i > 0; i--) {
+ while (OP(s = *scan++) != OCHAR)
+ continue;
+ assert(cp < g->must + g->mlen);
+ *cp++ = (char)OPND(s);
+ }
+ assert(cp == g->must + g->mlen);
+ *cp++ = '\0'; /* just on general principles */
+}
+
+/*
+ - pluscount - count + nesting
+ */
+static sopno /* nesting depth */
+pluscount(struct parse *p, struct re_guts *g)
+{
+ sop *scan;
+ sop s;
+ sopno plusnest = 0;
+ sopno maxnest = 0;
+
+ if (p->error != 0)
+ return(0); /* there may not be an OEND */
+
+ scan = g->strip + 1;
+ do {
+ s = *scan++;
+ switch (OP(s)) {
+ case OPLUS_:
+ plusnest++;
+ break;
+ case O_PLUS:
+ if (plusnest > maxnest)
+ maxnest = plusnest;
+ plusnest--;
+ break;
+ }
+ } while (OP(s) != OEND);
+ if (plusnest != 0)
+ g->iflags |= REGEX_BAD;
+ return(maxnest);
+}
diff --git a/ext/src/llvm/regengine.inc b/ext/src/llvm/regengine.inc
new file mode 100644
index 0000000..62d8c26
--- /dev/null
+++ b/ext/src/llvm/regengine.inc
@@ -0,0 +1,1034 @@
+/*-
+ * This code is derived from OpenBSD's libc/regex, original license follows:
+ *
+ * Copyright (c) 1992, 1993, 1994 Henry Spencer.
+ * Copyright (c) 1992, 1993, 1994
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Henry Spencer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)engine.c 8.5 (Berkeley) 3/20/94
+ */
+
+/*
+ * The matching engine and friends. This file is #included by regexec.c
+ * after suitable #defines of a variety of macros used herein, so that
+ * different state representations can be used without duplicating masses
+ * of code.
+ */
+
+#ifdef SNAMES
+#define matcher smatcher
+#define fast sfast
+#define slow sslow
+#define dissect sdissect
+#define backref sbackref
+#define step sstep
+#define print sprint
+#define at sat
+#define match smat
+#define nope snope
+#endif
+#ifdef LNAMES
+#define matcher lmatcher
+#define fast lfast
+#define slow lslow
+#define dissect ldissect
+#define backref lbackref
+#define step lstep
+#define print lprint
+#define at lat
+#define match lmat
+#define nope lnope
+#endif
+
+/* another structure passed up and down to avoid zillions of parameters */
+struct match {
+ struct re_guts *g;
+ int eflags;
+ llvm_regmatch_t *pmatch; /* [nsub+1] (0 element unused) */
+ const char *offp; /* offsets work from here */
+ const char *beginp; /* start of string -- virtual NUL precedes */
+ const char *endp; /* end of string -- virtual NUL here */
+ const char *coldp; /* can be no match starting before here */
+ const char **lastpos; /* [nplus+1] */
+ STATEVARS;
+ states st; /* current states */
+ states fresh; /* states for a fresh start */
+ states tmp; /* temporary */
+ states empty; /* empty set of states */
+};
+
+static int matcher(struct re_guts *, const char *, size_t,
+ llvm_regmatch_t[], int);
+static const char *dissect(struct match *, const char *, const char *, sopno,
+ sopno);
+static const char *backref(struct match *, const char *, const char *, sopno,
+ sopno, sopno, int);
+static const char *fast(struct match *, const char *, const char *, sopno, sopno);
+static const char *slow(struct match *, const char *, const char *, sopno, sopno);
+static states step(struct re_guts *, sopno, sopno, states, int, states);
+#define MAX_RECURSION 100
+#define BOL (OUT+1)
+#define EOL (BOL+1)
+#define BOLEOL (BOL+2)
+#define NOTHING (BOL+3)
+#define BOW (BOL+4)
+#define EOW (BOL+5)
+#define CODEMAX (BOL+5) /* highest code used */
+#define NONCHAR(c) ((c) > CHAR_MAX)
+#define NNONCHAR (CODEMAX-CHAR_MAX)
+#ifdef REDEBUG
+static void print(struct match *, char *, states, int, FILE *);
+#endif
+#ifdef REDEBUG
+static void at(struct match *, char *, char *, char *, sopno, sopno);
+#endif
+#ifdef REDEBUG
+static char *pchar(int);
+#endif
+
+#ifdef REDEBUG
+#define SP(t, s, c) print(m, t, s, c, stdout)
+#define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2)
+#define NOTE(str) { if (m->eflags®_TRACE) (void)printf("=%s\n", (str)); }
+static int nope = 0;
+#else
+#define SP(t, s, c) /* nothing */
+#define AT(t, p1, p2, s1, s2) /* nothing */
+#define NOTE(s) /* nothing */
+#endif
+
+/*
+ - matcher - the actual matching engine
+ */
+static int /* 0 success, REG_NOMATCH failure */
+matcher(struct re_guts *g, const char *string, size_t nmatch,
+ llvm_regmatch_t pmatch[],
+ int eflags)
+{
+ const char *endp;
+ size_t i;
+ struct match mv;
+ struct match *m = &mv;
+ const char *dp;
+ const sopno gf = g->firststate+1; /* +1 for OEND */
+ const sopno gl = g->laststate;
+ const char *start;
+ const char *stop;
+
+ /* simplify the situation where possible */
+ if (g->cflags®_NOSUB)
+ nmatch = 0;
+ if (eflags®_STARTEND) {
+ start = string + pmatch[0].rm_so;
+ stop = string + pmatch[0].rm_eo;
+ } else {
+ start = string;
+ stop = start + strlen(start);
+ }
+ if (stop < start)
+ return(REG_INVARG);
+
+ /* prescreening; this does wonders for this rather slow code */
+ if (g->must != NULL) {
+ for (dp = start; dp < stop; dp++)
+ if (*dp == g->must[0] && stop - dp >= g->mlen &&
+ memcmp(dp, g->must, (size_t)g->mlen) == 0)
+ break;
+ if (dp == stop) /* we didn't find g->must */
+ return(REG_NOMATCH);
+ }
+
+ /* match struct setup */
+ m->g = g;
+ m->eflags = eflags;
+ m->pmatch = NULL;
+ m->lastpos = NULL;
+ m->offp = string;
+ m->beginp = start;
+ m->endp = stop;
+ STATESETUP(m, 4);
+ SETUP(m->st);
+ SETUP(m->fresh);
+ SETUP(m->tmp);
+ SETUP(m->empty);
+ CLEAR(m->empty);
+
+ /* this loop does only one repetition except for backrefs */
+ for (;;) {
+ endp = fast(m, start, stop, gf, gl);
+ if (endp == NULL) { /* a miss */
+ free(m->pmatch);
+ free((void*)m->lastpos);
+ STATETEARDOWN(m);
+ return(REG_NOMATCH);
+ }
+ if (nmatch == 0 && !g->backrefs)
+ break; /* no further info needed */
+
+ /* where? */
+ assert(m->coldp != NULL);
+ for (;;) {
+ NOTE("finding start");
+ endp = slow(m, m->coldp, stop, gf, gl);
+ if (endp != NULL)
+ break;
+ assert(m->coldp < m->endp);
+ m->coldp++;
+ }
+ if (nmatch == 1 && !g->backrefs)
+ break; /* no further info needed */
+
+ /* oh my, they want the subexpressions... */
+ if (m->pmatch == NULL)
+ m->pmatch = (llvm_regmatch_t *)malloc((m->g->nsub + 1) *
+ sizeof(llvm_regmatch_t));
+ if (m->pmatch == NULL) {
+ STATETEARDOWN(m);
+ return(REG_ESPACE);
+ }
+ for (i = 1; i <= m->g->nsub; i++)
+ m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
+ if (!g->backrefs && !(m->eflags®_BACKR)) {
+ NOTE("dissecting");
+ dp = dissect(m, m->coldp, endp, gf, gl);
+ } else {
+ if (g->nplus > 0 && m->lastpos == NULL)
+ m->lastpos = (const char **)malloc((g->nplus+1) *
+ sizeof(char *));
+ if (g->nplus > 0 && m->lastpos == NULL) {
+ free(m->pmatch);
+ STATETEARDOWN(m);
+ return(REG_ESPACE);
+ }
+ NOTE("backref dissect");
+ dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
+ }
+ if (dp != NULL)
+ break;
+
+ /* uh-oh... we couldn't find a subexpression-level match */
+ assert(g->backrefs); /* must be back references doing it */
+ assert(g->nplus == 0 || m->lastpos != NULL);
+ for (;;) {
+ if (dp != NULL || endp <= m->coldp)
+ break; /* defeat */
+ NOTE("backoff");
+ endp = slow(m, m->coldp, endp-1, gf, gl);
+ if (endp == NULL)
+ break; /* defeat */
+ /* try it on a shorter possibility */
+#ifndef NDEBUG
+ for (i = 1; i <= m->g->nsub; i++) {
+ assert(m->pmatch[i].rm_so == -1);
+ assert(m->pmatch[i].rm_eo == -1);
+ }
+#endif
+ NOTE("backoff dissect");
+ dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
+ }
+ assert(dp == NULL || dp == endp);
+ if (dp != NULL) /* found a shorter one */
+ break;
+
+ /* despite initial appearances, there is no match here */
+ NOTE("false alarm");
+ if (m->coldp == stop)
+ break;
+ start = m->coldp + 1; /* recycle starting later */
+ }
+
+ /* fill in the details if requested */
+ if (nmatch > 0) {
+ pmatch[0].rm_so = m->coldp - m->offp;
+ pmatch[0].rm_eo = endp - m->offp;
+ }
+ if (nmatch > 1) {
+ assert(m->pmatch != NULL);
+ for (i = 1; i < nmatch; i++)
+ if (i <= m->g->nsub)
+ pmatch[i] = m->pmatch[i];
+ else {
+ pmatch[i].rm_so = -1;
+ pmatch[i].rm_eo = -1;
+ }
+ }
+
+ if (m->pmatch != NULL)
+ free((char *)m->pmatch);
+ if (m->lastpos != NULL)
+ free((char *)m->lastpos);
+ STATETEARDOWN(m);
+ return(0);
+}
+
+/*
+ - dissect - figure out what matched what, no back references
+ */
+static const char * /* == stop (success) always */
+dissect(struct match *m, const char *start, const char *stop, sopno startst,
+ sopno stopst)
+{
+ int i;
+ sopno ss; /* start sop of current subRE */
+ sopno es; /* end sop of current subRE */
+ const char *sp; /* start of string matched by it */
+ const char *stp; /* string matched by it cannot pass here */
+ const char *rest; /* start of rest of string */
+ const char *tail; /* string unmatched by rest of RE */
+ sopno ssub; /* start sop of subsubRE */
+ sopno esub; /* end sop of subsubRE */
+ const char *ssp; /* start of string matched by subsubRE */
+ const char *sep; /* end of string matched by subsubRE */
+ const char *oldssp; /* previous ssp */
+
+ AT("diss", start, stop, startst, stopst);
+ sp = start;
+ for (ss = startst; ss < stopst; ss = es) {
+ /* identify end of subRE */
+ es = ss;
+ switch (OP(m->g->strip[es])) {
+ case OPLUS_:
+ case OQUEST_:
+ es += OPND(m->g->strip[es]);
+ break;
+ case OCH_:
+ while (OP(m->g->strip[es]) != O_CH)
+ es += OPND(m->g->strip[es]);
+ break;
+ }
+ es++;
+
+ /* figure out what it matched */
+ switch (OP(m->g->strip[ss])) {
+ case OEND:
+ assert(nope);
+ break;
+ case OCHAR:
+ sp++;
+ break;
+ case OBOL:
+ case OEOL:
+ case OBOW:
+ case OEOW:
+ break;
+ case OANY:
+ case OANYOF:
+ sp++;
+ break;
+ case OBACK_:
+ case O_BACK:
+ assert(nope);
+ break;
+ /* cases where length of match is hard to find */
+ case OQUEST_:
+ stp = stop;
+ for (;;) {
+ /* how long could this one be? */
+ rest = slow(m, sp, stp, ss, es);
+ assert(rest != NULL); /* it did match */
+ /* could the rest match the rest? */
+ tail = slow(m, rest, stop, es, stopst);
+ if (tail == stop)
+ break; /* yes! */
+ /* no -- try a shorter match for this one */
+ stp = rest - 1;
+ assert(stp >= sp); /* it did work */
+ }
+ ssub = ss + 1;
+ esub = es - 1;
+ /* did innards match? */
+ if (slow(m, sp, rest, ssub, esub) != NULL) {
+ const char *dp = dissect(m, sp, rest, ssub, esub);
+ (void)dp; /* avoid warning if assertions off */
+ assert(dp == rest);
+ } else /* no */
+ assert(sp == rest);
+ sp = rest;
+ break;
+ case OPLUS_:
+ stp = stop;
+ for (;;) {
+ /* how long could this one be? */
+ rest = slow(m, sp, stp, ss, es);
+ assert(rest != NULL); /* it did match */
+ /* could the rest match the rest? */
+ tail = slow(m, rest, stop, es, stopst);
+ if (tail == stop)
+ break; /* yes! */
+ /* no -- try a shorter match for this one */
+ stp = rest - 1;
+ assert(stp >= sp); /* it did work */
+ }
+ ssub = ss + 1;
+ esub = es - 1;
+ ssp = sp;
+ oldssp = ssp;
+ for (;;) { /* find last match of innards */
+ sep = slow(m, ssp, rest, ssub, esub);
+ if (sep == NULL || sep == ssp)
+ break; /* failed or matched null */
+ oldssp = ssp; /* on to next try */
+ ssp = sep;
+ }
+ if (sep == NULL) {
+ /* last successful match */
+ sep = ssp;
+ ssp = oldssp;
+ }
+ assert(sep == rest); /* must exhaust substring */
+ assert(slow(m, ssp, sep, ssub, esub) == rest);
+ {
+ const char *dp = dissect(m, ssp, sep, ssub, esub);
+ (void)dp; /* avoid warning if assertions off */
+ assert(dp == sep);
+ }
+ sp = rest;
+ break;
+ case OCH_:
+ stp = stop;
+ for (;;) {
+ /* how long could this one be? */
+ rest = slow(m, sp, stp, ss, es);
+ assert(rest != NULL); /* it did match */
+ /* could the rest match the rest? */
+ tail = slow(m, rest, stop, es, stopst);
+ if (tail == stop)
+ break; /* yes! */
+ /* no -- try a shorter match for this one */
+ stp = rest - 1;
+ assert(stp >= sp); /* it did work */
+ }
+ ssub = ss + 1;
+ esub = ss + OPND(m->g->strip[ss]) - 1;
+ assert(OP(m->g->strip[esub]) == OOR1);
+ for (;;) { /* find first matching branch */
+ if (slow(m, sp, rest, ssub, esub) == rest)
+ break; /* it matched all of it */
+ /* that one missed, try next one */
+ assert(OP(m->g->strip[esub]) == OOR1);
+ esub++;
+ assert(OP(m->g->strip[esub]) == OOR2);
+ ssub = esub + 1;
+ esub += OPND(m->g->strip[esub]);
+ if (OP(m->g->strip[esub]) == OOR2)
+ esub--;
+ else
+ assert(OP(m->g->strip[esub]) == O_CH);
+ }
+ {
+ const char *dp = dissect(m, sp, rest, ssub, esub);
+ (void)dp; /* avoid warning if assertions off */
+ assert(dp == rest);
+ }
+ sp = rest;
+ break;
+ case O_PLUS:
+ case O_QUEST:
+ case OOR1:
+ case OOR2:
+ case O_CH:
+ assert(nope);
+ break;
+ case OLPAREN:
+ i = OPND(m->g->strip[ss]);
+ assert(0 < i && i <= m->g->nsub);
+ m->pmatch[i].rm_so = sp - m->offp;
+ break;
+ case ORPAREN:
+ i = OPND(m->g->strip[ss]);
+ assert(0 < i && i <= m->g->nsub);
+ m->pmatch[i].rm_eo = sp - m->offp;
+ break;
+ default: /* uh oh */
+ assert(nope);
+ break;
+ }
+ }
+
+ assert(sp == stop);
+ return(sp);
+}
+
+/*
+ - backref - figure out what matched what, figuring in back references
+ */
+static const char * /* == stop (success) or NULL (failure) */
+backref(struct match *m, const char *start, const char *stop, sopno startst,
+ sopno stopst, sopno lev, int rec) /* PLUS nesting level */
+{
+ int i;
+ sopno ss; /* start sop of current subRE */
+ const char *sp; /* start of string matched by it */
+ sopno ssub; /* start sop of subsubRE */
+ sopno esub; /* end sop of subsubRE */
+ const char *ssp; /* start of string matched by subsubRE */
+ const char *dp;
+ size_t len;
+ int hard;
+ sop s;
+ llvm_regoff_t offsave;
+ cset *cs;
+
+ AT("back", start, stop, startst, stopst);
+ sp = start;
+
+ /* get as far as we can with easy stuff */
+ hard = 0;
+ for (ss = startst; !hard && ss < stopst; ss++)
+ switch (OP(s = m->g->strip[ss])) {
+ case OCHAR:
+ if (sp == stop || *sp++ != (char)OPND(s))
+ return(NULL);
+ break;
+ case OANY:
+ if (sp == stop)
+ return(NULL);
+ sp++;
+ break;
+ case OANYOF:
+ cs = &m->g->sets[OPND(s)];
+ if (sp == stop || !CHIN(cs, *sp++))
+ return(NULL);
+ break;
+ case OBOL:
+ if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) ||
+ (sp < m->endp && *(sp-1) == '\n' &&
+ (m->g->cflags®_NEWLINE)) )
+ { /* yes */ }
+ else
+ return(NULL);
+ break;
+ case OEOL:
+ if ( (sp == m->endp && !(m->eflags®_NOTEOL)) ||
+ (sp < m->endp && *sp == '\n' &&
+ (m->g->cflags®_NEWLINE)) )
+ { /* yes */ }
+ else
+ return(NULL);
+ break;
+ case OBOW:
+ if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) ||
+ (sp < m->endp && *(sp-1) == '\n' &&
+ (m->g->cflags®_NEWLINE)) ||
+ (sp > m->beginp &&
+ !ISWORD(*(sp-1))) ) &&
+ (sp < m->endp && ISWORD(*sp)) )
+ { /* yes */ }
+ else
+ return(NULL);
+ break;
+ case OEOW:
+ if (( (sp == m->endp && !(m->eflags®_NOTEOL)) ||
+ (sp < m->endp && *sp == '\n' &&
+ (m->g->cflags®_NEWLINE)) ||
+ (sp < m->endp && !ISWORD(*sp)) ) &&
+ (sp > m->beginp && ISWORD(*(sp-1))) )
+ { /* yes */ }
+ else
+ return(NULL);
+ break;
+ case O_QUEST:
+ break;
+ case OOR1: /* matches null but needs to skip */
+ ss++;
+ s = m->g->strip[ss];
+ do {
+ assert(OP(s) == OOR2);
+ ss += OPND(s);
+ } while (OP(s = m->g->strip[ss]) != O_CH);
+ /* note that the ss++ gets us past the O_CH */
+ break;
+ default: /* have to make a choice */
+ hard = 1;
+ break;
+ }
+ if (!hard) { /* that was it! */
+ if (sp != stop)
+ return(NULL);
+ return(sp);
+ }
+ ss--; /* adjust for the for's final increment */
+
+ /* the hard stuff */
+ AT("hard", sp, stop, ss, stopst);
+ s = m->g->strip[ss];
+ switch (OP(s)) {
+ case OBACK_: /* the vilest depths */
+ i = OPND(s);
+ assert(0 < i && i <= m->g->nsub);
+ if (m->pmatch[i].rm_eo == -1)
+ return(NULL);
+ assert(m->pmatch[i].rm_so != -1);
+ len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
+ if (len == 0 && rec++ > MAX_RECURSION)
+ return(NULL);
+ assert(stop - m->beginp >= len);
+ if (sp > stop - len)
+ return(NULL); /* not enough left to match */
+ ssp = m->offp + m->pmatch[i].rm_so;
+ if (memcmp(sp, ssp, len) != 0)
+ return(NULL);
+ while (m->g->strip[ss] != SOP(O_BACK, i))
+ ss++;
+ return(backref(m, sp+len, stop, ss+1, stopst, lev, rec));
+ break;
+ case OQUEST_: /* to null or not */
+ dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
+ if (dp != NULL)
+ return(dp); /* not */
+ return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec));
+ break;
+ case OPLUS_:
+ assert(m->lastpos != NULL);
+ assert(lev+1 <= m->g->nplus);
+ m->lastpos[lev+1] = sp;
+ return(backref(m, sp, stop, ss+1, stopst, lev+1, rec));
+ break;
+ case O_PLUS:
+ if (sp == m->lastpos[lev]) /* last pass matched null */
+ return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
+ /* try another pass */
+ m->lastpos[lev] = sp;
+ dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec);
+ if (dp == NULL)
+ return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
+ else
+ return(dp);
+ break;
+ case OCH_: /* find the right one, if any */
+ ssub = ss + 1;
+ esub = ss + OPND(s) - 1;
+ assert(OP(m->g->strip[esub]) == OOR1);
+ for (;;) { /* find first matching branch */
+ dp = backref(m, sp, stop, ssub, esub, lev, rec);
+ if (dp != NULL)
+ return(dp);
+ /* that one missed, try next one */
+ if (OP(m->g->strip[esub]) == O_CH)
+ return(NULL); /* there is none */
+ esub++;
+ assert(OP(m->g->strip[esub]) == OOR2);
+ ssub = esub + 1;
+ esub += OPND(m->g->strip[esub]);
+ if (OP(m->g->strip[esub]) == OOR2)
+ esub--;
+ else
+ assert(OP(m->g->strip[esub]) == O_CH);
+ }
+ break;
+ case OLPAREN: /* must undo assignment if rest fails */
+ i = OPND(s);
+ assert(0 < i && i <= m->g->nsub);
+ offsave = m->pmatch[i].rm_so;
+ m->pmatch[i].rm_so = sp - m->offp;
+ dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
+ if (dp != NULL)
+ return(dp);
+ m->pmatch[i].rm_so = offsave;
+ return(NULL);
+ break;
+ case ORPAREN: /* must undo assignment if rest fails */
+ i = OPND(s);
+ assert(0 < i && i <= m->g->nsub);
+ offsave = m->pmatch[i].rm_eo;
+ m->pmatch[i].rm_eo = sp - m->offp;
+ dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
+ if (dp != NULL)
+ return(dp);
+ m->pmatch[i].rm_eo = offsave;
+ return(NULL);
+ break;
+ default: /* uh oh */
+ assert(nope);
+ break;
+ }
+
+ /* "can't happen" */
+ assert(nope);
+ /* NOTREACHED */
+ return NULL;
+}
+
+/*
+ - fast - step through the string at top speed
+ */
+static const char * /* where tentative match ended, or NULL */
+fast(struct match *m, const char *start, const char *stop, sopno startst,
+ sopno stopst)
+{
+ states st = m->st;
+ states fresh = m->fresh;
+ states tmp = m->tmp;
+ const char *p = start;
+ int c = (start == m->beginp) ? OUT : *(start-1);
+ int lastc; /* previous c */
+ int flagch;
+ int i;
+ const char *coldp; /* last p after which no match was underway */
+
+ CLEAR(st);
+ SET1(st, startst);
+ st = step(m->g, startst, stopst, st, NOTHING, st);
+ ASSIGN(fresh, st);
+ SP("start", st, *p);
+ coldp = NULL;
+ for (;;) {
+ /* next character */
+ lastc = c;
+ c = (p == m->endp) ? OUT : *p;
+ if (EQ(st, fresh))
+ coldp = p;
+
+ /* is there an EOL and/or BOL between lastc and c? */
+ flagch = '\0';
+ i = 0;
+ if ( (lastc == '\n' && m->g->cflags®_NEWLINE) ||
+ (lastc == OUT && !(m->eflags®_NOTBOL)) ) {
+ flagch = BOL;
+ i = m->g->nbol;
+ }
+ if ( (c == '\n' && m->g->cflags®_NEWLINE) ||
+ (c == OUT && !(m->eflags®_NOTEOL)) ) {
+ flagch = (flagch == BOL) ? BOLEOL : EOL;
+ i += m->g->neol;
+ }
+ if (i != 0) {
+ for (; i > 0; i--)
+ st = step(m->g, startst, stopst, st, flagch, st);
+ SP("boleol", st, c);
+ }
+
+ /* how about a word boundary? */
+ if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
+ (c != OUT && ISWORD(c)) ) {
+ flagch = BOW;
+ }
+ if ( (lastc != OUT && ISWORD(lastc)) &&
+ (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
+ flagch = EOW;
+ }
+ if (flagch == BOW || flagch == EOW) {
+ st = step(m->g, startst, stopst, st, flagch, st);
+ SP("boweow", st, c);
+ }
+
+ /* are we done? */
+ if (ISSET(st, stopst) || p == stop)
+ break; /* NOTE BREAK OUT */
+
+ /* no, we must deal with this character */
+ ASSIGN(tmp, st);
+ ASSIGN(st, fresh);
+ assert(c != OUT);
+ st = step(m->g, startst, stopst, tmp, c, st);
+ SP("aft", st, c);
+ assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
+ p++;
+ }
+
+ assert(coldp != NULL);
+ m->coldp = coldp;
+ if (ISSET(st, stopst))
+ return(p+1);
+ else
+ return(NULL);
+}
+
+/*
+ - slow - step through the string more deliberately
+ */
+static const char * /* where it ended */
+slow(struct match *m, const char *start, const char *stop, sopno startst,
+ sopno stopst)
+{
+ states st = m->st;
+ states empty = m->empty;
+ states tmp = m->tmp;
+ const char *p = start;
+ int c = (start == m->beginp) ? OUT : *(start-1);
+ int lastc; /* previous c */
+ int flagch;
+ int i;
+ const char *matchp; /* last p at which a match ended */
+
+ AT("slow", start, stop, startst, stopst);
+ CLEAR(st);
+ SET1(st, startst);
+ SP("sstart", st, *p);
+ st = step(m->g, startst, stopst, st, NOTHING, st);
+ matchp = NULL;
+ for (;;) {
+ /* next character */
+ lastc = c;
+ c = (p == m->endp) ? OUT : *p;
+
+ /* is there an EOL and/or BOL between lastc and c? */
+ flagch = '\0';
+ i = 0;
+ if ( (lastc == '\n' && m->g->cflags®_NEWLINE) ||
+ (lastc == OUT && !(m->eflags®_NOTBOL)) ) {
+ flagch = BOL;
+ i = m->g->nbol;
+ }
+ if ( (c == '\n' && m->g->cflags®_NEWLINE) ||
+ (c == OUT && !(m->eflags®_NOTEOL)) ) {
+ flagch = (flagch == BOL) ? BOLEOL : EOL;
+ i += m->g->neol;
+ }
+ if (i != 0) {
+ for (; i > 0; i--)
+ st = step(m->g, startst, stopst, st, flagch, st);
+ SP("sboleol", st, c);
+ }
+
+ /* how about a word boundary? */
+ if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
+ (c != OUT && ISWORD(c)) ) {
+ flagch = BOW;
+ }
+ if ( (lastc != OUT && ISWORD(lastc)) &&
+ (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
+ flagch = EOW;
+ }
+ if (flagch == BOW || flagch == EOW) {
+ st = step(m->g, startst, stopst, st, flagch, st);
+ SP("sboweow", st, c);
+ }
+
+ /* are we done? */
+ if (ISSET(st, stopst))
+ matchp = p;
+ if (EQ(st, empty) || p == stop)
+ break; /* NOTE BREAK OUT */
+
+ /* no, we must deal with this character */
+ ASSIGN(tmp, st);
+ ASSIGN(st, empty);
+ assert(c != OUT);
+ st = step(m->g, startst, stopst, tmp, c, st);
+ SP("saft", st, c);
+ assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
+ p++;
+ }
+
+ return(matchp);
+}
+
+
+/*
+ - step - map set of states reachable before char to set reachable after
+ */
+static states
+step(struct re_guts *g,
+ sopno start, /* start state within strip */
+ sopno stop, /* state after stop state within strip */
+ states bef, /* states reachable before */
+ int ch, /* character or NONCHAR code */
+ states aft) /* states already known reachable after */
+{
+ cset *cs;
+ sop s;
+ sopno pc;
+ onestate here; /* note, macros know this name */
+ sopno look;
+ int i;
+
+ for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
+ s = g->strip[pc];
+ switch (OP(s)) {
+ case OEND:
+ assert(pc == stop-1);
+ break;
+ case OCHAR:
+ /* only characters can match */
+ assert(!NONCHAR(ch) || ch != (char)OPND(s));
+ if (ch == (char)OPND(s))
+ FWD(aft, bef, 1);
+ break;
+ case OBOL:
+ if (ch == BOL || ch == BOLEOL)
+ FWD(aft, bef, 1);
+ break;
+ case OEOL:
+ if (ch == EOL || ch == BOLEOL)
+ FWD(aft, bef, 1);
+ break;
+ case OBOW:
+ if (ch == BOW)
+ FWD(aft, bef, 1);
+ break;
+ case OEOW:
+ if (ch == EOW)
+ FWD(aft, bef, 1);
+ break;
+ case OANY:
+ if (!NONCHAR(ch))
+ FWD(aft, bef, 1);
+ break;
+ case OANYOF:
+ cs = &g->sets[OPND(s)];
+ if (!NONCHAR(ch) && CHIN(cs, ch))
+ FWD(aft, bef, 1);
+ break;
+ case OBACK_: /* ignored here */
+ case O_BACK:
+ FWD(aft, aft, 1);
+ break;
+ case OPLUS_: /* forward, this is just an empty */
+ FWD(aft, aft, 1);
+ break;
+ case O_PLUS: /* both forward and back */
+ FWD(aft, aft, 1);
+ i = ISSETBACK(aft, OPND(s));
+ BACK(aft, aft, OPND(s));
+ if (!i && ISSETBACK(aft, OPND(s))) {
+ /* oho, must reconsider loop body */
+ pc -= OPND(s) + 1;
+ INIT(here, pc);
+ }
+ break;
+ case OQUEST_: /* two branches, both forward */
+ FWD(aft, aft, 1);
+ FWD(aft, aft, OPND(s));
+ break;
+ case O_QUEST: /* just an empty */
+ FWD(aft, aft, 1);
+ break;
+ case OLPAREN: /* not significant here */
+ case ORPAREN:
+ FWD(aft, aft, 1);
+ break;
+ case OCH_: /* mark the first two branches */
+ FWD(aft, aft, 1);
+ assert(OP(g->strip[pc+OPND(s)]) == OOR2);
+ FWD(aft, aft, OPND(s));
+ break;
+ case OOR1: /* done a branch, find the O_CH */
+ if (ISSTATEIN(aft, here)) {
+ for (look = 1;
+ OP(s = g->strip[pc+look]) != O_CH;
+ look += OPND(s))
+ assert(OP(s) == OOR2);
+ FWD(aft, aft, look);
+ }
+ break;
+ case OOR2: /* propagate OCH_'s marking */
+ FWD(aft, aft, 1);
+ if (OP(g->strip[pc+OPND(s)]) != O_CH) {
+ assert(OP(g->strip[pc+OPND(s)]) == OOR2);
+ FWD(aft, aft, OPND(s));
+ }
+ break;
+ case O_CH: /* just empty */
+ FWD(aft, aft, 1);
+ break;
+ default: /* ooooops... */
+ assert(nope);
+ break;
+ }
+ }
+
+ return(aft);
+}
+
+#ifdef REDEBUG
+/*
+ - print - print a set of states
+ */
+static void
+print(struct match *m, char *caption, states st, int ch, FILE *d)
+{
+ struct re_guts *g = m->g;
+ int i;
+ int first = 1;
+
+ if (!(m->eflags®_TRACE))
+ return;
+
+ (void)fprintf(d, "%s", caption);
+ if (ch != '\0')
+ (void)fprintf(d, " %s", pchar(ch));
+ for (i = 0; i < g->nstates; i++)
+ if (ISSET(st, i)) {
+ (void)fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
+ first = 0;
+ }
+ (void)fprintf(d, "\n");
+}
+
+/*
+ - at - print current situation
+ */
+static void
+at(struct match *m, char *title, char *start, char *stop, sopno startst,
+ sopno stopst)
+{
+ if (!(m->eflags®_TRACE))
+ return;
+
+ (void)printf("%s %s-", title, pchar(*start));
+ (void)printf("%s ", pchar(*stop));
+ (void)printf("%ld-%ld\n", (long)startst, (long)stopst);
+}
+
+#ifndef PCHARDONE
+#define PCHARDONE /* never again */
+/*
+ - pchar - make a character printable
+ *
+ * Is this identical to regchar() over in debug.c? Well, yes. But a
+ * duplicate here avoids having a debugging-capable regexec.o tied to
+ * a matching debug.o, and this is convenient. It all disappears in
+ * the non-debug compilation anyway, so it doesn't matter much.
+ */
+static char * /* -> representation */
+pchar(int ch)
+{
+ static char pbuf[10];
+
+ if (isprint(ch) || ch == ' ')
+ (void)snprintf(pbuf, sizeof pbuf, "%c", ch);
+ else
+ (void)snprintf(pbuf, sizeof pbuf, "\\%o", ch);
+ return(pbuf);
+}
+#endif
+#endif
+
+#undef matcher
+#undef fast
+#undef slow
+#undef dissect
+#undef backref
+#undef step
+#undef print
+#undef at
+#undef match
+#undef nope
diff --git a/ext/src/llvm/regerror.c b/ext/src/llvm/regerror.c
new file mode 100644
index 0000000..88d33b8
--- /dev/null
+++ b/ext/src/llvm/regerror.c
@@ -0,0 +1,131 @@
+/*-
+ * This code is derived from OpenBSD's libc/regex, original license follows:
+ *
+ * Copyright (c) 1992, 1993, 1994 Henry Spencer.
+ * Copyright (c) 1992, 1993, 1994
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Henry Spencer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)regerror.c 8.4 (Berkeley) 3/20/94
+ */
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <limits.h>
+#include <stdlib.h>
+#include "regex_impl.h"
+
+#include "regutils.h"
+
+static const char *regatoi(const llvm_regex_t *, char *, int);
+
+static struct rerr {
+ int code;
+ const char *name;
+ const char *explain;
+} rerrs[] = {
+ { REG_NOMATCH, "REG_NOMATCH", "llvm_regexec() failed to match" },
+ { REG_BADPAT, "REG_BADPAT", "invalid regular expression" },
+ { REG_ECOLLATE, "REG_ECOLLATE", "invalid collating element" },
+ { REG_ECTYPE, "REG_ECTYPE", "invalid character class" },
+ { REG_EESCAPE, "REG_EESCAPE", "trailing backslash (\\)" },
+ { REG_ESUBREG, "REG_ESUBREG", "invalid backreference number" },
+ { REG_EBRACK, "REG_EBRACK", "brackets ([ ]) not balanced" },
+ { REG_EPAREN, "REG_EPAREN", "parentheses not balanced" },
+ { REG_EBRACE, "REG_EBRACE", "braces not balanced" },
+ { REG_BADBR, "REG_BADBR", "invalid repetition count(s)" },
+ { REG_ERANGE, "REG_ERANGE", "invalid character range" },
+ { REG_ESPACE, "REG_ESPACE", "out of memory" },
+ { REG_BADRPT, "REG_BADRPT", "repetition-operator operand invalid" },
+ { REG_EMPTY, "REG_EMPTY", "empty (sub)expression" },
+ { REG_ASSERT, "REG_ASSERT", "\"can't happen\" -- you found a bug" },
+ { REG_INVARG, "REG_INVARG", "invalid argument to regex routine" },
+ { 0, "", "*** unknown regexp error code ***" }
+};
+
+/*
+ - llvm_regerror - the interface to error numbers
+ = extern size_t llvm_regerror(int, const llvm_regex_t *, char *, size_t);
+ */
+/* ARGSUSED */
+size_t
+llvm_regerror(int errcode, const llvm_regex_t *preg, char *errbuf, size_t errbuf_size)
+{
+ struct rerr *r;
+ size_t len;
+ int target = errcode &~ REG_ITOA;
+ const char *s;
+ char convbuf[50];
+
+ if (errcode == REG_ATOI)
+ s = regatoi(preg, convbuf, sizeof convbuf);
+ else {
+ for (r = rerrs; r->code != 0; r++)
+ if (r->code == target)
+ break;
+
+ if (errcode®_ITOA) {
+ if (r->code != 0) {
+ assert(strlen(r->name) < sizeof(convbuf));
+ (void) llvm_strlcpy(convbuf, r->name, sizeof convbuf);
+ } else
+ (void)snprintf(convbuf, sizeof convbuf,
+ "REG_0x%x", target);
+ s = convbuf;
+ } else
+ s = r->explain;
+ }
+
+ len = strlen(s) + 1;
+ if (errbuf_size > 0) {
+ llvm_strlcpy(errbuf, s, errbuf_size);
+ }
+
+ return(len);
+}
+
+/*
+ - regatoi - internal routine to implement REG_ATOI
+ */
+static const char *
+regatoi(const llvm_regex_t *preg, char *localbuf, int localbufsize)
+{
+ struct rerr *r;
+
+ for (r = rerrs; r->code != 0; r++)
+ if (strcmp(r->name, preg->re_endp) == 0)
+ break;
+ if (r->code == 0)
+ return("0");
+
+ (void)snprintf(localbuf, localbufsize, "%d", r->code);
+ return(localbuf);
+}
diff --git a/ext/src/llvm/regex2.h b/ext/src/llvm/regex2.h
new file mode 100644
index 0000000..d81bfbc
--- /dev/null
+++ b/ext/src/llvm/regex2.h
@@ -0,0 +1,162 @@
+/*-
+ * This code is derived from OpenBSD's libc/regex, original license follows:
+ *
+ * Copyright (c) 1992, 1993, 1994 Henry Spencer.
+ * Copyright (c) 1992, 1993, 1994
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Henry Spencer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)regex2.h 8.4 (Berkeley) 3/20/94
+ */
+
+#ifndef LLVM_SUPPORT_REGEX2_H
+#define LLVM_SUPPORT_REGEX2_H
+
+/*
+ * internals of regex_t
+ */
+#define MAGIC1 ((('r'^0200)<<8) | 'e')
+
+/*
+ * The internal representation is a *strip*, a sequence of
+ * operators ending with an endmarker. (Some terminology etc. is a
+ * historical relic of earlier versions which used multiple strips.)
+ * Certain oddities in the representation are there to permit running
+ * the machinery backwards; in particular, any deviation from sequential
+ * flow must be marked at both its source and its destination. Some
+ * fine points:
+ *
+ * - OPLUS_ and O_PLUS are *inside* the loop they create.
+ * - OQUEST_ and O_QUEST are *outside* the bypass they create.
+ * - OCH_ and O_CH are *outside* the multi-way branch they create, while
+ * OOR1 and OOR2 are respectively the end and the beginning of one of
+ * the branches. Note that there is an implicit OOR2 following OCH_
+ * and an implicit OOR1 preceding O_CH.
+ *
+ * In state representations, an operator's bit is on to signify a state
+ * immediately *preceding* "execution" of that operator.
+ */
+typedef unsigned long sop; /* strip operator */
+typedef long sopno;
+#define OPRMASK 0xf8000000LU
+#define OPDMASK 0x07ffffffLU
+#define OPSHIFT ((unsigned)27)
+#define OP(n) ((n)&OPRMASK)
+#define OPND(n) ((n)&OPDMASK)
+#define SOP(op, opnd) ((op)|(opnd))
+/* operators meaning operand */
+/* (back, fwd are offsets) */
+#define OEND (1LU<<OPSHIFT) /* endmarker - */
+#define OCHAR (2LU<<OPSHIFT) /* character unsigned char */
+#define OBOL (3LU<<OPSHIFT) /* left anchor - */
+#define OEOL (4LU<<OPSHIFT) /* right anchor - */
+#define OANY (5LU<<OPSHIFT) /* . - */
+#define OANYOF (6LU<<OPSHIFT) /* [...] set number */
+#define OBACK_ (7LU<<OPSHIFT) /* begin \d paren number */
+#define O_BACK (8LU<<OPSHIFT) /* end \d paren number */
+#define OPLUS_ (9LU<<OPSHIFT) /* + prefix fwd to suffix */
+#define O_PLUS (10LU<<OPSHIFT) /* + suffix back to prefix */
+#define OQUEST_ (11LU<<OPSHIFT) /* ? prefix fwd to suffix */
+#define O_QUEST (12LU<<OPSHIFT) /* ? suffix back to prefix */
+#define OLPAREN (13LU<<OPSHIFT) /* ( fwd to ) */
+#define ORPAREN (14LU<<OPSHIFT) /* ) back to ( */
+#define OCH_ (15LU<<OPSHIFT) /* begin choice fwd to OOR2 */
+#define OOR1 (16LU<<OPSHIFT) /* | pt. 1 back to OOR1 or OCH_ */
+#define OOR2 (17LU<<OPSHIFT) /* | pt. 2 fwd to OOR2 or O_CH */
+#define O_CH (18LU<<OPSHIFT) /* end choice back to OOR1 */
+#define OBOW (19LU<<OPSHIFT) /* begin word - */
+#define OEOW (20LU<<OPSHIFT) /* end word - */
+
+/*
+ * Structure for [] character-set representation. Character sets are
+ * done as bit vectors, grouped 8 to a byte vector for compactness.
+ * The individual set therefore has both a pointer to the byte vector
+ * and a mask to pick out the relevant bit of each byte. A hash code
+ * simplifies testing whether two sets could be identical.
+ *
+ * This will get trickier for multicharacter collating elements. As
+ * preliminary hooks for dealing with such things, we also carry along
+ * a string of multi-character elements, and decide the size of the
+ * vectors at run time.
+ */
+typedef struct {
+ uch *ptr; /* -> uch [csetsize] */
+ uch mask; /* bit within array */
+ uch hash; /* hash code */
+ size_t smultis;
+ char *multis; /* -> char[smulti] ab\0cd\0ef\0\0 */
+} cset;
+/* note that CHadd and CHsub are unsafe, and CHIN doesn't yield 0/1 */
+#define CHadd(cs, c) ((cs)->ptr[(uch)(c)] |= (cs)->mask, (cs)->hash += (c))
+#define CHsub(cs, c) ((cs)->ptr[(uch)(c)] &= ~(cs)->mask, (cs)->hash -= (c))
+#define CHIN(cs, c) ((cs)->ptr[(uch)(c)] & (cs)->mask)
+#define MCadd(p, cs, cp) mcadd(p, cs, cp) /* llvm_regcomp() internal fns */
+#define MCsub(p, cs, cp) mcsub(p, cs, cp)
+#define MCin(p, cs, cp) mcin(p, cs, cp)
+
+/* stuff for character categories */
+typedef unsigned char cat_t;
+
+/*
+ * main compiled-expression structure
+ */
+struct re_guts {
+ int magic;
+# define MAGIC2 ((('R'^0200)<<8)|'E')
+ sop *strip; /* malloced area for strip */
+ int csetsize; /* number of bits in a cset vector */
+ int ncsets; /* number of csets in use */
+ cset *sets; /* -> cset [ncsets] */
+ uch *setbits; /* -> uch[csetsize][ncsets/CHAR_BIT] */
+ int cflags; /* copy of llvm_regcomp() cflags argument */
+ sopno nstates; /* = number of sops */
+ sopno firststate; /* the initial OEND (normally 0) */
+ sopno laststate; /* the final OEND */
+ int iflags; /* internal flags */
+# define USEBOL 01 /* used ^ */
+# define USEEOL 02 /* used $ */
+# define REGEX_BAD 04 /* something wrong */
+ int nbol; /* number of ^ used */
+ int neol; /* number of $ used */
+ int ncategories; /* how many character categories */
+ cat_t *categories; /* ->catspace[-CHAR_MIN] */
+ char *must; /* match must contain this string */
+ int mlen; /* length of must */
+ size_t nsub; /* copy of re_nsub */
+ int backrefs; /* does it use back references? */
+ sopno nplus; /* how deep does it nest +s? */
+ /* catspace must be last */
+ cat_t catspace[1]; /* actually [NC] */
+};
+
+/* misc utilities */
+#define OUT (CHAR_MAX+1) /* a non-character value */
+#define ISWORD(c) (isalnum(c&0xff) || (c) == '_')
+
+#endif
diff --git a/ext/src/llvm/regex_impl.h b/ext/src/llvm/regex_impl.h
new file mode 100644
index 0000000..f8296c9
--- /dev/null
+++ b/ext/src/llvm/regex_impl.h
@@ -0,0 +1,108 @@
+/*-
+ * This code is derived from OpenBSD's libc/regex, original license follows:
+ *
+ * Copyright (c) 1992 Henry Spencer.
+ * Copyright (c) 1992, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Henry Spencer of the University of Toronto.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)regex.h 8.1 (Berkeley) 6/2/93
+ */
+
+#ifndef _REGEX_H_
+#define _REGEX_H_
+
+#include <sys/types.h>
+typedef off_t llvm_regoff_t;
+typedef struct {
+ llvm_regoff_t rm_so; /* start of match */
+ llvm_regoff_t rm_eo; /* end of match */
+} llvm_regmatch_t;
+
+typedef struct llvm_regex {
+ int re_magic;
+ size_t re_nsub; /* number of parenthesized subexpressions */
+ const char *re_endp; /* end pointer for REG_PEND */
+ struct re_guts *re_g; /* none of your business :-) */
+} llvm_regex_t;
+
+/* llvm_regcomp() flags */
+#define REG_BASIC 0000
+#define REG_EXTENDED 0001
+#define REG_ICASE 0002
+#define REG_NOSUB 0004
+#define REG_NEWLINE 0010
+#define REG_NOSPEC 0020
+#define REG_PEND 0040
+#define REG_DUMP 0200
+
+/* llvm_regerror() flags */
+#define REG_NOMATCH 1
+#define REG_BADPAT 2
+#define REG_ECOLLATE 3
+#define REG_ECTYPE 4
+#define REG_EESCAPE 5
+#define REG_ESUBREG 6
+#define REG_EBRACK 7
+#define REG_EPAREN 8
+#define REG_EBRACE 9
+#define REG_BADBR 10
+#define REG_ERANGE 11
+#define REG_ESPACE 12
+#define REG_BADRPT 13
+#define REG_EMPTY 14
+#define REG_ASSERT 15
+#define REG_INVARG 16
+#define REG_ATOI 255 /* convert name to number (!) */
+#define REG_ITOA 0400 /* convert number to name (!) */
+
+/* llvm_regexec() flags */
+#define REG_NOTBOL 00001
+#define REG_NOTEOL 00002
+#define REG_STARTEND 00004
+#define REG_TRACE 00400 /* tracing of execution */
+#define REG_LARGE 01000 /* force large representation */
+#define REG_BACKR 02000 /* force use of backref code */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int llvm_regcomp(llvm_regex_t *, const char *, int);
+size_t llvm_regerror(int, const llvm_regex_t *, char *, size_t);
+int llvm_regexec(const llvm_regex_t *, const char *, size_t,
+ llvm_regmatch_t [], int);
+void llvm_regfree(llvm_regex_t *);
+size_t llvm_strlcpy(char *dst, const char *src, size_t siz);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !_REGEX_H_ */
diff --git a/ext/src/llvm/regexec.c b/ext/src/llvm/regexec.c
new file mode 100644
index 0000000..bd5e72d
--- /dev/null
+++ b/ext/src/llvm/regexec.c
@@ -0,0 +1,162 @@
+/*-
+ * This code is derived from OpenBSD's libc/regex, original license follows:
+ *
+ * Copyright (c) 1992, 1993, 1994 Henry Spencer.
+ * Copyright (c) 1992, 1993, 1994
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Henry Spencer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)regexec.c 8.3 (Berkeley) 3/20/94
+ */
+
+/*
+ * the outer shell of llvm_regexec()
+ *
+ * This file includes engine.inc *twice*, after muchos fiddling with the
+ * macros that code uses. This lets the same code operate on two different
+ * representations for state sets.
+ */
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include <ctype.h>
+#include "regex_impl.h"
+
+#include "regutils.h"
+#include "regex2.h"
+
+/* macros for manipulating states, small version */
+/* FIXME: 'states' is assumed as 'long' on small version. */
+#define states1 long /* for later use in llvm_regexec() decision */
+#define states states1
+#define CLEAR(v) ((v) = 0)
+#define SET0(v, n) ((v) &= ~((unsigned long)1 << (n)))
+#define SET1(v, n) ((v) |= (unsigned long)1 << (n))
+#define ISSET(v, n) (((v) & ((unsigned long)1 << (n))) != 0)
+#define ASSIGN(d, s) ((d) = (s))
+#define EQ(a, b) ((a) == (b))
+#define STATEVARS long dummy /* dummy version */
+#define STATESETUP(m, n) /* nothing */
+#define STATETEARDOWN(m) /* nothing */
+#define SETUP(v) ((v) = 0)
+#define onestate long
+#define INIT(o, n) ((o) = (unsigned long)1 << (n))
+#define INC(o) ((o) = (unsigned long)(o) << 1)
+#define ISSTATEIN(v, o) (((v) & (o)) != 0)
+/* some abbreviations; note that some of these know variable names! */
+/* do "if I'm here, I can also be there" etc without branches */
+#define FWD(dst, src, n) ((dst) |= ((unsigned long)(src)&(here)) << (n))
+#define BACK(dst, src, n) ((dst) |= ((unsigned long)(src)&(here)) >> (n))
+#define ISSETBACK(v, n) (((v) & ((unsigned long)here >> (n))) != 0)
+/* function names */
+#define SNAMES /* engine.inc looks after details */
+
+#include "regengine.inc"
+
+/* now undo things */
+#undef states
+#undef CLEAR
+#undef SET0
+#undef SET1
+#undef ISSET
+#undef ASSIGN
+#undef EQ
+#undef STATEVARS
+#undef STATESETUP
+#undef STATETEARDOWN
+#undef SETUP
+#undef onestate
+#undef INIT
+#undef INC
+#undef ISSTATEIN
+#undef FWD
+#undef BACK
+#undef ISSETBACK
+#undef SNAMES
+
+/* macros for manipulating states, large version */
+#define states char *
+#define CLEAR(v) memset(v, 0, m->g->nstates)
+#define SET0(v, n) ((v)[n] = 0)
+#define SET1(v, n) ((v)[n] = 1)
+#define ISSET(v, n) ((v)[n])
+#define ASSIGN(d, s) memmove(d, s, m->g->nstates)
+#define EQ(a, b) (memcmp(a, b, m->g->nstates) == 0)
+#define STATEVARS long vn; char *space
+#define STATESETUP(m, nv) { (m)->space = malloc((nv)*(m)->g->nstates); \
+ if ((m)->space == NULL) return(REG_ESPACE); \
+ (m)->vn = 0; }
+#define STATETEARDOWN(m) { free((m)->space); }
+#define SETUP(v) ((v) = &m->space[m->vn++ * m->g->nstates])
+#define onestate long
+#define INIT(o, n) ((o) = (n))
+#define INC(o) ((o)++)
+#define ISSTATEIN(v, o) ((v)[o])
+/* some abbreviations; note that some of these know variable names! */
+/* do "if I'm here, I can also be there" etc without branches */
+#define FWD(dst, src, n) ((dst)[here+(n)] |= (src)[here])
+#define BACK(dst, src, n) ((dst)[here-(n)] |= (src)[here])
+#define ISSETBACK(v, n) ((v)[here - (n)])
+/* function names */
+#define LNAMES /* flag */
+
+#include "regengine.inc"
+
+/*
+ - llvm_regexec - interface for matching
+ *
+ * We put this here so we can exploit knowledge of the state representation
+ * when choosing which matcher to call. Also, by this point the matchers
+ * have been prototyped.
+ */
+int /* 0 success, REG_NOMATCH failure */
+llvm_regexec(const llvm_regex_t *preg, const char *string, size_t nmatch,
+ llvm_regmatch_t pmatch[], int eflags)
+{
+ struct re_guts *g = preg->re_g;
+#ifdef REDEBUG
+# define GOODFLAGS(f) (f)
+#else
+# define GOODFLAGS(f) ((f)&(REG_NOTBOL|REG_NOTEOL|REG_STARTEND))
+#endif
+
+ if (preg->re_magic != MAGIC1 || g->magic != MAGIC2)
+ return(REG_BADPAT);
+ assert(!(g->iflags®EX_BAD));
+ if (g->iflags®EX_BAD) /* backstop for no-debug case */
+ return(REG_BADPAT);
+ eflags = GOODFLAGS(eflags);
+
+ if (g->nstates <= (long)(CHAR_BIT*sizeof(states1)) && !(eflags®_LARGE))
+ return(smatcher(g, string, nmatch, pmatch, eflags));
+ else
+ return(lmatcher(g, string, nmatch, pmatch, eflags));
+}
diff --git a/ext/src/llvm/regfree.c b/ext/src/llvm/regfree.c
new file mode 100644
index 0000000..dc2b4af
--- /dev/null
+++ b/ext/src/llvm/regfree.c
@@ -0,0 +1,72 @@
+/*-
+ * This code is derived from OpenBSD's libc/regex, original license follows:
+ *
+ * Copyright (c) 1992, 1993, 1994 Henry Spencer.
+ * Copyright (c) 1992, 1993, 1994
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Henry Spencer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)regfree.c 8.3 (Berkeley) 3/20/94
+ */
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "regex_impl.h"
+
+#include "regutils.h"
+#include "regex2.h"
+
+/*
+ - llvm_regfree - free everything
+ */
+void
+llvm_regfree(llvm_regex_t *preg)
+{
+ struct re_guts *g;
+
+ if (preg->re_magic != MAGIC1) /* oops */
+ return; /* nice to complain, but hard */
+
+ g = preg->re_g;
+ if (g == NULL || g->magic != MAGIC2) /* oops again */
+ return;
+ preg->re_magic = 0; /* mark it invalid */
+ g->magic = 0; /* mark it invalid */
+
+ if (g->strip != NULL)
+ free((char *)g->strip);
+ if (g->sets != NULL)
+ free((char *)g->sets);
+ if (g->setbits != NULL)
+ free((char *)g->setbits);
+ if (g->must != NULL)
+ free(g->must);
+ free((char *)g);
+}
diff --git a/ext/src/llvm/regstrlcpy.c b/ext/src/llvm/regstrlcpy.c
new file mode 100644
index 0000000..8b68afd
--- /dev/null
+++ b/ext/src/llvm/regstrlcpy.c
@@ -0,0 +1,52 @@
+/*
+ * This code is derived from OpenBSD's libc, original license follows:
+ *
+ * Copyright (c) 1998 Todd C. Miller <Todd.Miller at courtesan.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <sys/types.h>
+#include <string.h>
+
+#include "regex_impl.h"
+/*
+ * Copy src to string dst of size siz. At most siz-1 characters
+ * will be copied. Always NUL terminates (unless siz == 0).
+ * Returns strlen(src); if retval >= siz, truncation occurred.
+ */
+size_t
+llvm_strlcpy(char *dst, const char *src, size_t siz)
+{
+ char *d = dst;
+ const char *s = src;
+ size_t n = siz;
+
+ /* Copy as many bytes as will fit */
+ if (n != 0) {
+ while (--n != 0) {
+ if ((*d++ = *s++) == '\0')
+ break;
+ }
+ }
+
+ /* Not enough room in dst, add NUL and traverse rest of src */
+ if (n == 0) {
+ if (siz != 0)
+ *d = '\0'; /* NUL-terminate dst */
+ while (*s++)
+ ;
+ }
+
+ return(s - src - 1); /* count does not include NUL */
+}
diff --git a/ext/src/llvm/regutils.h b/ext/src/llvm/regutils.h
new file mode 100644
index 0000000..49a975c
--- /dev/null
+++ b/ext/src/llvm/regutils.h
@@ -0,0 +1,58 @@
+/*-
+ * This code is derived from OpenBSD's libc/regex, original license follows:
+ *
+ * Copyright (c) 1992, 1993, 1994 Henry Spencer.
+ * Copyright (c) 1992, 1993, 1994
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Henry Spencer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)utils.h 8.3 (Berkeley) 3/20/94
+ */
+
+#ifndef LLVM_SUPPORT_REGUTILS_H
+#define LLVM_SUPPORT_REGUTILS_H
+
+/* utility definitions */
+#define NC (CHAR_MAX - CHAR_MIN + 1)
+typedef unsigned char uch;
+
+/* switch off assertions (if not already off) if no REDEBUG */
+#ifndef REDEBUG
+#ifndef NDEBUG
+#define NDEBUG /* no assertions please */
+#endif
+#endif
+#include <assert.h>
+
+/* for old systems with bcopy() but no memmove() */
+#ifdef USEBCOPY
+#define memmove(d, s, c) bcopy(s, d, c)
+#endif
+
+#endif
diff --git a/ext/src/samtools/examples/ex1.bam b/ext/src/samtools/examples/ex1.bam
deleted file mode 100644
index f88f1da..0000000
Binary files a/ext/src/samtools/examples/ex1.bam and /dev/null differ
diff --git a/ext/src/samtools/examples/ex1.sam.gz b/ext/src/samtools/examples/ex1.sam.gz
deleted file mode 100644
index 44c07ee..0000000
Binary files a/ext/src/samtools/examples/ex1.sam.gz and /dev/null differ
diff --git a/ext/src/yaml-cpp/CMakeLists.txt b/ext/src/yaml-cpp/CMakeLists.txt
deleted file mode 100644
index 23f7987..0000000
--- a/ext/src/yaml-cpp/CMakeLists.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-project(yaml-cpp)
-
-include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-include_directories(${EXT_DIR}/include)
-
-set(YAML_CPP_VERSION_MAJOR "0")
-set(YAML_CPP_VERSION_MINOR "5")
-set(YAML_CPP_VERSION_PATCH "0")
-set(YAML_CPP_VERSION "${YAML_CPP_VERSION_MAJOR}.${YAML_CPP_VERSION_MINOR}.${YAML_CPP_VERSION_PATCH}")
-
-file(GLOB sources "[a-zA-Z]*.cpp")
-
-add_library(yaml-cpp STATIC
- ${sources})
\ No newline at end of file
diff --git a/ext/src/yaml-cpp/binary.cpp b/ext/src/yaml-cpp/binary.cpp
deleted file mode 100644
index 62a6032..0000000
--- a/ext/src/yaml-cpp/binary.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-#include "yaml-cpp/binary.h"
-
-namespace YAML
-{
- static const char encoding[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
-
- std::string EncodeBase64(const unsigned char *data, std::size_t size)
- {
- const char PAD = '=';
-
- std::string ret;
- ret.resize(4 * size / 3 + 3);
- char *out = &ret[0];
-
- std::size_t chunks = size / 3;
- std::size_t remainder = size % 3;
-
- for(std::size_t i=0;i<chunks;i++, data += 3) {
- *out++ = encoding[data[0] >> 2];
- *out++ = encoding[((data[0] & 0x3) << 4) | (data[1] >> 4)];
- *out++ = encoding[((data[1] & 0xf) << 2) | (data[2] >> 6)];
- *out++ = encoding[data[2] & 0x3f];
- }
-
- switch(remainder) {
- case 0:
- break;
- case 1:
- *out++ = encoding[data[0] >> 2];
- *out++ = encoding[((data[0] & 0x3) << 4)];
- *out++ = PAD;
- *out++ = PAD;
- break;
- case 2:
- *out++ = encoding[data[0] >> 2];
- *out++ = encoding[((data[0] & 0x3) << 4) | (data[1] >> 4)];
- *out++ = encoding[((data[1] & 0xf) << 2)];
- *out++ = PAD;
- break;
- }
-
- ret.resize(out - &ret[0]);
- return ret;
- }
-
- static const unsigned char decoding[] = {
- 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
- 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
- 255,255,255,255,255,255,255,255,255,255,255, 62,255,255,255, 63,
- 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,255,255,255, 0,255,255,
- 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
- 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,255,255,255,255,255,
- 255, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
- 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,255,255,255,255,255,
- 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
- 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
- 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
- 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
- 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
- 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
- 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
- 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
- };
-
- std::vector<unsigned char> DecodeBase64(const std::string& input)
- {
- typedef std::vector<unsigned char> ret_type;
- if(input.empty())
- return ret_type();
-
- ret_type ret(3 * input.size() / 4 + 1);
- unsigned char *out = &ret[0];
-
- unsigned value = 0;
- for(std::size_t i=0;i<input.size();i++) {
- unsigned char d = decoding[static_cast<unsigned>(input[i])];
- if(d == 255)
- return ret_type();
-
- value = (value << 6) | d;
- if(i % 4 == 3) {
- *out++ = value >> 16;
- if(i > 0 && input[i - 1] != '=')
- *out++ = value >> 8;
- if(input[i] != '=')
- *out++ = value;
- }
- }
-
- ret.resize(out - &ret[0]);
- return ret;
- }
-}
diff --git a/ext/src/yaml-cpp/collectionstack.h b/ext/src/yaml-cpp/collectionstack.h
deleted file mode 100644
index 4a986bc..0000000
--- a/ext/src/yaml-cpp/collectionstack.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef COLLECTIONSTACK_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define COLLECTIONSTACK_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-#include <stack>
-#include <cassert>
-
-namespace YAML
-{
- struct CollectionType {
- enum value { None, BlockMap, BlockSeq, FlowMap, FlowSeq, CompactMap };
- };
-
- class CollectionStack
- {
- public:
- CollectionType::value GetCurCollectionType() const {
- if(collectionStack.empty())
- return CollectionType::None;
- return collectionStack.top();
- }
-
- void PushCollectionType(CollectionType::value type) { collectionStack.push(type); }
- void PopCollectionType(CollectionType::value type) { assert(type == GetCurCollectionType()); collectionStack.pop(); }
-
- private:
- std::stack<CollectionType::value> collectionStack;
- };
-}
-
-#endif // COLLECTIONSTACK_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/src/yaml-cpp/contrib/graphbuilder.cpp b/ext/src/yaml-cpp/contrib/graphbuilder.cpp
deleted file mode 100644
index ab5159c..0000000
--- a/ext/src/yaml-cpp/contrib/graphbuilder.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-#include "yaml-cpp/parser.h"
-#include "yaml-cpp/contrib/graphbuilder.h"
-#include "graphbuilderadapter.h"
-
-namespace YAML
-{
- void *BuildGraphOfNextDocument(Parser& parser, GraphBuilderInterface& graphBuilder)
- {
- GraphBuilderAdapter eventHandler(graphBuilder);
- if (parser.HandleNextDocument(eventHandler)) {
- return eventHandler.RootNode();
- } else {
- return NULL;
- }
- }
-}
diff --git a/ext/src/yaml-cpp/contrib/graphbuilderadapter.cpp b/ext/src/yaml-cpp/contrib/graphbuilderadapter.cpp
deleted file mode 100644
index 557e97c..0000000
--- a/ext/src/yaml-cpp/contrib/graphbuilderadapter.cpp
+++ /dev/null
@@ -1,96 +0,0 @@
-#include "graphbuilderadapter.h"
-
-namespace YAML
-{
- int GraphBuilderAdapter::ContainerFrame::sequenceMarker;
-
- void GraphBuilderAdapter::OnNull(const Mark& mark, anchor_t anchor)
- {
- void *pParent = GetCurrentParent();
- void *pNode = m_builder.NewNull(mark, pParent);
- RegisterAnchor(anchor, pNode);
-
- DispositionNode(pNode);
- }
-
- void GraphBuilderAdapter::OnAlias(const Mark& mark, anchor_t anchor)
- {
- void *pReffedNode = m_anchors.Get(anchor);
- DispositionNode(m_builder.AnchorReference(mark, pReffedNode));
- }
-
- void GraphBuilderAdapter::OnScalar(const Mark& mark, const std::string& tag, anchor_t anchor, const std::string& value)
- {
- void *pParent = GetCurrentParent();
- void *pNode = m_builder.NewScalar(mark, tag, pParent, value);
- RegisterAnchor(anchor, pNode);
-
- DispositionNode(pNode);
- }
-
- void GraphBuilderAdapter::OnSequenceStart(const Mark& mark, const std::string& tag, anchor_t anchor)
- {
- void *pNode = m_builder.NewSequence(mark, tag, GetCurrentParent());
- m_containers.push(ContainerFrame(pNode));
- RegisterAnchor(anchor, pNode);
- }
-
- void GraphBuilderAdapter::OnSequenceEnd()
- {
- void *pSequence = m_containers.top().pContainer;
- m_containers.pop();
-
- DispositionNode(pSequence);
- }
-
- void GraphBuilderAdapter::OnMapStart(const Mark& mark, const std::string& tag, anchor_t anchor)
- {
- void *pNode = m_builder.NewMap(mark, tag, GetCurrentParent());
- m_containers.push(ContainerFrame(pNode, m_pKeyNode));
- m_pKeyNode = NULL;
- RegisterAnchor(anchor, pNode);
- }
-
- void GraphBuilderAdapter::OnMapEnd()
- {
- void *pMap = m_containers.top().pContainer;
- m_pKeyNode = m_containers.top().pPrevKeyNode;
- m_containers.pop();
- DispositionNode(pMap);
- }
-
- void *GraphBuilderAdapter::GetCurrentParent() const
- {
- if (m_containers.empty()) {
- return NULL;
- }
- return m_containers.top().pContainer;
- }
-
- void GraphBuilderAdapter::RegisterAnchor(anchor_t anchor, void *pNode)
- {
- if (anchor) {
- m_anchors.Register(anchor, pNode);
- }
- }
-
- void GraphBuilderAdapter::DispositionNode(void *pNode)
- {
- if (m_containers.empty()) {
- m_pRootNode = pNode;
- return;
- }
-
- void *pContainer = m_containers.top().pContainer;
- if (m_containers.top().isMap()) {
- if (m_pKeyNode) {
- m_builder.AssignInMap(pContainer, m_pKeyNode, pNode);
- m_pKeyNode = NULL;
- } else {
- m_pKeyNode = pNode;
- }
- } else {
- m_builder.AppendToSequence(pContainer, pNode);
- }
- }
-}
diff --git a/ext/src/yaml-cpp/contrib/graphbuilderadapter.h b/ext/src/yaml-cpp/contrib/graphbuilderadapter.h
deleted file mode 100644
index 3ef8ab6..0000000
--- a/ext/src/yaml-cpp/contrib/graphbuilderadapter.h
+++ /dev/null
@@ -1,73 +0,0 @@
-#ifndef GRAPHBUILDERADAPTER_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define GRAPHBUILDERADAPTER_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-#include <cstdlib>
-#include <map>
-#include <stack>
-#include "yaml-cpp/eventhandler.h"
-#include "yaml-cpp/contrib/anchordict.h"
-#include "yaml-cpp/contrib/graphbuilder.h"
-
-namespace YAML
-{
- class GraphBuilderAdapter : public EventHandler
- {
- public:
- GraphBuilderAdapter(GraphBuilderInterface& builder)
- : m_builder(builder), m_pRootNode(NULL), m_pKeyNode(NULL)
- {
- }
-
- virtual void OnDocumentStart(const Mark& mark) {(void)mark;}
- virtual void OnDocumentEnd() {}
-
- virtual void OnNull(const Mark& mark, anchor_t anchor);
- virtual void OnAlias(const Mark& mark, anchor_t anchor);
- virtual void OnScalar(const Mark& mark, const std::string& tag, anchor_t anchor, const std::string& value);
-
- virtual void OnSequenceStart(const Mark& mark, const std::string& tag, anchor_t anchor);
- virtual void OnSequenceEnd();
-
- virtual void OnMapStart(const Mark& mark, const std::string& tag, anchor_t anchor);
- virtual void OnMapEnd();
-
- void *RootNode() const {return m_pRootNode;}
-
- private:
- struct ContainerFrame
- {
- ContainerFrame(void *pSequence)
- : pContainer(pSequence), pPrevKeyNode(&sequenceMarker)
- {}
- ContainerFrame(void *pMap, void* pPrevKeyNode)
- : pContainer(pMap), pPrevKeyNode(pPrevKeyNode)
- {}
-
- void *pContainer;
- void *pPrevKeyNode;
-
- bool isMap() const {return pPrevKeyNode != &sequenceMarker;}
-
- private:
- static int sequenceMarker;
- };
- typedef std::stack<ContainerFrame> ContainerStack;
- typedef AnchorDict<void*> AnchorMap;
-
- GraphBuilderInterface& m_builder;
- ContainerStack m_containers;
- AnchorMap m_anchors;
- void *m_pRootNode;
- void *m_pKeyNode;
-
- void *GetCurrentParent() const;
- void RegisterAnchor(anchor_t anchor, void *pNode);
- void DispositionNode(void *pNode);
- };
-}
-
-#endif // GRAPHBUILDERADAPTER_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/src/yaml-cpp/convert.cpp b/ext/src/yaml-cpp/convert.cpp
deleted file mode 100644
index dc715f7..0000000
--- a/ext/src/yaml-cpp/convert.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-#include "yaml-cpp/node/convert.h"
-#include "yaml-cpp/node/impl.h"
-#include <algorithm>
-
-namespace
-{
- // we're not gonna mess with the mess that is all the isupper/etc. functions
- bool IsLower(char ch) { return 'a' <= ch && ch <= 'z'; }
- bool IsUpper(char ch) { return 'A' <= ch && ch <= 'Z'; }
- char ToLower(char ch) { return IsUpper(ch) ? ch + 'a' - 'A' : ch; }
-
- std::string tolower(const std::string& str)
- {
- std::string s(str);
- std::transform(s.begin(), s.end(), s.begin(), ToLower);
- return s;
- }
-
- template <typename T>
- bool IsEntirely(const std::string& str, T func)
- {
- for(std::size_t i=0;i<str.size();i++)
- if(!func(str[i]))
- return false;
-
- return true;
- }
-
- // IsFlexibleCase
- // . Returns true if 'str' is:
- // . UPPERCASE
- // . lowercase
- // . Capitalized
- bool IsFlexibleCase(const std::string& str)
- {
- if(str.empty())
- return true;
-
- if(IsEntirely(str, IsLower))
- return true;
-
- bool firstcaps = IsUpper(str[0]);
- std::string rest = str.substr(1);
- return firstcaps && (IsEntirely(rest, IsLower) || IsEntirely(rest, IsUpper));
- }
-}
-
-namespace YAML
-{
- bool convert<bool>::decode(const Node& node, bool& rhs) {
- if(!node.IsScalar())
- return false;
-
- // we can't use iostream bool extraction operators as they don't
- // recognize all possible values in the table below (taken from
- // http://yaml.org/type/bool.html)
- static const struct {
- std::string truename, falsename;
- } names[] = {
- { "y", "n" },
- { "yes", "no" },
- { "true", "false" },
- { "on", "off" },
- };
-
- if(!IsFlexibleCase(node.Scalar()))
- return false;
-
- for(unsigned i=0;i<sizeof(names)/sizeof(names[0]);i++) {
- if(names[i].truename == tolower(node.Scalar())) {
- rhs = true;
- return true;
- }
-
- if(names[i].falsename == tolower(node.Scalar())) {
- rhs = false;
- return true;
- }
- }
-
- return false;
- }
-}
diff --git a/ext/src/yaml-cpp/directives.cpp b/ext/src/yaml-cpp/directives.cpp
deleted file mode 100644
index faf1483..0000000
--- a/ext/src/yaml-cpp/directives.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-#include "directives.h"
-
-namespace YAML
-{
- Directives::Directives()
- {
- // version
- version.isDefault = true;
- version.major = 1;
- version.minor = 2;
- }
-
- const std::string Directives::TranslateTagHandle(const std::string& handle) const
- {
- std::map <std::string, std::string>::const_iterator it = tags.find(handle);
- if(it == tags.end()) {
- if(handle == "!!")
- return "tag:yaml.org,2002:";
- return handle;
- }
-
- return it->second;
- }
-}
diff --git a/ext/src/yaml-cpp/directives.h b/ext/src/yaml-cpp/directives.h
deleted file mode 100644
index a3308f7..0000000
--- a/ext/src/yaml-cpp/directives.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef DIRECTIVES_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define DIRECTIVES_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-#include <string>
-#include <map>
-
-namespace YAML
-{
- struct Version {
- bool isDefault;
- int major, minor;
- };
-
- struct Directives {
- Directives();
-
- const std::string TranslateTagHandle(const std::string& handle) const;
-
- Version version;
- std::map<std::string, std::string> tags;
- };
-}
-
-#endif // DIRECTIVES_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/src/yaml-cpp/emit.cpp b/ext/src/yaml-cpp/emit.cpp
deleted file mode 100644
index 1f0a647..0000000
--- a/ext/src/yaml-cpp/emit.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-#include "yaml-cpp/node/emit.h"
-#include "yaml-cpp/emitfromevents.h"
-#include "yaml-cpp/emitter.h"
-#include "nodeevents.h"
-
-namespace YAML
-{
- Emitter& operator << (Emitter& out, const Node& node)
- {
- EmitFromEvents emitFromEvents(out);
- NodeEvents events(node);
- events.Emit(emitFromEvents);
- return out;
- }
-
- std::ostream& operator << (std::ostream& out, const Node& node)
- {
- Emitter emitter(out);
- emitter << node;
- return out;
- }
-
- std::string Dump(const Node& node)
- {
- Emitter emitter;
- emitter << node;
- return emitter.c_str();
- }
-}
diff --git a/ext/src/yaml-cpp/emitfromevents.cpp b/ext/src/yaml-cpp/emitfromevents.cpp
deleted file mode 100644
index 49fc10b..0000000
--- a/ext/src/yaml-cpp/emitfromevents.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-#include "yaml-cpp/emitfromevents.h"
-#include "yaml-cpp/emitter.h"
-#include "yaml-cpp/null.h"
-#include <cassert>
-#include <sstream>
-
-namespace {
- std::string ToString(YAML::anchor_t anchor) {
- std::stringstream stream;
- stream << anchor;
- return stream.str();
- }
-}
-
-namespace YAML
-{
- EmitFromEvents::EmitFromEvents(Emitter& emitter): m_emitter(emitter)
- {
- }
-
- void EmitFromEvents::OnDocumentStart(const Mark&)
- {
- }
-
- void EmitFromEvents::OnDocumentEnd()
- {
- }
-
- void EmitFromEvents::OnNull(const Mark&, anchor_t anchor)
- {
- BeginNode();
- EmitProps("", anchor);
- m_emitter << Null;
- }
-
- void EmitFromEvents::OnAlias(const Mark&, anchor_t anchor)
- {
- BeginNode();
- m_emitter << Alias(ToString(anchor));
- }
-
- void EmitFromEvents::OnScalar(const Mark&, const std::string& tag, anchor_t anchor, const std::string& value)
- {
- BeginNode();
- EmitProps(tag, anchor);
- m_emitter << value;
- }
-
- void EmitFromEvents::OnSequenceStart(const Mark&, const std::string& tag, anchor_t anchor)
- {
- BeginNode();
- EmitProps(tag, anchor);
- m_emitter << BeginSeq;
- m_stateStack.push(State::WaitingForSequenceEntry);
- }
-
- void EmitFromEvents::OnSequenceEnd()
- {
- m_emitter << EndSeq;
- assert(m_stateStack.top() == State::WaitingForSequenceEntry);
- m_stateStack.pop();
- }
-
- void EmitFromEvents::OnMapStart(const Mark&, const std::string& tag, anchor_t anchor)
- {
- BeginNode();
- EmitProps(tag, anchor);
- m_emitter << BeginMap;
- m_stateStack.push(State::WaitingForKey);
- }
-
- void EmitFromEvents::OnMapEnd()
- {
- m_emitter << EndMap;
- assert(m_stateStack.top() == State::WaitingForKey);
- m_stateStack.pop();
- }
-
- void EmitFromEvents::BeginNode()
- {
- if(m_stateStack.empty())
- return;
-
- switch(m_stateStack.top()) {
- case State::WaitingForKey:
- m_emitter << Key;
- m_stateStack.top() = State::WaitingForValue;
- break;
- case State::WaitingForValue:
- m_emitter << Value;
- m_stateStack.top() = State::WaitingForKey;
- break;
- default:
- break;
- }
- }
-
- void EmitFromEvents::EmitProps(const std::string& tag, anchor_t anchor)
- {
- if(!tag.empty() && tag != "?")
- m_emitter << VerbatimTag(tag);
- if(anchor)
- m_emitter << Anchor(ToString(anchor));
- }
-}
diff --git a/ext/src/yaml-cpp/emitter.cpp b/ext/src/yaml-cpp/emitter.cpp
deleted file mode 100644
index 8a5a706..0000000
--- a/ext/src/yaml-cpp/emitter.cpp
+++ /dev/null
@@ -1,951 +0,0 @@
-#include "yaml-cpp/emitter.h"
-#include "emitterstate.h"
-#include "emitterutils.h"
-#include "indentation.h"
-#include "yaml-cpp/exceptions.h"
-#include <sstream>
-
-namespace YAML
-{
- Emitter::Emitter(): m_pState(new EmitterState)
- {
- }
-
- Emitter::Emitter(std::ostream& stream): m_pState(new EmitterState), m_stream(stream)
- {
- }
-
- Emitter::~Emitter()
- {
- }
-
- const char *Emitter::c_str() const
- {
- return m_stream.str();
- }
-
- std::size_t Emitter::size() const
- {
- return m_stream.pos();
- }
-
- // state checking
- bool Emitter::good() const
- {
- return m_pState->good();
- }
-
- const std::string Emitter::GetLastError() const
- {
- return m_pState->GetLastError();
- }
-
- // global setters
- bool Emitter::SetOutputCharset(EMITTER_MANIP value)
- {
- return m_pState->SetOutputCharset(value, FmtScope::Global);
- }
-
- bool Emitter::SetStringFormat(EMITTER_MANIP value)
- {
- return m_pState->SetStringFormat(value, FmtScope::Global);
- }
-
- bool Emitter::SetBoolFormat(EMITTER_MANIP value)
- {
- bool ok = false;
- if(m_pState->SetBoolFormat(value, FmtScope::Global))
- ok = true;
- if(m_pState->SetBoolCaseFormat(value, FmtScope::Global))
- ok = true;
- if(m_pState->SetBoolLengthFormat(value, FmtScope::Global))
- ok = true;
- return ok;
- }
-
- bool Emitter::SetIntBase(EMITTER_MANIP value)
- {
- return m_pState->SetIntFormat(value, FmtScope::Global);
- }
-
- bool Emitter::SetSeqFormat(EMITTER_MANIP value)
- {
- return m_pState->SetFlowType(GroupType::Seq, value, FmtScope::Global);
- }
-
- bool Emitter::SetMapFormat(EMITTER_MANIP value)
- {
- bool ok = false;
- if(m_pState->SetFlowType(GroupType::Map, value, FmtScope::Global))
- ok = true;
- if(m_pState->SetMapKeyFormat(value, FmtScope::Global))
- ok = true;
- return ok;
- }
-
- bool Emitter::SetIndent(unsigned n)
- {
- return m_pState->SetIndent(n, FmtScope::Global);
- }
-
- bool Emitter::SetPreCommentIndent(unsigned n)
- {
- return m_pState->SetPreCommentIndent(n, FmtScope::Global);
- }
-
- bool Emitter::SetPostCommentIndent(unsigned n)
- {
- return m_pState->SetPostCommentIndent(n, FmtScope::Global);
- }
-
- bool Emitter::SetFloatPrecision(unsigned n)
- {
- return m_pState->SetFloatPrecision(n, FmtScope::Global);
- }
-
- bool Emitter::SetDoublePrecision(unsigned n)
- {
- return m_pState->SetDoublePrecision(n, FmtScope::Global);
- }
-
- // SetLocalValue
- // . Either start/end a group, or set a modifier locally
- Emitter& Emitter::SetLocalValue(EMITTER_MANIP value)
- {
- if(!good())
- return *this;
-
- switch(value) {
- case BeginDoc:
- EmitBeginDoc();
- break;
- case EndDoc:
- EmitEndDoc();
- break;
- case BeginSeq:
- EmitBeginSeq();
- break;
- case EndSeq:
- EmitEndSeq();
- break;
- case BeginMap:
- EmitBeginMap();
- break;
- case EndMap:
- EmitEndMap();
- break;
- case Key:
- case Value:
- // deprecated (these can be deduced by the parity of nodes in a map)
- break;
- case TagByKind:
- EmitKindTag();
- break;
- case Newline:
- EmitNewline();
- break;
- default:
- m_pState->SetLocalValue(value);
- break;
- }
- return *this;
- }
-
- Emitter& Emitter::SetLocalIndent(const _Indent& indent)
- {
- m_pState->SetIndent(indent.value, FmtScope::Local);
- return *this;
- }
-
- Emitter& Emitter::SetLocalPrecision(const _Precision& precision)
- {
- if(precision.floatPrecision >= 0)
- m_pState->SetFloatPrecision(precision.floatPrecision, FmtScope::Local);
- if(precision.doublePrecision >= 0)
- m_pState->SetDoublePrecision(precision.doublePrecision, FmtScope::Local);
- return *this;
- }
-
- // EmitBeginDoc
- void Emitter::EmitBeginDoc()
- {
- if(!good())
- return;
-
- if(m_pState->CurGroupType() != GroupType::None) {
- m_pState->SetError("Unexpected begin document");
- return;
- }
-
- if(m_pState->HasAnchor() || m_pState->HasTag()) {
- m_pState->SetError("Unexpected begin document");
- return;
- }
-
- if(m_stream.col() > 0)
- m_stream << "\n";
- m_stream << "---\n";
-
- m_pState->StartedDoc();
- }
-
- // EmitEndDoc
- void Emitter::EmitEndDoc()
- {
- if(!good())
- return;
-
- if(m_pState->CurGroupType() != GroupType::None) {
- m_pState->SetError("Unexpected begin document");
- return;
- }
-
- if(m_pState->HasAnchor() || m_pState->HasTag()) {
- m_pState->SetError("Unexpected begin document");
- return;
- }
-
- if(m_stream.col() > 0)
- m_stream << "\n";
- m_stream << "...\n";
- }
-
- // EmitBeginSeq
- void Emitter::EmitBeginSeq()
- {
- if(!good())
- return;
-
- PrepareNode(m_pState->NextGroupType(GroupType::Seq));
-
- m_pState->StartedGroup(GroupType::Seq);
- }
-
- // EmitEndSeq
- void Emitter::EmitEndSeq()
- {
- if(!good())
- return;
-
- if(m_pState->CurGroupChildCount() == 0)
- m_pState->ForceFlow();
-
- if(m_pState->CurGroupFlowType() == FlowType::Flow) {
- if(m_stream.comment())
- m_stream << "\n";
- m_stream << IndentTo(m_pState->CurIndent());
- if(m_pState->CurGroupChildCount() == 0)
- m_stream << "[";
- m_stream << "]";
- }
-
- m_pState->EndedGroup(GroupType::Seq);
- }
-
- // EmitBeginMap
- void Emitter::EmitBeginMap()
- {
- if(!good())
- return;
-
- PrepareNode(m_pState->NextGroupType(GroupType::Map));
-
- m_pState->StartedGroup(GroupType::Map);
- }
-
- // EmitEndMap
- void Emitter::EmitEndMap()
- {
- if(!good())
- return;
-
- if(m_pState->CurGroupChildCount() == 0)
- m_pState->ForceFlow();
-
- if(m_pState->CurGroupFlowType() == FlowType::Flow) {
- if(m_stream.comment())
- m_stream << "\n";
- m_stream << IndentTo(m_pState->CurIndent());
- if(m_pState->CurGroupChildCount() == 0)
- m_stream << "{";
- m_stream << "}";
- }
-
- m_pState->EndedGroup(GroupType::Map);
- }
-
- // EmitNewline
- void Emitter::EmitNewline()
- {
- if(!good())
- return;
-
- PrepareNode(EmitterNodeType::None);
- m_stream << "\n";
- m_pState->SetNonContent();
- }
-
- bool Emitter::CanEmitNewline() const
- {
- return true;
- }
-
- // Put the stream in a state so we can simply write the next node
- // E.g., if we're in a sequence, write the "- "
- void Emitter::PrepareNode(EmitterNodeType::value child)
- {
- switch(m_pState->CurGroupNodeType()) {
- case EmitterNodeType::None:
- PrepareTopNode(child);
- break;
- case EmitterNodeType::FlowSeq:
- FlowSeqPrepareNode(child);
- break;
- case EmitterNodeType::BlockSeq:
- BlockSeqPrepareNode(child);
- break;
- case EmitterNodeType::FlowMap:
- FlowMapPrepareNode(child);
- break;
- case EmitterNodeType::BlockMap:
- BlockMapPrepareNode(child);
- break;
- case EmitterNodeType::Property:
- case EmitterNodeType::Scalar:
- assert(false);
- break;
- }
- }
-
- void Emitter::PrepareTopNode(EmitterNodeType::value child)
- {
- if(child == EmitterNodeType::None)
- return;
-
- if(m_pState->CurGroupChildCount() > 0 && m_stream.col() > 0) {
- if(child != EmitterNodeType::None)
- EmitBeginDoc();
- }
-
- switch(child) {
- case EmitterNodeType::None:
- break;
- case EmitterNodeType::Property:
- case EmitterNodeType::Scalar:
- case EmitterNodeType::FlowSeq:
- case EmitterNodeType::FlowMap:
- // TODO: if we were writing null, and
- // we wanted it blank, we wouldn't want a space
- SpaceOrIndentTo(m_pState->HasBegunContent(), 0);
- break;
- case EmitterNodeType::BlockSeq:
- case EmitterNodeType::BlockMap:
- if(m_pState->HasBegunNode())
- m_stream << "\n";
- break;
- }
- }
-
- void Emitter::FlowSeqPrepareNode(EmitterNodeType::value child)
- {
- const unsigned lastIndent = m_pState->LastIndent();
-
- if(!m_pState->HasBegunNode()) {
- if(m_stream.comment())
- m_stream << "\n";
- m_stream << IndentTo(lastIndent);
- if(m_pState->CurGroupChildCount() == 0)
- m_stream << "[";
- else
- m_stream << ",";
- }
-
- switch(child) {
- case EmitterNodeType::None:
- break;
- case EmitterNodeType::Property:
- case EmitterNodeType::Scalar:
- case EmitterNodeType::FlowSeq:
- case EmitterNodeType::FlowMap:
- SpaceOrIndentTo(m_pState->HasBegunContent() || m_pState->CurGroupChildCount() > 0, lastIndent);
- break;
- case EmitterNodeType::BlockSeq:
- case EmitterNodeType::BlockMap:
- assert(false);
- break;
- }
- }
-
- void Emitter::BlockSeqPrepareNode(EmitterNodeType::value child)
- {
- const unsigned curIndent = m_pState->CurIndent();
- const unsigned nextIndent = curIndent + m_pState->CurGroupIndent();
-
- if(child == EmitterNodeType::None)
- return;
-
- if(!m_pState->HasBegunContent()) {
- if(m_pState->CurGroupChildCount() > 0 || m_stream.comment()) {
- m_stream << "\n";
- }
- m_stream << IndentTo(curIndent);
- m_stream << "-";
- }
-
- switch(child) {
- case EmitterNodeType::None:
- break;
- case EmitterNodeType::Property:
- case EmitterNodeType::Scalar:
- case EmitterNodeType::FlowSeq:
- case EmitterNodeType::FlowMap:
- SpaceOrIndentTo(m_pState->HasBegunContent(), nextIndent);
- break;
- case EmitterNodeType::BlockSeq:
- m_stream << "\n";
- break;
- case EmitterNodeType::BlockMap:
- if(m_pState->HasBegunContent() || m_stream.comment())
- m_stream << "\n";
- break;
- }
- }
-
- void Emitter::FlowMapPrepareNode(EmitterNodeType::value child)
- {
- if(m_pState->CurGroupChildCount() % 2 == 0) {
- if(m_pState->GetMapKeyFormat() == LongKey)
- m_pState->SetLongKey();
-
- if(m_pState->CurGroupLongKey())
- FlowMapPrepareLongKey(child);
- else
- FlowMapPrepareSimpleKey(child);
- } else {
- if(m_pState->CurGroupLongKey())
- FlowMapPrepareLongKeyValue(child);
- else
- FlowMapPrepareSimpleKeyValue(child);
- }
- }
-
- void Emitter::FlowMapPrepareLongKey(EmitterNodeType::value child)
- {
- const unsigned lastIndent = m_pState->LastIndent();
-
- if(!m_pState->HasBegunNode()) {
- if(m_stream.comment())
- m_stream << "\n";
- m_stream << IndentTo(lastIndent);
- if(m_pState->CurGroupChildCount() == 0)
- m_stream << "{ ?";
- else
- m_stream << ", ?";
- }
-
- switch(child) {
- case EmitterNodeType::None:
- break;
- case EmitterNodeType::Property:
- case EmitterNodeType::Scalar:
- case EmitterNodeType::FlowSeq:
- case EmitterNodeType::FlowMap:
- SpaceOrIndentTo(m_pState->HasBegunContent() || m_pState->CurGroupChildCount() > 0, lastIndent);
- break;
- case EmitterNodeType::BlockSeq:
- case EmitterNodeType::BlockMap:
- assert(false);
- break;
- }
- }
-
- void Emitter::FlowMapPrepareLongKeyValue(EmitterNodeType::value child)
- {
- const unsigned lastIndent = m_pState->LastIndent();
-
- if(!m_pState->HasBegunNode()) {
- if(m_stream.comment())
- m_stream << "\n";
- m_stream << IndentTo(lastIndent);
- m_stream << ":";
- }
-
- switch(child) {
- case EmitterNodeType::None:
- break;
- case EmitterNodeType::Property:
- case EmitterNodeType::Scalar:
- case EmitterNodeType::FlowSeq:
- case EmitterNodeType::FlowMap:
- SpaceOrIndentTo(m_pState->HasBegunContent() || m_pState->CurGroupChildCount() > 0, lastIndent);
- break;
- case EmitterNodeType::BlockSeq:
- case EmitterNodeType::BlockMap:
- assert(false);
- break;
- }
- }
-
- void Emitter::FlowMapPrepareSimpleKey(EmitterNodeType::value child)
- {
- const unsigned lastIndent = m_pState->LastIndent();
-
- if(!m_pState->HasBegunNode()) {
- if(m_stream.comment())
- m_stream << "\n";
- m_stream << IndentTo(lastIndent);
- if(m_pState->CurGroupChildCount() == 0)
- m_stream << "{";
- else
- m_stream << ",";
- }
-
- switch(child) {
- case EmitterNodeType::None:
- break;
- case EmitterNodeType::Property:
- case EmitterNodeType::Scalar:
- case EmitterNodeType::FlowSeq:
- case EmitterNodeType::FlowMap:
- SpaceOrIndentTo(m_pState->HasBegunContent() || m_pState->CurGroupChildCount() > 0, lastIndent);
- break;
- case EmitterNodeType::BlockSeq:
- case EmitterNodeType::BlockMap:
- assert(false);
- break;
- }
- }
-
- void Emitter::FlowMapPrepareSimpleKeyValue(EmitterNodeType::value child)
- {
- const unsigned lastIndent = m_pState->LastIndent();
-
- if(!m_pState->HasBegunNode()) {
- if(m_stream.comment())
- m_stream << "\n";
- m_stream << IndentTo(lastIndent);
- m_stream << ":";
- }
-
- switch(child) {
- case EmitterNodeType::None:
- break;
- case EmitterNodeType::Property:
- case EmitterNodeType::Scalar:
- case EmitterNodeType::FlowSeq:
- case EmitterNodeType::FlowMap:
- SpaceOrIndentTo(m_pState->HasBegunContent() || m_pState->CurGroupChildCount() > 0, lastIndent);
- break;
- case EmitterNodeType::BlockSeq:
- case EmitterNodeType::BlockMap:
- assert(false);
- break;
- }
- }
-
- void Emitter::BlockMapPrepareNode(EmitterNodeType::value child)
- {
- if(m_pState->CurGroupChildCount() % 2 == 0) {
- if(m_pState->GetMapKeyFormat() == LongKey)
- m_pState->SetLongKey();
- if(child == EmitterNodeType::BlockSeq || child == EmitterNodeType::BlockMap)
- m_pState->SetLongKey();
-
- if(m_pState->CurGroupLongKey())
- BlockMapPrepareLongKey(child);
- else
- BlockMapPrepareSimpleKey(child);
- } else {
- if(m_pState->CurGroupLongKey())
- BlockMapPrepareLongKeyValue(child);
- else
- BlockMapPrepareSimpleKeyValue(child);
- }
- }
-
- void Emitter::BlockMapPrepareLongKey(EmitterNodeType::value child)
- {
- const unsigned curIndent = m_pState->CurIndent();
- const std::size_t childCount = m_pState->CurGroupChildCount();
-
- if(child == EmitterNodeType::None)
- return;
-
- if(!m_pState->HasBegunContent()) {
- if(childCount > 0) {
- m_stream << "\n";
- }
- if(m_stream.comment()) {
- m_stream << "\n";
- }
- m_stream << IndentTo(curIndent);
- m_stream << "?";
- }
-
- switch(child) {
- case EmitterNodeType::None:
- break;
- case EmitterNodeType::Property:
- case EmitterNodeType::Scalar:
- case EmitterNodeType::FlowSeq:
- case EmitterNodeType::FlowMap:
- SpaceOrIndentTo(true, curIndent + 1);
- break;
- case EmitterNodeType::BlockSeq:
- case EmitterNodeType::BlockMap:
- break;
- }
- }
-
- void Emitter::BlockMapPrepareLongKeyValue(EmitterNodeType::value child)
- {
- const unsigned curIndent = m_pState->CurIndent();
-
- if(child == EmitterNodeType::None)
- return;
-
- if(!m_pState->HasBegunContent()) {
- m_stream << "\n";
- m_stream << IndentTo(curIndent);
- m_stream << ":";
- }
-
- switch(child) {
- case EmitterNodeType::None:
- break;
- case EmitterNodeType::Property:
- case EmitterNodeType::Scalar:
- case EmitterNodeType::FlowSeq:
- case EmitterNodeType::FlowMap:
- case EmitterNodeType::BlockSeq:
- case EmitterNodeType::BlockMap:
- SpaceOrIndentTo(true, curIndent + 1);
- break;
- }
- }
-
- void Emitter::BlockMapPrepareSimpleKey(EmitterNodeType::value child)
- {
- const unsigned curIndent = m_pState->CurIndent();
- const std::size_t childCount = m_pState->CurGroupChildCount();
-
- if(child == EmitterNodeType::None)
- return;
-
- if(!m_pState->HasBegunNode()) {
- if(childCount > 0) {
- m_stream << "\n";
- }
- }
-
- switch(child) {
- case EmitterNodeType::None:
- break;
- case EmitterNodeType::Property:
- case EmitterNodeType::Scalar:
- case EmitterNodeType::FlowSeq:
- case EmitterNodeType::FlowMap:
- SpaceOrIndentTo(m_pState->HasBegunContent(), curIndent);
- break;
- case EmitterNodeType::BlockSeq:
- case EmitterNodeType::BlockMap:
- break;
- }
- }
-
- void Emitter::BlockMapPrepareSimpleKeyValue(EmitterNodeType::value child)
- {
- const unsigned curIndent = m_pState->CurIndent();
- const unsigned nextIndent = curIndent + m_pState->CurGroupIndent();
-
- if(!m_pState->HasBegunNode()) {
- m_stream << ":";
- }
-
- switch(child) {
- case EmitterNodeType::None:
- break;
- case EmitterNodeType::Property:
- case EmitterNodeType::Scalar:
- case EmitterNodeType::FlowSeq:
- case EmitterNodeType::FlowMap:
- SpaceOrIndentTo(true, nextIndent);
- break;
- case EmitterNodeType::BlockSeq:
- case EmitterNodeType::BlockMap:
- m_stream << "\n";
- break;
- }
- }
-
- // SpaceOrIndentTo
- // . Prepares for some more content by proper spacing
- void Emitter::SpaceOrIndentTo(bool requireSpace, unsigned indent)
- {
- if(m_stream.comment())
- m_stream << "\n";
- if(m_stream.col() > 0 && requireSpace)
- m_stream << " ";
- m_stream << IndentTo(indent);
- }
-
- void Emitter::PrepareIntegralStream(std::stringstream& stream) const
- {
-
- switch(m_pState->GetIntFormat()) {
- case Dec:
- stream << std::dec;
- break;
- case Hex:
- stream << "0x";
- stream << std::hex;
- break;
- case Oct:
- stream << "0";
- stream << std::oct;
- break;
- default:
- assert(false);
- }
- }
-
- void Emitter::StartedScalar()
- {
- m_pState->StartedScalar();
- }
-
- // *******************************************************************************************
- // overloads of Write
-
- Emitter& Emitter::Write(const std::string& str)
- {
- if(!good())
- return *this;
-
- const bool escapeNonAscii = m_pState->GetOutputCharset() == EscapeNonAscii;
- const StringFormat::value strFormat = Utils::ComputeStringFormat(str, m_pState->GetStringFormat(), m_pState->CurGroupFlowType(), escapeNonAscii);
-
- if(strFormat == StringFormat::Literal)
- m_pState->SetMapKeyFormat(YAML::LongKey, FmtScope::Local);
-
- PrepareNode(EmitterNodeType::Scalar);
-
- switch(strFormat) {
- case StringFormat::Plain:
- m_stream << str;
- break;
- case StringFormat::SingleQuoted:
- Utils::WriteSingleQuotedString(m_stream, str);
- break;
- case StringFormat::DoubleQuoted:
- Utils::WriteDoubleQuotedString(m_stream, str, escapeNonAscii);
- break;
- case StringFormat::Literal:
- Utils::WriteLiteralString(m_stream, str, m_pState->CurIndent() + m_pState->GetIndent());
- break;
- }
-
- StartedScalar();
-
- return *this;
- }
-
- unsigned Emitter::GetFloatPrecision() const
- {
- return m_pState->GetFloatPrecision();
- }
-
- unsigned Emitter::GetDoublePrecision() const
- {
- return m_pState->GetDoublePrecision();
- }
-
- const char *Emitter::ComputeFullBoolName(bool b) const
- {
- const EMITTER_MANIP mainFmt = (m_pState->GetBoolLengthFormat() == ShortBool ? YesNoBool : m_pState->GetBoolFormat());
- const EMITTER_MANIP caseFmt = m_pState->GetBoolCaseFormat();
- switch(mainFmt) {
- case YesNoBool:
- switch(caseFmt) {
- case UpperCase: return b ? "YES" : "NO";
- case CamelCase: return b ? "Yes" : "No";
- case LowerCase: return b ? "yes" : "no";
- default: break;
- }
- break;
- case OnOffBool:
- switch(caseFmt) {
- case UpperCase: return b ? "ON" : "OFF";
- case CamelCase: return b ? "On" : "Off";
- case LowerCase: return b ? "on" : "off";
- default: break;
- }
- break;
- case TrueFalseBool:
- switch(caseFmt) {
- case UpperCase: return b ? "TRUE" : "FALSE";
- case CamelCase: return b ? "True" : "False";
- case LowerCase: return b ? "true" : "false";
- default: break;
- }
- break;
- default:
- break;
- }
- return b ? "y" : "n"; // should never get here, but it can't hurt to give these answers
- }
-
- Emitter& Emitter::Write(bool b)
- {
- if(!good())
- return *this;
-
- PrepareNode(EmitterNodeType::Scalar);
-
- const char *name = ComputeFullBoolName(b);
- if(m_pState->GetBoolLengthFormat() == ShortBool)
- m_stream << name[0];
- else
- m_stream << name;
-
- StartedScalar();
-
- return *this;
- }
-
- Emitter& Emitter::Write(char ch)
- {
- if(!good())
- return *this;
-
- PrepareNode(EmitterNodeType::Scalar);
- Utils::WriteChar(m_stream, ch);
- StartedScalar();
-
- return *this;
- }
-
- Emitter& Emitter::Write(const _Alias& alias)
- {
- if(!good())
- return *this;
-
- if(m_pState->HasAnchor() || m_pState->HasTag()) {
- m_pState->SetError(ErrorMsg::INVALID_ALIAS);
- return *this;
- }
-
- PrepareNode(EmitterNodeType::Scalar);
-
- if(!Utils::WriteAlias(m_stream, alias.content)) {
- m_pState->SetError(ErrorMsg::INVALID_ALIAS);
- return *this;
- }
-
- StartedScalar();
-
- return *this;
- }
-
- Emitter& Emitter::Write(const _Anchor& anchor)
- {
- if(!good())
- return *this;
-
- if(m_pState->HasAnchor()) {
- m_pState->SetError(ErrorMsg::INVALID_ANCHOR);
- return *this;
- }
-
- PrepareNode(EmitterNodeType::Property);
-
- if(!Utils::WriteAnchor(m_stream, anchor.content)) {
- m_pState->SetError(ErrorMsg::INVALID_ANCHOR);
- return *this;
- }
-
- m_pState->SetAnchor();
-
- return *this;
- }
-
- Emitter& Emitter::Write(const _Tag& tag)
- {
- if(!good())
- return *this;
-
- if(m_pState->HasTag()) {
- m_pState->SetError(ErrorMsg::INVALID_TAG);
- return *this;
- }
-
- PrepareNode(EmitterNodeType::Property);
-
- bool success = false;
- if(tag.type == _Tag::Type::Verbatim)
- success = Utils::WriteTag(m_stream, tag.content, true);
- else if(tag.type == _Tag::Type::PrimaryHandle)
- success = Utils::WriteTag(m_stream, tag.content, false);
- else
- success = Utils::WriteTagWithPrefix(m_stream, tag.prefix, tag.content);
-
- if(!success) {
- m_pState->SetError(ErrorMsg::INVALID_TAG);
- return *this;
- }
-
- m_pState->SetTag();
-
- return *this;
- }
-
- void Emitter::EmitKindTag()
- {
- Write(LocalTag(""));
- }
-
- Emitter& Emitter::Write(const _Comment& comment)
- {
- if(!good())
- return *this;
-
- PrepareNode(EmitterNodeType::None);
-
- if(m_stream.col() > 0)
- m_stream << Indentation(m_pState->GetPreCommentIndent());
- Utils::WriteComment(m_stream, comment.content, m_pState->GetPostCommentIndent());
-
- m_pState->SetNonContent();
-
- return *this;
- }
-
- Emitter& Emitter::Write(const _Null& /*null*/)
- {
- if(!good())
- return *this;
-
- PrepareNode(EmitterNodeType::Scalar);
-
- m_stream << "~";
-
- StartedScalar();
-
- return *this;
- }
-
- Emitter& Emitter::Write(const Binary& binary)
- {
- Write(SecondaryTag("binary"));
-
- if(!good())
- return *this;
-
- PrepareNode(EmitterNodeType::Scalar);
- Utils::WriteBinary(m_stream, binary);
- StartedScalar();
-
- return *this;
- }
-}
-
diff --git a/ext/src/yaml-cpp/emitterstate.cpp b/ext/src/yaml-cpp/emitterstate.cpp
deleted file mode 100644
index 08d2c15..0000000
--- a/ext/src/yaml-cpp/emitterstate.cpp
+++ /dev/null
@@ -1,384 +0,0 @@
-#include "emitterstate.h"
-#include "yaml-cpp/exceptions.h"
-#include <limits>
-
-namespace YAML
-{
- EmitterState::EmitterState(): m_isGood(true), m_curIndent(0), m_hasAnchor(false), m_hasTag(false), m_hasNonContent(false), m_docCount(0)
- {
- // set default global manipulators
- m_charset.set(EmitNonAscii);
- m_strFmt.set(Auto);
- m_boolFmt.set(TrueFalseBool);
- m_boolLengthFmt.set(LongBool);
- m_boolCaseFmt.set(LowerCase);
- m_intFmt.set(Dec);
- m_indent.set(2);
- m_preCommentIndent.set(2);
- m_postCommentIndent.set(1);
- m_seqFmt.set(Block);
- m_mapFmt.set(Block);
- m_mapKeyFmt.set(Auto);
- m_floatPrecision.set(6);
- m_doublePrecision.set(15);
- }
-
- EmitterState::~EmitterState()
- {
- }
-
- // SetLocalValue
- // . We blindly tries to set all possible formatters to this value
- // . Only the ones that make sense will be accepted
- void EmitterState::SetLocalValue(EMITTER_MANIP value)
- {
- SetOutputCharset(value, FmtScope::Local);
- SetStringFormat(value, FmtScope::Local);
- SetBoolFormat(value, FmtScope::Local);
- SetBoolCaseFormat(value, FmtScope::Local);
- SetBoolLengthFormat(value, FmtScope::Local);
- SetIntFormat(value, FmtScope::Local);
- SetFlowType(GroupType::Seq, value, FmtScope::Local);
- SetFlowType(GroupType::Map, value, FmtScope::Local);
- SetMapKeyFormat(value, FmtScope::Local);
- }
-
- void EmitterState::SetAnchor()
- {
- m_hasAnchor = true;
- }
-
- void EmitterState::SetTag()
- {
- m_hasTag = true;
- }
-
- void EmitterState::SetNonContent()
- {
- m_hasNonContent = true;
- }
-
- void EmitterState::SetLongKey()
- {
- assert(!m_groups.empty());
- if(m_groups.empty())
- return;
-
- assert(m_groups.top().type == GroupType::Map);
- m_groups.top().longKey = true;
- }
-
- void EmitterState::ForceFlow()
- {
- assert(!m_groups.empty());
- if(m_groups.empty())
- return;
-
- m_groups.top().flowType = FlowType::Flow;
- }
-
- void EmitterState::StartedNode()
- {
- if(m_groups.empty()) {
- m_docCount++;
- } else {
- m_groups.top().childCount++;
- if(m_groups.top().childCount % 2 == 0)
- m_groups.top().longKey = false;
- }
-
- m_hasAnchor = false;
- m_hasTag = false;
- m_hasNonContent = false;
- }
-
- EmitterNodeType::value EmitterState::NextGroupType(GroupType::value type) const
- {
- if(type == GroupType::Seq) {
- if(GetFlowType(type) == Block)
- return EmitterNodeType::BlockSeq;
- else
- return EmitterNodeType::FlowSeq;
- } else {
- if(GetFlowType(type) == Block)
- return EmitterNodeType::BlockMap;
- else
- return EmitterNodeType::FlowMap;
- }
-
- // can't happen
- assert(false);
- return EmitterNodeType::None;
- }
-
- void EmitterState::StartedDoc()
- {
- m_hasAnchor = false;
- m_hasTag = false;
- m_hasNonContent = false;
- }
-
- void EmitterState::EndedDoc()
- {
- m_hasAnchor = false;
- m_hasTag = false;
- m_hasNonContent = false;
- }
-
- void EmitterState::StartedScalar()
- {
- StartedNode();
- ClearModifiedSettings();
- }
-
- void EmitterState::StartedGroup(GroupType::value type)
- {
- StartedNode();
-
- const int lastGroupIndent = (m_groups.empty() ? 0 : m_groups.top().indent);
- m_curIndent += lastGroupIndent;
-
- std::auto_ptr<Group> pGroup(new Group(type));
-
- // transfer settings (which last until this group is done)
- pGroup->modifiedSettings = m_modifiedSettings;
-
- // set up group
- if(GetFlowType(type) == Block)
- pGroup->flowType = FlowType::Block;
- else
- pGroup->flowType = FlowType::Flow;
- pGroup->indent = GetIndent();
-
- m_groups.push(pGroup);
- }
-
- void EmitterState::EndedGroup(GroupType::value type)
- {
- if(m_groups.empty()) {
- if(type == GroupType::Seq)
- return SetError(ErrorMsg::UNEXPECTED_END_SEQ);
- else
- return SetError(ErrorMsg::UNEXPECTED_END_MAP);
- }
-
- // get rid of the current group
- {
- std::auto_ptr<Group> pFinishedGroup = m_groups.pop();
- if(pFinishedGroup->type != type)
- return SetError(ErrorMsg::UNMATCHED_GROUP_TAG);
- }
-
- // reset old settings
- unsigned lastIndent = (m_groups.empty() ? 0 : m_groups.top().indent);
- assert(m_curIndent >= lastIndent);
- m_curIndent -= lastIndent;
-
- // some global settings that we changed may have been overridden
- // by a local setting we just popped, so we need to restore them
- m_globalModifiedSettings.restore();
-
- ClearModifiedSettings();
- }
-
- EmitterNodeType::value EmitterState::CurGroupNodeType() const
- {
- if(m_groups.empty())
- return EmitterNodeType::None;
-
- return m_groups.top().NodeType();
- }
-
- GroupType::value EmitterState::CurGroupType() const
- {
- return m_groups.empty() ? GroupType::None : m_groups.top().type;
- }
-
- FlowType::value EmitterState::CurGroupFlowType() const
- {
- return m_groups.empty() ? FlowType::None : m_groups.top().flowType;
- }
-
- int EmitterState::CurGroupIndent() const
- {
- return m_groups.empty() ? 0 : m_groups.top().indent;
- }
-
- std::size_t EmitterState::CurGroupChildCount() const
- {
- return m_groups.empty() ? m_docCount : m_groups.top().childCount;
- }
-
- bool EmitterState::CurGroupLongKey() const
- {
- return m_groups.empty() ? false : m_groups.top().longKey;
- }
-
- int EmitterState::LastIndent() const
- {
- if(m_groups.size() <= 1)
- return 0;
-
- return m_curIndent - m_groups.top(-1).indent;
- }
-
- void EmitterState::ClearModifiedSettings()
- {
- m_modifiedSettings.clear();
- }
-
- bool EmitterState::SetOutputCharset(EMITTER_MANIP value, FmtScope::value scope)
- {
- switch(value) {
- case EmitNonAscii:
- case EscapeNonAscii:
- _Set(m_charset, value, scope);
- return true;
- default:
- return false;
- }
- }
-
- bool EmitterState::SetStringFormat(EMITTER_MANIP value, FmtScope::value scope)
- {
- switch(value) {
- case Auto:
- case SingleQuoted:
- case DoubleQuoted:
- case Literal:
- _Set(m_strFmt, value, scope);
- return true;
- default:
- return false;
- }
- }
-
- bool EmitterState::SetBoolFormat(EMITTER_MANIP value, FmtScope::value scope)
- {
- switch(value) {
- case OnOffBool:
- case TrueFalseBool:
- case YesNoBool:
- _Set(m_boolFmt, value, scope);
- return true;
- default:
- return false;
- }
- }
-
- bool EmitterState::SetBoolLengthFormat(EMITTER_MANIP value, FmtScope::value scope)
- {
- switch(value) {
- case LongBool:
- case ShortBool:
- _Set(m_boolLengthFmt, value, scope);
- return true;
- default:
- return false;
- }
- }
-
- bool EmitterState::SetBoolCaseFormat(EMITTER_MANIP value, FmtScope::value scope)
- {
- switch(value) {
- case UpperCase:
- case LowerCase:
- case CamelCase:
- _Set(m_boolCaseFmt, value, scope);
- return true;
- default:
- return false;
- }
- }
-
- bool EmitterState::SetIntFormat(EMITTER_MANIP value, FmtScope::value scope)
- {
- switch(value) {
- case Dec:
- case Hex:
- case Oct:
- _Set(m_intFmt, value, scope);
- return true;
- default:
- return false;
- }
- }
-
- bool EmitterState::SetIndent(unsigned value, FmtScope::value scope)
- {
- if(value <= 1)
- return false;
-
- _Set(m_indent, value, scope);
- return true;
- }
-
- bool EmitterState::SetPreCommentIndent(unsigned value, FmtScope::value scope)
- {
- if(value == 0)
- return false;
-
- _Set(m_preCommentIndent, value, scope);
- return true;
- }
-
- bool EmitterState::SetPostCommentIndent(unsigned value, FmtScope::value scope)
- {
- if(value == 0)
- return false;
-
- _Set(m_postCommentIndent, value, scope);
- return true;
- }
-
- bool EmitterState::SetFlowType(GroupType::value groupType, EMITTER_MANIP value, FmtScope::value scope)
- {
- switch(value) {
- case Block:
- case Flow:
- _Set(groupType == GroupType::Seq ? m_seqFmt : m_mapFmt, value, scope);
- return true;
- default:
- return false;
- }
- }
-
- EMITTER_MANIP EmitterState::GetFlowType(GroupType::value groupType) const
- {
- // force flow style if we're currently in a flow
- if(CurGroupFlowType() == FlowType::Flow)
- return Flow;
-
- // otherwise, go with what's asked of us
- return (groupType == GroupType::Seq ? m_seqFmt.get() : m_mapFmt.get());
- }
-
- bool EmitterState::SetMapKeyFormat(EMITTER_MANIP value, FmtScope::value scope)
- {
- switch(value) {
- case Auto:
- case LongKey:
- _Set(m_mapKeyFmt, value, scope);
- return true;
- default:
- return false;
- }
- }
-
- bool EmitterState::SetFloatPrecision(int value, FmtScope::value scope)
- {
- if(value < 0 || value > std::numeric_limits<float>::digits10)
- return false;
- _Set(m_floatPrecision, value, scope);
- return true;
- }
-
- bool EmitterState::SetDoublePrecision(int value, FmtScope::value scope)
- {
- if(value < 0 || value > std::numeric_limits<double>::digits10)
- return false;
- _Set(m_doublePrecision, value, scope);
- return true;
- }
-}
-
diff --git a/ext/src/yaml-cpp/emitterstate.h b/ext/src/yaml-cpp/emitterstate.h
deleted file mode 100644
index f43f471..0000000
--- a/ext/src/yaml-cpp/emitterstate.h
+++ /dev/null
@@ -1,190 +0,0 @@
-#ifndef EMITTERSTATE_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define EMITTERSTATE_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-#include "ptr_stack.h"
-#include "setting.h"
-#include "yaml-cpp/emitterdef.h"
-#include "yaml-cpp/emittermanip.h"
-#include <cassert>
-#include <vector>
-#include <stack>
-#include <memory>
-#include <stdexcept>
-
-namespace YAML
-{
- struct FmtScope { enum value { Local, Global }; };
- struct GroupType { enum value { None, Seq, Map }; };
- struct FlowType { enum value { None, Flow, Block }; };
-
- class EmitterState
- {
- public:
- EmitterState();
- ~EmitterState();
-
- // basic state checking
- bool good() const { return m_isGood; }
- const std::string GetLastError() const { return m_lastError; }
- void SetError(const std::string& error) { m_isGood = false; m_lastError = error; }
-
- // node handling
- void SetAnchor();
- void SetTag();
- void SetNonContent();
- void SetLongKey();
- void ForceFlow();
- void StartedDoc();
- void EndedDoc();
- void StartedScalar();
- void StartedGroup(GroupType::value type);
- void EndedGroup(GroupType::value type);
-
- EmitterNodeType::value NextGroupType(GroupType::value type) const;
- EmitterNodeType::value CurGroupNodeType() const;
-
- GroupType::value CurGroupType() const;
- FlowType::value CurGroupFlowType() const;
- int CurGroupIndent() const;
- std::size_t CurGroupChildCount() const;
- bool CurGroupLongKey() const;
-
- int LastIndent() const;
- int CurIndent() const { return m_curIndent; }
- bool HasAnchor() const { return m_hasAnchor; }
- bool HasTag() const { return m_hasTag; }
- bool HasBegunNode() const { return m_hasAnchor || m_hasTag || m_hasNonContent; }
- bool HasBegunContent() const { return m_hasAnchor || m_hasTag; }
-
- void ClearModifiedSettings();
-
- // formatters
- void SetLocalValue(EMITTER_MANIP value);
-
- bool SetOutputCharset(EMITTER_MANIP value, FmtScope::value scope);
- EMITTER_MANIP GetOutputCharset() const { return m_charset.get(); }
-
- bool SetStringFormat(EMITTER_MANIP value, FmtScope::value scope);
- EMITTER_MANIP GetStringFormat() const { return m_strFmt.get(); }
-
- bool SetBoolFormat(EMITTER_MANIP value, FmtScope::value scope);
- EMITTER_MANIP GetBoolFormat() const { return m_boolFmt.get(); }
-
- bool SetBoolLengthFormat(EMITTER_MANIP value, FmtScope::value scope);
- EMITTER_MANIP GetBoolLengthFormat() const { return m_boolLengthFmt.get(); }
-
- bool SetBoolCaseFormat(EMITTER_MANIP value, FmtScope::value scope);
- EMITTER_MANIP GetBoolCaseFormat() const { return m_boolCaseFmt.get(); }
-
- bool SetIntFormat(EMITTER_MANIP value, FmtScope::value scope);
- EMITTER_MANIP GetIntFormat() const { return m_intFmt.get(); }
-
- bool SetIndent(unsigned value, FmtScope::value scope);
- int GetIndent() const { return m_indent.get(); }
-
- bool SetPreCommentIndent(unsigned value, FmtScope::value scope);
- int GetPreCommentIndent() const { return m_preCommentIndent.get(); }
- bool SetPostCommentIndent(unsigned value, FmtScope::value scope);
- int GetPostCommentIndent() const { return m_postCommentIndent.get(); }
-
- bool SetFlowType(GroupType::value groupType, EMITTER_MANIP value, FmtScope::value scope);
- EMITTER_MANIP GetFlowType(GroupType::value groupType) const;
-
- bool SetMapKeyFormat(EMITTER_MANIP value, FmtScope::value scope);
- EMITTER_MANIP GetMapKeyFormat() const { return m_mapKeyFmt.get(); }
-
- bool SetFloatPrecision(int value, FmtScope::value scope);
- unsigned GetFloatPrecision() const { return m_floatPrecision.get(); }
- bool SetDoublePrecision(int value, FmtScope::value scope);
- unsigned GetDoublePrecision() const { return m_doublePrecision.get(); }
-
- private:
- template <typename T>
- void _Set(Setting<T>& fmt, T value, FmtScope::value scope);
-
- void StartedNode();
-
- private:
- // basic state ok?
- bool m_isGood;
- std::string m_lastError;
-
- // other state
- Setting<EMITTER_MANIP> m_charset;
- Setting<EMITTER_MANIP> m_strFmt;
- Setting<EMITTER_MANIP> m_boolFmt;
- Setting<EMITTER_MANIP> m_boolLengthFmt;
- Setting<EMITTER_MANIP> m_boolCaseFmt;
- Setting<EMITTER_MANIP> m_intFmt;
- Setting<unsigned> m_indent;
- Setting<unsigned> m_preCommentIndent, m_postCommentIndent;
- Setting<EMITTER_MANIP> m_seqFmt;
- Setting<EMITTER_MANIP> m_mapFmt;
- Setting<EMITTER_MANIP> m_mapKeyFmt;
- Setting<int> m_floatPrecision;
- Setting<int> m_doublePrecision;
-
- SettingChanges m_modifiedSettings;
- SettingChanges m_globalModifiedSettings;
-
- struct Group {
- explicit Group(GroupType::value type_): type(type_), indent(0), childCount(0), longKey(false) {}
-
- GroupType::value type;
- FlowType::value flowType;
- int indent;
- std::size_t childCount;
- bool longKey;
-
- SettingChanges modifiedSettings;
-
- EmitterNodeType::value NodeType() const {
- if(type == GroupType::Seq) {
- if(flowType == FlowType::Flow)
- return EmitterNodeType::FlowSeq;
- else
- return EmitterNodeType::BlockSeq;
- } else {
- if(flowType == FlowType::Flow)
- return EmitterNodeType::FlowMap;
- else
- return EmitterNodeType::BlockMap;
- }
-
- // can't get here
- assert(false);
- return EmitterNodeType::None;
- }
- };
-
- ptr_stack<Group> m_groups;
- unsigned m_curIndent;
- bool m_hasAnchor;
- bool m_hasTag;
- bool m_hasNonContent;
- std::size_t m_docCount;
- };
-
- template <typename T>
- void EmitterState::_Set(Setting<T>& fmt, T value, FmtScope::value scope) {
- switch(scope) {
- case FmtScope::Local:
- m_modifiedSettings.push(fmt.set(value));
- break;
- case FmtScope::Global:
- fmt.set(value);
- m_globalModifiedSettings.push(fmt.set(value)); // this pushes an identity set, so when we restore,
- // it restores to the value here, and not the previous one
- break;
- default:
- assert(false);
- }
- }
-}
-
-#endif // EMITTERSTATE_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/src/yaml-cpp/emitterutils.cpp b/ext/src/yaml-cpp/emitterutils.cpp
deleted file mode 100644
index 09780d0..0000000
--- a/ext/src/yaml-cpp/emitterutils.cpp
+++ /dev/null
@@ -1,424 +0,0 @@
-#include "emitterutils.h"
-#include "exp.h"
-#include "indentation.h"
-#include "yaml-cpp/binary.h"
-#include "yaml-cpp/exceptions.h"
-#include "stringsource.h"
-#include <sstream>
-#include <iomanip>
-
-namespace YAML
-{
- namespace Utils
- {
- namespace {
- enum {REPLACEMENT_CHARACTER = 0xFFFD};
-
- bool IsAnchorChar(int ch) { // test for ns-anchor-char
- switch (ch) {
- case ',': case '[': case ']': case '{': case '}': // c-flow-indicator
- case ' ': case '\t': // s-white
- case 0xFEFF: // c-byte-order-mark
- case 0xA: case 0xD: // b-char
- return false;
- case 0x85:
- return true;
- }
-
- if (ch < 0x20)
- return false;
-
- if (ch < 0x7E)
- return true;
-
- if (ch < 0xA0)
- return false;
- if (ch >= 0xD800 && ch <= 0xDFFF)
- return false;
- if ((ch & 0xFFFE) == 0xFFFE)
- return false;
- if ((ch >= 0xFDD0) && (ch <= 0xFDEF))
- return false;
- if (ch > 0x10FFFF)
- return false;
-
- return true;
- }
-
- int Utf8BytesIndicated(char ch) {
- int byteVal = static_cast<unsigned char>(ch);
- switch (byteVal >> 4) {
- case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
- return 1;
- case 12: case 13:
- return 2;
- case 14:
- return 3;
- case 15:
- return 4;
- default:
- return -1;
- }
- }
-
- bool IsTrailingByte(char ch) {
- return (ch & 0xC0) == 0x80;
- }
-
- bool GetNextCodePointAndAdvance(int& codePoint, std::string::const_iterator& first, std::string::const_iterator last) {
- if (first == last)
- return false;
-
- int nBytes = Utf8BytesIndicated(*first);
- if (nBytes < 1) {
- // Bad lead byte
- ++first;
- codePoint = REPLACEMENT_CHARACTER;
- return true;
- }
-
- if (nBytes == 1) {
- codePoint = *first++;
- return true;
- }
-
- // Gather bits from trailing bytes
- codePoint = static_cast<unsigned char>(*first) & ~(0xFF << (7 - nBytes));
- ++first;
- --nBytes;
- for (; nBytes > 0; ++first, --nBytes) {
- if ((first == last) || !IsTrailingByte(*first)) {
- codePoint = REPLACEMENT_CHARACTER;
- break;
- }
- codePoint <<= 6;
- codePoint |= *first & 0x3F;
- }
-
- // Check for illegal code points
- if (codePoint > 0x10FFFF)
- codePoint = REPLACEMENT_CHARACTER;
- else if (codePoint >= 0xD800 && codePoint <= 0xDFFF)
- codePoint = REPLACEMENT_CHARACTER;
- else if ((codePoint & 0xFFFE) == 0xFFFE)
- codePoint = REPLACEMENT_CHARACTER;
- else if (codePoint >= 0xFDD0 && codePoint <= 0xFDEF)
- codePoint = REPLACEMENT_CHARACTER;
- return true;
- }
-
- void WriteCodePoint(ostream_wrapper& out, int codePoint) {
- if (codePoint < 0 || codePoint > 0x10FFFF) {
- codePoint = REPLACEMENT_CHARACTER;
- }
- if (codePoint < 0x7F) {
- out << static_cast<char>(codePoint);
- } else if (codePoint < 0x7FF) {
- out << static_cast<char>(0xC0 | (codePoint >> 6))
- << static_cast<char>(0x80 | (codePoint & 0x3F));
- } else if (codePoint < 0xFFFF) {
- out << static_cast<char>(0xE0 | (codePoint >> 12))
- << static_cast<char>(0x80 | ((codePoint >> 6) & 0x3F))
- << static_cast<char>(0x80 | (codePoint & 0x3F));
- } else {
- out << static_cast<char>(0xF0 | (codePoint >> 18))
- << static_cast<char>(0x80 | ((codePoint >> 12) & 0x3F))
- << static_cast<char>(0x80 | ((codePoint >> 6) & 0x3F))
- << static_cast<char>(0x80 | (codePoint & 0x3F));
- }
- }
-
- bool IsValidPlainScalar(const std::string& str, FlowType::value flowType, bool allowOnlyAscii) {
- if(str.empty())
- return false;
-
- // first check the start
- const RegEx& start = (flowType == FlowType::Flow ? Exp::PlainScalarInFlow() : Exp::PlainScalar());
- if(!start.Matches(str))
- return false;
-
- // and check the end for plain whitespace (which can't be faithfully kept in a plain scalar)
- if(!str.empty() && *str.rbegin() == ' ')
- return false;
-
- // then check until something is disallowed
- const RegEx& disallowed = (flowType == FlowType::Flow ? Exp::EndScalarInFlow() : Exp::EndScalar())
- || (Exp::BlankOrBreak() + Exp::Comment())
- || Exp::NotPrintable()
- || Exp::Utf8_ByteOrderMark()
- || Exp::Break()
- || Exp::Tab();
- StringCharSource buffer(str.c_str(), str.size());
- while(buffer) {
- if(disallowed.Matches(buffer))
- return false;
- if(allowOnlyAscii && (0x80 <= static_cast<unsigned char>(buffer[0])))
- return false;
- ++buffer;
- }
-
- return true;
- }
-
- bool IsValidSingleQuotedScalar(const std::string& str, bool escapeNonAscii)
- {
- // TODO: check for non-printable characters?
- for(std::size_t i=0;i<str.size();i++) {
- if(escapeNonAscii && (0x80 <= static_cast<unsigned char>(str[i])))
- return false;
- if(str[i] == '\n')
- return false;
- }
- return true;
- }
-
- bool IsValidLiteralScalar(const std::string& str, FlowType::value flowType, bool escapeNonAscii)
- {
- if(flowType == FlowType::Flow)
- return false;
-
- // TODO: check for non-printable characters?
- for(std::size_t i=0;i<str.size();i++) {
- if(escapeNonAscii && (0x80 <= static_cast<unsigned char>(str[i])))
- return false;
- }
- return true;
- }
-
- void WriteDoubleQuoteEscapeSequence(ostream_wrapper& out, int codePoint) {
- static const char hexDigits[] = "0123456789abcdef";
-
- out << "\\";
- int digits = 8;
- if(codePoint < 0xFF) {
- out << "x";
- digits = 2;
- } else if(codePoint < 0xFFFF) {
- out << "u";
- digits = 4;
- } else {
- out << "U";
- digits = 8;
- }
-
- // Write digits into the escape sequence
- for (; digits > 0; --digits)
- out << hexDigits[(codePoint >> (4 * (digits - 1))) & 0xF];
- }
-
- bool WriteAliasName(ostream_wrapper& out, const std::string& str) {
- int codePoint;
- for(std::string::const_iterator i = str.begin();
- GetNextCodePointAndAdvance(codePoint, i, str.end());
- )
- {
- if (!IsAnchorChar(codePoint))
- return false;
-
- WriteCodePoint(out, codePoint);
- }
- return true;
- }
- }
-
- StringFormat::value ComputeStringFormat(const std::string& str, EMITTER_MANIP strFormat, FlowType::value flowType, bool escapeNonAscii)
- {
- switch(strFormat) {
- case Auto:
- if(IsValidPlainScalar(str, flowType, escapeNonAscii))
- return StringFormat::Plain;
- return StringFormat::DoubleQuoted;
- case SingleQuoted:
- if(IsValidSingleQuotedScalar(str, escapeNonAscii))
- return StringFormat::SingleQuoted;
- return StringFormat::DoubleQuoted;
- case DoubleQuoted:
- return StringFormat::DoubleQuoted;
- case Literal:
- if(IsValidLiteralScalar(str, flowType, escapeNonAscii))
- return StringFormat::Literal;
- return StringFormat::DoubleQuoted;
- default:
- break;
- }
-
- return StringFormat::DoubleQuoted;
- }
-
- bool WriteSingleQuotedString(ostream_wrapper& out, const std::string& str)
- {
- out << "'";
- int codePoint;
- for(std::string::const_iterator i = str.begin();
- GetNextCodePointAndAdvance(codePoint, i, str.end());
- )
- {
- if (codePoint == '\n')
- return false; // We can't handle a new line and the attendant indentation yet
-
- if (codePoint == '\'')
- out << "''";
- else
- WriteCodePoint(out, codePoint);
- }
- out << "'";
- return true;
- }
-
- bool WriteDoubleQuotedString(ostream_wrapper& out, const std::string& str, bool escapeNonAscii)
- {
- out << "\"";
- int codePoint;
- for(std::string::const_iterator i = str.begin();
- GetNextCodePointAndAdvance(codePoint, i, str.end());
- )
- {
- switch(codePoint) {
- case '\"': out << "\\\""; break;
- case '\\': out << "\\\\"; break;
- case '\n': out << "\\n"; break;
- case '\t': out << "\\t"; break;
- case '\r': out << "\\r"; break;
- case '\b': out << "\\b"; break;
- default:
- if(codePoint < 0x20 || (codePoint >= 0x80 && codePoint <= 0xA0)) // Control characters and non-breaking space
- WriteDoubleQuoteEscapeSequence(out, codePoint);
- else if (codePoint == 0xFEFF) // Byte order marks (ZWNS) should be escaped (YAML 1.2, sec. 5.2)
- WriteDoubleQuoteEscapeSequence(out, codePoint);
- else if (escapeNonAscii && codePoint > 0x7E)
- WriteDoubleQuoteEscapeSequence(out, codePoint);
- else
- WriteCodePoint(out, codePoint);
- }
- }
- out << "\"";
- return true;
- }
-
- bool WriteLiteralString(ostream_wrapper& out, const std::string& str, int indent)
- {
- out << "|\n";
- out << IndentTo(indent);
- int codePoint;
- for(std::string::const_iterator i = str.begin();
- GetNextCodePointAndAdvance(codePoint, i, str.end());
- )
- {
- if (codePoint == '\n')
- out << "\n" << IndentTo(indent);
- else
- WriteCodePoint(out, codePoint);
- }
- return true;
- }
-
- bool WriteChar(ostream_wrapper& out, char ch)
- {
- if(('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z'))
- out << ch;
- else if((0x20 <= ch && ch <= 0x7e) || ch == ' ')
- out << "\"" << ch << "\"";
- else if(ch == '\t')
- out << "\"\\t\"";
- else if(ch == '\n')
- out << "\"\\n\"";
- else if(ch == '\b')
- out << "\"\\b\"";
- else {
- out << "\"";
- WriteDoubleQuoteEscapeSequence(out, ch);
- out << "\"";
- }
- return true;
- }
-
- bool WriteComment(ostream_wrapper& out, const std::string& str, int postCommentIndent)
- {
- const unsigned curIndent = out.col();
- out << "#" << Indentation(postCommentIndent);
- out.set_comment();
- int codePoint;
- for(std::string::const_iterator i = str.begin();
- GetNextCodePointAndAdvance(codePoint, i, str.end());
- )
- {
- if(codePoint == '\n') {
- out << "\n" << IndentTo(curIndent) << "#" << Indentation(postCommentIndent);
- out.set_comment();
- } else {
- WriteCodePoint(out, codePoint);
- }
- }
- return true;
- }
-
- bool WriteAlias(ostream_wrapper& out, const std::string& str)
- {
- out << "*";
- return WriteAliasName(out, str);
- }
-
- bool WriteAnchor(ostream_wrapper& out, const std::string& str)
- {
- out << "&";
- return WriteAliasName(out, str);
- }
-
- bool WriteTag(ostream_wrapper& out, const std::string& str, bool verbatim)
- {
- out << (verbatim ? "!<" : "!");
- StringCharSource buffer(str.c_str(), str.size());
- const RegEx& reValid = verbatim ? Exp::URI() : Exp::Tag();
- while(buffer) {
- int n = reValid.Match(buffer);
- if(n <= 0)
- return false;
-
- while(--n >= 0) {
- out << buffer[0];
- ++buffer;
- }
- }
- if (verbatim)
- out << ">";
- return true;
- }
-
- bool WriteTagWithPrefix(ostream_wrapper& out, const std::string& prefix, const std::string& tag)
- {
- out << "!";
- StringCharSource prefixBuffer(prefix.c_str(), prefix.size());
- while(prefixBuffer) {
- int n = Exp::URI().Match(prefixBuffer);
- if(n <= 0)
- return false;
-
- while(--n >= 0) {
- out << prefixBuffer[0];
- ++prefixBuffer;
- }
- }
-
- out << "!";
- StringCharSource tagBuffer(tag.c_str(), tag.size());
- while(tagBuffer) {
- int n = Exp::Tag().Match(tagBuffer);
- if(n <= 0)
- return false;
-
- while(--n >= 0) {
- out << tagBuffer[0];
- ++tagBuffer;
- }
- }
- return true;
- }
-
- bool WriteBinary(ostream_wrapper& out, const Binary& binary)
- {
- WriteDoubleQuotedString(out, EncodeBase64(binary.data(), binary.size()), false);
- return true;
- }
- }
-}
-
diff --git a/ext/src/yaml-cpp/emitterutils.h b/ext/src/yaml-cpp/emitterutils.h
deleted file mode 100644
index 50b37f0..0000000
--- a/ext/src/yaml-cpp/emitterutils.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef EMITTERUTILS_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define EMITTERUTILS_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-#include "emitterstate.h"
-#include "yaml-cpp/ostream_wrapper.h"
-#include <string>
-
-namespace YAML
-{
- class Binary;
-
- struct StringFormat { enum value { Plain, SingleQuoted, DoubleQuoted, Literal }; };
-
- namespace Utils
- {
- StringFormat::value ComputeStringFormat(const std::string& str, EMITTER_MANIP strFormat, FlowType::value flowType, bool escapeNonAscii);
-
- bool WriteSingleQuotedString(ostream_wrapper& out, const std::string& str);
- bool WriteDoubleQuotedString(ostream_wrapper& out, const std::string& str, bool escapeNonAscii);
- bool WriteLiteralString(ostream_wrapper& out, const std::string& str, int indent);
- bool WriteChar(ostream_wrapper& out, char ch);
- bool WriteComment(ostream_wrapper& out, const std::string& str, int postCommentIndent);
- bool WriteAlias(ostream_wrapper& out, const std::string& str);
- bool WriteAnchor(ostream_wrapper& out, const std::string& str);
- bool WriteTag(ostream_wrapper& out, const std::string& str, bool verbatim);
- bool WriteTagWithPrefix(ostream_wrapper& out, const std::string& prefix, const std::string& tag);
- bool WriteBinary(ostream_wrapper& out, const Binary& binary);
- }
-}
-
-#endif // EMITTERUTILS_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/src/yaml-cpp/exp.cpp b/ext/src/yaml-cpp/exp.cpp
deleted file mode 100644
index 7bc5454..0000000
--- a/ext/src/yaml-cpp/exp.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-#include "exp.h"
-#include "yaml-cpp/exceptions.h"
-#include <sstream>
-
-namespace YAML
-{
- namespace Exp
- {
- unsigned ParseHex(const std::string& str, const Mark& mark)
- {
- unsigned value = 0;
- for(std::size_t i=0;i<str.size();i++) {
- char ch = str[i];
- int digit = 0;
- if('a' <= ch && ch <= 'f')
- digit = ch - 'a' + 10;
- else if('A' <= ch && ch <= 'F')
- digit = ch - 'A' + 10;
- else if('0' <= ch && ch <= '9')
- digit = ch - '0';
- else
- throw ParserException(mark, ErrorMsg::INVALID_HEX);
-
- value = (value << 4) + digit;
- }
-
- return value;
- }
-
- std::string Str(unsigned ch)
- {
- return std::string(1, static_cast<char>(ch));
- }
-
- // Escape
- // . Translates the next 'codeLength' characters into a hex number and returns the result.
- // . Throws if it's not actually hex.
- std::string Escape(Stream& in, int codeLength)
- {
- // grab string
- std::string str;
- for(int i=0;i<codeLength;i++)
- str += in.get();
-
- // get the value
- unsigned value = ParseHex(str, in.mark());
-
- // legal unicode?
- if((value >= 0xD800 && value <= 0xDFFF) || value > 0x10FFFF) {
- std::stringstream msg;
- msg << ErrorMsg::INVALID_UNICODE << value;
- throw ParserException(in.mark(), msg.str());
- }
-
- // now break it up into chars
- if(value <= 0x7F)
- return Str(value);
- else if(value <= 0x7FF)
- return Str(0xC0 + (value >> 6)) + Str(0x80 + (value & 0x3F));
- else if(value <= 0xFFFF)
- return Str(0xE0 + (value >> 12)) + Str(0x80 + ((value >> 6) & 0x3F)) + Str(0x80 + (value & 0x3F));
- else
- return Str(0xF0 + (value >> 18)) + Str(0x80 + ((value >> 12) & 0x3F)) +
- Str(0x80 + ((value >> 6) & 0x3F)) + Str(0x80 + (value & 0x3F));
- }
-
- // Escape
- // . Escapes the sequence starting 'in' (it must begin with a '\' or single quote)
- // and returns the result.
- // . Throws if it's an unknown escape character.
- std::string Escape(Stream& in)
- {
- // eat slash
- char escape = in.get();
-
- // switch on escape character
- char ch = in.get();
-
- // first do single quote, since it's easier
- if(escape == '\'' && ch == '\'')
- return "\'";
-
- // now do the slash (we're not gonna check if it's a slash - you better pass one!)
- switch(ch) {
- case '0': return std::string(1, '\x00');
- case 'a': return "\x07";
- case 'b': return "\x08";
- case 't':
- case '\t': return "\x09";
- case 'n': return "\x0A";
- case 'v': return "\x0B";
- case 'f': return "\x0C";
- case 'r': return "\x0D";
- case 'e': return "\x1B";
- case ' ': return "\x20";
- case '\"': return "\"";
- case '\'': return "\'";
- case '\\': return "\\";
- case '/': return "/";
- case 'N': return "\x85";
- case '_': return "\xA0";
- case 'L': return "\xE2\x80\xA8"; // LS (#x2028)
- case 'P': return "\xE2\x80\xA9"; // PS (#x2029)
- case 'x': return Escape(in, 2);
- case 'u': return Escape(in, 4);
- case 'U': return Escape(in, 8);
- }
-
- std::stringstream msg;
- throw ParserException(in.mark(), std::string(ErrorMsg::INVALID_ESCAPE) + ch);
- }
- }
-}
diff --git a/ext/src/yaml-cpp/exp.h b/ext/src/yaml-cpp/exp.h
deleted file mode 100644
index 52bfd0a..0000000
--- a/ext/src/yaml-cpp/exp.h
+++ /dev/null
@@ -1,196 +0,0 @@
-#ifndef EXP_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define EXP_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-#include "regex.h"
-#include <string>
-#include <ios>
-#include "stream.h"
-
-namespace YAML
-{
- ////////////////////////////////////////////////////////////////////////////////
- // Here we store a bunch of expressions for matching different parts of the file.
-
- namespace Exp
- {
- // misc
- inline const RegEx& Space() {
- static const RegEx e = RegEx(' ');
- return e;
- }
- inline const RegEx& Tab() {
- static const RegEx e = RegEx('\t');
- return e;
- }
- inline const RegEx& Blank() {
- static const RegEx e = Space() || Tab();
- return e;
- }
- inline const RegEx& Break() {
- static const RegEx e = RegEx('\n') || RegEx("\r\n");
- return e;
- }
- inline const RegEx& BlankOrBreak() {
- static const RegEx e = Blank() || Break();
- return e;
- }
- inline const RegEx& Digit() {
- static const RegEx e = RegEx('0', '9');
- return e;
- }
- inline const RegEx& Alpha() {
- static const RegEx e = RegEx('a', 'z') || RegEx('A', 'Z');
- return e;
- }
- inline const RegEx& AlphaNumeric() {
- static const RegEx e = Alpha() || Digit();
- return e;
- }
- inline const RegEx& Word() {
- static const RegEx e = AlphaNumeric() || RegEx('-');
- return e;
- }
- inline const RegEx& Hex() {
- static const RegEx e = Digit() || RegEx('A', 'F') || RegEx('a', 'f');
- return e;
- }
- // Valid Unicode code points that are not part of c-printable (YAML 1.2, sec. 5.1)
- inline const RegEx& NotPrintable() {
- static const RegEx e = RegEx(0) ||
- RegEx("\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x7F", REGEX_OR) ||
- RegEx(0x0E, 0x1F) ||
- (RegEx('\xC2') + (RegEx('\x80', '\x84') || RegEx('\x86', '\x9F')));
- return e;
- }
- inline const RegEx& Utf8_ByteOrderMark() {
- static const RegEx e = RegEx("\xEF\xBB\xBF");
- return e;
- }
-
- // actual tags
-
- inline const RegEx& DocStart() {
- static const RegEx e = RegEx("---") + (BlankOrBreak() || RegEx());
- return e;
- }
- inline const RegEx& DocEnd() {
- static const RegEx e = RegEx("...") + (BlankOrBreak() || RegEx());
- return e;
- }
- inline const RegEx& DocIndicator() {
- static const RegEx e = DocStart() || DocEnd();
- return e;
- }
- inline const RegEx& BlockEntry() {
- static const RegEx e = RegEx('-') + (BlankOrBreak() || RegEx());
- return e;
- }
- inline const RegEx& Key() {
- static const RegEx e = RegEx('?') + BlankOrBreak();
- return e;
- }
- inline const RegEx& KeyInFlow() {
- static const RegEx e = RegEx('?') + BlankOrBreak();
- return e;
- }
- inline const RegEx& Value() {
- static const RegEx e = RegEx(':') + (BlankOrBreak() || RegEx());
- return e;
- }
- inline const RegEx& ValueInFlow() {
- static const RegEx e = RegEx(':') + (BlankOrBreak() || RegEx(",}", REGEX_OR));
- return e;
- }
- inline const RegEx& ValueInJSONFlow() {
- static const RegEx e = RegEx(':');
- return e;
- }
- inline const RegEx Comment() {
- static const RegEx e = RegEx('#');
- return e;
- }
- inline const RegEx& Anchor() {
- static const RegEx e = !(RegEx("[]{},", REGEX_OR) || BlankOrBreak());
- return e;
- }
- inline const RegEx& AnchorEnd() {
- static const RegEx e = RegEx("?:,]}%@`", REGEX_OR) || BlankOrBreak();
- return e;
- }
- inline const RegEx& URI() {
- static const RegEx e = Word() || RegEx("#;/?:@&=+$,_.!~*'()[]", REGEX_OR) || (RegEx('%') + Hex() + Hex());
- return e;
- }
- inline const RegEx& Tag() {
- static const RegEx e = Word() || RegEx("#;/?:@&=+$_.~*'", REGEX_OR) || (RegEx('%') + Hex() + Hex());
- return e;
- }
-
- // Plain scalar rules:
- // . Cannot start with a blank.
- // . Can never start with any of , [ ] { } # & * ! | > \' \" % @ `
- // . In the block context - ? : must be not be followed with a space.
- // . In the flow context ? is illegal and : and - must not be followed with a space.
- inline const RegEx& PlainScalar() {
- static const RegEx e = !(BlankOrBreak() || RegEx(",[]{}#&*!|>\'\"%@`", REGEX_OR) || (RegEx("-?:", REGEX_OR) + (BlankOrBreak() || RegEx())));
- return e;
- }
- inline const RegEx& PlainScalarInFlow() {
- static const RegEx e = !(BlankOrBreak() || RegEx("?,[]{}#&*!|>\'\"%@`", REGEX_OR) || (RegEx("-:", REGEX_OR) + Blank()));
- return e;
- }
- inline const RegEx& EndScalar() {
- static const RegEx e = RegEx(':') + (BlankOrBreak() || RegEx());
- return e;
- }
- inline const RegEx& EndScalarInFlow() {
- static const RegEx e = (RegEx(':') + (BlankOrBreak() || RegEx() || RegEx(",]}", REGEX_OR))) || RegEx(",?[]{}", REGEX_OR);
- return e;
- }
-
- inline const RegEx& EscSingleQuote() {
- static const RegEx e = RegEx("\'\'");
- return e;
- }
- inline const RegEx& EscBreak() {
- static const RegEx e = RegEx('\\') + Break();
- return e;
- }
-
- inline const RegEx& ChompIndicator() {
- static const RegEx e = RegEx("+-", REGEX_OR);
- return e;
- }
- inline const RegEx& Chomp() {
- static const RegEx e = (ChompIndicator() + Digit()) || (Digit() + ChompIndicator()) || ChompIndicator() || Digit();
- return e;
- }
-
- // and some functions
- std::string Escape(Stream& in);
- }
-
- namespace Keys
- {
- const char Directive = '%';
- const char FlowSeqStart = '[';
- const char FlowSeqEnd = ']';
- const char FlowMapStart = '{';
- const char FlowMapEnd = '}';
- const char FlowEntry = ',';
- const char Alias = '*';
- const char Anchor = '&';
- const char Tag = '!';
- const char LiteralScalar = '|';
- const char FoldedScalar = '>';
- const char VerbatimTagStart = '<';
- const char VerbatimTagEnd = '>';
- }
-}
-
-#endif // EXP_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/src/yaml-cpp/indentation.h b/ext/src/yaml-cpp/indentation.h
deleted file mode 100644
index 426fcb5..0000000
--- a/ext/src/yaml-cpp/indentation.h
+++ /dev/null
@@ -1,38 +0,0 @@
-#ifndef INDENTATION_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define INDENTATION_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-#include "yaml-cpp/ostream_wrapper.h"
-#include <iostream>
-
-namespace YAML
-{
- struct Indentation {
- Indentation(unsigned n_): n(n_) {}
- unsigned n;
- };
-
- inline ostream_wrapper& operator << (ostream_wrapper& out, const Indentation& indent) {
- for(unsigned i=0;i<indent.n;i++)
- out << ' ';
- return out;
- }
-
- struct IndentTo {
- IndentTo(unsigned n_): n(n_) {}
- unsigned n;
- };
-
- inline ostream_wrapper& operator << (ostream_wrapper& out, const IndentTo& indent) {
- while(out.col() < indent.n)
- out << ' ';
- return out;
- }
-}
-
-
-#endif // INDENTATION_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/src/yaml-cpp/memory.cpp b/ext/src/yaml-cpp/memory.cpp
deleted file mode 100644
index 98d0dfb..0000000
--- a/ext/src/yaml-cpp/memory.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-#include "yaml-cpp/node/detail/memory.h"
-#include "yaml-cpp/node/detail/node.h"
-
-namespace YAML
-{
- namespace detail
- {
- void memory_holder::merge(memory_holder& rhs)
- {
- if(m_pMemory == rhs.m_pMemory)
- return;
-
- m_pMemory->merge(*rhs.m_pMemory);
- rhs.m_pMemory = m_pMemory;
- }
-
- node& memory::create_node()
- {
- shared_node pNode(new node);
- m_nodes.insert(pNode);
- return *pNode;
- }
-
- void memory::merge(const memory& rhs)
- {
- m_nodes.insert(rhs.m_nodes.begin(), rhs.m_nodes.end());
- }
- }
-}
diff --git a/ext/src/yaml-cpp/node.cpp b/ext/src/yaml-cpp/node.cpp
deleted file mode 100644
index 2d21aa9..0000000
--- a/ext/src/yaml-cpp/node.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-#include "yaml-cpp/node/node.h"
-#include "nodebuilder.h"
-#include "nodeevents.h"
-
-namespace YAML
-{
- Node Clone(const Node& node)
- {
- NodeEvents events(node);
- NodeBuilder builder;
- events.Emit(builder);
- return builder.Root();
- }
-}
diff --git a/ext/src/yaml-cpp/node_data.cpp b/ext/src/yaml-cpp/node_data.cpp
deleted file mode 100644
index 822aae1..0000000
--- a/ext/src/yaml-cpp/node_data.cpp
+++ /dev/null
@@ -1,302 +0,0 @@
-#include "yaml-cpp/node/detail/node_data.h"
-#include "yaml-cpp/node/detail/memory.h"
-#include "yaml-cpp/node/detail/node.h"
-#include "yaml-cpp/exceptions.h"
-#include <sstream>
-
-namespace YAML
-{
- namespace detail
- {
- std::string node_data::empty_scalar;
-
- bool node_cmp::operator()(const node *lhs, const node *rhs) const {
- if (lhs->is_defined() && rhs->is_defined())
- return lhs->scalar() < rhs->scalar();
-
- return lhs < rhs;
- }
-
- node_data::node_data(): m_isDefined(false), m_type(NodeType::Null), m_seqSize(0)
- {
- }
-
- void node_data::mark_defined()
- {
- if(m_type == NodeType::Undefined)
- m_type = NodeType::Null;
- m_isDefined = true;
- }
-
- void node_data::set_type(NodeType::value type)
- {
- if(type == NodeType::Undefined) {
- m_type = type;
- m_isDefined = false;
- return;
- }
-
-
- m_isDefined = true;
- if(type == m_type)
- return;
-
- m_type = type;
-
- switch(m_type) {
- case NodeType::Null:
- break;
- case NodeType::Scalar:
- m_scalar.clear();
- break;
- case NodeType::Sequence:
- reset_sequence();
- break;
- case NodeType::Map:
- reset_map();
- break;
- case NodeType::Undefined:
- assert(false);
- break;
- }
- }
-
- void node_data::set_tag(const std::string& tag)
- {
- m_tag = tag;
- }
-
- void node_data::set_null()
- {
- m_isDefined = true;
- m_type = NodeType::Null;
- }
-
- void node_data::set_scalar(const std::string& scalar)
- {
- m_isDefined = true;
- m_type = NodeType::Scalar;
- m_scalar = scalar;
- }
-
- // size/iterator
- std::size_t node_data::size() const
- {
- if(!m_isDefined)
- return 0;
-
- switch(m_type) {
- case NodeType::Sequence: compute_seq_size(); return m_seqSize;
- case NodeType::Map: compute_map_size(); return m_map.size() - m_undefinedPairs.size();
- default:
- return 0;
- }
- return 0;
- }
-
- void node_data::compute_seq_size() const
- {
- while(m_seqSize < m_sequence.size() && m_sequence[m_seqSize]->is_defined())
- m_seqSize++;
- }
-
- void node_data::compute_map_size() const
- {
- kv_pairs::iterator it = m_undefinedPairs.begin();
- while(it != m_undefinedPairs.end()) {
- kv_pairs::iterator jt = boost::next(it);
- if(it->first->is_defined() && it->second->is_defined())
- m_undefinedPairs.erase(it);
- it = jt;
- }
- }
-
- const_node_iterator node_data::begin() const
- {
- if(!m_isDefined)
- return const_node_iterator();
-
- switch(m_type) {
- case NodeType::Sequence: return const_node_iterator(m_sequence.begin());
- case NodeType::Map: return const_node_iterator(m_map.begin(), m_map.end());
- default: return const_node_iterator();
- }
- }
-
- node_iterator node_data::begin()
- {
- if(!m_isDefined)
- return node_iterator();
-
- switch(m_type) {
- case NodeType::Sequence: return node_iterator(m_sequence.begin());
- case NodeType::Map: return node_iterator(m_map.begin(), m_map.end());
- default: return node_iterator();
- }
- }
-
- const_node_iterator node_data::end() const
- {
- if(!m_isDefined)
- return const_node_iterator();
-
- switch(m_type) {
- case NodeType::Sequence: return const_node_iterator(m_sequence.end());
- case NodeType::Map: return const_node_iterator(m_map.end(), m_map.end());
- default: return const_node_iterator();
- }
- }
-
- node_iterator node_data::end()
- {
- if(!m_isDefined)
- return node_iterator();
-
- switch(m_type) {
- case NodeType::Sequence: return node_iterator(m_sequence.end());
- case NodeType::Map: return node_iterator(m_map.end(), m_map.end());
- default: return node_iterator();
- }
- }
-
- // sequence
- void node_data::push_back(node& node, shared_memory_holder /* pMemory */)
- {
- if(m_type == NodeType::Undefined || m_type == NodeType::Null) {
- m_type = NodeType::Sequence;
- reset_sequence();
- }
-
- if(m_type != NodeType::Sequence)
- throw BadPushback();
-
- m_sequence.push_back(&node);
- }
-
- void node_data::insert(node& key, node& value, shared_memory_holder pMemory)
- {
- switch(m_type) {
- case NodeType::Map:
- break;
- case NodeType::Undefined:
- case NodeType::Null:
- case NodeType::Sequence:
- convert_to_map(pMemory);
- break;
- case NodeType::Scalar:
- throw BadSubscript();
- }
-
- insert_map_pair(key, value);
- }
-
- // indexing
- node& node_data::get(node& key, shared_memory_holder pMemory) const
- {
- if(m_type != NodeType::Map)
- return pMemory->create_node();
-
- for(node_map::const_iterator it=m_map.begin();it!=m_map.end();++it) {
- if(it->first->is(key))
- return *it->second;
- }
-
- return pMemory->create_node();
- }
-
- node& node_data::get(node& key, shared_memory_holder pMemory)
- {
- switch(m_type) {
- case NodeType::Map:
- break;
- case NodeType::Undefined:
- case NodeType::Null:
- case NodeType::Sequence:
- convert_to_map(pMemory);
- break;
- case NodeType::Scalar:
- throw BadSubscript();
- }
-
- for(node_map::const_iterator it=m_map.begin();it!=m_map.end();++it) {
- if(it->first->is(key))
- return *it->second;
- }
-
- node& value = pMemory->create_node();
- insert_map_pair(key, value);
- return value;
- }
-
- bool node_data::remove(node& key, shared_memory_holder /* pMemory */)
- {
- if(m_type != NodeType::Map)
- return false;
-
- for(node_map::iterator it=m_map.begin();it!=m_map.end();++it) {
- if(it->first->is(key)) {
- m_map.erase(it);
- return true;
- }
- }
-
- return false;
- }
-
- void node_data::reset_sequence()
- {
- m_sequence.clear();
- m_seqSize = 0;
- }
-
- void node_data::reset_map()
- {
- m_map.clear();
- m_undefinedPairs.clear();
- }
-
- void node_data::insert_map_pair(node& key, node& value)
- {
- m_map[&key] = &value;
- if(!key.is_defined() || !value.is_defined())
- m_undefinedPairs.push_back(kv_pair(&key, &value));
- }
-
- void node_data::convert_to_map(shared_memory_holder pMemory)
- {
- switch(m_type) {
- case NodeType::Undefined:
- case NodeType::Null:
- reset_map();
- m_type = NodeType::Map;
- break;
- case NodeType::Sequence:
- convert_sequence_to_map(pMemory);
- break;
- case NodeType::Map:
- break;
- case NodeType::Scalar:
- assert(false);
- break;
- }
- }
-
- void node_data::convert_sequence_to_map(shared_memory_holder pMemory)
- {
- assert(m_type == NodeType::Sequence);
-
- reset_map();
- for(std::size_t i=0;i<m_sequence.size();i++) {
- std::stringstream stream;
- stream << i;
-
- node& key = pMemory->create_node();
- key.set_scalar(stream.str());
- insert_map_pair(key, *m_sequence[i]);
- }
-
- reset_sequence();
- m_type = NodeType::Map;
- }
- }
-}
diff --git a/ext/src/yaml-cpp/nodebuilder.cpp b/ext/src/yaml-cpp/nodebuilder.cpp
deleted file mode 100644
index 6735f73..0000000
--- a/ext/src/yaml-cpp/nodebuilder.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-#include "nodebuilder.h"
-#include "yaml-cpp/mark.h"
-#include "yaml-cpp/node/node.h"
-#include "yaml-cpp/node/impl.h"
-#include <cassert>
-
-namespace YAML
-{
- NodeBuilder::NodeBuilder(): m_pMemory(new detail::memory_holder), m_pRoot(0), m_mapDepth(0)
- {
- m_anchors.push_back(0); // since the anchors start at 1
- }
-
- NodeBuilder::~NodeBuilder()
- {
- }
-
- Node NodeBuilder::Root()
- {
- if(!m_pRoot)
- return Node();
-
- return Node(*m_pRoot, m_pMemory);
- }
-
- void NodeBuilder::OnDocumentStart(const Mark&)
- {
- }
-
- void NodeBuilder::OnDocumentEnd()
- {
- }
-
- void NodeBuilder::OnNull(const Mark& /* mark */, anchor_t anchor)
- {
- detail::node& node = Push(anchor);
- node.set_null();
- Pop();
- }
-
- void NodeBuilder::OnAlias(const Mark& /* mark */, anchor_t anchor)
- {
- detail::node& node = *m_anchors[anchor];
- Push(node);
- Pop();
- }
-
- void NodeBuilder::OnScalar(const Mark& /* mark */, const std::string& tag, anchor_t anchor, const std::string& value)
- {
- detail::node& node = Push(anchor);
- node.set_scalar(value);
- node.set_tag(tag);
- Pop();
- }
-
- void NodeBuilder::OnSequenceStart(const Mark& /* mark */, const std::string& tag, anchor_t anchor)
- {
- detail::node& node = Push(anchor);
- node.set_tag(tag);
- node.set_type(NodeType::Sequence);
- }
-
- void NodeBuilder::OnSequenceEnd()
- {
- Pop();
- }
-
- void NodeBuilder::OnMapStart(const Mark& /* mark */, const std::string& tag, anchor_t anchor)
- {
- detail::node& node = Push(anchor);
- node.set_type(NodeType::Map);
- node.set_tag(tag);
- m_mapDepth++;
- }
-
- void NodeBuilder::OnMapEnd()
- {
- assert(m_mapDepth > 0);
- m_mapDepth--;
- Pop();
- }
-
- detail::node& NodeBuilder::Push(anchor_t anchor)
- {
- detail::node& node = m_pMemory->create_node();
- RegisterAnchor(anchor, node);
- Push(node);
- return node;
- }
-
- void NodeBuilder::Push(detail::node& node)
- {
- const bool needsKey = (!m_stack.empty() && m_stack.back()->type() == NodeType::Map && m_keys.size() < m_mapDepth);
-
- m_stack.push_back(&node);
- if(needsKey)
- m_keys.push_back(PushedKey(&node, false));
- }
-
- void NodeBuilder::Pop()
- {
- assert(!m_stack.empty());
- if(m_stack.size() == 1) {
- m_pRoot = m_stack[0];
- m_stack.pop_back();
- return;
- }
-
- detail::node& node = *m_stack.back();
- m_stack.pop_back();
-
- detail::node& collection = *m_stack.back();
-
- if(collection.type() == NodeType::Sequence) {
- collection.push_back(node, m_pMemory);
- } else if(collection.type() == NodeType::Map) {
- assert(!m_keys.empty());
- PushedKey& key = m_keys.back();
- if(key.second) {
- collection.insert(*key.first, node, m_pMemory);
- m_keys.pop_back();
- } else {
- key.second = true;
- }
- } else {
- assert(false);
- m_stack.clear();
- }
- }
-
- void NodeBuilder::RegisterAnchor(anchor_t anchor, detail::node& node)
- {
- if(anchor) {
- assert(anchor == m_anchors.size());
- m_anchors.push_back(&node);
- }
- }
-}
diff --git a/ext/src/yaml-cpp/nodebuilder.h b/ext/src/yaml-cpp/nodebuilder.h
deleted file mode 100644
index cce91d3..0000000
--- a/ext/src/yaml-cpp/nodebuilder.h
+++ /dev/null
@@ -1,58 +0,0 @@
-#ifndef NODE_NODEBUILDER_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define NODE_NODEBUILDER_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-#include "yaml-cpp/eventhandler.h"
-#include "yaml-cpp/node/ptr.h"
-#include <vector>
-
-namespace YAML
-{
- class Node;
-
- class NodeBuilder: public EventHandler
- {
- public:
- NodeBuilder();
- virtual ~NodeBuilder();
-
- Node Root();
-
- virtual void OnDocumentStart(const Mark& mark);
- virtual void OnDocumentEnd();
-
- virtual void OnNull(const Mark& mark, anchor_t anchor);
- virtual void OnAlias(const Mark& mark, anchor_t anchor);
- virtual void OnScalar(const Mark& mark, const std::string& tag, anchor_t anchor, const std::string& value);
-
- virtual void OnSequenceStart(const Mark& mark, const std::string& tag, anchor_t anchor);
- virtual void OnSequenceEnd();
-
- virtual void OnMapStart(const Mark& mark, const std::string& tag, anchor_t anchor);
- virtual void OnMapEnd();
-
- private:
- detail::node& Push(anchor_t anchor);
- void Push(detail::node& node);
- void Pop();
- void RegisterAnchor(anchor_t anchor, detail::node& node);
-
- private:
- detail::shared_memory_holder m_pMemory;
- detail::node *m_pRoot;
-
- typedef std::vector<detail::node *> Nodes;
- Nodes m_stack;
- Nodes m_anchors;
-
- typedef std::pair<detail::node *, bool> PushedKey;
- std::vector<PushedKey> m_keys;
- std::size_t m_mapDepth;
- };
-}
-
-#endif // NODE_NODEBUILDER_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
diff --git a/ext/src/yaml-cpp/nodeevents.cpp b/ext/src/yaml-cpp/nodeevents.cpp
deleted file mode 100644
index 721f9c6..0000000
--- a/ext/src/yaml-cpp/nodeevents.cpp
+++ /dev/null
@@ -1,99 +0,0 @@
-#include "nodeevents.h"
-#include "yaml-cpp/node/node.h"
-#include "yaml-cpp/node/impl.h"
-#include "yaml-cpp/eventhandler.h"
-#include "yaml-cpp/mark.h"
-
-namespace YAML
-{
- void NodeEvents::AliasManager::RegisterReference(const detail::node& node)
- {
- m_anchorByIdentity.insert(std::make_pair(node.ref(), _CreateNewAnchor()));
- }
-
- anchor_t NodeEvents::AliasManager::LookupAnchor(const detail::node& node) const
- {
- AnchorByIdentity::const_iterator it = m_anchorByIdentity.find(node.ref());
- if(it == m_anchorByIdentity.end())
- return 0;
- return it->second;
- }
-
- NodeEvents::NodeEvents(const Node& node): m_pMemory(node.m_pMemory), m_root(*node.m_pNode)
- {
- Setup(m_root);
- }
-
- void NodeEvents::Setup(const detail::node& node)
- {
- int& refCount = m_refCount[node.ref()];
- refCount++;
- if(refCount > 1)
- return;
-
- if(node.type() == NodeType::Sequence) {
- for(detail::const_node_iterator it=node.begin();it!=node.end();++it)
- Setup(**it);
- } else if(node.type() == NodeType::Map) {
- for(detail::const_node_iterator it=node.begin();it!=node.end();++it) {
- Setup(*it->first);
- Setup(*it->second);
- }
- }
- }
-
- void NodeEvents::Emit(EventHandler& handler)
- {
- AliasManager am;
-
- handler.OnDocumentStart(Mark());
- Emit(m_root, handler, am);
- handler.OnDocumentEnd();
- }
-
- void NodeEvents::Emit(const detail::node& node, EventHandler& handler, AliasManager& am) const
- {
- anchor_t anchor = NullAnchor;
- if(IsAliased(node)) {
- anchor = am.LookupAnchor(node);
- if(anchor) {
- handler.OnAlias(Mark(), anchor);
- return;
- }
-
- am.RegisterReference(node);
- anchor = am.LookupAnchor(node);
- }
-
- switch(node.type()) {
- case NodeType::Undefined:
- break;
- case NodeType::Null:
- handler.OnNull(Mark(), anchor);
- break;
- case NodeType::Scalar:
- handler.OnScalar(Mark(), node.tag(), anchor, node.scalar());
- break;
- case NodeType::Sequence:
- handler.OnSequenceStart(Mark(), node.tag(), anchor);
- for(detail::const_node_iterator it=node.begin();it!=node.end();++it)
- Emit(**it, handler, am);
- handler.OnSequenceEnd();
- break;
- case NodeType::Map:
- handler.OnMapStart(Mark(), node.tag(), anchor);
- for(detail::const_node_iterator it=node.begin();it!=node.end();++it) {
- Emit(*it->first, handler, am);
- Emit(*it->second, handler, am);
- }
- handler.OnMapEnd();
- break;
- }
- }
-
- bool NodeEvents::IsAliased(const detail::node& node) const
- {
- RefCount::const_iterator it = m_refCount.find(node.ref());
- return it != m_refCount.end() && it->second > 1;
- }
-}
diff --git a/ext/src/yaml-cpp/nodeevents.h b/ext/src/yaml-cpp/nodeevents.h
deleted file mode 100644
index d142115..0000000
--- a/ext/src/yaml-cpp/nodeevents.h
+++ /dev/null
@@ -1,57 +0,0 @@
-#ifndef NODE_NODEEVENTS_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define NODE_NODEEVENTS_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-#include "yaml-cpp/anchor.h"
-#include "yaml-cpp/node/ptr.h"
-#include <map>
-#include <vector>
-
-namespace YAML
-{
- class EventHandler;
- class Node;
-
- class NodeEvents
- {
- public:
- explicit NodeEvents(const Node& node);
-
- void Emit(EventHandler& handler);
-
- private:
- class AliasManager {
- public:
- AliasManager(): m_curAnchor(0) {}
-
- void RegisterReference(const detail::node& node);
- anchor_t LookupAnchor(const detail::node& node) const;
-
- private:
- anchor_t _CreateNewAnchor() { return ++m_curAnchor; }
-
- private:
- typedef std::map<const detail::node_ref*, anchor_t> AnchorByIdentity;
- AnchorByIdentity m_anchorByIdentity;
-
- anchor_t m_curAnchor;
- };
-
- void Setup(const detail::node& node);
- void Emit(const detail::node& node, EventHandler& handler, AliasManager& am) const;
- bool IsAliased(const detail::node& node) const;
-
- private:
- detail::shared_memory_holder m_pMemory;
- detail::node& m_root;
-
- typedef std::map<const detail::node_ref *, int> RefCount;
- RefCount m_refCount;
- };
-}
-
-#endif // NODE_NODEEVENTS_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
diff --git a/ext/src/yaml-cpp/null.cpp b/ext/src/yaml-cpp/null.cpp
deleted file mode 100644
index 37f3197..0000000
--- a/ext/src/yaml-cpp/null.cpp
+++ /dev/null
@@ -1,6 +0,0 @@
-#include "yaml-cpp/null.h"
-
-namespace YAML
-{
- _Null Null;
-}
diff --git a/ext/src/yaml-cpp/ostream_wrapper.cpp b/ext/src/yaml-cpp/ostream_wrapper.cpp
deleted file mode 100644
index 15d9f0f..0000000
--- a/ext/src/yaml-cpp/ostream_wrapper.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-#include "yaml-cpp/ostream_wrapper.h"
-#include <cstring>
-#include <iostream>
-
-namespace YAML
-{
- ostream_wrapper::ostream_wrapper(): m_pStream(0), m_pos(0), m_row(0), m_col(0), m_comment(false)
- {
- }
-
- ostream_wrapper::ostream_wrapper(std::ostream& stream): m_pStream(&stream), m_pos(0), m_row(0), m_col(0), m_comment(false)
- {
- }
-
- ostream_wrapper::~ostream_wrapper()
- {
- }
-
- void ostream_wrapper::write(const std::string& str)
- {
- if(m_pStream) {
- m_pStream->write(str.c_str(), str.size());
- } else {
- m_buffer.resize(std::max(m_buffer.size(), m_pos + str.size() + 1));
- std::copy(str.begin(), str.end(), &m_buffer[m_pos]);
- }
-
- for(std::size_t i=0;i<str.size();i++)
- update_pos(str[i]);
- }
-
- void ostream_wrapper::write(const char *str, std::size_t size)
- {
- if(m_pStream) {
- m_pStream->write(str, size);
- } else {
- m_buffer.resize(std::max(m_buffer.size(), m_pos + size + 1));
- std::copy(str, str + size, &m_buffer[m_pos]);
- }
-
- for(std::size_t i=0;i<size;i++)
- update_pos(str[i]);
- }
-
- void ostream_wrapper::update_pos(char ch)
- {
- m_pos++;
- m_col++;
-
- if(ch == '\n') {
- m_row++;
- m_col = 0;
- m_comment = false;
- }
- }
-}
diff --git a/ext/src/yaml-cpp/parse.cpp b/ext/src/yaml-cpp/parse.cpp
deleted file mode 100644
index 1537d55..0000000
--- a/ext/src/yaml-cpp/parse.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-#include "yaml-cpp/node/parse.h"
-#include "yaml-cpp/node/node.h"
-#include "yaml-cpp/node/impl.h"
-#include "yaml-cpp/parser.h"
-#include "nodebuilder.h"
-
-#include <fstream>
-#include <sstream>
-
-namespace YAML
-{
- Node Load(const std::string& input) {
- std::stringstream stream(input);
- return Load(stream);
- }
-
- Node Load(const char *input) {
- std::stringstream stream(input);
- return Load(stream);
- }
-
- Node Load(std::istream& input) {
- Parser parser(input);
- NodeBuilder builder;
- if(!parser.HandleNextDocument(builder))
- return Node();
-
- return builder.Root();
- }
-
- Node LoadFile(const std::string& filename) {
- std::ifstream fin(filename.c_str());
- if(!fin)
- throw BadFile();
- return Load(fin);
- }
-
- std::vector<Node> LoadAll(const std::string& input) {
- std::stringstream stream(input);
- return LoadAll(stream);
- }
-
- std::vector<Node> LoadAll(const char *input) {
- std::stringstream stream(input);
- return LoadAll(stream);
- }
-
- std::vector<Node> LoadAll(std::istream& input) {
- std::vector<Node> docs;
-
- Parser parser(input);
- while(1) {
- NodeBuilder builder;
- if(!parser.HandleNextDocument(builder))
- break;
- docs.push_back(builder.Root());
- }
-
- return docs;
- }
-
- std::vector<Node> LoadAllFromFile(const std::string& filename) {
- std::ifstream fin(filename.c_str());
- if(!fin)
- throw BadFile();
- return LoadAll(fin);
- }
-}
diff --git a/ext/src/yaml-cpp/parser.cpp b/ext/src/yaml-cpp/parser.cpp
deleted file mode 100644
index 7861ec1..0000000
--- a/ext/src/yaml-cpp/parser.cpp
+++ /dev/null
@@ -1,141 +0,0 @@
-#include "yaml-cpp/parser.h"
-#include "yaml-cpp/eventhandler.h"
-#include "yaml-cpp/exceptions.h"
-#include "directives.h"
-#include "scanner.h"
-#include "singledocparser.h"
-#include "tag.h"
-#include "token.h"
-#include <sstream>
-#include <cstdio>
-
-namespace YAML
-{
- Parser::Parser()
- {
- }
-
- Parser::Parser(std::istream& in)
- {
- Load(in);
- }
-
- Parser::~Parser()
- {
- }
-
- Parser::operator bool() const
- {
- return m_pScanner.get() && !m_pScanner->empty();
- }
-
- void Parser::Load(std::istream& in)
- {
- m_pScanner.reset(new Scanner(in));
- m_pDirectives.reset(new Directives);
- }
-
- // HandleNextDocument
- // . Handles the next document
- // . Throws a ParserException on error.
- // . Returns false if there are no more documents
- bool Parser::HandleNextDocument(EventHandler& eventHandler)
- {
- if(!m_pScanner.get())
- return false;
-
- ParseDirectives();
- if(m_pScanner->empty())
- return false;
-
- SingleDocParser sdp(*m_pScanner, *m_pDirectives);
- sdp.HandleDocument(eventHandler);
- return true;
- }
-
- // ParseDirectives
- // . Reads any directives that are next in the queue.
- void Parser::ParseDirectives()
- {
- bool readDirective = false;
-
- while(1) {
- if(m_pScanner->empty())
- break;
-
- Token& token = m_pScanner->peek();
- if(token.type != Token::DIRECTIVE)
- break;
-
- // we keep the directives from the last document if none are specified;
- // but if any directives are specific, then we reset them
- if(!readDirective)
- m_pDirectives.reset(new Directives);
-
- readDirective = true;
- HandleDirective(token);
- m_pScanner->pop();
- }
- }
-
- void Parser::HandleDirective(const Token& token)
- {
- if(token.value == "YAML")
- HandleYamlDirective(token);
- else if(token.value == "TAG")
- HandleTagDirective(token);
- }
-
- // HandleYamlDirective
- // . Should be of the form 'major.minor' (like a version number)
- void Parser::HandleYamlDirective(const Token& token)
- {
- if(token.params.size() != 1)
- throw ParserException(token.mark, ErrorMsg::YAML_DIRECTIVE_ARGS);
-
- if(!m_pDirectives->version.isDefault)
- throw ParserException(token.mark, ErrorMsg::REPEATED_YAML_DIRECTIVE);
-
- std::stringstream str(token.params[0]);
- str >> m_pDirectives->version.major;
- str.get();
- str >> m_pDirectives->version.minor;
- if(!str || str.peek() != EOF)
- throw ParserException(token.mark, std::string(ErrorMsg::YAML_VERSION) + token.params[0]);
-
- if(m_pDirectives->version.major > 1)
- throw ParserException(token.mark, ErrorMsg::YAML_MAJOR_VERSION);
-
- m_pDirectives->version.isDefault = false;
- // TODO: warning on major == 1, minor > 2?
- }
-
- // HandleTagDirective
- // . Should be of the form 'handle prefix', where 'handle' is converted to 'prefix' in the file.
- void Parser::HandleTagDirective(const Token& token)
- {
- if(token.params.size() != 2)
- throw ParserException(token.mark, ErrorMsg::TAG_DIRECTIVE_ARGS);
-
- const std::string& handle = token.params[0];
- const std::string& prefix = token.params[1];
- if(m_pDirectives->tags.find(handle) != m_pDirectives->tags.end())
- throw ParserException(token.mark, ErrorMsg::REPEATED_TAG_DIRECTIVE);
-
- m_pDirectives->tags[handle] = prefix;
- }
-
- void Parser::PrintTokens(std::ostream& out)
- {
- if(!m_pScanner.get())
- return;
-
- while(1) {
- if(m_pScanner->empty())
- break;
-
- out << m_pScanner->peek() << "\n";
- m_pScanner->pop();
- }
- }
-}
diff --git a/ext/src/yaml-cpp/ptr_stack.h b/ext/src/yaml-cpp/ptr_stack.h
deleted file mode 100644
index eec0fb8..0000000
--- a/ext/src/yaml-cpp/ptr_stack.h
+++ /dev/null
@@ -1,49 +0,0 @@
-#ifndef PTR_STACK_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define PTR_STACK_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-#include "yaml-cpp/noncopyable.h"
-#include <cstddef>
-#include <cstdlib>
-#include <memory>
-#include <vector>
-
-template <typename T>
-class ptr_stack: private YAML::noncopyable
-{
-public:
- ptr_stack() {}
- ~ptr_stack() { clear(); }
-
- void clear() {
- for(unsigned i=0;i<m_data.size();i++)
- delete m_data[i];
- m_data.clear();
- }
-
- std::size_t size() const { return m_data.size(); }
- bool empty() const { return m_data.empty(); }
-
- void push(std::auto_ptr<T> t) {
- m_data.push_back(NULL);
- m_data.back() = t.release();
- }
- std::auto_ptr<T> pop() {
- std::auto_ptr<T> t(m_data.back());
- m_data.pop_back();
- return t;
- }
- T& top() { return *m_data.back(); }
- const T& top() const { return *m_data.back(); }
-
- T& top(std::ptrdiff_t diff) { return **(m_data.end() - 1 + diff); }
- const T& top(std::ptrdiff_t diff) const { return **(m_data.end() - 1 + diff); }
-
-private:
- std::vector<T*> m_data;
-};
-
-#endif // PTR_STACK_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/src/yaml-cpp/ptr_vector.h b/ext/src/yaml-cpp/ptr_vector.h
deleted file mode 100644
index 7b936cb..0000000
--- a/ext/src/yaml-cpp/ptr_vector.h
+++ /dev/null
@@ -1,47 +0,0 @@
-#ifndef PTR_VECTOR_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define PTR_VECTOR_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-#include "yaml-cpp/noncopyable.h"
-#include <cstddef>
-#include <cstdlib>
-#include <memory>
-#include <vector>
-
-namespace YAML {
-
- template <typename T>
- class ptr_vector: private YAML::noncopyable
- {
- public:
- ptr_vector() {}
- ~ptr_vector() { clear(); }
-
- void clear() {
- for(unsigned i=0;i<m_data.size();i++)
- delete m_data[i];
- m_data.clear();
- }
-
- std::size_t size() const { return m_data.size(); }
- bool empty() const { return m_data.empty(); }
-
- void push_back(std::auto_ptr<T> t) {
- m_data.push_back(NULL);
- m_data.back() = t.release();
- }
- T& operator[](std::size_t i) { return *m_data[i]; }
- const T& operator[](std::size_t i) const { return *m_data[i]; }
-
- T& back() { return *m_data.back(); }
- const T& back() const { return *m_data.back(); }
-
- private:
- std::vector<T*> m_data;
- };
-}
-
-#endif // PTR_VECTOR_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/src/yaml-cpp/regex.cpp b/ext/src/yaml-cpp/regex.cpp
deleted file mode 100644
index b35b1f4..0000000
--- a/ext/src/yaml-cpp/regex.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-#include "regex.h"
-
-namespace YAML
-{
- // constructors
- RegEx::RegEx(): m_op(REGEX_EMPTY)
- {
- }
-
- RegEx::RegEx(REGEX_OP op): m_op(op)
- {
- }
-
- RegEx::RegEx(char ch): m_op(REGEX_MATCH), m_a(ch)
- {
- }
-
- RegEx::RegEx(char a, char z): m_op(REGEX_RANGE), m_a(a), m_z(z)
- {
- }
-
- RegEx::RegEx(const std::string& str, REGEX_OP op): m_op(op)
- {
- for(std::size_t i=0;i<str.size();i++)
- m_params.push_back(RegEx(str[i]));
- }
-
- // combination constructors
- RegEx operator ! (const RegEx& ex)
- {
- RegEx ret(REGEX_NOT);
- ret.m_params.push_back(ex);
- return ret;
- }
-
- RegEx operator || (const RegEx& ex1, const RegEx& ex2)
- {
- RegEx ret(REGEX_OR);
- ret.m_params.push_back(ex1);
- ret.m_params.push_back(ex2);
- return ret;
- }
-
- RegEx operator && (const RegEx& ex1, const RegEx& ex2)
- {
- RegEx ret(REGEX_AND);
- ret.m_params.push_back(ex1);
- ret.m_params.push_back(ex2);
- return ret;
- }
-
- RegEx operator + (const RegEx& ex1, const RegEx& ex2)
- {
- RegEx ret(REGEX_SEQ);
- ret.m_params.push_back(ex1);
- ret.m_params.push_back(ex2);
- return ret;
- }
-}
-
diff --git a/ext/src/yaml-cpp/regex.h b/ext/src/yaml-cpp/regex.h
deleted file mode 100644
index 8722e62..0000000
--- a/ext/src/yaml-cpp/regex.h
+++ /dev/null
@@ -1,67 +0,0 @@
-#ifndef REGEX_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define REGEX_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-#include <vector>
-#include <string>
-
-namespace YAML
-{
- class Stream;
-
- enum REGEX_OP { REGEX_EMPTY, REGEX_MATCH, REGEX_RANGE, REGEX_OR, REGEX_AND, REGEX_NOT, REGEX_SEQ };
-
- // simplified regular expressions
- // . Only straightforward matches (no repeated characters)
- // . Only matches from start of string
- class RegEx
- {
- public:
- RegEx();
- RegEx(char ch);
- RegEx(char a, char z);
- RegEx(const std::string& str, REGEX_OP op = REGEX_SEQ);
- ~RegEx() {}
-
- friend RegEx operator ! (const RegEx& ex);
- friend RegEx operator || (const RegEx& ex1, const RegEx& ex2);
- friend RegEx operator && (const RegEx& ex1, const RegEx& ex2);
- friend RegEx operator + (const RegEx& ex1, const RegEx& ex2);
-
- bool Matches(char ch) const;
- bool Matches(const std::string& str) const;
- bool Matches(const Stream& in) const;
- template <typename Source> bool Matches(const Source& source) const;
-
- int Match(const std::string& str) const;
- int Match(const Stream& in) const;
- template <typename Source> int Match(const Source& source) const;
-
- private:
- RegEx(REGEX_OP op);
-
- template <typename Source> bool IsValidSource(const Source& source) const;
- template <typename Source> int MatchUnchecked(const Source& source) const;
-
- template <typename Source> int MatchOpEmpty(const Source& source) const;
- template <typename Source> int MatchOpMatch(const Source& source) const;
- template <typename Source> int MatchOpRange(const Source& source) const;
- template <typename Source> int MatchOpOr(const Source& source) const;
- template <typename Source> int MatchOpAnd(const Source& source) const;
- template <typename Source> int MatchOpNot(const Source& source) const;
- template <typename Source> int MatchOpSeq(const Source& source) const;
-
- private:
- REGEX_OP m_op;
- char m_a, m_z;
- std::vector <RegEx> m_params;
- };
-}
-
-#include "regeximpl.h"
-
-#endif // REGEX_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/src/yaml-cpp/regeximpl.h b/ext/src/yaml-cpp/regeximpl.h
deleted file mode 100644
index d5c20d7..0000000
--- a/ext/src/yaml-cpp/regeximpl.h
+++ /dev/null
@@ -1,186 +0,0 @@
-#ifndef REGEXIMPL_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define REGEXIMPL_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-#include "stream.h"
-#include "stringsource.h"
-#include "streamcharsource.h"
-
-namespace YAML
-{
- // query matches
- inline bool RegEx::Matches(char ch) const {
- std::string str;
- str += ch;
- return Matches(str);
- }
-
- inline bool RegEx::Matches(const std::string& str) const {
- return Match(str) >= 0;
- }
-
- inline bool RegEx::Matches(const Stream& in) const {
- return Match(in) >= 0;
- }
-
- template <typename Source>
- inline bool RegEx::Matches(const Source& source) const {
- return Match(source) >= 0;
- }
-
- // Match
- // . Matches the given string against this regular expression.
- // . Returns the number of characters matched.
- // . Returns -1 if no characters were matched (the reason for
- // not returning zero is that we may have an empty regex
- // which is ALWAYS successful at matching zero characters).
- // . REMEMBER that we only match from the start of the buffer!
- inline int RegEx::Match(const std::string& str) const
- {
- StringCharSource source(str.c_str(), str.size());
- return Match(source);
- }
-
- inline int RegEx::Match(const Stream& in) const
- {
- StreamCharSource source(in);
- return Match(source);
- }
-
- template <typename Source>
- inline bool RegEx::IsValidSource(const Source& source) const
- {
- return source;
- }
-
- template<>
- inline bool RegEx::IsValidSource<StringCharSource>(const StringCharSource&source) const
- {
- switch(m_op) {
- case REGEX_MATCH:
- case REGEX_RANGE:
- return source;
- default:
- return true;
- }
- }
-
- template <typename Source>
- inline int RegEx::Match(const Source& source) const
- {
- return IsValidSource(source) ? MatchUnchecked(source) : -1;
- }
-
- template <typename Source>
- inline int RegEx::MatchUnchecked(const Source& source) const
- {
- switch(m_op) {
- case REGEX_EMPTY:
- return MatchOpEmpty(source);
- case REGEX_MATCH:
- return MatchOpMatch(source);
- case REGEX_RANGE:
- return MatchOpRange(source);
- case REGEX_OR:
- return MatchOpOr(source);
- case REGEX_AND:
- return MatchOpAnd(source);
- case REGEX_NOT:
- return MatchOpNot(source);
- case REGEX_SEQ:
- return MatchOpSeq(source);
- }
-
- return -1;
- }
-
- //////////////////////////////////////////////////////////////////////////////
- // Operators
- // Note: the convention MatchOp*<Source> is that we can assume IsSourceValid(source).
- // So we do all our checks *before* we call these functions
-
- // EmptyOperator
- template <typename Source>
- inline int RegEx::MatchOpEmpty(const Source& source) const {
- return source[0] == Stream::eof() ? 0 : -1;
- }
-
- template <>
- inline int RegEx::MatchOpEmpty<StringCharSource>(const StringCharSource& source) const {
- return !source ? 0 : -1; // the empty regex only is successful on the empty string
- }
-
- // MatchOperator
- template <typename Source>
- inline int RegEx::MatchOpMatch(const Source& source) const {
- if(source[0] != m_a)
- return -1;
- return 1;
- }
-
- // RangeOperator
- template <typename Source>
- inline int RegEx::MatchOpRange(const Source& source) const {
- if(m_a > source[0] || m_z < source[0])
- return -1;
- return 1;
- }
-
- // OrOperator
- template <typename Source>
- inline int RegEx::MatchOpOr(const Source& source) const {
- for(std::size_t i=0;i<m_params.size();i++) {
- int n = m_params[i].MatchUnchecked(source);
- if(n >= 0)
- return n;
- }
- return -1;
- }
-
- // AndOperator
- // Note: 'AND' is a little funny, since we may be required to match things
- // of different lengths. If we find a match, we return the length of
- // the FIRST entry on the list.
- template <typename Source>
- inline int RegEx::MatchOpAnd(const Source& source) const {
- int first = -1;
- for(std::size_t i=0;i<m_params.size();i++) {
- int n = m_params[i].MatchUnchecked(source);
- if(n == -1)
- return -1;
- if(i == 0)
- first = n;
- }
- return first;
- }
-
- // NotOperator
- template <typename Source>
- inline int RegEx::MatchOpNot(const Source& source) const {
- if(m_params.empty())
- return -1;
- if(m_params[0].MatchUnchecked(source) >= 0)
- return -1;
- return 1;
- }
-
- // SeqOperator
- template <typename Source>
- inline int RegEx::MatchOpSeq(const Source& source) const {
- int offset = 0;
- for(std::size_t i=0;i<m_params.size();i++) {
- int n = m_params[i].Match(source + offset); // note Match, not MatchUnchecked because we need to check validity after the offset
- if(n == -1)
- return -1;
- offset += n;
- }
-
- return offset;
- }
-}
-
-#endif // REGEXIMPL_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/src/yaml-cpp/scanner.cpp b/ext/src/yaml-cpp/scanner.cpp
deleted file mode 100644
index 66cce0a..0000000
--- a/ext/src/yaml-cpp/scanner.cpp
+++ /dev/null
@@ -1,394 +0,0 @@
-#include "scanner.h"
-#include "token.h"
-#include "yaml-cpp/exceptions.h"
-#include "exp.h"
-#include <cassert>
-#include <memory>
-
-namespace YAML
-{
- Scanner::Scanner(std::istream& in)
- : INPUT(in), m_startedStream(false), m_endedStream(false), m_simpleKeyAllowed(false), m_canBeJSONFlow(false)
- {
- }
-
- Scanner::~Scanner()
- {
- }
-
- // empty
- // . Returns true if there are no more tokens to be read
- bool Scanner::empty()
- {
- EnsureTokensInQueue();
- return m_tokens.empty();
- }
-
- // pop
- // . Simply removes the next token on the queue.
- void Scanner::pop()
- {
- EnsureTokensInQueue();
- if(!m_tokens.empty())
- m_tokens.pop();
- }
-
- // peek
- // . Returns (but does not remove) the next token on the queue.
- Token& Scanner::peek()
- {
- EnsureTokensInQueue();
- assert(!m_tokens.empty()); // should we be asserting here? I mean, we really just be checking
- // if it's empty before peeking.
-
-#if 0
- static Token *pLast = 0;
- if(pLast != &m_tokens.front())
- std::cerr << "peek: " << m_tokens.front() << "\n";
- pLast = &m_tokens.front();
-#endif
-
- return m_tokens.front();
- }
-
- // mark
- // . Returns the current mark in the stream
- Mark Scanner::mark() const
- {
- return INPUT.mark();
- }
-
- // EnsureTokensInQueue
- // . Scan until there's a valid token at the front of the queue,
- // or we're sure the queue is empty.
- void Scanner::EnsureTokensInQueue()
- {
- while(1) {
- if(!m_tokens.empty()) {
- Token& token = m_tokens.front();
-
- // if this guy's valid, then we're done
- if(token.status == Token::VALID)
- return;
-
- // here's where we clean up the impossible tokens
- if(token.status == Token::INVALID) {
- m_tokens.pop();
- continue;
- }
-
- // note: what's left are the unverified tokens
- }
-
- // no token? maybe we've actually finished
- if(m_endedStream)
- return;
-
- // no? then scan...
- ScanNextToken();
- }
- }
-
- // ScanNextToken
- // . The main scanning function; here we branch out and
- // scan whatever the next token should be.
- void Scanner::ScanNextToken()
- {
- if(m_endedStream)
- return;
-
- if(!m_startedStream)
- return StartStream();
-
- // get rid of whitespace, etc. (in between tokens it should be irrelevent)
- ScanToNextToken();
-
- // maybe need to end some blocks
- PopIndentToHere();
-
- // *****
- // And now branch based on the next few characters!
- // *****
-
- // end of stream
- if(!INPUT)
- return EndStream();
-
- if(INPUT.column() == 0 && INPUT.peek() == Keys::Directive)
- return ScanDirective();
-
- // document token
- if(INPUT.column() == 0 && Exp::DocStart().Matches(INPUT))
- return ScanDocStart();
-
- if(INPUT.column() == 0 && Exp::DocEnd().Matches(INPUT))
- return ScanDocEnd();
-
- // flow start/end/entry
- if(INPUT.peek() == Keys::FlowSeqStart || INPUT.peek() == Keys::FlowMapStart)
- return ScanFlowStart();
-
- if(INPUT.peek() == Keys::FlowSeqEnd || INPUT.peek() == Keys::FlowMapEnd)
- return ScanFlowEnd();
-
- if(INPUT.peek() == Keys::FlowEntry)
- return ScanFlowEntry();
-
- // block/map stuff
- if(Exp::BlockEntry().Matches(INPUT))
- return ScanBlockEntry();
-
- if((InBlockContext() ? Exp::Key() : Exp::KeyInFlow()).Matches(INPUT))
- return ScanKey();
-
- if(GetValueRegex().Matches(INPUT))
- return ScanValue();
-
- // alias/anchor
- if(INPUT.peek() == Keys::Alias || INPUT.peek() == Keys::Anchor)
- return ScanAnchorOrAlias();
-
- // tag
- if(INPUT.peek() == Keys::Tag)
- return ScanTag();
-
- // special scalars
- if(InBlockContext() && (INPUT.peek() == Keys::LiteralScalar || INPUT.peek() == Keys::FoldedScalar))
- return ScanBlockScalar();
-
- if(INPUT.peek() == '\'' || INPUT.peek() == '\"')
- return ScanQuotedScalar();
-
- // plain scalars
- if((InBlockContext() ? Exp::PlainScalar() : Exp::PlainScalarInFlow()).Matches(INPUT))
- return ScanPlainScalar();
-
- // don't know what it is!
- throw ParserException(INPUT.mark(), ErrorMsg::UNKNOWN_TOKEN);
- }
-
- // ScanToNextToken
- // . Eats input until we reach the next token-like thing.
- void Scanner::ScanToNextToken()
- {
- while(1) {
- // first eat whitespace
- while(INPUT && IsWhitespaceToBeEaten(INPUT.peek())) {
- if(InBlockContext() && Exp::Tab().Matches(INPUT))
- m_simpleKeyAllowed = false;
- INPUT.eat(1);
- }
-
- // then eat a comment
- if(Exp::Comment().Matches(INPUT)) {
- // eat until line break
- while(INPUT && !Exp::Break().Matches(INPUT))
- INPUT.eat(1);
- }
-
- // if it's NOT a line break, then we're done!
- if(!Exp::Break().Matches(INPUT))
- break;
-
- // otherwise, let's eat the line break and keep going
- int n = Exp::Break().Match(INPUT);
- INPUT.eat(n);
-
- // oh yeah, and let's get rid of that simple key
- InvalidateSimpleKey();
-
- // new line - we may be able to accept a simple key now
- if(InBlockContext())
- m_simpleKeyAllowed = true;
- }
- }
-
- ///////////////////////////////////////////////////////////////////////
- // Misc. helpers
-
- // IsWhitespaceToBeEaten
- // . We can eat whitespace if it's a space or tab
- // . Note: originally tabs in block context couldn't be eaten
- // "where a simple key could be allowed
- // (i.e., not at the beginning of a line, or following '-', '?', or ':')"
- // I think this is wrong, since tabs can be non-content whitespace; it's just
- // that they can't contribute to indentation, so once you've seen a tab in a
- // line, you can't start a simple key
- bool Scanner::IsWhitespaceToBeEaten(char ch)
- {
- if(ch == ' ')
- return true;
-
- if(ch == '\t')
- return true;
-
- return false;
- }
-
- // GetValueRegex
- // . Get the appropriate regex to check if it's a value token
- const RegEx& Scanner::GetValueRegex() const
- {
- if(InBlockContext())
- return Exp::Value();
-
- return m_canBeJSONFlow ? Exp::ValueInJSONFlow() : Exp::ValueInFlow();
- }
-
- // StartStream
- // . Set the initial conditions for starting a stream.
- void Scanner::StartStream()
- {
- m_startedStream = true;
- m_simpleKeyAllowed = true;
- std::auto_ptr<IndentMarker> pIndent(new IndentMarker(-1, IndentMarker::NONE));
- m_indentRefs.push_back(pIndent);
- m_indents.push(&m_indentRefs.back());
- }
-
- // EndStream
- // . Close out the stream, finish up, etc.
- void Scanner::EndStream()
- {
- // force newline
- if(INPUT.column() > 0)
- INPUT.ResetColumn();
-
- PopAllIndents();
- PopAllSimpleKeys();
-
- m_simpleKeyAllowed = false;
- m_endedStream = true;
- }
-
- Token *Scanner::PushToken(Token::TYPE type)
- {
- m_tokens.push(Token(type, INPUT.mark()));
- return &m_tokens.back();
- }
-
- Token::TYPE Scanner::GetStartTokenFor(IndentMarker::INDENT_TYPE type) const
- {
- switch(type) {
- case IndentMarker::SEQ: return Token::BLOCK_SEQ_START;
- case IndentMarker::MAP: return Token::BLOCK_MAP_START;
- case IndentMarker::NONE: assert(false); break;
- }
- assert(false);
- throw std::runtime_error("yaml-cpp: internal error, invalid indent type");
- }
-
- // PushIndentTo
- // . Pushes an indentation onto the stack, and enqueues the
- // proper token (sequence start or mapping start).
- // . Returns the indent marker it generates (if any).
- Scanner::IndentMarker *Scanner::PushIndentTo(int column, IndentMarker::INDENT_TYPE type)
- {
- // are we in flow?
- if(InFlowContext())
- return 0;
-
- std::auto_ptr<IndentMarker> pIndent(new IndentMarker(column, type));
- IndentMarker& indent = *pIndent;
- const IndentMarker& lastIndent = *m_indents.top();
-
- // is this actually an indentation?
- if(indent.column < lastIndent.column)
- return 0;
- if(indent.column == lastIndent.column && !(indent.type == IndentMarker::SEQ && lastIndent.type == IndentMarker::MAP))
- return 0;
-
- // push a start token
- indent.pStartToken = PushToken(GetStartTokenFor(type));
-
- // and then the indent
- m_indents.push(&indent);
- m_indentRefs.push_back(pIndent);
- return &m_indentRefs.back();
- }
-
- // PopIndentToHere
- // . Pops indentations off the stack until we reach the current indentation level,
- // and enqueues the proper token each time.
- // . Then pops all invalid indentations off.
- void Scanner::PopIndentToHere()
- {
- // are we in flow?
- if(InFlowContext())
- return;
-
- // now pop away
- while(!m_indents.empty()) {
- const IndentMarker& indent = *m_indents.top();
- if(indent.column < INPUT.column())
- break;
- if(indent.column == INPUT.column() && !(indent.type == IndentMarker::SEQ && !Exp::BlockEntry().Matches(INPUT)))
- break;
-
- PopIndent();
- }
-
- while(!m_indents.empty() && m_indents.top()->status == IndentMarker::INVALID)
- PopIndent();
- }
-
- // PopAllIndents
- // . Pops all indentations (except for the base empty one) off the stack,
- // and enqueues the proper token each time.
- void Scanner::PopAllIndents()
- {
- // are we in flow?
- if(InFlowContext())
- return;
-
- // now pop away
- while(!m_indents.empty()) {
- const IndentMarker& indent = *m_indents.top();
- if(indent.type == IndentMarker::NONE)
- break;
-
- PopIndent();
- }
- }
-
- // PopIndent
- // . Pops a single indent, pushing the proper token
- void Scanner::PopIndent()
- {
- const IndentMarker& indent = *m_indents.top();
- m_indents.pop();
-
- if(indent.status != IndentMarker::VALID) {
- InvalidateSimpleKey();
- return;
- }
-
- if(indent.type == IndentMarker::SEQ)
- m_tokens.push(Token(Token::BLOCK_SEQ_END, INPUT.mark()));
- else if(indent.type == IndentMarker::MAP)
- m_tokens.push(Token(Token::BLOCK_MAP_END, INPUT.mark()));
- }
-
- // GetTopIndent
- int Scanner::GetTopIndent() const
- {
- if(m_indents.empty())
- return 0;
- return m_indents.top()->column;
- }
-
- // ThrowParserException
- // . Throws a ParserException with the current token location
- // (if available).
- // . Does not parse any more tokens.
- void Scanner::ThrowParserException(const std::string& msg) const
- {
- Mark mark = Mark::null_mark();
- if(!m_tokens.empty()) {
- const Token& token = m_tokens.front();
- mark = token.mark;
- }
- throw ParserException(mark, msg);
- }
-}
-
diff --git a/ext/src/yaml-cpp/scanner.h b/ext/src/yaml-cpp/scanner.h
deleted file mode 100644
index fe71124..0000000
--- a/ext/src/yaml-cpp/scanner.h
+++ /dev/null
@@ -1,133 +0,0 @@
-#ifndef SCANNER_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define SCANNER_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-#include <ios>
-#include <string>
-#include <queue>
-#include <stack>
-#include <set>
-#include <map>
-#include "ptr_vector.h"
-#include "stream.h"
-#include "token.h"
-
-namespace YAML
-{
- class Node;
- class RegEx;
-
- class Scanner
- {
- public:
- Scanner(std::istream& in);
- ~Scanner();
-
- // token queue management (hopefully this looks kinda stl-ish)
- bool empty();
- void pop();
- Token& peek();
- Mark mark() const;
-
- private:
- struct IndentMarker {
- enum INDENT_TYPE { MAP, SEQ, NONE };
- enum STATUS { VALID, INVALID, UNKNOWN };
- IndentMarker(int column_, INDENT_TYPE type_): column(column_), type(type_), status(VALID), pStartToken(0) {}
-
- int column;
- INDENT_TYPE type;
- STATUS status;
- Token *pStartToken;
- };
-
- enum FLOW_MARKER { FLOW_MAP, FLOW_SEQ };
-
- private:
- // scanning
- void EnsureTokensInQueue();
- void ScanNextToken();
- void ScanToNextToken();
- void StartStream();
- void EndStream();
- Token *PushToken(Token::TYPE type);
-
- bool InFlowContext() const { return !m_flows.empty(); }
- bool InBlockContext() const { return m_flows.empty(); }
- int GetFlowLevel() const { return m_flows.size(); }
-
- Token::TYPE GetStartTokenFor(IndentMarker::INDENT_TYPE type) const;
- IndentMarker *PushIndentTo(int column, IndentMarker::INDENT_TYPE type);
- void PopIndentToHere();
- void PopAllIndents();
- void PopIndent();
- int GetTopIndent() const;
-
- // checking input
- bool CanInsertPotentialSimpleKey() const;
- bool ExistsActiveSimpleKey() const;
- void InsertPotentialSimpleKey();
- void InvalidateSimpleKey();
- bool VerifySimpleKey();
- void PopAllSimpleKeys();
-
- void ThrowParserException(const std::string& msg) const;
-
- bool IsWhitespaceToBeEaten(char ch);
- const RegEx& GetValueRegex() const;
-
- struct SimpleKey {
- SimpleKey(const Mark& mark_, int flowLevel_);
-
- void Validate();
- void Invalidate();
-
- Mark mark;
- int flowLevel;
- IndentMarker *pIndent;
- Token *pMapStart, *pKey;
- };
-
- // and the tokens
- void ScanDirective();
- void ScanDocStart();
- void ScanDocEnd();
- void ScanBlockSeqStart();
- void ScanBlockMapSTart();
- void ScanBlockEnd();
- void ScanBlockEntry();
- void ScanFlowStart();
- void ScanFlowEnd();
- void ScanFlowEntry();
- void ScanKey();
- void ScanValue();
- void ScanAnchorOrAlias();
- void ScanTag();
- void ScanPlainScalar();
- void ScanQuotedScalar();
- void ScanBlockScalar();
-
- private:
- // the stream
- Stream INPUT;
-
- // the output (tokens)
- std::queue<Token> m_tokens;
-
- // state info
- bool m_startedStream, m_endedStream;
- bool m_simpleKeyAllowed;
- bool m_canBeJSONFlow;
- std::stack<SimpleKey> m_simpleKeys;
- std::stack<IndentMarker *> m_indents;
- ptr_vector<IndentMarker> m_indentRefs; // for "garbage collection"
- std::stack<FLOW_MARKER> m_flows;
- };
-}
-
-#endif // SCANNER_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
diff --git a/ext/src/yaml-cpp/scanscalar.cpp b/ext/src/yaml-cpp/scanscalar.cpp
deleted file mode 100644
index 064c086..0000000
--- a/ext/src/yaml-cpp/scanscalar.cpp
+++ /dev/null
@@ -1,214 +0,0 @@
-#include "scanscalar.h"
-#include "scanner.h"
-#include "exp.h"
-#include "yaml-cpp/exceptions.h"
-#include "token.h"
-
-namespace YAML
-{
- // ScanScalar
- // . This is where the scalar magic happens.
- //
- // . We do the scanning in three phases:
- // 1. Scan until newline
- // 2. Eat newline
- // 3. Scan leading blanks.
- //
- // . Depending on the parameters given, we store or stop
- // and different places in the above flow.
- std::string ScanScalar(Stream& INPUT, ScanScalarParams& params)
- {
- bool foundNonEmptyLine = false;
- bool pastOpeningBreak = (params.fold == FOLD_FLOW);
- bool emptyLine = false, moreIndented = false;
- int foldedNewlineCount = 0;
- bool foldedNewlineStartedMoreIndented = false;
- std::size_t lastEscapedChar = std::string::npos;
- std::string scalar;
- params.leadingSpaces = false;
-
- while(INPUT) {
- // ********************************
- // Phase #1: scan until line ending
-
- std::size_t lastNonWhitespaceChar = scalar.size();
- bool escapedNewline = false;
- while(!params.end.Matches(INPUT) && !Exp::Break().Matches(INPUT)) {
- if(!INPUT)
- break;
-
- // document indicator?
- if(INPUT.column() == 0 && Exp::DocIndicator().Matches(INPUT)) {
- if(params.onDocIndicator == BREAK)
- break;
- else if(params.onDocIndicator == THROW)
- throw ParserException(INPUT.mark(), ErrorMsg::DOC_IN_SCALAR);
- }
-
- foundNonEmptyLine = true;
- pastOpeningBreak = true;
-
- // escaped newline? (only if we're escaping on slash)
- if(params.escape == '\\' && Exp::EscBreak().Matches(INPUT)) {
- // eat escape character and get out (but preserve trailing whitespace!)
- INPUT.get();
- lastNonWhitespaceChar = scalar.size();
- lastEscapedChar = scalar.size();
- escapedNewline = true;
- break;
- }
-
- // escape this?
- if(INPUT.peek() == params.escape) {
- scalar += Exp::Escape(INPUT);
- lastNonWhitespaceChar = scalar.size();
- lastEscapedChar = scalar.size();
- continue;
- }
-
- // otherwise, just add the damn character
- char ch = INPUT.get();
- scalar += ch;
- if(ch != ' ' && ch != '\t')
- lastNonWhitespaceChar = scalar.size();
- }
-
- // eof? if we're looking to eat something, then we throw
- if(!INPUT) {
- if(params.eatEnd)
- throw ParserException(INPUT.mark(), ErrorMsg::EOF_IN_SCALAR);
- break;
- }
-
- // doc indicator?
- if(params.onDocIndicator == BREAK && INPUT.column() == 0 && Exp::DocIndicator().Matches(INPUT))
- break;
-
- // are we done via character match?
- int n = params.end.Match(INPUT);
- if(n >= 0) {
- if(params.eatEnd)
- INPUT.eat(n);
- break;
- }
-
- // do we remove trailing whitespace?
- if(params.fold == FOLD_FLOW)
- scalar.erase(lastNonWhitespaceChar);
-
- // ********************************
- // Phase #2: eat line ending
- n = Exp::Break().Match(INPUT);
- INPUT.eat(n);
-
- // ********************************
- // Phase #3: scan initial spaces
-
- // first the required indentation
- while(INPUT.peek() == ' ' && (INPUT.column() < params.indent || (params.detectIndent && !foundNonEmptyLine)))
- INPUT.eat(1);
-
- // update indent if we're auto-detecting
- if(params.detectIndent && !foundNonEmptyLine)
- params.indent = std::max(params.indent, INPUT.column());
-
- // and then the rest of the whitespace
- while(Exp::Blank().Matches(INPUT)) {
- // we check for tabs that masquerade as indentation
- if(INPUT.peek() == '\t'&& INPUT.column() < params.indent && params.onTabInIndentation == THROW)
- throw ParserException(INPUT.mark(), ErrorMsg::TAB_IN_INDENTATION);
-
- if(!params.eatLeadingWhitespace)
- break;
-
- INPUT.eat(1);
- }
-
- // was this an empty line?
- bool nextEmptyLine = Exp::Break().Matches(INPUT);
- bool nextMoreIndented = Exp::Blank().Matches(INPUT);
- if(params.fold == FOLD_BLOCK && foldedNewlineCount == 0 && nextEmptyLine)
- foldedNewlineStartedMoreIndented = moreIndented;
-
- // for block scalars, we always start with a newline, so we should ignore it (not fold or keep)
- if(pastOpeningBreak) {
- switch(params.fold) {
- case DONT_FOLD:
- scalar += "\n";
- break;
- case FOLD_BLOCK:
- if(!emptyLine && !nextEmptyLine && !moreIndented && !nextMoreIndented && INPUT.column() >= params.indent)
- scalar += " ";
- else if(nextEmptyLine)
- foldedNewlineCount++;
- else
- scalar += "\n";
-
- if(!nextEmptyLine && foldedNewlineCount > 0) {
- scalar += std::string(foldedNewlineCount - 1, '\n');
- if(foldedNewlineStartedMoreIndented || nextMoreIndented | !foundNonEmptyLine)
- scalar += "\n";
- foldedNewlineCount = 0;
- }
- break;
- case FOLD_FLOW:
- if(nextEmptyLine)
- scalar += "\n";
- else if(!emptyLine && !nextEmptyLine && !escapedNewline)
- scalar += " ";
- break;
- }
- }
-
- emptyLine = nextEmptyLine;
- moreIndented = nextMoreIndented;
- pastOpeningBreak = true;
-
- // are we done via indentation?
- if(!emptyLine && INPUT.column() < params.indent) {
- params.leadingSpaces = true;
- break;
- }
- }
-
- // post-processing
- if(params.trimTrailingSpaces) {
- std::size_t pos = scalar.find_last_not_of(' ');
- if(lastEscapedChar != std::string::npos) {
- if(pos < lastEscapedChar || pos == std::string::npos)
- pos = lastEscapedChar;
- }
- if(pos < scalar.size())
- scalar.erase(pos + 1);
- }
-
- switch(params.chomp) {
- case CLIP: {
- std::size_t pos = scalar.find_last_not_of('\n');
- if(lastEscapedChar != std::string::npos) {
- if(pos < lastEscapedChar || pos == std::string::npos)
- pos = lastEscapedChar;
- }
- if(pos == std::string::npos)
- scalar.erase();
- else if(pos + 1 < scalar.size())
- scalar.erase(pos + 2);
- } break;
- case STRIP: {
- std::size_t pos = scalar.find_last_not_of('\n');
- if(lastEscapedChar != std::string::npos) {
- if(pos < lastEscapedChar || pos == std::string::npos)
- pos = lastEscapedChar;
- }
- if(pos == std::string::npos)
- scalar.erase();
- else if(pos < scalar.size())
- scalar.erase(pos + 1);
- } break;
- default:
- break;
- }
-
- return scalar;
- }
-}
diff --git a/ext/src/yaml-cpp/scanscalar.h b/ext/src/yaml-cpp/scanscalar.h
deleted file mode 100644
index c198cb1..0000000
--- a/ext/src/yaml-cpp/scanscalar.h
+++ /dev/null
@@ -1,45 +0,0 @@
-#ifndef SCANSCALAR_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define SCANSCALAR_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-#include <string>
-#include "regex.h"
-#include "stream.h"
-
-namespace YAML
-{
- enum CHOMP { STRIP = -1, CLIP, KEEP };
- enum ACTION { NONE, BREAK, THROW };
- enum FOLD { DONT_FOLD, FOLD_BLOCK, FOLD_FLOW };
-
- struct ScanScalarParams {
- ScanScalarParams(): eatEnd(false), indent(0), detectIndent(false), eatLeadingWhitespace(0), escape(0), fold(DONT_FOLD),
- trimTrailingSpaces(0), chomp(CLIP), onDocIndicator(NONE), onTabInIndentation(NONE), leadingSpaces(false) {}
-
- // input:
- RegEx end; // what condition ends this scalar?
- bool eatEnd; // should we eat that condition when we see it?
- int indent; // what level of indentation should be eaten and ignored?
- bool detectIndent; // should we try to autodetect the indent?
- bool eatLeadingWhitespace; // should we continue eating this delicious indentation after 'indent' spaces?
- char escape; // what character do we escape on (i.e., slash or single quote) (0 for none)
- FOLD fold; // how do we fold line ends?
- bool trimTrailingSpaces; // do we remove all trailing spaces (at the very end)
- CHOMP chomp; // do we strip, clip, or keep trailing newlines (at the very end)
- // Note: strip means kill all, clip means keep at most one, keep means keep all
- ACTION onDocIndicator; // what do we do if we see a document indicator?
- ACTION onTabInIndentation; // what do we do if we see a tab where we should be seeing indentation spaces
-
- // output:
- bool leadingSpaces;
- };
-
- std::string ScanScalar(Stream& INPUT, ScanScalarParams& info);
-}
-
-#endif // SCANSCALAR_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
diff --git a/ext/src/yaml-cpp/scantag.cpp b/ext/src/yaml-cpp/scantag.cpp
deleted file mode 100644
index b71cbcc..0000000
--- a/ext/src/yaml-cpp/scantag.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-#include "scanner.h"
-#include "regex.h"
-#include "exp.h"
-#include "yaml-cpp/exceptions.h"
-
-namespace YAML
-{
- const std::string ScanVerbatimTag(Stream& INPUT)
- {
- std::string tag;
-
- // eat the start character
- INPUT.get();
-
- while(INPUT) {
- if(INPUT.peek() == Keys::VerbatimTagEnd) {
- // eat the end character
- INPUT.get();
- return tag;
- }
-
- int n = Exp::URI().Match(INPUT);
- if(n <= 0)
- break;
-
- tag += INPUT.get(n);
- }
-
- throw ParserException(INPUT.mark(), ErrorMsg::END_OF_VERBATIM_TAG);
- }
-
- const std::string ScanTagHandle(Stream& INPUT, bool& canBeHandle)
- {
- std::string tag;
- canBeHandle = true;
- Mark firstNonWordChar;
-
- while(INPUT) {
- if(INPUT.peek() == Keys::Tag) {
- if(!canBeHandle)
- throw ParserException(firstNonWordChar, ErrorMsg::CHAR_IN_TAG_HANDLE);
- break;
- }
-
- int n = 0;
- if(canBeHandle) {
- n = Exp::Word().Match(INPUT);
- if(n <= 0) {
- canBeHandle = false;
- firstNonWordChar = INPUT.mark();
- }
- }
-
- if(!canBeHandle)
- n = Exp::Tag().Match(INPUT);
-
- if(n <= 0)
- break;
-
- tag += INPUT.get(n);
- }
-
- return tag;
- }
-
- const std::string ScanTagSuffix(Stream& INPUT)
- {
- std::string tag;
-
- while(INPUT) {
- int n = Exp::Tag().Match(INPUT);
- if(n <= 0)
- break;
-
- tag += INPUT.get(n);
- }
-
- if(tag.empty())
- throw ParserException(INPUT.mark(), ErrorMsg::TAG_WITH_NO_SUFFIX);
-
- return tag;
- }
-}
-
diff --git a/ext/src/yaml-cpp/scantag.h b/ext/src/yaml-cpp/scantag.h
deleted file mode 100644
index 38437c0..0000000
--- a/ext/src/yaml-cpp/scantag.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef SCANTAG_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define SCANTAG_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-#include <string>
-#include "stream.h"
-
-namespace YAML
-{
- const std::string ScanVerbatimTag(Stream& INPUT);
- const std::string ScanTagHandle(Stream& INPUT, bool& canBeHandle);
- const std::string ScanTagSuffix(Stream& INPUT);
-}
-
-#endif // SCANTAG_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
diff --git a/ext/src/yaml-cpp/scantoken.cpp b/ext/src/yaml-cpp/scantoken.cpp
deleted file mode 100644
index 06d9cd6..0000000
--- a/ext/src/yaml-cpp/scantoken.cpp
+++ /dev/null
@@ -1,439 +0,0 @@
-#include "scanner.h"
-#include "token.h"
-#include "yaml-cpp/exceptions.h"
-#include "exp.h"
-#include "scanscalar.h"
-#include "scantag.h"
-#include "tag.h"
-#include <sstream>
-
-namespace YAML
-{
- ///////////////////////////////////////////////////////////////////////
- // Specialization for scanning specific tokens
-
- // Directive
- // . Note: no semantic checking is done here (that's for the parser to do)
- void Scanner::ScanDirective()
- {
- std::string name;
- std::vector <std::string> params;
-
- // pop indents and simple keys
- PopAllIndents();
- PopAllSimpleKeys();
-
- m_simpleKeyAllowed = false;
- m_canBeJSONFlow = false;
-
- // store pos and eat indicator
- Token token(Token::DIRECTIVE, INPUT.mark());
- INPUT.eat(1);
-
- // read name
- while(INPUT && !Exp::BlankOrBreak().Matches(INPUT))
- token.value += INPUT.get();
-
- // read parameters
- while(1) {
- // first get rid of whitespace
- while(Exp::Blank().Matches(INPUT))
- INPUT.eat(1);
-
- // break on newline or comment
- if(!INPUT || Exp::Break().Matches(INPUT) || Exp::Comment().Matches(INPUT))
- break;
-
- // now read parameter
- std::string param;
- while(INPUT && !Exp::BlankOrBreak().Matches(INPUT))
- param += INPUT.get();
-
- token.params.push_back(param);
- }
-
- m_tokens.push(token);
- }
-
- // DocStart
- void Scanner::ScanDocStart()
- {
- PopAllIndents();
- PopAllSimpleKeys();
- m_simpleKeyAllowed = false;
- m_canBeJSONFlow = false;
-
- // eat
- Mark mark = INPUT.mark();
- INPUT.eat(3);
- m_tokens.push(Token(Token::DOC_START, mark));
- }
-
- // DocEnd
- void Scanner::ScanDocEnd()
- {
- PopAllIndents();
- PopAllSimpleKeys();
- m_simpleKeyAllowed = false;
- m_canBeJSONFlow = false;
-
- // eat
- Mark mark = INPUT.mark();
- INPUT.eat(3);
- m_tokens.push(Token(Token::DOC_END, mark));
- }
-
- // FlowStart
- void Scanner::ScanFlowStart()
- {
- // flows can be simple keys
- InsertPotentialSimpleKey();
- m_simpleKeyAllowed = true;
- m_canBeJSONFlow = false;
-
- // eat
- Mark mark = INPUT.mark();
- char ch = INPUT.get();
- FLOW_MARKER flowType = (ch == Keys::FlowSeqStart ? FLOW_SEQ : FLOW_MAP);
- m_flows.push(flowType);
- Token::TYPE type = (flowType == FLOW_SEQ ? Token::FLOW_SEQ_START : Token::FLOW_MAP_START);
- m_tokens.push(Token(type, mark));
- }
-
- // FlowEnd
- void Scanner::ScanFlowEnd()
- {
- if(InBlockContext())
- throw ParserException(INPUT.mark(), ErrorMsg::FLOW_END);
-
- // we might have a solo entry in the flow context
- if(InFlowContext()) {
- if(m_flows.top() == FLOW_MAP && VerifySimpleKey())
- m_tokens.push(Token(Token::VALUE, INPUT.mark()));
- else if(m_flows.top() == FLOW_SEQ)
- InvalidateSimpleKey();
- }
-
- m_simpleKeyAllowed = false;
- m_canBeJSONFlow = true;
-
- // eat
- Mark mark = INPUT.mark();
- char ch = INPUT.get();
-
- // check that it matches the start
- FLOW_MARKER flowType = (ch == Keys::FlowSeqEnd ? FLOW_SEQ : FLOW_MAP);
- if(m_flows.top() != flowType)
- throw ParserException(mark, ErrorMsg::FLOW_END);
- m_flows.pop();
-
- Token::TYPE type = (flowType ? Token::FLOW_SEQ_END : Token::FLOW_MAP_END);
- m_tokens.push(Token(type, mark));
- }
-
- // FlowEntry
- void Scanner::ScanFlowEntry()
- {
- // we might have a solo entry in the flow context
- if(InFlowContext()) {
- if(m_flows.top() == FLOW_MAP && VerifySimpleKey())
- m_tokens.push(Token(Token::VALUE, INPUT.mark()));
- else if(m_flows.top() == FLOW_SEQ)
- InvalidateSimpleKey();
- }
-
- m_simpleKeyAllowed = true;
- m_canBeJSONFlow = false;
-
- // eat
- Mark mark = INPUT.mark();
- INPUT.eat(1);
- m_tokens.push(Token(Token::FLOW_ENTRY, mark));
- }
-
- // BlockEntry
- void Scanner::ScanBlockEntry()
- {
- // we better be in the block context!
- if(InFlowContext())
- throw ParserException(INPUT.mark(), ErrorMsg::BLOCK_ENTRY);
-
- // can we put it here?
- if(!m_simpleKeyAllowed)
- throw ParserException(INPUT.mark(), ErrorMsg::BLOCK_ENTRY);
-
- PushIndentTo(INPUT.column(), IndentMarker::SEQ);
- m_simpleKeyAllowed = true;
- m_canBeJSONFlow = false;
-
- // eat
- Mark mark = INPUT.mark();
- INPUT.eat(1);
- m_tokens.push(Token(Token::BLOCK_ENTRY, mark));
- }
-
- // Key
- void Scanner::ScanKey()
- {
- // handle keys diffently in the block context (and manage indents)
- if(InBlockContext()) {
- if(!m_simpleKeyAllowed)
- throw ParserException(INPUT.mark(), ErrorMsg::MAP_KEY);
-
- PushIndentTo(INPUT.column(), IndentMarker::MAP);
- }
-
- // can only put a simple key here if we're in block context
- m_simpleKeyAllowed = InBlockContext();
-
- // eat
- Mark mark = INPUT.mark();
- INPUT.eat(1);
- m_tokens.push(Token(Token::KEY, mark));
- }
-
- // Value
- void Scanner::ScanValue()
- {
- // and check that simple key
- bool isSimpleKey = VerifySimpleKey();
- m_canBeJSONFlow = false;
-
- if(isSimpleKey) {
- // can't follow a simple key with another simple key (dunno why, though - it seems fine)
- m_simpleKeyAllowed = false;
- } else {
- // handle values diffently in the block context (and manage indents)
- if(InBlockContext()) {
- if(!m_simpleKeyAllowed)
- throw ParserException(INPUT.mark(), ErrorMsg::MAP_VALUE);
-
- PushIndentTo(INPUT.column(), IndentMarker::MAP);
- }
-
- // can only put a simple key here if we're in block context
- m_simpleKeyAllowed = InBlockContext();
- }
-
- // eat
- Mark mark = INPUT.mark();
- INPUT.eat(1);
- m_tokens.push(Token(Token::VALUE, mark));
- }
-
- // AnchorOrAlias
- void Scanner::ScanAnchorOrAlias()
- {
- bool alias;
- std::string name;
-
- // insert a potential simple key
- InsertPotentialSimpleKey();
- m_simpleKeyAllowed = false;
- m_canBeJSONFlow = false;
-
- // eat the indicator
- Mark mark = INPUT.mark();
- char indicator = INPUT.get();
- alias = (indicator == Keys::Alias);
-
- // now eat the content
- while(INPUT && Exp::Anchor().Matches(INPUT))
- name += INPUT.get();
-
- // we need to have read SOMETHING!
- if(name.empty())
- throw ParserException(INPUT.mark(), alias ? ErrorMsg::ALIAS_NOT_FOUND : ErrorMsg::ANCHOR_NOT_FOUND);
-
- // and needs to end correctly
- if(INPUT && !Exp::AnchorEnd().Matches(INPUT))
- throw ParserException(INPUT.mark(), alias ? ErrorMsg::CHAR_IN_ALIAS : ErrorMsg::CHAR_IN_ANCHOR);
-
- // and we're done
- Token token(alias ? Token::ALIAS : Token::ANCHOR, mark);
- token.value = name;
- m_tokens.push(token);
- }
-
- // Tag
- void Scanner::ScanTag()
- {
- // insert a potential simple key
- InsertPotentialSimpleKey();
- m_simpleKeyAllowed = false;
- m_canBeJSONFlow = false;
-
- Token token(Token::TAG, INPUT.mark());
-
- // eat the indicator
- INPUT.get();
-
- if(INPUT && INPUT.peek() == Keys::VerbatimTagStart){
- std::string tag = ScanVerbatimTag(INPUT);
-
- token.value = tag;
- token.data = Tag::VERBATIM;
- } else {
- bool canBeHandle;
- token.value = ScanTagHandle(INPUT, canBeHandle);
- if(!canBeHandle && token.value.empty())
- token.data = Tag::NON_SPECIFIC;
- else if(token.value.empty())
- token.data = Tag::SECONDARY_HANDLE;
- else
- token.data = Tag::PRIMARY_HANDLE;
-
- // is there a suffix?
- if(canBeHandle && INPUT.peek() == Keys::Tag) {
- // eat the indicator
- INPUT.get();
- token.params.push_back(ScanTagSuffix(INPUT));
- token.data = Tag::NAMED_HANDLE;
- }
- }
-
- m_tokens.push(token);
- }
-
- // PlainScalar
- void Scanner::ScanPlainScalar()
- {
- std::string scalar;
-
- // set up the scanning parameters
- ScanScalarParams params;
- params.end = (InFlowContext() ? Exp::EndScalarInFlow() : Exp::EndScalar()) || (Exp::BlankOrBreak() + Exp::Comment());
- params.eatEnd = false;
- params.indent = (InFlowContext() ? 0 : GetTopIndent() + 1);
- params.fold = FOLD_FLOW;
- params.eatLeadingWhitespace = true;
- params.trimTrailingSpaces = true;
- params.chomp = STRIP;
- params.onDocIndicator = BREAK;
- params.onTabInIndentation = THROW;
-
- // insert a potential simple key
- InsertPotentialSimpleKey();
-
- Mark mark = INPUT.mark();
- scalar = ScanScalar(INPUT, params);
-
- // can have a simple key only if we ended the scalar by starting a new line
- m_simpleKeyAllowed = params.leadingSpaces;
- m_canBeJSONFlow = false;
-
- // finally, check and see if we ended on an illegal character
- //if(Exp::IllegalCharInScalar.Matches(INPUT))
- // throw ParserException(INPUT.mark(), ErrorMsg::CHAR_IN_SCALAR);
-
- Token token(Token::PLAIN_SCALAR, mark);
- token.value = scalar;
- m_tokens.push(token);
- }
-
- // QuotedScalar
- void Scanner::ScanQuotedScalar()
- {
- std::string scalar;
-
- // peek at single or double quote (don't eat because we need to preserve (for the time being) the input position)
- char quote = INPUT.peek();
- bool single = (quote == '\'');
-
- // setup the scanning parameters
- ScanScalarParams params;
- params.end = (single ? RegEx(quote) && !Exp::EscSingleQuote() : RegEx(quote));
- params.eatEnd = true;
- params.escape = (single ? '\'' : '\\');
- params.indent = 0;
- params.fold = FOLD_FLOW;
- params.eatLeadingWhitespace = true;
- params.trimTrailingSpaces = false;
- params.chomp = CLIP;
- params.onDocIndicator = THROW;
-
- // insert a potential simple key
- InsertPotentialSimpleKey();
-
- Mark mark = INPUT.mark();
-
- // now eat that opening quote
- INPUT.get();
-
- // and scan
- scalar = ScanScalar(INPUT, params);
- m_simpleKeyAllowed = false;
- m_canBeJSONFlow = true;
-
- Token token(Token::NON_PLAIN_SCALAR, mark);
- token.value = scalar;
- m_tokens.push(token);
- }
-
- // BlockScalarToken
- // . These need a little extra processing beforehand.
- // . We need to scan the line where the indicator is (this doesn't count as part of the scalar),
- // and then we need to figure out what level of indentation we'll be using.
- void Scanner::ScanBlockScalar()
- {
- std::string scalar;
-
- ScanScalarParams params;
- params.indent = 1;
- params.detectIndent = true;
-
- // eat block indicator ('|' or '>')
- Mark mark = INPUT.mark();
- char indicator = INPUT.get();
- params.fold = (indicator == Keys::FoldedScalar ? FOLD_BLOCK : DONT_FOLD);
-
- // eat chomping/indentation indicators
- params.chomp = CLIP;
- int n = Exp::Chomp().Match(INPUT);
- for(int i=0;i<n;i++) {
- char ch = INPUT.get();
- if(ch == '+')
- params.chomp = KEEP;
- else if(ch == '-')
- params.chomp = STRIP;
- else if(Exp::Digit().Matches(ch)) {
- if(ch == '0')
- throw ParserException(INPUT.mark(), ErrorMsg::ZERO_INDENT_IN_BLOCK);
-
- params.indent = ch - '0';
- params.detectIndent = false;
- }
- }
-
- // now eat whitespace
- while(Exp::Blank().Matches(INPUT))
- INPUT.eat(1);
-
- // and comments to the end of the line
- if(Exp::Comment().Matches(INPUT))
- while(INPUT && !Exp::Break().Matches(INPUT))
- INPUT.eat(1);
-
- // if it's not a line break, then we ran into a bad character inline
- if(INPUT && !Exp::Break().Matches(INPUT))
- throw ParserException(INPUT.mark(), ErrorMsg::CHAR_IN_BLOCK);
-
- // set the initial indentation
- if(GetTopIndent() >= 0)
- params.indent += GetTopIndent();
-
- params.eatLeadingWhitespace = false;
- params.trimTrailingSpaces = false;
- params.onTabInIndentation = THROW;
-
- scalar = ScanScalar(INPUT, params);
-
- // simple keys always ok after block scalars (since we're gonna start a new line anyways)
- m_simpleKeyAllowed = true;
- m_canBeJSONFlow = false;
-
- Token token(Token::NON_PLAIN_SCALAR, mark);
- token.value = scalar;
- m_tokens.push(token);
- }
-}
diff --git a/ext/src/yaml-cpp/setting.h b/ext/src/yaml-cpp/setting.h
deleted file mode 100644
index 806ccda..0000000
--- a/ext/src/yaml-cpp/setting.h
+++ /dev/null
@@ -1,105 +0,0 @@
-#ifndef SETTING_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define SETTING_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-#include <memory>
-#include <vector>
-#include "yaml-cpp/noncopyable.h"
-
-namespace YAML
-{
- class SettingChangeBase;
-
- template <typename T>
- class Setting
- {
- public:
- Setting(): m_value() {}
-
- const T get() const { return m_value; }
- std::auto_ptr <SettingChangeBase> set(const T& value);
- void restore(const Setting<T>& oldSetting) {
- m_value = oldSetting.get();
- }
-
- private:
- T m_value;
- };
-
- class SettingChangeBase
- {
- public:
- virtual ~SettingChangeBase() {}
- virtual void pop() = 0;
- };
-
- template <typename T>
- class SettingChange: public SettingChangeBase
- {
- public:
- SettingChange(Setting<T> *pSetting): m_pCurSetting(pSetting) {
- // copy old setting to save its state
- m_oldSetting = *pSetting;
- }
-
- virtual void pop() {
- m_pCurSetting->restore(m_oldSetting);
- }
-
- private:
- Setting<T> *m_pCurSetting;
- Setting<T> m_oldSetting;
- };
-
- template <typename T>
- inline std::auto_ptr <SettingChangeBase> Setting<T>::set(const T& value) {
- std::auto_ptr <SettingChangeBase> pChange(new SettingChange<T> (this));
- m_value = value;
- return pChange;
- }
-
- class SettingChanges: private noncopyable
- {
- public:
- SettingChanges() {}
- ~SettingChanges() { clear(); }
-
- void clear() {
- restore();
-
- for(setting_changes::const_iterator it=m_settingChanges.begin();it!=m_settingChanges.end();++it)
- delete *it;
- m_settingChanges.clear();
- }
-
- void restore() {
- for(setting_changes::const_iterator it=m_settingChanges.begin();it!=m_settingChanges.end();++it)
- (*it)->pop();
- }
-
- void push(std::auto_ptr <SettingChangeBase> pSettingChange) {
- m_settingChanges.push_back(pSettingChange.release());
- }
-
- // like std::auto_ptr - assignment is transfer of ownership
- SettingChanges& operator = (SettingChanges& rhs) {
- if(this == &rhs)
- return *this;
-
- clear();
- m_settingChanges = rhs.m_settingChanges;
- rhs.m_settingChanges.clear();
- return *this;
- }
-
- private:
- typedef std::vector <SettingChangeBase *> setting_changes;
- setting_changes m_settingChanges;
- };
-}
-
-#endif // SETTING_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/src/yaml-cpp/simplekey.cpp b/ext/src/yaml-cpp/simplekey.cpp
deleted file mode 100644
index 857a9e0..0000000
--- a/ext/src/yaml-cpp/simplekey.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-#include "scanner.h"
-#include "token.h"
-#include "yaml-cpp/exceptions.h"
-#include "exp.h"
-
-namespace YAML
-{
- Scanner::SimpleKey::SimpleKey(const Mark& mark_, int flowLevel_)
- : mark(mark_), flowLevel(flowLevel_), pIndent(0), pMapStart(0), pKey(0)
- {
- }
-
- void Scanner::SimpleKey::Validate()
- {
- // Note: pIndent will *not* be garbage here;
- // we "garbage collect" them so we can
- // always refer to them
- if(pIndent)
- pIndent->status = IndentMarker::VALID;
- if(pMapStart)
- pMapStart->status = Token::VALID;
- if(pKey)
- pKey->status = Token::VALID;
- }
-
- void Scanner::SimpleKey::Invalidate()
- {
- if(pIndent)
- pIndent->status = IndentMarker::INVALID;
- if(pMapStart)
- pMapStart->status = Token::INVALID;
- if(pKey)
- pKey->status = Token::INVALID;
- }
-
- // CanInsertPotentialSimpleKey
- bool Scanner::CanInsertPotentialSimpleKey() const
- {
- if(!m_simpleKeyAllowed)
- return false;
-
- return !ExistsActiveSimpleKey();
- }
-
- // ExistsActiveSimpleKey
- // . Returns true if there's a potential simple key at our flow level
- // (there's allowed at most one per flow level, i.e., at the start of the flow start token)
- bool Scanner::ExistsActiveSimpleKey() const
- {
- if(m_simpleKeys.empty())
- return false;
-
- const SimpleKey& key = m_simpleKeys.top();
- return key.flowLevel == GetFlowLevel();
- }
-
- // InsertPotentialSimpleKey
- // . If we can, add a potential simple key to the queue,
- // and save it on a stack.
- void Scanner::InsertPotentialSimpleKey()
- {
- if(!CanInsertPotentialSimpleKey())
- return;
-
- SimpleKey key(INPUT.mark(), GetFlowLevel());
-
- // first add a map start, if necessary
- if(InBlockContext()) {
- key.pIndent = PushIndentTo(INPUT.column(), IndentMarker::MAP);
- if(key.pIndent) {
- key.pIndent->status = IndentMarker::UNKNOWN;
- key.pMapStart = key.pIndent->pStartToken;
- key.pMapStart->status = Token::UNVERIFIED;
- }
- }
-
- // then add the (now unverified) key
- m_tokens.push(Token(Token::KEY, INPUT.mark()));
- key.pKey = &m_tokens.back();
- key.pKey->status = Token::UNVERIFIED;
-
- m_simpleKeys.push(key);
- }
-
- // InvalidateSimpleKey
- // . Automatically invalidate the simple key in our flow level
- void Scanner::InvalidateSimpleKey()
- {
- if(m_simpleKeys.empty())
- return;
-
- // grab top key
- SimpleKey& key = m_simpleKeys.top();
- if(key.flowLevel != GetFlowLevel())
- return;
-
- key.Invalidate();
- m_simpleKeys.pop();
- }
-
- // VerifySimpleKey
- // . Determines whether the latest simple key to be added is valid,
- // and if so, makes it valid.
- bool Scanner::VerifySimpleKey()
- {
- if(m_simpleKeys.empty())
- return false;
-
- // grab top key
- SimpleKey key = m_simpleKeys.top();
-
- // only validate if we're in the correct flow level
- if(key.flowLevel != GetFlowLevel())
- return false;
-
- m_simpleKeys.pop();
-
- bool isValid = true;
-
- // needs to be less than 1024 characters and inline
- if(INPUT.line() != key.mark.line || INPUT.pos() - key.mark.pos > 1024)
- isValid = false;
-
- // invalidate key
- if(isValid)
- key.Validate();
- else
- key.Invalidate();
-
- return isValid;
- }
-
- void Scanner::PopAllSimpleKeys()
- {
- while(!m_simpleKeys.empty())
- m_simpleKeys.pop();
- }
-}
-
diff --git a/ext/src/yaml-cpp/singledocparser.cpp b/ext/src/yaml-cpp/singledocparser.cpp
deleted file mode 100644
index 0431b4b..0000000
--- a/ext/src/yaml-cpp/singledocparser.cpp
+++ /dev/null
@@ -1,387 +0,0 @@
-#include "singledocparser.h"
-#include "collectionstack.h"
-#include "directives.h"
-#include "yaml-cpp/eventhandler.h"
-#include "yaml-cpp/exceptions.h"
-#include "scanner.h"
-#include "tag.h"
-#include "token.h"
-#include <sstream>
-#include <cstdio>
-#include <algorithm>
-
-namespace YAML
-{
- SingleDocParser::SingleDocParser(Scanner& scanner, const Directives& directives): m_scanner(scanner), m_directives(directives), m_pCollectionStack(new CollectionStack), m_curAnchor(0)
- {
- }
-
- SingleDocParser::~SingleDocParser()
- {
- }
-
- // HandleDocument
- // . Handles the next document
- // . Throws a ParserException on error.
- void SingleDocParser::HandleDocument(EventHandler& eventHandler)
- {
- assert(!m_scanner.empty()); // guaranteed that there are tokens
- assert(!m_curAnchor);
-
- eventHandler.OnDocumentStart(m_scanner.peek().mark);
-
- // eat doc start
- if(m_scanner.peek().type == Token::DOC_START)
- m_scanner.pop();
-
- // recurse!
- HandleNode(eventHandler);
-
- eventHandler.OnDocumentEnd();
-
- // and finally eat any doc ends we see
- while(!m_scanner.empty() && m_scanner.peek().type == Token::DOC_END)
- m_scanner.pop();
- }
-
- void SingleDocParser::HandleNode(EventHandler& eventHandler)
- {
- // an empty node *is* a possibility
- if(m_scanner.empty()) {
- eventHandler.OnNull(m_scanner.mark(), NullAnchor);
- return;
- }
-
- // save location
- Mark mark = m_scanner.peek().mark;
-
- // special case: a value node by itself must be a map, with no header
- if(m_scanner.peek().type == Token::VALUE) {
- eventHandler.OnMapStart(mark, "?", NullAnchor);
- HandleMap(eventHandler);
- eventHandler.OnMapEnd();
- return;
- }
-
- // special case: an alias node
- if(m_scanner.peek().type == Token::ALIAS) {
- eventHandler.OnAlias(mark, LookupAnchor(mark, m_scanner.peek().value));
- m_scanner.pop();
- return;
- }
-
- std::string tag;
- anchor_t anchor;
- ParseProperties(tag, anchor);
-
- const Token& token = m_scanner.peek();
-
- // add non-specific tags
- if(tag.empty())
- tag = (token.type == Token::NON_PLAIN_SCALAR ? "!" : "?");
-
- // now split based on what kind of node we should be
- switch(token.type) {
- case Token::PLAIN_SCALAR:
- case Token::NON_PLAIN_SCALAR:
- eventHandler.OnScalar(mark, tag, anchor, token.value);
- m_scanner.pop();
- return;
- case Token::FLOW_SEQ_START:
- case Token::BLOCK_SEQ_START:
- eventHandler.OnSequenceStart(mark, tag, anchor);
- HandleSequence(eventHandler);
- eventHandler.OnSequenceEnd();
- return;
- case Token::FLOW_MAP_START:
- case Token::BLOCK_MAP_START:
- eventHandler.OnMapStart(mark, tag, anchor);
- HandleMap(eventHandler);
- eventHandler.OnMapEnd();
- return;
- case Token::KEY:
- // compact maps can only go in a flow sequence
- if(m_pCollectionStack->GetCurCollectionType() == CollectionType::FlowSeq) {
- eventHandler.OnMapStart(mark, tag, anchor);
- HandleMap(eventHandler);
- eventHandler.OnMapEnd();
- return;
- }
- break;
- default:
- break;
- }
-
- if(tag == "?")
- eventHandler.OnNull(mark, anchor);
- else
- eventHandler.OnScalar(mark, tag, anchor, "");
- }
-
- void SingleDocParser::HandleSequence(EventHandler& eventHandler)
- {
- // split based on start token
- switch(m_scanner.peek().type) {
- case Token::BLOCK_SEQ_START: HandleBlockSequence(eventHandler); break;
- case Token::FLOW_SEQ_START: HandleFlowSequence(eventHandler); break;
- default: break;
- }
- }
-
- void SingleDocParser::HandleBlockSequence(EventHandler& eventHandler)
- {
- // eat start token
- m_scanner.pop();
- m_pCollectionStack->PushCollectionType(CollectionType::BlockSeq);
-
- while(1) {
- if(m_scanner.empty())
- throw ParserException(m_scanner.mark(), ErrorMsg::END_OF_SEQ);
-
- Token token = m_scanner.peek();
- if(token.type != Token::BLOCK_ENTRY && token.type != Token::BLOCK_SEQ_END)
- throw ParserException(token.mark, ErrorMsg::END_OF_SEQ);
-
- m_scanner.pop();
- if(token.type == Token::BLOCK_SEQ_END)
- break;
-
- // check for null
- if(!m_scanner.empty()) {
- const Token& token = m_scanner.peek();
- if(token.type == Token::BLOCK_ENTRY || token.type == Token::BLOCK_SEQ_END) {
- eventHandler.OnNull(token.mark, NullAnchor);
- continue;
- }
- }
-
- HandleNode(eventHandler);
- }
-
- m_pCollectionStack->PopCollectionType(CollectionType::BlockSeq);
- }
-
- void SingleDocParser::HandleFlowSequence(EventHandler& eventHandler)
- {
- // eat start token
- m_scanner.pop();
- m_pCollectionStack->PushCollectionType(CollectionType::FlowSeq);
-
- while(1) {
- if(m_scanner.empty())
- throw ParserException(m_scanner.mark(), ErrorMsg::END_OF_SEQ_FLOW);
-
- // first check for end
- if(m_scanner.peek().type == Token::FLOW_SEQ_END) {
- m_scanner.pop();
- break;
- }
-
- // then read the node
- HandleNode(eventHandler);
-
- if(m_scanner.empty())
- throw ParserException(m_scanner.mark(), ErrorMsg::END_OF_SEQ_FLOW);
-
- // now eat the separator (or could be a sequence end, which we ignore - but if it's neither, then it's a bad node)
- Token& token = m_scanner.peek();
- if(token.type == Token::FLOW_ENTRY)
- m_scanner.pop();
- else if(token.type != Token::FLOW_SEQ_END)
- throw ParserException(token.mark, ErrorMsg::END_OF_SEQ_FLOW);
- }
-
- m_pCollectionStack->PopCollectionType(CollectionType::FlowSeq);
- }
-
- void SingleDocParser::HandleMap(EventHandler& eventHandler)
- {
- // split based on start token
- switch(m_scanner.peek().type) {
- case Token::BLOCK_MAP_START: HandleBlockMap(eventHandler); break;
- case Token::FLOW_MAP_START: HandleFlowMap(eventHandler); break;
- case Token::KEY: HandleCompactMap(eventHandler); break;
- case Token::VALUE: HandleCompactMapWithNoKey(eventHandler); break;
- default: break;
- }
- }
-
- void SingleDocParser::HandleBlockMap(EventHandler& eventHandler)
- {
- // eat start token
- m_scanner.pop();
- m_pCollectionStack->PushCollectionType(CollectionType::BlockMap);
-
- while(1) {
- if(m_scanner.empty())
- throw ParserException(m_scanner.mark(), ErrorMsg::END_OF_MAP);
-
- Token token = m_scanner.peek();
- if(token.type != Token::KEY && token.type != Token::VALUE && token.type != Token::BLOCK_MAP_END)
- throw ParserException(token.mark, ErrorMsg::END_OF_MAP);
-
- if(token.type == Token::BLOCK_MAP_END) {
- m_scanner.pop();
- break;
- }
-
- // grab key (if non-null)
- if(token.type == Token::KEY) {
- m_scanner.pop();
- HandleNode(eventHandler);
- } else {
- eventHandler.OnNull(token.mark, NullAnchor);
- }
-
- // now grab value (optional)
- if(!m_scanner.empty() && m_scanner.peek().type == Token::VALUE) {
- m_scanner.pop();
- HandleNode(eventHandler);
- } else {
- eventHandler.OnNull(token.mark, NullAnchor);
- }
- }
-
- m_pCollectionStack->PopCollectionType(CollectionType::BlockMap);
- }
-
- void SingleDocParser::HandleFlowMap(EventHandler& eventHandler)
- {
- // eat start token
- m_scanner.pop();
- m_pCollectionStack->PushCollectionType(CollectionType::FlowMap);
-
- while(1) {
- if(m_scanner.empty())
- throw ParserException(m_scanner.mark(), ErrorMsg::END_OF_MAP_FLOW);
-
- Token& token = m_scanner.peek();
- // first check for end
- if(token.type == Token::FLOW_MAP_END) {
- m_scanner.pop();
- break;
- }
-
- // grab key (if non-null)
- if(token.type == Token::KEY) {
- m_scanner.pop();
- HandleNode(eventHandler);
- } else {
- eventHandler.OnNull(token.mark, NullAnchor);
- }
-
- // now grab value (optional)
- if(!m_scanner.empty() && m_scanner.peek().type == Token::VALUE) {
- m_scanner.pop();
- HandleNode(eventHandler);
- } else {
- eventHandler.OnNull(token.mark, NullAnchor);
- }
-
- if(m_scanner.empty())
- throw ParserException(m_scanner.mark(), ErrorMsg::END_OF_MAP_FLOW);
-
- // now eat the separator (or could be a map end, which we ignore - but if it's neither, then it's a bad node)
- Token& nextToken = m_scanner.peek();
- if(nextToken.type == Token::FLOW_ENTRY)
- m_scanner.pop();
- else if(nextToken.type != Token::FLOW_MAP_END)
- throw ParserException(nextToken.mark, ErrorMsg::END_OF_MAP_FLOW);
- }
-
- m_pCollectionStack->PopCollectionType(CollectionType::FlowMap);
- }
-
- // . Single "key: value" pair in a flow sequence
- void SingleDocParser::HandleCompactMap(EventHandler& eventHandler)
- {
- m_pCollectionStack->PushCollectionType(CollectionType::CompactMap);
-
- // grab key
- Mark mark = m_scanner.peek().mark;
- m_scanner.pop();
- HandleNode(eventHandler);
-
- // now grab value (optional)
- if(!m_scanner.empty() && m_scanner.peek().type == Token::VALUE) {
- m_scanner.pop();
- HandleNode(eventHandler);
- } else {
- eventHandler.OnNull(mark, NullAnchor);
- }
-
- m_pCollectionStack->PopCollectionType(CollectionType::CompactMap);
- }
-
- // . Single ": value" pair in a flow sequence
- void SingleDocParser::HandleCompactMapWithNoKey(EventHandler& eventHandler)
- {
- m_pCollectionStack->PushCollectionType(CollectionType::CompactMap);
-
- // null key
- eventHandler.OnNull(m_scanner.peek().mark, NullAnchor);
-
- // grab value
- m_scanner.pop();
- HandleNode(eventHandler);
-
- m_pCollectionStack->PopCollectionType(CollectionType::CompactMap);
- }
-
- // ParseProperties
- // . Grabs any tag or anchor tokens and deals with them.
- void SingleDocParser::ParseProperties(std::string& tag, anchor_t& anchor)
- {
- tag.clear();
- anchor = NullAnchor;
-
- while(1) {
- if(m_scanner.empty())
- return;
-
- switch(m_scanner.peek().type) {
- case Token::TAG: ParseTag(tag); break;
- case Token::ANCHOR: ParseAnchor(anchor); break;
- default: return;
- }
- }
- }
-
- void SingleDocParser::ParseTag(std::string& tag)
- {
- Token& token = m_scanner.peek();
- if(!tag.empty())
- throw ParserException(token.mark, ErrorMsg::MULTIPLE_TAGS);
-
- Tag tagInfo(token);
- tag = tagInfo.Translate(m_directives);
- m_scanner.pop();
- }
-
- void SingleDocParser::ParseAnchor(anchor_t& anchor)
- {
- Token& token = m_scanner.peek();
- if(anchor)
- throw ParserException(token.mark, ErrorMsg::MULTIPLE_ANCHORS);
-
- anchor = RegisterAnchor(token.value);
- m_scanner.pop();
- }
-
- anchor_t SingleDocParser::RegisterAnchor(const std::string& name)
- {
- if(name.empty())
- return NullAnchor;
-
- return m_anchors[name] = ++m_curAnchor;
- }
-
- anchor_t SingleDocParser::LookupAnchor(const Mark& mark, const std::string& name) const
- {
- Anchors::const_iterator it = m_anchors.find(name);
- if(it == m_anchors.end())
- throw ParserException(mark, ErrorMsg::UNKNOWN_ANCHOR);
-
- return it->second;
- }
-}
diff --git a/ext/src/yaml-cpp/singledocparser.h b/ext/src/yaml-cpp/singledocparser.h
deleted file mode 100644
index 3798dcc..0000000
--- a/ext/src/yaml-cpp/singledocparser.h
+++ /dev/null
@@ -1,65 +0,0 @@
-#ifndef SINGLEDOCPARSER_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define SINGLEDOCPARSER_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-#include "yaml-cpp/anchor.h"
-#include "yaml-cpp/noncopyable.h"
-#include <string>
-#include <map>
-#include <memory>
-
-namespace YAML
-{
- struct Directives;
- struct Mark;
- struct Token;
- class CollectionStack;
- class EventHandler;
- class Node;
- class Scanner;
-
- class SingleDocParser: private noncopyable
- {
- public:
- SingleDocParser(Scanner& scanner, const Directives& directives);
- ~SingleDocParser();
-
- void HandleDocument(EventHandler& eventHandler);
-
- private:
- void HandleNode(EventHandler& eventHandler);
-
- void HandleSequence(EventHandler& eventHandler);
- void HandleBlockSequence(EventHandler& eventHandler);
- void HandleFlowSequence(EventHandler& eventHandler);
-
- void HandleMap(EventHandler& eventHandler);
- void HandleBlockMap(EventHandler& eventHandler);
- void HandleFlowMap(EventHandler& eventHandler);
- void HandleCompactMap(EventHandler& eventHandler);
- void HandleCompactMapWithNoKey(EventHandler& eventHandler);
-
- void ParseProperties(std::string& tag, anchor_t& anchor);
- void ParseTag(std::string& tag);
- void ParseAnchor(anchor_t& anchor);
-
- anchor_t RegisterAnchor(const std::string& name);
- anchor_t LookupAnchor(const Mark& mark, const std::string& name) const;
-
- private:
- Scanner& m_scanner;
- const Directives& m_directives;
- std::auto_ptr<CollectionStack> m_pCollectionStack;
-
- typedef std::map<std::string, anchor_t> Anchors;
- Anchors m_anchors;
-
- anchor_t m_curAnchor;
- };
-}
-
-#endif // SINGLEDOCPARSER_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/src/yaml-cpp/stream.cpp b/ext/src/yaml-cpp/stream.cpp
deleted file mode 100644
index efab7e2..0000000
--- a/ext/src/yaml-cpp/stream.cpp
+++ /dev/null
@@ -1,447 +0,0 @@
-#include "stream.h"
-#include <iostream>
-#include "exp.h"
-
-#ifndef YAML_PREFETCH_SIZE
-#define YAML_PREFETCH_SIZE 2048
-#endif
-
-#define S_ARRAY_SIZE( A ) (sizeof(A)/sizeof(*(A)))
-#define S_ARRAY_END( A ) ((A) + S_ARRAY_SIZE(A))
-
-#define CP_REPLACEMENT_CHARACTER (0xFFFD)
-
-namespace YAML
-{
- enum UtfIntroState {
- uis_start,
- uis_utfbe_b1,
- uis_utf32be_b2,
- uis_utf32be_bom3,
- uis_utf32be,
- uis_utf16be,
- uis_utf16be_bom1,
- uis_utfle_bom1,
- uis_utf16le_bom2,
- uis_utf32le_bom3,
- uis_utf16le,
- uis_utf32le,
- uis_utf8_imp,
- uis_utf16le_imp,
- uis_utf32le_imp3,
- uis_utf8_bom1,
- uis_utf8_bom2,
- uis_utf8,
- uis_error
- };
-
- enum UtfIntroCharType {
- uict00,
- uictBB,
- uictBF,
- uictEF,
- uictFE,
- uictFF,
- uictAscii,
- uictOther,
- uictMax
- };
-
- static bool s_introFinalState[] = {
- false, //uis_start
- false, //uis_utfbe_b1
- false, //uis_utf32be_b2
- false, //uis_utf32be_bom3
- true, //uis_utf32be
- true, //uis_utf16be
- false, //uis_utf16be_bom1
- false, //uis_utfle_bom1
- false, //uis_utf16le_bom2
- false, //uis_utf32le_bom3
- true, //uis_utf16le
- true, //uis_utf32le
- false, //uis_utf8_imp
- false, //uis_utf16le_imp
- false, //uis_utf32le_imp3
- false, //uis_utf8_bom1
- false, //uis_utf8_bom2
- true, //uis_utf8
- true, //uis_error
- };
-
- static UtfIntroState s_introTransitions[][uictMax] = {
- // uict00, uictBB, uictBF, uictEF, uictFE, uictFF, uictAscii, uictOther
- {uis_utfbe_b1, uis_utf8, uis_utf8, uis_utf8_bom1, uis_utf16be_bom1, uis_utfle_bom1, uis_utf8_imp, uis_utf8},
- {uis_utf32be_b2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16be, uis_utf8},
- {uis_utf32be, uis_utf8, uis_utf8, uis_utf8, uis_utf32be_bom3, uis_utf8, uis_utf8, uis_utf8},
- {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf32be, uis_utf8, uis_utf8},
- {uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be},
- {uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be},
- {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16be, uis_utf8, uis_utf8},
- {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16le_bom2, uis_utf8, uis_utf8, uis_utf8},
- {uis_utf32le_bom3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le},
- {uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le},
- {uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le},
- {uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le},
- {uis_utf16le_imp, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8},
- {uis_utf32le_imp3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le},
- {uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le},
- {uis_utf8, uis_utf8_bom2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8},
- {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8},
- {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8},
- };
-
- static char s_introUngetCount[][uictMax] = {
- // uict00, uictBB, uictBF, uictEF, uictFE, uictFF, uictAscii, uictOther
- {0, 1, 1, 0, 0, 0, 0, 1},
- {0, 2, 2, 2, 2, 2, 2, 2},
- {3, 3, 3, 3, 0, 3, 3, 3},
- {4, 4, 4, 4, 4, 0, 4, 4},
- {1, 1, 1, 1, 1, 1, 1, 1},
- {1, 1, 1, 1, 1, 1, 1, 1},
- {2, 2, 2, 2, 2, 0, 2, 2},
- {2, 2, 2, 2, 0, 2, 2, 2},
- {0, 1, 1, 1, 1, 1, 1, 1},
- {0, 2, 2, 2, 2, 2, 2, 2},
- {1, 1, 1, 1, 1, 1, 1, 1},
- {1, 1, 1, 1, 1, 1, 1, 1},
- {0, 2, 2, 2, 2, 2, 2, 2},
- {0, 3, 3, 3, 3, 3, 3, 3},
- {4, 4, 4, 4, 4, 4, 4, 4},
- {2, 0, 2, 2, 2, 2, 2, 2},
- {3, 3, 0, 3, 3, 3, 3, 3},
- {1, 1, 1, 1, 1, 1, 1, 1},
- };
-
- inline UtfIntroCharType IntroCharTypeOf(std::istream::int_type ch)
- {
- if (std::istream::traits_type::eof() == ch) {
- return uictOther;
- }
-
- switch (ch) {
- case 0: return uict00;
- case 0xBB: return uictBB;
- case 0xBF: return uictBF;
- case 0xEF: return uictEF;
- case 0xFE: return uictFE;
- case 0xFF: return uictFF;
- }
-
- if ((ch > 0) && (ch < 0xFF)) {
- return uictAscii;
- }
-
- return uictOther;
- }
-
- inline char Utf8Adjust(unsigned long ch, unsigned char lead_bits, unsigned char rshift)
- {
- const unsigned char header = ((1 << lead_bits) - 1) << (8 - lead_bits);
- const unsigned char mask = (0xFF >> (lead_bits + 1));
- return static_cast<char>(static_cast<unsigned char>(
- header | ((ch >> rshift) & mask)
- ));
- }
-
- inline void QueueUnicodeCodepoint(std::deque<char>& q, unsigned long ch)
- {
- // We are not allowed to queue the Stream::eof() codepoint, so
- // replace it with CP_REPLACEMENT_CHARACTER
- if (static_cast<unsigned long>(Stream::eof()) == ch)
- {
- ch = CP_REPLACEMENT_CHARACTER;
- }
-
- if (ch < 0x80)
- {
- q.push_back(Utf8Adjust(ch, 0, 0));
- }
- else if (ch < 0x800)
- {
- q.push_back(Utf8Adjust(ch, 2, 6));
- q.push_back(Utf8Adjust(ch, 1, 0));
- }
- else if (ch < 0x10000)
- {
- q.push_back(Utf8Adjust(ch, 3, 12));
- q.push_back(Utf8Adjust(ch, 1, 6));
- q.push_back(Utf8Adjust(ch, 1, 0));
- }
- else
- {
- q.push_back(Utf8Adjust(ch, 4, 18));
- q.push_back(Utf8Adjust(ch, 1, 12));
- q.push_back(Utf8Adjust(ch, 1, 6));
- q.push_back(Utf8Adjust(ch, 1, 0));
- }
- }
-
- Stream::Stream(std::istream& input)
- : m_input(input),
- m_pPrefetched(new unsigned char[YAML_PREFETCH_SIZE]),
- m_nPrefetchedAvailable(0), m_nPrefetchedUsed(0)
- {
- typedef std::istream::traits_type char_traits;
-
- if(!input)
- return;
-
- // Determine (or guess) the character-set by reading the BOM, if any. See
- // the YAML specification for the determination algorithm.
- char_traits::int_type intro[4];
- int nIntroUsed = 0;
- UtfIntroState state = uis_start;
- for(; !s_introFinalState[state]; ) {
- std::istream::int_type ch = input.get();
- intro[nIntroUsed++] = ch;
- UtfIntroCharType charType = IntroCharTypeOf(ch);
- UtfIntroState newState = s_introTransitions[state][charType];
- int nUngets = s_introUngetCount[state][charType];
- if(nUngets > 0) {
- input.clear();
- for(; nUngets > 0; --nUngets) {
- if(char_traits::eof() != intro[--nIntroUsed])
- input.putback(char_traits::to_char_type(intro[nIntroUsed]));
- }
- }
- state = newState;
- }
-
- switch (state) {
- case uis_utf8: m_charSet = utf8; break;
- case uis_utf16le: m_charSet = utf16le; break;
- case uis_utf16be: m_charSet = utf16be; break;
- case uis_utf32le: m_charSet = utf32le; break;
- case uis_utf32be: m_charSet = utf32be; break;
- default: m_charSet = utf8; break;
- }
-
- ReadAheadTo(0);
- }
-
- Stream::~Stream()
- {
- delete[] m_pPrefetched;
- }
-
- char Stream::peek() const
- {
- if (m_readahead.empty())
- {
- return Stream::eof();
- }
-
- return m_readahead[0];
- }
-
- Stream::operator bool() const
- {
- return m_input.good() || (!m_readahead.empty() && m_readahead[0] != Stream::eof());
- }
-
- // get
- // . Extracts a character from the stream and updates our position
- char Stream::get()
- {
- char ch = peek();
- AdvanceCurrent();
- m_mark.column++;
-
- if(ch == '\n') {
- m_mark.column = 0;
- m_mark.line++;
- }
-
- return ch;
- }
-
- // get
- // . Extracts 'n' characters from the stream and updates our position
- std::string Stream::get(int n)
- {
- std::string ret;
- ret.reserve(n);
- for(int i=0;i<n;i++)
- ret += get();
- return ret;
- }
-
- // eat
- // . Eats 'n' characters and updates our position.
- void Stream::eat(int n)
- {
- for(int i=0;i<n;i++)
- get();
- }
-
- void Stream::AdvanceCurrent()
- {
- if (!m_readahead.empty())
- {
- m_readahead.pop_front();
- m_mark.pos++;
- }
-
- ReadAheadTo(0);
- }
-
- bool Stream::_ReadAheadTo(size_t i) const
- {
- while (m_input.good() && (m_readahead.size() <= i))
- {
- switch (m_charSet)
- {
- case utf8: StreamInUtf8(); break;
- case utf16le: StreamInUtf16(); break;
- case utf16be: StreamInUtf16(); break;
- case utf32le: StreamInUtf32(); break;
- case utf32be: StreamInUtf32(); break;
- }
- }
-
- // signal end of stream
- if(!m_input.good())
- m_readahead.push_back(Stream::eof());
-
- return m_readahead.size() > i;
- }
-
- void Stream::StreamInUtf8() const
- {
- unsigned char b = GetNextByte();
- if (m_input.good())
- {
- m_readahead.push_back(b);
- }
- }
-
- void Stream::StreamInUtf16() const
- {
- unsigned long ch = 0;
- unsigned char bytes[2];
- int nBigEnd = (m_charSet == utf16be) ? 0 : 1;
-
- bytes[0] = GetNextByte();
- bytes[1] = GetNextByte();
- if (!m_input.good())
- {
- return;
- }
- ch = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) |
- static_cast<unsigned long>(bytes[1 ^ nBigEnd]);
-
- if (ch >= 0xDC00 && ch < 0xE000)
- {
- // Trailing (low) surrogate...ugh, wrong order
- QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
- return;
- }
- else if (ch >= 0xD800 && ch < 0xDC00)
- {
- // ch is a leading (high) surrogate
-
- // Four byte UTF-8 code point
-
- // Read the trailing (low) surrogate
- for (;;)
- {
- bytes[0] = GetNextByte();
- bytes[1] = GetNextByte();
- if (!m_input.good())
- {
- QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
- return;
- }
- unsigned long chLow = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) |
- static_cast<unsigned long>(bytes[1 ^ nBigEnd]);
- if (chLow < 0xDC00 || ch >= 0xE000)
- {
- // Trouble...not a low surrogate. Dump a REPLACEMENT CHARACTER into the stream.
- QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
-
- // Deal with the next UTF-16 unit
- if (chLow < 0xD800 || ch >= 0xE000)
- {
- // Easiest case: queue the codepoint and return
- QueueUnicodeCodepoint(m_readahead, ch);
- return;
- }
- else
- {
- // Start the loop over with the new high surrogate
- ch = chLow;
- continue;
- }
- }
-
- // Select the payload bits from the high surrogate
- ch &= 0x3FF;
- ch <<= 10;
-
- // Include bits from low surrogate
- ch |= (chLow & 0x3FF);
-
- // Add the surrogacy offset
- ch += 0x10000;
- }
- }
-
- QueueUnicodeCodepoint(m_readahead, ch);
- }
-
- inline char* ReadBuffer(unsigned char* pBuffer)
- {
- return reinterpret_cast<char*>(pBuffer);
- }
-
- unsigned char Stream::GetNextByte() const
- {
- if (m_nPrefetchedUsed >= m_nPrefetchedAvailable)
- {
- std::streambuf *pBuf = m_input.rdbuf();
- m_nPrefetchedAvailable = static_cast<std::size_t>(pBuf->sgetn(ReadBuffer(m_pPrefetched), YAML_PREFETCH_SIZE));
- m_nPrefetchedUsed = 0;
- if (!m_nPrefetchedAvailable)
- {
- m_input.setstate(std::ios_base::eofbit);
- }
-
- if (0 == m_nPrefetchedAvailable)
- {
- return 0;
- }
- }
-
- return m_pPrefetched[m_nPrefetchedUsed++];
- }
-
- void Stream::StreamInUtf32() const
- {
- static int indexes[2][4] = {
- {3, 2, 1, 0},
- {0, 1, 2, 3}
- };
-
- unsigned long ch = 0;
- unsigned char bytes[4];
- int* pIndexes = (m_charSet == utf32be) ? indexes[1] : indexes[0];
-
- bytes[0] = GetNextByte();
- bytes[1] = GetNextByte();
- bytes[2] = GetNextByte();
- bytes[3] = GetNextByte();
- if (!m_input.good())
- {
- return;
- }
-
- for (int i = 0; i < 4; ++i)
- {
- ch <<= 8;
- ch |= bytes[pIndexes[i]];
- }
-
- QueueUnicodeCodepoint(m_readahead, ch);
- }
-}
diff --git a/ext/src/yaml-cpp/stream.h b/ext/src/yaml-cpp/stream.h
deleted file mode 100644
index 87f48dc..0000000
--- a/ext/src/yaml-cpp/stream.h
+++ /dev/null
@@ -1,79 +0,0 @@
-#ifndef STREAM_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define STREAM_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-#include "yaml-cpp/noncopyable.h"
-#include "yaml-cpp/mark.h"
-#include <cstddef>
-#include <deque>
-#include <ios>
-#include <iostream>
-#include <set>
-#include <string>
-
-namespace YAML
-{
- class Stream: private noncopyable
- {
- public:
- friend class StreamCharSource;
-
- Stream(std::istream& input);
- ~Stream();
-
- operator bool() const;
- bool operator !() const { return !static_cast <bool>(*this); }
-
- char peek() const;
- char get();
- std::string get(int n);
- void eat(int n = 1);
-
- static char eof() { return 0x04; }
-
- const Mark mark() const { return m_mark; }
- int pos() const { return m_mark.pos; }
- int line() const { return m_mark.line; }
- int column() const { return m_mark.column; }
- void ResetColumn() { m_mark.column = 0; }
-
- private:
- enum CharacterSet {utf8, utf16le, utf16be, utf32le, utf32be};
-
- std::istream& m_input;
- Mark m_mark;
-
- CharacterSet m_charSet;
- mutable std::deque<char> m_readahead;
- unsigned char* const m_pPrefetched;
- mutable size_t m_nPrefetchedAvailable;
- mutable size_t m_nPrefetchedUsed;
-
- void AdvanceCurrent();
- char CharAt(size_t i) const;
- bool ReadAheadTo(size_t i) const;
- bool _ReadAheadTo(size_t i) const;
- void StreamInUtf8() const;
- void StreamInUtf16() const;
- void StreamInUtf32() const;
- unsigned char GetNextByte() const;
- };
-
- // CharAt
- // . Unchecked access
- inline char Stream::CharAt(size_t i) const {
- return m_readahead[i];
- }
-
- inline bool Stream::ReadAheadTo(size_t i) const {
- if(m_readahead.size() > i)
- return true;
- return _ReadAheadTo(i);
- }
-}
-
-#endif // STREAM_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/src/yaml-cpp/streamcharsource.h b/ext/src/yaml-cpp/streamcharsource.h
deleted file mode 100644
index 21fae4e..0000000
--- a/ext/src/yaml-cpp/streamcharsource.h
+++ /dev/null
@@ -1,48 +0,0 @@
-#ifndef STREAMCHARSOURCE_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define STREAMCHARSOURCE_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-#include "yaml-cpp/noncopyable.h"
-#include <cstddef>
-
-namespace YAML
-{
- class StreamCharSource
- {
- public:
- StreamCharSource(const Stream& stream): m_offset(0), m_stream(stream) {}
- StreamCharSource(const StreamCharSource& source): m_offset(source.m_offset), m_stream(source.m_stream) {}
- ~StreamCharSource() {}
-
- operator bool() const;
- char operator [] (std::size_t i) const { return m_stream.CharAt(m_offset + i); }
- bool operator !() const { return !static_cast<bool>(*this); }
-
- const StreamCharSource operator + (int i) const;
-
- private:
- std::size_t m_offset;
- const Stream& m_stream;
-
- StreamCharSource& operator = (const StreamCharSource&); // non-assignable
- };
-
- inline StreamCharSource::operator bool() const {
- return m_stream.ReadAheadTo(m_offset);
- }
-
- inline const StreamCharSource StreamCharSource::operator + (int i) const {
- StreamCharSource source(*this);
- if(static_cast<int> (source.m_offset) + i >= 0)
- source.m_offset += i;
- else
- source.m_offset = 0;
- return source;
- }
-}
-
-#endif // STREAMCHARSOURCE_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/src/yaml-cpp/stringsource.h b/ext/src/yaml-cpp/stringsource.h
deleted file mode 100644
index 21be3c9..0000000
--- a/ext/src/yaml-cpp/stringsource.h
+++ /dev/null
@@ -1,47 +0,0 @@
-#ifndef STRINGSOURCE_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define STRINGSOURCE_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-#include <cstddef>
-
-namespace YAML
-{
- class StringCharSource
- {
- public:
- StringCharSource(const char *str, std::size_t size): m_str(str), m_size(size), m_offset(0) {}
-
- operator bool() const { return m_offset < m_size; }
- char operator [] (std::size_t i) const { return m_str[m_offset + i]; }
- bool operator !() const { return !static_cast<bool>(*this); }
-
- const StringCharSource operator + (int i) const {
- StringCharSource source(*this);
- if(static_cast<int> (source.m_offset) + i >= 0)
- source.m_offset += i;
- else
- source.m_offset = 0;
- return source;
- }
-
- StringCharSource& operator ++ () {
- ++m_offset;
- return *this;
- }
-
- StringCharSource& operator += (std::size_t offset) {
- m_offset += offset;
- return *this;
- }
- private:
- const char *m_str;
- std::size_t m_size;
- std::size_t m_offset;
- };
-}
-
-#endif // STRINGSOURCE_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/src/yaml-cpp/tag.cpp b/ext/src/yaml-cpp/tag.cpp
deleted file mode 100644
index 82a4704..0000000
--- a/ext/src/yaml-cpp/tag.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-#include "tag.h"
-#include "directives.h"
-#include "token.h"
-#include <cassert>
-#include <stdexcept>
-
-namespace YAML
-{
- Tag::Tag(const Token& token): type(static_cast<TYPE>(token.data))
- {
- switch(type) {
- case VERBATIM:
- value = token.value;
- break;
- case PRIMARY_HANDLE:
- value = token.value;
- break;
- case SECONDARY_HANDLE:
- value = token.value;
- break;
- case NAMED_HANDLE:
- handle = token.value;
- value = token.params[0];
- break;
- case NON_SPECIFIC:
- break;
- default:
- assert(false);
- }
- }
-
- const std::string Tag::Translate(const Directives& directives)
- {
- switch(type) {
- case VERBATIM:
- return value;
- case PRIMARY_HANDLE:
- return directives.TranslateTagHandle("!") + value;
- case SECONDARY_HANDLE:
- return directives.TranslateTagHandle("!!") + value;
- case NAMED_HANDLE:
- return directives.TranslateTagHandle("!" + handle + "!") + value;
- case NON_SPECIFIC:
- // TODO:
- return "!";
- default:
- assert(false);
- }
- throw std::runtime_error("yaml-cpp: internal error, bad tag type");
- }
-}
-
diff --git a/ext/src/yaml-cpp/tag.h b/ext/src/yaml-cpp/tag.h
deleted file mode 100644
index 5f77548..0000000
--- a/ext/src/yaml-cpp/tag.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef TAG_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define TAG_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-#include <string>
-
-namespace YAML
-{
- struct Token;
- struct Directives;
-
- struct Tag {
- enum TYPE {
- VERBATIM, PRIMARY_HANDLE, SECONDARY_HANDLE, NAMED_HANDLE, NON_SPECIFIC
- };
-
- Tag(const Token& token);
- const std::string Translate(const Directives& directives);
-
- TYPE type;
- std::string handle, value;
- };
-}
-
-#endif // TAG_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/ext/src/yaml-cpp/token.h b/ext/src/yaml-cpp/token.h
deleted file mode 100644
index 9807e25..0000000
--- a/ext/src/yaml-cpp/token.h
+++ /dev/null
@@ -1,85 +0,0 @@
-#ifndef TOKEN_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-#define TOKEN_H_62B23520_7C8E_11DE_8A39_0800200C9A66
-
-#if defined(_MSC_VER) || (defined(__GNUC__) && (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ >= 4)) // GCC supports "pragma once" correctly since 3.4
-#pragma once
-#endif
-
-
-#include "yaml-cpp/mark.h"
-#include <iostream>
-#include <string>
-#include <vector>
-
-namespace YAML
-{
- const std::string TokenNames[] = {
- "DIRECTIVE",
- "DOC_START",
- "DOC_END",
- "BLOCK_SEQ_START",
- "BLOCK_MAP_START",
- "BLOCK_SEQ_END",
- "BLOCK_MAP_END",
- "BLOCK_ENTRY",
- "FLOW_SEQ_START",
- "FLOW_MAP_START",
- "FLOW_SEQ_END",
- "FLOW_MAP_END",
- "FLOW_MAP_COMPACT",
- "FLOW_ENTRY",
- "KEY",
- "VALUE",
- "ANCHOR",
- "ALIAS",
- "TAG",
- "SCALAR"
- };
-
- struct Token {
- // enums
- enum STATUS { VALID, INVALID, UNVERIFIED };
- enum TYPE {
- DIRECTIVE,
- DOC_START,
- DOC_END,
- BLOCK_SEQ_START,
- BLOCK_MAP_START,
- BLOCK_SEQ_END,
- BLOCK_MAP_END,
- BLOCK_ENTRY,
- FLOW_SEQ_START,
- FLOW_MAP_START,
- FLOW_SEQ_END,
- FLOW_MAP_END,
- FLOW_MAP_COMPACT,
- FLOW_ENTRY,
- KEY,
- VALUE,
- ANCHOR,
- ALIAS,
- TAG,
- PLAIN_SCALAR,
- NON_PLAIN_SCALAR
- };
-
- // data
- Token(TYPE type_, const Mark& mark_): status(VALID), type(type_), mark(mark_), data(0) {}
-
- friend std::ostream& operator << (std::ostream& out, const Token& token) {
- out << TokenNames[token.type] << std::string(": ") << token.value;
- for(std::size_t i=0;i<token.params.size();i++)
- out << std::string(" ") << token.params[i];
- return out;
- }
-
- STATUS status;
- TYPE type;
- Mark mark;
- std::string value;
- std::vector <std::string> params;
- int data;
- };
-}
-
-#endif // TOKEN_H_62B23520_7C8E_11DE_8A39_0800200C9A66
diff --git a/manual.html b/manual.html
index cacb6a9..0ded83c 100644
--- a/manual.html
+++ b/manual.html
@@ -1,6 +1,6 @@
<html>
<head>
- <title>SPAdes 3.7.1 Manual</title>
+ <title>SPAdes 3.8.0 Manual</title>
<style type="text/css">
.code {
background-color: lightgray;
@@ -8,7 +8,7 @@
</style>
</head>
<body>
-<h1>SPAdes 3.7.1 Manual</h1>
+<h1>SPAdes 3.8.0 Manual</h1>
1. <a href="#sec1">About SPAdes</a><br>
1.1. <a href="#sec1.1">Supported data types</a><br>
@@ -25,8 +25,8 @@
3.3. <a href="#sec3.3">Assembling IonTorrent reads</a><br>
3.4. <a href="#sec3.4">Assembling long Illumina paired reads (2x150 and 2x250)</a><br>
3.5. <a href="#sec3.5">SPAdes output</a><br>
-
- 3.6. <a href="#sec3.6">Assembly evaluation</a><br>
+ 3.6. <a href="#sec3.6">plasmidSPAdes output</a><br>
+ 3.7. <a href="#sec3.7">Assembly evaluation</a><br>
4. <a href="#sec4">Citation</a><br>
5. <a href="#sec5">Feedback and bug reports</a><br>
<br>
@@ -35,16 +35,18 @@
<h2>1. About SPAdes</h2>
<p>
SPAdes – St. Petersburg genome assembler – is intended for both standard isolates and single-cell MDA bacteria assemblies. This manual will help you to install and run SPAdes.
-SPAdes version 3.7.1 was released under GPLv2 on March 8, 2016 and can be downloaded from <a href="http://bioinf.spbau.ru/en/spades" target="_blank">http://bioinf.spbau.ru/en/spades</a>.
+SPAdes version 3.8.0 was released under GPLv2 on March 8, 2016 and can be downloaded from <a href="http://bioinf.spbau.ru/en/spades" target="_blank">http://bioinf.spbau.ru/en/spades</a>.
<a name="sec1.1"></a>
<h3>1.1 Supported data types</h3>
<p>
The current version of SPAdes works with Illumina or IonTorrent reads and is capable of providing hybrid assemblies using PacBio, Oxford Nanopore and Sanger reads. You can also provide additional contigs that will be used as long reads.
<p>
- Version 3.7.1 of SPAdes supports paired-end reads, mate-pairs and unpaired reads. SPAdes can take as input several paired-end and mate-pair libraries simultaneously. Note, that SPAdes was initially designed for small genomes. It was tested on single-cell and standard bacterial and fungal data sets. SPAdes is not intended for larger genomes (e.g. mammalian size genomes). For such purposes you can use it at your own risk.
+ Version 3.8.0 of SPAdes supports paired-end reads, mate-pairs and unpaired reads. SPAdes can take as input several paired-end and mate-pair libraries simultaneously. Note, that SPAdes was initially designed for small genomes. It was tested on single-cell and standard bacterial and fungal data sets. SPAdes is not intended for larger genomes (e.g. mammalian size genomes). For such purposes you can use it at your own risk.
+<p>
+ SPAdes 3.8.0 includes metaSPAdes – a pipeline designed specially for metagenomic data sets. To learn more see <a href="#meta">options</a>.
<p>
- SPAdes 3.7.1 also includes metaSPAdes – a pipeline designed specially for metagenomic data sets. To learn more see <a href="#meta">options</a>.
+ Also, SPAdes 3.8.0 includes plasmidSPAdes – a pipeline designed for extracting and assembling plasmids from WGS data sets. To learn more see <a href="#plasmid">options</a>.
<p>
Additionally, SPAdes has a separate modules for assembling highly polymorphic diploid genomes and for TruSeq barcode assembly. For more information see <a href="dipspades_manual.html" target="_blank">dipSPAdes manual</a> and <a href="truspades_manual.html" target="_blank">truSPAdes manual</a> .
@@ -141,7 +143,7 @@ SPAdes comes in several separate modules:
<li> Running SPAdes without preliminary read error correction (e.g. without BayesHammer or IonHammer) will likely require more time and memory. </li>
<li> Each module removes its temporary files as soon as it finishes. </li>
<li> SPAdes uses 512 Mb per thread for buffers, which results in higher memory consumption. If you set memory limit manually, SPAdes will use smaller buffers and thus less RAM. </li>
- <li> Performance statistics is given for SPAdes version 3.7.1. </li>
+ <li> Performance statistics is given for SPAdes version 3.8.0. </li>
</ul>
@@ -155,13 +157,13 @@ SPAdes comes in several separate modules:
<h3>2.1 Downloading SPAdes Linux binaries</h3>
<p>
- To download <a href="http://spades.bioinf.spbau.ru/release3.7.1/SPAdes-3.7.1-Linux.tar.gz">SPAdes Linux binaries</a> and extract them, go to the directory in which you wish SPAdes to be installed and run:
+ To download <a href="http://spades.bioinf.spbau.ru/release3.8.0/SPAdes-3.8.0-Linux.tar.gz">SPAdes Linux binaries</a> and extract them, go to the directory in which you wish SPAdes to be installed and run:
<pre class="code">
<code>
- wget http://spades.bioinf.spbau.ru/release3.7.1/SPAdes-3.7.1-Linux.tar.gz
- tar -xzf SPAdes-3.7.1-Linux.tar.gz
- cd SPAdes-3.7.1-Linux/bin/
+ wget http://spades.bioinf.spbau.ru/release3.8.0/SPAdes-3.8.0-Linux.tar.gz
+ tar -xzf SPAdes-3.8.0-Linux.tar.gz
+ cd SPAdes-3.8.0-Linux/bin/
</code>
</pre>
@@ -187,13 +189,13 @@ SPAdes comes in several separate modules:
<h3>2.2 Downloading SPAdes binaries for Mac</h3>
<p>
- To obtain <a href="http://spades.bioinf.spbau.ru/release3.7.1/SPAdes-3.7.1-Darwin.tar.gz">SPAdes binaries for Mac</a>, go to the directory in which you wish SPAdes to be installed and run:
+ To obtain <a href="http://spades.bioinf.spbau.ru/release3.8.0/SPAdes-3.8.0-Darwin.tar.gz">SPAdes binaries for Mac</a>, go to the directory in which you wish SPAdes to be installed and run:
<pre class="code">
<code>
- curl http://spades.bioinf.spbau.ru/release3.7.1/SPAdes-3.7.1-Darwin.tar.gz -o SPAdes-3.7.1-Darwin.tar.gz
- tar -zxf SPAdes-3.7.1-Darwin.tar.gz
- cd SPAdes-3.7.1-Darwin/bin/
+ curl http://spades.bioinf.spbau.ru/release3.8.0/SPAdes-3.8.0-Darwin.tar.gz -o SPAdes-3.8.0-Darwin.tar.gz
+ tar -zxf SPAdes-3.8.0-Darwin.tar.gz
+ cd SPAdes-3.8.0-Darwin/bin/
</code>
</pre>
@@ -228,13 +230,13 @@ SPAdes comes in several separate modules:
</ul>
<p>
- If you meet these requirements, you can download the <a href="http://spades.bioinf.spbau.ru/release3.7.1/SPAdes-3.7.1.tar.gz">SPAdes source code</a>:
+ If you meet these requirements, you can download the <a href="http://spades.bioinf.spbau.ru/release3.8.0/SPAdes-3.8.0.tar.gz">SPAdes source code</a>:
<pre class="code">
<code>
- wget http://spades.bioinf.spbau.ru/release3.7.1/SPAdes-3.7.1.tar.gz
- tar -xzf SPAdes-3.7.1.tar.gz
- cd SPAdes-3.7.1
+ wget http://spades.bioinf.spbau.ru/release3.8.0/SPAdes-3.8.0.tar.gz
+ tar -xzf SPAdes-3.8.0.tar.gz
+ cd SPAdes-3.8.0
</code>
</pre>
@@ -338,7 +340,7 @@ Thank you for using SPAdes!
SPAdes takes as input paired-end reads, mate-pairs and single (unpaired) reads in FASTA and FASTQ. For IonTorrent data SPAdes also supports unpaired reads in unmapped BAM format (like the one produced by Torrent Server). However, in order to run read error correction, reads should be in FASTQ or BAM format. Sanger, Oxford Nanopore and PacBio CLR reads can be provided in both formats since SPAdes does not run error correction for these types of data.
<p>
- To run SPAdes 3.7.1 you need at least one library of the following types:
+ To run SPAdes 3.8.0 you need at least one library of the following types:
<ul>
<li>Illumina paired-end/high-quality mate-pairs/unpaired reads</li>
<li>IonTorrent paired-end/high-quality mate-pairs/unpaired reads</li>
@@ -447,6 +449,7 @@ Note that we assume that SPAdes installation directory is added to the <code>PAT
Specify the output directory. Required option.
</p>
+<a name="sc"></a>
<p>
<code>--sc </code><br>
This flag is required for MDA (single-cell) data.
@@ -454,8 +457,15 @@ Note that we assume that SPAdes installation directory is added to the <code>PAT
<a name="meta"></a>
<p>
- <code>--meta </code><br>
- This flag is required when assembling metagenomic data sets (runs metaSPAdes). Note, that metaSPAdes supports only a <b>single</b> paired-end library and does not support <a href="#correctoropt">careful mode</a> (mismatch correction is not available). In addition, you cannot specify coverage cutoff for metaSPAdes.
+ <code>--meta </code> (same as <code>metaspades.py</code>)<br>
+ This flag is recommended when assembling metagenomic data sets (runs metaSPAdes, see <a href="https://arxiv.org/abs/1604.03071">paper</a> for more details). Currently metaSPAdes supports only a <b>single</b> library which has to be <b>paired-end</b> (we hope to remove this restriction soon). It does not support <a href="#correctoropt">careful mode</a> (mismatch correction is not available). In addition, you cannot specify coverage cutoff for metaSPAdes. Note t [...]
+</p>
+
+<a name="plasmid"></a>
+<p>
+ <code>--plasmid </code> (same as <code>plasmidspades.py</code>)<br>
+ This flag is required when assembling only plasmids from WGS data sets (runs plasmidSPAdes, see <a href="http://biorxiv.org/content/early/2016/04/20/048942">paper</a> for the algorithm details). Note, that plasmidSPAdes is not compatible with <a href="#meta">metaSPAdes</a> and <a href="#sc"> single-cell mode</a>. Additionally, we do not recommend to run plasmidSPAdes on more than one library.
+ See <a href="#sec3.6">section 3.6</a> for plasmidSPAdes output details.
</p>
<p>
@@ -831,7 +841,7 @@ and PacBio CCS and CLR reads:
<p>
<code>-k <int,int,...></code><br>
- Comma-separated list of k-mer sizes to be used (all values must be odd, less than 128 and listed in ascending order). If <code>--sc</code> is set the default value are 21,33,55. For multicell data sets K values are automatically selected using maximum read length (<a href="#sec3.4">see note for assembling long Illumina paired reads for details</a>). To properly select K values for IonTorrent data read <a href="#sec3.3">section 3.3</a>.
+ Comma-separated list of k-mer sizes to be used (all values must be odd, less than 128 and listed in ascending order). If <code>--sc</code> is set the default values are 21,33,55. For multicell data sets K values are automatically selected using maximum read length (<a href="#sec3.4">see note for assembling long Illumina paired reads for details</a>). To properly select K values for IonTorrent data read <a href="#sec3.3">section 3.3</a>.
</p>
<p>
@@ -1139,9 +1149,14 @@ The full list of <code><output_dir></code> content is presented below:
<p>
SPAdes will overwrite these files and directories if they exist in the specified <code><output_dir></code>.
-
<a name="sec3.6">
-<h3>3.6 Assembly evaluation</h3>
+<h3>3.6 plasmidSPAdes output</h3>
+<p>
+plasmidSPAdes outputs only DNA sequences from putative plasmids. Output file names and formats remain the same as in SPAdes (see <a href="#sec3.5">previous</a> section), with the following difference. For all contig names in <code>contigs.fasta</code>, <code>scaffolds.fasta</code> and <code>assembly_graph.fastg</code>
+we append suffix <code>_component_X</code>, where <code>X</code> is the id of the putative plasmid, which the contig belongs to. Note that plasmidSPAdes may not be able to separate similar plasmids and thus their contigs may appear with the same id.
+
+<a name="sec3.7">
+<h3>3.7 Assembly evaluation</h3>
<p>
@@ -1156,7 +1171,7 @@ The full list of <code><output_dir></code> content is presented below:
If you use SPAdes in your research, please include <a href="http://link.springer.com/chapter/10.1007%2F978-3-642-37195-0_13" target="_blank">Nurk, Bankevich et al., 2013</a> in your reference list. You may also add <a href="http://online.liebertpub.com/doi/abs/10.1089/cmb.2012.0021" target="_blank">Bankevich, Nurk et al., 2012</a> instead.
<p>
- If you use PacBio or Nanopore reads, you may also cite <a href="http://bioinformatics.oxfordjournals.org/content/early/2015/11/20/bioinformatics.btv688.short" target="_blank">Antipov et al., 2014</a>. If you use multiple paired-end and/or mate-pair libraries you may also cite papers describing SPAdes repeat resolution algorithms <a href="http://bioinformatics.oxfordjournals.org/content/30/12/i293.short" target="_blank">Prjibelski et al., 2014</a> and <a href="http://bioinformatics.o [...]
+ If you use PacBio or Nanopore reads, you may also cite <a href="http://bioinformatics.oxfordjournals.org/content/early/2015/11/20/bioinformatics.btv688.short" target="_blank">Antipov et al., 2015</a>. If you use multiple paired-end and/or mate-pair libraries you may also cite papers describing SPAdes repeat resolution algorithms <a href="http://bioinformatics.oxfordjournals.org/content/30/12/i293.short" target="_blank">Prjibelski et al., 2014</a> and <a href="http://bioinformatics.o [...]
<p>
For the information about dipSPAdes and truSPAdes papers see <a href="dipspades_manual.html" target="_blank">dipSPAdes manual</a> and <a href="truspades_manual.html" target="_blank">truSPAdes manual</a> respectively.
diff --git a/metaspades.py b/metaspades.py
new file mode 100755
index 0000000..d06205f
--- /dev/null
+++ b/metaspades.py
@@ -0,0 +1,951 @@
+#!/usr/bin/env python
+
+############################################################################
+# Copyright (c) 2015 Saint Petersburg State University
+# Copyright (c) 2011-2014 Saint Petersburg Academic University
+# All Rights Reserved
+# See file LICENSE for details.
+############################################################################
+
+import os
+import shutil
+from site import addsitedir
+from distutils import dir_util
+from os.path import abspath, expanduser
+import sys
+import getopt
+import logging
+import platform
+import errno
+
+import spades_init
+spades_init.init()
+spades_home = spades_init.spades_home
+bin_home = spades_init.bin_home
+python_modules_home = spades_init.python_modules_home
+ext_python_modules_home = spades_init.ext_python_modules_home
+spades_version = spades_init.spades_version
+
+import support
+support.check_python_version()
+
+from process_cfg import merge_configs, empty_config, load_config_from_file
+import hammer_logic
+import spades_logic
+import options_storage
+addsitedir(ext_python_modules_home)
+if sys.version.startswith('2.'):
+ import pyyaml2 as pyyaml
+elif sys.version.startswith('3.'):
+ import pyyaml3 as pyyaml
+
+import moleculo_postprocessing
+import alignment
+
+
+def print_used_values(cfg, log):
+ def print_value(cfg, section, param, pretty_param="", margin=" "):
+ if not pretty_param:
+ pretty_param = param.capitalize().replace('_', ' ')
+ line = margin + pretty_param
+ if param in cfg[section].__dict__:
+ line += ": " + str(cfg[section].__dict__[param])
+ else:
+ if param.find("offset") != -1:
+ line += " will be auto-detected"
+ log.info(line)
+
+ log.info("")
+
+ # system info
+ log.info("System information:")
+ try:
+ log.info(" SPAdes version: " + str(spades_version).strip())
+ log.info(" Python version: " + ".".join(map(str, sys.version_info[0:3])))
+ # for more details: '[' + str(sys.version_info) + ']'
+ log.info(" OS: " + platform.platform())
+ # for more details: '[' + str(platform.uname()) + ']'
+ except Exception:
+ log.info(" Problem occurred when getting system information")
+ log.info("")
+
+ # main
+ print_value(cfg, "common", "output_dir", "", "")
+ if ("error_correction" in cfg) and (not "assembly" in cfg):
+ log.info("Mode: ONLY read error correction (without assembling)")
+ elif (not "error_correction" in cfg) and ("assembly" in cfg):
+ log.info("Mode: ONLY assembling (without read error correction)")
+ else:
+ log.info("Mode: read error correction and assembling")
+ if ("common" in cfg) and ("developer_mode" in cfg["common"].__dict__):
+ if cfg["common"].developer_mode:
+ log.info("Debug mode is turned ON")
+ else:
+ log.info("Debug mode is turned OFF")
+ log.info("")
+
+ # dataset
+ if "dataset" in cfg:
+ log.info("Dataset parameters:")
+
+ if options_storage.iontorrent:
+ log.info(" IonTorrent data")
+
+ if options_storage.meta:
+ log.info(" Metagenomic mode")
+ elif options_storage.large_genome:
+ log.info(" Large genome mode")
+ elif options_storage.truseq_mode:
+ log.info(" Illumina TruSeq mode")
+ elif options_storage.rna:
+ log.info(" RNA-seq mode")
+ elif options_storage.single_cell:
+ log.info(" Single-cell mode")
+ else:
+ log.info(" Multi-cell mode (you should set '--sc' flag if input data"\
+ " was obtained with MDA (single-cell) technology"\
+ " or --meta flag if processing metagenomic dataset)")
+
+ log.info(" Reads:")
+ dataset_data = pyyaml.load(open(cfg["dataset"].yaml_filename, 'r'))
+ dataset_data = support.relative2abs_paths(dataset_data, os.path.dirname(cfg["dataset"].yaml_filename))
+ support.pretty_print_reads(dataset_data, log)
+
+ # error correction
+ if "error_correction" in cfg:
+ log.info("Read error correction parameters:")
+ print_value(cfg, "error_correction", "max_iterations", "Iterations")
+ print_value(cfg, "error_correction", "qvoffset", "PHRED offset")
+
+ if cfg["error_correction"].gzip_output:
+ log.info(" Corrected reads will be compressed (with gzip)")
+ else:
+ log.info(" Corrected reads will NOT be compressed (with gzip)")
+
+ # assembly
+ if "assembly" in cfg:
+ log.info("Assembly parameters:")
+ if options_storage.auto_K_allowed():
+ log.info(" k: automatic selection based on read length")
+ else:
+ print_value(cfg, "assembly", "iterative_K", "k")
+ if options_storage.plasmid:
+ log.info(" Plasmid mode is turned ON")
+ if cfg["assembly"].disable_rr:
+ log.info(" Repeat resolution is DISABLED")
+ else:
+ log.info(" Repeat resolution is enabled")
+ if options_storage.careful:
+ log.info(" Mismatch careful mode is turned ON")
+ else:
+ log.info(" Mismatch careful mode is turned OFF")
+ if "mismatch_corrector" in cfg:
+ log.info(" MismatchCorrector will be used")
+ else:
+ log.info(" MismatchCorrector will be SKIPPED")
+ if cfg["assembly"].cov_cutoff == 'off':
+ log.info(" Coverage cutoff is turned OFF")
+ elif cfg["assembly"].cov_cutoff == 'auto':
+ log.info(" Coverage cutoff is turned ON and threshold will be auto-detected")
+ else:
+ log.info(" Coverage cutoff is turned ON and threshold is " + str(cfg["assembly"].cov_cutoff))
+
+ log.info("Other parameters:")
+ print_value(cfg, "common", "tmp_dir", "Dir for temp files")
+ print_value(cfg, "common", "max_threads", "Threads")
+ print_value(cfg, "common", "max_memory", "Memory limit (in Gb)", " ")
+ log.info("")
+
+
+def fill_cfg(options_to_parse, log, secondary_filling=False):
+ skip_output_dir=secondary_filling
+ skip_stop_after = secondary_filling
+ load_processed_dataset=secondary_filling
+
+ try:
+ options, not_options = getopt.gnu_getopt(options_to_parse, options_storage.short_options, options_storage.long_options)
+ except getopt.GetoptError:
+ _, exc, _ = sys.exc_info()
+ sys.stderr.write(str(exc) + "\n")
+ sys.stderr.flush()
+ show_usage(1)
+
+ if not options:
+ show_usage(1)
+
+ if len(not_options) > 1:
+ for opt, arg in options:
+ if opt == "-k" and arg.strip().endswith(','):
+ support.error("Do not put spaces after commas in the list of k-mers sizes! Correct example: -k 21,33,55", log)
+ support.error("Please specify option (e.g. -1, -2, -s, etc) for the following paths: " + ", ".join(not_options[1:]) + "\n", log)
+
+ # all parameters are stored here
+ cfg = dict()
+ # dataset is stored here. We are prepared for up to MAX_LIBS_NUMBER for each type of short-reads libs
+ dataset_data = [{} for i in range(options_storage.MAX_LIBS_NUMBER *
+ len(options_storage.SHORT_READS_TYPES.keys()) +
+ len(options_storage.LONG_READS_TYPES))] # "[{}]*num" doesn't work here!
+
+ # for parsing options from "previous run command"
+ options_storage.continue_mode = False
+ options_storage.k_mers = None
+
+ for opt, arg in options:
+ if opt == '-o':
+ if not skip_output_dir:
+ if options_storage.output_dir is not None:
+ support.error('-o option was specified at least twice')
+ options_storage.output_dir = abspath(expanduser(arg))
+ options_storage.dict_of_rel2abs[arg] = options_storage.output_dir
+ elif opt == "--tmp-dir":
+ options_storage.tmp_dir = abspath(expanduser(arg))
+ options_storage.dict_of_rel2abs[arg] = options_storage.tmp_dir
+ elif opt == "--configs-dir":
+ options_storage.configs_dir = support.check_dir_existence(arg)
+ elif opt == "--reference":
+ options_storage.reference = support.check_file_existence(arg, 'reference', log)
+ elif opt == "--dataset":
+ options_storage.dataset_yaml_filename = support.check_file_existence(arg, 'dataset', log)
+
+ elif opt in options_storage.reads_options:
+ support.add_to_dataset(opt, arg, dataset_data)
+
+ elif opt == '-k':
+ if arg == 'auto':
+ options_storage.k_mers = arg
+ else:
+ options_storage.k_mers = list(map(int, arg.split(",")))
+ for k in options_storage.k_mers:
+ if k < options_storage.MIN_K or k > options_storage.MAX_K:
+ support.error('wrong k value ' + str(k) + ': all k values should be between %d and %d' %
+ (options_storage.MIN_K, options_storage.MAX_K), log)
+ if k % 2 == 0:
+ support.error('wrong k value ' + str(k) + ': all k values should be odd', log)
+
+ elif opt == "--sc":
+ options_storage.single_cell = True
+ elif opt == "--meta":
+ #FIXME temporary solution
+ options_storage.single_cell = True
+ options_storage.meta = True
+ elif opt == "--large-genome":
+ options_storage.large_genome = True
+ elif opt == "--plasmid":
+ options_storage.plasmid = True
+ elif opt == "--rna":
+ #FIXME temporary solution
+ options_storage.single_cell = True
+ options_storage.rna = True
+ elif opt == "--iontorrent":
+ options_storage.iontorrent = True
+ elif opt == "--disable-gzip-output":
+ options_storage.disable_gzip_output = True
+ elif opt == "--disable-gzip-output:false":
+ options_storage.disable_gzip_output = False
+ elif opt == "--disable-rr":
+ options_storage.disable_rr = True
+ elif opt == "--disable-rr:false":
+ options_storage.disable_rr = False
+
+ elif opt == "--only-error-correction":
+ if options_storage.only_assembler:
+ support.error('you cannot specify --only-error-correction and --only-assembler simultaneously')
+ options_storage.only_error_correction = True
+ elif opt == "--only-assembler":
+ if options_storage.only_error_correction:
+ support.error('you cannot specify --only-error-correction and --only-assembler simultaneously')
+ options_storage.only_assembler = True
+
+ elif opt == "--read-buffer-size":
+ options_storage.read_buffer_size = int(arg)
+ elif opt == "--bh-heap-check":
+ options_storage.bh_heap_check = arg
+ elif opt == "--spades-heap-check":
+ options_storage.spades_heap_check = arg
+
+ elif opt == "--continue":
+ options_storage.continue_mode = True
+ elif opt == "--restart-from":
+ if arg not in ['ec', 'as', 'mc', 'scc', 'tpp'] and not arg.startswith('k'):
+ support.error("wrong value for --restart-from option: " + arg +
+ " (should be 'ec', 'as', 'k<int>', or 'mc'", log)
+ options_storage.continue_mode = True
+ options_storage.restart_from = arg
+ elif opt == "--stop-after":
+ if not skip_stop_after:
+ if arg not in ['ec', 'as', 'mc', 'scc', 'tpp'] and not arg.startswith('k'):
+ support.error("wrong value for --stop-after option: " + arg +
+ " (should be 'ec', 'as', 'k<int>', or 'mc'", log)
+ options_storage.stop_after = arg
+
+ elif opt == '-t' or opt == "--threads":
+ options_storage.threads = int(arg)
+ elif opt == '-m' or opt == "--memory":
+ options_storage.memory = int(arg)
+ elif opt == "--phred-offset":
+ if arg == 'auto':
+ options_storage.qvoffset = arg
+ elif arg in ['33', '64']:
+ options_storage.qvoffset = int(arg)
+ else:
+ support.error('wrong PHRED quality offset value: ' + arg +
+ ' (should be either 33, 64, or \'auto\')', log)
+ elif opt == "--cov-cutoff":
+ if arg == 'auto' or arg == 'off':
+ options_storage.cov_cutoff = arg
+ elif support.is_float(arg) and float(arg) > 0.0:
+ options_storage.cov_cutoff = float(arg)
+ else:
+ support.error('wrong value for --cov-cutoff option: ' + arg +
+ ' (should be a positive float number, or \'auto\', or \'off\')', log)
+ elif opt == '-i' or opt == "--iterations":
+ options_storage.iterations = int(arg)
+
+ elif opt == "--debug":
+ options_storage.developer_mode = True
+ elif opt == "--debug:false":
+ options_storage.developer_mode = False
+
+ #corrector
+ elif opt == "--mismatch-correction":
+ options_storage.mismatch_corrector = True
+ elif opt == "--mismatch-correction:false":
+ options_storage.mismatch_corrector = False
+
+ elif opt == "--careful":
+ options_storage.mismatch_corrector = True
+ options_storage.careful = True
+ elif opt == "--careful:false":
+ options_storage.mismatch_corrector = False
+ options_storage.careful = False
+
+ elif opt == '-v' or opt == "--version":
+ show_version()
+ elif opt == '-h' or opt == "--help":
+ show_usage(0)
+ elif opt == "--help-hidden":
+ show_usage(0, show_hidden=True)
+
+ elif opt == "--test":
+ options_storage.set_test_options()
+ support.add_to_dataset('-1', os.path.join(spades_home, "test_dataset/ecoli_1K_1.fq.gz"), dataset_data)
+ support.add_to_dataset('-2', os.path.join(spades_home, "test_dataset/ecoli_1K_2.fq.gz"), dataset_data)
+ #break
+ elif opt == "--diploid":
+ options_storage.diploid_mode = True
+ elif opt == "--truseq":
+ options_storage.enable_truseq_mode()
+ else:
+ raise ValueError
+
+ if not options_storage.output_dir:
+ support.error("the output_dir is not set! It is a mandatory parameter (-o output_dir).", log)
+ if not os.path.isdir(options_storage.output_dir):
+ if options_storage.continue_mode:
+ support.error("the output_dir should exist for --continue and for --restart-from!", log)
+ os.makedirs(options_storage.output_dir)
+ if options_storage.restart_from:
+ if options_storage.continue_mode: # saving parameters specified with --restart-from
+ if not support.dataset_is_empty(dataset_data):
+ support.error("you cannot specify reads with --restart-from option!", log)
+ options_storage.save_restart_options(log)
+ else: # overriding previous run parameters
+ options_storage.load_restart_options()
+ if options_storage.meta:
+ if options_storage.careful or options_storage.mismatch_corrector or options_storage.cov_cutoff != "off":
+ support.error("you cannot specify --careful, --mismatch-correction or --cov-cutoff in metagenomic mode!", log)
+ if options_storage.continue_mode:
+ return None, None
+
+ existing_dataset_data = None
+ processed_dataset_fpath = os.path.join(options_storage.output_dir, "input_dataset.yaml")
+ if load_processed_dataset:
+ if os.path.isfile(processed_dataset_fpath):
+ try:
+ existing_dataset_data = pyyaml.load(open(processed_dataset_fpath, 'r'))
+ except pyyaml.YAMLError:
+ existing_dataset_data = None
+ if existing_dataset_data is not None:
+ dataset_data = existing_dataset_data
+ options_storage.dataset_yaml_filename = processed_dataset_fpath
+ else:
+ if options_storage.dataset_yaml_filename:
+ try:
+ dataset_data = pyyaml.load(open(options_storage.dataset_yaml_filename, 'r'))
+ except pyyaml.YAMLError:
+ _, exc, _ = sys.exc_info()
+ support.error('exception caught while parsing YAML file (' + options_storage.dataset_yaml_filename + '):\n' + str(exc))
+ dataset_data = support.relative2abs_paths(dataset_data, os.path.dirname(options_storage.dataset_yaml_filename))
+ else:
+ dataset_data = support.correct_dataset(dataset_data)
+ dataset_data = support.relative2abs_paths(dataset_data, os.getcwd())
+ options_storage.dataset_yaml_filename = processed_dataset_fpath
+ pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w'))
+
+ support.check_dataset_reads(dataset_data, options_storage.only_assembler, log)
+ if not support.get_lib_ids_by_type(dataset_data, spades_logic.READS_TYPES_USED_IN_CONSTRUCTION):
+ support.error('you should specify at least one unpaired, paired-end, or high-quality mate-pairs library!')
+
+ options_storage.set_default_values()
+ ### FILLING cfg
+ cfg["common"] = empty_config()
+ cfg["dataset"] = empty_config()
+ if not options_storage.only_assembler:
+ cfg["error_correction"] = empty_config()
+ if not options_storage.only_error_correction:
+ cfg["assembly"] = empty_config()
+
+ # common
+ cfg["common"].__dict__["output_dir"] = options_storage.output_dir
+ cfg["common"].__dict__["tmp_dir"] = options_storage.tmp_dir
+ cfg["common"].__dict__["max_threads"] = options_storage.threads
+ cfg["common"].__dict__["max_memory"] = options_storage.memory
+ cfg["common"].__dict__["developer_mode"] = options_storage.developer_mode
+
+ # dataset section
+ cfg["dataset"].__dict__["yaml_filename"] = options_storage.dataset_yaml_filename
+ if options_storage.developer_mode and options_storage.reference:
+ cfg["dataset"].__dict__["reference"] = options_storage.reference
+
+ # error correction
+ if (not options_storage.only_assembler) and (options_storage.iterations > 0):
+ cfg["error_correction"].__dict__["output_dir"] = os.path.join(cfg["common"].output_dir, "corrected")
+ cfg["error_correction"].__dict__["max_iterations"] = options_storage.iterations
+ cfg["error_correction"].__dict__["gzip_output"] = not options_storage.disable_gzip_output
+ if options_storage.qvoffset:
+ cfg["error_correction"].__dict__["qvoffset"] = options_storage.qvoffset
+ if options_storage.bh_heap_check:
+ cfg["error_correction"].__dict__["heap_check"] = options_storage.bh_heap_check
+ cfg["error_correction"].__dict__["iontorrent"] = options_storage.iontorrent
+ if options_storage.meta or options_storage.large_genome:
+ cfg["error_correction"].__dict__["count_filter_singletons"] = 1
+
+ # assembly
+ if not options_storage.only_error_correction:
+ if options_storage.k_mers == 'auto' and options_storage.restart_from is None:
+ options_storage.k_mers = None
+ if options_storage.k_mers:
+ cfg["assembly"].__dict__["iterative_K"] = options_storage.k_mers
+ else:
+ cfg["assembly"].__dict__["iterative_K"] = options_storage.K_MERS_SHORT
+ cfg["assembly"].__dict__["disable_rr"] = options_storage.disable_rr
+ cfg["assembly"].__dict__["diploid_mode"] = options_storage.diploid_mode
+ cfg["assembly"].__dict__["cov_cutoff"] = options_storage.cov_cutoff
+ if options_storage.spades_heap_check:
+ cfg["assembly"].__dict__["heap_check"] = options_storage.spades_heap_check
+ if options_storage.read_buffer_size:
+ cfg["assembly"].__dict__["read_buffer_size"] = options_storage.read_buffer_size
+ cfg["assembly"].__dict__["correct_scaffolds"] = options_storage.correct_scaffolds
+ if options_storage.large_genome:
+ cfg["assembly"].__dict__["bwa_paired"] = True
+ cfg["assembly"].__dict__["scaffolding_mode"] = "old_pe_2015"
+ #corrector can work only if contigs exist (not only error correction)
+ if (not options_storage.only_error_correction) and options_storage.mismatch_corrector:
+ cfg["mismatch_corrector"] = empty_config()
+ cfg["mismatch_corrector"].__dict__["skip-masked"] = None
+ cfg["mismatch_corrector"].__dict__["bwa"] = os.path.join(bin_home, "bwa-spades")
+ cfg["mismatch_corrector"].__dict__["threads"] = options_storage.threads
+ cfg["mismatch_corrector"].__dict__["output-dir"] = options_storage.output_dir
+ cfg["run_truseq_postprocessing"] = options_storage.run_truseq_postprocessing
+ return cfg, dataset_data
+
+def check_cfg_for_partial_run(cfg, type='restart-from'): # restart-from ot stop-after
+ if type == 'restart-from':
+ check_point = options_storage.restart_from
+ action = 'restart from'
+ verb = 'was'
+ elif type == 'stop-after':
+ check_point = options_storage.stop_after
+ action = 'stop after'
+ verb = 'is'
+ else:
+ return
+
+ if check_point == 'ec' and ("error_correction" not in cfg):
+ support.error("failed to " + action + " 'read error correction' ('" + check_point + "') because this stage " + verb + " not specified!")
+ if check_point == 'mc' and ("mismatch_corrector" not in cfg):
+ support.error("failed to " + action + " 'mismatch correction' ('" + check_point + "') because this stage " + verb + " not specified!")
+ if check_point == 'as' or check_point.startswith('k'):
+ if "assembly" not in cfg:
+ support.error("failed to " + action + " 'assembling' ('" + check_point + "') because this stage " + verb + " not specified!")
+ if check_point.startswith('k'):
+ correct_k = False
+ k_to_check = options_storage.k_mers
+ if not k_to_check:
+ if options_storage.auto_K_allowed():
+ k_to_check = list(set(options_storage.K_MERS_SHORT + options_storage.K_MERS_150 + options_storage.K_MERS_250))
+ else:
+ k_to_check = options_storage.K_MERS_SHORT
+ for k in k_to_check:
+ if check_point == ("k%d" % k) or check_point.startswith("k%d:" % k):
+ correct_k = True
+ break
+ if not correct_k:
+ k_str = check_point[1:]
+ if k_str.find(":") != -1:
+ k_str = k_str[:k_str.find(":")]
+ support.error("failed to " + action + " K=%s because this K " % k_str + verb + " not specified!")
+
+
+def get_options_from_params(params_filename, spades_py_name=None):
+ if not os.path.isfile(params_filename):
+ return None, None
+ params = open(params_filename, 'r')
+ cmd_line = params.readline().strip()
+ spades_prev_version = None
+ for line in params:
+ if line.find('SPAdes version:') != -1:
+ spades_prev_version = line.split('SPAdes version:')[1]
+ break
+ params.close()
+ if spades_prev_version is None:
+ support.error("failed to parse SPAdes version of the previous run! "
+ "Please restart from the beginning or specify another output directory.")
+ if spades_prev_version.strip() != spades_version.strip():
+ support.error("SPAdes version of the previous run (%s) is not equal to the current version of SPAdes (%s)! "
+ "Please restart from the beginning or specify another output directory."
+ % (spades_prev_version.strip(), spades_version.strip()))
+ if spades_py_name is None or cmd_line.find(os.path.basename(spades_py_name)) == -1:
+ spades_py_name = 'spades.py' # try default name
+ else:
+ spades_py_name = os.path.basename(spades_py_name)
+ spades_py_pos = cmd_line.find(spades_py_name)
+ if spades_py_pos == -1:
+ return None, None
+ return cmd_line, cmd_line[spades_py_pos + len(spades_py_name):].split('\t')
+
+
+def show_version():
+ options_storage.version(spades_version)
+ sys.exit(0)
+
+
+def show_usage(code, show_hidden=False):
+ options_storage.usage(spades_version, show_hidden=show_hidden)
+ sys.exit(code)
+
+
+def main(args):
+ os.environ["LC_ALL"] = "C"
+
+ if len(args) == 1:
+ show_usage(0)
+
+ log = logging.getLogger('spades')
+ log.setLevel(logging.DEBUG)
+
+ console = logging.StreamHandler(sys.stdout)
+ console.setFormatter(logging.Formatter('%(message)s'))
+ console.setLevel(logging.DEBUG)
+ log.addHandler(console)
+
+ support.check_binaries(bin_home, log)
+
+ # auto detecting SPAdes mode (rna, meta, etc)
+ mode = options_storage.get_mode()
+ if mode is not None:
+ args.append('--' + mode)
+
+ # parse options and safe all parameters to cfg
+ options = args
+ cfg, dataset_data = fill_cfg(options, log)
+
+ if options_storage.continue_mode:
+ cmd_line, options = get_options_from_params(os.path.join(options_storage.output_dir, "params.txt"), args[0])
+ if not options:
+ support.error("failed to parse command line of the previous run! Please restart from the beginning or specify another output directory.")
+ cfg, dataset_data = fill_cfg(options, log, secondary_filling=True)
+ if options_storage.restart_from:
+ check_cfg_for_partial_run(cfg, type='restart-from')
+ options_storage.continue_mode = True
+ if options_storage.stop_after:
+ check_cfg_for_partial_run(cfg, type='stop-after')
+
+ log_filename = os.path.join(cfg["common"].output_dir, "spades.log")
+ if options_storage.continue_mode:
+ log_handler = logging.FileHandler(log_filename, mode='a')
+ else:
+ log_handler = logging.FileHandler(log_filename, mode='w')
+ log.addHandler(log_handler)
+
+ if options_storage.continue_mode:
+ log.info("\n======= SPAdes pipeline continued. Log can be found here: " + log_filename + "\n")
+ log.info("Restored from " + cmd_line)
+ if options_storage.restart_from:
+ updated_params = ""
+ skip_next = False
+ for v in args[1:]:
+ if v == '-o' or v == '--restart-from':
+ skip_next = True
+ continue
+ if skip_next or v.startswith('--restart-from='): # you can specify '--restart-from=k33' but not '-o=out_dir'
+ skip_next = False
+ continue
+ updated_params += "\t" + v
+ updated_params = updated_params.strip()
+ log.info("with updated parameters: " + updated_params)
+ cmd_line += "\t" + updated_params
+ log.info("")
+
+ params_filename = os.path.join(cfg["common"].output_dir, "params.txt")
+ params_handler = logging.FileHandler(params_filename, mode='w')
+ log.addHandler(params_handler)
+
+ if options_storage.continue_mode:
+ log.info(cmd_line)
+ else:
+ command = "Command line: "
+ for v in args:
+ # substituting relative paths with absolute ones (read paths, output dir path, etc)
+ v, prefix = support.get_option_prefix(v)
+ if v in options_storage.dict_of_rel2abs.keys():
+ v = options_storage.dict_of_rel2abs[v]
+ if prefix:
+ command += prefix + ":"
+ command += v + "\t"
+ log.info(command)
+
+ # special case
+# if "mismatch_corrector" in cfg and not support.get_lib_ids_by_type(dataset_data, 'paired-end'):
+# support.warning('cannot perform mismatch correction without at least one paired-end library! Skipping this step.', log)
+# del cfg["mismatch_corrector"]
+
+ print_used_values(cfg, log)
+ log.removeHandler(params_handler)
+
+ support.check_single_reads_in_options(options, log)
+
+ if not options_storage.continue_mode:
+ log.info("\n======= SPAdes pipeline started. Log can be found here: " + log_filename + "\n")
+
+ # splitting interlaced reads and processing Ns in additional contigs if needed
+ if support.dataset_has_interlaced_reads(dataset_data) or support.dataset_has_additional_contigs(dataset_data)\
+ or support.dataset_has_nxmate_reads(dataset_data):
+ dir_for_split_reads = os.path.join(options_storage.output_dir, 'split_input')
+ if support.dataset_has_interlaced_reads(dataset_data) or support.dataset_has_nxmate_reads(dataset_data):
+ if not os.path.isdir(dir_for_split_reads):
+ os.makedirs(dir_for_split_reads)
+ if support.dataset_has_interlaced_reads(dataset_data):
+ dataset_data = support.split_interlaced_reads(dataset_data, dir_for_split_reads, log)
+ if support.dataset_has_nxmate_reads(dataset_data):
+ dataset_data = support.process_nxmate_reads(dataset_data, dir_for_split_reads, log)
+ if support.dataset_has_additional_contigs(dataset_data):
+ dataset_data = support.process_Ns_in_additional_contigs(dataset_data, dir_for_split_reads, log)
+ options_storage.dataset_yaml_filename = os.path.join(options_storage.output_dir, "input_dataset.yaml")
+ pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w'))
+ cfg["dataset"].yaml_filename = options_storage.dataset_yaml_filename
+
+ try:
+ # copying configs before all computations (to prevent its changing at run time)
+ tmp_configs_dir = os.path.join(cfg["common"].output_dir, "configs")
+ if os.path.isdir(tmp_configs_dir) and not options_storage.continue_mode:
+ shutil.rmtree(tmp_configs_dir)
+ if not os.path.isdir(tmp_configs_dir):
+ if options_storage.configs_dir:
+ dir_util.copy_tree(options_storage.configs_dir, tmp_configs_dir, preserve_times=False, preserve_mode=False)
+ else:
+ dir_util.copy_tree(os.path.join(spades_home, "configs"), tmp_configs_dir, preserve_times=False, preserve_mode=False)
+
+ corrected_dataset_yaml_filename = ''
+ if "error_correction" in cfg:
+ STAGE_NAME = "Read error correction"
+ bh_cfg = merge_configs(cfg["error_correction"], cfg["common"])
+ corrected_dataset_yaml_filename = os.path.join(bh_cfg.output_dir, "corrected.yaml")
+ ec_is_needed = True
+ only_compressing_is_needed = False
+ if os.path.isfile(corrected_dataset_yaml_filename) and options_storage.continue_mode \
+ and not options_storage.restart_from == "ec":
+ if not bh_cfg.gzip_output or \
+ support.dataset_has_gzipped_reads(pyyaml.load(open(corrected_dataset_yaml_filename, 'r'))):
+ log.info("\n===== Skipping %s (already processed). \n" % STAGE_NAME)
+ ec_is_needed = False
+ else:
+ only_compressing_is_needed = True
+ if ec_is_needed:
+ if not only_compressing_is_needed:
+ support.continue_from_here(log)
+
+ if "HEAPCHECK" in os.environ:
+ del os.environ["HEAPCHECK"]
+ if "heap_check" in bh_cfg.__dict__:
+ os.environ["HEAPCHECK"] = bh_cfg.heap_check
+
+ if os.path.exists(bh_cfg.output_dir):
+ shutil.rmtree(bh_cfg.output_dir)
+ os.makedirs(bh_cfg.output_dir)
+
+ bh_cfg.__dict__["dataset_yaml_filename"] = cfg["dataset"].yaml_filename
+ log.info("\n===== %s started. \n" % STAGE_NAME)
+
+ hammer_logic.run_hammer(corrected_dataset_yaml_filename, tmp_configs_dir, bin_home, bh_cfg, dataset_data,
+ ext_python_modules_home, only_compressing_is_needed, log)
+ log.info("\n===== %s finished. \n" % STAGE_NAME)
+ if options_storage.stop_after == 'ec':
+ support.finish_here(log)
+
+ result_contigs_filename = os.path.join(cfg["common"].output_dir, options_storage.contigs_name)
+ result_scaffolds_filename = os.path.join(cfg["common"].output_dir, options_storage.scaffolds_name)
+ result_assembly_graph_filename = os.path.join(cfg["common"].output_dir, options_storage.assembly_graph_name)
+ result_contigs_paths_filename = os.path.join(cfg["common"].output_dir, options_storage.contigs_paths)
+ result_scaffolds_paths_filename = os.path.join(cfg["common"].output_dir, options_storage.scaffolds_paths)
+ truseq_long_reads_file_base = os.path.join(cfg["common"].output_dir, "truseq_long_reads")
+ truseq_long_reads_file = truseq_long_reads_file_base + ".fasta"
+ misc_dir = os.path.join(cfg["common"].output_dir, "misc")
+ ### if mismatch correction is enabled then result contigs are copied to misc directory
+ assembled_contigs_filename = os.path.join(misc_dir, "assembled_contigs.fasta")
+ assembled_scaffolds_filename = os.path.join(misc_dir, "assembled_scaffolds.fasta")
+ if "assembly" in cfg and not options_storage.run_completed:
+ STAGE_NAME = "Assembling"
+ spades_cfg = merge_configs(cfg["assembly"], cfg["common"])
+ spades_cfg.__dict__["result_contigs"] = result_contigs_filename
+ spades_cfg.__dict__["result_scaffolds"] = result_scaffolds_filename
+ spades_cfg.__dict__["result_graph"] = result_assembly_graph_filename
+ spades_cfg.__dict__["result_contigs_paths"] = result_contigs_paths_filename
+ spades_cfg.__dict__["result_scaffolds_paths"] = result_scaffolds_paths_filename
+
+ if options_storage.continue_mode and (os.path.isfile(spades_cfg.result_contigs)
+ or ("mismatch_corrector" in cfg and
+ os.path.isfile(assembled_contigs_filename))
+ or (options_storage.truseq_mode and os.path.isfile(assembled_scaffolds_filename)))\
+ and not options_storage.restart_from == 'as' \
+ and not options_storage.restart_from == 'scc' \
+ and not (options_storage.restart_from and options_storage.restart_from.startswith('k')):
+
+ log.info("\n===== Skipping %s (already processed). \n" % STAGE_NAME)
+ # calculating latest_dir for the next stages
+ latest_dir = support.get_latest_dir(os.path.join(spades_cfg.output_dir, "K*"))
+ if not latest_dir:
+ support.error("failed to continue the previous run! Please restart from previous stages or from the beginning.", log)
+ else:
+ old_result_files = [result_contigs_filename, result_scaffolds_filename,
+ assembled_contigs_filename, assembled_scaffolds_filename]
+ for old_result_file in old_result_files:
+ if os.path.isfile(old_result_file):
+ os.remove(old_result_file)
+
+ if options_storage.restart_from == 'as':
+ support.continue_from_here(log)
+
+ if os.path.isfile(corrected_dataset_yaml_filename):
+ dataset_data = pyyaml.load(open(corrected_dataset_yaml_filename, 'r'))
+ dataset_data = support.relative2abs_paths(dataset_data, os.path.dirname(corrected_dataset_yaml_filename))
+ if spades_cfg.disable_rr:
+ spades_cfg.__dict__["rr_enable"] = False
+ else:
+ spades_cfg.__dict__["rr_enable"] = True
+
+ if "HEAPCHECK" in os.environ:
+ del os.environ["HEAPCHECK"]
+ if "heap_check" in spades_cfg.__dict__:
+ os.environ["HEAPCHECK"] = spades_cfg.heap_check
+
+ log.info("\n===== %s started.\n" % STAGE_NAME)
+
+ # creating dataset
+ dataset_filename = os.path.join(spades_cfg.output_dir, "dataset.info")
+ if not os.path.isfile(dataset_filename) or not options_storage.continue_mode:
+ dataset_file = open(dataset_filename, 'w')
+ import process_cfg
+ if os.path.isfile(corrected_dataset_yaml_filename):
+ dataset_file.write("reads" + '\t' + process_cfg.process_spaces(corrected_dataset_yaml_filename) + '\n')
+ else:
+ dataset_file.write("reads" + '\t' + process_cfg.process_spaces(cfg["dataset"].yaml_filename) + '\n')
+ if spades_cfg.developer_mode and "reference" in cfg["dataset"].__dict__:
+ dataset_file.write("reference_genome" + '\t')
+ dataset_file.write(process_cfg.process_spaces(cfg["dataset"].reference) + '\n')
+ dataset_file.close()
+ spades_cfg.__dict__["dataset"] = dataset_filename
+
+ latest_dir = spades_logic.run_spades(tmp_configs_dir, bin_home, spades_cfg, dataset_data, ext_python_modules_home, log)
+
+ if os.path.isdir(misc_dir) and not options_storage.continue_mode:
+ shutil.rmtree(misc_dir)
+ if not os.path.isdir(misc_dir):
+ os.makedirs(misc_dir)
+
+ if options_storage.continue_mode and options_storage.restart_from and options_storage.restart_from.startswith('k'):
+ k_str = options_storage.restart_from[1:]
+ if k_str.find(":") != -1:
+ k_str = k_str[:k_str.find(":")]
+ support.error("failed to continue from K=%s because this K was not processed in the original run!" % k_str, log)
+ log.info("\n===== %s finished. \n" % STAGE_NAME)
+ if not options_storage.run_completed:
+ if options_storage.stop_after == 'as' or options_storage.stop_after == 'scc' or (options_storage.stop_after and options_storage.stop_after.startswith('k')):
+ support.finish_here(log)
+
+ #postprocessing
+ if cfg["run_truseq_postprocessing"] and not options_storage.run_completed:
+ if options_storage.continue_mode and os.path.isfile(truseq_long_reads_file_base + ".fastq") and not options_storage.restart_from == 'tpp':
+ log.info("\n===== Skipping %s (already processed). \n" % "TruSeq postprocessing")
+ else:
+ support.continue_from_here(log)
+ if os.path.isfile(result_scaffolds_filename):
+ shutil.move(result_scaffolds_filename, assembled_scaffolds_filename)
+ reads_library = dataset_data[0]
+ alignment_bin = os.path.join(bin_home, "bwa-spades")
+ alignment_dir = os.path.join(cfg["common"].output_dir, "alignment")
+ sam_files = alignment.align_bwa(alignment_bin, assembled_scaffolds_filename, dataset_data, alignment_dir, log, options_storage.threads)
+ moleculo_postprocessing.moleculo_postprocessing(assembled_scaffolds_filename, truseq_long_reads_file_base, sam_files, log)
+ if options_storage.stop_after == 'tpp':
+ support.finish_here(log)
+
+ #corrector
+ if "mismatch_corrector" in cfg and not options_storage.run_completed and \
+ (os.path.isfile(result_contigs_filename) or
+ (options_storage.continue_mode and os.path.isfile(assembled_contigs_filename))):
+ STAGE_NAME = "Mismatch correction"
+ to_correct = dict()
+ to_correct["contigs"] = (result_contigs_filename, assembled_contigs_filename)
+ if os.path.isfile(result_scaffolds_filename) or (options_storage.continue_mode and
+ os.path.isfile(assembled_scaffolds_filename)):
+ to_correct["scaffolds"] = (result_scaffolds_filename, assembled_scaffolds_filename)
+
+ # moving assembled contigs (scaffolds) to misc dir
+ for assembly_type, (old, new) in to_correct.items():
+ if options_storage.continue_mode and os.path.isfile(new):
+ continue
+ if os.path.isfile(old):
+ shutil.move(old, new)
+
+ if options_storage.continue_mode and os.path.isfile(result_contigs_filename) and \
+ (os.path.isfile(result_scaffolds_filename) or not os.path.isfile(assembled_scaffolds_filename)) \
+ and not options_storage.restart_from == 'mc':
+ log.info("\n===== Skipping %s (already processed). \n" % STAGE_NAME)
+ else:
+ if options_storage.restart_from == 'mc':
+ support.continue_from_here(log)
+
+ log.info("\n===== %s started." % STAGE_NAME)
+ # detecting paired-end library with the largest insert size
+ cfg["mismatch_corrector"].__dict__["dataset"] = cfg["dataset"].yaml_filename
+ #TODO: add reads orientation
+
+ import corrector_logic
+ corrector_cfg = cfg["mismatch_corrector"]
+ # processing contigs and scaffolds (or only contigs)
+ for assembly_type, (corrected, assembled) in to_correct.items():
+ if options_storage.continue_mode and os.path.isfile(corrected):
+ log.info("\n== Skipping processing of " + assembly_type + " (already processed)\n")
+ continue
+
+ support.continue_from_here(log)
+ log.info("\n== Processing of " + assembly_type + "\n")
+
+ tmp_dir_for_corrector = os.path.join(cfg["common"].output_dir, "mismatch_corrector", assembly_type)
+
+ cfg["mismatch_corrector"].__dict__["output_dir"] = tmp_dir_for_corrector
+ # correcting
+ corr_cfg = merge_configs(cfg["mismatch_corrector"], cfg["common"])
+
+ result_corrected_filename = os.path.join(tmp_dir_for_corrector, "corrected_contigs.fasta")
+
+ corrector_logic.run_corrector( tmp_configs_dir, bin_home, corr_cfg,
+ ext_python_modules_home, log, assembled, result_corrected_filename)
+
+ if os.path.isfile(result_corrected_filename):
+ shutil.copyfile(result_corrected_filename, corrected)
+ tmp_d = os.path.join(tmp_dir_for_corrector, "tmp")
+ if os.path.isdir(tmp_d) and not cfg["common"].developer_mode:
+ shutil.rmtree(tmp_d)
+ log.info("\n===== %s finished.\n" % STAGE_NAME)
+ if options_storage.stop_after == 'mc':
+ support.finish_here(log)
+
+ if not cfg["common"].developer_mode and os.path.isdir(tmp_configs_dir):
+ shutil.rmtree(tmp_configs_dir)
+
+ if not options_storage.run_completed:
+ #log.info("")
+ if "error_correction" in cfg and os.path.isdir(os.path.dirname(corrected_dataset_yaml_filename)):
+ log.info(" * Corrected reads are in " + support.process_spaces(os.path.dirname(corrected_dataset_yaml_filename) + "/"))
+ if "assembly" in cfg and os.path.isfile(result_contigs_filename):
+ message = " * Assembled contigs are in " + support.process_spaces(result_contigs_filename)
+ log.info(message)
+ if "assembly" in cfg and os.path.isfile(result_scaffolds_filename):
+ message = " * Assembled scaffolds are in " + support.process_spaces(result_scaffolds_filename)
+ log.info(message)
+ if "assembly" in cfg and os.path.isfile(result_assembly_graph_filename):
+ message = " * Assembly graph is in " + support.process_spaces(result_assembly_graph_filename)
+ log.info(message)
+ if "assembly" in cfg and os.path.isfile(result_contigs_paths_filename):
+ message = " * Paths in the assembly graph corresponding to the contigs are in " + \
+ support.process_spaces(result_contigs_paths_filename)
+ log.info(message)
+ if "assembly" in cfg and os.path.isfile(result_scaffolds_paths_filename):
+ message = " * Paths in the assembly graph corresponding to the scaffolds are in " + \
+ support.process_spaces(result_scaffolds_paths_filename)
+ log.info(message)
+ #log.info("")
+
+ #breaking scaffolds
+ if os.path.isfile(result_scaffolds_filename):
+ if not os.path.isdir(misc_dir):
+ os.makedirs(misc_dir)
+ result_broken_scaffolds = os.path.join(misc_dir, "broken_scaffolds.fasta")
+ if not os.path.isfile(result_broken_scaffolds) or not options_storage.continue_mode:
+ modified, broken_scaffolds = support.break_scaffolds(result_scaffolds_filename,
+ options_storage.THRESHOLD_FOR_BREAKING_SCAFFOLDS)
+ if modified:
+ support.write_fasta(result_broken_scaffolds, broken_scaffolds)
+ #log.info(" * Scaffolds broken by " + str(options_storage.THRESHOLD_FOR_BREAKING_SCAFFOLDS) +
+ # " Ns are in " + result_broken_scaffolds)
+
+ ### printing WARNINGS SUMMARY
+ if not support.log_warnings(log):
+ log.info("\n======= SPAdes pipeline finished.") # otherwise it finished WITH WARNINGS
+
+ if options_storage.test_mode:
+ if options_storage.truseq_mode:
+ if not os.path.isfile(truseq_long_reads_file):
+ support.error("TEST FAILED: %s does not exist!" % truseq_long_reads_file)
+ else:
+ for result_filename in [result_contigs_filename, result_scaffolds_filename]:
+ if os.path.isfile(result_filename):
+ result_fasta = list(support.read_fasta(result_filename))
+ # correctness check: should be one contig of length 1000 bp
+ correct_number = 1
+ correct_length = 1000
+ if not len(result_fasta):
+ support.error("TEST FAILED: %s does not contain contigs!" % result_filename)
+ elif len(result_fasta) > correct_number:
+ support.error("TEST FAILED: %s contains more than %d contig (%d)!" %
+ (result_filename, correct_number, len(result_fasta)))
+ elif len(result_fasta[0][1]) != correct_length:
+ if len(result_fasta[0][1]) > correct_length:
+ relation = "more"
+ else:
+ relation = "less"
+ support.error("TEST FAILED: %s contains %s than %d bp (%d bp)!" %
+ (result_filename, relation, correct_length, len(result_fasta[0][1])))
+ else:
+ support.error("TEST FAILED: " + result_filename + " does not exist!")
+ log.info("\n========= TEST PASSED CORRECTLY.")
+
+
+ log.info("\nSPAdes log can be found here: " + log_filename)
+ log.info("")
+ log.info("Thank you for using SPAdes!")
+ log.removeHandler(log_handler)
+
+ except Exception:
+ exc_type, exc_value, _ = sys.exc_info()
+ if exc_type == SystemExit:
+ sys.exit(exc_value)
+ else:
+ if exc_type == OSError and exc_value.errno == errno.ENOEXEC: # Exec format error
+ support.error("It looks like you are using SPAdes binaries for another platform.\n" +
+ support.get_spades_binaries_info_message())
+ else:
+ log.exception(exc_value)
+ support.error("exception caught: %s" % exc_type, log)
+ except BaseException: # since python 2.5 system-exiting exceptions (e.g. KeyboardInterrupt) are derived from BaseException
+ exc_type, exc_value, _ = sys.exc_info()
+ if exc_type == SystemExit:
+ sys.exit(exc_value)
+ else:
+ log.exception(exc_value)
+ support.error("exception caught: %s" % exc_type, log)
+
+
+if __name__ == '__main__':
+ main(sys.argv)
diff --git a/plasmidspades.py b/plasmidspades.py
new file mode 100755
index 0000000..d06205f
--- /dev/null
+++ b/plasmidspades.py
@@ -0,0 +1,951 @@
+#!/usr/bin/env python
+
+############################################################################
+# Copyright (c) 2015 Saint Petersburg State University
+# Copyright (c) 2011-2014 Saint Petersburg Academic University
+# All Rights Reserved
+# See file LICENSE for details.
+############################################################################
+
+import os
+import shutil
+from site import addsitedir
+from distutils import dir_util
+from os.path import abspath, expanduser
+import sys
+import getopt
+import logging
+import platform
+import errno
+
+import spades_init
+spades_init.init()
+spades_home = spades_init.spades_home
+bin_home = spades_init.bin_home
+python_modules_home = spades_init.python_modules_home
+ext_python_modules_home = spades_init.ext_python_modules_home
+spades_version = spades_init.spades_version
+
+import support
+support.check_python_version()
+
+from process_cfg import merge_configs, empty_config, load_config_from_file
+import hammer_logic
+import spades_logic
+import options_storage
+addsitedir(ext_python_modules_home)
+if sys.version.startswith('2.'):
+ import pyyaml2 as pyyaml
+elif sys.version.startswith('3.'):
+ import pyyaml3 as pyyaml
+
+import moleculo_postprocessing
+import alignment
+
+
+def print_used_values(cfg, log):
+ def print_value(cfg, section, param, pretty_param="", margin=" "):
+ if not pretty_param:
+ pretty_param = param.capitalize().replace('_', ' ')
+ line = margin + pretty_param
+ if param in cfg[section].__dict__:
+ line += ": " + str(cfg[section].__dict__[param])
+ else:
+ if param.find("offset") != -1:
+ line += " will be auto-detected"
+ log.info(line)
+
+ log.info("")
+
+ # system info
+ log.info("System information:")
+ try:
+ log.info(" SPAdes version: " + str(spades_version).strip())
+ log.info(" Python version: " + ".".join(map(str, sys.version_info[0:3])))
+ # for more details: '[' + str(sys.version_info) + ']'
+ log.info(" OS: " + platform.platform())
+ # for more details: '[' + str(platform.uname()) + ']'
+ except Exception:
+ log.info(" Problem occurred when getting system information")
+ log.info("")
+
+ # main
+ print_value(cfg, "common", "output_dir", "", "")
+ if ("error_correction" in cfg) and (not "assembly" in cfg):
+ log.info("Mode: ONLY read error correction (without assembling)")
+ elif (not "error_correction" in cfg) and ("assembly" in cfg):
+ log.info("Mode: ONLY assembling (without read error correction)")
+ else:
+ log.info("Mode: read error correction and assembling")
+ if ("common" in cfg) and ("developer_mode" in cfg["common"].__dict__):
+ if cfg["common"].developer_mode:
+ log.info("Debug mode is turned ON")
+ else:
+ log.info("Debug mode is turned OFF")
+ log.info("")
+
+ # dataset
+ if "dataset" in cfg:
+ log.info("Dataset parameters:")
+
+ if options_storage.iontorrent:
+ log.info(" IonTorrent data")
+
+ if options_storage.meta:
+ log.info(" Metagenomic mode")
+ elif options_storage.large_genome:
+ log.info(" Large genome mode")
+ elif options_storage.truseq_mode:
+ log.info(" Illumina TruSeq mode")
+ elif options_storage.rna:
+ log.info(" RNA-seq mode")
+ elif options_storage.single_cell:
+ log.info(" Single-cell mode")
+ else:
+ log.info(" Multi-cell mode (you should set '--sc' flag if input data"\
+ " was obtained with MDA (single-cell) technology"\
+ " or --meta flag if processing metagenomic dataset)")
+
+ log.info(" Reads:")
+ dataset_data = pyyaml.load(open(cfg["dataset"].yaml_filename, 'r'))
+ dataset_data = support.relative2abs_paths(dataset_data, os.path.dirname(cfg["dataset"].yaml_filename))
+ support.pretty_print_reads(dataset_data, log)
+
+ # error correction
+ if "error_correction" in cfg:
+ log.info("Read error correction parameters:")
+ print_value(cfg, "error_correction", "max_iterations", "Iterations")
+ print_value(cfg, "error_correction", "qvoffset", "PHRED offset")
+
+ if cfg["error_correction"].gzip_output:
+ log.info(" Corrected reads will be compressed (with gzip)")
+ else:
+ log.info(" Corrected reads will NOT be compressed (with gzip)")
+
+ # assembly
+ if "assembly" in cfg:
+ log.info("Assembly parameters:")
+ if options_storage.auto_K_allowed():
+ log.info(" k: automatic selection based on read length")
+ else:
+ print_value(cfg, "assembly", "iterative_K", "k")
+ if options_storage.plasmid:
+ log.info(" Plasmid mode is turned ON")
+ if cfg["assembly"].disable_rr:
+ log.info(" Repeat resolution is DISABLED")
+ else:
+ log.info(" Repeat resolution is enabled")
+ if options_storage.careful:
+ log.info(" Mismatch careful mode is turned ON")
+ else:
+ log.info(" Mismatch careful mode is turned OFF")
+ if "mismatch_corrector" in cfg:
+ log.info(" MismatchCorrector will be used")
+ else:
+ log.info(" MismatchCorrector will be SKIPPED")
+ if cfg["assembly"].cov_cutoff == 'off':
+ log.info(" Coverage cutoff is turned OFF")
+ elif cfg["assembly"].cov_cutoff == 'auto':
+ log.info(" Coverage cutoff is turned ON and threshold will be auto-detected")
+ else:
+ log.info(" Coverage cutoff is turned ON and threshold is " + str(cfg["assembly"].cov_cutoff))
+
+ log.info("Other parameters:")
+ print_value(cfg, "common", "tmp_dir", "Dir for temp files")
+ print_value(cfg, "common", "max_threads", "Threads")
+ print_value(cfg, "common", "max_memory", "Memory limit (in Gb)", " ")
+ log.info("")
+
+
+def fill_cfg(options_to_parse, log, secondary_filling=False):
+ skip_output_dir=secondary_filling
+ skip_stop_after = secondary_filling
+ load_processed_dataset=secondary_filling
+
+ try:
+ options, not_options = getopt.gnu_getopt(options_to_parse, options_storage.short_options, options_storage.long_options)
+ except getopt.GetoptError:
+ _, exc, _ = sys.exc_info()
+ sys.stderr.write(str(exc) + "\n")
+ sys.stderr.flush()
+ show_usage(1)
+
+ if not options:
+ show_usage(1)
+
+ if len(not_options) > 1:
+ for opt, arg in options:
+ if opt == "-k" and arg.strip().endswith(','):
+ support.error("Do not put spaces after commas in the list of k-mers sizes! Correct example: -k 21,33,55", log)
+ support.error("Please specify option (e.g. -1, -2, -s, etc) for the following paths: " + ", ".join(not_options[1:]) + "\n", log)
+
+ # all parameters are stored here
+ cfg = dict()
+ # dataset is stored here. We are prepared for up to MAX_LIBS_NUMBER for each type of short-reads libs
+ dataset_data = [{} for i in range(options_storage.MAX_LIBS_NUMBER *
+ len(options_storage.SHORT_READS_TYPES.keys()) +
+ len(options_storage.LONG_READS_TYPES))] # "[{}]*num" doesn't work here!
+
+ # for parsing options from "previous run command"
+ options_storage.continue_mode = False
+ options_storage.k_mers = None
+
+ for opt, arg in options:
+ if opt == '-o':
+ if not skip_output_dir:
+ if options_storage.output_dir is not None:
+ support.error('-o option was specified at least twice')
+ options_storage.output_dir = abspath(expanduser(arg))
+ options_storage.dict_of_rel2abs[arg] = options_storage.output_dir
+ elif opt == "--tmp-dir":
+ options_storage.tmp_dir = abspath(expanduser(arg))
+ options_storage.dict_of_rel2abs[arg] = options_storage.tmp_dir
+ elif opt == "--configs-dir":
+ options_storage.configs_dir = support.check_dir_existence(arg)
+ elif opt == "--reference":
+ options_storage.reference = support.check_file_existence(arg, 'reference', log)
+ elif opt == "--dataset":
+ options_storage.dataset_yaml_filename = support.check_file_existence(arg, 'dataset', log)
+
+ elif opt in options_storage.reads_options:
+ support.add_to_dataset(opt, arg, dataset_data)
+
+ elif opt == '-k':
+ if arg == 'auto':
+ options_storage.k_mers = arg
+ else:
+ options_storage.k_mers = list(map(int, arg.split(",")))
+ for k in options_storage.k_mers:
+ if k < options_storage.MIN_K or k > options_storage.MAX_K:
+ support.error('wrong k value ' + str(k) + ': all k values should be between %d and %d' %
+ (options_storage.MIN_K, options_storage.MAX_K), log)
+ if k % 2 == 0:
+ support.error('wrong k value ' + str(k) + ': all k values should be odd', log)
+
+ elif opt == "--sc":
+ options_storage.single_cell = True
+ elif opt == "--meta":
+ #FIXME temporary solution
+ options_storage.single_cell = True
+ options_storage.meta = True
+ elif opt == "--large-genome":
+ options_storage.large_genome = True
+ elif opt == "--plasmid":
+ options_storage.plasmid = True
+ elif opt == "--rna":
+ #FIXME temporary solution
+ options_storage.single_cell = True
+ options_storage.rna = True
+ elif opt == "--iontorrent":
+ options_storage.iontorrent = True
+ elif opt == "--disable-gzip-output":
+ options_storage.disable_gzip_output = True
+ elif opt == "--disable-gzip-output:false":
+ options_storage.disable_gzip_output = False
+ elif opt == "--disable-rr":
+ options_storage.disable_rr = True
+ elif opt == "--disable-rr:false":
+ options_storage.disable_rr = False
+
+ elif opt == "--only-error-correction":
+ if options_storage.only_assembler:
+ support.error('you cannot specify --only-error-correction and --only-assembler simultaneously')
+ options_storage.only_error_correction = True
+ elif opt == "--only-assembler":
+ if options_storage.only_error_correction:
+ support.error('you cannot specify --only-error-correction and --only-assembler simultaneously')
+ options_storage.only_assembler = True
+
+ elif opt == "--read-buffer-size":
+ options_storage.read_buffer_size = int(arg)
+ elif opt == "--bh-heap-check":
+ options_storage.bh_heap_check = arg
+ elif opt == "--spades-heap-check":
+ options_storage.spades_heap_check = arg
+
+ elif opt == "--continue":
+ options_storage.continue_mode = True
+ elif opt == "--restart-from":
+ if arg not in ['ec', 'as', 'mc', 'scc', 'tpp'] and not arg.startswith('k'):
+ support.error("wrong value for --restart-from option: " + arg +
+ " (should be 'ec', 'as', 'k<int>', or 'mc'", log)
+ options_storage.continue_mode = True
+ options_storage.restart_from = arg
+ elif opt == "--stop-after":
+ if not skip_stop_after:
+ if arg not in ['ec', 'as', 'mc', 'scc', 'tpp'] and not arg.startswith('k'):
+ support.error("wrong value for --stop-after option: " + arg +
+ " (should be 'ec', 'as', 'k<int>', or 'mc'", log)
+ options_storage.stop_after = arg
+
+ elif opt == '-t' or opt == "--threads":
+ options_storage.threads = int(arg)
+ elif opt == '-m' or opt == "--memory":
+ options_storage.memory = int(arg)
+ elif opt == "--phred-offset":
+ if arg == 'auto':
+ options_storage.qvoffset = arg
+ elif arg in ['33', '64']:
+ options_storage.qvoffset = int(arg)
+ else:
+ support.error('wrong PHRED quality offset value: ' + arg +
+ ' (should be either 33, 64, or \'auto\')', log)
+ elif opt == "--cov-cutoff":
+ if arg == 'auto' or arg == 'off':
+ options_storage.cov_cutoff = arg
+ elif support.is_float(arg) and float(arg) > 0.0:
+ options_storage.cov_cutoff = float(arg)
+ else:
+ support.error('wrong value for --cov-cutoff option: ' + arg +
+ ' (should be a positive float number, or \'auto\', or \'off\')', log)
+ elif opt == '-i' or opt == "--iterations":
+ options_storage.iterations = int(arg)
+
+ elif opt == "--debug":
+ options_storage.developer_mode = True
+ elif opt == "--debug:false":
+ options_storage.developer_mode = False
+
+ #corrector
+ elif opt == "--mismatch-correction":
+ options_storage.mismatch_corrector = True
+ elif opt == "--mismatch-correction:false":
+ options_storage.mismatch_corrector = False
+
+ elif opt == "--careful":
+ options_storage.mismatch_corrector = True
+ options_storage.careful = True
+ elif opt == "--careful:false":
+ options_storage.mismatch_corrector = False
+ options_storage.careful = False
+
+ elif opt == '-v' or opt == "--version":
+ show_version()
+ elif opt == '-h' or opt == "--help":
+ show_usage(0)
+ elif opt == "--help-hidden":
+ show_usage(0, show_hidden=True)
+
+ elif opt == "--test":
+ options_storage.set_test_options()
+ support.add_to_dataset('-1', os.path.join(spades_home, "test_dataset/ecoli_1K_1.fq.gz"), dataset_data)
+ support.add_to_dataset('-2', os.path.join(spades_home, "test_dataset/ecoli_1K_2.fq.gz"), dataset_data)
+ #break
+ elif opt == "--diploid":
+ options_storage.diploid_mode = True
+ elif opt == "--truseq":
+ options_storage.enable_truseq_mode()
+ else:
+ raise ValueError
+
+ if not options_storage.output_dir:
+ support.error("the output_dir is not set! It is a mandatory parameter (-o output_dir).", log)
+ if not os.path.isdir(options_storage.output_dir):
+ if options_storage.continue_mode:
+ support.error("the output_dir should exist for --continue and for --restart-from!", log)
+ os.makedirs(options_storage.output_dir)
+ if options_storage.restart_from:
+ if options_storage.continue_mode: # saving parameters specified with --restart-from
+ if not support.dataset_is_empty(dataset_data):
+ support.error("you cannot specify reads with --restart-from option!", log)
+ options_storage.save_restart_options(log)
+ else: # overriding previous run parameters
+ options_storage.load_restart_options()
+ if options_storage.meta:
+ if options_storage.careful or options_storage.mismatch_corrector or options_storage.cov_cutoff != "off":
+ support.error("you cannot specify --careful, --mismatch-correction or --cov-cutoff in metagenomic mode!", log)
+ if options_storage.continue_mode:
+ return None, None
+
+ existing_dataset_data = None
+ processed_dataset_fpath = os.path.join(options_storage.output_dir, "input_dataset.yaml")
+ if load_processed_dataset:
+ if os.path.isfile(processed_dataset_fpath):
+ try:
+ existing_dataset_data = pyyaml.load(open(processed_dataset_fpath, 'r'))
+ except pyyaml.YAMLError:
+ existing_dataset_data = None
+ if existing_dataset_data is not None:
+ dataset_data = existing_dataset_data
+ options_storage.dataset_yaml_filename = processed_dataset_fpath
+ else:
+ if options_storage.dataset_yaml_filename:
+ try:
+ dataset_data = pyyaml.load(open(options_storage.dataset_yaml_filename, 'r'))
+ except pyyaml.YAMLError:
+ _, exc, _ = sys.exc_info()
+ support.error('exception caught while parsing YAML file (' + options_storage.dataset_yaml_filename + '):\n' + str(exc))
+ dataset_data = support.relative2abs_paths(dataset_data, os.path.dirname(options_storage.dataset_yaml_filename))
+ else:
+ dataset_data = support.correct_dataset(dataset_data)
+ dataset_data = support.relative2abs_paths(dataset_data, os.getcwd())
+ options_storage.dataset_yaml_filename = processed_dataset_fpath
+ pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w'))
+
+ support.check_dataset_reads(dataset_data, options_storage.only_assembler, log)
+ if not support.get_lib_ids_by_type(dataset_data, spades_logic.READS_TYPES_USED_IN_CONSTRUCTION):
+ support.error('you should specify at least one unpaired, paired-end, or high-quality mate-pairs library!')
+
+ options_storage.set_default_values()
+ ### FILLING cfg
+ cfg["common"] = empty_config()
+ cfg["dataset"] = empty_config()
+ if not options_storage.only_assembler:
+ cfg["error_correction"] = empty_config()
+ if not options_storage.only_error_correction:
+ cfg["assembly"] = empty_config()
+
+ # common
+ cfg["common"].__dict__["output_dir"] = options_storage.output_dir
+ cfg["common"].__dict__["tmp_dir"] = options_storage.tmp_dir
+ cfg["common"].__dict__["max_threads"] = options_storage.threads
+ cfg["common"].__dict__["max_memory"] = options_storage.memory
+ cfg["common"].__dict__["developer_mode"] = options_storage.developer_mode
+
+ # dataset section
+ cfg["dataset"].__dict__["yaml_filename"] = options_storage.dataset_yaml_filename
+ if options_storage.developer_mode and options_storage.reference:
+ cfg["dataset"].__dict__["reference"] = options_storage.reference
+
+ # error correction
+ if (not options_storage.only_assembler) and (options_storage.iterations > 0):
+ cfg["error_correction"].__dict__["output_dir"] = os.path.join(cfg["common"].output_dir, "corrected")
+ cfg["error_correction"].__dict__["max_iterations"] = options_storage.iterations
+ cfg["error_correction"].__dict__["gzip_output"] = not options_storage.disable_gzip_output
+ if options_storage.qvoffset:
+ cfg["error_correction"].__dict__["qvoffset"] = options_storage.qvoffset
+ if options_storage.bh_heap_check:
+ cfg["error_correction"].__dict__["heap_check"] = options_storage.bh_heap_check
+ cfg["error_correction"].__dict__["iontorrent"] = options_storage.iontorrent
+ if options_storage.meta or options_storage.large_genome:
+ cfg["error_correction"].__dict__["count_filter_singletons"] = 1
+
+ # assembly
+ if not options_storage.only_error_correction:
+ if options_storage.k_mers == 'auto' and options_storage.restart_from is None:
+ options_storage.k_mers = None
+ if options_storage.k_mers:
+ cfg["assembly"].__dict__["iterative_K"] = options_storage.k_mers
+ else:
+ cfg["assembly"].__dict__["iterative_K"] = options_storage.K_MERS_SHORT
+ cfg["assembly"].__dict__["disable_rr"] = options_storage.disable_rr
+ cfg["assembly"].__dict__["diploid_mode"] = options_storage.diploid_mode
+ cfg["assembly"].__dict__["cov_cutoff"] = options_storage.cov_cutoff
+ if options_storage.spades_heap_check:
+ cfg["assembly"].__dict__["heap_check"] = options_storage.spades_heap_check
+ if options_storage.read_buffer_size:
+ cfg["assembly"].__dict__["read_buffer_size"] = options_storage.read_buffer_size
+ cfg["assembly"].__dict__["correct_scaffolds"] = options_storage.correct_scaffolds
+ if options_storage.large_genome:
+ cfg["assembly"].__dict__["bwa_paired"] = True
+ cfg["assembly"].__dict__["scaffolding_mode"] = "old_pe_2015"
+ #corrector can work only if contigs exist (not only error correction)
+ if (not options_storage.only_error_correction) and options_storage.mismatch_corrector:
+ cfg["mismatch_corrector"] = empty_config()
+ cfg["mismatch_corrector"].__dict__["skip-masked"] = None
+ cfg["mismatch_corrector"].__dict__["bwa"] = os.path.join(bin_home, "bwa-spades")
+ cfg["mismatch_corrector"].__dict__["threads"] = options_storage.threads
+ cfg["mismatch_corrector"].__dict__["output-dir"] = options_storage.output_dir
+ cfg["run_truseq_postprocessing"] = options_storage.run_truseq_postprocessing
+ return cfg, dataset_data
+
+def check_cfg_for_partial_run(cfg, type='restart-from'): # restart-from ot stop-after
+ if type == 'restart-from':
+ check_point = options_storage.restart_from
+ action = 'restart from'
+ verb = 'was'
+ elif type == 'stop-after':
+ check_point = options_storage.stop_after
+ action = 'stop after'
+ verb = 'is'
+ else:
+ return
+
+ if check_point == 'ec' and ("error_correction" not in cfg):
+ support.error("failed to " + action + " 'read error correction' ('" + check_point + "') because this stage " + verb + " not specified!")
+ if check_point == 'mc' and ("mismatch_corrector" not in cfg):
+ support.error("failed to " + action + " 'mismatch correction' ('" + check_point + "') because this stage " + verb + " not specified!")
+ if check_point == 'as' or check_point.startswith('k'):
+ if "assembly" not in cfg:
+ support.error("failed to " + action + " 'assembling' ('" + check_point + "') because this stage " + verb + " not specified!")
+ if check_point.startswith('k'):
+ correct_k = False
+ k_to_check = options_storage.k_mers
+ if not k_to_check:
+ if options_storage.auto_K_allowed():
+ k_to_check = list(set(options_storage.K_MERS_SHORT + options_storage.K_MERS_150 + options_storage.K_MERS_250))
+ else:
+ k_to_check = options_storage.K_MERS_SHORT
+ for k in k_to_check:
+ if check_point == ("k%d" % k) or check_point.startswith("k%d:" % k):
+ correct_k = True
+ break
+ if not correct_k:
+ k_str = check_point[1:]
+ if k_str.find(":") != -1:
+ k_str = k_str[:k_str.find(":")]
+ support.error("failed to " + action + " K=%s because this K " % k_str + verb + " not specified!")
+
+
+def get_options_from_params(params_filename, spades_py_name=None):
+ if not os.path.isfile(params_filename):
+ return None, None
+ params = open(params_filename, 'r')
+ cmd_line = params.readline().strip()
+ spades_prev_version = None
+ for line in params:
+ if line.find('SPAdes version:') != -1:
+ spades_prev_version = line.split('SPAdes version:')[1]
+ break
+ params.close()
+ if spades_prev_version is None:
+ support.error("failed to parse SPAdes version of the previous run! "
+ "Please restart from the beginning or specify another output directory.")
+ if spades_prev_version.strip() != spades_version.strip():
+ support.error("SPAdes version of the previous run (%s) is not equal to the current version of SPAdes (%s)! "
+ "Please restart from the beginning or specify another output directory."
+ % (spades_prev_version.strip(), spades_version.strip()))
+ if spades_py_name is None or cmd_line.find(os.path.basename(spades_py_name)) == -1:
+ spades_py_name = 'spades.py' # try default name
+ else:
+ spades_py_name = os.path.basename(spades_py_name)
+ spades_py_pos = cmd_line.find(spades_py_name)
+ if spades_py_pos == -1:
+ return None, None
+ return cmd_line, cmd_line[spades_py_pos + len(spades_py_name):].split('\t')
+
+
+def show_version():
+ options_storage.version(spades_version)
+ sys.exit(0)
+
+
+def show_usage(code, show_hidden=False):
+ options_storage.usage(spades_version, show_hidden=show_hidden)
+ sys.exit(code)
+
+
+def main(args):
+ os.environ["LC_ALL"] = "C"
+
+ if len(args) == 1:
+ show_usage(0)
+
+ log = logging.getLogger('spades')
+ log.setLevel(logging.DEBUG)
+
+ console = logging.StreamHandler(sys.stdout)
+ console.setFormatter(logging.Formatter('%(message)s'))
+ console.setLevel(logging.DEBUG)
+ log.addHandler(console)
+
+ support.check_binaries(bin_home, log)
+
+ # auto detecting SPAdes mode (rna, meta, etc)
+ mode = options_storage.get_mode()
+ if mode is not None:
+ args.append('--' + mode)
+
+ # parse options and safe all parameters to cfg
+ options = args
+ cfg, dataset_data = fill_cfg(options, log)
+
+ if options_storage.continue_mode:
+ cmd_line, options = get_options_from_params(os.path.join(options_storage.output_dir, "params.txt"), args[0])
+ if not options:
+ support.error("failed to parse command line of the previous run! Please restart from the beginning or specify another output directory.")
+ cfg, dataset_data = fill_cfg(options, log, secondary_filling=True)
+ if options_storage.restart_from:
+ check_cfg_for_partial_run(cfg, type='restart-from')
+ options_storage.continue_mode = True
+ if options_storage.stop_after:
+ check_cfg_for_partial_run(cfg, type='stop-after')
+
+ log_filename = os.path.join(cfg["common"].output_dir, "spades.log")
+ if options_storage.continue_mode:
+ log_handler = logging.FileHandler(log_filename, mode='a')
+ else:
+ log_handler = logging.FileHandler(log_filename, mode='w')
+ log.addHandler(log_handler)
+
+ if options_storage.continue_mode:
+ log.info("\n======= SPAdes pipeline continued. Log can be found here: " + log_filename + "\n")
+ log.info("Restored from " + cmd_line)
+ if options_storage.restart_from:
+ updated_params = ""
+ skip_next = False
+ for v in args[1:]:
+ if v == '-o' or v == '--restart-from':
+ skip_next = True
+ continue
+ if skip_next or v.startswith('--restart-from='): # you can specify '--restart-from=k33' but not '-o=out_dir'
+ skip_next = False
+ continue
+ updated_params += "\t" + v
+ updated_params = updated_params.strip()
+ log.info("with updated parameters: " + updated_params)
+ cmd_line += "\t" + updated_params
+ log.info("")
+
+ params_filename = os.path.join(cfg["common"].output_dir, "params.txt")
+ params_handler = logging.FileHandler(params_filename, mode='w')
+ log.addHandler(params_handler)
+
+ if options_storage.continue_mode:
+ log.info(cmd_line)
+ else:
+ command = "Command line: "
+ for v in args:
+ # substituting relative paths with absolute ones (read paths, output dir path, etc)
+ v, prefix = support.get_option_prefix(v)
+ if v in options_storage.dict_of_rel2abs.keys():
+ v = options_storage.dict_of_rel2abs[v]
+ if prefix:
+ command += prefix + ":"
+ command += v + "\t"
+ log.info(command)
+
+ # special case
+# if "mismatch_corrector" in cfg and not support.get_lib_ids_by_type(dataset_data, 'paired-end'):
+# support.warning('cannot perform mismatch correction without at least one paired-end library! Skipping this step.', log)
+# del cfg["mismatch_corrector"]
+
+ print_used_values(cfg, log)
+ log.removeHandler(params_handler)
+
+ support.check_single_reads_in_options(options, log)
+
+ if not options_storage.continue_mode:
+ log.info("\n======= SPAdes pipeline started. Log can be found here: " + log_filename + "\n")
+
+ # splitting interlaced reads and processing Ns in additional contigs if needed
+ if support.dataset_has_interlaced_reads(dataset_data) or support.dataset_has_additional_contigs(dataset_data)\
+ or support.dataset_has_nxmate_reads(dataset_data):
+ dir_for_split_reads = os.path.join(options_storage.output_dir, 'split_input')
+ if support.dataset_has_interlaced_reads(dataset_data) or support.dataset_has_nxmate_reads(dataset_data):
+ if not os.path.isdir(dir_for_split_reads):
+ os.makedirs(dir_for_split_reads)
+ if support.dataset_has_interlaced_reads(dataset_data):
+ dataset_data = support.split_interlaced_reads(dataset_data, dir_for_split_reads, log)
+ if support.dataset_has_nxmate_reads(dataset_data):
+ dataset_data = support.process_nxmate_reads(dataset_data, dir_for_split_reads, log)
+ if support.dataset_has_additional_contigs(dataset_data):
+ dataset_data = support.process_Ns_in_additional_contigs(dataset_data, dir_for_split_reads, log)
+ options_storage.dataset_yaml_filename = os.path.join(options_storage.output_dir, "input_dataset.yaml")
+ pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w'))
+ cfg["dataset"].yaml_filename = options_storage.dataset_yaml_filename
+
+ try:
+ # copying configs before all computations (to prevent its changing at run time)
+ tmp_configs_dir = os.path.join(cfg["common"].output_dir, "configs")
+ if os.path.isdir(tmp_configs_dir) and not options_storage.continue_mode:
+ shutil.rmtree(tmp_configs_dir)
+ if not os.path.isdir(tmp_configs_dir):
+ if options_storage.configs_dir:
+ dir_util.copy_tree(options_storage.configs_dir, tmp_configs_dir, preserve_times=False, preserve_mode=False)
+ else:
+ dir_util.copy_tree(os.path.join(spades_home, "configs"), tmp_configs_dir, preserve_times=False, preserve_mode=False)
+
+ corrected_dataset_yaml_filename = ''
+ if "error_correction" in cfg:
+ STAGE_NAME = "Read error correction"
+ bh_cfg = merge_configs(cfg["error_correction"], cfg["common"])
+ corrected_dataset_yaml_filename = os.path.join(bh_cfg.output_dir, "corrected.yaml")
+ ec_is_needed = True
+ only_compressing_is_needed = False
+ if os.path.isfile(corrected_dataset_yaml_filename) and options_storage.continue_mode \
+ and not options_storage.restart_from == "ec":
+ if not bh_cfg.gzip_output or \
+ support.dataset_has_gzipped_reads(pyyaml.load(open(corrected_dataset_yaml_filename, 'r'))):
+ log.info("\n===== Skipping %s (already processed). \n" % STAGE_NAME)
+ ec_is_needed = False
+ else:
+ only_compressing_is_needed = True
+ if ec_is_needed:
+ if not only_compressing_is_needed:
+ support.continue_from_here(log)
+
+ if "HEAPCHECK" in os.environ:
+ del os.environ["HEAPCHECK"]
+ if "heap_check" in bh_cfg.__dict__:
+ os.environ["HEAPCHECK"] = bh_cfg.heap_check
+
+ if os.path.exists(bh_cfg.output_dir):
+ shutil.rmtree(bh_cfg.output_dir)
+ os.makedirs(bh_cfg.output_dir)
+
+ bh_cfg.__dict__["dataset_yaml_filename"] = cfg["dataset"].yaml_filename
+ log.info("\n===== %s started. \n" % STAGE_NAME)
+
+ hammer_logic.run_hammer(corrected_dataset_yaml_filename, tmp_configs_dir, bin_home, bh_cfg, dataset_data,
+ ext_python_modules_home, only_compressing_is_needed, log)
+ log.info("\n===== %s finished. \n" % STAGE_NAME)
+ if options_storage.stop_after == 'ec':
+ support.finish_here(log)
+
+ result_contigs_filename = os.path.join(cfg["common"].output_dir, options_storage.contigs_name)
+ result_scaffolds_filename = os.path.join(cfg["common"].output_dir, options_storage.scaffolds_name)
+ result_assembly_graph_filename = os.path.join(cfg["common"].output_dir, options_storage.assembly_graph_name)
+ result_contigs_paths_filename = os.path.join(cfg["common"].output_dir, options_storage.contigs_paths)
+ result_scaffolds_paths_filename = os.path.join(cfg["common"].output_dir, options_storage.scaffolds_paths)
+ truseq_long_reads_file_base = os.path.join(cfg["common"].output_dir, "truseq_long_reads")
+ truseq_long_reads_file = truseq_long_reads_file_base + ".fasta"
+ misc_dir = os.path.join(cfg["common"].output_dir, "misc")
+ ### if mismatch correction is enabled then result contigs are copied to misc directory
+ assembled_contigs_filename = os.path.join(misc_dir, "assembled_contigs.fasta")
+ assembled_scaffolds_filename = os.path.join(misc_dir, "assembled_scaffolds.fasta")
+ if "assembly" in cfg and not options_storage.run_completed:
+ STAGE_NAME = "Assembling"
+ spades_cfg = merge_configs(cfg["assembly"], cfg["common"])
+ spades_cfg.__dict__["result_contigs"] = result_contigs_filename
+ spades_cfg.__dict__["result_scaffolds"] = result_scaffolds_filename
+ spades_cfg.__dict__["result_graph"] = result_assembly_graph_filename
+ spades_cfg.__dict__["result_contigs_paths"] = result_contigs_paths_filename
+ spades_cfg.__dict__["result_scaffolds_paths"] = result_scaffolds_paths_filename
+
+ if options_storage.continue_mode and (os.path.isfile(spades_cfg.result_contigs)
+ or ("mismatch_corrector" in cfg and
+ os.path.isfile(assembled_contigs_filename))
+ or (options_storage.truseq_mode and os.path.isfile(assembled_scaffolds_filename)))\
+ and not options_storage.restart_from == 'as' \
+ and not options_storage.restart_from == 'scc' \
+ and not (options_storage.restart_from and options_storage.restart_from.startswith('k')):
+
+ log.info("\n===== Skipping %s (already processed). \n" % STAGE_NAME)
+ # calculating latest_dir for the next stages
+ latest_dir = support.get_latest_dir(os.path.join(spades_cfg.output_dir, "K*"))
+ if not latest_dir:
+ support.error("failed to continue the previous run! Please restart from previous stages or from the beginning.", log)
+ else:
+ old_result_files = [result_contigs_filename, result_scaffolds_filename,
+ assembled_contigs_filename, assembled_scaffolds_filename]
+ for old_result_file in old_result_files:
+ if os.path.isfile(old_result_file):
+ os.remove(old_result_file)
+
+ if options_storage.restart_from == 'as':
+ support.continue_from_here(log)
+
+ if os.path.isfile(corrected_dataset_yaml_filename):
+ dataset_data = pyyaml.load(open(corrected_dataset_yaml_filename, 'r'))
+ dataset_data = support.relative2abs_paths(dataset_data, os.path.dirname(corrected_dataset_yaml_filename))
+ if spades_cfg.disable_rr:
+ spades_cfg.__dict__["rr_enable"] = False
+ else:
+ spades_cfg.__dict__["rr_enable"] = True
+
+ if "HEAPCHECK" in os.environ:
+ del os.environ["HEAPCHECK"]
+ if "heap_check" in spades_cfg.__dict__:
+ os.environ["HEAPCHECK"] = spades_cfg.heap_check
+
+ log.info("\n===== %s started.\n" % STAGE_NAME)
+
+ # creating dataset
+ dataset_filename = os.path.join(spades_cfg.output_dir, "dataset.info")
+ if not os.path.isfile(dataset_filename) or not options_storage.continue_mode:
+ dataset_file = open(dataset_filename, 'w')
+ import process_cfg
+ if os.path.isfile(corrected_dataset_yaml_filename):
+ dataset_file.write("reads" + '\t' + process_cfg.process_spaces(corrected_dataset_yaml_filename) + '\n')
+ else:
+ dataset_file.write("reads" + '\t' + process_cfg.process_spaces(cfg["dataset"].yaml_filename) + '\n')
+ if spades_cfg.developer_mode and "reference" in cfg["dataset"].__dict__:
+ dataset_file.write("reference_genome" + '\t')
+ dataset_file.write(process_cfg.process_spaces(cfg["dataset"].reference) + '\n')
+ dataset_file.close()
+ spades_cfg.__dict__["dataset"] = dataset_filename
+
+ latest_dir = spades_logic.run_spades(tmp_configs_dir, bin_home, spades_cfg, dataset_data, ext_python_modules_home, log)
+
+ if os.path.isdir(misc_dir) and not options_storage.continue_mode:
+ shutil.rmtree(misc_dir)
+ if not os.path.isdir(misc_dir):
+ os.makedirs(misc_dir)
+
+ if options_storage.continue_mode and options_storage.restart_from and options_storage.restart_from.startswith('k'):
+ k_str = options_storage.restart_from[1:]
+ if k_str.find(":") != -1:
+ k_str = k_str[:k_str.find(":")]
+ support.error("failed to continue from K=%s because this K was not processed in the original run!" % k_str, log)
+ log.info("\n===== %s finished. \n" % STAGE_NAME)
+ if not options_storage.run_completed:
+ if options_storage.stop_after == 'as' or options_storage.stop_after == 'scc' or (options_storage.stop_after and options_storage.stop_after.startswith('k')):
+ support.finish_here(log)
+
+ #postprocessing
+ if cfg["run_truseq_postprocessing"] and not options_storage.run_completed:
+ if options_storage.continue_mode and os.path.isfile(truseq_long_reads_file_base + ".fastq") and not options_storage.restart_from == 'tpp':
+ log.info("\n===== Skipping %s (already processed). \n" % "TruSeq postprocessing")
+ else:
+ support.continue_from_here(log)
+ if os.path.isfile(result_scaffolds_filename):
+ shutil.move(result_scaffolds_filename, assembled_scaffolds_filename)
+ reads_library = dataset_data[0]
+ alignment_bin = os.path.join(bin_home, "bwa-spades")
+ alignment_dir = os.path.join(cfg["common"].output_dir, "alignment")
+ sam_files = alignment.align_bwa(alignment_bin, assembled_scaffolds_filename, dataset_data, alignment_dir, log, options_storage.threads)
+ moleculo_postprocessing.moleculo_postprocessing(assembled_scaffolds_filename, truseq_long_reads_file_base, sam_files, log)
+ if options_storage.stop_after == 'tpp':
+ support.finish_here(log)
+
+ #corrector
+ if "mismatch_corrector" in cfg and not options_storage.run_completed and \
+ (os.path.isfile(result_contigs_filename) or
+ (options_storage.continue_mode and os.path.isfile(assembled_contigs_filename))):
+ STAGE_NAME = "Mismatch correction"
+ to_correct = dict()
+ to_correct["contigs"] = (result_contigs_filename, assembled_contigs_filename)
+ if os.path.isfile(result_scaffolds_filename) or (options_storage.continue_mode and
+ os.path.isfile(assembled_scaffolds_filename)):
+ to_correct["scaffolds"] = (result_scaffolds_filename, assembled_scaffolds_filename)
+
+ # moving assembled contigs (scaffolds) to misc dir
+ for assembly_type, (old, new) in to_correct.items():
+ if options_storage.continue_mode and os.path.isfile(new):
+ continue
+ if os.path.isfile(old):
+ shutil.move(old, new)
+
+ if options_storage.continue_mode and os.path.isfile(result_contigs_filename) and \
+ (os.path.isfile(result_scaffolds_filename) or not os.path.isfile(assembled_scaffolds_filename)) \
+ and not options_storage.restart_from == 'mc':
+ log.info("\n===== Skipping %s (already processed). \n" % STAGE_NAME)
+ else:
+ if options_storage.restart_from == 'mc':
+ support.continue_from_here(log)
+
+ log.info("\n===== %s started." % STAGE_NAME)
+ # detecting paired-end library with the largest insert size
+ cfg["mismatch_corrector"].__dict__["dataset"] = cfg["dataset"].yaml_filename
+ #TODO: add reads orientation
+
+ import corrector_logic
+ corrector_cfg = cfg["mismatch_corrector"]
+ # processing contigs and scaffolds (or only contigs)
+ for assembly_type, (corrected, assembled) in to_correct.items():
+ if options_storage.continue_mode and os.path.isfile(corrected):
+ log.info("\n== Skipping processing of " + assembly_type + " (already processed)\n")
+ continue
+
+ support.continue_from_here(log)
+ log.info("\n== Processing of " + assembly_type + "\n")
+
+ tmp_dir_for_corrector = os.path.join(cfg["common"].output_dir, "mismatch_corrector", assembly_type)
+
+ cfg["mismatch_corrector"].__dict__["output_dir"] = tmp_dir_for_corrector
+ # correcting
+ corr_cfg = merge_configs(cfg["mismatch_corrector"], cfg["common"])
+
+ result_corrected_filename = os.path.join(tmp_dir_for_corrector, "corrected_contigs.fasta")
+
+ corrector_logic.run_corrector( tmp_configs_dir, bin_home, corr_cfg,
+ ext_python_modules_home, log, assembled, result_corrected_filename)
+
+ if os.path.isfile(result_corrected_filename):
+ shutil.copyfile(result_corrected_filename, corrected)
+ tmp_d = os.path.join(tmp_dir_for_corrector, "tmp")
+ if os.path.isdir(tmp_d) and not cfg["common"].developer_mode:
+ shutil.rmtree(tmp_d)
+ log.info("\n===== %s finished.\n" % STAGE_NAME)
+ if options_storage.stop_after == 'mc':
+ support.finish_here(log)
+
+ if not cfg["common"].developer_mode and os.path.isdir(tmp_configs_dir):
+ shutil.rmtree(tmp_configs_dir)
+
+ if not options_storage.run_completed:
+ #log.info("")
+ if "error_correction" in cfg and os.path.isdir(os.path.dirname(corrected_dataset_yaml_filename)):
+ log.info(" * Corrected reads are in " + support.process_spaces(os.path.dirname(corrected_dataset_yaml_filename) + "/"))
+ if "assembly" in cfg and os.path.isfile(result_contigs_filename):
+ message = " * Assembled contigs are in " + support.process_spaces(result_contigs_filename)
+ log.info(message)
+ if "assembly" in cfg and os.path.isfile(result_scaffolds_filename):
+ message = " * Assembled scaffolds are in " + support.process_spaces(result_scaffolds_filename)
+ log.info(message)
+ if "assembly" in cfg and os.path.isfile(result_assembly_graph_filename):
+ message = " * Assembly graph is in " + support.process_spaces(result_assembly_graph_filename)
+ log.info(message)
+ if "assembly" in cfg and os.path.isfile(result_contigs_paths_filename):
+ message = " * Paths in the assembly graph corresponding to the contigs are in " + \
+ support.process_spaces(result_contigs_paths_filename)
+ log.info(message)
+ if "assembly" in cfg and os.path.isfile(result_scaffolds_paths_filename):
+ message = " * Paths in the assembly graph corresponding to the scaffolds are in " + \
+ support.process_spaces(result_scaffolds_paths_filename)
+ log.info(message)
+ #log.info("")
+
+ #breaking scaffolds
+ if os.path.isfile(result_scaffolds_filename):
+ if not os.path.isdir(misc_dir):
+ os.makedirs(misc_dir)
+ result_broken_scaffolds = os.path.join(misc_dir, "broken_scaffolds.fasta")
+ if not os.path.isfile(result_broken_scaffolds) or not options_storage.continue_mode:
+ modified, broken_scaffolds = support.break_scaffolds(result_scaffolds_filename,
+ options_storage.THRESHOLD_FOR_BREAKING_SCAFFOLDS)
+ if modified:
+ support.write_fasta(result_broken_scaffolds, broken_scaffolds)
+ #log.info(" * Scaffolds broken by " + str(options_storage.THRESHOLD_FOR_BREAKING_SCAFFOLDS) +
+ # " Ns are in " + result_broken_scaffolds)
+
+ ### printing WARNINGS SUMMARY
+ if not support.log_warnings(log):
+ log.info("\n======= SPAdes pipeline finished.") # otherwise it finished WITH WARNINGS
+
+ if options_storage.test_mode:
+ if options_storage.truseq_mode:
+ if not os.path.isfile(truseq_long_reads_file):
+ support.error("TEST FAILED: %s does not exist!" % truseq_long_reads_file)
+ else:
+ for result_filename in [result_contigs_filename, result_scaffolds_filename]:
+ if os.path.isfile(result_filename):
+ result_fasta = list(support.read_fasta(result_filename))
+ # correctness check: should be one contig of length 1000 bp
+ correct_number = 1
+ correct_length = 1000
+ if not len(result_fasta):
+ support.error("TEST FAILED: %s does not contain contigs!" % result_filename)
+ elif len(result_fasta) > correct_number:
+ support.error("TEST FAILED: %s contains more than %d contig (%d)!" %
+ (result_filename, correct_number, len(result_fasta)))
+ elif len(result_fasta[0][1]) != correct_length:
+ if len(result_fasta[0][1]) > correct_length:
+ relation = "more"
+ else:
+ relation = "less"
+ support.error("TEST FAILED: %s contains %s than %d bp (%d bp)!" %
+ (result_filename, relation, correct_length, len(result_fasta[0][1])))
+ else:
+ support.error("TEST FAILED: " + result_filename + " does not exist!")
+ log.info("\n========= TEST PASSED CORRECTLY.")
+
+
+ log.info("\nSPAdes log can be found here: " + log_filename)
+ log.info("")
+ log.info("Thank you for using SPAdes!")
+ log.removeHandler(log_handler)
+
+ except Exception:
+ exc_type, exc_value, _ = sys.exc_info()
+ if exc_type == SystemExit:
+ sys.exit(exc_value)
+ else:
+ if exc_type == OSError and exc_value.errno == errno.ENOEXEC: # Exec format error
+ support.error("It looks like you are using SPAdes binaries for another platform.\n" +
+ support.get_spades_binaries_info_message())
+ else:
+ log.exception(exc_value)
+ support.error("exception caught: %s" % exc_type, log)
+ except BaseException: # since python 2.5 system-exiting exceptions (e.g. KeyboardInterrupt) are derived from BaseException
+ exc_type, exc_value, _ = sys.exc_info()
+ if exc_type == SystemExit:
+ sys.exit(exc_value)
+ else:
+ log.exception(exc_value)
+ support.error("exception caught: %s" % exc_type, log)
+
+
+if __name__ == '__main__':
+ main(sys.argv)
diff --git a/spades.py b/spades.py
index 2c4a95a..d06205f 100755
--- a/spades.py
+++ b/spades.py
@@ -88,18 +88,23 @@ def print_used_values(cfg, log):
if "dataset" in cfg:
log.info("Dataset parameters:")
- if cfg["dataset"].single_cell:
- log.info(" Single-cell mode")
- elif cfg["dataset"].meta:
+ if options_storage.iontorrent:
+ log.info(" IonTorrent data")
+
+ if options_storage.meta:
log.info(" Metagenomic mode")
- elif cfg["dataset"].truseq:
- log.info(" Illumina TruSeq mode")
+ elif options_storage.large_genome:
+ log.info(" Large genome mode")
+ elif options_storage.truseq_mode:
+ log.info(" Illumina TruSeq mode")
+ elif options_storage.rna:
+ log.info(" RNA-seq mode")
+ elif options_storage.single_cell:
+ log.info(" Single-cell mode")
else:
log.info(" Multi-cell mode (you should set '--sc' flag if input data"\
" was obtained with MDA (single-cell) technology"\
" or --meta flag if processing metagenomic dataset)")
- if cfg["dataset"].iontorrent:
- log.info(" IonTorrent data")
log.info(" Reads:")
dataset_data = pyyaml.load(open(cfg["dataset"].yaml_filename, 'r'))
@@ -124,14 +129,16 @@ def print_used_values(cfg, log):
log.info(" k: automatic selection based on read length")
else:
print_value(cfg, "assembly", "iterative_K", "k")
- if cfg["assembly"].careful:
- log.info(" Mismatch careful mode is turned ON")
- else:
- log.info(" Mismatch careful mode is turned OFF")
+ if options_storage.plasmid:
+ log.info(" Plasmid mode is turned ON")
if cfg["assembly"].disable_rr:
log.info(" Repeat resolution is DISABLED")
else:
log.info(" Repeat resolution is enabled")
+ if options_storage.careful:
+ log.info(" Mismatch careful mode is turned ON")
+ else:
+ log.info(" Mismatch careful mode is turned OFF")
if "mismatch_corrector" in cfg:
log.info(" MismatchCorrector will be used")
else:
@@ -218,7 +225,17 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
elif opt == "--sc":
options_storage.single_cell = True
elif opt == "--meta":
+ #FIXME temporary solution
+ options_storage.single_cell = True
options_storage.meta = True
+ elif opt == "--large-genome":
+ options_storage.large_genome = True
+ elif opt == "--plasmid":
+ options_storage.plasmid = True
+ elif opt == "--rna":
+ #FIXME temporary solution
+ options_storage.single_cell = True
+ options_storage.rna = True
elif opt == "--iontorrent":
options_storage.iontorrent = True
elif opt == "--disable-gzip-output":
@@ -229,7 +246,7 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
options_storage.disable_rr = True
elif opt == "--disable-rr:false":
options_storage.disable_rr = False
-
+
elif opt == "--only-error-correction":
if options_storage.only_assembler:
support.error('you cannot specify --only-error-correction and --only-assembler simultaneously')
@@ -386,11 +403,7 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
cfg["common"].__dict__["developer_mode"] = options_storage.developer_mode
# dataset section
- cfg["dataset"].__dict__["single_cell"] = options_storage.single_cell
- cfg["dataset"].__dict__["iontorrent"] = options_storage.iontorrent
- cfg["dataset"].__dict__["meta"] = options_storage.meta
cfg["dataset"].__dict__["yaml_filename"] = options_storage.dataset_yaml_filename
- cfg["dataset"].__dict__["truseq"] = options_storage.truseq_mode
if options_storage.developer_mode and options_storage.reference:
cfg["dataset"].__dict__["reference"] = options_storage.reference
@@ -404,7 +417,7 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
if options_storage.bh_heap_check:
cfg["error_correction"].__dict__["heap_check"] = options_storage.bh_heap_check
cfg["error_correction"].__dict__["iontorrent"] = options_storage.iontorrent
- if options_storage.meta:
+ if options_storage.meta or options_storage.large_genome:
cfg["error_correction"].__dict__["count_filter_singletons"] = 1
# assembly
@@ -415,7 +428,6 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
cfg["assembly"].__dict__["iterative_K"] = options_storage.k_mers
else:
cfg["assembly"].__dict__["iterative_K"] = options_storage.K_MERS_SHORT
- cfg["assembly"].__dict__["careful"] = options_storage.careful
cfg["assembly"].__dict__["disable_rr"] = options_storage.disable_rr
cfg["assembly"].__dict__["diploid_mode"] = options_storage.diploid_mode
cfg["assembly"].__dict__["cov_cutoff"] = options_storage.cov_cutoff
@@ -424,7 +436,9 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
if options_storage.read_buffer_size:
cfg["assembly"].__dict__["read_buffer_size"] = options_storage.read_buffer_size
cfg["assembly"].__dict__["correct_scaffolds"] = options_storage.correct_scaffolds
-
+ if options_storage.large_genome:
+ cfg["assembly"].__dict__["bwa_paired"] = True
+ cfg["assembly"].__dict__["scaffolding_mode"] = "old_pe_2015"
#corrector can work only if contigs exist (not only error correction)
if (not options_storage.only_error_correction) and options_storage.mismatch_corrector:
cfg["mismatch_corrector"] = empty_config()
@@ -498,7 +512,7 @@ def get_options_from_params(params_filename, spades_py_name=None):
spades_py_pos = cmd_line.find(spades_py_name)
if spades_py_pos == -1:
return None, None
- return cmd_line, cmd_line[spades_py_pos + len(spades_py_name):].split()
+ return cmd_line, cmd_line[spades_py_pos + len(spades_py_name):].split('\t')
def show_version():
@@ -527,6 +541,11 @@ def main(args):
support.check_binaries(bin_home, log)
+ # auto detecting SPAdes mode (rna, meta, etc)
+ mode = options_storage.get_mode()
+ if mode is not None:
+ args.append('--' + mode)
+
# parse options and safe all parameters to cfg
options = args
cfg, dataset_data = fill_cfg(options, log)
@@ -562,10 +581,10 @@ def main(args):
if skip_next or v.startswith('--restart-from='): # you can specify '--restart-from=k33' but not '-o=out_dir'
skip_next = False
continue
- updated_params += " " + v
+ updated_params += "\t" + v
updated_params = updated_params.strip()
log.info("with updated parameters: " + updated_params)
- cmd_line += " " + updated_params
+ cmd_line += "\t" + updated_params
log.info("")
params_filename = os.path.join(cfg["common"].output_dir, "params.txt")
@@ -583,7 +602,7 @@ def main(args):
v = options_storage.dict_of_rel2abs[v]
if prefix:
command += prefix + ":"
- command += v + " "
+ command += v + "\t"
log.info(command)
# special case
@@ -623,44 +642,43 @@ def main(args):
shutil.rmtree(tmp_configs_dir)
if not os.path.isdir(tmp_configs_dir):
if options_storage.configs_dir:
- dir_util.copy_tree(options_storage.configs_dir, tmp_configs_dir, preserve_times=False)
+ dir_util.copy_tree(options_storage.configs_dir, tmp_configs_dir, preserve_times=False, preserve_mode=False)
else:
- dir_util.copy_tree(os.path.join(spades_home, "configs"), tmp_configs_dir, preserve_times=False)
+ dir_util.copy_tree(os.path.join(spades_home, "configs"), tmp_configs_dir, preserve_times=False, preserve_mode=False)
corrected_dataset_yaml_filename = ''
if "error_correction" in cfg:
STAGE_NAME = "Read error correction"
bh_cfg = merge_configs(cfg["error_correction"], cfg["common"])
corrected_dataset_yaml_filename = os.path.join(bh_cfg.output_dir, "corrected.yaml")
+ ec_is_needed = True
+ only_compressing_is_needed = False
if os.path.isfile(corrected_dataset_yaml_filename) and options_storage.continue_mode \
- and not options_storage.restart_from == "ec":
- log.info("\n===== Skipping %s (already processed). \n" % STAGE_NAME)
- else:
- support.continue_from_here(log)
-
- if "HEAPCHECK" in os.environ:
- del os.environ["HEAPCHECK"]
- if "heap_check" in bh_cfg.__dict__:
- os.environ["HEAPCHECK"] = bh_cfg.heap_check
-
- if os.path.exists(bh_cfg.output_dir):
- shutil.rmtree(bh_cfg.output_dir)
- os.makedirs(bh_cfg.output_dir)
-
- if support.get_lib_ids_by_type(dataset_data, options_storage.LONG_READS_TYPES):
- not_used_dataset_data = support.get_libs_by_type(dataset_data, options_storage.LONG_READS_TYPES)
- to_correct_dataset_data = support.rm_libs_by_type(dataset_data, options_storage.LONG_READS_TYPES)
- to_correct_dataset_yaml_filename = os.path.join(bh_cfg.output_dir, "to_correct.yaml")
- pyyaml.dump(to_correct_dataset_data, open(to_correct_dataset_yaml_filename, 'w'))
- bh_cfg.__dict__["dataset_yaml_filename"] = to_correct_dataset_yaml_filename
+ and not options_storage.restart_from == "ec":
+ if not bh_cfg.gzip_output or \
+ support.dataset_has_gzipped_reads(pyyaml.load(open(corrected_dataset_yaml_filename, 'r'))):
+ log.info("\n===== Skipping %s (already processed). \n" % STAGE_NAME)
+ ec_is_needed = False
else:
- not_used_dataset_data = None
- bh_cfg.__dict__["dataset_yaml_filename"] = cfg["dataset"].yaml_filename
+ only_compressing_is_needed = True
+ if ec_is_needed:
+ if not only_compressing_is_needed:
+ support.continue_from_here(log)
+
+ if "HEAPCHECK" in os.environ:
+ del os.environ["HEAPCHECK"]
+ if "heap_check" in bh_cfg.__dict__:
+ os.environ["HEAPCHECK"] = bh_cfg.heap_check
+
+ if os.path.exists(bh_cfg.output_dir):
+ shutil.rmtree(bh_cfg.output_dir)
+ os.makedirs(bh_cfg.output_dir)
+ bh_cfg.__dict__["dataset_yaml_filename"] = cfg["dataset"].yaml_filename
log.info("\n===== %s started. \n" % STAGE_NAME)
- hammer_logic.run_hammer(corrected_dataset_yaml_filename, tmp_configs_dir, bin_home, bh_cfg, not_used_dataset_data,
- ext_python_modules_home, log)
+ hammer_logic.run_hammer(corrected_dataset_yaml_filename, tmp_configs_dir, bin_home, bh_cfg, dataset_data,
+ ext_python_modules_home, only_compressing_is_needed, log)
log.info("\n===== %s finished. \n" % STAGE_NAME)
if options_storage.stop_after == 'ec':
support.finish_here(log)
@@ -687,7 +705,8 @@ def main(args):
if options_storage.continue_mode and (os.path.isfile(spades_cfg.result_contigs)
or ("mismatch_corrector" in cfg and
- os.path.isfile(assembled_contigs_filename)))\
+ os.path.isfile(assembled_contigs_filename))
+ or (options_storage.truseq_mode and os.path.isfile(assembled_scaffolds_filename)))\
and not options_storage.restart_from == 'as' \
and not options_storage.restart_from == 'scc' \
and not (options_storage.restart_from and options_storage.restart_from.startswith('k')):
@@ -727,9 +746,6 @@ def main(args):
if not os.path.isfile(dataset_filename) or not options_storage.continue_mode:
dataset_file = open(dataset_filename, 'w')
import process_cfg
- dataset_file.write("single_cell" + '\t' + process_cfg.bool_to_str(cfg["dataset"].single_cell) + '\n')
- dataset_file.write("meta" + '\t' + process_cfg.bool_to_str(cfg["dataset"].meta) + '\n')
- dataset_file.write("moleculo" + '\t' + process_cfg.bool_to_str(cfg["dataset"].truseq) + '\n')
if os.path.isfile(corrected_dataset_yaml_filename):
dataset_file.write("reads" + '\t' + process_cfg.process_spaces(corrected_dataset_yaml_filename) + '\n')
else:
diff --git a/spades_compile.sh b/spades_compile.sh
index 0cc1ee3..580f4b6 100755
--- a/spades_compile.sh
+++ b/spades_compile.sh
@@ -15,11 +15,11 @@ fi
BUILD_DIR=build_spades
BASEDIR=`pwd`/`dirname $0`
-rm -rf $BASEDIR/$BUILD_DIR
-mkdir -p $BASEDIR/$BUILD_DIR
+rm -rf "$BASEDIR/$BUILD_DIR"
+mkdir -p "$BASEDIR/$BUILD_DIR"
set -e
-cd $BASEDIR/$BUILD_DIR
-cmake -G "Unix Makefiles" -DCMAKE_INSTALL_PREFIX=$PREFIX $BASEDIR/src $*
+cd "$BASEDIR/$BUILD_DIR"
+cmake -G "Unix Makefiles" -DCMAKE_INSTALL_PREFIX="$PREFIX" "$BASEDIR/src" $*
make -j 8
make install
cd $PREFIX
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index d971bc3..fd7ad2e 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -20,6 +20,7 @@ set(CMAKE_MODULE_PATH
"${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules")
# Define various dirs
set(SPADES_MAIN_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+set(SPADES_MODULES_DIR ${SPADES_MAIN_SRC_DIR}/modules)
set(SPADES_MAIN_INCLUDE_DIR ${SPADES_MAIN_SRC_DIR}/include)
set(SPADES_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
set(SPADES_TOOLS_BINARY_DIR ${SPADES_BINARY_DIR}/bin)
@@ -81,25 +82,31 @@ if (NOT OPENMP_FOUND)
endif()
# sub projects
-add_subdirectory(io)
-add_subdirectory(debruijn)
-add_subdirectory(hammer)
-add_subdirectory(ionhammer)
+add_subdirectory(modules)
+add_subdirectory(projects)
add_subdirectory(spades_pipeline)
-add_subdirectory(dipspades)
-add_subdirectory(corrector)
-add_subdirectory(scaffold_correction)
+
# Main pipeline script
install(PROGRAMS "${CMAKE_CURRENT_SOURCE_DIR}/../spades.py"
DESTINATION bin
COMPONENT runtime)
-install(PROGRAMS "${CMAKE_CURRENT_SOURCE_DIR}/../truspades.py"
+install(PROGRAMS "${CMAKE_CURRENT_SOURCE_DIR}/../dipspades.py"
DESTINATION bin
COMPONENT runtime)
-install(PROGRAMS "${CMAKE_CURRENT_SOURCE_DIR}/../dipspades.py"
+install(PROGRAMS "${CMAKE_CURRENT_SOURCE_DIR}/../metaspades.py"
+ DESTINATION bin
+ COMPONENT runtime)
+install(PROGRAMS "${CMAKE_CURRENT_SOURCE_DIR}/../plasmidspades.py"
DESTINATION bin
COMPONENT runtime)
+#install(PROGRAMS "${CMAKE_CURRENT_SOURCE_DIR}/../rnaspades.py"
+# DESTINATION bin
+# COMPONENT runtime)
+install(PROGRAMS "${CMAKE_CURRENT_SOURCE_DIR}/../truspades.py"
+ DESTINATION bin
+ COMPONENT runtime)
+
install(FILES "${CMAKE_CURRENT_SOURCE_DIR}/../spades_init.py"
DESTINATION bin
COMPONENT runtime)
diff --git a/src/cmake/includes.cmake b/src/cmake/includes.cmake
index e745f2e..ed3f984 100644
--- a/src/cmake/includes.cmake
+++ b/src/cmake/includes.cmake
@@ -3,7 +3,7 @@
set(CMAKE_INCLUDE_CURRENT_DIR ON)
set(CMAKE_INCLUDE_SYSTEM_FLAG_C "-isystem ")
set(CMAKE_INCLUDE_SYSTEM_FLAG_CXX "-isystem ")
-include_directories(${SPADES_MAIN_INCLUDE_DIR} ${SPADES_BUILT_INCLUDE_DIR})
+include_directories(${SPADES_MAIN_INCLUDE_DIR} ${SPADES_BUILT_INCLUDE_DIR} ${CMAKE_SOURCE_DIR} ${SPADES_MODULES_DIR})
include_directories(SYSTEM "${EXT_DIR}/include")
include_directories(SYSTEM "${ZLIB_INCLUDE_DIRS}")
include_directories(SYSTEM "${Boost_INCLUDE_DIRS}")
diff --git a/src/cmake/pack.cmake b/src/cmake/pack.cmake
index 089b9f2..08b37da 100644
--- a/src/cmake/pack.cmake
+++ b/src/cmake/pack.cmake
@@ -12,10 +12,10 @@ set(CPACK_PACKAGE_NAME "SPAdes")
set(CPACK_PACKAGE_VENDOR "Saint Petersburg Academic University")
set(CPACK_PACKAGE_DESCRIPTION_FILE "${SPADES_MAIN_SRC_DIR}/../README")
set(CPACK_RESOURCE_FILE_LICENSE "${SPADES_MAIN_SRC_DIR}/../LICENSE")
-set(CPACK_PACKAGE_VERSION "3.7.1")
+set(CPACK_PACKAGE_VERSION "3.8.0")
set(CPACK_PACKAGE_VERSION_MAJOR "3")
-set(CPACK_PACKAGE_VERSION_MINOR "7")
-set(CPACK_PACKAGE_VERSION_PATCH "1")
+set(CPACK_PACKAGE_VERSION_MINOR "8")
+set(CPACK_PACKAGE_VERSION_PATCH "0")
set(CPACK_STRIP_FILES bin/spades bin/hammer bin/ionhammer bin/dipspades bin/spades-bwa bin/corrector bin/scaffold_correction)
# Source stuff
diff --git a/src/corrector/CMakeLists.txt b/src/corrector/CMakeLists.txt
deleted file mode 100644
index 42c66bb..0000000
--- a/src/corrector/CMakeLists.txt
+++ /dev/null
@@ -1,34 +0,0 @@
-############################################################################
-# Copyright (c) 2015 Saint Petersburg State University
-# Copyright (c) 2011-2014 Saint Petersburg Academic University
-# All Rights Reserved
-# See file LICENSE for details.
-############################################################################
-
-project(corrector CXX)
-
-include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-
-
-add_executable(corrector
- positional_read.cpp
- interesting_pos_processor.cpp
- contig_processor.cpp
- dataset_processor.cpp
- config_struct.cpp
- main.cpp)
-
-target_link_libraries(corrector input yaml-cpp ${COMMON_LIBRARIES})
-
-
-
-if (SPADES_STATIC_BUILD)
- set_target_properties(corrector PROPERTIES LINK_SEARCH_END_STATIC 1)
-endif()
-
-install(TARGETS corrector
- DESTINATION bin
- COMPONENT runtime)
-install(DIRECTORY "${SPADES_CFG_DIR}/corrector"
- DESTINATION share/spades/configs
- FILES_MATCHING PATTERN "*.info.template")
diff --git a/src/corrector/config_struct.cpp b/src/corrector/config_struct.cpp
deleted file mode 100644
index 8259b40..0000000
--- a/src/corrector/config_struct.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "config_struct.hpp"
-
-#include "openmp_wrapper.h"
-
-#include <yaml-cpp/yaml.h>
-#include <string>
-
-namespace YAML {
-template<>
-struct convert<corrector::Strategy> {
- static bool decode(const YAML::Node &node, corrector::Strategy &rhs) {
- std::string strategy_str = node.as<std::string>();
- if (strategy_str == "all_reads") {
- rhs = corrector::Strategy::AllReads;
- return true;
- } else if (strategy_str == "majority_only") {
- rhs = corrector::Strategy::MajorityOnly;
- return true;
- } else if (strategy_str == "not_started") {
- rhs = corrector::Strategy::AllExceptJustStarted;
- return true;
- } else if (strategy_str == "mapped_squared") {
- rhs = corrector::Strategy::MappedSquared;
- return true;
- }
- return false;
- }
-};
-}
-
-namespace corrector {
-void load(corrector_config& cfg, const std::string &filename) {
- YAML::Node config = YAML::LoadFile(filename);
- cfg.dataset.load(config["dataset"].as<std::string>());
- cfg.work_dir = config["work_dir"].as<std::string>(".");
- cfg.output_dir = config["output_dir"].as<std::string>(".");
- cfg.max_nthreads = config["max_nthreads"].as<unsigned>();
-
- cfg.max_nthreads = std::min(cfg.max_nthreads, (unsigned) omp_get_max_threads());
- cfg.strat = config["strategy"].as<Strategy>();
- cfg.bwa = config["bwa"].as<std::string>(".");
- omp_set_num_threads(cfg.max_nthreads);
-}
-}
diff --git a/src/corrector/config_struct.hpp b/src/corrector/config_struct.hpp
deleted file mode 100644
index 575bffb..0000000
--- a/src/corrector/config_struct.hpp
+++ /dev/null
@@ -1,33 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "config_singl.hpp"
-
-#include "io/library.hpp"
-
-namespace corrector {
-enum class Strategy {
- AllReads = 1,
- MappedSquared = 2,
- AllExceptJustStarted = 3,
- MajorityOnly = 4
-};
-struct corrector_config {
- io::DataSet<> dataset;
- std::string work_dir;
- std::string output_dir;
- unsigned max_nthreads;
- Strategy strat;
- std::string bwa;
-};
-
-void load(corrector::corrector_config& cfg, const std::string &filename);
-}
-
-typedef config_common::config<corrector::corrector_config> corr_cfg;
diff --git a/src/corrector/contig_processor.cpp b/src/corrector/contig_processor.cpp
deleted file mode 100644
index 325d797..0000000
--- a/src/corrector/contig_processor.cpp
+++ /dev/null
@@ -1,302 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "contig_processor.hpp"
-#include "config_struct.hpp"
-#include "variants_table.hpp"
-
-#include "io/ireader.hpp"
-#include "io/osequencestream.hpp"
-#include "io/file_reader.hpp"
-#include "io/single_read.hpp"
-#include "path_helper.hpp"
-
-#include <boost/algorithm/string.hpp>
-
-using namespace std;
-
-namespace corrector {
-
-void ContigProcessor::ReadContig() {
- io::FileReadStream frs(contig_file_);
- io::SingleRead cur_read;
- frs >> cur_read;
- if (!frs.eof()) {
-#pragma omp critical
- {
- ERROR("Non unique sequnce in one contig fasta!");
- }
- }
- contig_name_ = cur_read.name();
- contig_ = cur_read.GetSequenceString();
-
- output_contig_file_ = path::append_path(path::parent_path(contig_file_), path::basename(contig_file_) + ".ref.fasta");
- charts_.resize(contig_.length());
-}
-
-void ContigProcessor::UpdateOneRead(const SingleSamRead &tmp, MappedSamStream &sm) {
- unordered_map<size_t, position_description> all_positions;
- if (tmp.contig_id() < 0) {
- return;
- }
- auto cur_s = sm.get_contig_name(tmp.contig_id());
- if (contig_name_.compare(cur_s) != 0) {
- return;
- }
- CountPositions(tmp, all_positions);
- size_t error_num = 0;
-
- for (auto &pos : all_positions) {
- charts_[pos.first].update(pos.second);
- if (pos.second.FoundOptimal(contig_[pos.first]) != var_to_pos[(int) contig_[pos.first]]) {
- error_num++;
- }
- }
-
- if (error_num >= error_counts_.size())
- error_counts_[error_counts_.size() - 1]++;
- else
- error_counts_[error_num]++;
-}
-
-//returns: number of changed nucleotides;
-size_t ContigProcessor::UpdateOneBase(size_t i, stringstream &ss, const unordered_map<size_t, position_description> &interesting_positions) const{
- char old = (char) toupper(contig_[i]);
- auto strat = corr_cfg::get().strat;
- size_t maxi = charts_[i].FoundOptimal(contig_[i]);
- auto i_position = interesting_positions.find(i);
- if (i_position != interesting_positions.end()) {
- size_t maxj = i_position->second.FoundOptimal(contig_[i]);
- if (maxj != maxi) {
- DEBUG("Interesting positions differ with majority!");
- DEBUG("On position " << i << " old: " << old << " majority: " << pos_to_var[maxi] << "interesting: " << pos_to_var[maxj]);
- if (strat != Strategy::MajorityOnly)
- maxi = maxj;
- }
- }
- if (old != pos_to_var[maxi]) {
- DEBUG("On position " << i << " changing " << old << " to " << pos_to_var[maxi]);
- DEBUG(charts_[i].str());
- if (maxi < Variants::Deletion) {
- ss << pos_to_var[maxi];
- return 1;
- } else if (maxi == Variants::Deletion) {
- return 1;
- } else if (maxi == Variants::Insertion) {
- string maxj = "";
- //first base before insertion;
- size_t new_maxi = var_to_pos[(int) contig_[i]];
- int new_maxx = charts_[i].votes[new_maxi];
- for (size_t k = 0; k < MAX_VARIANTS; k++) {
- if (new_maxx < charts_[i].votes[k] && (k != Variants::Insertion) && (k != Variants::Deletion)) {
- new_maxx = charts_[i].votes[k];
- new_maxi = k;
- }
- }
- ss << pos_to_var[new_maxi];
- int max_ins = 0;
- for (const auto &ic : charts_[i].insertions) {
- if (ic.second > max_ins) {
- max_ins = ic.second;
- maxj = ic.first;
- }
- }
- DEBUG("most popular insertion: " << maxj);
- ss << maxj;
- if (old == maxj[0]) {
- return (int) maxj.length() - 1;
- } else {
- return (int) maxj.length();
- }
- } else {
- //something strange happened
- WARN("While processing base " << i << " unknown decision was made");
- return 0;
- }
- } else {
- ss << old;
- return 0;
- }
-}
-
-
-bool ContigProcessor::CountPositions(const SingleSamRead &read, unordered_map<size_t, position_description> &ps) const {
-
- if (read.contig_id() < 0) {
- DEBUG("not this contig");
- return false;
- }
- //TODO: maybe change to read.is_properly_aligned() ?
- if (read.map_qual() == 0) {
- DEBUG("zero qual");
- return false;
- }
- int pos = read.pos();
- if (pos < 0) {
- WARN("Negative position " << pos << " found on read " << read.name() << ", skipping");
- return false;
- }
- size_t position = size_t(pos);
- int mate = 1; // bonus for mate mapped can be here;
- size_t l_read = (size_t) read.data_len();
- size_t l_cigar = read.cigar_len();
-
- int aligned_length = 0;
- uint32_t *cigar = read.cigar_ptr();
- //* in cigar;
- if (l_cigar == 0)
- return false;
- if (bam_cigar_opchr(cigar[0]) == '*')
- return false;
- for (size_t i = 0; i < l_cigar; i++)
- if (bam_cigar_opchr(cigar[i]) == 'M')
- aligned_length += bam_cigar_oplen(cigar[i]);
-//It's about bad aligned reads, but whether it is necessary?
- double read_len_double = (double) l_read;
- if ((aligned_length < min(read_len_double * 0.4, 40.0)) && (position > read_len_double / 2) && (contig_.length() > read_len_double / 2 + (double) position)) {
- return false;
- }
- int state_pos = 0;
- int shift = 0;
- size_t skipped = 0;
- size_t deleted = 0;
- string insertion_string = "";
- auto seq = read.seq_ptr();
- for (size_t i = 0; i < l_read; i++) {
- DEBUG(i << " " << position << " " << skipped);
- if (shift + bam_cigar_oplen(cigar[state_pos]) <= i) {
- shift += bam_cigar_oplen(cigar[state_pos]);
- state_pos += 1;
- }
- if (insertion_string != "" and bam_cigar_opchr(cigar[state_pos]) != 'I') {
- VERIFY(i + position >= skipped + 1);
- size_t ind = i + position - skipped - 1;
- if (ind >= contig_.length())
- break;
- ps[ind].insertions[insertion_string] += 1;
- insertion_string = "";
- }
- char cur_state = bam_cigar_opchr(cigar[state_pos]);
- if (cur_state == 'M') {
- VERIFY(i >= deleted);
- if (i + position < skipped) {
- WARN(i << " " << position << " " << skipped);
- INFO(read.name());
- }
- VERIFY(i + position >= skipped);
-
- size_t ind = i + position - skipped;
- size_t cur = var_to_pos[(int) bam_nt16_rev_table[bam1_seqi(seq, i - deleted)]];
- if (ind >= contig_.length())
- continue;
- ps[ind].votes[cur] = ps[ind].votes[cur] + mate;
-
- } else {
- if (cur_state == 'I' || cur_state == 'H' || cur_state == 'S' ) {
- if (cur_state == 'I') {
- if (insertion_string == "") {
- size_t ind = i + position - skipped - 1;
- if (ind >= contig_.length())
- break;
- ps[ind].votes[Variants::Insertion] += mate;
- }
- insertion_string += bam_nt16_rev_table[bam1_seqi(seq, i - deleted)];
- }
- skipped += 1;
- } else if (bam_cigar_opchr(cigar[state_pos]) == 'D') {
- if (i + position - skipped >= contig_.length())
- break;
- ps[i + position - skipped].votes[Variants::Deletion] += mate;
- deleted += 1;
- }
- }
- }
- if (insertion_string != "" and bam_cigar_opchr(cigar[state_pos]) != 'I') {
- VERIFY(l_read + position >= skipped + 1);
- size_t ind = l_read + position - skipped - 1;
- if (ind < contig_.length()) {
- ps[ind].insertions[insertion_string] += 1;
- }
- insertion_string = "";
- }
- return true;
-}
-
-
-bool ContigProcessor::CountPositions(const PairedSamRead &read, unordered_map<size_t, position_description> &ps) const {
-
- TRACE("starting pairing");
- bool t1 = CountPositions(read.Left(), ps );
- unordered_map<size_t, position_description> tmp;
- bool t2 = CountPositions(read.Right(), tmp);
- //overlaps.. multimap? Look on qual?
- if (ps.size() == 0 || tmp.size() == 0) {
- //We do not need paired reads which are not really paired
- ps.clear();
- return false;
- }
- TRACE("counted, uniting maps of " << tmp.size() << " and " << ps.size());
- ps.insert(tmp.begin(), tmp.end());
- TRACE("united");
- return (t1 && t2);
-}
-
-size_t ContigProcessor::ProcessMultipleSamFiles() {
- error_counts_.resize(kMaxErrorNum);
- for (const auto &sf : sam_files_) {
- MappedSamStream sm(sf.first);
- while (!sm.eof()) {
- SingleSamRead tmp;
- sm >> tmp;
-
- UpdateOneRead(tmp, sm);
- }
- sm.close();
- }
-
- ipp_.FillInterestingPositions(charts_);
- for (const auto &sf : sam_files_) {
- MappedSamStream sm(sf.first);
- while (!sm.eof()) {
- unordered_map<size_t, position_description> ps;
- if (sf.second == io::LibraryType::PairedEnd ) {
- PairedSamRead tmp;
- sm >> tmp;
- CountPositions(tmp, ps);
- } else {
- SingleSamRead tmp;
- sm >> tmp;
- CountPositions(tmp, ps);
- }
- ipp_.UpdateInterestingRead(ps);
- }
- sm.close();
- }
- ipp_.UpdateInterestingPositions();
- unordered_map<size_t, position_description> interesting_positions = ipp_.get_weights();
- stringstream s_new_contig;
- size_t total_changes = 0;
- for (size_t i = 0; i < contig_.length(); i++) {
- total_changes += UpdateOneBase(i, s_new_contig, interesting_positions);
- }
- vector<string> contig_name_splitted;
- boost::split(contig_name_splitted, contig_name_, boost::is_any_of("_"));
- if (contig_name_splitted.size() >= 8) {
- io::osequencestream_with_manual_node_id oss(output_contig_file_);
- oss.setNodeID(std::stoi(contig_name_splitted[1]));
- oss.setCoverage(std::stod(contig_name_splitted[5]));
- oss.setID(std::stoi(contig_name_splitted[7]));
- oss << s_new_contig.str();
- } else {
- io::osequencestream oss(output_contig_file_);
- oss << io::SingleRead(contig_name_, s_new_contig.str());
- }
- return total_changes;
-}
-
-}
-;
diff --git a/src/corrector/contig_processor.hpp b/src/corrector/contig_processor.hpp
deleted file mode 100644
index 1011323..0000000
--- a/src/corrector/contig_processor.hpp
+++ /dev/null
@@ -1,65 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * contig_processor.hpp
- *
- * Created on: Jun 27, 2014
- * Author: lab42
- */
-
-#pragma once
-#include "interesting_pos_processor.hpp"
-#include "positional_read.hpp"
-#include "openmp_wrapper.h"
-
-#include <io/sam/sam_reader.hpp>
-#include <io/sam/read.hpp>
-#include <io/library.hpp>
-
-#include <string>
-#include <vector>
-#include <unordered_map>
-
-namespace corrector {
-
-using namespace sam_reader;
-
-typedef std::vector<std::pair<std::string, io::LibraryType> > sam_files_type;
-class ContigProcessor {
- sam_files_type sam_files_;
- std::string contig_file_;
- std::string contig_name_;
- std::string output_contig_file_;
- std::string contig_;
- std::vector<position_description> charts_;
- InterestingPositionProcessor ipp_;
- std::vector<int> error_counts_;
-
- const size_t kMaxErrorNum = 20;
-
-public:
- ContigProcessor(const sam_files_type &sam_files, const std::string &contig_file)
- : sam_files_(sam_files), contig_file_(contig_file) {
- ReadContig();
- ipp_.set_contig(contig_);
- }
- size_t ProcessMultipleSamFiles();
-private:
- void ReadContig();
-//Moved from read.hpp
- bool CountPositions(const SingleSamRead &read, std::unordered_map<size_t, position_description> &ps) const;
- bool CountPositions(const PairedSamRead &read, std::unordered_map<size_t, position_description> &ps) const;
-
- void UpdateOneRead(const SingleSamRead &tmp, MappedSamStream &sm);
- //returns: number of changed nucleotides;
-
- size_t UpdateOneBase(size_t i, std::stringstream &ss, const std::unordered_map<size_t, position_description> &interesting_positions) const ;
-
-};
-}
-;
diff --git a/src/corrector/dataset_processor.cpp b/src/corrector/dataset_processor.cpp
deleted file mode 100644
index 8cb2d38..0000000
--- a/src/corrector/dataset_processor.cpp
+++ /dev/null
@@ -1,278 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "dataset_processor.hpp"
-#include "variants_table.hpp"
-#include "contig_processor.hpp"
-#include "config_struct.hpp"
-
-#include "io/file_reader.hpp"
-#include "path_helper.hpp"
-#include "io/osequencestream.hpp"
-#include "openmp_wrapper.h"
-
-#include <boost/algorithm/string.hpp>
-
-#include <iostream>
-#include <unistd.h>
-
-using namespace std;
-
-namespace corrector {
-std::string DatasetProcessor::GetLibDir(const size_t lib_count) {
- if (lib_dirs_.find(lib_count) != lib_dirs_.end())
- return lib_dirs_[lib_count];
- std::string res = path::make_temp_dir(corr_cfg::get().work_dir, "lib" + to_string(lib_count));
- lib_dirs_[lib_count] = res;
- return res;
-}
-
-void DatasetProcessor::SplitGenome(const string &genome_splitted_dir) {
- io::FileReadStream frs(genome_file_);
- size_t cur_id = 0;
- while (!frs.eof()) {
- io::SingleRead cur_read;
- frs >> cur_read;
- string contig_name = cur_read.name();
- string contig_seq = cur_read.GetSequenceString();
- if (all_contigs_.find(contig_name) != all_contigs_.end()) {
- WARN("Duplicated contig names! Multiple contigs with name" << contig_name);
- }
- string full_path = path::append_path(genome_splitted_dir, contig_name + ".fasta");
- string out_full_path = path::append_path(genome_splitted_dir, contig_name + ".ref.fasta");
- string sam_filename = path::append_path(genome_splitted_dir, contig_name + ".pair.sam");
- all_contigs_[contig_name] = {full_path, out_full_path, contig_seq.length(), sam_files_type(), sam_filename, cur_id};
- cur_id ++;
- buffered_reads_[contig_name].clear();
- io::osequencestream oss(full_path);
- oss << io::SingleRead(contig_name, contig_seq);
- DEBUG("full_path " + full_path)
- }
-}
-
-//contigs - set of aligned contig names
-void DatasetProcessor::GetAlignedContigs(const string &read, set<string> &contigs) const {
- vector<string> arr;
- boost::split(arr, read, boost::is_any_of("\t"));
- if (arr.size() > 5) {
- if (arr[2] != "*" && stoi(arr[4]) > 0) {
-// here can be multuple aligned parsing if neeeded;
- contigs.insert(arr[2]);
- }
- }
-
-}
-
-void DatasetProcessor::SplitSingleLibrary(const string &all_reads_filename, const size_t lib_count) {
- ifstream fs(all_reads_filename);
- while (!fs.eof()) {
- set<string> contigs;
- string r1;
- getline(fs, r1);
- if (r1[0] == '@')
- continue;
- GetAlignedContigs(r1, contigs);
- for (auto &contig : contigs) {
- VERIFY_MSG(all_contigs_.find(contig) != all_contigs_.end(), "wrong contig name in SAM file header: " + contig);
- BufferedOutputRead(r1, contig, lib_count);
- }
- }
- FlushAll(lib_count);
-}
-
-void DatasetProcessor::FlushAll(const size_t lib_count) {
- for (const auto &ac : all_contigs_) {
- if (buffered_reads_[ac.first].size() > 0) {
- ofstream stream(ac.second.sam_filenames[lib_count].first.c_str(), std::ios_base::app | std::ios_base::out);
- for (const string &read : buffered_reads_[ac.first]) {
- stream << read;
- stream << '\n';
- }
- buffered_reads_[ac.first].clear();
- }
- }
-}
-
-void DatasetProcessor::BufferedOutputRead(const string &read, const string &contig_name, const size_t lib_count) {
- buffered_reads_[contig_name].push_back(read);
- buffered_count_++;
- if (buffered_count_ % kBuffSize == 0) {
- if (buffered_count_ % (10 * kBuffSize) == 0)
- INFO("processed " << buffered_count_ << "reads, flushing");
- FlushAll(lib_count);
- }
-}
-
-void DatasetProcessor::SplitPairedLibrary(const string &all_reads_filename, const size_t lib_count) {
- ifstream fs(all_reads_filename);
- while (!fs.eof()) {
- set<string> contigs;
- string r1;
- string r2;
- getline(fs, r1);
- if (r1[0] == '@')
- continue;
- getline(fs, r2);
- GetAlignedContigs(r1, contigs);
- GetAlignedContigs(r2, contigs);
- for (const auto &contig : contigs) {
- VERIFY_MSG(all_contigs_.find(contig) != all_contigs_.end(), "wrong contig name in SAM file header: " + contig);
- if (all_contigs_.find(contig) != all_contigs_.end()) {
- BufferedOutputRead(r1, contig, lib_count);
- BufferedOutputRead(r2, contig, lib_count);
- }
- }
- }
- FlushAll(lib_count);
-}
-
-string DatasetProcessor::RunPairedBwa(const string &left, const string &right, const size_t lib) {
- string cur_dir = GetLibDir(lib);
- int run_res = 0;
- string tmp1_sai_filename = path::append_path(cur_dir, "tmp1.sai");
- string tmp2_sai_filename = path::append_path(cur_dir, "tmp2.sai");
- string tmp_sam_filename = path::append_path(cur_dir, "tmp.sam");
- string isize_txt_filename = path::append_path(cur_dir, "isize.txt");
- string tmp_file = path::append_path(cur_dir, "bwa.flood");
-
- string index_line = corr_cfg::get().bwa + string(" index ") + "-a " + "is " + genome_file_ ;
- INFO("Running bwa index ...: " << index_line);
- run_res = system(index_line.c_str());
- if (run_res != 0) {
- INFO("bwa failed, skipping sublib");
- return "";
- }
- string nthreads_str = to_string(nthreads_);
- string last_line = corr_cfg::get().bwa + string(" mem ") + " -v 1 -t " + nthreads_str + " "+ genome_file_ + " " + left + " " + right + " > "
- + tmp_sam_filename ;
- INFO("Running bwa mem ...:" << last_line);
- run_res = system(last_line.c_str());
- if (run_res != 0) {
- INFO("bwa failed, skipping sublib");
- return "";
- }
- return tmp_sam_filename;
-}
-
-string DatasetProcessor::RunSingleBwa(const string &single, const size_t lib) {
- int run_res = 0;
- string cur_dir = GetLibDir(lib);
- string tmp_sai_filename = path::append_path(cur_dir, "tmp1.sai");
- string tmp_sam_filename = path::append_path(cur_dir, "tmp.sam");
- string isize_txt_filename = path::append_path(cur_dir, "isize.txt");
- string tmp_file = path::append_path(cur_dir, "bwa.flood");
-
- string index_line = corr_cfg::get().bwa + string(" index ") + "-a " + "is " + genome_file_ ;
- INFO("Running bwa index ...: " << index_line);
- run_res = system(index_line.c_str());
- if (run_res != 0) {
- INFO("bwa failed, skipping sublib");
- return "";
- }
- string nthreads_str = to_string(nthreads_);
- string last_line = corr_cfg::get().bwa + " mem "+ " -v 1 -t " + nthreads_str + " " + genome_file_ + " " + single + " > " + tmp_sam_filename;
- INFO("Running bwa mem ...:" << last_line);
- run_res = system(last_line.c_str());
- if (run_res != 0) {
- INFO("bwa failed, skipping sublib");
- return "";
- }
- return tmp_sam_filename;
-}
-
-void DatasetProcessor::PrepareContigDirs(const size_t lib_count) {
- string out_dir = GetLibDir(lib_count);
- for (auto &ac : all_contigs_) {
- auto contig_name = ac.first;
- string out_name = path::append_path(out_dir, contig_name + ".sam");
- ac.second.sam_filenames.push_back(make_pair(out_name, unsplitted_sam_files_[lib_count].second));
- BufferedOutputRead("@SQ\tSN:" + contig_name + "\tLN:" + to_string(all_contigs_[contig_name].contig_length), contig_name, lib_count);
- }
- FlushAll(lib_count);
-}
-
-void DatasetProcessor::ProcessDataset() {
- size_t lib_num = 0;
- INFO("Splitting assembly...");
- INFO("Assembly file: " + genome_file_);
- SplitGenome(work_dir_);
- for (size_t i = 0; i < corr_cfg::get().dataset.lib_count(); ++i) {
- const auto& dataset = corr_cfg::get().dataset[i];
- auto lib_type = dataset.type();
- if (lib_type == io::LibraryType::PairedEnd || lib_type == io::LibraryType::HQMatePairs || lib_type == io::LibraryType::SingleReads) {
- for (auto iter = dataset.paired_begin(); iter != dataset.paired_end(); iter++) {
- INFO("Processing paired sublib of number " << lib_num);
- string left = iter->first;
- string right = iter->second;
- INFO(left + " " + right);
- string samf = RunPairedBwa(left, right, lib_num);
- if (samf != "") {
- INFO("Adding samfile " << samf);
- unsplitted_sam_files_.push_back(make_pair(samf, lib_type));
- PrepareContigDirs(lib_num);
- SplitPairedLibrary(samf, lib_num);
- lib_num++;
- } else {
- FATAL_ERROR("Failed to align paired reads " << left << " and " << right);
- }
- }
- for (auto iter = dataset.single_begin(); iter != dataset.single_end(); iter++) {
- INFO("Processing single sublib of number " << lib_num);
- string left = *iter;
- INFO(left);
- string samf = RunSingleBwa(left, lib_num);
- if (samf != "") {
- INFO("Adding samfile " << samf);
- unsplitted_sam_files_.push_back(make_pair(samf, io::LibraryType::SingleReads));
- PrepareContigDirs(lib_num);
- SplitSingleLibrary(samf, lib_num);
- lib_num++;
- } else {
- FATAL_ERROR("Failed to align single reads " << left);
- }
- }
- }
- }
- INFO("Processing contigs");
- vector<pair<size_t, string> > ordered_contigs;
- for (const auto &ac : all_contigs_) {
- ordered_contigs.push_back(make_pair(ac.second.contig_length, ac.first));
- }
- size_t cont_num = ordered_contigs.size();
- sort(ordered_contigs.begin(), ordered_contigs.end(), std::greater<pair<size_t, string> >());
- auto all_contigs_ptr = &all_contigs_;
-# pragma omp parallel for shared(all_contigs_ptr, ordered_contigs) num_threads(nthreads_) schedule(dynamic,1)
- for (size_t i = 0; i < cont_num; i++) {
- bool long_enough = (*all_contigs_ptr)[ordered_contigs[i].second].contig_length > kMinContigLengthForInfo;
- ContigProcessor pc((*all_contigs_ptr)[ordered_contigs[i].second].sam_filenames, (*all_contigs_ptr)[ordered_contigs[i].second].input_contig_filename);
- size_t changes = pc.ProcessMultipleSamFiles();
- if (long_enough) {
-#pragma omp critical
- {
- INFO("Contig " << ordered_contigs[i].second << " processed with " << changes << " changes in thread " << omp_get_thread_num());
- }
- }
- }
- INFO("Gluing processed contigs");
- GlueSplittedContigs(output_contig_file_);
-}
-
-void DatasetProcessor::GlueSplittedContigs(string &out_contigs_filename) {
- ofstream of_c(out_contigs_filename, std::ios_base::binary);
- vector<string> ordered_names;
- ordered_names.resize(all_contigs_.size());
- for (const auto &ac : all_contigs_) {
- ordered_names[ac.second.id] = ac.first;
- }
- for (size_t i = 0; i < ordered_names.size(); i++) {
- ifstream a_f(all_contigs_[ordered_names[i]].output_contig_filename, std::ios_base::binary);
- of_c << a_f.rdbuf();
- }
-}
-
-}
-;
diff --git a/src/corrector/dataset_processor.hpp b/src/corrector/dataset_processor.hpp
deleted file mode 100644
index 1012c8f..0000000
--- a/src/corrector/dataset_processor.hpp
+++ /dev/null
@@ -1,71 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "path_helper.hpp"
-
-#include "io/file_reader.hpp"
-#include "path_helper.hpp"
-
-#include <io/library.hpp>
-
-#include <string>
-#include <set>
-#include <vector>
-#include <unordered_map>
-
-namespace corrector {
-
-typedef std::vector<std::pair<std:: string, io::LibraryType> > sam_files_type;
-
-struct OneContigDescription {
- std::string input_contig_filename;
- std::string output_contig_filename;
- size_t contig_length;
- sam_files_type sam_filenames;
- std::string sam_filename;
- size_t id;
-};
-typedef std::unordered_map<std::string, OneContigDescription> ContigInfoMap;
-
-class DatasetProcessor {
-
- const std::string &genome_file_;
- std::string output_contig_file_;
- ContigInfoMap all_contigs_;
- sam_files_type unsplitted_sam_files_;
- const std::string &work_dir_;
- std::unordered_map<std::string, std::vector<std::string> > buffered_reads_;
- size_t nthreads_;
- size_t buffered_count_;
- std::unordered_map<size_t, std::string> lib_dirs_;
- const size_t kBuffSize = 100000;
- const size_t kMinContigLengthForInfo = 20000;
-public:
- DatasetProcessor(const std::string &genome_file, const std::string &work_dir, const std::string &output_dir, const size_t &thread_num)
- : genome_file_(genome_file), work_dir_(work_dir), nthreads_(thread_num) {
- output_contig_file_ = path::append_path(output_dir, "corrected_contigs.fasta");
- buffered_count_ = 0;
- }
-
- void ProcessDataset();
-private:
- void SplitGenome(const std::string &genome_splitted_dir);
- void FlushAll(const size_t lib_count);
- void BufferedOutputRead(const std::string &read, const std::string &contig_name, const size_t lib_count);
- void GetAlignedContigs(const std::string &read, std::set<std::string> &contigs) const;
- void SplitSingleLibrary(const std::string &out_contigs_filename, const size_t lib_count);
- void SplitPairedLibrary(const std::string &all_reads, const size_t lib_count);
- void GlueSplittedContigs(std::string &out_contigs_filename);
- std::string RunPairedBwa(const std::string &left, const std::string &right, const size_t lib);
- std::string RunSingleBwa(const std::string &single, const size_t lib);
- void PrepareContigDirs(const size_t lib_count);
- std::string GetLibDir(const size_t lib_count);
-};
-}
-;
diff --git a/src/corrector/interesting_pos_processor.cpp b/src/corrector/interesting_pos_processor.cpp
deleted file mode 100644
index 222ef79..0000000
--- a/src/corrector/interesting_pos_processor.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "interesting_pos_processor.hpp"
-#include "config_struct.hpp"
-
-using namespace std;
-
-namespace corrector {
-bool InterestingPositionProcessor::FillInterestingPositions(const vector<position_description> &charts) {
- bool any_interesting = false;
- for (size_t i = 0; i < contig_.length(); i++) {
- int sum_total = 0;
- for (size_t j = 0; j < MAX_VARIANTS; j++) {
- if (j != Variants::Insertion && j != Variants::Deletion) {
- sum_total += charts[i].votes[j];
- }
- }
- int variants = 0;
- for (size_t j = 0; j < MAX_VARIANTS; j++) {
- //TODO::For IT reconsider this condition
- if (j != Variants::Insertion && j != Variants::Deletion && (charts[i].votes[j] > 0.1 * sum_total) && (charts[i].votes[j] < 0.9 * sum_total) && (sum_total > 20)) {
- variants++;
- }
- }
- if (variants > 1 || contig_[i] == Variants::Undefined) {
- DEBUG("Adding interesting position: " << i << " " << charts[i].str());
- any_interesting = true;
- is_interesting_[i] = true;
- for (int j = -kAnchorNum; j <= kAnchorNum; j++) {
- int additional = (int) (i / kAnchorGap + j) * kAnchorGap;
- if (additional >= 0 && additional < (int) contig_.length())
- is_interesting_[additional] = true;
- }
- }
- }
-
- return any_interesting;
-}
-
-void InterestingPositionProcessor::UpdateInterestingRead(const PositionDescriptionMap &ps) {
- vector<size_t> interesting_in_read;
- for (const auto &pos : ps) {
- if (is_interesting(pos.first)) {
- interesting_in_read.push_back(pos.first);
- }
- }
- if (interesting_in_read.size() >= 2) {
- WeightedPositionalRead wr(interesting_in_read, ps, contig_);
- size_t cur_id = wr_storage_.size();
- wr_storage_.push_back(wr);
- for (size_t i = 0; i < interesting_in_read.size(); i++) {
- TRACE(interesting_in_read[i] << " " << contig_.length());
- read_ids_[interesting_in_read[i]].push_back(cur_id);
- }
- }
-}
-
-void InterestingPositionProcessor::set_contig(const string &ctg) {
- contig_ = ctg;
- size_t len = contig_.length();
- is_interesting_.resize(len);
- read_ids_.resize(len);
-}
-
-void InterestingPositionProcessor::UpdateInterestingPositions() {
- auto strat = corr_cfg::get().strat;
- for (int dir = 1; dir >= -1; dir -= 2) {
- int start_pos;
- dir == 1 ? start_pos = 0 : start_pos = (int) contig_.length() - 1;
- int current_pos = start_pos;
- for (; current_pos >= 0 && current_pos < (int) contig_.length(); current_pos += dir) {
- if (is_interesting_[current_pos]) {
- DEBUG("reads on position: " << read_ids_[current_pos].size());
- for (size_t i = 0; i < read_ids_[current_pos].size(); i++) {
- size_t current_read_id = read_ids_[current_pos][i];
- size_t current_variant = wr_storage_[current_read_id].positions[current_pos];
- {
- int coef = 1;
- if (strat == Strategy::AllReads)
- coef = 1;
- else if (strat == Strategy::MappedSquared)
- coef = wr_storage_[current_read_id].processed_positions * wr_storage_[current_read_id].processed_positions;
- else if (strat == Strategy::AllExceptJustStarted)
- coef = wr_storage_[current_read_id].is_first(current_pos, dir);
- interesting_weights[current_pos].votes[current_variant] += get_error_weight(
- wr_storage_[current_read_id].error_num ) * coef;
- }
- }
- size_t maxi = interesting_weights[current_pos].FoundOptimal(contig_[current_pos]);
- for (size_t i = 0; i < read_ids_[current_pos].size(); i++) {
- size_t current_read_id = read_ids_[current_pos][i];
- size_t current_variant = wr_storage_[current_read_id].positions[current_pos];
- if (current_variant != maxi) {
- wr_storage_[current_read_id].error_num++;
- } else {
- wr_storage_[current_read_id].processed_positions++;
- }
-
- }
-
- if ((char) toupper(contig_[current_pos]) != pos_to_var[maxi]) {
- DEBUG("Interesting positions differ at position " << current_pos);
- DEBUG("Was " << (char) toupper(contig_[current_pos]) << "new " << pos_to_var[maxi]);
- DEBUG("weights" << interesting_weights[current_pos].str());
- changed_weights_[current_pos] = interesting_weights[current_pos];
- }
- //for backward pass
- interesting_weights[current_pos].clear();
- }
- }
- if (dir == 1)
- DEBUG("reversing the order...");
- for (size_t i = 0; i < wr_storage_.size(); i++) {
- wr_storage_[i].error_num = 0;
- wr_storage_[i].processed_positions = 0;
- }
- }
-}
-}
-;
diff --git a/src/corrector/main.cpp b/src/corrector/main.cpp
deleted file mode 100644
index 52d345e..0000000
--- a/src/corrector/main.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "dataset_processor.hpp"
-#include "config_struct.hpp"
-
-#include "logger/log_writers.hpp"
-#include "segfault_handler.hpp"
-
-#include "version.hpp"
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <string>
-
-using namespace std;
-void create_console_logger() {
- using namespace logging;
-
- logger *lg = create_logger("");
- lg->add_writer(std::make_shared<console_writer>());
- attach_logger(lg);
-}
-
-int main(int argc, char** argv) {
- perf_counter pc;
-
- srand(42);
- srandom(42);
-
- create_console_logger();
-
- if (argc != 3) {
- WARN("Wrong argument number");
- return 1;
- }
- string contig_name(argv[2]);
- string cfg_file(argv[1]);
- corr_cfg::create_instance(cfg_file);
- string work_dir = corr_cfg::get().work_dir;
- if (!path::check_existence(corr_cfg::get().output_dir))
- path::make_dir(corr_cfg::get().output_dir);
- if (!path::check_existence(corr_cfg::get().work_dir))
- path::make_dir(corr_cfg::get().work_dir);
-
- INFO("Starting MismatchCorrector, built from " SPADES_GIT_REFSPEC ", git revision " SPADES_GIT_SHA1);
-
- corrector::DatasetProcessor dp(contig_name, corr_cfg::get().work_dir, corr_cfg::get().output_dir, corr_cfg::get().max_nthreads);
- dp.ProcessDataset();
- unsigned ms = (unsigned) pc.time_ms();
- unsigned secs = (ms / 1000) % 60;
- unsigned mins = (ms / 1000 / 60) % 60;
- unsigned hours = (ms / 1000 / 60 / 60);
-
- INFO("Correcting time: " << hours << " hours " << mins << " minutes " << secs << " seconds");
-
- return 0;
-}
diff --git a/src/debruijn/CMakeLists.txt b/src/debruijn/CMakeLists.txt
deleted file mode 100644
index 2bd2da0..0000000
--- a/src/debruijn/CMakeLists.txt
+++ /dev/null
@@ -1,53 +0,0 @@
-############################################################################
-# Copyright (c) 2015 Saint Petersburg State University
-# Copyright (c) 2011-2014 Saint Petersburg Academic University
-# All Rights Reserved
-# See file LICENSE for details.
-############################################################################
-
-project(spades CXX)
-
-add_library(debruijn STATIC
- kmer_coverage_model.cpp
- config_struct.cpp
- path_extend/pe_config_struct.cpp
- path_extend/bidirectional_path.cpp
- path_extend/scaffolder2015/scaff_supplementary.cpp
- path_extend/scaffolder2015/extension_chooser2015.cpp
- path_extend/scaffolder2015/scaffold_graph.cpp
- path_extend/scaffolder2015/scaffold_graph_constructor.cpp
- path_extend/scaffolder2015/scaffold_graph_visualizer.cpp
- path_extend/scaffolder2015/connection_condition2015.cpp
- genome_consistance_checker.cpp
- stage.cpp
- construction.cpp
- gap_closer.cpp
- simplification.cpp
- mismatch_correction.cpp
- pair_info_count.cpp
- second_phase_setup.cpp
- distance_estimation.cpp
- repeat_resolving.cpp
- genomic_info_filler.cpp
- pacbio_aligning.cpp
- bwa_pair_info_filler.cpp
- genome_storage.cpp)
-
-target_include_directories(debruijn PRIVATE ${EXT_DIR}/include/ConsensusCore)
-target_link_libraries(debruijn ConsensusCore input cityhash nlopt BamTools ssw yaml-cpp ${COMMON_LIBRARIES})
-
-add_executable(spades
- main.cpp)
-target_link_libraries(spades debruijn)
-
-
-if (SPADES_STATIC_BUILD)
- set_target_properties(spades PROPERTIES LINK_SEARCH_END_STATIC 1)
-endif()
-
-install(TARGETS spades
- DESTINATION bin
- COMPONENT runtime)
-install(DIRECTORY "${SPADES_CFG_DIR}/debruijn"
- DESTINATION share/spades/configs
- FILES_MATCHING PATTERN "*.info.template")
diff --git a/src/debruijn/bwa_pair_info_filler.cpp b/src/debruijn/bwa_pair_info_filler.cpp
deleted file mode 100644
index 36511fc..0000000
--- a/src/debruijn/bwa_pair_info_filler.cpp
+++ /dev/null
@@ -1,407 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "bwa_pair_info_filler.hpp"
-
-
-namespace bwa_pair_info {
-
-
-void MapperReadT::ParseCigar(const string& cigar) {
- string num = "";
- bool left_side = true;
- for (size_t i = 0; i < cigar.length(); ++i) {
- if (isdigit(cigar[i])) {
- num += cigar[i];
- }
- else {
- if (cigar[i] == 'H') {
- if (left_side)
- left_hard_clip_ = (uint16_t) std::stoi(num);
- else
- right_hard_clip_ = (uint16_t) std::stoi(num);
- num = "";
- }
- else if (cigar[i] == 'S') {
- if (left_side)
- left_soft_clip_ = (uint16_t) std::stoi(num);
- else
- right_soft_clip_ = (uint16_t) std::stoi(num);
- num = "";
- }
- else {
- left_side = false;
- num = "";
- }
- }
- }
-}
-
-//Correct read algnment according to orientation and clippings
-void BWACorrectingProcessor::ProcessPairedRead(const MapperReadT& l, const MapperReadT& r) {
- using io::LibraryOrientation;
-
- if (!l.IsValid() || !r.IsValid()) {
- return;
- }
- ++count_;
-
- MappedPositionT left_pos(edge_id_map_.at(stoi(l.get_contig_id())), l.pos());
- MappedPositionT right_pos(edge_id_map_.at(stoi(r.get_contig_id())), r.pos());
-
- //This function if overloaded in BWAISCounter and BWAIndexFiller
- if (!CheckAlignments(left_pos, right_pos)) {
- return;
- }
-
- int r_from_pos_to_right_end = r.len() + r.right_hard_clip() - r.left_soft_clip();
- int l_from_pos_to_left_end = l.left_soft_clip() + l.left_hard_clip();
-
- if ((!l.is_forward() && (lib_.orientation() == LibraryOrientation::FF || lib_.orientation() == LibraryOrientation::FR)) ||
- (l.is_forward() && (lib_.orientation() == LibraryOrientation::RF || lib_.orientation() == LibraryOrientation::RR))) {
- left_pos.e = g_.conjugate(left_pos.e);
- left_pos.pos = (int) g_.length(left_pos.e) - left_pos.pos - (l.len() - l.left_soft_clip() - l.right_soft_clip()) + (int) g_.k();
- l_from_pos_to_left_end = l.right_soft_clip() + l.right_hard_clip();
- }
- if ((!r.is_forward() && (lib_.orientation() == LibraryOrientation::FF || lib_.orientation() == LibraryOrientation::RF)) ||
- (r.is_forward() && (lib_.orientation() == LibraryOrientation::FR || lib_.orientation() == LibraryOrientation::RR))) {
- right_pos.e = g_.conjugate(right_pos.e);
- right_pos.pos = (int) g_.length(right_pos.e) - right_pos.pos - (r.len() - r.left_soft_clip() - r.right_soft_clip()) + (int) g_.k();
- r_from_pos_to_right_end = r.len() + r.left_hard_clip() - r.right_soft_clip();
- }
-
- right_pos.pos = right_pos.pos + r_from_pos_to_right_end;
- left_pos.pos = left_pos.pos - l_from_pos_to_left_end;
-
- //This function if overloaded in BWAISCounter and BWAIndexFiller
- ProcessAlignments(left_pos, right_pos);
-}
-
-// ==== insert size counter overloads ====
-bool BWAISCounter::CheckAlignments(const MappedPositionT& l, const MappedPositionT& r) {
- return l.e == r.e && g_.length(l.e) >= min_contig_len_;
-}
-
-void BWAISCounter::ProcessAlignments(const MappedPositionT& l, const MappedPositionT& r) {
- ++mapped_count_;
-
- int is = r.pos - l.pos;
- if (is > 0 || !ignore_negative_) {
- hist_[is] += 1;
- } else {
- ++negative_count_;
- }
-}
-
-bool BWAISCounter::RefineInsertSize(SequencingLibraryT& reads) const {
- using namespace omnigraph;
- size_t correctly_mapped = mapped_count_ - negative_count_;
- INFO(correctly_mapped << " paired reads (" << ((double) correctly_mapped * 100.0 / (double) count_) << "% of all) aligned to long edges");
-
- if (negative_count_ > 3 * correctly_mapped)
- WARN("Too much reads aligned with negative insert size. Is the library orientation set properly?");
- if (mapped_count_ == 0)
- return false;
-
- std::map<size_t, size_t> percentiles;
- find_mean(hist_, reads.data().mean_insert_size, reads.data().insert_size_deviation, percentiles);
- find_median(hist_, reads.data().median_insert_size, reads.data().insert_size_mad, reads.data().insert_size_distribution);
- if (reads.data().median_insert_size < reads.data().read_length) {
- return false;
- }
-
- std::tie(reads.data().insert_size_left_quantile, reads.data().insert_size_right_quantile) =
- GetISInterval(0.8, reads.data().insert_size_distribution);
-
- return !reads.data().insert_size_distribution.empty();
-}
-
-// ==== pair info index filler overloads ====
-EdgePair BWAIndexFiller::ConjugatePair(EdgePair ep) const {
- return make_pair(g_.conjugate(ep.second), g_.conjugate(ep.first));
-}
-
-void BWAIndexFiller::ProcessAlignments(const MappedPositionT& l, const MappedPositionT& r) {
- EdgePair ep{l.e, r.e};
- TRACE("Lpos " << l.pos << ", Rpos " << r.pos);
- int edge_distance = (int) lib_.data().mean_insert_size - r.pos + l.pos;
- TRACE("Distance " << edge_distance);
-
- paired_index_.Add(ep.first, ep.second, { (double) edge_distance, 1.0 });
-}
-
-bool BWAIndexFiller::CheckAlignments(const MappedPositionT& l, const MappedPositionT& r) {
- return g_.length(l.e) >= min_contig_len_ && g_.length(r.e) >= min_contig_len_;
-}
-
-
-//Main class realization
-void BWAPairInfoFiller::OutputEdges(const string &filename) const {
- io::osequencestream_simple oss(filename);
- for (auto it = g_.ConstEdgeBegin(true); !it.IsEnd(); ++it) {
- debruijn_graph::EdgeId e = *it;
- oss.set_header(ToString(g_.int_id(e)));
- oss << g_.EdgeNucls(e);
- }
-}
-void BWAPairInfoFiller::FillEdgeIdMap() {
- for (auto it = g_.ConstEdgeBegin(true); !it.IsEnd(); ++it) {
- debruijn_graph::EdgeId e = *it;
- edge_id_map_.insert(make_pair(g_.int_id(e), e));
- }
-}
-
-bool BWAPairInfoFiller::CreateIndex(const string& contigs) {
- int run_res = 0;
- string err_log = path::append_path(work_dir_, "index.err");
- string index_line = bwa_path_ + string(" index ") + "-a is " + contigs + " 2>" + err_log;
- INFO("Running bwa index ... ");
- INFO("Command line: " << index_line);
- run_res = system(index_line.c_str());
- if (run_res != 0) {
- ERROR("bwa index failed, cannot align reads");
- return false;
- }
- return true;
-}
-
-
-bool BWAPairInfoFiller::RunBWA(const string& reads_file, const string& out_sam_file) const {
- string run_command = bwa_path_ + " mem -t " + ToString(nthreads_) + " " + index_base_ + " " + reads_file + " > " + out_sam_file + " 2>"
- + out_sam_file + ".txt";
- INFO("Running bwa mem ...");
- INFO("Command line: " << run_command);
-
- int run_res = system(run_command.c_str());
- if (run_res != 0) {
- ERROR("bwa mem failed, cannot align reads");
- return false;
- }
- return true;
-}
-
-bool BWAPairInfoFiller::AlignLib(const SequencingLibraryT& lib,
- const string& sam_file_base,
- vector<pair<string, string>>& resulting_sam_files) {
-
- VERIFY_MSG(Init(), "BWA index was not constructed properly");
- resulting_sam_files.clear();
- size_t file_index = 0;
- bool any_aligned = false;
-
- for (auto iter = lib.paired_begin(); iter != lib.paired_end(); iter++) {
- string left_reads = iter->first;
- string left_sam = sam_file_base + "_1_" + ToString(file_index) + ".sam";
- bool res = RunBWA(left_reads, left_sam);
- if (!res) {
- WARN("Failed to align left reads " << left_reads);
- continue;
- }
- string right_reads = iter->second;
- string right_sam = sam_file_base + "_2_" + ToString(file_index) + ".sam";
- res = RunBWA(right_reads, right_sam);
- if (!res) {
- WARN("Failed to align right reads " << right_reads);
- continue;
- }
-
- resulting_sam_files.push_back(make_pair(left_sam, right_sam));
- any_aligned = true;
- }
- return any_aligned;
-}
-
-
-void BWAPairInfoFiller::ProcessSAMFiles(const string &left_sam, const string &right_sam,
- BWAPairedReadProcessor& processor) {
-
- //Left and right reads are stored in maps until pair is detected
- unordered_map<string, MapperReadT> left_reads;
- unordered_map<string, MapperReadT> right_reads;
- size_t counter = 0;
- //Check for duplicating read IDs
- bool left_duplicated = false;
- bool right_duplicated = false;
-
- INFO("Reading SAM files " << left_sam << " and " << right_sam);
- MappedSamStream lf(left_sam);
- MappedSamStream rf(right_sam);
- while (!lf.eof() || !rf.eof()) {
- SingleSamRead left_read;
- MapperReadT left_data;
- string l_name = "";
-
- SingleSamRead right_read;
- MapperReadT right_data;
- string r_name = "";
-
- if (!lf.eof()) {
- lf >> left_read;
- l_name = left_read.name();
- if (left_read.is_properly_aligned()) {
- TRACE("Left read " << l_name);
- left_data = MapperReadT(string(lf.get_contig_name(left_read.contig_id())),
- left_read.pos(),
- left_read.data_len(),
- left_read.strand(),
- left_read.cigar());
- }
- else if (!left_read.is_main_alignment()) {
- //If not primary alignment ignore mapping
- TRACE("Ignoring left read");
- l_name = "";
- }
- }
- if (!rf.eof()) {
- rf >> right_read;
- r_name = right_read.name();
- if (right_read.is_properly_aligned()) {
- TRACE("Right read " << r_name);
- right_data = MapperReadT(string(rf.get_contig_name(right_read.contig_id())),
- right_read.pos(),
- right_read.data_len(),
- right_read.strand(),
- right_read.cigar());
- }
- else if (!right_read.is_main_alignment()) {
- //If not primary alignment ignore mapping
- TRACE("Ignoring right read");
- r_name = "";
- }
- }
-
- //Think about custom read names
- if (l_name == r_name) {
- TRACE("Equal processing");
- //Process immideately if ids are equal in both SAM entries
- processor.ProcessPairedRead(left_data, right_data);
- VERBOSE_POWER2(++counter, "Processed " << counter << " paired reads");
- continue;
- }
-
- if (r_name != "") {
- auto it = left_reads.find(r_name);
- if (it != left_reads.end()) {
- //Right read's mate found in map
- TRACE("Right read's mate found, processing");
- processor.ProcessPairedRead(it->second, right_data);
- VERBOSE_POWER2(++counter, "Processed " << counter << " paired reads");
- //Remove mate as used
- left_reads.erase(it);
- }
- else {
- TRACE("Right read's mate not found, adding to map");
- if (right_reads.count(r_name) == 0) {
- //Insert read without mate for further analysis
- //TODO inspect map size and performance
- right_reads.emplace(r_name, right_data);
- } else {
- DEBUG("Right read " << r_name << " is duplicated!");
- //Report duplication
- right_duplicated = true;
- }
- }
- }
-
- if (l_name != "") {
- auto it = right_reads.find(l_name);
- if (it != right_reads.end()) {
- //Left read's mate found in map
- TRACE("Left read's mate found, processing");
- processor.ProcessPairedRead(left_data, it->second);
- VERBOSE_POWER2(++counter, "Processed " << counter << " paired reads");
- //Remove mate as used
- right_reads.erase(it);
- }
- else {
- TRACE("Left read's mate not found, adding to map");
- if (left_reads.count(l_name) == 0) {
- //Insert read without mate for further analysis
- //TODO inspect map size and performance
- left_reads.emplace(l_name, left_data);
- } else {
- DEBUG("Left read " << r_name << " is duplicated!");
- //Report duplication
- left_duplicated = true;
- }
-
- }
- }
- }
-
- if (left_duplicated)
- WARN("SAM file " << left_sam << " contains duplicated read ids");
- if (right_duplicated)
- WARN("SAM file " << right_sam << " contains duplicated read ids");
-}
-
-bool BWAPairInfoFiller::Init() {
- if (!index_constructed_) {
- INFO("Initializing bwa pair info counter, working dir " << work_dir_);
- path::make_dir(base_dir_);
- work_dir_ = path::make_temp_dir(base_dir_, "");
- index_base_= path::append_path(work_dir_, "long_edges.fasta");
- INFO("Saving edges to " << index_base_);
- OutputEdges(index_base_);
- FillEdgeIdMap();
- index_constructed_ = CreateIndex(index_base_);
- }
- return index_constructed_;
-}
-
-bool BWAPairInfoFiller::ProcessLib(size_t lib_index,
- SequencingLibraryT& lib,
- PairedInfoIndexT& paired_index,
- size_t counter_edge_len,
- size_t index_filler_edge_len) {
- //Initialize if needed
- Init();
- string lib_dir = path::append_path(work_dir_, ToString(lib_index));
- path::make_dir(lib_dir);
- vector<pair<string, string>> sam_files;
- bool result = false;
-
- INFO("Mapping lib #" << lib_index << " using BWA");
- if (!AlignLib(lib, path::append_path(lib_dir, "single"), sam_files)) {
- WARN("Failed to align lib #" << lib_index);
- return false;
- }
-
- INFO("Estimating insert size for library #" << lib_index);
- BWAISCounter counter(lib, edge_id_map_, g_, counter_edge_len);
- for (const auto& sam_pair : sam_files) {
- ProcessSAMFiles(sam_pair.first, sam_pair.second, counter);
- }
-
- if (!counter.RefineInsertSize(lib)) {
- lib.data().mean_insert_size = 0.0;
- WARN("Unable to estimate insert size paired library #" << lib_index);
- }
- else {
- INFO(" Estimated insert size for paired library #" << lib_index);
- INFO(" Insert size = " << lib.data().mean_insert_size <<
- ", deviation = " << lib.data().insert_size_deviation <<
- ", left quantile = " << lib.data().insert_size_left_quantile <<
- ", right quantile = " << lib.data().insert_size_right_quantile <<
- ", read length = " << lib.data().read_length);
-
- INFO("Collecting paired information for library #" << lib_index);
- paired_index.Init();
-
- BWAIndexFiller filler(lib, edge_id_map_, g_, paired_index, index_filler_edge_len);
- for (const auto& sam_pair : sam_files) {
- ProcessSAMFiles(sam_pair.first, sam_pair.second, filler);
- }
- result = true;
- }
- if (remove_tmp_files_)
- path::remove_dir(lib_dir);
- return result;
-}
-
-
-}
diff --git a/src/debruijn/bwa_pair_info_filler.hpp b/src/debruijn/bwa_pair_info_filler.hpp
deleted file mode 100644
index 92eedeb..0000000
--- a/src/debruijn/bwa_pair_info_filler.hpp
+++ /dev/null
@@ -1,254 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "standard.hpp"
-#include "debruijn_graph.hpp"
-#include "config_struct.hpp"
-
-#include <io/sam/sam_reader.hpp>
-#include <io/sam/read.hpp>
-
-#include <io/osequencestream.hpp>
-#include <de/paired_info.hpp>
-#include <de/insert_size_refiner.hpp>
-
-#ifndef PROJECT_BWA_PAIR_INFO_FILLER_HPP_H
-#define PROJECT_BWA_PAIR_INFO_FILLER_HPP_H
-
-namespace bwa_pair_info {
-
-using namespace sam_reader;
-using debruijn_graph::EdgeId;
-
-typedef omnigraph::de::UnclusteredPairedInfoIndexT<debruijn_graph::Graph> PairedInfoIndexT;
-typedef io::SequencingLibrary<debruijn_graph::debruijn_config::DataSetData> SequencingLibraryT;
-typedef std::pair<debruijn_graph::EdgeId, debruijn_graph::EdgeId> EdgePair;
-typedef unordered_map<size_t, debruijn_graph::EdgeId> EdgeIdMap;
-
-//More compact representation of aligned read for storing in map
-class MapperReadT {
-public:
- MapperReadT(): contig_id_(""), pos_(-1), len_(-1), is_forward_(true),
- left_hard_clip_(0), right_hard_clip_(0), left_soft_clip_(0), right_soft_clip_(0){}
-
- MapperReadT(const string& ctg_id, int32_t pos, int32_t len, bool is_forward, const string& cigar):
- contig_id_(ctg_id), pos_(pos), len_(len), is_forward_(is_forward),
- left_hard_clip_(0), right_hard_clip_(0), left_soft_clip_(0), right_soft_clip_(0) {
-
- ParseCigar(cigar);
- }
-
- bool IsValid() const {
- return contig_id_ != "";
- }
-
-private:
-
- void ParseCigar(const string& cigar);
-
-public:
- const string &get_contig_id() const {
- return contig_id_;
- }
- int32_t pos() const {
- return pos_;
- }
- int32_t len() const {
- return len_;
- }
- bool is_forward() const {
- return is_forward_;
- }
- uint32_t left_soft_clip() const {
- return left_soft_clip_;
- }
- uint32_t right_soft_clip() const {
- return right_soft_clip_;
- }
- uint32_t left_hard_clip() const {
- return left_hard_clip_;
- }
- uint32_t right_hard_clip() const {
- return right_hard_clip_;
- }
-
-private:
- string contig_id_;
- int32_t pos_;
- int32_t len_;
- bool is_forward_;
- uint32_t left_hard_clip_:16, right_hard_clip_:16;
- uint32_t left_soft_clip_:16, right_soft_clip_:16;
-};
-
-//Base class for aligned read processor (simple analog of SequenceMapperListener)
-class BWAPairedReadProcessor {
-public:
- virtual void ProcessPairedRead(const MapperReadT& l, const MapperReadT& r) = 0;
-
- virtual ~BWAPairedReadProcessor() {
-
- }
-};
-
-//Class that corrects mapping positions according to lib orientation and clippings
-class BWACorrectingProcessor: public BWAPairedReadProcessor {
-protected:
- const SequencingLibraryT& lib_;
-
- const EdgeIdMap& edge_id_map_;
-
- const debruijn_graph::Graph& g_;
-
- size_t count_;
-
-public:
-
- struct MappedPositionT {
- EdgeId e;
- int pos;
-
- MappedPositionT(EdgeId e_, int pos_): e(e_), pos(pos_) {
-
- }
- };
-
- BWACorrectingProcessor(const SequencingLibraryT& lib, const EdgeIdMap& edge_id_map, const debruijn_graph::Graph& g):
- lib_(lib), edge_id_map_(edge_id_map), g_(g), count_(0) {
- }
-
- virtual bool CheckAlignments(const MappedPositionT& l, const MappedPositionT& r) = 0;
-
- virtual void ProcessAlignments(const MappedPositionT& l, const MappedPositionT& r) = 0;
-//Correct read algnment according to orientation and clippings
- virtual void ProcessPairedRead(const MapperReadT& l, const MapperReadT& r);
-};
-
-//Insert size counter
-class BWAISCounter: public BWACorrectingProcessor {
-private:
- HistType hist_;
- size_t min_contig_len_;
- bool ignore_negative_;
- size_t mapped_count_;
- size_t negative_count_;
-
-public:
- BWAISCounter(const SequencingLibraryT& lib, const EdgeIdMap& edge_id_map, const debruijn_graph::Graph& g,
- size_t min_contig_len, bool ignore_negative = false):
- BWACorrectingProcessor(lib, edge_id_map, g), hist_(), min_contig_len_(min_contig_len),
- ignore_negative_(ignore_negative), mapped_count_(0), negative_count_(0) {
- }
-
- bool CheckAlignments(const MappedPositionT& l, const MappedPositionT& r) override;
-
- void ProcessAlignments(const MappedPositionT& l, const MappedPositionT& r) override;
-
- bool RefineInsertSize(SequencingLibraryT& reads) const ;
-
-};
-
-//Pair info filler
-class BWAIndexFiller: public BWACorrectingProcessor {
-
-private:
- PairedInfoIndexT& paired_index_;
-
- size_t min_contig_len_;
-
- EdgePair ConjugatePair(EdgePair ep) const;
-
-public:
- BWAIndexFiller(const SequencingLibraryT& lib, const EdgeIdMap& edge_id_map, const debruijn_graph::Graph& g,
- PairedInfoIndexT& paired_index, size_t min_contig_len = 0):
- BWACorrectingProcessor(lib, edge_id_map, g), paired_index_(paired_index), min_contig_len_(min_contig_len) {
- }
-
- bool CheckAlignments(const MappedPositionT& l, const MappedPositionT& r) override;
-
- void ProcessAlignments(const MappedPositionT& l, const MappedPositionT& r) override;
-};
-
-//Class for running BWA, managing and parsing SAM files
-class BWAPairInfoFiller {
-public:
- DECL_LOGGER("BWAPairInfo");
-
-private:
- const debruijn_graph::Graph& g_;
-
- string bwa_path_;
-
- string base_dir_;
-
- string work_dir_;
-
- size_t nthreads_;
-
- string index_base_;
-
- bool index_constructed_;
-
- bool remove_tmp_files_;
-
- unordered_map<size_t, debruijn_graph::EdgeId> edge_id_map_;
-
-private:
-
- //Save graph in fasta format
- void OutputEdges(const string& filename) const;
-
- //Construct int_id -> EdgeId map
- void FillEdgeIdMap();
-
- //Run bwa index
- bool CreateIndex(const string& contigs);
-
- //Initialize for read aligment (includes all above)
- bool Init();
-
- //Run bwa mem on single file
- bool RunBWA(const string& reads_file, const string& out_sam_file) const;
-
- //Process single read library
- bool AlignLib(const SequencingLibraryT& lib,
- const string& sam_file_base,
- vector<pair<string, string>>& resulting_sam_files);
-
- //Parse a pair of same files and analyze alignments with processor
- void ProcessSAMFiles(const string &left_sam, const string &right_sam,
- BWAPairedReadProcessor& processor);
-
-public:
-
- BWAPairInfoFiller(const debruijn_graph::Graph& g,
- const string& bwa_path,
- const string& work_dir,
- size_t nthreads = 1,
- bool remove_tmp = true):
- g_(g), bwa_path_(bwa_path), base_dir_(work_dir), work_dir_(""),
- nthreads_(nthreads), index_base_(""), index_constructed_(false),
- remove_tmp_files_(remove_tmp),
- edge_id_map_() {
- }
-
- ~BWAPairInfoFiller() {
- if (remove_tmp_files_)
- path::remove_if_exists(work_dir_);
- }
-
- //Count IS and fill pair info index for the given lib
- bool ProcessLib(size_t lib_index,
- SequencingLibraryT& lib,
- PairedInfoIndexT& paired_index,
- size_t counter_edge_len,
- size_t index_filler_edge_len);
-};
-
-}
-
-#endif //PROJECT_BWA_PAIR_INFO_FILLER_HPP_H
diff --git a/src/debruijn/config_struct.cpp b/src/debruijn/config_struct.cpp
deleted file mode 100644
index b5b8fbd..0000000
--- a/src/debruijn/config_struct.cpp
+++ /dev/null
@@ -1,799 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "config_struct.hpp"
-
-#include "config_common.hpp"
-#include "openmp_wrapper.h"
-
-#include "logger/logger.hpp"
-
-#include "io/file_reader.hpp"
-
-#include <string>
-#include <vector>
-
-namespace YAML {
-template<>
-struct convert<io::SequencingLibrary<debruijn_graph::debruijn_config::DataSetData> > {
- static Node encode(const io::SequencingLibrary<debruijn_graph::debruijn_config::DataSetData> &rhs) {
- // First, save the "common" stuff
- Node node = convert<io::SequencingLibraryBase>::encode(rhs);
-
- // Now, save the remaining stuff
- auto const& data = rhs.data();
- node["read length"] = data.read_length;
- node["average read length"] = data.avg_read_length;
- node["insert size mean"] = data.mean_insert_size;
- node["insert size deviation"] = data.insert_size_deviation;
- node["insert size left quantile"] = data.insert_size_left_quantile;
- node["insert size right quantile"] = data.insert_size_right_quantile;
- node["insert size median"] = data.median_insert_size;
- node["insert size mad"] = data.insert_size_mad;
- node["insert size distribution"] = data.insert_size_distribution;
- node["average coverage"] = data.average_coverage;
- node["pi threshold"] = data.pi_threshold;
- node["binary converted"] = data.binary_coverted;
- node["single reads mapped"] = data.single_reads_mapped;
-
- return node;
- }
-
- static bool decode(const Node& node, io::SequencingLibrary<debruijn_graph::debruijn_config::DataSetData> &rhs) {
- // First, load the "common stuff"
- rhs.load(node);
-
- // Now load the remaining stuff
- auto& data = rhs.data();
- data.read_length = node["read length"].as<size_t>(0);
- data.avg_read_length = node["average read length"].as<double>(0.0);
- data.mean_insert_size = node["insert size mean"].as<double>(0.0);
- data.insert_size_deviation = node["insert size deviation"].as<double>(0.0);
- data.insert_size_left_quantile = node["insert size left quantile"].as<double>(0.0);
- data.insert_size_right_quantile = node["insert size right quantile"].as<double>(0.0);
- data.median_insert_size = node["insert size median"].as<double>(0.0);
- data.insert_size_mad = node["insert size mad"].as<double>(0.0);
- data.insert_size_distribution = node["insert size distribution"].as<decltype(data.insert_size_distribution)>(decltype(data.insert_size_distribution)());
-
- data.average_coverage = node["average coverage"].as<double>(0.0);
- data.pi_threshold = node["pi threshold"].as<double>(0.0);
- data.binary_coverted = node["binary converted"].as<bool>(false);
- data.single_reads_mapped = node["single reads mapped"].as<bool>(false);
-
- return true;
- }
-};
-
-template<>
-struct convert<debruijn_graph::debruijn_config::dataset> {
- static Node encode(const debruijn_graph::debruijn_config::dataset &rhs) {
- Node node;
-
- node["reads"] = rhs.reads;
- node["max read length"] = rhs.RL();
- node["avg read length"] = rhs.aRL();
- node["average coverage"] = rhs.avg_coverage();
-
- return node;
- }
-
- static bool decode(const Node& node, debruijn_graph::debruijn_config::dataset &rhs) {
- rhs.set_RL(node["max read length"].as<size_t>(0));
- rhs.set_aRL(node["avg read length"].as<double>(0.0));
- rhs.set_avg_coverage(node["average coverage"].as<double>(0.0));
- rhs.reads = node["reads"];
-
- return true;
- }
-};
-}
-
-namespace debruijn_graph {
-static std::string MakeLaunchTimeDirName() {
- time_t rawtime;
- struct tm * timeinfo;
- char buffer[80];
-
- time(&rawtime);
- timeinfo = localtime(&rawtime);
-
- strftime(buffer, 80, "%m.%d_%H.%M.%S", timeinfo);
- return std::string(buffer);
-}
-
-void load_lib_data(const std::string& prefix) {
- // First, load the data into separate libs
- cfg::get_writable().ds.reads.load(prefix + ".lib_data");
-
- // Now, infer the common parameters
- size_t max_rl = 0;
- double avg_cov = 0.0;
- double avg_rl = 0.0;
- for (const auto& lib : cfg::get().ds.reads.libraries()) {
- auto const& data = lib.data();
- if (lib.is_graph_contructable())
- max_rl = std::max(max_rl, data.read_length);
- if (data.average_coverage > 0)
- avg_cov = data.average_coverage;
- if (data.avg_read_length > 0)
- avg_rl = data.avg_read_length;
- }
-
- cfg::get_writable().ds.set_RL(max_rl);
- cfg::get_writable().ds.set_aRL(avg_rl);
- cfg::get_writable().ds.set_avg_coverage(avg_cov);
-}
-
-void write_lib_data(const std::string& prefix) {
- cfg::get().ds.reads.save(prefix + ".lib_data");
-}
-
-void load(debruijn_config::simplification::tip_clipper& tc,
- boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
- load(tc.condition, pt, "condition");
-}
-
-void load(resolving_mode& rm, boost::property_tree::ptree const& pt,
- std::string const& key, bool complete) {
- if (complete || pt.find(key) != pt.not_found()) {
- std::string ep = pt.get<std::string>(key);
- rm = debruijn_config::resolving_mode_id(ep);
- }
-}
-
-void load(single_read_resolving_mode& rm, boost::property_tree::ptree const& pt,
- std::string const& key, bool complete) {
- if (complete || pt.find(key) != pt.not_found()) {
- std::string ep = pt.get<std::string>(key);
- rm = debruijn_config::single_read_resolving_mode_id(ep);
- }
-}
-
-inline void load(construction_mode& con_mode,
- boost::property_tree::ptree const& pt, std::string const& key,
- bool complete) {
- if (complete || pt.find(key) != pt.not_found()) {
- std::string ep = pt.get<std::string>(key);
- con_mode = debruijn_config::construction_mode_id(ep);
- }
-}
-
-inline void load(debruijn_config::construction::early_tip_clipper& etc,
- boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
- load(etc.enable, pt, "enable");
- etc.length_bound = pt.get_optional<size_t>("length_bound");
-}
-
-inline void load(debruijn_config::construction& con,
- boost::property_tree::ptree const& pt, bool complete) {
- using config_common::load;
- load(con.con_mode, pt, "mode", complete);
- load(con.keep_perfect_loops, pt, "keep_perfect_loops", complete);
- load(con.read_buffer_size, pt, "read_buffer_size", complete);
- con.read_buffer_size *= 1024 * 1024;
- load(con.early_tc, pt, "early_tip_clipper", complete);
-}
-
-inline void load(debruijn_config::sensitive_mapper& sensitive_map,
- boost::property_tree::ptree const& pt, bool complete) {
- using config_common::load;
- load(sensitive_map.k, pt, "k", complete);
-}
-
-inline void load(estimation_mode& est_mode,
- boost::property_tree::ptree const& pt, std::string const& key,
- bool complete) {
- if (complete || pt.find(key) != pt.not_found()) {
- std::string ep = pt.get<std::string>(key);
- est_mode = debruijn_config::estimation_mode_id(ep);
- }
-}
-
-void load(debruijn_config::simplification::bulge_remover& br,
- boost::property_tree::ptree const& pt, bool complete) {
- using config_common::load;
-
- load(br.enabled , pt, "enabled" , complete);
- load(br.main_iteration_only , pt, "main_iteration_only" , complete);
- load(br.max_bulge_length_coefficient , pt, "max_bulge_length_coefficient", complete);
- load(br.max_additive_length_coefficient , pt,
- "max_additive_length_coefficient", complete);
- load(br.max_coverage, pt, "max_coverage", complete);
- load(br.max_relative_coverage, pt, "max_relative_coverage", complete);
- load(br.max_delta, pt, "max_delta", complete);
- load(br.max_relative_delta, pt, "max_relative_delta", complete);
- load(br.max_number_edges, pt, "max_number_edges", complete);
- load(br.parallel, pt, "parallel", complete);
- load(br.buff_size, pt, "buff_size", complete);
- load(br.buff_cov_diff, pt, "buff_cov_diff", complete);
- load(br.buff_cov_rel_diff, pt, "buff_cov_rel_diff", complete);
-}
-
-void load(debruijn_config::simplification::topology_tip_clipper& ttc,
- boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
- load(ttc.length_coeff, pt, "length_coeff");
- load(ttc.plausibility_length, pt, "plausibility_length");
- load(ttc.uniqueness_length, pt, "uniqueness_length");
-}
-
-void load(debruijn_config::simplification::complex_tip_clipper& ctc,
- boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
- load(ctc.enabled, pt, "enabled");
-}
-
-void load(debruijn_config::simplification::relative_coverage_edge_disconnector& relative_ed,
- boost::property_tree::ptree const& pt, bool complete) {
- using config_common::load;
- load(relative_ed.enabled, pt, "enabled", complete);
- load(relative_ed.diff_mult, pt, "diff_mult", complete);
-}
-
-void load(debruijn_config::simplification::relative_coverage_comp_remover& rcc,
- boost::property_tree::ptree const& pt, bool complete) {
- using config_common::load;
- load(rcc.enabled, pt, "enabled", complete);
- load(rcc.coverage_gap, pt, "coverage_gap", complete);
- load(rcc.length_coeff, pt, "max_length_coeff", complete);
- load(rcc.tip_allowing_length_coeff, pt, "max_length_with_tips_coeff", complete);
- load(rcc.vertex_count_limit, pt, "max_vertex_cnt", complete);
- load(rcc.max_ec_length_coefficient, pt, "max_ec_length_coefficient", complete);
- load(rcc.max_coverage_coeff, pt, "max_coverage_coeff", complete);
-}
-
-void load(debruijn_config::simplification::isolated_edges_remover& ier,
- boost::property_tree::ptree const& pt, bool complete) {
- using config_common::load;
- load(ier.enabled, pt, "enabled", complete);
- load(ier.max_length, pt, "max_length", complete);
- load(ier.max_coverage, pt, "max_coverage", complete);
- load(ier.max_length_any_cov, pt, "max_length_any_cov", complete);
-}
-
-void load(debruijn_config::simplification::init_cleaning& init_clean,
- boost::property_tree::ptree const& pt, bool complete) {
- using config_common::load;
- load(init_clean.self_conj_condition, pt, "self_conj_condition", complete);
- load(init_clean.early_it_only, pt, "early_it_only", complete);
- load(init_clean.activation_cov, pt, "activation_cov", complete);
- load(init_clean.ier, pt, "ier", complete);
- load(init_clean.tip_condition, pt, "tip_condition", complete);
- load(init_clean.ec_condition, pt, "ec_condition", complete);
- load(init_clean.disconnect_flank_cov, pt, "disconnect_flank_cov", complete);
-}
-
-void load(debruijn_config::simplification::complex_bulge_remover& cbr,
- boost::property_tree::ptree const& pt, bool complete) {
- using config_common::load;
-
- load(cbr.enabled, pt, "enabled");
- load(cbr.max_relative_length, pt, "max_relative_length", complete);
- load(cbr.max_length_difference, pt, "max_length_difference", complete);
-}
-
-void load(debruijn_config::simplification::erroneous_connections_remover& ec,
- boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
-
- load(ec.condition, pt, "condition");
-}
-
-void load(debruijn_config::simplification::topology_based_ec_remover& tec,
- boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
-
- load(tec.max_ec_length_coefficient, pt, "max_ec_length_coefficient");
- load(tec.plausibility_length, pt, "plausibility_length");
- load(tec.uniqueness_length, pt, "uniqueness_length");
-}
-
-void load(debruijn_config::simplification::interstrand_ec_remover &isec,
- boost::property_tree::ptree const &pt, bool /*complete*/) {
- using config_common::load;
- load(isec.max_ec_length_coefficient, pt, "max_ec_length_coefficient");
- load(isec.uniqueness_length, pt, "uniqueness_length");
- load(isec.span_distance, pt, "span_distance");
-}
-
-void load(debruijn_config::simplification::tr_based_ec_remover &trec,
- boost::property_tree::ptree const &pt, bool /*complete*/) {
- using config_common::load;
- load(trec.max_ec_length_coefficient, pt, "max_ec_length_coefficient");
- load(trec.unreliable_coverage, pt, "unreliable_coverage");
- load(trec.uniqueness_length, pt, "uniqueness_length");
-}
-
-void load(debruijn_config::simplification::max_flow_ec_remover& mfec,
- boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
-
- load(mfec.enabled, pt, "enabled");
- load(mfec.max_ec_length_coefficient, pt, "max_ec_length_coefficient");
- load(mfec.plausibility_length, pt, "plausibility_length");
- load(mfec.uniqueness_length, pt, "uniqueness_length");
-}
-
-void load(debruijn_config::simplification::hidden_ec_remover& her,
- boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
-
- load(her.enabled, pt, "enabled");
- load(her.uniqueness_length, pt, "uniqueness_length");
- load(her.unreliability_threshold, pt, "unreliability_threshold");
- load(her.relative_threshold, pt, "relative_threshold");
-}
-
-void load(debruijn_config::distance_estimator& de,
- boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
-
- load(de.linkage_distance_coeff, pt, "linkage_distance_coeff");
- load(de.max_distance_coeff, pt, "max_distance_coeff");
- load(de.max_distance_coeff_scaff, pt, "max_distance_coeff_scaff");
- load(de.filter_threshold, pt, "filter_threshold");
-}
-
-void load(debruijn_config::smoothing_distance_estimator& ade,
- boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
-
- load(ade.threshold, pt, "threshold");
- load(ade.range_coeff, pt, "range_coeff");
- load(ade.delta_coeff, pt, "delta_coeff");
- load(ade.percentage, pt, "percentage");
- load(ade.cutoff, pt, "cutoff");
- load(ade.min_peak_points, pt, "min_peak_points");
- load(ade.inv_density, pt, "inv_density");
- load(ade.derivative_threshold, pt, "derivative_threshold");
-}
-
-inline void load(debruijn_config::ambiguous_distance_estimator& amde,
- boost::property_tree::ptree const& pt, bool){
- using config_common::load;
-
- load(amde.enabled, pt, "enabled");
- load(amde.haplom_threshold, pt, "haplom_threshold");
- load(amde.relative_length_threshold, pt, "relative_length_threshold");
- load(amde.relative_seq_threshold, pt, "relative_seq_threshold");
-}
-
-void load(debruijn_config::scaffold_correction& sc_corr,
- boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
- load(sc_corr.scaffolds_file, pt, "scaffolds_file");
- load(sc_corr.output_unfilled, pt, "output_unfilled");
- load(sc_corr.max_insert, pt, "max_insert");
- load(sc_corr.max_cut_length, pt, "max_cut_length");
-}
-
-void load(debruijn_config::truseq_analysis& tsa,
- boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
- load(tsa.scaffolds_file, pt, "scaffolds_file");
- load(tsa.genome_file, pt, "genome_file");
-}
-
-void load(debruijn_config::bwa_aligner& bwa,
- boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
- load(bwa.enabled, pt, "enabled");
- load(bwa.debug, pt, "debug");
- load(bwa.path_to_bwa, pt, "path_to_bwa");
- load(bwa.min_contig_len, pt, "min_contig_len");
-}
-
-void load(debruijn_config::pacbio_processor& pb,
- boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
- load(pb.pacbio_k, pt, "pacbio_k");
- load(pb.additional_debug_info, pt, "additional_debug_info");
- load(pb.compression_cutoff, pt, "compression_cutoff");
- load(pb.domination_cutoff, pt, "domination_cutoff");
- load(pb.path_limit_stretching, pt, "path_limit_stretching");
- load(pb.path_limit_pressing, pt, "path_limit_pressing");
- load(pb.ignore_middle_alignment, pt, "ignore_middle_alignment");
- load(pb.long_seq_limit, pt, "long_seq_limit");
- load(pb.pacbio_min_gap_quantity, pt, "pacbio_min_gap_quantity");
- load(pb.contigs_min_gap_quantity, pt, "contigs_min_gap_quantity");
- load(pb.max_contigs_gap_length, pt, "max_contigs_gap_length");
-
-}
-
-
-void load(debruijn_config::position_handler& pos,
- boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
- load(pos.max_mapping_gap, pt, "max_mapping_gap");
- load(pos.max_gap_diff, pt, "max_gap_diff");
- load(pos.contigs_for_threading, pt, "contigs_for_threading");
- load(pos.contigs_to_analyze, pt, "contigs_to_analyze");
- load(pos.late_threading, pt, "late_threading");
- load(pos.careful_labeling, pt, "careful_labeling");
-}
-
-void load(debruijn_config::gap_closer& gc,
- boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
- load(gc.minimal_intersection, pt, "minimal_intersection");
- load(gc.before_simplify, pt, "before_simplify");
- load(gc.in_simplify, pt, "in_simplify");
- load(gc.after_simplify, pt, "after_simplify");
- load(gc.weight_threshold, pt, "weight_threshold");
-}
-
-void load(debruijn_config::graph_read_corr_cfg& graph_read_corr,
- boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
- load(graph_read_corr.enable, pt, "enable");
- load(graph_read_corr.output_dir, pt, "output_dir");
- load(graph_read_corr.binary, pt, "binary");
-}
-
-void load(debruijn_config::kmer_coverage_model& kcm,
- boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
- load(kcm.probability_threshold, pt, "probability_threshold");
- load(kcm.strong_probability_threshold, pt, "strong_probability_threshold");
- load(kcm.coverage_threshold, pt, "coverage_threshold");
- load(kcm.use_coverage_threshold, pt, "use_coverage_threshold");
-}
-
-void load(debruijn_config::dataset& ds,
- boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
-
- load(ds.reads_filename, pt, "reads");
-
- ds.single_cell = pt.get("single_cell", false);
- ds.meta = pt.get("meta", false);
- ds.moleculo = pt.get("moleculo", false);
-
- //fixme temporary solution
- if (ds.meta)
- ds.single_cell = true;
-
- ds.reference_genome_filename = "";
- boost::optional<std::string> refgen =
- pt.get_optional<std::string>("reference_genome");
- if (refgen && *refgen != "N/A") {
- ds.reference_genome_filename = *refgen;
- }
-}
-
-void load_reads(debruijn_config::dataset& ds,
- std::string input_dir) {
- if (ds.reads_filename[0] != '/')
- ds.reads_filename = input_dir + ds.reads_filename;
- path::CheckFileExistenceFATAL(ds.reads_filename);
- ds.reads.load(ds.reads_filename);
-}
-
-void load_reference_genome(debruijn_config::dataset& ds,
- std::string input_dir) {
- if (ds.reference_genome_filename == "") {
- ds.reference_genome = "";
- return;
- }
- if (ds.reference_genome_filename[0] != '/')
- ds.reference_genome_filename = input_dir + ds.reference_genome_filename;
- path::CheckFileExistenceFATAL(ds.reference_genome_filename);
- io::FileReadStream genome_stream(ds.reference_genome_filename);
- io::SingleRead genome;
- genome_stream >> genome;
- ds.reference_genome = genome.GetSequenceString();
-}
-
-void load(debruijn_config::simplification& simp,
- boost::property_tree::ptree const& pt, bool complete) {
- using config_common::load;
-
- load(simp.cycle_iter_count, pt, "cycle_iter_count", complete);
- load(simp.post_simplif_enabled, pt, "post_simplif_enabled", complete);
- load(simp.topology_simplif_enabled, pt, "topology_simplif_enabled", complete);
- load(simp.tc, pt, "tc", complete); // tip clipper:
- load(simp.ttc, pt, "ttc", complete); // topology tip clipper:
- load(simp.br, pt, "br", complete); // bulge remover:
- load(simp.ec, pt, "ec", complete); // erroneous connections remover:
- load(simp.rcc, pt, "rcc", complete); // relative coverage component remover:
- load(simp.relative_ed, pt, "relative_ed", complete); // relative edge disconnector:
- load(simp.tec, pt, "tec", complete); // topology aware erroneous connections remover:
- load(simp.trec, pt, "trec", complete); // topology and reliability based erroneous connections remover:
- load(simp.isec, pt, "isec", complete); // interstrand erroneous connections remover (thorn remover):
- load(simp.mfec, pt, "mfec", complete); // max flow erroneous connections remover:
- load(simp.ier, pt, "ier", complete); // isolated edges remover
- load(simp.cbr, pt, "cbr", complete); // complex bulge remover
- load(simp.her, pt, "her", complete); // hidden ec remover
- load(simp.init_clean, pt, "init_clean", complete); // presimplification
- load(simp.final_tc, pt, "final_tc", complete);
- load(simp.final_br, pt, "final_br", complete);
- simp.second_final_br = simp.final_br;
- load(simp.second_final_br, pt, "second_final_br", false);
-}
-
-void load(debruijn_config::info_printer& printer,
- boost::property_tree::ptree const& pt, bool complete) {
- using config_common::load;
- load(printer.basic_stats, pt, "basic_stats", complete);
- load(printer.extended_stats, pt, "extended_stats", complete);
- load(printer.write_components, pt, "write_components", complete);
- load(printer.components_for_kmer, pt, "components_for_kmer", complete);
- load(printer.components_for_genome_pos, pt, "components_for_genome_pos",
- complete);
- load(printer.write_components_along_genome, pt,
- "write_components_along_genome", complete);
- load(printer.write_components_along_contigs, pt,
- "write_components_along_contigs", complete);
- load(printer.save_full_graph, pt, "save_full_graph", complete);
- load(printer.write_full_graph, pt, "write_full_graph", complete);
- load(printer.write_full_nc_graph, pt, "write_full_nc_graph", complete);
- load(printer.write_error_loc, pt, "write_error_loc", complete);
-}
-
-//void clear(debruijn_config::info_printer& printer) {
-// printer.print_stats = false;
-// printer.write_components = false;
-// printer.components_for_kmer = "";
-// printer.components_for_genome_pos = "";
-// printer.write_components_along_genome = false;
-// printer.save_full_graph = false;
-// printer.write_full_graph = false;
-// printer.write_full_nc_graph = false;
-// printer.write_error_loc = false;
-//}
-
-
-void load(debruijn_config::info_printers_t& printers,
- boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
- using details::info_printer_pos_name;
-
- debruijn_config::info_printer def;
- load(def, pt, info_printer_pos_name(ipp_default), true);
-
- for (size_t pos = ipp_default + 1; pos != ipp_total; ++pos) {
- debruijn_config::info_printer printer(def);
- load(printer, pt, info_printer_pos_name(pos), false);
-
- printers[info_printer_pos(pos)] = printer;
- }
-}
-
-// main debruijn config load function
-void load(debruijn_config& cfg, boost::property_tree::ptree const& pt,
- bool /*complete*/) {
- using config_common::load;
-
- load(cfg.K, pt, "K");
-
- // input options:
- load(cfg.dataset_file, pt, "dataset");
- // input dir is based on dataset file location (all paths in datasets are relative to its location)
- cfg.input_dir = path::parent_path(cfg.dataset_file);
- if (cfg.input_dir[cfg.input_dir.length() - 1] != '/')
- cfg.input_dir += '/';
-
- load(cfg.output_base, pt, "output_base");
- if (cfg.output_base[cfg.output_base.length() - 1] != '/')
- cfg.output_base += '/';
-
- // TODO: remove this option
- load(cfg.run_mode, pt, "run_mode");
- load(cfg.scaffold_correction_mode, pt, "scaffold_correction_mode");
- load(cfg.sc_cor, pt, "sc_cor");
- load(cfg.tsa, pt, "tsa");
-
- if (cfg.run_mode) {
- load(cfg.project_name, pt, "project_name");
- cfg.output_root =
- cfg.project_name.empty() ?
- (cfg.output_base + "/K" + ToString(cfg.K) + "/") :
- (cfg.output_base + cfg.project_name + "/K"
- + ToString(cfg.K) + "/");
- cfg.output_suffix = MakeLaunchTimeDirName() + "/";
- cfg.output_dir = cfg.output_root + cfg.output_suffix;
- } else {
- //todo remove scaffold_correction_mode from config after config is refactored and move this logic to main or spades.py
- if(!cfg.scaffold_correction_mode)
- cfg.output_root = cfg.output_base + "/K" + ToString(cfg.K) + "/";
- else
- cfg.output_root = cfg.output_base + "/SCC/";
- cfg.output_dir = cfg.output_root;
- }
-
-
- cfg.output_saves = cfg.output_dir + "saves/";
-
- load(cfg.log_filename, pt, "log_filename");
-
- load(cfg.developer_mode, pt, "developer_mode");
- if (cfg.developer_mode) {
- load(cfg.output_pictures, pt, "output_pictures");
- load(cfg.output_nonfinal_contigs, pt, "output_nonfinal_contigs");
- load(cfg.compute_paths_number, pt, "compute_paths_number");
- } else {
- cfg.output_pictures = false;
- cfg.output_nonfinal_contigs = false;
- cfg.compute_paths_number = false;
- }
-
- load(cfg.load_from, pt, "load_from");
- if (cfg.load_from[0] != '/') { // relative path
- if (cfg.run_mode)
- cfg.load_from = cfg.output_root + cfg.load_from;
- else
- cfg.load_from = cfg.output_dir + cfg.load_from;
- }
-
- load(cfg.tmp_dir, pt, "tmp_dir");
- if (cfg.tmp_dir[0] != '/') { // relative path
- if (cfg.run_mode)
- cfg.tmp_dir = cfg.output_root + cfg.tmp_dir;
- else
- cfg.tmp_dir = cfg.output_dir + cfg.tmp_dir;
- }
-
- load(cfg.main_iteration, pt, "main_iteration");
-
- load(cfg.entry_point, pt, "entry_point");
-
- load(cfg.use_additional_contigs, pt, "use_additional_contigs");
- load(cfg.use_unipaths, pt, "use_unipaths");
-
- load(cfg.pb, pt, "pacbio_processor");
-
- load(cfg.additional_contigs, pt, "additional_contigs");
-
- load(cfg.rr_enable, pt, "rr_enable");
- load(cfg.two_step_rr, pt, "two_step_rr");
- load(cfg.use_intermediate_contigs, pt, "use_intermediate_contigs");
- load(cfg.single_reads_rr, pt, "single_reads_rr");
- cfg.use_single_reads = false;
-
- load(cfg.mismatch_careful, pt, "mismatch_careful");
- load(cfg.correct_mismatches, pt, "correct_mismatches");
- load(cfg.paired_info_statistics, pt, "paired_info_statistics");
- load(cfg.paired_info_scaffolder, pt, "paired_info_scaffolder");
- load(cfg.cut_bad_connections, pt, "cut_bad_connections");
- load(cfg.gap_closer_enable, pt, "gap_closer_enable");
-
- load(cfg.max_repeat_length, pt, (cfg.ds.single_cell ? "max_repeat_length_sc" : "max_repeat_length"));
-
- load(cfg.buffer_size, pt, "buffer_size");
- cfg.buffer_size <<= 20; //turn MB to bytes
-
- load(cfg.temp_bin_reads_dir, pt, "temp_bin_reads_dir");
- if (cfg.temp_bin_reads_dir[cfg.temp_bin_reads_dir.length() - 1] != '/')
- cfg.temp_bin_reads_dir += '/';
- cfg.temp_bin_reads_path =
- cfg.project_name.empty() ?
- (cfg.output_base + "/" + cfg.temp_bin_reads_dir) :
- (cfg.output_base + cfg.project_name + "/"
- + cfg.temp_bin_reads_dir);
- cfg.temp_bin_reads_info = cfg.temp_bin_reads_path + "INFO";
-
- cfg.paired_read_prefix = cfg.temp_bin_reads_path + "_paired";
- cfg.single_read_prefix = cfg.temp_bin_reads_path + "_single";
-
- load(cfg.max_threads, pt, "max_threads");
- // Fix number of threads according to OMP capabilities.
- cfg.max_threads = std::min(cfg.max_threads, (size_t) omp_get_max_threads());
- // Inform OpenMP runtime about this :)
- omp_set_num_threads((int) cfg.max_threads);
-
- load(cfg.max_memory, pt, "max_memory");
-
- load(cfg.diploid_mode, pt, "diploid_mode");
-
- path::CheckFileExistenceFATAL(cfg.dataset_file);
- boost::property_tree::ptree ds_pt;
- boost::property_tree::read_info(cfg.dataset_file, ds_pt);
- load(cfg.ds, ds_pt, true);
-
- load(cfg.ade, pt, (cfg.ds.single_cell ? "sc_ade" : "usual_ade")); // advanced distance estimator:
-
- load(cfg.pos, pt, "pos"); // position handler:
-
- load(cfg.est_mode, pt, "estimation_mode");
-
- load(cfg.amb_de, pt, "amb_de");
- cfg.amb_de.enabled = (cfg.diploid_mode) ? true : false;
-
- load(cfg.rm, pt, "resolving_mode");
-
- if (cfg.rm == rm_path_extend) {
- load(cfg.de, pt, (cfg.ds.single_cell ? "sc_de" : "usual_de"));
- }
- else {
- load(cfg.de, pt, (cfg.ds.single_cell ? "old_sc_de" : "old_usual_de"));
- }
-
- load(cfg.pe_params, pt, "default_pe");
- if (cfg.ds.single_cell) {
- VERIFY(pt.count("sc_pe"));
- load(cfg.pe_params, pt, "sc_pe", false);
- }
- if (cfg.ds.meta) {
- VERIFY(pt.count("meta_pe"));
- load(cfg.pe_params, pt, "meta_pe", false);
- }
- if (cfg.ds.moleculo) {
- VERIFY(pt.count("moleculo_pe"));
- load(cfg.pe_params, pt, "moleculo_pe", false);
- }
-
- cfg.prelim_pe_params = cfg.pe_params;
- VERIFY(pt.count("prelim_pe"));
- load(cfg.prelim_pe_params, pt, "prelim_pe", false);
-
- if (!cfg.developer_mode) {
- cfg.pe_params.debug_output = false;
- cfg.pe_params.viz.DisableAll();
- cfg.pe_params.output.DisableAll();
- }
- load(cfg.use_scaffolder, pt, "use_scaffolder");
- if (!cfg.use_scaffolder) {
- cfg.pe_params.param_set.scaffolder_options.on = false;
- }
- load(cfg.avoid_rc_connections, pt, "avoid_rc_connections");
-
- load(cfg.con, pt, "construction");
- load(cfg.gc, pt, "gap_closer");
- load(cfg.graph_read_corr, pt, "graph_read_corr");
- load(cfg.kcm, pt, "kmer_coverage_model");
- load(cfg.need_consensus, pt, "need_consensus");
- load(cfg.uncorrected_reads, pt, "uncorrected_reads");
- load(cfg.mismatch_ratio, pt, "mismatch_ratio");
-
- load(cfg.con, pt, "construction");
- load(cfg.sensitive_map, pt, "sensitive_mapper");
- load(cfg.flanking_range, pt, "flanking_range");
- if (cfg.ds.meta) {
- INFO("Flanking range overwritten to 30 for meta mode");
- cfg.flanking_range = 30;
- }
-
- load(cfg.info_printers, pt, "info_printers");
- load_reads(cfg.ds, cfg.input_dir);
-
- load_reference_genome(cfg.ds, cfg.input_dir);
-
- cfg.need_mapping = cfg.developer_mode || cfg.correct_mismatches
- || cfg.gap_closer_enable || cfg.rr_enable || cfg.scaffold_correction_mode;
-
- load(cfg.simp, pt, "default");
-
- if (cfg.ds.single_cell)
- load(cfg.simp, pt, "sc", false);
-
- if (cfg.mismatch_careful)
- load(cfg.simp, pt, "careful", false);
-
- if (cfg.ds.moleculo)
- load(cfg.simp, pt, "moleculo", false);
-
- if (cfg.diploid_mode)
- load(cfg.simp, pt, "diploid_simp", false);
-
- if (cfg.ds.meta)
- load(cfg.simp, pt, "meta", false);
-
- cfg.preliminary_simp = cfg.simp;
- load(cfg.preliminary_simp, pt, "preliminary", false);
- load(cfg.bwa, pt, "bwa_aligner", false);
-}
-
-void load(debruijn_config& cfg, const std::string &filename) {
- boost::property_tree::ptree pt;
- boost::property_tree::read_info(filename, pt);
-
- load(cfg, pt, true);
-}
-
-};
diff --git a/src/debruijn/config_struct.hpp b/src/debruijn/config_struct.hpp
deleted file mode 100644
index 7a8d7b3..0000000
--- a/src/debruijn/config_struct.hpp
+++ /dev/null
@@ -1,641 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef CONFIG_STRUCT_HDIP_
-#define CONFIG_STRUCT_HDIP_
-
-#include "config_singl.hpp"
-#include "cpp_utils.hpp"
-#include "sequence/sequence.hpp"
-#include "path_extend/pe_config_struct.hpp"
-
-#include "io/library.hpp"
-#include "io/binary_streams.hpp"
-#include "io/rc_reader_wrapper.hpp"
-#include "io/read_stream_vector.hpp"
-
-#include <boost/bimap.hpp>
-#include "xmath.h"
-
-namespace debruijn_graph {
-
-enum construction_mode {
- con_old, con_extention
-};
-
-enum estimation_mode {
- em_simple, em_weighted, em_extensive, em_smoothing
-};
-
-enum resolving_mode {
- rm_none,
- rm_path_extend,
-};
-
-enum single_read_resolving_mode {
- sr_none,
- sr_only_single_libs,
- sr_all
-};
-
-enum info_printer_pos {
- ipp_default = 0,
- ipp_before_first_gap_closer,
- ipp_before_simplification,
- ipp_before_post_simplification,
- ipp_final_simplified,
- ipp_final_gap_closed,
- ipp_before_repeat_resolution,
-
- ipp_total
-};
-
-namespace details {
-
-inline const char* info_printer_pos_name(size_t pos) {
- const char* names[] = { "default", "before_first_gap_closer",
- "before_simplification", "before_post_simplification",
- "final_simplified", "final_gap_closed", "before_repeat_resolution" };
-
- utils::check_array_size < ipp_total > (names);
- return names[pos];
-}
-
-} // namespace details
-
-// struct for debruijn project's configuration file
-struct debruijn_config {
- typedef boost::bimap<std::string, construction_mode> construction_mode_id_mapping;
- typedef boost::bimap<std::string, estimation_mode> estimation_mode_id_mapping;
- typedef boost::bimap<std::string, resolving_mode> resolve_mode_id_mapping;
- typedef boost::bimap<std::string, single_read_resolving_mode> single_read_resolving_mode_id_mapping;
-
-
- // bad fix, it is to be removed! To determine is it started from run.sh or from spades.py
- bool run_mode;
- bool scaffold_correction_mode;
-
- bool developer_mode;
-
- static const construction_mode_id_mapping FillConstructionModeInfo() {
- construction_mode_id_mapping::value_type info[] = {
- construction_mode_id_mapping::value_type("old", con_old),
- construction_mode_id_mapping::value_type("extension", con_extention), };
- return construction_mode_id_mapping(info, utils::array_end(info));
- }
-
- static const estimation_mode_id_mapping FillEstimationModeInfo() {
- estimation_mode_id_mapping::value_type info[] = {
- estimation_mode_id_mapping::value_type("simple", em_simple),
- estimation_mode_id_mapping::value_type("weighted", em_weighted),
- estimation_mode_id_mapping::value_type("extensive", em_extensive),
- estimation_mode_id_mapping::value_type("smoothing", em_smoothing)
- };
- return estimation_mode_id_mapping(info, utils::array_end(info));
- }
-
- static const resolve_mode_id_mapping FillResolveModeInfo() {
- resolve_mode_id_mapping::value_type info[] = {
- resolve_mode_id_mapping::value_type("none", rm_none),
- resolve_mode_id_mapping::value_type("path_extend", rm_path_extend),
- };
-
- return resolve_mode_id_mapping(info, utils::array_end(info));
- }
-
- static const single_read_resolving_mode_id_mapping FillSingleReadResolveModeInfo() {
- single_read_resolving_mode_id_mapping::value_type info[] = {
- single_read_resolving_mode_id_mapping::value_type("none", sr_none),
- single_read_resolving_mode_id_mapping::value_type("all", sr_all),
- single_read_resolving_mode_id_mapping::value_type("only_single_libs", sr_only_single_libs),
- };
-
- return single_read_resolving_mode_id_mapping(info, utils::array_end(info));
- }
-
- static const construction_mode_id_mapping& construction_mode_info() {
- static construction_mode_id_mapping con_mode_info =
- FillConstructionModeInfo();
- return con_mode_info;
- }
-
- static const estimation_mode_id_mapping& estimation_mode_info() {
- static estimation_mode_id_mapping est_mode_info = FillEstimationModeInfo();
- return est_mode_info;
- }
-
- static const resolve_mode_id_mapping& resolve_mode_info() {
- static resolve_mode_id_mapping info = FillResolveModeInfo();
- return info;
- }
-
- static const single_read_resolving_mode_id_mapping& single_read_resolve_mode_info() {
- static single_read_resolving_mode_id_mapping info = FillSingleReadResolveModeInfo();
- return info;
- }
-
- static const std::string& construction_mode_name(construction_mode con_id) {
- auto it = construction_mode_info().right.find(con_id);
- VERIFY_MSG(it != construction_mode_info().right.end(),
- "No name for construction mode id = " << con_id);
- return it->second;
- }
-
- static construction_mode construction_mode_id(std::string name) {
- auto it = construction_mode_info().left.find(name);
- VERIFY_MSG(it != construction_mode_info().left.end(),
- "There is no construction mode with name = " << name);
-
- return it->second;
- }
-
- static const std::string& estimation_mode_name(estimation_mode est_id) {
- auto it = estimation_mode_info().right.find(est_id);
- VERIFY_MSG(it != estimation_mode_info().right.end(),
- "No name for estimation mode id = " << est_id);
- return it->second;
- }
-
- static estimation_mode estimation_mode_id(std::string name) {
- auto it = estimation_mode_info().left.find(name);
- VERIFY_MSG(it != estimation_mode_info().left.end(),
- "There is no estimation mode with name = " << name);
-
- return it->second;
- }
-
- static const std::string& resolving_mode_name(resolving_mode mode_id) {
- auto it = resolve_mode_info().right.find(mode_id);
- VERIFY_MSG(it != resolve_mode_info().right.end(),
- "No name for resolving mode id = " << mode_id);
-
- return it->second;
- }
-
- static resolving_mode resolving_mode_id(std::string name) {
- auto it = resolve_mode_info().left.find(name);
- VERIFY_MSG(it != resolve_mode_info().left.end(),
- "There is no resolving mode with name = " << name);
-
- return it->second;
- }
-
- static const std::string& single_read_resolving_mode_name(single_read_resolving_mode mode_id) {
- auto it = single_read_resolve_mode_info().right.find(mode_id);
- VERIFY_MSG(it != single_read_resolve_mode_info().right.end(),
- "No name for single read resolving mode id = " << mode_id);
-
- return it->second;
- }
-
- static single_read_resolving_mode single_read_resolving_mode_id(std::string name) {
- auto it = single_read_resolve_mode_info().left.find(name);
- VERIFY_MSG(it != single_read_resolve_mode_info().left.end(),
- "There is no resolving mode with name = " << name);
-
- return it->second;
- }
-
- struct simplification {
- struct tip_clipper {
- std::string condition;
- tip_clipper() {}
- tip_clipper(std::string condition_) : condition(condition_) {}
- };
-
- struct topology_tip_clipper {
- double length_coeff;
- size_t uniqueness_length;
- size_t plausibility_length;
- };
-
- struct complex_tip_clipper {
- bool enabled;
- };
-
- struct bulge_remover {
- bool enabled;
- bool main_iteration_only;
- double max_bulge_length_coefficient;
- size_t max_additive_length_coefficient;
- double max_coverage;
- double max_relative_coverage;
- size_t max_delta;
- double max_relative_delta;
- size_t max_number_edges;
- bool parallel;
- size_t buff_size;
- double buff_cov_diff;
- double buff_cov_rel_diff;
- };
-
- struct erroneous_connections_remover {
- std::string condition;
- erroneous_connections_remover() {}
- erroneous_connections_remover(std::string condition_) : condition(condition_) {}
- };
-
- struct relative_coverage_ec_remover {
- size_t max_ec_length_coefficient;
- double max_coverage_coeff;
- double coverage_gap;
- };
-
- struct topology_based_ec_remover {
- size_t max_ec_length_coefficient;
- size_t uniqueness_length;
- size_t plausibility_length;
- };
-
- struct tr_based_ec_remover {
- size_t max_ec_length_coefficient;
- size_t uniqueness_length;
- double unreliable_coverage;
- };
-
- struct interstrand_ec_remover {
- size_t max_ec_length_coefficient;
- size_t uniqueness_length;
- size_t span_distance;
- };
-
- struct max_flow_ec_remover {
- bool enabled;
- double max_ec_length_coefficient;
- size_t uniqueness_length;
- size_t plausibility_length;
- };
-
- struct isolated_edges_remover {
- bool enabled;
- size_t max_length;
- double max_coverage;
- size_t max_length_any_cov;
- };
-
- struct complex_bulge_remover {
- bool enabled;
- double max_relative_length;
- size_t max_length_difference;
- };
-
- struct hidden_ec_remover {
- bool enabled;
- size_t uniqueness_length;
- double unreliability_threshold;
- double relative_threshold;
- };
-
- struct relative_coverage_edge_disconnector {
- bool enabled;
- double diff_mult;
- };
-
- struct relative_coverage_comp_remover {
- bool enabled;
- double coverage_gap;
- double length_coeff;
- double tip_allowing_length_coeff;
- size_t max_ec_length_coefficient;
- double max_coverage_coeff;
- size_t vertex_count_limit;
- };
-
- struct init_cleaning {
- std::string self_conj_condition;
-
- bool early_it_only;
- double activation_cov;
- isolated_edges_remover ier;
- std::string tip_condition;
- std::string ec_condition;
- double disconnect_flank_cov;
- };
-
- size_t cycle_iter_count;
- bool post_simplif_enabled;
- bool topology_simplif_enabled;
- tip_clipper tc;
- complex_tip_clipper complex_tc;
- topology_tip_clipper ttc;
- bulge_remover br;
- erroneous_connections_remover ec;
- relative_coverage_comp_remover rcc;
- relative_coverage_edge_disconnector relative_ed;
- topology_based_ec_remover tec;
- tr_based_ec_remover trec;
- interstrand_ec_remover isec;
- max_flow_ec_remover mfec;
- isolated_edges_remover ier;
- complex_bulge_remover cbr;
- hidden_ec_remover her;
-
- tip_clipper final_tc;
- bulge_remover final_br;
- bulge_remover second_final_br;
-
- init_cleaning init_clean;
- };
-
- struct construction {
- struct early_tip_clipper {
- bool enable;
- boost::optional<size_t> length_bound;
- };
-
- construction_mode con_mode;
- early_tip_clipper early_tc;
- bool keep_perfect_loops;
- size_t read_buffer_size;
- };
-
- std::string uncorrected_reads;
- bool need_consensus;
- double mismatch_ratio;
- simplification simp;
- simplification preliminary_simp;
-
- struct sensitive_mapper {
- size_t k;
- };
-
- struct distance_estimator {
- double linkage_distance_coeff;
- double max_distance_coeff;
- double max_distance_coeff_scaff;
- double filter_threshold;
- };
-
- struct smoothing_distance_estimator {
- size_t threshold;
- double range_coeff;
- double delta_coeff;
- double percentage;
- size_t cutoff;
- size_t min_peak_points;
- double inv_density;
- double derivative_threshold;
- };
-
- struct ambiguous_distance_estimator {
- bool enabled;
- double haplom_threshold;
- double relative_length_threshold;
- double relative_seq_threshold;
- };
-
- struct pacbio_processor {
- //align and traverse.
- size_t pacbio_k; //13
- bool additional_debug_info; //false
- double compression_cutoff;// 0.6
- double domination_cutoff; //1.5
- double path_limit_stretching; //1.3
- double path_limit_pressing;//0.7
- bool ignore_middle_alignment; //true; false for stats and mate_pairs;
- //gap_closer
- size_t long_seq_limit; //400
- size_t pacbio_min_gap_quantity; //2
- size_t contigs_min_gap_quantity; //1
- size_t max_contigs_gap_length; // 10000
- };
-
- struct DataSetData {
- size_t read_length;
- double avg_read_length;
- double mean_insert_size;
- double insert_size_deviation;
- double insert_size_left_quantile;
- double insert_size_right_quantile;
- double median_insert_size;
- double insert_size_mad;
- std::map<int, size_t> insert_size_distribution;
-
- bool binary_coverted;
- bool single_reads_mapped;
-
- uint64_t total_nucls;
- double average_coverage;
- double pi_threshold;
-
- std::string paired_read_prefix;
- std::string single_read_prefix;
- size_t thread_num;
-
- DataSetData(): read_length(0), avg_read_length(0.0),
- mean_insert_size(0.0),
- insert_size_deviation(0.0),
- insert_size_left_quantile(0.0),
- insert_size_right_quantile(0.0),
- median_insert_size(0.0),
- insert_size_mad(0.0),
- binary_coverted(false),
- single_reads_mapped(false),
- total_nucls(0),
- average_coverage(0.0),
- pi_threshold(0.0) {
- }
- };
-
- struct dataset {
- io::DataSet<DataSetData> reads;
-
- size_t max_read_length;
- double average_coverage;
- double average_read_length;
-
- size_t RL() const { return max_read_length; }
- void set_RL(size_t RL) {
- max_read_length = RL;
- }
-
- double aRL() const { return average_read_length; }
- void set_aRL(double aRL) {
- average_read_length = aRL;
- for (size_t i = 0; i < reads.lib_count(); ++i) {
- reads[i].data().avg_read_length = aRL;
- }
- }
-
- double avg_coverage() const { return average_coverage; }
- void set_avg_coverage(double avg_coverage) {
- average_coverage = avg_coverage;
- for (size_t i = 0; i < reads.lib_count(); ++i) {
- reads[i].data().average_coverage = avg_coverage;
- }
- }
-
- bool single_cell;
- bool meta;
- bool moleculo;
- std::string reference_genome_filename;
- std::string reads_filename;
-
- std::string reference_genome;
-
- dataset(): max_read_length(0), average_coverage(0.0) {
- }
- };
-
- struct position_handler {
- size_t max_mapping_gap;
- size_t max_gap_diff;
- std::string contigs_for_threading;
- std::string contigs_to_analyze;
- bool late_threading;
- bool careful_labeling;
- };
-
- struct gap_closer {
- int minimal_intersection;
- bool before_simplify;
- bool in_simplify;
- bool after_simplify;
- double weight_threshold;
- };
-
- struct info_printer {
- bool basic_stats;
- bool extended_stats;
- bool write_components;
- std::string components_for_kmer;
- std::string components_for_genome_pos;
- bool write_components_along_genome;
- bool write_components_along_contigs;
- bool save_full_graph;
- bool write_error_loc;
- bool write_full_graph;
- bool write_full_nc_graph;
- };
-
- struct graph_read_corr_cfg {
- bool enable;
- std::string output_dir;
- bool binary;
- };
-
- struct kmer_coverage_model {
- double probability_threshold;
- double strong_probability_threshold;
- double coverage_threshold;
- bool use_coverage_threshold;
- };
-
- struct bwa_aligner {
- bool enabled;
- bool debug;
- std::string path_to_bwa;
- size_t min_contig_len;
- };
-
- typedef std::map<info_printer_pos, info_printer> info_printers_t;
-
- std::string dataset_file;
- std::string project_name;
- std::string input_dir;
- std::string output_base;
- std::string output_root;
- std::string output_dir;
- std::string tmp_dir;
- std::string output_suffix;
- std::string output_saves;
- std::string final_contigs_file;
- std::string log_filename;
-
- bool output_pictures;
- bool output_nonfinal_contigs;
- bool compute_paths_number;
-
- bool use_additional_contigs;
- bool use_unipaths;
- std::string additional_contigs;
-
- struct scaffold_correction {
- std::string scaffolds_file;
- bool output_unfilled;
- size_t max_insert;
- size_t max_cut_length;
- };
-
- struct truseq_analysis {
- std::string scaffolds_file;
- std::string genome_file;
- };
-
- scaffold_correction sc_cor;
- truseq_analysis tsa;
- std::string load_from;
-
- std::string entry_point;
-
- bool rr_enable;
- bool two_step_rr;
- bool use_intermediate_contigs;
-
- single_read_resolving_mode single_reads_rr;
- bool use_single_reads;
-
- bool mismatch_careful;
- bool correct_mismatches;
- bool paired_info_statistics;
- bool paired_info_scaffolder;
- bool cut_bad_connections;
- bool gap_closer_enable;
-
- size_t max_repeat_length;
-
- //Convertion options
- size_t buffer_size;
- std::string temp_bin_reads_dir;
- std::string temp_bin_reads_path;
- std::string temp_bin_reads_info;
- std::string paired_read_prefix;
- std::string single_read_prefix;
-
- size_t K;
-
- bool main_iteration;
-
- size_t max_threads;
- size_t max_memory;
-
- estimation_mode est_mode;
-
- resolving_mode rm;
- path_extend::pe_config::MainPEParamsT pe_params;
- path_extend::pe_config::MainPEParamsT prelim_pe_params;
- bool avoid_rc_connections;
-
- construction con;
- sensitive_mapper sensitive_map;
- distance_estimator de;
- smoothing_distance_estimator ade;
- ambiguous_distance_estimator amb_de;
- pacbio_processor pb;
- bool use_scaffolder;
- dataset ds;
- position_handler pos;
- gap_closer gc;
- graph_read_corr_cfg graph_read_corr;
- info_printers_t info_printers;
- kmer_coverage_model kcm;
- bwa_aligner bwa;
-
- size_t flanking_range;
-
- bool diploid_mode;
- bool need_mapping;
-};
-
-void load(debruijn_config& cfg, const std::string &filename);
-void load_lib_data(const std::string& prefix);
-void write_lib_data(const std::string& prefix);
-} // debruijn_graph
-
-typedef config_common::config<debruijn_graph::debruijn_config> cfg;
-
-#endif
diff --git a/src/debruijn/construction.cpp b/src/debruijn/construction.cpp
deleted file mode 100644
index 545acb9..0000000
--- a/src/debruijn/construction.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "standard.hpp"
-#include "io/easy_reader.hpp"
-#include "io/vector_reader.hpp"
-#include "dataset_readers.hpp"
-#include "graph_pack.hpp"
-#include "read_converter.hpp"
-#include "omni/visualization/graph_labeler.hpp"
-
-#include "graph_construction.hpp"
-#include "stats/debruijn_stats.hpp"
-#include "positions.hpp"
-#include "construction.hpp"
-
-namespace debruijn_graph {
-
-template<class Read>
-void construct_graph(io::ReadStreamList<Read>& streams,
- conj_graph_pack& gp, io::SingleStreamPtr contigs_stream = io::SingleStreamPtr()) {
- debruijn_config::construction params = cfg::get().con;
- params.early_tc.enable &= !cfg::get().gap_closer_enable;
-
- ReadStatistics stats = ConstructGraphWithCoverage(params, streams, gp.g,
- gp.index, gp.flanking_cov, contigs_stream);
- size_t rl = stats.max_read_length_;
-
- if (!cfg::get().ds.RL()) {
- INFO("Figured out: read length = " << rl);
- cfg::get_writable().ds.set_RL(rl);
- cfg::get_writable().ds.set_aRL(1.0 * stats.bases_ / stats.reads_);
- } else if (cfg::get().ds.RL() != rl)
- WARN("In datasets.info, wrong RL is specified: " << cfg::get().ds.RL() << ", not " << rl);
-}
-
-void Construction::run(conj_graph_pack &gp, const char*) {
- // Has to be separate stream for not counting it in coverage
- io::ReadStreamList<io::SingleRead> trusted_contigs;
- if (cfg::get().use_additional_contigs) {
- INFO("Contigs from previous K will be used");
- trusted_contigs.push_back(io::EasyStream(cfg::get().additional_contigs, true));
- }
-
- bool trusted_contigs_exist = false;
- for (const auto& lib : cfg::get().ds.reads) {
- if (lib.type() != io::LibraryType::TrustedContigs)
- continue;
-
- for (const auto& read : lib.single_reads()) {
- trusted_contigs.push_back(io::EasyStream(read, true));
- trusted_contigs_exist = true;
- }
- }
-
- if (trusted_contigs_exist)
- INFO("Trusted contigs will be used in graph construction");
- auto contigs_stream = MultifileWrap(trusted_contigs);
-
- std::vector<size_t> libs_for_construction;
- for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i)
- if (cfg::get().ds.reads[i].is_graph_contructable())
- libs_for_construction.push_back(i);
-
- auto streams = single_binary_readers_for_libs(libs_for_construction, true, true);
- construct_graph<io::SingleReadSeq>(streams, gp, contigs_stream);
-}
-
-} //namespace debruijn_graph
diff --git a/src/debruijn/construction.hpp b/src/debruijn/construction.hpp
deleted file mode 100644
index e4b365e..0000000
--- a/src/debruijn/construction.hpp
+++ /dev/null
@@ -1,23 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "stage.hpp"
-
-namespace debruijn_graph {
-
-class Construction : public spades::AssemblyStage {
- public:
- Construction()
- : AssemblyStage("Construction", "construction") {}
-
- void run(conj_graph_pack &gp, const char*);
-};
-
-}
-
diff --git a/src/debruijn/contig_output.hpp b/src/debruijn/contig_output.hpp
deleted file mode 100644
index ee36c8d..0000000
--- a/src/debruijn/contig_output.hpp
+++ /dev/null
@@ -1,418 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "standard.hpp"
-#include "utils.hpp"
-
-namespace debruijn_graph {
-
-//This class corrects mismatches or masks repeat differences or other such things with the sequence of an edge
-template<class Graph>
-class ContigCorrector {
-private:
- typedef typename Graph::EdgeId EdgeId;
- const Graph &graph_;
-protected:
- const Graph &graph() const {
- return graph_;
- }
-
-public:
- ContigCorrector(const Graph &graph) : graph_(graph) {
- }
-
- virtual string correct(EdgeId e) = 0;
-
- virtual ~ContigCorrector() {
- }
-};
-
-template<class Graph>
-class DefaultContigCorrector : public ContigCorrector<Graph> {
-private:
- typedef typename Graph::EdgeId EdgeId;
-public:
- DefaultContigCorrector(const Graph &graph) : ContigCorrector<Graph>(graph) {
- }
-
- string correct(EdgeId e) {
- return this->graph().EdgeNucls(e).str();
- }
-};
-
-//This class uses corrected sequences to construct contig (just return as is, find unipath, trim contig)
-template<class Graph>
-class ContigConstructor {
-private:
- typedef typename Graph::EdgeId EdgeId;
- const Graph &graph_;
- ContigCorrector<Graph> &corrector_;
-protected:
- string correct(EdgeId e) {
- return corrector_.correct(e);
- }
-
- const Graph &graph() const {
- return graph_;
- }
-
-public:
-
- ContigConstructor(const Graph &graph, ContigCorrector<Graph> &corrector) : graph_(graph), corrector_(corrector) {
- }
-
- virtual pair<string, double> construct(EdgeId e) = 0;
-
- virtual ~ContigConstructor(){
- }
-};
-
-template<class Graph>
-class DefaultContigConstructor : public ContigConstructor<Graph> {
-private:
- typedef typename Graph::EdgeId EdgeId;
-public:
-
- DefaultContigConstructor(const Graph &graph, ContigCorrector<Graph> &corrector) : ContigConstructor<Graph>(graph, corrector) {
- }
-
- pair<string, double> construct(EdgeId e) {
- return make_pair(this->correct(e), this->graph().coverage(e));
- }
-};
-
-template<class Graph>
-vector<typename Graph::EdgeId> Unipath(const Graph& g, typename Graph::EdgeId e) {
- UniquePathFinder<Graph> unipath_finder(g);
- vector<typename Graph::EdgeId> answer = unipath_finder.UniquePathBackward(e);
- const vector<typename Graph::EdgeId>& forward = unipath_finder.UniquePathForward(e);
- for (size_t i = 1; i < forward.size(); ++i) {
- answer.push_back(forward[i]);
- }
- return answer;
-}
-
-template<class Graph>
-class UnipathConstructor : public ContigConstructor<Graph> {
-private:
- typedef typename Graph::EdgeId EdgeId;
-
-
-
- string MergeOverlappingSequences(std::vector<string>& ss, size_t overlap) {
- if (ss.empty()) {
- return "";
- }
- stringstream result;
- result << ss.front().substr(0, overlap);
-// prev_end = ss.front().substr(0, overlap);
- for (auto it = ss.begin(); it != ss.end(); ++it) {
-// VERIFY(prev_end == it->substr(0, overlap));
- result << it->substr(overlap);
-// prev_end = it->substr(it->size() - overlap);
- }
- return result.str();
- }
-
-
- string MergeSequences(const Graph& g,
- const vector<typename Graph::EdgeId>& continuous_path) {
- vector<string> path_sequences;
- for (size_t i = 0; i < continuous_path.size(); ++i) {
- if(i > 0)
- VERIFY(
- g.EdgeEnd(continuous_path[i - 1])
- == g.EdgeStart(continuous_path[i]));
- path_sequences.push_back(this->correct(continuous_path[i]));
- }
- return MergeOverlappingSequences(path_sequences, g.k());
- }
-
-public:
-
- UnipathConstructor(const Graph &graph, ContigCorrector<Graph> &corrector) : ContigConstructor<Graph>(graph, corrector) {
- }
-
- pair<string, double> construct(EdgeId e) {
- vector<EdgeId> unipath = Unipath(this->graph(), e);
- return make_pair(MergeSequences(this->graph(), unipath), stats::AvgCoverage(this->graph(), unipath));
- }
-};
-
-template<class Graph>
-class CuttingContigConstructor : public ContigConstructor<Graph> {
-private:
- typedef typename Graph::EdgeId EdgeId;
-
- bool ShouldCut(VertexId v) const {
- const Graph &g = this->graph();
- vector<EdgeId> edges;
- push_back_all(edges, g.OutgoingEdges(v));
- if(edges.size() == 0)
- return false;
- for(size_t i = 1; i < edges.size(); i++) {
- if(g.EdgeNucls(edges[i])[g.k()] != g.EdgeNucls(edges[0])[g.k()])
- return false;
- }
- edges.clear();
- push_back_all(edges, g.IncomingEdges(v));
- for(size_t i = 0; i < edges.size(); i++)
- for(size_t j = i + 1; j < edges.size(); j++) {
- if(g.EdgeNucls(edges[i])[g.length(edges[i]) - 1] != g.EdgeNucls(edges[j])[g.length(edges[j]) - 1])
- return true;
- }
- return false;
- }
-
-public:
-
- CuttingContigConstructor(const Graph &graph, ContigCorrector<Graph> &corrector) : ContigConstructor<Graph>(graph, corrector) {
- }
-
- pair<string, double> construct(EdgeId e) {
- string result = this->correct(e);
- if(result.size() > this->graph().k() && ShouldCut(this->graph().EdgeEnd(e))) {
- result = result.substr(0, result.size() - this->graph().k());
- }
- if(result.size() > this->graph().k() && ShouldCut(this->graph().conjugate(this->graph().EdgeStart(e)))) {
- result = result.substr(this->graph().k(), result.size());
- }
- return make_pair(result, this->graph().coverage(e));
- }
-};
-
-struct ExtendedContigIdT {
- string full_id_;
- string short_id_;
-
- ExtendedContigIdT(): full_id_(""), short_id_("") {}
-
- ExtendedContigIdT(string full_id, string short_id): full_id_(full_id), short_id_(short_id) {}
-};
-
-template <class Graph>
-void MakeContigIdMap(const Graph& graph, map<EdgeId, ExtendedContigIdT>& ids) {
- int counter = 0;
- for (auto it = graph.ConstEdgeBegin(); !it.IsEnd(); ++it) {
- EdgeId e = *it;
- if (ids.count(e) == 0) {
- string id = io::MakeContigId(++counter, graph.length(e) + graph.k(), graph.coverage(e), "EDGE");
- ids[e] = ExtendedContigIdT(id, ToString(counter) + "+");
- if (e != graph.conjugate(e))
- ids[graph.conjugate(e)] = ExtendedContigIdT(id + "'", ToString(counter) + "-");
- }
- }
-}
-
-template<class Graph>
-class ContigPrinter {
-private:
- const Graph &graph_;
- ContigConstructor<Graph> &constructor_;
-
- template<class sequence_stream>
- void ReportEdge(sequence_stream& oss
- , const pair<string, double> sequence_data) {
- oss << sequence_data.second;
- oss << sequence_data.first;
- }
-
- void ReportEdge(io::osequencestream_for_fastg& oss,
- const string& sequence,
- const string& id,
- const set<string>& nex_ids) {
- oss.set_header(id);
- oss << nex_ids;
- oss << sequence;
- }
-
-public:
- ContigPrinter(const Graph &graph, ContigConstructor<Graph> &constructor) : graph_(graph), constructor_(constructor) {
- }
-
- template<class sequence_stream>
- void PrintContigs(sequence_stream &os) {
- for (auto it = graph_.ConstEdgeBegin(true); !it.IsEnd(); ++it) {
- ReportEdge<sequence_stream>(os, constructor_.construct(*it));
- }
- }
-
- template<class sequence_stream>
- void PrintContigsFASTG(sequence_stream &os) {
- map<EdgeId, ExtendedContigIdT> ids;
- MakeContigIdMap(graph_, ids);
-
- for (auto it = graph_.SmartEdgeBegin(); !it.IsEnd(); ++it) {
- set<string> next;
- VertexId v = graph_.EdgeEnd(*it);
- auto edges = graph_.OutgoingEdges(v);
- for (auto next_it = edges.begin(); next_it != edges.end(); ++next_it) {
- next.insert(ids[*next_it].full_id_);
- }
- ReportEdge(os, constructor_.construct(*it).first, ids[*it].full_id_, next);
- //FASTG always needs both sets of edges
- //it.HandleDelete(graph_.conjugate(*it));
- }
- }
-};
-
-template<class Graph>
-bool PossibleECSimpleCheck(const Graph& g
- , typename Graph::EdgeId e) {
- return g.OutgoingEdgeCount(g.EdgeStart(e)) > 1 && g.IncomingEdgeCount(g.EdgeEnd(e)) > 1;
-}
-
-template<class Graph>
-void ReportEdge(io::osequencestream_cov& oss
- , const Graph& g
- , typename Graph::EdgeId e
- , bool output_unipath = false
- , size_t solid_edge_length_bound = 0) {
- typedef typename Graph::EdgeId EdgeId;
- if (!output_unipath || (PossibleECSimpleCheck(g, e) && g.length(e) <= solid_edge_length_bound)) {
- TRACE("Outputting edge " << g.str(e) << " as single edge");
- oss << g.coverage(e);
- oss << g.EdgeNucls(e);
- } else {
- TRACE("Outputting edge " << g.str(e) << " as part of unipath");
- vector<EdgeId> unipath = Unipath(g, e);
- TRACE("Unipath is " << g.str(unipath));
- oss << stats::AvgCoverage(g, unipath);
- TRACE("Merged sequence is of length " << MergeSequences(g, unipath).size());
- oss << MergeSequences(g, unipath);
- }
-}
-
-void OutputContigs(ConjugateDeBruijnGraph& g,
- const string& contigs_output_filename,
- bool output_unipath = false,
- size_t /*solid_edge_length_bound*/ = 0,
- bool cut_bad_connections = false) {
- INFO("Outputting contigs to " << contigs_output_filename << ".fasta");
- DefaultContigCorrector<ConjugateDeBruijnGraph> corrector(g);
- io::osequencestream_cov oss(contigs_output_filename + ".fasta");
-
- if(!output_unipath) {
- if(!cut_bad_connections) {
- DefaultContigConstructor<ConjugateDeBruijnGraph> constructor(g, corrector);
- ContigPrinter<ConjugateDeBruijnGraph>(g, constructor).PrintContigs(oss);
- } else {
- CuttingContigConstructor<ConjugateDeBruijnGraph> constructor(g, corrector);
- ContigPrinter<ConjugateDeBruijnGraph>(g, constructor).PrintContigs(oss);
- }
- } else {
- UnipathConstructor<ConjugateDeBruijnGraph> constructor(g, corrector);
- ContigPrinter<ConjugateDeBruijnGraph>(g, constructor).PrintContigs(oss);
- }
-
-// {
-// osequencestream_cov oss(contigs_output_filename);
-// set<ConjugateDeBruijnGraph::EdgeId> edges;
-// for (auto it = g.SmartEdgeBegin(); !it.IsEnd(); ++it) {
-// if (edges.count(*it) == 0) {
-// ReportEdge(oss, g, *it, output_unipath, solid_edge_length_bound + ".oppa.fasta");
-// edges.insert(g.conjugate(*it));
-// }
-// // oss << g.EdgeNucls(*it);
-// }
-// DEBUG("Contigs written");
-// }
-// if(!output_unipath) {
-// OutputContigs(g, contigs_output_filename + ".2.fasta", true, solid_edge_length_bound);
-// }
-}
-
-void OutputContigsToFASTG(ConjugateDeBruijnGraph& g,
- const string& contigs_output_filename) {
-
- INFO("Outputting graph to " << contigs_output_filename << ".fastg");
- DefaultContigCorrector<ConjugateDeBruijnGraph> corrector(g);
- DefaultContigConstructor<ConjugateDeBruijnGraph> constructor(g, corrector);
- io::osequencestream_for_fastg ossfg(contigs_output_filename + ".fastg");
- ContigPrinter<ConjugateDeBruijnGraph>(g, constructor).PrintContigsFASTG(ossfg);
-}
-
-
-
-
- bool ShouldCut(ConjugateDeBruijnGraph& g, VertexId v) {
- vector<EdgeId> edges;
- push_back_all(edges, g.OutgoingEdges(v));
-
- if(edges.size() == 0)
- return false;
- for(size_t i = 1; i < edges.size(); i++) {
- if(g.EdgeNucls(edges[i])[g.k()] != g.EdgeNucls(edges[0])[g.k()])
- return false;
- }
- edges.clear();
- push_back_all(edges, g.IncomingEdges(v));
- for(size_t i = 0; i < edges.size(); i++)
- for(size_t j = i + 1; j < edges.size(); j++) {
- if(g.EdgeNucls(edges[i])[g.length(edges[i]) - 1] != g.EdgeNucls(edges[j])[g.length(edges[j]) - 1])
- return true;
- }
- return false;
-}
-
-void OutputCutContigs(ConjugateDeBruijnGraph& g,
- const string& contigs_output_filename,
- bool /*output_unipath*/ = false,
- size_t /*solid_edge_length_bound*/ = 0) {
- INFO("Outputting contigs to " << contigs_output_filename);
- DefaultContigCorrector<ConjugateDeBruijnGraph> corrector(g);
- io::osequencestream_cov oss(contigs_output_filename);
- CuttingContigConstructor<ConjugateDeBruijnGraph> constructor(g, corrector);
- ContigPrinter<ConjugateDeBruijnGraph>(g, constructor).PrintContigs(oss);
-
-// osequencestream_cov oss(contigs_output_filename);
-// set<ConjugateDeBruijnGraph::EdgeId> edges;
-// for (auto it = g.SmartEdgeBegin(); !it.IsEnd(); ++it) {
-// EdgeId e = *it;
-// cout << g.length(e) << endl;
-// if (edges.count(e) == 0) {
-// Sequence s = g.EdgeNucls(e);
-// cout << s.size() << endl;
-// cout << "oppa " << ShouldCut(g, g.EdgeEnd(e)) << endl;
-// if(s.size() > g.k() && ShouldCut(g, g.EdgeEnd(e))) {
-// s = s.Subseq(0, s.size() - g.k());
-// cout << s.size() << endl;
-// }
-// cout << "oppa1 " << ShouldCut(g, g.conjugate(g.EdgeStart(e))) << endl;
-// if(s.size() > g.k() && ShouldCut(g, g.conjugate(g.EdgeStart(e)))) {
-// s = s.Subseq(g.k(), s.size());
-// cout << s.size() << endl;
-// }
-// oss << g.coverage(e);
-// oss << s;
-// edges.insert(g.conjugate(*it));
-// }
-// // oss << g.EdgeNucls(*it);
-// }
-}
-
-void OutputSingleFileContigs(ConjugateDeBruijnGraph& g,
- const string& contigs_output_dir) {
- INFO("Outputting contigs to " << contigs_output_dir);
- int n = 0;
- make_dir(contigs_output_dir);
- char n_str[20];
- set<ConjugateDeBruijnGraph::EdgeId> edges;
- for (auto it = g.SmartEdgeBegin(); !it.IsEnd(); ++it) {
- if (edges.count(*it) == 0) {
- sprintf(n_str, "%d.fa", n);
- edges.insert(g.conjugate(*it));
- io::osequencestream oss(contigs_output_dir + n_str);
- oss << g.EdgeNucls(*it);
- n++;
- }
- }DEBUG("SingleFileContigs(Conjugate) written");
-}
-
-}
diff --git a/src/debruijn/dataset_readers.hpp b/src/debruijn/dataset_readers.hpp
deleted file mode 100644
index 2d69b6e..0000000
--- a/src/debruijn/dataset_readers.hpp
+++ /dev/null
@@ -1,122 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "logger/logger.hpp"
-#include "simple_tools.hpp"
-#include "io/io_helper.hpp"
-#include "io/library.hpp"
-
-#include "config_struct.hpp"
-
-namespace debruijn_graph {
-
-inline
-io::PairedStreamPtr paired_easy_reader(const io::SequencingLibrary<debruijn_config::DataSetData> &lib,
- bool followed_by_rc,
- size_t insert_size,
- bool change_read_order = false,
- bool use_orientation = true,
- io::OffsetType offset_type = io::PhredOffset) {
- io::ReadStreamList<io::PairedRead> streams;
- for (auto read_pair : lib.paired_reads()) {
- streams.push_back(io::PairedEasyStream(read_pair.first, read_pair.second, followed_by_rc, insert_size, change_read_order,
- use_orientation, lib.orientation(), offset_type));
- }
- return io::MultifileWrap<io::PairedRead>(streams);
-}
-
-inline
-io::ReadStreamList<io::SingleRead> single_easy_readers(const io::SequencingLibrary<debruijn_config::DataSetData> &lib,
- bool followed_by_rc,
- bool including_paired_reads,
- bool handle_Ns = true,
- io::OffsetType offset_type = io::PhredOffset) {
- io::ReadStreamList<io::SingleRead> streams;
- if (including_paired_reads) {
- for (const auto& read : lib.reads()) {
- //do we need input_file function here?
- streams.push_back(io::EasyStream(read, followed_by_rc, handle_Ns, offset_type));
- }
- } else {
- for (const auto& read : lib.single_reads()) {
- streams.push_back(io::EasyStream(read, followed_by_rc, handle_Ns, offset_type));
- }
- }
- return streams;
-}
-
-inline
-io::SingleStreamPtr single_easy_reader(const io::SequencingLibrary<debruijn_config::DataSetData> &lib,
- bool followed_by_rc,
- bool including_paired_reads,
- bool handle_Ns = true,
- io::OffsetType offset_type = io::PhredOffset) {
- return io::MultifileWrap<io::SingleRead>(
- single_easy_readers(lib, followed_by_rc, including_paired_reads, handle_Ns, offset_type));
-}
-
-inline
-io::PairedStreamPtr paired_easy_reader_for_libs(std::vector<size_t> libs,
- bool followed_by_rc,
- size_t insert_size,
- bool change_read_order = false,
- bool use_orientation = true,
- io::OffsetType offset_type = io::PhredOffset) {
- io::ReadStreamList<io::PairedRead> streams;
- for (size_t i = 0; i < libs.size(); ++i) {
- streams.push_back(paired_easy_reader(cfg::get().ds.reads[libs[i]],
- followed_by_rc, insert_size, change_read_order, use_orientation, offset_type));
- }
- return io::MultifileWrap<io::PairedRead>(streams);
-}
-
-
-inline
-io::PairedStreamPtr paired_easy_reader(bool followed_by_rc,
- size_t insert_size,
- bool change_read_order = false,
- bool use_orientation = true,
- io::OffsetType offset_type = io::PhredOffset) {
-
- std::vector<size_t> all_libs(cfg::get().ds.reads.lib_count());
- for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i)
- all_libs[i] = i;
-
- // FIXME: Should we use only first library?
- // No, this one is for all libs together
- return paired_easy_reader_for_libs(all_libs, followed_by_rc, insert_size, change_read_order, use_orientation, offset_type);
-}
-
-
-inline
-io::SingleStreamPtr single_easy_reader_for_libs(vector<size_t> libs,
- bool followed_by_rc,
- bool including_paired_reads,
- io::OffsetType offset_type = io::PhredOffset) {
- io::ReadStreamList<io::SingleRead> streams;
- for (size_t i = 0; i < libs.size(); ++i) {
- streams.push_back(single_easy_reader(cfg::get().ds.reads[libs[i]],
- followed_by_rc, including_paired_reads, offset_type));
- }
- return io::MultifileWrap<io::SingleRead>(streams);
-}
-
-inline
-io::SingleStreamPtr single_easy_reader(bool followed_by_rc,
- bool including_paired_reads,
- io::OffsetType offset_type = io::PhredOffset) {
-
- std::vector<size_t> all_libs(cfg::get().ds.reads.lib_count());
- for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i)
- all_libs[i] = i;
-
- return single_easy_reader_for_libs(all_libs, followed_by_rc, including_paired_reads, offset_type);
-}
-
-}
diff --git a/src/debruijn/debruijn debug.launch.template b/src/debruijn/debruijn debug.launch.template
deleted file mode 100644
index 1ab5a89..0000000
--- a/src/debruijn/debruijn debug.launch.template
+++ /dev/null
@@ -1,32 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<launchConfiguration type="org.eclipse.cdt.launch.applicationLaunchType">
-<booleanAttribute key="org.eclipse.cdt.dsf.gdb.AUTO_SOLIB" value="true"/>
-<listAttribute key="org.eclipse.cdt.dsf.gdb.AUTO_SOLIB_LIST"/>
-<stringAttribute key="org.eclipse.cdt.dsf.gdb.DEBUG_NAME" value="gdb"/>
-<booleanAttribute key="org.eclipse.cdt.dsf.gdb.DEBUG_ON_FORK" value="false"/>
-<stringAttribute key="org.eclipse.cdt.dsf.gdb.GDB_INIT" value=".gdbinit"/>
-<booleanAttribute key="org.eclipse.cdt.dsf.gdb.NON_STOP" value="false"/>
-<booleanAttribute key="org.eclipse.cdt.dsf.gdb.REVERSE" value="false"/>
-<listAttribute key="org.eclipse.cdt.dsf.gdb.SOLIB_PATH"/>
-<booleanAttribute key="org.eclipse.cdt.dsf.gdb.UPDATE_THREADLIST_ON_SUSPEND" value="false"/>
-<booleanAttribute key="org.eclipse.cdt.dsf.gdb.internal.ui.launching.LocalApplicationCDebuggerTab.DEFAULTS_SET" value="true"/>
-<intAttribute key="org.eclipse.cdt.launch.ATTR_BUILD_BEFORE_LAUNCH_ATTR" value="2"/>
-<stringAttribute key="org.eclipse.cdt.launch.COREFILE_PATH" value=""/>
-<stringAttribute key="org.eclipse.cdt.launch.DEBUGGER_ID" value="gdb"/>
-<stringAttribute key="org.eclipse.cdt.launch.DEBUGGER_START_MODE" value="run"/>
-<booleanAttribute key="org.eclipse.cdt.launch.DEBUGGER_STOP_AT_MAIN" value="true"/>
-<stringAttribute key="org.eclipse.cdt.launch.DEBUGGER_STOP_AT_MAIN_SYMBOL" value="main"/>
-<stringAttribute key="org.eclipse.cdt.launch.PROGRAM_NAME" value="../../build/debug/debruijn/debruijn"/>
-<stringAttribute key="org.eclipse.cdt.launch.PROJECT_ATTR" value="debruijn"/>
-<booleanAttribute key="org.eclipse.cdt.launch.PROJECT_BUILD_CONFIG_AUTO_ATTR" value="true"/>
-<stringAttribute key="org.eclipse.cdt.launch.PROJECT_BUILD_CONFIG_ID_ATTR" value=""/>
-<stringAttribute key="org.eclipse.cdt.launch.WORKING_DIRECTORY" value="${workspace_loc:debruijn}/../../"/>
-<booleanAttribute key="org.eclipse.cdt.launch.use_terminal" value="true"/>
-<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_PATHS">
-<listEntry value="/debruijn"/>
-</listAttribute>
-<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_TYPES">
-<listEntry value="4"/>
-</listAttribute>
-<stringAttribute key="org.eclipse.dsf.launch.MEMORY_BLOCKS" value="<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<memoryBlockExpressionList context="reserved-for-future-use"/>
"/>
-</launchConfiguration>
diff --git a/src/debruijn/debruijn release.launch.template b/src/debruijn/debruijn release.launch.template
deleted file mode 100644
index 40abc9e..0000000
--- a/src/debruijn/debruijn release.launch.template
+++ /dev/null
@@ -1,17 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<launchConfiguration type="org.eclipse.cdt.launch.applicationLaunchType">
-<intAttribute key="org.eclipse.cdt.launch.ATTR_BUILD_BEFORE_LAUNCH_ATTR" value="2"/>
-<stringAttribute key="org.eclipse.cdt.launch.COREFILE_PATH" value=""/>
-<stringAttribute key="org.eclipse.cdt.launch.PROGRAM_NAME" value="../../build/release/debruijn/debruijn"/>
-<stringAttribute key="org.eclipse.cdt.launch.PROJECT_ATTR" value="debruijn"/>
-<booleanAttribute key="org.eclipse.cdt.launch.PROJECT_BUILD_CONFIG_AUTO_ATTR" value="true"/>
-<stringAttribute key="org.eclipse.cdt.launch.PROJECT_BUILD_CONFIG_ID_ATTR" value=""/>
-<stringAttribute key="org.eclipse.cdt.launch.WORKING_DIRECTORY" value="${workspace_loc:debruijn}/../../"/>
-<booleanAttribute key="org.eclipse.cdt.launch.use_terminal" value="true"/>
-<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_PATHS">
-<listEntry value="/debruijn"/>
-</listAttribute>
-<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_TYPES">
-<listEntry value="4"/>
-</listAttribute>
-</launchConfiguration>
diff --git a/src/debruijn/debruijn_data.hpp b/src/debruijn/debruijn_data.hpp
deleted file mode 100644
index abfa4d6..0000000
--- a/src/debruijn/debruijn_data.hpp
+++ /dev/null
@@ -1,169 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include <vector>
-#include <set>
-#include <cstring>
-#include "verify.hpp"
-#include "logger/logger.hpp"
-#include "sequence/sequence_tools.hpp"
-
-namespace omnigraph {
-class DeBruijnMaster;
-
-class DeBruijnVertexData {
- friend class DeBruinMaster;
-public:
- DeBruijnVertexData() {
-
- }
-};
-
-class CoverageData {
- private:
- unsigned coverage_;
-
- public:
- CoverageData()
- : coverage_(0) {
- }
-
- void inc_coverage(int value) {
- VERIFY(value >= 0 || coverage_ > unsigned(-value));
- coverage_ += value;
- }
-
- void set_coverage(unsigned coverage) {
- coverage_ = coverage;
- }
-
- //not length normalized
- unsigned coverage() const {
- return coverage_;
- }
-};
-
-class DeBruijnEdgeData {
- friend class DeBruinMaster;
- CoverageData coverage_;
- CoverageData flanking_cov_;
- Sequence nucls_;
-public:
-
- DeBruijnEdgeData(const Sequence &nucls) :
- nucls_(nucls) {
- }
-
- const Sequence& nucls() const {
- return nucls_;
- }
-
- void inc_raw_coverage(int value) {
- coverage_.inc_coverage(value);
- }
-
- void set_raw_coverage(unsigned coverage) {
- coverage_.set_coverage(coverage);
- }
-
- unsigned raw_coverage() const {
- return coverage_.coverage();
- }
-
- void inc_flanking_coverage(int value) {
- flanking_cov_.inc_coverage(value);
- }
-
- void set_flanking_coverage(unsigned flanking_coverage) {
- flanking_cov_.set_coverage(flanking_coverage);
- }
-
- //not length normalized
- unsigned flanking_coverage() const {
- return flanking_cov_.coverage();
- }
-
- size_t size() const {
- return nucls_.size();
- }
-};
-
-class DeBruijnDataMaster {
-private:
- const size_t k_;
-
-public:
- typedef DeBruijnVertexData VertexData;
- typedef DeBruijnEdgeData EdgeData;
-
- DeBruijnDataMaster(size_t k) :
- k_(k) {
- }
-
- const EdgeData MergeData(const std::vector<const EdgeData*>& to_merge, bool safe_merging = true) const;
-
- std::pair<VertexData, std::pair<EdgeData, EdgeData>> SplitData(const EdgeData& edge, size_t position, bool is_self_conj = false) const;
-
- EdgeData GlueData(const EdgeData&, const EdgeData& data2) const;
-
- bool isSelfConjugate(const EdgeData &data) const {
- return data.nucls() == !(data.nucls());
- }
-
- EdgeData conjugate(const EdgeData &data) const {
- return EdgeData(!(data.nucls()));
- }
-
- VertexData conjugate(const VertexData & /*data*/) const {
- return VertexData();
- }
-
- size_t length(const EdgeData& data) const {
- return data.nucls().size() - k_;
- }
-
- size_t length(const VertexData& ) const {
- return k_;
- }
-
- size_t k() const {
- return k_;
- }
-
-};
-
-//typedef DeBruijnVertexData VertexData;
-//typedef DeBruijnEdgeData EdgeData;
-//typedef DeBruijnDataMaster DataMaster;
-
-inline const DeBruijnEdgeData DeBruijnDataMaster::MergeData(const std::vector<const DeBruijnEdgeData*>& to_merge, bool safe_merging) const {
- std::vector<Sequence> ss;
- ss.reserve(to_merge.size());
- for (auto it = to_merge.begin(); it != to_merge.end(); ++it) {
- ss.push_back((*it)->nucls());
- }
- return EdgeData(MergeOverlappingSequences(ss, k_, safe_merging));
-}
-
-inline std::pair<DeBruijnVertexData, std::pair<DeBruijnEdgeData, DeBruijnEdgeData>> DeBruijnDataMaster::SplitData(const EdgeData& edge,
- size_t position,
- bool is_self_conj) const {
- const Sequence& nucls = edge.nucls();
- size_t end = nucls.size();
- if (is_self_conj) {
- VERIFY(position < end);
- end -= position;
- }
- return std::make_pair(VertexData(), std::make_pair(EdgeData(edge.nucls().Subseq(0, position + k_)), EdgeData(nucls.Subseq(position, end))));
-}
-
-inline DeBruijnEdgeData DeBruijnDataMaster::GlueData(const DeBruijnEdgeData&, const DeBruijnEdgeData& data2) const {
- return data2;
-}
-
-}
diff --git a/src/debruijn/debruijn_graph.hpp b/src/debruijn/debruijn_graph.hpp
deleted file mode 100644
index 716c41d..0000000
--- a/src/debruijn/debruijn_graph.hpp
+++ /dev/null
@@ -1,110 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "omni/observable_graph.hpp"
-#include "omni/coverage.hpp"
-#include "omni/id_track_handler.hpp"
-#include "debruijn_data.hpp"
-
-namespace debruijn_graph {
-
-class DeBruijnGraph: public omnigraph::ObservableGraph<DeBruijnDataMaster> {
-public:
- typedef omnigraph::ObservableGraph<DeBruijnDataMaster> base;
- typedef base::DataMasterT DataMasterT;
- typedef base::VertexData VertexData;
- typedef base::EdgeData EdgeData;
- typedef base::EdgeId EdgeId;
- typedef base::VertexId VertexId;
- typedef base::VertexIt VertexIt;
- typedef VertexIt VertexIterator;
- typedef VertexIterator iterator; // for for_each
- typedef const VertexIterator const_iterator; // for for_each
-private:
- CoverageIndex<DeBruijnGraph> coverage_index_;
-
-public:
- DeBruijnGraph(size_t k) :
- base(k), coverage_index_(*this) {
- }
-
- CoverageIndex<DeBruijnGraph>& coverage_index() {
- return coverage_index_;
- }
-
- const CoverageIndex<DeBruijnGraph>& coverage_index() const {
- return coverage_index_;
- }
-
- /**
- * Method returns average coverage of the edge
- */
- double coverage(EdgeId edge) const {
- return coverage_index_.coverage(edge);
- }
-
- using base::AddVertex;
- using base::AddEdge;
-
- VertexId AddVertex() {
- return AddVertex(VertexData());
- }
-
- EdgeId AddEdge(VertexId from, VertexId to, const Sequence &nucls) {
- VERIFY(nucls.size() > k());
- return AddEdge(from, to, EdgeData(nucls));
- }
-
- size_t k() const {
- return master().k();
- }
-
- /**
- * Method returns Sequence stored in the edge
- */
- const Sequence& EdgeNucls(EdgeId edge) const {
- return this->data(edge).nucls();
- }
-
- const Sequence VertexNucls(VertexId v) const {
- //todo add verify on vertex nucls consistency
- if (this->OutgoingEdgeCount(v) > 0) {
- return EdgeNucls(*(this->out_begin(v))).Subseq(0, k());
- } else if (this->IncomingEdgeCount(v) > 0) {
- EdgeId inc = *(this->in_begin(v));
- size_t length = EdgeNucls(inc).size();
- return EdgeNucls(inc).Subseq(length - k(), length);
- }
- VERIFY(false);
- return Sequence();
- }
-
- Sequence PathNucls(const vector<EdgeId> &path) const {
- if(path.empty())
- return Sequence("");
- SequenceBuilder result;
- result.append(Sequence(""));
- result.append(this->EdgeNucls(path[0]).Subseq(0, this->k()));
- for (size_t i = 0; i < path.size(); ++i) {
- result.append(this->EdgeNucls(path[i]).Subseq(this->k()));
- }
-
- return result.BuildSequence();
- }
-
-private:
- DECL_LOGGER("DeBruijnGraph")
-};
-
-typedef DeBruijnGraph ConjugateDeBruijnGraph;
-
-typedef ConjugateDeBruijnGraph Graph;
-typedef Graph::EdgeId EdgeId;
-typedef Graph::VertexId VertexId;
-}
diff --git a/src/debruijn/debruijn_graph_constructor.hpp b/src/debruijn/debruijn_graph_constructor.hpp
deleted file mode 100644
index 6e19d17..0000000
--- a/src/debruijn/debruijn_graph_constructor.hpp
+++ /dev/null
@@ -1,556 +0,0 @@
-#pragma once
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * debruijn_graph_constructor.hpp
- *
- * Created on: Apr 5, 2011
- * Author: sergey
- */
-
-#include "utils.hpp"
-#include "debruijn_graph.hpp"
-#include "omni/construction_helper.hpp"
-#include "standard_base.hpp"
-#include "indices/kmer_extension_index.hpp"
-#include "openmp_wrapper.h"
-#include "parallel_wrapper.hpp"
-
-namespace debruijn_graph {
-
-/*
- * Constructs DeBruijnGraph from DeBruijn Graph using "new DeBruijnGraphConstructor(DeBruijn).ConstructGraph(DeBruijnGraph, Index)"
- */
-template<class Graph, class Index>
-class DeBruijnGraphConstructor {
-private:
- typedef typename Graph::EdgeId EdgeId;
- typedef Index DeBruijn;
- typedef typename Graph::VertexId VertexId;
- typedef typename Index::KMer Kmer;
- typedef typename DeBruijn::KeyWithHash KeyWithHash;
- typedef typename DeBruijn::kmer_iterator kmer_iterator;
-
- Graph &graph_;
- DeBruijn &origin_;
- size_t kmer_size_;
-
- bool StepRightIfPossible(KeyWithHash &kwh) {
- // VERIFY(origin_.contains(edge));
- if (origin_.RivalEdgeCount(kwh) == 1
- && origin_.NextEdgeCount(kwh) == 1) {
- kwh = origin_.NextEdge(kwh);
- // VERIFY(origin_.contains(next_edge));
- return true;
- }
- return false;
- }
-
- KeyWithHash &GoRight(KeyWithHash &kwh) {
- KeyWithHash initial = kwh;
- while (StepRightIfPossible(kwh) && kwh != initial) {
- ;
- }
- return kwh;
- }
-
- KeyWithHash &GoLeft(KeyWithHash &kwh) {
- //These strange things are in order to avoid making copies of kwh
- kwh = !kwh;
- kwh = !GoRight(kwh);
- return kwh;
- }
-
- Sequence ConstructSeqGoingRight(KeyWithHash &kwh) {
- SequenceBuilder s;
- s.append(kwh.key());
- KeyWithHash initial = kwh;
- while (StepRightIfPossible(kwh) && kwh != initial) {
- s.append(kwh[kmer_size_]);
- }
- return s.BuildSequence();
- }
-
- Sequence ConstructSequenceWithEdge(const KeyWithHash &kwh) {
- KeyWithHash tmp = kwh;
- return ConstructSeqGoingRight(GoLeft(tmp));
- }
-
- VertexId FindVertexByOutgoingEdges(Kmer kmer) {
- for (char c = 0; c < 4; ++c) {
- KeyWithHash edge = origin_.ConstructKWH(kmer.pushBack(c));
- if (origin_.contains(edge))
- return graph_.EdgeStart(origin_.get_value(edge).edge_id);
- }
- return VertexId(NULL);
- }
-
- VertexId FindVertexByIncomingEdges(Kmer kmer) {
- for (char c = 0; c < 4; ++c) {
- KeyWithHash edge = origin_.ConstructKWH(kmer.pushFront(c));
- if (origin_.contains(edge)) {
- return graph_.EdgeEnd(origin_.get_value(edge).edge_id);
- }
- }
- return VertexId(NULL);
- }
-
- VertexId FindVertex(Kmer kmer) {
- VertexId v = FindVertexByOutgoingEdges(kmer);
- return v == VertexId(NULL) ? FindVertexByIncomingEdges(kmer) : v;
- }
-
- VertexId FindVertexMaybeMissing(Kmer kmer) {
- VertexId v = FindVertex(kmer);
- return v != VertexId(NULL) ? v : graph_.AddVertex();
- }
-
- VertexId FindEndMaybeMissing(const ConjugateDeBruijnGraph& graph,
- VertexId start, Kmer start_kmer, Kmer end_kmer) {
- if (start_kmer == end_kmer) {
- return start;
- } else if (start_kmer == !end_kmer) {
- return graph.conjugate(start);
- } else {
- return FindVertexMaybeMissing(end_kmer);
- }
- }
-
- void ConstructPart(const std::vector<KeyWithHash>& kwh_list,
- std::vector<Sequence>& sequences) {
- for (size_t i = 0; i < sequences.size(); ++i) {
- if (origin_.contains(kwh_list[i])) {
- continue;
- }
-
- Kmer start_kmer = sequences[i].start < Kmer > (kmer_size_);
- Kmer end_kmer = sequences[i].end < Kmer > (kmer_size_);
-
- VertexId start = FindVertexMaybeMissing(start_kmer);
- VertexId end = FindEndMaybeMissing(graph_, start, start_kmer,
- end_kmer);
-
- graph_.AddEdge(start, end, sequences[i]);
- }
- }
-
- void AddKmers(kmer_iterator &it, kmer_iterator &end, size_t queueSize,
- std::vector<KeyWithHash>& kwh_list) {
- for (; kwh_list.size() != queueSize && it != end; ++it) {
- KeyWithHash kwh = origin_.ConstructKWH(Kmer(unsigned(kmer_size_ + 1), (*it).data()));
-
- if (!origin_.contains(kwh))
- kwh_list.push_back(kwh);
- }
- }
-
- void CalculateSequences(std::vector<KeyWithHash> &kwh_list,
- std::vector<Sequence> &sequences) {
- size_t size = kwh_list.size();
- sequences.resize(size);
-
-# pragma omp parallel for schedule(guided)
- for (size_t i = 0; i < size; ++i) {
- sequences[i] = ConstructSequenceWithEdge(kwh_list[i]);
- }
- }
-
-public:
- DeBruijnGraphConstructor(Graph& graph, DeBruijn &origin) :
- graph_(graph), origin_(origin), kmer_size_(graph_.k()) {
- }
-
- void ConstructGraph(size_t queueMinSize, size_t queueMaxSize,
- double queueGrowthRate) {
- kmer_iterator it = origin_.kmer_begin();
- kmer_iterator end = origin_.kmer_end();
- size_t queueSize = queueMinSize;
- std::vector<KeyWithHash> kwh_list;
- std::vector<Sequence> sequences;
- kwh_list.reserve(queueSize);
- sequences.reserve(queueMaxSize);
- while (it != end) {
- AddKmers(it, end, queueSize, kwh_list); // format a queue of kmers that are not in index
- CalculateSequences(kwh_list, sequences); // in parallel
- ConstructPart(kwh_list, sequences);
- kwh_list.clear();
- queueSize = min(size_t(double(queueSize) * queueGrowthRate), queueMaxSize);
- }
- }
-
-private:
- DECL_LOGGER("DeBruijnGraphConstructor")
-};
-
-class UnbranchingPathFinder {
-private:
- typedef DeBruijnExtensionIndex<> Index;
- typedef runtime_k::RtSeq Kmer;
- typedef Index::kmer_iterator kmer_iterator;
- typedef Index::KeyWithHash KeyWithHash;
- typedef Index::DeEdge DeEdge;
-
- Index &origin_;
- size_t kmer_size_;
- bool clean_condensed_;
-
-
-public:
- UnbranchingPathFinder(Index &origin, size_t kmer_size) : origin_(origin), kmer_size_(kmer_size) {
- }
-
- bool StepRightIfPossible(DeEdge &edge) {
- if (origin_.CheckUniqueOutgoing(edge.end) && origin_.CheckUniqueIncoming(edge.end)) {
- edge = DeEdge(edge.end, origin_.GetUniqueOutgoing(edge.end));
- return true;
- }
- return false;
- }
-
- Sequence ConstructSeqGoingRight(DeEdge edge) {
- SequenceBuilder s;
- s.append(edge.start.key());
- s.append(edge.end[kmer_size_ - 1]);
- DeEdge initial = edge;
- while (StepRightIfPossible(edge) && edge != initial) {
- s.append(edge.end[kmer_size_ - 1]);
- }
- return s.BuildSequence();
- }
-
- Sequence ConstructSequenceWithEdge(DeEdge edge) {
- return ConstructSeqGoingRight(edge);
- }
-
-//TODO Think about what happends to self rc perfect loops
- Sequence ConstructLoopFromVertex(const KeyWithHash &kh) {
- DeEdge break_point(kh, origin_.GetUniqueOutgoing(kh));
- Sequence result = ConstructSequenceWithEdge(break_point);
- if (clean_condensed_)
- origin_.IsolateVertex(kh);
- return result;
- }
-};
-
-class UnbranchingPathExtractor {
-private:
- typedef DeBruijnExtensionIndex<> Index;
- typedef runtime_k::RtSeq Kmer;
- typedef Index::kmer_iterator kmer_iterator;
- typedef Index::DeEdge DeEdge;
- typedef Index::KeyWithHash KeyWithHash;
-
- Index &origin_;
- size_t kmer_size_;
-
- bool IsJunction(KeyWithHash kh) const {
- return !(origin_.CheckUniqueOutgoing(kh) && origin_.CheckUniqueIncoming(kh));
- }
-
- void AddStartDeEdgesForVertex(KeyWithHash kh, std::vector<DeEdge>& start_edges) const {
- for (char next = 0; next < 4; next++) {
- if (origin_.CheckOutgoing(kh, next)) {
- TRACE("Added to queue " << DeEdge(kh, origin_.GetOutgoing(kh, next)));
- start_edges.push_back(DeEdge(kh, origin_.GetOutgoing(kh, next)));
- }
- }
- }
-
- void AddStartDeEdges(kmer_iterator &it, size_t queueSize,
- std::vector<DeEdge>& start_edges) const {
- for (; start_edges.size() < queueSize && it.good(); ++it) {
- KeyWithHash kh = origin_.ConstructKWH(Kmer(kmer_size_, *it));
- if (IsJunction(kh)) {
- AddStartDeEdgesForVertex(kh, start_edges);
- KeyWithHash kh_inv = !kh;
- if(!(kh_inv.is_minimal())) {
- AddStartDeEdgesForVertex(kh_inv, start_edges);
- }
- }
- }
- }
-
- void CalculateSequences(std::vector<DeEdge> &edges,
- std::vector<Sequence> &sequences, UnbranchingPathFinder &finder) const {
- size_t size = edges.size();
- size_t start = sequences.size();
- sequences.resize(start + size);
-
-# pragma omp parallel for schedule(guided)
- for (size_t i = 0; i < size; ++i) {
- sequences[start + i] = finder.ConstructSequenceWithEdge(edges[i]);
- TRACE("From " << edges[i] << " calculated sequence");
- TRACE(sequences[start + i]);
- }
- }
-
- void CleanCondensed(const Sequence &sequence) {
- Kmer kmer = sequence.start<Kmer>(kmer_size_);
- KeyWithHash kwh = origin_.ConstructKWH(kmer);
- origin_.IsolateVertex(kwh);
- for(size_t pos = kmer_size_; pos < sequence.size(); pos++) {
- kwh = kwh << sequence[pos];
- origin_.IsolateVertex(kwh);
- }
- }
-
- void CleanCondensed(const std::vector<Sequence> &sequences) {
-# pragma omp parallel for schedule(guided)
- for (size_t i = 0; i < sequences.size(); ++i) {
- CleanCondensed(sequences[i]);
- }
- }
-
- //This methods collects all loops that were not extracted by finding unbranching paths because there are no junctions on loops.
- //TODO make parallel
- const std::vector<Sequence> CollectLoops() {
- INFO("Collecting perfect loops");
- UnbranchingPathFinder finder(origin_, kmer_size_);
- std::vector<Sequence> result;
- for (kmer_iterator it = origin_.kmer_begin(); it.good(); ++it) {
- KeyWithHash kh = origin_.ConstructKWH(Kmer(kmer_size_, *it));
- if (!IsJunction(kh)) {
- Sequence loop = finder.ConstructLoopFromVertex(kh);
- result.push_back(loop);
- CleanCondensed(loop);
- if(loop != (!loop)) {
- CleanCondensed(!loop);
- result.push_back(!loop);
- }
- }
- }
- INFO("Collecting perfect loops finished. " << result.size() << " loops collected");
- return result;
- }
-
-public:
- UnbranchingPathExtractor(Index &origin, size_t k) : origin_(origin), kmer_size_(k) {
- }
-
- //TODO very large vector is returned. But I hate to make all those artificial changes that can fix it.
- const std::vector<Sequence> ExtractUnbranchingPaths(size_t queueMinSize, size_t queueMaxSize,
- double queueGrowthRate) {
- INFO("Extracting unbranching paths");
- UnbranchingPathFinder finder(origin_, kmer_size_);
- std::vector<Sequence> result;
- size_t queueSize = queueMinSize;
- std::vector<DeEdge> start_edges;
- std::vector<Sequence> sequences;
- start_edges.reserve(queueSize);
- auto it = origin_.kmer_begin();
- while (it.good()) {
- AddStartDeEdges(it, queueSize, start_edges); // format a queue of junction kmers
- CalculateSequences(start_edges, sequences, finder); // in parallel
- start_edges.clear();
- queueSize = min((size_t) ((double) queueSize * queueGrowthRate), queueMaxSize);
- }
- INFO("Extracting unbranching paths finished. " << sequences.size() << " sequences extracted");
- return sequences;
- }
-
- const std::vector<Sequence> ExtractUnbranchingPathsAndLoops(size_t queueMinSize, size_t queueMaxSize,
- double queueGrowthRate) {
- std::vector<Sequence> result = ExtractUnbranchingPaths(queueMinSize, queueMaxSize, queueGrowthRate);
- CleanCondensed(result);
- std::vector<Sequence> loops = CollectLoops();
- for(auto it = loops.begin(); it != loops.end(); ++it) {
- result.push_back(*it);
- }
- return result;
- }
-
-private:
- DECL_LOGGER("UnbranchingPathExtractor")
-};
-
-/*
- * Only works for Conjugate dbg
- */
-template<class Graph>
-class FastGraphFromSequencesConstructor {
-private:
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef runtime_k::RtSeq Kmer;
- typedef DeBruijnExtensionIndex<> Index;
- size_t kmer_size_;
- Index &origin_;
-
- class LinkRecord {
- private:
- size_t hash_and_mask_;
- EdgeId edge_;
-
- size_t BitBool(bool flag) const {
- if(flag)
- return 1;
- return 0;
- }
-
- public:
- size_t GetHash() const {
- return hash_and_mask_ >> 2;
- }
-
- bool IsRC() const {
- return hash_and_mask_ & 2;
- }
-
- bool IsStart() const {
- return hash_and_mask_ & 1;
- }
-
-
- EdgeId GetEdge() const {
- return edge_;
- }
-
- LinkRecord(size_t hash, EdgeId edge, bool is_start, bool is_rc) :
- hash_and_mask_((hash << 2) | (BitBool(is_rc) << 1)| BitBool(is_start)), edge_(edge) {
- }
-
- LinkRecord() :
- hash_and_mask_(-1ul), edge_(0) {
- }
-
- bool IsInvalid() {
- return hash_and_mask_ + 1 == 0 && edge_ == EdgeId(0);
- }
-
- bool operator<(const LinkRecord &other) const {
- if(this->hash_and_mask_ == other.hash_and_mask_)
- return this->edge_ < other.edge_;
- return this->hash_and_mask_ < other.hash_and_mask_;
- }
- };
-
- LinkRecord StartLink(const EdgeId &edge, const Sequence &sequence) const {
- Kmer kmer(kmer_size_, sequence);
- Kmer kmer_rc = !kmer;
- if(kmer < kmer_rc)
- return LinkRecord(origin_.ConstructKWH(kmer).idx(), edge, true, false);
- else
- return LinkRecord(origin_.ConstructKWH(kmer_rc).idx(), edge, true, true);
- }
-
- LinkRecord EndLink(const EdgeId &edge, const Sequence &sequence) const {
- Kmer kmer(kmer_size_, sequence, sequence.size() - kmer_size_);
- Kmer kmer_rc = !kmer;
- if(kmer < kmer_rc)
- return LinkRecord(origin_.ConstructKWH(kmer).idx(), edge, false, false);
- else
- return LinkRecord(origin_.ConstructKWH(kmer_rc).idx(), edge, false, true);
- }
-
- void CollectLinkRecords(typename Graph::HelperT &helper, const Graph &graph, vector<LinkRecord> &records, const vector<Sequence> &sequences) const {
- size_t size = sequences.size();
- records.resize(size * 2, LinkRecord(0, EdgeId(0), false, false));
- restricted::IdSegmentStorage id_storage = helper.graph().GetGraphIdDistributor().Reserve(size * 2);
-# pragma omp parallel for schedule(guided)
- for (size_t i = 0; i < size; ++i) {
- size_t j = i << 1;
- auto id_distributor = id_storage.GetSegmentIdDistributor(j, j + 2);//indices for two edges are required
- EdgeId edge = helper.AddEdge(DeBruijnEdgeData(sequences[i]), id_distributor);
- records[j] = StartLink(edge, sequences[i]);
- if(graph.conjugate(edge) != edge)
- records[j + 1] = EndLink(edge, sequences[i]);
- else
- records[j + 1] = LinkRecord();
- }
- }
-
- void LinkEdge(typename Graph::HelperT &helper, const Graph &graph, const VertexId v, const EdgeId edge, const bool is_start, const bool is_rc) const {
- VertexId v1 = v;
- if(is_rc) {
- v1 = graph.conjugate(v);
- }
- if(is_start) {
- helper.LinkOutgoingEdge(v1, edge);
- } else {
- helper.LinkIncomingEdge(v1, edge);
- }
- }
-
-public:
- FastGraphFromSequencesConstructor(size_t k, Index &origin) : kmer_size_(k), origin_(origin) {
- }
-
- void ConstructGraph(Graph &graph, const vector<Sequence> &sequences) const {
- typename Graph::HelperT helper = graph.GetConstructionHelper();
- vector<LinkRecord> records;
- CollectLinkRecords(helper, graph, records, sequences);//TODO make parallel
- parallel::sort(records.begin(), records.end());
- size_t size = records.size();
- vector<vector<VertexId>> vertices_list(omp_get_max_threads());
- restricted::IdSegmentStorage id_storage = helper.graph().GetGraphIdDistributor().Reserve(size * 2);
-# pragma omp parallel for schedule(guided)
- for(size_t i = 0; i < size; i++) {
- if(i != 0 && records[i].GetHash() == records[i - 1].GetHash()) {
- continue;
- }
- if(records[i].IsInvalid())
- continue;
- auto id_distributor = id_storage.GetSegmentIdDistributor(i << 1, (i << 1) + 2);
- VertexId v = helper.CreateVertex(DeBruijnVertexData(), id_distributor);
- vertices_list[omp_get_thread_num()].push_back(v);
- for(size_t j = i; j < size && records[j].GetHash() == records[i].GetHash(); j++) {
- LinkEdge(helper, graph, v, records[j].GetEdge(), records[j].IsStart(), records[j].IsRC());
- }
- }
- for(size_t i = 0; i < vertices_list.size(); i++)
- helper.AddVerticesToGraph(vertices_list[i].begin(), vertices_list[i].end());
- }
-};
-
-/*
- * Constructs DeBruijnGraph from DeBruijnExtensionIndex using "new DeBruijnGraphExtentionConstructor(DeBruijn).ConstructGraph(DeBruijnGraph, Index)"
- */
-template<class Graph>
-class DeBruijnGraphExtentionConstructor {
-private:
- typedef typename Graph::EdgeId EdgeId;
- typedef DeBruijnExtensionIndex<> DeBruijn;
- typedef typename Graph::VertexId VertexId;
- typedef runtime_k::RtSeq Kmer;
-
- Graph &graph_;
- DeBruijn &origin_;
- size_t kmer_size_;
-
- void FilterRC(std::vector<Sequence> &edge_sequences) {
- size_t size = 0;
- for(size_t i = 0; i < edge_sequences.size(); i++) {
- if(!(edge_sequences[i] < !edge_sequences[i])) {
- edge_sequences[size] = edge_sequences[i];
- size++;
- }
- }
- edge_sequences.resize(size);
- }
-
-public:
- DeBruijnGraphExtentionConstructor(Graph& graph, DeBruijn &origin) :
- graph_(graph), origin_(origin), kmer_size_(graph.k()) {
- }
-
- void ConstructGraph(size_t queueMinSize, size_t queueMaxSize,
- double queueGrowthRate, bool keep_perfect_loops) {
- std::vector<Sequence> edge_sequences;
- if(keep_perfect_loops)
- edge_sequences = UnbranchingPathExtractor(origin_, kmer_size_).ExtractUnbranchingPathsAndLoops(queueMinSize, queueMaxSize, queueGrowthRate);
- else
- edge_sequences = UnbranchingPathExtractor(origin_, kmer_size_).ExtractUnbranchingPaths(queueMinSize, queueMaxSize, queueGrowthRate);
- FilterRC(edge_sequences);
- FastGraphFromSequencesConstructor<Graph>(kmer_size_, origin_).ConstructGraph(graph_, edge_sequences);
- }
-
-private:
- DECL_LOGGER("DeBruijnGraphConstructor")
-};
-
-}
diff --git a/src/debruijn/debruijn_stats.cpp b/src/debruijn/debruijn_stats.cpp
deleted file mode 100644
index 96f2e67..0000000
--- a/src/debruijn/debruijn_stats.cpp
+++ /dev/null
@@ -1,525 +0,0 @@
-////***************************************************************************
-////* Copyright (c) 2015 Saint Petersburg State University
-////* Copyright (c) 2011-2014 Saint Petersburg Academic University
-////* All Rights Reserved
-////* See file LICENSE for details.
-////***************************************************************************
-//
-//// FIXME: Refactor and turn into stage
-//
-////todo rewrite with extended sequence mapper!
-//template<class Graph, class Index>
-//class EtalonPairedInfoCounter {
-// typedef typename Graph::EdgeId EdgeId;
-//
-// const Graph& g_;
-// const Index& index_;
-// const KmerMapper<Graph>& kmer_mapper_;
-// size_t k_;
-//
-// size_t insert_size_;
-// size_t read_length_;
-// int gap_;
-// size_t delta_;
-//
-// void AddEtalonInfo(PairedInfoIndexT<Graph>& index, EdgeId e1, EdgeId e2, double d) {
-// index.AddPairInfo(e1, e2, d, 1000., 0.);
-// }
-//
-// void ProcessSequence(const Sequence& sequence, PairedInfoIndexT<Graph>& index)
-// {
-// int mod_gap = (gap_ + (int) k_ > (int) delta_ ) ? gap_ - (int) delta_ : 0 - (int) k_;
-// runtime_k::RtSeq left(k_ +1, sequence);
-// left >>= 0;
-// for (size_t left_idx = 0;
-// left_idx + 2 * (k_ + 1) + mod_gap <= sequence.size();
-// ++left_idx) {
-// left <<= sequence[left_idx + k_];
-// runtime_k::RtSeq left_upd = kmer_mapper_.Substitute(left);
-// if (!index_.contains(left_upd)) {
-// continue;
-// }
-// pair<EdgeId, size_t> left_pos = index_.get(left_upd);
-//
-// size_t right_idx = left_idx + k_ + 1 + mod_gap;
-// runtime_k::RtSeq right(k_ + 1, sequence, right_idx);
-// right >>= 0;
-// for (;
-// right_idx + k_ + 1 <= left_idx + insert_size_ + delta_ && right_idx + k_ + 1 <= sequence.size();
-// ++right_idx) {
-// right <<= sequence[right_idx + k_];
-// runtime_k::RtSeq right_upd = kmer_mapper_.Substitute(right);
-// if (!index_.contains(right_upd)) {
-// continue;
-// }
-// pair<EdgeId, size_t> right_pos = index_.get(right_upd);
-//
-// AddEtalonInfo(index, left_pos.first, right_pos.first,
-// 0. + (double) right_idx - (double) left_idx +
-// (double) left_pos.second - (double) right_pos.second);
-// }
-// }
-// }
-//
-//public:
-// EtalonPairedInfoCounter(const Graph& g, const Index& index,
-// const KmerMapper<Graph>& kmer_mapper,
-// size_t insert_size, size_t read_length,
-// size_t delta, size_t k)
-// : g_(g),
-// index_(index),
-// kmer_mapper_(kmer_mapper),
-// k_(k),
-// insert_size_(insert_size),
-// read_length_(read_length),
-// gap_((int) (insert_size_ - 2 * read_length_)),
-// delta_(delta) {
-//// VERIFY(insert_size_ >= 2 * read_length_);
-// }
-//
-// void FillEtalonPairedInfo(const Sequence& genome,
-// omnigraph::de::PairedInfoIndexT<Graph>& paired_info) {
-// ProcessSequence(genome, paired_info);
-// ProcessSequence(!genome, paired_info);
-// }
-//};
-//
-//template<class Graph>
-//void GetAllDistances(const PairedInfoIndexT<Graph>& paired_index,
-// PairedInfoIndexT<Graph>& result,
-// const GraphDistanceFinder<Graph>& dist_finder) {
-// for (auto iter = paired_index.begin(); iter != paired_index.end(); ++iter) {
-// EdgeId e1 = iter.first();
-// EdgeId e2 = iter.second();
-// vector<size_t> forward = dist_finder.GetGraphDistancesLengths(e1, e2);
-// for (size_t i = 0; i < forward.size(); ++i)
-// result.AddPairInfo(e1, e2, (double) forward[i], -10.0, 0.0, false);
-// }
-//}
-//
-//template<class Graph>
-//void GetAllDistances(const Graph& g,
-// const PairedInfoIndexT<Graph>& paired_index,
-// const PairedInfoIndexT<Graph>& clustered_index,
-// const GraphDistanceFinder<Graph>& dist_finder,
-// PairedInfoIndexT<Graph>& result)
-//{
-// typedef typename Graph::EdgeId EdgeId;
-// typedef vector<EdgeId> Path;
-// for (auto iter = paired_index.begin(); iter != paired_index.end(); ++iter) {
-// EdgeId first = iter.first();
-// EdgeId second = iter.second();
-// const vector<Path>& raw_paths = dist_finder.GetGraphDistances(first, second);
-// // adding first edge to every path
-// vector<Path> paths;
-// for (size_t i = 0; i < raw_paths.size(); ++i) {
-// Path path;
-// path.push_back(first);
-// for (size_t j = 0; j < raw_paths[i].size(); ++j)
-// path.push_back(raw_paths[i][j]);
-// path.push_back(second);
-//
-// paths.push_back(path);
-// }
-// vector<size_t> path_lengths;
-// vector<double> path_weights;
-// for (size_t i = 0; i < paths.size(); ++i) {
-// size_t len_total = 0 ;
-// double weight_total = 0.;
-// for (size_t j = 0; j < paths[i].size(); ++j) {
-// len_total += g.length(paths[i][j]);
-// size_t cur_length = 0;
-// for (size_t l = j + 1; l < paths[i].size(); ++l) {
-// cur_length += g.length(paths[i][l - 1]);
-// const de::Histogram& infos = clustered_index.GetEdgePairInfo(paths[i][j], paths[i][l]);
-// for (auto iterator = infos.begin(); iterator != infos.end(); ++iterator) {
-// const Point& info = *iterator;
-// if (info.d == cur_length) {
-// weight_total += info.weight;
-// break;
-// }
-// }
-// }
-// }
-// path_lengths.push_back(len_total - g.length(second));
-// path_weights.push_back(weight_total);
-// }
-//
-// for (size_t i = 0; i < paths.size(); ++i) {
-// cout << first.int_id() << "(" << g.length(first) << ") "
-// << second.int_id() << "(" << g.length(second) << ") : "
-// << (i + 1) << "-th path (" << path_lengths[i] << ", " << path_weights[i] << ") ::: ";
-// for (size_t j = 0; j < paths[i].size(); ++j) {
-// cout << paths[i][j].int_id() << "(" << g.length(paths[i][j]) << ") ";
-// }
-// cout << endl;
-// }
-// }
-//}
-//
-//template<class Graph, class Index>
-//void FillEtalonPairedIndex(PairedInfoIndexT<Graph>& etalon_paired_index,
-// const Graph &g, const Index& index,
-// const KmerMapper<Graph>& kmer_mapper, size_t is, size_t rs,
-// size_t delta, const Sequence& genome, size_t k)
-//{
-// VERIFY_MSG(genome.size() > 0,
-// "The genome seems not to be loaded, program will exit");
-// INFO((string) (FormattedString("Counting etalon paired info for genome of length=%i, k=%i, is=%i, rs=%i, delta=%i")
-// << genome.size() << k << is << rs << delta));
-//
-// EtalonPairedInfoCounter<Graph, Index> etalon_paired_info_counter(g, index, kmer_mapper, is, rs, delta, k);
-// etalon_paired_info_counter.FillEtalonPairedInfo(genome, etalon_paired_index);
-//
-// DEBUG("Etalon paired info counted");
-//}
-//
-//template<class Graph, class Index>
-//void FillEtalonPairedIndex(PairedInfoIndexT<Graph>& etalon_paired_index,
-// const Graph &g, const Index& index,
-// const KmerMapper<Graph>& kmer_mapper, const Sequence& genome,
-// const io::SequencingLibrary<debruijn_config::DataSetData> &lib,
-// size_t k) {
-//
-// FillEtalonPairedIndex(etalon_paired_index, g, index, kmer_mapper,
-// size_t(lib.data().mean_insert_size), lib.data().read_length, size_t(lib.data().insert_size_deviation),
-// genome, k);
-//
-// //////////////////DEBUG
-// // SimpleSequenceMapper<k + 1, Graph> simple_mapper(g, index);
-// // Path<EdgeId> path = simple_mapper.MapSequence(genome);
-// // SequenceBuilder sequence_builder;
-// // sequence_builder.append(Seq<k>(g.EdgeNucls(path[0])));
-// // for (auto it = path.begin(); it != path.end(); ++it) {
-// // sequence_builder.append(g.EdgeNucls(*it).Subseq(k));
-// // }
-// // Sequence new_genome = sequence_builder.BuildSequence();
-// // NewEtalonPairedInfoCounter<k, Graph> new_etalon_paired_info_counter(g, index,
-// // insert_size, read_length, insert_size * 0.1);
-// // PairedInfoIndexT<Graph> new_paired_info_index(g);
-// // new_etalon_paired_info_counter.FillEtalonPairedInfo(new_genome, new_paired_info_index);
-// // CheckInfoEquality(etalon_paired_index, new_paired_info_index);
-// //////////////////DEBUG
-// // INFO("Etalon paired info counted");
-//}
-//
-//template<class Graph>
-//void CountPairedInfoStats(const Graph& g,
-// const io::SequencingLibrary<debruijn_config::DataSetData> &lib,
-// const PairedInfoIndexT<Graph>& paired_index,
-// const PairedInfoIndexT<Graph>& etalon_index,
-// const string& output_folder) {
-// PairedInfoIndexT<Graph> filtered_index = paired_index;
-//// PairInfoWeightFilter<Graph>(g, 40).Filter(filtered_index);
-// PairInfoFilter<Graph>(PairInfoWeightChecker<Graph>(g, 40)).Filter(filtered_index);
-// INFO("Counting paired info stats");
-// EdgePairStat<Graph>(g, paired_index, output_folder).Count();
-//
-// //todo remove filtration if launch on etalon info is ok
-// UniquePathStat<Graph>(g, filtered_index,
-// (size_t)math::round(lib.data().mean_insert_size),
-// lib.data().read_length,
-// 0.1 * lib.data().mean_insert_size).Count();
-// UniqueDistanceStat<Graph>(etalon_index).Count();
-// INFO("Paired info stats counted");
-//}
-//
-//// leave only those pairs, which edges have no path in the graph between them
-//template<class Graph>
-//void FilterIndexWithExistingPaths(PairedIndexT& scaf_clustered_index,
-// const PairedIndexT& index,
-// const conj_graph_pack &gp,
-// const GraphDistanceFinder<Graph>& dist_finder) {
-// for (auto it = index.begin(); it != index.end(); ++it) {
-// const de::Histogram& histogram = *it;
-// EdgeId e1 = it.first();
-// EdgeId e2 = it.second();
-// if (gp.g.OutgoingEdgeCount(gp.g.EdgeEnd(e1)) == 0 && gp.g.IncomingEdgeCount(gp.g.EdgeEnd(e1)) == 1 &&
-// gp.g.IncomingEdgeCount(gp.g.EdgeStart(e2)) == 0 && gp.g.OutgoingEdgeCount(gp.g.EdgeStart(e2)) == 1) {
-// vector<size_t> dists = dist_finder.GetGraphDistancesLengths(e1, e2);
-// if (dists.size() == 0)
-// for (auto point_iter = histogram.begin(); point_iter != histogram.end(); ++point_iter)
-// if (math::gr(point_iter->d, 0.)) {
-// scaf_clustered_index.AddPairInfo(it.first(), it.second(),
-// point_iter->d, point_iter->weight, 20.);
-// }
-// }
-// }
-//}
-//
-//inline
-//void tSeparatedStats(conj_graph_pack& gp, const Sequence& contig,
-// PairedInfoIndex<conj_graph_pack::graph_t> &ind,
-// const io::SequencingLibrary<debruijn_config::DataSetData> &lib,
-// size_t /*k*/) {
-// typedef omnigraph::de::PairInfo<EdgeId> PairInfo;
-//
-// MappingPath<Graph::EdgeId> m_path1 = FindGenomeMappingPath(contig, gp.g,
-// gp.index, gp.kmer_mapper);
-//
-// map<Graph::EdgeId, vector<pair<int, int>>> inGenomeWay;
-// int CurI = 0;
-// int gaps = 0;
-// for (size_t i = 0; i < m_path1.size(); i++) {
-// bool new_edge_added = false;
-// EdgeId ei = m_path1[i].first;
-// MappingRange mr = m_path1[i].second;
-// int start = (int)(mr.initial_range.start_pos - mr.mapped_range.start_pos);
-// if (inGenomeWay.find(ei) == inGenomeWay.end()) {
-// vector<pair<int, int>> tmp;
-// tmp.push_back(make_pair(CurI, start));
-// inGenomeWay[ei] = tmp;
-// CurI++;
-// new_edge_added = true;
-// DEBUG("Edge " << gp.g.str(ei) << " num " << CurI << " pos " << start);
-// } else {
-// if (m_path1[i - 1].first == ei) {
-// if (abs(start - inGenomeWay[ei][(inGenomeWay[ei].size() - 1)].second) > 50) {
-// inGenomeWay[ei].push_back(make_pair(CurI, start));
-// CurI++;
-// new_edge_added = true;
-// DEBUG("Edge " << gp.g().str(ei) << " num " << CurI << " pos " << start);
-// }
-// } else {
-// inGenomeWay[ei].push_back(make_pair(CurI, start));
-// CurI++;
-// new_edge_added = true;
-// DEBUG("Edge " << gp.g.str(ei) << " num " << CurI << " pos " << start);
-// }
-// }
-// if (new_edge_added && (i > 0)) {
-// if (gp.g.EdgeStart(ei) != gp.g.EdgeEnd(m_path1[i - 1].first)) {
-// gaps++;
-// }
-// }
-// }
-// INFO("Totaly " << CurI << " edges in genome path, with " << gaps << "not adjacent conequences");
-//
-// vector<int> stats(10);
-// vector<int> stats_d(10);
-// int PosInfo = 0;
-// int AllignedPI = 0;
-// int ExactDPI = 0;
-// int OurD = int(lib.data().mean_insert_size) - int(lib.data().read_length);
-// for (auto p_iter = ind.begin(), p_end_iter = ind.end();
-// p_iter != p_end_iter; ++p_iter) {
-// vector<PairInfo> pi = *p_iter;
-// for (size_t j = 0; j < pi.size(); j++) {
-// EdgeId left_edge = pi[j].first;
-// EdgeId right_edge = pi[j].second;
-// double d = pi[j].d();
-// if (d < 0.001)
-// continue;
-// int best_d = 100;
-// int best_t = 0;
-// PosInfo++;
-// DEBUG(
-// "PairInfo " << gp.g().str(left_edge) << " -- " << gp.g().str(right_edge) << " d " << d);
-// bool ExactOnD = false;
-// for (size_t left_i = 0; left_i < inGenomeWay[left_edge].size();
-// left_i++)
-// for (size_t right_i = 0;
-// right_i < inGenomeWay[right_edge].size(); right_i++) {
-// if (best_d
-// > abs(
-// inGenomeWay[right_edge][right_i].second
-// - inGenomeWay[left_edge][left_i].second
-// - d)) {
-// best_d = (int)math::round(abs(
-// inGenomeWay[right_edge][right_i].second
-// - inGenomeWay[left_edge][left_i].second
-// - d));
-// best_t = inGenomeWay[right_edge][right_i].first
-// - inGenomeWay[left_edge][left_i].first;
-// DEBUG("best d " << best_d);
-// if ((inGenomeWay[right_edge][right_i].second
-// - inGenomeWay[left_edge][left_i].second
-// - (int) gp.g.length(left_edge) <= OurD)
-// && (inGenomeWay[right_edge][right_i].second
-// - inGenomeWay[left_edge][left_i].second
-// + (int) gp.g.length(right_edge) >= OurD))
-// ExactOnD = true;
-// else
-// ExactOnD = false;
-// }
-// }
-// if (best_t > 5)
-// best_t = 5;
-// if (best_d < 100) {
-// AllignedPI++;
-// stats[best_t]++;
-// if (ExactOnD) {
-// stats_d[best_t]++;
-// ExactDPI++;
-// }
-// }
-//
-// }
-// }INFO(
-// "Total positive pair info " << PosInfo << " alligned to genome " << AllignedPI << " with exact distance " << ExactDPI);
-// INFO(
-// "t-separated stats Alligneg: 1 - " << stats[1] << " 2 - " << stats[2] << " 3 - " << stats[3] << " 4 - " << stats[4] << " >4 - " << stats[5]);
-// INFO(
-// "t-separated stats Exact: 1 - " << stats_d[1] << " 2 - " << stats_d[2] << " 3 - " << stats_d[3] << " 4 - " << stats_d[4] << " >4 - " << stats[5]);
-//}
-//
-//template<class Graph>
-//void CountAndSaveAllPaths(const Graph& g, const io::SequencingLibrary<debruijn_config::DataSetData> &lib,
-// const PairedInfoIndexT<Graph>& paired_index, const PairedInfoIndexT<Graph>& /*clustered_index*/) {
-// PairedIndexT all_paths(g);
-// GetAllDistances<Graph>(paired_index,
-// all_paths,
-// GraphDistanceFinder<Graph>(g,
-// size_t(lib.data().mean_insert_size),
-// lib.data().read_length,
-// size_t(lib.data().insert_size_deviation)));
-//
-// std::string dir_name = cfg::get().output_dir + "estimation_qual/";
-// make_dir(dir_name);
-//
-// graphio::ConjugateDataPrinter<Graph> printer(g);
-// printer.savePaired(dir_name + "paths", all_paths);
-//
-// //PairedIndexT& all_paths_2(g);
-// //GetAllDistances<Graph>(g,
-// //paired_index, clustered_index,
-// //all_paths_2,
-// //GraphDistanceFinder<Graph>(g, *cfg::get().ds.IS, *cfg::get().ds.RL,
-// //size_t(*cfg::get().ds.is_var)));
-// //printer.savePaired(dir_name + "paths_all", all_paths_2);
-//}
-//
-//void FillAndCorrectEtalonPairedInfo(PairedIndexT& corrected_etalon_index,
-// const conj_graph_pack& gp,
-// const PairedIndexT& paired_index, size_t insert_size,
-// size_t read_length, size_t delta,
-// bool save_etalon_info_history = false) {
-// INFO("Filling etalon paired index");
-// PairedIndexT etalon_index(gp.g);
-// bool successful_load = false;
-// if (cfg::get().entry_point >= ws_distance_estimation) {
-// string p = path::append_path(cfg::get().load_from, "../etalon");
-// if (!path::is_regular_file(p + ".prd")) {
-// DEBUG("file " << p + ".prd" << " does not exist");
-// }
-// else {
-// INFO("Loading etalon pair info from the previous run...");
-// Graph& graph = const_cast<Graph&>(gp.g);
-// graphio::ConjugateDataScanner<Graph> scanner(graph);
-// scanner.loadPaired(p, etalon_index);
-// path::files_t files;
-// files.push_back(p);
-// path::copy_files_by_prefix(files, cfg::get().output_dir);
-// successful_load = true;
-// }
-// }
-// if (!successful_load)
-// FillEtalonPairedIndex(etalon_index, gp.g,
-// gp.index, gp.kmer_mapper, insert_size, read_length, delta,
-// gp.genome, gp.k_value);
-// INFO("Etalon paired index filled");
-//
-// INFO("Correction of etalon paired info has been started");
-//
-// INFO("Filtering etalon info");
-// //leave only info between edges both present in paired_index
-// PairedIndexT filtered_etalon_index(gp.g);
-// for (auto iter = etalon_index.begin(); iter != etalon_index.end(); ++iter) {
-// const de::Histogram& histogram = *iter;
-// EdgeId first_edge = iter.first();
-// EdgeId second_edge = iter.second();
-// if (paired_index.GetEdgePairInfo(first_edge, second_edge).size() > 0) {
-// for (auto point = histogram.begin(); point != histogram.end(); ++point)
-// filtered_etalon_index.AddPairInfo(first_edge, second_edge, *point);
-// }
-// else
-// DEBUG("Filtering out pair_info " << gp.g.int_id(first_edge) << " "
-// << gp.g.int_id(second_edge));
-// }
-//
-// INFO("Pushing etalon info through estimator");
-// GraphDistanceFinder<Graph> dist_finder(gp.g, insert_size, read_length, delta);
-// DistanceEstimator<Graph> estimator(gp.g, filtered_etalon_index, dist_finder, 0., 4.);
-// estimator.Estimate(corrected_etalon_index);
-// if (save_etalon_info_history) {
-// INFO("Saving etalon paired info indices on different stages");
-// ConjugateDataPrinter<Graph> data_printer(gp.g);
-// data_printer.savePaired(cfg::get().output_dir + "etalon", etalon_index);
-// data_printer.savePaired(cfg::get().output_dir + "etalon_filtered_by_index",
-// filtered_etalon_index);
-// data_printer.savePaired(cfg::get().output_dir + "etalon_corrected_by_graph",
-// corrected_etalon_index);
-// INFO("Everything is saved");
-//
-// if (cfg::get().paired_info_scaffolder) {
-// GraphDistanceFinder<Graph> dist_finder(gp.g, insert_size, read_length, delta);
-// INFO("Saving paired information statistics for a scaffolding");
-// PairedIndexT scaf_etalon_index(gp.g);
-// FilterIndexWithExistingPaths(scaf_etalon_index, etalon_index, gp, dist_finder);
-// data_printer.savePaired(
-// cfg::get().output_dir + "scaf_etalon",
-// scaf_etalon_index);
-// PairedIndexT scaf_filtered_etalon_index(gp.g);
-// FilterIndexWithExistingPaths(scaf_filtered_etalon_index, filtered_etalon_index, gp, dist_finder);
-// data_printer.savePaired(
-// cfg::get().output_dir + "scaf_etalon_filtered",
-// scaf_filtered_etalon_index);
-// }
-//
-// INFO("Everything saved");
-// }
-// INFO("Correction finished");
-//}
-//
-//void CountClusteredPairedInfoStats(const conj_graph_pack &gp,
-// const io::SequencingLibrary<debruijn_config::DataSetData> &lib,
-// const PairedInfoIndexT<Graph> &paired_index,
-// const PairedInfoIndexT<Graph> &clustered_index) {
-// PairedIndexT etalon_index(gp.g);
-//
-// FillAndCorrectEtalonPairedInfo(etalon_index, gp, paired_index,
-// (size_t)math::round(lib.data().mean_insert_size),
-// lib.data().read_length,
-// (size_t)math::round(lib.data().insert_size_deviation), true);
-//
-// CountAndSaveAllPaths(gp.g, lib, paired_index, clustered_index);
-//
-// INFO("Counting clustered info stats");
-// EdgeQuality<Graph, Index> edge_qual(gp.g, gp.index, gp.kmer_mapper, gp.genome);
-// //EstimationQualityStat<Graph> estimation_stat(gp.g, edge_qual,
-// //paired_index, clustered_index, etalon_index);
-// //estimation_stat.Count();
-// //estimation_stat.SaveStats(cfg::get().output_dir + "estimation_qual/");
-//
-// INFO("Counting overall cluster stat");
-// ClusterStat<Graph>(clustered_index).Count();
-// INFO("Overall cluster stat");
-//
-// if (cfg::get().paired_info_scaffolder) {
-// ConjugateDataPrinter<Graph> data_printer(gp.g);
-// INFO("Generating the statistics of pair info for scaffolding");
-// PairedIndexT scaf_clustered_index(gp.g);
-// FilterIndexWithExistingPaths(scaf_clustered_index,
-// clustered_index, gp,
-// GraphDistanceFinder<Graph>(gp.g,
-// (size_t)math::round(lib.data().mean_insert_size),
-// lib.data().read_length,
-// (size_t)math::round(lib.data().insert_size_deviation)));
-// data_printer.savePaired(cfg::get().output_dir + "scaf_clustered",
-// scaf_clustered_index);
-// }
-// // PairedInfoIndexT<Graph> etalon_clustered_index;
-// // DistanceEstimator<Graph> estimator(g, etalon_index, insert_size,
-// // max_read_length, cfg::get().de.delta,
-// // cfg::get().de.linkage_distance, cfg::get().de.max_distance);
-// // estimator.Estimate(etalon_clustered_index);
-//
-// // PairedInfoIndexT<Graph> filtered_clustered_index(g);
-// // PairInfoFilter<Graph> (g, 1000.).Filter(
-// // clustered_index[>etalon_clustered_index<], filtered_clustered_index);
-// INFO("Counting mate-pair transformation stat");
-// MatePairTransformStat<Graph>(gp.g, //filtered_
-// clustered_index).Count();
-// INFO("Mate-pair transformation stat counted");
-// INFO("Clustered info stats counted");
-//}
diff --git a/src/debruijn/detail_coverage.hpp b/src/debruijn/detail_coverage.hpp
deleted file mode 100644
index b559239..0000000
--- a/src/debruijn/detail_coverage.hpp
+++ /dev/null
@@ -1,257 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "indices/perfect_hash_map.hpp"
-#include "omni/coverage.hpp"
-#include "verify.hpp"
-#include <vector>
-#include <map>
-#include <set>
-#include <string>
-#include <iostream>
-#include <fstream>
-
-namespace debruijn_graph {
-
-template<class Graph>
-class FlankingCoverage : public GraphActionHandler<Graph>,
- public omnigraph::AbstractFlankingCoverage<Graph> {
- typedef GraphActionHandler<Graph> base;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef pair<EdgeId, unsigned> Pos;
-
- Graph& g_;
- const size_t averaging_range_;
-
- void SetRawCoverage(EdgeId e, unsigned cov) {
- g_.data(e).set_flanking_coverage(cov);
- }
-
- unsigned RawCoverage(EdgeId e) const {
- return g_.data(e).flanking_coverage();
- }
-
- size_t EdgeAveragingRange(EdgeId e) const {
- return std::min(this->g().length(e), averaging_range_);
- }
-
- double AverageFlankingCoverage(EdgeId e) const {
- return double(RawCoverage(e)) / double(EdgeAveragingRange(e));
- }
-
- unsigned InterpolateCoverage(EdgeId e, size_t l) const {
- VERIFY(l <= averaging_range_);
- VERIFY(l < g_.length(e));
- return unsigned(math::round(AverageFlankingCoverage(e) * double(l)));
- }
-
- void SetCoverageSimilarToAverageFlanking(EdgeId target, EdgeId source) {
- SetRawCoverage(target, unsigned(math::round(AverageFlankingCoverage(source) * double(EdgeAveragingRange(target)))));
- }
-
- void SetCoverageSimilarToAverageGlobal(EdgeId target, EdgeId source) {
- SetRawCoverage(target, unsigned(math::round(g_.coverage(source) * double(EdgeAveragingRange(target)))));
- }
-
-public:
-
- //todo think about interactions with gap closer
- FlankingCoverage(Graph& g, size_t averaging_range)
- : base(g, "FlankingCoverage"), g_(g),
- averaging_range_(averaging_range) {
- }
-
- size_t averaging_range() const {
- return averaging_range_;
- }
-
- //todo currently left for saves compatibility! remove later!
- template<class CoverageIndex>
- void Fill(const CoverageIndex& count_index) {
- TRACE("Filling flanking coverage from index");
-
- for (auto I = count_index.value_cbegin(), E = count_index.value_cend();
- I != E; ++I) {
- const auto& edge_info = *I;
- EdgeId e = edge_info.edge_id;
- unsigned offset = edge_info.offset;
- unsigned count = edge_info.count;
- VERIFY(offset != -1u);
- VERIFY(e.get() != NULL);
- if (offset < averaging_range_) {
- IncRawCoverage(e, count);
- }
- }
- }
-
- void IncRawCoverage(EdgeId e, unsigned count) {
- g_.data(e).inc_flanking_coverage(count);
- }
-
- double CoverageOfStart(EdgeId e) const {
- return AverageFlankingCoverage(e);
- }
-
- double CoverageOfEnd(EdgeId e) const {
- return CoverageOfStart(this->g().conjugate(e));
- }
-
- virtual void HandleAdd(EdgeId /*e*/) {
- }
-
- virtual void HandleMerge(const vector<EdgeId>& old_edges, EdgeId new_edge) {
-// SetRawCoverage(new_edge, RawCoverage(old_edges.front()));
- size_t kpomers_left = averaging_range_;
- unsigned acc = 0;
- for (EdgeId e : old_edges) {
- if (kpomers_left >= g_.length(e)) {
- acc += RawCoverage(e);
- kpomers_left -= g_.length(e);
- } else {
- if (kpomers_left != 0)
- acc += InterpolateCoverage(e, kpomers_left);
- break;
- }
- }
- SetRawCoverage(new_edge, acc);
- }
-
- virtual void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) {
- SetRawCoverage(new_edge, RawCoverage(edge1) + RawCoverage(edge2));
- }
-
- virtual void HandleSplit(EdgeId old_edge, EdgeId new_edge_1,
- EdgeId new_edge_2) {
- //todo maybe improve later
- SetCoverageSimilarToAverageFlanking(new_edge_1, old_edge);
- SetCoverageSimilarToAverageGlobal(new_edge_2, old_edge);
- if (old_edge == g_.conjugate(old_edge)) {
- SetCoverageSimilarToAverageGlobal(g_.conjugate(new_edge_1), old_edge);
- }
- }
-
- virtual void HandleDelete(EdgeId e) {
- SetRawCoverage(e, 0);
- }
-
- double LocalCoverage(EdgeId e, VertexId v) const {
- if (this->g().EdgeStart(e) == v) {
- return GetInCov(e);
- } else if (this->g().EdgeEnd(e) == v) {
- return GetOutCov(e);
- } else {
- VERIFY(false);
- return 0.0;
- }
- }
-
- //left for compatibility
- //todo rename
- double GetInCov(EdgeId e) const {
- return CoverageOfStart(e);
- }
-
- //todo rename
- double GetOutCov(EdgeId e) const {
- return CoverageOfEnd(e);
- }
-
- //////////////////////////
-
- void Save(EdgeId e, ostream& out) const {
- out << RawCoverage(e);
- }
-
- void Load(EdgeId e, istream& in) {
- unsigned cov;
- in >> cov;
- SetRawCoverage(e, cov);
- }
-
- /*
- * Is thread safe if different threads process different edges.
- */
- bool IsThreadSafe() const {
- return true;
- }
-
-private:
- DECL_LOGGER("FlankingCoverage")
- ;
-};
-
-template<class StoringType>
-struct SimultaneousCoverageCollector {
-};
-
-template<>
-struct SimultaneousCoverageCollector<SimpleStoring> {
- template<class SimultaneousCoverageFiller, class Info>
- static void CollectCoverage(SimultaneousCoverageFiller& filler, const Info &edge_info) {
- filler.inc_coverage(edge_info);
- }
-};
-
-template<>
-struct SimultaneousCoverageCollector<InvertableStoring> {
- template<class SimultaneousCoverageFiller, class Info>
- static void CollectCoverage(SimultaneousCoverageFiller& filler, const Info &edge_info) {
- filler.inc_coverage(edge_info);
- filler.inc_coverage(edge_info.conjugate(filler.k()));
- }
-};
-
-template<class Graph, class CountIndex>
-class SimultaneousCoverageFiller {
- const Graph& g_;
- const CountIndex& count_index_;
- FlankingCoverage<Graph>& flanking_coverage_;
- CoverageIndex<Graph>& coverage_index_;
- typedef typename CountIndex::Value Value;
-public:
- SimultaneousCoverageFiller(const Graph& g, const CountIndex& count_index,
- FlankingCoverage<Graph>& flanking_coverage,
- CoverageIndex<Graph>& coverage_index) :
- g_(g),
- count_index_(count_index),
- flanking_coverage_(flanking_coverage),
- coverage_index_(coverage_index) {
- }
-
- size_t k() const {
- return count_index_.k();
- }
-
- void inc_coverage(const Value &edge_info) {
- coverage_index_.IncRawCoverage(edge_info.edge_id, edge_info.count);
- if (edge_info.offset < flanking_coverage_.averaging_range()) {
- flanking_coverage_.IncRawCoverage(edge_info.edge_id, edge_info.count);
- }
- }
-
- void Fill() {
- for (auto I = count_index_.value_cbegin(), E = count_index_.value_cend();
- I != E; ++I) {
- const auto& edge_info = *I;
- VERIFY(edge_info.valid());
- VERIFY(edge_info.edge_id.get() != NULL);
- SimultaneousCoverageCollector<typename CountIndex::storing_type>::CollectCoverage(*this, edge_info);
- }
- }
-};
-
-template<class Graph, class CountIndex>
-void FillCoverageAndFlanking(const CountIndex& count_index, Graph& g,
- FlankingCoverage<Graph>& flanking_coverage) {
- SimultaneousCoverageFiller<Graph, CountIndex> filler(g, count_index, flanking_coverage, g.coverage_index());
- filler.Fill();
-}
-
-}
diff --git a/src/debruijn/distance_estimation.cpp b/src/debruijn/distance_estimation.cpp
deleted file mode 100644
index ef846f7..0000000
--- a/src/debruijn/distance_estimation.cpp
+++ /dev/null
@@ -1,242 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "standard.hpp"
-#include "dataset_readers.hpp"
-#include "pair_info_improver.hpp"
-
-#include "de/paired_info_helpers.hpp"
-#include "de/pair_info_filters.hpp"
-#include "de/distance_estimation.hpp"
-#include "de/weighted_distance_estimation.hpp"
-#include "de/extensive_distance_estimation.hpp"
-#include "de/smoothing_distance_estimation.hpp"
-
-#include "utils.hpp"
-
-#include "distance_estimation.hpp"
-
-#include <set>
-
-namespace debruijn_graph {
-
-using namespace omnigraph::de;
-
-template<class Graph>
-void estimate_with_estimator(const Graph &graph,
- const omnigraph::de::AbstractDistanceEstimator<Graph>& estimator,
- omnigraph::de::AbstractPairInfoChecker<Graph>& checker,
- PairedIndexT& clustered_index) {
- using debruijn_graph::estimation_mode;
- DEBUG("Estimating distances");
-
- estimator.Estimate(clustered_index, cfg::get().max_threads);
-
- INFO("Filtering info");
- if(cfg::get().amb_de.enabled){
- AmbiguousPairInfoChecker<Graph> amb_de_checker(graph,
- clustered_index,
- checker,
- cfg::get().amb_de.haplom_threshold,
- cfg::get().amb_de.relative_length_threshold,
- cfg::get().amb_de.relative_seq_threshold);
- PairInfoFilter<Graph>(amb_de_checker).Filter(clustered_index);
- }
- else
- PairInfoFilter<Graph>(checker).Filter(clustered_index);
-// filter.Filter(clustered_index);
- DEBUG("Info Filtered");
-}
-
-
-// Postprocessing, checking that clusters do not intersect
-template<class Graph>
-void RefinePairedInfo(const Graph& graph, PairedInfoIndexT<Graph>& clustered_index) {
- for (auto iter = pair_begin(clustered_index); iter != pair_end(clustered_index); ++iter) {
- EdgeId first_edge = iter.first();
- EdgeId second_edge = iter.second();
- auto infos = iter->Unwrap(); //we need an ordered histogram here
- if (infos.empty())
- continue;
-
- auto prev_it = infos.begin();
- auto it = prev_it;
- ++it;
- for (auto end_it = infos.end(); it != end_it; ++it) {
- if (math::le(abs(it->d - prev_it->d), it->var + prev_it->var)) {
- WARN("Clusters intersect, edges -- " << graph.int_id(first_edge)
- << " " << graph.int_id(second_edge));
- INFO("Trying to handle this case");
- // seeking the symmetric pair info to [i - 1]
- bool success = false;
- double total_weight = prev_it->weight;
- for (auto inner_it = it; inner_it != end_it; ++inner_it) {
- total_weight += inner_it->weight;
- if (math::eq(inner_it->d + prev_it->d, 0.f)) {
- success = true;
- double center = 0.;
- double var = inner_it->d + inner_it->var;
- for (auto inner_it_2 = prev_it; inner_it_2 != inner_it; ++inner_it_2) {
- TRACE("Removing pair info " << *inner_it_2);
- clustered_index.Remove(first_edge, second_edge, *inner_it_2);
- }
- clustered_index.Remove(first_edge, second_edge, *inner_it);
- Point new_point(center, total_weight, var);
- TRACE("Adding new pair info " << first_edge << " " << second_edge << " " << new_point);
- clustered_index.Add(first_edge, second_edge, new_point);
- break;
- }
- }
- INFO("Pair information was resolved");
-
- if (!success)
- WARN("This intersection can not be handled in the right way");
-
- break;
- }
- }
- }
-}
-
-void estimate_distance(conj_graph_pack& gp,
- const io::SequencingLibrary<debruijn_config::DataSetData> &lib,
- const UnclusteredPairedIndexT& paired_index,
- PairedIndexT& clustered_index,
- PairedIndexT& scaffolding_index) {
- using debruijn_graph::estimation_mode;
-
- const debruijn_config& config = cfg::get();
- size_t delta = size_t(lib.data().insert_size_deviation);
- size_t linkage_distance = size_t(config.de.linkage_distance_coeff * lib.data().insert_size_deviation);
- GraphDistanceFinder<Graph> dist_finder(gp.g, (size_t)math::round(lib.data().mean_insert_size), lib.data().read_length, delta);
- size_t max_distance = size_t(config.de.max_distance_coeff * lib.data().insert_size_deviation);
-
- std::function<double(int)> weight_function;
-
- if (config.est_mode == em_weighted || // in these cases we need a weight function
- config.est_mode == em_smoothing || // to estimate graph distances in the
- config.est_mode == em_extensive) { // histogram
- if (lib.data().insert_size_distribution.size() == 0) {
- WARN("No insert size distribution found, stopping distance estimation");
- return;
- }
-
- WeightDEWrapper wrapper(lib.data().insert_size_distribution, lib.data().mean_insert_size);
- DEBUG("Weight Wrapper Done");
- weight_function = std::bind(&WeightDEWrapper::CountWeight, wrapper, std::placeholders::_1);
- } else
- weight_function = UnityFunction;
-
-// PairInfoWeightFilter<Graph> filter(gp.g, config.de.filter_threshold);
- PairInfoWeightChecker<Graph> checker(gp.g, config.de.filter_threshold);
-
- INFO("Weight Filter Done");
-
- switch (config.est_mode) {
- case em_simple: {
- const AbstractDistanceEstimator<Graph>&
- estimator =
- DistanceEstimator<Graph>(gp.g, paired_index, dist_finder,
- linkage_distance, max_distance);
-
- estimate_with_estimator<Graph>(gp.g, estimator, checker, clustered_index);
- break;
- }
- case em_weighted: {
- const AbstractDistanceEstimator<Graph>&
- estimator =
- WeightedDistanceEstimator<Graph>(gp.g, paired_index,
- dist_finder, weight_function, linkage_distance, max_distance);
-
- estimate_with_estimator<Graph>(gp.g, estimator, checker, clustered_index);
- break;
- }
- case em_extensive: {
- const AbstractDistanceEstimator<Graph>&
- estimator =
- ExtensiveDistanceEstimator<Graph>(gp.g, paired_index,
- dist_finder, weight_function, linkage_distance, max_distance);
-
- estimate_with_estimator<Graph>(gp.g, estimator, checker, clustered_index);
- break;
- }
- case em_smoothing: {
- const AbstractDistanceEstimator<Graph>&
- estimator =
- SmoothingDistanceEstimator<Graph>(gp.g, paired_index,
- dist_finder, weight_function, linkage_distance, max_distance,
- config.ade.threshold,
- config.ade.range_coeff,
- config.ade.delta_coeff, config.ade.cutoff,
- config.ade.min_peak_points,
- config.ade.inv_density,
- config.ade.percentage,
- config.ade.derivative_threshold);
-
- estimate_with_estimator<Graph>(gp.g, estimator, checker, clustered_index);
- break;
- }
- }
-
- INFO("Refining clustered pair information "); // this procedure checks, whether index
- RefinePairedInfo(gp.g, clustered_index); // contains intersecting paired info clusters,
- INFO("The refining of clustered pair information has been finished "); // if so, it resolves such conflicts.
-
- INFO("Improving paired information");
- PairInfoImprover<Graph> improver(gp.g, clustered_index, lib);
- improver.ImprovePairedInfo((unsigned) config.max_threads);
-
- if (cfg::get().pe_params.param_set.scaffolder_options.cluster_info) {
- INFO("Filling scaffolding index");
-
- double is_var = lib.data().insert_size_deviation;
- size_t delta = size_t(is_var);
- size_t linkage_distance = size_t(cfg::get().de.linkage_distance_coeff * is_var);
- GraphDistanceFinder<Graph> dist_finder(gp.g, (size_t) math::round(lib.data().mean_insert_size),
- lib.data().read_length, delta);
- size_t max_distance = size_t(cfg::get().de.max_distance_coeff_scaff * is_var);
- std::function<double(int)> weight_function;
-
- DEBUG("Retaining insert size distribution for it");
- if (lib.data().insert_size_distribution.size() == 0) {
- WARN("The library will not be used for scaffolding");
- return;
- }
-
-
- WeightDEWrapper wrapper(lib.data().insert_size_distribution, lib.data().mean_insert_size);
- DEBUG("Weight Wrapper Done");
- weight_function = std::bind(&WeightDEWrapper::CountWeight, wrapper, std::placeholders::_1);
-
-// PairInfoWeightFilter<Graph> filter(gp.g, 0.);
- PairInfoWeightChecker<Graph> checker(gp.g, 0.);
- DEBUG("Weight Filter Done");
-
- const AbstractDistanceEstimator<Graph>& estimator =
- SmoothingDistanceEstimator<Graph>(gp.g, paired_index, dist_finder,
- weight_function, linkage_distance, max_distance,
- cfg::get().ade.threshold, cfg::get().ade.range_coeff,
- cfg::get().ade.delta_coeff, cfg::get().ade.cutoff,
- cfg::get().ade.min_peak_points, cfg::get().ade.inv_density,
- cfg::get().ade.percentage,
- cfg::get().ade.derivative_threshold, true);
- estimate_with_estimator<Graph>(gp.g, estimator, checker, scaffolding_index);
- }
-}
-
-void DistanceEstimation::run(conj_graph_pack &gp, const char*) {
- for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i)
- if (cfg::get().ds.reads[i].type() == io::LibraryType::PairedEnd) {
- if (cfg::get().ds.reads[i].data().mean_insert_size != 0.0) {
- INFO("Processing library #" << i);
- estimate_distance(gp, cfg::get().ds.reads[i], gp.paired_indices[i], gp.clustered_indices[i], gp.scaffolding_indices[i]);
- }
- gp.paired_indices[i].Clear();
- }
-}
-
-}
diff --git a/src/debruijn/distance_estimation.hpp b/src/debruijn/distance_estimation.hpp
deleted file mode 100644
index b9d8158..0000000
--- a/src/debruijn/distance_estimation.hpp
+++ /dev/null
@@ -1,24 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "stage.hpp"
-
-namespace debruijn_graph {
-
-class DistanceEstimation : public spades::AssemblyStage {
- public:
- DistanceEstimation(bool preliminary = false)
- : AssemblyStage(preliminary ? "Preliminary Distance Estimation" : "Distance Estimation",
- preliminary ? "distance_estimation_preliminary" : "distance_estimation") {}
-
- void run(conj_graph_pack &gp, const char*);
-};
-
-}
-
diff --git a/src/debruijn/early_simplification.hpp b/src/debruijn/early_simplification.hpp
deleted file mode 100644
index 4ee6f20..0000000
--- a/src/debruijn/early_simplification.hpp
+++ /dev/null
@@ -1,269 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-#include "standard.hpp"
-#include "indices/perfect_hash_map.hpp"
-#include "runtime_k.hpp"
-#include "mph_index/kmer_index.hpp"
-
-namespace debruijn_graph {
-
-class LinkCleaner {
-private:
- typedef DeBruijnExtensionIndex<> Index;
- typedef Index::KMer Kmer;
- typedef Index::KeyWithHash KeyWithHash;
- Index &index_;
-
- void CleanForwardLinks(KeyWithHash &kh, char i) {
- if(index_.CheckOutgoing(kh, i)) {
- KeyWithHash next_kh = index_.GetOutgoing(kh, i);
- if(!index_.CheckIncoming(next_kh, kh[0])) {
- index_.DeleteOutgoing(kh, i);
- }
- }
- }
-
- void CleanBackwardLinks(KeyWithHash &kh, char i) {
- if(index_.CheckIncoming(kh, i)) {
- KeyWithHash prev_kh = index_.GetIncoming(kh, i);
- if(!index_.CheckOutgoing(prev_kh, kh[index_.k() - 1])) {
- index_.DeleteIncoming(kh, i);
- }
- }
- }
-
-public:
- LinkCleaner(Index &index) : index_(index) {}
-
- //TODO make parallel
- void CleanLinks() {
- vector<Index::kmer_iterator> iters = index_.kmer_begin(10 * cfg::get().max_threads);
-# pragma omp parallel for schedule(guided)
- for(size_t i = 0; i < iters.size(); i++) {
- for (Index::kmer_iterator &it = iters[i]; it.good(); ++it) {
- KeyWithHash kh = index_.ConstructKWH(runtime_k::RtSeq(index_.k(), *it));
- if (kh.is_minimal()) {
- KeyWithHash kh = index_.ConstructKWH(runtime_k::RtSeq(index_.k(), *it));
- for (char i = 0; i < 4; i++) {
- CleanForwardLinks(kh, i);
- CleanBackwardLinks(kh, i);
- }
- }
- }
- }
- }
-};
-
-
-class EarlyTipClipper {
-private:
- typedef DeBruijnExtensionIndex<> Index;
- typedef Index::KMer Kmer;
- typedef Index::KeyWithHash KeyWithHash;
- Index &index_;
- size_t length_bound_;
-
-//Not optimal with respect to the number of large array queries (the one that contains adjacency masks). Should be ok though in case cash works the way I think it does
- size_t RemoveForward(KeyWithHash kh) {
- std::vector<KeyWithHash> tip;
- do {
- tip.push_back(kh);
- kh = index_.GetUniqueOutgoing(kh);
- } while (tip.size() < length_bound_ && index_.CheckUniqueIncoming(kh) && index_.CheckUniqueOutgoing(kh));
-
- if (!index_.CheckUniqueIncoming(kh)) {
- for (size_t i = 0; i < tip.size(); i++) {
- index_.IsolateVertex(tip[i]);
- }
- return tip.size();
- }
-
- return 0;
- }
-
- size_t RemoveBackward(KeyWithHash kh) {
- std::vector<KeyWithHash> tip;
- do {
- tip.push_back(kh);
- kh = index_.GetUniqueIncoming(kh);
- } while(tip.size() < length_bound_ && index_.CheckUniqueIncoming(kh) && index_.CheckUniqueOutgoing(kh));
-
- if (!index_.CheckUniqueOutgoing(kh)) {
- for (size_t i = 0; i < tip.size(); i++) {
- index_.IsolateVertex(tip[i]);
- }
- return tip.size();
- }
- return 0;
- }
-
- //TODO make parallel
- size_t RoughClipTips() {
- size_t result = 0;
- for (auto it = index_.kmer_begin(); it.good(); ++it) {
- KeyWithHash kh = index_.ConstructKWH(runtime_k::RtSeq(index_.k(), *it));
- if (index_.IsDeadEnd(kh) && index_.CheckUniqueIncoming(kh)) {
- result += RemoveBackward(kh);
- } else if(index_.IsDeadStart(kh) && index_.CheckUniqueOutgoing(kh)) {
- result += RemoveForward(kh);
- }
- }
- return result;
- }
-
-
-public:
- EarlyTipClipper(Index &index, size_t length_bound) :
- index_(index), length_bound_(length_bound) {}
-
- /*
- * Method returns the number of removed edges
- */
- size_t ClipTips() {
- INFO("Early tip clipping");
- size_t result = RoughClipTips();
- LinkCleaner(index_).CleanLinks();
- INFO(result << " " << (index_.k()+1) <<"-mers were removed by early tip clipper");
- return result;
- }
-protected:
- DECL_LOGGER("Early tip clipping");
-};
-
-
-class AlternativeEarlyTipClipper {
-private:
- typedef DeBruijnExtensionIndex<> Index;
- typedef Index::KMer Kmer;
- typedef Index::KeyWithHash KeyWithHash;
- Index &index_;
- size_t length_bound_;
-
- /*
- * This method starts from the kmer that is second in the tip counting from junction vertex. It records all kmers of a tip into tip vector.
- * The method returns length of a tip.
- * In case it did not end as a tip or if it was too long tip vector is cleared and infinite length is returned.
- * Thus tip vector contains only kmers to be removed while returned length value gives reasonable information of what happend.
- */
- size_t FindForward(KeyWithHash kh, vector<KeyWithHash> &tip) {
- while(tip.size() < length_bound_ && index_.CheckUniqueIncoming(kh) && index_.CheckUniqueOutgoing(kh)) {
- tip.push_back(kh);
- kh = index_.GetUniqueOutgoing(kh);
- }
- tip.push_back(kh);
- if(index_.CheckUniqueIncoming(kh) && index_.IsDeadEnd(kh)) {
- return tip.size();
- }
- tip.clear();
- return -1;
- }
-
- size_t FindBackward(KeyWithHash kh, vector<KeyWithHash> &tip) {
- while(tip.size() < length_bound_ && index_.CheckUniqueOutgoing(kh) && index_.CheckUniqueIncoming(kh)) {
- tip.push_back(kh);
- kh = index_.GetUniqueIncoming(kh);
- }
- tip.push_back(kh);
- if(index_.CheckUniqueOutgoing(kh) && index_.IsDeadStart(kh)) {
- return tip.size();
- }
- tip.clear();
- return -1;
- }
-
- size_t RemoveTip(vector<KeyWithHash > &tip) {
- for(size_t i = 0; i < tip.size(); i++)
- index_.IsolateVertex(tip[i]);
- return tip.size();
- }
-
- size_t RemoveTips(vector<vector<KeyWithHash > > tips, size_t max) {
- size_t result = 0;
- for(char c = 0; c < 4; c++) {
- if(tips[c].size() < max) {
- result += RemoveTip(tips[c]);
- }
- }
- return result;
- }
-
- size_t RemoveForward(KeyWithHash kh) {
- vector<vector<KeyWithHash >> tips;
- tips.resize(4);
- size_t max = 0;
- for(char c = 0; c < 4; c++) {
- if(index_.CheckOutgoing(kh, c)) {
- KeyWithHash khc = index_.GetOutgoing(kh, c);
- size_t len = FindForward(khc, tips[c]);
- if(len > max)
- max = len;
- }
- }
- return RemoveTips(tips, max);
- }
-
- size_t RemoveBackward(KeyWithHash kh) {
- vector<vector<KeyWithHash >> tips;
- tips.resize(4);
- size_t max = 0;
- for(char c = 0; c < 4; c++) {
- if(index_.CheckIncoming(kh, c)) {
- KeyWithHash khc = index_.GetIncoming(kh, c);
- size_t len = FindBackward(khc, tips[c]);
- if(len > max)
- max = len;
- }
- }
- return RemoveTips(tips, max);
- }
-
- //TODO make parallel
- size_t RoughClipTips() {
- vector<Index::kmer_iterator> iters = index_.kmer_begin(10 * cfg::get().max_threads);
- vector<size_t> result(iters.size());
-# pragma omp parallel for schedule(guided)
- for(size_t i = 0; i < iters.size(); i++) {
- for(Index::kmer_iterator &it = iters[i]; it.good(); ++it) {
- KeyWithHash kh = index_.ConstructKWH(runtime_k::RtSeq(index_.k(), *it));
- if(kh.is_minimal()) {
- if (index_.OutgoingEdgeCount(kh) >= 2) {
- result[i] += RemoveForward(kh);
- }
- if (index_.IncomingEdgeCount(kh) >= 2) {
- result[i] += RemoveBackward(kh);
- }
- }
- }
- }
- size_t sum = 0;
- for(size_t i = 0; i < result.size(); i++)
- sum += result[i];
- return sum;
- }
-
-
-public:
- AlternativeEarlyTipClipper(Index &index, size_t length_bound) : index_(index), length_bound_(length_bound) {
- }
-
- /*
- * Method returns the number of removed edges
- */
- size_t ClipTips() {
- INFO("Early tip clipping");
- size_t result = RoughClipTips();
- LinkCleaner(index_).CleanLinks();
- INFO(result << " " << (index_.k()+1) <<"-mers were removed by early tip clipper");
- return result;
- }
-protected:
- DECL_LOGGER("Early tip clipping");
-};
-
-}
diff --git a/src/debruijn/edge_index.hpp b/src/debruijn/edge_index.hpp
deleted file mode 100644
index 5496114..0000000
--- a/src/debruijn/edge_index.hpp
+++ /dev/null
@@ -1,113 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "openmp_wrapper.h"
-
-#include "omni/omni_utils.hpp"
-#include "adt/kmer_map.hpp"
-
-#include "debruijn_graph.hpp"
-#include "standard.hpp"
-#include "indices/edge_index_builders.hpp"
-
-namespace debruijn_graph {
-
-/**
- * EdgeIndex is a structure to store info about location of certain k-mers in graph. It delegates all
- * container procedures to inner_index_ and all handling procedures to
- * renewer_ which is DataHashRenewer.
- * @see DeBruijnKMerIndex
- * @see DataHashRenewer
- */
-//fixme template params
-template<class Graph, class Seq /*= runtime_k::RtSeq*/,
- class Index /*= KmerFreeEdgeIndex<Graph, Seq>*/>
-class EdgeIndex: public GraphActionHandler<Graph> {
-
-public:
- typedef typename Graph::EdgeId EdgeId;
- typedef Index InnerIndexT;
- typedef Graph GraphT;
- typedef typename Index::KMer KMer;
- typedef typename Index::KMerIdx KMerIdx;
- typedef typename Index::Value Value;
-
-private:
- Index inner_index_;
- EdgeInfoUpdater<Index, Graph> updater_;
- bool delete_index_;
-
-public:
-
- EdgeIndex(const Graph& g, const std::string &workdir)
- : GraphActionHandler<Graph>(g, "EdgeIndex"),
- inner_index_(g, workdir),
- updater_(g, inner_index_),
- delete_index_(true) {
- }
-
- virtual ~EdgeIndex() {
- TRACE("~EdgeIndex OK")
- }
-
- Index &inner_index() {
- return inner_index_;
- }
-
- size_t k() const {
- return inner_index_.k();
- }
-
- const Index &inner_index() const {
- VERIFY(this->IsAttached());
- return inner_index_;
- }
-
- virtual void HandleAdd(EdgeId e) {
- updater_.UpdateKmers(e);
- }
-
- virtual void HandleDelete(EdgeId e) {
- updater_.DeleteKmers(e);
- }
-
- bool contains(const KMer& kmer) const {
- VERIFY(this->IsAttached());
- return inner_index_.contains(inner_index_.ConstructKWH(kmer));
- }
-
- const pair<EdgeId, size_t> get(const KMer& kmer) const {
- VERIFY(this->IsAttached());
- auto kwh = inner_index_.ConstructKWH(kmer);
- if (!inner_index_.contains(kwh)) {
- return make_pair(EdgeId(0), -1u);
- } else {
- EdgeInfo<EdgeId> entry = inner_index_.get_value(kwh);
- return std::make_pair(entry.edge_id, (size_t)entry.offset);
- }
- }
-
- void Refill() {
- clear();
- typedef typename EdgeIndexHelper<InnerIndexT>::GraphPositionFillingIndexBuilderT IndexBuilder;
- //also makes an update!
- //todo pass appropriate 3-rd arg
- IndexBuilder().BuildIndexFromGraph(inner_index_, this->g());
- }
-
- void Update() {
- updater_.UpdateAll();
- }
-
- void clear() {
- inner_index_.clear();
- }
-
-};
-}
diff --git a/src/debruijn/gap_closer.cpp b/src/debruijn/gap_closer.cpp
deleted file mode 100644
index 616d631..0000000
--- a/src/debruijn/gap_closer.cpp
+++ /dev/null
@@ -1,505 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "gap_closer.hpp"
-
-#include "standard.hpp"
-
-#include "omni/omni_tools.hpp"
-#include "io/io_helper.hpp"
-#include "omni/visualization/graph_labeler.hpp"
-#include "dataset_readers.hpp"
-#include "read_converter.hpp"
-#include "sequence_mapper.hpp"
-#include "short_read_mapper.hpp"
-#include "adt/kmer_set.hpp"
-
-#include "de/paired_info.hpp"
-
-#include <set>
-#include <stack>
-
-namespace debruijn_graph {
-
-template<class Graph, class SequenceMapper>
-class GapCloserPairedIndexFiller {
- private:
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- const Graph &graph_;
- const SequenceMapper& mapper_;
-
- size_t CorrectLength(Path<EdgeId> path, size_t idx) const {
- size_t answer = graph_.length(path[idx]);
- if (idx == 0)
- answer -= path.start_pos();
- if (idx == path.size() - 1)
- answer -= graph_.length(path[idx]) - path.end_pos();
- return answer;
- }
-
- template<typename PairedRead>
- void ProcessPairedRead(omnigraph::de::PairedInfoBuffer<Graph> &paired_index,
- const PairedRead& p_r,
- const std::unordered_map<EdgeId, pair<EdgeId, int> >& OutTipMap,
- const std::unordered_map<EdgeId, pair<EdgeId, int> >& InTipMap) const {
- Sequence read1 = p_r.first().sequence();
- Sequence read2 = p_r.second().sequence();
-
- Path<EdgeId> path1 = mapper_.MapSequence(read1).path();
- Path<EdgeId> path2 = mapper_.MapSequence(read2).path();
- for (size_t i = 0; i < path1.size(); ++i) {
- auto OutTipIter = OutTipMap.find(path1[i]);
- if (OutTipIter != OutTipMap.end()) {
- for (size_t j = 0; j < path2.size(); ++j) {
- auto InTipIter = InTipMap.find(path2[j]);
- if (InTipIter != InTipMap.end()) {
- auto e1 = OutTipIter->second.first;
- auto e2 = InTipIter->second.first;
- paired_index.SwapConj(e1, e2);
- paired_index.Add(e1, e2, omnigraph::de::RawPoint(1000000., 1.));
- }
- }
- }
- }
- }
-
- void PrepareShiftMaps(std::unordered_map<EdgeId, pair<EdgeId, int> >& OutTipMap,
- std::unordered_map<EdgeId, pair<EdgeId, int> >& InTipMap) {
- std::stack<pair<EdgeId, int>> edge_stack;
- for (auto iterator = graph_.ConstEdgeBegin(); !iterator.IsEnd();) {
- EdgeId edge = *iterator;
- if (graph_.IncomingEdgeCount(graph_.EdgeStart(edge)) == 0) {
- InTipMap.insert(std::make_pair(edge, std::make_pair(edge, 0)));
- edge_stack.push(std::make_pair(edge, 0));
- while (edge_stack.size() > 0) {
- pair<EdgeId, int> checking_pair = edge_stack.top();
- edge_stack.pop();
- if (graph_.IncomingEdgeCount(graph_.EdgeEnd(checking_pair.first)) == 1) {
- VertexId v = graph_.EdgeEnd(checking_pair.first);
- if (graph_.OutgoingEdgeCount(v)) {
- for (auto I = graph_.out_begin(v), E = graph_.out_end(v); I != E; ++I) {
- EdgeId Cur_edge = *I;
- InTipMap.insert(
- std::make_pair(Cur_edge,
- std::make_pair(edge,
- graph_.length(checking_pair.first) + checking_pair.second)));
- edge_stack.push(
- std::make_pair(Cur_edge,
- graph_.length(checking_pair.first) + checking_pair.second));
-
- }
- }
- }
- }
- }
-
- if (graph_.OutgoingEdgeCount(graph_.EdgeEnd(edge)) == 0) {
- OutTipMap.insert(std::make_pair(edge, std::make_pair(edge, 0)));
- edge_stack.push(std::make_pair(edge, 0));
- while (edge_stack.size() > 0) {
- std::pair<EdgeId, int> checking_pair = edge_stack.top();
- edge_stack.pop();
- if (graph_.OutgoingEdgeCount(graph_.EdgeStart(checking_pair.first)) == 1) {
- if (graph_.IncomingEdgeCount(graph_.EdgeStart(checking_pair.first))) {
- for (EdgeId e : graph_.IncomingEdges(graph_.EdgeStart(checking_pair.first))) {
- OutTipMap.insert(std::make_pair(e,
- std::make_pair(edge,
- graph_.length(e) + checking_pair.second)));
- edge_stack.push(std::make_pair(e,
- graph_.length(e) + checking_pair.second));
- }
- }
- }
-
- }
- }
- ++iterator;
- }
- }
-
- template<class Streams>
- void MapReads(omnigraph::de::PairedInfoIndexT<Graph> &paired_index, Streams& streams,
- const std::unordered_map<EdgeId, pair<EdgeId, int> >& OutTipMap,
- const std::unordered_map<EdgeId, pair<EdgeId, int> >& InTipMap) const {
- INFO("Processing paired reads (takes a while)");
-
- size_t nthreads = streams.size();
- omnigraph::de::PairedInfoBuffersT<Graph> buffer_pi(graph_, nthreads);
-
- size_t counter = 0;
-# pragma omp parallel for num_threads(nthreads) reduction(+ : counter)
- for (size_t i = 0; i < nthreads; ++i) {
- typename Streams::ReadT r;
- auto& stream = streams[i];
- stream.reset();
-
- while (!stream.eof()) {
- stream >> r;
- ++counter;
- ProcessPairedRead(buffer_pi[i], r, OutTipMap, InTipMap);
- }
- }
-
- INFO("Used " << counter << " paired reads");
-
- INFO("Merging paired indices");
- for (auto& index: buffer_pi) {
- paired_index.Merge(index);
- index.Clear();
- }
- }
-
- public:
-
- GapCloserPairedIndexFiller(const Graph &graph, const SequenceMapper& mapper)
- : graph_(graph), mapper_(mapper) {}
-
- /**
- * Method reads paired data from stream, maps it to genome and stores it in this PairInfoIndex.
- */
- template<class Streams>
- void FillIndex(omnigraph::de::PairedInfoIndexT<Graph> &paired_index, Streams& streams) {
- std::unordered_map<EdgeId, pair<EdgeId, int> > OutTipMap, InTipMap;
-
- INFO("Preparing shift maps");
- PrepareShiftMaps(OutTipMap, InTipMap);
-
- MapReads(paired_index, streams, OutTipMap, InTipMap);
- }
-
-};
-
-template<class Graph, class SequenceMapper>
-class GapCloser {
- public:
- typedef std::function<bool (const Sequence&)> SequenceCheckF;
- private:
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- Graph& g_;
- int k_;
- omnigraph::de::PairedInfoIndexT<Graph>& tips_paired_idx_;
- const size_t min_intersection_;
- const size_t hamming_dist_bound_;
- const int init_gap_val_;
- const omnigraph::de::DEWeight weight_threshold_;
-
- SequenceMapper mapper_;
- runtime_k::KmerSet new_kmers_;
-
- bool CheckNoKmerClash(const Sequence& s) {
- runtime_k::RtSeq kmer(k_ + 1, s);
- kmer >>= 'A';
- for (size_t i = k_; i < s.size(); ++i) {
- kmer <<= s[i];
- if (new_kmers_.contains(kmer)) {
- return false;
- }
- }
- std::vector<EdgeId> path = mapper_.MapSequence(s).simple_path();
- return path.empty();
- }
-
- std::vector<size_t> DiffPos(const Sequence& s1, const Sequence& s2) const {
- VERIFY(s1.size() == s2.size());
- std::vector<size_t> answer;
- for (size_t i = 0; i < s1.size(); ++i)
- if (s1[i] != s2[i])
- answer.push_back(i);
- return answer;
- }
-
- size_t HammingDistance(const Sequence& s1, const Sequence& s2) const {
- VERIFY(s1.size() == s2.size());
- size_t dist = 0;
- for (size_t i = 0; i < s1.size(); ++i)
- if (s1[i] != s2[i])
- dist++;
- return dist;
- }
-
- // size_t HammingDistance(const Sequence& s1, const Sequence& s2) const {
- // return DiffPos(s1, s2).size();
- // }
-
- vector<size_t> PosThatCanCorrect(size_t overlap_length/*in nucls*/,
- const vector<size_t>& mismatch_pos, size_t edge_length/*in nucls*/,
- bool left_edge) const {
- TRACE("Try correct left edge " << left_edge);
- TRACE("Overlap length " << overlap_length);
- TRACE("Edge length " << edge_length);
- TRACE("Mismatches " << mismatch_pos);
-
- vector < size_t > answer;
- for (size_t i = 0; i < mismatch_pos.size(); ++i) {
- size_t relative_mm_pos =
- left_edge ?
- mismatch_pos[i] :
- overlap_length - 1 - mismatch_pos[i];
- if (overlap_length - relative_mm_pos + g_.k() < edge_length)
- //can correct mismatch
- answer.push_back(mismatch_pos[i]);
- }
- TRACE("Can correct mismatches: " << answer);
- return answer;
- }
-
- //todo write easier
- bool CanCorrectLeft(EdgeId e, int overlap, const vector<size_t>& mismatch_pos) const {
- return PosThatCanCorrect(overlap, mismatch_pos, g_.length(e) + g_.k(), true).size() == mismatch_pos.size();
- }
-
- //todo write easier
- bool CanCorrectRight(EdgeId e, int overlap,
- const vector<size_t>& mismatch_pos) const {
- return PosThatCanCorrect(overlap, mismatch_pos, g_.length(e) + g_.k(), false).size() == mismatch_pos.size();
- }
-
- bool MatchesEnd(const Sequence& long_seq, const Sequence& short_seq, bool from_begin) const {
- return from_begin ? long_seq.Subseq(0, short_seq.size()) == short_seq
- : long_seq.Subseq(long_seq.size() - short_seq.size()) == short_seq;
- }
-
- void AddEdge(VertexId start, VertexId end, const Sequence& s) {
- runtime_k::RtSeq kmer(k_ + 1, s);
- kmer >>= 'A';
- for (size_t i = k_; i < s.size(); ++i) {
- kmer <<= s[i];
- new_kmers_.insert(kmer);
- new_kmers_.insert(!kmer);
- }
- g_.AddEdge(start, end, s);
- }
-
- bool CorrectLeft(EdgeId first, EdgeId second, int overlap, const vector<size_t>& diff_pos) {
- DEBUG("Can correct first with sequence from second.");
- Sequence new_sequence = g_.EdgeNucls(first).Subseq(g_.length(first) - overlap + diff_pos.front(), g_.length(first) + k_ - overlap)
- + g_.EdgeNucls(second).First(k_);
- DEBUG("Checking new k+1-mers.");
- if (CheckNoKmerClash(new_sequence)) {
- DEBUG("Check ok.");
- DEBUG("Splitting first edge.");
- pair<EdgeId, EdgeId> split_res = g_.SplitEdge(first, g_.length(first) - overlap + diff_pos.front());
- first = split_res.first;
- tips_paired_idx_.Remove(split_res.second);
- DEBUG("Adding new edge.");
- VERIFY(MatchesEnd(new_sequence, g_.VertexNucls(g_.EdgeEnd(first)), true));
- VERIFY(MatchesEnd(new_sequence, g_.VertexNucls(g_.EdgeStart(second)), false));
- AddEdge(g_.EdgeEnd(first), g_.EdgeStart(second),
- new_sequence);
- return true;
- } else {
- DEBUG("Check fail.");
- DEBUG("Filled k-mer already present in graph");
- return false;
- }
- return false;
- }
-
- bool CorrectRight(EdgeId first, EdgeId second, int overlap, const vector<size_t>& diff_pos) {
- DEBUG("Can correct second with sequence from first.");
- Sequence new_sequence = g_.EdgeNucls(first).Last(k_) + g_.EdgeNucls(second).Subseq(overlap, diff_pos.back() + 1 + k_);
- DEBUG("Checking new k+1-mers.");
- if (CheckNoKmerClash(new_sequence)) {
- DEBUG("Check ok.");
- DEBUG("Splitting second edge.");
- pair<EdgeId, EdgeId> split_res = g_.SplitEdge(second, diff_pos.back() + 1);
- second = split_res.second;
- tips_paired_idx_.Remove(split_res.first);
- DEBUG("Adding new edge.");
- VERIFY(MatchesEnd(new_sequence, g_.VertexNucls(g_.EdgeEnd(first)), true));
- VERIFY(MatchesEnd(new_sequence, g_.VertexNucls(g_.EdgeStart(second)), false));
-
- AddEdge(g_.EdgeEnd(first), g_.EdgeStart(second),
- new_sequence);
- return true;
- } else {
- DEBUG("Check fail.");
- DEBUG("Filled k-mer already present in graph");
- return false;
- }
- return false;
- }
-
- bool HandlePositiveHammingDistanceCase(EdgeId first, EdgeId second, int overlap) {
- DEBUG("Match was imperfect. Trying to correct one of the tips");
- vector<size_t> diff_pos = DiffPos(g_.EdgeNucls(first).Last(overlap),
- g_.EdgeNucls(second).First(overlap));
- if (CanCorrectLeft(first, overlap, diff_pos)) {
- return CorrectLeft(first, second, overlap, diff_pos);
- } else if (CanCorrectRight(second, overlap, diff_pos)) {
- return CorrectRight(first, second, overlap, diff_pos);
- } else {
- DEBUG("Can't correct tips due to the graph structure");
- return false;
- }
- }
-
- bool HandleSimpleCase(EdgeId first, EdgeId second, int overlap) {
- DEBUG("Match was perfect. No correction needed");
- //strange info guard
- VERIFY(overlap <= k_);
- if (overlap == k_) {
- DEBUG("Tried to close zero gap");
- return false;
- }
- //old code
- Sequence edge_sequence = g_.EdgeNucls(first).Last(k_)
- + g_.EdgeNucls(second).Subseq(overlap, k_);
- if (CheckNoKmerClash(edge_sequence)) {
- DEBUG("Gap filled: Gap size = " << k_ - overlap << " Result seq "
- << edge_sequence.str());
- AddEdge(g_.EdgeEnd(first), g_.EdgeStart(second), edge_sequence);
- return true;
- } else {
- DEBUG("Filled k-mer already present in graph");
- return false;
- }
- }
-
- bool ProcessPair(EdgeId first, EdgeId second) {
- TRACE("Processing edges " << g_.str(first) << " and " << g_.str(second));
- TRACE("first " << g_.EdgeNucls(first) << " second " << g_.EdgeNucls(second));
-
- if (cfg::get().avoid_rc_connections &&
- (first == g_.conjugate(second) || first == second)) {
- DEBUG("Trying to join conjugate edges " << g_.int_id(first));
- return false;
- }
- //may be negative!
- int gap = max(init_gap_val_,
- -1 * (int)(min(g_.length(first), g_.length(second)) - 1));
-
- Sequence seq1 = g_.EdgeNucls(first);
- Sequence seq2 = g_.EdgeNucls(second);
- TRACE("Checking possible gaps from " << gap << " to " << k_ - min_intersection_);
- for (; gap <= k_ - (int)min_intersection_; ++gap) {
- int overlap = k_ - gap;
- size_t hamming_distance = HammingDistance(g_.EdgeNucls(first).Last(overlap)
- , g_.EdgeNucls(second).First(overlap));
- if (hamming_distance <= hamming_dist_bound_) {
- DEBUG("For edges " << g_.str(first) << " and " << g_.str(second)
- << ". For gap value " << gap << " (overlap " << overlap << "bp) hamming distance was " << hamming_distance);
- // DEBUG("Sequences of distance " << tip_distance << " :"
- // << seq1.Subseq(seq1.size() - k).str() << " "
- // << seq2.Subseq(0, k).str());
-
- if (hamming_distance > 0) {
- return HandlePositiveHammingDistanceCase(first, second, overlap);
- } else {
- return HandleSimpleCase(first, second, overlap);
- }
- }
- }
- return false;
- }
-
- public:
- //TODO extract methods
- void CloseShortGaps() {
- INFO("Closing short gaps");
- size_t gaps_filled = 0;
- size_t gaps_checked = 0;
- for (auto edge = g_.SmartEdgeBegin(); !edge.IsEnd(); ++edge) {
- EdgeId first_edge = *edge;
- for (auto i : tips_paired_idx_.Get(first_edge)) {
- EdgeId second_edge = i.first;
- if (first_edge == second_edge)
- continue;
-
- if (!g_.IsDeadEnd(g_.EdgeEnd(first_edge)) || !g_.IsDeadStart(g_.EdgeStart(second_edge))) {
- // WARN("Topologically wrong tips");
- continue;
- }
-
- bool closed = false;
- for (auto point : i.second) {
- if (math::ls(point.d, 0))
- continue;
- if (math::ls(point.weight, weight_threshold_))
- continue;
-
- ++gaps_checked;
- closed = ProcessPair(first_edge, second_edge);
- if (closed) {
- ++gaps_filled;
- break;
- }
- }
- if (closed)
- break;
- } // second edge
- } // first edge
-
- INFO("Closing short gaps complete: filled " << gaps_filled
- << " gaps after checking " << gaps_checked
- << " candidates");
- omnigraph::CompressAllVertices(g_);
- }
-
- GapCloser(Graph& g, omnigraph::de::PairedInfoIndexT<Graph>& tips_paired_idx,
- size_t min_intersection, double weight_threshold,
- const SequenceMapper& mapper,
- size_t hamming_dist_bound = 0 /*min_intersection_ / 5*/)
- : g_(g),
- k_((int) g_.k()),
- tips_paired_idx_(tips_paired_idx),
- min_intersection_(min_intersection),
- hamming_dist_bound_(hamming_dist_bound),
- init_gap_val_(-10),
- weight_threshold_(weight_threshold),
- mapper_(mapper),
- new_kmers_(k_ + 1) {
- VERIFY(min_intersection_ < g_.k());
- DEBUG("weight_threshold=" << weight_threshold_);
- DEBUG("min_intersect=" << min_intersection_);
- DEBUG("paired_index size=" << tips_paired_idx_.size());
- }
-
- private:
- DECL_LOGGER("GapCloser");
-};
-
-template<class Streams>
-void CloseGaps(conj_graph_pack& gp, Streams& streams) {
- typedef NewExtendedSequenceMapper<Graph, Index> Mapper;
- auto mapper = MapperInstance(gp);
- GapCloserPairedIndexFiller<Graph, Mapper> gcpif(gp.g, *mapper);
- PairedIndexT tips_paired_idx(gp.g);
- gcpif.FillIndex(tips_paired_idx, streams);
- GapCloser<Graph, Mapper> gap_closer(gp.g, tips_paired_idx,
- cfg::get().gc.minimal_intersection, cfg::get().gc.weight_threshold,
- *mapper);
- gap_closer.CloseShortGaps();
-}
-
-void GapClosing::run(conj_graph_pack &gp, const char*) {
-
- bool pe_exist = false;
- for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i) {
- if (cfg::get().ds.reads[i].type() == io::LibraryType::PairedEnd) {
- pe_exist = true;
- break;
- }
- }
- if (!pe_exist) {
- INFO("No paired-end libraries exist, skipping gap closer");
- return;
- }
- gp.EnsureIndex();
-
- for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i) {
- if (cfg::get().ds.reads[i].type() == io::LibraryType::PairedEnd) {
- auto streams = paired_binary_readers(cfg::get().ds.reads[i], false, 0);
- CloseGaps(gp, streams);
- }
- }
-}
-
-}
diff --git a/src/debruijn/gap_closer.hpp b/src/debruijn/gap_closer.hpp
deleted file mode 100644
index 980e052..0000000
--- a/src/debruijn/gap_closer.hpp
+++ /dev/null
@@ -1,33 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef GAP_CLOSER_HPP_
-#define GAP_CLOSER_HPP_
-
-#include "stage.hpp"
-
-namespace debruijn_graph {
-
-class GapClosing : public spades::AssemblyStage {
- public:
- GapClosing(const char* id)
- : AssemblyStage("Gap Closer", id) {}
-
- void run(conj_graph_pack &gp, const char*);
-};
-
-}
-
-
-
-#endif /* GAP_CLOSER_HPP_ */
diff --git a/src/debruijn/genome_consistance_checker.cpp b/src/debruijn/genome_consistance_checker.cpp
deleted file mode 100644
index cc954d5..0000000
--- a/src/debruijn/genome_consistance_checker.cpp
+++ /dev/null
@@ -1,236 +0,0 @@
-#include "genome_consistance_checker.hpp"
-#include "debruijn_graph.hpp"
-#include <algorithm>
-#include <limits>
-namespace debruijn_graph {
-using omnigraph::MappingRange;
-using namespace std;
-
-//gap or overlap size. WITHOUT SIGN!
-static size_t gap(const Range &a, const Range &b) {
- return max(a.end_pos, b.start_pos) - min (a.end_pos, b.start_pos);
-}
-bool GenomeConsistenceChecker::consequent(const Range &mr1, const Range &mr2) const{
- if (mr1.end_pos > mr2.start_pos + absolute_max_gap_)
- return false;
- if (mr1.end_pos + absolute_max_gap_ < mr2.start_pos)
- return false;
- return true;
-
-}
-bool GenomeConsistenceChecker::consequent(const MappingRange &mr1, const MappingRange &mr2) const {
- //do not want to think about handling gaps near 0 position.
- if (!consequent(mr1.initial_range, mr2.initial_range) || !consequent(mr1.mapped_range, mr2.mapped_range))
- return false;
- size_t initial_gap = gap(mr1.initial_range, mr2.initial_range);
- size_t mapped_gap = gap(mr1.mapped_range, mr2.mapped_range);
- size_t max_gap = max(initial_gap, mapped_gap);
- if ( max_gap > relative_max_gap_* double (max (min(mr1.initial_range.size(), mr1.mapped_range.size()), min(mr2.initial_range.size(), mr2.mapped_range.size()))))
- return false;
- return true;
-}
-
-PathScore GenomeConsistenceChecker::CountMisassemblies(const BidirectionalPath &path) const {
- PathScore straight = CountMisassembliesWithStrand(path, "0");
- PathScore reverse = CountMisassembliesWithStrand(path, "1");
- size_t total_length = path.LengthAt(0);
-//TODO: constant;
- if (total_length > std::max(straight.mapped_length, reverse.mapped_length) * 2) {
- DEBUG("mapped less than half of the path, skipping");
- return PathScore(0,0,0);
- } else {
- if (straight.mapped_length > reverse.mapped_length) {
- return straight;
- } else {
- return reverse;
- }
- }
-}
-
-void GenomeConsistenceChecker::SpellGenome() {
- vector<pair<EdgeId, MappingRange> > to_sort;
- for(auto e: storage_) {
- if (excluded_unique_.find(e) == excluded_unique_.end() ) {
- set<MappingRange> mappings = gp_.edge_pos.GetEdgePositions(e, "fxd0");
- if (mappings.size() > 1) {
- INFO("edge " << e << "smth strange");
- } else if (mappings.size() == 0) {
- continue;
- } else {
- to_sort.push_back(make_pair(e, *mappings.begin()));
- }
- }
- }
- sort(to_sort.begin(), to_sort.end(), [](const pair<EdgeId, MappingRange> & a, const pair<EdgeId, MappingRange> & b) -> bool
- {
- return a.second.initial_range.start_pos < b.second.initial_range.start_pos;
- }
- );
- size_t count = 0;
- for(auto p: to_sort) {
- INFO("edge " << gp_.g.int_id(p.first) << " length "<< gp_.g.length(p.first) << " coverage " << gp_.g.coverage(p.first) << " mapped to " << p.second.mapped_range.start_pos << " - " << p.second.mapped_range.end_pos << " init_range " << p.second.initial_range.start_pos << " - " << p.second.initial_range.end_pos );
- genome_spelled_[p.first] = count;
- count++;
- }
-}
-
-PathScore GenomeConsistenceChecker::CountMisassembliesWithStrand(const BidirectionalPath &path, const string strand) const {
- if (strand == "1") {
- return (CountMisassembliesWithStrand(*path.GetConjPath(), "0"));
- }
- PathScore res(0, 0, 0);
- EdgeId prev;
- size_t prev_in_genome = std::numeric_limits<std::size_t>::max();
- size_t prev_in_path = std::numeric_limits<std::size_t>::max();
- MappingRange prev_range;
- for (int i = 0; i < (int) path.Size(); i++) {
- if (genome_spelled_.find(path.At(i)) != genome_spelled_.end()) {
- size_t cur_in_genome = genome_spelled_[path.At(i)];
- MappingRange cur_range = *gp_.edge_pos.GetEdgePositions(path.At(i), "fxd0").begin();
- if (prev_in_genome != std::numeric_limits<std::size_t>::max()) {
- if (cur_in_genome == prev_in_genome + 1) {
- int dist_in_genome = (int) cur_range.initial_range.start_pos - (int) prev_range.initial_range.end_pos;
- int dist_in_path = (int) path.LengthAt(prev_in_path) - (int) path.LengthAt(i) + (int) cur_range.mapped_range.start_pos - (int) prev_range.mapped_range.end_pos;
- DEBUG("Edge " << prev.int_id() << " position in genome ordering: " << prev_in_genome);
- DEBUG("Gap in genome / gap in path: " << dist_in_genome << " / " << dist_in_path);
- if (abs(dist_in_genome - dist_in_path) >absolute_max_gap_ && (dist_in_genome * (1 + relative_max_gap_) < dist_in_path || dist_in_path * (1 + relative_max_gap_) < dist_in_genome)) {
-
- res.wrong_gap_size ++;
- }
- } else {
- if (path.At(i) != circular_edge_ && path.At(prev_in_path) != circular_edge_)
- res.misassemblies++;
- else
- INFO("Skipping fake(circular) misassembly");
- }
- }
- res.mapped_length += cur_range.mapped_range.size();
- prev = path.At(i);
- prev_in_genome = cur_in_genome;
- prev_range = cur_range;
- prev_in_path = i;
- }
- }
- if (prev_in_path != std::numeric_limits<std::size_t>::max())
- DEBUG("Edge " << prev.int_id() << " position in genome ordering: " << prev_in_genome);
- return res;
-}
-void GenomeConsistenceChecker::RefillPos() {
- RefillPos("0");
- RefillPos("1");
-}
-
-
-void GenomeConsistenceChecker::RefillPos(const string &strand) {
- for (auto e: storage_) {
- RefillPos(strand, e);
- }
-}
-
-void GenomeConsistenceChecker::FindBestRangeSequence(const set<MappingRange>& old_mappings, vector<MappingRange>& used_mappings) const {
- vector<MappingRange> to_process (old_mappings.begin(), old_mappings.end());
- sort(to_process.begin(), to_process.end(), [](const MappingRange & a, const MappingRange & b) -> bool
- {
- return a.mapped_range.start_pos < b.mapped_range.start_pos;
- } );
- size_t sz = to_process.size();
-//max weight path in orgraph of mappings
- TRACE("constructing mapping graph" << sz << " vertices");
- vector<vector<size_t>> consecutive_mappings(sz);
- for(size_t i = 0; i < sz; i++) {
- for (size_t j = i + 1; j < sz; j++) {
- if (consequent(to_process[i], to_process[j])) {
- consecutive_mappings[i].push_back(j);
- } else {
- if (to_process[j].mapped_range.start_pos > to_process[i].mapped_range.end_pos + absolute_max_gap_) {
- break;
- }
- }
- }
- }
- vector<size_t> scores(sz), prev(sz);
- for(size_t i = 0; i < sz; i++) {
- scores[i] = to_process[i].initial_range.size();
- prev[i] = std::numeric_limits<std::size_t>::max();
- }
- for(size_t i = 0; i < sz; i++) {
- for (size_t j = 0; j < consecutive_mappings[i].size(); j++) {
- TRACE(consecutive_mappings[i][j]);
- if (scores[consecutive_mappings[i][j]] < scores[i] + to_process[consecutive_mappings[i][j]].initial_range.size()) {
- scores[consecutive_mappings[i][j]] = scores[i] + to_process[consecutive_mappings[i][j]].initial_range.size();
- prev[consecutive_mappings[i][j]] = i;
- }
- }
- }
- size_t cur_max = 0;
- size_t cur_i = 0;
- for(size_t i = 0; i < sz; i++) {
- if (scores[i] > cur_max) {
- cur_max = scores[i];
- cur_i = i;
- }
- }
- used_mappings.clear();
- while (cur_i != std::numeric_limits<std::size_t>::max()) {
- used_mappings.push_back(to_process[cur_i]);
- cur_i = prev[cur_i];
- }
- reverse(used_mappings.begin(), used_mappings.end());
-};
-
-void GenomeConsistenceChecker::RefillPos(const string &strand, const EdgeId &e) {
- set<MappingRange> old_mappings = gp_.edge_pos.GetEdgePositions(e, strand);
- TRACE("old mappings sz " << old_mappings.size() );
- size_t total_mapped = 0;
- for (auto mp:old_mappings) {
- total_mapped += mp.initial_range.size();
- }
- if (total_mapped > (double) gp_.g.length(e) * 1.5) {
- INFO ("Edge " << gp_.g.int_id(e) << "is not unique, excluding");
- excluded_unique_.insert(e);
- return;
- }
-//TODO: support non-unique edges;
- if (total_mapped < (double) gp_.g.length(e) * 0.5) {
- DEBUG ("Edge " << gp_.g.int_id(e) << "is not mapped on strand "<< strand <<", not used");
- return;
- }
- TRACE(total_mapped << " " << gp_.g.length(e));
- string new_strand = "fxd" + strand;
- vector<MappingRange> used_mappings;
- FindBestRangeSequence(old_mappings, used_mappings);
-
- size_t cur_i = 0;
- MappingRange new_mapping;
- new_mapping = used_mappings[cur_i];
- size_t used_mapped = new_mapping.initial_range.size();
- TRACE ("Edge " << gp_.g.int_id(e) << " length "<< gp_.g.length(e));
- TRACE ("new_mapping mp_range "<< new_mapping.mapped_range.start_pos << " - " << new_mapping.mapped_range.end_pos
- << " init_range " << new_mapping.initial_range.start_pos << " - " << new_mapping.initial_range.end_pos );
- while (cur_i < used_mappings.size() - 1) {
- cur_i ++;
- used_mapped += used_mappings[cur_i].initial_range.size();
- new_mapping = new_mapping.Merge(used_mappings[cur_i]);
- TRACE("new_mapping mp_range "<< new_mapping.mapped_range.start_pos << " - " << new_mapping.mapped_range.end_pos
- << " init_range " << new_mapping.initial_range.start_pos << " - " << new_mapping.initial_range.end_pos );
- }
-//used less that 0.9 of aligned length
- if (total_mapped * 10 >= used_mapped * 10 + gp_.g.length(e)) {
- INFO ("Edge " << gp_.g.int_id(e) << " length "<< gp_.g.length(e) << "is potentially misassembled! mappings: ");
- for (auto mp:old_mappings) {
- INFO("mp_range "<< mp.mapped_range.start_pos << " - " << mp.mapped_range.end_pos << " init_range " << mp.initial_range.start_pos << " - " << mp.initial_range.end_pos );
- if (mp.initial_range.start_pos < absolute_max_gap_) {
- INFO ("Fake(linear order) misassembly on edge "<< e.int_id());
- if (strand == "0") {
- circular_edge_ = e;
- }
- }
- }
-
- }
- gp_.edge_pos.AddEdgePosition(e, new_strand, new_mapping);
-}
-
-
-
-}
diff --git a/src/debruijn/genome_consistance_checker.hpp b/src/debruijn/genome_consistance_checker.hpp
deleted file mode 100644
index e2a1ba5..0000000
--- a/src/debruijn/genome_consistance_checker.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-
-#pragma once
-#include "omni/visualization/graph_labeler.hpp"
-#include "omni/edges_position_handler.hpp"
-#include "omni/mapping_path.hpp"
-#include "omni/edges_position_handler.hpp"
-#include "sequence/sequence.hpp"
-#include "graph_pack.hpp"
-#include "positions.hpp"
-#include "path_extend/bidirectional_path.hpp"
-#include "path_extend/scaffolder2015/scaff_supplementary.hpp"
-
-namespace debruijn_graph {
-
-
-using path_extend::BidirectionalPath;
-using path_extend::ScaffoldingUniqueEdgeStorage;
-
-struct PathScore{
- size_t misassemblies;
- size_t wrong_gap_size;
- size_t mapped_length;
- PathScore(size_t m, size_t w, size_t ml): misassemblies(m), wrong_gap_size(w), mapped_length(ml) {}
-};
-class GenomeConsistenceChecker {
-
-private:
- const conj_graph_pack &gp_;
- const Graph &graph_;
- //EdgesPositionHandler<Graph> &position_handler_;
- Sequence genome_;
- ScaffoldingUniqueEdgeStorage storage_;
- size_t absolute_max_gap_;
- double relative_max_gap_;
- set<EdgeId> excluded_unique_;
- EdgeId circular_edge_;
-//map from unique edges to their order in genome spelling;
- mutable map<EdgeId, size_t> genome_spelled_;
- bool consequent(const Range &mr1, const Range &mr2) const;
- bool consequent(const MappingRange &mr1, const MappingRange &mr2) const ;
-
- PathScore CountMisassembliesWithStrand(const BidirectionalPath &path, const string strand) const;
-//constructs longest sequence of consequetive ranges, stores result in used_mappings
- void FindBestRangeSequence(const set<MappingRange>& old_mappings, vector<MappingRange>& used_mappings) const;
-//Refills genomic positions uniting alingments separated with small gaps
- void RefillPos();
- void RefillPos(const string &strand);
- void RefillPos(const string &strand, const EdgeId &e);
-DECL_LOGGER("GenomeConsistenceChecker");
-
-
-public:
- GenomeConsistenceChecker(const conj_graph_pack &gp, ScaffoldingUniqueEdgeStorage &storage, size_t max_gap, double relative_max_gap /*= 0.2*/) : gp_(gp),
- graph_(gp.g), /*position_handler_(gp.edge_pos),*/ genome_(gp.genome.GetSequence()), storage_(storage),
- absolute_max_gap_(max_gap), relative_max_gap_(relative_max_gap), excluded_unique_(), circular_edge_() {
- if (!gp.edge_pos.IsAttached()) {
- gp.edge_pos.Attach();
- }
- gp.edge_pos.clear();
- FillPos(gp_, gp_.genome.GetSequence(), "0");
- FillPos(gp_, !gp_.genome.GetSequence(), "1");
- RefillPos();
- }
- PathScore CountMisassemblies(const BidirectionalPath &path) const;
-//spells genome in language of long unique edges from storage;
- void SpellGenome();
-
-};
-
-
-}
diff --git a/src/debruijn/genome_storage.cpp b/src/debruijn/genome_storage.cpp
deleted file mode 100644
index 17decdf..0000000
--- a/src/debruijn/genome_storage.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-//
-// Created by lab42 on 8/19/15.
-//
-
-#include "genome_storage.hpp"
-#include "sequence/nucl.hpp"
-using namespace std;
-
-namespace debruijn_graph {
-//TODO exterminate this where possible
- Sequence GenomeStorage::GetSequence() const{
- stringstream ss;
- size_t l = 0, r = 0;
- for(size_t i = 0; i < s_.size(); i++) {
- if (! is_nucl(s_[i]) ) {
- if (r > l) {
- ss << s_.substr(l, r - l);
- }
- r = i + 1;
- l = i + 1;
- } else {
- r++;
- }
- }
- if (r > l) {
- ss << s_.substr(l, r - l);
- }
- return Sequence(ss.str());
- }
- void GenomeStorage::SetSequence(const Sequence &s) {
- s_ = s.str();
- }
- string GenomeStorage::str() const{
- return s_;
- }
- size_t GenomeStorage::size() const {
- return s_.size();
- }
-}
\ No newline at end of file
diff --git a/src/debruijn/genome_storage.hpp b/src/debruijn/genome_storage.hpp
deleted file mode 100644
index aaff952..0000000
--- a/src/debruijn/genome_storage.hpp
+++ /dev/null
@@ -1,33 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-//
-// Created by lab42 on 8/19/15.
-//
-
-#ifndef GENOME_STORAGE_HPP_
-#define GENOME_STORAGE_HPP_
-
-#include <string>
-#include "sequence/sequence.hpp"
-namespace debruijn_graph {
- class GenomeStorage {
- private:
- std::string s_;
- public:
- GenomeStorage():s_(""){
- }
-
- GenomeStorage(const std::string &s): s_(s){
- }
-
- Sequence GetSequence() const;
- void SetSequence(const Sequence &s);
- std::string str() const;
- size_t size() const;
- };
-}
-#endif //PROJECT_GENOME_STORAGE_HPP
diff --git a/src/debruijn/genomic_info.hpp b/src/debruijn/genomic_info.hpp
deleted file mode 100644
index 13e1646..0000000
--- a/src/debruijn/genomic_info.hpp
+++ /dev/null
@@ -1,44 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef __GENOMIC_INFO_HPP__
-#define __GENOMIC_INFO_HPP__
-
-#include <vector>
-
-class GenomicInfo {
- public:
- GenomicInfo()
- : genome_size_(0), estimated_mean_(0), ec_bound_(0), trusted_bound_(0) {}
-
- const std::vector<size_t>& cov_histogram() const { return cov_histogram_; }
- void set_cov_histogram(const std::vector<size_t> &hist) { cov_histogram_ = hist; }
-
- size_t genome_size() const { return genome_size_; }
- void set_genome_size(size_t genome_size) { genome_size_ = genome_size; }
-
- double estimated_mean() const { return estimated_mean_; }
- void set_estimated_mean(double estimated_mean) { estimated_mean_ = estimated_mean; }
-
- double ec_bound() const { return ec_bound_; }
- void set_ec_bound(double ec_bound) { ec_bound_ = ec_bound; }
-
- double trusted_bound() const { return trusted_bound_; }
- void set_trusted_bound(double trusted_bound) { trusted_bound_ = trusted_bound; }
-
- bool Load(const std::string &filename);
- void Save(const std::string &filename) const;
-
- private:
- std::vector<size_t> cov_histogram_;
- size_t genome_size_;
- double estimated_mean_;
- double ec_bound_;
- double trusted_bound_;
-};
-
-#endif
diff --git a/src/debruijn/genomic_info_filler.cpp b/src/debruijn/genomic_info_filler.cpp
deleted file mode 100644
index 0f5a9cb..0000000
--- a/src/debruijn/genomic_info_filler.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "genomic_info_filler.hpp"
-
-#include "graph_pack.hpp"
-#include "kmer_coverage_model.hpp"
-#include "omni/omni_tools.hpp"
-#include "../include/config_singl.hpp"
-
-#include <yaml-cpp/yaml.h>
-
-#include <map>
-#include <vector>
-
-using namespace debruijn_graph;
-
-static std::vector<size_t> extract(const std::map<size_t, size_t> &hist) {
- std::map<size_t, size_t> tmp = hist;
-
- size_t maxcov = 0;
- for (auto it = tmp.cbegin(), et = tmp.cend(); it != et; ++it)
- maxcov = std::max(maxcov, it->first);
-
- // Touch all the values until maxcov to make sure all the values exist in the map
- for (size_t i = 0; i <= maxcov; ++i)
- tmp[i];
-
- // Extract the values
- std::vector<size_t> res(maxcov);
- for (size_t i = 0; i < maxcov; ++i)
- res[i] = tmp[i + 1];
-
- return res;
-}
-
-bool GenomicInfo::Load(const std::string &filename) {
- std::ifstream ifs(filename.c_str());
- if (!ifs)
- return false;
-
- YAML::Node node = YAML::Load(ifs);
-
- ec_bound_ = node["ec bound"].as<double>(0);
- estimated_mean_ = node["estimated mean"].as<double>(0);
- trusted_bound_ = node["trusted bound"].as<double>(0);
- genome_size_ = node["genome size"].as<size_t>(0);
- cov_histogram_ = node["coverage histogram"].as<std::vector<size_t> >(std::vector<size_t>());
-
- return true;
-}
-
-void GenomicInfo::Save(const std::string &filename) const {
- std::ofstream ofs(filename.c_str());
-
- YAML::Node node;
- node["ec bound"] = ec_bound_;
- node["estimated mean"] = estimated_mean_;
- node["trusted bound"] = trusted_bound_;
- node["genome size"] = genome_size_;
- node["coverage histogram"] = cov_histogram_;
-
- ofs << node;
-}
-
-void GenomicInfoFiller::run(conj_graph_pack &gp, const char*) {
- if (cfg::get().ds.single_cell) {
- ErroneousConnectionThresholdFinder<decltype(gp.g)> finder(gp.g);
- std::map<size_t, size_t> hist = finder.ConstructHistogram();
- double avg = finder.AvgCoverage();
- double gthr = finder.FindThreshold(hist);
- INFO("Average edge coverage: " << avg);
- INFO("Graph threshold: " << gthr);
-
- gp.ginfo.set_cov_histogram(extract(hist));
- gp.ginfo.set_ec_bound(std::min(avg, gthr));
- } else {
- // First, get k-mer coverage histogram
- std::map<size_t, size_t> tmp;
- size_t maxcov = 0;
- size_t kmer_per_record = 1;
- if (conj_graph_pack::index_t::InnerIndexT::storing_type::IsInvertable())
- kmer_per_record = 2;
-
- for (auto I = gp.index.inner_index().value_cbegin(), E = gp.index.inner_index().value_cend(); I != E; ++I) {
- size_t ccov = I->count;
- maxcov = std::max(ccov, maxcov);
- tmp[ccov] += kmer_per_record;
- }
-
- gp.ginfo.set_cov_histogram(extract(tmp));
-
- // Fit the coverage model and get the threshold
- cov_model::KMerCoverageModel CovModel(gp.ginfo.cov_histogram(), cfg::get().kcm.probability_threshold, cfg::get().kcm.strong_probability_threshold);
- CovModel.Fit();
-
- gp.ginfo.set_genome_size(CovModel.GetGenomeSize());
- gp.ginfo.set_ec_bound((double)CovModel.GetErrorThreshold());
- if (CovModel.converged()) {
- gp.ginfo.set_estimated_mean((double)CovModel.GetMeanCoverage());
- INFO("Mean coverage was calculated as " << gp.ginfo.estimated_mean());
- } else
- INFO("Failed to estimate mean coverage");
-
- if (cfg::get().kcm.use_coverage_threshold) {
- double coef = (cfg::get().ds.aRL() - double(cfg::get().K) + 1) / cfg::get().ds.aRL();
- if (coef < 0)
- coef = double(cfg::get().ds.RL() - cfg::get().K + 1) / double(cfg::get().ds.RL());
- gp.ginfo.set_trusted_bound(CovModel.converged() && cfg::get().kcm.coverage_threshold == 0.0 ?
- double(CovModel.GetLowThreshold()) :
- cfg::get().kcm.coverage_threshold * coef);
- }
-
- }
- INFO("EC coverage threshold value was calculated as " << gp.ginfo.ec_bound());
- INFO("Trusted kmer low bound: " << gp.ginfo.trusted_bound());
-}
diff --git a/src/debruijn/genomic_info_filler.hpp b/src/debruijn/genomic_info_filler.hpp
deleted file mode 100644
index aaf0289..0000000
--- a/src/debruijn/genomic_info_filler.hpp
+++ /dev/null
@@ -1,23 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "stage.hpp"
-
-namespace debruijn_graph {
-
-class GenomicInfoFiller : public spades::AssemblyStage {
- public:
- GenomicInfoFiller()
- : AssemblyStage("EC Threshold Finding", "ec_threshold_finder") {}
-
- void run(conj_graph_pack &gp, const char*);
-};
-
-}
-
diff --git a/src/debruijn/genomic_quality.hpp b/src/debruijn/genomic_quality.hpp
deleted file mode 100644
index 3821369..0000000
--- a/src/debruijn/genomic_quality.hpp
+++ /dev/null
@@ -1,553 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "omni/visualization/visualization.hpp"
-#include "omni/basic_edge_conditions.hpp"
-#include "sequence_mapper.hpp"
-
-namespace debruijn_graph {
-
-template<class Graph>
-class EdgeQuality: public GraphLabeler<Graph>, public GraphActionHandler<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- map<EdgeId, size_t> quality_;
- size_t k_;
-
- template<class Index>
- void FillQuality(const Index &index
- , const KmerMapper<Graph>& kmer_mapper, const Sequence &genome) {
- if (genome.size() < k_)
- return;
- runtime_k::RtSeq cur = genome.start<runtime_k::RtSeq>(k_);
- cur >>= 0;
- for (size_t i = 0; i + k_ - 1 < genome.size(); i++) {
- cur <<= genome[i + k_ - 1];
- auto corr_cur = kmer_mapper.Substitute(cur);
- if (index.contains(corr_cur)) {
- quality_[index.get(corr_cur).first]++;
- }
- }
- }
-
-public:
-
- template<class Index>
- void Fill(const Index &index
- , const KmerMapper<Graph>& kmer_mapper
- , const Sequence &genome) {
- FillQuality(index, kmer_mapper, genome);
- FillQuality(index, kmer_mapper, !genome);
- }
-
- EdgeQuality(const Graph &graph) :
- GraphActionHandler<Graph>(graph, "EdgeQuality"),
- k_(graph.k() + 1) {
- }
-
- virtual ~EdgeQuality() {
- }
-
- virtual void HandleAdd(EdgeId /*e*/) {
- }
-
- virtual void HandleDelete(EdgeId e) {
- quality_.erase(e);
- }
-
- virtual void HandleMerge(const vector<EdgeId>& old_edges, EdgeId new_edge) {
- size_t res = 0;
- for (size_t i = 0; i < old_edges.size(); i++) {
- res += quality_[old_edges[i]];
- }
- quality_[new_edge] += res;
- }
-
- virtual void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) {
- quality_[new_edge] += quality_[edge2];
- quality_[new_edge] += quality_[edge1];
- }
-
- virtual void HandleSplit(EdgeId old_edge, EdgeId new_edge1,
- EdgeId new_edge2) {
- if (old_edge == this->g().conjugate(old_edge)) {
- WARN("EdgeQuality does not support self-conjugate splits");
- return;
- }
- VERIFY(old_edge != this->g().conjugate(old_edge));
- quality_[new_edge1] = quality_[old_edge] * this->g().length(new_edge1)
- / (this->g().length(new_edge1) + this->g().length(new_edge2));
- quality_[new_edge2] = quality_[old_edge] * this->g().length(new_edge2)
- / (this->g().length(new_edge1) + this->g().length(new_edge2));
- }
-
- double quality(EdgeId edge) const {
- auto it = quality_.find(edge);
- if (it == quality_.end())
- return 0.;
- else
- return 1. * (double) it->second / (double) this->g().length(edge);
- }
-
- bool IsPositiveQuality(EdgeId edge) const {
- return math::gr(quality(edge), 0.);
- }
-
- bool IsZeroQuality(EdgeId edge) const {
- return math::eq(quality(edge), 0.);
- }
-
- virtual std::string label(VertexId /*vertexId*/) const {
- return "";
- }
-
- virtual std::string label(EdgeId edge) const {
- double q = quality(edge);
- return (q == 0) ? "" : "quality: " + ToString(q);
- }
-
- void clear() {
- quality_.clear();
- }
-
-};
-
-template<class Graph>
-class QualityLoggingRemovalHandler {
- typedef typename Graph::EdgeId EdgeId;
- const Graph& g_;
- const EdgeQuality<Graph>& quality_handler_;
- size_t black_removed_;
- size_t total_;
- bool handle_all_;
-
- virtual void HandlePositiveQuality(EdgeId /*e*/) {
-
- }
-
-public:
- QualityLoggingRemovalHandler(const Graph& g, const EdgeQuality<Graph>& quality_handler,
- bool handle_all = false) :
- g_(g), quality_handler_(quality_handler), black_removed_(0), total_(0), handle_all_(handle_all) {
- }
-
- void HandleDelete(EdgeId e) {
- total_++;
- if (handle_all_ || math::gr(quality_handler_.quality(e), 0.)) {
- TRACE("Deleting good edge id = " << g_.int_id(e)
- << "; length = " << g_.length(e)
- << "; quality = " << quality_handler_.quality(e)
- << "; cov = " << g_.coverage(e));
- HandlePositiveQuality(e);
- } else {
- black_removed_++;
- }
- }
-
- const Graph& g() const {
- return g_;
- }
-
- const EdgeQuality<Graph>& quality_handler() const {
- return quality_handler_;
- }
-
- virtual ~QualityLoggingRemovalHandler() {
- TRACE("Overall stats: total removed = " << total_
- << "; bad removed = " << black_removed_
- << "; good removed = " << total_ - black_removed_);
- }
-
-private:
- DECL_LOGGER("QualityLoggingRemovalHandler");
-};
-
-template<class Graph>
-class QualityEdgeLocalityPrintingRH : public QualityLoggingRemovalHandler<Graph> {
- typedef QualityLoggingRemovalHandler<Graph> base;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- omnigraph::visualization::LocalityPrintingRH<Graph> printing_rh_;
-public:
- QualityEdgeLocalityPrintingRH(const Graph& g
- , const EdgeQuality<Graph>& quality_handler
- , const GraphLabeler<Graph>& labeler
- , std::shared_ptr<visualization::GraphColorer<Graph>> colorer
- , const string& output_folder, bool handle_all = false) :
- base(g, quality_handler, handle_all),
- printing_rh_(g, labeler, colorer, output_folder)
- {}
-
- virtual void HandlePositiveQuality(EdgeId e) {
- printing_rh_.HandleDelete(e, "_" + ToString(this->quality_handler().quality(e)));
- }
-
-private:
- DECL_LOGGER("QualityEdgeLocalityPrintingRH");
-};
-
-//earlier version from rel_cov branch
-//template<class Graph>
-//class EdgeNeighborhoodFinder: public omnigraph::GraphSplitter<Graph> {
-//private:
-// typedef typename Graph::EdgeId EdgeId;
-// typedef typename Graph::VertexId VertexId;
-// EdgeId edge_;
-// size_t max_size_;
-// size_t edge_length_bound_;
-// bool finished_;
-//public:
-// EdgeNeighborhoodFinder(const Graph &graph, EdgeId edge, size_t max_size
-// , size_t edge_length_bound) :
-// GraphSplitter<Graph>(graph), edge_(edge), max_size_(
-// max_size), edge_length_bound_(edge_length_bound), finished_(
-// false) {
-// }
-//
-// GraphComponent<Graph> NextComponent() {
-// CountingDijkstra<Graph> cf(this->graph(), max_size_,
-// edge_length_bound_);
-// set<VertexId> result_set;
-// cf.run(this->graph().EdgeStart(edge_));
-// vector<VertexId> result_start = cf.ReachedVertices();
-// result_set.insert(result_start.begin(), result_start.end());
-// cf.run(this->graph().EdgeEnd(edge_));
-// vector<VertexId> result_end = cf.ReachedVertices();
-// result_set.insert(result_end.begin(), result_end.end());
-//
-// ComponentCloser<Graph> cc(this->graph(), edge_length_bound_);
-// cc.CloseComponent(result_set);
-//
-// finished_ = true;
-// return GraphComponent<Graph>(this->graph(), result_set.begin(), result_set.end());
-// }
-//
-// /*virtual*/ bool Finished() {
-// return finished_;
-// }
-//};
-//
-//template<class Graph>
-//class EdgeLocalityPrintingRH {
-// typedef typename Graph::EdgeId EdgeId;
-// typedef typename Graph::VertexId VertexId;
-// const Graph& g_;
-// const GraphLabeler<Graph>& labeler_;
-// const string& output_folder_;
-// std::function<double (EdgeId)>& quality_f_;
-//// size_t black_removed_;
-//// size_t colored_removed_;
-//public:
-// EdgeLocalityPrintingRH(const Graph& g
-// , const GraphLabeler<Graph>& labeler
-// , const string& output_folder
-// , std::function<double (EdgeId)> quality_f = 0) :
-// g_(g),
-// labeler_(labeler), output_folder_(output_folder),
-// quality_f_(quality_f){
-// }
-//
-// void HandleDelete(EdgeId edge) {
-// TRACE("Deleting edge " << g_.str(edge));
-// if (quality_f_ && math::gr(quality_f_(edge), 0.))
-// INFO("EdgeLocalityPrintRH handling the edge with positive quality : " << quality_f_(edge) << " " << g_.str(edge));
-//
-// string folder = output_folder_ + "edges_deleted/";
-// path::make_dir(folder);
-// //todo magic constant
-// map<EdgeId, string> empty_coloring;
-// omnigraph::visualization::WriteComponent(g_, EdgeNeighborhood<Graph>(g_, edge, 50, 250),
-// folder + "edge_" + ToString(g_.int_id(edge)) + ".dot", empty_coloring, labeler_);
-// }
-//
-//private:
-// DECL_LOGGER("QualityEdgeLocalityPrintingRH")
-// ;
-//};
-
-//template<class Graph, class Index>
-//class EdgeQuality: public GraphLabeler<Graph>, public GraphActionHandler<Graph> {
-// typedef typename Graph::EdgeId EdgeId;
-// typedef typename Graph::VertexId VertexId;
-// map<EdgeId, size_t> quality_;
-// size_t k_;
-//
-//public:
-//
-// void FillQuality(const Index &index
-// , const KmerMapper<Graph>& kmer_mapper, const Sequence &genome) {
-// if (genome.size() < k_)
-// return;
-// runtime_k::RtSeq cur = genome.start<runtime_k::RtSeq>(k_);
-// cur >>= 0;
-// for (size_t i = 0; i + k_ - 1 < genome.size(); i++) {
-// cur <<= genome[i + k_ - 1];
-// auto corr_cur = kmer_mapper.Substitute(cur);
-// if (index.contains(corr_cur)) {
-// quality_[index.get(corr_cur).first]++;
-// }
-// }
-// }
-//
-// EdgeQuality(const Graph &graph, const Index &index,
-// const KmerMapper<Graph>& kmer_mapper,
-// const Sequence &genome) :
-//
-// GraphActionHandler<Graph>(graph, "EdgeQualityLabeler"),
-// k_(kmer_mapper.get_k()) {
-// FillQuality(index, kmer_mapper, genome);
-// FillQuality(index, kmer_mapper, !genome);
-// }
-//
-// virtual ~EdgeQuality() {
-// }
-//
-// virtual void HandleAdd(EdgeId /*e*/) {
-// }
-//
-// virtual void HandleDelete(EdgeId e) {
-// quality_.erase(e);
-// }
-//
-// virtual void HandleMerge(const vector<EdgeId>& old_edges, EdgeId new_edge) {
-// size_t res = 0;
-// for (size_t i = 0; i < old_edges.size(); i++) {
-// res += quality_[old_edges[i]];
-// }
-// quality_[new_edge] += res;
-// }
-//
-// virtual void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) {
-// quality_[new_edge] += quality_[edge2];
-// quality_[new_edge] += quality_[edge1];
-// }
-//
-// virtual void HandleSplit(EdgeId old_edge, EdgeId new_edge1,
-// EdgeId new_edge2) {
-// quality_[new_edge1] = quality_[old_edge] * this->g().length(new_edge1)
-// / (this->g().length(new_edge1) + this->g().length(new_edge2));
-// quality_[new_edge2] = quality_[old_edge] * this->g().length(new_edge2)
-// / (this->g().length(new_edge1) + this->g().length(new_edge2));
-// }
-//
-// double quality(EdgeId edge) const {
-// auto it = quality_.find(edge);
-// if (it == quality_.end())
-// return 0.;
-// else
-// return 1. * (double) it->second / (double) this->g().length(edge);
-// }
-//
-// bool IsPositiveQuality(EdgeId edge) const {
-// return math::gr(quality(edge), 0.);
-// }
-//
-// virtual std::string label(VertexId /*vertexId*/) const {
-// return "";
-// }
-//
-// virtual std::string label(EdgeId edge) const {
-// double q = quality(edge);
-// return (q == 0) ? "" : "quality: " + ToString(q);
-// }
-//
-//};
-//
-//template<class Graph, class Index>
-//class QualityLoggingRemovalHandler {
-// typedef typename Graph::EdgeId EdgeId;
-// const Graph& g_;
-// const EdgeQuality<Graph, Index>& quality_handler_;
-//// size_t black_removed_;
-//// size_t colored_removed_;
-//public:
-// QualityLoggingRemovalHandler(const Graph& g, const EdgeQuality<Graph, Index>& quality_handler) :
-// g_(g), quality_handler_(quality_handler)/*, black_removed_(0), colored_removed_(
-// 0)*/{
-//
-// }
-//
-// void HandleDelete(EdgeId edge) {
-// if (math::gr(quality_handler_.quality(edge), 0.)) {
-// TRACE("Deleting edge " << g_.str(edge) << " with quality " << quality_handler_.quality(edge));
-// } else {
-//// TRACE("Deleting edge " << g_.int_id(edge) << " with zero quality");
-// }
-//// if (math::gr(quality_handler_.quality(edge), 0.))
-//// colored_removed_++;
-//// else
-//// black_removed_++;
-// }
-//
-//private:
-// DECL_LOGGER("QualityLoggingRemovalHandler")
-// ;
-//};
-//
-//template<class Graph, class Index>
-//class QualityLoggingRemovalCountHandler {
-// typedef typename Graph::EdgeId EdgeId;
-// const Graph& g_;
-// const EdgeQuality<Graph, Index>& quality_handler_;
-// size_t black_removed_;
-// size_t total;
-//
-//public:
-// QualityLoggingRemovalCountHandler(const Graph& g, const EdgeQuality<Graph, Index>& quality_handler) :
-// g_(g), quality_handler_(quality_handler)/*, black_removed_(0), colored_removed_(
-// 0)*/{
-// black_removed_ = 0;
-// total = 0;
-// }
-//
-// void HandleDelete(EdgeId edge) {
-// total++;
-// if (math::gr(quality_handler_.quality(edge), 0.)) {
-// TRACE("Deleting good edge " << g_.int_id(edge) << " with quality " << quality_handler_.quality(edge) << " cov " << g_.coverage(edge) << " length " << g_.length(edge));
-// }else{
-// black_removed_++;
-// }
-// if ((total % (1<<10)) != 0)
-// TRACE("Removed still " << black_removed_ << " " << total);
-// }
-//
-//private:
-//};
-//
-//template<class Graph, class Index>
-//class QualityEdgeLocalityPrintingRH {
-// typedef typename Graph::EdgeId EdgeId;
-// typedef typename Graph::VertexId VertexId;
-// const Graph& g_;
-// const EdgeQuality<Graph, Index>& quality_handler_;
-// const omnigraph::GraphLabeler<Graph>& labeler_;
-// const omnigraph::visualization::GraphColorer<Graph>& colorer_;
-// const string& output_folder_;
-//// size_t black_removed_;
-//// size_t colored_removed_;
-//public:
-// QualityEdgeLocalityPrintingRH(const Graph& g
-// , const EdgeQuality<Graph, Index>& quality_handler
-// , const omnigraph::GraphLabeler<Graph>& labeler
-// , const omnigraph::visualization::GraphColorer<Graph>& colorer
-// , const string& output_folder) :
-// g_(g), quality_handler_(quality_handler),
-// labeler_(labeler), colorer_(colorer), output_folder_(output_folder){
-// }
-//
-// void HandleDelete(EdgeId edge) {
-// if (quality_handler_.IsPositiveQuality(edge)) {
-// DEBUG("Deleting edge " << g_.str(edge) << " with quality " << quality_handler_.quality(edge));
-// string folder = output_folder_ + "colored_edges_deleted/";
-// path::make_dir(folder);
-// //todo magic constant
-//// map<EdgeId, string> empty_coloring;
-// shared_ptr<GraphSplitter<Graph>> splitter = EdgeNeighborhoodFinder<Graph>(g_, edge, 50, 250);
-// omnigraph::visualization::WriteComponents(g_, *splitter/*, "locality_of_edge_" + ToString(g_.int_id(edge))*/
-// , folder + "edge_" + ToString(g_.int_id(edge)) + "_" + ToString(quality_handler_.quality(edge)) + ".dot"
-// , colorer_, labeler_);
-// } else {
-// TRACE("Deleting edge " << g_.str(edge) << " with zero quality");
-// }
-// }
-//
-//private:
-// DECL_LOGGER("QualityEdgeLocalityPrintingRH")
-// ;
-//};
-//
-//template<class Graph, class Index>
-//class QualityPairInfoHandler {
-// typedef typename Graph::EdgeId EdgeId;
-// typedef typename Graph::VertexId VertexId;
-// typedef omnigraph::PairInfo<EdgeId> PairInfo;
-// typedef vector<PairInfo> PairInfos;
-// const Graph& g_;
-// const EdgeQuality<Graph, Index>& quality_handler_;
-// const GraphLabeler<Graph>& labeler_;
-// const string& output_folder_;
-// const PairedInfoIndex<ConjugateDeBruijnGraph>& index_;
-//// size_t black_removed_;
-//// size_t colored_removed_;
-//public:
-// QualityPairInfoHandler(const Graph& g
-// , const EdgeQuality<Graph, Index>& quality_handler
-// , const GraphLabeler<Graph>& labeler
-// , const string& output_folder
-// , const PairedInfoIndex<ConjugateDeBruijnGraph>& index) :
-// g_(g), quality_handler_(quality_handler),
-// labeler_(labeler), output_folder_(output_folder), index_(index) {
-// }
-//
-// void HandleDelete(EdgeId edge) {
-// if (quality_handler_.IsPositiveQuality(edge)) {
-// cout << "Deleting edge " << g_.str(edge) << " with quality " << quality_handler_.quality(edge) << endl;
-// string folder = output_folder_ + "colored_edges_deleted/";
-// path::make_dir(folder);
-// //todo magic constant
-// PairInfos infos = index_.GetEdgeInfo(edge);
-// if (infos.size() > 0){
-// for (size_t i = 0; i<infos.size(); i++){
-// cout << "Tip Info " << g_.int_id(infos[i].first) << " " << g_.int_id(infos[i].second) << " " << infos[i].d << " " << infos[i].weight << " " << infos[i].variance << endl;
-// }
-// }
-// map<EdgeId, string> empty_coloring;
-// shared_ptr<GraphSplitter<Graph>> splitter = EdgeNeighborhoodFinder<Graph>(g_, edge, 50,
-// 250);
-//
-// omnigraph::visualization::WriteComponents(g_, *splitter, TrueFilter<vector<VertexId>>(), "locality_of_edge_" + ToString(g_.int_id(edge))
-// , folder + "edge_" + ToString(g_.int_id(edge)) + "_" + ToString(quality_handler_.quality(edge)) + ".dot"
-// , empty_coloring, labeler_);
-// }
-// }
-//
-//private:
-//};
-//
-////todo what is the difference with QELPRH?!
-//template<class Graph>
-//class EdgeLocalityPrintingRH {
-// typedef typename Graph::EdgeId EdgeId;
-// typedef typename Graph::VertexId VertexId;
-// const Graph& g_;
-// const GraphLabeler<Graph>& labeler_;
-// const string& output_folder_;
-// std::function<double (EdgeId)>& quality_f_;
-//// size_t black_removed_;
-//// size_t colored_removed_;
-//public:
-// EdgeLocalityPrintingRH(const Graph& g
-// , const GraphLabeler<Graph>& labeler
-// , const string& output_folder
-// , std::function<double (EdgeId)> quality_f = 0) :
-// g_(g),
-// labeler_(labeler), output_folder_(output_folder),
-// quality_f_(quality_f){
-// }
-//
-// void HandleDelete(EdgeId edge) {
-// TRACE("Deleting edge " << g_.str(edge));
-// if (quality_f_ && math::gr(quality_f_(edge), 0.))
-// INFO("Handling the edge with positive quality : " << quality_f_(edge) << " " << g_.str(edge));
-//
-// string folder = output_folder_ + "edges_deleted/";
-// path::make_dir(folder);
-// //todo magic constant
-// map<EdgeId, string> empty_coloring;
-// shared_ptr<GraphSplitter<Graph>> splitter = EdgeNeighborhoodFinder<Graph>(g_, edge, 50, 250);
-// omnigraph::visualization::WriteComponents(g_, *splitter, TrueFilter<vector<VertexId>>(), "locality_of_edge_" + ToString(g_.int_id(edge))
-// , folder + "edge_" + ToString(g_.int_id(edge)) + ".dot", empty_coloring, labeler_);
-// }
-//
-//private:
-// DECL_LOGGER("EdgeLocalityPrintingRH")
-// ;
-//};
-
-}
diff --git a/src/debruijn/graph_construction.hpp b/src/debruijn/graph_construction.hpp
deleted file mode 100644
index c25299e..0000000
--- a/src/debruijn/graph_construction.hpp
+++ /dev/null
@@ -1,190 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * graph_construction.hpp
- *
- * Created on: Aug 12, 2011
- * Author: sergey
- */
-#pragma once
-
-#include "openmp_wrapper.h"
-
-#include "io/io_helper.hpp"
-#include "omni/edges_position_handler.hpp"
-
-#include "debruijn_graph_constructor.hpp"
-#include "indices/edge_index_builders.hpp"
-#include "debruijn_graph.hpp"
-#include "graph_pack.hpp"
-#include "utils.hpp"
-#include "perfcounter.hpp"
-#include "early_simplification.hpp"
-
-#include "read_converter.hpp"
-#include "detail_coverage.hpp"
-#include "indices/storing_traits.hpp"
-
-namespace debruijn_graph {
-
-template<class StoringType>
-struct CoverageCollector {
-};
-
-template<>
-struct CoverageCollector<SimpleStoring> {
- template<class Info>
- static void CollectCoverage(Info edge_info) {
- edge_info.edge_id->IncCoverage(edge_info.count);
- }
-};
-
-template<>
-struct CoverageCollector<InvertableStoring> {
- template<class Info>
- static void CollectCoverage(Info edge_info) {
- edge_info.edge_id->IncCoverage(edge_info.count);
- edge_info.edge_id->conjugate()->IncCoverage(edge_info.count);
- }
-};
-
-
-template<class Index>
-void FillCoverageFromIndex(const Index &index) {
- for (auto I = index.value_cbegin(), E = index.value_cend();
- I != E; ++I) {
- const auto& edge_info = *I;
- VERIFY(edge_info.offset != -1u);
-// VERIFY(edge_info.edge_id.get() != NULL);
- if(edge_info.offset != -1u) {
- CoverageCollector<typename Index::storing_type>::CollectCoverage(edge_info);
- }
- }
- DEBUG("Coverage counted");
-}
-
-template<class Graph, class Readers, class Index>
-size_t ConstructGraphUsingOldIndex(Readers& streams, Graph& g,
- Index& index, io::SingleStreamPtr contigs_stream = io::SingleStreamPtr()) {
- INFO("Constructing DeBruijn graph");
-
- TRACE("Filling indices");
- size_t rl = 0;
- VERIFY_MSG(streams.size(), "No input streams specified");
-
- TRACE("... in parallel");
- typedef typename Index::InnerIndexT InnerIndex;
- typedef typename EdgeIndexHelper<InnerIndex>::CoverageFillingEdgeIndexBuilderT IndexBuilder;
- InnerIndex& debruijn = index.inner_index();
- //fixme hack
- rl = IndexBuilder().BuildIndexFromStream(debruijn, streams, (contigs_stream == 0) ? 0 : &(*contigs_stream));
-
- VERIFY(g.k() + 1== debruijn.k());
- // FIXME: output_dir here is damn ugly!
-
- TRACE("Filled indices");
-
- INFO("Condensing graph");
- DeBruijnGraphConstructor<Graph, InnerIndex> g_c(g, debruijn);
- TRACE("Constructor ok");
- VERIFY(!index.IsAttached());
- index.Attach();
- g_c.ConstructGraph(100, 10000, 1.2); // TODO: move magic constants to config
- INFO("Graph condensed");
-
- return rl;
-}
-
-inline debruijn_config::construction CreateDefaultConstructionConfig() {
- debruijn_config::construction config;
- config.con_mode = construction_mode::con_extention;
- debruijn_config::construction::early_tip_clipper early_tc;
- early_tc.enable = false;
- config.early_tc = early_tc;
- config.keep_perfect_loops = true;
- config.read_buffer_size = 0;
- return config;
-}
-
-template<class ExtensionIndex>
-void EarlyClipTips(size_t k, const debruijn_config::construction params, size_t rl, ExtensionIndex& ext) {
- if (params.early_tc.enable) {
- size_t length_bound = rl - k;
- if (params.early_tc.length_bound)
- length_bound = params.early_tc.length_bound.get();
- AlternativeEarlyTipClipper(ext, length_bound).ClipTips();
- }
-}
-
-template<class Graph, class Read, class Index>
-ReadStatistics ConstructGraphUsingExtentionIndex(const debruijn_config::construction params,
- io::ReadStreamList<Read>& streams, Graph& g,
- Index& index, io::SingleStreamPtr contigs_stream = io::SingleStreamPtr()) {
-
- size_t k = g.k();
- INFO("Constructing DeBruijn graph for k=" << k);
-
- TRACE("Filling indices");
- VERIFY_MSG(streams.size(), "No input streams specified");
-
- TRACE("... in parallel");
- // FIXME: output_dir here is damn ugly!
- typedef DeBruijnExtensionIndex<> ExtensionIndex;
- typedef typename ExtensionIndexHelper<ExtensionIndex>::DeBruijnExtensionIndexBuilderT ExtensionIndexBuilder;
- ExtensionIndex ext((unsigned) k, index.inner_index().workdir());
-
- //fixme hack
- ReadStatistics stats = ExtensionIndexBuilder().BuildExtensionIndexFromStream(ext, streams, (contigs_stream == 0) ? 0 : &(*contigs_stream), params.read_buffer_size);
-
- EarlyClipTips(k, params, stats.max_read_length_, ext);
-
- INFO("Condensing graph");
- VERIFY(!index.IsAttached());
- DeBruijnGraphExtentionConstructor<Graph> g_c(g, ext);
- g_c.ConstructGraph(100, 10000, 1.2, params.keep_perfect_loops);//TODO move these parameters to config
-
- INFO("Building index with from graph")
- //todo pass buffer size
- index.Refill();
- index.Attach();
-
- return stats;
-}
-
-template<class Graph, class Index, class Streams>
-ReadStatistics ConstructGraph(const debruijn_config::construction ¶ms,
- Streams& streams, Graph& g,
- Index& index, io::SingleStreamPtr contigs_stream = io::SingleStreamPtr()) {
- if (params.con_mode == construction_mode::con_extention) {
- return ConstructGraphUsingExtentionIndex(params, streams, g, index, contigs_stream);
-// } else if(params.con_mode == construction_mode::con_old){
-// return ConstructGraphUsingOldIndex(k, streams, g, index, contigs_stream);
- } else {
- INFO("Invalid construction mode")
- VERIFY(false);
- return {0,0,0};
- }
-}
-
-template<class Graph, class Index, class Streams>
-ReadStatistics ConstructGraphWithCoverage(const debruijn_config::construction ¶ms,
- Streams& streams, Graph& g,
- Index& index, FlankingCoverage<Graph>& flanking_cov,
- io::SingleStreamPtr contigs_stream = io::SingleStreamPtr()) {
- ReadStatistics rs = ConstructGraph(params, streams, g, index, contigs_stream);
-
- typedef typename Index::InnerIndexT InnerIndex;
- typedef typename EdgeIndexHelper<InnerIndex>::CoverageAndGraphPositionFillingIndexBuilderT IndexBuilder;
- INFO("Filling coverage index")
- IndexBuilder().ParallelFillCoverage(index.inner_index(), streams);
- INFO("Filling coverage and flanking coverage from index");
- FillCoverageAndFlanking(index.inner_index(), g, flanking_cov);
- return rs;
-}
-
-}
diff --git a/src/debruijn/graph_pack.hpp b/src/debruijn/graph_pack.hpp
deleted file mode 100644
index d608417..0000000
--- a/src/debruijn/graph_pack.hpp
+++ /dev/null
@@ -1,154 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "omni/id_track_handler.hpp"
-#include "omni/edges_position_handler.hpp"
-#include "debruijn_graph.hpp"
-#include "de/paired_info.hpp"
-#include "config_struct.hpp"
-#include "edge_index.hpp"
-#include "genomic_quality.hpp"
-#include "sequence_mapper.hpp"
-#include "genomic_info.hpp"
-#include "long_read_storage.hpp"
-#include "detail_coverage.hpp"
-#include "genome_storage.hpp"
-
-namespace debruijn_graph {
-
-/*KmerFree*//*KmerStoring*/
-template<class Graph, class SeqType, class KmerEdgeIndex = KmerStoringEdgeIndex<Graph, SeqType, kmer_index_traits<SeqType>, DefaultStoring>>
-struct graph_pack: private boost::noncopyable {
- typedef Graph graph_t;
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- typedef SeqType seq_t;
- typedef EdgeIndex<graph_t, seq_t, KmerEdgeIndex> index_t;
- using PairedInfoIndicesT = omnigraph::de::PairedInfoIndicesT<Graph>;
- //typedef omnigraph::de::PairedInfoIndicesT<Graph> PairedInfoIndicesT;
- typedef omnigraph::de::UnclusteredPairedInfoIndicesT<Graph> UnclusteredPairedInfoIndicesT;
- typedef LongReadContainer<Graph> LongReadContainerT;
-
- size_t k_value;
-
- graph_t g;
- index_t index;
- KmerMapper<graph_t, seq_t> kmer_mapper;
- FlankingCoverage<graph_t> flanking_cov;
- UnclusteredPairedInfoIndicesT paired_indices;
- PairedInfoIndicesT clustered_indices;
- PairedInfoIndicesT scaffolding_indices;
- LongReadContainerT single_long_reads;
- GenomicInfo ginfo;
-
- GenomeStorage genome;
- EdgeQuality<Graph> edge_qual;
- mutable EdgesPositionHandler<graph_t> edge_pos;
-
- graph_pack(size_t k, const std::string &workdir, size_t lib_count,
- const std::string &genome = "",
- size_t flanking_range = 50,
- size_t max_mapping_gap = 0,
- size_t max_gap_diff = 0,
- bool detach_indices = true)
- : k_value(k), g(k), index(g, workdir),
- kmer_mapper(g),
- flanking_cov(g, flanking_range),
- paired_indices(g, lib_count),
- clustered_indices(g, lib_count),
- scaffolding_indices(g, lib_count),
- single_long_reads(g, lib_count),
- genome(genome),
- edge_qual(g),
- edge_pos(g, max_mapping_gap + k, max_gap_diff)
- {
- if (detach_indices) {
- DetachAll();
- }
- }
-
- void FillQuality() {
- edge_qual.Fill(index, kmer_mapper, genome.GetSequence());
- }
-
- //todo remove with usages after checking
- void ClearQuality() {
- edge_qual.clear();
- }
-
- void EnsureIndex() {
- if (!index.IsAttached()) {
- INFO("Index refill");
- index.Refill();
- index.Attach();
- }
- }
-
- void EnsureBasicMapping() {
- VERIFY(kmer_mapper.IsAttached());
- EnsureIndex();
- }
-
- void EnsureQuality() {
- if (!edge_qual.IsAttached()) {
- ClearQuality();
- FillQuality();
- edge_qual.Attach();
- }
- }
-
- //positions are refilled every time
- void EnsurePos() {
- if (!edge_pos.IsAttached()) {
- edge_pos.Attach();
- }
- edge_pos.clear();
- FillPos(*this, genome.GetSequence(), "ref0");
- FillPos(*this, !genome.GetSequence(), "ref1");
- }
-
- void EnsureDebugInfo() {
- EnsureBasicMapping();
- EnsureQuality();
- EnsurePos();
- }
-
- void InitRRIndices() {
- clustered_indices.Init();
- scaffolding_indices.Init();
- }
-
- void ClearRRIndices() {
- for (auto& pi : paired_indices) {
- pi.Clear();
- }
- clustered_indices.Clear();
- scaffolding_indices.Clear();
- single_long_reads.Clear();
- }
-
- void DetachAll() {
- index.Detach();
- kmer_mapper.Detach();
- edge_pos.Detach();
- edge_qual.Detach();
- }
-
-};
-
-typedef graph_pack<ConjugateDeBruijnGraph, runtime_k::RtSeq, KmerFreeEdgeIndex<Graph, runtime_k::RtSeq, kmer_index_traits<runtime_k::RtSeq>, DefaultStoring>> conj_graph_pack;
-typedef conj_graph_pack::index_t Index;
-
-typedef conj_graph_pack::PairedInfoIndicesT PairedIndicesT;
-typedef conj_graph_pack::UnclusteredPairedInfoIndicesT UnclusteredPairedIndicesT;
-typedef conj_graph_pack::LongReadContainerT LongReadContainerT;
-typedef omnigraph::de::PairedInfoIndexT<ConjugateDeBruijnGraph> PairedIndexT;
-typedef omnigraph::de::UnclusteredPairedInfoIndexT<ConjugateDeBruijnGraph> UnclusteredPairedIndexT;
-
-} // namespace debruijn_graph
diff --git a/src/debruijn/graph_read_correction.hpp b/src/debruijn/graph_read_correction.hpp
deleted file mode 100644
index b827ac7..0000000
--- a/src/debruijn/graph_read_correction.hpp
+++ /dev/null
@@ -1,183 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "standard.hpp"
-#include "omni/path_processor.hpp"
-#include "io/modifying_reader_wrapper.hpp"
-
-namespace debruijn_graph {
-
-template<class gp_t>
-class TipsProjector {
- typedef typename gp_t::graph_t Graph;
- typedef typename Graph::EdgeId EdgeId;
-
- gp_t& gp_;
-
- const UniquePathFinder<Graph> unique_path_finder_;
-
- optional<EdgeId> UniqueAlternativeEdge(EdgeId tip, bool outgoing_tip) {
- vector<EdgeId> edges;
- if (outgoing_tip) {
- push_back_all(edges, gp_.g.OutgoingEdges(gp_.g.EdgeStart(tip)));
- } else {
- push_back_all(edges, gp_.g.IncomingEdges(gp_.g.EdgeEnd(tip)));
- }
- restricted::set<EdgeId> edges_set(edges.begin(), edges.end());
- edges_set.erase(tip);
- if (edges_set.size() == 1)
- return optional < EdgeId > (*edges_set.begin());
- else
- return boost::none;
- }
-
- vector<EdgeId> UniqueAlternativePath(EdgeId tip, bool outgoing_tip) {
- optional<EdgeId> alt_edge = UniqueAlternativeEdge(tip, outgoing_tip);
- if (alt_edge) {
- if (outgoing_tip) {
- return unique_path_finder_.UniquePathForward(*alt_edge);
- } else {
- return unique_path_finder_.UniquePathBackward(*alt_edge);
- }
- }
- return vector<EdgeId>();
- }
-
- void AlignAndProject(const Sequence& tip_seq, const Sequence& alt_seq,
- bool outgoing_tip) {
- //todo refactor
- Sequence aligned_tip = tip_seq;
- Sequence aligned_alt = alt_seq;
- if (outgoing_tip) {
- if (tip_seq.size() >= alt_seq.size()) {
- aligned_tip = tip_seq.Subseq(0, alt_seq.size());
- } else {
- aligned_alt = alt_seq.Subseq(0, tip_seq.size());
- }
- } else {
- if (tip_seq.size() >= alt_seq.size()) {
- aligned_tip = tip_seq.Subseq(tip_seq.size() - alt_seq.size());
- } else {
- aligned_alt = alt_seq.Subseq(alt_seq.size() - tip_seq.size());
- }
- }
-
- INFO(
- "Remapping " << aligned_tip.size()
- << " kmers of aligned_tip to aligned_alt");
- gp_.kmer_mapper.RemapKmers(aligned_tip, aligned_alt);
- }
-
-public:
- TipsProjector(gp_t& gp) :
- gp_(gp), unique_path_finder_(gp.g) {
-
- }
-
- void ProjectTip(EdgeId tip) {
- TRACE("Trying to project tip " << gp_.g.str(tip));
- bool outgoing_tip = gp_.g.IsDeadEnd(gp_.g.EdgeEnd(tip));
- Sequence tip_seq = gp_.g.EdgeNucls(tip);
- vector<EdgeId> alt_path = UniqueAlternativePath(tip, outgoing_tip);
- if (alt_path.empty()) {
- TRACE(
- "Failed to find unique alt path for tip " << gp_.g.str(tip)
- << ". Wasn't projected!!!");
- } else {
- Sequence alt_seq = MergeSequences(gp_.g, alt_path);
- if (tip_seq.size() > alt_seq.size()) {
- TRACE(
- "Can't fully project tip " << gp_.g.str(tip)
- << " with seq length " << tip_seq.size()
- << " because alt path length is "
- << alt_seq.size()
- << ". Trying to project partially");
- }
- AlignAndProject(tip_seq, alt_seq, outgoing_tip);
- AlignAndProject(!tip_seq, !alt_seq, !outgoing_tip);
- TRACE("Tip projected");
- }
- }
-private:
- DECL_LOGGER("TipsProjector")
- ;
-};
-
-//todo improve logging
-template<class Graph, class Mapper>
-class GraphReadCorrector: public io::SequenceModifier {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- const Graph& graph_;
- const Mapper mapper_;
- const MappingPathFixer<Graph> path_fixer_;
-
-public:
- /*virtual*/
- Sequence Modify(const Sequence& s) {
-// if(s < !s)
-// return !Refine(!s);
- MappingPath<EdgeId> mapping_path = mapper_.MapSequence(s);
-
- if (mapping_path.size() == 0 || s.size() < graph_.k() + 1
- || mapping_path.front().second.initial_range.start_pos != 0
- || mapping_path.back().second.initial_range.end_pos
- != s.size() - graph_.k()) {
- //todo reduce concat unmapped beginning and end in future???
- TRACE(
- "Won't fix because wasn't mapped or start/end fell on unprojected tip/erroneous connection");
-// TRACE(
-// "For sequence of length " << s.size()
-// << " returning empty sequence");
- return s;
-// return Sequence();
- }
-
- Path<EdgeId> path = path_fixer_.TryFixPath(mapping_path.path());
-// TRACE("Mapped sequence to path " << graph_.str(path.sequence()));
-
- if (!path_fixer_.CheckContiguous(path.sequence())) {
- TRACE("Even fixed path wasn't contiguous");
- return s;
- } else {
- TRACE("Fixed path is contiguous");
- Sequence answer = PathSequence(graph_, path);
-// if (answer != s) {
-// if (answer.size() < 1000) {
-// TRACE(
-// "Initial sequence modified, edit distance= "
-// << EditDistance(answer, s));
-// } else {
-// TRACE("Sequence too large, won't count edit distance");
-// }
-// }
- return answer;
- }
-
-// else {
-// TRACE("Initial sequence unmodified!");
-// }
- }
-
- GraphReadCorrector(const Graph& graph, const Mapper& mapper) :
- graph_(graph), mapper_(mapper), path_fixer_(graph) {
- }
-
-private:
- DECL_LOGGER("ContigRefiner");
-};
-
-template<class Graph, class Mapper>
-shared_ptr<GraphReadCorrector<Graph, Mapper>> GraphReadCorrectorInstance(
- const Graph& graph, const Mapper& mapper) {
- return std::make_shared<GraphReadCorrector<Graph, Mapper>>(graph, mapper);
-}
-
-}
diff --git a/src/debruijn/graphio.hpp b/src/debruijn/graphio.hpp
deleted file mode 100644
index 2534204..0000000
--- a/src/debruijn/graphio.hpp
+++ /dev/null
@@ -1,1017 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "standard.hpp"
-#include "omni/omni_utils.hpp"
-
-#include "omni/omni_tools.hpp"
-
-#include "omni/id_track_handler.hpp"
-#include "omni/edges_position_handler.hpp"
-#include "omni/graph_component.hpp"
-
-#include "de/paired_info.hpp"
-
-#include "debruijn_graph.hpp"
-#include "detail_coverage.hpp"
-#include "long_read_storage.hpp"
-
-#include "omni/order_and_law.hpp"
-
-#include <cmath>
-#include <set>
-#include <map>
-#include <algorithm>
-#include <fstream>
-#include <cstdio>
-
-namespace debruijn_graph {
-
-namespace graphio {
-
-using namespace omnigraph;
-using namespace omnigraph::de;
-//todo think of inner namespace
-
-template<class KmerMapper>
-void SaveKmerMapper(const string& file_name,
- const KmerMapper& mapper) {
- std::ofstream file;
- file.open((file_name + ".kmm").c_str(),
- std::ios_base::binary | std::ios_base::out);
- DEBUG("Saving kmer mapper, " << file_name <<" created");
- VERIFY(file.is_open());
-
- uint32_t k_ = (uint32_t) mapper.get_k();
- file.write((char *) &k_, sizeof(uint32_t));
- mapper.BinWrite(file);
-
- file.close();
- DEBUG("kmer mapper saved ")
-}
-
-template<class KmerMapper>
-bool LoadKmerMapper(const string& file_name,
- KmerMapper& kmer_mapper) {
- kmer_mapper.clear();
- std::ifstream file;
- file.open((file_name + ".kmm").c_str(),
- std::ios_base::binary | std::ios_base::in);
- if (!file.is_open()) {
- return false;
- }
- INFO("Reading kmer mapper, " << file_name <<" started");
-
- uint32_t k_;
- file.read((char *) &k_, sizeof(uint32_t));
-
- VERIFY_MSG(k_ == kmer_mapper.get_k(), "Cannot read kmer mapper, different Ks");
- kmer_mapper.BinRead(file);
-
- file.close();
- return true;
-}
-
-template<class EdgeIndex>
-void SaveEdgeIndex(const std::string& file_name,
- const EdgeIndex& index) {
- std::ofstream file;
- file.open((file_name + ".kmidx").c_str(),
- std::ios_base::binary | std::ios_base::out);
- DEBUG("Saving kmer index, " << file_name <<" created");
- VERIFY(file.is_open());
-
- uint32_t k_ = index.k();
- file.write((char *) &k_, sizeof(uint32_t));
- index.BinWrite(file);
-
- file.close();
- DEBUG("index saved ")
-}
-
-template<class EdgeIndex>
-bool LoadEdgeIndex(const std::string& file_name,
- EdgeIndex& index) {
- std::ifstream file;
- file.open((file_name + ".kmidx").c_str(),
- std::ios_base::binary | std::ios_base::in);
- INFO("Reading kmer index, " << file_name <<" started");
- if (!file.is_open())
- return false;
-
- uint32_t k_;
- file.read((char *) &k_, sizeof(uint32_t));
- VERIFY_MSG(k_ == index.k(), "Cannot read edge index, different Ks:");
-
- index.BinRead(file, file_name + ".kmidx");
-
- file.close();
-
- return true;
-}
-
-inline
-void SaveMapCoverage(const std::string& path, const std::map<int, int>& data ) {
- std::ofstream outFile;
- outFile.open(path.c_str());
-
- INFO("Saving detailed coverage in file " << path <<" started");
- outFile << data.size() << "\n";
- for (auto dataIterator = data.begin(); dataIterator != data.end(); ++dataIterator){
- outFile << dataIterator->first << " " << dataIterator->second << " .\n";
- }
-}
-
-template<class KmerIndex>
-void SaveDetailCoverage(const std::string& pathInCov, const std::string& pathOutCov, const KmerIndex& index ) {
- SaveMapCoverage(pathInCov, index.inCoverage);
- SaveMapCoverage(pathOutCov, index.outCoverage);
-}
-
-
-template<class Graph>
-class DataPrinter {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- //todo reduce duplication
- template<class T>
- void SaveEdgeAssociatedInfo(std::function<T (EdgeId)> access_f, ostream& out) const {
- out << component_.e_size() << endl;
- for (auto iter = component_.e_begin(); iter != component_.e_end(); ++iter) {
- EdgeId e = *iter;
- //todo fixme currently matches old format .cvr format
- out << e.int_id()/* << endl*/;
- out << " " << access_f(e) << " ." << endl;
- }
- }
-
-// template<class C>
-// void SaveEdgeAssociatedInfo(const C& c, ostream& out) const {
-// SaveEdgeAssociatedInfo<decltype(C::operator[])>(boost::bind(&C::operator[], c, _1), out);
-// }
-
- template<class C>
- void SaveEdgeAssociatedInfo(const C& c, ostream& out) const {
- out << component_.e_size() << endl;
- for (auto iter = component_.e_begin(); iter != component_.e_end(); ++iter) {
- EdgeId e = *iter;
- //todo fixme currently matches old format .cvr format
- out << e.int_id()/* << endl*/;
- out << " ";
- c.Save(e, out);
- out << " ." << endl;
- }
- }
-
- public:
-
- void SaveGraph(const string& file_name) const {
- FILE* gid_file = fopen((file_name + ".gid").c_str(), "w");
- size_t max_id = this->component().g().GetGraphIdDistributor().GetMax();
- fprintf(gid_file, "%zu\n", max_id);
- fclose(gid_file);
- FILE* file = fopen((file_name + ".grp").c_str(), "w");
- DEBUG("Graph saving to " << file_name << " started");
- VERIFY_MSG(file != NULL,
- "Couldn't open file " << (file_name + ".grp") << " on write");
- size_t vertex_count = component_.v_size();
- size_t edge_count = component_.e_size();
- fprintf(file, "%zu %zu \n", vertex_count, edge_count);
- for (auto iter = component_.v_begin(); iter != component_.v_end(); ++iter) {
- Save(file, *iter);
- }
-
- fprintf(file, "\n");
-
- for (auto iter = component_.e_begin(); iter != component_.e_end(); ++iter) {
- Save(file, *iter);
- }
- DEBUG("Graph saving to " << file_name << " finished");
-
- fclose(file);
- }
-
- void SaveEdgeSequences(const string& file_name) const {
- ofstream out(file_name + ".sqn");
- //todo switch to general function after its switching to fasta
- //SaveEdgeAssociatedInfo<Sequence>(boost::bind(&Graph::EdgeNucls, component_.g(), _1), out);
- DEBUG("Saving sequences, " << file_name <<" created");
- for (auto iter = component_.e_begin(); iter != component_.e_end(); ++iter) {
- EdgeId e = *iter;
- out << ">" << e.int_id() << endl;
- out << component_.g().EdgeNucls(e) << endl;
- }
- }
-
- void SaveCoverage(const string& file_name) const {
- ofstream out(file_name + ".cvr");
- DEBUG("Saving coverage, " << file_name <<" created");
- SaveEdgeAssociatedInfo(component_.g().coverage_index(), out);
- }
-
- void SaveFlankingCoverage(const string& file_name, const FlankingCoverage<Graph>& flanking_cov) const {
- ofstream out(file_name + ".flcvr");
- DEBUG("Saving flanking coverage, " << file_name <<" created");
- SaveEdgeAssociatedInfo(flanking_cov, out);
- }
-
- template<class Index>
- void SavePaired(const string& file_name,
- Index const& paired_index) const {
- FILE* file = fopen((file_name + ".prd").c_str(), "w");
- DEBUG("Saving paired info, " << file_name <<" created");
- VERIFY(file != NULL);
-
- size_t comp_size = 0;
- for (auto I = component_.e_begin(), E = component_.e_end(); I != E; ++I) {
- EdgeId e1 = *I;
- auto inner_map = paired_index.Get(e1);
- for (auto entry : inner_map) {
- if (component_.contains(entry.first)) { // if the second edge also lies in the same component
- comp_size += entry.second.size();
- continue;
- }
- }
- }
-
- fprintf(file, "%zu\n", comp_size);
-
- for (auto I = component_.e_begin(), E = component_.e_end(); I != E; ++I) {
- EdgeId e1 = *I;
- const auto& inner_map = paired_index.RawGet(e1);
- std::map<typename Graph::EdgeId, typename Index::RawHistProxy> ordermap(inner_map.begin(), inner_map.end());
- for (auto entry : ordermap) {
- EdgeId e2 = entry.first;
- if (component_.contains(e2))
- for (auto point : entry.second)
- fprintf(file, "%zu %zu %.2f %.2f %.2f .\n",
- e1.int_id(), e2.int_id(), math::eq((double)point.d, .0) ? .0 : (double)point.d, (double)point.weight, (double)point.variation());
- }
- }
-
- fclose(file);
- }
-
- void SavePositions(const string& file_name,
- EdgesPositionHandler<Graph> const& ref_pos) const {
- ofstream file((file_name + ".pos").c_str());
- DEBUG("Saving edges positions, " << file_name << " created");
- VERIFY(file.is_open());
- file << component_.e_size() << endl;
- for (auto it = component_.e_begin(); it != component_.e_end(); ++it) {
- vector<omnigraph::EdgePosition> pos_it = ref_pos.GetEdgePositions(*it);
- file << it->int_id() << " " << pos_it.size() << endl;
- for (size_t i = 0; i < pos_it.size(); i++) {
- file << " " << pos_it[i].contigId << " " << pos_it[i].mr << endl;
- }
- }
- }
-
- private:
- void Save(FILE* file, EdgeId eid) const {
- fprintf(file, "%s\n", ToPrint(eid).c_str());
- }
-
- void Save(FILE* file, VertexId vid) const {
- fprintf(file, "%s\n", ToPrint(vid).c_str());
- }
-
- const GraphComponent<Graph> component_;
-
- virtual std::string ToPrint(VertexId v) const = 0;
- virtual std::string ToPrint(EdgeId e) const = 0;
-
- protected:
-
- //todo optimize component copy
- DataPrinter(const GraphComponent<Graph>& component) :
- component_(component) {
- }
-
- const GraphComponent<Graph>& component() const {
- return component_;
- }
-
- public:
- virtual ~DataPrinter() {
- }
-};
-
-template<class Graph>
-class ConjugateDataPrinter: public DataPrinter<Graph> {
- typedef DataPrinter<Graph> base;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- public:
- ConjugateDataPrinter(Graph const& g) :
- base(g) {
- }
-
- ConjugateDataPrinter(const GraphComponent<Graph>& graph_component) :
- base(GraphComponent<Graph>(graph_component, true)) {
- }
-
- template<class VertexIt>
- ConjugateDataPrinter(const Graph& g, VertexIt begin, VertexIt end) :
- base(GraphComponent<Graph>(g, begin, end, true)) {
- }
-
- std::string ToPrint(VertexId v) const {
- stringstream ss;
- ss
- << "Vertex "
- << v.int_id()
- << " ~ "
- << this->component().g().conjugate(v).int_id() << " .";
- return ss.str();
- }
-
- std::string ToPrint(EdgeId e) const {
- stringstream ss;
- ss
- << "Edge "
- << e.int_id()
- << " : "
- << this->component().g().EdgeStart(e).int_id()
- << " -> "
- << this->component().g().EdgeEnd(e).int_id()
- << ", l = "
- << this->component().g().length(e)
- << " ~ "
- << this->component().g().conjugate(e).int_id() << " .";
- return ss.str();
- }
-
-};
-
-template<class Graph>
-class DataScanner {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- template<class T>
- void LoadEdgeAssociatedInfo(std::function<void (EdgeId, T)> setting_f, istream& in) const {
- size_t cnt;
- in >> cnt;
- for (size_t i = 0 ; i < cnt; ++i) {
- size_t edge_id;
- T t;
- string delim;
- in >> edge_id;
- in >> t;
- in >> delim;
- VERIFY(delim == ".");
- VERIFY(this->edge_id_map().find(edge_id) != this->edge_id_map().end());
- setting_f(this->edge_id_map()[edge_id], t);
- }
- }
-
- template<class T>
- void LoadEdgeAssociatedInfo(T& t, istream& in) const {
- size_t cnt;
- in >> cnt;
- for (size_t i = 0 ; i < cnt; ++i) {
- size_t edge_id;
- in >> edge_id;
- VERIFY(this->edge_id_map().find(edge_id) != this->edge_id_map().end());
- EdgeId eid = this->edge_id_map().find(edge_id)->second;
- t.Load(eid, in);
- string delim;
- in >> delim;
- VERIFY(delim == ".");
- }
- }
-
-// template<class C>
-// void LoadEdgeAssociatedInfo(const C& c, ostream& out) const {
-// SaveEdgeAssociatedInfo<decltype(C::operator[])>(boost::bind(&C::operator[], c, _1), out);
-// }
-
- public:
- virtual void LoadGraph(const string& file_name) = 0;
-
- void LoadCoverage(const string& file_name) {
- INFO("Reading coverage from " << file_name);
- ifstream in(file_name + ".cvr");
- LoadEdgeAssociatedInfo(g_.coverage_index(), in);
- }
-
- bool LoadFlankingCoverage(const string& file_name, FlankingCoverage<Graph>& flanking_cov) {
- if (!path::FileExists(file_name + ".flcvr")) {
- INFO("Flanking coverage saves are absent");
- return false;
- }
- INFO("Reading flanking coverage from " << file_name);
- ifstream in(file_name + ".flcvr");
- LoadEdgeAssociatedInfo(flanking_cov, in);
- return true;
- }
-
- template<typename Index>
- void LoadPaired(const string& file_name,
- Index& paired_index,
- bool force_exists = true) {
- typedef typename Graph::EdgeId EdgeId;
- FILE* file = fopen((file_name + ".prd").c_str(), "r");
- INFO((file_name + ".prd"));
- if (force_exists) {
- VERIFY(file != NULL);
- } else if (file == NULL) {
- INFO("Paired info not found, skipping");
- return;
- }
- INFO("Reading paired info from " << file_name << " started");
-
- size_t paired_count;
- int read_count = fscanf(file, "%zu \n", &paired_count);
- VERIFY(read_count == 1);
- while (!feof(file)) {
- size_t first_real_id, second_real_id;
- double w, d, v;
- read_count = fscanf(file, "%zu %zu %lf %lf %lf .\n",
- &first_real_id, &second_real_id, &d, &w, &v);
- VERIFY(read_count == 5);
- TRACE(first_real_id<< " " << second_real_id << " " << d << " " << w << " " << v);
- VERIFY(this->edge_id_map().find(first_real_id) != this->edge_id_map().end())
- EdgeId e1 = this->edge_id_map()[first_real_id];
- EdgeId e2 = this->edge_id_map()[second_real_id];
- if (e1 == EdgeId(NULL) || e2 == EdgeId(NULL))
- continue;
- TRACE(e1 << " " << e2 << " " << d << " " << w);
- paired_index.Add(e1, e2, { d, w, v });
- }
- DEBUG("PII SIZE " << paired_index.size());
- fclose(file);
- }
-
- bool LoadPositions(const string& file_name,
- EdgesPositionHandler<Graph>& edge_pos) {
- FILE* file = fopen((file_name + ".pos").c_str(), "r");
- if (file == NULL) {
- INFO("No positions were saved");
- return false;
- }
- VERIFY(!edge_pos.IsAttached());
- edge_pos.Attach();
- INFO("Reading edges positions, " << file_name <<" started");
- VERIFY(file != NULL);
- size_t pos_count;
- int read_count = fscanf(file, "%zu\n", &pos_count);
- VERIFY(read_count == 1);
- for (size_t i = 0; i < pos_count; i++) {
- size_t edge_real_id, pos_info_count;
- char contigId[500];
- char cur_str[500];
- read_count = fscanf(file, "%zu %zu\n", &edge_real_id, &pos_info_count);
- VERIFY(read_count == 2);
- // INFO( edge_real_id);
- for (size_t j = 0; j < pos_info_count; j++) {
- int start_pos, end_pos;
- int m_start_pos, m_end_pos;
- read_count = fscanf(file, "%[^\n]s", cur_str);
- read_count = fscanf(file, "\n");
- read_count = sscanf(cur_str, "%s [%d - %d] --> [%d - %d]", contigId,
- &start_pos, &end_pos, &m_start_pos, &m_end_pos);
- // INFO(cur_str);
- // INFO (contigId<<" "<< start_pos<<" "<<end_pos);
- // VERIFY(read_count == 3);
- VERIFY(read_count == 5);
- VERIFY(this->edge_id_map().find(edge_real_id) != this->edge_id_map().end());
- EdgeId eid = this->edge_id_map()[edge_real_id];
- edge_pos.AddEdgePosition(eid, string(contigId), start_pos - 1, end_pos, m_start_pos - 1, m_end_pos);
- }
- }
- fclose(file);
- return true;
- }
-
- private:
- Graph& g_;
- // int edge_count_;
- map<size_t, EdgeId> edge_id_map_;
- map<size_t, VertexId> vertex_id_map_;
-
- protected:
- DataScanner(Graph &g) : g_(g) {
- INFO("Creating of scanner started");
- // edge_count_ = 0;
- }
-
- Graph& g() {
- return g_;
- }
-
- map<size_t, EdgeId> &edge_id_map() {
- return edge_id_map_;
- }
-
- map<size_t, VertexId> &vertex_id_map() {
- return vertex_id_map_;
- }
-
- const map<size_t, EdgeId> &edge_id_map() const {
- return edge_id_map_;
- }
-
- const map<size_t, VertexId> &vertex_id_map() const {
- return vertex_id_map_;
- }
-
- public:
- virtual ~DataScanner() {
-
- }
-};
-
-template<class Graph>
-class ConjugateDataScanner: public DataScanner<Graph> {
- typedef DataScanner<Graph> base;
-public:
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-private:
- restricted::IdSegmentStorage CreateIdStorage(const string& file_name) {
- FILE* file = fopen((file_name + ".gid").c_str(), "r");
- //This is to support compatibility to old saves. Will be removed soon
- if(file == NULL) {
- return this->g().GetGraphIdDistributor().ReserveUpTo(1000000000);
- }
- VERIFY_MSG(file != NULL, "Couldn't find file " << (file_name + ".gid"));
- size_t max;
- int flag = fscanf(file, "%zu\n", &max);
- VERIFY(flag == 1);
- fclose(file);
- return this->g().GetGraphIdDistributor().ReserveUpTo(max);
- }
-
- public:
- /*virtual*/
- void LoadGraph(const string& file_name) {
- restricted::IdSegmentStorage id_storage = CreateIdStorage(file_name);
- INFO("Trying to read conjugate de bruijn graph from " << file_name << ".grp");
- FILE* file = fopen((file_name + ".grp").c_str(), "r");
- VERIFY_MSG(file != NULL, "Couldn't find file " << (file_name + ".grp"));
- FILE* sequence_file = fopen((file_name + ".sqn").c_str(), "r");
- VERIFY_MSG(file != NULL, "Couldn't find file " << (file_name + ".sqn"));
- INFO("Reading conjugate de bruijn graph from " << file_name << " started");
- size_t vertex_count;
- size_t edge_count;
- int flag = fscanf(file, "%zu %zu \n", &vertex_count, &edge_count);
- VERIFY(flag == 2);
- for (size_t i = 0; i < vertex_count; i++) {
- size_t vertex_real_id, conjugate_id;
- flag = fscanf(file, "Vertex %zu ~ %zu .\n", &vertex_real_id, &conjugate_id);
- TRACE("Vertex "<<vertex_real_id<<" ~ "<<conjugate_id<<" .");
- VERIFY(flag == 2);
-
- if (this->vertex_id_map().find((int) vertex_real_id) == this->vertex_id_map().end()) {
- size_t ids[2] = {vertex_real_id, conjugate_id};
- auto id_distributor = id_storage.GetSegmentIdDistributor(ids, ids + 2);
- VertexId vid = this->g().AddVertex(typename Graph::VertexData(), id_distributor);
- VertexId conj_vid = this->g().conjugate(vid);
-
- this->vertex_id_map()[vertex_real_id] = vid;
- this->vertex_id_map()[conjugate_id] = conj_vid;
- }
- }
-
- char first_char = (char) getc(sequence_file);
- VERIFY(!ferror(sequence_file));
- ungetc(first_char, sequence_file);
- bool fasta = (first_char == '>'); // if it's not fasta, then it's old .sqn
-
-
- if (!fasta) {
- size_t tmp_edge_count;
- flag = fscanf(sequence_file, "%zu", &tmp_edge_count);
- VERIFY(flag == 1);
- VERIFY(edge_count == tmp_edge_count);
- }
-
- const size_t longstring_size = 1000500; // TODO: O RLY magic constant? => Can't load edges >= 1Mbp
- char longstring[longstring_size];
- for (size_t i = 0; i < edge_count; i++) {
- size_t e_real_id, start_id, fin_id, length, conjugate_edge_id;
- flag = fscanf(file, "Edge %zu : %zu -> %zu, l = %zu ~ %zu .\n",
- &e_real_id, &start_id, &fin_id, &length, &conjugate_edge_id);
- VERIFY(flag == 5);
- VERIFY(length < longstring_size);
- if (fasta) {
- flag = fscanf(sequence_file, ">%zu\n%s\n", &e_real_id, longstring);
- }
- else {
- flag = fscanf(sequence_file, "%zu %s .", &e_real_id, longstring);
- }
- VERIFY(flag == 2);
- TRACE("Edge " << e_real_id << " : " << start_id << " -> "
- << fin_id << " l = " << length << " ~ " << conjugate_edge_id);
- if (this->edge_id_map().find((int) e_real_id) == this->edge_id_map().end()) {
- size_t ids[2] = {e_real_id, conjugate_edge_id};
- auto id_distributor = id_storage.GetSegmentIdDistributor(ids, ids + 2);
- Sequence tmp(longstring);
- EdgeId eid = this->g().AddEdge(this->vertex_id_map()[start_id], this->vertex_id_map()[fin_id], tmp, id_distributor);
- this->edge_id_map()[e_real_id] = eid;
- this->edge_id_map()[conjugate_edge_id] = this->g().conjugate(eid);
- }
- }
- fclose(file);
- fclose(sequence_file);
- }
- public:
- ConjugateDataScanner(Graph& g) :
- base(g) {
- }
-};
-
-inline std::string MakeSingleReadsFileName(const std::string& file_name,
- size_t index) {
- return file_name + "_paths_" + ToString(index) + ".mpr";
-}
-
-//helper methods
-// todo think how to organize them in the most natural way
-
-template<class Graph>
-void PrintBasicGraph(const string& file_name, DataPrinter<Graph>& printer) {
- printer.SaveGraph(file_name);
- printer.SaveEdgeSequences(file_name);
- printer.SaveCoverage(file_name);
-}
-
-template<class graph_pack>
-void PrintGraphPack(const string& file_name,
- DataPrinter<typename graph_pack::graph_t>& printer,
- const graph_pack& gp) {
- PrintBasicGraph(file_name, printer);
- // printer.SavePaired(file_name + "_et", gp.etalon_paired_index);
- if (gp.edge_pos.IsAttached())
- printer.SavePositions(file_name, gp.edge_pos);
- if (gp.index.IsAttached())
- SaveEdgeIndex(file_name, gp.index.inner_index());
- if (gp.kmer_mapper.IsAttached())
- SaveKmerMapper(file_name, gp.kmer_mapper);
- if (gp.flanking_cov.IsAttached())
- printer.SaveFlankingCoverage(file_name, gp.flanking_cov);
-}
-
-template<class graph_pack>
-void PrintGraphPack(const string& file_name, const graph_pack& gp) {
- ConjugateDataPrinter<typename graph_pack::graph_t> printer(gp.g);
- PrintGraphPack(file_name, printer, gp);
-}
-
-template<class Graph>
-void PrintPairedIndex(const string& file_name, DataPrinter<Graph>& printer,
- const PairedInfoIndexT<Graph>& paired_index) {
- printer.SavePaired(file_name, paired_index);
-}
-
-template<class Graph>
-void PrintUnclusteredIndex(const string& file_name, DataPrinter<Graph>& printer,
- const UnclusteredPairedInfoIndexT<Graph>& paired_index) {
- printer.SavePaired(file_name, paired_index);
-}
-
-template<class Graph>
-void PrintClusteredIndex(const string& file_name, DataPrinter<Graph>& printer,
- const PairedInfoIndexT<Graph>& clustered_index) {
- PrintPairedIndex(file_name + "_cl", printer, clustered_index);
-}
-
-template<class Graph>
-void PrintScaffoldingIndex(const string& file_name, DataPrinter<Graph>& printer,
- const PairedInfoIndexT<Graph>& clustered_index) {
- PrintPairedIndex(file_name + "_scf", printer, clustered_index);
-}
-
-template<class Graph>
-void PrintScaffoldIndex(const string& file_name, DataPrinter<Graph>& printer,
- const PairedInfoIndexT<Graph>& scaffold_index) {
- PrintPairedIndex(file_name + "_scf", printer, scaffold_index);
-}
-
-template<class Graph>
-void PrintUnclusteredIndices(const string& file_name, DataPrinter<Graph>& printer,
- const UnclusteredPairedInfoIndicesT<Graph>& paired_indices) {
- for (size_t i = 0; i < paired_indices.size(); ++i)
- PrintUnclusteredIndex(file_name + "_" + ToString(i), printer, paired_indices[i]);
-}
-
-template<class Graph>
-void PrintClusteredIndices(const string& file_name, DataPrinter<Graph>& printer,
- const PairedInfoIndicesT<Graph>& paired_indices) {
- for (size_t i = 0; i < paired_indices.size(); ++i)
- PrintClusteredIndex(file_name + "_" + ToString(i), printer, paired_indices[i]);
-}
-
-template<class Graph>
-void PrintScaffoldingIndices(const string& file_name, DataPrinter<Graph>& printer,
- const PairedInfoIndicesT<Graph>& paired_indices) {
- for (size_t i = 0; i < paired_indices.size(); ++i)
- PrintScaffoldingIndex(file_name + "_" + ToString(i), printer, paired_indices[i]);
-}
-
-template<class graph_pack>
-void PrintWithPairedIndex(const string& file_name,
- DataPrinter<typename graph_pack::graph_t>& printer,
- const graph_pack& gp,
- const PairedInfoIndexT<typename graph_pack::graph_t>& paired_index,
- bool clustered_index = false) {
-
- PrintGraphPack(file_name, printer, gp);
- if (!clustered_index) {
- PrintPairedIndex(file_name, printer, paired_index);
- } else {
- PrintClusteredIndex(file_name, printer, paired_index);
- }
-}
-
-template<class graph_pack>
-void PrintWithClusteredIndex(const string& file_name,
- DataPrinter<typename graph_pack::graph_t>& printer,
- const graph_pack& gp,
- const PairedInfoIndexT<typename graph_pack::graph_t>& paired_index) {
- PrintWithPairedIndex(file_name, printer, gp, paired_index, true);
-}
-
-template<class graph_pack>
-void PrintWithPairedIndices(const string& file_name,
- DataPrinter<typename graph_pack::graph_t>& printer,
- const graph_pack& gp,
- const PairedInfoIndicesT<typename graph_pack::graph_t>& paired_indices,
- bool clustered_index = false) {
- PrintGraphPack(file_name, printer, gp);
- if (!clustered_index)
- PrintPairedIndices(file_name, printer, paired_indices);
- else
- PrintClusteredIndices(file_name, printer, paired_indices);
-}
-
-template<class graph_pack>
-void PrintWithClusteredIndices(const string& file_name,
- DataPrinter<typename graph_pack::graph_t>& printer,
- const graph_pack& gp,
- const PairedInfoIndicesT<typename graph_pack::graph_t>& paired_indices) {
- PrintWithPairedIndices(file_name, printer, gp, paired_indices, true);
-}
-
-template<class Graph>
-void PrintSingleLongReads(const string& file_name, const LongReadContainer<Graph>& single_long_reads) {
- for (size_t i = 0; i < single_long_reads.size(); ++i){
- single_long_reads[i].DumpToFile(MakeSingleReadsFileName(file_name, i));
- }
-}
-
-template<class graph_pack>
-void PrintAll(const string& file_name, const graph_pack& gp) {
- ConjugateDataPrinter<typename graph_pack::graph_t> printer(gp.g, gp.g.begin(), gp.g.end());
- PrintGraphPack(file_name, printer, gp);
- PrintUnclusteredIndices(file_name, printer, gp.paired_indices);
- PrintClusteredIndices(file_name, printer, gp.clustered_indices);
- PrintScaffoldingIndices(file_name, printer, gp.scaffolding_indices);
- PrintSingleLongReads(file_name, gp.single_long_reads);
- gp.ginfo.Save(file_name + ".ginfo");
-}
-
-template<class graph_pack, class VertexIt>
-void PrintWithPairedIndex(const string& file_name, const graph_pack& gp,
- VertexIt begin, VertexIt end,
- const PairedInfoIndexT<typename graph_pack::graph_t>& paired_index,
- bool clustered_index = false) {
- ConjugateDataPrinter<typename graph_pack::graph_t> printer(gp.g,
- begin, end);
- PrintWithPairedIndex(file_name, printer, gp, paired_index, clustered_index);
-}
-
-template<class graph_pack, class VertexIt>
-void PrintWithClusteredIndex(const string& file_name, const graph_pack& gp,
- VertexIt begin, VertexIt end,
- const PairedInfoIndexT<typename graph_pack::graph_t>& clustered_index) {
- ConjugateDataPrinter<typename graph_pack::graph_t> printer(gp.g,
- begin, end);
- PrintWithPairedIndex(file_name, printer, gp, clustered_index, true);
-}
-
-template<class graph_pack>
-void PrintWithPairedIndex(const string& file_name, const graph_pack& gp,
- const PairedInfoIndexT<typename graph_pack::graph_t>& paired_index,
- bool clustered_index = false) {
- PrintWithPairedIndex(file_name, gp, gp.g.begin(), gp.g.end(), paired_index,
- clustered_index);
-}
-
-template<class graph_pack, class VertexIt>
-void PrinGraphPack(const string& file_name, const graph_pack& gp,
- VertexIt begin, VertexIt end) {
- ConjugateDataPrinter<typename graph_pack::graph_t> printer(gp.g,
- begin, end);
- PrintGraphPack(file_name, printer, gp);
-}
-
-template<class graph_pack>
-void PrintWithClusteredIndex(const string& file_name, const graph_pack& gp,
- const PairedInfoIndexT<typename graph_pack::graph_t>& clustered_index) {
- PrintWithPairedIndex(file_name, gp, clustered_index, true);
-}
-
-template<class graph_pack>
-void PrintWithPairedIndices(const string& file_name, const graph_pack& gp,
- const PairedInfoIndicesT<typename graph_pack::graph_t>& paired_indices,
- bool clustered_index = false) {
-
- ConjugateDataPrinter<typename graph_pack::graph_t> printer(gp.g, gp.g.begin(), gp.g.end());
-
- PrintWithPairedIndices(file_name, printer, gp, paired_indices, clustered_index);
-}
-
-template<class graph_pack>
-void PrintWithClusteredIndices(const string& file_name, const graph_pack& gp,
- const PairedInfoIndicesT<typename graph_pack::graph_t>& paired_indices) {
- PrintWithPairedIndices(file_name, gp, paired_indices, true);
-}
-
-template<class Graph>
-void ScanBasicGraph(const string& file_name, DataScanner<Graph>& scanner) {
- scanner.LoadGraph(file_name);
- scanner.LoadCoverage(file_name);
-}
-
-template<class graph_pack>
-void ScanGraphPack(const string& file_name,
- DataScanner<typename graph_pack::graph_t>& scanner, graph_pack& gp) {
- ScanBasicGraph(file_name, scanner);
- gp.index.Attach();
- if (LoadEdgeIndex(file_name, gp.index.inner_index())) {
- gp.index.Update();
- } else {
- WARN("Cannot load edge index, kmer coverages will be missed");
- gp.index.Refill();
- }
- // scanner.LoadPaired(file_name + "_et", gp.etalon_paired_index);
- scanner.LoadPositions(file_name, gp.edge_pos);
- //load kmer_mapper only if needed
- if (gp.kmer_mapper.IsAttached())
- if (!LoadKmerMapper(file_name, gp.kmer_mapper)) {
- WARN("Cannot load kmer_mapper, information on projected kmers will be missed");
- }
- if (!scanner.LoadFlankingCoverage(file_name, gp.flanking_cov)) {
- gp.flanking_cov.Fill(gp.index.inner_index());
- }
-}
-
-template<class Graph>
-void ScanPairedIndex(const string& file_name, DataScanner<Graph>& scanner,
- UnclusteredPairedInfoIndexT<Graph>& paired_index,
- bool force_exists = true) {
- scanner.LoadPaired(file_name, paired_index, force_exists);
-}
-
-template<class Graph>
-void ScanClusteredIndex(const string& file_name, DataScanner<Graph>& scanner,
- PairedInfoIndexT<Graph>& clustered_index,
- bool force_exists = true) {
- scanner.LoadPaired(file_name + "_cl", clustered_index, force_exists);
-}
-
-template<class Graph>
-void ScanScaffoldingIndex(const string& file_name, DataScanner<Graph>& scanner,
- PairedInfoIndexT<Graph>& clustered_index,
- bool force_exists = true) {
- scanner.LoadPaired(file_name + "_scf", clustered_index, force_exists);
-}
-
-template<class Graph>
-void ScanPairedIndices(const std::string& file_name, DataScanner<Graph>& scanner,
- UnclusteredPairedInfoIndicesT<Graph>& paired_indices,
- bool force_exists = true) {
- for (size_t i = 0; i < paired_indices.size(); ++i)
- ScanPairedIndex(file_name + "_" + ToString(i), scanner, paired_indices[i], force_exists);
-}
-
-template<class Graph>
-void ScanClusteredIndices(const std:: string& file_name, DataScanner<Graph>& scanner,
- PairedInfoIndicesT<Graph>& paired_indices,
- bool force_exists = true) {
- for (size_t i = 0; i < paired_indices.size(); ++i)
- ScanClusteredIndex(file_name + "_" + ToString(i), scanner, paired_indices[i], force_exists);
-}
-
-template<class Graph>
-void ScanScaffoldingIndices(const std:: string& file_name, DataScanner<Graph>& scanner,
- PairedInfoIndicesT<Graph>& paired_indices,
- bool force_exists = true) {
- for (size_t i = 0; i < paired_indices.size(); ++i)
- ScanScaffoldingIndex(file_name + "_" + ToString(i), scanner, paired_indices[i], force_exists);
-}
-
-template<class Graph>
-void ScanScaffoldIndices(const string& file_name, DataScanner<Graph>& scanner,
- PairedInfoIndicesT<Graph>& scaffold_indices) {
-
- for (size_t i = 0; i < scaffold_indices.size(); ++i) {
- ScanScaffoldIndex(file_name + "_" + ToString(i), scanner, scaffold_indices[i]);
- }
-}
-
-template<class graph_pack>
-void ScanWithPairedIndex(const string& file_name,
- DataScanner<typename graph_pack::graph_t>& scanner, graph_pack& gp,
- PairedInfoIndexT<typename graph_pack::graph_t>& paired_index,
- bool clustered_index = false) {
- ScanGraphPack(file_name, scanner, gp);
- if (!clustered_index) {
- ScanPairedIndex(file_name, scanner, paired_index);
- } else {
- ScanClusteredIndex(file_name, scanner, paired_index);
- }
-}
-
-template<class graph_pack>
-void ScanWithPairedIndices(const string& file_name,
- DataScanner<typename graph_pack::graph_t>& scanner, graph_pack& gp,
- PairedInfoIndicesT<typename graph_pack::graph_t>& paired_indices,
- bool clustered_index = false) {
-
- ScanGraphPack(file_name, scanner, gp);
- if (!clustered_index) {
- ScanPairedIndices(file_name, scanner, paired_indices);
- } else {
- ScanClusteredIndices(file_name, scanner, paired_indices);
- }
-}
-
-template<class graph_pack>
-void ScanWithPairedIndex(const string& file_name, graph_pack& gp,
- PairedInfoIndexT<typename graph_pack::graph_t>& paired_index,
- bool clustered_index = false) {
- ConjugateDataScanner<typename graph_pack::graph_t> scanner(gp.g);
- ScanWithPairedIndex(file_name, scanner, gp, paired_index, clustered_index);
-}
-
-template<class graph_pack>
-void ScanWithClusteredIndex(const string& file_name, graph_pack& gp,
- PairedInfoIndexT<typename graph_pack::graph_t>& clustered_index) {
- ScanWithPairedIndex(file_name, gp, clustered_index, true);
-}
-
-template<class graph_pack>
-void ScanWithClusteredIndices(const string& file_name,
- DataScanner<typename graph_pack::graph_t>& scanner, graph_pack& gp,
- PairedInfoIndicesT<typename graph_pack::graph_t>& paired_indices) {
- ScanWithPairedIndices(file_name, scanner, gp, paired_indices, true);
-}
-
-template<class graph_pack>
-void ScanWithPairedIndices(const string& file_name, graph_pack& gp,
- PairedInfoIndicesT<typename graph_pack::graph_t>& paired_indices,
- bool clustered_index = false) {
- ConjugateDataScanner<typename graph_pack::graph_t> scanner(gp.g);
- ScanWithPairedIndices(file_name, scanner, gp, paired_indices, clustered_index);
-}
-
-
-template<class graph_pack>
-void ScanWithClusteredIndices(const string& file_name, graph_pack& gp,
- PairedInfoIndicesT<typename graph_pack::graph_t>& paired_indices) {
- ScanWithPairedIndices(file_name, gp, paired_indices, true);
-}
-
-template<class Graph>
-void ScanBasicGraph(const string& file_name, Graph& g) {
- ConjugateDataScanner<Graph> scanner(g);
- ScanBasicGraph<Graph>(file_name, scanner);
-}
-
-template<class Graph>
-void ScanSingleLongReads(const string& file_name, LongReadContainer<Graph>& single_long_reads) {
- for (size_t i = 0; i < single_long_reads.size(); ++i){
- single_long_reads[i].LoadFromFile(MakeSingleReadsFileName(file_name, i), false);
- }
-}
-
-template<class graph_pack>
-void ScanGraphPack(const string& file_name, graph_pack& gp) {
- ConjugateDataScanner<typename graph_pack::graph_t> scanner(gp.g);
- ScanGraphPack(file_name, scanner, gp);
-}
-
-template<class graph_pack>
-void ScanAll(const std::string& file_name, graph_pack& gp,
- bool force_exists = true) {
- ConjugateDataScanner<typename graph_pack::graph_t> scanner(gp.g);
- ScanGraphPack(file_name, scanner, gp);
- ScanPairedIndices(file_name, scanner, gp.paired_indices, force_exists);
- ScanClusteredIndices(file_name, scanner, gp.clustered_indices, force_exists);
- ScanScaffoldingIndices(file_name, scanner, gp.scaffolding_indices, force_exists);
- ScanSingleLongReads(file_name, gp.single_long_reads);
- gp.ginfo.Load(file_name + ".ginfo");
-}
-}
-}
diff --git a/src/debruijn/indices/edge_index_builders.hpp b/src/debruijn/indices/edge_index_builders.hpp
deleted file mode 100644
index 71fffa0..0000000
--- a/src/debruijn/indices/edge_index_builders.hpp
+++ /dev/null
@@ -1,179 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "edge_position_index.hpp"
-
-namespace debruijn_graph {
-
-template <class Builder>
-class GraphPositionFillingIndexBuilder : public Builder {
- typedef Builder base;
-public:
- typedef typename Builder::IndexT IndexT;
- typedef typename IndexT::KMer Kmer;
-// typedef typename IndexT::GraphT GraphT;
-
- template<class Graph>
- void BuildIndexFromGraph(IndexT &index,
- const Graph/*T*/ &g, size_t read_buffer_size = 0) const {
- base::BuildIndexFromGraph(index, g, read_buffer_size);
-
- // Now use the index to fill the coverage and EdgeId's
- INFO("Collecting k-mer coverage information from graph, this takes a while.");
- EdgeInfoUpdater<IndexT, Graph> updater(g, index);
- updater.UpdateAll();
- }
-
-};
-
-template<typename> struct Void { typedef void type; };
-
-template<typename T, typename Sfinae = void>
-struct has_contains: std::false_type {};
-
-template<typename T>
-struct has_contains<
- T
- , typename Void<
- //decltype( std::declval<T&>().contains(typename T::KMerIdx(0), typename T::KMer()) )
- decltype( ((T*)(0))->contains(*((typename T::KeyWithHash*)(0))) )
- >::type
->: std::true_type {};
-
-template <class Builder>
-class CoverageFillingEdgeIndexBuilder : public Builder {
- typedef Builder base;
- public:
- typedef typename Builder::IndexT IndexT;
- typedef typename IndexT::KMer Kmer;
- typedef typename IndexT::KMerIdx KmerIdx;
- typedef typename IndexT::KeyWithHash KeyWithHash;
-
- private:
-
-
- bool ContainsWrap(bool check_contains, IndexT& index, const KeyWithHash &kwh, std::true_type) const {
- return !check_contains || index.contains(kwh);
- }
-
- bool ContainsWrap(bool /*check_contains*/, IndexT&/* index*/, const KeyWithHash &/*kwh*/, std::false_type) const {
- VERIFY(false);
-// VERIFY(!check_contains);
- return true;
- }
-
- template<class ReadStream>
- size_t FillCoverageFromStream(ReadStream &stream,
- IndexT &index, bool check_contains) const {
- unsigned k = index.k();
- size_t rl = 0;
-
- while (!stream.eof()) {
- typename ReadStream::ReadT r;
- stream >> r;
- rl = std::max(rl, r.size());
-
- const Sequence &seq = r.sequence();
- if (seq.size() < k)
- continue;
-
- KeyWithHash kwh = index.ConstructKWH(seq.start<Kmer>(k) >> 'A');
- for (size_t j = k - 1; j < seq.size(); ++j) {
- kwh <<= seq[j];
- //contains is not used since index might be still empty here
- if (kwh.is_minimal() && index.valid(kwh) && ContainsWrap(check_contains, index, kwh, has_contains<IndexT>())) {
-# pragma omp atomic
- index.get_raw_value_reference(kwh).count += 1;
- }
- }
- }
-
- return rl;
- }
-
- public:
-
- template<class Streams>
- size_t ParallelFillCoverage(IndexT &index,
- Streams &streams,
- bool check_contains = true) const {
- INFO("Collecting k-mer coverage information from reads, this takes a while.");
- unsigned nthreads = (unsigned) streams.size();
- size_t rl = 0;
- streams.reset();
-#pragma omp parallel for num_threads(nthreads) shared(rl)
- for (size_t i = 0; i < nthreads; ++i) {
- size_t crl = FillCoverageFromStream(streams[i], index, check_contains);
-
- // There is no max reduction in C/C++ OpenMP... Only in FORTRAN :(
-#pragma omp flush(rl)
- if (crl > rl)
-#pragma omp critical
- {
- rl = std::max(rl, crl);
- }
- }
-
- // Contigs have zero coverage!
-#if 0
- if (contigs_stream) {
- contigs_stream->reset();
- FillCoverageFromStream(*contigs_stream, index, check_contains);
- }
-#endif
-
-//todo if this verify is neede, put it outside
-//#ifndef NDEBUG
-// for (auto idx = index.kmer_idx_begin(), eidx = index.kmer_idx_end();
-// idx != eidx; ++idx) {
-//
-// Kmer k = index.kmer(idx);
-//
-// VERIFY(index[k].count == index[!k].count);
-// }
-//#endif
-
- return rl;
- }
-
- template<class Streams>
- size_t BuildIndexFromStream(IndexT &index,
- Streams &streams,
- io::SingleStream* contigs_stream = 0) const {
- base::BuildIndexFromStream(index, streams, contigs_stream);
-
- return ParallelFillCoverage(index, streams, false);
- }
-
-// template<class Streams>
-// size_t BuildIndexWithCoverageFromGraph(
-// GraphT &graph, IndexT &index,
-// Streams &streams,
-// SingleReadStream* contigs_stream = 0) const {
-// this->BuildIndexFromGraph(index, graph);
-//
-// return ParallelFillCoverage(index, streams, contigs_stream, true);
-// }
-};
-
-template<class Index>
-struct EdgeIndexHelper {
- typedef Index IndexT;
- typedef typename IndexT::KMer Kmer;
- typedef typename IndexT::KMerIdx KMerIdx;
- typedef typename IndexT::traits_t traits_t;
-// typedef typename IndexT::IdType IdType;
- typedef DeBruijnStreamKMerIndexBuilder<Kmer, IndexT> DeBruijnStreamKMerIndexBuilderT;
- typedef CoverageFillingEdgeIndexBuilder<DeBruijnStreamKMerIndexBuilderT> CoverageFillingEdgeIndexBuilderT;
- typedef DeBruijnGraphKMerIndexBuilder<IndexT> DeBruijnGraphKMerIndexBuilderT;
- typedef GraphPositionFillingIndexBuilder<DeBruijnGraphKMerIndexBuilderT> GraphPositionFillingIndexBuilderT;
- typedef CoverageFillingEdgeIndexBuilder<GraphPositionFillingIndexBuilderT> CoverageAndGraphPositionFillingIndexBuilderT;
-};
-
-}
diff --git a/src/debruijn/indices/edge_info_updater.hpp b/src/debruijn/indices/edge_info_updater.hpp
deleted file mode 100644
index 011befe..0000000
--- a/src/debruijn/indices/edge_info_updater.hpp
+++ /dev/null
@@ -1,96 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "standard.hpp"
-
-namespace debruijn_graph {
-
-template<typename Index, typename Graph>
-class EdgeInfoUpdater {
- typedef typename Index::KMer Kmer;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Index::KeyWithHash KeyWithHash;
- typedef typename Index::Value EdgeInfo;
-
- const Graph &g_;
- Index &index_;
-
-
- void PutInIndex(const KeyWithHash &kwh, EdgeId id, size_t offset) {
- if (index_.valid(kwh)) {
- auto &entry = index_.get_raw_value_reference(kwh);
- if (!entry.valid() || index_.contains(kwh)) {
- index_.put_value(kwh, EdgeInfo(id, (unsigned)offset, entry.count));
- }
- }
- }
-
- //todo why do we need to check equality???!!!
- bool DeleteIfEqual(const KeyWithHash &kwh, EdgeId e) {
- if (!index_.contains(kwh))
- return false;
- if (index_.get_value(kwh).edge_id == e) {
- index_.get_raw_value_reference(kwh).invalidate();
- return true;
- }
- return false;
- }
-
- void UpdateKMers(const Sequence &nucls, EdgeId e) {
- VERIFY(nucls.size() >= index_.k());
- KeyWithHash kwh = index_.ConstructKWH(Kmer(index_.k(), nucls));
- index_.PutInIndex(kwh, e, 0);
- for (size_t i = index_.k(), n = nucls.size(); i < n; ++i) {
- kwh <<= nucls[i];
- index_.PutInIndex(kwh, e, i - index_.k() + 1);
- }
- }
-
- void DeleteKMers(const Sequence &nucls, EdgeId e) {
- VERIFY(nucls.size() >= index_.k());
- KeyWithHash kwh = index_.ConstructKWH(Kmer(index_.k(), nucls));
- DeleteIfEqual(kwh, e);
- for (size_t i = index_.k(), n = nucls.size(); i < n; ++i) {
- kwh <<= nucls[i];
- DeleteIfEqual(kwh, e);
- }
- }
-
- public:
- /**
- * Creates DataHashRenewer for specified graph and index
- * @param g graph to be indexed
- * @param index index to be synchronized with graph
- */
- EdgeInfoUpdater(const Graph& g, Index& index)
- : g_(g),
- index_(index) {
- }
-
- void UpdateKmers(EdgeId e) {
- Sequence nucls = g_.EdgeNucls(e);
- UpdateKMers(nucls, e);
- }
-
- void DeleteKmers(EdgeId e) {
- Sequence nucls = g_.EdgeNucls(e);
- DeleteKMers(nucls, e);
- }
-
- void UpdateAll() {
- for (auto it = g_.ConstEdgeBegin(); !it.IsEnd(); ++it) {
- UpdateKmers(*it);
- }
- }
-
- private:
- DECL_LOGGER("EdgeInfoUpdater")
-};
-
-}
diff --git a/src/debruijn/indices/edge_multi_index.hpp b/src/debruijn/indices/edge_multi_index.hpp
deleted file mode 100644
index e33c919..0000000
--- a/src/debruijn/indices/edge_multi_index.hpp
+++ /dev/null
@@ -1,152 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-/*
- * edge_multi_index.hpp
- *
- * Created on: May 24, 2013
- * Author: anton
- */
-#include "perfect_hash_map.hpp"
-#include "edge_info_updater.hpp"
-#include "kmer_splitters.hpp"
-#include "edge_position_index.hpp"
-
-namespace debruijn_graph {
-
-template<class IdType>
-class EdgeInfoStorage {
-public:
- typedef vector<EdgeInfo<IdType>> Content;
- typedef typename Content::iterator iterator;
- typedef typename Content::const_iterator const_iterator;
- Content content_;
-
- EdgeInfoStorage(const Content &content) : content_(content) {
- }
-
- EdgeInfoStorage() {
- }
-
- EdgeInfo<IdType> &operator[](size_t i) {
- return content_[i];
- }
-
- iterator begin() {
- return content_.begin();
- }
-
- iterator end() {
- return content_.end();
- }
-
- const_iterator begin() const {
- return content_.cbegin();
- }
-
- const_iterator end() const {
- return content_.cend();
- }
-
- iterator find(const EdgeInfo<IdType> &info) {
- return content_.find(info);
- }
-
- const_iterator find(const EdgeInfo<IdType> &info) const {
- return content_.find(info);
- }
-
- void push_back(const EdgeInfo<IdType> &info) {
- content_.push_back(info);
- }
-
- size_t size() const{
- return content_.size();
- }
-
- bool valid() const {
- //what's invalid edge info storage?
- return true;
- }
-
- EdgeInfoStorage conjugate(size_t k) const {
- EdgeInfoStorage result;
- for(auto it = content_.rbegin(); it != content_.rend(); ++it) {
- result.push_back(it->conjugate(k));
- }
- return result;
- }
-};
-
-//todo it is not handling graph events!!!
-template<class IdType, class Seq = runtime_k::RtSeq,
- class traits = kmer_index_traits<Seq>, class StoringType = SimpleStoring >
-class DeBruijnEdgeMultiIndex : public KeyStoringMap<Seq, EdgeInfoStorage<IdType>, traits, StoringType > {
- typedef KeyStoringMap<Seq, EdgeInfoStorage<IdType>, traits, StoringType > base;
- public:
- typedef StoringType storing_type;
- typedef typename base::traits_t traits_t;
- typedef typename base::KMer KMer;
- typedef typename base::KMerIdx KMerIdx;
- typedef typename base::KeyWithHash KeyWithHash;
- typedef EdgeInfoStorage<IdType> Value;
-
- using base::ConstructKWH;
-// typedef typename base::IdType IdType;
- //todo move this typedef up in hierarchy (need some c++ tricks)
-
- DeBruijnEdgeMultiIndex(unsigned k, const std::string &workdir)
- : base(k, workdir) {
- INFO("DeBruijnEdgeMultiIndex constructing");
- }
-
- ~DeBruijnEdgeMultiIndex() {}
-
-
- Value get(const KeyWithHash &kwh) const {
- VERIFY(contains(kwh));
- return base::get_value(kwh);
- }
-
- bool contains(const KeyWithHash &kwh) const {
- if (!base::valid(kwh))
- return false;
- return this->get_raw_value_reference(kwh).valid();
- }
-
- bool valid(const KMer &kmer) const {
- KeyWithHash kwh = base::ConstructKWH(kmer);
- return base::valid(kwh);
- }
-
- void PutInIndex(const KeyWithHash &kwh, IdType id, size_t offset) {
- //KeyWithHash kwh = base::ConstructKWH(kmer);
- if (contains(kwh)) {
- EdgeInfoStorage<IdType> &entry = this->get_raw_value_reference(kwh);
- EdgeInfo<IdType> new_entry;
- new_entry.edge_id = id;
- new_entry.offset = (unsigned int) offset;
- entry.push_back(new_entry);
- }
- }
-
- const EdgeInfoStorage<IdType> get(const KMer& kmer) const {
-// VERIFY(this->IsAttached());
- auto kwh = base::ConstructKWH(kmer);
- auto entry = this->get_value(kwh);
- return entry;
- }
- //todo delete if equal seems to work improperly!!!
- bool DeleteIfEqual(const KeyWithHash &, IdType) {
- VERIFY(false);
- return false;
- }
-
-};
-
-}
diff --git a/src/debruijn/indices/edge_position_index.hpp b/src/debruijn/indices/edge_position_index.hpp
deleted file mode 100644
index 18e9ab2..0000000
--- a/src/debruijn/indices/edge_position_index.hpp
+++ /dev/null
@@ -1,191 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-/*
- * edge_index.hpp
- *
- * Created on: May 24, 2013
- * Author: anton
- */
-
-#include "perfect_hash_map.hpp"
-#include "edge_info_updater.hpp"
-#include "kmer_splitters.hpp"
-
-namespace debruijn_graph {
-
-template<class IdType>
-struct EdgeInfo {
- IdType edge_id;
- unsigned offset;
- unsigned count;
-
- EdgeInfo(IdType edge_id_ = IdType(), unsigned offset_ = -1u, unsigned count_ = 0) :
- edge_id(edge_id_), offset(offset_), count(count_) { }
-
- template<class KWH>
- EdgeInfo conjugate(const KWH &kwh) const {
- return conjugate(kwh.key().size());
- }
-
- EdgeInfo conjugate(size_t k) const {
- if(!valid()) {
- return EdgeInfo(IdType(0), unsigned(-1), count);
- } else {
- return EdgeInfo(edge_id->conjugate(), (unsigned)edge_id->length(k) - offset, count);
- }
- }
-
- void invalidate() {
- offset = unsigned(-1);
- }
-
- bool valid() const {
- return offset != unsigned(-1);
- }
-};
-
-template<class stream, class IdType>
-stream &operator<<(stream &s, const EdgeInfo<IdType> &info) {
- return s << "EdgeInfo[" << info.edge_id << ", " << info.offset << ", " << info.count << "]";
-}
-
-template<class Graph, class Seq = runtime_k::RtSeq, class traits = kmer_index_traits<Seq>, class StoringType = DefaultStoring>
-class KmerFreeEdgeIndex : public KeyIteratingMap<Seq, EdgeInfo<typename Graph::EdgeId>, traits, StoringType> {
- typedef KeyIteratingMap<Seq, EdgeInfo<typename Graph::EdgeId>, traits, StoringType> base;
- const Graph &graph_;
-
-public:
- typedef typename base::traits_t traits_t;
- typedef StoringType storing_type;
- typedef typename base::KMer KMer;
- typedef typename base::KMerIdx KMerIdx;
- typedef Graph GraphT;
- typedef typename Graph::EdgeId IdType;
- typedef typename base::KeyWithHash KeyWithHash;
- typedef EdgeInfo<typename Graph::EdgeId> Value;
- using base::valid;
- using base::ConstructKWH;
-
-public:
-
- KmerFreeEdgeIndex(const Graph &graph, const std::string &workdir)
- : base(unsigned(graph.k() + 1), workdir), graph_(graph) {}
-
- /**
- * Shows if kmer has some entry associated with it
- */
- bool contains(const KeyWithHash &kwh) const {
- // Sanity check
- if (!valid(kwh)) {
- return false;
- }
-
- Value entry = base::get_value(kwh);
-
- if (entry.offset == -1u) {
- return false;
- }
-
- return kwh.key() == KMer(this->k(), graph_.EdgeNucls(entry.edge_id), entry.offset);
- }
-
- void PutInIndex(KeyWithHash &kwh, IdType id, size_t offset) {
- if (valid(kwh)) {
- auto &entry = this->get_raw_value_reference(kwh);
- if (!entry.valid() || contains(kwh)) {
- this->put_value(kwh, Value(id, (unsigned)offset, entry.count));
- }
- }
- }
-
- //Only coverage is loaded
- template<class Writer>
- void BinWrite(Writer &writer) const {
- this->index_.serialize(writer);
- size_t sz = this->data_.size();
- writer.write((char*)&sz, sizeof(sz));
- for (size_t i = 0; i < sz; ++i)
- writer.write((char*)&(this->data_[i].count), sizeof(this->data_[0].count));
- }
-
- template<class Reader>
- void BinRead(Reader &reader, const std::string/* &FileName*/) {
- this->clear();
- this->index_.deserialize(reader);
- size_t sz = 0;
- reader.read((char*)&sz, sizeof(sz));
- this->data_.resize(sz);
- for (size_t i = 0; i < sz; ++i)
- reader.read((char*)&(this->data_[i].count), sizeof(this->data_[0].count));
- }
-};
-
-template<class Graph, class Seq = runtime_k::RtSeq, class traits = kmer_index_traits<Seq>, class StoringType = DefaultStoring>
-class KmerStoringEdgeIndex : public KeyStoringMap<Seq, EdgeInfo<typename Graph::EdgeId>, traits, StoringType> {
- typedef KeyStoringMap<Seq, EdgeInfo<typename Graph::EdgeId>, traits, StoringType> base;
-
-public:
- typedef typename base::traits_t traits_t;
- typedef StoringType storing_type;
- typedef typename base::KMer KMer;
- typedef typename base::KMerIdx KMerIdx;
- typedef Graph GraphT;
- typedef typename Graph::EdgeId IdType;
- typedef typename base::KeyWithHash KeyWithHash;
- typedef EdgeInfo<typename Graph::EdgeId> Value;
- using base::valid;
- using base::ConstructKWH;
-
-
- KmerStoringEdgeIndex(const Graph& g, const std::string &workdir)
- : base(unsigned(g.k() + 1), workdir) {}
-
- ~KmerStoringEdgeIndex() {}
-
- /**
- * Shows if kmer has some entry associated with it
- */
- bool contains(const KeyWithHash &kwh) const {
- if (!base::valid(kwh))
- return false;
- return this->get_raw_value_reference(kwh).valid();
- }
-
- template<class Writer>
- void BinWrite(Writer &writer) const {
- this->index_.serialize(writer);
- size_t sz = this->data_.size();
- writer.write((char*)&sz, sizeof(sz));
- for (size_t i = 0; i < sz; ++i)
- writer.write((char*)&(this->data_[i].count), sizeof(this->data_[0].count));
- this->BinWriteKmers(writer);
- }
-
- template<class Reader>
- void BinRead(Reader &reader, const std::string &FileName) {
- this->clear();
- this->index_.deserialize(reader);
- size_t sz = 0;
- reader.read((char*)&sz, sizeof(sz));
- this->data_.resize(sz);
- for (size_t i = 0; i < sz; ++i)
- reader.read((char*)&(this->data_[i].count), sizeof(this->data_[0].count));
- this->BinReadKmers(reader, FileName);
- }
- void PutInIndex(KeyWithHash &kwh, IdType id, size_t offset) {
- if (valid(kwh)) {
- auto &entry = this->get_raw_value_reference(kwh);
- if (!entry.valid() || contains(kwh)) {
- this->put_value(kwh, Value(id, (unsigned)offset, entry.count));
- }
- }
- }
-};
-
-}
diff --git a/src/debruijn/indices/kmer_extension_index.hpp b/src/debruijn/indices/kmer_extension_index.hpp
deleted file mode 100644
index 457b506..0000000
--- a/src/debruijn/indices/kmer_extension_index.hpp
+++ /dev/null
@@ -1,413 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-/*
- * kmer_extension_index.hpp
- *
- * Created on: May 24, 2013
- * Author: anton
- */
-#include "perfect_hash_map.hpp"
-#include "kmer_splitters.hpp"
-#include "simple_tools.hpp"
-#include "storing_traits.hpp"
-#include <bitset>
-
-namespace debruijn_graph {
-
-inline uint8_t invert_byte_slow(uint8_t a) {
- size_t res = 0;
- for(size_t i = 0; i < 8; i++) {
- res <<= 1;
- res += a & 1;
- a = uint8_t(a >> 1);
- }
- return uint8_t(res);
-}
-
-inline vector<uint8_t> count_invert_byte() {
- vector<uint8_t> result;
- for(size_t a = 0; a < 256; a++) {
- result.push_back(invert_byte_slow((uint8_t)a));
- }
- return result;
-}
-
-inline uint8_t invert_byte(uint8_t a) {
- static vector<uint8_t> precalc = count_invert_byte();
- return precalc[a];
-}
-
-class InOutMask {
-private:
- uint8_t mask_;
-
- bool CheckUnique(uint8_t mask) const {
- static bool unique[] =
- { 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 };
- return unique[mask];
- }
-
- char GetUnique(uint8_t mask) const {
- static char next[] = { -1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1,
- -1, -1, -1 };
- VERIFY(next[mask] != -1)
- return next[mask];
- }
-
- size_t Count(uint8_t mask) const {
- static char count[] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
- return count[mask];
- }
-
-
- char inv_position(char nucl, bool as_is) const {
- if(as_is)
- return nucl;
- else
- return char(7 - nucl);
- }
-
-public:
- explicit InOutMask(uint8_t mask = 0) : mask_(mask){
- }
-
- uint8_t get_mask() const {
- return mask_;
- }
-
- template<class Key>
- InOutMask conjugate(const Key & /*k*/) const {
- return InOutMask(invert_byte(mask_));
- }
-
- void AddOutgoing(char nnucl, bool as_is) {
- unsigned nmask = (unsigned) (1 << inv_position(nnucl, as_is));
- if (!(mask_ & nmask)) {
-# pragma omp atomic
- mask_ |= (unsigned char) nmask;
- }
- }
-
- void AddIncoming(char pnucl, bool as_is) {
- unsigned pmask = (unsigned) (1 << inv_position(char(pnucl + 4), as_is));
- if (!(mask_ & pmask)) {
-# pragma omp atomic
- mask_|= (unsigned char) pmask;
- }
- }
-
- void DeleteOutgoing(char nnucl, bool as_is) {
- unsigned nmask = (1 << inv_position(nnucl, as_is));
- if (mask_ & nmask) {
-# pragma omp atomic
- mask_ &= (unsigned char) ~nmask;
- }
- }
-
- void DeleteIncoming(char pnucl, bool as_is) {
- unsigned pmask = (1 << inv_position(char(pnucl + 4), as_is));
- if (mask_ & pmask) {
-# pragma omp atomic
- mask_ &= (unsigned char) ~pmask;
- }
- }
-
- void IsolateVertex() {
- mask_ = 0;
- }
-
- bool CheckOutgoing(char nucl) const {
- return mask_ & (1 << nucl);
- }
-
- bool CheckIncoming(char nucl) const {
- return mask_ & (1 << (4 + nucl));
- }
-
- bool IsDeadEnd() const {
- return !(mask_ & 15);
- }
-
- bool IsDeadStart() const {
- return !(mask_ >> 4);
- }
-
- bool CheckUniqueOutgoing() const {
- return CheckUnique(mask_ & 15);
- }
-
- bool CheckUniqueIncoming() const {
- return CheckUnique(uint8_t(mask_ >> 4));
- }
-
- char GetUniqueOutgoing() const {
- return GetUnique(mask_ & 15);
- }
-
- char GetUniqueIncoming() const {
- return GetUnique(uint8_t(mask_ >> 4));
- }
-
- size_t OutgoingEdgeCount() const {
- return Count(mask_ & 15);
- }
-
- size_t IncomingEdgeCount() const {
- return Count(uint8_t(mask_ >> 4));
- }
-};
-
-template<class Stream>
-Stream &operator<<(Stream& stream, const InOutMask &mask) {
- return stream << std::bitset<8>(mask.get_mask());
-}
-
-template<class Seq>
-struct slim_kmer_index_traits : public kmer_index_traits<Seq> {
- typedef kmer_index_traits<Seq> __super;
-
- typedef MMappedRecordReader<typename Seq::DataType> FinalKMerStorage;
-
- template<class Writer>
- static void raw_serialize(Writer&, typename __super::RawKMerStorage*) {
- VERIFY(false && "Cannot save extension index");
- }
-
- template<class Reader>
- static typename __super::RawKMerStorage *raw_deserialize(
- Reader&, const std::string &) {
- VERIFY(false && "Cannot load extension index");
- return NULL;
- }
-
-};
-
-template<typename KeyWithHash>
-struct AbstractDeEdge {
- KeyWithHash start;
- KeyWithHash end;
- AbstractDeEdge(KeyWithHash _start, KeyWithHash _end) : start(_start), end(_end) {
- }
-
- AbstractDeEdge<KeyWithHash> &operator=(const AbstractDeEdge<KeyWithHash> &that) {
- this->start = that.start;
- this->end = that.end;
- return *this;
- }
-
- bool operator==(const AbstractDeEdge &other) {
- return start.idx() == other.start.idx() && end.idx() == other.end.idx();
- }
-
- bool operator!=(const AbstractDeEdge &other) {
- return !(*this == other);
- }
-};
-
-template<class stream, class KWH>
-stream &operator<<(stream &s, const AbstractDeEdge<KWH> de_edge) {
- return s << "DeEdge[" << de_edge.start << ", " << de_edge.end << "]";
-}
-
-template<class traits = slim_kmer_index_traits<runtime_k::RtSeq>, class StoringType = DefaultStoring>
-class DeBruijnExtensionIndex : public KeyIteratingMap<typename traits::SeqType, InOutMask, traits, StoringType> {
- typedef KeyIteratingMap<typename traits::SeqType, InOutMask, traits, StoringType> base;
-
-public:
- typedef typename base::traits_t traits_t;
- typedef StoringType storing_type;
- typedef typename base::KeyType KMer;
- typedef typename base::IdxType KMerIdx;
- typedef typename base::KeyWithHash KeyWithHash;
- typedef AbstractDeEdge<KeyWithHash> DeEdge;
- using base::ConstructKWH;
-
- DeBruijnExtensionIndex(unsigned K, const std::string &workdir)
- : base((size_t) K, workdir) {
- }
-
- void AddOutgoing(const KeyWithHash &kwh, char nucl) {
- TRACE("Add outgoing " << kwh << " " << size_t(nucl) << " " << kwh.is_minimal());
- this->get_raw_value_reference(kwh).AddOutgoing(nucl, kwh.is_minimal());
- }
-
- void AddIncoming(const KeyWithHash &kwh, char nucl) {
- TRACE("Add incoming " << kwh << " " << size_t(nucl) << " " << kwh.is_minimal());
- this->get_raw_value_reference(kwh).AddIncoming(nucl, kwh.is_minimal());
- }
-
- void DeleteOutgoing(const KeyWithHash &kwh, char nucl) {
- TRACE("Delete outgoing " << kwh << " " << size_t(nucl) << " " << kwh.is_minimal());
- this->get_raw_value_reference(kwh).DeleteOutgoing(nucl, kwh.is_minimal());
- }
-
- void DeleteIncoming(const KeyWithHash &kwh, char nucl) {
- TRACE("Delete incoming " << kwh << " " << size_t(nucl) << " " << kwh.is_minimal());
- this->get_raw_value_reference(kwh).DeleteIncoming(nucl, kwh.is_minimal());
- }
-
- void IsolateVertex(const KeyWithHash &kwh) {
- TRACE("Isolate vertex " << kwh);
- this->get_raw_value_reference(kwh).IsolateVertex();
- }
-
- bool CheckOutgoing(const KeyWithHash &kwh, char nucl) const {
- return this->get_value(kwh).CheckOutgoing(nucl);
- }
-
- KeyWithHash GetOutgoing(const KeyWithHash &kwh, char nucl) const {
- return kwh << nucl;
- }
-
- bool CheckIncoming(const KeyWithHash &kwh, char nucl) const {
- return this->get_value(kwh).CheckIncoming(nucl);
- }
-
- KeyWithHash GetIncoming(const KeyWithHash &kwh, char nucl) const {
- return kwh >> nucl;
- }
-
- bool IsDeadEnd(const KeyWithHash &kwh) const {
- return this->get_value(kwh).IsDeadEnd();
- }
-
- bool IsDeadStart(const KeyWithHash &kwh) const {
- return this->get_value(kwh).IsDeadStart();
- }
-
- bool CheckUniqueOutgoing(const KeyWithHash &kwh) const {
- return this->get_value(kwh).CheckUniqueOutgoing();
- }
-
- KeyWithHash GetUniqueOutgoing(const KeyWithHash &kwh) const {
- return GetOutgoing(kwh, this->get_value(kwh).GetUniqueOutgoing());
- }
-
- bool CheckUniqueIncoming(const KeyWithHash &kwh) const {
- return this->get_value(kwh).CheckUniqueIncoming();
- }
-
- KeyWithHash GetUniqueIncoming(const KeyWithHash &kwh) const {
- return GetIncoming(kwh, this->get_value(kwh).GetUniqueIncoming());
- }
-
- size_t OutgoingEdgeCount(const KeyWithHash &kwh) const {
- return this->get_value(kwh).OutgoingEdgeCount();
- }
-
- size_t IncomingEdgeCount(const KeyWithHash &kwh) const {
- return this->get_value(kwh).IncomingEdgeCount();
- }
-
- ~DeBruijnExtensionIndex() {
- }
-
-private:
- DECL_LOGGER("ExtentionIndex");
-};
-
-template<class Builder>
-class DeBruijnExtensionIndexBuilder : public Builder {
- typedef Builder base;
-public:
- typedef typename Builder::IndexT IndexT;
-
- template<class ReadStream>
- size_t FillExtensionsFromStream(ReadStream &stream, IndexT &index) const {
- unsigned k = index.k();
- size_t rl = 0;
-
- while (!stream.eof()) {
- typename ReadStream::read_type r;
- stream >> r;
- rl = std::max(rl, r.size());
-
- const Sequence &seq = r.sequence();
- if (seq.size() < k + 1)
- continue;
-
- typename IndexT::KeyWithHash kwh = index.ConstructKWH(seq.start<runtime_k::RtSeq>(k));
- for (size_t j = k; j < seq.size(); ++j) {
- char nnucl = seq[j], pnucl = kwh[0];
- index.AddOutgoing(kwh, nnucl);
- kwh <<= nnucl;
- index.AddIncoming(kwh, pnucl);
- }
- }
-
- return rl;
- }
-
- void FillExtensionsFromIndex(const std::string &KPlusOneMersFilename,
- IndexT &index) const {
- unsigned KPlusOne = index.k() + 1;
-
- typename IndexT::kmer_iterator it(
- KPlusOneMersFilename, runtime_k::RtSeq::GetDataSize(KPlusOne));
- for (; it.good(); ++it) {
- runtime_k::RtSeq kpomer(KPlusOne, *it);
-
- char pnucl = kpomer[0], nnucl = kpomer[KPlusOne - 1];
- TRACE("processing k+1-mer " << kpomer);
- index.AddOutgoing(index.ConstructKWH(runtime_k::RtSeq(KPlusOne - 1, kpomer)),
- nnucl);
- // FIXME: This is extremely ugly. Needs to add start / end methods to extract first / last N symbols...
- index.AddIncoming(index.ConstructKWH(runtime_k::RtSeq(KPlusOne - 1, kpomer << 0)),
- pnucl);
- }
- }
-
-public:
- template<class Streams>
- ReadStatistics BuildExtensionIndexFromStream(
- IndexT &index, Streams &streams, io::SingleStream* contigs_stream = 0,
- size_t read_buffer_size = 0) const {
- unsigned nthreads = (unsigned) streams.size();
-
- // First, build a k+1-mer index
- DeBruijnReadKMerSplitter<typename Streams::ReadT, StoringTypeFilter<typename IndexT::storing_type>> splitter(
- index.workdir(), index.k() + 1, 0xDEADBEEF, streams,
- contigs_stream, read_buffer_size);
- KMerDiskCounter<runtime_k::RtSeq> counter(index.workdir(), splitter);
- counter.CountAll(nthreads, nthreads, /* merge */false);
-
- // Now, count unique k-mers from k+1-mers
- DeBruijnKMerKMerSplitter<StoringTypeFilter<typename IndexT::storing_type> > splitter2(index.workdir(), index.k(),
- index.k() + 1, IndexT::storing_type::IsInvertable(), read_buffer_size);
- for (unsigned i = 0; i < nthreads; ++i)
- splitter2.AddKMers(counter.GetMergedKMersFname(i));
- KMerDiskCounter<runtime_k::RtSeq> counter2(index.workdir(), splitter2);
-
- index.BuildIndex(counter2, 16, nthreads);
-
- // Build the kmer extensions
- INFO("Building k-mer extensions from k+1-mers");
-# pragma omp parallel for num_threads(nthreads)
- for (unsigned i = 0; i < nthreads; ++i)
- FillExtensionsFromIndex(counter.GetMergedKMersFname(i), index);
- INFO("Building k-mer extensions from k+1-mers finished.");
-
- return splitter.stats();
- }
-
-private:
- DECL_LOGGER("DeBruijnExtensionIndexBuilder");
-};
-
-template<class Index>
-struct ExtensionIndexHelper {
- typedef Index IndexT;
- typedef typename IndexT::traits_t traits_t;
- typedef typename IndexT::KMer Kmer;
- typedef typename IndexT::KMerIdx KMerIdx;
- typedef DeBruijnStreamKMerIndexBuilder<Kmer, IndexT> DeBruijnStreamKMerIndexBuilderT;
- typedef DeBruijnExtensionIndexBuilder<DeBruijnStreamKMerIndexBuilderT> DeBruijnExtensionIndexBuilderT;
-};
-
-}
diff --git a/src/debruijn/indices/kmer_splitters.hpp b/src/debruijn/indices/kmer_splitters.hpp
deleted file mode 100644
index f347be5..0000000
--- a/src/debruijn/indices/kmer_splitters.hpp
+++ /dev/null
@@ -1,444 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "io/io_helper.hpp"
-#include "storing_traits.hpp"
-
-#include "file_limit.hpp"
-
-namespace debruijn_graph {
-
-template<class StoringType>
-struct StoringTypeFilter {
-};
-
-template<>
-struct StoringTypeFilter<SimpleStoring> {
- template<class Kmer>
- bool filter(const Kmer &/*kmer*/) const {
- return true;
- }
-};
-
-template<>
-struct StoringTypeFilter<InvertableStoring> {
- template<class Kmer>
- bool filter(const Kmer &kmer) const {
- return kmer.IsMinimal();
- }
-};
-
-// used for temporary reads storage during parallel reading
-static const size_t READS_BUFFER_SIZE = 536870912; // 512 MB in bytes
-
-typedef ::KMerSplitter<runtime_k::RtSeq> RtSeqKMerSplitter;
-
-typedef KMerVector<runtime_k::RtSeq> RtSeqKMerVector;
-typedef std::vector<RtSeqKMerVector> KMerBuffer;
-
-template<class KmerFilter>
-class DeBruijnKMerSplitter : public RtSeqKMerSplitter {
- private:
- bool skip_not_minimal_;
- KmerFilter kmer_filter_;
- protected:
- size_t read_buffer_size_;
- protected:
- size_t FillBufferFromSequence(const Sequence &seq,
- KMerBuffer &buffer, unsigned num_files) const {
- size_t kmers = 0;
-
- if (seq.size() < this->K_)
- return kmers;
-
- runtime_k::RtSeq kmer = seq.start<runtime_k::RtSeq>(this->K_) >> 'A';
- for (size_t j = this->K_ - 1; j < seq.size(); ++j) {
- kmer <<= seq[j];
- if(kmer_filter_.filter(kmer)) {
- buffer[this->GetFileNumForSeq(kmer, num_files)].push_back(kmer);
- kmers++;
- }
- }
- return kmers;
- }
-
-
- void DumpBuffers(size_t num_files, size_t nthreads,
- std::vector<KMerBuffer> &buffers,
- const path::files_t &ostreams) const{
- # pragma omp parallel for
- for (unsigned k = 0; k < num_files; ++k) {
- size_t sz = 0;
- for (size_t i = 0; i < nthreads; ++i)
- sz += buffers[i][k].size();
-
- KMerVector<runtime_k::RtSeq> SortBuffer(this->K_, sz);
- for (size_t i = 0; i < nthreads; ++i) {
- KMerBuffer &entry = buffers[i];
- for (size_t j = 0; j < entry[k].size(); ++j)
- SortBuffer.push_back(entry[k][j]);
- }
- libcxx::sort(SortBuffer.begin(), SortBuffer.end(), KMerVector<runtime_k::RtSeq>::less2_fast());
- auto it = std::unique(SortBuffer.begin(), SortBuffer.end(), KMerVector<runtime_k::RtSeq>::equal_to());
-
- # pragma omp critical
- {
- FILE *f = fopen(ostreams[k].c_str(), "ab");
- VERIFY_MSG(f, "Cannot open temporary file to write");
- fwrite(SortBuffer.data(), SortBuffer.el_data_size(), it - SortBuffer.begin(), f);
- fclose(f);
- }
- }
-
- for (unsigned i = 0; i < nthreads; ++i) {
- for (unsigned j = 0; j < num_files; ++j) {
- buffers[i][j].clear();
- }
- }
- }
-
- public:
- DeBruijnKMerSplitter(const std::string &work_dir,
- unsigned K, KmerFilter kmer_filter, size_t read_buffer_size = 0, uint32_t seed = 0)
- : RtSeqKMerSplitter(work_dir, K, seed), kmer_filter_(kmer_filter), read_buffer_size_(read_buffer_size) {
- }
- protected:
- DECL_LOGGER("DeBruijnKMerSplitter");
-};
-
-struct ReadStatistics {
- size_t reads_;
- size_t max_read_length_;
- size_t bases_;
-};
-
-template<class Read, class KmerFilter>
-class DeBruijnReadKMerSplitter : public DeBruijnKMerSplitter<KmerFilter> {
- io::ReadStreamList<Read> &streams_;
- io::SingleStream *contigs_;
-
- template<class ReadStream>
- ReadStatistics
- FillBufferFromStream(ReadStream& stream,
- KMerBuffer &tmp_entries,
- unsigned num_files, size_t cell_size) const;
-
- ReadStatistics rs_;
-
- public:
- DeBruijnReadKMerSplitter(const std::string &work_dir,
- unsigned K, uint32_t seed,
- io::ReadStreamList<Read>& streams,
- io::SingleStream* contigs_stream = 0,
- size_t read_buffer_size = 0)
- : DeBruijnKMerSplitter<KmerFilter>(work_dir, K, KmerFilter(), read_buffer_size, seed),
- streams_(streams), contigs_(contigs_stream), rs_({0 ,0 ,0}) {
- }
-
- virtual path::files_t Split(size_t num_files);
-
- size_t read_length() const { return rs_.max_read_length_; }
- ReadStatistics stats() const { return rs_; }
-};
-
-template<class Read, class KmerFilter> template<class ReadStream>
-ReadStatistics
-DeBruijnReadKMerSplitter<Read, KmerFilter>::FillBufferFromStream(ReadStream &stream,
- KMerBuffer &buffer,
- unsigned num_files, size_t cell_size) const {
- typename ReadStream::ReadT r;
- size_t reads = 0, kmers = 0, rl = 0, bases = 0;
-
- while (!stream.eof() && kmers < num_files * cell_size) {
- stream >> r;
- rl = std::max(rl, r.size());
- reads += 1;
- bases += r.size();
-
- kmers += this->FillBufferFromSequence(r.sequence(), buffer, num_files);
- }
- return { reads, rl, bases };
-}
-
-template<class Read, class KmerFilter>
-path::files_t DeBruijnReadKMerSplitter<Read, KmerFilter>::Split(size_t num_files) {
- unsigned nthreads = (unsigned) streams_.size();
-
- INFO("Splitting kmer instances into " << num_files << " buckets. This might take a while.");
-
- // Determine the set of output files
- path::files_t out;
- for (unsigned i = 0; i < num_files; ++i)
- out.push_back(this->GetRawKMersFname(i));
-
- size_t file_limit = num_files + 2*nthreads;
- size_t res = limit_file(file_limit);
- if (res < file_limit) {
- WARN("Failed to setup necessary limit for number of open files. The process might crash later on.");
- WARN("Do 'ulimit -n " << file_limit << "' in the console to overcome the limit");
- }
-
- size_t reads_buffer_size = DeBruijnKMerSplitter<KmerFilter>::read_buffer_size_;
- if (reads_buffer_size == 0) {
- reads_buffer_size = READS_BUFFER_SIZE;
- size_t mem_limit = (size_t)((double)(get_free_memory()) / (nthreads * 3));
- INFO("Memory available for splitting buffers: " << (double)mem_limit / 1024.0 / 1024.0 / 1024.0 << " Gb");
- reads_buffer_size = std::min(reads_buffer_size, mem_limit);
- }
- size_t cell_size = reads_buffer_size /
- (num_files * runtime_k::RtSeq::GetDataSize(this->K_) * sizeof(runtime_k::RtSeq::DataType));
-
- // Set sane minimum cell size
- if (cell_size < 16384)
- cell_size = 16384;
- INFO("Using cell size of " << cell_size);
-
- std::vector<KMerBuffer> tmp_entries(nthreads);
- for (unsigned i = 0; i < nthreads; ++i) {
- KMerBuffer &entry = tmp_entries[i];
- entry.resize(num_files, RtSeqKMerVector(this->K_, (size_t) (1.1 * (double) cell_size)));
- }
-
- size_t counter = 0, rl = 0, bases = 0, n = 15;
- streams_.reset();
- while (!streams_.eof()) {
-# pragma omp parallel for num_threads(nthreads) reduction(+ : counter) reduction(+ : bases) shared(rl)
- for (size_t i = 0; i < nthreads; ++i) {
- ReadStatistics stats = FillBufferFromStream(streams_[i], tmp_entries[i], (unsigned) num_files, cell_size);
- counter += stats.reads_;
- bases += stats.bases_;
-
- // There is no max reduction in C/C++ OpenMP... Only in FORTRAN :(
-# pragma omp flush(rl)
- if (stats.max_read_length_ > rl)
-# pragma omp critical
- {
- rl = std::max(rl, stats.max_read_length_);
- }
- }
-
- this->DumpBuffers(num_files, nthreads, tmp_entries, out);
-
- if (counter >> n) {
- INFO("Processed " << counter << " reads");
- n += 1;
- }
- }
-
- if (contigs_) {
- INFO("Adding contigs from previous K");
- size_t cnt = 0;
- contigs_->reset();
- while (!contigs_->eof()) {
- FillBufferFromStream(*contigs_, tmp_entries[cnt], (unsigned) num_files, cell_size);
- this->DumpBuffers(num_files, nthreads, tmp_entries, out);
- if (++cnt >= nthreads)
- cnt = 0;
- }
- }
-
- INFO("Used " << counter << " reads. Maximum read length " << rl);
- INFO("Average read length " << double(bases) / double(counter));
- rs_ = { counter, rl, bases };
-
- return out;
-}
-
-template<class Graph, class KmerFilter>
-class DeBruijnGraphKMerSplitter : public DeBruijnKMerSplitter<KmerFilter> {
- typedef typename Graph::ConstEdgeIt EdgeIt;
- typedef typename Graph::EdgeId EdgeId;
-
- const Graph &g_;
-
- size_t FillBufferFromEdges(EdgeIt &edge,
- KMerBuffer &tmp_entries,
- unsigned num_files, size_t cell_size) const;
-
- public:
- DeBruijnGraphKMerSplitter(const std::string &work_dir,
- unsigned K, const Graph &g, size_t read_buffer_size = 0)
- : DeBruijnKMerSplitter<KmerFilter>(work_dir, K, KmerFilter(), read_buffer_size), g_(g) {}
-
- virtual path::files_t Split(size_t num_files);
-};
-
-template<class Graph, class KmerFilter>
-size_t
-DeBruijnGraphKMerSplitter<Graph, KmerFilter>::FillBufferFromEdges(EdgeIt &edge,
- KMerBuffer &buffer,
- unsigned num_files, size_t cell_size) const {
- size_t seqs = 0;
- for (size_t kmers = 0; !edge.IsEnd() && kmers < num_files * cell_size; ++edge) {
- const Sequence &nucls = g_.EdgeNucls(*edge);
-
- kmers += this->FillBufferFromSequence(nucls, buffer, num_files);
- seqs += 1;
- }
-
- return seqs;
-}
-
-template<class Graph, class KmerFilter>
-path::files_t DeBruijnGraphKMerSplitter<Graph, KmerFilter>::Split(size_t num_files) {
- INFO("Splitting kmer instances into " << num_files << " buckets. This might take a while.");
-
- // Determine the set of output files
- path::files_t out;
- for (unsigned i = 0; i < num_files; ++i)
- out.push_back(this->GetRawKMersFname(i));
-
- size_t file_limit = num_files + 2*16;
- size_t res = limit_file(file_limit);
- if (res < file_limit) {
- WARN("Failed to setup necessary limit for number of open files. The process might crash later on.");
- WARN("Do 'ulimit -n " << file_limit << "' in the console to overcome the limit");
- }
-
- size_t reads_buffer_size = DeBruijnKMerSplitter<KmerFilter>::read_buffer_size_;
- if (reads_buffer_size == 0) {
- reads_buffer_size = READS_BUFFER_SIZE;
- size_t mem_limit = (size_t)((double)(get_free_memory()) / (3));
- INFO("Memory available for splitting buffers: " << (double)mem_limit / 1024.0 / 1024.0 / 1024.0 << " Gb");
- reads_buffer_size = std::min(reads_buffer_size, mem_limit);
- }
- size_t cell_size = reads_buffer_size /
- (num_files * runtime_k::RtSeq::GetDataSize(this->K_) * sizeof(runtime_k::RtSeq::DataType));
- INFO("Using cell size of " << cell_size);
-
- std::vector<KMerBuffer> tmp_entries(1);
- KMerBuffer &entry = tmp_entries[0];
- entry.resize(num_files, RtSeqKMerVector(this->K_, (size_t) (1.1 * (double) cell_size)));
-
- size_t counter = 0, n = 10;
- for (auto it = g_.ConstEdgeBegin(); !it.IsEnd(); ) {
- counter += FillBufferFromEdges(it, tmp_entries[0], (unsigned) num_files, cell_size);
-
- this->DumpBuffers(num_files, 1, tmp_entries, out);
-
- if (counter >> n) {
- INFO("Processed " << counter << " edges");
- n += 1;
- }
- }
-
- INFO("Used " << counter << " sequences.");
-
- return out;
-}
-
-
-template<class KmerFilter>
-class DeBruijnKMerKMerSplitter : public DeBruijnKMerSplitter<KmerFilter> {
- typedef MMappedFileRecordArrayIterator<runtime_k::RtSeq::DataType> kmer_iterator;
-
- unsigned K_source_;
- std::vector<std::string> kmers_;
- bool add_rc_;
-
- size_t FillBufferFromKMers(kmer_iterator &kmer,
- KMerBuffer &tmp_entries,
- unsigned num_files, size_t cell_size) const;
-
- public:
- DeBruijnKMerKMerSplitter(const std::string &work_dir,
- unsigned K_target, unsigned K_source, bool add_rc, size_t read_buffer_size = 0)
- : DeBruijnKMerSplitter<KmerFilter>(work_dir, K_target, KmerFilter(), read_buffer_size), K_source_(K_source), add_rc_(add_rc) {}
-
- void AddKMers(const std::string &file) {
- kmers_.push_back(file);
- }
-
- virtual path::files_t Split(size_t num_files);
-};
-
-template<class KmerFilter>
-inline size_t DeBruijnKMerKMerSplitter<KmerFilter>::FillBufferFromKMers(kmer_iterator &kmer,
- KMerBuffer &buffer,
- unsigned num_files, size_t cell_size) const {
- size_t seqs = 0;
- for (size_t kmers = 0; kmer.good() && kmers < num_files * cell_size; ++kmer) {
- Sequence nucls(runtime_k::RtSeq(K_source_, *kmer));
- kmers += this->FillBufferFromSequence(nucls, buffer, num_files);
- if(add_rc_)
- kmers += this->FillBufferFromSequence(!nucls, buffer, num_files);
- seqs += 1;
- }
-
- return seqs;
-}
-
-template<class KmerFilter>
-inline path::files_t DeBruijnKMerKMerSplitter<KmerFilter>::Split(size_t num_files) {
- unsigned nthreads = (unsigned) kmers_.size();
- INFO("Splitting kmer instances into " << num_files << " buckets. This might take a while.");
-
- // Determine the set of output files
- path::files_t out;
- for (unsigned i = 0; i < num_files; ++i)
- out.push_back(this->GetRawKMersFname(i));
-
- size_t file_limit = num_files + 2*nthreads;
- size_t res = limit_file(file_limit);
- if (res < file_limit) {
- WARN("Failed to setup necessary limit for number of open files. The process might crash later on.");
- WARN("Do 'ulimit -n " << file_limit << "' in the console to overcome the limit");
- }
-
- size_t reads_buffer_size = DeBruijnKMerSplitter<KmerFilter>::read_buffer_size_;
- if (reads_buffer_size == 0) {
- reads_buffer_size = READS_BUFFER_SIZE;
- size_t mem_limit = (size_t)((double)(get_free_memory()) / (nthreads * 3));
- INFO("Memory available for splitting buffers: " << (double)mem_limit / 1024.0 / 1024.0 / 1024.0 << " Gb");
- reads_buffer_size = std::min(reads_buffer_size, mem_limit);
- }
- size_t cell_size = reads_buffer_size /
- (num_files * runtime_k::RtSeq::GetDataSize(this->K_) * sizeof(runtime_k::RtSeq::DataType));
- // Set sane minimum cell size
- if (cell_size < 16384)
- cell_size = 16384;
- INFO("Using cell size of " << cell_size);
-
- std::vector<KMerBuffer> tmp_entries(nthreads);
- for (unsigned i = 0; i < nthreads; ++i) {
- KMerBuffer &entry = tmp_entries[i];
- entry.resize(num_files, RtSeqKMerVector(this->K_, (size_t) (1.1 * (double) cell_size)));
- }
-
- size_t counter = 0, n = 10;
- std::vector<kmer_iterator> its;
- its.reserve(nthreads);
- for (auto it = kmers_.begin(), et = kmers_.end(); it != et; ++it)
- its.emplace_back(*it, runtime_k::RtSeq::GetDataSize(K_source_));
-
- bool anygood = false;
- do {
-# pragma omp parallel for num_threads(nthreads) reduction(+ : counter)
- for (size_t i = 0; i < nthreads; ++i)
- counter += FillBufferFromKMers(its[i], tmp_entries[i], (unsigned) num_files, cell_size);
-
- this->DumpBuffers(num_files, nthreads, tmp_entries, out);
-
- if (counter >> n) {
- INFO("Processed " << counter << " kmers");
- n += 1;
- }
-
- anygood = false;
- for (auto it = its.begin(), et = its.end(); it != et; ++it)
- anygood |= it->good();
- } while (anygood);
-
- INFO("Used " << counter << " kmers.");
-
- return out;
-}
-
-
-}
diff --git a/src/debruijn/indices/perfect_hash_map.hpp b/src/debruijn/indices/perfect_hash_map.hpp
deleted file mode 100644
index 7da07d8..0000000
--- a/src/debruijn/indices/perfect_hash_map.hpp
+++ /dev/null
@@ -1,397 +0,0 @@
-#pragma once
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "openmp_wrapper.h"
-#include "standard.hpp"
-
-#include "io/io_helper.hpp"
-
-#include "mph_index/kmer_index.hpp"
-#include "adt/kmer_vector.hpp"
-
-#include "libcxx/sort.hpp"
-
-#include "kmer_splitters.hpp"
-#include "key_with_hash.hpp"
-#include "values.hpp"
-#include "storing_traits.hpp"
-
-#include <vector>
-#include <cstdlib>
-#include <cstdio>
-#include <cstdint>
-#include <io/kmer_iterator.hpp>
-
-namespace debruijn_graph {
-
-template<class K, class traits>
-class IndexWrapper {
- static const size_t InvalidIdx = size_t(-1);
-public:
- typedef size_t IdxType;
- typedef K KeyType;
- typedef traits traits_t;
-protected:
- typedef KMerIndex<traits> KMerIndexT;
- //these fields are protected only for reduction of storage in edge indices BinWrite
- KMerIndexT index_;
-private:
- std::string workdir_;
- unsigned k_;
-
-protected:
- size_t raw_seq_idx(const typename KMerIndexT::KMerRawReference s) const {
- return index_.raw_seq_idx(s);
- }
-
- bool valid(const size_t idx) const {
- return idx != InvalidIdx && idx < index_.size();
- }
-public:
- IndexWrapper(size_t k, const std::string &workdir) : k_((unsigned) k) {
- //fixme string literal
- workdir_ = path::make_temp_dir(workdir, "kmeridx");
- }
-
- ~IndexWrapper() {
- path::remove_dir(workdir_);
- }
-
- void clear() {
- index_.clear();
- }
-
- unsigned k() const { return k_; }
-
-public:
- template<class Writer>
- void BinWrite(Writer &writer) const {
- index_.serialize(writer);
- }
-
- template<class Reader>
- void BinRead(Reader &reader, const std::string &) {
- clear();
- index_.deserialize(reader);
- }
-
- const std::string &workdir() const {
- return workdir_;
- }
-};
-
-template<class K, class V, class traits, class StoringType>
-class PerfectHashMap : public ValueArray<V>, public IndexWrapper<K, traits> {
-public:
- typedef size_t IdxType;
- typedef K KeyType;
- typedef ValueArray<V> ValueBase;
- typedef IndexWrapper<KeyType, traits> KeyBase;
- using KeyBase::index_;
- typedef typename KeyBase::KMerIndexT KMerIndexT;
- typedef typename StoringTraits<K, KMerIndexT, StoringType>::KeyWithHash KeyWithHash;
-
- KeyWithHash ConstructKWH(const KeyType &key) const {
- return KeyWithHash(key, index_);
- }
-
- bool valid(const KeyWithHash &kwh) const {
- return KeyBase::valid(kwh.idx());
- }
-
- PerfectHashMap(size_t k, const std::string &workdir) : KeyBase(k, workdir) {
- }
-
- ~PerfectHashMap() {
- }
-
- void clear() {
- KeyBase::clear();
- ValueBase::clear();
- }
-
- const V get_value(const KeyWithHash &kwh) const {
- return StoringType::get_value(*this, kwh);
- }
-
- //Think twice or ask AntonB if you want to use it!
- V &get_raw_value_reference(const KeyWithHash &kwh) {
- return ValueBase::operator[](kwh.idx());
- }
-
- const V &get_raw_value_reference(const KeyWithHash &kwh) const {
- return ValueBase::operator[](kwh.idx());
- }
-
- void put_value(const KeyWithHash &kwh, const V &value) {
- StoringType::set_value(*this, kwh, value);
- }
-
- template<class Writer>
- void BinWrite(Writer &writer) const {
- ValueBase::BinWrite(writer);
- KeyBase::BinWrite(writer);
- }
-
- template<class Reader>
- void BinRead(Reader &reader, const std::string &tmp) {
- KeyBase::BinRead(reader, tmp);
- ValueBase::BinRead(reader, tmp);
- }
-//todo think more about hierarchy
-protected:
- template <class KmerCounter>
- void BuildIndex(KmerCounter& counter, size_t bucket_num, size_t thread_num, bool save_final = true) {
- KMerIndexBuilder<KMerIndexT> builder(this->workdir(),
- (unsigned) bucket_num,
- (unsigned) thread_num);
- size_t sz = builder.BuildIndex(index_, counter, save_final);
- ValueBase::resize(sz);
- }
-};
-
-
-template<class K, class V, class traits, class StoringType>
-class KeyStoringMap : public PerfectHashMap<K, V, traits, StoringType> {
-private:
- typedef PerfectHashMap<K, V, traits, StoringType> base;
-
-public:
- typedef traits traits_t;
- typedef K KMer;
- typedef typename base::IdxType KMerIdx;
- typedef typename traits::FinalKMerStorage::iterator kmer_iterator;
- typedef typename traits::FinalKMerStorage::const_iterator const_kmer_iterator;
- typedef typename base::KeyWithHash KeyWithHash;
- using base::ConstructKWH;
-
-private:
- typename traits::FinalKMerStorage *kmers_;
-
- void SortUniqueKMers() const {
- size_t swaps = 0;
- INFO("Arranging kmers in hash map order");
- for (auto I = kmers_->begin(), E = kmers_->end(); I != E; ++I) {
- size_t cidx = I - kmers_->begin();
- size_t kidx = this->raw_seq_idx(*I);
- while (cidx != kidx) {
- auto J = kmers_->begin() + kidx;
- using std::swap;
- swap(*I, *J);
- swaps += 1;
- kidx = this->raw_seq_idx(*I);
- }
- }
- INFO("Done. Total swaps: " << swaps);
- }
-
-protected:
- template<class Writer>
- void BinWriteKmers(Writer &writer) const {
- traits::raw_serialize(writer, this->kmers_);
- }
-
- template<class Reader>
- void BinReadKmers(Reader &reader, const std::string &FileName) {
- this->kmers_ = traits_t::raw_deserialize(reader, FileName);
- }
-
- template<class Writer>
- void BinWrite(Writer &writer) const {
- base::BinWrite(writer);
- BinWriteKmers(writer);
- }
-
- template<class Reader>
- void BinRead(Reader &reader, const std::string &FileName) {
- base::BinRead(reader, FileName);
- BinReadKmers(reader, FileName);
- }
-
-public:
-
- KeyStoringMap(size_t k, const std::string &workdir)
- : base(k, workdir),
- kmers_(NULL) {
- }
-
- ~KeyStoringMap() {
- delete kmers_;
- }
-
- KMer true_kmer(KeyWithHash kwh) const {
- VERIFY(this->valid(kwh));
-
- auto it = this->kmers_->begin() + kwh.idx();
- return (typename traits_t::raw_create()(this->k(), *it));
- }
-
- void clear() {
- base::clear();
- delete kmers_;
- kmers_ = NULL;
- }
-
- kmer_iterator kmer_begin() {
- return kmers_->begin();
- }
- const_kmer_iterator kmer_begin() const {
- return kmers_->cbegin();
- }
-
- kmer_iterator kmer_end() {
- return kmers_->end();
- }
- const_kmer_iterator kmer_end() const {
- return kmers_->cend();
- }
-
- bool valid(const KeyWithHash &kwh) const {
- if (!base::valid(kwh))
- return false;
-
- auto it = this->kmers_->begin() + kwh.idx();
- if(!kwh.is_minimal())
- return (typename traits_t::raw_equal_to()(!kwh.key(), *it));
- else
- return (typename traits_t::raw_equal_to()(kwh.key(), *it));
- }
-
- /**
- * Number of edges going out of the param edge's end
- */
- unsigned NextEdgeCount(const KeyWithHash &kwh) const {
- unsigned res = 0;
- for (char c = 0; c < 4; ++c)
- if (valid(kwh << c))
- res += 1;
-
- return res;
- }
-
- KeyWithHash NextEdge(const KeyWithHash &kwh) const { // returns any next edge
- for (char c = 0; c < 4; ++c) {
- if (valid(kwh << c))
- //hack for this code to work with long seqs! (oterwise return s is totally fine)
- return ConstructKWH(true_kmer(kwh));//s;
- }
-
- VERIFY_MSG(false, "Couldn't find requested edge!");
- return ConstructKWH(KMer(this->k()));
- // no next edges (we should request one here).
- }
-
- /**
- * Number of edges coming into param edge's end
- */
- unsigned RivalEdgeCount(const KeyWithHash &kwh) const {
- KeyWithHash next = kwh << 'A';
- unsigned res = 0;
- for (char c = 0; c < 4; ++c)
- if (valid(next >> c))
- res += 1;
-
- return res;
- }
-
- template<class KmerCounter>
- void BuildIndex(KmerCounter& counter, size_t bucket_num,
- size_t thread_num) {
- base::BuildIndex(counter, bucket_num, thread_num);
- VERIFY(!kmers_);
- kmers_ = counter.GetFinalKMers();
- VERIFY(kmers_);
- SortUniqueKMers();
- }
-};
-
-template<class K, class V, class traits, class StoringType>
-class KeyIteratingMap : public PerfectHashMap<K, V, traits, StoringType> {
- typedef PerfectHashMap<K, V, traits, StoringType> base;
-
- std::string KMersFilename_;
-
-public:
- typedef StoringType storing_type;
- typedef typename base::traits_t traits_t;
- typedef typename base::KeyType KMer;
- typedef typename base::IdxType KMerIdx;
- using base::ConstructKWH;
-
-public:
-
- KeyIteratingMap(size_t k, const std::string &workdir)
- : base(k, workdir),
- KMersFilename_("") {
- }
-
- ~KeyIteratingMap() {
- }
-
- typedef MMappedFileRecordArrayIterator<typename KMer::DataType> kmer_iterator;
-
- kmer_iterator kmer_begin() const {
- return kmer_iterator(this->KMersFilename_, KMer::GetDataSize(base::k()));
- }
-
- std::vector<kmer_iterator> kmer_begin(size_t parts) const {
- return io::make_kmer_iterator<KMer>(this->KMersFilename_, base::k(), parts);
- }
-
-
- template<class KmerCounter>
- void BuildIndex(KmerCounter& counter, size_t bucket_num,
- size_t thread_num) {
- base::BuildIndex(counter, bucket_num, thread_num);
- KMersFilename_ = counter.GetFinalKMersFname();
- }
-};
-
-//Seq is here for partial specialization
-template <class Seq, class Index>
-class DeBruijnStreamKMerIndexBuilder {
-
-};
-
-template<class Index>
-class DeBruijnStreamKMerIndexBuilder<runtime_k::RtSeq, Index> {
- public:
- typedef Index IndexT;
-
- template <class Streams>
- size_t BuildIndexFromStream(IndexT &index,
- Streams &streams,
- io::SingleStream* contigs_stream = 0) const {
- DeBruijnReadKMerSplitter<typename Streams::ReadT, StoringTypeFilter<typename IndexT::storing_type>>
- splitter(index.workdir(), index.k(), 0, streams, contigs_stream);
- KMerDiskCounter<runtime_k::RtSeq> counter(index.workdir(), splitter);
-
- index.BuildIndex(counter, 16, streams.size());
- return 0;
- }
-};
-
-//fixme makes hierarchy a bit strange
-template <class Index, class Seq = typename Index::KMer>
-class DeBruijnGraphKMerIndexBuilder;
-
-template <class Index>
-class DeBruijnGraphKMerIndexBuilder<Index, runtime_k::RtSeq> {
- public:
- typedef Index IndexT;
-
- template<class Graph>
- void BuildIndexFromGraph(IndexT &index, const Graph &g, size_t read_buffer_size = 0) const {
- DeBruijnGraphKMerSplitter<Graph, StoringTypeFilter<typename Index::storing_type>> splitter(index.workdir(), index.k(),
- g, read_buffer_size);
- KMerDiskCounter<runtime_k::RtSeq> counter(index.workdir(), splitter);
- index.BuildIndex(counter, 16, 1);
- }
-};
-
-}
diff --git a/src/debruijn/indices/storing_traits.hpp b/src/debruijn/indices/storing_traits.hpp
deleted file mode 100644
index 207b73a..0000000
--- a/src/debruijn/indices/storing_traits.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-/*
- * key_with_hash.hpp
- *
- * Created on: Nov 7, 2013
- * Author: anton
- */
-
-#include "values.hpp"
-
-namespace debruijn_graph {
-
-
-struct SimpleStoring {
- template<class K, class V>
- static V get_value(const ValueArray<V> &values, const K& key) {
- return values[key.idx()];
- }
-
- template<class K, class V>
- static void set_value(ValueArray<V> &values, const K& key, const V& value) {
- values[key.idx()] = value;
- }
-
- static bool IsInvertable() {
- return false;
- }
-};
-
-struct InvertableStoring {
- template<class K, class V>
- static V get_value(const ValueArray<V> &values, const K& key) {
- if(key.is_minimal())
- return values[key.idx()];
- else
- return values[key.idx()].conjugate(key);
- }
-
- template<class K, class V>
- static void set_value(ValueArray<V> &values, const K& key, const V& value) {
- if(key.is_minimal())
- values[key.idx()] = value;
- else
- values[key.idx()] = value.conjugate(key);
- }
-
- static bool IsInvertable() {
- return true;
- }
-};
-
-typedef InvertableStoring DefaultStoring;
-
-}
diff --git a/src/debruijn/is_counter.hpp b/src/debruijn/is_counter.hpp
deleted file mode 100644
index ace7681..0000000
--- a/src/debruijn/is_counter.hpp
+++ /dev/null
@@ -1,173 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * is_counter.hpp
- *
- * Created on: May 25, 2014
- * Author: andrey
- */
-
-#ifndef IS_COUNTER_HPP_
-#define IS_COUNTER_HPP_
-
-
-#include "de/insert_size_refiner.hpp"
-#include "sequence_mapper_notifier.hpp"
-
-namespace debruijn_graph {
-
-using namespace omnigraph;
-
-class InsertSizeCounter: public SequenceMapperListener {
-
-public:
-
- InsertSizeCounter(const conj_graph_pack& gp,
- size_t edge_length_threshold,
- bool ignore_negative = false)
- : gp_(gp), hist_(), tmp_hists_(),
- total_(), counted_(), negative_(),
- edge_length_threshold_(edge_length_threshold),
- ignore_negative_(ignore_negative) {
- }
-
- HistType hist() { return hist_; }
- size_t total() const { return total_.total_; }
- size_t mapped() const { return counted_.total_; }
- size_t negative() const { return negative_.total_; }
-
-
- virtual void StartProcessLibrary(size_t threads_count) {
- hist_.clear();
- tmp_hists_ = vector<HistType*>(threads_count);
- tmp_hists_[0] = &hist_;
- for (size_t i = 1; i < threads_count; ++i)
- tmp_hists_[i] = new HistType();
-
- total_ = count_data(threads_count);
- counted_ = count_data(threads_count);
- negative_ = count_data(threads_count);
- }
-
- virtual void StopProcessLibrary() {
- for (size_t i = 1; i < tmp_hists_.size(); ++i) {
- MergeBuffer(i);
- delete tmp_hists_[i];
- }
- total_.merge();
- counted_.merge();
- negative_.merge();
- }
-
- virtual void ProcessPairedRead(size_t thread_index,
- const io::PairedRead& r,
- const MappingPath<EdgeId>& read1,
- const MappingPath<EdgeId>& read2) {
- ProcessPairedRead(thread_index, read1, read2, (int) r.second().size(),
- (int) r.first().GetLeftOffset() + (int) r.second().GetRightOffset());
- }
-
- virtual void ProcessPairedRead(size_t thread_index,
- const io::PairedReadSeq& r,
- const MappingPath<EdgeId>& read1,
- const MappingPath<EdgeId>& read2) {
- ProcessPairedRead(thread_index, read1, read2, (int) r.second().size(),
- (int) r.first().GetLeftOffset() + (int) r.second().GetRightOffset());
- }
-
- virtual void ProcessSingleRead(size_t /*thread_index*/, const io::SingleRead&, const MappingPath<EdgeId>& /*read*/) {
- }
-
- virtual void ProcessSingleRead(size_t /*thread_index*/, const io::SingleReadSeq&, const MappingPath<EdgeId>& /*read*/) {
- }
-
- virtual void MergeBuffer(size_t thread_index) {
- if (thread_index != 0) {
- for (auto it = tmp_hists_[thread_index]->begin(); it != tmp_hists_[thread_index]->end(); ++it) {
- (*tmp_hists_[0])[it->first] += it->second;
- }
- tmp_hists_[thread_index]->clear();
- }
- }
-
- void FindMean(double& mean, double& delta, std::map<size_t, size_t>& percentiles) const {
- find_mean(hist_, mean, delta, percentiles);
- }
-
- void FindMedian(double& median, double& mad, HistType& histogram) const {
- find_median(hist_, median, mad, histogram);
- }
-
-private:
- virtual void ProcessPairedRead(size_t thread_index,
- const MappingPath<EdgeId>& read1,
- const MappingPath<EdgeId>& read2,
- int read2_size,
- int is_delta) {
-
- ++total_.arr_[thread_index];
-
- if (read1.size() == 1 && read2.size() == 1 &&
- read2.simple_path().front() == read1.simple_path().front() &&
- gp_.g.length(read1.simple_path().front()) >= edge_length_threshold_) {
-
- auto mapping_edge_1 = read1.front().second;
- auto mapping_edge_2 = read2.front().second;
-
- int read1_start = (int) mapping_edge_1.mapped_range.start_pos - (int) mapping_edge_1.initial_range.start_pos ;
- TRACE("Read 1: " << (int) mapping_edge_1.mapped_range.start_pos << " - " << (int) mapping_edge_1.initial_range.start_pos << " = " << read1_start);
- int read2_start = (int) mapping_edge_2.mapped_range.start_pos - (int) mapping_edge_2.initial_range.start_pos;
- TRACE("Read 2: " << (int) mapping_edge_2.mapped_range.start_pos << " - " << (int) mapping_edge_2.initial_range.start_pos << " = " << read2_start);
- int is = read2_start - read1_start + read2_size + is_delta;
- TRACE("IS: " << read2_start << " - " << read1_start << " + " << (int) is_delta << " = " << is);
-
- if (is > 0 || !ignore_negative_) {
- (*tmp_hists_[thread_index])[is] += 1;
- ++counted_.arr_[thread_index];
- } else {
- ++negative_.arr_[thread_index];
- }
-
- }
-
- }
- struct count_data {
- size_t total_;
- vector<size_t> arr_;
- count_data(): total_(0) {
- }
- count_data(size_t nthreads): total_(0), arr_(nthreads, 0) {
- }
- void inc(size_t i) {
- ++arr_[i];
- }
- void merge() {
- for (size_t i = 0; i < arr_.size(); ++i) {
- total_ += arr_[i];
- }
- }
- };
-
-private:
- const conj_graph_pack& gp_;
-
- HistType hist_;
- vector<HistType*> tmp_hists_;
-
- count_data total_;
- count_data counted_;
- count_data negative_;
-
- size_t edge_length_threshold_;
- bool ignore_negative_;
-};
-
-}
-
-
-#endif /* IS_COUNTER_HPP_ */
diff --git a/src/debruijn/kmer_coverage_model.cpp b/src/debruijn/kmer_coverage_model.cpp
deleted file mode 100644
index ae6fb3a..0000000
--- a/src/debruijn/kmer_coverage_model.cpp
+++ /dev/null
@@ -1,379 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "kmer_coverage_model.hpp"
-
-#include "xmath.h"
-#include "logger/logger.hpp"
-#include "smooth.hpp"
-#include "verify.hpp"
-
-#include <boost/math/special_functions/zeta.hpp>
-#include <boost/math/distributions/normal.hpp>
-#include <boost/math/distributions/skew_normal.hpp>
-#include <boost/math/distributions/geometric.hpp>
-#include <boost/math/distributions/pareto.hpp>
-
-#include <nlopt/nlopt.hpp>
-
-#include <vector>
-
-#include <cstring>
-#include <cstdint>
-#include <cstddef>
-#include <cmath>
-
-namespace cov_model {
-using std::isfinite;
-
-static const size_t MaxCopy = 10;
-
-static double dzeta(double x, double p) {
- return pow(x, -p-1) / boost::math::zeta(p + 1);
-}
-
-static double perr(size_t i, double scale, double shape) {
- return pow((1 + shape*((double)(i-1))/scale), -1.0/shape) - pow((1 + shape*((double)i)/scale), -1.0/shape);
-}
-
-static double pgood(size_t i, double zp, double u, double sd, double shape,
- double *mixprobs = NULL) {
- double res = 0;
-
- for (unsigned copy = 0; copy < MaxCopy; ++copy) {
- boost::math::skew_normal snormal((copy + 1)* u, sd * sqrt(copy + 1), shape);
- // res += (mixprobs ? mixprobs[copy] : dzeta(copy + 1, zp)) * (boost::math::cdf(snormal, i + 1) - boost::math::cdf(snormal, i));
- res += (mixprobs ? mixprobs[copy] : dzeta(copy + 1, zp)) * boost::math::pdf(snormal, i);
- }
-
- return res;
-}
-
-class CovModelLogLike {
- const std::vector<size_t> &cov;
-
- public:
- CovModelLogLike(const std::vector<size_t> &cov)
- : cov(cov) {}
-
- int getN() const { return 7; };
-
- private:
-
- double eval_(const double *x) const {
- double zp = x[0], p = x[1], shape = x[2], u = x[3], sd = x[4], scale = x[5], shape2 = x[6];
-
- if (zp <= 1 || shape <= 0 || sd <= 0 || p < 1e-9 || p > 1-1e-9 || u <= 0 || scale <= 0 ||
- !isfinite(zp) || !isfinite(shape) || !isfinite(sd) || !isfinite(p) || !isfinite(u) ||
- !isfinite(scale) || !isfinite(shape2))
- return +std::numeric_limits<double>::infinity();
-
- std::vector<double> kmer_probs(cov.size());
-
- // Error
- for (size_t i = 0; i < kmer_probs.size(); ++i)
- kmer_probs[i] += p * perr(i + 1, scale, shape);
-
- // Good
- for (size_t i = 0; i < kmer_probs.size(); ++i)
- kmer_probs[i] += (1 - p) * pgood(i + 1, zp, u, sd, shape2);
-
- double res = 0;
- for (size_t i = 0; i < kmer_probs.size(); ++i)
- res += (double)(cov[i]) * log(kmer_probs[i]);
-
- return -res;
- }
-};
-
-struct CovModelLogLikeEMData {
- const std::vector<size_t> &cov;
- const std::vector<double> &z;
-};
-
-static double CovModelLogLikeEM(unsigned, const double *x, double *, void *data) {
- double zp = x[0], shape = x[1], u = x[2], sd = x[3], scale = x[4], shape2 = x[5];
-
- // INFO("Entry: " << x[0] << " " << x[1] << " " << x[2] << " " << x[3] << " " << x[4]);
-
- if (zp <= 1 || shape <= 0 || sd <= 0 || u <= 0 || scale <= 0 ||
- !isfinite(zp) || !isfinite(shape) || !isfinite(sd) || !isfinite(u) ||
- !isfinite(scale) || !isfinite(shape2))
- return -std::numeric_limits<double>::infinity();
-
- const std::vector<size_t> &cov = static_cast<CovModelLogLikeEMData*>(data)->cov;
- const std::vector<double> &z = static_cast<CovModelLogLikeEMData*>(data)->z;
-
- std::vector<double> kmer_probs(cov.size(), 0);
-
- // Error
- for (size_t i = 0; i < kmer_probs.size(); ++i) {
- if (cov[i] == 0)
- continue;
-
- kmer_probs[i] += z[i] * log(perr(i + 1, scale, shape));
- }
-
- // Good
- // Pre-compute mixing probabilities
- std::vector<double> mixprobs(MaxCopy, 0);
- for (unsigned copy = 0; copy < MaxCopy; ++copy)
- mixprobs[copy] = dzeta(copy + 1, zp);
-
- // Compute the density
- for (size_t i = 0; i < kmer_probs.size(); ++i) {
- if (cov[i] == 0)
- continue;
-
- double val = log(pgood(i + 1, zp, u, sd, shape2, &mixprobs[0]));
- if (!isfinite(val))
- val = -1000.0;
- kmer_probs[i] += (1 - z[i]) * val;
- }
-
- double res = 0;
- for (size_t i = 0; i < kmer_probs.size(); ++i)
- res += (double)(cov[i]) * kmer_probs[i];
-
- // INFO("f: " << res);
- return res;
- }
-
-
-static std::vector<double> EStep(const std::vector<double> &x,
- double p, size_t N) {
- double zp = x[0], shape = x[1], u = x[2], sd = x[3], scale = x[4], shape2 = x[5];
-
- std::vector<double> res(N);
- for (size_t i = 0; i < N; ++i) {
- double pe = p * perr(i + 1, scale, shape);
- res[i] = pe / (pe + (1 - p) * pgood(i + 1, zp, u, sd, shape2));
- if (!isfinite(res[i]))
- res[i] = 1.0;
- }
-
- return res;
-}
-
-// Estimate the coverage mean by finding the max past the
-// first valley.
-size_t KMerCoverageModel::EstimateValley() const {
- // Smooth the histogram
- std::vector<size_t> scov;
- math::Smooth3RS3R(scov, cov_);
-
- size_t Valley = scov[0];
-
- // Start finding the valley
- size_t Idx = 1;
- while (scov[Idx] < Valley && Idx < scov.size()) {
- Valley = scov[Idx];
- Idx += 1;
- }
- Idx -= 1;
-
- INFO("Kmer coverage valley at: " << Idx);
-
- return Idx;
-}
-
-void KMerCoverageModel::Fit() {
- VERIFY_MSG(cov_.size() > 10, "Invalid kmer coverage histogram");
-
- // Find the minimal coverage point using smoothed histogram.
- Valley_ = EstimateValley();
-
- // First estimate of coverage is the first maximum after the valley.
- MaxCov_ = Valley_ + 1;
- size_t MaxHist = cov_[MaxCov_];
- for (size_t i = Valley_ + 1; i < cov_.size(); ++i) {
- if (cov_[i] > MaxHist) {
- MaxHist = cov_[i];
- MaxCov_ = i;
- }
- }
- INFO("K-mer histogram maximum: " << MaxCov_);
-
- // Refine the estimate via median
- size_t AfterValley = 0, SecondValley = std::min(2*MaxCov_ - Valley_, cov_.size());
- for (size_t i = Valley_ + 1; i < SecondValley ; ++i)
- AfterValley += cov_[i];
-
- size_t ccov = 0;
- for (size_t i = Valley_ + 1; i < SecondValley; ++i) {
- if (ccov > AfterValley / 2) {
- MaxCov_ = std::max(i, MaxCov_);
- break;
- }
- ccov += cov_[i];
- }
-
- if (MaxCov_ - Valley_ < 3)
- WARN("Too much erroneous kmers, the estimates might be unreliable");
-
- std::vector<size_t> mvals(1 + MaxCov_ - Valley_);
- mvals[0] = cov_[MaxCov_];
- size_t tmadcov = mvals[0];
- for (size_t i = 1; i < std::min(MaxCov_ - Valley_, cov_.size() - MaxCov_); ++i) {
- mvals[i] = cov_[MaxCov_ + i] + cov_[MaxCov_ - i];
- tmadcov += mvals[i];
- }
- size_t madcov = 0;
- double CovSd = sqrt(5.0 * (double)MaxCov_);
- for (size_t i = 0; i < MaxCov_ - Valley_ ; ++i) {
- if (madcov > tmadcov / 2) {
- CovSd = i;
- break;
- }
- madcov += mvals[i];
- }
- CovSd *= 1.4826;
- INFO("Estimated median coverage: " << MaxCov_ << ". Coverage mad: " << CovSd);
-
- // Estimate error probability as ratio of kmers before the valley.
- size_t BeforeValley = 0, Total = 0;
- double ErrorProb = 0;
- for (size_t i = 0; i < cov_.size(); ++i) {
- if (i <= Valley_)
- BeforeValley += cov_[i];
- Total += cov_[i];
- }
- ErrorProb = (double)BeforeValley / (double)Total;
- // Allow some erroneous / good kmers.
- ErrorProb = std::min(1-1e-3, ErrorProb);
- ErrorProb = std::max(1e-3, ErrorProb);
-
- TRACE("Total: " << Total << ". Before: " << BeforeValley);
- TRACE("p: " << ErrorProb);
-
- std::vector<double> x(6), lb(6), ub(6);
-
- x[0] = 3; lb[0] = 0; ub[0] = 2000;
- x[1] = 3; lb[1] = 0; ub[1] = 2000;
- x[2] = MaxCov_; lb[2] = 0; ub[2] = 2 * MaxCov_;
- x[3] = CovSd; lb[3] = MaxCov_ - Valley_; ub[3] = SecondValley;
- x[4] = 1; lb[4] = 0; ub[4] = 2000;
- x[5] = 0; lb[5] = -6; ub[5] = 6;
-
- INFO("Fitting coverage model");
- // Ensure that there will be at least 2 iterations.
- double PrevErrProb = 2;
- const double ErrProbThr = 1e-8;
- auto GoodCov = cov_;
- GoodCov.resize(std::min(cov_.size(), 5 * MaxCopy * MaxCov_ / 4));
- converged_ = true;
- unsigned it = 1;
- while (fabs(PrevErrProb - ErrorProb) > ErrProbThr) {
- // Recalculate the vector of posterior error probabilities
- std::vector<double> z = EStep(x, ErrorProb, GoodCov.size());
-
- // Recalculate the probability of error
- PrevErrProb = ErrorProb; ErrorProb = 0;
- for (size_t i=0; i < GoodCov.size(); ++i)
- ErrorProb += z[i] * (double)GoodCov[i];
- ErrorProb /= (double)Total;
-
- bool LastIter = fabs(PrevErrProb - ErrorProb) <= ErrProbThr;
-
- nlopt::opt opt(nlopt::LN_NELDERMEAD, 6);
- CovModelLogLikeEMData data = { GoodCov, z };
- opt.set_max_objective(CovModelLogLikeEM, &data);
- if (!LastIter)
- opt.set_maxeval(5 * 6 * it);
- opt.set_xtol_rel(1e-8);
- opt.set_ftol_rel(1e-8);
-
- double fMin;
- nlopt::result Results = nlopt::FAILURE;
- try {
- Results = opt.optimize(x, fMin);
- } catch (nlopt::roundoff_limited&) {
- }
-
- VERBOSE_POWER_T2(it, 1, "... iteration " << it);
- TRACE("Results: ");
- TRACE("Converged: " << Results << " " << "F: " << fMin);
-
- double zp = x[0], shape = x[1], u = x[2], sd = x[3], scale = x[4], shape2 = x[5];
- TRACE("zp: " << zp << " p: " << ErrorProb << " shape: " << shape << " u: " << u << " sd: " << sd << " scale: " << scale << " shape2: " << shape2);
-
- it += 1;
- }
-
- double delta = x[5] / sqrt(1 + x[5]*x[5]);
- mean_coverage_ = x[2] + x[3]*delta*sqrt(2/M_PI);
- sd_coverage_ = x[3]*sqrt(1-2*delta*delta/M_PI);
- INFO("Fitted mean coverage: " << mean_coverage_ << ". Fitted coverage std. dev: " << sd_coverage_);
-
- // Now let us check whether we have sane results
- for (size_t i = 0; i < x.size(); ++i)
- if (!isfinite(x[i])) {
- converged_ = false;
- break;
- }
-
- if (!isfinite(ErrorProb))
- converged_ = false;
-
- // See, if we can deduce proper threshold
-
- // First, check whether initial estimate of Valley was sane.
- ErrorThreshold_ = 0;
- if (converged_ && Valley_ > x[2] && x[2] > 2) {
- Valley_ = (size_t)math::round(x[2] / 2.0);
- WARN("Valley value was estimated improperly, reset to " << Valley_);
- }
-
- // If the model converged, then use it to estimate the thresholds.
- if (converged_) {
- std::vector<double> z = EStep(x, ErrorProb, GoodCov.size());
-
- INFO("Probability of erroneous kmer at valley: " << z[Valley_]);
- converged_ = false;
- for (size_t i = 0; i < z.size(); ++i)
- if (z[i] > strong_probability_threshold_) //0.999
- LowThreshold_ = std::min(i + 1, Valley_);
- else if (z[i] < probability_threshold_) {//0.05?
- ErrorThreshold_ = std::max(i + 1, Valley_);
- converged_ = true;
- break;
- }
-
- #if 0
- for (size_t i = 0; i < z.size(); ++i) {
- double zp = x[0], shape = x[1], u = x[2], sd = x[3], scale = x[4], shape2 = x[5];
- double pe = ErrorProb * perr(i + 1, scale, shape);
- double pg = (1 - ErrorProb) * pgood(i + 1, zp, u, sd, shape2);
-
- fprintf(stderr, "%e %e %e %e\n", pe, pg, z[i], perr(i + 1, scale, shape));
- }
- #endif
- }
-
- // See, if we have sane ErrorThreshold_ and go down to something convervative, if not.
- if (converged_) {
- INFO("Preliminary threshold calculated as: " << ErrorThreshold_);
- ErrorThreshold_ = (Valley_ < mean_coverage_ ?
- std::min(Valley_ + (size_t)(mean_coverage_ - Valley_) / 2, ErrorThreshold_) :
- Valley_);
- INFO("Threshold adjusted to: " << ErrorThreshold_);
- } else {
- ErrorThreshold_ = Valley_;
- LowThreshold_ = 1;
- WARN("Failed to determine erroneous kmer threshold. Threshold set to: " << ErrorThreshold_);
- }
-
- // Now the bonus: estimate the genome size!
- GenomeSize_ = 0;
- for (size_t i = ErrorThreshold_ - 1; i < GoodCov.size(); ++i)
- GenomeSize_ += GoodCov[i];
- GenomeSize_ /= 2;
-
- INFO("Estimated genome size (ignoring repeats): " << GenomeSize_);
-}
-
-};
diff --git a/src/debruijn/kmer_coverage_model.hpp b/src/debruijn/kmer_coverage_model.hpp
deleted file mode 100644
index 30a78fe..0000000
--- a/src/debruijn/kmer_coverage_model.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef __KMER_COVERAGE_MODEL_HPP__
-#define __KMER_COVERAGE_MODEL_HPP__
-
-#include <vector>
-#include <cstddef>
-
-namespace cov_model {
-
-class KMerCoverageModel {
- const std::vector<size_t> &cov_;
- size_t MaxCov_, Valley_, ErrorThreshold_, LowThreshold_, GenomeSize_;
- double probability_threshold_, strong_probability_threshold_, mean_coverage_, sd_coverage_;
- bool converged_;
-
- public:
- KMerCoverageModel(const std::vector<size_t> &cov, double probability_threshold, double strong_probability_threshold)
- : cov_(cov), LowThreshold_(0), probability_threshold_(probability_threshold), strong_probability_threshold_(strong_probability_threshold),
- mean_coverage_(0.0), sd_coverage_(0.0), converged_(false) {}
-
- void Fit();
-
- size_t GetErrorThreshold() const { return ErrorThreshold_; }
- size_t GetLowThreshold() const { return LowThreshold_; }
- size_t GetGenomeSize() const { return GenomeSize_; }
- double GetMeanCoverage() const { return mean_coverage_; }
- double GetSdCoverage() const { return sd_coverage_; }
- bool converged() const { return converged_; }
-
- private:
- size_t EstimateValley() const;
-};
-
-};
-
-
-#endif
diff --git a/src/debruijn/kmer_mapper.hpp b/src/debruijn/kmer_mapper.hpp
deleted file mode 100644
index 355c71d..0000000
--- a/src/debruijn/kmer_mapper.hpp
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * kmer_mapper.hpp
- *
- * Created on: Dec 4, 2013
- * Author: andrey
- */
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "omni/omni_utils.hpp"
-#include "sequence/sequence_tools.hpp"
-#include "omni/path_processor.hpp"
-
-#include "runtime_k.hpp"
-#include "edge_index.hpp"
-
-#include <cstdlib>
-
-namespace debruijn_graph {
-template <class Graph, class Seq = runtime_k::RtSeq>
-class KmerMapper : public omnigraph::GraphActionHandler<Graph> {
- typedef omnigraph::GraphActionHandler<Graph> base;
- typedef typename Graph::EdgeId EdgeId;
- typedef Seq Kmer;
- typedef typename runtime_k::KmerMap<Kmer, Seq> MapType;
-
- MapType mapping_;
- size_t k_;
- bool verification_on_;
-
- bool CheckAllDifferent(const Sequence& old_s, const Sequence& new_s) const {
- set<Kmer> kmers;
- Kmer kmer = old_s.start<Kmer>(k_) >> 0;
- for (size_t i = k_ - 1; i < old_s.size(); ++i) {
- kmer <<= old_s[i];
- kmers.insert(kmer);
- }
- kmer = new_s.start<Kmer>(k_) >> 0;
- for (size_t i = k_ - 1; i < new_s.size(); ++i) {
- kmer <<= new_s[i];
- kmers.insert(kmer);
- }
- return kmers.size() == old_s.size() - k_ + 1 + new_s.size() - k_ + 1;
- }
-
- public:
-
- KmerMapper(const Graph& g, bool verification_on = true) :
- base(g, "KmerMapper"), mapping_(g.k() + 1), k_(g.k() + 1), verification_on_(verification_on) {}
-
- virtual ~KmerMapper() { }
-
- size_t get_k() const { return k_; }
-
- typename MapType::const_iterator begin() const {
- return mapping_.begin();
- }
-
- typename MapType::const_iterator end() const {
- return mapping_.end();
- }
-
- void Normalize() {
- std::vector<Kmer> all;
- for (auto it = begin(); it != end(); ++it) {
- all.push_back(it->first);
- }
- for (auto it = all.begin(); it != all.end(); ++it) {
- Normalize(*it);
- }
- }
-
- void Revert(const Kmer &kmer) {
- Kmer old_value = Substitute(kmer);
- if (old_value != kmer) {
- mapping_.erase(kmer);
- mapping_[old_value] = kmer;
- }
- }
-
- void Normalize(const Kmer &kmer) {
- mapping_[kmer] = Substitute(kmer);
- }
-
- bool CheckCanRemap(const Sequence& old_s, const Sequence& new_s) const {
- if(!CheckAllDifferent(old_s, new_s))
- return false;
- size_t old_length = old_s.size() - k_ + 1;
- size_t new_length = new_s.size() - k_ + 1;
- UniformPositionAligner aligner(old_s.size() - k_ + 1,
- new_s.size() - k_ + 1);
- Kmer old_kmer = old_s.start<Kmer>(k_);
- old_kmer >>= 0;
- for (size_t i = k_ - 1; i < old_s.size(); ++i) {
- old_kmer <<= old_s[i];
- size_t old_kmer_offset = i - k_ + 1;
- size_t new_kmer_offest = aligner.GetPosition(old_kmer_offset);
- if(old_kmer_offset * 2 + 1 == old_length && new_length % 2 == 0) {
- Kmer middle(k_ - 1, new_s, new_length / 2);
- if (typename Kmer::less2()(middle, !middle)) {
- new_kmer_offest = new_length - 1 - new_kmer_offest;
- }
- }
- Kmer new_kmer(k_, new_s, new_kmer_offest);
- auto it = mapping_.find(new_kmer);
- if (it != mapping_.end()) {
- if (Substitute(new_kmer) != old_kmer) {
- return false;
- }
- }
- }
- return true;
- }
-
- void RemapKmers(const Sequence& old_s, const Sequence& new_s) {
- VERIFY(this->IsAttached());
- size_t old_length = old_s.size() - k_ + 1;
- size_t new_length = new_s.size() - k_ + 1;
- UniformPositionAligner aligner(old_s.size() - k_ + 1,
- new_s.size() - k_ + 1);
- Kmer old_kmer = old_s.start<Kmer>(k_);
-
- for (size_t i = k_ - 1; i < old_s.size(); ++i) {
- // Instead of shifting right
- if (i != k_ - 1) {
- old_kmer <<= old_s[i];
- }
-
- size_t old_kmer_offset = i - k_ + 1;
- size_t new_kmer_offest = aligner.GetPosition(old_kmer_offset);
- if(old_kmer_offset * 2 + 1 == old_length && new_length % 2 == 0) {
- Kmer middle(unsigned(k_ - 1), new_s, new_length / 2);
- if(typename Kmer::less2()(middle, !middle)) {
- new_kmer_offest = new_length - 1 - new_kmer_offest;
- }
- }
- Kmer new_kmer(unsigned(k_), new_s, new_kmer_offest);
- auto it = mapping_.find(new_kmer);
- if (it != mapping_.end()) {
- if(verification_on_)
- VERIFY(Substitute(new_kmer) == old_kmer);
- mapping_.erase(it);
- }
- if(old_kmer != new_kmer)
- mapping_[old_kmer] = new_kmer;
- }
- }
-
- virtual void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) {
- VERIFY(this->g().EdgeNucls(new_edge) == this->g().EdgeNucls(edge2));
- RemapKmers(this->g().EdgeNucls(edge1), this->g().EdgeNucls(edge2));
- }
-
- Kmer Substitute(const Kmer& kmer) const {
- VERIFY(this->IsAttached());
- Kmer answer = kmer;
- auto it = mapping_.find(answer);
- while (it != mapping_.end()) {
- if(verification_on_)
- VERIFY(it.first() != it.second());
- answer = it.second();
- it = mapping_.find(answer);
- }
- return answer;
- }
-
- void BinWrite(std::ostream& file) const {
- u_int32_t size = (u_int32_t) mapping_.size();
- file.write((const char *) &size, sizeof(u_int32_t));
-
- for (auto iter = mapping_.begin(); iter != mapping_.end(); ++iter) {
- Kmer::BinWrite(file, iter.first());
- Kmer::BinWrite(file, iter.second());
- }
- }
-
- void BinRead(std::istream& file) {
- mapping_.clear();
- u_int32_t size;
- file.read((char *) &size, sizeof(u_int32_t));
-
- for (u_int32_t i = 0; i < size; ++i) {
- Kmer key(k_);
- Kmer value(k_);
- Kmer::BinRead(file, &key);
- Kmer::BinRead(file, &value);
- mapping_[key] = value;
- }
- }
-
- bool CompareTo(KmerMapper<Graph, Kmer> const& m) {
- if (mapping_.size() != m.mapping_.size()) {
- INFO("Unequal sizes");
- }
- for (auto iter = mapping_.begin(); iter != mapping_.end(); ++iter) {
- auto cmp = m.mapping_.find(iter.first());
- if (cmp == m.mapping_.end() || cmp.second() != iter.second()) {
- return false;
- }
- }
- return true;
- }
-
- void clear() {
- mapping_.clear();
- }
-
- size_t size() const {
- return mapping_.size();
- }
-
- // "turn on = true" means turning of all verifies
- void SetUnsafeMode(bool turn_on){
- verification_on_ = !turn_on;
- }
-};
-
-}
diff --git a/src/debruijn/kmer_mapper_logger.hpp b/src/debruijn/kmer_mapper_logger.hpp
deleted file mode 100644
index 1c373b7..0000000
--- a/src/debruijn/kmer_mapper_logger.hpp
+++ /dev/null
@@ -1,44 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * sequencem_mapping_logger.h
- *
- * Created on: Nov 27, 2012
- * Author: alex
- */
-
-#ifndef KMER_MAPPER_LOGGER_H_
-#define KMER_MAPPER_LOGGER_H_
-
-#include "omni/omni_utils.hpp"
-#include "standard_base.hpp"
-
-namespace debruijn {
-
-template<class Graph>
-class KmerMapperLogger : public omnigraph::GraphActionHandler<Graph> {
-public:
- typedef pair<Sequence, Sequence> MappedSeq;
- typedef typename Graph::EdgeId EdgeId;
-
- KmerMapperLogger(Graph& graph) : GraphActionHandler<Graph>(graph, "KmerMapperLogger") {}
- virtual ~KmerMapperLogger() {}
-
- virtual void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) {
- log_.push_back(MappedSeq(this->g().EdgeNucls(edge1), this->g().EdgeNucls(edge2)));
- }
-
- const vector<MappedSeq>& log() const {
- return log_;
- }
-
- vector<MappedSeq> log_;
-};
-
-} /* namespace debruijn */
-#endif /* KMER_MAPPER_LOGGER_H_ */
diff --git a/src/debruijn/launch.hpp b/src/debruijn/launch.hpp
deleted file mode 100644
index 38fe4a1..0000000
--- a/src/debruijn/launch.hpp
+++ /dev/null
@@ -1,117 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "standard.hpp"
-
-#include "config_struct.hpp"
-
-#include "graph_pack.hpp"
-#include "construction.hpp"
-#include "genomic_info_filler.hpp"
-#include "gap_closer.hpp"
-#include "simplification.hpp"
-#include "mismatch_correction.hpp"
-#include "pair_info_count.hpp"
-#include "second_phase_setup.hpp"
-#include "repeat_resolving.hpp"
-#include "distance_estimation.hpp"
-#include "pacbio_aligning.hpp"
-#include "stage.hpp"
-
-namespace spades {
-
-void assemble_genome() {
- INFO("SPAdes started");
- if (cfg::get().ds.meta && cfg::get().ds.reads.lib_count() != 1) {
- ERROR("Sorry, current version of metaSPAdes can work with single library only (paired-end only).");
- exit(239);
- }
-
- INFO("Starting from stage: " << cfg::get().entry_point);
-
- bool two_step_rr = cfg::get().two_step_rr && cfg::get().rr_enable && cfg::get().ds.meta;
- INFO("Two-step RR enabled: " << two_step_rr);
-
- StageManager SPAdes({cfg::get().developer_mode,
- cfg::get().load_from,
- cfg::get().output_saves});
-
- size_t read_index_cnt = cfg::get().ds.reads.lib_count();
- if (two_step_rr)
- read_index_cnt++;
-
- debruijn_graph::conj_graph_pack conj_gp(cfg::get().K,
- cfg::get().tmp_dir,
- read_index_cnt,
- cfg::get().ds.reference_genome,
- cfg::get().flanking_range,
- cfg::get().pos.max_mapping_gap,
- cfg::get().pos.max_gap_diff);
-
- if (cfg::get().need_mapping) {
- INFO("Will need read mapping, kmer mapper will be attached");
- conj_gp.kmer_mapper.Attach();
- }
- // Build the pipeline
- SPAdes.add(new debruijn_graph::Construction())
- .add(new debruijn_graph::GenomicInfoFiller());
- if (cfg::get().gap_closer_enable && cfg::get().gc.before_simplify)
- SPAdes.add(new debruijn_graph::GapClosing("early_gapcloser"));
-
- SPAdes.add(new debruijn_graph::Simplification(two_step_rr));
-
- if (cfg::get().gap_closer_enable && cfg::get().gc.after_simplify)
- SPAdes.add(new debruijn_graph::GapClosing("late_gapcloser"));
-
- SPAdes.add(new debruijn_graph::SimplificationCleanup());
- //currently cannot be used with two step rr
- if (cfg::get().correct_mismatches && !cfg::get().ds.meta)
- SPAdes.add(new debruijn_graph::MismatchCorrection());
- if (cfg::get().rr_enable) {
- if (two_step_rr) {
- if (cfg::get().use_intermediate_contigs)
- SPAdes.add(new debruijn_graph::PairInfoCount(true))
- .add(new debruijn_graph::DistanceEstimation(true))
- .add(new debruijn_graph::RepeatResolution(true))
- .add(new debruijn_graph::SecondPhaseSetup());
-
- SPAdes.add(new debruijn_graph::Simplification());
- }
-
- //begin pacbio
- bool run_pacbio = false;
- for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i) {
- if (cfg::get().ds.reads[i].is_pacbio_alignable()) {
- run_pacbio = true;
- break;
- }
- }
- if (run_pacbio) {
- //currently not integrated with two step rr process
- VERIFY(!two_step_rr);
- SPAdes.add(new debruijn_graph::PacBioAligning());
- }
- //end pacbio
-
- SPAdes.add(new debruijn_graph::PairInfoCount())
- .add(new debruijn_graph::DistanceEstimation())
- .add(new debruijn_graph::RepeatResolution());
- } else {
- SPAdes.add(new debruijn_graph::ContigOutput());
- }
-
- SPAdes.run(conj_gp, cfg::get().entry_point.c_str());
-
- // For informing spades.py about estimated params
- debruijn_graph::write_lib_data(path::append_path(cfg::get().output_dir, "final"));
-
- INFO("SPAdes finished");
-}
-
-}
diff --git a/src/debruijn/long_read_mapper.hpp b/src/debruijn/long_read_mapper.hpp
deleted file mode 100644
index acc1fc9..0000000
--- a/src/debruijn/long_read_mapper.hpp
+++ /dev/null
@@ -1,100 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * long_read_mapper.hpp
- *
- * Created on: Jun 17, 2013
- * Author: andrey
- */
-
-#ifndef LONG_READ_MAPPER_HPP_
-#define LONG_READ_MAPPER_HPP_
-
-#include "long_read_storage.hpp"
-#include "sequence_mapper_notifier.hpp"
-
-namespace debruijn_graph {
-
-class SimpleLongReadMapper: public SequenceMapperListener {
-public:
- SimpleLongReadMapper(conj_graph_pack& gp, PathStorage<conj_graph_pack::graph_t>& storage)
- : gp_(gp), storage_(storage), path_finder_(gp_.g) {
- mapper_ = MapperInstance(gp_);
- }
-
- virtual ~SimpleLongReadMapper() {}
-
- void StartProcessLibrary(size_t threads_count) override {
- for (size_t i = 0; i < threads_count; ++i)
- buffer_storages_.emplace_back(gp_.g);
- }
-
- void StopProcessLibrary() override {
- for (size_t i = 0; i < buffer_storages_.size(); ++i) {
- MergeBuffer(i);
- }
- buffer_storages_.clear();
- }
-
- void MergeBuffer(size_t thread_index) override {
- DEBUG("Merge buffer " << thread_index << " with size " << buffer_storages_[thread_index].size());
- storage_.AddStorage(buffer_storages_[thread_index]);
- buffer_storages_[thread_index].Clear();
- DEBUG("Now size " << storage_.size());
- }
-
- void ProcessPairedRead(size_t ,
- const io::PairedReadSeq&,
- const MappingPath<EdgeId>& ,
- const MappingPath<EdgeId>&) override {
- //nothing to do
- }
-
- void ProcessPairedRead(size_t ,
- const io::PairedRead&,
- const MappingPath<EdgeId>& ,
- const MappingPath<EdgeId>&) override {
- //nothing to do
- }
-
- void ProcessSingleRead(size_t thread_index,
- const io::SingleRead&,
- const MappingPath<EdgeId>& read) override {
- ProcessSingleRead(thread_index, read);
- }
-
- void ProcessSingleRead(size_t thread_index,
- const io::SingleReadSeq&,
- const MappingPath<EdgeId>& read) override {
- ProcessSingleRead(thread_index, read);
- }
-
- PathStorage<conj_graph_pack::graph_t>& GetPaths() {
- return storage_;
- }
-
-private:
-
- void ProcessSingleRead(size_t thread_index, const MappingPath<EdgeId>& read) {
- vector<vector<EdgeId>> paths = path_finder_.FindReadPathWithGaps(read);
- for(auto path : paths) {
- buffer_storages_[thread_index].AddPath(path, 1, false);
- }
- }
-
- conj_graph_pack& gp_;
- PathStorage<conj_graph_pack::graph_t>& storage_;
- std::shared_ptr<const NewExtendedSequenceMapper<conj_graph_pack::graph_t,
- conj_graph_pack::index_t> > mapper_;
- ReadPathFinder<conj_graph_pack::graph_t> path_finder_;
- std::vector<PathStorage<conj_graph_pack::graph_t> > buffer_storages_;
-};
-
-}/*longreads*/
-
-#endif /* LONG_READ_MAPPER_HPP_ */
diff --git a/src/debruijn/long_read_storage.hpp b/src/debruijn/long_read_storage.hpp
deleted file mode 100644
index 26f1706..0000000
--- a/src/debruijn/long_read_storage.hpp
+++ /dev/null
@@ -1,376 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * long_edge_storage.hpp
- *
- * Created on: Feb 7, 2013
- * Author: lab42
- */
-
-#pragma once
-
-#include <algorithm>
-
-namespace debruijn_graph {
-
-template<class Graph>
-class PathInfo {
-public:
- typedef typename Graph::EdgeId EdgeId;
- vector<EdgeId> path;
-
-private:
- mutable size_t w;
-
-public:
- vector<EdgeId> getPath() const {
- return path;
- }
-
- size_t getWeight() const {
- return w;
- }
-
- void increaseWeight(int addition = 1) const {
- w += addition;
- }
-
- bool operator<(const PathInfo<Graph> &other) const {
- return path < other.path;
- }
-
- PathInfo(const vector<EdgeId> &p, size_t weight = 0) :
- path(p), w(weight) {
- }
- PathInfo(const PathInfo<Graph> &other) {
- path = other.path;
- w = other.w;
- }
-
- string str(Graph &g_) {
- stringstream s;
- for(auto iter = path.begin(); iter != path.end(); iter ++ ){
- s << g_.int_id(*iter) << " ";
- }
- return s.str();
- }
-
-};
-
-template<class Graph>
-class PathStorage {
- friend class PathInfo<Graph> ;
- typedef typename Graph::EdgeId EdgeId;
- typedef map<EdgeId, set<PathInfo<Graph> > > InnerIndex;
-private:
- Graph &g_;
- InnerIndex inner_index_;
- const size_t kLongEdgeForStats = 500;
-
- void HiddenAddPath(const vector<EdgeId> &p, int w){
- if (p.size() == 0 ) return;
- for (typename set<PathInfo<Graph> >::iterator iter = inner_index_[p[0]].begin(); iter != inner_index_[p[0]].end(); ++iter) {
-
- if (iter->path == p) {
- iter->increaseWeight(w);
- return;
- }
- }
- inner_index_[p[0]].insert(PathInfo<Graph>(p, w));
- size_++;
- }
-
-public:
-
- PathStorage(Graph &g)
- : g_(g),
- inner_index_(),
- size_(0) {
- }
- PathStorage(const PathStorage & p)
- : g_(p.g_),
- inner_index_(),
- size_(0) {
- for (auto iter = p.inner_index_.begin(); iter != p.inner_index_.end();
- iter++) {
- for (auto j_iter = iter->second.begin();
- j_iter != iter->second.end(); j_iter++) {
- this->AddPath(j_iter->path, (int) j_iter->getWeight());
- }
- }
- }
- void ReplaceEdges(map<EdgeId, EdgeId> &old_to_new){
- map<int, EdgeId> tmp_map;
-// for (auto iter = g_.SmartEdgeBegin(); !iter.IsEnd(); ++iter ){
-// tmp_map[g_.int_id(*iter)] = *iter;
-// }
- InnerIndex new_index;
- for (auto iter = inner_index_.begin(); iter != inner_index_.end(); iter++) {
- auto tmp = iter->second;
- EdgeId new_first;
- if (old_to_new.find(iter->first) == old_to_new.end())
- new_first = iter->first;
- else {
- DEBUG("new first edge: "<< g_.int_id(old_to_new[iter->first]) << " with " << tmp.size() << " edges ");
- new_first = old_to_new[iter->first];
- }
- set<PathInfo<Graph> > new_tmp;
- for (auto j_iter = tmp.begin(); j_iter != tmp.end(); j_iter++) {
- PathInfo<Graph> pi = *(j_iter);
- for (size_t k = 0; k < pi.path.size(); k++)
- if (old_to_new.find(pi.path[k]) != old_to_new.end()) {
-// INFO(g_.int_id(old_to_new[pi.path[k]]));
- pi.path[k] = old_to_new[pi.path[k]];
- }
- DEBUG(pi.str(g_));
- new_tmp.insert(pi);
-
- }
- if (new_first != iter->first) {
- TRACE("and mmew_tmp.size: "<< new_tmp.size());
- }
- if (new_index.find(new_first) == new_index.end()) {
- new_index[new_first] = new_tmp;
- } else {
- for (auto j_iter = new_tmp.begin(); j_iter != new_tmp.end(); j_iter++) {
- new_index[new_first].insert(*j_iter);
- }
- }
-
- }
-
- inner_index_ = new_index;
- }
-
- void AddPath(const vector<EdgeId> &p, int w, bool add_rc = false) {
- HiddenAddPath(p, w);
- if (add_rc) {
- vector<EdgeId> rc_p(p.size());
- for (size_t i = 0; i < p.size(); i++)
- rc_p[i] = g_.conjugate(p[p.size() - 1 - i]);
- HiddenAddPath(rc_p, w);
- }
- }
- void DumpToFile(const string filename) const{
- map <EdgeId, EdgeId> auxilary;
- DumpToFile(filename, auxilary);
- }
- void DumpToFile(const string filename, map<EdgeId, EdgeId> &replacement, size_t stats_weight_cutoff = 1, bool need_log = false) const {
- ofstream filestr(filename);
- set<EdgeId> continued_edges;
-
- for(auto iter = inner_index_.begin(); iter != inner_index_.end(); ++iter){
- filestr<< iter->second.size() << endl;
- int non1 = 0;
- for (auto j_iter = iter->second.begin(); j_iter != iter->second.end(); ++j_iter) {
- filestr << " Weight: " << j_iter->getWeight();
- if (j_iter->getWeight() > stats_weight_cutoff)
- non1++;
-
- filestr << " length: " << j_iter->path.size() << " ";
- for (auto p_iter = j_iter->path.begin(); p_iter != j_iter->path.end(); ++p_iter) {
- if (p_iter != j_iter->path.end() - 1 && j_iter->getWeight() > stats_weight_cutoff) {
- continued_edges.insert(*p_iter);
- }
-
- filestr << g_.int_id(*p_iter) << "(" << g_.length(*p_iter) << ") ";
- }
- filestr << endl;
- }
- filestr << endl;
- }
-
- int noncontinued = 0;
- int long_gapped = 0;
- int continued = 0;
- if (need_log) {
- for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
- if (g_.length(*iter) > kLongEdgeForStats) {
- if (!g_.IsDeadEnd(g_.EdgeEnd(*iter))) {
- if (continued_edges.find(*iter) == continued_edges.end()) {
- if ((replacement.find(*iter) != replacement.end() &&
- continued_edges.find(replacement[*iter]) != continued_edges.end())) {
- TRACE("found in teplacement, edges " << g_.int_id(*iter) << " " <<
- g_.int_id(replacement[*iter]) << " skipping ");
- continue;
- }
- TRACE("noncontinued end left " << g_.int_id(*iter));
- noncontinued++;
- } else
- continued++;
- } else {
- TRACE("dead end left " << g_.int_id(*iter));
- long_gapped++;
- }
- }
- }
- INFO("After PacBio (long reads) aligning, for edges longer than " << kLongEdgeForStats << ":");
- INFO("No continuation found for " << noncontinued + long_gapped << " edges of " <<
- noncontinued + continued + long_gapped);
- }
- }
-
- vector<PathInfo<Graph> > GetAllPaths() const {
- vector<PathInfo<Graph> > res;
- for (auto iter = inner_index_.begin(); iter != inner_index_.end();
- ++iter) {
- for (auto j_iter = iter->second.begin();
- j_iter != iter->second.end(); ++j_iter) {
-
- res.push_back(*j_iter);
- }
- }
- return res;
- }
-
-
- vector<PathInfo<Graph> > GetAllPathsNoConjugate() {
- vector<PathInfo<Graph> > res;
-
- std::set< PathInfo<Graph> > added;
- for (auto iter = inner_index_.begin(); iter != inner_index_.end(); ++iter) {
- for (auto j_iter = iter->second.begin(); j_iter != iter->second.end(); ++j_iter) {
- if (added.count(*j_iter) > 0) {
- continue;
- }
-
- added.insert(*j_iter);
- vector<EdgeId> rc_p(j_iter->path.size()) ;
- for (size_t i = 0; i < j_iter->path.size(); i++) {
- rc_p[i] = g_.conjugate(j_iter->path[j_iter->path.size() - 1 - i]);
- }
- added.insert(PathInfo<Graph>(rc_p, j_iter->getWeight()));
-
- res.push_back(*j_iter);
- }
- }
- return res;
- }
-
-
- void LoadFromFile(const string s, bool force_exists = true) {
- FILE* file = fopen(s.c_str(), "r");
- if (force_exists) {
- VERIFY(file != NULL);
- } else if (file == NULL) {
- INFO("Long reads not found, skipping");
- return;
- }
- fclose(file);
-
- INFO("Loading long reads alignment...");
- ifstream filestr(s);
- INFO("loading from " << s);
- map<size_t, EdgeId> tmp_map;
- for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
- tmp_map[g_.int_id(*iter)] = *iter;
- }
- int fl;
-
- file = fopen((s).c_str(), "r");
- char ss[14];
- while (!feof(file)) {
- int n;
-
- fl = fscanf(file, "%d\n", &n);
- if (fl != 1)
- break;
- TRACE(n);
- for (int i = 0; i < n; i++) {
-
- int w = -1, l = -1;
- fl = fscanf(file, "Weight: %d length: %d", &w, &l);
- TRACE(w << " " << l);
- VERIFY(fl == 2);
- vector<EdgeId> p;
- for (int j = 0; j < l; j++) {
- size_t e;
- int x;
- fl = fscanf(file, "%zu(%d)", &e, &x);
- VERIFY(fl == 2);
- VERIFY(tmp_map.find(e) != tmp_map.end());
- p.push_back(tmp_map[e]);
- }
- fl = fscanf(file, "%[^\n]\n", ss);
- TRACE(ss[0]);
- AddPath(p, w);
- }
- }
- fclose(file);
- INFO("Loading finished.");
- }
-
- void AddStorage(PathStorage<Graph> & to_add) {
-
- for(auto iter = to_add.inner_index_.begin(); iter != to_add.inner_index_.end(); iter++) {
- for(auto j_iter = iter->second.begin(); j_iter != iter->second.end(); j_iter ++) {
- this->AddPath(j_iter->path, (int) j_iter->getWeight());
- }
- }
- }
-
- void Clear() {
- inner_index_.clear();
- size_ = 0;
- }
-
- size_t size() {
- return size_;
- }
-
-// typename InnerIndex::iterator begin() const {
-// return inner_index.begin();
-// }
-//
-// typename InnerIndex::iterator end() const {
-// return inner_index.end();
-// }
-// typename InnerIndex::iterator operator*(){
-// return this->first;
-// }
-private:
- size_t size_;
-};
-
-template<class Graph>
-class LongReadContainer {
- Graph& g_;
- vector<PathStorage<Graph>> data_;
-
-public:
-
- LongReadContainer(Graph& g, size_t count = 0): g_(g) {
- for (size_t i = 0; i < count; ++i) {
- data_.emplace_back(g_);
- }
- }
-
- PathStorage<Graph>& operator[](size_t index) {
- return data_[index];
- }
-
- const PathStorage<Graph>& operator[](size_t index) const {
- return data_[index];
- }
-
- size_t size() const {
- return data_.size();
- }
-
- void Clear() {
- for (auto& storage : data_) {
- storage.Clear();
- }
- }
-
-};
-
-
-}
-
-
diff --git a/src/debruijn/main.cpp b/src/debruijn/main.cpp
deleted file mode 100644
index 4306f57..0000000
--- a/src/debruijn/main.cpp
+++ /dev/null
@@ -1,173 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * Assembler Main
- */
-#include "standard.hpp"
-#include "logger/log_writers.hpp"
-
-#include "segfault_handler.hpp"
-#include "stacktrace.hpp"
-#include "launch.hpp"
-#include "memory_limit.hpp"
-#include "copy_file.hpp"
-#include "perfcounter.hpp"
-#include "runtime_k.hpp"
-
-#include "config_struct.hpp"
-#include "version.hpp"
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <unistd.h>
-
-void link_output(std::string const& link_name) {
- if (!cfg::get().run_mode)
- return;
-
- std::string link = cfg::get().output_root + link_name;
- unlink(link.c_str());
- if (symlink(cfg::get().output_suffix.c_str(), link.c_str()) != 0)
- WARN( "Symlink to \"" << link << "\" launch failed");
-}
-
-void link_previous_run(std::string const& previous_link_name, std::string const& link_name) {
- if (!cfg::get().run_mode)
- return;
-
- char buf[255];
-
- std::string link = cfg::get().output_dir + previous_link_name;
- unlink(link.c_str());
- ssize_t count = readlink((cfg::get().output_root + link_name).c_str(), buf, sizeof(buf) - 1);
- if (count >= 0){
- buf[count] = '\0';
- std::string previous_run("../");
- previous_run = previous_run + buf;
- if (symlink(previous_run.c_str(), link.c_str()) != 0) {
- DEBUG( "Symlink to \"" << link << "\" launch failed : " << previous_run);
- }
- } else {
- DEBUG( "Symlink to \"" << link << "\" launch failed");
- }
-}
-
-struct on_exit_output_linker {
- on_exit_output_linker(std::string const& link_name) :
- link_name_(link_name) { }
-
- ~on_exit_output_linker() {
- link_previous_run("previous", link_name_);
- link_output(link_name_);
- }
-
- private:
- std::string link_name_;
-};
-
-void copy_configs(string cfg_filename, string to) {
- if (!cfg::get().run_mode)
- return;
-
- using namespace debruijn_graph;
-
- if (!make_dir(to)) {
- WARN("Could not create files use in /tmp directory");
- }
- path::copy_files_by_ext(path::parent_path(cfg_filename), to, ".info", true);
-}
-
-void load_config(string cfg_filename) {
- path::CheckFileExistenceFATAL(cfg_filename);
-
- cfg::create_instance(cfg_filename);
-
- if (!cfg::get().project_name.empty()) {
- make_dir(cfg::get().output_base + cfg::get().project_name);
- }
-
- make_dir(cfg::get().output_root);
- make_dir(cfg::get().tmp_dir);
-
- make_dir(cfg::get().output_dir);
- if (cfg::get().developer_mode)
- make_dir(cfg::get().output_saves);
-
- make_dir(cfg::get().temp_bin_reads_path);
-
- string path_to_copy = path::append_path(cfg::get().output_dir, "configs");
- copy_configs(cfg_filename, path_to_copy);
-}
-
-void create_console_logger(string cfg_filename) {
- using namespace logging;
-
- string log_props_file = cfg::get().log_filename;
-
- if (!path::FileExists(log_props_file))
- log_props_file = path::append_path(path::parent_path(cfg_filename), cfg::get().log_filename);
-
- logger *lg = create_logger(path::FileExists(log_props_file) ? log_props_file : "");
- lg->add_writer(std::make_shared<console_writer>());
- attach_logger(lg);
-}
-
-int main(int /*argc*/, char** argv) {
- perf_counter pc;
-
- const size_t GB = 1 << 30;
-
- segfault_handler sh(bind(link_output, "latest"));
-
- srand(42);
- srandom(42);
-
- try {
- using namespace debruijn_graph;
-
- string cfg_filename = argv[1];
-
- load_config (cfg_filename);
- create_console_logger(cfg_filename);
-
- on_exit_output_linker try_linker("latest");
-
- VERIFY(cfg::get().K >= runtime_k::MIN_K && cfg::get().K < runtime_k::MAX_K);
- VERIFY(cfg::get().K % 2 != 0);
-
- // read configuration file (dataset path etc.)
-
- limit_memory(cfg::get().max_memory * GB);
-
- // assemble it!
- INFO("Starting SPAdes, built from " SPADES_GIT_REFSPEC ", git revision " SPADES_GIT_SHA1);
- INFO("Assembling dataset (" << cfg::get().dataset_file << ") with K=" << cfg::get().K);
-
- spades::assemble_genome();
-
- link_output("latest_success");
- } catch (std::bad_alloc const& e) {
- std::cerr << "Not enough memory to run SPAdes. " << e.what() << std::endl;
- return EINTR;
- } catch (std::exception const& e) {
- std::cerr << "Exception caught " << e.what() << std::endl;
- return EINTR;
- } catch (...) {
- std::cerr << "Unknown exception caught " << std::endl;
- return EINTR;
- }
-
- unsigned ms = (unsigned)pc.time_ms();
- unsigned secs = (ms / 1000) % 60;
- unsigned mins = (ms / 1000 / 60) % 60;
- unsigned hours = (ms / 1000 / 60 / 60);
- INFO("Assembling time: " << hours << " hours " << mins << " minutes " << secs << " seconds");
-
- // OK
- return 0;
-}
diff --git a/src/debruijn/mismatch_correction.cpp b/src/debruijn/mismatch_correction.cpp
deleted file mode 100644
index 8371205..0000000
--- a/src/debruijn/mismatch_correction.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "mismatch_correction.hpp"
-
-#include "mismatch_shall_not_pass.hpp"
-#include "read_converter.hpp"
-
-namespace debruijn_graph {
-
-void MismatchCorrection::run(conj_graph_pack &gp, const char*) {
- gp.EnsureBasicMapping();
- std::vector<size_t> libs;
- for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i) {
- if (cfg::get().ds.reads[i].is_mismatch_correctable())
- libs.push_back(i);
- }
- auto streams = single_binary_readers_for_libs(libs, true, true);
- size_t corrected = MismatchShallNotPass<conj_graph_pack, io::SingleReadSeq>(gp, 2).ParallelStopAllMismatches(streams, 1);
- INFO("Corrected " << corrected << " nucleotides");
-}
-
-}
diff --git a/src/debruijn/mismatch_correction.hpp b/src/debruijn/mismatch_correction.hpp
deleted file mode 100644
index 6f94baa..0000000
--- a/src/debruijn/mismatch_correction.hpp
+++ /dev/null
@@ -1,23 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "stage.hpp"
-
-namespace debruijn_graph {
-
-class MismatchCorrection : public spades::AssemblyStage {
- public:
- MismatchCorrection()
- : AssemblyStage("Mismatch Correction", "mismatch_correction") {}
-
- void run(conj_graph_pack &gp, const char*);
-};
-
-}
-
diff --git a/src/debruijn/mismatch_shall_not_pass.hpp b/src/debruijn/mismatch_shall_not_pass.hpp
deleted file mode 100644
index 4861aac..0000000
--- a/src/debruijn/mismatch_shall_not_pass.hpp
+++ /dev/null
@@ -1,339 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "omni/omni_utils.hpp"
-#include "omni/omni_tools.hpp"
-#include "omni/id_track_handler.hpp"
-#include "logger/logger.hpp"
-
-#include "runtime_k.hpp"
-#include "sequence_mapper.hpp"
-
-namespace debruijn_graph {
-
-namespace mismatches {
-struct NuclCount {
- size_t counts_[4];
- NuclCount() {
- memset(counts_, 0, sizeof(counts_));
- }
-
- size_t &operator[](size_t nucl) {
- return counts_[nucl];
- }
-
- NuclCount &operator+=(const NuclCount &other) {
- counts_[0] += other.counts_[0];
- counts_[1] += other.counts_[1];
- counts_[2] += other.counts_[2];
- counts_[3] += other.counts_[3];
- return *this;
- }
-};
-
-struct MismatchEdgeInfo {
- NuclCount operator[](size_t i) const {
- auto it = info_.find(i);
- if(it == info_.end())
- return NuclCount();
- else
- return it->second;
- }
-
- void operator+=(const MismatchEdgeInfo &other) {
- for(auto it = other.info_.begin(); it != other.info_.end(); ++it) {
- info_[it->first] += it->second;
- }
- }
-
- void IncIfContains(size_t position, size_t nucl) {
- auto it = info_.find(position);
- if(it != info_.end()) {
- it->second[nucl]++;
- }
- }
-
- void AddPosition(size_t position) {
- info_[position]; //in case map did not contain this key creates entry in the map with default value
- }
-
- public:
- map<size_t, NuclCount> info_;
-};
-
-template<typename EdgeId>
-class MismatchStatistics {
- private:
- typedef typename map<EdgeId, MismatchEdgeInfo>::const_iterator const_iterator;
- map<EdgeId, MismatchEdgeInfo> statistics_;
-
- template<class graph_pack>
- void CollectPotensialMismatches(const graph_pack &gp) {
- auto &kmer_mapper = gp.kmer_mapper;
- for(auto it = kmer_mapper.begin(); it != kmer_mapper.end(); ++it) {
- runtime_k::RtSeq from = it.first();
- runtime_k::RtSeq to = it.second();
- size_t cnt = 0;
- size_t cnt_arr[4];
- for(size_t i = 0; i < 4; i++)
- cnt_arr[i] = 0;
- for(size_t i = 0; i < from.size(); i++) {
- if(from[i] != to[i]) {
- cnt++;
- cnt_arr[(i * 4)/from.size()] ++;
- }
- }
- //last two contitions - to avoid excessive indels.
- //if two/third of nucleotides in first/last quoter are mismatches, then it means erroneous mapping
-
- if(cnt >= 1 && cnt <= from.size()/3 && cnt_arr[0] <= from.size()/6 && cnt_arr[3] <= from.size()/6) {
- for(size_t i = 0; i < from.size(); i++) {
- if(from[i] != to[i] && gp.index.contains(to)) {
- pair<EdgeId, size_t> position = gp.index.get(to);
- statistics_[position.first].AddPosition(position.second + i);
- }
- }
- }
- }
- for (auto it = gp.g.ConstEdgeBegin(); !it.IsEnd(); ++it){
- if (gp.g.length(*it) < cfg::get().max_repeat_length) {
- // INFO("edge id " <<gp.g.int_id(*it) << " added to stat" );
- // for(size_t i = 0; i < gp.g.length(*it) + gp.g.k(); i++)
- // statistics_[*it].AddPosition(i);
- }
- }
- }
-
- void operator+=(const MismatchStatistics<EdgeId> &other) {
- for(auto it = other.statistics_.begin(); it != other.statistics_.end(); ++it) {
- statistics_[it->first] += it->second;
- }
- }
-
- public:
- template<class graph_pack>
- MismatchStatistics(const graph_pack &gp) {
- CollectPotensialMismatches(gp);
- }
-
- const_iterator begin() const {
- return statistics_.begin();
- }
-
- const_iterator end() const {
- return statistics_.end();
- }
-
- const_iterator find(const EdgeId &edge) const {
- return statistics_.find(edge);
- }
-
- template<class graph_pack, class read_type>
- void Count(io::ReadStream<read_type>& stream, const graph_pack &gp) {
- stream.reset();
- DEBUG("count started");
- auto sm = MapperInstance(gp);
- DEBUG("seq mapper created");
- while(!stream.eof()) {
- read_type read;
- stream >> read;
- const Sequence &s_read = read.sequence();
- omnigraph::MappingPath<EdgeId> path = sm->MapSequence(s_read);
- TRACE("read mapped");
- if(path.size() == 1 && path[0].second.initial_range.size() == path[0].second.mapped_range.size()) {
- Range initial_range = path[0].second.initial_range;
- Range mapped_range = path[0].second.mapped_range;
- const Sequence &s_edge = gp.g.EdgeNucls(path[0].first);
- size_t len = initial_range.size() + gp.g.k();
- size_t cnt = 0;
- for(size_t i = 0; i < len; i++) {
- if(s_read[initial_range.start_pos + i] != s_edge[mapped_range.start_pos + i]) {
- cnt++;
- }
- }
- if(cnt <= gp.g.k() / 3) {
- TRACE("statistics changing");
- auto it = statistics_.find(path[0].first);
- if(it == statistics_.end()) {
- // if (gp.g.length(path[0].first) < 4000)
- // WARN ("id "<< gp.g.length(path[0].first)<<" " << len);
- continue;
- }
- for(size_t i = 0; i < len; i++) {
- size_t nucl_code = s_read[initial_range.start_pos + i];
- it->second.IncIfContains(mapped_range.start_pos + i, nucl_code);
- }
- }
- }
- }
- }
-
- template<class graph_pack, class read_type>
- void ParallelCount(io::ReadStreamList<read_type> &streams, const graph_pack &gp) {
- size_t nthreads = streams.size();
- std::vector<MismatchStatistics<EdgeId>*> statistics(nthreads);
-#pragma omp parallel for num_threads(nthreads) shared(streams, statistics)
- for (size_t i = 0; i < nthreads; ++i) {
- statistics[i] = new MismatchStatistics<EdgeId>(*this);
- DEBUG("statistics created thread " << i);
- statistics[i]->Count(streams[i], gp);
- DEBUG("count finished thread " << i);
- }
-
- INFO("Finished collecting potential mismatches positions");
- for (size_t i = 0; i < statistics.size(); i++) {
- *this += *statistics[i];
- delete statistics[i];
- }
- }
-};
-}
-
-template<class graph_pack, class read_type>
-class MismatchShallNotPass {
- private:
- typedef typename graph_pack::graph_t Graph;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef runtime_k::RtSeq Kmer;
-
- graph_pack &gp_;
- double relative_threshold_;
-
- EdgeId CorrectNucl(EdgeId edge, size_t position, char nucl) {
- VERIFY(position >= gp_.g.k());
- if(position + 1 < gp_.g.length(edge)) {
- edge = gp_.g.SplitEdge(edge, position + 1).first;
- }
- EdgeId mismatch = edge;
- if(position > gp_.g.k()) {
- auto tmp = gp_.g.SplitEdge(edge, position - gp_.g.k());
- edge = tmp.first;
- mismatch = tmp.second;
- }
- const Sequence& s_mm = gp_.g.EdgeNucls(mismatch);
- Sequence correct = s_mm.Subseq(0, gp_.g.k()) + Sequence(string(1, nucl)) + s_mm.Subseq(gp_.g.k() + 1, gp_.g.k() * 2 + 1);
- if(!gp_.kmer_mapper.CheckCanRemap(s_mm, correct)) {
- return edge;
- }
- VERIFY(nucl != s_mm[gp_.g.k()]);
- EdgeId correct_edge = gp_.g.AddEdge(gp_.g.EdgeStart(mismatch), gp_.g.EdgeEnd(mismatch), correct);
- if (position > gp_.g.k()) {
- gp_.g.GlueEdges(mismatch, correct_edge);
- return edge;
- } else {
- return gp_.g.GlueEdges(mismatch, correct_edge);
- }
- }
-
- EdgeId CorrectNucls(EdgeId edge, const std::vector<pair<size_t, char>> &mismatches) {
- for (auto it = mismatches.rbegin(); it != mismatches.rend(); ++it) {
- edge = CorrectNucl(edge, it->first, it->second);
- }
- EdgeId tmp = Compressor<Graph>(gp_.g).CompressVertexEdgeId(gp_.g.EdgeEnd(edge));
- if (tmp == EdgeId(0))
- return edge;
- else
- return tmp;
- }
-
- vector<pair<size_t, char>> FindMismatches(EdgeId edge, const mismatches::MismatchEdgeInfo &statistics) {
- vector<pair<size_t, char>> to_correct;
- const Sequence& s_edge = gp_.g.EdgeNucls(edge);
- for (size_t i = gp_.g.k(); i < gp_.g.length(edge); i++) {
- size_t cur_best = 0;
- mismatches::NuclCount nc = statistics[i];
- for(size_t j = 1; j < 4; j++) {
- if(nc[j] > nc[cur_best]) {
- cur_best = j;
- }
- }
- size_t nucl_code = s_edge[i];
- if ((double) nc[cur_best] > relative_threshold_ * (double) nc[nucl_code] + 1.) {
- to_correct.push_back(make_pair(i, cur_best));
- i += gp_.g.k();
- }
-
- }
- return to_correct;
- }
-
- size_t CorrectEdge(EdgeId edge, const mismatches::MismatchEdgeInfo &statistics) {
- vector<pair<size_t, char>> to_correct = FindMismatches(edge, statistics);
- EdgeId new_edge = CorrectNucls(edge, to_correct);
- if (new_edge == EdgeId(0))
- new_edge = edge;
-
- return to_correct.size();
- }
-
- size_t CorrectAllEdges(const mismatches::MismatchStatistics<typename Graph::EdgeId> &statistics) {
- size_t res = 0;
- set<EdgeId> conjugate_fix;
- for (auto it = gp_.g.ConstEdgeBegin(); !it.IsEnd(); ++it) {
- if (conjugate_fix.find(gp_.g.conjugate(*it)) == conjugate_fix.end()){
- conjugate_fix.insert(*it);
- }
- }
- for(auto it = conjugate_fix.begin(); it != conjugate_fix.end(); ++it) {
- DEBUG("processing edge" << gp_.g.int_id(*it));
-
- if (statistics.find(*it) != statistics.end()) {
- if (!gp_.g.RelatedVertices(gp_.g.EdgeStart(*it), gp_.g.EdgeEnd(*it)))
- res += CorrectEdge(*it, statistics.find(*it)->second);
- }
- }
- INFO("All edges processed");
- return res;
- }
-
- size_t StopMismatchIteration(io::ReadStream<read_type>& stream) {
- mismatches::MismatchStatistics<typename Graph::EdgeId> statistics(gp_);
- statistics.Count(stream, gp_);
- return CorrectAllEdges(statistics);
- }
-
- size_t ParallelStopMismatchIteration(io::ReadStreamList<read_type> &streams) {
- mismatches::MismatchStatistics<typename Graph::EdgeId> statistics(gp_);
- statistics.ParallelCount(streams, gp_);
- return CorrectAllEdges(statistics);
- }
-
- public:
- MismatchShallNotPass(graph_pack &gp, double relative_threshold = 1.5) : gp_(gp), relative_threshold_(relative_threshold) {
- VERIFY(relative_threshold >= 1);
- }
-
-
- size_t StopAllMismatches(io::ReadStream<read_type>& stream, size_t max_iterations = 1) {
- size_t res = 0;
- while(max_iterations > 0) {
- size_t last = StopMismatchIteration(stream);
- res += last;
- if(last == 0)
- break;
- max_iterations--;
- }
- return res;
- }
-
- size_t ParallelStopAllMismatches(io::ReadStreamList<read_type> &streams, size_t max_iterations = 1) {
- size_t res = 0;
- while(max_iterations > 0) {
- size_t last = ParallelStopMismatchIteration(streams);
- res += last;
- if(last == 0)
- break;
- max_iterations--;
- }
- return res;
- }
-};
-
-}
diff --git a/src/debruijn/moleculo.hpp b/src/debruijn/moleculo.hpp
deleted file mode 100644
index 4fa340f..0000000
--- a/src/debruijn/moleculo.hpp
+++ /dev/null
@@ -1,36 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-#include "debruijn_graph.hpp"
-#include "omni/basic_edge_conditions.hpp"
-
-namespace debruijn_graph {
-
-class ForbiddenPatternCondition : public EdgeCondition<Graph> {
- typedef EdgeCondition<Graph> base;
-
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- Sequence pattern_;
- size_t max_offset_;
-public:
- ForbiddenPatternCondition(const Graph& g, Sequence pattern, size_t max_offset) : base(g), pattern_(pattern), max_offset_(max_offset) {
- }
-
- /*virtual*/ bool Check(EdgeId e) const {
- Sequence nucls = this->g().EdgeNucls(e);
- for(size_t i = 0; i < max_offset_ && i + pattern_.size() < nucls.size(); i++) {
- if(nucls.Subseq(i, i + pattern_.size()) == pattern_ || (!nucls).Subseq(i, i + pattern_.size()) == pattern_) {
- return false;
- }
- }
- return true;
- }
-
-};
-
-}
diff --git a/src/debruijn/overlap_analysis.hpp b/src/debruijn/overlap_analysis.hpp
deleted file mode 100644
index 5809b36..0000000
--- a/src/debruijn/overlap_analysis.hpp
+++ /dev/null
@@ -1,113 +0,0 @@
-#pragma once
-
-#include "logger/logger.hpp"
-#include "omni/range.hpp"
-#include "ssw/ssw_cpp.h"
-
-namespace debruijn_graph {
-using omnigraph::Range;
-
-struct OverlapInfo {
- Range r1;
- Range r2;
- size_t match_cnt;
-
- OverlapInfo(const Range& r1_, const Range& r2_, size_t match_cnt_)
- : r1(r1_),
- r2(r2_),
- match_cnt(match_cnt_) {
- VERIFY(match_cnt <= std::min(r1.size(), r2.size()));
- }
-
- OverlapInfo()
- : match_cnt(0) {
- }
-
- double identity() const {
- if (match_cnt == 0)
- return 0.;
- return (double)match_cnt / (double)size();
- }
-
- size_t size() const {
- return std::max(r1.size(), r2.size());
- }
-
- bool operator==(const OverlapInfo &that) const {
- return r1 == that.r1 && r2 == that.r2 && match_cnt == that.match_cnt;
- }
-
- bool operator!=(const OverlapInfo &that) const {
- return !(*this == that);
- }
-};
-
-std::ostream& operator<<(std::ostream& os, const OverlapInfo& info) {
- return os << "R1: [" << info.r1.start_pos << ", " << info.r1.end_pos
- << "]; R2: [" << info.r2.start_pos << ", " << info.r2.end_pos << "]"
- << "; match_cnt: " << info.match_cnt;
-}
-
-class SWOverlapAnalyzer {
- static const uint32_t CIGAR_FLAG_MASK = (1 << 4) - 1;
- static const uint32_t CIGAR_MATCH_FLAG = 7;
- typedef typename Graph::EdgeId EdgeId;
- size_t flank_length_;
-
- const StripedSmithWaterman::Aligner aligner_;
- const StripedSmithWaterman::Filter filter_;
-
- size_t CountMatches(std::vector<uint32_t> cigar) const {
- size_t match_cnt = 0;
- for (uint32_t entry : cigar) {
- if ((entry & CIGAR_FLAG_MASK) == CIGAR_MATCH_FLAG) {
- match_cnt += (entry >> 4);
- }
- }
- return match_cnt;
- }
-
- OverlapInfo InnerAnalyze(const Sequence& s1, const Sequence& s2) const {
- if (s1.size() == 0 || s2.size() == 0) {
- return OverlapInfo();
- }
- StripedSmithWaterman::Alignment alignment;
- if (aligner_.Align(s1.str().c_str(), s2.str().c_str(), int(s2.size()), filter_, &alignment)) {
- if (alignment.sw_score > 0) {
- return OverlapInfo(Range(alignment.query_begin, alignment.query_end + 1),
- Range(alignment.ref_begin, alignment.ref_end + 1),
- CountMatches(alignment.cigar));
- }
- }
- return OverlapInfo();
- }
-
-public:
- SWOverlapAnalyzer(size_t flank_length)
- : flank_length_(flank_length),
- aligner_(/*match_score*/2,
- /*mismatch_penalty*/6,
- /*gap_opening_penalty*/8,
- /*gap_extending_penalty*/8) {
- }
-
-
- OverlapInfo AnalyzeOverlap(const Sequence& s1, const Sequence& s2) const {
- size_t start1 = flank_length_ > s1.size() ? 0 : s1.size() - flank_length_;
- size_t end2 = flank_length_ > s2.size() ? s2.size() : flank_length_;
-
- OverlapInfo result = InnerAnalyze(s1.Subseq(start1, s1.size()), s2.Subseq(0, end2));
- if (result == OverlapInfo())
- return result;
-
- result.r1.shift(int(start1));
- return result;
- }
-
- template<class Graph>
- OverlapInfo AnalyzeOverlap(const Graph& g, EdgeId e1, EdgeId e2) const {
- return AnalyzeOverlap(g.EdgeNucls(e1), g.EdgeNucls(e2));
- }
-};
-
-}
diff --git a/src/debruijn/pacbio/pac_index.hpp b/src/debruijn/pacbio/pac_index.hpp
deleted file mode 100644
index 51e9d4f..0000000
--- a/src/debruijn/pacbio/pac_index.hpp
+++ /dev/null
@@ -1,833 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * pac_index.hpp
- *
- * Created on: Jan 21, 2013
- * Author: lab42
- */
-#pragma once
-
-#include "indices/edge_multi_index.hpp"
-#include "indices/edge_index_builders.hpp"
-#include <algorithm>
-#include "pacbio_read_structures.hpp"
-
-namespace pacbio {
-#define UNDEF_COLOR -1
-#define DELETED_COLOR -2
-
-template<class Graph>
-struct MappingDescription {
-
-};
-
-template<class Graph>
-class PacBioMappingIndex {
-public:
- typedef map<typename Graph::EdgeId, vector<MappingInstance> > MappingDescription;
- typedef pair<typename Graph::EdgeId, vector<MappingInstance> > ClusterDescription;
- typedef set<KmerCluster<Graph> > ClustersSet;
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- typedef debruijn_graph::DeBruijnEdgeMultiIndex<typename Graph::EdgeId> Index;
- typedef typename Index::KeyWithHash KeyWithHash;
-
-private:
- DECL_LOGGER("PacIndex")
-
- const Graph &g_;
- size_t pacbio_k;
- size_t debruijn_k;
- const static int short_edge_cutoff = 0;
- const static size_t min_cluster_size = 8;
- const static int max_similarity_distance = 500;
- int good_follow = 0;
- int half_bad_follow = 0;
- int bad_follow = 0;
-
- double compression_cutoff;
- double domination_cutoff;
- set<Sequence> banned_kmers;
- debruijn_graph::DeBruijnEdgeMultiIndex<typename Graph::EdgeId> tmp_index;
- map<pair<VertexId, VertexId>, vector<size_t> > distance_cashed;
- size_t read_count;
- bool ignore_map_to_middle;
-
-public:
- MappingDescription Locate(const Sequence &s) const;
-
- PacBioMappingIndex(const Graph &g, size_t k, size_t debruijn_k_, bool ignore_map_to_middle)
- : g_(g),
- pacbio_k(k),
- debruijn_k(debruijn_k_),
- tmp_index((unsigned) pacbio_k, cfg::get().output_dir), ignore_map_to_middle(ignore_map_to_middle) {
- DEBUG("PB Mapping Index construction started");
-
- typedef typename debruijn_graph::EdgeIndexHelper<debruijn_graph::DeBruijnEdgeMultiIndex<typename Graph::EdgeId>>::GraphPositionFillingIndexBuilderT Builder;
-
- Builder().BuildIndexFromGraph(tmp_index, g_);
- FillBannedKmers();
- compression_cutoff = cfg::get().pb.compression_cutoff; // 0.6
- domination_cutoff = cfg::get().pb.domination_cutoff; //1.5
- //INFO(tmp_index.size());
- read_count = 0;
- }
- ~PacBioMappingIndex(){
- DEBUG("good/ugly/bad counts:" << good_follow << " "<<half_bad_follow << " " << bad_follow);
-
- }
- void FillBannedKmers() {
- for (int i = 0; i < 4; i++) {
- auto base = nucl((unsigned char) i);
- for (int j = 0; j < 4; j++) {
- auto other = nucl((unsigned char) j);
- for (size_t other_pos = 0; other_pos < pacbio_k; other_pos++) {
- string s = "";
- for (size_t k = 0; k < pacbio_k; k++) {
- if (k != other_pos)
- s += base;
- else
- s += other;
- }
- banned_kmers.insert(Sequence(s));
- }
- }
- }
- }
-
- bool similar(const MappingInstance &a, const MappingInstance &b,
- int shift = 0) const {
- if (b.read_position + shift < a.read_position) {
- return similar(b, a, -shift);
- } else if (b.read_position == a.read_position) {
- return (abs(int(b.edge_position) + shift - int(a.edge_position)) < 2);
- } else {
- return ((b.edge_position + shift - a.edge_position >= (b.read_position - a.read_position) * compression_cutoff) &&
- ((b.edge_position + shift - a.edge_position) * compression_cutoff <= (b.read_position - a.read_position)));
- }
- }
-
- void dfs_cluster(vector<int> &used, vector<MappingInstance> &to_add,
- const int cur_ind,
- const typename MappingDescription::iterator iter) const {
- size_t len = iter->second.size();
- for (size_t k = 0; k < len; k++) {
- if (!used[k] && similar(iter->second[cur_ind], iter->second[k])) {
- to_add.push_back(iter->second[k]);
- used[k] = 1;
- dfs_cluster(used, to_add, (int) k, iter);
- }
- }
- }
-
- void dfs_cluster_norec(vector<int> &used, vector<MappingInstance> &to_add,
- const size_t cur_ind,
- const typename MappingDescription::iterator iter, vector<vector<size_t> > &similarity_list) const {
- std::deque<size_t> stack;
- stack.push_back(cur_ind);
- used[cur_ind] = 1;
- while (stack.size() > 0) {
- size_t k = stack.back();
- stack.pop_back();
- to_add.push_back(iter->second[k]);
-
- for (size_t i = 0; i < similarity_list[k].size(); i++) {
- if (!used[similarity_list[k][i]]) {
- stack.push_back(similarity_list[k][i]);
- used[similarity_list[k][i]] = 1;
- }
- }
- }
- }
-
- ClustersSet GetOrderClusters(const Sequence &s) const {
- MappingDescription descr = Locate(s);
- ClustersSet res;
- TRACE(read_count << " read_count");
-
- DEBUG(descr.size() <<" clusters");
- for (auto iter = descr.begin(); iter != descr.end(); ++iter) {
- size_t edge_id = g_.int_id(iter->first);
- DEBUG(edge_id);
- sort(iter->second.begin(), iter->second.end(), ReadPositionComparator());
- set<vector<MappingInstance> > edge_cluster_set;
- size_t len = iter->second.size();
- vector<vector<size_t> > similarity_list(len);
- int cnt = 0;
- for (size_t i = 0; i < len; i++){
- for (size_t j = i + 1; j < len; j++){
- if (iter->second[i].read_position + max_similarity_distance < iter->second[j].read_position) {
- break;
- }
- if (similar(iter->second[i], iter->second[j])) {
- similarity_list[i].push_back(j);
- cnt ++;
- if (cnt % 10000 == 0) {
- DEBUG(cnt);
- }
- }
- }
- }
-
- DEBUG(len <<" kmers in cluster");
- vector<int> used(len);
- for (size_t i = 0; i < len; i++) {
- if (!used[i]) {
- vector<size_t> new_cluster(len);
- vector<size_t> prev(len);
- for(size_t j = i; j < len; j++) {
- if (!used[j]) {
- if (new_cluster[j] == 0) new_cluster[j] = 1, prev[j] = size_t(-1);
- for(size_t k = 0; k < similarity_list[j].size(); k++) {
- size_t next_ind = similarity_list[j][k];
- if (!used[next_ind]) {
- if (new_cluster[next_ind] < new_cluster[j] + 1){
- new_cluster[next_ind] = new_cluster[j] + 1;
- prev[next_ind] = j;
- }
- }
- }
- }
- }
- size_t maxx = 0;
- size_t maxj = i;
- for(size_t j = i; j < len; j++) {
- if (new_cluster[j] > maxx) maxj = j, maxx = new_cluster[j];
- }
- vector<MappingInstance> to_add;
- size_t real_maxj = maxj, first_j = maxj;
- while (maxj != size_t(-1)) {
- to_add.push_back(iter->second[maxj]);
- first_j = maxj;
- maxj = prev[maxj];
- }
- for (auto j = first_j; j < real_maxj; j++)
- used[j] = 1;
- reverse(to_add.begin(), to_add.end());
- TRACE("adding cluster "" edge "<< edge_id << " len " <<to_add.size() )
- res.insert(KmerCluster<Graph>(iter->first, to_add));
- }
- }
- }
- FilterClusters(res);
- return res;
- }
- //filter clusters that are too small or fully located on a vertex or dominated by some other cluster.
- void FilterClusters(ClustersSet &clusters) const {
- for (auto i_iter = clusters.begin(); i_iter != clusters.end();) {
- size_t edge_id = g_.int_id(i_iter->edgeId);
-
- int len = (int) g_.length(i_iter->edgeId);
- auto sorted_by_edge = i_iter->sorted_positions;
- sort(sorted_by_edge.begin(), sorted_by_edge.end());
- double good = 0;
- DEBUG("filtering cluster of size " << sorted_by_edge.size());
- DEBUG(edge_id <<" : edgeId");
- for (auto iter = sorted_by_edge.begin();
- iter < sorted_by_edge.end(); iter++) {
- if (iter->IsUnique())
- good++;
- //good += 1.0 / (iter->quality * iter->quality);
- }
- DEBUG("good " << good);
-
- if (good < min_cluster_size || (len < short_edge_cutoff)) {
- if (len < short_edge_cutoff) {
- DEBUG("Life is too long, and edge is too short!");
- }
- auto tmp_iter = i_iter;
- tmp_iter++;
- clusters.erase(i_iter);
- i_iter = tmp_iter;
- } else {
- if (sorted_by_edge[0].edge_position >= len
- || sorted_by_edge[i_iter->size - 1].edge_position
- <= int(debruijn_k) - int(pacbio_k)) {
- DEBUG("All anchors in vertex");
- auto tmp_iter = i_iter;
- tmp_iter++;
- clusters.erase(i_iter);
- i_iter = tmp_iter;
- } else {
- i_iter++;
- }
- }
- }
- for (auto i_iter = clusters.begin(); i_iter != clusters.end();) {
- size_t edge_id = g_.int_id(i_iter->edgeId);
- auto sorted_by_edge = i_iter->sorted_positions;
-
- DEBUG("filtering with cluster edge, stage 2 "<< edge_id << " len " << sorted_by_edge.size() << " clusters still alive: "<< clusters.size());
- for (auto j_iter = clusters.begin(); j_iter != clusters.end();) {
- if (i_iter != j_iter) {
- if (dominates(*i_iter, *j_iter)) {
- TRACE("cluster is dominated");
- auto tmp_iter = j_iter;
- tmp_iter++;
- TRACE("cluster on edge " << g_.int_id(j_iter->edgeId));
- TRACE("erased - dominated");
- clusters.erase(j_iter);
- j_iter = tmp_iter;
- } else {
- j_iter++;
- }
- } else {
- j_iter++;
- }
- }
- DEBUG("cluster size "<< i_iter->sorted_positions.size() << "survived filtering");
- i_iter++;
- }
- }
-
- // is "non strictly dominates" required?
- inline bool dominates(const KmerCluster<Graph> &a,
- const KmerCluster<Graph> &b) const {
- size_t a_size = a.size;
- size_t b_size = b.size;
- if ((double) a_size < (double) b_size * domination_cutoff
- || a.sorted_positions[a.first_trustable_index].read_position
- > b.sorted_positions[b.first_trustable_index].read_position
- || a.sorted_positions[a.last_trustable_index].read_position
- < b.sorted_positions[b.last_trustable_index].read_position) {
- return false;
- } else {
- return true;
- }
- }
-
- vector<EdgeId> FillGapsInCluster(vector<pair<size_t, typename ClustersSet::iterator> > &cur_cluster,
- const Sequence &s) {
- vector<EdgeId> cur_sorted;
- EdgeId prev_edge = EdgeId(0);
-
- for (auto iter = cur_cluster.begin(); iter != cur_cluster.end();
- ++iter) {
- EdgeId cur_edge = iter->second->edgeId;
- if (prev_edge != EdgeId(0)) {
-//Need to find sequence of edges between clusters
- VertexId start_v = g_.EdgeEnd(prev_edge);
- VertexId end_v = g_.EdgeStart(cur_edge);
- auto prev_iter = iter - 1;
- MappingInstance cur_first_index =
- iter->second->sorted_positions[iter->second
- ->first_trustable_index];
- MappingInstance prev_last_index = prev_iter->second
- ->sorted_positions[prev_iter->second
- ->last_trustable_index];
-
- if (start_v != end_v ||
- (start_v == end_v &&
- (double) (cur_first_index.read_position - prev_last_index.read_position) >
- (double) (cur_first_index.edge_position + (int) g_.length(prev_edge) - prev_last_index.edge_position) * 1.3)) {
- DEBUG(" traversing tangled hregion between "<< g_.int_id(prev_edge)<< " " << g_.int_id(cur_edge));
- DEBUG(" first pair" << cur_first_index.str() << " edge_len" << g_.length(cur_edge));
- DEBUG(" last pair" << prev_last_index.str() << " edge_len" << g_.length(prev_edge));
- string s_add = "";
- string e_add = "";
- int seq_end = cur_first_index.read_position;
- int seq_start = prev_last_index.read_position;
- string tmp = g_.EdgeNucls(prev_edge).str();
- s_add = tmp.substr(prev_last_index.edge_position,
- g_.length(prev_edge) - prev_last_index.edge_position);
- tmp = g_.EdgeNucls(cur_edge).str();
- e_add = tmp.substr(0, cur_first_index.edge_position);
- pair<int, int> limits = GetPathLimits(*(prev_iter->second),
- *(iter->second),
- (int) s_add.length(),
- (int) e_add.length());
- if (limits.first == -1)
- return vector<EdgeId>(0);
-
- vector<EdgeId> intermediate_path = BestScoredPath(s, start_v, end_v, limits.first, limits.second, seq_start, seq_end, s_add, e_add);
- if (intermediate_path.size() == 0) {
- DEBUG("Tangled region between edgees "<< g_.int_id(prev_edge) << " " << g_.int_id(cur_edge) << " is not closed, additions from edges: " << int(g_.length(prev_edge)) - int(prev_last_index.edge_position) <<" " << int(cur_first_index.edge_position) - int(debruijn_k - pacbio_k ) << " and seq "<< - seq_start + seq_end);
- if (cfg::get().pb.additional_debug_info) {
- DEBUG(" escpected gap length: " << -int(g_.length(prev_edge)) + int(prev_last_index.edge_position) - int(cur_first_index.edge_position) + int(debruijn_k - pacbio_k ) - seq_start + seq_end);
- PathStorageCallback<Graph> callback(g_);
- ProcessPaths(g_, 0, 4000,
- start_v, end_v,
- callback);
- vector<vector<EdgeId> > paths = callback.paths();
- stringstream s_buf;
- for (auto p_iter = paths.begin();
- p_iter != paths.end(); p_iter++) {
- size_t tlen = 0;
- for (auto path_iter = p_iter->begin();
- path_iter != p_iter->end();
- path_iter++) {
- tlen += g_.length(*path_iter);
- }
- s_buf << tlen << " ";
- }
- DEBUG(s_buf.str());
- }
- return intermediate_path;
- }
- for (auto j_iter = intermediate_path.begin(); j_iter != intermediate_path.end(); j_iter++) {
- cur_sorted.push_back(*j_iter);
- }
- }
- }
- cur_sorted.push_back(cur_edge);
- prev_edge = cur_edge;
- }
- return cur_sorted;
- }
-
- bool TopologyGap(EdgeId first, EdgeId second, bool oriented) const {
- bool res = (g_.IsDeadStart(g_.EdgeStart(first)) && g_.IsDeadEnd(g_.EdgeEnd(second)));
- if (!oriented)
- res |= g_.IsDeadEnd(g_.EdgeEnd(first)) && g_.IsDeadStart(g_.EdgeStart(second));
- return res;
- }
-
- vector<int> GetWeightedColors(ClustersSet &mapping_descr, Sequence &s) {
- int len = (int) mapping_descr.size();
- DEBUG("getting colors, table size "<< len);
- vector<vector<int> > cons_table(len);
-
- vector<int> colors(len);
- vector<int> cluster_size(len);
- vector<int> max_size(len);
- vector<int> prev(len);
-
- for (int i = 0; i < len; i++) {
- cons_table[i].resize(len);
- cons_table[i][i] = 0;
- prev[i] = -1;
- }
- int i = 0;
-
- for (int i = 0; i < len; i++) {
-//-1 not initialized, -2 - removed as trash
- colors[i] = UNDEF_COLOR;
- }
- for (auto i_iter = mapping_descr.begin(); i_iter != mapping_descr.end();
- ++i_iter, ++i) {
- cluster_size[i] = i_iter->size;
- }
- i = 0;
- if (len > 1) {
- TRACE(len << "clusters");
- }
-
- for (auto i_iter = mapping_descr.begin(); i_iter != mapping_descr.end();
- ++i_iter, ++i) {
- int j = i;
- for (auto j_iter = i_iter;
- j_iter != mapping_descr.end(); ++j_iter, ++j) {
- if (i_iter == j_iter)
- continue;
- cons_table[i][j] = IsConsistent(s, *i_iter, *j_iter);
- }
- }
- i = 0;
- int cur_color = 0;
-
- while (true) {
- for (i = 0; i < len; i++) {
- max_size[i] = 0;
- prev[i] = -1;
- }
- i = 0;
- for (auto i_iter = mapping_descr.begin(); i_iter != mapping_descr.end();
- ++i_iter, ++i) {
- if (colors[i] != UNDEF_COLOR) continue;
- max_size[i] = cluster_size[i];
- for (int j = 0; j < i; j ++) {
- if (colors[j] != -1) continue;
- if (cons_table[j][i] && max_size[i] < cluster_size[i] + max_size[j]) {
- max_size[i] = max_size[j] + cluster_size[i];
- prev[i] = j;
- }
- }
- }
- int maxx = 0;
- int maxi = -1;
- for (int j = 0; j < len; j++) {
- if (max_size[j] > maxx) {
- maxx = max_size[j];
- maxi = j;
- }
- }
- if (maxi == -1) {
- break;
- }
- colors[maxi] = cur_color;
- int real_maxi = maxi, min_i = maxi;
-
- while (prev[maxi] != -1) {
- min_i = maxi;
- maxi = prev[maxi];
- colors[maxi] = cur_color;
- }
- while (real_maxi >= min_i) {
- if (colors[real_maxi] == UNDEF_COLOR) {
- colors[real_maxi] = DELETED_COLOR;
- }
- real_maxi --;
- }
- cur_color ++;
-
- }
- return colors;
- }
-
-
-
-
- OneReadMapping<Graph> GetReadAlignment(Sequence &s) {
- ClustersSet mapping_descr = GetOrderClusters(s);
- DEBUG("clusters got");
- int len = (int) mapping_descr.size();
- vector<size_t> real_length;
-
- vector<int> colors = GetWeightedColors(mapping_descr, s);
- vector<vector<EdgeId> > sortedEdges;
- vector<typename ClustersSet::iterator> start_clusters, end_clusters;
- vector<GapDescription<Graph> > illumina_gaps;
- vector<int> used(len);
- size_t used_seed_count = 0;
- auto iter = mapping_descr.begin();
- for (int i = 0; i < len; i++, iter ++) {
- used[i] = 0;
- DEBUG(colors[i] <<" " << iter->str(g_));
- }
- for (int i = 0; i < len; i++) {
- if (!used[i]) {
- DEBUG("starting new subread");
- size_t cur_seed_count = 0;
- vector<pair<size_t, typename ClustersSet::iterator> > cur_cluster;
- used[i] = 1;
- int j = 0;
- int cur_color = colors[i];
- if (cur_color == DELETED_COLOR)
- continue;
- for (auto i_iter = mapping_descr.begin();
- i_iter != mapping_descr.end(); ++i_iter, ++j) {
- if (colors[j] == cur_color) {
- cur_cluster.push_back(
- make_pair(
- i_iter->average_read_position,
- i_iter));
- used[j] = 1;
- cur_seed_count += i_iter->sorted_positions.size();
- }
- }
- sort(cur_cluster.begin(), cur_cluster.end(),
- pair_iterator_less<typename ClustersSet::iterator>());
- VERIFY(cur_cluster.size() > 0);
- //if (cur_seed_count > used_seed_count)
- used_seed_count += cur_seed_count;
- auto cur_cluster_start = cur_cluster.begin();
- for (auto iter = cur_cluster.begin(); iter != cur_cluster.end();
- ++iter) {
- auto next_iter = iter + 1;
- if (next_iter == cur_cluster.end()
- || !IsConsistent(s, *(iter->second),
- *(next_iter->second))) {
- if (next_iter != cur_cluster.end()) {
- DEBUG("clusters splitted:");
- DEBUG("on "<< iter->second->str(g_));
- DEBUG("and " << next_iter->second->str(g_));
- }
- vector<pair<size_t, typename ClustersSet::iterator> > splitted_cluster(
- cur_cluster_start, next_iter);
- vector<EdgeId> cur_sorted = FillGapsInCluster(
- splitted_cluster, s);
- if (cur_sorted.size() > 0) {
- start_clusters.push_back(cur_cluster_start->second);
- end_clusters.push_back(iter->second);
- sortedEdges.push_back(cur_sorted);
- }
- cur_cluster_start = next_iter;
- } else {
- DEBUG("connected consequtive clusters:");
- DEBUG("on "<< iter->second->str(g_));
- DEBUG("and " << next_iter->second->str(g_));
-
- }
-
- }
- }
- }
- DEBUG("adding gaps between subreads");
- int alignments = int(sortedEdges.size());
- for (int i = 0; i < alignments; i++) {
- for (int j = 0; j < alignments; j++) {
- EdgeId before_gap = sortedEdges[j][sortedEdges[j].size() - 1];
- EdgeId after_gap = sortedEdges[i][0];
-//do not add "gap" for rc-jumping
- if (before_gap != after_gap
- && before_gap != g_.conjugate(after_gap)) {
- if (i != j && TopologyGap(before_gap, after_gap, true)) {
- if (start_clusters[j]->CanFollow(*end_clusters[i])) {
- illumina_gaps.push_back(
- GapDescription<Graph>(*end_clusters[i],
- *start_clusters[j], s,
- (int) pacbio_k));
- }
-
- }
- }
- }
- }
- return OneReadMapping<Graph>(sortedEdges, illumina_gaps, real_length, used_seed_count);
- }
-
- std::pair<int, int> GetPathLimits(const KmerCluster<Graph> &a,
- const KmerCluster<Graph> &b,
- int s_add_len, int e_add_len) {
- int start_pos = a.sorted_positions[a.last_trustable_index].read_position;
- int end_pos = b.sorted_positions[b.first_trustable_index].read_position;
- int seq_len = -start_pos + end_pos;
- //int new_seq_len =
-//TODO::something more reasonable
- int path_min_len = max(int(floor((seq_len - int(debruijn_k)) * cfg::get().pb.path_limit_pressing)), 0);
- int path_max_len = (int) ((double) (seq_len + (int) debruijn_k) * cfg::get().pb.path_limit_stretching);
- if (seq_len < 0) {
- DEBUG("suspicious negative seq_len " << start_pos << " " << end_pos << " " << path_min_len << " " << path_max_len);
- return std::make_pair(-1, -1);
- }
- path_min_len = max(path_min_len - int(s_add_len + e_add_len), 0);
- path_max_len = max(path_max_len - int(s_add_len + e_add_len), 0);
- return std::make_pair(path_min_len, path_max_len);
- }
-
-//0 - No, 1 - Yes
- int IsConsistent(Sequence &s, const KmerCluster<Graph> &a,
- const KmerCluster<Graph> &b) {
- EdgeId a_edge = a.edgeId;
- EdgeId b_edge = b.edgeId;
- size_t a_id = g_.int_id(a_edge);
- size_t b_id = g_.int_id(b_edge);
- DEBUG("clusters on " << a_id << " and " << b_id );
- if (abs(a.sorted_positions[a.last_trustable_index].read_position - b.sorted_positions[b.first_trustable_index].read_position) > 5000) {
- DEBUG("...to far5000");
- return 0;
- }
- VertexId start_v = g_.EdgeEnd(a_edge);
- size_t addition = g_.length(a_edge);
- VertexId end_v = g_.EdgeStart(b_edge);
- pair<VertexId, VertexId> vertex_pair = make_pair(start_v, end_v);
- vector<size_t> result;
- DEBUG("seq dist:" << s.size()/3);
- if (distance_cashed.find(vertex_pair) == distance_cashed.end()) {
- DistancesLengthsCallback<Graph> callback(g_);
- ProcessPaths(g_, 0, s.size() / 3, start_v,
- end_v, callback);
- result = callback.distances();
- distance_cashed[vertex_pair] = result;
- } else {
- DEBUG("taking from cashed");
- }
- DEBUG("addition: " << addition << " found " << result.size() << " lengths:" );
- for (size_t i = 0; i < result.size(); i++) {
- DEBUG(result[i]);
- }
- result = distance_cashed[vertex_pair];
- //TODO: Serious optimization possible
- for (size_t i = 0; i < result.size(); i++) {
- for (auto a_iter = a.sorted_positions.begin();
- a_iter != a.sorted_positions.end(); ++a_iter) {
- if (a_iter - a.sorted_positions.begin() > 500 && a.sorted_positions.end() - a_iter >500) continue;
- int cnt = 0;
- for (auto b_iter = b.sorted_positions.begin();
- b_iter != b.sorted_positions.end() && cnt <500; ++b_iter, cnt ++) {
- if (similar(*a_iter, *b_iter,
- (int) (result[i] + addition))) {
- return 1;
- }
- }
- cnt = 0;
- if (b.sorted_positions.size() > 500) {
- for (auto b_iter = b.sorted_positions.end() - 1;
- b_iter != b.sorted_positions.begin() && cnt < 500; --b_iter, cnt ++) {
- if (similar(*a_iter, *b_iter,
- (int) (result[i] + addition))) {
- return 1;
- }
- }
- }
- }
- }
- return 0;
-
- }
-
- string PathToString(const vector<EdgeId>& path) const {
- string res = "";
- for (auto iter = path.begin(); iter != path.end(); iter++) {
- size_t len = g_.length(*iter);
- string tmp = g_.EdgeNucls(*iter).First(len).str();
- res = res + tmp;
- }
- return res;
- }
-
- vector<EdgeId> BestScoredPath(const Sequence &s, VertexId start_v, VertexId end_v,
- int path_min_length, int path_max_length,
- int start_pos, int end_pos, string &s_add,
- string &e_add) {
- DEBUG(" Traversing tangled region. Start and end vertices resp: " << g_.int_id(start_v) <<" " << g_.int_id(end_v));
- PathStorageCallback<Graph> callback(g_);
- ProcessPaths(g_,
- path_min_length, path_max_length,
- start_v, end_v,
- callback);
- vector<vector<EdgeId> > paths = callback.paths();
- DEBUG("taking subseq" << start_pos <<" "<< end_pos <<" " << s.size());
- int s_len = int(s.size());
- string seq_string = s.Subseq(start_pos, min(end_pos + 1, s_len)).str();
- size_t best_path_ind = paths.size();
- size_t best_score = 1000000000;
- DEBUG("need to find best scored path between "<<paths.size()<<" , seq_len " << seq_string.length());
- if (paths.size() == 0)
- return vector<EdgeId>(0);
- for (size_t i = 0; i < paths.size(); i++) {
- string cur_string = s_add + PathToString(paths[i]) + e_add;
- if (paths.size() > 1 && paths.size() < 10) {
- TRACE("candidate path number "<< i << " , len " << cur_string.length());
- TRACE("graph candidate: " << cur_string);
- TRACE("in pacbio read: " << seq_string);
- for (auto j_iter = paths[i].begin(); j_iter != paths[i].end();
- ++j_iter) {
- DEBUG(g_.int_id(*j_iter));
- }
- }
- size_t cur_score = StringDistance(cur_string, seq_string);
- if (paths.size() > 1 && paths.size() < 10) {
- DEBUG("score: "<< cur_score);
- }
- if (cur_score < best_score) {
- best_score = cur_score;
- best_path_ind = i;
- }
- }
- if (best_score == 1000000000)
- return vector<EdgeId>(0);
- if (paths.size() > 1 && paths.size() < 10) {
- DEBUG("best score found! Path " <<best_path_ind <<" score "<< best_score);
- }
- return paths[best_path_ind];
- }
-
- // Short read alignment
- MappingPath<EdgeId> GetShortReadAlignment(const Sequence &s) const {
- ClustersSet mapping_descr = GetOrderClusters(s);
- map<EdgeId, KmerCluster<Graph> > largest_clusters;
-
- //Selecting the biggest cluster for each edge
- for (auto iter = mapping_descr.begin(); iter != mapping_descr.end(); ++iter) {
-
- auto first_cluster = iter->sorted_positions[iter->first_trustable_index];
- auto last_cluster = iter->sorted_positions[iter->last_trustable_index];
- int read_range = last_cluster.read_position - first_cluster.read_position;
- int edge_range = last_cluster.edge_position - first_cluster.edge_position;
- int cluster_szie = iter->last_trustable_index - iter->first_trustable_index;
- if (cluster_szie > 2 * read_range || edge_range < 0 || 2 * edge_range < read_range || edge_range > 2 * read_range) {
- //skipping cluster
- continue;
- }
-
- auto edge_cluster = largest_clusters.find(iter->edgeId);
- if (edge_cluster != largest_clusters.end()) {
- if (edge_cluster->second.last_trustable_index - edge_cluster->second.first_trustable_index
- < iter->last_trustable_index - iter->first_trustable_index) {
-
- edge_cluster->second = *iter;
- }
- }
- else {
- largest_clusters.insert(make_pair(iter->edgeId, *iter));
- }
- }
-
- MappingPath<EdgeId> result;
- for (auto iter = largest_clusters.begin(); iter != largest_clusters.end(); ++iter) {
- auto first_cluster = iter->second.sorted_positions[iter->second.first_trustable_index];
- auto last_cluster = iter->second.sorted_positions[iter->second.last_trustable_index];
- MappingRange range(Range(first_cluster.read_position, last_cluster.read_position),
- Range(first_cluster.edge_position, last_cluster.edge_position));
- result.join(MappingPath<EdgeId>(vector<EdgeId>(1, iter->second.edgeId), vector<MappingRange>(1, range)));
- }
-
- return result;
- }
-
- pair<EdgeId, size_t> GetUniqueKmerPos(const runtime_k::RtSeq& kmer) const {
- KeyWithHash kwh = tmp_index.ConstructKWH(kmer);
-
- if (tmp_index.valid(kwh.key())) {
- auto keys = tmp_index.get(kwh);
- if (keys.size() == 1) {
- return make_pair(keys[0].edge_id, keys[0].offset);
- }
- }
- return make_pair(EdgeId(0), -1u);
- }
-
-
-};
-
-template<class Graph>
-typename PacBioMappingIndex<Graph>::MappingDescription PacBioMappingIndex<Graph>::Locate(const Sequence &s) const {
- MappingDescription res;
- //WARNING: removed read_count from here to make const methods
- int local_read_count = 0;
- ++local_read_count;
- if (s.size() < pacbio_k)
- return res;
-
- //runtime_k::RtSeq kmer = s.start<runtime_k::RtSeq>(pacbio_k);
- KeyWithHash kwh = tmp_index.ConstructKWH(s.start<runtime_k::RtSeq>(pacbio_k));
-
- for (size_t j = pacbio_k; j < s.size(); ++j) {
- kwh = kwh << s[j];
- if (!tmp_index.valid(kwh.key())) {
-// INFO("not valid kmer");
- continue;
- }
- auto keys = tmp_index.get(kwh);
- TRACE("Valid key, size: "<< keys.size());
-
- for (auto iter = keys.begin(); iter != keys.end(); ++iter) {
-
- int quality = (int) keys.size();
- TRACE("and quality:" << quality);
- if (banned_kmers.find(Sequence(kwh.key())) != banned_kmers.end())
- continue;
- int offset = (int)iter->offset;
- int s_stretched = int ((double)s.size() * 1.2 + 50);
- int edge_len = int(g_.length(iter->edge_id));
- //No alignment in vertex, and further than s+eps bp from edge ends;
- bool correct_alignment = offset > int(debruijn_k - pacbio_k) && offset < edge_len;
- if (ignore_map_to_middle) {
- correct_alignment &= (offset < int(debruijn_k - pacbio_k) + s_stretched || offset > edge_len - s_stretched);
- }
- if (correct_alignment) {
- res[iter->edge_id].push_back(MappingInstance((int) iter->offset, (int) (j - pacbio_k + 1), quality));
- }
- }
- }
-
- for (auto iter = res.begin(); iter != res.end(); ++iter) {
- sort(iter->second.begin(), iter->second.end());
- DEBUG("read count "<< local_read_count);
- DEBUG("edge: " << g_.int_id(iter->first) << "size: " << iter->second.size());
- for (auto j_iter = iter->second.begin(); j_iter != iter->second.end(); j_iter++) {
- DEBUG(j_iter->str());
- }
- }
-
- return res;
-}
-
-}
diff --git a/src/debruijn/pacbio/pacbio_gap_closer.hpp b/src/debruijn/pacbio/pacbio_gap_closer.hpp
deleted file mode 100644
index 544962b..0000000
--- a/src/debruijn/pacbio/pacbio_gap_closer.hpp
+++ /dev/null
@@ -1,394 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "pacbio_read_structures.hpp"
-
-#include "ConsensusCore/Poa/PoaConfig.hpp"
-#include "ConsensusCore/Poa/PoaConsensus.hpp"
-
-#include <algorithm>
-
-namespace pacbio {
-template<class Graph>
-class PacbioGapCloser;
-
-template<class Graph>
-class GapStorage {
- friend class PacbioGapCloser<Graph> ;
- typedef typename Graph::EdgeId EdgeId;
-private:
- DECL_LOGGER("PacbioGaps")
- ;
- Graph &g_;
- map<EdgeId, vector<GapDescription<Graph> > > inner_index;
- void HiddenAddGap(const GapDescription<Graph> &p) {
- inner_index[p.start].push_back(p);
- }
- vector<EdgeId> index;
- set<pair<EdgeId, EdgeId> > nonempty_pairs;
- set<pair<EdgeId, EdgeId> > transitively_ignored_pairs;
- set<pair<EdgeId, EdgeId> > symmetrically_ignored_pairs;
-
-public:
- size_t min_gap_quantity;
- GapStorage(Graph &g, size_t min_gap_quantity)
- : g_(g),
- inner_index(), min_gap_quantity(min_gap_quantity){
- }
-
- size_t FillIndex() {
- index.resize(0);
- set<EdgeId> tmp;
- for (auto iter = inner_index.begin(); iter != inner_index.end(); iter++) {
- index.push_back(iter->first);
- }
- return index.size();
- }
-
- EdgeId operator[](size_t i) {
- return index.at(i);
- }
-
- size_t size() const {
- return index.size();
- }
-
- bool IsTransitivelyIgnored(pair<EdgeId, EdgeId> p) {
- return (transitively_ignored_pairs.find(p) != transitively_ignored_pairs.end());
- }
- bool IsSymmetricallyIgnored(pair<EdgeId, EdgeId> p) {
- return (symmetrically_ignored_pairs.find(p) != symmetrically_ignored_pairs.end());
- }
-
- bool IsIgnored(pair<EdgeId, EdgeId> p) {
- return (IsTransitivelyIgnored(p) || IsSymmetricallyIgnored(p));
- }
- void AddGap(const GapDescription<Graph> &p, bool add_rc = false) {
- HiddenAddGap(p);
- if (add_rc) {
- TRACE("Adding conjugate");
- HiddenAddGap(p.conjugate(g_, (int) cfg::get().K));
- }
- }
-
- void AddStorage(const GapStorage<Graph> & to_add) {
- const auto& idx = to_add.inner_index;
- for (auto iter = idx.begin(); iter != idx.end(); ++iter)
- inner_index[iter->first].insert(inner_index[iter->first].end(), iter->second.begin(), iter->second.end());
- }
-
- void PostProcess() {
- FillIndex();
-
- for (auto j_iter = index.begin(); j_iter != index.end(); j_iter++) {
- EdgeId e = *j_iter;
- auto cl_start = inner_index[e].begin();
- auto iter = inner_index[e].begin();
- vector<GapDescription<Graph> > padded_gaps;
- while (iter != inner_index[e].end()) {
- auto next_iter = ++iter;
- if (next_iter == inner_index[e].end() || next_iter->end != cl_start->end) {
- size_t len = next_iter - cl_start;
- if (len >= min_gap_quantity) {
- nonempty_pairs.insert(make_pair(cl_start->start, cl_start->end));
- }
- cl_start = next_iter;
- }
- }
- }
-
- set<pair<EdgeId, EdgeId> > used_rc_pairs;
- for (auto iter = nonempty_pairs.begin(); iter != nonempty_pairs.end(); ++iter) {
- if (used_rc_pairs.find(*iter) != used_rc_pairs.end()) {
- DEBUG("skipping pair " << g_.int_id(iter->first) << "," << g_.int_id(iter->second));
- symmetrically_ignored_pairs.insert(make_pair(iter->first, iter->second));
- } else {
- DEBUG("Using pair" << g_.int_id(iter->first) << "," << g_.int_id(iter->second));
- }
-
- for (size_t i = 0; i < index.size(); i++) {
- if (nonempty_pairs.find(make_pair(iter->first, index[i])) != nonempty_pairs.end()
- && nonempty_pairs.find(make_pair(index[i], iter->second)) != nonempty_pairs.end()) {
- DEBUG("pair " << g_.int_id(iter->first) << "," << g_.int_id(iter->second) << " is ignored because of edge between " << g_.int_id(index[i]));
- transitively_ignored_pairs.insert(make_pair(iter->first, iter->second));
- }
- }
- used_rc_pairs.insert(make_pair(g_.conjugate(iter->second), g_.conjugate(iter->first)));
- }
- }
-
- void DumpToFile(const string filename) {
- ofstream filestr(filename);
- for (auto iter = inner_index.begin(); iter != inner_index.end(); ++iter) {
- DEBUG( g_.int_id(iter->first)<< " " <<iter->second.size());
- filestr << g_.int_id(iter->first) << " " << iter->second.size() << endl;
- sort(iter->second.begin(), iter->second.end());
- for (auto j_iter = iter->second.begin(); j_iter != iter->second.end(); ++j_iter) {
- filestr << j_iter->str(g_);
- }
- filestr << endl;
- }
- }
-
- void LoadFromFile(const string s) {
- FILE* file = fopen((s).c_str(), "r");
- int res;
- char ss[5000];
- map<int, EdgeId> tmp_map;
- for (auto iter = g_.SmartEdgeBegin(); !iter.IsEnd(); ++iter) {
- tmp_map[g_.int_id(*iter)] = *iter;
- }
- while (!feof(file)) {
- int first_id, second_id, first_ind, second_ind;
- int size;
- res = fscanf(file, "%d %d\n", &first_id, &size);
- VERIFY(res == 2);
- for (int i = 0; i < size; i++) {
- res = fscanf(file, "%d %d\n", &first_id, &first_ind);
- VERIFY(res == 2);
- res = fscanf(file, "%d %d\n", &second_id, &second_ind);
- VERIFY(res == 2);
- res = fscanf(file, "%s\n", ss);
- VERIFY(res == 1);
- GapDescription<Graph> gap(tmp_map[first_id], tmp_map[second_id], Sequence(ss), first_ind, second_ind);
- this->AddGap(gap);
- }
- }
- }
-
- void PadGapStrings(EdgeId e) {
- sort(inner_index[e].begin(), inner_index[e].end());
- auto cl_start = inner_index[e].begin();
- auto iter = inner_index[e].begin();
- vector<GapDescription<Graph> > padded_gaps;
- while (iter != inner_index[e].end()) {
- auto next_iter = ++iter;
- if (next_iter == inner_index[e].end() || next_iter->end != cl_start->end) {
- int start_min = 1000000000;
- int end_max = 0;
- size_t long_seqs = 0;
- size_t short_seqs = 0;
- size_t long_seq_limit = cfg::get().pb.long_seq_limit; //400
- bool exclude_long_seqs = false;
- for (auto j_iter = cl_start; j_iter != next_iter; j_iter++) {
- if (g_.length(j_iter->start) - j_iter->edge_gap_start_position > 500 || j_iter->edge_gap_end_position > 500) {
- DEBUG("ignoring alingment to the middle of edge");
- continue;
- }
- if (j_iter->gap_seq.size() > long_seq_limit)
- long_seqs++;
- else
- short_seqs++;
-
- if (j_iter->edge_gap_start_position < start_min)
- start_min = j_iter->edge_gap_start_position;
- if (j_iter->edge_gap_end_position > end_max)
- end_max = j_iter->edge_gap_end_position;
- }
-
- if (short_seqs >= min_gap_quantity && short_seqs > long_seqs)
- exclude_long_seqs = true;
-
- for (auto j_iter = cl_start; j_iter != next_iter; j_iter++) {
- if (g_.length(j_iter->start) - j_iter->edge_gap_start_position > 500 || j_iter->edge_gap_end_position > 500)
- continue;
-
- if (exclude_long_seqs && j_iter->gap_seq.size() > long_seq_limit)
- continue;
-
- string s = g_.EdgeNucls(j_iter->start).Subseq(start_min, j_iter->edge_gap_start_position).str();
- s += j_iter->gap_seq.str();
- s += g_.EdgeNucls(j_iter->end).Subseq(j_iter->edge_gap_end_position, end_max).str();
- padded_gaps.push_back(GapDescription<Graph>(j_iter->start, j_iter->end, Sequence(s), start_min, end_max));
- }
- cl_start = next_iter;
- }
- }
- inner_index[e] = padded_gaps;
- }
-
- void PadGapStrings() {
- for (auto iter = inner_index.begin(); iter != inner_index.end(); ++iter) {
- DEBUG("Padding gaps for first edge " << g_.int_id(iter->first));
- PadGapStrings(iter->first);
- }
- PostProcess();
- }
-};
-
-template<class Graph>
-class PacbioGapCloser {
- typedef typename Graph::EdgeId EdgeId;
- typedef runtime_k::RtSeq Kmer;
- typedef vector<map<Kmer, int> > KmerStorage;
-private:
- DECL_LOGGER("PacbioGaps")
- ;
- Graph &g_;
- //first edge, second edge, weight, seq
- map<EdgeId, map<EdgeId, pair<size_t, string> > > new_edges_;
- int closed_gaps;
- int not_unique_gaps;
- int chained_gaps;
- bool consensus_gap_closing;
-public:
- void CloseGapsInGraph(map<EdgeId, EdgeId> &replacement) {
- for (auto iter = new_edges_.begin(); iter != new_edges_.end(); ++iter) {
- if (iter->second.size() != 1) {
- DEBUG("non-unique gap!!");
- not_unique_gaps ++;
- continue;
- }
- EdgeId first = iter->first;
- EdgeId second = (iter->second.begin()->first);
- if (replacement.find(first) != replacement.end() || replacement.find(second) != replacement.end()) {
- DEBUG("sorry, gap chains are not supported yet");
- chained_gaps++;
- continue;
- }
-
- EdgeId first_conj = g_.conjugate(first);
- EdgeId second_conj = g_.conjugate(second);
- size_t first_id = g_.int_id(first);
- size_t second_id = g_.int_id(second);
- size_t first_id_conj = g_.int_id(g_.conjugate(first));
- size_t second_id_conj = g_.int_id(g_.conjugate(second));
- DEBUG("closing gaps between "<< first_id << " " << second_id);
- size_t len_f = g_.length(first);
- size_t len_s = g_.length(second);
- size_t len_sum = iter->second.begin()->second.second.length();
- double cov = (double)g_.length(first) * g_.coverage(first) + (double)g_.length(second) * g_.coverage(second);
-
- DEBUG("coverage was " << g_.coverage(first) << " " << g_.coverage(second));
-
- EdgeId newEdge = g_.AddEdge(g_.EdgeStart(first), g_.EdgeEnd(second), Sequence(iter->second.begin()->second.second));
- if (cov > UINT_MAX * 0.75 ) cov = UINT_MAX*0.75;
- cov /= (double) g_.length(newEdge);
- TRACE(g_.int_id(newEdge));
- int len_split = int(((double) len_f * (double) len_sum) / ((double)len_s + (double)len_f));
- if (len_split == 0) {
- DEBUG(" zero split length, length are:" << len_f <<" " << len_sum <<" " << len_s);
- len_split = 1;
- }
- g_.DeleteEdge(first);
- g_.DeleteEdge(second);
- g_.coverage_index().SetAvgCoverage(newEdge, cov);
- g_.coverage_index().SetAvgCoverage(g_.conjugate(newEdge), cov);
- size_t next_id = g_.int_id(newEdge);
- DEBUG("and new coverage is " << g_.coverage(newEdge));
- closed_gaps ++;
- size_t next_id_conj = g_.int_id(g_.conjugate(newEdge));
- TRACE(first_id << " " << second_id << " " << next_id << " " << first_id_conj << " " << second_id_conj << " " << next_id_conj << " ");
- replacement[first] = newEdge;
- replacement[second] = newEdge;
- replacement[first_conj] = g_.conjugate(newEdge);
- replacement[second_conj] = g_.conjugate(newEdge);
- }
- INFO("Closed " << closed_gaps << " gaps");
- INFO("Total " << not_unique_gaps << " were not closed due to more than one possible pairing");
- INFO("Total " << chained_gaps << " were skipped because of gap chains");
- //TODO: chains of gaps!
- }
-private:
-
- void ConstructConsensus(EdgeId e, GapStorage<Graph> &storage, map<EdgeId, map<EdgeId, pair<size_t, string> > > & new_edges) {
- auto cl_start = storage.inner_index[e].begin();
- auto iter = storage.inner_index[e].begin();
- size_t cur_len = 0;
- while (iter != storage.inner_index[e].end()) {
- auto next_iter = ++iter;
- cur_len++;
- if (next_iter == storage.inner_index[e].end() || next_iter->end != cl_start->end) {
- if (cur_len >= storage.min_gap_quantity && !storage.IsIgnored(make_pair(cl_start->start, cl_start->end))) {
- vector<string> gap_variants;
-
- for (auto j_iter = cl_start; j_iter != next_iter; j_iter++) {
- string s = j_iter->gap_seq.str();
- transform(s.begin(), s.end(), s.begin(), ::toupper);
- gap_variants.push_back(s);
- }
- if (consensus_gap_closing || (gap_variants.size() > 0 && gap_variants[0].length() < cfg::get().pb.max_contigs_gap_length)) {
- map <EdgeId, pair<size_t, string>> tmp;
- string tmp_string;
- string s = g_.EdgeNucls(cl_start->start).Subseq(0, cl_start->edge_gap_start_position).str();
- if (consensus_gap_closing) {
- const ConsensusCore::PoaConsensus *pc = ConsensusCore::PoaConsensus::FindConsensus(
- gap_variants,
- ConsensusCore::PoaConfig::GLOBAL_ALIGNMENT);
- tmp_string = pc->Sequence();
- } else {
- tmp_string = gap_variants[0];
- if (gap_variants.size() > 1) {
-
- stringstream ss;
- for (int i = 0; i < gap_variants.size(); i++)
- ss << gap_variants[i].length() << " ";
- INFO(gap_variants.size() << " gap closing variant for contigs, lengths: " << ss.str());
- }
- }
-
- DEBUG("consenus for " << g_.int_id(cl_start->start) << " and " << g_.int_id(cl_start->end) <<
- "found: ");
- DEBUG(tmp_string);
- s += tmp_string;
- s += g_.EdgeNucls(cl_start->end).Subseq(cl_start->edge_gap_end_position,
- g_.length(cl_start->end) + g_.k()).str();
- tmp.insert(make_pair(cl_start->end, make_pair(cur_len, s)));
- new_edges[cl_start->start] = tmp;
- } else {
- INFO ("Skipping gap of size " << gap_variants[0].length() << " multiplicity " << gap_variants.size());
- }
- }
- cl_start = next_iter;
- cur_len = 0;
- }
- }
- }
-
-public:
- PacbioGapCloser(Graph &g, bool consensus_gap )
- : g_(g), consensus_gap_closing(consensus_gap) {
- closed_gaps = 0;
- not_unique_gaps = 0;
- chained_gaps = 0;
- }
-
- void ConstructConsensus(size_t nthreads, GapStorage<Graph> &storage) {
- vector<map<EdgeId, map<EdgeId, pair<size_t, string> > > > new_edges_by_thread;
- new_edges_by_thread.resize(nthreads);
- size_t storage_size = storage.size();
-# pragma omp parallel for shared(storage, new_edges_by_thread) num_threads(nthreads)
- for (size_t i = 0; i < storage_size; i++) {
- EdgeId e = storage[i];
- size_t thread_num = omp_get_thread_num();
- DEBUG("constructing consenus for first edge " << g_.int_id(e) << " in thread " <<thread_num);
- ConstructConsensus(e, storage, new_edges_by_thread[thread_num]);
- }
- for (size_t i = 0; i < nthreads; i++) {
- for (auto iter = new_edges_by_thread[i].begin(); iter != new_edges_by_thread[i].end(); ++iter) {
- new_edges_.insert(*iter);
- }
- }
- }
- void DumpToFile(const string filename) {
- ofstream filestr(filename);
- for (auto iter = new_edges_.begin(); iter != new_edges_.end(); ++iter) {
- if (iter->second.size() > 1) {
- DEBUG("nontrivial gap closing for edge" <<g_.int_id(iter->first));
- }
- for (auto j_iter = iter->second.begin(); j_iter != iter->second.end(); ++j_iter) {
- filestr << ">" << g_.int_id(iter->first) << "_" << iter->second.size() << "_" << g_.int_id(j_iter->first) << "_" << j_iter->second.first << endl;
- filestr << j_iter->second.second << endl;
- }
- }
- }
-
-};
-
-}
diff --git a/src/debruijn/pacbio/pacbio_read_structures.hpp b/src/debruijn/pacbio/pacbio_read_structures.hpp
deleted file mode 100644
index 8c4d4c5..0000000
--- a/src/debruijn/pacbio/pacbio_read_structures.hpp
+++ /dev/null
@@ -1,326 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * pac_index.hpp
- *
- * Created on: Jan 21, 2013
- * Author: lab42
- */
-#pragma once
-
-#include "indices/perfect_hash_map.hpp"
-#include "graph_pack.hpp"
-#include <algorithm>
-using std::map;
-using std::set;
-namespace pacbio {
-template<class T>
-struct pair_iterator_less {
- bool operator ()(pair<size_t, T> const& a, pair<size_t, T> const& b) const {
- return (a.first < b.first);
- }
-};
-
-struct MappingInstance {
- int edge_position;
- int read_position;
- //Now quality is the same with multiplicity, so best quality is 1,
- int quality;
- MappingInstance(int edge_position, int read_position, int quality) :
- edge_position(edge_position), read_position(read_position), quality(quality) {
- }
-
- inline bool IsUnique() const {
- return (quality == 1);
- }
-
- string str() {
- stringstream s;
- s << "E: " << edge_position << " R: " << read_position << " Q: " << quality;
- return s.str();
- }
-
-//Less by EDGE position
- bool operator <(MappingInstance const& b) const {
- if (edge_position < b.edge_position || (edge_position == b.edge_position && read_position < b.read_position))
- return true;
- else
- return false;
- }
-private:
- DECL_LOGGER("MappingInstance")
- ;
-};
-
-//Less by READ position
-struct ReadPositionComparator {
- bool operator ()(MappingInstance const& a, MappingInstance const& b) const {
- return (a.read_position < b.read_position || (a.read_position == b.read_position && a.edge_position < b.edge_position));
- }
-};
-
-template<class Graph>
-struct KmerCluster {
- typedef typename Graph::EdgeId EdgeId;
- int last_trustable_index;
- int first_trustable_index;
- size_t average_read_position;
- size_t average_edge_position;
- EdgeId edgeId;
- vector<MappingInstance> sorted_positions;
- int size;
-
- KmerCluster(EdgeId e, const vector<MappingInstance>& v) {
- last_trustable_index = 0;
- first_trustable_index = 0;
- average_read_position = 0;
- edgeId = e;
- size = (int) v.size();
- sorted_positions = v;
- FillTrustableIndeces();
- }
-
- bool operator <(const KmerCluster & b) const {
- return (average_read_position < b.average_read_position ||(average_read_position == b.average_read_position && edgeId < b.edgeId) ||
- (average_read_position == b.average_read_position && edgeId == b.edgeId && sorted_positions < b.sorted_positions));
- }
-
- bool CanFollow(const KmerCluster &b) const {
- return (b.sorted_positions[b.last_trustable_index].read_position < sorted_positions[first_trustable_index].read_position);
- }
-
- void FillTrustableIndeces() {
- //ignore non-unique kmers for distance determination
- int first_unique_ind = 0;
- while (first_unique_ind != size - 1 && !(sorted_positions[first_unique_ind].IsUnique())) {
- first_unique_ind += 1;
- }
- int last_unique_ind = size - 1;
- while (last_unique_ind != 0 && !(sorted_positions[last_unique_ind].IsUnique())) {
- last_unique_ind -= 1;
- }
- last_trustable_index = last_unique_ind;
- first_trustable_index = first_unique_ind;
- double tmp_read_position = 0, tmp_edge_position = 0;;
- vector<int> diffs;
- for (auto mp : sorted_positions) {
- tmp_read_position += mp.read_position;
- tmp_edge_position += mp.edge_position;
- diffs.push_back(mp.read_position - mp.edge_position);
- }
- sort(diffs.begin(), diffs.end());
- int median_diff = diffs[size/2];
-
- tmp_read_position /= size;
- tmp_edge_position /= size;
- average_read_position = (size_t)trunc(tmp_read_position);
- average_edge_position = (size_t)trunc(tmp_edge_position);
-
- if (size > 10) {
- int max_debug_size = 10;
- vector<int> distances(max_debug_size);
- for (int df: diffs) {
- int ind = abs(df - median_diff)/ 50;
- if (ind > max_debug_size - 1) ind = max_debug_size - 1;
- distances [ind] ++;
- }
- if (size > 100 || distances[0] * 5 < size * 4) {
- stringstream s;
-
- for (int d: distances) {
- s << d << " ";
- }
-// INFO(s.str());
-
- }
- }
- }
-
- string str(const Graph &g) const{
- stringstream s;
- s << "Edge: " << g.int_id(edgeId) << " on edge: " << sorted_positions[first_trustable_index].edge_position<< " - " << sorted_positions[last_trustable_index].edge_position<< ";on read: " << sorted_positions[first_trustable_index].read_position<< " - " << sorted_positions[last_trustable_index].read_position<< ";size "<< size;
- return s.str();
- }
-private:
- DECL_LOGGER("KmerCluster")
- ;
-};
-
-template<class Graph>
-struct GapDescription {
- typedef typename Graph::EdgeId EdgeId;
- typename Graph::EdgeId start, end;
- Sequence gap_seq;
- int edge_gap_start_position, edge_gap_end_position;
-
-
- GapDescription(EdgeId start_e, EdgeId end_e, const Sequence &gap, int gap_start, int gap_end) :
- start(start_e), end(end_e), gap_seq(gap.str()), edge_gap_start_position(gap_start), edge_gap_end_position(gap_end) {
- }
-
- GapDescription(const KmerCluster<Graph> &a, const KmerCluster<Graph> & b, Sequence read, int pacbio_k) {
- edge_gap_start_position = a.sorted_positions[a.last_trustable_index].edge_position;
- edge_gap_end_position = b.sorted_positions[b.first_trustable_index].edge_position + pacbio_k - 1;
- start = a.edgeId;
- end = b.edgeId;
- DEBUG(read.str());
- gap_seq = read.Subseq(a.sorted_positions[a.last_trustable_index].read_position, b.sorted_positions[b.first_trustable_index].read_position + pacbio_k - 1);
- DEBUG(gap_seq.str());
- DEBUG("gap added");
- }
-
- GapDescription<Graph> conjugate(Graph &g_, int shift) const {
- GapDescription<Graph> res(
- g_.conjugate(end), g_.conjugate(start), (!gap_seq),
- (int) g_.length(end) + shift - edge_gap_end_position,
- (int) g_.length(start) + shift - edge_gap_start_position);
- DEBUG("conjugate created" << res.str(g_));
- return res;
- }
-
- string str(Graph &g_) const {
- stringstream s;
- s << g_.int_id(start) << " " << edge_gap_start_position <<endl << g_.int_id(end) << " " << edge_gap_end_position << endl << gap_seq.str()<< endl;
- return s.str();
- }
-
- bool operator <(const GapDescription & b) const {
- return (start < b.start || (start == b.start && end < b.end) ||
- (start == b.start && end == b.end && edge_gap_start_position < b.edge_gap_start_position));
- }
-
-private:
- DECL_LOGGER("PacIndex")
- ;
-};
-
-template<class Graph>
-struct OneReadMapping {
- typedef typename Graph::EdgeId EdgeId;
- vector<vector<EdgeId> > main_storage;
- vector<GapDescription<Graph> > gaps;
- vector<size_t> real_length;
-//Total used seeds. sum over all subreads;
- size_t seed_num;
- OneReadMapping(vector<vector<EdgeId> > &paths_description, vector<GapDescription<Graph> > &gaps_description, vector<size_t> real_length, size_t seed_num) :
- main_storage(paths_description), gaps(gaps_description), real_length(real_length), seed_num(seed_num) {
- }
-
-};
-
-
-struct StatsCounter{
-
- map<size_t,size_t> path_len_in_edges;
- vector<size_t> subreads_length;
- size_t total_len ;
- size_t reads_with_conjugate;
- size_t subreads_count;
- map<size_t, size_t> seeds_percentage;
- StatsCounter() {
- total_len = 0;
- reads_with_conjugate = 0;
- }
-
- void AddStorage(StatsCounter &other) {
- total_len += other.total_len;
- reads_with_conjugate += other.reads_with_conjugate;
- for (auto iter = other.subreads_length.begin(); iter != other.subreads_length.end(); ++iter) {
- subreads_length.push_back(*iter);
- }
-
- for (auto iter = other.path_len_in_edges.begin(); iter != other.path_len_in_edges.end(); ++iter){
- auto j_iter = iter;
- if (( j_iter = path_len_in_edges.find(iter->first)) == other.path_len_in_edges.end()){
- path_len_in_edges.insert(make_pair(iter->first, iter->second));
- } else {
- path_len_in_edges[j_iter->first] += iter->second;
- }
- }
- for (auto iter = other.seeds_percentage.begin(); iter != other.seeds_percentage.end(); ++iter){
- auto j_iter = iter;
- if (( j_iter = seeds_percentage.find(iter->first)) == other.seeds_percentage.end()){
- seeds_percentage.insert(make_pair(iter->first, iter->second));
- } else {
- seeds_percentage[j_iter->first] += iter->second;
- }
- }
- }
-
- void report(){
- size_t total = 0;
- for (auto iter = seeds_percentage.begin(); iter != seeds_percentage.end(); ++iter){
- total += iter->second;
- }
- size_t cur = 0;
- size_t percentage = 0;
- for (auto iter = seeds_percentage.begin(); iter != seeds_percentage.end(); ++iter){
- cur += iter->second;
- percentage = iter->first;
- if (cur * 2 > total) break;
- }
- INFO("Median fraction of present seeds in maximal alignmnent among reads aligned to the graph: " << double(percentage) * 0.001);
- }
-private:
- DECL_LOGGER("StatsCounter");
-
-};
-
-inline int StringDistance(string &a, string &b) {
- int a_len = (int) a.length();
- int b_len = (int) b.length();
- int d = min(a_len / 3, b_len / 3);
- d = max(d, 10);
- DEBUG(a_len << " " << b_len << " " << d);
- vector<vector<int> > table(a_len);
- //int d =
- for (int i = 0; i < a_len; i++) {
- table[i].resize(b_len);
- int low = max(max(0, i - d - 1), i + b_len - a_len - d - 1);
- int high = min(min(b_len, i + d + 1), i + a_len - b_len + d + 1);
- TRACE(low << " " <<high);
- for (int j = low; j < high; j++)
- table[i][j] = 1000000;
- }
- table[a_len - 1][b_len - 1] = 1000000;
- table[0][0] = 0;
-//free deletions on begin
-// for(int j = 0; j < b_len; j++)
-// table[0][j] = 0;
-
- for (int i = 0; i < a_len; i++) {
- int low = max(max(0, i - d), i + b_len - a_len - d);
- int high = min(min(b_len, i + d), i + a_len - b_len + d);
-
- TRACE(low << " " <<high);
- for (int j = low; j < high; j++) {
-
- if (i > 0)
- table[i][j] = min(table[i][j], table[i - 1][j] + 1);
- if (j > 0)
- table[i][j] = min(table[i][j], table[i][j - 1] + 1);
- if (i > 0 && j > 0) {
- int add = 1;
- if (a[i] == b[j])
- add = 0;
- table[i][j] = min(table[i][j], table[i - 1][j - 1] + add);
- }
- }
- }
- //return table[a_len - 1][b_len - 1];
-//free deletions on end
- int res = table[a_len - 1][b_len - 1];
- DEBUG(res);
-// for(int j = 0; j < b_len; j++){
-// res = min(table[a_len - 1][j], res);
-// }
- return res;
-}
-
-
-}
diff --git a/src/debruijn/pacbio_aligning.cpp b/src/debruijn/pacbio_aligning.cpp
deleted file mode 100644
index 5b78881..0000000
--- a/src/debruijn/pacbio_aligning.cpp
+++ /dev/null
@@ -1,186 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "standard.hpp"
-#include "pacbio/pac_index.hpp"
-#include "pacbio/pacbio_gap_closer.hpp"
-#include "long_read_storage.hpp"
-#include "io/wrapper_collection.hpp"
-#include "stats/debruijn_stats.hpp"
-#include "pacbio_aligning.hpp"
-
-namespace debruijn_graph {
-
-void ProcessReadsBatch(conj_graph_pack &gp,
- std::vector<io::SingleRead>& reads,
- pacbio::PacBioMappingIndex<ConjugateDeBruijnGraph>& pac_index,
- PathStorage<Graph>& long_reads, pacbio::GapStorage<Graph>& gaps,
- size_t buf_size, int n, size_t min_gap_quantity, pacbio::StatsCounter& stats) {
- vector<PathStorage<Graph> > long_reads_by_thread(cfg::get().max_threads,
- PathStorage<Graph>(gp.g));
- vector<pacbio::GapStorage<Graph> > gaps_by_thread(cfg::get().max_threads,
- pacbio::GapStorage<Graph>(gp.g, min_gap_quantity));
- vector<pacbio::StatsCounter> stats_by_thread(cfg::get().max_threads);
-
- size_t longer_500 = 0;
- size_t aligned = 0;
- size_t nontrivial_aligned = 0;
-
-# pragma omp parallel for shared(reads, long_reads_by_thread, pac_index, n, aligned, nontrivial_aligned)
- for (size_t i = 0; i < buf_size; ++i) {
- if (i % 1000 == 0) {
- DEBUG("thread number " << omp_get_thread_num());
- }
- size_t thread_num = omp_get_thread_num();
- Sequence seq(reads[i].sequence());
-# pragma omp atomic
- n++;
- auto current_read_mapping = pac_index.GetReadAlignment(seq);
- auto aligned_edges = current_read_mapping.main_storage;
- auto gaps = current_read_mapping.gaps;
- for (auto iter = gaps.begin(); iter != gaps.end(); ++iter)
- gaps_by_thread[thread_num].AddGap(*iter, true);
-
- for (auto iter = aligned_edges.begin(); iter != aligned_edges.end(); ++iter)
- long_reads_by_thread[thread_num].AddPath(*iter, 1, true);
- //counting stats:
- for (auto iter = aligned_edges.begin(); iter != aligned_edges.end(); ++iter) {
- stats_by_thread[thread_num].path_len_in_edges[iter->size()]++;
- }
-# pragma omp critical
- {
-// INFO(current_read_mapping.seed_num);
- if (seq.size() > 500) {
- longer_500++;
- if (aligned_edges.size() > 0) {
- aligned++;
- stats_by_thread[thread_num].seeds_percentage[size_t(
- floor(double(current_read_mapping.seed_num) * 1000.0 / (double) seq.size()))]++;
- for (size_t j = 0; j < aligned_edges.size(); j++) {
- if (aligned_edges[j].size() > 1) {
- nontrivial_aligned++;
- break;
- }
- }
- }
- }
- }
-# pragma omp critical
- {
- VERBOSE_POWER(n, " reads processed");
- }
- }
- INFO("Read batch of size: " << buf_size << " processed; "<< longer_500 << " of them longer than 500; among long reads aligned: " << aligned << "; paths of more than one edge received: " << nontrivial_aligned );
-
- for (size_t i = 0; i < cfg::get().max_threads; i++) {
- long_reads.AddStorage(long_reads_by_thread[i]);
- gaps.AddStorage(gaps_by_thread[i]);
- stats.AddStorage(stats_by_thread[i]);
- }
-}
-
-void align_pacbio(conj_graph_pack &gp, int lib_id, bool make_additional_saves) {
- io::ReadStreamList<io::SingleRead> streams;
- for (const auto& reads : cfg::get().ds.reads[lib_id].single_reads())
- //do we need input_file function here?
- streams.push_back(make_shared<io::FixingWrapper>(make_shared<io::FileReadStream>(reads)));
-
- //make_shared<io::FixingWrapper>(make_shared<io::FileReadStream>(file));
- // auto pacbio_read_stream = single_easy_reader(cfg::get().ds.reads[lib_id],
-// false, false);
-
-// io::ReadStreamList<io::SingleRead> streams(pacbio_read_stream);
- // pacbio_read_stream.release();
- int n = 0;
- PathStorage<Graph>& long_reads = gp.single_long_reads[lib_id];
- pacbio::StatsCounter stats;
- size_t min_gap_quantity = 2;
- size_t rtype = 0;
- bool consensus_gap_closing = false;
- if (cfg::get().ds.reads[lib_id].type() == io::LibraryType::PacBioReads ||
- cfg::get().ds.reads[lib_id].type() == io::LibraryType::SangerReads ||
- cfg::get().ds.reads[lib_id].type() == io::LibraryType::NanoporeReads) {
- min_gap_quantity = cfg::get().pb.pacbio_min_gap_quantity;
- rtype = 1;
- consensus_gap_closing = true;
- } else {
- min_gap_quantity = cfg::get().pb.contigs_min_gap_quantity;
- rtype = 2;
- }
- pacbio::GapStorage<ConjugateDeBruijnGraph> gaps(gp.g, min_gap_quantity);
- size_t read_buffer_size = 50000;
- std::vector<io::SingleRead> reads(read_buffer_size);
- io::SingleRead read;
- size_t buffer_no = 0;
- INFO("Usign seed size: " << cfg::get().pb.pacbio_k);
- pacbio::PacBioMappingIndex<ConjugateDeBruijnGraph> pac_index(gp.g,
- cfg::get().pb.pacbio_k,
- cfg::get().K, cfg::get().pb.ignore_middle_alignment);
-
-// path_extend::ContigWriter cw(gp.g);
-// cw.WriteEdges("before_rr_with_ids.fasta");
-// ofstream filestr("pacbio_mapped.mpr");
-// filestr.close();
- for (auto iter = streams.begin(); iter != streams.end(); ++iter) {
- auto &stream = *iter;
- while (!stream.eof()) {
- size_t buf_size = 0;
- for (; buf_size < read_buffer_size && !stream.eof(); ++buf_size)
- stream >> reads[buf_size];
- INFO("Prepared batch " << buffer_no << " of " << buf_size << " reads.");
- DEBUG("master thread number " << omp_get_thread_num());
- ProcessReadsBatch(gp, reads, pac_index, long_reads, gaps, buf_size, n, min_gap_quantity, stats);
- // INFO("Processed batch " << buffer_no);
- ++buffer_no;
- }
- }
- string ss = (rtype == 1 ? "long reads": "contigs");
- INFO("For lib " << lib_id << " of " << ss <<" :");
- stats.report();
- map<EdgeId, EdgeId> replacement;
- size_t min_stats_cutoff =(rtype == 1 ? 1 : 0);
- if (make_additional_saves)
- long_reads.DumpToFile(cfg::get().output_saves + "long_reads_before_rep.mpr",
- replacement, min_stats_cutoff, true);
- gaps.DumpToFile(cfg::get().output_saves + "gaps.mpr");
- gaps.PadGapStrings();
- if (make_additional_saves)
- gaps.DumpToFile(cfg::get().output_saves + "gaps_padded.mpr");
- pacbio::PacbioGapCloser<Graph> gap_closer(gp.g, consensus_gap_closing);
- gap_closer.ConstructConsensus(cfg::get().max_threads, gaps);
- gap_closer.CloseGapsInGraph(replacement);
- long_reads.ReplaceEdges(replacement);
- for(int j = 0; j < lib_id; j++) {
- gp.single_long_reads[j].ReplaceEdges(replacement);
- }
-
- gap_closer.DumpToFile(cfg::get().output_saves + "gaps_pb_closed.fasta");
- INFO("PacBio aligning finished");
- return;
-}
-
-void PacBioAligning::run(conj_graph_pack &gp, const char*) {
- using namespace omnigraph;
- omnigraph::DefaultLabeler<Graph> labeler(gp.g, gp.edge_pos);
- int lib_id = -1;
- bool make_additional_saves = parent_->saves_policy().make_saves_;
- for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i) {
- if ( cfg::get().ds.reads[i].is_pacbio_alignable() ) {
- lib_id = (int) i;
- align_pacbio(gp, lib_id, make_additional_saves);
- }
- }
-
- if (lib_id == -1)
- INFO("no PacBio lib found");
-
- stats::detail_info_printer printer(gp, labeler, cfg::get().output_dir);
- printer(ipp_final_gap_closed);
-}
-
-}
-
diff --git a/src/debruijn/pacbio_aligning.hpp b/src/debruijn/pacbio_aligning.hpp
deleted file mode 100644
index 620e76d..0000000
--- a/src/debruijn/pacbio_aligning.hpp
+++ /dev/null
@@ -1,23 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "stage.hpp"
-
-namespace debruijn_graph {
-
-class PacBioAligning : public spades::AssemblyStage {
-public:
- PacBioAligning()
- : AssemblyStage("PacBio Aligning", "pacbio_aligning") {
- }
- void run(conj_graph_pack &gp, const char*);
-};
-
-}
-
diff --git a/src/debruijn/pair_info_count.cpp b/src/debruijn/pair_info_count.cpp
deleted file mode 100644
index 433b3ee..0000000
--- a/src/debruijn/pair_info_count.cpp
+++ /dev/null
@@ -1,249 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "standard.hpp"
-#include "dataset_readers.hpp"
-#include "read_converter.hpp"
-
-#include "de/paired_info.hpp"
-
-#include "utils.hpp"
-#include "stats/debruijn_stats.hpp"
-
-#include "is_counter.hpp"
-#include "pair_info_count.hpp"
-#include "sequence_mapper.hpp"
-#include "short_read_mapper.hpp"
-#include "long_read_mapper.hpp"
-#include "pair_info_filler.hpp"
-#include "stats/debruijn_stats.hpp"
-#include "path_extend/split_graph_pair_info.hpp"
-#include "bwa_pair_info_filler.hpp"
-
-namespace debruijn_graph {
-
-typedef io::SequencingLibrary<debruijn_config::DataSetData> SequencingLib;
-
-bool RefineInsertSizeForLib(conj_graph_pack& gp, size_t ilib, size_t edge_length_threshold) {
-
- INFO("Estimating insert size (takes a while)");
- InsertSizeCounter hist_counter(gp, edge_length_threshold, /* ignore negative */ true);
- SequenceMapperNotifier notifier(gp);
- notifier.Subscribe(ilib, &hist_counter);
-
- SequencingLib& reads = cfg::get_writable().ds.reads[ilib];
- VERIFY(reads.data().read_length != 0);
- auto paired_streams = paired_binary_readers(reads, false);
- notifier.ProcessLibrary(paired_streams, ilib, *ChooseProperMapper(gp, reads));
-
- INFO(hist_counter.mapped() << " paired reads (" <<
- ((double) hist_counter.mapped() * 100.0 / (double) hist_counter.total()) << "% of all) aligned to long edges");
- if (hist_counter.negative() > 3 * hist_counter.mapped())
- WARN("Too much reads aligned with negative insert size. Is the library orientation set properly?");
- if (hist_counter.mapped() == 0)
- return false;
-
- std::map<size_t, size_t> percentiles;
- hist_counter.FindMean(reads.data().mean_insert_size, reads.data().insert_size_deviation, percentiles);
- hist_counter.FindMedian(reads.data().median_insert_size, reads.data().insert_size_mad, reads.data().insert_size_distribution);
- if (reads.data().median_insert_size < gp.k_value + 2) {
- return false;
- }
-
- std::tie(reads.data().insert_size_left_quantile, reads.data().insert_size_right_quantile) = omnigraph::GetISInterval(0.8, reads.data().insert_size_distribution);
-
- return !reads.data().insert_size_distribution.empty();
-}
-
-void ProcessSingleReads(conj_graph_pack& gp, size_t ilib,
- bool use_binary = true) {
- const SequencingLib& reads = cfg::get().ds.reads[ilib];
- SequenceMapperNotifier notifier(gp);
- SimpleLongReadMapper read_mapper(gp, gp.single_long_reads[ilib]);
- notifier.Subscribe(ilib, &read_mapper);
-
- auto mapper_ptr = ChooseProperMapper(gp, reads);
- if (use_binary) {
- auto single_streams = single_binary_readers(reads, false, true);
- notifier.ProcessLibrary(single_streams, ilib, *mapper_ptr);
- } else {
- auto single_streams = single_easy_readers(reads, false,
- true, /*handle Ns*/false);
- notifier.ProcessLibrary(single_streams, ilib, *mapper_ptr);
- }
- cfg::get_writable().ds.reads[ilib].data().single_reads_mapped = true;
-}
-
-void ProcessPairedReads(conj_graph_pack& gp, size_t ilib, bool map_single_reads) {
- const SequencingLib& reads = cfg::get().ds.reads[ilib];
- bool calculate_threshold = (reads.type() == io::LibraryType::PairedEnd);
- SequenceMapperNotifier notifier(gp);
- INFO("Left insert size qauntile " << reads.data().insert_size_left_quantile << ", right insert size quantile " << reads.data().insert_size_right_quantile);
-
- SimpleLongReadMapper read_mapper(gp, gp.single_long_reads[ilib]);
- if (map_single_reads) {
- notifier.Subscribe(ilib, &read_mapper);
- }
-
- path_extend::SplitGraphPairInfo split_graph(
- gp, (size_t) reads.data().median_insert_size,
- (size_t) reads.data().insert_size_deviation,
- (size_t) reads.data().insert_size_left_quantile,
- (size_t) reads.data().insert_size_right_quantile,
- reads.data().read_length, gp.g.k(),
- cfg::get().pe_params.param_set.split_edge_length,
- reads.data().insert_size_distribution);
- if (calculate_threshold) {
- notifier.Subscribe(ilib, &split_graph);
- }
-
- LatePairedIndexFiller pif(gp.g, PairedReadCountWeight, gp.paired_indices[ilib]);
- notifier.Subscribe(ilib, &pif);
-
- auto paired_streams = paired_binary_readers(reads, false, (size_t) reads.data().mean_insert_size);
- notifier.ProcessLibrary(paired_streams, ilib, *ChooseProperMapper(gp, reads));
- cfg::get_writable().ds.reads[ilib].data().pi_threshold = split_graph.GetThreshold();
-
- if (map_single_reads) {
- ProcessSingleReads(gp, ilib);
- }
-}
-
-bool HasGoodRRLibs() {
- for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i) {
- const auto& lib = cfg::get().ds.reads[i];
- if (lib.is_contig_lib())
- continue;
- if (lib.is_paired() &&
- lib.data().mean_insert_size == 0.0) {
- continue;
- }
- if (lib.is_repeat_resolvable()) {
- return true;
- }
- }
- return false;
-}
-
-bool HasOnlyMP() {
- for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i) {
- if (cfg::get().ds.reads[i].type() == io::LibraryType::PathExtendContigs)
- continue;
- if (cfg::get().ds.reads[i].type() != io::LibraryType::MatePairs && cfg::get().ds.reads[i].type() != io::LibraryType::HQMatePairs) {
- return false;
- }
- }
- return true;
-}
-
-//todo improve logic
-bool ShouldMapSingleReads(size_t ilib) {
- switch (cfg::get().single_reads_rr) {
- case sr_none: {
- return false;
- }
- case sr_all: {
- return true;
- }
- case sr_only_single_libs: {
- //Map when no PacBio/paried libs or only mate-pairs or single lib itself
- return !HasGoodRRLibs() || HasOnlyMP() || (cfg::get().ds.reads[ilib].type() == io::LibraryType::SingleReads);
- }
- }
- return false;
-}
-
-void PairInfoCount::run(conj_graph_pack &gp, const char*) {
- gp.InitRRIndices();
- gp.EnsureBasicMapping();
-
- //fixme implement better universal logic
- size_t edge_length_threshold = cfg::get().ds.meta ? 1000 : stats::Nx(gp.g, 50);
- INFO("Min edge length for estimation: " << edge_length_threshold);
- bwa_pair_info::BWAPairInfoFiller bwa_counter(gp.g,
- cfg::get().bwa.path_to_bwa,
- path::append_path(cfg::get().output_dir, "bwa_count"),
- cfg::get().max_threads, !cfg::get().bwa.debug);
-
- for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i) {
- const auto& lib = cfg::get().ds.reads[i];
-
- if (cfg::get().bwa.enabled && lib.is_bwa_alignable()) {
- //Run insert size estimation and pair index filler together to save disc space (removes SAM file right after processing the lib)
- bwa_counter.ProcessLib(i, cfg::get_writable().ds.reads[i], gp.paired_indices[i],
- edge_length_threshold, cfg::get().bwa.min_contig_len);
- }
- else if (lib.is_paired()) {
- INFO("Estimating insert size for library #" << i);
- const auto& lib_data = lib.data();
- size_t rl = lib_data.read_length;
- size_t k = cfg::get().K;
- bool insert_size_refined = RefineInsertSizeForLib(gp, i, edge_length_threshold);
-
- if (!insert_size_refined) {
- cfg::get_writable().ds.reads[i].data().mean_insert_size = 0.0;
- WARN("Unable to estimate insert size for paired library #" << i);
- if (rl > 0 && rl <= k) {
- WARN("Maximum read length (" << rl << ") should be greater than K (" << k << ")");
- } else if (rl <= k * 11 / 10) {
- WARN("Maximum read length (" << rl << ") is probably too close to K (" << k << ")");
- } else {
- WARN("None of paired reads aligned properly. Please, check orientation of your read pairs.");
- }
- continue;
- } else {
- INFO(" Insert size = " << lib_data.mean_insert_size <<
- ", deviation = " << lib_data.insert_size_deviation <<
- ", left quantile = " << lib_data.insert_size_left_quantile <<
- ", right quantile = " << lib_data.insert_size_right_quantile <<
- ", read length = " << lib_data.read_length);
-
- if (lib_data.mean_insert_size < 1.1 * (double) rl) {
- WARN("Estimated mean insert size " << lib_data.mean_insert_size
- << " is very small compared to read length " << rl);
- }
- }
- }
- }
-
- for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i) {
- const auto& lib = cfg::get().ds.reads[i];
- if (lib.is_pacbio_alignable()) {
- INFO("Library #" << i << " was mapped by PacBio mapper, skipping");
- continue;
- }
- else if (lib.is_contig_lib()) {
- INFO("Mapping contigs library #" << i);
- ProcessSingleReads(gp, i, false);
- }
- else if (cfg::get().bwa.enabled && lib.is_bwa_alignable()) {
- INFO("Library #" << i << " was mapped by BWA, skipping");
- continue;
- }
- else {
- INFO("Mapping library #" << i);
- bool map_single_reads = ShouldMapSingleReads(i);
- cfg::get_writable().use_single_reads |= map_single_reads;
-
- if (lib.is_paired() && lib.data().mean_insert_size != 0.0) {
- INFO("Mapping paired reads (takes a while) ");
- ProcessPairedReads(gp, i, map_single_reads);
- } else if (map_single_reads) {
- INFO("Mapping single reads (takes a while) ");
- ProcessSingleReads(gp, i);
- }
-
- if (map_single_reads) {
- INFO("Total paths obtained from single reads: " << gp.single_long_reads[i].size());
- }
- }
- }
-
- SensitiveReadMapper<Graph>::EraseIndices();
-}
-
-}
diff --git a/src/debruijn/pair_info_count.hpp b/src/debruijn/pair_info_count.hpp
deleted file mode 100644
index a9d1f6c..0000000
--- a/src/debruijn/pair_info_count.hpp
+++ /dev/null
@@ -1,24 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "stage.hpp"
-
-namespace debruijn_graph {
-
-class PairInfoCount : public spades::AssemblyStage {
- public:
- PairInfoCount(bool preliminary = false)
- : AssemblyStage(preliminary ? "Preliminary Paired Information Counting" : "Paired Information Counting",
- preliminary ? "late_pair_info_count_preliminary" : "late_pair_info_count") {}
-
- void run(conj_graph_pack &gp, const char*);
-};
-
-}
-
diff --git a/src/debruijn/pair_info_filler.hpp b/src/debruijn/pair_info_filler.hpp
deleted file mode 100644
index 11a3b7e..0000000
--- a/src/debruijn/pair_info_filler.hpp
+++ /dev/null
@@ -1,119 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * pair_info_filler.hpp
- *
- * Created on: Oct 3, 2013
- * Author: andrey
- */
-
-#ifndef PAIR_INFO_FILLER_HPP_
-#define PAIR_INFO_FILLER_HPP_
-
-#include "sequence_mapper_notifier.hpp"
-
-namespace debruijn_graph {
-
-/**
- * As for now it ignores sophisticated case of repeated consecutive
- * occurrence of edge in path due to gaps in mapping
- *
- * todo talk with Anton about simplification and speed-up of procedure with little quality loss
- */
-class LatePairedIndexFiller : public SequenceMapperListener {
- typedef std::function<double(MappingRange, MappingRange)> WeightF;
- typedef std::pair<EdgeId, EdgeId> EdgePair;
-public:
- LatePairedIndexFiller(const Graph &graph, WeightF weight_f, omnigraph::de::UnclusteredPairedInfoIndexT<Graph>& paired_index)
- : graph_(graph),
- weight_f_(weight_f),
- paired_index_(paired_index) {
- }
-
- virtual void StartProcessLibrary(size_t threads_count) {
- paired_index_.Init();
- buffer_pi_ = {graph_, threads_count};
- }
-
- virtual void StopProcessLibrary() {
- for (size_t i = 0; i < buffer_pi_.size(); ++i)
- MergeBuffer(i);
-
- buffer_pi_.Clear();
- }
-
- virtual void ProcessPairedRead(size_t thread_index,
- const io::PairedRead& r,
- const MappingPath<EdgeId>& read1,
- const MappingPath<EdgeId>& read2) {
- ProcessPairedRead(buffer_pi_[thread_index], read1, read2, r.distance());
- }
-
- virtual void ProcessPairedRead(size_t thread_index,
- const io::PairedReadSeq& r,
- const MappingPath<EdgeId>& read1,
- const MappingPath<EdgeId>& read2) {
- ProcessPairedRead(buffer_pi_[thread_index], read1, read2, r.distance());
- }
-
- virtual void ProcessSingleRead(size_t,
- const io::SingleReadSeq&,
- const MappingPath<EdgeId>&) {}
-
- virtual void ProcessSingleRead(size_t,
- const io::SingleRead&,
- const MappingPath<EdgeId>&) {}
-
- virtual void MergeBuffer(size_t thread_index) {
- paired_index_.Merge(buffer_pi_[thread_index]);
- buffer_pi_[thread_index].Clear();
- }
-
- virtual ~LatePairedIndexFiller() {}
-
-private:
- void ProcessPairedRead(omnigraph::de::PairedInfoBuffer<Graph>& paired_index,
- const MappingPath<EdgeId>& path1,
- const MappingPath<EdgeId>& path2, size_t read_distance) const {
- for (size_t i = 0; i < path1.size(); ++i) {
- std::pair<EdgeId, MappingRange> mapping_edge_1 = path1[i];
- for (size_t j = 0; j < path2.size(); ++j) {
- std::pair<EdgeId, MappingRange> mapping_edge_2 = path2[j];
-
- EdgePair ep{mapping_edge_1.first, mapping_edge_2.first};
-
-
- double weight = weight_f_(mapping_edge_1.second,
- mapping_edge_2.second);
- size_t kmer_distance = read_distance
- + mapping_edge_2.second.initial_range.end_pos
- - mapping_edge_1.second.initial_range.start_pos;
- int edge_distance = (int) kmer_distance
- + (int) mapping_edge_1.second.mapped_range.start_pos
- - (int) mapping_edge_2.second.mapped_range.end_pos;
-
- paired_index.Add(mapping_edge_1.first, mapping_edge_2.first,
- omnigraph::de::RawPoint(edge_distance, weight));
- }
- }
- }
-
-private:
- const Graph& graph_;
- WeightF weight_f_;
- omnigraph::de::UnclusteredPairedInfoIndexT<Graph>& paired_index_;
- omnigraph::de::PairedInfoBuffersT<Graph> buffer_pi_;
-
- DECL_LOGGER("LatePairedIndexFiller");
-};
-
-
-}
-
-
-#endif /* PAIR_INFO_FILLER_HPP_ */
diff --git a/src/debruijn/pair_info_improver.hpp b/src/debruijn/pair_info_improver.hpp
deleted file mode 100644
index 8c529f4..0000000
--- a/src/debruijn/pair_info_improver.hpp
+++ /dev/null
@@ -1,235 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "standard.hpp"
-#include "graph_pack.hpp"
-#include "path_utils.hpp"
-#include "split_path_constructor.hpp"
-#include "de/paired_info_helpers.hpp"
-#include <math.h>
-
-namespace debruijn_graph {
-
-template<class Graph>
-static
-bool TryToAddPairInfo(omnigraph::de::PairedInfoIndexT<Graph>& clustered_index,
- typename Graph::EdgeId e1, typename Graph::EdgeId e2,
- const omnigraph::de::Point& point_to_add) {
- auto histogram = clustered_index.Get(e1, e2);
- for (auto i : histogram)
- if (ClustersIntersect(i, point_to_add))
- return false;
-
- clustered_index.Add(e1, e2, point_to_add);
- return true;
-}
-
-template<class Graph>
-class PairInfoImprover {
- typedef typename Graph::EdgeId EdgeId;
- typedef std::vector<omnigraph::de::PairInfo<EdgeId> > PairInfos;
- typedef std::pair<EdgeId, EdgeId> EdgePair;
- typedef omnigraph::de::PairedInfoIndexT<Graph> Index;
-
- public:
- PairInfoImprover(const Graph& g,
- Index& clustered_index,
- const io::SequencingLibrary<debruijn_config::DataSetData> &lib)
- : graph_(g), index_(clustered_index), lib_(lib) { }
-
- void ImprovePairedInfo(unsigned num_threads = 1) {
- CorrectPairedInfo(num_threads);
- CorrectPairedInfo(num_threads);
- }
-
- private:
- void CorrectPairedInfo(unsigned nthreads) {
- size_t missing_paired_info_count = 0;
- size_t extra_paired_info_count = 0;
- extra_paired_info_count = RemoveContradictional(nthreads);
- missing_paired_info_count = FillMissing(nthreads);
-
- INFO("Paired info stats: missing = " << missing_paired_info_count
- << "; contradictional = " << extra_paired_info_count);
- }
-
- class ContradictionalRemover {
- public:
- ContradictionalRemover(omnigraph::de::PairedInfoIndicesT<Graph> &to_remove,
- const Graph &g,
- omnigraph::de::PairedInfoIndexT<Graph>& index)
- : to_remove_(to_remove), graph_(g), index_(index) {}
-
- bool operator()(EdgeId e) {
- omnigraph::de::PairedInfoIndexT<Graph> &to_remove = to_remove_[omp_get_thread_num()];
-
- if (graph_.length(e)>= cfg::get().max_repeat_length && index_.contains(e))
- FindInconsistent(e, to_remove);
-
- return false;
- }
-
- private:
- bool IsConsistent(EdgeId /*e*/, EdgeId e1, EdgeId e2,
- const omnigraph::de::Point& p1, const omnigraph::de::Point& p2) const {
- if (math::le(p1.d, 0.f) || math::le(p2.d, 0.f) || math::gr(p1.d, p2.d))
- return true;
-
- double pi_dist = p2.d - p1.d;
- int first_length = (int) graph_.length(e1);
- double var = p1.var + p2.var;
-
- TRACE(" PI " << p1 << " tr " << omp_get_thread_num());
- TRACE("vs PI " << p2 << " tr " << omp_get_thread_num());
-
- if (math::le(pi_dist, double(first_length) + var) &&
- math::le(double(first_length), pi_dist + var)) {
- if (graph_.EdgeEnd(e1) == graph_.EdgeStart(e2))
- return true;
-
- auto paths = GetAllPathsBetweenEdges(graph_, e1, e2, 0, (size_t) ceil(pi_dist - first_length + var));
- return (paths.size() > 0);
- } else {
- if (math::gr(p2.d, p1.d + first_length)) {
- auto paths = GetAllPathsBetweenEdges(graph_, e1, e2,
- (size_t) floor(pi_dist - first_length - var),
- (size_t) ceil(pi_dist - first_length + var));
- return (paths.size() > 0);
- }
- return false;
- }
- }
-
- // Checking the consistency of two edge pairs (e, e_1) and (e, e_2) for all pairs (base_edge, <some_edge>)
- void FindInconsistent(EdgeId base_edge,
- Index& pi) const {
- for (auto i1 : index_.Get(base_edge)) {
- auto e1 = i1.first;
- for (auto i2 : index_.Get(base_edge)) {
- auto e2 = i2.first;
- if (e1 == e2)
- continue;
- for (auto p1 : i1.second) {
- for (auto p2 : i2.second) {
- if (!IsConsistent(base_edge, e1, e2, p1, p2)) {
- if (math::le(p1.weight, p2.weight))
- pi.Add(base_edge, e1, p1);
- else
- pi.Add(base_edge, e2, p2);
- }
- }
- }
- }
- }
- }
-
- omnigraph::de::PairedInfoIndicesT<Graph> &to_remove_;
- const Graph &graph_;
- Index& index_;
- };
-
- size_t RemoveContradictional(unsigned nthreads) {
- size_t cnt = 0;
-
- omnigraph::de::PairedInfoIndicesT<Graph> to_remove(graph_, nthreads);
-
- // FIXME: Replace with lambda
- ContradictionalRemover remover(to_remove, graph_, index_);
- ParallelEdgeProcessor<Graph>(graph_, nthreads).Run(remover);
-
- DEBUG("ParallelRemoveContraditional: Threads finished");
-
- DEBUG("Merging maps");
- for (size_t i = 1; i < nthreads; ++i) {
- to_remove[0].Merge(to_remove[i]);
- to_remove[i].Clear();
- }
- DEBUG("Resulting size " << to_remove[0].size());
-
- DEBUG("Deleting paired infos, liable to removing");
- for (auto I = omnigraph::de::raw_pair_begin(to_remove[0]);
- I != omnigraph::de::raw_pair_end(to_remove[0]); ++I) {
- cnt += DeleteIfExist(I.first(), I.second(), *I);
- }
- to_remove[0].Clear();
-
- DEBUG("Size of index " << index_.size());
- DEBUG("ParallelRemoveContraditional: Clean finished");
- return cnt;
-
- }
-
- size_t FillMissing(unsigned nthreads) {
- DEBUG("Fill missing: Creating indexes");
- const size_t NUM_CHUNKS = nthreads * 16;
- omnigraph::de::PairedInfoIndicesT<Graph> to_add(graph_, NUM_CHUNKS);
-
- SplitPathConstructor<Graph> spc(graph_);
- IterationHelper<Graph, EdgeId> edges(graph_);
- auto iters = edges.Chunks(NUM_CHUNKS);
-
- DEBUG("Fill missing: Start threads");
- #pragma omp parallel for schedule(guided)
- for (size_t i = 0; i < iters.size() - 1; ++i) {
- TRACE("Processing chunk #" << i);
- for (auto e = iters[i]; e != iters[i + 1]; ++e) {
- TRACE("Checking for edge " << *e);
- auto paths = spc.ConvertPIToSplitPaths(*e, index_,
- lib_.data().mean_insert_size,
- lib_.data().insert_size_deviation);
- for (const auto &path : paths) {
- TRACE("Path " << path.PrintPath(graph_));
- for (const auto &pi : path)
- TryToAddPairInfo(to_add[i], pi.first, pi.second, pi.point);
- }
- }
- }
- //ParallelEdgeProcessor<Graph>(graph_, nthreads).Run(filler);
- DEBUG("Fill missing: Threads finished");
-
- size_t cnt = 0;
- for (size_t i = 0; i < iters.size() - 1; ++i) {
- DEBUG("Adding map #" << i);
- for (auto I = omnigraph::de::raw_pair_begin(to_add[i]);
- I != omnigraph::de::raw_pair_end(to_add[i]);
- ++I) {
- EdgeId e1 = I.first();
- EdgeId e2 = I.second();
- for (auto p : *I)
- cnt += TryToAddPairInfo(index_, e1, e2, p);
- }
- to_add[i].Clear();
- }
-
- DEBUG("Size of paired index " << index_.size());
-
- DEBUG("Fill missing: Clean finished");
- DEBUG("Added " << cnt);
- return cnt;
- }
-
- private:
- size_t DeleteIfExist(EdgeId e1, EdgeId e2, const typename Index::FullHistProxy& infos) {
- size_t cnt = 0;
- for (auto point : infos) {
- cnt += index_.Remove(e1, e2, point);
- TRACE("cnt += " << cnt);
- }
-
- return cnt;
- }
-
- const Graph& graph_;
- Index& index_;
- const io::SequencingLibrary<debruijn_config::DataSetData>& lib_;
-
- DECL_LOGGER("PairInfoImprover")
-};
-
-}
diff --git a/src/debruijn/paired_statistics.hpp b/src/debruijn/paired_statistics.hpp
deleted file mode 100644
index 6493c73..0000000
--- a/src/debruijn/paired_statistics.hpp
+++ /dev/null
@@ -1,1058 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-template<class Graph>
-class PairInfoChecker {
-private:
- typedef typename Graph::EdgeId EdgeId;
- const EdgesPositionHandler<Graph> &positions_;
- size_t first_bound_;
- const size_t second_bound_;
- vector<double> perfect_matches_;
- vector<double> good_matches_;
- vector<double> mismatches_;
- vector<double> imperfect_matches_;
-
-public:
- PairInfoChecker(const EdgesPositionHandler<Graph> &positions,
- size_t first_bound, size_t second_bound) :
- positions_(positions), first_bound_(first_bound), second_bound_(
- second_bound) {
- }
-
- void Check(const de::PairedInfoIndex<Graph> &paired_index) {
- for (auto it = paired_index.begin(); it != paired_index.end(); ++it) {
- auto vec = *it;
- for (auto vec_it = vec.begin(); vec_it != vec.end(); ++vec_it) {
- size_t code = CheckSingleInfo(*vec_it);
- if (code == 0) {
- perfect_matches_.push_back(vec_it->weight);
- } else if (code == 1) {
- good_matches_.push_back(vec_it->weight);
- } else if (code == 2) {
- mismatches_.push_back(vec_it->weight);
- } else if (code == 3) {
- imperfect_matches_.push_back(vec_it->weight);
- }
- }
- }
- }
-
- size_t CheckSingleInfo(de::PairInfo<EdgeId> info) {
- const vector<EdgePosition> &pos1 = positions_.GetEdgePositions(
- info.first);
- const vector<EdgePosition> &pos2 = positions_.GetEdgePositions(
- info.second);
- bool good_match_found = false;
- for (size_t i = 0; i < pos1.size(); i++)
- for (size_t j = 0; j < pos2.size(); j++) {
- if (abs(pos1[i].mr.initial_range.start_pos + info.d - pos2[j].mr.initial_range.start_pos)
- <= first_bound_ + info.variance) {
- if (info.variance == 0) {
- return 0;
- } else {
- return 3;
- }
- } else if (abs(pos1[i].mr.initial_range.start_pos + info.d - pos2[j].mr.initial_range.start_pos)
- <= second_bound_) {
- good_match_found = true;
- }
- }
- if (good_match_found) {
- return 1;
- } else {
- return 2;
- }
- }
-
- void WriteResultsToFile(vector<double> results, const string &file_name) {
- sort(results.begin(), results.end());
- ofstream os;
- os.open(file_name.c_str());
- for (size_t i = 0; i < results.size(); i++) {
- os << results[i] << endl;
- }
- os.close();
- }
-
- void WriteResults(const string &folder_name) {
- path::make_dir(folder_name);
- WriteResultsToFile(perfect_matches_,
- folder_name + "/perfect_matches.txt");
- WriteResultsToFile(good_matches_, folder_name + "/good_matches.txt");
- WriteResultsToFile(mismatches_, folder_name + "/mismatches.txt");
- WriteResultsToFile(imperfect_matches_,
- folder_name + "/imperfect_matches.txt");
- }
-};
-
-template<class Graph>
-class TrivialEdgePairChecker {
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- const Graph &graph_;
- const size_t bound_;
-public:
- TrivialEdgePairChecker(const Graph &graph, size_t bound = (size_t) - 1) :
- graph_(graph), bound_(bound) {
- }
-
- /*
- * Very bad code. Shame on me.
- */
- bool GoForward(EdgeId &edge) {
- if (!graph_.CheckUniqueOutgoingEdge(graph_.EdgeEnd(edge))) {
- return false;
- }
- edge = graph_.GetUniqueOutgoingEdge(graph_.EdgeEnd(edge));
- return true;
- }
-
- bool GoBackward(EdgeId &edge) {
- if (!graph_.CheckUniqueIncomingEdge(graph_.EdgeStart(edge))) {
- return false;
- }
- edge = graph_.GetUniqueIncomingEdge(graph_.EdgeStart(edge));
- return true;
- }
-
- bool CheckForward(EdgeId edge1, EdgeId edge2) {
- set<EdgeId> was;
- size_t length = 0;
- do {
- if (edge1 == edge2)
- return true;
- if (was.count(edge1) != 0)
- return false;
- was.insert(edge1);
- length += graph_.length(edge1);
- } while (length <= bound_ && GoForward(edge1));
- return false;
- }
-
- bool CheckBackward(EdgeId edge1, EdgeId edge2) {
- set<EdgeId> was;
- size_t length = 0;
- do {
- if (edge1 == edge2)
- return true;
- if (was.count(edge1) != 0)
- return false;
- was.insert(edge1);
- length += graph_.length(edge1);
- } while (length <= bound_ && GoBackward(edge1));
- return false;
- }
-
- bool Check(EdgeId edge1, EdgeId edge2) {
- return CheckForward(edge1, edge2) || CheckBackward(edge2, edge1)
- /*|| CheckForward(edge2, edge1) || CheckBackward(edge1, edge2)*/;
- }
-};
-
-template<class Graph>
-class EdgePairStat: public AbstractStatCounter {
-
-private:
- typedef typename Graph::EdgeId EdgeId;
- typedef pair<EdgeId, EdgeId> EdgePair;
- const Graph& graph_;
- const PairedInfoIndexT<Graph>& pair_info_;
- const string& output_folder_;
-
-public:
- EdgePairStat(const Graph &graph, const PairedInfoIndexT<Graph> &pair_info,
- const string &output_folder) :
- graph_(graph), pair_info_(pair_info), output_folder_(output_folder) {
- }
-
- virtual ~EdgePairStat() {
- }
-
-private:
- vector<double> GetWeights(map<EdgePair, double>& edge_pairs) {
- vector<double> weights;
- for (auto it = edge_pairs.begin(); it != edge_pairs.end(); ++it) {
- weights.push_back(it->second);
- }
- sort(weights.begin(), weights.end());
- return weights;
- }
-
- void GetPairInfo(map<EdgePair, double> &edge_pairs, PairedInfoIndexT<Graph>& index) {
- for (auto it = index.begin(); it != index.end(); ++it) {
- de::Histogram v = *it;
- size_t w = 0;
- for (auto I = v.begin(); I != v.end(); ++I)
- w += (size_t) I->weight;
-
- edge_pairs.insert(make_pair(make_pair(it.first(), it.second()), w));
- }
- }
-
- void RemoveTrivial(map<pair<EdgeId, EdgeId> , double> &edge_pairs) {
- TrivialEdgePairChecker<Graph> checker(graph_);
- for (auto iterator = edge_pairs.begin(); iterator != edge_pairs.end();
- ) {
- if (checker.Check(iterator->first.first, iterator->first.second)) {
- edge_pairs.erase(iterator++);
- } else {
- ++iterator;
- }
- }
- }
-
- // void RemoveUntrustful(map<pair<EdgeId, EdgeId> , double> &edge_pairs, double bound) {
- // vector<double> weights;
- // for (auto iterator = edge_pairs.begin(); iterator != edge_pairs.end(); ++iterator) {
- // weights.push_back(iterator->second);
- // }
- // sort(weights.begin(), weights.end());
- //
- // for (auto iterator = edge_pairs.begin(); iterator != edge_pairs.end();) {
- // if(iterator->second < bound) {
- // edge_pairs.erase(iterator++);
- // } else {
- // ++iterator;
- // }
- // }
- // }
-
-public:
- vector<pair<int, double>> ComulativeHistogram(vector<double> weights) {
- vector<pair<int, double>> result;
- int cur = weights.size() - 1;
- size_t max = 1000;
- vector<double> res(max);
- for (int i = max - 1; i >= 0; i--) {
- while (cur >= 0 && weights[cur] >= i + 1) {
- cur--;
- }
- res[i] = weights.size() - 1 - cur;
- }
- for (size_t i = 0; i < weights.size(); i++) {
- result.push_back(make_pair(i + 1, res[i]));
- }
- return result;
- }
-
- // void OutputWeights(vector<double> weights, string file_name) {
- // ofstream os(file_name);
- // size_t cur = weights.size() - 1;
- // size_t max = 1000;
- // vector<double> res(max);
- // for(int i = max - 1; i >= 0; i--) {
- // while(cur >= 0 && weights[cur] >= i + 1) {
- // cur--;
- // }
- // res[i] = weights.size() - 1 - cur;
- // }
- // for(size_t i = 0; i < weights.size(); i++) {
- // os << i + 1 << " " << res[i] << endl;
- // }
- // os.close();
- // }
-
- bool ContainsPositiveDistance(EdgeId e1, const Histogram& infos) const {
- int first_len = int(graph_.length(e1));
- for (auto point : infos) {
- if (rounded_d(point) > first_len)
- return true;
- }
- return false;
- }
-
- virtual void Count() {
- typedef pair<EdgeId, EdgeId> EdgePair;
- PairedInfoIndexT<Graph> new_index = pair_info_;
- PairInfoWeightFilter<Graph>(graph_, 40).Filter(new_index);
- map<EdgePair, double> edge_pairs;
- TrivialEdgePairChecker<Graph> checker(graph_);
- size_t nontrivial = 0;
- size_t pair_number = 0;
- for (auto iterator = new_index.begin(); iterator != new_index.end(); ++iterator) {
- Histogram info = *iterator;
- if (ContainsPositiveDistance(iterator.first(), info)) {
- ++pair_number;
- if (checker.Check(iterator.first(), iterator.second())) {
- ++nontrivial;
- }
- }
- }
- GetPairInfo(edge_pairs, new_index);
- INFO("Number of edge pairs connected with paired info: " << pair_number);
- RemoveTrivial(edge_pairs);
- INFO("Number of nontrivial edge pairs connected with paired info: " << nontrivial);
- }
-};
-
-template<class Graph>
-class UniquePathStat: public AbstractStatCounter {
-
- typedef typename Graph::EdgeId EdgeId;
- const Graph& g_;
- const PairedInfoIndexT<Graph>& filtered_index_;
- size_t insert_size_;
- size_t max_read_length_;
- size_t gap_;
- double variance_delta_;
-
- size_t considered_edge_pair_cnt_;
- size_t unique_distance_cnt_;
- size_t non_unique_distance_cnt_;
-
- bool ContainsPositiveDistance(EdgeId e1, const de::Histogram& infos) const {
- int first_len = int(g_.length(e1));
- for (auto it = infos.begin(); it != infos.end(); ++it) {
- if (rounded_d(*it) > first_len)
- return true;
- }
- return false;
- }
-
-public:
-
- UniquePathStat(const Graph& g, const PairedInfoIndexT<Graph>& filtered_index,
- size_t insert_size, size_t max_read_length, double variance_delta) :
- g_(g), filtered_index_(filtered_index), insert_size_(insert_size), max_read_length_(
- max_read_length), gap_(insert_size_ - 2 * max_read_length_), variance_delta_(
- variance_delta), considered_edge_pair_cnt_(0), unique_distance_cnt_(
- 0), non_unique_distance_cnt_(0) {
-
- }
-
- virtual void Count() {
- // PairedInfoIndexT<Graph> filtered_index(g_);
- // PairInfoFilter < Graph > (g_, 40).Filter(pair_info_, filtered_index);
-
- for (auto it = filtered_index_.begin(); it != filtered_index_.end(); ++it)
- {
- if (ContainsPositiveDistance(it.first(), *it)) {
- considered_edge_pair_cnt_++;
- EdgeId e1 = it.first();
- EdgeId e2 = it.second();
-
- // cout << "Finding paths between edges " << e1 << " and " << e2 << endl;
- NonEmptyPathCounter<Graph> counter(g_);
- // VertexLablerCallback<Graph> graph_labeler(g_);
- // CompositeCallback<Graph> composite_callback;
- // composite_callback.AddProcessor(counter);
- // composite_callback.AddProcessor(graph_labeler);
- PathProcessor<Graph> path_processor(
- g_,
- omnigraph::PairInfoPathLengthLowerBound(g_.k(),
- g_.length(e1), g_.length(e2), (int) gap_,
- variance_delta_),
- omnigraph::PairInfoPathLengthUpperBound(g_.k(),
- insert_size_, variance_delta_),
- g_.EdgeEnd(e1),
- g_.EdgeStart(e2),
- counter);
- path_processor.Process();
- if (counter.count() == 1) {
- unique_distance_cnt_++;
- }
- if (counter.count() > 1) {
- non_unique_distance_cnt_++;
-
- }
- }
- }
- INFO("Considered " << considered_edge_pair_cnt_ << " edge pairs")INFO(
- unique_distance_cnt_ << " edge pairs connected with unique path of appropriate length")
- INFO(
- non_unique_distance_cnt_ << " edge pairs connected with non-unique path of appropriate length")
- }
-
- size_t considered_edge_pair_count() {
- return considered_edge_pair_cnt_;
- }
-
- size_t unique_distance_count() {
- return unique_distance_cnt_;
- }
-
- size_t non_unique_distance_count() {
- return non_unique_distance_cnt_;
- }
-private:
- DECL_LOGGER("UniquePathStat")
-};
-
-template<class Graph>
-class MatePairTransformStat: public AbstractStatCounter {
-
- typedef typename Graph::EdgeId EdgeId;
-
- public:
- MatePairTransformStat(const Graph& g, const PairedInfoIndexT<Graph>& pair_info) :
- g_(g), pair_info_(pair_info), considered_dist_cnt_(0),
- unique_distance_cnt_(0), non_unique_distance_cnt_(0)
- {
- }
-
- virtual void Count() {
- for (auto it = pair_info_.begin(); it != pair_info_.end(); ++it) {
- de::Histogram infos = *it;
- EdgeId e1 = it.first();
- EdgeId e2 = it.second();
- ProcessInfos(e1, e2, infos);
- }
- INFO("Considered " << considered_dist_cnt_ << " edge pair distances (including trivial)");
- INFO(unique_distance_cnt_ << " edge distances connected with unique path of appropriate length");
- INFO(non_unique_distance_cnt_ << " edge distances connected with non-unique path of appropriate length");
- }
-
- size_t considered_edge_pair_count() {
- return considered_dist_cnt_;
- }
-
- size_t unique_distance_count() {
- return unique_distance_cnt_;
- }
-
- size_t non_unique_distance_count() {
- return non_unique_distance_cnt_;
- }
-
- private:
- const Graph& g_;
- const PairedInfoIndexT<Graph>& pair_info_;
-
- size_t considered_dist_cnt_;
- size_t unique_distance_cnt_;
- size_t non_unique_distance_cnt_;
-
- void ProcessInfos(EdgeId e1, EdgeId e2, const Histogram& infos) {
- for (auto it = infos.begin(); it != infos.end(); ++it) {
- Point point = *it;
- if (gr(point.d, 0.)) {
- if (eq(point.var, 0.)) {
-
- PathStorageCallback<Graph> counter(g_);
-
- PathProcessor<Graph> path_processor(g_,
- (size_t) (point.d - (double) g_.length(e1)),
- (size_t) (point.d - (double) g_.length(e1)),
- g_.EdgeEnd(e1), g_.EdgeStart(e2), counter);
- path_processor.Process();
-
- TRACE("Edges" << e1 << " : " << e2 << ": " << point.weight << " : " << point.d);
- TRACE("Path Numbs" << counter.size());
-
- if (counter.size() == 1)
- ++unique_distance_cnt_;
- if (counter.size() > 1)
- ++non_unique_distance_cnt_;
- }
- else
- non_unique_distance_cnt_++;
-
- considered_dist_cnt_++;
- }
- }
- }
-
- DECL_LOGGER("MatePairTransformStat")
-};
-
-template<class Graph>
-class UniqueDistanceStat: public AbstractStatCounter {
- typedef omnigraph::de::PairedInfoIndexT<Graph> PairedIndex;
-
- const PairedIndex& paired_info_;
- size_t unique_;
- size_t non_unique_;
-public:
-
- UniqueDistanceStat(const PairedIndex& paired_info) :
- paired_info_(paired_info), unique_(0), non_unique_(0) {
-
- }
-
- virtual ~UniqueDistanceStat() {
-
- }
-
- virtual void Count() {
- for (auto it = paired_info_.begin(); it != paired_info_.end(); ++it) {
- VERIFY((*it).size() > 0);
- if ((*it).size() > 1) {
- non_unique_++;
- // for (auto info_it = (*it).begin(); info_it != (*it).end(); ++info_it) {
- // //todo
- // }
- } else {
- unique_++;
- }
- }INFO(unique_ << " unique edge distances");
- INFO(non_unique_ << " non unique edge distances");
- }
-
- size_t unique() {
- return unique_;
- }
-
- size_t non_unique() {
- return non_unique_;
- }
-};
-
-template<class Graph, class Index>
-class EstimationQualityStat: public AbstractStatCounter {
-private:
- typedef typename Graph::EdgeId EdgeId;
- typedef PairInfo<EdgeId> Info;
- typedef vector<Info> Infos;
- //input fields
- const Graph &graph_;
- const EdgeQuality<Graph, Index>& quality_;
- const PairedInfoIndex<Graph>& pair_info_;
- const PairedInfoIndex<Graph>& estimated_pair_info_;
- const PairedInfoIndex<Graph>& etalon_pair_info_;
-
- //output fields
- PairedInfoIndex<Graph> false_positives_;
- PairedInfoIndex<Graph> perfect_matches_;
- PairedInfoIndex<Graph> imperfect_matches_;
- PairedInfoIndex<Graph> false_negatives_;
-
-// PairedInfoIndexT<Graph> false_positive_weights_;
-// set<Info> false_positive_infos_;
-// vector<double> perfect_match_weights_;
-////(weight, estimated_variance - actual_variance, number of etalon points)
-// vector<pair<pair<double, double> , size_t>> imperfect_match_stat_;
-// size_t false_negative_count_;
-// vector<Info> false_negative_infos_;
-
- bool CheckInterestInInfo(const Info& info) {
- if (math::ls(info.d, 0.)) return false;
- if (info.first == info.second && math::eq(info.d, 0.)) return false;
- return quality_.IsPositiveQuality(info.first)
- && quality_.IsPositiveQuality(info.second) && math::gr(info.weight, 0.);
- }
-
- void HandleFalsePositive(const Info& estimated) {
-// DEBUG("Handling false positive " << estimated);
- if (CheckInterestInInfo(estimated))
- false_positives_.AddPairInfo(estimated, false);
- }
-
- void HandleFalseNegative(const Info& etalon) {
- if (CheckInterestInInfo(etalon))
- false_negatives_.AddPairInfo(etalon, false);
- }
-
- void HandlePerfectMatch(const Info& etalon, const Info& estimated) {
- if (CheckInterestInInfo(estimated))
- perfect_matches_.AddPairInfo(estimated, false);
- }
-
- void HandleImperfectMatch(const Info &estimated_cluster,
- const Infos& etalon_matches) {
- if (CheckInterestInInfo(estimated_cluster))
- imperfect_matches_.AddPairInfo(estimated_cluster, false);
-// double etalon_variance = etalon_matches[etalon_matches.size() - 1].d
-// - etalon_matches[0].d;
-// imperfect_match_stat_.push_back(
-// make_pair(
-// make_pair(estimated_cluster.weight,
-// estimated_cluster.variance - etalon_variance),
-// etalon_matches.size()));
- }
-
-// void Flush() {
-// ProcessImperfectMatch(last_estimated_imperfect_match_,
-// last_etalon_imperfect_matches_);
-// }
-
- void HandlePairsNotInEtalon(
- const set<pair<EdgeId, EdgeId>>& pairs_in_etalon) {
- for (auto it = estimated_pair_info_.begin();
- it != estimated_pair_info_.end(); ++it) {
- Infos estimated_infos = *it;
- EdgeId first = estimated_infos[0].first;
- EdgeId second = estimated_infos[0].second;
- if (pairs_in_etalon.count(make_pair(first, second)) == 0) {
- // for_each(estimated_infos.begin(), estimated_infos.end(),
- // boost::bind(&EstimationQualityStat::HandleFalsePositive, this, _1));
-
- for (auto it2 = estimated_infos.begin();
- it2 != estimated_infos.end(); ++it2) {
- HandleFalsePositive(*it2);
- }
- }
- }
- }
-
- bool InfoLess(const Info& a, const Info& b) {
- if (eq(a.variance, 0.) && eq(b.variance, 0.)) {
- return ls(a.d, b.d);
- }
- return ls(a.d + a.variance, b.d - b.variance);
- }
-
- bool IsPerfectMatch(const Info& etalon, const Info& estimated) {
- return le(etalon.d, estimated.d) && ge(etalon.d, estimated.d)
- && eq(estimated.variance, 0.);
- }
-
- bool IsImperfectMatch(const Info& etalon, const Info& estimated) {
- return ge(etalon.d, estimated.d - estimated.variance)
- && le(etalon.d, estimated.d + estimated.variance);
- }
-
- size_t Move(size_t estimated_idx, const Infos &estimated_infos) {
- estimated_idx++;
- while (estimated_idx < estimated_infos.size()
- && math::eq(estimated_infos[estimated_idx].weight, 0.))
- estimated_idx++;
- return estimated_idx;
- return 0;
- }
-
- size_t InitIdx(const Infos &pair_infos) {
- return Move(-1, pair_infos);
- }
-
- void ProcessInfos(const Infos& etalon_infos, const Infos& estimated_infos) {
- size_t etalon_idx = InitIdx(etalon_infos);
- for (size_t estimated_idx = InitIdx(estimated_infos);
- estimated_idx < estimated_infos.size();
- estimated_idx = Move(estimated_idx, estimated_infos)) {
- while (estimated_idx < estimated_infos.size()
- && (etalon_idx == etalon_infos.size()
- || InfoLess(estimated_infos[estimated_idx],
- etalon_infos[etalon_idx]))) {
- HandleFalsePositive(estimated_infos[estimated_idx]);
- estimated_idx = Move(estimated_idx, estimated_infos);
- }
- if (estimated_idx == estimated_infos.size()) {
- break;
- }
- while (etalon_idx < etalon_infos.size()
- && InfoLess(etalon_infos[etalon_idx],
- estimated_infos[estimated_idx])) {
- HandleFalseNegative(etalon_infos[etalon_idx]);
- etalon_idx = Move(etalon_idx, etalon_infos);
- }
- if (etalon_idx == etalon_infos.size()) {
- continue;
- }
- if (IsPerfectMatch(etalon_infos[etalon_idx],
- estimated_infos[estimated_idx])) {
- while (etalon_idx < etalon_infos.size()
- && IsPerfectMatch(etalon_infos[etalon_idx],
- estimated_infos[estimated_idx])) {
- HandlePerfectMatch(etalon_infos[etalon_idx],
- estimated_infos[estimated_idx]);
- etalon_idx = Move(etalon_idx, etalon_infos);
- }
- } else {
- vector<PairInfo<EdgeId> > cluster_hits;
- while (etalon_idx < etalon_infos.size()
- && IsImperfectMatch(etalon_infos[etalon_idx],
- estimated_infos[estimated_idx])) {
- cluster_hits.push_back(etalon_infos[etalon_idx]);
- etalon_idx = Move(etalon_idx, etalon_infos);
- }
- if (cluster_hits.size() == 0) {
- HandleFalsePositive(estimated_infos[estimated_idx]);
- } else {
- HandleImperfectMatch(estimated_infos[estimated_idx],
- cluster_hits);
- }
- }
- }
- // for (size_t etalon_idx = 0; etalon_idx < etalon_infos.size(); ++etalon_idx) {
- // Info etalon_info = etalon_infos[etalon_idx];
- //// cout << "here" << endl;
- // while (estimated_idx < estimated_infos.size() && InfoLess(estimated_infos[estimated_idx], etalon_info)) {
- // HandleFalsePositive(estimated_infos[estimated_idx]);
- // estimated_idx++;
- //// cout << "here1" << endl;
- // }
- //// cout << "here2" << endl;
- // if (estimated_idx != estimated_infos.size()
- // && (HandleIfPerfectMatch(etalon_info, estimated_infos[estimated_idx])
- // || HandleIfImperfectMatch(etalon_info, estimated_infos[estimated_idx]))) {
- // last_matched = true;
- // } else {
- // HandleFalseNegative(etalon_info);
- // }
- // }
- // if (last_matched)
- // estimated_idx++;
- while (etalon_idx < etalon_infos.size()) {
- // DEBUG("Handling false positives beyond all etalons");
- HandleFalseNegative(etalon_infos[etalon_idx]);
- etalon_idx = Move(etalon_idx, etalon_infos);
- }
- // Flush();
- }
-
-// void ReportFalsePositiveWeights() {
-// sort(false_positive_weights_.begin(), false_positive_weights_.end());
-//
-// INFO("False positive count: " << false_positive_weights_.size());
-// }
-//
-// void ReportPerfectMatchWeights() {
-// sort(perfect_match_weights_.begin(), perfect_match_weights_.end());
-// INFO("Perfect match count: " << perfect_match_weights_.size());
-// }
-//
-// void ReportImperfectMatchWeights() {
-// sort(imperfect_match_stat_.begin(), imperfect_match_stat_.end());
-// //todo do something better
-// INFO("Imperfect match count: " << imperfect_match_stat_.size());
-// }
-//
-// void FalseNegativeCount() {
-// INFO("False negative count: " << false_negative_count_);
-// }
-
-public:
- EstimationQualityStat(const Graph &graph,
- const EdgeQuality<Graph, Index>& quality,
- const PairedInfoIndex<Graph>& pair_info,
- const PairedInfoIndex<Graph>& estimated_pair_info,
- const PairedInfoIndex<Graph>& etalon_pair_info) :
- graph_(graph), quality_(quality), pair_info_(pair_info), estimated_pair_info_(
- estimated_pair_info), etalon_pair_info_(etalon_pair_info), false_positives_(
- graph_), perfect_matches_(graph_), imperfect_matches_(
- graph_), false_negatives_(graph_) {
- }
-
- virtual ~EstimationQualityStat() {
- }
-
- virtual void Count() {
- INFO("Counting distance estimation statistics");
- set<pair<EdgeId, EdgeId>> pairs_in_etalon;
- // DEBUG("Handling pairs present in etalon information");
- for (auto it = etalon_pair_info_.begin(); it != etalon_pair_info_.end(); ++it) {
- Infos etalon_infos = *it;
- EdgeId first = etalon_infos[0].first;
- EdgeId second = etalon_infos[0].second;
- pairs_in_etalon.insert(make_pair(first, second));
-
- Infos estimated_infos = estimated_pair_info_.GetEdgePairInfo(first, second);
- // DEBUG("Processing distances for pair " << first << ", " << second);
- ProcessInfos(etalon_infos, estimated_infos);
- }
- // DEBUG("Handling pairs that are not in etalon information");
- HandlePairsNotInEtalon(pairs_in_etalon);
-
- INFO("FPR: " << fpr());
- INFO("FNR: " << fnr());
- INFO("Distance estimation statistics counted");
- }
-
- const PairedInfoIndexT<Graph>& false_positives() {
- return false_positives_;
- }
-
- const PairedInfoIndexT<Graph>& perfect_matches() {
- return perfect_matches_;
- }
-
- const PairedInfoIndexT<Graph>& imperfect_matches() {
- return imperfect_matches_;
- }
-
- const PairedInfoIndexT<Graph>& false_negatives() {
- return false_negatives_;
- }
-
- double fpr() {
- return 1. * false_positives_.size() / estimated_pair_info_.size();
- }
-
- double fnr() {
- return 1. * false_negatives_.size() / etalon_pair_info_.size();
- }
-
- void SaveStats(const string& dir_name) {
- //saving results
- INFO("Saving estimation statistic");
- make_dir(dir_name);
- graphio::ConjugateDataPrinter<Graph> printer(graph_);
- printer.savePaired(dir_name + "fp", false_positives_);
- printer.savePaired(dir_name + "pm", perfect_matches_);
- printer.savePaired(dir_name + "im", imperfect_matches_);
- printer.savePaired(dir_name + "fn", false_negatives_);
- INFO("Estimation statistics saved");
- }
-
-// vector<double> false_positive_weights() {
-// sort(false_positive_weights_.begin(), false_positive_weights_.end());
-// return false_positive_weights_;
-// }
-// vector<double> perfect_match_weights() {
-// sort(perfect_match_weights_.begin(), perfect_match_weights_.end());
-// return perfect_match_weights_;
-// }
-//
-// vector<pair<pair<double, double> , size_t>> imperfect_match_weights() {
-// sort(imperfect_match_stat_.begin(), imperfect_match_stat_.end());
-// return imperfect_match_stat_;
-// }
-//
-// size_t false_negative_count() {
-// return false_negative_count_;
-// }
-
-// void WriteFalseNegativeGaps(const string &file_name) {
-// ofstream stream;
-// stream.open(file_name);
-// vector<double> to_print;
-// // for (size_t i = 0; i < false_negative_infos_.size(); i++) {
-// // if (false_negative_infos_[i].d > 0)
-// // to_print.push_back(
-// // false_negative_infos_[i].d - graph_.length(
-// // false_negative_infos_[i].first));
-// // }
-// // sort(to_print.begin(), to_print.end());
-// // copy(to_print.begin(), to_print.end(),
-// // ostream_iterator<double> (stream, "\n"));
-// for (size_t i = 0; i < false_negative_infos_.size(); i++) {
-// stream << false_negative_infos_[i] << endl;
-// }
-// stream.close();
-// }
-//
-// void WriteEstmationStats(const string &output_folder) {
-// ofstream stream;
-// stream.open(output_folder + "/perfect.inf");
-// copy(perfect_match_weights_.begin(), perfect_match_weights_.end(),
-// ostream_iterator<double>(stream, "\n"));
-// stream.close();
-//
-// stream.open(output_folder + "/false_positive.inf");
-// copy(false_positive_weights_.begin(), false_positive_weights_.end(),
-// ostream_iterator<double>(stream, "\n"));
-// stream.close();
-// WriteWorstEdgesStat(output_folder, 1000000);
-// }
-
-// void WriteEdgePairInfo(const string &file_name, Infos infos) {
-// ofstream stream;
-// stream.open(file_name);
-// for (size_t i = 0; i < infos.size(); i++) {
-// stream << infos[i] << endl;
-// }
-// stream.close();
-// }
-//
-// string ConstructEdgePairFileName(const string output_folder,
-// const string &name, const string &modifier, size_t index) {
-// stringstream ss;
-// ss.clear();
-// ss << output_folder << "/" << name << "_" << index << "_" << modifier
-// << ".inf";
-// return ss.str();
-// }
-
-// void WriteWorstEdgesStat(const string &output_folder, double bound) {
-// size_t count = 0;
-// WriteFalseNegativeGaps(output_folder + "/gaps.inf");
-// for (auto iterator = false_positive_infos_.begin();
-// iterator != false_positive_infos_.end(); ++iterator) {
-// if (iterator->weight > bound) {
-// WriteEdgePairInfo(
-// ConstructEdgePairFileName(output_folder, "fp",
-// "histogram", count),
-// pair_info_.GetEdgePairInfo(iterator->first,
-// iterator->second));
-// WriteEdgePairInfo(
-// ConstructEdgePairFileName(output_folder, "fp",
-// "estimated", count),
-// estimated_pair_info_.GetEdgePairInfo(iterator->first,
-// iterator->second));
-// WriteEdgePairInfo(
-// ConstructEdgePairFileName(output_folder, "fp", "etalon",
-// count),
-// etalon_pair_info_.GetEdgePairInfo(iterator->first,
-// iterator->second));
-// count++;
-// }
-// }
-// for (auto iterator = false_negative_infos_.begin();
-// iterator != false_negative_infos_.end(); ++iterator) {
-// if (iterator->weight > bound) {
-// WriteEdgePairInfo(
-// ConstructEdgePairFileName(output_folder, "fp",
-// "histogram", count),
-// pair_info_.GetEdgePairInfo(iterator->first,
-// iterator->second));
-// WriteEdgePairInfo(
-// ConstructEdgePairFileName(output_folder, "fp",
-// "estimated", count),
-// estimated_pair_info_.GetEdgePairInfo(iterator->first,
-// iterator->second));
-// WriteEdgePairInfo(
-// ConstructEdgePairFileName(output_folder, "fp", "etalon",
-// count),
-// etalon_pair_info_.GetEdgePairInfo(iterator->first,
-// iterator->second));
-// count++;
-// }
-// }
-// }
-
-};
-
-template<class Graph>
-class ClusterStat: public AbstractStatCounter {
-
- typedef typename Graph::EdgeId EdgeId;
- typedef pair<double, double> DoublePair;
-
- public:
- ClusterStat(const PairedInfoIndexT<Graph>& estimated_pair_info) :
- estimated_pair_info_(estimated_pair_info)
- {
- }
-
- virtual ~ClusterStat()
- {
- }
-
- virtual void Count() {
- for (auto it = estimated_pair_info_.begin(); it != estimated_pair_info_.end(); ++it) {
- de::Histogram infos = *it;
- for (auto it2 = infos.begin(); it2 != infos.end(); ++it2) {
- Point point = *it2;
- if (gr(point.var, 0.))
- weight_variance_stat_.push_back(make_pair(point.weight, point.var));
- }
- //todo talk with Anton!!!
- // for (auto it2 = (*it).begin(); it2 != (*it).end(); ++it2) {
- // Info info = *it2;
- //// if (gr(info.variance, 0)) {
- // weight_variance_stat_.push_back(make_pair(info.weight, info.variance));
- //// }
- // }
- }
- stringstream ss;
- copy(weight_variance_stat_.begin(), weight_variance_stat_.end(),
- ostream_iterator<DoublePair>(ss, ", "));
- INFO("Estimated cluster stat: " << ss.str());
- }
-
- vector<DoublePair> weight_variance_stat() {
- sort(weight_variance_stat_.begin(), weight_variance_stat_.end());
- return weight_variance_stat_;
- }
-
-private:
- const PairedInfoIndexT<Graph>& estimated_pair_info_;
- vector<DoublePair> weight_variance_stat_;
-
- DECL_LOGGER("EstimatedClusterStat");
-};
-
-template<class Graph, class Index>
-class CoverageStatistics{
-
-private:
- Graph& graph_;
- EdgeQuality<Graph, Index> & edge_qual_;
-
- bool DoesSuit(VertexId vertex){
- bool ans = true;
- for (size_t i = 0; ans && i<graph_.OutgoingEdgeCount(vertex); i++)
- ans = ans & math::gr(edge_qual_.quality(graph_.OutgoingEdges(vertex)[i]), 0.);
- for (size_t i = 0; ans && i<graph_.IncomingEdgeCount(vertex); i++)
- ans = ans & math::gr(edge_qual_.quality(graph_.IncomingEdges(vertex)[i]), 0.);
- return ans;
- }
-
-public:
- CoverageStatistics(Graph& graph, EdgeQuality<Graph, Index>& edge_qual):
- graph_(graph), edge_qual_(edge_qual){
- }
-
- virtual ~CoverageStatistics(){}
-
- virtual void Count(){
-
- map<double, size_t> cov_map;
- map<double, size_t> ratio_map;
- map<double, size_t> len_map;
- size_t area = 0;
- size_t area15 = 0;
- size_t area10 = 0;
- size_t area5 = 0;
- size_t area2 = 0;
- for (auto iter = graph_.ConstEdgeBegin(); !iter.IsEnd(); ++iter){
- len_map[graph_.length(*iter)]++;
- }
- for (auto iter = graph_.begin(); iter != graph_.end(); ++iter)
- if (true || DoesSuit(*iter) ){
-
- double plus_cov = 0.;
- double min_cov = 0.;
- double plus_all_cov = 0.;
- double min_all_cov = 0.;
- bool suit_us = true;
-
- if (graph_.IncomingEdgeCount(*iter)*graph_.OutgoingEdgeCount(*iter) == 0) continue;
-
- for (size_t i = 0; suit_us && i<graph_.IncomingEdgeCount(*iter); i++)
- if (graph_.length(graph_.IncomingEdges(*iter)[i]) < 80){
- if (math::ge(edge_qual_.quality(graph_.IncomingEdges(*iter)[i]), 1.))
- plus_cov += graph_.coverage(graph_.IncomingEdges(*iter)[i]);
- plus_all_cov += graph_.coverage(graph_.IncomingEdges(*iter)[i]);
- }else suit_us = false;
- for (size_t i = 0; suit_us && i<graph_.OutgoingEdgeCount(*iter); i++)
- if (graph_.length(graph_.OutgoingEdges(*iter)[i]) < 80){
- if (math::ge(edge_qual_.quality(graph_.OutgoingEdges(*iter)[i]), 1.))
- min_cov += graph_.coverage(graph_.OutgoingEdges(*iter)[i]);
- min_all_cov += graph_.coverage(graph_.OutgoingEdges(*iter)[i]);
- }else suit_us = false;
-
- if (!suit_us) continue;
-
- if (math::eq(min_cov, 0.) || math::eq(plus_cov, 0.)) continue;
-
- double delta_cov = math::round(1000.*(plus_cov - min_cov)/(plus_cov + min_cov));
-
- double ratio_cov = math::round(1000.*(plus_cov + min_cov)/(plus_all_cov + min_all_cov));
-
- if (math::ls(abs(delta_cov), 150.)) area15++;
- if (math::ls(abs(delta_cov), 100.)) area10++;
- if (math::ls(abs(delta_cov), 50.)) area5++;
- if (math::ls(abs(delta_cov), 20.)) area2++;
- area++;
-
- cov_map[delta_cov/10.]++;
- ratio_map[ratio_cov/10.]++;
-
- }
-
- for (auto iter = ratio_map.begin(); iter != ratio_map.end(); ++iter){
- INFO("Ratio " << (*iter).first << " " << (*iter).second);
- }
-
- for (auto iter = cov_map.begin(); iter != cov_map.end(); ++iter){
- INFO("Cov " << (*iter).first << " " << (*iter).second);
- }
-
- INFO("stats_cov " << area << " " << area2 << " " << area5 << " " << area10 << " " << area15);
-
- for (auto iter = len_map.begin(); iter != len_map.end(); ++iter){
- INFO("Len " << (*iter).first << " " << (*iter).second);
- }
-
- }
-
-};
diff --git a/src/debruijn/path_extend/bidirectional_path.cpp b/src/debruijn/path_extend/bidirectional_path.cpp
deleted file mode 100644
index fd2bb56..0000000
--- a/src/debruijn/path_extend/bidirectional_path.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * bidirectional_path.cpp
- *
- * Created on: Jun 25, 2015
- * Author: andrey
- */
-
-#include "standard.hpp"
-#include "bidirectional_path.hpp"
-
-namespace path_extend {
-
-std::atomic<uint64_t> BidirectionalPath::path_id_{0};
-
-}
diff --git a/src/debruijn/path_extend/bidirectional_path.hpp b/src/debruijn/path_extend/bidirectional_path.hpp
deleted file mode 100644
index 8ea4edb..0000000
--- a/src/debruijn/path_extend/bidirectional_path.hpp
+++ /dev/null
@@ -1,1065 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * bidirectional_path.h
- *
- * Created on: Nov 14, 2011
- * Author: andrey
- */
-
-#ifndef BIDIRECTIONAL_PATH_H_
-#define BIDIRECTIONAL_PATH_H_
-
-#include "../debruijn_graph.hpp"
-
-using debruijn_graph::Graph;
-using debruijn_graph::EdgeId;
-using debruijn_graph::VertexId;
-
-namespace path_extend {
-
-class BidirectionalPath;
-
-struct Gap {
- int gap_;
- uint32_t trash_previous_;
- uint32_t trash_current_;
- Gap(int gap)
- : gap_(gap), trash_previous_(0), trash_current_(0)
- { }
-
- Gap(int gap, uint32_t trash_previous, uint32_t trash_current)
- : gap_(gap), trash_previous_(trash_previous), trash_current_(trash_current)
- { }
-};
-
-
-class PathListener {
-public:
- virtual void FrontEdgeAdded(EdgeId e, BidirectionalPath * path, Gap gap) = 0;
- virtual void BackEdgeAdded(EdgeId e, BidirectionalPath * path, Gap gap) = 0;
- virtual void FrontEdgeRemoved(EdgeId e, BidirectionalPath * path) = 0;
- virtual void BackEdgeRemoved(EdgeId e, BidirectionalPath * path) = 0;
- virtual ~PathListener() {
- }
-};
-
-
-class BidirectionalPath : public PathListener {
-private:
- static std::atomic<uint64_t> path_id_;
-
-
-public:
- BidirectionalPath(const Graph& g)
- : g_(g),
- data_(),
- conj_path_(NULL),
- cumulative_len_(),
- gap_len_(),
- listeners_(),
- id_(path_id_++),
- weight_(1.0),
- has_overlaped_begin_(false),
- has_overlaped_end_(false),
- overlap_(false) {
- }
-
- BidirectionalPath(const Graph& g, const std::vector<EdgeId>& path)
- : BidirectionalPath(g) {
- for (size_t i = 0; i < path.size(); ++i) {
- PushBack(path[i]);
- }
- RecountLengths();
- }
-
- BidirectionalPath(const Graph& g, EdgeId startingEdge)
- : BidirectionalPath(g) {
- PushBack(startingEdge);
- }
-
- BidirectionalPath(const BidirectionalPath& path)
- : g_(path.g_),
- data_(path.data_),
- conj_path_(NULL),
- cumulative_len_(path.cumulative_len_),
- gap_len_(path.gap_len_),
- listeners_(),
- id_(path_id_++),
- weight_(path.weight_),
- has_overlaped_begin_(path.has_overlaped_begin_),
- has_overlaped_end_(path.has_overlaped_end_),
- overlap_(path.overlap_) {
- }
-
-public:
- void Subscribe(PathListener * listener) {
- listeners_.push_back(listener);
- }
-
- void Unsubscribe(PathListener * listener) {
- listeners_.push_back(listener);
- }
-
- void SetConjPath(BidirectionalPath* path) {
- conj_path_ = path;
- }
-
- const BidirectionalPath* GetConjPath() const {
- return conj_path_;
- }
-
- BidirectionalPath* GetConjPath() {
- return conj_path_;
- }
-
- void SetWeight(float w) {
- weight_ = w;
- }
-
- double GetWeight() const {
- return weight_;
- }
-
- size_t Size() const {
- return data_.size();
- }
-
- const Graph& graph() const {
- return g_;
- }
-
- bool Empty() const {
- return data_.empty();
- }
-
- size_t Length() const {
- if (gap_len_.size() == 0 || cumulative_len_.size() == 0) {
- return 0;
- }
- return cumulative_len_[0] + gap_len_[0].gap_;
- }
-
- //TODO iterators forward/reverse
- EdgeId operator[](size_t index) const {
- return data_[index];
- }
-
- EdgeId At(size_t index) const {
- return data_[index];
- }
-
- EdgeId ReverseAt(size_t index) const {
- return data_[data_.size() - index - 1];
- }
-
-
- // Length from beginning of i-th edge to path end for forward directed path: L(e1 + e2 + ... + eN)
- size_t LengthAt(size_t index) const {
- return cumulative_len_[index];
- }
-
- int GapAt(size_t index) const {
- return gap_len_[index].gap_;
- }
-
- uint32_t TrashCurrentAt(size_t index) const {
- return gap_len_[index].trash_current_;
- }
-
- uint32_t TrashPreviousAt(size_t index) const {
- return gap_len_[index].trash_previous_;
- }
-
- size_t GetId() const {
- return id_;
- }
-
- EdgeId Back() const {
- return data_.back();
- }
-
- EdgeId Front() const {
- return data_.front();
- }
-
- void PushBack(EdgeId e, int gap = 0, uint32_t trash_previous = 0, uint32_t trash_current = 0) {
- data_.push_back(e);
- Gap gap_struct(gap, trash_previous, trash_current);
- gap_len_.push_back(gap_struct);
- IncreaseLengths(g_.length(e), gap_struct);
- NotifyBackEdgeAdded(e, gap_struct);
- }
-
- void PushBack(EdgeId e, Gap gap) {
- data_.push_back(e);
- gap_len_.push_back(gap);
- IncreaseLengths(g_.length(e), gap);
- NotifyBackEdgeAdded(e, gap);
- }
-
- void PushBack(const BidirectionalPath& path) {
- for (size_t i = 0; i < path.Size(); ++i) {
- PushBack(path.At(i), path.GapAt(i), path.TrashPreviousAt(i), path.TrashCurrentAt(i));
- }
- }
-
- void PopBack() {
- if (data_.empty()) {
- return;
- }
- EdgeId e = data_.back();
- DecreaseLengths();
- gap_len_.pop_back();
- data_.pop_back();
- NotifyBackEdgeRemoved(e);
- }
-
- void PopBack(size_t count) {
- for (size_t i = 0; i < count; ++i) {
- PopBack();
- }
- }
-
- void Clear() {
- while (!Empty()) {
- PopBack();
- }
- }
-
- virtual void FrontEdgeAdded(EdgeId, BidirectionalPath*, int) {
- }
-
- virtual void FrontEdgeAdded(EdgeId, BidirectionalPath*, Gap) {
- }
-
-
- virtual void BackEdgeAdded(EdgeId e, BidirectionalPath*, int gap) {
- PushFront(g_.conjugate(e), gap);
- }
-
- virtual void BackEdgeAdded(EdgeId e, BidirectionalPath*, Gap gap) {
- PushFront(g_.conjugate(e), gap);
- }
-
- virtual void FrontEdgeRemoved(EdgeId, BidirectionalPath*) {
- }
-
- virtual void BackEdgeRemoved(EdgeId, BidirectionalPath *) {
- PopFront();
- }
-
- int FindFirst(EdgeId e) const {
- for (size_t i = 0; i < Size(); ++i) {
- if (data_[i] == e) {
- return (int) i;
- }
- }
- return -1;
- }
-
- int FindLast(EdgeId e) const {
- for (int i = (int) Size() - 1; i >= 0; --i) {
- if (data_[i] == e) {
- return i;
- }
- }
- return -1;
- }
-
- bool Contains(EdgeId e) const {
- return FindFirst(e) != -1;
- }
-
- bool Contains(VertexId v) const {
- for(auto edge : data_) {
- if(g_.EdgeEnd(edge) == v || g_.EdgeStart(edge) == v ) {
- return true;
- }
- }
- return false;
- }
-
- vector<size_t> FindAll(EdgeId e, size_t start = 0) const {
- vector<size_t> result;
- for (size_t i = start; i < Size(); ++i) {
- if (data_[i] == e) {
- result.push_back(i);
- }
- }
- return result;
- }
-
- bool CompareFrom(size_t from, const BidirectionalPath& sample) const {
- if (from + sample.Size() > Size()) {
- return false;
- }
-
- for (size_t i = 0; i < sample.Size(); ++i) {
- if (At(from + i) != sample[i]) {
- return false;
- }
- }
- return true;
- }
-
- size_t CommonEndSize(const BidirectionalPath& p) const {
- if (p.Size() == 0) {
- return 0;
- }
- std::vector<size_t> begins = FindAll(p.At(0));
- for (size_t i = 0; i < begins.size(); ++i) {
- size_t it1 = begins[i];
- size_t it2 = 0;
- while (it2 < p.Size() and At(it1) == p.At(it2)) {
- it1++;
- it2++;
- if (it1 == Size()) {
- return it2;
- }
- }
- }
- return 0;
- }
-
- size_t OverlapEndSize(const BidirectionalPath* path2) const {
- if (Size() == 0) {
- return 0;
- }
- int last1 = (int) Size() - 1;
- int max_over = 0;
- vector<size_t> begins2 = path2->FindAll(At(last1));
- for (size_t i = 0; i < begins2.size(); ++i) {
- int begin2 = (int) begins2[i];
- int cur1 = last1;
- while (begin2 > 0 && cur1 > 0 && path2->At(begin2 - 1) == At(cur1 - 1)) {
- cur1--;
- begin2--;
- }
- int over = last1 - cur1 + 1;
- if (begin2 == 0 && cur1 > 0 && over > max_over) {
- max_over = over;
- }
- }
- return (size_t) max_over;
- }
-
- int FindFirst(const BidirectionalPath& path, size_t from = 0) const {
- if (path.Size() > Size()) {
- return -1;
- }
- for (size_t i = from; i <= Size() - path.Size(); ++i) {
- if (CompareFrom(i, path)) {
- return (int) i;
- }
- }
- return -1;
- }
-//TODO: Why just naive search?
- int FindLast(const BidirectionalPath& path) const {
- if (path.Size() > Size()) {
- return -1;
- }
- for (int i = (int) (Size() - path.Size()); i >= 0; --i) {
- if (CompareFrom((size_t) i, path)) {
- return i;
- }
- }
- return -1;
- }
-
- bool Contains(const BidirectionalPath& path) const {
- return FindFirst(path) != -1;
- }
-
- bool Equal(const BidirectionalPath& path) const {
- return operator==(path);
- }
-
- bool operator==(const BidirectionalPath& path) const {
- return Size() == path.Size() && CompareFrom(0, path);
- }
-
- bool operator!=(const BidirectionalPath& path) const {
- return !operator==(path);
- }
-
- void CheckConjugateEnd(size_t max_repeat_length) {
- size_t prev_size = 0;
- while (prev_size != Size()) {
- prev_size = Size();
- FindConjEdges(max_repeat_length);
- }
- }
-
- void FindConjEdges(size_t max_repeat_length) {
- for (size_t begin_pos = 0; begin_pos < Size(); ++begin_pos) {
- size_t begin = begin_pos;
- vector<size_t> conj_pos = FindAll(g_.conjugate(At(begin_pos)), begin + 1);
- for (auto end_pos = conj_pos.rbegin(); end_pos != conj_pos.rend(); ++end_pos) {
- VERIFY(*end_pos < Size());
- size_t end = *end_pos;
- if (end <= begin) {
- continue;
- }
- while (begin < end && At(begin) == g_.conjugate(At(end))) {
- begin++;
- end--;
- }
- DEBUG("Found palindromic fragment from " << begin_pos << " to " << *end_pos);
- Print();
- VERIFY(*end_pos < Size());
- size_t tail_size = Size() - *end_pos - 1;
- size_t head_size = begin_pos;
- size_t palindrom_half_size = begin - begin_pos;
- size_t head_len = Length() - LengthAt(begin_pos);
- size_t tail_len = *end_pos < Size() - 1 ? LengthAt(*end_pos + 1) : 0;
-//TODO : this is not true in case of gaps inside the palindrom_len;
- size_t palindrom_len = (size_t) max((int) LengthAt(begin_pos) - (int) LengthAt(begin), 0);
- size_t between = (size_t) max(0, (int) LengthAt(begin) - (int) (end < Size() - 1 ? LengthAt(end + 1) : 0));
- DEBUG("tail len " << tail_len << " head len " << head_len << " palindrom_len "<< palindrom_len << " between " << between);
- if (palindrom_len <= max_repeat_length) {
- if (palindrom_len < head_len && palindrom_len < tail_len) {
- DEBUG("too big head and end");
- continue;
- }
- if (between > palindrom_len) {
- DEBUG("too big part between");
- continue;
- }
- }
- bool delete_tail = tail_size < head_size;
- if (tail_size == head_size) {
- delete_tail = tail_len < head_len;
- }
- if (delete_tail) {
- PopBack(tail_size + palindrom_half_size);
- DEBUG("Deleting tail because of palindrom removal");
- return;
- } else {
- GetConjPath()->PopBack(head_size + palindrom_half_size);
- DEBUG("Deleting head because of palindrom removal");
- return;
- }
- }
- }
- }
-
- BidirectionalPath SubPath(size_t from, size_t to) const {
- BidirectionalPath result(g_);
- for (size_t i = from; i < min(to, Size()); ++i) {
- result.PushBack(data_[i], gap_len_[i]);
- }
- return result;
- }
-
- BidirectionalPath SubPath(size_t from) const {
- return SubPath(from, Size());
- }
-
- double Coverage() const {
- double cov = 0.0;
-
- for (size_t i = 0; i < Size(); ++i) {
- cov += g_.coverage(data_[i]) * (double) g_.length(data_[i]);
- }
- return cov / (double) Length();
- }
-
- BidirectionalPath Conjugate() const {
- BidirectionalPath result(g_);
- if (Empty()) {
- return result;
- }
- result.PushBack(g_.conjugate(Back()), 0);
- for (int i = ((int) Size()) - 2; i >= 0; --i) {
- result.PushBack(g_.conjugate(data_[i]), gap_len_[i + 1].gap_ + gap_len_[i + 1].trash_current_ - gap_len_[i + 1].trash_previous_, gap_len_[i + 1].trash_current_, gap_len_[i + 1].trash_previous_);
- }
-
- return result;
- }
-
- vector<EdgeId> ToVector() const {
- return vector<EdgeId>(data_.begin(), data_.end());
- }
-
- bool CameToInterstrandBulge() const {
- if (Empty())
- return false;
-
- EdgeId lastEdge = Back();
- VertexId lastVertex = g_.EdgeEnd(lastEdge);
-
- if (g_.OutgoingEdgeCount(lastVertex) == 2) {
- vector<EdgeId> bulgeEdges(g_.out_begin(lastVertex), g_.out_end(lastVertex));
- VertexId nextVertex = g_.EdgeEnd(bulgeEdges[0]);
-
- if (bulgeEdges[0] == g_.conjugate(bulgeEdges[1]) && nextVertex == g_.EdgeEnd(bulgeEdges[1]) && g_.CheckUniqueOutgoingEdge(nextVertex)
- && *(g_.out_begin(nextVertex)) == g_.conjugate(lastEdge)) {
-
- DEBUG("Came to interstrand bulge " << g_.int_id(lastEdge));
- return true;
- }
- }
- return false;
- }
-
- bool IsInterstrandBulge() const {
- if (Empty())
- return false;
-
- EdgeId lastEdge = Back();
- VertexId lastVertex = g_.EdgeEnd(lastEdge);
- VertexId prevVertex = g_.EdgeStart(lastEdge);
-
- if (g_.OutgoingEdgeCount(prevVertex) == 2 && g_.IncomingEdgeCount(lastVertex) == 2 && g_.CheckUniqueOutgoingEdge(lastVertex)
- && g_.CheckUniqueIncomingEdge(prevVertex) && *(g_.in_begin(prevVertex)) == g_.conjugate(*(g_.out_begin(lastVertex)))) {
-
- vector<EdgeId> bulgeEdges(g_.out_begin(prevVertex), g_.out_end(prevVertex));
- EdgeId bulgeEdge = bulgeEdges[0] == lastEdge ? bulgeEdges[1] : bulgeEdges[0];
-
- if (bulgeEdge == g_.conjugate(lastEdge)) {
- DEBUG("In interstrand bulge " << g_.int_id(lastEdge));
- return true;
- }
- }
- return false;
- }
-
- void Print() const {
- DEBUG("Path " << id_);
- DEBUG("Length " << Length());
- DEBUG("Weight " << weight_);
- DEBUG("#, edge, length, gap length, trash length, total length, total length from begin");
- for (size_t i = 0; i < Size(); ++i) {
- DEBUG(i << ", " << g_.int_id(At(i)) << ", "
- << g_.length(At(i)) << ", " << GapAt(i) << ", "
- << TrashPreviousAt(i) << "-" << TrashCurrentAt(i)
- << ", " << LengthAt(i) << ", "
- << ((Length() < LengthAt(i)) ? 0 : Length() - LengthAt(i)));
- }
- }
-
- void PrintInString() const {
- stringstream str;
- for (size_t i = 0; i < Size(); ++i) {
- str << g_.int_id(At(i)) << " ";
- }
- DEBUG(str.str());
- }
- void PrintInfo() const {
- INFO("Path " << id_);
- INFO("Length " << Length());
- INFO("Weight " << weight_);
- INFO("#, edge, length, gap length, total length");
- for (size_t i = 0; i < Size(); ++i) {
- INFO(i << ", " << g_.int_id(At(i)) << ", " << g_.length(At(i)) << ", " << GapAt(i) << ", " << LengthAt(i));
- }
- }
-
- void Print(std::ostream& os) {
- if (Empty()) {
- return;
- }
- os << "Path " << GetId() << endl;
- os << "Length " << Length() << endl;
- os << "#, edge, length, gap, total length" << endl;
- for (size_t i = 0; i < Size(); ++i) {
- os << i << ", " << g_.int_id(At(i)) << ", " << g_.length(At(i)) << ", " << GapAt(i) << ", " << LengthAt(i) << endl;
- }
- }
-
- void SetOverlapedBeginTo(BidirectionalPath* to) {
- if (has_overlaped_begin_) {
- to->SetOverlapBegin();
- }
- SetOverlapBegin();
- to->SetOverlapEnd();
- }
-
- void SetOverlapedEndTo(BidirectionalPath* to) {
- if (has_overlaped_end_) {
- to->SetOverlapEnd();
- }
- SetOverlapEnd();
- to->SetOverlapBegin();
- }
-
- void SetOverlap(bool overlap = true) {
- overlap_ = overlap;
- conj_path_->overlap_ = overlap;
- }
-
- bool HasOverlapedBegin() const {
- return has_overlaped_begin_;
- }
-
- bool HasOverlapedEnd() const {
- return has_overlaped_end_;
- }
-
- bool IsOverlap() const {
- return overlap_;
- }
-
- void ResetOverlaps() {
- overlap_ = false;
- has_overlaped_begin_ = false;
- has_overlaped_end_ = false;
- conj_path_->overlap_ = false;
- conj_path_->has_overlaped_begin_ = false;
- conj_path_->has_overlaped_end_ = false;
- }
-private:
-
- void RecountLengths() {
- cumulative_len_.clear();
- size_t currentLength = 0;
- for (auto iter = data_.rbegin(); iter != data_.rend(); ++iter) {
- currentLength += g_.length((EdgeId) *iter);
- cumulative_len_.push_front(currentLength);
- }
- }
-
- void IncreaseLengths(size_t length, Gap gap_struct) {
- for (auto iter = cumulative_len_.begin(); iter != cumulative_len_.end(); ++iter) {
- *iter += length + gap_struct.gap_ - gap_struct.trash_previous_;
- }
- cumulative_len_.push_back(length);
- }
-
- void DecreaseLengths() {
- size_t length = g_.length(data_.back()) + gap_len_.back().gap_ - gap_len_.back().trash_previous_;
-
- for (auto iter = cumulative_len_.begin(); iter != cumulative_len_.end(); ++iter) {
- *iter -= length;
- }
- cumulative_len_.pop_back();
- }
-
- void NotifyFrontEdgeAdded(EdgeId e, int gap) {
- for (auto i = listeners_.begin(); i != listeners_.end(); ++i) {
- (*i)->FrontEdgeAdded(e, this, gap);
- }
- }
-
- void NotifyFrontEdgeAdded(EdgeId e, Gap gap) {
- for (auto i = listeners_.begin(); i != listeners_.end(); ++i) {
- (*i)->FrontEdgeAdded(e, this, gap);
- }
- }
-
- void NotifyBackEdgeAdded(EdgeId e, int gap) {
- for (auto i = listeners_.begin(); i != listeners_.end(); ++i) {
- (*i)->BackEdgeAdded(e, this, gap);
- }
- }
-
- void NotifyBackEdgeAdded(EdgeId e, Gap gap) {
- for (auto i = listeners_.begin(); i != listeners_.end(); ++i) {
- (*i)->BackEdgeAdded(e, this, gap);
- }
- }
-
- void NotifyFrontEdgeRemoved(EdgeId e) {
- for (auto i = listeners_.begin(); i != listeners_.end(); ++i) {
- (*i)->FrontEdgeRemoved(e, this);
- }
- }
-
- void NotifyBackEdgeRemoved(EdgeId e) {
- for (auto i = listeners_.begin(); i != listeners_.end(); ++i) {
- (*i)->BackEdgeRemoved(e, this);
- }
- }
-
- void PushFront(EdgeId e, Gap gap) {
- PushFront(e, gap.gap_ + gap.trash_current_ - gap.trash_previous_, gap.trash_current_, gap.trash_previous_);
- }
-
- void PushFront(EdgeId e, int gap = 0, uint32_t trash_previous = 0, uint32_t trash_current = 0) {
- data_.push_front(e);
- if (gap_len_.size() > 0) {
- gap_len_[0].gap_ += gap;
- gap_len_[0].trash_previous_ += trash_previous;
- gap_len_[0].trash_current_ += trash_current;
- }
- gap_len_.push_front(Gap(0, 0, 0));
-
- int length = (int) g_.length(e);
- if (cumulative_len_.empty()) {
- cumulative_len_.push_front(length);
- } else {
- cumulative_len_.push_front(length + cumulative_len_.front() + gap - trash_previous );
- }
- NotifyFrontEdgeAdded(e, gap);
- }
-
- void PopFront() {
- EdgeId e = data_.front();
- if (gap_len_.size() > 1) {
- gap_len_[1].gap_ = 0;
- gap_len_[1].trash_previous_ = 0;
- gap_len_[1].trash_current_ = 0;
- }
- data_.pop_front();
- gap_len_.pop_front();
-
- cumulative_len_.pop_front();
- NotifyFrontEdgeRemoved(e);
- }
-
- void SetOverlapBegin(bool overlap = true) {
- if (has_overlaped_begin_ != overlap) {
- has_overlaped_begin_ = overlap;
- }
- if (GetConjPath()->has_overlaped_end_ != overlap) {
- GetConjPath()->has_overlaped_end_ = overlap;
- }
- }
-
- void SetOverlapEnd(bool overlap = true) {
- GetConjPath()->SetOverlapBegin(overlap);
- }
-
- const Graph& g_;
- std::deque<EdgeId> data_;
- BidirectionalPath* conj_path_;
- std::deque<size_t> cumulative_len_; // Length from beginning of i-th edge to path end for forward directed path: L(e1 + e2 + ... + eN) ... L(eN)
- std::deque<Gap> gap_len_; // e1 - gap2 - e2 - ... - gapN - eN
- std::vector<PathListener *> listeners_;
- const uint64_t id_; //Unique ID
- float weight_;
- bool has_overlaped_begin_;
- bool has_overlaped_end_;
- bool overlap_;
- DECL_LOGGER("BidirectionalPath");
-};
-
-inline int SkipOneGap(EdgeId end, const BidirectionalPath& path, int gap, int pos, bool forward) {
- size_t len = 0;
- while (pos < (int) path.Size() && pos >= 0 && end != path.At(pos) && (int) len < 2 * gap) {
- len += path.graph().length(path.At(pos));
- forward ? pos++ : pos--;
- }
- if (pos < (int) path.Size() && pos >= 0 && end == path.At(pos)) {
- return pos;
- }
- return -1;
-}
-
-inline void SkipGaps(const BidirectionalPath& path1, size_t& cur_pos1, int gap1, const BidirectionalPath& path2, size_t& cur_pos2, int gap2, bool use_gaps,
- bool forward) {
- if (use_gaps) {
- if (gap1 > 0 && gap2 <= 0) {
- int temp2 = SkipOneGap(path1.At(cur_pos1), path2, gap1, (int) cur_pos2, forward);
- if (temp2 >= 0) {
- cur_pos2 = (size_t) temp2;
- }
- } else if (gap2 > 0 && gap1 <= 0) {
- int temp1 = SkipOneGap(path2.At(cur_pos2), path1, gap2, (int) cur_pos1, forward);
- if (temp1 >= 0) {
- cur_pos1 = (size_t) temp1;
- }
- } else if (gap1 > 0 && gap2 > 0 && gap1 != gap2) {
- DEBUG("not equal gaps in two paths!!!");
- }
- }
-}
-
-inline size_t FirstNotEqualPosition(const BidirectionalPath& path1, size_t pos1, const BidirectionalPath& path2, size_t pos2, bool use_gaps) {
- int cur_pos1 = (int) pos1;
- int cur_pos2 = (int) pos2;
- int gap1 = path1.GapAt(cur_pos1);
- int gap2 = path2.GapAt(cur_pos2);
- while (cur_pos1 >= 0 && cur_pos2 >= 0) {
- if (path1.At(cur_pos1) == path2.At(cur_pos2)) {
- cur_pos1--;
- cur_pos2--;
- } else {
- DEBUG("Not Equal at " << cur_pos1 << " and " << cur_pos2);
- return cur_pos1;
- }
- if (cur_pos1 >= 0 && cur_pos2 >= 0) {
- size_t p1 = (size_t) cur_pos1;
- size_t p2 = (size_t) cur_pos2;
- SkipGaps(path1, p1, gap1, path2, p2, gap2, use_gaps, false);
- cur_pos1 = (int) p1;
- cur_pos2 = (int) p2;
- gap1 = path1.GapAt(cur_pos1);
- gap2 = path2.GapAt(cur_pos2);
- }
- }
- DEBUG("Equal!!");
- return -1UL;
-}
-inline bool EqualBegins(const BidirectionalPath& path1, size_t pos1, const BidirectionalPath& path2, size_t pos2, bool use_gaps) {
- DEBUG("Checking for equal begins");
- return FirstNotEqualPosition(path1, pos1, path2, pos2, use_gaps) == -1UL;
-}
-
-inline size_t LastNotEqualPosition(const BidirectionalPath& path1, size_t pos1, const BidirectionalPath& path2, size_t pos2, bool use_gaps) {
- size_t cur_pos1 = pos1;
- size_t cur_pos2 = pos2;
- while (cur_pos1 < path1.Size() && cur_pos2 < path2.Size()) {
- if (path1.At(cur_pos1) == path2.At(cur_pos2)) {
- cur_pos1++;
- cur_pos2++;
- } else {
- return cur_pos1;
- }
- int gap1 = cur_pos1 < path1.Size() ? path1.GapAt(cur_pos1) : 0;
- int gap2 = cur_pos2 < path2.Size() ? path2.GapAt(cur_pos2) : 0;
- SkipGaps(path1, cur_pos1, gap1, path2, cur_pos2, gap2, use_gaps, true);
- }
- return -1UL;
-}
-
-inline bool EqualEnds(const BidirectionalPath& path1, size_t pos1, const BidirectionalPath& path2, size_t pos2, bool use_gaps) {
- return LastNotEqualPosition(path1, pos1, path2, pos2, use_gaps) == -1UL;
-}
-
-inline bool PathIdCompare(const BidirectionalPath* p1, const BidirectionalPath* p2) {
- return p1->GetId() < p2->GetId();
-}
-
-
-
-typedef std::pair<BidirectionalPath*, BidirectionalPath*> PathPair;
-
-inline bool compare_path_pairs(const PathPair& p1, const PathPair& p2) {
- if (p1.first->Length() != p2.first->Length() || p1.first->Size() == 0 || p2.first->Size() == 0) {
- return p1.first->Length() > p2.first->Length();
- }
- const Graph& g = p1.first->graph();
- return g.int_id(p1.first->Front()) < g.int_id(p2.first->Front());
-}
-
-class PathComparator {
-public:
- bool operator()(const BidirectionalPath& p1, const BidirectionalPath& p2) const {
- return p1.GetId() < p2.GetId();
- }
-
- bool operator()(const BidirectionalPath* p1, const BidirectionalPath* p2) const {
- return p1->GetId() < p2->GetId();
- }
-};
-
-typedef set<BidirectionalPath*, PathComparator> BidirectionalPathSet;
-
-template<class Value>
-using BidirectionalPathMap = map<BidirectionalPath*, Value, PathComparator>;
-
-typedef std::multiset <BidirectionalPath *, PathComparator> BidirectionalPathMultiset;
-
-class PathContainer {
-
-public:
-
- typedef std::vector<PathPair> PathContainerT;
-
- class Iterator : public PathContainerT::iterator {
- public:
- Iterator(const PathContainerT::iterator& iter)
- : PathContainerT::iterator(iter) {
- }
- BidirectionalPath* get() const {
- return this->operator *().first;
- }
- BidirectionalPath* getConjugate() const {
- return this->operator *().second;
- }
- };
-
- class ConstIterator : public PathContainerT::const_iterator {
- public:
- ConstIterator(const PathContainerT::const_iterator& iter)
- : PathContainerT::const_iterator(iter) {
- }
- BidirectionalPath* get() const {
- return this->operator *().first;
- }
- BidirectionalPath* getConjugate() const {
- return this->operator *().second;
- }
- };
-
- PathContainer() {
- }
-
- BidirectionalPath& operator[](size_t index) const {
- return *(data_[index].first);
- }
-
- BidirectionalPath* Get(size_t index) const {
- return data_[index].first;
- }
-
- BidirectionalPath* GetConjugate(size_t index) const {
- return data_[index].second;
- }
-
- void DeleteAllPaths() {
- for (size_t i = 0; i < data_.size(); ++i) {
- delete data_[i].first;
- delete data_[i].second;
- }
- clear();
- }
-
- size_t size() const {
- return data_.size();
- }
-
- void clear() {
- data_.clear();
- }
-
- void reserve(size_t size) {
- data_.reserve(size);
- }
-
- bool AddPair(BidirectionalPath* p, BidirectionalPath* cp) {
- p->SetConjPath(cp);
- cp->SetConjPath(p);
- p->Subscribe(cp);
- cp->Subscribe(p);
- data_.push_back(std::make_pair(p, cp));
- return true;
- }
-
- void SortByLength() {
- std::stable_sort(data_.begin(), data_.end(), compare_path_pairs);
- }
-
- Iterator begin() {
- return Iterator(data_.begin());
- }
-
- Iterator end() {
- return Iterator(data_.end());
- }
-
-
- ConstIterator begin() const {
- return ConstIterator(data_.begin());
- }
-
- ConstIterator end() const {
- return ConstIterator(data_.end());
- }
-
- Iterator erase(Iterator iter) {
- return Iterator(data_.erase(iter));
- }
-
- void print() const {
- for (size_t i = 0; i < size(); ++i) {
- Get(i)->Print();
- GetConjugate(i)->Print();
- }
- }
-
- void FilterEmptyPaths() {
- DEBUG ("try to delete empty paths");
- for (Iterator iter = begin(); iter != end();) {
- if (iter.get()->Size() == 0) {
- iter = erase(iter);
- } else {
- ++iter;
- }
- }
- DEBUG("empty paths are removed");
- }
-
- void FilterInterstandBulges() {
- DEBUG ("Try to delete paths with interstand bulges");
- for (Iterator iter = begin(); iter != end(); ++iter) {
- if (iter.get()->IsInterstrandBulge()) {
- iter.get()->PopBack();
- }
- if (iter.getConjugate()->IsInterstrandBulge()) {
- iter.getConjugate()->PopBack();
- }
- }
- DEBUG("deleted paths with interstand bulges");
- }
-
-private:
- std::vector<PathPair> data_;
-
-protected:
- DECL_LOGGER("BidirectionalPath");
-
-};
-
-inline pair<size_t, size_t> ComparePaths(size_t start_pos1, size_t start_pos2, const BidirectionalPath& path1, const BidirectionalPath& path2,
- size_t max_diff) {
- path1.Print();
- path2.Print();
- if (start_pos1 >= path1.Size() || start_pos2 >= path2.Size()) {
- return make_pair(start_pos1, start_pos2);
- }
- const Graph& g = path1.graph();
- size_t cur_pos = start_pos1;
- size_t last2 = start_pos2;
- size_t last1 = cur_pos;
- cur_pos++;
- size_t diff_len = 0;
- while (cur_pos < path1.Size()) {
- if (diff_len > max_diff) {
- return make_pair(last1, last2);
- }
- EdgeId e = path1[cur_pos];
- vector<size_t> poses2 = path2.FindAll(e);
- bool found = false;
- for (size_t pos2 = 0; pos2 < poses2.size(); ++pos2) {
- if (poses2[pos2] > last2) {
- if (path2.LengthAt(last2) - path2.LengthAt(poses2[pos2]) - g.length(path2.At(last2)) - path2.GapAt(poses2[pos2]) > max_diff) {
- break;
- }
- last2 = poses2[pos2];
- last1 = cur_pos;
- DEBUG("found " << cur_pos);
- found = true;
- break;
- }
- }
- if (!found) {
- diff_len += g.length(e) + path1.GapAt(cur_pos);
- DEBUG("not found " << cur_pos << " now diff len " << diff_len);
- } else {
- diff_len = 0;
- }
- cur_pos++;
- }
- return make_pair(last1, last2);
-}
-
-inline void DeletePaths(BidirectionalPathSet& paths) {
- for (auto i = paths.begin(); i != paths.end(); ++i) {
- delete (*i);
- }
-}
-
-inline void DeletePaths(vector<BidirectionalPath*>& paths) {
- for (auto i = paths.begin(); i != paths.end(); ++i) {
- delete (*i);
- }
-}
-
-inline void DeleteMapWithPaths(map<EdgeId, BidirectionalPath*> m) {
- for (auto i = m.begin(); i != m.end(); ++i){
- delete i->second;
- }
-}
-
-} // path extend
-
-#endif /* BIDIRECTIONAL_PATH_H_ */
diff --git a/src/debruijn/path_extend/extension_chooser.hpp b/src/debruijn/path_extend/extension_chooser.hpp
deleted file mode 100644
index cb6d897..0000000
--- a/src/debruijn/path_extend/extension_chooser.hpp
+++ /dev/null
@@ -1,1443 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * extension.hpp
- *
- * Created on: Mar 5, 2012
- * Author: andrey
- */
-
-#ifndef EXTENSION_HPP_
-#define EXTENSION_HPP_
-
-#include <cfloat>
-#include <iostream>
-#include <fstream>
-#include "weight_counter.hpp"
-#include "pe_utils.hpp"
-#include "next_path_searcher.hpp"
-
-//#include "scaff_supplementary.hpp"
-
-namespace path_extend {
-
-typedef std::multimap<double, EdgeWithDistance> AlternativeContainer;
-
-class PathAnalyzer {
- const Graph& g_;
-
-public:
- PathAnalyzer(const Graph& g): g_(g) {
- }
-
- int ExcludeTrivial(const BidirectionalPath& path, std::set<size_t>& edges, int from = -1) const {
- int edgeIndex = (from == -1) ? (int) path.Size() - 1 : from;
- if ((int) path.Size() <= from) {
- return edgeIndex;
- }
- VertexId currentVertex = g_.EdgeEnd(path[edgeIndex]);
- while (edgeIndex >= 0 && g_.CheckUniqueIncomingEdge(currentVertex)) {
- EdgeId e = g_.GetUniqueIncomingEdge(currentVertex);
- currentVertex = g_.EdgeStart(e);
-
- edges.insert((size_t) edgeIndex);
- --edgeIndex;
- }
- return edgeIndex;
- }
-
- int ExcludeTrivialWithBulges(const BidirectionalPath& path, std::set<size_t>& edges) const {
-
- if (path.Empty()) {
- return 0;
- }
-
- int lastEdge = (int) path.Size() - 1;
- do {
- lastEdge = ExcludeTrivial(path, edges, lastEdge);
-
- if (lastEdge >= 0) {
- VertexId v = g_.EdgeEnd(path[lastEdge]);
- VertexId u = g_.EdgeStart(path[lastEdge]);
- auto bulgeCandidates = g_.IncomingEdges(v);
- bool bulge = true;
-
- for (auto iter = bulgeCandidates.begin(); iter != bulgeCandidates.end(); ++iter) {
- if (g_.EdgeStart(*iter) != u) {
- bulge = false;
- break;
- }
- }
-
- if (!bulge) {
- break;
- }
-
- --lastEdge;
- }
- } while (lastEdge >= 0);
-
- return lastEdge;
- }
-
-protected:
- DECL_LOGGER("PathAnalyzer")
-};
-
-
-class ExtensionChooserListener {
-
-public:
-
- virtual void ExtensionChosen(double weight) = 0;
-
- virtual void ExtensionChosen(const AlternativeContainer& alts) = 0;
-
- virtual ~ExtensionChooserListener() {
-
- }
-};
-
-
-class ExtensionChooser {
-
-public:
- typedef std::vector<EdgeWithDistance> EdgeContainer;
-
-protected:
- const Graph& g_;
- shared_ptr<WeightCounter> wc_;
- //FIXME memory leak?!
- std::vector<ExtensionChooserListener *> listeners_;
-
-private:
- double weight_threshold_;
- PathAnalyzer analyzer_;
-
- bool excludeTrivial_;
- bool excludeTrivialWithBulges_;
-
-
-public:
- ExtensionChooser(const Graph& g, shared_ptr<WeightCounter> wc = nullptr, double weight_threshold = -1.):
- g_(g), wc_(wc),
- weight_threshold_(weight_threshold), analyzer_(g),
- excludeTrivial_(true), excludeTrivialWithBulges_(true) {
- }
-
- virtual ~ExtensionChooser() {
-
- }
-
- virtual EdgeContainer Filter(const BidirectionalPath& path, const EdgeContainer& edges) const = 0;
-
- bool isExcludeTrivial() const
- {
- return excludeTrivial_;
- }
-
- bool isExcludeTrivialWithBulges() const
- {
- return excludeTrivialWithBulges_;
- }
-
- void setExcludeTrivial(bool excludeTrivial) {
- this->excludeTrivial_ = excludeTrivial;
- }
-
- void setExcludeTrivialWithBulges(bool excludeTrivialWithBulges) {
- this->excludeTrivialWithBulges_ = excludeTrivialWithBulges;
- }
-
- bool CheckThreshold(double weight) const {
- return math::ge(weight, weight_threshold_);
- }
-
- void Subscribe(ExtensionChooserListener * listener) {
- listeners_.push_back(listener);
- }
-
- void NotifyAll(double weight) const {
- for (auto listener_ptr : listeners_) {
- listener_ptr->ExtensionChosen(weight);
- }
- }
-
- void NotifyAll(const AlternativeContainer& alts) const {
- for (auto listener_ptr : listeners_) {
- listener_ptr->ExtensionChosen(alts);
- }
- }
-
- bool WeightCounterBased() const {
- return wc_ != nullptr;
- }
-
- const WeightCounter& wc() const {
- VERIFY(wc_);
- return *wc_;
- }
-
-protected:
- void RemoveTrivial(const BidirectionalPath& path, std::set<size_t>& to_exclude) const {
- if (excludeTrivialWithBulges_) {
- analyzer_.ExcludeTrivialWithBulges(path, to_exclude);
- } else if (excludeTrivial_) {
- analyzer_.ExcludeTrivial(path, to_exclude);
- }
- }
-
- bool HasIdealInfo(EdgeId e1, EdgeId e2, size_t dist) const {
- return math::gr(wc_->lib().IdealPairedInfo(e1, e2, (int) dist), 0.);
- }
-
- bool HasIdealInfo(const BidirectionalPath& p, EdgeId e, size_t gap) const {
- for (int i = (int) p.Size() - 1; i >= 0; --i)
- if (HasIdealInfo(p[i], e, gap + p.LengthAt(i)))
- return true;
- return false;
- }
-
-private:
- DECL_LOGGER("ExtensionChooser");
-};
-
-
-class JointExtensionChooser: public ExtensionChooser {
-
-protected:
- shared_ptr<ExtensionChooser> first_;
-
- shared_ptr<ExtensionChooser> second_;
-
-public:
- JointExtensionChooser(Graph& g, shared_ptr<ExtensionChooser> first, shared_ptr<ExtensionChooser> second): ExtensionChooser(g),
- first_(first), second_(second)
- {
- }
-
- EdgeContainer Filter(const BidirectionalPath& path, const EdgeContainer& edges) const override {
- EdgeContainer e1 = first_->Filter(path, edges);
- return second_->Filter(path, e1);
- }
-};
-
-
-class TrivialExtensionChooser: public ExtensionChooser {
-
-public:
- TrivialExtensionChooser(Graph& g): ExtensionChooser(g) {
- }
-
- EdgeContainer Filter(const BidirectionalPath& /*path*/, const EdgeContainer& edges) const override {
- if (edges.size() == 1) {
- return edges;
- }
- return EdgeContainer();
- }
-};
-
-
-class TrivialExtensionChooserWithPI: public ExtensionChooser {
-
-public:
- TrivialExtensionChooserWithPI(Graph& g, shared_ptr<WeightCounter> wc, double weight_threshold):
- ExtensionChooser(g, wc, weight_threshold) {
- }
-
- EdgeContainer Filter(const BidirectionalPath& path, const EdgeContainer& edges) const override {
- if (edges.size() == 1) {
- double weight = wc_->CountWeight(path, edges.back().e_, std::set<size_t>());
- NotifyAll(weight);
-
- if (CheckThreshold(weight)) {
- return edges;
- }
- }
- return EdgeContainer();
- }
-};
-
-class ExcludingExtensionChooser: public ExtensionChooser {
- //FIXME what is the logic behind it?
- double prior_coeff_;
-
- AlternativeContainer FindWeights(const BidirectionalPath& path, const EdgeContainer& edges, const std::set<size_t>& to_exclude) const {
- AlternativeContainer weights;
- for (auto iter = edges.begin(); iter != edges.end(); ++iter) {
- double weight = wc_->CountWeight(path, iter->e_, to_exclude);
- weights.insert(std::make_pair(weight, *iter));
- DEBUG("Candidate " << g_.int_id(iter->e_) << " weight " << weight << " length " << g_.length(iter->e_));
- }
- NotifyAll(weights);
- return weights;
- }
-
- EdgeContainer FindPossibleEdges(const AlternativeContainer& weights,
- double max_weight) const {
- EdgeContainer top;
- auto possible_edge = weights.lower_bound(max_weight / prior_coeff_);
- for (auto iter = possible_edge; iter != weights.end(); ++iter) {
- top.push_back(iter->second);
- }
- return top;
- }
-
- EdgeContainer FindFilteredEdges(const BidirectionalPath& path,
- const EdgeContainer& edges, const std::set<size_t>& to_exclude) const {
- AlternativeContainer weights = FindWeights(path, edges, to_exclude);
- auto max_weight = (--weights.end())->first;
- EdgeContainer top = FindPossibleEdges(weights, max_weight);
- EdgeContainer result;
- if (top.size() >= 1 && CheckThreshold(max_weight)) {
- result = top;
- }
- return result;
- }
-
-protected:
-
- virtual void ExcludeEdges(const BidirectionalPath& path, const EdgeContainer& edges, std::set<size_t>& to_exclude) const = 0;
-
-public:
- ExcludingExtensionChooser(const Graph& g, shared_ptr<WeightCounter> wc, double weight_threshold, double priority) :
- ExtensionChooser(g, wc, weight_threshold), prior_coeff_(priority) {
-
- }
-
- virtual EdgeContainer Filter(const BidirectionalPath& path,
- const EdgeContainer& edges) const {
- DEBUG("Paired-end extension chooser");
- if (edges.empty()) {
- return edges;
- }
- std::set<size_t> to_exclude;
- RemoveTrivial(path, to_exclude);
- path.Print();
- EdgeContainer result = edges;
- ExcludeEdges(path, result, to_exclude);
- result = FindFilteredEdges(path, result, to_exclude);
- if (result.size() == 1) {
- DEBUG("Paired-end extension chooser helped");
- }
- return result;
- }
-
-private:
- DECL_LOGGER("ExcludingExtensionChooser");
-
-};
-
-class SimpleExtensionChooser: public ExcludingExtensionChooser {
-protected:
- void ExcludeEdges(const BidirectionalPath& path, const EdgeContainer& edges, std::set<size_t>& to_exclude) const override {
- if (edges.size() < 2) {
- return;
- }
- //excluding based on absense of ideal info
- int index = (int) path.Size() - 1;
- while (index >= 0) {
- if (to_exclude.count(index)) {
- index--;
- continue;
- }
- EdgeId path_edge = path[index];
-
- for (size_t i = 0; i < edges.size(); ++i) {
- if (!HasIdealInfo(path_edge,
- edges.at(i).e_,
- path.LengthAt(index))) {
- to_exclude.insert((size_t) index);
- }
- }
-
- index--;
- }
-
- //excluding based on presense of ambiguous paired info
- map<size_t, unsigned> edge_2_extension_cnt;
- for (size_t i = 0; i < edges.size(); ++i) {
- for (size_t e : wc_->PairInfoExist(path, edges.at(i).e_)) {
- edge_2_extension_cnt[e] += 1;
- }
- }
-
- for (auto e_w_ec : edge_2_extension_cnt) {
- if (e_w_ec.second == edges.size()) {
- to_exclude.insert(e_w_ec.first);
- }
- }
- }
-
-public:
-
- SimpleExtensionChooser(const Graph& g, shared_ptr<WeightCounter> wc, double weight_threshold, double priority) :
- ExcludingExtensionChooser(g, wc, weight_threshold, priority) {
- }
-
-private:
- DECL_LOGGER("SimpleExtensionChooser");
-};
-
-class LongEdgeExtensionChooser: public ExcludingExtensionChooser {
-protected:
- virtual void ExcludeEdges(const BidirectionalPath& path, const EdgeContainer& edges, std::set<size_t>& to_exclude) const {
- if (edges.size() < 2) {
- return;
- }
- int index = (int) path.Size() - 1;
- while (index >= 0) {
- if (to_exclude.count(index)) {
- index--;
- continue;
- }
- EdgeId path_edge = path[index];
- //FIXME configure!
- if (path.graph().length(path_edge) < 200)
- to_exclude.insert((size_t) index);
- index--;
- }
- }
-public:
- LongEdgeExtensionChooser(const Graph& g, shared_ptr<WeightCounter> wc, double weight_threshold, double priority) :
- ExcludingExtensionChooser(g, wc, weight_threshold, priority) {
- }
-};
-
-class ScaffoldingExtensionChooser : public ExtensionChooser {
-
-protected:
- typedef ExtensionChooser base;
- double raw_weight_threshold_;
- double cl_weight_threshold_;
- const double is_scatter_coeff_ = 3.0;
-
- void AddInfoFromEdge(const std::vector<int>& distances, const std::vector<double>& weights,
- std::vector<pair<int, double>>& histogram, size_t len_to_path_end) const {
- for (size_t l = 0; l < distances.size(); ++l) {
- //todo commented out condition seems unnecessary and should be library dependent! do we need "max(0" there?
- if (/*distances[l] > max(0, (int) len_to_path_end - int(1000)) && */math::ge(weights[l], raw_weight_threshold_)) {
- histogram.push_back(make_pair(distances[l] - (int) len_to_path_end, weights[l]));
- }
- }
- }
-
- int CountMean(const vector<pair<int, double> >& histogram) const {
- double dist = 0.0;
- double sum = 0.0;
- for (size_t i = 0; i < histogram.size(); ++i) {
- dist += histogram[i].first * histogram[i].second;
- sum += histogram[i].second;
- }
- dist /= sum;
- return (int) round(dist);
- }
-
- void GetDistances(EdgeId e1, EdgeId e2, std::vector<int>& dist,
- std::vector<double>& w) const {
- wc_->lib().CountDistances(e1, e2, dist, w);
- }
-
- void CountAvrgDists(const BidirectionalPath& path, EdgeId e, std::vector<pair<int, double>> & histogram) const {
- for (size_t j = 0; j < path.Size(); ++j) {
- std::vector<int> distances;
- std::vector<double> weights;
- GetDistances(path.At(j), e, distances, weights);
- if (distances.size() > 0) {
- AddInfoFromEdge(distances, weights, histogram, path.LengthAt(j));
- }
- }
- }
-
- void FindBestFittedEdgesForClustered(const BidirectionalPath& path, const set<EdgeId>& edges, EdgeContainer& result) const {
- for (EdgeId e : edges) {
- std::vector<pair<int, double>> histogram;
- CountAvrgDists(path, e, histogram);
- double sum = 0.0;
- for (size_t j = 0; j < histogram.size(); ++j) {
- sum += histogram[j].second;
- }
- if (sum <= cl_weight_threshold_) {
- continue;
- }
- int gap = CountMean(histogram);
- if (HasIdealInfo(path, e, gap)) {
- DEBUG("scaffolding " << g_.int_id(e) << " gap " << gap);
- result.push_back(EdgeWithDistance(e, gap));
- }
- }
- }
-
- bool IsTip(EdgeId e) const {
- return g_.IncomingEdgeCount(g_.EdgeStart(e)) == 0;
- }
-
- set<EdgeId> FindCandidates(const BidirectionalPath& path) const {
- set<EdgeId> jumping_edges;
- const auto& lib = wc_->lib();
- //todo lib (and FindJumpEdges) knows its var so it can be counted there
- int is_scatter = int(math::round(double(lib.GetIsVar()) * is_scatter_coeff_));
- for (int i = (int) path.Size() - 1; i >= 0 && path.LengthAt(i) - g_.length(path.At(i)) <= lib.GetISMax(); --i) {
- set<EdgeId> jump_edges_i;
- lib.FindJumpEdges(path.At(i), jump_edges_i,
- std::max(0, (int)path.LengthAt(i) - is_scatter),
- //FIXME do we need is_scatter here?
- int((path.LengthAt(i) + lib.GetISMax() + is_scatter)),
- 0);
- for (EdgeId e : jump_edges_i) {
- if (IsTip(e)) {
- jumping_edges.insert(e);
- }
- }
- }
- return jumping_edges;
- }
-
-public:
-
- ScaffoldingExtensionChooser(const Graph& g, shared_ptr<WeightCounter> wc, double is_scatter_coeff) :
- ExtensionChooser(g, wc), raw_weight_threshold_(0.0),
- cl_weight_threshold_(cfg::get().pe_params.param_set.scaffolder_options.cl_threshold),
- is_scatter_coeff_(is_scatter_coeff) {
- }
-
- EdgeContainer Filter(const BidirectionalPath& path, const EdgeContainer& edges) const override {
- if (edges.empty()) {
- return edges;
- }
- set<EdgeId> candidates = FindCandidates(path);
- EdgeContainer result;
- FindBestFittedEdgesForClustered(path, candidates, result);
- return result;
- }
-private:
- DECL_LOGGER("ScaffoldingExtensionChooser");
-};
-
-inline bool EdgeWithWeightCompareReverse(const pair<EdgeId, double>& p1,
- const pair<EdgeId, double>& p2) {
- return p1.second > p2.second;
-}
-
-class LongReadsUniqueEdgeAnalyzer {
-private:
- DECL_LOGGER("LongReadsUniqueEdgeAnalyzer")
-public:
- LongReadsUniqueEdgeAnalyzer(const Graph& g, const GraphCoverageMap& cov_map,
- double filter_threshold, double prior_threshold)
- : g_(g),
- cov_map_(cov_map),
- filter_threshold_(filter_threshold),
- prior_threshold_(prior_threshold) {
- FindAllUniqueEdges();
- }
-
- bool IsUnique(EdgeId e) const {
- return unique_edges_.count(e) > 0;
- }
-
-private:
- bool UniqueEdge(EdgeId e) const {
- if (g_.length(e) > cfg::get().max_repeat_length)
- return true;
- DEBUG("Analyze unique edge " << g_.int_id(e));
- if (cov_map_.size() == 0) {
- return false;
- }
- auto cov_paths = cov_map_.GetCoveringPaths(e);
- for (auto it1 = cov_paths.begin(); it1 != cov_paths.end(); ++it1) {
- auto pos1 = (*it1)->FindAll(e);
- if (pos1.size() > 1) {
- DEBUG("***not unique " << g_.int_id(e) << " len " << g_.length(e) << "***");
- return false;
- }
- for (auto it2 = it1; it2 != cov_paths.end(); it2++) {
- auto pos2 = (*it2)->FindAll(e);
- if (pos2.size() > 1) {
- DEBUG("***not unique " << g_.int_id(e) << " len " << g_.length(e) << "***");
- return false;
- }
- if (!ConsistentPath(**it1, pos1[0], **it2, pos2[0])) {
- DEBUG("Checking inconsistency");
- if (CheckInconsistence(**it1, pos1[0], **it2, pos2[0],
- cov_paths)) {
- DEBUG("***not unique " << g_.int_id(e) << " len " << g_.length(e) << "***");
- return false;
- }
- }
- }
- }
- DEBUG("***edge " << g_.int_id(e) << " is unique.***");
- return true;
- }
-
- bool ConsistentPath(const BidirectionalPath& path1, size_t pos1,
- const BidirectionalPath& path2, size_t pos2) const {
- return EqualBegins(path1, pos1, path2, pos2, false)
- && EqualEnds(path1, pos1, path2, pos2, false);
- }
- bool SignificantlyDiffWeights(double w1, double w2) const {
- if (w1 > filter_threshold_ and w2 > filter_threshold_) {
- if (w1 > w2 * prior_threshold_ or w2 > w1 * prior_threshold_) {
- return true;
- }
- return false;
- }
- return true;
- }
-
- bool CheckInconsistence(
- const BidirectionalPath& path1, size_t pos1,
- const BidirectionalPath& path2, size_t pos2,
- const BidirectionalPathSet& cov_paths) const {
- size_t first_diff_pos1 = FirstNotEqualPosition(path1, pos1, path2, pos2, false);
- size_t first_diff_pos2 = FirstNotEqualPosition(path2, pos2, path1, pos1, false);
- if (first_diff_pos1 != -1UL && first_diff_pos2 != -1UL) {
- const BidirectionalPath cand1 = path1.SubPath(first_diff_pos1,
- pos1 + 1);
- const BidirectionalPath cand2 = path2.SubPath(first_diff_pos2,
- pos2 + 1);
- std::pair<double, double> weights = GetSubPathsWeights(cand1, cand2,
- cov_paths);
- DEBUG("Not equal begin " << g_.int_id(path1.At(first_diff_pos1)) << " weight " << weights.first << "; " << g_.int_id(path2.At(first_diff_pos2)) << " weight " << weights.second);
- if (!SignificantlyDiffWeights(weights.first, weights.second)) {
- DEBUG("not significantly different");
- return true;
- }
- }
- size_t last_diff_pos1 = LastNotEqualPosition(path1, pos1, path2, pos2, false);
- size_t last_diff_pos2 = LastNotEqualPosition(path2, pos2, path1, pos1, false);
- if (last_diff_pos1 != -1UL) {
- const BidirectionalPath cand1 = path1.SubPath(pos1,
- last_diff_pos1 + 1);
- const BidirectionalPath cand2 = path2.SubPath(pos2,
- last_diff_pos2 + 1);
- std::pair<double, double> weights = GetSubPathsWeights(cand1, cand2,
- cov_paths);
- DEBUG("Not equal end " << g_.int_id(path1.At(last_diff_pos1)) << " weight " << weights.first << "; " << g_.int_id(path2.At(last_diff_pos2)) << " weight " << weights.second);
- if (!SignificantlyDiffWeights(weights.first, weights.second)) {
- DEBUG("not significantly different");
- return true;
- }
- }
- return false;
- }
-
- std::pair<double, double> GetSubPathsWeights(
- const BidirectionalPath& cand1, const BidirectionalPath& cand2,
- const BidirectionalPathSet& cov_paths) const {
- double weight1 = 0.0;
- double weight2 = 0.0;
- for (auto iter = cov_paths.begin(); iter != cov_paths.end(); ++iter) {
- BidirectionalPath* path = *iter;
- if (ContainSubPath(*path, cand1)) {
- weight1 += path->GetWeight();
- } else if (ContainSubPath(*path, cand2)) {
- weight2 += path->GetWeight();
- }
- }
- return std::make_pair(weight1, weight2);
- }
-
- bool ContainSubPath(const BidirectionalPath& path,
- const BidirectionalPath& subpath) const {
- for (size_t i = 0; i < path.Size(); ++i) {
- if (path.CompareFrom(i, subpath))
- return true;
- }
- return false;
- }
-
- void FindAllUniqueCoverageEdges() {
- if (cfg::get().ds.single_cell) {
- return;
- }
- double sum_cov = 0;
- size_t sum_len = 0;
- size_t total_len = 0;
- for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
- total_len += g_.length(*iter);
- if (g_.length(*iter) >= cfg::get().max_repeat_length) {
- sum_cov += g_.coverage(*iter) * (double)g_.length(*iter);
- sum_len += g_.length(*iter);
- }
- }
- if (sum_len * 4 < total_len) return;
- sum_cov /= (double)sum_len;
- DEBUG("average coverage of long edges: " << sum_cov) ;
- for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
- if (g_.length(*iter) > 500 && (double)g_.coverage(*iter) < 1.2 * sum_cov) {
- if (unique_edges_.find(*iter) == unique_edges_.end()) {
- unique_edges_.insert(*iter);
- unique_edges_.insert(g_.conjugate(*iter));
- DEBUG("Added coverage based unique edge " << g_.int_id(*iter) << " len "<< g_.length(*iter) << " " << g_.coverage(*iter));
- }
- }
- }
- }
-
-
- void FindAllUniqueEdges() {
- DEBUG("Looking for unique edges");
- for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
- if (UniqueEdge(*iter)) {
- unique_edges_.insert(*iter);
- unique_edges_.insert(g_.conjugate(*iter));
- }
- }
- DEBUG("coverage based uniqueness started");
- FindAllUniqueCoverageEdges();
- DEBUG("Unique edges are found");
- }
-
- const Graph& g_;
- const GraphCoverageMap& cov_map_;
- double filter_threshold_;
- double prior_threshold_;
- std::set<EdgeId> unique_edges_;
-
-};
-
-class SimpleScaffolding {
-public:
- SimpleScaffolding(const Graph& g) : g_(g) {}
-
- BidirectionalPath FindMaxCommonPath(const vector<BidirectionalPath*>& paths,
- size_t max_diff_len) const {
- BidirectionalPath max_end(g_);
- for (auto it1 = paths.begin(); it1 != paths.end(); ++it1) {
- BidirectionalPath* p1 = *it1;
- for (size_t i = 0; i < p1->Size(); ++i) {
- if (p1->Length() - p1->LengthAt(i) > max_diff_len) {
- break;
- }
- bool contain_all = true;
- for (size_t i1 = i + 1; i1 <= p1->Size() && contain_all; ++i1) {
- BidirectionalPath subpath = p1->SubPath(i, i1);
- for (auto it2 = paths.begin(); it2 != paths.end() && contain_all; ++it2) {
- BidirectionalPath* p2 = *it2;
- vector<size_t> positions2 = p2->FindAll(subpath.At(0));
- bool contain = false;
- for (size_t ipos2 = 0; ipos2 < positions2.size(); ++ipos2) {
- size_t pos2 = positions2[ipos2];
- if (p2->Length() - p2->LengthAt(pos2) <= max_diff_len
- && EqualEnds(subpath, 0, *p2, pos2, false)) {
- contain = true;
- break;
- }
- }
- if (!contain) {
- contain_all = false;
- }
- }
- if (contain_all && (i1 - i) >= max_end.Size()) {
- max_end.Clear();
- max_end.PushBack(subpath);
- }
- }
- }
- }
- return max_end;
- }
-
-private:
- const Graph& g_;
-};
-
-class LongReadsExtensionChooser : public ExtensionChooser {
-public:
- LongReadsExtensionChooser(const Graph& g, PathContainer& pc,
- double filtering_threshold,
- double weight_priority_threshold,
- double unique_edge_priority_threshold)
- : ExtensionChooser(g),
- filtering_threshold_(filtering_threshold),
- weight_priority_threshold_(weight_priority_threshold),
- cov_map_(g, pc),
- unique_edge_analyzer_(g, cov_map_, filtering_threshold, unique_edge_priority_threshold),
- simple_scaffolding_(g) {
-
- }
-
- /* Choose extension as correct only if we have reads that traverse a unique edge from the path and this extension.
- * Edge is unique if all reads mapped to this edge are consistent.
- * Two reads are consistent if they can form one path in the graph.
- */
- EdgeContainer Filter(const BidirectionalPath& path,
- const EdgeContainer& edges) const override {
- if (edges.empty()) {
- return edges;
- }DEBUG("We in Filter of LongReadsExtensionChooser");
- path.Print();
- map<EdgeId, double> weights_cands;
- for (auto it = edges.begin(); it != edges.end(); ++it) {
- weights_cands.insert(make_pair(it->e_, 0.0));
- }
- set<EdgeId> filtered_cands;
- map<EdgeId, BidirectionalPathSet > support_paths_ends;
- auto support_paths = cov_map_.GetCoveringPaths(path.Back());
- DEBUG("Found " << support_paths.size() << " covering paths!!!");
- for (auto it = support_paths.begin(); it != support_paths.end(); ++it) {
- auto positions = (*it)->FindAll(path.Back());
- (*it)->Print();
- for (size_t i = 0; i < positions.size(); ++i) {
- if ((int) positions[i] < (int) (*it)->Size() - 1
- && EqualBegins(path, (int) path.Size() - 1, **it,
- positions[i], false)) {
- DEBUG("Checking unique path_back for " << (*it)->GetId());
-
- if (UniqueBackPath(**it, positions[i])) {
- DEBUG("Success");
-
- EdgeId next = (*it)->At(positions[i] + 1);
- weights_cands[next] += (*it)->GetWeight();
- filtered_cands.insert(next);
- if (support_paths_ends.count(next) == 0){
- support_paths_ends[next] = BidirectionalPathSet();
- }
- support_paths_ends[next].insert(new BidirectionalPath((*it)->SubPath(positions[i] + 1)));
- }
- }
- }
- }
- DEBUG("Candidates");
- for (auto iter = weights_cands.begin(); iter != weights_cands.end(); ++iter) {
- DEBUG("Candidate " << g_.int_id(iter->first) << " weight " << iter->second);
- }
- vector<pair<EdgeId, double> > sort_res = MapToSortVector(weights_cands);
- DEBUG("sort res " << sort_res.size() << " tr " << weight_priority_threshold_);
- if (sort_res.size() < 1 || sort_res[0].second < filtering_threshold_) {
- filtered_cands.clear();
- } else if (sort_res.size() > 1
- && sort_res[0].second > weight_priority_threshold_ * sort_res[1].second) {
- filtered_cands.clear();
- filtered_cands.insert(sort_res[0].first);
- } else if (sort_res.size() > 1) {
- for (size_t i = 0; i < sort_res.size(); ++i) {
- if (sort_res[i].second * weight_priority_threshold_ < sort_res[0].second) {
- filtered_cands.erase(sort_res[i].first);
- }
- }
- }
- EdgeContainer result;
- for (auto it = edges.begin(); it != edges.end(); ++it) {
- if (filtered_cands.find(it->e_) != filtered_cands.end()) {
- result.push_back(*it);
- }
- }
- if (result.size() != 1) {
- DEBUG("Long reads doesn't help =(");
- }
- return result;
- }
-
-private:
- bool UniqueBackPath(const BidirectionalPath& path, size_t pos) const {
- int int_pos = (int) pos;
- while (int_pos >= 0) {
- if (unique_edge_analyzer_.IsUnique(path.At(int_pos)) > 0)
- return true;
- int_pos--;
- }
- return false;
- }
-
- vector<pair<EdgeId, double> > MapToSortVector(
- map<EdgeId, double>& map) const {
- vector<pair<EdgeId, double> > result1(map.begin(), map.end());
- std::sort(result1.begin(), result1.end(), EdgeWithWeightCompareReverse);
- return result1;
- }
-
- double filtering_threshold_;
- double weight_priority_threshold_;
- const GraphCoverageMap cov_map_;
- LongReadsUniqueEdgeAnalyzer unique_edge_analyzer_;
- SimpleScaffolding simple_scaffolding_;
-
- DECL_LOGGER("LongReadsExtensionChooser");
-};
-
-class MatePairExtensionChooser : public ExtensionChooser {
-public:
- MatePairExtensionChooser(const Graph& g, shared_ptr<PairedInfoLibrary> lib,
- const PathContainer& paths, size_t max_number_of_paths_to_search)
- : ExtensionChooser(g),
- g_(g),
- lib_(lib),
- search_dist_(lib->GetISMax()),
- weight_counter_(g, lib, 10),
- cov_map_(g_, paths),
- path_searcher_(g_, cov_map_, lib_->GetISMax(), PathsWeightCounter(g, lib, (size_t) lib->GetSingleThreshold()), max_number_of_paths_to_search),
- unique_edge_analyzer_(g, cov_map_, 0., 1000.),
- simple_scaffolder_(g) {
- }
-
- //Attention! Uses const_cast to modify path!!!
- EdgeContainer Filter(const BidirectionalPath& path,
- const EdgeContainer& init_edges) const override {
- DEBUG("mp chooser");
- path.Print();
- if (path.Length() < lib_->GetISMin()) {
- return EdgeContainer();
- }
- EdgeContainer edges = TryResolveBulge(path, init_edges);
- map<EdgeId, BidirectionalPath*> best_paths;
- for (size_t iedge = 0; iedge < edges.size(); ++iedge) {
- BidirectionalPathSet following_paths = path_searcher_.FindNextPaths(path, edges[iedge].e_);
- vector<BidirectionalPath*> max_weighted = MaxWeightedPath(path, following_paths);
- if (max_weighted.size() == 0) {
- DEBUG("too much paths or tip");
- DeleteMapWithPaths(best_paths);
- DeletePaths(following_paths);
- best_paths.clear();
- break;
- } else {
- best_paths[edges[iedge].e_] = new BidirectionalPath(*max_weighted[0]);
- }
- DeletePaths(following_paths);
- }
-
- BidirectionalPathSet next_paths;
- if (edges.size() == 0) {
- DEBUG("scaffolding edges size " << edges.size())
- next_paths = path_searcher_.FindNextPaths(path, path.Back());
- } else if (best_paths.size() == edges.size()) {
- for (size_t iedge = 0; iedge < edges.size(); ++iedge) {
- if (best_paths.count(edges[iedge].e_) > 0){
- next_paths.insert(best_paths[edges[iedge].e_]);
- }
- }
- }
- EdgeContainer result = ChooseBest(path, next_paths);
- if (result.size() != 1) {
- DEBUG("scaffold tree");
- result = ScaffoldTree(const_cast<BidirectionalPath&>(path));
- }
- DeletePaths(next_paths);
- if (result.size() != 1) {
- DEBUG("nobody can extend " << g_.int_id(path.Back()));
- }
- return result;
- }
-
-private:
- EdgeContainer ScaffoldTree(BidirectionalPath& path) const {
- DEBUG("try scaffold tree");
- vector<BidirectionalPath*> next_paths = path_searcher_.ScaffoldTree(path);
- VERIFY(next_paths.size() <= 1);
- EdgeContainer result;
- if (!next_paths.empty() && next_paths.back()->Size() > 0) {
- BidirectionalPath* res = next_paths.back();
- for (size_t i = 0; i < res->Size() - 1; ++i) {
- path.PushBack(res->At(i), res->GapAt(i), res->TrashPreviousAt(i), res->TrashCurrentAt(i));
- }
- result = EdgeContainer(1, EdgeWithDistance(res->Back(), res->GapAt(res->Size() - 1)));
- }
- DeletePaths(next_paths);
- return result;
- }
-
- bool IsBulge(const EdgeContainer& edges) const {
- if (edges.size() == 0)
- return false;
- for (EdgeWithDistance e : edges) {
- if (!InBuble(e.e_, g_))
- return false;
- }
- return true;
- }
-
- map<EdgeId, double> FindBulgeWeights(const BidirectionalPath& p, const EdgeContainer& edges) const {
- map<EdgeId, double> result;
- for (size_t i = 0; i < edges.size(); ++i) {
- result[edges[i].e_] = 0.0;
- }
- for (size_t i = 0; i < p.Size(); ++i) {
- bool common = true;
- bool common_ideal = true;
- for (EdgeWithDistance e : edges) {
- common_ideal = common_ideal && weight_counter_.HasIdealPI(p.At(i), e.e_, (int) p.LengthAt(i));
- common = common && weight_counter_.HasPI(p.At(i), e.e_, (int) p.LengthAt(i));
- }
- if (!common_ideal || common) {
- continue;
- }
- for (size_t j = 0; j < edges.size(); ++j) {
- result[edges[j].e_] += weight_counter_.PI(p.At(i), edges[j].e_, (int) p.LengthAt(i));
- }
- }
- return result;
- }
-
- EdgeContainer TryResolveBulge(const BidirectionalPath& p, const EdgeContainer& edges) const {
- if (!IsBulge(edges))
- return edges;
- map<EdgeId, double> weights = FindBulgeWeights(p, edges);
- double max_w = 0.0;
- EdgeContainer result;
- for (EdgeWithDistance e : edges) {
- double w = weights[e.e_];
- DEBUG("bulge " << g_.int_id(e.e_) << " w = " << w);
- if (math::gr(w, max_w)) {
- max_w = w;
- result.clear();
- result.push_back(e);
- } else if (math::eq(w, max_w)) {
- result.push_back(e);
- }
- }
- if (result.size() != 1) {
- result = edges;
- }
- return result;
- }
-
- EdgeContainer ChooseBest(const BidirectionalPath& path, const BidirectionalPathSet& next_paths) const {
- DEBUG("Try to choose from best paths...");
- vector<BidirectionalPath*> best_path = MaxWeightedPath(path, next_paths);
- EdgeContainer result;
- if (best_path.size() == 1) {
- result.push_back(EdgeWithDistance((*best_path.begin())->At(0), (*best_path.begin())->GapAt(0)));
- } else if (best_path.size() > 1) {
- result = TryToScaffold(path, best_path);
- }
- return result;
- }
-
- bool HasPIFromUniqueEdges(const BidirectionalPath& p1, const BidirectionalPath& p2, const set<size_t>& p1_unique_edges) const {
- for (size_t i1 = 0; i1 < p1.Size(); ++i1) {
- if (p1_unique_edges.find(i1) == p1_unique_edges.end()) {
- continue;
- }
- for (size_t i2 = 0; i2 < p2.Size(); ++i2) {
- int gap = (int) p1.LengthAt(i1) + (int) p2.Length() - (int) p2.LengthAt(i2);
- if (unique_edge_analyzer_.IsUnique(p2.At(i2)) && weight_counter_.HasPI(p1.At(i1), p2.At(i2), gap)) {
- DEBUG("has unique edge " << g_.int_id(p1.At(i1)) << " " << g_.int_id(p2.At(i2)));
- return true;
- }
- }
- }
- return false;
- }
-
- bool SignificallyDifferentEdges(const BidirectionalPath& init_path, const BidirectionalPath& path1, const map<size_t, double>& pi1,
- const BidirectionalPath& path2, const map<size_t, double>& pi2, const set<size_t>& unique_init_edges) const {
- double not_common_w1 = 0.0;
- double common_w = 0.0;
- for (auto iter = pi1.begin(); iter != pi1.end(); ++iter) {
- auto iter2 = pi2.find(iter->first);
- double w = 0.0;
- if (iter2 != pi2.end() && !math::eq(iter2->second, 0.0)) {
- w = min(iter2->second, iter->second);
- }
- not_common_w1 += iter->second - w;
- common_w += w;
- }
- if (common_w < 0.8 * (not_common_w1 + common_w)
- || (HasPIFromUniqueEdges(init_path, path1, unique_init_edges) && !HasPIFromUniqueEdges(init_path, path2, unique_init_edges))) {
- DEBUG("common_w " << common_w << " sum * 0.8 = " << 0.8 * (not_common_w1 + common_w))
- return true;
- }
- return false;
- }
-
- set<size_t> FindNotCommonEdges(const BidirectionalPath& path, const BidirectionalPathMap< map<size_t, double> >& all_pi) const {
- set<size_t> res;
- for (size_t i = 0; i < path.Size(); ++i) {
- if (!unique_edge_analyzer_.IsUnique(path.At(i))) {
- continue;
- }
- size_t pi_count = 0;
- for (auto iter = all_pi.begin(); iter != all_pi.end(); ++iter) {
- const map<size_t, double>& info = iter->second;
- if (info.count(i) > 0 && math::gr(info.at(i), 0.0)) {
- pi_count++;
- }
- }
- if (pi_count == 1)
- res.insert(i);
- }
- return res;
- }
-
- void DeleteSmallWeights(const BidirectionalPath& path, BidirectionalPathSet& paths, BidirectionalPathMap< map<size_t, double> >& all_pi) const {
- double max_weight = 0.0;
- BidirectionalPath* max_path = NULL;
- for (auto iter = paths.begin(); iter != paths.end(); ++iter) {
- if ((*iter)->GetWeight() >= max_weight) {
- max_weight = max(max_weight, (*iter)->GetWeight());
- max_path = *iter;
- }
- }
- BidirectionalPathSet to_del;
- for (auto iter = paths.begin(); iter != paths.end(); ++iter) {
- if (math::gr(max_weight, (*iter)->GetWeight() * 1.5) //TODO: move 1.5 to config
- && SignificallyDifferentEdges(path, *max_path, all_pi.find(max_path)->second, **iter, all_pi.find(*iter)->second,
- FindNotCommonEdges(path, all_pi)))
- to_del.insert(*iter);
- }
- for (BidirectionalPath* p : to_del) {
- paths.erase(p);
- all_pi.erase(p);
- }
- }
-
- void DeleteCommonPi(const BidirectionalPath& p, BidirectionalPathMap< map<size_t, double> >& all_pi) const {
- weight_counter_.ClearCommonWeight();
- for (size_t i = 0; i < p.Size(); ++i) {
- double common = DBL_MAX;
- for (auto iter = all_pi.begin(); iter != all_pi.end(); ++iter) {
- common = iter->second.count(i) == 0 ? 0.0 : min(common, iter->second.at(i));
- }
- weight_counter_.SetCommonWeightFrom(i, common);
- }
- }
-
- size_t FindCommonBegin(const BidirectionalPathSet& paths) const {
- if (paths.size() == 0) {
- return 0;
- }
- size_t common_begin = 0;
- BidirectionalPath* p = *paths.begin();
- while (common_begin < p->Size()) {
- EdgeId e = p->At(common_begin);
- for (BidirectionalPath* next : paths) {
- if (common_begin >= next->Size() || next->At(common_begin) != e) {
- return common_begin;
- }
- }
- common_begin++;
- }
- return common_begin;
- }
-
- void CountAllPairInfo(const BidirectionalPath& path, const BidirectionalPathSet& next_paths,
- BidirectionalPathMap<map<size_t, double>>& result) const {
- result.clear();
- size_t common_begin = FindCommonBegin(next_paths);
- DEBUG("common begin " << common_begin);
- for (BidirectionalPath* next : next_paths) {
- result[next] = weight_counter_.FindPairInfoFromPath(path, 0, path.Size(), *next, common_begin, next->Size());
- }
- }
-
- void CountWeightsAndFilter(const BidirectionalPath& path, BidirectionalPathSet& next_paths, bool delete_small_w) const {
- BidirectionalPathMap<map<size_t, double> > all_pi;
- CountAllPairInfo(path, next_paths, all_pi);
- DeleteCommonPi(path, all_pi);
- for (BidirectionalPath* next : next_paths) {
- next->SetWeight((float) weight_counter_.CountPairInfo(path, 0, path.Size(), *next, 0, next->Size()));
- }
- if (delete_small_w) {
- DeleteSmallWeights(path, next_paths, all_pi);
- }
- }
-
- struct PathWithWeightSort {
- PathWithWeightSort(const MatePairExtensionChooser& mp_chooser, const BidirectionalPath& path, BidirectionalPathMap< map<size_t, double> >& all_pi)
- : mp_chooser_(mp_chooser),
- path_(path),
- not_common_(mp_chooser_.FindNotCommonEdges(path_, all_pi)) {
- }
-
- bool operator()(const BidirectionalPath* p1, const BidirectionalPath* p2) {
- if (mp_chooser_.HasPIFromUniqueEdges(path_, *p1, not_common_) && !mp_chooser_.HasPIFromUniqueEdges(path_, *p2, not_common_)) {
- return true;
- }
- if (mp_chooser_.HasPIFromUniqueEdges(path_, *p2, not_common_) && !mp_chooser_.HasPIFromUniqueEdges(path_, *p1, not_common_)) {
- return false;
- }
- if (!math::eq(p1->GetWeight(), p2->GetWeight())) {
- return math::gr(p1->GetWeight(), p2->GetWeight());
- }
- if (!math::eq(p1->GetWeight(), p2->GetWeight())) {
- return math::gr(p1->GetWeight(), p2->GetWeight());
- }
- if (p1->Length() != p2->Length()) {
- return p1->Length() > p2->Length();
- }
- return p1->Size() > p2->Size();
- }
- const MatePairExtensionChooser& mp_chooser_;
- const BidirectionalPath& path_;
- const set<size_t> not_common_;
- };
-
- vector<BidirectionalPath*> SortResult(const BidirectionalPath& path, BidirectionalPathSet& next_paths) const {
- BidirectionalPathMap< map<size_t, double> > all_pi;
- CountAllPairInfo(path, next_paths, all_pi);
- CountWeightsAndFilter(path, next_paths, false);
- vector<BidirectionalPath*> to_sort(next_paths.begin(), next_paths.end());
- PathWithWeightSort comparator(*this, path, all_pi);
- std::sort(to_sort.begin(), to_sort.end(), comparator);
- return to_sort;
- }
-
- vector<BidirectionalPath*> MaxWeightedPath(const BidirectionalPath& path, const BidirectionalPathSet& following_paths) const {
- BidirectionalPathSet result(following_paths);
- BidirectionalPathSet prev_result;
- while (prev_result.size() != result.size()) {
- prev_result = result;
- DEBUG("iteration with paths " << result.size());
- CountWeightsAndFilter(path, result, true);
- if (result.size() == 0)
- result = prev_result;
- if (result.size() == 1)
- break;
- }
- if (result.size() == 0) {
- DEBUG("bad case");
- return vector<BidirectionalPath*>();
- }
- return SortResult(path, result);
- }
-
- BidirectionalPath ChooseFromEnds(const BidirectionalPath& path, const vector<BidirectionalPath*>& paths, const BidirectionalPath& end) const { //TODO" rewrite
- DEBUG("choose from ends " << paths.size());
- end.Print();
- vector<BidirectionalPath*> new_paths;
- vector<BidirectionalPath*> paths_to_cover;
- for (BidirectionalPath* p : paths) {
- int from = 0;
- int pos = p->FindFirst(end, from);
- while (pos > -1) {
- BidirectionalPath* new_p = new BidirectionalPath(path);
- BidirectionalPath* new_end = new BidirectionalPath(p->SubPath(0, pos + end.Size()));
- new_p->PushBack(*new_end);
- new_paths.push_back(new_p);
- paths_to_cover.push_back(new_end);
- from = pos + 1;
- pos = p->FindFirst(end, from);
- }
- }
- BidirectionalPath max = **new_paths.begin();
- size_t covered_edges_max = 0;
- size_t min_size = max.Size();
- for (BidirectionalPath* p : new_paths) {
- size_t cov_edges = 0;
- for (BidirectionalPath* e : paths_to_cover) {
- vector<size_t> poses = p->FindAll(e->Back());
- for (size_t pos : poses) {
- if (EqualBegins(*p, pos, *e, e->Size() - 1, true)) {
- cov_edges++;
- break;
- }
- }
- }
- if (cov_edges > covered_edges_max || (cov_edges == covered_edges_max && min_size > p->Size())) {
- DEBUG("cov_e " << cov_edges << " s " << p->Size());
- max.Clear();
- max.PushBack(*p);
- covered_edges_max = cov_edges;
- min_size = max.Size();
- }
- }
- for (BidirectionalPath* p : new_paths) {
- delete p;
- }
- for (BidirectionalPath* p : paths_to_cover) {
- delete p;
- }
- BidirectionalPath result = max.SubPath(path.Size());
- DEBUG("res");
- result.Print();
- return result;
- }
-
- int CheckPairInfo(const BidirectionalPath& path, const BidirectionalPath& result_end, int to_add) const {
- while (to_add < (int)result_end.Size()) {
- map<size_t, double> weights = weight_counter_.FindPairInfoFromPath(path, 0, path.Size(), result_end, to_add, to_add + 1);
- double weight_to_edge = 0.0;
- for (auto iter = weights.begin(); iter != weights.end(); ++iter) {
- weight_to_edge += iter->second;
- }
- if (math::gr(weight_to_edge, 0.0)) {
- break;
- }
- to_add++;
- }
- return to_add;
- }
-
- EdgeContainer TryToScaffold(const BidirectionalPath& path, const vector<BidirectionalPath*>& paths) const {
- if (paths.size() == 0) {
- return EdgeContainer();
- }
- DEBUG("Simple Scaffolding")
- for (BidirectionalPath* p : paths) {
- p->Print();
- }
- BidirectionalPath max_end = simple_scaffolder_.FindMaxCommonPath(paths, search_dist_);
- if (max_end.Size() == 0) {
- return EdgeContainer();
- }
- BidirectionalPath result_end = ChooseFromEnds(path, paths, max_end);
- int to_add = result_end.FindFirst(max_end);
- result_end.Print();
- EdgeContainer result;
- to_add = CheckPairInfo(path, result_end, to_add);
- if (to_add < 0 || to_add >= (int) result_end.Size()) {
- return EdgeContainer();
- }
- size_t gap_length = result_end.Length() - result_end.LengthAt(to_add);
- DEBUG(" edge to add " << g_.int_id(result_end.At(to_add)) << " with length " << gap_length);
- result.push_back(EdgeWithDistance(result_end.At(to_add), gap_length));
- return result;
- }
-
- const Graph& g_;
- shared_ptr<PairedInfoLibrary> lib_;
- size_t search_dist_;
- mutable PathsWeightCounter weight_counter_;
- const GraphCoverageMap cov_map_;
- NextPathSearcher path_searcher_;
- LongReadsUniqueEdgeAnalyzer unique_edge_analyzer_;
- SimpleScaffolding simple_scaffolder_;
-
- DECL_LOGGER("MatePairExtensionChooser");
-};
-
-class CoordinatedCoverageExtensionChooser: public ExtensionChooser {
-public:
- CoordinatedCoverageExtensionChooser(const Graph& g, CoverageAwareIdealInfoProvider& coverage_provider, size_t max_edge_length_in_repeat, double delta) :
- ExtensionChooser(g), provider_(coverage_provider), max_edge_length_in_repeat_(max_edge_length_in_repeat), delta_(delta) {
- }
-
- EdgeContainer Filter(const BidirectionalPath& path,
- const EdgeContainer& edges) const override {
-
-
- double path_coverage = provider_.EstimatePathCoverage(path);
- if (math::eq(path_coverage, -1.0)) {
- DEBUG("Path coverage can't be calculated");
- return EdgeContainer();
- }
- DEBUG("Path coverage is " << path_coverage);
-
- for (auto e_d : edges) {
- if (path.Contains(g_.EdgeEnd(e_d.e_))) {
- DEBUG("Avoid to create loops");
- return EdgeContainer();
- }
- }
- return FindExtensionTroughRepeat(edges, path_coverage);
- }
-
-private:
-
- void UpdateCanBeProcessed(VertexId v,
- std::queue<VertexId>& can_be_processed) const {
- DEBUG("Updating can be processed");
- for (EdgeId e : g_.OutgoingEdges(v)) {
- VertexId neighbour_v = this->g_.EdgeEnd(e);
- if (g_.length(e) < max_edge_length_in_repeat_) {
- DEBUG(
- "Adding vertex " << neighbour_v.int_id()
- << "through edge " << g_.str(e));
- can_be_processed.push(neighbour_v);
- }
- }
- }
-
- GraphComponent<Graph> GetRepeatComponent(const VertexId start) const {
- set<VertexId> vertices_of_component;
- vertices_of_component.insert(start);
- std::queue<VertexId> can_be_processed;
- UpdateCanBeProcessed(start, can_be_processed);
- while (!can_be_processed.empty()) {
- VertexId v = can_be_processed.front();
- can_be_processed.pop();
- if (vertices_of_component.count(v) != 0) {
-
- DEBUG("Component is too complex");
- return GraphComponent<Graph>(g_, false);
- }
- DEBUG("Adding vertex " << g_.str(v) << " to component set");
- vertices_of_component.insert(v);
- UpdateCanBeProcessed(v, can_be_processed);
- }
-
- GraphComponent<Graph> gc(g_, vertices_of_component.begin(),
- vertices_of_component.end());
- return gc;
- }
-
- EdgeContainer FinalFilter(const EdgeContainer& edges,
- EdgeId edge_to_extend) const {
- EdgeContainer result;
- for (auto e_with_d : edges) {
- if (e_with_d.e_ == edge_to_extend) {
- result.push_back(e_with_d);
- }
- }
- return result;
- }
-
- bool GoodExtension(EdgeId e, double path_coverage) const {
- if (math::ge(g_.coverage(e), path_coverage - path_coverage * delta_)) {
- return true;
- }
- else {
- return false;
- }
- }
-
- EdgeContainer FindExtensionTroughRepeat(const EdgeContainer& edges, double path_coverage) const {
- set<EdgeId> good_extensions;
- for(auto edge : edges) {
-
- if(g_.length(edge.e_) > max_edge_length_in_repeat_) {
- if(GoodExtension(edge.e_, path_coverage)) {
- good_extensions.insert(edge.e_);
- continue;
- }
- }
-
- GraphComponent<Graph> gc = GetRepeatComponent(g_.EdgeEnd(edge.e_));
- if(gc.v_size() == 0) {
- return EdgeContainer();
- }
-
- for (auto e : gc.edges()) {
- if (g_.length(e) > max_edge_length_in_repeat_) {
- DEBUG("Repeat component contains long edges");
- return EdgeContainer();
- }
- }
-
- for (auto v : gc.sinks()) {
- for (auto e : g_.OutgoingEdges(v)) {
- if(GoodExtension(e, path_coverage)) {
- good_extensions.insert(edge.e_);
- }
- }
- }
- }
-
- DEBUG("Number of good extensions is " << good_extensions.size());
-
- if (good_extensions.size() != 1) {
- DEBUG("Returning");
- return EdgeContainer();
- }
-
- DEBUG("Filtering... Extend with edge " << good_extensions.begin()->int_id());
- return FinalFilter(edges, *good_extensions.begin());
- }
-
- //fixme codestyle
- CoverageAwareIdealInfoProvider provider_;
- const size_t max_edge_length_in_repeat_;
- const double delta_;
-protected:
- DECL_LOGGER("CoordCoverageExtensionChooser");
-};
-
-}
-#endif /* EXTENSION_HPP_ */
diff --git a/src/debruijn/path_extend/ideal_pair_info.hpp b/src/debruijn/path_extend/ideal_pair_info.hpp
deleted file mode 100644
index 34c10f7..0000000
--- a/src/debruijn/path_extend/ideal_pair_info.hpp
+++ /dev/null
@@ -1,129 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * ideal_pair_info.hpp
- *
- * Created on: Oct 10, 2013
- * Author: ira
- */
-
-#ifndef IDEAL_PAIR_INFO_HPP_
-#define IDEAL_PAIR_INFO_HPP_
-#import <vector>
-#include "graph_pack.hpp"
-
-namespace path_extend {
-
-using debruijn_graph::Graph;
-using debruijn_graph::EdgeId;
-
-class IdealPairInfoCounter {
-public:
- IdealPairInfoCounter(const Graph& g, int d_min, int d_max, size_t read_size,
- const std::map<int, size_t>& is_distribution)
- : g_(g),
- d_min_(d_min),
- d_max_(d_max),
- read_size_(read_size) {
- size_t sum = 0;
- for (auto iter = is_distribution.begin(); iter != is_distribution.end();
- ++iter) {
- sum += iter->second;
- }
- for (auto iter = is_distribution.begin(); iter != is_distribution.end();
- ++iter) {
- insert_size_distrib_[iter->first] = (double) iter->second
- / (double) sum;
- }
- PreCalculateNotTotalReadsWeight();
- }
-
- double IdealPairedInfo(EdgeId e1, EdgeId e2, int dist, bool additive = false) const {
- std::pair<size_t, size_t> lengths = make_pair(g_.length(e1), g_.length(e2));
- if (pi_.find(lengths) == pi_.end()) {
- pi_.insert(make_pair(lengths, std::map<int, double>()));
- }
- std::map<int, double>& weights = pi_[lengths];
- if (weights.find(dist) == weights.end()) {
- weights.insert(make_pair(dist, IdealPairedInfo(g_.length(e1), g_.length(e2), dist, additive)));
- }
- return weights[dist];
- }
-
- double IdealPairedInfo(size_t len1, size_t len2, int dist, bool additive = false) const {
- double result = 0.0;
- for (auto it = insert_size_distrib_.lower_bound(max(d_min_, 0)); it != insert_size_distrib_.upper_bound(d_max_); ++it) {
- result += it->second * (double) IdealReads(len1, len2, dist, it->first, additive);
- }
- return result;
- }
-
-private:
-
- double IdealReads(size_t len1_1, size_t len2_1, int dist,
- size_t is_1, bool additive) const {
- int len1 = (int) len1_1;
- int len2 = (int) len2_1;
- int is = (int) is_1;
- int k = (int) g_.k();
- int rs = (int) read_size_;
- double w = 0.0;
- if (dist == 0) {
- return len1 - is + 2 * rs - 2 - k + 1;
- }
- if (dist < 0) {
- int tmp = len1;
- len1 = len2;
- len2 = tmp;
- dist = -dist;
- }
- int gap_len = dist - len1;
- int right_long = is - rs - 1;
- int right_short = gap_len + len2 - 1;
- int left_short = gap_len + k + 1 - rs;
- int left_long = is - rs - len1 - rs + (k + 1);
- int right = std::min(right_long, right_short);
- int left = std::max(left_short, left_long);
- int result = std::max(right - left + 1, 0);
- int right_e2 = std::min(gap_len + len2 - rs + k, right_long);
- int left_e2 = std::max(left_long, gap_len);
- int right_not_full = std::max(right - right_e2, 0);
- int left_not_full = std::max(left_e2 - left, 0);
- w = result;
- if (additive){
- w = w - not_total_weights_right_[right_not_full]- not_total_weights_left_[left_not_full];
- }
- return w > 0.0 ? w : 0.0;
- }
-
- void PreCalculateNotTotalReadsWeight() {
- not_total_weights_right_.push_back(0.0);
- not_total_weights_left_.push_back(0.0);
- for (int i = 1; i < int(read_size_) - int(g_.k()) + 1; ++i) {
- double right = (double(i) + double(g_.k()) /2.0) / (double) read_size_;
- double left = 1 - right;
- not_total_weights_right_.push_back(not_total_weights_right_[i-1] + right);
- not_total_weights_left_.push_back(not_total_weights_left_[i-1] + left);
- }
- }
-
- const Graph& g_;
- int d_min_;
- int d_max_;
- size_t read_size_;
- std::vector<double> weights_;
- std::map<int, double> insert_size_distrib_;
- mutable std::map<std::pair<size_t, size_t>, std::map<int, double> > pi_;
- std::vector<double> not_total_weights_right_;
- std::vector<double> not_total_weights_left_;
-protected:
- DECL_LOGGER("PathExtendPI");
-};
-} // path extend
-
-#endif /* IDEAL_PAIR_INFO_HPP_ */
diff --git a/src/debruijn/path_extend/loop_traverser.hpp b/src/debruijn/path_extend/loop_traverser.hpp
deleted file mode 100644
index e0b04a9..0000000
--- a/src/debruijn/path_extend/loop_traverser.hpp
+++ /dev/null
@@ -1,213 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * loop_traverser.hpp
- *
- * Created on: Jan 28, 2013
- * Author: ira
- */
-
-#ifndef LOOP_TRAVERSER_H_
-#define LOOP_TRAVERSER_H_
-
-#include "path_extender.hpp"
-#include "pe_resolver.hpp"
-#include "path_visualizer.hpp"
-
-namespace path_extend {
-
-class LoopTraverser {
-
- const Graph& g_;
- GraphCoverageMap& covMap_;
- shared_ptr<ContigsMaker> extender_;
-private:
- EdgeId FindStart(const set<VertexId>& component_set) const{
- EdgeId result;
- for (auto it = component_set.begin(); it != component_set.end(); ++it) {
- for (auto eit = g_.in_begin(*it); eit != g_.in_end(*it); ++eit) {
- if (component_set.count(g_.EdgeStart(*eit)) == 0) {
- if (result != EdgeId()) {
- return EdgeId();
- }
- result = *eit;
- }
- }
- }
- return result;
- }
-
- EdgeId FindFinish(const set<VertexId>& component_set) {
- EdgeId result;
- for (auto it = component_set.begin(); it != component_set.end(); ++it) {
- for (auto I = g_.out_begin(*it), E = g_.out_end(*it);
- I != E; ++I) {
- if (component_set.count(g_.EdgeEnd(*I)) == 0) {
- if (result != EdgeId()) {
- return EdgeId();
- }
- result = *I;
- }
- }
- }
- return result;
- }
-
- void TryToGrow(BidirectionalPath* path, EdgeId component_entrance) {
- BidirectionalPath clone = *path;
- extender_->GrowPathSimple(*path);
- if (!path->Contains(component_entrance)) {
- DEBUG("Grown paths do not contain initial edges, rolling back");
- path->Clear();
- path->PushBack(clone);
- }
- }
-
- bool IsEndInsideComponent(const BidirectionalPath &path,
- const set <VertexId> &component_set) {
- if (component_set.count(g_.EdgeStart(path.Front())) == 0) {
- return false;
- }
- for (size_t i = 0; i < path.Size(); ++i) {
- if (component_set.count(g_.EdgeEnd(path.At(i))) == 0)
- return false;
- }
- return true;
- }
-
-
- bool IsEndInsideComponent(const BidirectionalPath &path, EdgeId component_entrance,
- const set <VertexId> &component_set,
- bool conjugate = false) {
- int i = path.FindLast(component_entrance);
- VERIFY_MSG(i != -1, "Component edge is not found in the path")
-
- if ((size_t) i == path.Size() - 1) {
- if (conjugate)
- return component_set.count(g_.conjugate(g_.EdgeEnd(path.Back()))) > 0;
- else
- return component_set.count(g_.EdgeEnd(path.Back())) > 0;
- }
-
- if (conjugate)
- return IsEndInsideComponent(path.SubPath((size_t) i + 1).Conjugate(), component_set);
- else
- return IsEndInsideComponent(path.SubPath((size_t) i + 1), component_set);
- }
-
- void TraverseLoop(EdgeId start, EdgeId end, const set<VertexId>& component_set) {
- DEBUG("start " << g_.int_id(start) << " end " << g_.int_id(end));
- BidirectionalPathSet coveredStartPaths =
- covMap_.GetCoveringPaths(start);
- BidirectionalPathSet coveredEndPaths =
- covMap_.GetCoveringPaths(end);
-
- for (auto it_path = coveredStartPaths.begin();
- it_path != coveredStartPaths.end(); ++it_path) {
- if ((*it_path)->FindAll(end).size() > 0) {
- return;
- }
- }
- if (coveredStartPaths.size() < 1 or coveredEndPaths.size() < 1) {
- DEBUG("TraverseLoop STRANGE SITUATION: start " << coveredStartPaths.size() << " end " << coveredEndPaths.size());
- return;
- }
-
- if (coveredStartPaths.size() > 1 or coveredEndPaths.size() > 1) {
- DEBUG("Ambiguous situation in path joining, quitting");
- return;
- }
-
- BidirectionalPath* startPath = *coveredStartPaths.begin();
- BidirectionalPath* endPath = *coveredEndPaths.begin();
- if ((*startPath) == endPath->Conjugate()){
- return;
- }
-
- TryToGrow(startPath, start);
- TryToGrow(endPath->GetConjPath(), g_.conjugate(end));
-
- //Checking that paths ends are within component
- if (!IsEndInsideComponent(*startPath, start, component_set) ||
- !IsEndInsideComponent(*endPath->GetConjPath(), g_.conjugate(end), component_set, true)) {
- DEBUG("Some path goes outside of the component")
- return;
- }
-
- size_t commonSize = startPath->CommonEndSize(*endPath);
- size_t nLen = 0;
- DEBUG("Str " << startPath->Size() << ", end" << endPath->Size());
- if (commonSize == 0 && !startPath->Empty() > 0 && !endPath->Empty()) {
- DEBUG("Estimating gap size");
- VertexId lastVertex = g_.EdgeEnd(startPath->Back());
- VertexId firstVertex = g_.EdgeStart(endPath->Front());
-
- if (firstVertex == lastVertex) {
- nLen = 0;
- } else {
- DijkstraHelper<Graph>::BoundedDijkstra dijkstra(DijkstraHelper<Graph>::CreateBoundedDijkstra(g_, 1000, 3000));
- dijkstra.Run(lastVertex);
- vector<EdgeId> shortest_path = dijkstra.GetShortestPathTo(g_.EdgeStart(endPath->Front()));
-
- if (shortest_path.size() == 0) {
- DEBUG("Failed to find closing path");
- return;
- } else if (!IsEndInsideComponent(BidirectionalPath(g_, shortest_path), component_set)) {
- DEBUG("Closing path is outside the component");
- return;
- } else {
- for (size_t i = 0; i < shortest_path.size(); ++i) {
- nLen += g_.length(shortest_path[i]);
- }
- nLen += g_.k();
- }
- }
- }
- if (commonSize < endPath->Size()){
- startPath->PushBack(endPath->At(commonSize), (int) nLen);
- }
- for (size_t i = commonSize + 1; i < endPath->Size(); ++i) {
- startPath->PushBack(endPath->At(i), endPath->GapAt(i), endPath->TrashPreviousAt(i), endPath->TrashCurrentAt(i));
- }
- DEBUG("travers");
- startPath->Print();
- endPath->Print();
- DEBUG("conj");
- endPath->GetConjPath()->Print();
- endPath->Clear();
- }
-
-public:
- LoopTraverser(const Graph& g, GraphCoverageMap& coverageMap, shared_ptr<ContigsMaker> extender) :
- g_(g), covMap_(coverageMap), extender_(extender) {
- }
-
- void TraverseAllLoops() {
- DEBUG("TraverseAllLoops");
- shared_ptr<GraphSplitter<Graph>> splitter = LongEdgesExclusiveSplitter<Graph>(g_, 1000);
- while (splitter->HasNext()) {
- GraphComponent<Graph> component = splitter->Next();
- if (component.v_size() > 10)
- continue;
- set<VertexId> component_set(component.v_begin(), component.v_end());
- EdgeId start = FindStart(component_set);
- EdgeId finish = FindFinish(component_set);
- if (start == EdgeId() || finish == EdgeId()) {
- continue;
- }
- TraverseLoop(start, finish, component_set);
- }
-
- }
-protected:
- DECL_LOGGER("LoopTraverser");
-};
-
-}
-
-#endif /* LOOP_TRAVERSER_H_ */
diff --git a/src/debruijn/path_extend/next_path_searcher.hpp b/src/debruijn/path_extend/next_path_searcher.hpp
deleted file mode 100644
index 37f458c..0000000
--- a/src/debruijn/path_extend/next_path_searcher.hpp
+++ /dev/null
@@ -1,1031 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * next_path_searcher.hpp
- *
- * Created on: Sep 27, 2013
- * Author: ira
- */
-#pragma once
-
-#include <set>
-#include <vector>
-#include <map>
-
-#include "../graph_pack.hpp"
-#include "../debruijn_graph.hpp"
-#include "bidirectional_path.hpp"
-#include "pe_utils.hpp"
-
-namespace path_extend {
-using debruijn_graph::Graph;
-using std::set;
-using std::vector;
-using std::multimap;
-
-class Edge {
-public:
- Edge(const Graph& g, EdgeId id, Edge* prev_e, size_t dist, int gap = 0)
- : g_(g),
- id_(id),
- prev_edge_(prev_e),
- dist_(dist),
- gap_(gap) {
- }
- ~Edge() {
- for (size_t i = 0; i < out_edges_.size(); ++i) {
- delete out_edges_[i];
- }
- for (size_t i = 0; i < not_out_edges_.size(); ++i) {
- delete not_out_edges_[i];
- }
- }
- Edge* AddOutEdge(EdgeId edge, int gap = 0) {
- return AddIfNotExist(edge, gap, out_edges_);
- }
- Edge* AddIncorrectOutEdge(EdgeId edge, int gap = 0) {
- for (size_t i = 0; i < out_edges_.size(); ++i) {
- if (out_edges_[i]->GetId() == edge) {
- not_out_edges_.push_back(out_edges_[i]);
- out_edges_.erase(out_edges_.begin() + i);
- break;
- }
- }
- return AddIfNotExist(edge, gap, not_out_edges_);
- }
- Edge* AddPath(const BidirectionalPath& path, size_t from) {
- Edge* e = this;
- for (size_t i = from; i < path.Size(); ++i) {
- e = e->AddOutEdge(path.At(i), path.GapAt(i));
- }
- return e;
- }
-
- int GetOutEdgeIndex(EdgeId edge) const {
- return GetEdgeIndex(edge, out_edges_);
- }
-
- int GetIncorrectEdgeIndex(EdgeId edge) const {
- return GetEdgeIndex(edge, not_out_edges_);
- }
-
- size_t OutSize() const {
- return out_edges_.size();
- }
-
- Edge* GetOutEdge(size_t i) const {
- return out_edges_[i];
- }
-
- BidirectionalPath GetPrevPath(size_t from) const {
- BidirectionalPath result(g_);
- vector<pair<EdgeId, int> > edges_wgaps;
- const Edge* e = this;
- edges_wgaps.push_back(make_pair(e->GetId(), e->Gap()));
- while (e->prev_edge_) {
- e = e->prev_edge_;
- edges_wgaps.push_back(make_pair(e->GetId(), e->Gap()));
- }
- for (int i = (int) edges_wgaps.size() - 1 - (int) from; i >= 0; i--) {
- result.PushBack(edges_wgaps[i].first, edges_wgaps[i].second);
- }
- return result;
- }
-
- bool IsCorrect() {
- Edge* e = this;
- while (e->prev_edge_) {
- if (e->prev_edge_->GetOutEdgeIndex(e->GetId()) == -1) {
- TRACE("after " << g_.int_id(e->prev_edge_->GetId()) << " souldn't go " << g_.int_id(e->GetId()));
- return false;
- }
- e = e->prev_edge_;
- }
- return true;
- }
-
- bool EqualBegins(const BidirectionalPath& path, int pos) {
- BidirectionalPath p = this->GetPrevPath(0);
- return path_extend::EqualBegins(path, (size_t) pos, p, p.Size() - 1, true);
- }
- size_t Length() const {
- return dist_;
- }
- set<Edge*> GetPrevEdges(size_t dist) {
- size_t init_len = Length();
- Edge* e = this;
- set<Edge*> result;
- while (e && init_len - e->Length() < dist) {
- result.insert(e);
- e = e->prev_edge_;
- }
- return result;
- }
- EdgeId GetId() const {
- return id_;
- }
- int Gap() const {
- return gap_;
- }
-private:
- Edge* AddIfNotExist(EdgeId e, int gap, vector<Edge*>& vect) {
- int i = GetEdgeIndex(e, vect);
- if (i != -1) {
- return vect[i];
- }
- size_t dist = dist_ + gap + g_.length(e);
- vect.push_back(new Edge(g_, e, this, dist, gap));
- return vect.back();
- }
- int GetEdgeIndex(EdgeId e, const vector<Edge*>& vect) const {
- for (size_t i = 0; i < vect.size(); ++i) {
- if (vect[i]->GetId() == e)
- return (int) i;
- }
- return -1;
- }
- const Graph& g_;
- EdgeId id_;
- vector<Edge*> out_edges_;
- vector<Edge*> not_out_edges_;
- Edge* prev_edge_;
- size_t dist_;
- int gap_;
-
-protected:
- DECL_LOGGER("NextPathSearcher")
-};
-struct PathWithDistance {
- PathWithDistance(BidirectionalPath p, int dist)
- : p_(p),
- dist_(dist) {
-
- }
- BidirectionalPath p_;
- int dist_;
-};
-class NextPathSearcher {
-public:
- typedef set<EdgeWithDistance, EdgeWithDistance::DistanceComparator> EdgeSet;
- typedef multimap<EdgeId, PathWithDistance> ConstructedPathT;
-
- NextPathSearcher(const Graph& g, const GraphCoverageMap& cover_map, size_t search_dist, PathsWeightCounter weight_counter, size_t max_number_of_paths_to_search);
- BidirectionalPathSet FindNextPaths(const BidirectionalPath& path, EdgeId begin_edge, bool jump = true) const ;
- vector<BidirectionalPath*> ScaffoldTree(const BidirectionalPath& path) const;
-private:
- bool IsOutTip(VertexId v) const;
- bool IsInTip(VertexId v) const;
- vector<Edge*> GrowPath(const BidirectionalPath& init_path, Edge* e) const;
- Edge* AddEdge(const BidirectionalPath& init_path, Edge* prev_e, EdgeId e_to_add, int gap) const;
- bool AnalyzeBubble(const BidirectionalPath& p, EdgeId buldge_edge, size_t gap, Edge* prev_edge) const;
-
- void ScaffoldTip(const BidirectionalPath& path, Edge * current_path, vector<Edge*>& result_edges, vector<Edge*>& stopped_paths, vector<Edge*>& to_add,
- bool jump) const;
- void ScaffoldChristmasTree(const BidirectionalPath& path, Edge * current_path, vector<Edge*>& to_add, size_t min_length_from) const;
- void Scaffold(const BidirectionalPath& init_path, Edge* current_path, ConstructedPathT& constructed_paths, set<EdgeId>& seeds, bool is_gap) const;
- void FindScaffoldingCandidates(const BidirectionalPath& init_path, Edge* current_path, EdgeSet& candidate_set, size_t min_length_from) const;
- void FindScaffoldingCandidates(EdgeId e, size_t distance_to_tip, vector<EdgeWithDistance>& jump_edges) const;
- void OrderScaffoldingCandidates(EdgeSet& candidate_set, const BidirectionalPath& init_path, Edge* current_path, ConstructedPathT& constructed_paths, set<EdgeId>& seeds, bool is_gap) const;
- void RemoveRedundant(ConstructedPathT& constructed_paths) const;
- void ConvertPaths(const ConstructedPathT& constructed_paths, Edge* current_path, vector<Edge*>& to_add) const;
- void ProcessScaffoldingCandidate(EdgeWithDistance& e, EdgeSet& candidate_set, Edge* current_path, size_t grown_path_len,
- ConstructedPathT& constructed_paths, bool is_gap) const;
- int EstimateGapForPath(EdgeSet& candidate_set, const BidirectionalPath& p) const;
- void AddConstructedPath(const BidirectionalPath& cp, size_t from, int gap, ConstructedPathT& constructed_paths) const;
- void FilterBackPaths(BidirectionalPathSet& back_paths, EdgeId edge_to_reach, BidirectionalPathSet& reached_paths, size_t max_len = -1UL) const;
- void JoinPathsByGraph(ConstructedPathT& constructed_paths) const;
- void JoinPathsByPI(ConstructedPathT& constructed_paths) const;
- void JoinPathsByDejikstra(const BidirectionalPath& init_path, ConstructedPathT& constructed_paths) const;
- map<PathWithDistance*, size_t> FindDistances(const BidirectionalPath& p, vector<PathWithDistance*>& paths) const;
- void FindConnections(vector<PathWithDistance*>& all_paths, map<PathWithDistance*, set<PathWithDistance*> >& connections) const;
- vector<vector<PathWithDistance*> > FilterConnections(vector<PathWithDistance*>& all_paths, map<PathWithDistance*, set<PathWithDistance*> >& connections) const;
- void ConnectPaths(const BidirectionalPath& init_path, vector<vector<PathWithDistance*> >& variants) const;
-
- const Graph& g_;
- const GraphCoverageMap& cover_map_;
- size_t search_dist_;
- PathsWeightCounter weight_counter_;
- size_t long_edge_len_;
- size_t max_paths_;
-
-protected:
- DECL_LOGGER("NextPathSearcher")
-};
-
-inline NextPathSearcher::NextPathSearcher(const Graph& g, const GraphCoverageMap& cover_map, size_t search_dist, PathsWeightCounter weight_counter, size_t max_number_of_paths_to_search)
- : g_(g),
- cover_map_(cover_map),
- search_dist_(search_dist),
- weight_counter_(weight_counter),
- long_edge_len_(500),
- max_paths_(max_number_of_paths_to_search) {
-
-}
-
-inline vector<BidirectionalPath*> NextPathSearcher::ScaffoldTree(const BidirectionalPath& path) const {
- Edge* start_e = new Edge(g_, path.At(0), NULL, g_.length(path.At(0)) + path.GapAt(0), path.GapAt(0));
- Edge* e = start_e->AddPath(path, 1);
- //jump forward when too much paths
- DEBUG("Scaffolding tree for edge " << g_.int_id(start_e->GetId()));
- path.Print();
- vector<Edge*> result_edges;
- ScaffoldChristmasTree(path, e, result_edges, 0);
- std::vector<BidirectionalPath*> result_paths;
- for (size_t i = 0; i < result_edges.size(); ++i) {
- BidirectionalPath result_path = result_edges[i]->GetPrevPath(path.Size());
- if (!result_path.Empty())
- result_paths.push_back(new BidirectionalPath(result_path));
- }
- if (result_paths.size() != 1) {
- for (size_t i = 0; i < result_paths.size(); ++i) {
- delete result_paths[i];
- }
- result_paths.clear();
- result_edges.clear();
- ScaffoldChristmasTree(path, e, result_edges, long_edge_len_);
- for (size_t i = 0; i < result_edges.size(); ++i) {
- BidirectionalPath result_path = result_edges[i]->GetPrevPath(path.Size());
- if (!result_path.Empty())
- result_paths.push_back(new BidirectionalPath(result_path));
- }
- }
- delete start_e;
- DEBUG( "for path " << path.GetId() << " several extension " << result_paths.size());
- return result_paths;
-}
-
-inline BidirectionalPathSet NextPathSearcher::FindNextPaths(const BidirectionalPath& path, EdgeId begin_edge, bool jump) const {
- TRACE("begin find next paths");
- vector<Edge*> grow_paths;
- vector<Edge*> result_edges;
- vector<Edge*> stopped_paths;
- size_t max_len = search_dist_ + path.Length();
- std::set<Edge*> used_edges;
- int count_to_grow = 1;
-
- Edge* start_e = new Edge(g_, path.At(0), NULL, g_.length(path.At(0)) + path.GapAt(0), path.GapAt(0));
- Edge* e = start_e->AddPath(path, 1);
- if (begin_edge != path.Back()) {
- e = e->AddOutEdge(begin_edge);
- DEBUG( "Try to find next path for path with edge " << g_.int_id(begin_edge));
- } else {
- DEBUG( "Try to search for path with last edge " << g_.int_id(path.Back()) << " Scaffolding: " << jump << ", next edges " << g_.OutgoingEdgeCount(g_.EdgeEnd(path.Back())));
- }
- grow_paths.push_back(e);
-
- size_t ipath = 0;
- DEBUG("Processing paths");
- while (ipath < grow_paths.size()) {
- DEBUG("Processing path " << ipath << " of " << grow_paths.size() << " need to grow " << count_to_grow);
- Edge* current_path = grow_paths[ipath++];
- DEBUG(" edge " << g_.int_id(current_path->GetId()));
- if (used_edges.count(current_path) > 0) {
- count_to_grow--;
- continue;
- }
- used_edges.insert(current_path);
- if (current_path->Length() >= max_len && current_path->IsCorrect()) {
- result_edges.push_back(current_path);
- count_to_grow--;
- continue;
- }
- DEBUG("Growing path");
- vector<Edge*> to_add = GrowPath(path, current_path);
- DEBUG("Path grown");
- if (to_add.empty() && current_path->IsCorrect()) {
- DEBUG("scaffold tip");
- ScaffoldTip(path, current_path, result_edges, stopped_paths, to_add, jump);
- }
- count_to_grow--;
- for (Edge* e_to_add : to_add) {
- grow_paths.push_back(e_to_add);
- count_to_grow++;
- }
-
- if (count_to_grow > (int) max_paths_ || ipath > max_paths_ * 10) {
- DEBUG("too many paths");
- delete start_e;
- return BidirectionalPathSet();
- }
- }
- DEBUG("Paths processed");
-
- BidirectionalPathSet result_paths;
- TRACE("adding paths " << result_edges.size());
- for (size_t i = 0; i < result_edges.size(); ++i) {
- BidirectionalPath result_path = result_edges[i]->GetPrevPath(path.Size());
- if (!result_path.Empty()) {
- result_paths.insert(new BidirectionalPath(result_path));
- }
- }
- delete start_e;
- DEBUG( "for path " << path.GetId() << " several extension " << result_paths.size());
- return result_paths;
-}
-
-inline bool NextPathSearcher::AnalyzeBubble(const BidirectionalPath& p, EdgeId buldge_edge, size_t gap, Edge* prev_edge) const {
- EdgeId max_edge = buldge_edge;
- if (prev_edge->GetOutEdgeIndex(buldge_edge) != -1 || prev_edge->GetIncorrectEdgeIndex(buldge_edge) != -1) {
- return prev_edge->GetOutEdgeIndex(buldge_edge) != -1;
- }
- double max_w = 0.0;
- for (EdgeId e : g_.OutgoingEdges(g_.EdgeStart(buldge_edge))) {
- double w = weight_counter_.CountPairInfo(p, 0, p.Size(), e, gap);
- if (math::gr(w, max_w) || (math::eq(w, max_w) && g_.int_id(e) < g_.int_id(max_edge))) {
- max_w = w;
- max_edge = e;
- }
- }
- for (EdgeId e : g_.OutgoingEdges(g_.EdgeStart(buldge_edge))) {
- if (e == max_edge) {
- prev_edge->AddOutEdge(e);
- } else {
- prev_edge->AddIncorrectOutEdge(e);
- }
- }
- return max_edge == buldge_edge;
-}
-
-inline Edge* NextPathSearcher::AddEdge(const BidirectionalPath& init_path, Edge* prev_e, EdgeId e_to_add, int gap) const {
- Edge* e = prev_e;
- if (e->GetIncorrectEdgeIndex(e_to_add) != -1) {
- return e;
- }
- int inext = e->GetOutEdgeIndex(e_to_add);
- if (inext != -1) {
- return e->GetOutEdge(inext);
- }
- if (InBuble(e_to_add, g_)) {
- if (AnalyzeBubble(init_path, e_to_add, gap, e)) {
- return e->AddOutEdge(e_to_add);
- }
- } else if (e->GetId() != e_to_add) {
- return e->AddOutEdge(e_to_add);
- }
- return e;
-}
-
-inline vector<Edge*> NextPathSearcher::GrowPath(const BidirectionalPath& init_path, Edge* e) const {
- TRACE("in growing path");
- vector<Edge*> to_add;
- if (!e->IsCorrect()) {
- TRACE("incorrect");
- return to_add;
- }
- for (EdgeId next_edge : g_.OutgoingEdges(g_.EdgeEnd(e->GetId()))) {
- TRACE("Analyze outgoing edge " << g_.int_id(next_edge));
- BidirectionalPathSet cov_paths = cover_map_.GetCoveringPaths(next_edge);
- TRACE("cov_map size " << cov_paths.size());
- bool already_added = false;
- for (auto inext_path = cov_paths.begin(); inext_path != cov_paths.end() && !already_added; ++inext_path) {
- vector<size_t> positions = (*inext_path)->FindAll(next_edge);
- for (size_t pos : positions) {
- if (pos == 0 || e->EqualBegins(**inext_path, (int) pos - 1)) {
- TRACE("Found equal begin");
- Edge* new_edge = AddEdge(init_path, e, (*inext_path)->At(pos), (*inext_path)->GapAt(pos));
- if (new_edge && new_edge != e) {
- TRACE("Add edge")
- to_add.push_back(new_edge);
- already_added = true;
- break;
- }
- }
- }
- }
- }
- if (to_add.size() == 0) {
- for (EdgeId next_edge : g_.OutgoingEdges(g_.EdgeEnd(e->GetId()))) {
- if (next_edge != e->GetId()) {
- to_add.push_back(e->AddOutEdge(next_edge));
- }
- }
- }
- stringstream str;
- str << " for edge " << g_.int_id(e->GetId()) << " add ";
- for (Edge* e1 : to_add) {
- str << " " << g_.int_id(e1->GetId());
- }
- TRACE(str.str());
- return to_add;
-}
-
-inline void NextPathSearcher::ScaffoldTip(const BidirectionalPath& path, Edge * current_path, vector<Edge*>& result_edges, vector<Edge*>& stopped_paths,
- vector<Edge*>& to_add, bool jump) const {
-
- if (jump) {
- //jump forward when tip
- DEBUG("Scaffolding");
- ConstructedPathT constructed_paths;
- set<EdgeId> seeds;
- Scaffold(path, current_path, constructed_paths, seeds, true);
- if (constructed_paths.empty()) {
- stopped_paths.push_back(current_path);
- } else {
- DEBUG("Jumped! " << to_add.size());
- ConvertPaths(constructed_paths, current_path, to_add);
- }
- } else {
- DEBUG("Not scaffolding because going back");
- result_edges.push_back(current_path);
- }
-}
-
-inline void NextPathSearcher::ScaffoldChristmasTree(const BidirectionalPath& path, Edge * current_path, vector<Edge*>& to_add, size_t min_length_from) const {
- //jump forward when too much paths
- DEBUG("========= Scaffolding when too many paths =========");
- ConstructedPathT constructed_paths;
- set<EdgeId> seeds;
- //Scaffold(path, current_path, constructed_paths, seeds, false);
- EdgeSet candidate_set;
- FindScaffoldingCandidates(path, current_path, candidate_set, min_length_from);
- for (EdgeWithDistance e : candidate_set) {
- constructed_paths.insert(make_pair(e.e_,PathWithDistance(BidirectionalPath(g_, e.e_), e.d_)));
- }
- RemoveRedundant(constructed_paths);
- JoinPathsByDejikstra(path, constructed_paths);
-
- RemoveRedundant(constructed_paths);
- DEBUG("Scafolding candidates");
- for (EdgeWithDistance e : candidate_set) {
- DEBUG( "Edge " << g_.int_id(e.e_) << " (" << g_.length(e.e_) << ")" << ", distance " << e.d_);
- }
-
- DEBUG("scaffolding candidates for tree " << constructed_paths.size());
- for (auto iter = constructed_paths.begin(); iter != constructed_paths.end(); ++iter){
- iter->second.p_.Print();
- }
-
- if (constructed_paths.size() > 0 && constructed_paths.upper_bound(constructed_paths.begin()->first) == constructed_paths.end()) {
- DEBUG("All paths from one seed");
- int first_seed_pos = 0;
- auto p = constructed_paths.begin();
- if (constructed_paths.size() > 1) {
- //Searching for path with max number of seeds
- DEBUG("Many paths from one seed " << constructed_paths.size());
- int max_seeds = 0;
- for (auto it = constructed_paths.begin(); it != constructed_paths.end(); ++it) {
- int seed_count = 0;
- for (EdgeId e : seeds) {
- if (it->second.p_.Contains(e)) {
- ++seed_count;
- }
- }
- if (seed_count > max_seeds) {
- max_seeds = seed_count;
- p = it;
- }
- }
- DEBUG("Max seed containing contains " << max_seeds << " seeds");
- //Looking for first seed in that path
- PathWithDistance& winner(p->second);
- first_seed_pos = (int) winner.p_.Size() + 1;
- for (EdgeId e : seeds) {
- int pos = winner.p_.FindFirst(e);
- if (pos != -1)
- first_seed_pos = min(pos, first_seed_pos);
- }
- VERIFY(first_seed_pos != (int) winner.p_.Size() + 1);
- DEBUG("First seed position " << first_seed_pos << " seeds");
- }
- PathWithDistance& path_to_add(p->second);
- int distance = path_to_add.dist_ + (int) path_to_add.p_.Length() - (int) path_to_add.p_.LengthAt(first_seed_pos);
- to_add.push_back(current_path->AddOutEdge(path_to_add.p_[first_seed_pos], distance));
- to_add.back() = to_add.back()->AddPath(path_to_add.p_, first_seed_pos + 1);
- }
- DEBUG("========= Done scaffolding when too many paths =========");
-}
-
-inline void NextPathSearcher::Scaffold(const BidirectionalPath& init_path, Edge* current_path,
- ConstructedPathT& constructed_paths, set<EdgeId>& seeds, bool is_gap) const {
-
- EdgeSet candidate_set;
- FindScaffoldingCandidates(init_path, current_path, candidate_set, 0);
-
- DEBUG("Scafolding candidates");
- for (EdgeWithDistance e : candidate_set) {
- DEBUG( "Edge " << g_.int_id(e.e_) << " (" << g_.length(e.e_) << ")" << ", distance " << e.d_);
- }
-
- OrderScaffoldingCandidates(candidate_set, init_path, current_path, constructed_paths, seeds, is_gap);
-}
-
-inline void NextPathSearcher::FindScaffoldingCandidates(const BidirectionalPath& init_path, Edge* current_path, EdgeSet& candidate_set, size_t min_length_from) const {
- set<EdgeId> path_end;
- set<Edge*> prev_edges = current_path->GetPrevEdges(search_dist_);
- for (Edge* e : prev_edges) {
- path_end.insert(e->GetId());
- path_end.insert(g_.conjugate(e->GetId()));
- }
- map<EdgeId, vector<int> > candidates;
- //current_path->GetPrevPath(0).Print();
- TRACE(current_path->Length() << " " << init_path.Length());
- VERIFY(current_path->Length() >= init_path.Length());
- size_t grown_path_len = current_path->Length() - init_path.Length();
- TRACE("Path already grown to " << grown_path_len);
-
- for (size_t i = 0; i < init_path.Size(); ++i) {
- if (g_.length(init_path[i]) <= min_length_from) {
- continue;
- }
- vector<EdgeWithDistance> jump_edges;
- size_t distance_to_tip = init_path.LengthAt(i) + grown_path_len;
- FindScaffoldingCandidates(init_path[i], distance_to_tip, jump_edges);
- for (EdgeWithDistance e : jump_edges) {
- if (candidates.find(e.e_) == candidates.end()) {
- candidates[e.e_] = vector<int>();
- }
- DEBUG("ADD JUMP EDGE FROM " << g_.int_id(init_path[i]) << " TO " << g_.int_id(e.e_))
- candidates[e.e_].push_back(/*max(e.d_ - (int) distance_to_tip, 100)*/100);
- }
- }
-
- for (std::pair<EdgeId, vector<int> > e : candidates) {
- if (path_end.count(e.first) > 0) {
- continue;
- }
- int avg_distance = 0;
- TRACE( "All distances for edge " << g_.int_id(e.first) << " (" << g_.length(e.first) << ")");
- for (int dist : e.second) {
- TRACE(dist);
- avg_distance += dist;
- }
- avg_distance /= (int) e.second.size();
- candidate_set.insert(EdgeWithDistance(e.first, avg_distance));
- }
-}
-
-inline void NextPathSearcher::FindScaffoldingCandidates(EdgeId e, size_t distance_to_tip, vector<EdgeWithDistance>& jump_edges) const {
- if (g_.length(e) < long_edge_len_ || distance_to_tip - g_.length(e) >= search_dist_)
- return;
-
- TRACE("Edge " << g_.int_id(e) << ", length " << g_.length(e));
- TRACE( distance_to_tip << " " << distance_to_tip - g_.length(e) << " " << search_dist_);
-
- set<EdgeId> candidate_edges;
- int min_distance = std::max((int) distance_to_tip - (int) weight_counter_.GetLib()->GetLeftVar(), 0);
- int max_distance = (int) search_dist_ + (int) g_.length(e);
- TRACE("Looking in range " << min_distance << " " << max_distance);
- weight_counter_.FindJumpCandidates(e, min_distance, max_distance, long_edge_len_, candidate_edges);
- weight_counter_.FindJumpEdges(e, candidate_edges, min_distance, max_distance, jump_edges);
- TRACE("Found " << jump_edges.size() << " candidate(s) from this edge");
-}
-
-inline void NextPathSearcher::OrderScaffoldingCandidates(EdgeSet& candidate_set, const BidirectionalPath& init_path,
- Edge* current_path, ConstructedPathT& constructed_paths,
- set<EdgeId>& seeds, bool is_gap) const {
- size_t grown_path_len = current_path->Length() - init_path.Length();
-
- TRACE("Order Scaffolding Candidates, is gap " << is_gap);
- for (EdgeWithDistance e : candidate_set) {
- TRACE("e " << g_.int_id(e.e_));
- if (constructed_paths.count(e.e_) > 0) {
- TRACE("visited");
- continue;
- }
- ProcessScaffoldingCandidate(e, candidate_set, current_path, grown_path_len, constructed_paths, is_gap);
- for (auto p1 = constructed_paths.begin(); p1 != constructed_paths.end(); ++p1) {
- TRACE("current constructed paths " << g_.int_id(p1->first));
- //p1->second.p_.Print();
- }
-
- }
- RemoveRedundant(constructed_paths);
- for (auto it = constructed_paths.begin(); it != constructed_paths.end(); ++it) {
- seeds.insert(it->first);
- }
- JoinPathsByGraph(constructed_paths);
- JoinPathsByPI(constructed_paths);
-
- RemoveRedundant(constructed_paths);
-}
-
-inline void NextPathSearcher::ConvertPaths(const ConstructedPathT& constructed_paths, Edge* current_path, vector<Edge*>& to_add) const {
- for (auto edge = constructed_paths.begin(); edge != constructed_paths.end(); ++edge) {
- to_add.push_back(current_path->AddOutEdge(edge->second.p_[0], edge->second.dist_));
- to_add.back() = to_add.back()->AddPath(edge->second.p_, 1);
- }
-}
-
-inline void NextPathSearcher::RemoveRedundant(ConstructedPathT& constructed_paths) const {
- for (auto edge = constructed_paths.begin(); edge != constructed_paths.end();) {
- if (edge->second.p_.Empty()) {
- edge = constructed_paths.erase(edge);
- } else {
- ++edge;
- }
- }
-}
-
-inline void NextPathSearcher::ProcessScaffoldingCandidate(EdgeWithDistance& e, EdgeSet& candidate_set, Edge* current_path, size_t grown_path_len,
- ConstructedPathT& constructed_paths, bool is_gap) const {
- bool looking_for_tip = is_gap;
- //Search back from e till tip or maximim length back
- TRACE(" === Searching back === ");
- TRACE( "Distances: search = " << search_dist_ << ", grown = " << grown_path_len << ", estimated gap = " << e.d_);
- VERIFY(search_dist_ >= grown_path_len);
- VERIFY((int) search_dist_ >= e.d_);
-
- size_t max_length_back = search_dist_ - grown_path_len;
- TRACE(search_dist_ << " " << grown_path_len);
- TRACE( "Searchin for edge of length " << g_.length(e.e_) << " to dist " << max_length_back);
- NextPathSearcher back_searcher(g_, cover_map_, max_length_back, weight_counter_, max_paths_);
- BidirectionalPath jumped_edge(g_, g_.conjugate(e.e_));
- BidirectionalPathSet back_paths = back_searcher.FindNextPaths(jumped_edge, jumped_edge.Back(), false);
- TRACE(" === DONE SEARCHING === ");
- TRACE("Found " << back_paths.size() << " is tip " << IsInTip(g_.EdgeStart(e.e_)) << " look for tip " << looking_for_tip);
-
- if (back_paths.empty()) {
- if (IsInTip(g_.EdgeStart(e.e_)) && looking_for_tip) {
- TRACE( "Added tip edge " << g_.int_id(e.e_) << " (" << g_.length(e.e_) << ")" << ", distance " << e.d_);
- constructed_paths.insert(make_pair(e.e_, PathWithDistance(BidirectionalPath(g_, e.e_), e.d_)));
- } else if (!IsInTip(g_.EdgeStart(e.e_)) && !looking_for_tip) {
- constructed_paths.insert(make_pair(e.e_, PathWithDistance(BidirectionalPath(g_, e.e_), e.d_)));
- }
- } else {
- TRACE("Found several back paths " << back_paths.size());
- BidirectionalPathSet reached_paths;
- FilterBackPaths(back_paths, g_.conjugate(current_path->GetId()), reached_paths, search_dist_ - grown_path_len);
- //Found a path back to the init path
- if (reached_paths.size() > 0 && !looking_for_tip) {
- TRACE("Found " << reached_paths.size() << " direct path(s) back");
- int i = 0;
- for (BidirectionalPath* p : reached_paths) {
- TRACE("Processing reached path " << i++);
- BidirectionalPath cp = p->Conjugate();
- //Adding jumped edge since its not included in the path
- cp.PushBack(e.e_);
- //cp.Print();
- int reached_edge_pos = cp.FindLast(current_path->GetId());
- VERIFY(reached_edge_pos != -1);
- AddConstructedPath(cp, reached_edge_pos + 1, 0, constructed_paths);
- }
- } else if (reached_paths.size() > 0 && looking_for_tip) {
- DEBUG("Impossible: back path reaches tip");
- } else if (looking_for_tip) {
- TRACE( "Found " << back_paths.size() << " path(s) going back to tip");
- int i = 0;
- for (BidirectionalPath* p : back_paths) {
- DEBUG("Processing tip path " << i++);
- BidirectionalPath cp = p->Conjugate();
- //Adding jumped edge since its not included in the path
- cp.PushBack(e.e_);
- AddConstructedPath(cp, 0, EstimateGapForPath(candidate_set, cp), constructed_paths);
- }
- }
- }
- for (BidirectionalPath* p : back_paths) {
- delete p;
- }
-}
-
-inline int NextPathSearcher::EstimateGapForPath(EdgeSet& candidate_set, const BidirectionalPath& p) const {
- int gap = 0;
- int count = 0;
- for (EdgeWithDistance e : candidate_set) {
- int pos = p.FindFirst(e.e_);
- if (pos != -1) {
- size_t length_to_e = 0;
- for (int i = 0; i < pos; ++i) {
- length_to_e += p.LengthAt(i);
- }
- gap += e.d_ - (int) length_to_e;
- }
- ++count;
- }
- gap /= count;
- return gap > 0 ? gap : 100;
-}
-
-inline void NextPathSearcher::AddConstructedPath(const BidirectionalPath& cp, size_t from, int gap, ConstructedPathT& constructed_paths) const {
- VERIFY(!cp.Empty());
-
- //Adding if there is unique (candidate - tip)
- EdgeId candidate = cp.Back();
- for (auto it = constructed_paths.lower_bound(candidate); it != constructed_paths.upper_bound(candidate); ++it) {
- if (it->second.p_.Front() == cp.Front()) {
- return;
- }
- }
-
- TRACE("Adding path starting from " << from);
- constructed_paths.insert(make_pair(candidate, PathWithDistance(cp.SubPath(from), gap)));
- TRACE("add constructed path " << g_.int_id(candidate));
- //cp.Print();
-
- for (size_t i = 0; i < cp.Size() - 1; ++i) {
- EdgeId edge = cp[i];
- for (auto it = constructed_paths.lower_bound(edge); it != constructed_paths.upper_bound(edge); ++it) {
- TRACE("found " << g_.int_id(edge));
- //it->second.p_.Print();
- TRACE("clear");
- it->second.p_.Clear();
- }
- }
-}
-inline bool NextPathSearcher::IsOutTip(VertexId v) const {
- if (g_.OutgoingEdgeCount(v) == 0) {
- return true;
- }
- if (g_.OutgoingEdgeCount(v) != 1) {
- return false;
- }
- EdgeId oute = *g_.OutgoingEdges(v).begin();
- for (EdgeId ine : g_.IncomingEdges(v)) {
- if (oute == ine) {
- return true;
- }
- }
- return false;
-}
-inline bool NextPathSearcher::IsInTip(VertexId v) const {
- if (g_.IncomingEdgeCount(v) == 0) {
- return true;
- }
- if (g_.IncomingEdgeCount(v) != 1) {
- return false;
- }
- EdgeId ine = *g_.IncomingEdges(v).begin();
- for (EdgeId oute : g_.OutgoingEdges(v)) {
- if (oute == ine) {
- return true;
- }
- }
- return false;
-}
-inline void NextPathSearcher::FilterBackPaths(BidirectionalPathSet& back_paths, EdgeId edge_to_reach, BidirectionalPathSet& reached_paths,
- size_t max_len) const {
- TRACE("Searching for proper back paths");
-
- int i = 0;
- for (auto piter = back_paths.begin(); piter != back_paths.end();) {
- BidirectionalPath* p = *piter;
- VERIFY(!p->Empty());
- EdgeId last_e = p->Back();
- VertexId last_v = g_.EdgeEnd(last_e);
- TRACE("Processing path " << i++);
- //p->Print();
- if (p->FindFirst(edge_to_reach) != -1) {
- reached_paths.insert(p);
- ++piter;
- } else if (IsInTip(last_v) == 0 && p->Length() < max_len) {
- ++piter;
- } else {
- delete p;
- piter = back_paths.erase(piter);
- }
- }
-}
-
-inline void NextPathSearcher::JoinPathsByGraph(ConstructedPathT& constructed_paths) const {
- TRACE("== try to join paths using graph ==");
- for (auto p1 = constructed_paths.begin(); p1 != constructed_paths.end(); ++p1) {
- //p1->second.p_.Print();
- }
- TRACE("== printed ==");
-
- //Removing edges whose seed is contained in any other path
- set<EdgeId> to_remove;
- for (auto p1 = constructed_paths.begin(); p1 != constructed_paths.end(); ++p1) {
- if (to_remove.count(p1->first) > 0) {
- continue;
- }
- for (auto p2 = constructed_paths.begin(); p2 != constructed_paths.end(); ++p2) {
- if (p1->first == p2->first || to_remove.count(p2->first) > 0) {
- continue;
- }
- if (p1->second.p_.Contains(p2->first)) {
- to_remove.insert(p2->first);
- }
- }
- }
- for (auto p = constructed_paths.begin(); p != constructed_paths.end(); ) {
- if (to_remove.count(p->first) > 0) {
- p = constructed_paths.erase(p);
- } else {
- ++p;
- }
- }
-}
-
-inline void NextPathSearcher::JoinPathsByPI(ConstructedPathT& constructed_paths) const {
- DEBUG("== try to join paths ===");
- for (auto p1 = constructed_paths.begin(); p1 != constructed_paths.end(); ++p1) {
- p1->second.p_.Print();
- }
- DEBUG("== printed ===");
-
- //Checking paired info
- set<EdgeId> visited;
- for (auto p1 = constructed_paths.begin(); p1 != constructed_paths.end(); ++p1) {
- if (visited.count(p1->first) > 0) {
- continue;
- }
- for (auto p2 = constructed_paths.begin(); p2 != constructed_paths.end(); ++p2) {
- if (p1->first == p2->first) {
- continue;
- }
- BidirectionalPath& path1 = p1->second.p_;
- BidirectionalPath& path2 = p2->second.p_;
- bool has_pi = false;
- for (size_t i = 0; i < path1.Size(); ++i) {
-
- for (size_t j = 0; j < path2.Size(); ++j) {
- size_t len_to_e2 = path2.Length() - path2.LengthAt(j);
- size_t dist = path1.LengthAt(i) + len_to_e2;
- size_t min_dist = (size_t) max(0, (int) dist - (int) weight_counter_.GetLib()->GetLeftVar());
- size_t max_dist = dist + search_dist_;
- DEBUG("try to find pair info between " << g_.int_id(path1[i]) << " and " << g_.int_id(path2[j])
- << " distance from " << min_dist
- <<" to " << max_dist);
- if (path1[i] != path2[j] &&
- weight_counter_.HasPI(path1[i], path2[j], min_dist, max_dist)) {
- has_pi = true;
- break;
- }
- }
- if (has_pi) {
- break;
- }
- }
-
- set<EdgeId> edges_path1;
- for (size_t i = 0; i < path1.Size(); ++i) {
- edges_path1.insert(path1.At(i));
- }
- for (size_t i = 0; i < path2.Size(); ++i) {
- if (edges_path1.count(path2.At(i)) > 0 || edges_path1.count(g_.conjugate(path2.At(i))) > 0) {
- has_pi = false;
- }
- }
- if (has_pi) {
- DEBUG("has pi from ");
- path1.Print();
- DEBUG("to");
- path2.Print();
- path1.PushBack(path2.Front(), 100);
- for (int i = 1; i < (int) path2.Size(); ++i) {
- path1.PushBack(path2[i], path2.GapAt(i), path2.TrashPreviousAt(i), path2.TrashCurrentAt(i));
- }
- DEBUG("new path");
- path1.Print();
- path2.Clear();
- visited.insert(p2->first);
- }
- }
- }
-}
-inline void Generate(size_t l, size_t r, vector<size_t> a,
- vector<vector<size_t> >& res, vector<PathWithDistance*>& all_paths, map<PathWithDistance*, set<PathWithDistance*> >& connections) {
- if (l == r) {
- DEBUG("result " << a.size())
- res.push_back(a);
- } else {
- for (size_t i = l; i < r; ++i) {
- if (l > 0 && connections[all_paths[a[l - 1]]].count(all_paths[a[i]]) == 0) {
- DEBUG(" not connected " << a[l-1] << " and " << a[i])
- continue;
- }
- DEBUG(" connected " << l-1 << " and " << i)
- size_t v = a[l];
- a[l] = a[i];
- a[i] = v;
- Generate(l + 1, r, a, res, all_paths, connections);
- v = a[l];
- a[l] = a[i];
- a[i] = v;
- }
- }
-}
-
-inline vector<vector<size_t> > Generate(size_t n, vector<PathWithDistance*>& all_paths, map<PathWithDistance*, set<PathWithDistance*> >& connections) {
- vector<vector<size_t> > result;
- if (n > 5) {
- return result;
- }
- vector<size_t> a;
- for (size_t i = 0; i < n; ++i) {
- a.push_back(i);
- }
- Generate(0, n, a, result, all_paths, connections);
- return result;
-}
-
-inline map<PathWithDistance*, size_t> NextPathSearcher::FindDistances(const BidirectionalPath& p, vector<PathWithDistance*>& paths) const {
- DEBUG("find distances from e " << g_.int_id(p.Back()))
- map<PathWithDistance*, size_t> result;
- DijkstraHelper<Graph>::BoundedDijkstra dijkstra(DijkstraHelper<Graph>::CreateBoundedDijkstra(g_, search_dist_, 3000));
- dijkstra.Run(g_.EdgeEnd(p.Back()));
- DEBUG("paths size " << paths.size());
- for (auto ipath = paths.begin(); ipath != paths.end(); ++ipath) {
- vector<EdgeId> shortest_path = dijkstra.GetShortestPathTo(g_.EdgeStart((*ipath)->p_.Front()));
- if (shortest_path.size() != 0) {
- int gap = 0;
- for (size_t i = 0; i < shortest_path.size(); ++i) {
- gap += (int) g_.length(shortest_path[i]);
- }
- gap += (int) g_.k();
- result[*ipath] = gap;
- }
- }
- DEBUG("return result " << result.size());
- return result;
-}
-
-inline void NextPathSearcher::FindConnections(vector<PathWithDistance*>& all_paths, map<PathWithDistance*, set<PathWithDistance*> >& connections) const {
- for (auto p1 = all_paths.begin(); p1 != all_paths.end(); ++p1) {
- map<PathWithDistance*, size_t> distances = FindDistances((*p1)->p_, all_paths);
- connections[*p1] = set<PathWithDistance*>();
- for (auto iter = distances.begin(); iter != distances.end(); ++iter) {
- if ((*p1)->p_.Length() + iter->second < search_dist_){
- connections[*p1].insert(iter->first);
- }
- }
- }
-}
-
-inline void NextPathSearcher::ConnectPaths(const BidirectionalPath& init_path, vector<vector<PathWithDistance*> >& variants) const {
- if (variants.size() == 1 && variants[0].size() > 0) {
- vector<PathWithDistance*> res = variants[0];
- vector<PathWithDistance*> for_dijkstra;
- BidirectionalPath& path1 = res[0]->p_;
- for_dijkstra.push_back(res[0]);
- map<PathWithDistance*, size_t> distances = FindDistances(init_path, for_dijkstra);
- size_t gap = distances.count(res[0]) > 0 ? distances[res[0]] : 100 + g_.k();
- BidirectionalPath p(path1);
- path1.Clear();
- path1.PushBack(p.Front(), (int)gap);
- path1.PushBack(p.SubPath(1));
- for (size_t i = 1; i < res.size(); ++i) {
- for_dijkstra.clear();
- for_dijkstra.push_back(res[i]);
- BidirectionalPath& path2 = res[i]->p_;
- distances = FindDistances(path1, for_dijkstra);
- gap = distances.count(res[i]) > 0 ? distances[res[i]] : 100 + g_.k();
- path1.PushBack(path2.Front(), (int)gap);
- for (int i = 1; i < (int) path2.Size(); ++i) {
- path1.PushBack(path2[i], path2.GapAt(i), path2.TrashPreviousAt(i), path2.TrashCurrentAt(i));
- }
- path2.Clear();
- }
- } else if (variants.size() > 1) {
- vector<PathWithDistance*> res = variants[0];
- EdgeId last = res.back()->p_.Back();
- for (size_t i = 1; i < variants.size(); ++i) {
- if (last != variants[i].back()->p_.Back()) {
- return;
- }
- }
- for (size_t i = 0; i < res.size(); ++i) {
- res[i]->p_.Clear();
- }
- int gap = (int) 1000 + (int) g_.k();
- res[0]->p_.PushBack(last, gap);
- }
-}
-
-inline vector<vector<PathWithDistance*> > NextPathSearcher::FilterConnections(vector<PathWithDistance*>& all_paths, map<PathWithDistance*, set<PathWithDistance*> >& connections) const {
- vector<vector<PathWithDistance*> > variants;
- DEBUG("filter connections " << connections.size() << " all paths size " << all_paths.size())
- vector<vector<size_t> > permutations = Generate(all_paths.size(), all_paths, connections);
- DEBUG("generated all permutations " << permutations.size());
- for (size_t i = 0; i < permutations.size(); ++i) {
- vector<PathWithDistance*> variant;
- for (size_t j = 0; j < permutations[i].size(); ++j) {
- variant.push_back(all_paths[permutations[i][j]]);
- }
- variants.push_back(variant);
- }
- return variants;
-}
-
-inline void NextPathSearcher::JoinPathsByDejikstra(const BidirectionalPath& init_path, ConstructedPathT& constructed_paths) const {
- DEBUG("== try to join paths by dejikstra ===");
- for (auto p1 = constructed_paths.begin(); p1 != constructed_paths.end(); ++p1) {
- p1->second.p_.Print();
- }
- DEBUG("== printed ===");
-
- vector<PathWithDistance*> all_paths;
- for (auto p1 = constructed_paths.begin(); p1 != constructed_paths.end(); ++p1) {
- if (p1->second.p_.Size() != 0) {
- all_paths.push_back(&p1->second);
- }
- }
- map<PathWithDistance*, set<PathWithDistance*> > connections;
- FindConnections(all_paths, connections);
- vector<vector<PathWithDistance*> > variants = FilterConnections(all_paths, connections);
- ConnectPaths(init_path, variants);
-
- DEBUG("== after to join paths ===");
- for (auto p1 = constructed_paths.begin(); p1 != constructed_paths.end(); ++p1) {
- p1->second.p_.Print();
- }
- DEBUG("== printed ===");
-}
-
-} // namespace path_extend
diff --git a/src/debruijn/path_extend/paired_library.hpp b/src/debruijn/path_extend/paired_library.hpp
deleted file mode 100644
index a940098..0000000
--- a/src/debruijn/path_extend/paired_library.hpp
+++ /dev/null
@@ -1,180 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * paired_library.hpp
- *
- * Created on: Feb 19, 2012
- * Author: andrey
- */
-
-#ifndef PAIRED_LIBRARY_HPP_
-#define PAIRED_LIBRARY_HPP_
-
-#include "graph_pack.hpp"
-#include "de/paired_info.hpp"
-#include "ideal_pair_info.hpp"
-
-#include "xmath.h"
-
-namespace path_extend {
-
-using debruijn_graph::Graph;
-using debruijn_graph::EdgeId;
-
-using omnigraph::de::PairedInfoIndexT;
-typedef omnigraph::de::PairInfo<EdgeId> DePairInfo;
-using omnigraph::de::Point;
-
-struct PairedInfoLibrary {
- PairedInfoLibrary(size_t k, const Graph& g, size_t readS, size_t is,
- size_t is_min, size_t is_max, size_t is_var,
- bool is_mp,
- const std::map<int, size_t>& is_distribution)
- : g_(g),
- k_(k),
- read_size_(readS),
- is_(is),
- is_min_(is_min),
- is_max_(is_max),
- is_var_(is_var),
- is_mp_(is_mp),
- single_threshold_(-1.0),
- coverage_coeff_(1.0),
- ideal_pi_counter_(g, (int) is_min, (int) is_max, readS, is_distribution) {
- }
-
- virtual ~PairedInfoLibrary() {}
-
- void SetCoverage(double cov) { coverage_coeff_ = cov; }
- void SetSingleThreshold(double threshold) { single_threshold_ = threshold; }
-
- virtual size_t FindJumpEdges(EdgeId e, set<EdgeId>& result, int min_dist, int max_dist, size_t min_len = 0) const = 0;
- virtual void CountDistances(EdgeId e1, EdgeId e2, vector<int>& dist, vector<double>& w) const = 0;
- virtual double CountPairedInfo(EdgeId e1, EdgeId e2, int distance, bool from_interval = false) const = 0;
- virtual double CountPairedInfo(EdgeId e1, EdgeId e2, int dist_min, int dist_max) const = 0;
-
- double IdealPairedInfo(EdgeId e1, EdgeId e2, int distance, bool additive = false) const {
- return ideal_pi_counter_.IdealPairedInfo(e1, e2, distance, additive);
- }
-
- size_t GetISMin() const { return is_min_; }
- double GetSingleThreshold() const { return single_threshold_; }
- double GetCoverageCoeff() const { return coverage_coeff_; }
- size_t GetISMax() const { return is_max_; }
- size_t GetIsVar() const { return is_var_; }
- size_t GetLeftVar() const { return is_ - is_min_; }
- size_t GetRightVar() const { return is_max_ - is_; }
- size_t GetReadSize() const { return read_size_; }
- bool IsMp() const { return is_mp_; }
-
- const Graph& g_;
- size_t k_;
- size_t read_size_;
- size_t is_;
- size_t is_min_;
- size_t is_max_;
- size_t is_var_;
- bool is_mp_;
- double single_threshold_;
- double coverage_coeff_;
- IdealPairInfoCounter ideal_pi_counter_;
-protected:
- DECL_LOGGER("PathExtendPI");
-};
-
-template<class Index>
-struct PairedInfoLibraryWithIndex : public PairedInfoLibrary {
-
- PairedInfoLibraryWithIndex(size_t k, const Graph& g, size_t readS, size_t is, size_t is_min, size_t is_max, size_t is_div,
- const Index& index, bool is_mp,
- const std::map<int, size_t>& is_distribution)
- : PairedInfoLibrary(k, g, readS, is, is_min, is_max, is_div, is_mp, is_distribution),
- index_(index) {}
-
- size_t FindJumpEdges(EdgeId e, std::set<EdgeId>& result, int min_dist, int max_dist, size_t min_len = 0) const override {
- VERIFY(index_.size() > 0);
- result.clear();
-
- auto infos = index_.Get(e);
- // We do not care about iteration order here - all the edges collected
- // will be inside std::set<EdgeId>
- for (auto it : infos) {
- EdgeId e2 = it.first;
- if (e2 == e)
- continue;
- if (g_.length(e2) < min_len)
- continue;
- for (auto point : it.second) {
- omnigraph::de::DEDistance dist = point.d;
- if (math::le(dist, (omnigraph::de::DEDistance) max_dist) &&
- math::ge(dist, (omnigraph::de::DEDistance) min_dist)) {
- result.insert(e2);
- }
- }
- }
- return result.size();
- }
-
-
- void CountDistances(EdgeId e1, EdgeId e2, vector<int>& dist, vector<double>& w) const override {
- VERIFY(index_.size() > 0);
- if (e1 == e2)
- return;
-
- for (auto point : index_.Get(e1, e2)) {
- int pairedDistance = rounded_d(point);
- dist.push_back(pairedDistance);
- w.push_back(point.weight);
- }
- }
-
- double CountPairedInfo(EdgeId e1, EdgeId e2, int distance,
- bool from_interval = false) const override {
- VERIFY(index_.size() != 0);
- double weight = 0.0;
-
- for (auto point : index_.Get(e1, e2)) {
- int pairedDistance = rounded_d(point);
- int distanceDev = (int) point.variation(); //max((int) pointIter->var, (int) is_variation_);
- //Can be modified according to distance comparison
- int d_min = distance - distanceDev;
- int d_max = distance + distanceDev;
-
- if (from_interval) {
- d_min -= (int) (is_ - is_min_);
- d_max += (int) (is_max_ - is_);
- }
- if (pairedDistance >= d_min && pairedDistance <= d_max) {
- weight += point.weight;
- }
- }
- return weight;
- }
-
- double CountPairedInfo(EdgeId e1, EdgeId e2, int dist_min, int dist_max) const override {
- VERIFY(index_.size() != 0);
- double weight = 0.0;
-
- for (auto point : index_.Get(e1, e2)) {
- int dist = rounded_d(point);
- if (dist >= dist_min && dist <= dist_max)
- weight += point.weight;
- }
- return weight;
- }
-
- const Index& index_;
-protected:
- DECL_LOGGER("PathExtendPI");
-};
-
-typedef std::vector<shared_ptr<PairedInfoLibrary> > PairedInfoLibraries;
-
-} // path extend
-
-#endif /* PAIRED_LIBRARY_HPP_ */
diff --git a/src/debruijn/path_extend/path_extend_launch.hpp b/src/debruijn/path_extend/path_extend_launch.hpp
deleted file mode 100644
index 360f7d6..0000000
--- a/src/debruijn/path_extend/path_extend_launch.hpp
+++ /dev/null
@@ -1,851 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * lc_launch.hpp
- *
- * Created on: Dec 1, 2011
- * Author: andrey
- */
-
-#ifndef PATH_EXTEND_LAUNCH_HPP_
-#define PATH_EXTEND_LAUNCH_HPP_
-
-#include <path_extend/scaffolder2015/scaffold_graph_constructor.hpp>
-#include "pe_config_struct.hpp"
-#include "pe_resolver.hpp"
-#include "path_extender.hpp"
-#include "pe_io.hpp"
-#include "path_visualizer.hpp"
-#include "loop_traverser.hpp"
-#include "long_read_storage.hpp"
-#include "next_path_searcher.hpp"
-#include "scaffolder2015/extension_chooser2015.hpp"
-#include "genome_consistance_checker.hpp"
-#include "scaffolder2015/scaffold_graph.hpp"
-#include "scaffolder2015/scaffold_graph_visualizer.hpp"
-
-namespace path_extend {
-
-using namespace debruijn_graph;
-typedef omnigraph::de::PairedInfoIndicesT<Graph> PairedInfoIndicesT;
-
-inline size_t FindMaxOverlapedLen(const vector<shared_ptr<PairedInfoLibrary> >& libs) {
- size_t max = 0;
- for (size_t i = 0; i < libs.size(); ++i) {
- max = std::max(libs[i]->GetISMax(), max);
- }
- return max;
-}
-
-inline string GetEtcDir(const std::string& output_dir) {
- return output_dir + cfg::get().pe_params.etc_dir + "/";
-}
-
-inline void DebugOutputPaths(const conj_graph_pack& gp,
- const std::string& output_dir, const PathContainer& paths,
- const string& name) {
- PathInfoWriter path_writer;
- PathVisualizer visualizer;
-
- DefaultContigCorrector<ConjugateDeBruijnGraph> corrector(gp.g);
- DefaultContigConstructor<ConjugateDeBruijnGraph> constructor(gp.g, corrector);
- ContigWriter writer(gp.g, constructor);
-
- string etcDir = GetEtcDir(output_dir);
- if (!cfg::get().pe_params.debug_output) {
- return;
- }
- writer.OutputPaths(paths, etcDir + name);
- if (cfg::get().pe_params.output.write_paths) {
- path_writer.WritePaths(paths, etcDir + name + ".dat");
- }
- if (cfg::get().pe_params.viz.print_paths) {
- visualizer.writeGraphWithPathsSimple(gp, etcDir + name + ".dot", name,
- paths);
- }
-}
-
-inline double GetWeightThreshold(shared_ptr<PairedInfoLibrary> lib, const pe_config::ParamSetT& pset) {
- return lib->IsMp() ? pset.mate_pair_options.weight_threshold : pset.extension_options.weight_threshold;
-}
-
-inline double GetPriorityCoeff(shared_ptr<PairedInfoLibrary> lib, const pe_config::ParamSetT& pset) {
- return lib->IsMp() ? pset.mate_pair_options.priority_coeff : pset.extension_options.priority_coeff;
-}
-
-inline void SetSingleThresholdForLib(shared_ptr<PairedInfoLibrary> lib, const pe_config::ParamSetT &pset, double threshold, double correction_coeff = 1.0) {
- if (lib->IsMp()) {
- lib->SetSingleThreshold(pset.mate_pair_options.use_default_single_threshold || math::le(threshold, 0.0) ?
- pset.mate_pair_options.single_threshold : threshold);
- }
- else {
- double t = pset.extension_options.use_default_single_threshold || math::le(threshold, 0.0) ?
- pset.extension_options.single_threshold : threshold;
- t = correction_coeff * t;
- lib->SetSingleThreshold(t);
- }
-}
-
-
-inline string MakeNewName(const std::string& contigs_name, const std::string& subname) {
- return contigs_name.substr(0, contigs_name.rfind(".fasta")) + "_" + subname + ".fasta";
-}
-
-inline void OutputBrokenScaffolds(PathContainer& paths, int k,
- const ContigWriter& writer,
- const std::string& filename) {
- if (!cfg::get().pe_params.param_set.scaffolder_options.on
- or !cfg::get().use_scaffolder
- or cfg::get().pe_params.obs == obs_none) {
- return;
- }
-
- int min_gap = cfg::get().pe_params.obs == obs_break_all ? k / 2 : k;
-
- ScaffoldBreaker breaker(min_gap);
- breaker.Split(paths);
- breaker.container().SortByLength();
- writer.OutputPaths(breaker.container(), filename);
-}
-
-inline void AddPathsToContainer(const conj_graph_pack& gp,
- const std::vector<PathInfo<Graph> > paths,
- size_t size_threshold, PathContainer& result) {
- for (size_t i = 0; i < paths.size(); ++i) {
- auto path = paths.at(i);
- vector<EdgeId> edges = path.getPath();
- if (edges.size() <= size_threshold) {
- continue;
- }
- BidirectionalPath* new_path = new BidirectionalPath(gp.g, edges);
- BidirectionalPath* conj_path = new BidirectionalPath(new_path->Conjugate());
- new_path->SetWeight((float) path.getWeight());
- conj_path->SetWeight((float) path.getWeight());
- result.AddPair(new_path, conj_path);
- }
- DEBUG("Long reads paths " << result.size() << " == ");
-}
-
-double GetSingleReadsFilteringThreshold(const io::LibraryType& type) {
- if (type == io::LibraryType::PacBioReads || type == io::LibraryType::SangerReads || type == io::LibraryType::NanoporeReads) {
- return cfg::get().pe_params.long_reads.pacbio_reads.filtering;
- } else if (io::SequencingLibraryBase::IsContigLib(type)) {
- return cfg::get().pe_params.long_reads.contigs.filtering;
- }
- return cfg::get().pe_params.long_reads.single_reads.filtering;
-}
-
-double GetSingleReadsWeightPriorityThreshold(const io::LibraryType& type) {
- if (type == io::LibraryType::PacBioReads || type == io::LibraryType::SangerReads || type == io::LibraryType::NanoporeReads) {
- return cfg::get().pe_params.long_reads.pacbio_reads.weight_priority;
- } else if (io::SequencingLibraryBase::IsContigLib(type)) {
- return cfg::get().pe_params.long_reads.contigs.weight_priority;
- }
- return cfg::get().pe_params.long_reads.single_reads.weight_priority;
-}
-
-double GetSingleReadsUniqueEdgePriorityThreshold(const io::LibraryType& type) {
- if (cfg::get().ds.single_cell &&
- (type == io::LibraryType::PacBioReads || type == io::LibraryType::SangerReads || type == io::LibraryType::NanoporeReads)) {
- return 10000.0;
- }
- if (type == io::LibraryType::PacBioReads || type == io::LibraryType::SangerReads || type == io::LibraryType::NanoporeReads) {
- return cfg::get().pe_params.long_reads.pacbio_reads.unique_edge_priority;
- } else if (io::SequencingLibraryBase::IsContigLib(type)) {
- return cfg::get().pe_params.long_reads.contigs.unique_edge_priority;
- }
- return cfg::get().pe_params.long_reads.single_reads.unique_edge_priority;
-}
-
-bool HasOnlyMPLibs() {
- for (const auto& lib : cfg::get().ds.reads) {
- if (!((lib.type() == io::LibraryType::MatePairs || lib.type() == io::LibraryType::HQMatePairs) &&
- lib.data().mean_insert_size > 0.0)) {
- return false;
- }
- }
- return true;
-}
-
-bool UseCoverageResolverForSingleReads(const io::LibraryType& type) {
- return HasOnlyMPLibs() && (type == io::LibraryType::HQMatePairs);
-}
-
-inline size_t CountEdgesInGraph(const Graph& g) {
- size_t count = 0;
- for (auto iter = g.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
- count++;
- }
- return count;
-}
-
-inline size_t GetNumberMPPaths(const Graph& g) {
- size_t count_edge = CountEdgesInGraph(g);
- if (count_edge < 1000) {
- return 1000;
- }
- if (count_edge < 10000) {
- return 100;
- }
- return 50;
-}
-
-inline string LibStr(size_t count) {
- return count == 1 ? "library" : "libraries";
-}
-
-inline void ClonePathContainer(PathContainer& spaths, PathContainer& tpaths, GraphCoverageMap& tmap) {
- tpaths.clear();
- tmap.Clear();
-
- for (auto iter = spaths.begin(); iter != spaths.end(); ++iter) {
- BidirectionalPath& path = *iter.get();
- BidirectionalPath* new_path = new BidirectionalPath(path.graph());
- new_path->Subscribe(&tmap);
- new_path->PushBack(path);
-
- BidirectionalPath& cpath = *iter.getConjugate();
- BidirectionalPath* new_cpath = new BidirectionalPath(cpath.graph());
- new_cpath->Subscribe(&tmap);
- new_cpath->PushBack(cpath);
-
- tpaths.AddPair(new_path, new_cpath);
- }
-}
-
-inline void FinalizePaths(PathContainer& paths, GraphCoverageMap& cover_map, size_t min_edge_len, size_t max_path_diff, bool mate_pairs = false) {
- DefaultContigCorrector<ConjugateDeBruijnGraph> corrector(cover_map.graph());
- DefaultContigConstructor<ConjugateDeBruijnGraph> constructor(cover_map.graph(), corrector);
- ContigWriter writer(cover_map.graph(), constructor);
- PathExtendResolver resolver(cover_map.graph());
-
- resolver.removeOverlaps(paths, cover_map, min_edge_len, max_path_diff,
- cfg::get().pe_params.param_set.remove_overlaps, cfg::get().pe_params.param_set.cut_all_overlaps);
- if (mate_pairs) {
- resolver.RemoveMatePairEnds(paths, min_edge_len);
- }
- if (cfg::get().avoid_rc_connections) {
- paths.FilterInterstandBulges();
- }
- paths.FilterEmptyPaths();
- if (!mate_pairs) {
- resolver.addUncoveredEdges(paths, cover_map);
- }
- paths.SortByLength();
- for(auto& path : paths) {
- path.first->ResetOverlaps();
- }
-
-}
-
-inline void TraverseLoops(PathContainer& paths, GraphCoverageMap& cover_map, shared_ptr<ContigsMaker> extender) {
- INFO("Traversing tandem repeats");
- LoopTraverser loopTraverser(cover_map.graph(), cover_map, extender);
- loopTraverser.TraverseAllLoops();
- paths.SortByLength();
-}
-
-inline bool IsForSingleReadExtender(const io::SequencingLibrary<debruijn_config::DataSetData> &lib) {
- io::LibraryType lt = lib.type();
- return (lib.data().single_reads_mapped ||
- lt == io::LibraryType::PacBioReads ||
- lt == io::LibraryType::SangerReads ||
- lt == io::LibraryType::NanoporeReads ||
- lib.is_contig_lib());
-}
-
-inline bool IsForPEExtender(const io::SequencingLibrary<debruijn_config::DataSetData> &lib) {
- return (lib.type() == io::LibraryType::PairedEnd &&
- lib.data().mean_insert_size > 0.0);
-}
-
-inline bool IsForShortLoopExtender(const io::SequencingLibrary<debruijn_config::DataSetData> &lib) {
- return (lib.type() == io::LibraryType::PairedEnd &&
- lib.data().mean_insert_size > 0.0);
-}
-
-inline bool IsForScaffoldingExtender(const io::SequencingLibrary<debruijn_config::DataSetData> &lib) {
- return (lib.type() == io::LibraryType::PairedEnd &&
- lib.data().mean_insert_size > 0.0);
-}
-
-inline bool IsForMPExtender(const io::SequencingLibrary<debruijn_config::DataSetData> &lib) {
- return lib.data().mean_insert_size > 0.0 &&
- (lib.type() == io::LibraryType::HQMatePairs ||
- lib.type() == io::LibraryType::MatePairs);
-}
-
-enum class PathExtendStage {
- PEStage,
- MPStage,
- FinalizingPEStage,
- Scaffold2015,
-};
-
-template<class Index>
-inline shared_ptr<PairedInfoLibrary> MakeNewLib(const conj_graph_pack::graph_t& g,
- const Index& paired_index,
- size_t index) {
- const auto& lib = cfg::get().ds.reads[index];
- size_t read_length = lib.data().read_length;
- size_t is = (size_t) lib.data().mean_insert_size;
- int is_min = (int) lib.data().insert_size_left_quantile;
- int is_max = (int) lib.data().insert_size_right_quantile;
- int var = (int) lib.data().insert_size_deviation;
- bool is_mp = lib.type() == io::LibraryType::MatePairs || lib.type() == io::LibraryType::HQMatePairs ;
- return make_shared< PairedInfoLibraryWithIndex<decltype(paired_index[index])> >(cfg::get().K, g, read_length,
- is, is_min > 0.0 ? size_t(is_min) : 0, is_max > 0.0 ? size_t(is_max) : 0,
- size_t(var),
- paired_index[index], is_mp,
- lib.data().insert_size_distribution);
-}
-
-inline shared_ptr<SimpleExtender> MakeLongReadsExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map, size_t lib_index,
- const pe_config::ParamSetT& pset) {
- PathContainer paths;
- AddPathsToContainer(gp, gp.single_long_reads[lib_index].GetAllPaths(), 1, paths);
-
- const auto& lib = cfg::get().ds.reads[lib_index];
- shared_ptr<ExtensionChooser> longReadEC =
- make_shared<LongReadsExtensionChooser>(gp.g, paths, GetSingleReadsFilteringThreshold(lib.type()),
- GetSingleReadsWeightPriorityThreshold(lib.type()),
- GetSingleReadsUniqueEdgePriorityThreshold(lib.type()));
-
- size_t resolvable_repeat_length_bound = 10000ul;
- if (!lib.is_contig_lib()) {
- resolvable_repeat_length_bound = std::max(resolvable_repeat_length_bound, lib.data().read_length);
- }
- INFO("resolvable_repeat_length_bound set to " << resolvable_repeat_length_bound);
- return make_shared<SimpleExtender>(gp, cov_map, longReadEC, resolvable_repeat_length_bound,
- pset.loop_removal.max_loops, true, UseCoverageResolverForSingleReads(lib.type()));
-}
-
-inline shared_ptr<SimpleExtender> MakeLongEdgePEExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map,
- size_t lib_index, const pe_config::ParamSetT& pset, bool investigate_loops) {
- shared_ptr<PairedInfoLibrary> lib = MakeNewLib(gp.g, gp.clustered_indices, lib_index);
- SetSingleThresholdForLib(lib, pset, cfg::get().ds.reads[lib_index].data().pi_threshold);
- INFO("Threshold for lib #" << lib_index << ": " << lib->GetSingleThreshold());
-
- shared_ptr<WeightCounter> wc = make_shared<PathCoverWeightCounter>(gp.g, lib, pset.normalize_weight);
- shared_ptr<ExtensionChooser> extension = make_shared<LongEdgeExtensionChooser>(gp.g, wc, GetWeightThreshold(lib, pset), GetPriorityCoeff(lib, pset));
- return make_shared<SimpleExtender>(gp, cov_map, extension, lib->GetISMax(), pset.loop_removal.max_loops, investigate_loops, false);
-}
-
-
-inline shared_ptr<SimpleExtender> MakeMetaExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map,
- size_t lib_index, const pe_config::ParamSetT& pset, bool investigate_loops) {
- shared_ptr<PairedInfoLibrary> lib = MakeNewLib(gp.g, gp.clustered_indices, lib_index);
- VERIFY(!lib->IsMp());
-
- shared_ptr<WeightCounter> wc = make_shared<MetagenomicWeightCounter>(gp.g, lib, /*read_length*/cfg::get().ds.RL(),
- /*normalized_threshold*/ 0.3, /*raw_threshold*/ 3, /*estimation_edge_length*/ 300);
- shared_ptr<SimpleExtensionChooser> extension = make_shared<SimpleExtensionChooser>(gp.g, wc,
- pset.extension_options.weight_threshold,
- pset.extension_options.priority_coeff);
- return make_shared<SimpleExtender>(gp, cov_map, extension, lib->GetISMax(), pset.loop_removal.max_loops, investigate_loops, false);
-}
-
-inline shared_ptr<SimpleExtender> MakePEExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map,
- size_t lib_index, const pe_config::ParamSetT& pset, bool investigate_loops) {
- shared_ptr<PairedInfoLibrary> lib = MakeNewLib(gp.g, gp.clustered_indices, lib_index);
- SetSingleThresholdForLib(lib, pset, cfg::get().ds.reads[lib_index].data().pi_threshold);
- INFO("Threshold for lib #" << lib_index << ": " << lib->GetSingleThreshold());
-
- shared_ptr<WeightCounter> wc = make_shared<PathCoverWeightCounter>(gp.g, lib, pset.normalize_weight);
- shared_ptr<SimpleExtensionChooser> extension = make_shared<SimpleExtensionChooser>(gp.g, wc, GetWeightThreshold(lib, pset), GetPriorityCoeff(lib, pset));
- return make_shared<SimpleExtender>(gp, cov_map, extension, lib->GetISMax(), pset.loop_removal.max_loops, investigate_loops, false);
-}
-
-inline shared_ptr<PathExtender> MakeScaffoldingExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map,
- size_t lib_index, const pe_config::ParamSetT& pset) {
- shared_ptr<PairedInfoLibrary> lib = MakeNewLib(gp.g, gp.scaffolding_indices, lib_index);
-
- shared_ptr<WeightCounter> counter = make_shared<ReadCountWeightCounter>(gp.g, lib);
- //FIXME this variable was not used!
- //double prior_coef = GetPriorityCoeff(lib, pset);
- //FIXME review parameters
- //todo put parameters in config
- //FIXME remove max_must_overlap from config
- double var_coeff = 3.0;
- auto scaff_chooser = std::make_shared<ScaffoldingExtensionChooser>(gp.g, counter, var_coeff);
-
- vector<shared_ptr<GapJoiner>> joiners;
-
- if (pset.scaffolder_options.use_la_gap_joiner) {
- joiners.push_back(std::make_shared<LAGapJoiner>(gp.g, pset.scaffolder_options.min_overlap_length,
- pset.scaffolder_options.flank_multiplication_coefficient,
- pset.scaffolder_options.flank_addition_coefficient));
- }
-
- joiners.push_back(std::make_shared<HammingGapJoiner>(gp.g, pset.scaffolder_options.min_gap_score,
- pset.scaffolder_options.short_overlap,
- (int) 2 * cfg::get().ds.RL()));
-
- auto composite_gap_joiner = std::make_shared<CompositeGapJoiner>(gp.g,
- joiners,
- size_t(pset.scaffolder_options.max_can_overlap * (double) gp.g.k()),
- int(math::round((double) gp.g.k() - var_coeff * (double) lib->GetIsVar())),
- pset.scaffolder_options.artificial_gap);
-
- return make_shared<ScaffoldingPathExtender>(gp, cov_map, scaff_chooser, composite_gap_joiner, lib->GetISMax(), pset.loop_removal.max_loops, false);
-}
-
-
-inline shared_ptr<PathExtender> MakeScaffolding2015Extender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map,
- size_t lib_index, const pe_config::ParamSetT& pset, const shared_ptr<ScaffoldingUniqueEdgeStorage> storage) {
- shared_ptr<PairedInfoLibrary> lib;
- INFO("for lib " << lib_index);
-
- //TODO:: temporary solution
- if (gp.paired_indices[lib_index].size() > gp.clustered_indices[lib_index].size()) {
- INFO("Paired unclustered indices not empty, using them");
- lib = MakeNewLib(gp.g, gp.paired_indices, lib_index);
- } else if (gp.clustered_indices[lib_index].size() != 0 ) {
- INFO("clustered indices not empty, using them");
- lib = MakeNewLib(gp.g, gp.clustered_indices, lib_index);
- } else {
- ERROR("All paired indices are empty!");
- }
-
- shared_ptr<WeightCounter> counter = make_shared<ReadCountWeightCounter>(gp.g, lib);
-//TODO::was copypasted from MakeScaffoldingExtender
-//TODO::REWRITE
- double var_coeff = 3.0;
- DEBUG("here creating extchooser");
-//TODO: 2 is relative weight cutoff, to config!
- auto scaff_chooser = std::make_shared<ExtensionChooser2015>(gp.g, counter, var_coeff, storage, 2, lib_index);
-
- auto gap_joiner = std::make_shared<HammingGapJoiner>(gp.g, pset.scaffolder_options.min_gap_score,
- pset.scaffolder_options.short_overlap,
- (int) 2 * cfg::get().ds.RL());
-
- return make_shared<ScaffoldingPathExtender>(gp, cov_map, scaff_chooser, gap_joiner, lib->GetISMax(), pset.loop_removal.max_loops, false , false);
-}
-
-
-inline shared_ptr<SimpleExtender> MakeMPExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map, const PathContainer& paths,
- size_t lib_index, const pe_config::ParamSetT& pset) {
-
- shared_ptr<PairedInfoLibrary> lib = MakeNewLib(gp.g, gp.paired_indices, lib_index);
- SetSingleThresholdForLib(lib, pset, cfg::get().ds.reads[lib_index].data().pi_threshold);
- INFO("Threshold for lib #" << lib_index << ": " << lib->GetSingleThreshold());
-
- size_t max_number_of_paths_to_search = GetNumberMPPaths(gp.g);
- DEBUG("max number of mp paths " << max_number_of_paths_to_search);
-
- shared_ptr<MatePairExtensionChooser> chooser = make_shared<MatePairExtensionChooser>(gp.g, lib, paths, max_number_of_paths_to_search);
- return make_shared<SimpleExtender>(gp, cov_map, chooser, lib->GetISMax(), pset.loop_removal.mp_max_loops, true, false);
-}
-
-inline shared_ptr<SimpleExtender> MakeCoordCoverageExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map,
- const pe_config::ParamSetT& pset) {
- shared_ptr<PairedInfoLibrary> lib = MakeNewLib(gp.g, gp.paired_indices, 0);
- CoverageAwareIdealInfoProvider provider(gp.g, lib, 1000, 2000);
- shared_ptr<CoordinatedCoverageExtensionChooser> chooser = make_shared<CoordinatedCoverageExtensionChooser>(gp.g, provider,
- pset.coordinated_coverage.max_edge_length_in_repeat, pset.coordinated_coverage.delta);
- return make_shared<SimpleExtender>(gp, cov_map, chooser, -1ul, pset.loop_removal.mp_max_loops, true, false);
-}
-
-
-inline bool InsertSizeCompare(const shared_ptr<PairedInfoLibrary> lib1,
- const shared_ptr<PairedInfoLibrary> lib2) {
- return lib1->GetISMax() < lib2->GetISMax();
-}
-
-template<typename Base, typename T>
-inline bool instanceof(const T *ptr) {
- return dynamic_cast<const Base*>(ptr) != nullptr;
-}
-
-//Used for debug purpose only
-inline void PrintExtenders(vector<shared_ptr<PathExtender> >& extenders) {
- DEBUG("Extenders in vector:");
- for(size_t i = 0; i < extenders.size(); ++i) {
- string type = typeid(*extenders[i]).name();
- DEBUG("Extender #i" << type);
- if (instanceof<SimpleExtender>(extenders[i].get())) {
- auto ec = ((SimpleExtender *) extenders[i].get())->GetExtensionChooser();
- string chooser_type = typeid(*ec).name();
- DEBUG(" Extender #i" << chooser_type);
- }
- else if (instanceof<ScaffoldingPathExtender>(extenders[i].get())) {
- auto ec = ((ScaffoldingPathExtender *) extenders[i].get())->GetExtensionChooser();
- string chooser_type = typeid(*ec).name();
- DEBUG(" Extender #i" << chooser_type);
- }
- }
-}
-
-inline vector<shared_ptr<PathExtender> > MakeAllExtenders(PathExtendStage stage, const conj_graph_pack& gp, const GraphCoverageMap& cov_map,
- const pe_config::ParamSetT& pset, shared_ptr<ScaffoldingUniqueEdgeStorage> storage, const PathContainer& paths_for_mp = PathContainer()) {
-
- vector<shared_ptr<PathExtender> > result;
- vector<shared_ptr<PathExtender> > pes;
- vector<shared_ptr<PathExtender> > pes2015;
- vector<shared_ptr<PathExtender> > pe_loops;
- vector<shared_ptr<PathExtender> > pe_scafs;
- vector<shared_ptr<PathExtender> > mps;
-
- size_t single_read_libs = 0;
- size_t pe_libs = 0;
- size_t scf_pe_libs = 0;
- size_t mp_libs = 0;
-
- for (io::LibraryType lt : io::LibraryPriotity) {
- for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i) {
- const auto& lib = cfg::get().ds.reads[i];
- if (lib.type() != lt)
- continue;
-
- //TODO: scaff2015 does not need any single read libs?
- if (IsForSingleReadExtender(lib) && pset.sm != sm_2015) {
- result.push_back(MakeLongReadsExtender(gp, cov_map, i, pset));
- ++single_read_libs;
- }
- if (IsForPEExtender(lib)) {
- ++pe_libs;
- if (stage == PathExtendStage::PEStage
- && (pset.sm == sm_old_pe_2015 || pset.sm == sm_old || pset.sm == sm_combined)) {
- if (cfg::get().ds.meta)
- //TODO proper configuration via config
- pes.push_back(MakeMetaExtender(gp, cov_map, i, pset, false));
- else if (cfg::get().ds.moleculo)
- pes.push_back(MakeLongEdgePEExtender(gp, cov_map, i, pset, false));
- else
- pes.push_back(MakePEExtender(gp, cov_map, i, pset, false));
- }
- else if (pset.sm == sm_2015) {
- pes2015.push_back(MakeScaffolding2015Extender(gp, cov_map, i, pset, storage));
- }
- }
- if (IsForShortLoopExtender(lib) && (pset.sm == sm_old_pe_2015 || pset.sm == sm_old || pset.sm == sm_combined)) {
- if (cfg::get().ds.meta) {
- pes.push_back(MakeMetaExtender(gp, cov_map, i, pset, true));
- } else {
- pe_loops.push_back(MakePEExtender(gp, cov_map, i, pset, true));
- }
- }
- if (IsForScaffoldingExtender(lib) && cfg::get().use_scaffolder && pset.scaffolder_options.on) {
- ++scf_pe_libs;
- if (pset.sm == sm_old || pset.sm == sm_combined) {
- pe_scafs.push_back(MakeScaffoldingExtender(gp, cov_map, i, pset));
- }
- if (pset.sm == sm_old_pe_2015 || pset.sm == sm_combined) {
- pe_scafs.push_back(MakeScaffolding2015Extender(gp, cov_map, i, pset, storage));
- }
- }
- if (IsForMPExtender(lib) && stage == PathExtendStage::MPStage) {
- ++mp_libs;
- if (pset.sm == sm_old || pset.sm == sm_combined) {
- mps.push_back(MakeMPExtender(gp, cov_map, paths_for_mp, i, pset));
- }
- if (is_2015_scaffolder_enabled(pset.sm)) {
- mps.push_back(MakeScaffolding2015Extender(gp, cov_map, i, pset, storage));
- }
- }
- }
-
- //std::sort(scaff_libs.begin(), scaff_libs.end(), InsertSizeCompare);
- result.insert(result.end(), pes.begin(), pes.end());
- result.insert(result.end(), pes2015.begin(), pes2015.end());
- result.insert(result.end(), pe_loops.begin(), pe_loops.end());
- result.insert(result.end(), pe_scafs.begin(), pe_scafs.end());
- result.insert(result.end(), mps.begin(), mps.end());
- pes.clear();
- pe_loops.clear();
- pe_scafs.clear();
- pes2015.clear();
- mps.clear();
- }
-
- INFO("Using " << pe_libs << " paired-end " << LibStr(pe_libs));
- INFO("Using " << scf_pe_libs << " paired-end scaffolding " << LibStr(scf_pe_libs));
- INFO("Using " << mp_libs << " mate-pair " << LibStr(mp_libs));
- INFO("Using " << single_read_libs << " single read " << LibStr(single_read_libs));
- INFO("Scaffolder is " << (pset.scaffolder_options.on ? "on" : "off"));
-
- if(pset.use_coordinated_coverage) {
- INFO("Using additional coordinated coverage extender");
- result.push_back(MakeCoordCoverageExtender(gp, cov_map, pset));
- }
-
- PrintExtenders(result);
- return result;
-}
-
-inline shared_ptr<scaffold_graph::ScaffoldGraph> ConstructScaffoldGraph(const conj_graph_pack& gp,
- shared_ptr<ScaffoldingUniqueEdgeStorage> edge_storage,
- const pe_config::ParamSetT::ScaffoldGraphParamsT& params) {
- using namespace scaffold_graph;
- vector<shared_ptr<ConnectionCondition>> conditions;
-
- INFO("Constructing connections");
- if (params.graph_connectivity) {
- conditions.push_back(make_shared<AssemblyGraphConnectionCondition>(gp.g, params.max_path_length));
- }
- for (size_t lib_index = 0; lib_index < cfg::get().ds.reads.lib_count(); ++lib_index) {
- auto lib = cfg::get().ds.reads[lib_index];
- if (lib.is_paired()) {
- shared_ptr<PairedInfoLibrary> paired_lib;
- if (IsForMPExtender(lib))
- paired_lib = MakeNewLib(gp.g, gp.paired_indices, lib_index);
- else if (IsForPEExtender(lib))
- paired_lib = MakeNewLib(gp.g, gp.clustered_indices, lib_index);
- else
- INFO("Unusable paired lib #" << lib_index);
- conditions.push_back(make_shared<PairedLibConnectionCondition>(gp.g, paired_lib, lib_index, params.min_read_count));
- }
- }
- INFO("Total conditions " << conditions.size());
-
- INFO("Constructing scaffold graph");
- LengthEdgeCondition edge_condition(gp.g, edge_storage->GetMinLength());
- DefaultScaffoldGraphConstructor constructor(gp.g, edge_storage->GetSet(), conditions, edge_condition);
- auto scaffoldGraph = constructor.Construct();
-
- INFO("Scaffold graph contains " << scaffoldGraph->VertexCount() << " vertices and " << scaffoldGraph->EdgeCount() << " edges");
- return scaffoldGraph;
-}
-
-
-inline void PrintScaffoldGraph(shared_ptr<scaffold_graph::ScaffoldGraph> scaffoldGraph,
- const set<EdgeId> main_edge_set,
- const string& filename) {
- using namespace scaffold_graph;
-
- auto vcolorer = make_shared<ScaffoldVertexSetColorer>(main_edge_set);
- auto ecolorer = make_shared<ScaffoldEdgeColorer>();
- CompositeGraphColorer <ScaffoldGraph> colorer(vcolorer, ecolorer);
-
- INFO("Visualizing single grpah");
- ScaffoldGraphVisualizer singleVisualizer(*scaffoldGraph, false);
- std::ofstream single_dot;
- single_dot.open((filename + "_single.dot").c_str());
- singleVisualizer.Visualize(single_dot, colorer);
- single_dot.close();
-
- INFO("Visualizing paired grpah");
- ScaffoldGraphVisualizer pairedVisualizer(*scaffoldGraph, true);
- std::ofstream paired_dot;
- paired_dot.open((filename + "_paired.dot").c_str());
- pairedVisualizer.Visualize(paired_dot, colorer);
- paired_dot.close();
-
- INFO("Printing scaffold grpah");
- std::ofstream data_stream;
- data_stream.open((filename + ".data").c_str());
- scaffoldGraph->Print(data_stream);
- data_stream.close();
-}
-
-
-inline size_t FindOverlapLenForStage(PathExtendStage stage) {
- size_t res = 0;
- for (const auto& lib : cfg::get().ds.reads) {
- if (IsForPEExtender(lib) && stage == PathExtendStage::PEStage) {
- res = max(res, (size_t) lib.data().insert_size_right_quantile);
- } else if (IsForShortLoopExtender(lib)) {
- res = max(res, (size_t) lib.data().insert_size_right_quantile);
- } else if (IsForMPExtender(lib) && stage == PathExtendStage::MPStage) {
- res = max(res, (size_t) lib.data().insert_size_right_quantile);
- }
- }
- return res;
-}
-
-inline bool MPLibsExist() {
- for (const auto& lib : cfg::get().ds.reads)
- if (IsForMPExtender(lib))
- return true;
-
- return false;
-}
-
-
-inline void ResolveRepeatsPe(conj_graph_pack& gp,
- const std::string& output_dir,
- const std::string& contigs_name,
- bool traversLoops,
- boost::optional<std::string> broken_contigs) {
-
- INFO("ExSPAnder repeat resolving tool started");
-
- auto storage = std::make_shared<ScaffoldingUniqueEdgeStorage>();
- auto sc_mode = cfg::get().pe_params.param_set.sm;
-
- if (sc_mode != sm_old) {
-//TODO: Separate function!!
- //Setting scaffolding2015 parameters
- auto min_unique_length = cfg::get().pe_params.param_set.scaffolding2015.min_unique_length;
- auto unique_variaton = cfg::get().pe_params.param_set.scaffolding2015.unique_coverage_variation;
- if (cfg::get().pe_params.param_set.scaffolding2015.autodetect) {
- INFO("Autodetecting unique edge set parameters...");
- bool pe_found = false;
-//TODO constant
- size_t min_MP_IS = 10000;
- for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i) {
-
- if (IsForPEExtender(cfg::get().ds.reads[i])) {
- pe_found = true;
- }
- if (IsForMPExtender(cfg::get().ds.reads[i])) {
- min_MP_IS = min(min_MP_IS, (size_t) cfg::get().ds.reads[i].data().mean_insert_size);
- }
- }
- if (pe_found) {
-//TODO constants;
- unique_variaton = 0.5;
- INFO("PE lib found, we believe in coverage");
- } else {
- unique_variaton = 50;
- INFO("No paired libs found, we do not believe in coverage");
- }
- min_unique_length = min_MP_IS;
- INFO("Minimal unique edge length set to the smallest MP library IS: " << min_unique_length);
-
- } else {
- INFO("Unique edge set constructed with parameters from config : length " << min_unique_length
- << " variation " << unique_variaton);
- }
- ScaffoldingUniqueEdgeAnalyzer unique_edge_analyzer(gp, min_unique_length, unique_variaton);
- unique_edge_analyzer.FillUniqueEdgeStorage(*storage);
- }
-
-
- make_dir(output_dir);
- make_dir(GetEtcDir(output_dir));
- const pe_config::ParamSetT &pset = cfg::get().pe_params.param_set;
-
- //Scaffold graph
- shared_ptr<scaffold_graph::ScaffoldGraph> scaffoldGraph;
- if (cfg::get().pe_params.param_set.scaffold_graph_params.construct) {
- scaffoldGraph = ConstructScaffoldGraph(gp, storage, cfg::get().pe_params.param_set.scaffold_graph_params);
- if (cfg::get().pe_params.param_set.scaffold_graph_params.output) {
- PrintScaffoldGraph(scaffoldGraph, storage->GetSet(), GetEtcDir(output_dir) + "scaffold_graph");
- }
- }
-
-
- DefaultContigCorrector<ConjugateDeBruijnGraph> corrector(gp.g);
- DefaultContigConstructor<ConjugateDeBruijnGraph> constructor(gp.g, corrector);
- ContigWriter writer(gp.g, constructor);
-
-//make pe + long reads extenders
- GraphCoverageMap cover_map(gp.g);
- INFO("SUBSTAGE = paired-end libraries")
- PathExtendStage exspander_stage = PathExtendStage::PEStage;
- vector<shared_ptr<PathExtender> > all_libs = MakeAllExtenders(exspander_stage, gp, cover_map, pset, storage);
-
- //Parameters are subject to change
- size_t max_is_right_quantile = max(FindOverlapLenForStage(exspander_stage), gp.g.k() + 100);
- size_t min_edge_len = 100;
-
- shared_ptr<CompositeExtender> mainPE = make_shared<CompositeExtender>(gp.g, cover_map, all_libs,
- max_is_right_quantile, storage);
-
-//extend pe + long reads
- PathExtendResolver resolver(gp.g);
- auto seeds = resolver.makeSimpleSeeds();
- DebugOutputPaths(gp, output_dir, seeds, "init_paths");
- seeds.SortByLength();
- INFO("Growing paths using paired-end and long single reads");
- auto paths = resolver.extendSeeds(seeds, *mainPE);
- paths.SortByLength();
- DebugOutputPaths(gp, output_dir, paths, "pe_before_overlap");
-
- PathContainer clone_paths;
- GraphCoverageMap clone_map(gp.g);
- bool mp_exist = MPLibsExist();
-
- if (mp_exist) {
- ClonePathContainer(paths, clone_paths, clone_map);
- }
-//We do not run overlap removal in 2015 mode
- if (!(sc_mode == sm_old_pe_2015 || sc_mode == sm_2015 || sc_mode == sm_combined))
- FinalizePaths(paths, cover_map, min_edge_len, max_is_right_quantile);
- if (broken_contigs.is_initialized()) {
- OutputBrokenScaffolds(paths, (int) gp.g.k(), writer,
- output_dir + (mp_exist ? "pe_contigs" : broken_contigs.get()));
- }
- DebugOutputPaths(gp, output_dir, paths, "pe_before_traverse");
- if (traversLoops) {
- TraverseLoops(paths, cover_map, mainPE);
- FinalizePaths(paths, cover_map, min_edge_len, max_is_right_quantile);
- }
- DebugOutputPaths(gp, output_dir, paths, (mp_exist ? "pe_final_paths" : "final_paths"));
- writer.OutputPaths(paths, output_dir + (mp_exist ? "pe_scaffolds" : contigs_name));
-
- cover_map.Clear();
- paths.DeleteAllPaths();
- if (!mp_exist) {
- return;
- }
-
-//MP
- DebugOutputPaths(gp, output_dir, clone_paths, "mp_before_extend");
-
- INFO("SUBSTAGE = mate-pair libraries ")
- exspander_stage = PathExtendStage::MPStage;
- all_libs.clear();
- all_libs = MakeAllExtenders(exspander_stage, gp, clone_map, pset, storage, clone_paths);
- max_is_right_quantile = FindOverlapLenForStage(exspander_stage);
- shared_ptr<CompositeExtender> mp_main_pe = make_shared<CompositeExtender>(gp.g, clone_map, all_libs,
- max_is_right_quantile, storage);
-
- INFO("Growing paths using mate-pairs");
- auto mp_paths = resolver.extendSeeds(clone_paths, *mp_main_pe);
- if (!is_2015_scaffolder_enabled(pset.sm)) {
- DebugOutputPaths(gp, output_dir, mp_paths, "mp_before_overlap");
- FinalizePaths(mp_paths, clone_map, max_is_right_quantile, max_is_right_quantile, true);
- }
- DebugOutputPaths(gp, output_dir, mp_paths, "mp_final_paths");
- DEBUG("Paths are grown with mate-pairs");
-
-//MP end
-
-//pe again
- INFO("SUBSTAGE = polishing paths")
- exspander_stage = PathExtendStage::FinalizingPEStage;
- all_libs.clear();
- all_libs = MakeAllExtenders(exspander_stage, gp, cover_map, pset, storage);
- max_is_right_quantile = FindOverlapLenForStage(exspander_stage);
- shared_ptr<CompositeExtender> last_extender = make_shared<CompositeExtender>(gp.g, clone_map, all_libs,
- max_is_right_quantile, storage);
-
- auto last_paths = resolver.extendSeeds(mp_paths, *last_extender);
- DebugOutputPaths(gp, output_dir, last_paths, "mp2_before_overlap");
- if (!is_2015_scaffolder_enabled(pset.sm)) {
- FinalizePaths(last_paths, clone_map, min_edge_len, max_is_right_quantile);
- DebugOutputPaths(gp, output_dir, last_paths, "mp2_before_traverse");
- }
-
- TraverseLoops(last_paths, clone_map, last_extender);
- FinalizePaths(last_paths, clone_map, min_edge_len, max_is_right_quantile);
-
-//result
- if (broken_contigs.is_initialized()) {
- OutputBrokenScaffolds(last_paths, (int) gp.g.k(), writer, output_dir + broken_contigs.get());
- }
- DebugOutputPaths(gp, output_dir, last_paths, "mp2_final_paths");
- writer.OutputPaths(last_paths, output_dir + contigs_name);
-
- //FinalizeUniquenessPaths();
-
- last_paths.DeleteAllPaths();
- seeds.DeleteAllPaths();
- mp_paths.DeleteAllPaths();
- clone_paths.DeleteAllPaths();
-
- INFO("ExSPAnder repeat resolving tool finished");
-}
-
-} /* path_extend */
-
-
-
-#endif /* PATH_EXTEND_LAUNCH_HPP_ */
diff --git a/src/debruijn/path_extend/path_extender.hpp b/src/debruijn/path_extend/path_extender.hpp
deleted file mode 100644
index 7f40516..0000000
--- a/src/debruijn/path_extend/path_extender.hpp
+++ /dev/null
@@ -1,1390 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2011-2014 Saint-Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//****************************************************************************
-
-/*
- * path_extender.hpp
- *
- * Created on: Mar 5, 2012
- * Author: andrey
- */
-
-#pragma once
-
-
-#include "extension_chooser.hpp"
-#include "path_filter.hpp"
-#include "overlap_analysis.hpp"
-#include "scaffolder2015/scaff_supplementary.hpp"
-#include <cmath>
-
-
-namespace path_extend {
-
-class ShortLoopResolver {
-public:
- ShortLoopResolver(const Graph& g)
- : g_(g) { }
-
- virtual ~ShortLoopResolver() { }
-
- virtual void ResolveShortLoop(BidirectionalPath& path) const = 0;
-
-protected:
- DECL_LOGGER("PathExtender")
- const Graph& g_;
-
- void UndoCycles(BidirectionalPath& p, EdgeId next_edge) const {
- if (p.Size() <= 2) {
- return;
- }
- EdgeId first_edge = p.Back();
- EdgeId second_edge = next_edge;
- while (p.Size() > 2) {
- if (p.At(p.Size() - 1) == first_edge && p.At(p.Size() - 2) == second_edge) {
- p.PopBack(2);
- } else {
- return;;
- }
- }
- }
-
- void MakeCycleStep(BidirectionalPath& path, EdgeId e) const {
- if (path.Size() == 0) {
- return;
- }
- EdgeId pathEnd = path.Back();
- path.PushBack(e);
- path.PushBack(pathEnd);
- }
-};
-
-class CovShortLoopResolver : public ShortLoopResolver {
-public:
- CovShortLoopResolver(const conj_graph_pack& gp)
- : ShortLoopResolver(gp.g), gp_(gp) {
-
- }
-
- void ResolveShortLoop(BidirectionalPath& path) const override {
- DEBUG("resolve short loop by coverage");
- path.Print();
-
- pair<EdgeId, EdgeId> edges;
- if (path.Size() >= 1 && GetLoopAndExit(g_, path.Back(), edges)) {
- DEBUG("Coverage Short Loop Resolver");
- UndoCycles(path, edges.first);
- EdgeId e1 = path.Back();
- EdgeId e2 = edges.first;
- EdgeId e_out = edges.second;
- auto prob_e_in = g_.IncomingEdges(g_.EdgeEnd(e2));
- EdgeId e_in = *prob_e_in.begin();
- size_t count = 0;
- for (auto edge = prob_e_in.begin(); edge != prob_e_in.end(); ++edge) {
- if (*edge != e2)
- e_in = *edge;
- count++;
- }
- if (count != 2) {
- return;
- }
- double in_cov = gp_.flanking_cov.GetOutCov(e_in); //g_.coverage(e_in);
- double out_cov = gp_.flanking_cov.GetInCov(e_out); //g_.coverage(e_out);
- double cov = (in_cov + out_cov) / 2.0;
- double time1 = math::round(gp_.flanking_cov.GetInCov(e1) / cov);//math::round(gp_.g.coverage(e1) / cov);
- double time2 = math::round(gp_.flanking_cov.GetInCov(e2) / cov);////math::round(gp_.g.coverage(e2) / cov);
- size_t time = (size_t) std::max(0.0, std::min(time1 - 1.0, time2));
- for (size_t i = 0; i < time; ++i) {
- MakeCycleStep(path, edges.first);
- }
- path.PushBack(edges.second);
- DEBUG("loop with start " << g_.int_id(e_in)
- <<" e1 " << g_.int_id(e1)
- << " e2 " << g_.int_id(e2)
- << " out " <<g_.int_id(e_out)
- << " cov in = " << in_cov
- << " cov out " << out_cov
- << " cov " << cov
- << " cov e1 = " << gp_.g.coverage(e1)
- << " cov e2 = " << gp_.g.coverage(e2)
- << " time1 = " << time1
- << " time2 = " << time2
- << " time = " << time);
- }
- }
-private:
- const conj_graph_pack& gp_;
-};
-
-class SimpleLoopResolver : public ShortLoopResolver {
-
-public:
- SimpleLoopResolver(Graph& g) : ShortLoopResolver(g) { }
-
- void ResolveShortLoop(BidirectionalPath& path) const override {
- pair<EdgeId, EdgeId> edges;
- if (path.Size() >= 1 && GetLoopAndExit(g_, path.Back(), edges)) {
- DEBUG("Resolving short loop...");
- EdgeId e = path.Back();
- path.PushBack(edges.first);
- path.PushBack(e);
- path.PushBack(edges.second);
- DEBUG("Resolving short loop done");
- }
- }
-
-protected:
- DECL_LOGGER("PathExtender")
-};
-
-class LoopResolver : public ShortLoopResolver {
- static const size_t ITER_COUNT = 10;
- const WeightCounter& wc_;
-
-public:
- LoopResolver(const Graph& g, const WeightCounter& wc)
- : ShortLoopResolver(g),
- wc_(wc) { }
-
- void MakeBestChoice(BidirectionalPath& path, pair<EdgeId, EdgeId>& edges) const {
- UndoCycles(path, edges.first);
- BidirectionalPath experiment(path);
- double max_weight = wc_.CountWeight(experiment, edges.second);
- double diff = max_weight - wc_.CountWeight(experiment, edges.first);
- size_t maxIter = 0;
- for (size_t i = 1; i <= ITER_COUNT; ++i) {
- double weight = wc_.CountWeight(experiment, edges.first);
- if (weight > 0) {
- MakeCycleStep(experiment, edges.first);
- weight = wc_.CountWeight(experiment, edges.second);
- double weight2 = wc_.CountWeight(experiment, edges.first);
- if (weight > max_weight || (weight == max_weight && weight - weight2 > diff)
- || (weight == max_weight && weight - weight2 == diff && i == 1)) {
- max_weight = weight;
- maxIter = i;
- diff = weight - weight2;
- }
- }
- }
- for (size_t i = 0; i < maxIter; ++i) {
- MakeCycleStep(path, edges.first);
- }
- path.PushBack(edges.second);
- }
-
- void ResolveShortLoop(BidirectionalPath& path) const override {
- pair<EdgeId, EdgeId> edges;
- if (path.Size() >=1 && GetLoopAndExit(g_, path.Back(), edges)) {
- DEBUG("Resolving short loop...");
- MakeBestChoice(path, edges);
- DEBUG("Resolving short loop done");
- }
- }
-};
-
-class GapJoiner {
-
-public:
- static const int INVALID_GAP = -1000000;
- GapJoiner(const Graph& g)
- : g_(g) { }
-
- virtual Gap FixGap( EdgeId source, EdgeId sink, int initial_gap) const = 0;
-
- virtual ~GapJoiner() { }
-protected:
- const Graph& g_;
-};
-
-class SimpleGapJoiner : public GapJoiner {
-
-public:
- SimpleGapJoiner(const Graph& g) : GapJoiner(g) { }
-
- Gap FixGap(EdgeId source, EdgeId sink, int initial_gap) const override {
- if (initial_gap > 2 * (int) g_.k()) {
- return Gap(initial_gap);
- }
- for (int l = (int) g_.k(); l > 0; --l) {
- if (g_.EdgeNucls(sink).Subseq(g_.length(source) + g_.k() - l) == g_.EdgeNucls(sink).Subseq(0, l)) {
- DEBUG("Found correct gap length");
- DEBUG("Inintial: " << initial_gap << ", new gap: " << g_.k() - l);
- return Gap((int) g_.k() - l);
- }
- }
- DEBUG("Perfect overlap is not found, inintial: " << initial_gap);
- return Gap(initial_gap);
- }
-};
-
-class HammingGapJoiner: public GapJoiner {
- const double min_gap_score_;
- const size_t short_overlap_threshold_;
- const size_t basic_overlap_length_;
-
- vector<size_t> DiffPos(const Sequence& s1, const Sequence& s2) const {
- VERIFY(s1.size() == s2.size());
- vector < size_t > answer;
- for (size_t i = 0; i < s1.size(); ++i)
- if (s1[i] != s2[i])
- answer.push_back(i);
- return answer;
- }
-
- size_t HammingDistance(const Sequence& s1, const Sequence& s2) const {
- VERIFY(s1.size() == s2.size());
- size_t dist = 0;
- for (size_t i = 0; i < s1.size(); ++i) {
- if (s1[i] != s2[i]) {
- dist++;
- }
- }
- return dist;
- }
-
-// double ScoreGap(const Sequence& s1, const Sequence& s2, int gap, int initial_gap) const {
-// VERIFY(s1.size() == s2.size());
-// return 1.0 - (double) HammingDistance(s1, s2) / (double) s1.size()
-// - (double) abs(gap - initial_gap) / (double) (2 * g_.k());
-// }
-
-
- double ScoreGap(const Sequence& s1, const Sequence& s2) const {
- VERIFY(s1.size() == s2.size());
- return 1.0 - (double) HammingDistance(s1, s2) / (double) s1.size();
- }
-
-public:
-
- //todo review parameters in usages
- HammingGapJoiner(const Graph& g,
- double min_gap_score,
- size_t short_overlap_threshold,
- size_t basic_overlap_length):
- GapJoiner(g),
- min_gap_score_(min_gap_score),
- short_overlap_threshold_(short_overlap_threshold),
- basic_overlap_length_(basic_overlap_length)
- {
- DEBUG("HammingGapJoiner params: \n min_gap_score " << min_gap_score_ <<
- "\n short_overlap_threshold " << short_overlap_threshold_ <<
- "\n basic_overlap_length " << basic_overlap_length_);
- }
-
- //estimated_gap is in k-mers
- Gap FixGap(EdgeId source, EdgeId sink, int estimated_gap) const override {
-
- size_t corrected_start_overlap = basic_overlap_length_;
- if (estimated_gap < 0) {
- corrected_start_overlap -= estimated_gap;
- }
-
- corrected_start_overlap = min(corrected_start_overlap,
- g_.k() + min(g_.length(source), g_.length(sink)));
-
- DEBUG("Corrected max overlap " << corrected_start_overlap);
-
- double best_score = min_gap_score_;
- int fixed_gap = INVALID_GAP;
-
- double overlap_coeff = 0.3;
- size_t min_overlap = 1ul;
- if (estimated_gap < 0) {
- size_t estimated_overlap = g_.k() - estimated_gap;
- min_overlap = max(size_t(math::round(overlap_coeff * double(estimated_overlap))), 1ul);
- }
- //todo better usage of estimated overlap
- DEBUG("Min overlap " << min_overlap);
-
- for (size_t l = corrected_start_overlap; l >= min_overlap; --l) {
- //TRACE("Sink: " << g_.EdgeNucls(sink).Subseq(g_.length(sink) + g_.k() - l).str());
- //TRACE("Source: " << g_.EdgeNucls(source).Subseq(0, l));
- double score = 0;
- score = ScoreGap(g_.EdgeNucls(source).Subseq(g_.length(source) + g_.k() - l),
- g_.EdgeNucls(sink).Subseq(0, l));
- if (math::gr(score, best_score)) {
- TRACE("Curr overlap " << l);
- TRACE("Score: " << score);
- best_score = score;
- fixed_gap = int(g_.k() - l);
- }
-
- if (l == short_overlap_threshold_ && fixed_gap != INVALID_GAP) {
- //look at "short" overlaps only if long overlaps couldn't be found
- DEBUG("Not looking at short overlaps");
- break;
- }
- }
-
- if (fixed_gap != INVALID_GAP) {
- DEBUG("Found candidate gap length with score " << best_score);
- DEBUG("Estimated gap: " << estimated_gap <<
- ", fixed gap: " << fixed_gap << " (overlap " << g_.k() - fixed_gap<< ")");
- }
- return Gap(fixed_gap);
- }
-
-private:
- DECL_LOGGER("HammingGapJoiner");
-};
-
-//deprecated!
-//fixme reduce code duplication with HammingGapJoiner
-class LikelihoodHammingGapJoiner: public GapJoiner {
- static const size_t DEFAULT_PADDING_LENGTH = 10;
- const double min_gap_score_;
- const size_t short_overlap_threshold_;
- const size_t basic_overlap_length_;
-
- vector<size_t> DiffPos(const Sequence& s1, const Sequence& s2) const {
- VERIFY(s1.size() == s2.size());
- vector < size_t > answer;
- for (size_t i = 0; i < s1.size(); ++i)
- if (s1[i] != s2[i])
- answer.push_back(i);
- return answer;
- }
-
- size_t HammingDistance(const Sequence& s1, const Sequence& s2) const {
- VERIFY(s1.size() == s2.size());
- size_t dist = 0;
- for (size_t i = 0; i < s1.size(); ++i) {
- if (s1[i] != s2[i]) {
- dist++;
- }
- }
- return dist;
- }
-
-// double ScoreGap(const Sequence& s1, const Sequence& s2, int gap, int initial_gap) const {
-// VERIFY(s1.size() == s2.size());
-// return 1.0 - (double) HammingDistance(s1, s2) / (double) s1.size()
-// - (double) abs(gap - initial_gap) / (double) (2 * g_.k());
-// }
-
- //FIXME use GC content, change match prob and use partition of tip sequence into bad and good part
- double ScoreGap(const Sequence& s1, const Sequence& s2) const {
- static double match_prob = 0.9;
- static double log_match_prob = log2(match_prob);
- static double log_mismatch_prob = log2(1. - match_prob);
- VERIFY(s1.size() == s2.size());
- size_t n = s1.size();
- size_t mismatches = HammingDistance(s1, s2);
- VERIFY(mismatches <= n);
- return 2.*double(n) + double(n - mismatches) * log_match_prob + double(mismatches) * log_mismatch_prob;
- }
-
-public:
-
- //todo review parameters in usages
- LikelihoodHammingGapJoiner(const Graph& g,
- double min_gap_score,
- size_t short_overlap_threshold,
- size_t basic_overlap_length):
- GapJoiner(g),
- min_gap_score_(min_gap_score),
- short_overlap_threshold_(short_overlap_threshold),
- basic_overlap_length_(basic_overlap_length)
- {
- DEBUG("LikelihoodHammingGapJoiner params: \n min_gap_score " << min_gap_score_ <<
- "\n short_overlap_threshold " << short_overlap_threshold_ <<
- "\n basic_overlap_length " << basic_overlap_length_);
- }
-
- //estimated_gap is in k-mers
- Gap FixGap(EdgeId source, EdgeId sink, int estimated_gap) const override {
-
- size_t corrected_start_overlap = basic_overlap_length_;
- if (estimated_gap < 0) {
- corrected_start_overlap -= estimated_gap;
- }
-
- corrected_start_overlap = min(corrected_start_overlap,
- g_.k() + min(g_.length(source), g_.length(sink)));
-
- DEBUG("Corrected max overlap " << corrected_start_overlap);
-
- double best_score = min_gap_score_;
- int fixed_gap = INVALID_GAP;
-
- double overlap_coeff = 0.3;
- size_t min_overlap = 1ul;
- if (estimated_gap < 0) {
- size_t estimated_overlap = g_.k() - estimated_gap;
- min_overlap = max(size_t(math::round(overlap_coeff * double(estimated_overlap))), 1ul);
- }
- //todo better usage of estimated overlap
- DEBUG("Min overlap " << min_overlap);
-
- for (size_t l = corrected_start_overlap; l >= min_overlap; --l) {
- //TRACE("Sink: " << g_.EdgeNucls(sink).Subseq(g_.length(sink) + g_.k() - l).str());
- //TRACE("Source: " << g_.EdgeNucls(source).Subseq(0, l));
- double score = 0;
- score = ScoreGap(g_.EdgeNucls(source).Subseq(g_.length(source) + g_.k() - l),
- g_.EdgeNucls(sink).Subseq(0, l));
- if (math::gr(score, best_score)) {
- TRACE("Curr overlap " << l);
- TRACE("Score: " << score);
- best_score = score;
- fixed_gap = int(g_.k() - l);
- }
-
- if (l == short_overlap_threshold_ && fixed_gap != INVALID_GAP) {
- //look at "short" overlaps only if long overlaps couldn't be found
- DEBUG("Not looking at short overlaps");
- break;
- }
- }
-
- if (fixed_gap != INVALID_GAP) {
- DEBUG("Found candidate gap length with score " << best_score);
- DEBUG("Estimated gap: " << estimated_gap <<
- ", fixed gap: " << fixed_gap << " (overlap " << g_.k() - fixed_gap<< ")");
- }
- return Gap(fixed_gap);
- }
-
-private:
- DECL_LOGGER("LikelihoodHammingGapJoiner");
-};
-
-//if I was in LA
-class LAGapJoiner: public GapJoiner {
-public:
- LAGapJoiner(const Graph& g, size_t min_la_length,
- double flank_multiplication_coefficient,
- double flank_addition_coefficient) :
- GapJoiner(g), min_la_length_(min_la_length), flank_addition_coefficient_(
- flank_addition_coefficient), flank_multiplication_coefficient_(
- flank_multiplication_coefficient) {
- DEBUG("flank_multiplication_coefficient - " << flank_multiplication_coefficient_); DEBUG("flank_addition_coefficient_ - " << flank_addition_coefficient_ );
- }
-
- Gap FixGap(EdgeId source, EdgeId sink, int initial_gap) const override {
-
- DEBUG("Overlap doesn't exceed " << size_t(abs(initial_gap) * ESTIMATED_GAP_MULTIPLIER) + GAP_ADDITIONAL_COEFFICIENT);
- SWOverlapAnalyzer overlap_analyzer(
- size_t(abs(initial_gap) * ESTIMATED_GAP_MULTIPLIER) + GAP_ADDITIONAL_COEFFICIENT);
-
- auto overlap_info = overlap_analyzer.AnalyzeOverlap(g_, source,
- sink);
-
- DEBUG(overlap_info);
-
- if (overlap_info.size() < min_la_length_) {
- DEBUG("Low alignment size");
- return Gap(INVALID_GAP);
- }
-
- size_t max_flank_length = max(overlap_info.r2.start_pos,
- g_.length(source) + g_.k() - overlap_info.r1.end_pos);
- DEBUG("Max flank length - " << max_flank_length);
-
- if ((double) max_flank_length * flank_multiplication_coefficient_
- + flank_addition_coefficient_ > overlap_info.size()) {
- DEBUG("Too long flanks for such alignment");
- return Gap(INVALID_GAP);
- }
-
- if (overlap_info.identity() < IDENTITY_RATIO) {
- DEBUG("Low identity score");
- return Gap(INVALID_GAP);
- }
-
- if ((g_.length(source) + g_.k()) - overlap_info.r1.end_pos > g_.length(source)) {
- DEBUG("Save kmers. Don't want to have edges shorter than k");
- return Gap(INVALID_GAP);
- }
-
- if (overlap_info.r2.start_pos > g_.length(sink)) {
- DEBUG("Save kmers. Don't want to have edges shorter than k");
- return Gap(INVALID_GAP);
- }
-
- return Gap(
- (int) (-overlap_info.r1.size() - overlap_info.r2.start_pos
- + g_.k()),
- (uint32_t) (g_.length(source) + g_.k()
- - overlap_info.r1.end_pos),
- (uint32_t) overlap_info.r2.start_pos);
- }
-
-private:
- DECL_LOGGER("LAGapJoiner");
- const size_t min_la_length_;
- const double flank_addition_coefficient_;
- const double flank_multiplication_coefficient_;
- constexpr static double IDENTITY_RATIO = 0.9;
- constexpr static double ESTIMATED_GAP_MULTIPLIER = 2.0;
- const size_t GAP_ADDITIONAL_COEFFICIENT = 30;
-};
-
-
-class CompositeGapJoiner: public GapJoiner {
-public:
-
- CompositeGapJoiner(const Graph& g,
- const vector<shared_ptr<GapJoiner>>& joiners,
- size_t may_overlap_threhold,
- int must_overlap_threhold,
- size_t artificail_gap) :
- GapJoiner(g),
- joiners_(joiners),
- may_overlap_threshold_(may_overlap_threhold),
- must_overlap_threshold_(must_overlap_threhold),
- artificial_gap_(artificail_gap)
- { }
-
- Gap FixGap(EdgeId source, EdgeId sink, int estimated_gap) const override {
- DEBUG("Trying to fix estimated gap " << estimated_gap <<
- " between " << g_.str(source) << " and " << g_.str(sink));
-
- if (estimated_gap > int(g_.k() + may_overlap_threshold_)) {
- DEBUG("Edges are supposed to be too far to check overlaps");
- return Gap(estimated_gap);
- }
-
- for (auto joiner : joiners_) {
- Gap gap = joiner->FixGap(source, sink, estimated_gap);
- if (gap.gap_ != GapJoiner::INVALID_GAP) {
- return gap;
- }
- }
-
- //couldn't find decent overlap
- if (estimated_gap < must_overlap_threshold_) {
- DEBUG("Estimated gap looks unreliable");
- return Gap(INVALID_GAP);
- } else {
- DEBUG("Overlap was not found");
- return Gap(max(estimated_gap, int(g_.k() + artificial_gap_)));
- }
- }
-
-private:
- vector<shared_ptr<GapJoiner>> joiners_;
- const size_t may_overlap_threshold_;
- const int must_overlap_threshold_;
- const size_t artificial_gap_;
-
- DECL_LOGGER("CompositeGapJoiner");
-};
-
-//FIXME move to tests
-//Just for test. Look at overlap_analysis_tests
-inline Gap MimicLAGapJoiner(Sequence& s1, Sequence& s2) {
- const int INVALID_GAP = -1000000;
- constexpr static double IDENTITY_RATIO = 0.9;
-
- SWOverlapAnalyzer overlap_analyzer_(10000);
- auto overlap_info = overlap_analyzer_.AnalyzeOverlap(s1, s2);
- size_t min_la_length_ = 4;
- if (overlap_info.size() < min_la_length_) {
- DEBUG("Low alignment size");
- return Gap(INVALID_GAP);
- }
- if (overlap_info.identity() < IDENTITY_RATIO) {
- DEBUG("Low identity score");
- return Gap(INVALID_GAP);
- }
- std::cout << overlap_info;
-
- return Gap(
- (int) (-overlap_info.r1.size() - overlap_info.r2.start_pos),
- (uint32_t) (s1.size() - overlap_info.r1.end_pos),
- (uint32_t) overlap_info.r2.start_pos);
-}
-
-
-//Detects a cycle as a minsuffix > IS present earlier in the path. Overlap is allowed.
-class InsertSizeLoopDetector {
-protected:
- const Graph& g_;
- const GraphCoverageMap& cov_map_;
- size_t min_cycle_len_;
-
-public:
- InsertSizeLoopDetector(const Graph& g, const GraphCoverageMap& cov_map, size_t is): g_(g), cov_map_(cov_map), min_cycle_len_(is) {
- }
-
- size_t GetMinCycleLenth() const {
- return min_cycle_len_;
- }
-
- bool CheckCycledNonIS(const BidirectionalPath& path) const {
- if (path.Size() <= 2) {
- return false;
- }
- BidirectionalPath last = path.SubPath(path.Size() - 2);
- int pos = path.FindFirst(last);
- VERIFY(pos >= 0);
- return size_t(pos) != path.Size() - 2;
- }
-
- bool CheckCycled(const BidirectionalPath& path) const {
- return FindCycleStart(path) != -1;
- }
-//first suffix longer than min_cycle_len
- int FindPosIS(const BidirectionalPath& path) const {
- int i = (int) path.Size() - 1;
- while (i >= 0 && path.LengthAt(i) < min_cycle_len_) {
- --i;
- }
- return i;
- }
- int FindCycleStart(const BidirectionalPath& path) const {
- TRACE("Looking for IS cycle " << min_cycle_len_);
- int i = FindPosIS(path);
- TRACE("last is pos " << i);
- if (i < 0) return -1;
-//Tail
- BidirectionalPath last = path.SubPath(i);
- //last.Print();
-
- int pos = path.FindFirst(last);
-// not cycle
- if (pos == i) pos = -1;
- TRACE("looking for 1sr IS cycle " << pos);
- return pos;
- }
-
-//After cycle detected, removes min suffix > IS.
-//returns the beginning of the cycle.
- int RemoveCycle(BidirectionalPath& path) const {
- int pos = FindCycleStart(path);
- DEBUG("Found IS cycle " << pos);
- if (pos == -1) {
- return -1;
- }
-
- int last_edge_pos = FindPosIS(path);
- VERIFY(last_edge_pos > -1);
- DEBUG("last edge pos " << last_edge_pos);
- VERIFY(last_edge_pos > pos);
- for (int i = (int) path.Size() - 1; i >= last_edge_pos; --i) {
- path.PopBack();
- }
- VERIFY((int) path.Size() == last_edge_pos);
- VERIFY(pos < (int) path.Size());
- DEBUG("result pos " <<pos);
- return pos;
- }
-};
-
-class RepeatDetector {
-public:
- RepeatDetector(const Graph& g, const GraphCoverageMap& cov_map, size_t max_repeat_len)
- : g_(g),
- cov_map_(cov_map),
- used_paths_(),
- repeat_len_(max_repeat_len){
- empty_ = new BidirectionalPath(g_);
- }
- ~RepeatDetector() {
- delete empty_;
- }
-
- BidirectionalPath* RepeatPath(const BidirectionalPath& p) {
- if (p.Size() == 0) {
- return empty_;
- }
- EdgeId last_e = p.Back();
- BidirectionalPathSet cov_paths = cov_map_.GetCoveringPaths(last_e);
- DEBUG("cov paths for e " << g_.int_id(last_e) << " size " << cov_paths.size());
- size_t max_common_size = 0;
- BidirectionalPath* result_p = empty_;
- for (BidirectionalPath* cov_p : cov_paths) {
- if (used_paths_.find(cov_p) == used_paths_.end() || cov_p == &p || cov_p == p.GetConjPath()) {
- continue;
- }
- size_t common_size = MaxCommonSize(p, *cov_p);
- DEBUG("max comon size with path " << cov_p->GetId() << " is " << common_size);
- if (common_size == 0) {
- continue;
- }
- VERIFY(common_size <= p.Size());
- if (p.LengthAt(p.Size() - common_size) > repeat_len_) {
- DEBUG("repeat from " << (p.Size() - common_size) << " length " << p.LengthAt(p.Size() - common_size) << " repeat length " << repeat_len_);
- max_common_size = max(common_size, max_common_size);
- result_p = cov_p;
- }
- }
- used_paths_.insert(&p);
- DEBUG("max common size " << max_common_size);
- return result_p;
- }
- size_t MaxCommonSize(const BidirectionalPath& p1, const BidirectionalPath& p2) const {
- DEBUG("max coomon size ")
- EdgeId last_e = p1.Back();
- vector<size_t> positions2 = p2.FindAll(last_e);
- DEBUG("pos size " << positions2.size())
- size_t max_common_size = 0;
- for (size_t pos2 : positions2) {
- size_t common_size = MaxCommonSize(p1, p1.Size() - 1, p2, pos2);
- DEBUG("max common size from " << pos2 << " is " << common_size);
- max_common_size = max(max_common_size, common_size);
- }
- return max_common_size;
- }
-private:
- size_t MaxCommonSize(const BidirectionalPath& p1, size_t pos1, const BidirectionalPath& p2, size_t pos2) const {
- int i1 = (int) pos1;
- int i2 = (int) pos2;
- while (i1 >= 0 && i2 >= 0 &&
- p1.At((size_t) i1) == p2.At((size_t) i2) &&
- p1.GapAt((size_t) i1) == p2.GapAt((size_t) i2)) {
- i1--;
- i2--;
- }
- if (i1 >=0 && i2>=0 && p1.At((size_t) i1) == p2.At((size_t) i2)) {
- i1--;
- i2--;
- }
-
- VERIFY(i1 <= (int)pos1);
- return std::max(size_t((int) pos1 - i1), (size_t)1);
- }
- const Graph& g_;
- const GraphCoverageMap& cov_map_;
- set<const BidirectionalPath*> used_paths_;
- size_t repeat_len_;
- BidirectionalPath* empty_;
-};
-
-class ContigsMaker {
-public:
- ContigsMaker(const Graph & g)
- : g_(g) { }
-
- virtual ~ContigsMaker() { }
-
- virtual void GrowPath(BidirectionalPath& path) = 0;
-
- virtual void GrowPathSimple(BidirectionalPath& path) = 0;
-
- virtual void GrowAll(PathContainer & paths, PathContainer * result) = 0;
-
-protected:
- const Graph& g_;
- DECL_LOGGER("PathExtender")
-};
-
-struct UsedUniqueStorage {
- set<EdgeId> used_;
-
- shared_ptr<ScaffoldingUniqueEdgeStorage> unique_;
- void insert(EdgeId e) {
- if (unique_->IsUnique(e)) {
- used_.insert(e);
- used_.insert(e->conjugate());
- }
- }
- bool IsUsedAndUnique (EdgeId e) {
- return (unique_->IsUnique(e) && used_.find(e) != used_.end());
- }
- UsedUniqueStorage( shared_ptr<ScaffoldingUniqueEdgeStorage> unique ):used_(), unique_(unique) {}
-};
-class PathExtender {
-public:
- PathExtender(const Graph & g): g_(g){ }
- virtual ~PathExtender() { }
- virtual bool MakeGrowStep(BidirectionalPath& path) = 0;
- void AddUniqueEdgeStorage(shared_ptr<UsedUniqueStorage> used_storage) {
- used_storage_ = used_storage;
- }
-protected:
- const Graph& g_;
- shared_ptr<UsedUniqueStorage> used_storage_;
- DECL_LOGGER("PathExtender")
-};
-
-class CompositeExtender : public ContigsMaker {
-public:
- CompositeExtender(Graph & g, GraphCoverageMap& cov_map, size_t max_diff_len)
- : ContigsMaker(g),
- cover_map_(cov_map),
- repeat_detector_(g, cover_map_, 2 * cfg::get().max_repeat_length), //TODO: move to config
- extenders_(),
- max_diff_len_(max_diff_len) {
- }
-
- CompositeExtender(Graph & g, GraphCoverageMap& cov_map, vector<shared_ptr<PathExtender> > pes, size_t max_diff_len, shared_ptr<ScaffoldingUniqueEdgeStorage> unique)
- : ContigsMaker(g),
- cover_map_(cov_map),
- repeat_detector_(g, cover_map_, 2 * cfg::get().max_repeat_length), //TODO: move to config
- extenders_(),
- max_diff_len_(max_diff_len) {
- extenders_ = pes;
- used_storage_ = make_shared<UsedUniqueStorage>(UsedUniqueStorage( unique));
- for (auto ex: extenders_) {
- ex->AddUniqueEdgeStorage(used_storage_);
- }
- }
-
- void AddExtender(shared_ptr<PathExtender> pe) {
- extenders_.push_back(pe);
- pe->AddUniqueEdgeStorage(used_storage_);
- }
-
- virtual void GrowAll(PathContainer& paths, PathContainer * result) {
- result->clear();
- PathContainer usedPaths;
- GrowAll(paths, usedPaths, result);
- LengthPathFilter filter(g_, 0);
- filter.filter(*result);
- }
-
- virtual void GrowPath(BidirectionalPath& path) {
- while (MakeGrowStep(path)) { }
- }
-
- virtual void GrowPathSimple(BidirectionalPath& path) {
- while (MakeGrowStep(path, false)) { }
- }
-
- bool MakeGrowStep(BidirectionalPath& path, bool detect_repeats_online = true) {
- DEBUG("make grow step composite extender");
- auto sc_mode = cfg::get().pe_params.param_set.sm;
- if (is_2015_scaffolder_enabled(sc_mode)) {
- DEBUG("force switch off online repeats detect, 2015 on");
- detect_repeats_online = false;
- }
- if (detect_repeats_online) {
- BidirectionalPath *repeat_path = repeat_detector_.RepeatPath(path);
- size_t repeat_size = repeat_detector_.MaxCommonSize(path, *repeat_path);
-
- if (repeat_size > 0) {
- DEBUG("repeat with length " << repeat_size);
- path.Print();
- repeat_path->Print();
- BidirectionalPath repeat = path.SubPath(path.Size() - repeat_size);
- int begin_repeat = repeat_path->FindLast(repeat);
- VERIFY(begin_repeat > -1);
- size_t end_repeat = (size_t) begin_repeat + repeat_size;
- DEBUG("not consistent subpaths ");
- BidirectionalPath begin1 = path.SubPath(0, path.Size() - repeat_size);
- begin1.Print();
- BidirectionalPath begin2 = repeat_path->SubPath(0, begin_repeat);
- begin2.Print();
- int gpa_in_repeat_path = repeat_path->GapAt(begin_repeat);
- BidirectionalPath end2 = repeat_path->SubPath(end_repeat);
- BidirectionalPath begin1_conj = path.SubPath(0, path.Size() - repeat_size + 1).Conjugate();
- BidirectionalPath begin2_conj = repeat_path->SubPath(0, begin_repeat + 1).Conjugate();
- pair<size_t, size_t> last = ComparePaths(0, 0, begin1_conj, begin2_conj, max_diff_len_);
- DEBUG("last " << last.first << " last2 " << last.second);
- path.Clear();
- repeat_path->Clear();
- int gap_len = repeat.GapAt(0);
-
- if (begin2.Size() == 0 || last.second != 0) { //TODO: incorrect: common edges, but then different ends
- path.PushBack(begin1);
- repeat_path->PushBack(begin2);
- } else {
- gap_len = gpa_in_repeat_path;
- path.PushBack(begin2);
- repeat_path->PushBack(begin1);
- }
-
- path.PushBack(repeat.At(0), gap_len);
- path.PushBack(repeat.SubPath(1));
- path.PushBack(end2);
- DEBUG("new path");
- path.Print();
- return false;
- }
- }
-
- size_t current = 0;
- while (current < extenders_.size()) {
- DEBUG("step " << current << " from " <<extenders_.size());
- if (extenders_[current]->MakeGrowStep(path)) {
- return true;
- }
- ++current;
- }
- return false;
- }
-
-private:
- GraphCoverageMap& cover_map_;
- RepeatDetector repeat_detector_;
- vector<shared_ptr<PathExtender> > extenders_;
- size_t max_diff_len_;
- shared_ptr<UsedUniqueStorage> used_storage_;
- void SubscribeCoverageMap(BidirectionalPath * path) {
- path->Subscribe(&cover_map_);
- for (size_t i = 0; i < path->Size(); ++i) {
- cover_map_.BackEdgeAdded(path->At(i), path, path->GapAt(i));
- }
- }
-
- void GrowAll(PathContainer& paths, PathContainer& usedPaths, PathContainer * result) {
- cover_map_.Clear();
- for (size_t i = 0; i < paths.size(); ++i) {
- VERBOSE_POWER_T2(i, 100, "Processed " << i << " paths from " << paths.size() << " (" << i * 100 / paths.size() << "%)");
- if (paths.size() > 10 && i % (paths.size() / 10 + 1) == 0) {
- INFO("Processed " << i << " paths from " << paths.size() << " (" << i * 100 / paths.size() << "%)");
- }
-//In 2015 modes do not use a seed already used in paths.
- auto sc_mode = cfg::get().pe_params.param_set.sm;
- if (sc_mode == sm_old_pe_2015 || sc_mode == sm_2015 || sc_mode == sm_combined) {
- bool was_used = false;
- for (size_t ind =0; ind < paths.Get(i)->Size(); ind++) {
- EdgeId eid = paths.Get(i)->At(ind);
- if (used_storage_->IsUsedAndUnique(eid)) {
- was_used = true; break;
- } else {
- used_storage_->insert(eid);
- }
- }
- if (was_used) {
- DEBUG("skipping already used seed");
- continue;
- }
- }
-//TODO: coverage_map should be exterminated
- if (!cover_map_.IsCovered(*paths.Get(i))) {
- usedPaths.AddPair(paths.Get(i), paths.GetConjugate(i));
- BidirectionalPath * path = new BidirectionalPath(*paths.Get(i));
- BidirectionalPath * conjugatePath = new BidirectionalPath(*paths.GetConjugate(i));
- result->AddPair(path, conjugatePath);
- SubscribeCoverageMap(path);
- SubscribeCoverageMap(conjugatePath);
- size_t count_trying = 0;
- size_t current_path_len = 0;
- do {
- current_path_len = path->Length();
- count_trying++;
- GrowPath(*path);
- GrowPath(*conjugatePath);
- } while (count_trying < 10 && (path->Length() != current_path_len));
- path->CheckConjugateEnd(cfg::get().max_repeat_length);
- DEBUG("result path " << path->GetId());
- path->Print();
- }
- }
- }
-
-};
-
-//All Path-Extenders inherits this one.
-
-class LoopDetectingPathExtender : public PathExtender {
-
-protected:
- size_t maxLoops_;
- bool investigateShortLoops_;
- bool use_short_loop_cov_resolver_;
- CovShortLoopResolver cov_loop_resolver_;
-
- vector<shared_ptr<BidirectionalPath> > visited_cycles_;
- InsertSizeLoopDetector is_detector_;
- const GraphCoverageMap& cov_map_;
-
-public:
- LoopDetectingPathExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map, size_t max_loops, bool investigateShortLoops,
- bool use_short_loop_cov_resolver, size_t is)
- : PathExtender(gp.g),
- maxLoops_(max_loops),
- investigateShortLoops_(investigateShortLoops),
- use_short_loop_cov_resolver_(use_short_loop_cov_resolver),
- cov_loop_resolver_(gp),
- is_detector_(gp.g, cov_map, is),
- cov_map_(cov_map) {
-
- }
-
- size_t getMaxLoops() const {
- return maxLoops_;
- }
-
- bool isInvestigateShortLoops() const {
- return investigateShortLoops_;
- }
-
- void setInvestigateShortLoops(bool investigateShortLoops) {
- this->investigateShortLoops_ = investigateShortLoops;
- }
-
- void setMaxLoops(size_t maxLoops) {
- if (maxLoops != 0) {
- this->maxLoops_ = maxLoops;
- }
- }
-//seems that it is outofdate
- bool InExistingLoop(const BidirectionalPath& path) {
- TRACE("Checking existing loops");
- int j = 0;
- for (auto cycle : visited_cycles_) {
- VERBOSE_POWER2(j++, "checking ");
- int pos = path.FindLast(*cycle);
- if (pos == -1)
- continue;
-
- int start_cycle_pos = pos + (int) cycle->Size();
- bool only_cycles_in_tail = true;
- int last_cycle_pos = start_cycle_pos;
- DEBUG("start_cycle pos "<< last_cycle_pos);
- for (int i = start_cycle_pos; i < (int) path.Size() - (int) cycle->Size(); i += (int) cycle->Size()) {
- if (!path.CompareFrom(i, *cycle)) {
- only_cycles_in_tail = false;
- break;
- } else {
- last_cycle_pos = i + (int) cycle->Size();
- DEBUG("last cycle pos changed " << last_cycle_pos);
- }
- }
- DEBUG("last_cycle_pos " << last_cycle_pos);
- only_cycles_in_tail = only_cycles_in_tail && cycle->CompareFrom(0, path.SubPath(last_cycle_pos));
- if (only_cycles_in_tail) {
-// seems that most of this is useless, checking
- VERIFY (last_cycle_pos == start_cycle_pos);
- DEBUG("find cycle " << last_cycle_pos);
- DEBUG("path");
- path.Print();
- DEBUG("last subpath");
- path.SubPath(last_cycle_pos).Print();
- DEBUG("cycle");
- cycle->Print();
- DEBUG("last_cycle_pos " << last_cycle_pos << " path size " << path.Size());
- VERIFY(last_cycle_pos <= (int)path.Size());
- DEBUG("last cycle pos + cycle " << last_cycle_pos + (int)cycle->Size());
- VERIFY(last_cycle_pos + (int)cycle->Size() >= (int)path.Size());
-
- return true;
- }
- }
- return false;
- }
-
- void AddCycledEdges(const BidirectionalPath& path, size_t pos) {
- if (pos >= path.Size()) {
- DEBUG("Wrong position in IS cycle");
- return;
- }
- visited_cycles_.push_back(std::make_shared<BidirectionalPath>(path.SubPath(pos)));
- DEBUG("add cycle");
- path.SubPath(pos).Print();
- }
-
- bool DetectCycle(BidirectionalPath& path) {
- DEBUG("detect cycle");
- if (is_detector_.CheckCycled(path)) {
- DEBUG("Checking IS cycle");
- int loop_pos = is_detector_.RemoveCycle(path);
- DEBUG("Removed IS cycle");
- if (loop_pos != -1) {
- AddCycledEdges(path, loop_pos);
- return true;
- }
- }
- return false;
- }
-
- bool DetectCycleScaffolding(BidirectionalPath& path) {
- return is_detector_.CheckCycledNonIS(path);
- }
-
- virtual bool MakeSimpleGrowStep(BidirectionalPath& path) = 0;
-
- virtual bool ResolveShortLoopByCov(BidirectionalPath& path) = 0;
-
- virtual bool ResolveShortLoopByPI(BidirectionalPath& path) = 0;
-
- virtual bool CanInvestigateShortLoop() const {
- return false;
- }
-
- virtual bool MakeGrowStep(BidirectionalPath& path) {
- if (InExistingLoop(path)) {
- DEBUG("in existing loop");
- return false;
- }
- bool result = false;
- LoopDetector loop_detector(&path, cov_map_);
- if (DetectCycle(path)) {
- result = false;
- } else if (path.Size() >= 1 && InvestigateShortLoop() && loop_detector.EdgeInShortLoop(path.Back()) && use_short_loop_cov_resolver_) {
- DEBUG("edge in short loop");
- result = ResolveShortLoop(path);
- } else if (InvestigateShortLoop() && loop_detector.PrevEdgeInShortLoop() && use_short_loop_cov_resolver_) {
- DEBUG("Prev edge in short loop");
- path.PopBack();
- result = ResolveShortLoop(path);
- } else {
- DEBUG("Making step");
- result = MakeSimpleGrowStep(path);
- DEBUG("Made step");
- if (DetectCycle(path)) {
- result = false;
- } else if (path.Size() >= 1 && InvestigateShortLoop() && loop_detector.EdgeInShortLoop(path.Back())) {
- DEBUG("Edge in short loop");
- result = ResolveShortLoop(path);
- } else if (InvestigateShortLoop() && loop_detector.PrevEdgeInShortLoop()) {
- DEBUG("Prev edge in short loop");
- path.PopBack();
- result = ResolveShortLoop(path);
- }
- }
- return result;
- }
-
-private:
- bool ResolveShortLoop(BidirectionalPath& p) {
- if (use_short_loop_cov_resolver_) {
- return ResolveShortLoopByCov(p);
- } else {
- return ResolveShortLoopByPI(p);
- }
- }
-
- bool InvestigateShortLoop() {
- return investigateShortLoops_ && (use_short_loop_cov_resolver_ || CanInvestigateShortLoop());
- }
-protected:
- DECL_LOGGER("LoopDetectingPathExtender")
-};
-
-class SimpleExtender: public LoopDetectingPathExtender {
-
-protected:
-
- shared_ptr<ExtensionChooser> extensionChooser_;
-
- void FindFollowingEdges(BidirectionalPath& path, ExtensionChooser::EdgeContainer * result) {
- DEBUG("Looking for the following edges")
- result->clear();
- vector<EdgeId> edges;
- DEBUG("Pushing back")
- push_back_all(edges, g_.OutgoingEdges(g_.EdgeEnd(path.Back())));
- result->reserve(edges.size());
- for (auto iter = edges.begin(); iter != edges.end(); ++iter) {
- DEBUG("Adding edge w distance " << g_.int_id(*iter));
- result->push_back(EdgeWithDistance(*iter, 0));
- }
- DEBUG("Following edges found");
- }
-
-
-public:
-
- SimpleExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map, shared_ptr<ExtensionChooser> ec,
- size_t is, size_t max_loops, bool investigate_short_loops, bool use_short_loop_cov_resolver):
- LoopDetectingPathExtender(gp, cov_map, max_loops, investigate_short_loops, use_short_loop_cov_resolver, is),
- extensionChooser_(ec) {
- }
-
- std::shared_ptr<ExtensionChooser> GetExtensionChooser() const {
- return extensionChooser_;
- }
-
- bool MakeSimpleGrowStep(BidirectionalPath& path) override {
- if (path.Size() == 0) {
- return false;
- }
- DEBUG("Simple grow step");
- path.Print();
- ExtensionChooser::EdgeContainer candidates;
- FindFollowingEdges(path, &candidates);
- DEBUG("found candidates");
- DEBUG(candidates.size())
- if (candidates.size() == 1) {
- LoopDetector loop_detector(&path, cov_map_);
- if (!investigateShortLoops_ && (loop_detector.EdgeInShortLoop(path.Back()) or loop_detector.EdgeInShortLoop(candidates.back().e_))
- && extensionChooser_->WeightCounterBased()) {
- return false;
- }
- }
- DEBUG("more filtering");
- candidates = extensionChooser_->Filter(path, candidates);
- DEBUG("found candidates 2");
- DEBUG(candidates.size())
- if (candidates.size() == 1) {
- LoopDetector loop_detector(&path, cov_map_);
- DEBUG("loop detecor");
- if (!investigateShortLoops_ &&
- (loop_detector.EdgeInShortLoop(path.Back()) or loop_detector.EdgeInShortLoop(candidates.back().e_))
- && extensionChooser_->WeightCounterBased()) {
- return false;
- }
- DEBUG("push");
- auto sc_mode = cfg::get().pe_params.param_set.sm;
- EdgeId eid = candidates.back().e_;
-//In 2015 modes when trying to use already used unique edge, it is not added and path growing stops.
-//That allows us to avoid overlap removal hacks used earlier.
- if (is_2015_scaffolder_enabled(sc_mode)) {
- if (used_storage_->IsUsedAndUnique(eid)) {
- return false;
- } else {
- used_storage_->insert(eid);
- }
- }
- path.PushBack(eid, candidates.back().d_);
- DEBUG("push done");
- return true;
- }
- return false;
- }
-
-
- bool CanInvestigateShortLoop() const override {
- return extensionChooser_->WeightCounterBased();
- }
-
- bool ResolveShortLoopByCov(BidirectionalPath& path) override {
- LoopDetector loop_detector(&path, cov_map_);
- size_t init_len = path.Length();
- bool result = false;
- while (path.Size() >= 1 && loop_detector.EdgeInShortLoop(path.Back())) {
- cov_loop_resolver_.ResolveShortLoop(path);
- if (init_len == path.Length()) {
- return result;
- } else {
- result = true;
- }
- init_len = path.Length();
- }
- return true;
- }
-
- bool ResolveShortLoopByPI(BidirectionalPath& path) override {
- if (extensionChooser_->WeightCounterBased()) {
- LoopResolver loop_resolver(g_, extensionChooser_->wc());
- LoopDetector loop_detector(&path, cov_map_);
- size_t init_len = path.Length();
- bool result = false;
- while (path.Size() >= 1 && loop_detector.EdgeInShortLoop(path.Back())) {
- loop_resolver.ResolveShortLoop(path);
- if (init_len == path.Length()) {
- return result;
- } else {
- result = true;
- }
- init_len = path.Length();
- }
- return true;
- }
- return false;
- }
-
-protected:
- DECL_LOGGER("SimpleExtender")
-
-};
-
-class ScaffoldingPathExtender: public LoopDetectingPathExtender {
- std::shared_ptr<ExtensionChooser> extension_chooser_;
- ExtensionChooser::EdgeContainer sources_;
- std::shared_ptr<GapJoiner> gap_joiner_;
-
-//When check_sink_ set to false we can scaffold not only tips
- bool check_sink_;
-
- void InitSources() {
- sources_.clear();
-
- for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
- if (g_.IncomingEdgeCount(g_.EdgeStart(*iter)) == 0) {
- sources_.push_back(EdgeWithDistance(*iter, 0));
- }
- }
- }
-
- bool IsSink(EdgeId e) const {
- return g_.OutgoingEdgeCount(g_.EdgeEnd(e)) == 0;
- }
-
-
-public:
-
- ScaffoldingPathExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map, std::shared_ptr<ExtensionChooser> extension_chooser,
- std::shared_ptr<GapJoiner> gap_joiner, size_t is, size_t max_loops, bool investigateShortLoops, bool check_sink = true):
- LoopDetectingPathExtender(gp, cov_map, max_loops, investigateShortLoops, false, is),
- extension_chooser_(extension_chooser),
- gap_joiner_(gap_joiner),check_sink_(check_sink)
- {
- InitSources();
- }
-
- virtual bool MakeSimpleGrowStep(BidirectionalPath& path) {
- if (path.Size() < 1 || (check_sink_ && !IsSink(path.Back())) ) {
- return false;
- }
- DEBUG("scaffolding:");
- DEBUG("Simple grow step, growing path");
- path.Print();
- ExtensionChooser::EdgeContainer candidates = extension_chooser_->Filter(path, sources_);
- DEBUG("scaffolding candidates " << candidates.size() << " from sources " << sources_.size());
-
- if (candidates.size() == 1) {
- if (candidates[0].e_ == path.Back() || (cfg::get().avoid_rc_connections && candidates[0].e_ == g_.conjugate(path.Back()))) {
- return false;
- }
- BidirectionalPath temp_path(path);
- temp_path.PushBack(candidates[0].e_);
- if(this->DetectCycleScaffolding(temp_path)) {
- return false;
- }
-
- auto sc_mode = cfg::get().pe_params.param_set.sm;
- EdgeId eid = candidates.back().e_;
- if(cfg::get().pe_params.param_set.scaffolder_options.fix_gaps && check_sink_) {
- Gap gap = gap_joiner_->FixGap(path.Back(), candidates.back().e_, candidates.back().d_);
- if (gap.gap_ != GapJoiner::INVALID_GAP) {
- DEBUG("Scaffolding. PathId: " << path.GetId() << " path length: " << path.Length() <<
- ", fixed gap length: " << gap.gap_ << ", trash length: " << gap.trash_previous_ << "-" <<
- gap.trash_current_);
-
- if (is_2015_scaffolder_enabled(sc_mode)) {
- if (used_storage_->IsUsedAndUnique(eid)) {
- return false;
- } else {
- used_storage_->insert(eid);
- }
- }
- path.PushBack(eid, gap);
- return true;
- }
- else {
- DEBUG("Looks like wrong scaffolding. PathId: " << path.GetId() << " path length: " <<
- path.Length() << ", fixed gap length: " << candidates.back().d_);
- return false;
- }
- }
- else {
- DEBUG("Gap joiners off");
- DEBUG("Scaffolding. PathId: " << path.GetId() << " path length: " << path.Length() << ", fixed gap length: " << candidates.back().d_ );
- if (is_2015_scaffolder_enabled(sc_mode)) {
- if (used_storage_->IsUsedAndUnique(eid)) {
- return false;
- } else {
- used_storage_->insert(eid);
- }
- }
- path.PushBack(candidates.back().e_, candidates.back().d_);
- return true;
- }
- }
- DEBUG("scaffolding end");
- return false;
- }
-
- virtual bool ResolveShortLoopByCov(BidirectionalPath&) {
- return false;
- }
-
- virtual bool ResolveShortLoopByPI(BidirectionalPath&) {
- return false;
- }
-
- std::shared_ptr<ExtensionChooser> GetExtensionChooser() const {
- return extension_chooser_;
- }
-
-private:
- DECL_LOGGER("ScaffoldingPathExtender");
-};
-
-}
diff --git a/src/debruijn/path_extend/path_filter.hpp b/src/debruijn/path_extend/path_filter.hpp
deleted file mode 100644
index b9625a4..0000000
--- a/src/debruijn/path_extend/path_filter.hpp
+++ /dev/null
@@ -1,134 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * path_filter.hpp
- *
- * Created on: Mar 14, 2012
- * Author: andrey
- */
-
-#ifndef PATH_FILTER_HPP_
-#define PATH_FILTER_HPP_
-
-#include "bidirectional_path.hpp"
-
-namespace path_extend {
-
-class CopyOnWritePathFilter {
-
-protected:
- Graph& g;
-
-public:
- CopyOnWritePathFilter(Graph& g_): g(g_) {
- }
-
- virtual bool predicate(BidirectionalPath& path) = 0;
-
- virtual bool conjugateOperator(bool p, bool cp) {
- return p || cp;
- }
-
- PathContainer filter(PathContainer& paths) {
- PathContainer result;
-
- for (size_t i = 0; i < paths.size(); ++i) {
- if (conjugateOperator(predicate(*paths.Get(i)), predicate(*paths.GetConjugate(i)))) {
- result.AddPair(paths.Get(i), paths.GetConjugate(i));
- }
- }
-
- return result;
- }
-
-};
-
-
-class IdFilter: public CopyOnWritePathFilter {
-
-protected:
- std::set<size_t> ids;
-
-public:
-
- IdFilter(Graph& g_, std::set<size_t> ids_): CopyOnWritePathFilter(g_), ids(ids_) {
- }
-
- virtual bool predicate(BidirectionalPath& path) {
- return ids.count(path.GetId()) > 0;
- }
-};
-
-
-class ErasingPathFilter {
-
-protected:
- const Graph& g;
-
-public:
- ErasingPathFilter(const Graph& g_): g(g_) {
- }
-
- virtual bool predicate(BidirectionalPath& path) = 0;
-
- virtual bool conjugateOperator(bool p, bool cp) {
- return p && cp;
- }
-
- void filter(PathContainer& paths) {
- for (PathContainer::Iterator iter = paths.begin(); iter != paths.end(); ) {
- if (!conjugateOperator(predicate(*iter.get()), predicate(*iter.getConjugate()))) {
- iter = paths.erase(iter);
- }
- else {
- ++iter;
- }
- }
- }
-
-};
-
-
-class CoveragePathFilter: public ErasingPathFilter {
-
-protected:
- double minCoverage;
-
-public:
- CoveragePathFilter(Graph& g_, double cov): ErasingPathFilter(g_), minCoverage(cov) {
-
- }
-
- virtual bool predicate(BidirectionalPath& path) {
- for (size_t i = 0; i < path.Size(); ++i) {
- if (math::ls(g.coverage(path[i]), minCoverage)) {
- return false;
- }
- }
- return true;
- }
-};
-
-
-class LengthPathFilter: public ErasingPathFilter {
-
-protected:
- size_t minLength;
-
-public:
- LengthPathFilter(const Graph& g_, size_t len): ErasingPathFilter(g_), minLength(len) {
- }
-
- virtual bool predicate(BidirectionalPath& path) {
- return path.Length() > minLength;
- }
-};
-
-}
-
-#endif /* PATH_FILTER_HPP_ */
diff --git a/src/debruijn/path_extend/path_visualizer.hpp b/src/debruijn/path_extend/path_visualizer.hpp
deleted file mode 100644
index c805ee8..0000000
--- a/src/debruijn/path_extend/path_visualizer.hpp
+++ /dev/null
@@ -1,172 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * path_visualizer.hpp
- *
- * Created on: Mar 22, 2012
- * Author: andrey
- */
-
-#ifndef PATH_VISUALIZER_HPP_
-#define PATH_VISUALIZER_HPP_
-
-#include "bidirectional_path.hpp"
-#include "stats/debruijn_stats.hpp"
-
-namespace path_extend {
-
-using namespace debruijn_graph;
-
-template<class Graph>
-class PathGraphLabeler : public AbstractGraphLabeler<Graph> {
- typedef AbstractGraphLabeler<Graph> base;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- std::map<EdgeId, std::string> labels_;
-
-public:
- PathGraphLabeler(const Graph& g, const PathContainer& paths) : base(g) {
- for(size_t i = 0; i < paths.size(); ++i) {
- BidirectionalPath * path = paths.Get(i);
- for (size_t j = 0; j < path->Size(); ++j) {
- if (labels_.count(path->At(j)) > 0) {
- labels_[path->At(j)] += ", ";
- }
- labels_[path->At(j)] += "(" + ToString(path->GetId()) + " : " + ToString(j) + ")";
- }
-
- path = paths.GetConjugate(i);
- for (size_t j = 0; j < path->Size(); ++j) {
- if (labels_.count(path->At(j)) > 0) {
- labels_[path->At(j)] += ", ";
- }
- labels_[path->At(j)] += "(" + ToString(path->GetId()) + " : " + ToString(j) + ")";
- }
- }
- }
-
- virtual std::string label(VertexId /*vertexId*/) const {
- return "";
- }
-
- virtual std::string label(EdgeId edgeId) const {
- auto label = labels_.find(edgeId);
- return label == labels_.end() ? "" : label->second;
- }
-};
-
-
-class PathVisualizer {
-
-protected:
- bool writeLength;
- bool writePos;
-
-public:
-
- PathVisualizer(): writeLength(true), writePos(true) {
-
- }
-
- void writeGraphWithPathsSimple(const conj_graph_pack& gp, const string& file_name, const string& graph_name, const PathContainer& paths) const {
- INFO("Visualizing graph " << graph_name << " to file " << file_name);
- std::fstream filestr;
- filestr.open(file_name.c_str(), std::fstream::out);
-
- StrGraphLabeler<Graph> str_labeler(gp.g);
- PathGraphLabeler<Graph> path_labeler(gp.g, paths);
- CoverageGraphLabeler<Graph> cov_labler(gp.g);
- EdgePosGraphLabeler<Graph> pos_labeler(gp.g, gp.edge_pos);
-
- CompositeLabeler<Graph> composite_labeler(str_labeler, cov_labler, path_labeler, pos_labeler);
- shared_ptr<omnigraph::visualization::GraphColorer<Graph>> colorer;
- if (gp.index.IsAttached()) {
- colorer = stats::DefaultColorer(gp);
- } else {
- colorer = omnigraph::visualization::DefaultColorer(gp.g);
- }
-
- omnigraph::visualization::ComponentVisualizer<Graph> visualizer(gp.g, false);
- omnigraph::visualization::EmptyGraphLinker<Graph> linker;
- visualizer.Visualize(filestr, composite_labeler, *colorer, linker);
- filestr.close();
- INFO("Visualizing graph done");
- }
-
- void writeGraphSimple(const conj_graph_pack& gp, const string& file_name, const string& graph_name) const{
- INFO("Visualizing graph " << graph_name << " to file " << file_name);
- std::fstream filestr;
- filestr.open(file_name.c_str(), std::fstream::out);
-
- StrGraphLabeler<Graph> str_labeler(gp.g);
- EdgePosGraphLabeler<Graph> pos_labeler(gp.g, gp.edge_pos);
- CoverageGraphLabeler<Graph> cov_labler(gp.g);
- CompositeLabeler<Graph> composite_labeler(str_labeler, cov_labler, pos_labeler);
-
- shared_ptr<omnigraph::visualization::GraphColorer<Graph>> colorer;
-
- if (gp.index.IsAttached()) {
- colorer = stats::DefaultColorer(gp);
- } else {
- Path<EdgeId> empty;
- colorer = omnigraph::visualization::DefaultColorer(gp.g, empty, empty);
- }
-
- omnigraph::visualization::ComponentVisualizer<Graph> visualizer(gp.g, false);
- omnigraph::visualization::EmptyGraphLinker<Graph> linker;
- visualizer.Visualize(filestr, composite_labeler, *colorer, linker);
- filestr.close();
- INFO("Visualizing graph done");
- }
-
- void writeGraphSimple(const Graph& g, const string& file_name, const string& graph_name) const{
- INFO("Visualizing graph " << graph_name << " to file " << file_name);
- std::fstream filestr;
- filestr.open(file_name.c_str(), std::fstream::out);
-
- StrGraphLabeler<Graph> str_labeler(g);
- CoverageGraphLabeler<Graph> cov_labler(g);
- CompositeLabeler<Graph> composite_labeler(str_labeler, cov_labler);
-
- shared_ptr<omnigraph::visualization::GraphColorer<Graph>> colorer;
-
- Path<EdgeId> empty;
- colorer = omnigraph::visualization::DefaultColorer(g, empty, empty);
-
- omnigraph::visualization::ComponentVisualizer<Graph> visualizer(g, false);
- omnigraph::visualization::EmptyGraphLinker<Graph> linker;
- visualizer.Visualize(filestr, composite_labeler, *colorer, linker);
- filestr.close();
- INFO("Visualizing graph done");
- }
-
- bool isWriteLength() const
- {
- return writeLength;
- }
-
- bool isWritePos() const
- {
- return writePos;
- }
-
- void setWriteLength(bool writeLength)
- {
- this->writeLength = writeLength;
- }
-
- void setWritePos(bool writePos)
- {
- this->writePos = writePos;
- }
-};
-
-}
-
-#endif /* PATH_VISUALIZER_HPP_ */
diff --git a/src/debruijn/path_extend/pe_config_struct.cpp b/src/debruijn/path_extend/pe_config_struct.cpp
deleted file mode 100644
index cd332ac..0000000
--- a/src/debruijn/path_extend/pe_config_struct.cpp
+++ /dev/null
@@ -1,164 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "pe_config_struct.hpp"
-#include "config_common.hpp"
-
-namespace path_extend {
-
-void load(output_broken_scaffolds& obs, boost::property_tree::ptree const& pt, std::string const& key, bool complete) {
- if (complete || pt.find(key) != pt.not_found()) {
- std::string ep = pt.get<std::string>(key);
- obs = pe_config::output_broken_scaffolds_id(ep);
- }
-}
-
-void load(scaffolding_mode &sm, boost::property_tree::ptree const& pt, std::string const& key, bool complete) {
- if (complete || pt.find(key) != pt.not_found()) {
- std::string ep = pt.get<std::string>(key);
- sm = pe_config::scaffolding_mode_id(ep);
- }
-}
-
-void load(pe_config::ParamSetT::ScaffoldGraphParamsT& sg, boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
- load(sg.construct, pt, "construct" );
- load(sg.output, pt, "output" );
- load(sg.min_read_count, pt, "min_read_count" );
- load(sg.graph_connectivity, pt, "graph_connectivity");
- load(sg.max_path_length, pt, "max_path_length" );
-}
-
-void load(pe_config::OutputParamsT& o, boost::property_tree::ptree const& pt, bool complete) {
- using config_common::load;
-
- load(o.write_overlaped_paths, pt, "write_overlaped_paths" , complete);
- load(o.write_paths, pt, "write_paths" , complete);
-}
-
-void load(pe_config::VisualizeParamsT& o, boost::property_tree::ptree const& pt, bool complete) {
- using config_common::load;
- load(o.print_overlaped_paths, pt, "print_overlaped_paths" , complete);
- load(o.print_paths, pt, "print_paths" , complete);
-}
-
-void load(pe_config::ParamSetT::ExtensionOptionsT& es,
- boost::property_tree::ptree const& pt, bool complete) {
- using config_common::load;
- load(es.use_default_single_threshold, pt, "use_default_single_threshold", complete);
- load(es.priority_coeff, pt, "priority_coeff", complete);
- load(es.weight_threshold, pt, "weight_threshold", complete);
- load(es.single_threshold, pt, "single_threshold", complete);
-}
-
-void load(pe_config::ParamSetT::LoopRemovalT& lr,
- boost::property_tree::ptree const& pt, bool complete) {
- using config_common::load;
- load(lr.max_loops, pt, "max_loops", complete);
- load(lr.mp_max_loops, pt, "mp_max_loops", complete);
-}
-
-void load(pe_config::ParamSetT::CoordinatedCoverageT& coord_cov,
- boost::property_tree::ptree const& pt, bool complete) {
- using config_common::load;
- load(coord_cov.max_edge_length_in_repeat, pt, "max_edge_length_repeat", complete);
- load(coord_cov.delta, pt, "delta", complete);
-}
-
-void load(pe_config::ParamSetT::ScaffolderOptionsT& so,
- boost::property_tree::ptree const& pt, bool complete)
-{
- using config_common::load;
- load(so.on , pt, "on" , complete);
- load(so.cutoff , pt, "cutoff", complete);
- load(so.rel_cutoff , pt, "rel_cutoff", complete);
- load(so.sum_threshold , pt, "sum_threshold", complete);
-
- load(so.cluster_info , pt, "cluster_info", complete);
- load(so.cl_threshold , pt, "cl_threshold", complete);
-
- load(so.fix_gaps , pt, "fix_gaps", complete);
- load(so.use_la_gap_joiner , pt, "use_la_gap_joiner", complete);
- load(so.min_gap_score , pt, "min_gap_score", complete);
- load(so.max_must_overlap , pt, "max_must_overlap", complete);
- load(so.max_can_overlap , pt, "max_can_overlap", complete);
- load(so.short_overlap , pt, "short_overlap", complete);
- load(so.artificial_gap , pt, "artificial_gap", complete);
- load(so.use_old_score , pt, "use_old_score", complete);
- load(so.min_overlap_length, pt, "min_overlap_length", complete);
- load(so.flank_addition_coefficient, pt, "flank_addition_coefficient", complete);
- load(so.flank_multiplication_coefficient, pt, "flank_multiplication_coefficient", complete);
-}
-
-void load(pe_config::ParamSetT& p, boost::property_tree::ptree const& pt, bool complete) {
- using config_common::load;
- load(p.sm, pt, "scaffolding_mode", complete);
- load(p.normalize_weight, pt, "normalize_weight", complete);
- load(p.cut_all_overlaps, pt, "cut_all_overlaps", complete);
- load(p.split_edge_length, pt, "split_edge_length", complete);
- load(p.extension_options, pt, "extension_options", complete);
- load(p.mate_pair_options, pt, "mate_pair_options", complete);
- load(p.scaffolder_options, pt, "scaffolder", complete);
- load(p.loop_removal, pt, "loop_removal", complete);
- load(p.coordinated_coverage, pt, "coordinated_coverage", complete);
- load(p.remove_overlaps, pt, "remove_overlaps", complete);
- load(p.use_coordinated_coverage, pt, "use_coordinated_coverage", complete);
- load(p.scaffolding2015, pt, "scaffolding2015", complete);
- load(p.scaffold_graph_params, pt, "scaffold_graph", complete);
-}
-
-
-void load(pe_config::LongReads& p, boost::property_tree::ptree const& pt,
- bool complete) {
- using config_common::load;
- load(p.filtering, pt, "filtering", complete);
- load(p.weight_priority, pt, "weight_priority", complete);
- load(p.unique_edge_priority, pt, "unique_edge_priority", complete);
-}
-
-void load(pe_config::ParamSetT::Scaffolding2015& p, boost::property_tree::ptree const& pt,
- bool) {
- using config_common::load;
- load(p.autodetect, pt, "autodetect");
- load(p.min_unique_length, pt, "min_unique_length");
- load(p.unique_coverage_variation, pt, "unique_coverage_variation");
-}
-
-void load(pe_config::AllLongReads& p, boost::property_tree::ptree const& pt,
- bool complete) {
- using config_common::load;
- load(p.pacbio_reads, pt, "pacbio_reads", complete);
- load(p.single_reads, pt, "single_reads", complete);
- load(p.contigs, pt, "coverage_base_rr", complete);
-}
-
-void load(pe_config::MainPEParamsT& p, boost::property_tree::ptree const& pt,
- bool complete) {
- using config_common::load;
- load(p.debug_output, pt, "debug_output", complete);
- load(p.output, pt, "output", complete);
- load(p.viz, pt, "visualize", complete);
- load(p.obs, pt, "output_broken_scaffolds", complete);
- load(p.param_set, pt, "params", complete);
- load(p.long_reads, pt, "long_reads", complete);
- if (!p.debug_output) {
- p.output.DisableAll();
- p.viz.DisableAll();
- }
- p.etc_dir = "path_extend";
-}
-
-//// main long contigs config load function
-//void load(pe_config& pe_cfg, boost::property_tree::ptree const& pt, bool complete) {
-// using config_common::load;
-//
-// load(pe_cfg.dataset_name , pt, "dataset", complete);
-// load(pe_cfg.params , pt, "pe_params", complete);
-//}
-
-};
-
diff --git a/src/debruijn/path_extend/pe_config_struct.hpp b/src/debruijn/path_extend/pe_config_struct.hpp
deleted file mode 100644
index 286e0bf..0000000
--- a/src/debruijn/path_extend/pe_config_struct.hpp
+++ /dev/null
@@ -1,243 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * lc_config_struct.hpp
- *
- * Created on: Aug 16, 2011
- * Author: Alexey.Gurevich
- */
-
-#ifndef LC_CONFIG_STRUCT_HPP_
-#define LC_CONFIG_STRUCT_HPP_
-
-#include "config_singl.hpp"
-#include "cpp_utils.hpp"
-
-#include <boost/optional.hpp>
-#include <boost/property_tree/ptree_fwd.hpp>
-#include <boost/bimap.hpp>
-
-#include <string>
-#include <vector>
-
-namespace path_extend {
-
-enum output_broken_scaffolds {
- obs_none,
- obs_break_gaps,
- obs_break_all
-};
-
-enum scaffolding_mode {
- sm_old,
- sm_2015,
- sm_combined,
- sm_old_pe_2015
-};
-
-inline bool is_2015_scaffolder_enabled(const scaffolding_mode mode) {
- return (mode != sm_old);
-}
-
-// struct for path extend subproject's configuration file
-struct pe_config {
-
- typedef boost::bimap<std::string, output_broken_scaffolds> output_broken_scaffolds_id_mapping;
-
- static const output_broken_scaffolds_id_mapping FillOBSInfo() {
- output_broken_scaffolds_id_mapping::value_type info[] = {
- output_broken_scaffolds_id_mapping::value_type("none", obs_none),
- output_broken_scaffolds_id_mapping::value_type("break_gaps", obs_break_gaps),
- output_broken_scaffolds_id_mapping::value_type("break_all", obs_break_all)
- };
-
- return output_broken_scaffolds_id_mapping(info, utils::array_end(info));
- }
-
- static const output_broken_scaffolds_id_mapping& output_broken_scaffolds_info() {
- static output_broken_scaffolds_id_mapping output_broken_scaffolds_info = FillOBSInfo();
- return output_broken_scaffolds_info;
- }
-
- static const std::string& output_broken_scaffolds_name(output_broken_scaffolds obs) {
- auto it = output_broken_scaffolds_info().right.find(obs);
- VERIFY_MSG(it != output_broken_scaffolds_info().right.end(),
- "No name for output broken scaffolds mode id = " << obs);
-
- return it->second;
- }
-
- static output_broken_scaffolds output_broken_scaffolds_id(std::string name) {
- auto it = output_broken_scaffolds_info().left.find(name);
- VERIFY_MSG(it != output_broken_scaffolds_info().left.end(),
- "There is no output broken scaffolds mode with name = " << name);
-
- return it->second;
- }
-
- typedef boost::bimap<std::string, scaffolding_mode> scaffolding_mode_id_mapping;
-
- static const scaffolding_mode_id_mapping FillSMInfo() {
- scaffolding_mode_id_mapping::value_type info[] = {
- scaffolding_mode_id_mapping::value_type("old", sm_old),
- scaffolding_mode_id_mapping::value_type("2015", sm_2015),
- scaffolding_mode_id_mapping::value_type("combined", sm_combined),
- scaffolding_mode_id_mapping::value_type("old_pe_2015", sm_old_pe_2015)
- };
-
- return scaffolding_mode_id_mapping(info, utils::array_end(info));
- }
-
- static const scaffolding_mode_id_mapping& scaffolding_mode_info() {
- static scaffolding_mode_id_mapping scaffolding_mode_info = FillSMInfo();
- return scaffolding_mode_info;
- }
-
- static const std::string& scaffolding_mode_name(scaffolding_mode sm) {
- auto it = scaffolding_mode_info().right.find(sm);
- VERIFY_MSG(it != scaffolding_mode_info().right.end(),
- "No name for scaffolding mode id = " << sm);
-
- return it->second;
- }
-
- static scaffolding_mode scaffolding_mode_id(std::string name) {
- auto it = scaffolding_mode_info().left.find(name);
- VERIFY_MSG(it != scaffolding_mode_info().left.end(),
- "There is no scaffolding mode with name = " << name);
-
- return it->second;
- }
-
- struct OutputParamsT {
- bool write_overlaped_paths;
- bool write_paths;
-
- void DisableAll() {
- write_overlaped_paths = false;
- write_paths = false;
- }
- };
-
-
-
- struct VisualizeParamsT {
- bool print_overlaped_paths;
- bool print_paths;
-
- void DisableAll() {
- print_overlaped_paths = false;
- print_paths = false;
- }
- };
-
- struct ParamSetT {
- scaffolding_mode sm;
-
- bool normalize_weight;
- size_t split_edge_length;
- bool cut_all_overlaps;
-
- struct ExtensionOptionsT {
- bool use_default_single_threshold;
- double single_threshold;
- double weight_threshold;
- double priority_coeff;
- } extension_options;
-
- ExtensionOptionsT mate_pair_options;
-
-
- struct ScaffolderOptionsT {
- bool on;
- int cutoff;
- double rel_cutoff;
- double sum_threshold;
-
- bool cluster_info;
- double cl_threshold;
-
- bool fix_gaps;
- bool use_la_gap_joiner;
- double min_gap_score;
- double max_must_overlap;
- double max_can_overlap;
- int short_overlap;
- size_t artificial_gap;
-
- bool use_old_score;
-
- size_t min_overlap_length;
- double flank_addition_coefficient;
- double flank_multiplication_coefficient;
- } scaffolder_options;
-
-
- struct LoopRemovalT {
- size_t max_loops;
- size_t mp_max_loops;
- } loop_removal;
-
- bool remove_overlaps;
- bool use_coordinated_coverage;
-
- struct CoordinatedCoverageT {
- size_t max_edge_length_in_repeat;
- double delta;
- } coordinated_coverage;
- struct Scaffolding2015 {
- bool autodetect;
- size_t min_unique_length;
- double unique_coverage_variation;
- } scaffolding2015;
- struct ScaffoldGraphParamsT {
- bool construct;
- bool output;
- size_t min_read_count;
- bool graph_connectivity;
- size_t max_path_length;
- } scaffold_graph_params;
- };
-
- struct LongReads {
- double filtering;
- double weight_priority;
- double unique_edge_priority;
- };
-
- struct AllLongReads{
- LongReads single_reads;
- LongReads pacbio_reads;
- LongReads contigs;
- };
-
-
- struct MainPEParamsT {
- output_broken_scaffolds obs;
-
- bool finalize_paths;
- bool debug_output;
- std::string etc_dir;
-
- OutputParamsT output;
- VisualizeParamsT viz;
- ParamSetT param_set;
- AllLongReads long_reads;
- }; // params;
-
-};
-
-void load(pe_config::ParamSetT& p, boost::property_tree::ptree const& pt, bool complete = true);
-void load(pe_config::MainPEParamsT& p, boost::property_tree::ptree const& pt, bool complete = true);
-//void load(pe_config& pe_cfg, boost::property_tree::ptree const& pt, bool complete);
-
-}
-
-//typedef config_common::config<path_extend::pe_config> pe_cfg;
-
-#endif /* CONFIG_STRUCT_HPP_ */
diff --git a/src/debruijn/path_extend/pe_io.hpp b/src/debruijn/path_extend/pe_io.hpp
deleted file mode 100644
index 01f1a91..0000000
--- a/src/debruijn/path_extend/pe_io.hpp
+++ /dev/null
@@ -1,279 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * pe_io.hpp
- *
- * Created on: Mar 15, 2012
- * Author: andrey
- */
-
-#ifndef PE_IO_HPP_
-#define PE_IO_HPP_
-
-
-#include "bidirectional_path.hpp"
-#include "contig_output.hpp"
-#include "io/osequencestream.hpp"
-#include "genome_consistance_checker.hpp"
-namespace path_extend {
-
-using namespace debruijn_graph;
-
-class ContigWriter {
-protected:
- DECL_LOGGER("PathExtendIO")
-
-protected:
- const Graph& g_;
- ContigConstructor<Graph> &constructor_;
- size_t k_;
- map<EdgeId, ExtendedContigIdT> ids_;
-
- //TODO: add constructor
- string ToString(const BidirectionalPath& path) const {
- stringstream ss;
- if (path.IsInterstrandBulge() && path.Size() == 1) {
- ss << constructor_.construct(path.Back()).first.substr(k_, g_.length(path.Back()) - k_);
- return ss.str();
- }
-
- if (!path.Empty()) {
- ss << constructor_.construct(path[0]).first.substr(0, k_);
- }
-
- for (size_t i = 0; i < path.Size(); ++i) {
- int gap = i == 0 ? 0 : path.GapAt(i);
- if (gap > (int) k_) {
- for (size_t j = 0; j < gap - k_; ++j) {
- ss << "N";
- }
- ss << constructor_.construct(path[i]).first;
- } else {
- int overlapLen = (int) k_ - gap;
- if (overlapLen >= (int) g_.length(path[i]) + (int) k_) {
- if(overlapLen > (int) g_.length(path[i]) + (int) k_) {
- WARN("Such scaffolding logic leads to local misassemblies");
- }
- continue;
- }
- auto temp_str = g_.EdgeNucls(path[i]).Subseq(overlapLen).str();
- if(i != path.Size() - 1) {
- for(size_t j = 0 ; j < path.TrashPreviousAt(i + 1); ++j) {
- temp_str.pop_back();
- if(temp_str.size() == 0) {
- break;
- }
- }
- }
- ss << temp_str;
- }
- }
- return ss.str();
- }
-
- string ToFASTGString(const BidirectionalPath& path) const {
- if (path.Empty())
- return "";
-
- string res = ids_.at(path.Front()).short_id_;
- for (size_t i = 1; i < path.Size(); ++i) {
- if (g_.EdgeEnd(path[i - 1]) != g_.EdgeStart(path[i])) {
- res += ";\n" + ids_.at(path[i]).short_id_;
- }
- else {
- res += "," + ids_.at(path[i]).short_id_;
- }
- }
- return res;
- }
-
-
-public:
- ContigWriter(const Graph& g, ContigConstructor<Graph> &constructor): g_(g), constructor_(constructor), k_(g.k()), ids_() {
- MakeContigIdMap(g_, ids_);
- }
-
- void WriteEdges(const string &filename) const {
- INFO("Outputting edges to " << filename);
- io::osequencestream_with_id oss(filename);
-
- set<EdgeId> included;
- for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
- if (included.count(*iter) == 0) {
- oss.setCoverage(g_.coverage(*iter));
- oss.setID((int) g_.int_id(*iter));
- oss << g_.EdgeNucls(*iter);
-
- included.insert(*iter);
- included.insert(g_.conjugate(*iter));
- }
- }
- DEBUG("Contigs written");
- }
-
-
- void WritePaths(const PathContainer &paths, const string &filename) const {
- INFO("Outputting path data to " << filename);
- std::ofstream oss;
- oss.open(filename.c_str());
- int i = 1;
- oss << paths.size() << endl;
- for (auto iter = paths.begin(); iter != paths.end(); ++iter) {
- //oss << i << endl;
- i++;
- BidirectionalPath* path = iter.get();
- if (path->GetId() % 2 != 0) {
- path = path->GetConjPath();
- }
- oss << "PATH " << path->GetId() << " " << path->Size() << " " << path->Length() + k_ << endl;
- for (size_t j = 0; j < path->Size(); ++j) {
- oss << g_.int_id(path->At(j)) << " " << g_.length(path->At(j)) << " " << path->GapAt(j) << " " << path->TrashPreviousAt(j) << " " << path->TrashCurrentAt(j) << endl;
- }
- //oss << endl;
- }
- oss.close();
- DEBUG("Edges written");
- }
-
- void LoadPaths(PathContainer &paths, GraphCoverageMap &cover_map, const string &filename) const {
- paths.clear();
- map<size_t, EdgeId> int_ids;
- for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
- int_ids.insert(make_pair(g_.int_id(*iter), *iter));
- }
-
- std::ifstream iss;
- iss.open(filename);
- size_t psize;
- iss >> psize;
- for(size_t i = 0; i < psize && !iss.eof(); ++i) {
- string s;
- size_t id;
- size_t size;
- size_t len;
- iss >> s >> id >> size >> len;
- VERIFY(s == "PATH");
-
- BidirectionalPath * path = new BidirectionalPath(g_);
- BidirectionalPath * conjugatePath = new BidirectionalPath(g_);
- paths.AddPair(path, conjugatePath);
- path->Subscribe(&cover_map);
- conjugatePath->Subscribe(&cover_map);
- for (size_t j = 0; !iss.eof() && j < size; ++j) {
- size_t eid;
- size_t elen;
- int gap;
- uint32_t trash_prev;
- uint32_t trash_current;
-
- iss >> eid >> elen >> gap >> trash_prev >> trash_current;
- Gap gap_struct(gap, trash_prev, trash_current);
- EdgeId edge = int_ids[eid];
- conjugatePath->PushBack(edge, gap_struct);
- VERIFY(g_.length(edge) == elen);
- }
- VERIFY(path->Length() + k_ == len);
- }
- VERIFY(psize == paths.size());
- iss.close();
- }
-
- void WritePathsToFASTA(const PathContainer &paths,
- const string &filename_base,
- bool write_fastg = true) const {
-
- INFO("Writing contigs to " << filename_base);
- io::osequencestream_with_id oss(filename_base + ".fasta");
-
- std::ofstream os_fastg;
- if (write_fastg)
- os_fastg.open((filename_base + ".paths").c_str());
-
- int i = 0;
- for (auto iter = paths.begin(); iter != paths.end(); ++iter) {
- if (iter.get()->Length() <= 0)
- continue;
- DEBUG("NODE " << ++i);
- BidirectionalPath* path = iter.get();
- path->Print();
- oss.setID((int) path->GetId());
- oss.setCoverage(path->Coverage());
- string path_string = ToString(*path);
-
- if (write_fastg) {
- os_fastg << oss.GetId(path_string) << endl;
- os_fastg << ToFASTGString(*iter.get()) << endl;
- os_fastg << oss.GetId(path_string) << "'" << endl;
- os_fastg << ToFASTGString(*iter.getConjugate()) << endl;
- }
-
- oss << path_string;
- }
- if (write_fastg)
- os_fastg.close();
- DEBUG("Contigs written");
- }
-
-
- //TODO: DimaA insert somewhere
- /*
- auto map_res = genome_checker.CountMisassemblies(*path);
- if (map_res.misassemblies > 0) {
- INFO ("there are "<< map_res.misassemblies<< " misassemblies in path: ");
- path->PrintInfo();
- total_mis += map_res.misassemblies;
- }
- if (map_res.wrong_gap_size > 0) {
- INFO ("there are "<<map_res.wrong_gap_size <<" wrong gaps in path: ");
- path->PrintInfo();
- gap_mis += map_res.wrong_gap_size;
- }
- */
-
- void WriteFASTGPaths(const PathContainer& paths, const string& filename) const {
- INFO("Writing FASTG paths to " << filename);
- std::ofstream oss(filename.c_str());
-
- for (auto iter = paths.begin(); iter != paths.end(); ++iter) {
- if (iter.get()->Length() <= 0)
- continue;
- oss << ToFASTGString(*iter.get()) << endl;
- oss << ToFASTGString(*iter.getConjugate()) << endl;
- }
- oss.close();
- }
-
- void OutputPaths(const PathContainer& paths, const string& filename_base) const {
- WritePathsToFASTA(paths, filename_base);
- }
-
-};
-
-
-
-class PathInfoWriter {
-protected:
- DECL_LOGGER("PathExtendIO")
-
-
-public:
-
- void WritePaths(const PathContainer &paths, const string &filename){
- std::ofstream oss(filename.c_str());
-
- for (auto iter = paths.begin(); iter != paths.end(); ++iter) {
- iter.get()->Print(oss);
- }
-
- oss.close();
- }
-};
-
-}
-
-#endif /* PE_IO_HPP_ */
diff --git a/src/debruijn/path_extend/pe_resolver.hpp b/src/debruijn/path_extend/pe_resolver.hpp
deleted file mode 100644
index 5c0f976..0000000
--- a/src/debruijn/path_extend/pe_resolver.hpp
+++ /dev/null
@@ -1,518 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * pe_resolver.hpp
- *
- * Created on: Mar 12, 2012
- * Author: andrey
- */
-
-#ifndef PE_RESOLVER_HPP_
-#define PE_RESOLVER_HPP_
-
-#include "path_extender.hpp"
-#include "pe_io.hpp"
-
-namespace path_extend {
-
-
-class SimpleOverlapRemover {
-
-public:
- SimpleOverlapRemover(const Graph& g, GraphCoverageMap& cm)
- : g_(g), coverage_map_(cm) {
- }
-
- void RemoveOverlaps(PathContainer& paths) const {
- for (size_t i = 0; i < paths.size(); i++) {
- FindAndRemovePathOverlap(paths, paths.Get(i));
- FindAndRemovePathOverlap(paths, paths.GetConjugate(i));
- }
- }
-
- void CutPseudoSelfConjugatePaths(PathContainer& paths) {
- vector<pair<BidirectionalPath *, BidirectionalPath *>> tmp_paths(paths.begin(), paths.end());
- for(auto it = tmp_paths.begin(); it != tmp_paths.end(); ++it) {
- BidirectionalPath * path1 = it->first;
- BidirectionalPath * path2 = it->second;
- bool ups = false;
- if(path1 != path2) {
- size_t last = 0;
- while(last < path1->Size() && path1->operator [](last) == path2->operator [](last)) {
- last++;
- }
- if(last > 0) {
- AddOverlap(paths, path1, 0, last - 1);
- path1->PopBack(last);
- path2->PopBack(last);
- }
- }
- if(ups) path1->Print();
- }
- }
-
- void RemoveSimilarPaths(PathContainer& paths, size_t min_edge_len, size_t max_path_diff, bool del_only_equal, bool del_subpaths, bool del_begins, bool del_all, bool add_overlap_begins) const {
- DEBUG("== Removing similar paths ==");
- DEBUG("Min edge len " << min_edge_len << ", max path diff " << max_path_diff)
- DEBUG("Only equal " << del_only_equal << ", subpaths " << del_subpaths << ", starts " << del_begins << ", all " << del_all << ", add starts " << add_overlap_begins);
- std::vector<EdgeId> edges = GetSortedEdges();
- for (size_t edgeIndex = 0; edgeIndex < edges.size(); ++edgeIndex) {
- EdgeId edge = edges.at(edgeIndex);
- BidirectionalPathSet cov_paths = coverage_map_.GetCoveringPaths(edge);
- std::vector<BidirectionalPath*> cov_vect(cov_paths.begin(), cov_paths.end());
- std::sort(cov_vect.begin(), cov_vect.end(), PathIdCompare);
- for (size_t vect_i = 0; vect_i < cov_vect.size(); ++vect_i) {
- BidirectionalPath* path1 = cov_vect.at(vect_i);
- if (cov_paths.find(path1) == cov_paths.end()) {
- continue;
- }
- for (size_t vect_i1 = vect_i + 1; vect_i1 < cov_vect.size(); ++vect_i1) {
- BidirectionalPath* path2 = cov_vect.at(vect_i1);
- if (path1 == path2 || path1 == path2->GetConjPath()) {
- continue;
- }
- if (cov_paths.find(path2) == cov_paths.end())
- continue;
- if ((*path1) == (*path2)) {
- if (path2->IsOverlap()) {
- path1->SetOverlap(true);
- }
- DEBUG("Removing path " << path2->GetId() << " because of path " << path1->GetId());
- path2->Print();
- path1->Print();
- path2->Clear();
- cov_paths = coverage_map_.GetCoveringPaths(edge);
- continue;
- }
- if (g_.length(edge) <= min_edge_len || path1->IsOverlap() || path2->IsOverlap() || del_only_equal) {
- continue;
- }
- CompareAndCut(paths, edge, path1, path2, max_path_diff,
- del_subpaths, del_begins, del_all, add_overlap_begins);
- cov_paths = coverage_map_.GetCoveringPaths(edge);
- }
- }
- }
- DEBUG("== Emd removing similar paths ==");
- }
-
-private:
-
- void SubscribeCoverageMap(BidirectionalPath* path) const {
- path->Subscribe(&coverage_map_);
- for (size_t i = 0; i < path->Size(); ++i) {
- coverage_map_.BackEdgeAdded(path->At(i), path, path->GapAt(i));
- }
- }
-
- void CompareAndCut(PathContainer& paths, EdgeId edge, BidirectionalPath* path1, BidirectionalPath* path2,
- size_t max_path_diff,
- bool del_subpaths, bool del_begins,
- bool del_all, bool add_overlap_begins) const {
- vector<size_t> positions1 = path1->FindAll(edge);
- vector<size_t> positions2 = path2->FindAll(edge);
- size_t i1 = 0;
- size_t i2 = 0;
- bool renewed = false;
- while (i1 < positions1.size()) {
- while (i2 < positions2.size()) {
- DEBUG("CompareAndCutFromPos paths " << g_.int_id(edge));
- CompareAndCutFromPos(paths, path1, (int) positions1[i1], path2,
- (int) positions2[i2], max_path_diff,
- del_subpaths, del_begins, del_all, add_overlap_begins);
-
- if (positions1[i1] >= path1->Size() || path1->At(positions1[i1]) != edge || positions2[i2] >= path2->Size() || path2->At(positions2[i2]) != edge) {
- vector<size_t> new_positions1 = path1->FindAll(edge);
- vector<size_t> new_positions2 = path2->FindAll(edge);
-
- if (new_positions1.size() == positions1.size() && new_positions2.size() == positions2.size()) {
- return;
- }
- else {
- positions1 = new_positions1;
- positions2 = new_positions2;
- i1 = 0;
- i2 = 0;
- renewed = true;
- break;
- }
- ++i2;
- }
- ++i2;
- }
-
- if (renewed) {
- renewed = false;
- continue;
- }
- ++i1;
- }
- }
-
- void CompareAndCutFromPos(PathContainer& paths, BidirectionalPath* path1, int pos1,
- BidirectionalPath* path2, int pos2,
- size_t max_path_diff,
- bool delete_subpaths, bool delete_begins,
- bool delete_all, bool add_overlap_begins) const {
- int last2 = pos2;
- int last1 = pos1;
- if (last1 >= (int) path1->Size() || last2 >= (int) path2->Size()) {
- return;
- }
- vector<int> other_path_end;
- pair<int, int> posRes = ComparePaths(last1, last2, *path1, *path2, max_path_diff);
- last1 = posRes.first;
- last2 = posRes.second;
- BidirectionalPath* conj1 = path1->GetConjPath();
- BidirectionalPath* conj2 = path2->GetConjPath();
- size_t first1 = conj1->Size() - pos1 - 1;
- size_t first2 = conj2->Size() - pos2 - 1;
- posRes = ComparePaths(first1, first2, *conj1, *conj2, max_path_diff);
- first2 = conj2->Size() - posRes.second - 1;
- first1 = conj1->Size() - posRes.first - 1;
- if ((int)path2->LengthAt(last2) - (int)g_.length(path2->At(last2)) < (int) max_path_diff) {
- last2 = (int)path2->Size() - 1;
- }
- if ((int)path2->Length() - (int)path2->LengthAt(first2) < (int) max_path_diff) {
- first2 = 0;
- }
- if ((int)path1->LengthAt(last1) - (int)g_.length(path1->At(last1)) < (int) max_path_diff) {
- last1 = (int)path1->Size() - 1;
- }
- if ((int)path1->Length() - (int)path1->LengthAt(first1) < (int) max_path_diff) {
- first1 = 0;
- }
-
- CutOverlaps(paths, path1, first1, last1, path1->Size(), path2,
- first2, last2, path2->Size(), delete_subpaths,
- delete_begins, delete_all, add_overlap_begins);
- }
-
- void AddOverlap(PathContainer& paths, BidirectionalPath* path1, size_t first1, size_t last1) const {
- BidirectionalPath* overlap = new BidirectionalPath(path1->SubPath(first1, last1 + 1));
- BidirectionalPath* conj_overlap = new BidirectionalPath(overlap->Conjugate());
- SubscribeCoverageMap(overlap);
- SubscribeCoverageMap(conj_overlap);
- paths.AddPair(overlap, conj_overlap);
- }
-
- bool CutOverlaps(PathContainer& paths, BidirectionalPath* path1, size_t first1, size_t last1, size_t size1, BidirectionalPath* path2, size_t first2,
- size_t last2, size_t size2, bool del_subpaths, bool del_begins, bool del_all, bool add_overlap_begins) const {
- if (first1 == 0 && last1 == size1 - 1 && del_subpaths) {
- DEBUG("Removing path " << path1->GetId() << " because of path " << path2->GetId());
- path1->Print();
- path2->Print();
- path1->Clear();
- } else if (first2 == 0 && last2 == size2 - 1 && del_subpaths) {
- DEBUG("Removing path " << path2->GetId() << " because of path " << path1->GetId());
- path2->Print();
- path1->Print();
- path2->Clear();
- } else if (first2 == 0 && first1 == 0 && del_begins) {
- DEBUG("Path " << path1->GetId() << ", len " << path1->Length() << " and path " << path2->GetId() << ", len " << path2->Length() << " have similar starts");
- DEBUG("Path 1: " << last1 << " edges of length " << path1->Length() - path1->LengthAt(min(last1 + 1, path1->Size() - 1)));
- DEBUG("Path 2: " << last2 << " edges of length " << path2->Length() - path2->LengthAt(min(last2 + 1, path2->Size() - 1)));
- DEBUG("Path 1 has overlap start " << path1->HasOverlapedBegin() << ", path 2 has overlap start " << path2->HasOverlapedBegin());
-
- if (add_overlap_begins) {
- AddOverlap(paths, path1, first1, last1);
- DEBUG("Detaching overlap " << path2->GetId() << " and " << path1->GetId());
- path2->Print();
- path1->Print();
- path1->GetConjPath()->PopBack(last1 + 1);
- path2->GetConjPath()->PopBack(last2 + 1);
- } else if (path1->Length() < path2->Length()) {
- DEBUG("Detaching overlap from " << path1->GetId() << " because of "<< path2->GetId());
- path1->Print();
- path2->Print();
- path1->GetConjPath()->PopBack(last1 + 1);
- } else {
- DEBUG("Detaching overlap from " << path2->GetId() << " because of "<< path1->GetId());
- path2->Print();
- path1->Print();
- path2->GetConjPath()->PopBack(last2 + 1);
- }
- } else if ((last1 == size1 - 1 && last2 == size2 - 1) && del_begins) {
- DEBUG("Path " << path1->GetId() << ", len " << path1->Length() << " and path " << path2->GetId() << ", len " << path2->Length() << " have similar ends");
- DEBUG("Path 1: " << path1->Size() - first1 << " edges of length " << path1->LengthAt(first1));
- DEBUG("Path 2: " << path2->Size() - first2 << " edges of length " << path2->LengthAt(first2));
- DEBUG("Path 1 has overlap end " << path1->HasOverlapedEnd() << ", path 2 has overlap end " << path2->HasOverlapedEnd());
-
- if (add_overlap_begins){
- AddOverlap(paths, path1, first1, last1);
- DEBUG("Detaching overlap " << path2->GetId() << " and " << path1->GetId());
- path2->Print();
- path1->Print();
- path1->PopBack(last1 + 1 - first1);
- path2->PopBack(last2 + 1 - first2);
- }
- if (path1->Length() < path2->Length()) {
- DEBUG("Detaching overlap from " << path1->GetId() << " because of "<< path2->GetId());
- path1->Print();
- path2->Print();
- path1->PopBack(last1 + 1 - first1);
- } else {
- DEBUG("Detaching overlap from " << path2->GetId() << " because of "<< path1->GetId());
- path2->Print();
- path1->Print();
- path2->PopBack(last2 + 1 - first2);
- }
- } else if (first2 == 0 && del_all) {
- DEBUG("Detaching overlap from " << path2->GetConjPath()->GetId() << " because of "<< path1->GetId());
- DEBUG("Does it have overlap in the beginning: " << path2->HasOverlapedBegin());
- path2->Print();
- DEBUG(" >>>> ")
- path1->Print();
- DEBUG(" ==== ");
- path2->GetConjPath()->PopBack(last2 + 1);
- } else if (last2 == size2 - 1 && del_all) {
- DEBUG("Detaching overlap from " << path2->GetId() << " because of "<< path1->GetId());
- DEBUG("Does it have overlap in the end: " << path2->HasOverlapedEnd());
- path2->Print();
- DEBUG(" >>>> ")
- path1->Print();
- DEBUG(" ==== ");
- path2->PopBack(last1 + 1 - first1);
- } else if (first1 == 0 && del_all) {
- DEBUG("Detaching overlap from " << path1->GetConjPath()->GetId() << " because of "<< path2->GetId());
- DEBUG("Does it have overlap in the end: " << path1->HasOverlapedBegin());
- path1->Print();
- DEBUG(" >>>> ")
- path2->Print();
- DEBUG(" ==== ");
- path1->GetConjPath()->PopBack(last1 + 1);
- } else if (last1 == size1 - 1 && del_all) {
- DEBUG("Detaching overlap from " << path1->GetId() << " because of "<< path2->GetId());
- DEBUG("Does it have overlap in the end: " << path1->HasOverlapedBegin());
- path1->Print();
- DEBUG(" >>>> ")
- path2->Print();
- DEBUG(" ==== ");
- path1->PopBack(last1 + 1 - first1);
- } else {
- return false;
- }
- return true;
- }
-
- std::vector<EdgeId> GetSortedEdges() const {
- std::set<EdgeId> edges_set;
- for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
- edges_set.insert(*iter);
- edges_set.insert(g_.conjugate(*iter));
- }
- std::vector<EdgeId> edges(edges_set.begin(), edges_set.end());
- std::sort(edges.begin(), edges.end(), EdgeLengthAndIdComparator(g_));
- return edges;
- }
-
- bool HasAlreadyOverlapedEnd(BidirectionalPath * path) const {
- return !path->IsOverlap() and path->HasOverlapedEnd();
- }
-
- bool HasAlreadyOverlapedBegin(BidirectionalPath * path) const {
- return !path->IsOverlap() and path->HasOverlapedBegin();
- }
-
- bool IsSamePath(BidirectionalPath * path1,
- BidirectionalPath * path2) const {
- return *path2 == *path1 or *path2 == *path1->GetConjPath();
- }
-
- void RemoveOverlap(PathContainer& paths, BidirectionalPath* path1,
- BidirectionalPath* path2, size_t overlap_size) const {
- BidirectionalPath* conj2 = path2->GetConjPath();
- if (path1->IsOverlap() && overlap_size == path1->Size()) {
- DEBUG("Detaching overlap from " << path2->GetConjPath()->GetId() << " because of "<< path1->GetId());
- path2->Print();
- path1->Print();
- conj2->PopBack(overlap_size);
- path2->SetOverlapedBeginTo(path1);
- } else if (path2->IsOverlap() && path2->Size() == overlap_size) {
- DEBUG("Detaching overlap from " << path1->GetId() << " because of "<< path2->GetId());
- path1->Print();
- path2->Print();
- path1->PopBack(overlap_size);
- path1->SetOverlapedEndTo(path2);
- } else if (overlap_size < path2->Size()
- && overlap_size < path1->Size()) {
- BidirectionalPath* overlap = new BidirectionalPath(g_, path1->Back());
- BidirectionalPath* conj_overlap = new BidirectionalPath(
- g_, g_.conjugate(path1->Back()));
- SubscribeCoverageMap(overlap);
- SubscribeCoverageMap(conj_overlap);
- paths.AddPair(overlap, conj_overlap);
- DEBUG("Detaching overlap " << path1->GetId() << " and " << conj2->GetId());
- path1->Print();
- conj2->Print();
- path1->PopBack();
- conj2->PopBack();
-
- for (size_t i = 1; i < overlap_size; ++i) {
- conj_overlap->PushBack(g_.conjugate(path1->Back()));
- path1->PopBack();
- conj2->PopBack();
- }
- overlap->SetOverlap(true);
- path1->SetOverlapedEndTo(overlap);
- path2->SetOverlapedBeginTo(overlap);
- }
- }
-
- void FindAndRemovePathOverlap(PathContainer& all_paths,
- BidirectionalPath* path1) const {
- int last = (int) path1->Size() - 1;
- if (last <= 0 or coverage_map_.GetCoverage(path1->At(last)) <= 1) {
- return;
- }
- BidirectionalPathSet paths =
- coverage_map_.GetCoveringPaths(path1->At(last));
- BidirectionalPath* overlap_path = NULL;
- size_t overlap_size = 0;
- for (auto path_iter = paths.begin(); path_iter != paths.end();
- ++path_iter) {
- if (IsSamePath(*path_iter, path1)) {
- continue;
- }
- size_t over_size = path1->OverlapEndSize(*path_iter);
- if (over_size > overlap_size) {
- overlap_size = over_size;
- overlap_path = *path_iter;
- } else if (over_size == overlap_size &&
- (overlap_path == NULL || (*path_iter)->GetId() < overlap_path->GetId())) {
- overlap_path = *path_iter;
- }
- }
- if (overlap_path == NULL) {
- return;
- }
- if (overlap_size > 0) {
- RemoveOverlap(all_paths, path1, overlap_path, overlap_size);
- }
- }
-
- class EdgeLengthAndIdComparator {
- public:
- EdgeLengthAndIdComparator(const Graph& g)
- : g_(g) {
- }
- bool operator()(const EdgeId& e1, const EdgeId& e2) const {
- if (g_.length(e1) > g_.length(e2)) {
- return true;
- }
- if (g_.length(e2) > g_.length(e1)) {
- return false;
- }
- return e1.int_id() < e2.int_id();
- }
- private:
- const Graph& g_;
- };
-
- const Graph& g_;
- GraphCoverageMap& coverage_map_;
-protected:
- DECL_LOGGER("PEResolver")
-};
-
-class PathExtendResolver {
-
-protected:
- const Graph& g_;
- size_t k_;
-
-public:
- PathExtendResolver(const Graph& g): g_(g), k_(g.k()) {
- }
-
- PathContainer makeSimpleSeeds() {
- std::set<EdgeId> included;
- PathContainer edges;
- for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
- if (g_.int_id(*iter) <= 0 or InTwoEdgeCycle(*iter, g_))
- continue;
- if (included.count(*iter) == 0) {
- BidirectionalPath * first = new BidirectionalPath(g_, *iter);
- BidirectionalPath * second = new BidirectionalPath(g_, g_.conjugate(*iter));
- edges.AddPair(first,second);
- included.insert(*iter);
- included.insert(g_.conjugate(*iter));
- }
- }
- return edges;
- }
-
- PathContainer extendSeeds(PathContainer& seeds, ContigsMaker& pathExtender) {
- PathContainer paths;
- pathExtender.GrowAll(seeds, &paths);
- return paths;
- }
-
- void removeOverlaps(PathContainer& paths, GraphCoverageMap& coverage_map,
- size_t min_edge_len, size_t max_path_diff, bool cut_overlaps, bool add_overlaps_begin) {
- if (!cut_overlaps) {
- return;
- }
- SimpleOverlapRemover remover(g_, coverage_map);
- if (cfg::get().ds.moleculo)
- remover.CutPseudoSelfConjugatePaths(paths);
- //writer.WritePathsToFASTA(paths, output_dir + "/before.fasta");
- //DEBUG("Removing subpaths");
- //delete not only eq,
- remover.RemoveSimilarPaths(paths, min_edge_len, max_path_diff, false, true, false, false, add_overlaps_begin);
- //writer.WritePathsToFASTA(paths, output_dir + "/remove_similar.fasta");
- //DEBUG("Remove overlaps")
- remover.RemoveOverlaps(paths);
- //writer.WritePathsToFASTA(paths, output_dir + "/after_remove_overlaps.fasta");
- remover.RemoveSimilarPaths(paths, min_edge_len, max_path_diff, true, false, false, false, add_overlaps_begin);
- //writer.WritePathsToFASTA(paths, output_dir + "/remove_equal.fasta");
- //DEBUG("remove similar path. Max difference " << max_overlap);
- remover.RemoveSimilarPaths(paths, min_edge_len, max_path_diff, false, true, true, true, add_overlaps_begin);
- DEBUG("end removing");
- }
-
- void RemoveMatePairEnds(PathContainer& paths, size_t min_edge_len) const {
- DEBUG("remove mp ends");
- for (size_t i = 0; i < paths.size(); ++i) {
- RemoveMatePairEnd(*paths.Get(i), min_edge_len);
- RemoveMatePairEnd(*paths.GetConjugate(i), min_edge_len);
- }
- }
-
- void addUncoveredEdges(PathContainer& paths, GraphCoverageMap& coverageMap) {
- std::set<EdgeId> included;
- for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
- if (included.count(*iter) == 0 && !coverageMap.IsCovered(*iter)) {
- BidirectionalPath* path = new BidirectionalPath(g_, *iter);
- BidirectionalPath* conj = new BidirectionalPath(g_, g_.conjugate(*iter));
- path->Subscribe(&coverageMap);
- conj->Subscribe(&coverageMap);
- coverageMap.BackEdgeAdded(path->At(0), path, path->GapAt(0));
- coverageMap.BackEdgeAdded(conj->At(0), conj, conj->GapAt(0));
- paths.AddPair(path, conj);
- included.insert(*iter);
- included.insert(g_.conjugate(*iter));
- }
- }
- }
-
-private:
- void RemoveMatePairEnd(BidirectionalPath& path, size_t min_edge_len) const {
- int pos = int(path.Size()) - 1;
- while (pos > 0 and g_.length(path.At(pos)) < min_edge_len) {
- path.PopBack();
- pos--;
- }
- }
-protected:
- DECL_LOGGER("PEResolver")
-};
-
-} /* PE_RESOLVER_HPP_ */
-
-#endif
diff --git a/src/debruijn/path_extend/pe_utils.hpp b/src/debruijn/path_extend/pe_utils.hpp
deleted file mode 100644
index eaf2606..0000000
--- a/src/debruijn/path_extend/pe_utils.hpp
+++ /dev/null
@@ -1,461 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * pe_utils.hpp
- *
- * Created on: Nov 27, 2012
- * Author: andrey
- */
-
-#ifndef PE_UTILS_HPP_
-#define PE_UTILS_HPP_
-
-#include "bidirectional_path.hpp"
-
-using namespace debruijn_graph;
-
-namespace path_extend {
-
-//Checks whether we are in a cycle of length 2, used only for seed selection.
-inline bool InTwoEdgeCycle(EdgeId e, const Graph &g) {
- auto v = g.EdgeEnd(e);
- if (g.OutgoingEdgeCount(v) >= 1) {
- auto edges = g.OutgoingEdges(v);
- for (auto it = edges.begin(); it != edges.end(); ++it) {
- if (g.EdgeStart(e) == g.EdgeEnd(*it)) {
- return true;
- }
- }
- }
- return false;
-}
-
-inline bool InBuble(EdgeId e, const Graph& g) {
- auto edges = g.OutgoingEdges(g.EdgeStart(e));
- auto endVertex = g.EdgeEnd(e);
- for (auto it = edges.begin(); it != edges.end(); ++it) {
- if ((g.EdgeEnd(*it) == endVertex) and (*it != e)) {
- return true;
- }
- }
- return false;
-}
-
-
-// Handles all paths in PathContainer.
-// For each edge output all paths that _traverse_ this path. If path contains multiple instances - count them. Position of the edge is not reported.
-//TODO: Inside is some WTF, should be rewritten.
-//TODO: Memory leaks, inefficient data structure.
-class GraphCoverageMap: public PathListener {
-
-public:
- typedef BidirectionalPathMultiset MapDataT;
-
-
-protected:
- const Graph& g_;
-
- std::map <EdgeId, MapDataT * > edgeCoverage_;
-
- MapDataT * empty_;
-
- virtual void EdgeAdded(EdgeId e, BidirectionalPath * path, Gap /*gap*/) {
- auto iter = edgeCoverage_.find(e);
- if (iter == edgeCoverage_.end()) {
- edgeCoverage_.insert(std::make_pair(e, new MapDataT()));
- }
- edgeCoverage_[e]->insert(path);
- }
-
- virtual void EdgeRemoved(EdgeId e, BidirectionalPath * path) {
- auto iter = edgeCoverage_.find(e);
- if (iter != edgeCoverage_.end()) {
- if (iter->second->count(path) == 0) {
- DEBUG("Error erasing path from coverage map");
- } else {
- auto entry = iter->second->find(path);
- iter->second->erase(entry);
- }
- }
- }
-
-public:
- GraphCoverageMap(const Graph& g) : g_(g), edgeCoverage_() {
- empty_ = new MapDataT();
- }
-
- GraphCoverageMap(const Graph& g, const PathContainer& paths) : g_(g), edgeCoverage_() {
- empty_ = new MapDataT();
- for (size_t i = 0; i < paths.size(); ++i) {
- for (size_t j = 0; j < paths.Get(i)->Size(); ++j) {
- EdgeAdded(paths.Get(i)->At(j), paths.Get(i), paths.Get(i)->GapAt(j));
- }
- for (size_t j = 0; j < paths.GetConjugate(i)->Size(); ++j) {
- EdgeAdded(paths.GetConjugate(i)->At(j), paths.GetConjugate(i), paths.GetConjugate(i)->GapAt(j));
- }
- }
- }
-
- virtual ~GraphCoverageMap() {
- delete empty_;
- for (auto iter = edgeCoverage_.begin(); iter != edgeCoverage_.end(); ++iter) {
- delete iter->second;
- }
- }
-
- void Clear() {
- for (auto iter = edgeCoverage_.begin(); iter != edgeCoverage_.end(); ++iter) {
- MapDataT* cover_paths = iter->second;
- for (auto ipath = cover_paths->begin(); ipath != cover_paths->end(); ++ipath) {
- BidirectionalPath* p = *ipath;
- p->Unsubscribe(this);
- }
- delete cover_paths;
- }
- edgeCoverage_.clear();
- }
-
- void Subscribe(BidirectionalPath * path) {
- path->Subscribe(this);
- for (size_t i = 0; i < path->Size(); ++i) {
- BackEdgeAdded(path->At(i), path, path->GapAt(i));
- }
- }
-
- virtual void FrontEdgeAdded(EdgeId e, BidirectionalPath * path, Gap gap) {
- EdgeAdded(e, path, gap);
- }
-
- virtual void BackEdgeAdded(EdgeId e, BidirectionalPath * path, Gap gap) {
- EdgeAdded(e, path, gap);
- }
-
- virtual void FrontEdgeRemoved(EdgeId e, BidirectionalPath * path) {
- EdgeRemoved(e, path);
- }
-
- virtual void BackEdgeRemoved(EdgeId e, BidirectionalPath * path) {
- EdgeRemoved(e, path);
- }
-
- MapDataT * GetEdgePaths(EdgeId e) const {
- auto iter = edgeCoverage_.find(e);
- if (iter != edgeCoverage_.end()) {
- return iter->second;
- }
- return empty_;
- }
-
- int GetCoverage(EdgeId e) const {
- return (int) GetEdgePaths(e)->size();
- }
-
- bool IsCovered(EdgeId e) const {
- return GetCoverage(e) > 0;
- }
-
- bool IsCovered(const BidirectionalPath& path) const {
- for (size_t i = 0; i < path.Size(); ++i) {
- if (!IsCovered(path[i])) {
- return false;
- }
- }
- return true;
- }
-
- int GetCoverage(const BidirectionalPath& path) const {
- if (path.Empty()) {
- return 0;
- }
-
- int cov = GetCoverage(path[0]);
- for (size_t i = 1; i < path.Size(); ++i) {
- int currentCov = GetCoverage(path[i]);
- if (cov > currentCov) {
- cov = currentCov;
- }
- }
-
- return cov;
- }
-
- BidirectionalPathSet GetCoveringPaths(EdgeId e) const {
- auto mapData = GetEdgePaths(e);
- return BidirectionalPathSet(mapData->begin(), mapData->end());
- }
-
- int GetUniqueCoverage(EdgeId e) const {
- return (int) GetCoveringPaths(e).size();
- }
-
- std::map <EdgeId, MapDataT * >::const_iterator begin() const {
- return edgeCoverage_.begin();
- }
-
- std::map <EdgeId, MapDataT * >::const_iterator end() const {
- return edgeCoverage_.end();
- }
-
- // DEBUG
-
- void PrintUncovered() const {
- DEBUG("Uncovered edges");
- int s = 0;
- for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
- if (!IsCovered(*iter)) {
- DEBUG(g_.int_id(*iter) << " (" << g_.length(*iter) << ") ~ " << g_.int_id(g_.conjugate(*iter)) << " (" << g_.length(g_.conjugate(*iter)) << ")");
- s += 1;
- }
- }
- DEBUG("Uncovered edges " << s / 2);
- }
-
- void PrintMulticovered() const {
- DEBUG("Multicovered edges");
- for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
- auto paths = GetCoveringPaths(*iter);
- if (paths.size() > 1 && g_.length(*iter) > 1000) {
- DEBUG(g_.int_id(*iter) << " (" << g_.length(*iter) << "). " << " Covered: " << paths.size());
- for (auto path = paths.begin(); path != paths.end(); ++path) {
- (*path)->Print();
- }
- DEBUG("=====");
- }
- }
- }
-
- size_t size() const {
- return edgeCoverage_.size();
- }
-
- const Graph& graph() const {
- return g_;
- }
-
-private:
- GraphCoverageMap(const GraphCoverageMap& t) : g_(t.g_), empty_(t.empty_) {}
-};
-
-inline bool GetLoopAndExit(const Graph& g, EdgeId e, pair<EdgeId, EdgeId>& result) {
- VertexId v = g.EdgeEnd(e);
- VertexId start = g.EdgeStart(e);
- if (g.OutgoingEdgeCount(v) != 2 || g.IncomingEdgeCount(v) != 1 || g.OutgoingEdgeCount(start) != 1 || g.IncomingEdgeCount(start) != 2) {
- return false;
- }
- EdgeId loop;
- EdgeId exit;
- bool loop_found = false;
- bool exit_found = false;
- auto edges = g.OutgoingEdges(v);
- for (auto edge = edges.begin(); edge != edges.end(); ++edge) {
- if (g.EdgeEnd(*edge) == g.EdgeStart(e) && *edge != e) {
- loop = *edge;
- loop_found = true;
- } else if (*edge != e) {
- exit = *edge;
- exit_found = true;
- }
- }
- result = make_pair(loop, exit);
- return exit_found && loop_found;
-}
-
-class LoopDetector {
-public:
- LoopDetector(BidirectionalPath* p, const GraphCoverageMap& cov_map);
- size_t LoopEdges(size_t skip_identical_edges, size_t min_cycle_appearences) const;
- size_t LoopLength(size_t skip_identical_edges, size_t min_cycle_appearences) const;
- bool PathIsLoop(size_t edges) const;
- size_t LastLoopCount(size_t skip_identical_edges, size_t min_cycle_appearences) const;
- size_t LastLoopCount(size_t edges) const;
- bool IsCycled(size_t loopLimit, size_t& skip_identical_edges) const;
- size_t EdgesToRemove(size_t skip_identical_edges, bool fullRemoval = false) const;
- void RemoveLoop(size_t skip_identical_edges, bool fullRemoval = true);
- bool EdgeInShortLoop(EdgeId e) const;
- bool PrevEdgeInShortLoop() const;
-private:
- BidirectionalPath* path_;
- const GraphCoverageMap& cov_map_;
- DECL_LOGGER("BidirectionalPath");
-};
-
-inline LoopDetector::LoopDetector(BidirectionalPath* p, const GraphCoverageMap& cov_map)
- : path_(p),
- cov_map_(cov_map) {
-}
-
-inline size_t LoopDetector::LoopEdges(size_t skip_identical_edges, size_t min_cycle_appearences) const {
- if (path_->Size() == 0) {
- return 0;
- }
- EdgeId e = path_->Back();
- size_t count = cov_map_.GetEdgePaths(e)->count(path_);
- if (count <= 1 || count < min_cycle_appearences * (skip_identical_edges + 1)) {
- return 0;
- }
- vector<size_t> edge_positions = path_->FindAll(e);
- VERIFY(edge_positions.size() == count);
- VERIFY(edge_positions.size() >= skip_identical_edges);
- size_t loopSize = edge_positions.back() - edge_positions[edge_positions.size() - 1 - (skip_identical_edges + 1)];
- return loopSize;
-}
-
-inline bool LoopDetector::PathIsLoop(size_t edges) const {
- if (edges == 0 || path_->Size() <= 1)
- return false;
-
- for (size_t i = 0; i < edges; ++i) {
- EdgeId e = path_->At(i);
- for (int j = (int) path_->Size() - ((int) edges - (int) i); j >= 0; j -= (int) edges) {
- if (path_->operator [](j) != e) {
- return false;
- }
- }
- }
- return true;
-}
-
-inline size_t LoopDetector::LastLoopCount(size_t skip_identical_edges, size_t min_cycle_appearences) const {
- size_t edges = LoopEdges(skip_identical_edges, min_cycle_appearences);
- return LastLoopCount(edges);
-}
-
-inline size_t LoopDetector::LastLoopCount(size_t edges) const {
- if (edges == 0) {
- return 0;
- }
-
- BidirectionalPath loop = path_->SubPath(path_->Size() - edges);
- size_t count = 0;
- int i = (int) path_->Size() - (int) edges;
- int delta = -(int) edges;
-
- while (i >= 0) {
- if (!path_->CompareFrom(i, loop)) {
- break;
- }
- ++count;
- i += delta;
- }
-
- return count;
-}
-
-inline bool LoopDetector::IsCycled(size_t loopLimit, size_t& skip_identical_edges) const {
- if (path_->Size() == 0 or cov_map_.GetEdgePaths(path_->Back())->count(path_) < loopLimit) {
- return false;
- }
- skip_identical_edges = 0;
- size_t loop_count = LastLoopCount(skip_identical_edges, loopLimit);
- while (loop_count > 0) {
- if (loop_count >= loopLimit) {
- return true;
- }
- loop_count = LastLoopCount(++skip_identical_edges, loopLimit);
- }
- return false;
-}
-
-inline size_t LoopDetector::EdgesToRemove(size_t skip_identical_edges, bool fullRemoval) const {
- size_t edges = LoopEdges(skip_identical_edges, 1);
- size_t count = LastLoopCount(edges);
- bool onlyCycle = PathIsLoop(edges);
- int result;
-
- if (onlyCycle || path_->Size() <= count * edges) {
- result = (int) path_->Size() - (int) edges;
- } else if (fullRemoval) {
- result = (int) count * (int) edges;
- } else {
- result = (int) (count - 1) * (int) edges;
- }
-
- return result < 0 ? 0 : result;
-}
-
-inline void LoopDetector::RemoveLoop(size_t skip_identical_edges, bool fullRemoval) {
- size_t toRemove = EdgesToRemove(skip_identical_edges, fullRemoval);
- for (size_t i = 0; i < toRemove; ++i) {
- path_->PopBack();
- }
-}
-
-inline bool LoopDetector::EdgeInShortLoop(EdgeId e) const {
- pair<EdgeId, EdgeId> temp;
- return GetLoopAndExit(path_->graph(), e, temp);
-}
-
-inline bool LoopDetector::PrevEdgeInShortLoop() const {
- if (path_->Size() <= 2) {
- return false;
- }
- const Graph& g = path_->graph();
- EdgeId e2 = path_->At(path_->Size() - 1);
- EdgeId e1 = path_->At(path_->Size() - 2);
- VertexId v2 = g.EdgeEnd(e1);
- if (g.OutgoingEdgeCount(v2) == 2 && g.EdgeEnd(e2) == g.EdgeStart(e1) && g.EdgeEnd(e1) == g.EdgeStart(e2)) {
- return EdgeInShortLoop(e1);
- }
- return false;
-}
-
-class ScaffoldBreaker {
-
-private:
-
- int min_gap_;
-
- PathContainer container_;
-
- void SplitPath(const BidirectionalPath& path) {
- size_t i = 0;
-
- while (i < path.Size()) {
- BidirectionalPath * p = new BidirectionalPath(path.graph(), path[i]);
- ++i;
-
- while(i < path.Size() and path.GapAt(i) <= min_gap_) {
- p->PushBack(path[i], path.GapAt(i), path.TrashPreviousAt(i), path.TrashCurrentAt(i));
- ++i;
- }
- if (i < path.Size()) {
- DEBUG("split path " << i << " gap " << path.GapAt(i));
- p->Print();
- }
-
- BidirectionalPath * cp = new BidirectionalPath(p->Conjugate());
- container_.AddPair(p, cp);
- }
- }
-
-public:
-
- ScaffoldBreaker(int min_gap): min_gap_(min_gap) {
-
- }
-
- void Split(PathContainer& paths) {
- for (auto it = paths.begin(); it != paths.end(); ++it) {
- SplitPath(*it.get());
- }
- }
-
-
- void clear() {
- container_.clear();
- }
-
- PathContainer& container() {
- return container_;
- }
-
-};
-
-}
-
-#endif /* PE_UTILS_HPP_ */
diff --git a/src/debruijn/path_extend/scaffolder2015/connection_condition2015.cpp b/src/debruijn/path_extend/scaffolder2015/connection_condition2015.cpp
deleted file mode 100644
index 1e411c5..0000000
--- a/src/debruijn/path_extend/scaffolder2015/connection_condition2015.cpp
+++ /dev/null
@@ -1,111 +0,0 @@
-
-
-#include "connection_condition2015.hpp"
-namespace path_extend {
-
- PairedLibConnectionCondition::PairedLibConnectionCondition(const debruijn_graph::Graph &graph,
- shared_ptr <PairedInfoLibrary> lib,
- size_t lib_index,
- size_t min_read_count) :
- graph_(graph),
- lib_(lib),
- lib_index_(lib_index),
- min_read_count_(min_read_count),
-//TODO reconsider condition
- left_dist_delta_(5 * (int) lib_->GetIsVar()),
- right_dist_delta_(5 * (int) lib_->GetISMax()) {
- }
-
- size_t PairedLibConnectionCondition::GetLibIndex() const {
- return lib_index_;
- }
-
- set <debruijn_graph::EdgeId> PairedLibConnectionCondition::ConnectedWith(debruijn_graph::EdgeId e) const {
- set <debruijn_graph::EdgeId> all_edges;
- int e_length = (int) graph_.length(e);
- lib_->FindJumpEdges(e, all_edges, e_length - left_dist_delta_, e_length + right_dist_delta_);
-
- set <debruijn_graph::EdgeId> result;
- for (auto edge : all_edges) {
- if (edge != e && edge != graph_.conjugate(e) &&
- math::ge(GetWeight(e, edge), (double) min_read_count_)) {
- result.insert(edge);
- }
- }
- return result;
- }
-
- double PairedLibConnectionCondition::GetWeight(debruijn_graph::EdgeId e1, debruijn_graph::EdgeId e2) const {
- int e_length = (int) graph_.length(e1);
- double res = lib_->CountPairedInfo(e1, e2, e_length - left_dist_delta_, e_length + right_dist_delta_);
- VERIFY(res == lib_->CountPairedInfo(graph_.conjugate(e2), graph_.conjugate(e1),
- (int) graph_.length(e2) - left_dist_delta_,
- (int) graph_.length(e2) + right_dist_delta_));
-
- return res;
- }
-
-//TODO: We use same part of index twice, is it necessary?
- int PairedLibConnectionCondition::GetMedianGap(debruijn_graph::EdgeId e1, debruijn_graph::EdgeId e2) const {
- std::vector<int> distances;
- std::vector<double> weights;
- int e_length = (int) graph_.length(e1);
- lib_->CountDistances(e1, e2, distances, weights);
- std::vector<pair<int, double> >h(distances.size());
- for (size_t i = 0; i< distances.size(); i++) {
- if (distances[i] >= e_length - left_dist_delta_ && distances[i] <= e_length + right_dist_delta_)
- h.push_back(std::make_pair(distances[i], weights[i]));
- }
-//TODO: is it really necessary?
- std::sort(h.begin(), h.end());
- double sum = 0.0;
- double sum2 = 0.0;
- for (size_t j = 0; j< h.size(); ++j) {
- sum += h[j].second;
- }
- size_t i = 0;
- for (; i < h.size(); ++i) {
- sum2 += h[i].second;
- if (sum2 * 2 > sum)
- break;
- }
- if (i >= h.size()) {
- WARN("Count median error");
- i = h.size() - 1;
- }
- return (int) round(h[i].first - e_length);
- }
-
- AssemblyGraphConnectionCondition::AssemblyGraphConnectionCondition(const debruijn_graph::Graph &g, size_t max_connection_length) :
- g_(g),
- max_connection_length_(max_connection_length) {
- }
-
- set <debruijn_graph::EdgeId> AssemblyGraphConnectionCondition::ConnectedWith(debruijn_graph::EdgeId e) const {
- set <debruijn_graph::EdgeId> result;
-
- for (auto connected: g_.OutgoingEdges(g_.EdgeEnd(e))) {
- result.insert(connected);
- }
-//TODO: optimization possible. Precompute all pairs of interesting connected vertex.
- DijkstraHelper<debruijn_graph::Graph>::BoundedDijkstra dijkstra(
- DijkstraHelper<debruijn_graph::Graph>::CreateBoundedDijkstra(g_, max_connection_length_));
- dijkstra.Run(g_.EdgeEnd(e));
- for (auto v: dijkstra.ReachedVertices()) {
- for (auto connected: g_.OutgoingEdges(v)) {
- result.insert(connected);
- }
- }
-
- return result;
- }
-
- double AssemblyGraphConnectionCondition::GetWeight(debruijn_graph::EdgeId, debruijn_graph::EdgeId) const {
- return 1.0;
- }
-
- size_t AssemblyGraphConnectionCondition::GetLibIndex() const {
- return (size_t) - 1;
- }
-
-}
diff --git a/src/debruijn/path_extend/scaffolder2015/connection_condition2015.hpp b/src/debruijn/path_extend/scaffolder2015/connection_condition2015.hpp
deleted file mode 100644
index a9842b4..0000000
--- a/src/debruijn/path_extend/scaffolder2015/connection_condition2015.hpp
+++ /dev/null
@@ -1,69 +0,0 @@
-
-#ifndef CONNECTION_CONDITION2015_HPP
-#define CONNECTION_CONDITION2015_HPP
-#include "genome_consistance_checker.hpp"
-#include "logger/logger.hpp"
-#include "path_extend/paired_library.hpp"
-#include <map>
-#include <set>
-
-namespace path_extend {
-
-/* Connection condition are used by both scaffolder's extension chooser and scaffold graph */
-
- class ConnectionCondition {
- public:
-// Outputs the edges e is connected with.
-//TODO performance issue: think about inside filtering. Return only unique connected edges?
- virtual set <debruijn_graph::EdgeId> ConnectedWith(debruijn_graph::EdgeId e) const = 0;
-// Outputs the weight of the pair e1 and e2
- virtual double GetWeight(debruijn_graph::EdgeId e1, debruijn_graph::EdgeId e2) const = 0;
- virtual size_t GetLibIndex() const = 0;
- virtual ~ConnectionCondition() {
- }
- };
-/* Main (mate pair library) connection condition.
- *
- */
- class PairedLibConnectionCondition : public ConnectionCondition {
- private:
- const debruijn_graph::Graph &graph_;
- shared_ptr <PairedInfoLibrary> lib_;
- size_t lib_index_;
-//Minimal number of mate pairs to call connection sound
- size_t min_read_count_;
- public:
-//Only paired info with gap between e1 and e2 between -left_dist_delta_ and right_dist_delta_ taken in account
- int left_dist_delta_;
- int right_dist_delta_;
-
- PairedLibConnectionCondition(const debruijn_graph::Graph &graph,
- shared_ptr <PairedInfoLibrary> lib,
- size_t lib_index,
- size_t min_read_count);
- size_t GetLibIndex() const override;
- set <debruijn_graph::EdgeId> ConnectedWith(debruijn_graph::EdgeId e) const override;
- double GetWeight(debruijn_graph::EdgeId e1, debruijn_graph::EdgeId e2) const override;
-//Returns median gap size
- int GetMedianGap (debruijn_graph::EdgeId e1, debruijn_graph::EdgeId e2) const;
- };
-
-/* Condition used to find connected in graph edges.
- *
- */
- class AssemblyGraphConnectionCondition : public ConnectionCondition {
- private:
- const debruijn_graph::Graph &g_;
-//Maximal gap to the connection.
- size_t max_connection_length_;
-
- public:
- AssemblyGraphConnectionCondition(const debruijn_graph::Graph &g, size_t max_connection_length);
-
- set <debruijn_graph::EdgeId> ConnectedWith(debruijn_graph::EdgeId e) const override;
- double GetWeight(debruijn_graph::EdgeId, debruijn_graph::EdgeId) const override;
- size_t GetLibIndex() const override;
- };
-}
-
-#endif //PROJECT_CONNECTION_CONDITION2015_HPP
diff --git a/src/debruijn/path_extend/scaffolder2015/extension_chooser2015.cpp b/src/debruijn/path_extend/scaffolder2015/extension_chooser2015.cpp
deleted file mode 100644
index ca45f49..0000000
--- a/src/debruijn/path_extend/scaffolder2015/extension_chooser2015.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-//
-// Created by lab42 on 8/26/15.
-//
-
-#include "extension_chooser2015.hpp"
-
-namespace path_extend {
-using namespace std;
-
-std::pair<EdgeId, int> ExtensionChooser2015::FindLastUniqueInPath(const BidirectionalPath& path) const {
- for (int i = (int)path.Size() - 1; i >= 0; --i) {
- if (unique_edges_->IsUnique(path.At(i))) {
- return std::make_pair(path.At(i), i);
- }
- }
- return std::make_pair(EdgeId(0), -1);
-}
-
-ExtensionChooser::EdgeContainer ExtensionChooser2015::FindNextUniqueEdge(const EdgeId from) const {
- VERIFY(unique_edges_->IsUnique(from));
- EdgeContainer result;
- set<EdgeId> candidate_edges = paired_connection_condition_.ConnectedWith(from);
- vector<pair<double, pair<EdgeId, int >>> to_sort;
- for (EdgeId e : candidate_edges) {
- if (!unique_edges_->IsUnique(e)) {
- continue;
- }
- double sum = paired_connection_condition_.GetWeight(from, e);
- DEBUG("edge " << g_.int_id(e) << " weight " << sum);
- if (sum < absolute_weight_threshold_) {
- DEBUG("Edge " << g_.int_id(e) << " weight " << sum << " failed absolute weight threshold " << absolute_weight_threshold_);
- continue;
- }
- int gap = paired_connection_condition_.GetMedianGap(from, e);
-
- auto connected_with = graph_connection_condition_.ConnectedWith(from);
- if (connected_with.find(e) != connected_with.end()) {
- sum *= graph_connection_bonus_;
- }
- to_sort.push_back(make_pair(sum, make_pair(e, gap)));
- }
-//descending order, reverse iterators;
- sort(to_sort.rbegin(), to_sort.rend());
- for(size_t j = 0; j < to_sort.size(); j++) {
- if (j == 0 || to_sort[j].first* relative_weight_threshold_ > to_sort[j - 1].first) {
- result.push_back(EdgeWithDistance(to_sort[j].second.first, to_sort[j].second.second));
- DEBUG("Edge " << g_.int_id(to_sort[j].second.first) << " gap " << to_sort[j].second.second << " weight "<< to_sort[j].first << " passed absolute weight threshold " << absolute_weight_threshold_);
- } else {
- DEBUG ("Edge " << g_.int_id(to_sort[j].second.first) << " weight " << to_sort[j].first << " failed relative weight threshold " << relative_weight_threshold_);
- DEBUG("other removed");
- break;
- }
- }
- return result;
-}
-
-ExtensionChooser::EdgeContainer ExtensionChooser2015::Filter(const BidirectionalPath& path, const ExtensionChooser::EdgeContainer& /*edges*/) const {
-// set<EdgeId> candidates = FindCandidates(path);
- pair<EdgeId, int> last_unique = FindLastUniqueInPath(path);
- EdgeContainer result;
-
- if (last_unique.second < 0) {
-// No unique edge found
- return result;
- }
-
- result = FindNextUniqueEdge(last_unique.first);
-//Backward check. We connected edges iff they are best continuation to each other.
- if (result.size() == 1) {
- DEBUG("For edge " << g_.int_id(last_unique.first) << " unique next edge "<< result[0].e_ <<" found, doing backwards check ");
- EdgeContainer backwards_check = FindNextUniqueEdge(g_.conjugate(result[0].e_));
- if ((backwards_check.size() != 1) || (g_.conjugate(backwards_check[0].e_) != last_unique.first)) {
- result.clear();
- }
-//We should reduce gap size with length of the edges that came after last unique.
- result[0].d_ -= int (path.LengthAt(last_unique.second) - g_.length(last_unique.first));
- }
- return result;
-}
-
-}
diff --git a/src/debruijn/path_extend/scaffolder2015/extension_chooser2015.hpp b/src/debruijn/path_extend/scaffolder2015/extension_chooser2015.hpp
deleted file mode 100644
index 64c9080..0000000
--- a/src/debruijn/path_extend/scaffolder2015/extension_chooser2015.hpp
+++ /dev/null
@@ -1,49 +0,0 @@
-//
-// Created by lab42 on 8/26/15.
-//
-#pragma once
-
-#include "path_extend/extension_chooser.hpp"
-#include "connection_condition2015.hpp"
-#include "genome_consistance_checker.hpp"
-#include "logger/logger.hpp"
-#include <map>
-#include <set>
-namespace path_extend {
-class ExtensionChooser2015: public ScaffoldingExtensionChooser {
-private:
- shared_ptr<ScaffoldingUniqueEdgeStorage> unique_edges_;
-// for possible connections e1 and e2 if weight(e1) > relative_weight_threshold_ * weight(e2) then e2 will be ignored
- double relative_weight_threshold_;
- PairedLibConnectionCondition paired_connection_condition_;
- AssemblyGraphConnectionCondition graph_connection_condition_;
-// weight < absolute_weight_threshold_ will be ignored
- size_t absolute_weight_threshold_;
-// multiplicator for the pairs which are connected in graph.
- double graph_connection_bonus_;
-
-protected:
-//If path contains no unique edges return -1
- pair<EdgeId, int> FindLastUniqueInPath(const BidirectionalPath& path) const;
-//Find all possible next unique edges confirmed with mate-pair information. (absolute/relative)_weight_threshold_ used for filtering
- EdgeContainer FindNextUniqueEdge(const EdgeId from) const;
- DECL_LOGGER("ExtensionChooser2015")
-public:
- ExtensionChooser2015(const Graph& g, shared_ptr<WeightCounter> wc, double is_scatter_coeff,
- shared_ptr<ScaffoldingUniqueEdgeStorage> unique_edges ,double relative_threshold, size_t lib_index):
- ScaffoldingExtensionChooser(g, wc, is_scatter_coeff), unique_edges_(unique_edges), relative_weight_threshold_(relative_threshold), paired_connection_condition_(g,
- wc->get_libptr(), lib_index,
-//TODO: constants are subject to reconsider
- 0), graph_connection_condition_(g, 2*unique_edges_->GetMinLength()), absolute_weight_threshold_(2), graph_connection_bonus_(2) {
- INFO("ExtensionChooser2015 created");
- }
-/* @param edges are really not used and left for compatibility
- * @returns possible next edge if there is unique one, else returns empty container
- *
- */
-
- EdgeContainer Filter(const BidirectionalPath& path, const EdgeContainer& edges) const override;
-};
-
-
-}
diff --git a/src/debruijn/path_extend/scaffolder2015/scaff_supplementary.hpp b/src/debruijn/path_extend/scaffolder2015/scaff_supplementary.hpp
deleted file mode 100644
index aef1431..0000000
--- a/src/debruijn/path_extend/scaffolder2015/scaff_supplementary.hpp
+++ /dev/null
@@ -1,75 +0,0 @@
-#pragma once
-#include "graph_pack.hpp"
-#include "logger/logger.hpp"
-
-namespace path_extend {
- typedef debruijn_graph::EdgeId EdgeId;
-
-/* Storage of presumably unique, relatively long edges. Filled by ScaffoldingUniqueEdgeAnalyzer
- *
- */
- class ScaffoldingUniqueEdgeStorage {
- friend class ScaffoldingUniqueEdgeAnalyzer;
- private:
- set <EdgeId> unique_edges_;
- size_t min_unique_length_;
- public:
- ScaffoldingUniqueEdgeStorage(): unique_edges_(){
- DEBUG("storage created, empty");
- }
-
- bool IsUnique(EdgeId e) const {
- return (unique_edges_.find(e) != unique_edges_.end());
- }
-
- decltype(unique_edges_.begin()) begin() const {
- return unique_edges_.begin();
- }
-
- decltype(unique_edges_.end()) end() const {
- return unique_edges_.end();
- }
-
- size_t size() const {
- return unique_edges_.size();
- }
- size_t GetMinLength() const {
- return min_unique_length_;
- }
- void SetMinLength(size_t min_length) {
- min_unique_length_ = min_length;
- }
-
- const set<EdgeId>& GetSet() const {
- return unique_edges_;
- }
- protected:
- DECL_LOGGER("ScaffoldingUniqueEdgeStorage")
-
- };
-
-/* Auxillary class required to fillin the unique edge storage.
- *
- */
- class ScaffoldingUniqueEdgeAnalyzer {
-
- ;
- private:
- const debruijn_graph::conj_graph_pack &gp_;
- size_t length_cutoff_;
- double median_coverage_;
- double relative_coverage_variation_;
- protected:
- DECL_LOGGER("ScaffoldingUniqueEdgeAnalyzer")
-
-
- void SetCoverageBasedCutoff();
- public:
- ScaffoldingUniqueEdgeAnalyzer(const debruijn_graph::conj_graph_pack &gp, size_t apriori_length_cutoff, double max_relative_coverage):gp_(gp), length_cutoff_(apriori_length_cutoff), relative_coverage_variation_(max_relative_coverage){
- SetCoverageBasedCutoff();
- }
- void FillUniqueEdgeStorage(ScaffoldingUniqueEdgeStorage &storage_);
- };
-}
-
-
diff --git a/src/debruijn/path_extend/scaffolder2015/scaffold_graph.cpp b/src/debruijn/path_extend/scaffolder2015/scaffold_graph.cpp
deleted file mode 100644
index 0dfd8b8..0000000
--- a/src/debruijn/path_extend/scaffolder2015/scaffold_graph.cpp
+++ /dev/null
@@ -1,275 +0,0 @@
-#include "scaffold_graph.hpp"
-
-
-namespace path_extend {
-namespace scaffold_graph {
-
-std::atomic<ScaffoldGraph::ScaffoldEdgeIdT> ScaffoldGraph::ScaffoldEdge::scaffold_edge_id_{0};
-
-void ScaffoldGraph::AddEdgeSimple(const ScaffoldGraph::ScaffoldEdge &e, size_t conjugate_id) {
- edges_.emplace(e.getId(), e);
- outgoing_edges_.emplace(e.getStart(), e.getId());
- incoming_edges_.emplace(e.getEnd(), e.getId());
- conjugate_[e.getId()] = conjugate_id;
-}
-
-void ScaffoldGraph::DeleteOutgoing(const ScaffoldGraph::ScaffoldEdge &e) {
- auto e_range = outgoing_edges_.equal_range(e.getStart());
- for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) {
- if (edges_.at(edge_id->second) == e) {
- outgoing_edges_.erase(edge_id);
- }
- }
-}
-
-void ScaffoldGraph::DeleteIncoming(const ScaffoldGraph::ScaffoldEdge &e) {
- auto e_range = incoming_edges_.equal_range(e.getEnd());
- for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) {
- if (edges_.at(edge_id->second) == e) {
- incoming_edges_.erase(edge_id);
- }
- }
-}
-
-void ScaffoldGraph::DeleteAllOutgoingEdgesSimple(ScaffoldGraph::ScaffoldVertex v) {
- auto e_range = outgoing_edges_.equal_range(v);
- for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) {
- DeleteIncoming(edges_.at(edge_id->second));
- }
- outgoing_edges_.erase(v);
-}
-
-void ScaffoldGraph::DeleteEdgeFromStorage(const ScaffoldGraph::ScaffoldEdge &e) {
- VERIFY(!Exists(e));
-
- size_t conjugate_id = conjugate_[e.getId()];
- edges_.erase(e.getId());
- edges_.erase(conjugate_id);
- conjugate_.erase(e.getId());
- conjugate_.erase(conjugate_id);
-}
-
-void ScaffoldGraph::DeleteAllIncomingEdgesSimple(ScaffoldGraph::ScaffoldVertex v) {
- auto e_range = incoming_edges_.equal_range(v);
- for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) {
- DeleteOutgoing(edges_.at(edge_id->second));
- }
- incoming_edges_.erase(v);
-}
-
-bool ScaffoldGraph::Exists(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
- return vertices_.count(assembly_graph_edge) != 0;
-}
-
-bool ScaffoldGraph::Exists(const ScaffoldGraph::ScaffoldEdge &e) const {
- auto e_range = outgoing_edges_.equal_range(e.getStart());
- for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) {
- if (edges_.at(edge_id->second) == e) {
- return true;
- }
- }
- return false;
-}
-
-ScaffoldGraph::ScaffoldVertex ScaffoldGraph::conjugate(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
- return assembly_graph_.conjugate(assembly_graph_edge);
-}
-
-ScaffoldGraph::ScaffoldEdge ScaffoldGraph::conjugate(const ScaffoldGraph::ScaffoldEdge &e) const {
- auto iter = conjugate_.find(e.getId());
- if (iter != conjugate_.end()) {
- return edges_.at(iter->second);
- }
- return ScaffoldEdge(conjugate(e.getEnd()), conjugate(e.getStart()), e.getColor(), e.getWeight());
-}
-
-bool ScaffoldGraph::AddVertex(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) {
- if (!Exists(assembly_graph_edge)) {
- VERIFY(!Exists(conjugate(assembly_graph_edge)));
- vertices_.insert(assembly_graph_edge);
- vertices_.insert(conjugate(assembly_graph_edge));
- return true;
- }
- return false;
-}
-
-void ScaffoldGraph::AddVertices(const set<ScaffoldGraph::ScaffoldVertex> &vertices) {
- for (auto v : vertices) {
- AddVertex(v);
- }
-}
-
-bool ScaffoldGraph::AddEdge(ScaffoldGraph::ScaffoldVertex v1, ScaffoldGraph::ScaffoldVertex v2, size_t lib_id, double weight) {
- VERIFY(Exists(v1));
- VERIFY(Exists(v2));
-
- ScaffoldEdge e(v1, v2, lib_id, weight);
- if (Exists(e)) {
- VERIFY(Exists(conjugate(e)));
- return false;
- }
-
- auto conj = conjugate(e);
- AddEdgeSimple(e, conj.getId());
- AddEdgeSimple(conj, e.getId());
- return true;
-}
-
-void ScaffoldGraph::Print(ostream &os) const {
- for (auto v: vertices_) {
- os << "Vertex " << int_id(v) << " ~ " << int_id(conjugate(v))
- << ": len = " << assembly_graph_.length(v) << ", cov = " << assembly_graph_.coverage(v) << endl;
- }
- for (auto e_iter = ebegin(); e_iter != eend(); ++e_iter) {
- os << "Edge " << e_iter->getId() << " ~ " << conjugate(*e_iter).getId() <<
- ": " << int_id(e_iter->getStart()) << " -> " << int_id(e_iter->getEnd()) <<
- ", lib index = " << e_iter->getColor() << ", weight " << e_iter->getWeight() << endl;
- }
-}
-
-ScaffoldGraph::ScaffoldEdge ScaffoldGraph::UniqueIncoming(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
- VERIFY(HasUniqueIncoming(assembly_graph_edge));
- return edges_.at(incoming_edges_.find(assembly_graph_edge)->second);
-}
-
-ScaffoldGraph::ScaffoldEdge ScaffoldGraph::UniqueOutgoing(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
- VERIFY(HasUniqueOutgoing(assembly_graph_edge));
- return edges_.at(outgoing_edges_.find(assembly_graph_edge)->second);
-}
-
-bool ScaffoldGraph::HasUniqueIncoming(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
- return IncomingEdgeCount(assembly_graph_edge) == 1;
-}
-
-bool ScaffoldGraph::HasUniqueOutgoing(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
- return OutgoingEdgeCount(assembly_graph_edge) == 1;
-}
-
-size_t ScaffoldGraph::IncomingEdgeCount(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
- return incoming_edges_.count(assembly_graph_edge);
-}
-
-size_t ScaffoldGraph::OutgoingEdgeCount(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
- return outgoing_edges_.count(assembly_graph_edge);
-}
-
-vector<ScaffoldGraph::ScaffoldEdge> ScaffoldGraph::IncomingEdges(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
- vector<ScaffoldEdge> result;
- auto e_range = incoming_edges_.equal_range(assembly_graph_edge);
- for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) {
- result.push_back(edges_.at(edge_id->second));
- }
- return result;
-}
-
-vector<ScaffoldGraph::ScaffoldEdge> ScaffoldGraph::OutgoingEdges(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
- vector<ScaffoldEdge> result;
- auto e_range = outgoing_edges_.equal_range(assembly_graph_edge);
- for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) {
- result.push_back(edges_.at(edge_id->second));
- }
- return result;
-}
-
-const debruijn_graph::Graph &ScaffoldGraph::AssemblyGraph() const {
- return assembly_graph_;
-}
-
-size_t ScaffoldGraph::EdgeCount() const {
- return edges_.size();
-}
-
-size_t ScaffoldGraph::VertexCount() const {
- return vertices_.size();
-}
-
-ScaffoldGraph::ScaffoldVertex ScaffoldGraph::EdgeEnd(ScaffoldEdge e) const {
- return e.getEnd();
-}
-
-ScaffoldGraph::ScaffoldVertex ScaffoldGraph::EdgeStart(ScaffoldEdge e) const {
- return e.getStart();
-}
-
-size_t ScaffoldGraph::int_id(ScaffoldGraph::ScaffoldEdge e) const {
- return e.getId();
-}
-
-size_t ScaffoldGraph::int_id(ScaffoldGraph::ScaffoldVertex v) const {
- return assembly_graph_.int_id(v);
-}
-
-ScaffoldGraph::ConstScaffoldEdgeIterator ScaffoldGraph::eend() const {
- return ConstScaffoldEdgeIterator(edges_.cend());
-}
-
-ScaffoldGraph::ConstScaffoldEdgeIterator ScaffoldGraph::ebegin() const {
- return ConstScaffoldEdgeIterator(edges_.cbegin());
-}
-
-ScaffoldGraph::VertexStorage::const_iterator ScaffoldGraph::vend() const {
- return vertices_.cend();
-}
-
-ScaffoldGraph::VertexStorage::const_iterator ScaffoldGraph::vbegin() const {
- return vertices_.cbegin();
-}
-
-adt::iterator_range<ScaffoldGraph::VertexStorage::const_iterator> ScaffoldGraph::vertices() const {
- return adt::make_range(vbegin(), vend());
-}
-
-adt::iterator_range<ScaffoldGraph::ConstScaffoldEdgeIterator> ScaffoldGraph::edges() const {
- return adt::make_range(ebegin(), eend());
-}
-
-bool ScaffoldGraph::IsVertexIsolated(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
- bool
- result = incoming_edges_.count(assembly_graph_edge) == 0 && outgoing_edges_.count(assembly_graph_edge) == 0;
- VERIFY((incoming_edges_.count(conjugate(assembly_graph_edge)) == 0
- && incoming_edges_.count(assembly_graph_edge) == 0) == result);
- return result;
-}
-
-bool ScaffoldGraph::RemoveVertex(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) {
- if (Exists(assembly_graph_edge)) {
- VERIFY(Exists(conjugate(assembly_graph_edge)));
-
- DeleteAllOutgoingEdgesSimple(assembly_graph_edge);
- DeleteAllIncomingEdgesSimple(assembly_graph_edge);
- DeleteAllOutgoingEdgesSimple(conjugate(assembly_graph_edge));
- DeleteAllIncomingEdgesSimple(conjugate(assembly_graph_edge));
-
- VERIFY(incoming_edges_.count(assembly_graph_edge) == 0);
- VERIFY(outgoing_edges_.count(assembly_graph_edge) == 0);
- VERIFY(incoming_edges_.count(conjugate(assembly_graph_edge)) == 0);
- VERIFY(outgoing_edges_.count(conjugate(assembly_graph_edge)) == 0);
-
- vertices_.erase(assembly_graph_edge);
- vertices_.erase(conjugate(assembly_graph_edge));
-
- return true;
- }
- return false;
-}
-
-bool ScaffoldGraph::RemoveEdge(const ScaffoldGraph::ScaffoldEdge &e) {
- if (Exists(e)) {
- VERIFY(Exists(conjugate(e)));
- DeleteOutgoing(e);
- DeleteIncoming(e);
- DeleteOutgoing(conjugate(e));
- DeleteIncoming(conjugate(e));
- DeleteEdgeFromStorage(e);
-
- return true;
- }
- return false;
-}
-
-bool ScaffoldGraph::AddEdge(const ScaffoldGraph::ScaffoldEdge &e) {
- return AddEdge(e.getStart(), e.getEnd(), e.getColor(), e.getWeight());
-}
-
-} //scaffold_graph
-} //path_extend
\ No newline at end of file
diff --git a/src/debruijn/path_extend/scaffolder2015/scaffold_graph.hpp b/src/debruijn/path_extend/scaffolder2015/scaffold_graph.hpp
deleted file mode 100644
index 033efea..0000000
--- a/src/debruijn/path_extend/scaffolder2015/scaffold_graph.hpp
+++ /dev/null
@@ -1,233 +0,0 @@
-//
-// Created by andrey on 17.09.15.
-//
-#pragma once
-
-#include "logger/logger.hpp"
-#include "debruijn_graph.hpp"
-#include "path_extend/paired_library.hpp"
-#include "connection_condition2015.hpp"
-
-#include <standard_base.hpp>
-#include <adt/iterator_range.hpp>
-
-namespace path_extend {
-namespace scaffold_graph {
-
-//do NOT add "using namespace debruijn_graph" in order not to confuse between EdgeId typdefs
-
-class ScaffoldGraph {
-
-public:
- //EdgeId in de Bruijn graph is vertex in scaffolding graph
- typedef debruijn_graph::EdgeId ScaffoldVertex;
-
- //Unique edge id
- typedef size_t ScaffoldEdgeIdT;
-
- //Scaffold edge indormation class
- struct ScaffoldEdge {
- private:
- //unique id
- ScaffoldEdgeIdT id_;
- //id counter
- static std::atomic<ScaffoldEdgeIdT> scaffold_edge_id_;
-
- ScaffoldVertex start_;
- ScaffoldVertex end_;
- //color = lib#
- size_t color_;
- //read pair weight or anything else
- double weight_;
-
- public:
-
- ScaffoldEdge(ScaffoldVertex start, ScaffoldVertex end, size_t lib_id = (size_t) -1, double weight = 0) :
- id_(scaffold_edge_id_++),
- start_(start), end_(end),
- color_(lib_id),
- weight_(weight) {
- }
-
- ScaffoldEdgeIdT getId() const {
- return id_;
- }
-
-
- size_t getColor() const {
- return color_;
- }
-
- double getWeight() const {
- return weight_;
- }
-
- const ScaffoldVertex getStart() const {
- return start_;
- }
-
- const ScaffoldVertex getEnd() const {
- return end_;
- }
-
- bool operator==(const ScaffoldEdge &e) const {
- return color_ == e.color_ && weight_ == e.weight_ && start_ == e.start_ && end_ == e.end_;
- }
-
- bool operator==(const ScaffoldEdge &e) {
- return color_ == e.color_ && weight_ == e.weight_ && start_ == e.start_ && end_ == e.end_;
- }
- };
-
- //typedef for possibility to use in templated graph visualizers
- typedef ScaffoldVertex VertexId;
- typedef ScaffoldEdge EdgeId;
-
- //All vertices are stored in set
- typedef std::set<ScaffoldVertex> VertexStorage;
- //Edges are stored in map: Id -> Edge Information
- typedef std::unordered_map<ScaffoldEdgeIdT, ScaffoldEdge> EdgeStorage;
- //Adjacency list contains vertrx and edge id (instead of whole edge information)
- typedef std::unordered_multimap<ScaffoldVertex, ScaffoldEdgeIdT> AdjacencyStorage;
-
- struct ConstScaffoldEdgeIterator: public boost::iterator_facade<ConstScaffoldEdgeIterator,
- const ScaffoldEdge,
- boost::forward_traversal_tag> {
- private:
- EdgeStorage::const_iterator iter_;
-
- public:
- ConstScaffoldEdgeIterator(EdgeStorage::const_iterator iter) : iter_(iter) {
- }
-
- private:
- friend class boost::iterator_core_access;
-
- void increment() {
- ++iter_;
- }
-
- bool equal(const ConstScaffoldEdgeIterator &other) const {
- return iter_ == other.iter_;
- }
-
- ScaffoldEdge dereference() const {
- return iter_->second;
- }
- };
-
-private:
- const debruijn_graph::Graph &assembly_graph_;
-
- VertexStorage vertices_;
-
- EdgeStorage edges_;
-
- //Map for storing conjugate scaffolding edges
- std::unordered_map<ScaffoldEdgeIdT, ScaffoldEdgeIdT> conjugate_;
-
- AdjacencyStorage outgoing_edges_;
-
- AdjacencyStorage incoming_edges_;
-
- //Add edge without any checks and conjugate
- void AddEdgeSimple(const ScaffoldEdge &e, size_t conjugate_id);
-
- //Delete outgoing edge from adjancecy list without checks
- //and removing conjugate and respective incoming edge
- void DeleteOutgoing(const ScaffoldEdge &e);
-
- //Delete incoming edge from adjancecy list without checks
- //and removing conjugate and respective outoging edge
- void DeleteIncoming(const ScaffoldEdge &e);
-
- //Delete all edge info from storage
- void DeleteEdgeFromStorage(const ScaffoldEdge &e);
-
- //Detelte all outgoing from v edges from adjacency lists
- void DeleteAllOutgoingEdgesSimple(ScaffoldVertex v);
-
- //Detelte all incoming from v edges from adjacency lists
- void DeleteAllIncomingEdgesSimple(ScaffoldVertex v);
-
-public:
- ScaffoldGraph(const debruijn_graph::Graph &g) : assembly_graph_(g) {
- }
-
- bool Exists(ScaffoldVertex assembly_graph_edge) const;
-
- bool Exists(const ScaffoldEdge &e) const;
-
- ScaffoldVertex conjugate(ScaffoldVertex assembly_graph_edge) const;
-
- //Return structure thay is equal to conjugate of e (not exactrly the same structure as in graph)
- ScaffoldEdge conjugate(const ScaffoldEdge &e) const;
-
- //Add isolated vertex to the graph if not exitsts
- bool AddVertex(ScaffoldVertex assembly_graph_edge);
-
- void AddVertices(const set<ScaffoldVertex> &vertices);
-
- //Add edge (and conjugate) if not exists
- //v1 and v2 must exist
- bool AddEdge(ScaffoldVertex v1, ScaffoldVertex v2, size_t lib_id, double weight);
-
- bool AddEdge(const ScaffoldEdge &e);
-
- //Rempve edge from edge container and all adjacency lists
- bool RemoveEdge(const ScaffoldEdge &e);
-
- //Remove vertex and all adjacent edges
- bool RemoveVertex(ScaffoldVertex assembly_graph_edge);
-
- bool IsVertexIsolated(ScaffoldVertex assembly_graph_edge) const;
-
- VertexStorage::const_iterator vbegin() const;
-
- VertexStorage::const_iterator vend() const;
-
- adt::iterator_range<VertexStorage::const_iterator> vertices() const;
-
- ConstScaffoldEdgeIterator ebegin() const;
-
- ConstScaffoldEdgeIterator eend() const;
-
- adt::iterator_range<ScaffoldGraph::ConstScaffoldEdgeIterator> edges() const;
-
- size_t int_id(ScaffoldVertex v) const;
-
- size_t int_id(ScaffoldEdge e) const;
-
- ScaffoldVertex EdgeStart(ScaffoldEdge e) const;
-
- ScaffoldVertex EdgeEnd(ScaffoldEdge e) const;
-
- size_t VertexCount() const;
-
- size_t EdgeCount() const;
-
- const debruijn_graph::Graph & AssemblyGraph() const;
-
- vector<ScaffoldEdge> OutgoingEdges(ScaffoldVertex assembly_graph_edge) const;
-
- vector<ScaffoldEdge> IncomingEdges(ScaffoldVertex assembly_graph_edge) const;
-
- size_t OutgoingEdgeCount(ScaffoldVertex assembly_graph_edge) const;
-
- size_t IncomingEdgeCount(ScaffoldVertex assembly_graph_edge) const;
-
- bool HasUniqueOutgoing(ScaffoldVertex assembly_graph_edge) const;
-
- bool HasUniqueIncoming(ScaffoldVertex assembly_graph_edge) const;
-
- ScaffoldEdge UniqueOutgoing(ScaffoldVertex assembly_graph_edge) const;
-
- ScaffoldEdge UniqueIncoming(ScaffoldVertex assembly_graph_edge) const;
-
- void Print(ostream &os) const;
-
-};
-
-} //scaffold_graph
-} //path_extend
-
diff --git a/src/debruijn/path_extend/scaffolder2015/scaffold_graph_constructor.cpp b/src/debruijn/path_extend/scaffolder2015/scaffold_graph_constructor.cpp
deleted file mode 100644
index 4cc41aa..0000000
--- a/src/debruijn/path_extend/scaffolder2015/scaffold_graph_constructor.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-//
-// Created by andrey on 04.12.15.
-//
-
-#include "scaffold_graph_constructor.hpp"
-
-namespace path_extend {
-namespace scaffold_graph {
-
-
-bool LengthEdgeCondition::IsSuitable(debruijn_graph::EdgeId e) const {
- return graph_.length(e) >= min_length_;
-}
-
-void BaseScaffoldGraphConstructor::ConstructFromEdgeConditions(const EdgeCondition &edge_condition,
- vector<shared_ptr<ConnectionCondition>> &connection_conditions,
- bool use_terminal_vertices_only) {
- for (auto e = graph_->AssemblyGraph().ConstEdgeBegin(); !e.IsEnd(); ++e) {
- if (edge_condition.IsSuitable(*e)) {
- graph_->AddVertex(*e);
- }
- }
- ConstructFromConditions(connection_conditions, use_terminal_vertices_only);
-}
-
-void BaseScaffoldGraphConstructor::ConstructFromSet(const set<EdgeId> edge_set,
- vector<shared_ptr<ConnectionCondition>> &connection_conditions,
- bool use_terminal_vertices_only) {
- graph_->AddVertices(edge_set);
- ConstructFromConditions(connection_conditions, use_terminal_vertices_only);
-}
-
-void BaseScaffoldGraphConstructor::ConstructFromConditions(vector<shared_ptr<ConnectionCondition>> &connection_conditions,
- bool use_terminal_vertices_only) {
- for (auto condition : connection_conditions) {
- ConstructFromSingleCondition(condition, use_terminal_vertices_only);
- }
-}
-
-void BaseScaffoldGraphConstructor::ConstructFromSingleCondition(const shared_ptr<ConnectionCondition> condition,
- bool use_terminal_vertices_only) {
- for (auto v : graph_->vertices()) {
- TRACE("Vertex " << graph_->int_id(v));
-
- if (use_terminal_vertices_only && graph_->OutgoingEdgeCount(v) > 0)
- continue;
-
- auto connected_with = condition->ConnectedWith(v);
- for (auto connected : connected_with) {
- TRACE("Connected with " << graph_->int_id(connected));
- if (graph_->Exists(connected)) {
- if (use_terminal_vertices_only && graph_->IncomingEdgeCount(connected) > 0)
- continue;
- graph_->AddEdge(v, connected, condition->GetLibIndex(), condition->GetWeight(v, connected));
- }
- }
- }
-}
-
-
-shared_ptr<ScaffoldGraph> SimpleScaffoldGraphConstructor::Construct() {
- ConstructFromSet(edge_set_, connection_conditions_);
- return graph_;
-}
-
-shared_ptr<ScaffoldGraph> DefaultScaffoldGraphConstructor::Construct() {
- ConstructFromSet(edge_set_, connection_conditions_);
- ConstructFromEdgeConditions(edge_condition_, connection_conditions_);
- return graph_;
-}
-
-} //scaffold_graph
-} //path_extend
\ No newline at end of file
diff --git a/src/debruijn/path_extend/scaffolder2015/scaffold_graph_visualizer.cpp b/src/debruijn/path_extend/scaffolder2015/scaffold_graph_visualizer.cpp
deleted file mode 100644
index ed9f254..0000000
--- a/src/debruijn/path_extend/scaffolder2015/scaffold_graph_visualizer.cpp
+++ /dev/null
@@ -1,71 +0,0 @@
-//
-// Created by andrey on 21.09.15.
-//
-
-#include "scaffold_graph_visualizer.hpp"
-
-namespace path_extend{ namespace scaffold_graph {
-
-const map<size_t, string> ScaffoldEdgeColorer::color_map =
- {{(size_t) -1, "black"},
- {0, "red"},
- {1, "blue"},
- {2, "green"},
- {3, "magenta"},
- {4, "orange"},
- {5, "cyan"}};
-
-const string ScaffoldEdgeColorer::default_color = "black";
-
-string ScaffoldGraphLabeler::label(EdgeId e) const {
- return "ID: " + ToString(e.getId()) +
- "\\n Weight: " + ToString(e.getWeight()) +
- "\\n Lib#: " + ToString(e.getColor());
-}
-
-string ScaffoldGraphLabeler::label(VertexId v) const {
- return "ID: " + ToString(graph_.int_id(v)) +
- "\\n Len: " + ToString(graph_.AssemblyGraph().length(v)) +
- "\\n Cov: " + ToString(graph_.AssemblyGraph().coverage(v));
-}
-
-void ScaffoldGraphVisualizer::Visualize(GraphPrinter<ScaffoldGraph> &printer) {
- printer.open();
- printer.AddVertices(graph_.vbegin(), graph_.vend());
- for (auto e : graph_.edges()) {
- printer.AddEdge(e);
- }
- printer.close();
-}
-
-void ScaffoldGraphVisualizer::Visualize(ostream &os, CompositeGraphColorer<ScaffoldGraph>& colorer) {
- ScaffoldGraphLabeler labeler(graph_);
- EmptyGraphLinker<ScaffoldGraph> linker;
-
- if (paired_) {
- PairedGraphPrinter <ScaffoldGraph> printer(graph_, os, labeler, colorer, linker);
- Visualize(printer);
- } else {
- SingleGraphPrinter <ScaffoldGraph> printer(graph_, os, labeler, colorer, linker);
- Visualize(printer);
- }
-}
-
-string ScaffoldEdgeColorer::GetValue(ScaffoldGraph::EdgeId e) const {
- auto it = color_map.find(e.getColor());
- if (it != color_map.end()) {
- return it->second;
- }
- return default_color;
-}
-
-string ScaffoldVertexSetColorer::GetValue(ScaffoldGraph::VertexId v) const {
- if (vertex_set_.count(v) > 0)
- return "white";
- return "yellow";
-}
-} //scaffold_graph
-} //path_extend
-
-
-
diff --git a/src/debruijn/path_extend/scaffolder2015/scaffold_graph_visualizer.hpp b/src/debruijn/path_extend/scaffolder2015/scaffold_graph_visualizer.hpp
deleted file mode 100644
index cd42022..0000000
--- a/src/debruijn/path_extend/scaffolder2015/scaffold_graph_visualizer.hpp
+++ /dev/null
@@ -1,73 +0,0 @@
-//
-// Created by andrey on 21.09.15.
-//
-
-#ifndef PROJECT_SCAFFOLD_GRAPH_VISUALIZER_HPP
-#define PROJECT_SCAFFOLD_GRAPH_VISUALIZER_HPP
-
-#include "graphio.hpp"
-#include "scaffold_graph.hpp"
-
-namespace path_extend { namespace scaffold_graph {
-
-using namespace omnigraph::visualization;
-
-
-class ScaffoldGraphLabeler : public GraphLabeler<ScaffoldGraph> {
-
-private:
- const ScaffoldGraph &graph_;
-
-public:
- ScaffoldGraphLabeler(const ScaffoldGraph &graph) : graph_(graph) {
- }
-
- string label(VertexId v) const;
-
- string label(EdgeId e) const;
-};
-
-
-class ScaffoldEdgeColorer : public ElementColorer<ScaffoldGraph::EdgeId> {
-private:
- static const map<size_t, string> color_map;
-
- static const string default_color;
-
-public:
- string GetValue(ScaffoldGraph::EdgeId e) const;
-};
-
-
-class ScaffoldVertexSetColorer : public ElementColorer<ScaffoldGraph::VertexId> {
- private:
- set<ScaffoldGraph::VertexId> vertex_set_;
-
- public:
- ScaffoldVertexSetColorer(const set<ScaffoldGraph::VertexId>& vertex_set): vertex_set_(vertex_set) {
- }
-
- string GetValue(ScaffoldGraph::VertexId v) const;
-};
-
-class ScaffoldGraphVisualizer {
-
- const ScaffoldGraph &graph_;
- const bool paired_;
-
-private:
- void Visualize(GraphPrinter<ScaffoldGraph> &printer);
-
-public:
- ScaffoldGraphVisualizer(const ScaffoldGraph &graph, bool paired = true) :
- graph_(graph), paired_(paired) {
- }
-
- void Visualize(ostream &os, CompositeGraphColorer<ScaffoldGraph>& colorer);
-};
-
-} //scaffold_graph
-} //path_extend
-
-
-#endif //PROJECT_SCAFFOLD_GRAPH_VISUALIZER_HPP
diff --git a/src/debruijn/path_extend/split_graph_pair_info.hpp b/src/debruijn/path_extend/split_graph_pair_info.hpp
deleted file mode 100644
index 2a739b7..0000000
--- a/src/debruijn/path_extend/split_graph_pair_info.hpp
+++ /dev/null
@@ -1,449 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * split_graph_pair_info.hpp
- *
- * Created on: May 14, 2013
- * Author: ira
- */
-
-#ifndef SPLIT_GRAPH_PAIR_INFO_HPP_
-#define SPLIT_GRAPH_PAIR_INFO_HPP_
-
-#include "sequence_mapper_notifier.hpp"
-#include "read_converter.hpp"
-#include "utils.hpp"
-#include "ideal_pair_info.hpp"
-
-using namespace debruijn_graph;
-
-namespace path_extend {
-
-inline double FindIntersection(vector<double>& pi1, vector<double>& pi2) {
- std::sort(pi1.begin(), pi1.end());
- std::sort(pi2.begin(), pi2.end());
- size_t iter1 = 0;
- size_t iter2 = 0;
- double threshold = 0.0;
- double percent1 = 0.0;
- double percent2 = 1.0;
- while (percent1 < percent2 and iter1 < pi1.size() and iter2 < pi2.size()) {
- threshold = pi1[iter1];
- while (iter2 < pi2.size() and pi2[iter2] <= threshold) {
- iter2++;
- }
- percent1 = (double) iter1 / (double) pi1.size();
- percent2 = 1.0 - (double) iter2 / (double) pi2.size();
- iter1 += 1;
- }
- return threshold;
-}
-
-class Basket {
- EdgeId edgeId_;
- size_t index_;
-
-public:
- Basket(EdgeId edgeId, size_t index)
- : edgeId_(edgeId), index_(index) { }
-
- Basket(const Basket& b)
- : edgeId_(b.edgeId_), index_(b.index_) {}
-
- const EdgeId edgeId() const {
- return edgeId_;
- }
-
- size_t index() const {
- return index_;
- }
-
- bool operator<(const Basket& rhs) const {
- if (edgeId() != rhs.edgeId()) {
- return edgeId() < rhs.edgeId();
- }
- return index() < rhs.index();
- }
-
- bool operator==(const Basket& rhs) const {
- return edgeId() == rhs.edgeId() && index() == rhs.index();
- }
-};
-
-struct PairInfo {
- double weight_;
- double distance_;
- size_t count_;
-
- PairInfo()
- : weight_(0.), distance_(0.), count_(0) {}
-
- PairInfo(double weight, double distance, size_t count = 0)
- : weight_(weight), distance_(distance), count_(count) {}
-
-};
-
-class EdgePairInfo {
- EdgeId edgeId_;
- size_t basket_size_;
- vector<map<Basket, PairInfo> > pair_info_;
-
-public:
- EdgePairInfo() {
- basket_size_ = 0;
- }
-
- EdgePairInfo(size_t length, EdgeId edgeId, size_t basket_size)
- : edgeId_(edgeId),
- basket_size_(basket_size) {
- size_t count_baskets = length / basket_size_ + 1;
- for (size_t index = 0; index < count_baskets; ++index) {
- pair_info_.push_back(map<Basket, PairInfo>());
- }
- }
-
- EdgePairInfo(const EdgePairInfo& pairInfo)
- : edgeId_(pairInfo.edgeId_),
- basket_size_(pairInfo.basket_size_) {
- for (size_t index = 0; index < pairInfo.pair_info_.size(); ++index) {
- pair_info_.push_back(pairInfo.pair_info_[index]);
- }
- }
-
- void AddPairInfo(size_t pos_begin1, size_t pos_end1, EdgeId edgeId2,
- size_t pos_begin2, size_t pos_end2, double weight,
- double edge_distance) {
- size_t begin_basket_index1 = GetBasketIndex(pos_begin1);
- size_t end_basket_index1 = GetBasketIndex(pos_end1);
- size_t begin_basket_index2 = GetBasketIndex(pos_begin2);
- size_t end_basket_index2 = GetBasketIndex(pos_end2);
- for (size_t index1 = begin_basket_index1; index1 <= end_basket_index1;
- ++index1) {
- for (size_t index2 = begin_basket_index2;
- index2 <= end_basket_index2; ++index2) {
- AddPairInfoToBasket(index1, edgeId2, index2, weight,
- edge_distance);
- }
- }
- }
-
- void AddPairInfo(const EdgePairInfo& edgePairInfo) {
- for (size_t index = 0; index < pair_info_.size(); ++index) {
- const map<Basket, PairInfo>& basketInfoToAdd = edgePairInfo
- .pair_info_[index];
- map<Basket, PairInfo>& oldBasketInfo = pair_info_[index];
- for (auto iter = basketInfoToAdd.begin();
- iter != basketInfoToAdd.end(); ++iter) {
- if (oldBasketInfo.find(iter->first) == oldBasketInfo.end()) {
- oldBasketInfo[iter->first] = iter->second;
- } else {
- PairInfo& pairInfo = oldBasketInfo[iter->first];
- oldBasketInfo[iter->first] = PairInfo(
- pairInfo.weight_ + iter->second.weight_,
- CountNewDistance(pairInfo, iter->second.distance_,
- iter->second.count_),
- iter->second.count_ + pairInfo.count_);
- }
- }
- }
- }
-
- map<Basket, PairInfo>& GetInfo(size_t index) {
- return pair_info_.at(index);
- }
-
- size_t size() {
- return pair_info_.size();
- }
-
-private:
- size_t GetBasketIndex(size_t pos) const {
- return pos / basket_size_;
- }
-
- void AddPairInfoToBasket(size_t index1, EdgeId edgeId2, size_t index2,
- double weight, double edge_distance) {
- Basket basket2(edgeId2, index2);
- if (pair_info_[index1].find(basket2) == pair_info_[index1].end()) {
- pair_info_[index1][basket2] = PairInfo(0.0, 0);
- }
- PairInfo oldPairInfo = pair_info_[index1][basket2];
- double basket_distance = GetBasketDistance(edge_distance, index1,
- index2);
- pair_info_[index1][basket2] = PairInfo(
- oldPairInfo.weight_ + weight,
- CountNewDistance(oldPairInfo, basket_distance),
- oldPairInfo.count_ + 1);
- }
-
- double CountNewDistance(PairInfo& oldPairInfo, double distance,
- size_t count = 1) {
- return (oldPairInfo.distance_ * (double) oldPairInfo.count_
- + distance * (double) count)
- / (double) (oldPairInfo.count_ + count);
- }
-
- double GetBasketDistance(double edge_distance, size_t index1,
- size_t index2) {
- return edge_distance - (double) index1 * (double) basket_size_
- + (double) index2 * (double) basket_size_;
- }
-};
-
-class BasketsPairInfoIndex {
- const conj_graph_pack& gp_;
- size_t basket_size_;
- map<EdgeId, EdgePairInfo> pair_info_;
-
-public:
- BasketsPairInfoIndex(const conj_graph_pack& gp, size_t basket_size)
- : gp_(gp),
- basket_size_(basket_size) {
- }
-
- void AddPairInfo(EdgeId edgeId1, size_t pos_begin1, size_t pos_end1,
- EdgeId edgeId2, size_t pos_begin2, size_t pos_end2,
- double weight, double edge_distance) {
- if (pair_info_.find(edgeId1) == pair_info_.end()) {
- EdgePairInfo edgePairInfo2(gp_.g.length(edgeId1), edgeId1,
- basket_size_);
- pair_info_.insert(make_pair(edgeId1, edgePairInfo2));
- }
- pair_info_[edgeId1].AddPairInfo(pos_begin1, pos_end1, edgeId2,
- pos_begin2, pos_end2, weight,
- edge_distance);
- }
-
- EdgePairInfo& GetEdgePairInfo(EdgeId edgeId) {
- return pair_info_[edgeId];
- }
-
- void AddAll(const BasketsPairInfoIndex& index) {
- for (auto it = index.pair_info_.begin(); it != index.pair_info_.end();
- ++it) {
- if (pair_info_.find(it->first) == pair_info_.end()) {
- pair_info_.insert(make_pair(it->first, it->second));
- } else {
- pair_info_[it->first].AddPairInfo(it->second);
- }
- }
- }
-
- void Clear() {
- pair_info_.clear();
- }
-
- size_t size() const {
- return pair_info_.size();
- }
-
-};
-
-class SplitGraphPairInfo : public SequenceMapperListener {
-
-public:
- //TODO: d_min = ? d_max = ? for ideal_pi_counter_
- SplitGraphPairInfo(conj_graph_pack& gp, size_t is,
- size_t is_var,
- size_t is_min, size_t is_max,
- size_t read_size, size_t /* k */, size_t basket_size,
- const std::map<int, size_t>& is_distribution)
- : gp_(gp),
- is_(is),
- is_var_(is_var),
- is_min_(is_min),
- is_max_(is_max),
- basket_size_(basket_size),
- basket_index_(gp, basket_size),
- threshold_(-1),
- ideal_pi_counter_(gp.g, (int)is_min_,
- (int)is_max_, read_size, is_distribution) {
-
- }
-
- void StartProcessLibrary(size_t threads_count) override {
- baskets_buffer_.clear();
- for (size_t i = 0; i < threads_count; ++i)
- baskets_buffer_.emplace_back(gp_, basket_size_);
- }
-
- void ProcessPairedRead(size_t thread_index,
- const io::PairedRead& r,
- const MappingPath<EdgeId>& read1,
- const MappingPath<EdgeId>& read2) override {
- ProcessPairedRead(baskets_buffer_[thread_index], r.first().size(), r.second().size(),
- read1, read2, r.distance());
- }
-
- void ProcessPairedRead(size_t thread_index,
- const io::PairedReadSeq& r,
- const MappingPath<EdgeId>& read1,
- const MappingPath<EdgeId>& read2) override {
- ProcessPairedRead(baskets_buffer_[thread_index], r.first().size(), r.second().size(),
- read1, read2, r.distance());
- }
-
- void ProcessSingleRead(size_t, const io::SingleRead&, const MappingPath<EdgeId>&) override {
- //only paired reads are interesting
- }
-
- void ProcessSingleRead(size_t, const io::SingleReadSeq&, const MappingPath<EdgeId>&) override {
- //only paired reads are interesting
- }
- void MergeBuffer(size_t thread_index) override {
- basket_index_.AddAll(baskets_buffer_[thread_index]);
- baskets_buffer_[thread_index].Clear();
- }
-
- void StopProcessLibrary() override {
- for (size_t i = 0; i < baskets_buffer_.size(); ++i)
- MergeBuffer(i);
-
- FindThreshold();
-
- baskets_buffer_.clear();
- }
-
- double GetThreshold() const {
- return threshold_;
- }
-
-private:
- void FindThreshold() {
- size_t min_long_edge = basket_size_;
- const Graph& g = gp_.g;
- vector<double> good_pi;
- vector<double> bad_pi;
- double insert_size_min = (double) is_ - 2. * (double) is_var_;
- double insert_size_max = (double) is_ + 2. * (double) is_var_;
- for (auto e = g.ConstEdgeBegin(); !e.IsEnd(); ++e) {
- EdgeId edge = *e;
-
- if (g.length(edge) > min_long_edge) {
- if (g.int_id(edge) <= 0)
- continue;
-
- EdgePairInfo& edge_pi = basket_index_.GetEdgePairInfo(edge);
- if (edge_pi.size() == 0)
- continue;
- size_t count_backets = LastBasketIndex(edge, (int) insert_size_max,
- edge_pi);
- for (size_t index = 0; index <= count_backets; ++index) {
- map<Basket, PairInfo>& basket_info = edge_pi.GetInfo(index);
- set<size_t> pair_baskets = GetBaskets(index,
- (int) insert_size_min,
- (int) insert_size_max,
- edge_pi);
- for (auto iter = basket_info.begin(); iter != basket_info.end(); ++iter) {
- PairInfo& pi = iter->second;
- if (iter->first.edgeId() == edge &&
- pair_baskets.find(iter->first.index()) != pair_baskets.end()) {
- good_pi.push_back(GetNormalizedWeight(pi));
- } else {
- bad_pi.push_back(GetNormalizedWeight(pi));
- }
- }
- }
- }
- }
- DEBUG("good pi size " << good_pi.size() << " bad pi size " << bad_pi.size());
- threshold_ = FindIntersection(good_pi, bad_pi);
- INFO("Threshold for paired information " << threshold_);
- }
-
- size_t LastBasketIndex(EdgeId edgeId, int insert_size_max,
- EdgePairInfo& edge_pair_info) {
- return min((gp_.g.length(edgeId) - insert_size_max) / basket_size_,
- edge_pair_info.size() - 1);
- }
-
- size_t FindBeginPairBasket(size_t index, int insert_size_min,
- EdgePairInfo& edge_pair_info) {
- return min(index + insert_size_min / basket_size_,
- edge_pair_info.size() - 1);
- }
-
- size_t FindEndPairBasket(size_t index, int insert_size_max,
- EdgePairInfo& edge_pair_info) {
- return min(index + insert_size_max / basket_size_,
- edge_pair_info.size() - 1);
- }
-
- set<size_t> GetBaskets(size_t index, int insert_size_min,
- int insert_size_max, EdgePairInfo& edge_pair_info) {
- set<size_t> result;
- size_t begin = FindBeginPairBasket(index, insert_size_min,
- edge_pair_info);
- size_t end = FindEndPairBasket(index, insert_size_max, edge_pair_info);
- for (size_t pair_index = begin; pair_index <= end; ++pair_index) {
- result.insert(pair_index);
- }
- return result;
- }
-
- double GetNormalizedWeight(PairInfo& pi) {
- return pi.weight_
- / ideal_pi_counter_.IdealPairedInfo(basket_size_, basket_size_,
- (int) pi.distance_);
- }
-
- void InnerProcess(BasketsPairInfoIndex& basket_index,
- const MappingPath<EdgeId>& path1,
- const MappingPath<EdgeId>& path2,
- size_t read_distance) {
- for (size_t i = 0; i < path1.size(); ++i) {
- pair<EdgeId, MappingRange> mapping_edge_1 = path1[i];
- for (size_t j = 0; j < path2.size(); ++j) {
- pair<EdgeId, MappingRange> mapping_edge_2 = path2[j];
- double weight = PairedReadCountWeight(mapping_edge_1.second,
- mapping_edge_2.second);
- size_t kmer_distance = read_distance
- + mapping_edge_2.second.initial_range.end_pos
- - mapping_edge_1.second.initial_range.start_pos;
- int edge_distance = (int) kmer_distance
- + (int) mapping_edge_1.second.mapped_range.start_pos
- - (int) mapping_edge_2.second.mapped_range.end_pos;
-
- basket_index.AddPairInfo(
- mapping_edge_1.first,
- mapping_edge_1.second.mapped_range.start_pos,
- mapping_edge_1.second.mapped_range.end_pos,
- mapping_edge_2.first,
- mapping_edge_2.second.mapped_range.start_pos,
- mapping_edge_2.second.mapped_range.end_pos, weight,
- (double) edge_distance);
- }
- }
- }
-
- void ProcessPairedRead(BasketsPairInfoIndex& basket_index,
- size_t r1_length,
- size_t r2_length,
- const MappingPath<EdgeId>& path1,
- const MappingPath<EdgeId>& path2,
- size_t read_distance) {
- InnerProcess(basket_index, path1, path2, read_distance);
- InnerProcess(basket_index, ConjugateMapping(gp_.g, path2, r2_length),
- ConjugateMapping(gp_.g, path1, r1_length), read_distance);
- }
-
- const conj_graph_pack& gp_;
- size_t is_;
- size_t is_var_;
- size_t is_min_;
- size_t is_max_;
- size_t basket_size_;
- BasketsPairInfoIndex basket_index_;
- vector<BasketsPairInfoIndex> baskets_buffer_;
- double threshold_;
- IdealPairInfoCounter ideal_pi_counter_;
-};
-
-} /* path_extend */
-
-#endif /* SPLIT_GRAPH_PAIR_INFO_HPP_ */
diff --git a/src/debruijn/path_extend/utils/paired_info_checker.cpp b/src/debruijn/path_extend/utils/paired_info_checker.cpp
deleted file mode 100644
index ca6b854..0000000
--- a/src/debruijn/path_extend/utils/paired_info_checker.cpp
+++ /dev/null
@@ -1,204 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * paired_info_checker.cpp
- *
- * Created on: Sep 26, 2011
- * Author: andrey
- */
-
-#include "../lc_common.hpp"
-#include "../lc_io.hpp"
-
-using namespace debruijn_graph;
-
-class PairedInfoChecker {
-private:
- Graph& g_;
-
-public:
- PairedInfoChecker(Graph& g) : g_(g) {
-
- }
-
- bool IsSymmetric(PairedInfoIndex<Graph>& index) {
- bool result = true;
- for (auto iter = index.begin(); iter != index.end(); ++iter) {
- auto pi = *iter;
- if (pi.size() == 0) {
- continue;
- }
- EdgeId e1 = pi.back().first;
- EdgeId e2 = pi.back().second;
-
- auto sym_pi = index.GetEdgePairInfo(e2, e1);
-
- for (auto i1 = pi.begin(); i1 != pi.end(); ++i1) {
- for (auto i2 = sym_pi.begin(); i2 != sym_pi.end(); ++i2) {
- if (math::eq(i1->d, - i2->d) && !math::eq(i1->weight, i2->weight)) {
- INFO("No symmetric found ");
- result = false;
- }
- }
- }
-
- }
- return result;
- }
-
- bool IsConjugateSymmetric(PairedInfoIndex<Graph>& index) {
- bool result = true;
- for (auto iter = index.begin(); iter != index.end(); ++iter) {
- auto pi = *iter;
- if (pi.size() == 0) {
- continue;
- }
- EdgeId e1 = pi.back().first;
- EdgeId e2 = pi.back().second;
-
- auto conj_pi = index.GetEdgePairInfo(g_.conjugate(e1), g_.conjugate(e2));
-
- for (auto i1 = pi.begin(); i1 != pi.end(); ++i1) {
- for (auto i2 = conj_pi.begin(); i2 != conj_pi.end(); ++i2) {
- double new_d = i1->d - g_.length(e1) + g_.length(e2);
- if (math::eq(i1->d, - new_d) && !math::eq(i1->weight, i2->weight)) {
- INFO("No conjugate found ");
- result = false;
- }
- }
- }
-
- }
- return result;
- }
-
- bool AreEqual(PairedInfoIndex<Graph>& index1, PairedInfoIndex<Graph>& index2) {
- bool result = true;
- for (auto iter = index1.begin(); iter != index1.end(); ++iter) {
- auto pi = *iter;
- if (pi.size() == 0) {
- continue;
- }
- EdgeId e1 = pi.back().first;
- EdgeId e2 = pi.back().second;
-
- auto pi2 = index2.GetEdgePairInfo(e1, e2);
-
- for (auto i1 = pi.begin(); i1 != pi.end(); ++i1) {
- for (auto i2 = pi2.begin(); i2 != pi2.end(); ++i2) {
- if (math::eq(i1->d, i2->d) && !math::eq(i1->weight, i2->weight)) {
- INFO("Unequal weights");
- result = false;
- }
- }
- }
-
- }
- return result;
- }
-
- void AggregatePairedInfo(PairedInfoIndex<Graph>& clustered, PairedInfoIndex<Graph>& advanced,
- size_t insert_size, size_t read_length,
- PairedInfoIndex<Graph>* result) {
-
- PairedInfoWeightNormalizer<Graph> normalizer(g_, insert_size, read_length, K);
-
- for (auto iter = clustered.begin(); iter != clustered.end(); ++iter) {
- auto pi = *iter;
- if (pi.size() == 0) {
- continue;
- }
-
- EdgeId e1 = pi.back().first;
- EdgeId e2 = pi.back().second;
-
- auto pi2 = advanced.GetEdgePairInfo(e1, e2);
-
- for (auto i1 = pi.begin(); i1 != pi.end(); ++i1) {
-
- auto norm_pi = normalizer.NormalizeWeight(*i1);
-
- for (auto i2 = pi2.begin(); i2 != pi2.end(); ++i2) {
- if (math::ge(i1->d, i2->d - lc_cfg::get().u.dev) && math::le(i1->d, i2->d + lc_cfg::get().u.dev) && math::gr(i2->weight, 0.0)) {
- norm_pi.weight *= lc_cfg::get().es.advanced_coeff;
- }
- }
-
- result->AddPairInfo(norm_pi, false);
- }
-
- }
-
- }
-
-};
-
-
-int main() {
- cfg::create_instance(cfg_filename);
- lc_cfg::create_instance(long_contigs::lc_cfg_filename);
-
- Graph g(K);
- EdgeIndex<K + 1, Graph> index(g);
- PairedInfoIndex<Graph> pairedIndex(g, 0);
- KmerMapper<K+1, Graph> mapper(g);
- Sequence sequence("");
-
- long_contigs::LoadFromFile(lc_cfg::get().ds.graph_file, &g, sequence, &mapper);
- PairedInfoChecker checker(g);
-
- DataScanner<Graph> dataScanner(g);
-
- switch (lc_cfg::get().u.mode) {
- case 1: {
- INFO("Checking " << lc_cfg::get().u.file1);
- dataScanner.loadPaired(lc_cfg::get().u.file1, pairedIndex);
- INFO("Symmetric: " << checker.IsSymmetric(pairedIndex));
- INFO("Conjugate symmetric: " << checker.IsConjugateSymmetric(pairedIndex));
- break;
- }
- case 2: {
- PairedInfoIndex<Graph> pairedIndex2(g, 0);
- dataScanner.loadPaired(lc_cfg::get().u.file1, pairedIndex);
- dataScanner.loadPaired(lc_cfg::get().u.file2, pairedIndex2);
-
- INFO("Checking " << lc_cfg::get().u.file1 << " and " << lc_cfg::get().u.file2);
- INFO("1 is subset of 2 " << checker.AreEqual(pairedIndex, pairedIndex2));
- INFO("2 is subset of 1 " << checker.AreEqual(pairedIndex2, pairedIndex));
- break;
- }
- case 3: {
- INFO("Aggregating paired info");
-
- PairedInfoIndex<Graph> cl(g, 0);
- PairedInfoIndex<Graph> ad(g, 0);
- PairedInfoIndex<Graph> res(g, 0);
-
- dataScanner.loadPaired(lc_cfg::get().u.clustered, cl);
- dataScanner.loadPaired(lc_cfg::get().u.advanced, ad);
-
- checker.AggregatePairedInfo(cl, ad,
- lc_cfg::get().u.insert_size, lc_cfg::get().u.read_size,
- &res);
-
- DataPrinter<Graph> dataPrinter(g);
- dataPrinter.savePaired( "./" + lc_cfg::get().paired_info_file_prefix + "IS" + ToString(lc_cfg::get().u.insert_size) + "_RS" + ToString(lc_cfg::get().u.read_size)
- + "_agregate_" + ToString(lc_cfg::get().es.advanced_coeff), res);
-
- INFO("Done");
- break;
-
- }
- default: {
- INFO("Unknown mode");
- }
- }
-
- return 0;
-}
-
diff --git a/src/debruijn/path_extend/weight_counter.hpp b/src/debruijn/path_extend/weight_counter.hpp
deleted file mode 100644
index b2d8ef6..0000000
--- a/src/debruijn/path_extend/weight_counter.hpp
+++ /dev/null
@@ -1,543 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * weight_counter.hpp
- *
- * Created on: Feb 19, 2012
- * Author: andrey
- */
-
-#ifndef WEIGHT_COUNTER_HPP_
-#define WEIGHT_COUNTER_HPP_
-
-#include "bidirectional_path.hpp"
-#include "paired_library.hpp"
-#include <algorithm>
-#include <boost/math/special_functions/fpclassify.hpp>
-
-namespace path_extend {
-
-inline int median(const vector<int>& dist, const vector<double>& w, int min, int max) {
- VERIFY(dist.size() == w.size());
- double S = 0;
- for (size_t i = 0; i < w.size(); ++i) {
- if (dist[i] >= min && dist[i] <= max)
- S += w[i];
- }
- if (S == 0) {
- DEBUG("Empty histogram");
- return 0;
- }
-
- double sum = S;
- for (size_t i = 0; i < w.size(); ++i) {
- if (dist[i] >= min && dist[i] <= max) {
- sum -= w[i];
- if (sum <= S / 2) {
- return dist[i];
- }
- }
- }
- assert(false);
- return -1;
-}
-
-struct EdgeWithPairedInfo {
- size_t e_;
- double pi_;
-
- EdgeWithPairedInfo(size_t e_, double pi) :
- e_(e_), pi_(pi) {
-
- }
-};
-
-struct EdgeWithDistance {
- EdgeId e_;
- int d_;
-
- EdgeWithDistance(EdgeId e, size_t d) :
- e_(e), d_((int) d) {
- }
-
- struct DistanceComparator {
- bool operator()(const EdgeWithDistance& e1, const EdgeWithDistance& e2) {
- if (e1.d_ == e2.d_)
- return e1.e_ < e2.e_;
- return e1.d_ > e2.d_;
- }
- };
-
- //static DistanceComparator comparator;
-};
-
-class IdealInfoProvider {
-public:
- virtual ~IdealInfoProvider() {}
-
- virtual std::vector<EdgeWithPairedInfo> FindCoveredEdges(const BidirectionalPath& path, EdgeId candidate) const = 0;
-};
-
-class BasicIdealInfoProvider : public IdealInfoProvider {
- const shared_ptr<PairedInfoLibrary> lib_;
-public:
- BasicIdealInfoProvider(const shared_ptr<PairedInfoLibrary>& lib) : lib_(lib) {
- }
-
- std::vector<EdgeWithPairedInfo> FindCoveredEdges(const BidirectionalPath& path, EdgeId candidate) const override {
- std::vector<EdgeWithPairedInfo> covered;
- for (int i = (int) path.Size() - 1; i >= 0; --i) {
- double w = lib_->IdealPairedInfo(path[i], candidate,
- (int) path.LengthAt(i));
- //FIXME think if we need extremely low ideal weights
- if (math::gr(w, 0.)) {
- covered.push_back(EdgeWithPairedInfo(i, w));
- }
- }
- return covered;
- }
-};
-
-class WeightCounter {
-
-protected:
- const Graph& g_;
- const shared_ptr<PairedInfoLibrary> lib_;
- bool normalize_weight_;
- shared_ptr<IdealInfoProvider> ideal_provider_;
-
-public:
-
- WeightCounter(const Graph& g, const shared_ptr<PairedInfoLibrary>& lib,
- bool normalize_weight = true,
- shared_ptr<IdealInfoProvider> ideal_provider = nullptr) :
- g_(g), lib_(lib), normalize_weight_(normalize_weight), ideal_provider_(ideal_provider) {
- if (!ideal_provider_) {
- ideal_provider_ = make_shared<BasicIdealInfoProvider>(lib);
- }
- }
-
- virtual std::set<size_t> PairInfoExist(const BidirectionalPath& path, EdgeId e,
- int gap = 0) const = 0;
-
- virtual double CountWeight(const BidirectionalPath& path, EdgeId e,
- const std::set<size_t>& excluded_edges = std::set<size_t>(), int gapLength = 0) const = 0;
-
- const PairedInfoLibrary& lib() const {
- return *lib_;
- }
-
- const shared_ptr<PairedInfoLibrary> get_libptr() const {
- return lib_;
- };
-
-private:
- DECL_LOGGER("WeightCounter");
-};
-
-class ReadCountWeightCounter: public WeightCounter {
-
- std::vector<EdgeWithPairedInfo> CountLib(const BidirectionalPath& path, EdgeId e,
- int add_gap = 0) const {
- std::vector<EdgeWithPairedInfo> answer;
-
- for (const EdgeWithPairedInfo& e_w_pi : ideal_provider_->FindCoveredEdges(path, e)) {
- double w = lib_->CountPairedInfo(path[e_w_pi.e_], e,
- (int) path.LengthAt(e_w_pi.e_) + add_gap);
-
- if (normalize_weight_) {
- w /= e_w_pi.pi_;
- }
- answer.push_back(EdgeWithPairedInfo(e_w_pi.e_, w));
- }
-
- return answer;
- }
-
-public:
-
- ReadCountWeightCounter(const Graph& g, const shared_ptr<PairedInfoLibrary>& lib,
- bool normalize_weight = true,
- shared_ptr<IdealInfoProvider> ideal_provider = nullptr) :
- WeightCounter(g, lib, normalize_weight, ideal_provider) {
- }
-
- double CountWeight(const BidirectionalPath& path, EdgeId e,
- const std::set<size_t>& excluded_edges, int gap) const override {
- double weight = 0.0;
-
- for (const auto& e_w_pi : CountLib(path, e, gap)) {
- if (!excluded_edges.count(e_w_pi.e_)) {
- weight += e_w_pi.pi_;
- }
- }
-
- return weight;
- }
-
- std::set<size_t> PairInfoExist(const BidirectionalPath& path, EdgeId e,
- int gap = 0) const override {
- std::set<size_t> answer;
- for (const auto& e_w_pi : CountLib(path, e, gap)) {
- if (math::gr(e_w_pi.pi_, 0.)) {
- answer.insert(e_w_pi.e_);
- }
- }
-
- return answer;
- }
-
-};
-
-class PathCoverWeightCounter: public WeightCounter {
- double single_threshold_;
-
- double TotalIdealNonExcluded(const std::vector<EdgeWithPairedInfo>& ideally_covered_edges,
- const std::set<size_t>& excluded_edges) const {
- double ideal_total = 0.0;
-
- for (const EdgeWithPairedInfo& e_w_pi : ideally_covered_edges) {
- if (!excluded_edges.count(e_w_pi.e_))
- ideal_total += e_w_pi.pi_;
- }
-
- return ideal_total;
- }
-
- std::vector<EdgeWithPairedInfo> CountLib(const BidirectionalPath& path, EdgeId e,
- const std::vector<EdgeWithPairedInfo>& ideally_covered_edges, int add_gap = 0) const {
- std::vector<EdgeWithPairedInfo> answer;
-
- for (const EdgeWithPairedInfo& e_w_pi : ideally_covered_edges) {
- double ideal_weight = e_w_pi.pi_;
-
- double weight = lib_->CountPairedInfo(
- path[e_w_pi.e_], e,
- (int) path.LengthAt(e_w_pi.e_) + add_gap);
-
- if (normalize_weight_) {
- weight /= ideal_weight;
- }
-
- if (math::ge(weight, single_threshold_)) {
- answer.push_back(EdgeWithPairedInfo(e_w_pi.e_, ideal_weight));
- }
- }
-
- return answer;
- }
-
-public:
-
- PathCoverWeightCounter(const Graph& g, const shared_ptr<PairedInfoLibrary>& lib,
- bool normalize_weight = true,
- double single_threshold = -1.,
- shared_ptr<IdealInfoProvider> ideal_provider = nullptr) :
- WeightCounter(g, lib, normalize_weight, ideal_provider), single_threshold_(single_threshold) {
- if (math::ls(single_threshold_, 0.)) {
- single_threshold_ = lib_->GetSingleThreshold();
- }
- }
-
- double CountWeight(const BidirectionalPath& path, EdgeId e,
- const std::set<size_t>& excluded_edges, int gap) const override {
- double lib_weight = 0.;
- const auto ideal_coverage = ideal_provider_->FindCoveredEdges(path, e);
-
- for (const auto& e_w_pi : CountLib(path, e, ideal_coverage, gap)) {
- if (!excluded_edges.count(e_w_pi.e_)) {
- lib_weight += e_w_pi.pi_;
- }
- }
-
- double total_ideal_coverage = TotalIdealNonExcluded(ideal_coverage, excluded_edges);
- return math::eq(total_ideal_coverage, 0.) ? 0. : lib_weight / total_ideal_coverage;
- }
-
- std::set<size_t> PairInfoExist(const BidirectionalPath& path, EdgeId e,
- int gap = 0) const override {
- std::set<size_t> answer;
- for (const auto& e_w_pi : CountLib(path, e, ideal_provider_->FindCoveredEdges(path, e), gap)) {
- if (math::gr(e_w_pi.pi_, 0.)) {
- answer.insert(e_w_pi.e_);
- }
- }
- return answer;
- }
-};
-
-class CoverageAwareIdealInfoProvider : public BasicIdealInfoProvider {
- static constexpr double MAGIC_COEFF = 2.;
- const Graph& g_;
- size_t read_length_;
- size_t estimation_edge_length_;
-
-public:
- //works for single lib only!!!
- double EstimatePathCoverage(const BidirectionalPath& path) const {
- double answer = -1.0;
- for (int i = (int) path.Size() - 1; i >= 0; --i) {
- EdgeId e = path.At(i);
- if (g_.length(e) > estimation_edge_length_) {
- if (answer < 0 || g_.coverage(e) < answer) {
- answer = g_.coverage(e);
- }
- }
- }
- return answer;
- }
-
- CoverageAwareIdealInfoProvider(const Graph& g, const shared_ptr<PairedInfoLibrary>& lib,
- size_t read_length, size_t estimation_edge_length) :
- BasicIdealInfoProvider(lib), g_(g), read_length_(read_length),
- estimation_edge_length_(estimation_edge_length) {
- VERIFY(read_length_ > g_.k());
- }
-
- std::vector<EdgeWithPairedInfo> FindCoveredEdges(const BidirectionalPath& path, EdgeId candidate) const override {
- double estimated_coverage = EstimatePathCoverage(path);
- VERIFY(math::gr(estimated_coverage, 0.));
-
- double correction_coeff = estimated_coverage / ((double(read_length_) - double(g_.k())) * MAGIC_COEFF);
-
- std::vector<EdgeWithPairedInfo> answer = BasicIdealInfoProvider::FindCoveredEdges(path, candidate);
- for (auto& e_w_pi : answer) {
- e_w_pi.pi_ *= correction_coeff;
- }
- return answer;
- }
-};
-
-//FIXME optimize number of calls of EstimatePathCoverage(path)
-class MetagenomicWeightCounter: public WeightCounter {
- static const size_t LENGTH_BOUND = 500;
- shared_ptr<CoverageAwareIdealInfoProvider> cov_info_provider_;
- shared_ptr<WeightCounter> normalizing_wc_;
- shared_ptr<WeightCounter> raw_wc_;
-
-public:
-
- MetagenomicWeightCounter(const Graph& g, const shared_ptr<PairedInfoLibrary>& lib,
- size_t read_length, double normalized_threshold, double raw_threshold,
- size_t estimation_edge_length = LENGTH_BOUND) :
- WeightCounter(g, lib) {
- cov_info_provider_ = make_shared<CoverageAwareIdealInfoProvider>(g, lib, read_length, estimation_edge_length);
- normalizing_wc_ = make_shared<PathCoverWeightCounter>(g, lib, true, normalized_threshold, cov_info_provider_);
- raw_wc_ = make_shared<PathCoverWeightCounter>(g, lib, false, raw_threshold);
- }
-
- double CountWeight(const BidirectionalPath& path, EdgeId e,
- const std::set<size_t>& excluded_edges, int gap = 0) const override {
- if (math::gr(cov_info_provider_->EstimatePathCoverage(path), 0.)) {
- return normalizing_wc_->CountWeight(path, e, excluded_edges, gap);
- } else {
- return raw_wc_->CountWeight(path, e, excluded_edges, gap);
- }
- }
-
- std::set<size_t> PairInfoExist(const BidirectionalPath& path, EdgeId e,
- int gap = 0) const override {
- if (math::gr(cov_info_provider_->EstimatePathCoverage(path), 0.)) {
- return normalizing_wc_->PairInfoExist(path, e, gap);
- } else {
- return raw_wc_->PairInfoExist(path, e, gap);
- }
- }
-};
-
-struct PathsPairIndexInfo {
- PathsPairIndexInfo(size_t edge1_, size_t edge2_, double w_, double dist_)
- : edge1(edge1_),
- edge2(edge2_),
- w(w_),
- dist(dist_) {
-
- }
- size_t edge1;
- size_t edge2;
- double w;
- double dist;
-};
-
-class PathsWeightCounter {
-public:
- PathsWeightCounter(const Graph& g, shared_ptr<PairedInfoLibrary> lib, size_t min_read_count);
- PathsWeightCounter(const PathsWeightCounter& w);
- map<size_t, double> FindPairInfoFromPath(
- const BidirectionalPath& path1, size_t from1, size_t to1,
- const BidirectionalPath& path2, size_t from2, size_t to2) const;
- double CountPairInfo(const BidirectionalPath& path1, size_t from1,
- size_t to1, const BidirectionalPath& path2,
- size_t from2, size_t to2, bool normalize = true) const;
- double CountPairInfo(const BidirectionalPath& path1, size_t from1,
- size_t to1, EdgeId edge, size_t gap) const;
- void SetCommonWeightFrom(size_t iedge, double weight);
- void ClearCommonWeight();
- void FindJumpCandidates(EdgeId e, int min_dist, int max_dist, size_t min_len, set<EdgeId>& result) const;
- void FindJumpEdges(EdgeId e, set<EdgeId>& candidates, int min_dist, int max_dist, vector<EdgeWithDistance>& result) const;
- const shared_ptr<PairedInfoLibrary> GetLib() const {
- return lib_;
- }
- bool HasPI(EdgeId e1, EdgeId e2, int dist) const;
- bool HasPI(EdgeId e1, EdgeId e2, size_t dist_min, size_t dist_max) const;
- double PI(EdgeId e1, EdgeId e2, int dist) const;
- bool HasIdealPI(EdgeId e1, EdgeId e2, int dist) const;
- double IdealPI(EdgeId e1, EdgeId e2, int dist) const;
-
-private:
- void FindPairInfo(const BidirectionalPath& path1, size_t from1, size_t to1,
- const BidirectionalPath& path2, size_t from2, size_t to2,
- map<size_t, double>& pi, double& ideal_pi) const;
- void FindPairInfo(EdgeId e1, EdgeId e2, size_t dist, double& ideal_w,
- double& result_w) const;
-
- const Graph& g_;
- shared_ptr<PairedInfoLibrary> lib_;
- std::map<size_t, double> common_w_;
- size_t min_read_count_;
- DECL_LOGGER("WeightCounter");
-};
-inline PathsWeightCounter::PathsWeightCounter(const Graph& g, shared_ptr<PairedInfoLibrary>lib, size_t min_read_count):g_(g), lib_(lib), min_read_count_(min_read_count){
-
-}
-
-inline PathsWeightCounter::PathsWeightCounter(const PathsWeightCounter& w): g_(w.g_), lib_(w.lib_), min_read_count_(w.min_read_count_) {
-
-}
-
-inline double PathsWeightCounter::CountPairInfo(const BidirectionalPath& path1,
- size_t from1, size_t to1,
- const BidirectionalPath& path2,
- size_t from2, size_t to2, bool normalize) const {
- map<size_t, double> pi;
- double ideal_pi = 0.0;
- FindPairInfo(path1, from1, to1, path2, from2, to2,
- pi, ideal_pi);
- double result = 0.0;
- double all_common = 0.0;
- for (size_t i = from1; i < to1; ++i) {
- if (common_w_.find(i) != common_w_.end()) {
- all_common += common_w_.at(i);
- }
- result += pi[i];
- }
- DEBUG("ideal _pi " << ideal_pi << " common " << all_common << " result " << result);
- ideal_pi -= all_common;
- result -= all_common;
- double total_result = math::gr(ideal_pi, 0.0) ? result / ideal_pi : 0.0;
- total_result = math::gr(total_result, 0.0) ? total_result : 0.0;
- DEBUG("ideal _pi " << ideal_pi << " result " << result << " total_result " << total_result);
- return normalize ? total_result : result;
-}
-
-inline double PathsWeightCounter::CountPairInfo(const BidirectionalPath& path1,
- size_t from1, size_t to1, EdgeId edge,
- size_t gap) const {
- double result = 0.0;
- for (size_t i1 = from1; i1 < to1; ++i1) {
- double ideal_w, w;
- FindPairInfo(path1.At(i1), edge, gap + path1.LengthAt(i1), ideal_w, w);
- result += w;
- }
- return result;
-}
-
-inline void PathsWeightCounter::FindPairInfo(const BidirectionalPath& path1,
- size_t from1, size_t to1,
- const BidirectionalPath& path2,
- size_t from2, size_t to2,
- map<size_t, double>& pi,
- double& ideal_pi) const {
- stringstream str;
- for (size_t i = 0; i < path2.Size(); ++i) {
- str << g_.int_id(path2.At(i)) << " ";
- }
- DEBUG("pair info for path " << str.str());
- for (size_t i1 = from1; i1 < to1; ++i1) {
- for (size_t i2 = from2; i2 < to2; ++i2) {
- size_t dist = path1.LengthAt(i1) + path2.Length()
- - path2.LengthAt(i2);
- double ideal_w = 0.0;
- double w = 0.0;
- FindPairInfo(path1.At(i1), path2.At(i2), dist, ideal_w, w);
- ideal_pi += ideal_w;
- if (pi.find(i1) == pi.end()) {
- pi[i1] = 0;
- }
- pi[i1] += w;
- }
- }
-}
-
-inline void PathsWeightCounter::FindPairInfo(EdgeId e1, EdgeId e2, size_t dist,
- double& ideal_w, double& result_w) const {
- ideal_w = lib_->IdealPairedInfo(e1, e2, (int) dist, true);
- result_w = 0.0;
- if (ideal_w == 0.0) {
- return;
- }
- if (HasPI(e1, e2, (int) dist)) {
- result_w = ideal_w;
- }
-}
-inline map<size_t, double> PathsWeightCounter::FindPairInfoFromPath(
- const BidirectionalPath& path1, size_t from1, size_t to1,
- const BidirectionalPath& path2, size_t from2, size_t to2) const {
- map<size_t, double> pi;
- double ideal_pi = 0;
- FindPairInfo(path1, from1, to1, path2, from2, to2, pi, ideal_pi);
- return pi;
-}
-inline void PathsWeightCounter::FindJumpCandidates(EdgeId e, int min_dist, int max_dist, size_t min_len, set<EdgeId>& result) const {
- result.clear();
- lib_->FindJumpEdges(e, result, min_dist, max_dist, min_len);
-}
-inline void PathsWeightCounter::FindJumpEdges(EdgeId e, set<EdgeId>& edges, int min_dist, int max_dist, vector<EdgeWithDistance>& result) const {
- result.clear();
-
- for (auto e2 = edges.begin(); e2 != edges.end(); ++e2) {
- vector<int> distances;
- vector<double> weights;
- lib_->CountDistances(e, *e2, distances, weights);
- int median_distance = median(distances, weights, min_dist, max_dist);
-
- if (HasPI(e, *e2, median_distance)) {
- result.push_back(EdgeWithDistance(*e2, median_distance));
- }
- }
-}
-inline void PathsWeightCounter::SetCommonWeightFrom(size_t iedge, double weight) {
- common_w_[iedge] = weight;
-}
-inline void PathsWeightCounter::ClearCommonWeight() {
- common_w_.clear();
-}
-
-inline double PathsWeightCounter::PI(EdgeId e1, EdgeId e2, int dist) const {
- double w = lib_->CountPairedInfo(e1, e2, dist, true);
- return w > (double) min_read_count_ ? w : 0.0;
-}
-
-inline bool PathsWeightCounter::HasPI(EdgeId e1, EdgeId e2, int dist) const {
- return lib_->CountPairedInfo(e1, e2, dist, true) > (double) min_read_count_;
-}
-
-inline bool PathsWeightCounter::HasIdealPI(EdgeId e1, EdgeId e2, int dist) const {
- return lib_->IdealPairedInfo(e1, e2, dist, true) > 0.0;
-}
-
-inline double PathsWeightCounter::IdealPI(EdgeId e1, EdgeId e2, int dist) const {
- return lib_->IdealPairedInfo(e1, e2, dist, true);
-}
-
-inline bool PathsWeightCounter::HasPI(EdgeId e1, EdgeId e2, size_t dist_min, size_t dist_max) const {
- return lib_->CountPairedInfo(e1, e2, (int) dist_min, (int) dist_max) > min_read_count_;
-}
-};
-
-#endif /* WEIGHT_COUNTER_HPP_ */
diff --git a/src/debruijn/path_utils.hpp b/src/debruijn/path_utils.hpp
deleted file mode 100644
index 4f40077..0000000
--- a/src/debruijn/path_utils.hpp
+++ /dev/null
@@ -1,105 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * path_utils.hpp
- *
- */
-
-#pragma once
-
-#include "omni/path_processor.hpp"
-
-namespace debruijn_graph {
-
- // TODO: rewrite this function
- template<class Graph>
- vector<typename Graph::EdgeId> GetCommonPathsEnd(
- const Graph& g,
- typename Graph::EdgeId e1,
- typename Graph::EdgeId e2,
- size_t min_dist,
- size_t max_dist,
- const PathProcessor<Graph>& path_processor)
- {
- typedef typename Graph::EdgeId EdgeId;
- typedef vector<EdgeId> Path;
-
- //PathProcessor<Graph> path_processor(g,
- //min_dist - g.length(e1),
- //max_dist - g.length(e1),
- //g.EdgeEnd(e1), g.EdgeStart(e2), callback);
-
- PathStorageCallback<Graph> callback(g);
- int error_code = path_processor.Process(g.EdgeStart(e2), min_dist - g.length(e1),
- max_dist - g.length(e1), callback);
- vector<Path> paths = callback.paths();
-
- vector<EdgeId> result;
- if (error_code != 0) {
- DEBUG("Edge " << g.int_id(e1) << " path_processor problem")
- return result;
- }
- if (paths.size() == 0)
- return result;
- if (paths.size() == 1)
- return paths[0];
- size_t j = 0;
- while (j < paths[0].size()) {
- for (size_t i = 1; i < paths.size(); ++i) {
- if (j == paths[i].size()) {
- vector<EdgeId> result(paths[0].begin()+(paths[0].size() - j), paths[0].end());
- return result;
- } else {
- if (paths[0][paths[0].size()-1-j] != paths[i][paths[i].size()-1-j]) {
- vector<EdgeId> result(paths[0].begin()+(paths[0].size() - j), paths[0].end());
- return result;
- }
- }
- }
- ++j;
- }
- return paths[0];
- }
-
-
-
- template<class Graph>
- vector<vector<typename Graph::EdgeId> > GetAllPathsBetweenEdges(
- const Graph& g,
- typename Graph::EdgeId& e1,
- typename Graph::EdgeId& e2, size_t min_dist,
- size_t max_dist) {
- PathStorageCallback<Graph> callback(g);
- ProcessPaths(g,
- min_dist,
- max_dist, //0, *cfg::get().ds.IS - K + size_t(*cfg::get().ds.is_var),
- g.EdgeEnd(e1), g.EdgeStart(e2),
- callback);
- auto paths = callback.paths();
- return paths;
- }
-
-template<class graph_pack>
-size_t GetAllPathsQuantity(const graph_pack& origin_gp,
- const typename graph_pack::graph_t::EdgeId& e1,
- const typename graph_pack::graph_t::EdgeId& e2, double d, double is_var) {
- PathStorageCallback<typename graph_pack::graph_t> callback(origin_gp.g);
- PathProcessor<typename graph_pack::graph_t>
- path_processor(origin_gp.g,
- (size_t) d - origin_gp.g.length(e1) - size_t(is_var),
- (size_t) d - origin_gp.g.length(e1) + size_t(is_var),
- origin_gp.g.EdgeEnd(e1),
- origin_gp.g.EdgeStart(e2),
- callback);
- path_processor.Process();
- auto paths = callback.paths();
- TRACE(e1.ind_id() << " " << e2.int_id() << " " << paths.size());
- return paths.size();
-}
-
-}
diff --git a/src/debruijn/positions.hpp b/src/debruijn/positions.hpp
deleted file mode 100644
index bb89632..0000000
--- a/src/debruijn/positions.hpp
+++ /dev/null
@@ -1,112 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "sequence_mapper.hpp"
-#include "omni/edges_position_handler.hpp"
-#include "io/wrapper_collection.hpp"
-
-namespace debruijn_graph {
-
-template<class Graph>
-class PosFiller {
- typedef typename Graph::EdgeId EdgeId;
- typedef std::shared_ptr<SequenceMapper<Graph>> MapperPtr;
- const Graph& g_;
- MapperPtr mapper_;
- EdgesPositionHandler<Graph>& edge_pos_;
-
- public:
- PosFiller(const Graph& g, MapperPtr mapper,
- EdgesPositionHandler<Graph>& edge_pos) :
- g_(g), mapper_(mapper), edge_pos_(edge_pos) {
-
- }
-
- void Process(const Sequence& s, string name) const {
- //todo stupid conversion!
- return Process(io::SingleRead(name, s.str()));
- }
-
- void Process(const io::SingleRead& read) const {
- MappingPath<EdgeId> path = mapper_->MapRead(read);
- const string name = read.name();
- int cur_pos = 0;
- TRACE("Contig " << name << " mapped on " << path.size()
- << " fragments.");
- for (size_t i = 0; i < path.size(); i++) {
- EdgeId ei = path[i].first;
- MappingRange mr = path[i].second;
- int len = (int) (mr.mapped_range.end_pos - mr.mapped_range.start_pos);
- if (i > 0)
- if (path[i - 1].first != ei)
- if (g_.EdgeStart(ei) != g_.EdgeEnd(path[i - 1].first)) {
- TRACE(
- "Contig " << name
- << " mapped on not adjacent edge. Position in contig is "
- << path[i - 1].second.initial_range.start_pos
- + 1
- << "--"
- << path[i - 1].second.initial_range.end_pos
- << " and "
- << mr.initial_range.start_pos + 1
- << "--" << mr.initial_range.end_pos);
- }
- edge_pos_.AddEdgePosition(ei, name, mr.initial_range.start_pos,
- mr.initial_range.end_pos,
- mr.mapped_range.start_pos,
- mr.mapped_range.end_pos);
- cur_pos += len;
- }
- }
-
- void Process(io::SingleStream& stream) const {
- io::SingleRead read;
- while (!stream.eof()) {
- stream >> read;
- Process(read);
- }
- }
-
- private:
- DECL_LOGGER("PosFiller");
-};
-
-template<class gp_t>
-void FillPos(gp_t& gp, const string& contig_file, string prefix, bool with_rc = false) {
- PosFiller<typename gp_t::graph_t> pos_filler(gp.g, MapperInstance(gp), gp.edge_pos);
- auto irs = std::make_shared<io::PrefixAddingReaderWrapper>(io::EasyStream(contig_file, with_rc), prefix);
- pos_filler.Process(*irs);
-}
-
-template<class gp_t>
-void FillPos(gp_t& gp, const Sequence& s, string name) {
- PosFiller<typename gp_t::graph_t> pos_filler(gp.g, MapperInstance(gp), gp.edge_pos);
- pos_filler.Process(s, name);
-}
-
-
-
-//inline
-//void CollectPositions(conj_graph_pack &gp) {
-// gp.edge_pos.clear();
-// if (gp.genome.size() > 0) {
-// FillPos(gp, gp.genome, "ref0");
-// FillPos(gp, !gp.genome, "ref1");
-// }
-//
-// if (!cfg::get().pos.contigs_for_threading.empty() &&
-// path::FileExists(cfg::get().pos.contigs_for_threading))
-// FillPos(gp, cfg::get().pos.contigs_for_threading, "thr_", true);
-//
-// if (!cfg::get().pos.contigs_to_analyze.empty() &&
-// path::FileExists(cfg::get().pos.contigs_to_analyze))
-// FillPos(gp, cfg::get().pos.contigs_to_analyze, "anlz_", true);
-//}
-
-}
diff --git a/src/debruijn/read_converter.hpp b/src/debruijn/read_converter.hpp
deleted file mode 100644
index 23e3ca0..0000000
--- a/src/debruijn/read_converter.hpp
+++ /dev/null
@@ -1,360 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * read_converter.hpp
- *
- * Created on: Apr 13, 2012
- * Author: andrey
- */
-
-#pragma once
-
-#include "io/binary_converter.hpp"
-#include "io/io_helper.hpp"
-#include "dataset_readers.hpp"
-#include "simple_tools.hpp"
-
-#include <fstream>
-
-namespace debruijn_graph {
-
-class ReadConverter {
-
-private:
- const static size_t current_binary_format_version = 10;
-
- void convert_reads_to_binary() {
- if (path::FileExists(cfg::get().temp_bin_reads_info)) {
- std::ifstream info;
- info.open(cfg::get().temp_bin_reads_info.c_str(), std::ios_base::in);
-
- size_t thread_num = 0;
- size_t format = 0;
- size_t lib_count = 0;
-
- info >> format;
- if (!info.eof()) {
- info >> thread_num;
- }
- if (!info.eof()) {
- info >> lib_count;
- }
-
- if (thread_num == cfg::get().max_threads && format == current_binary_format_version && lib_count == cfg::get().ds.reads.lib_count()) {
- INFO("Binary reads detected");
-
- io::ReadStreamStat stat;
- info >> stat.read_count_;
- info >> stat.max_len_;
- info >> stat.total_len_;
-
- auto &dataset = cfg::get_writable().ds.reads;
- for (size_t i = 0; i < dataset.lib_count(); ++i) {
- info >> dataset[i].data().binary_coverted;
- info >> dataset[i].data().read_length;
- info >> dataset[i].data().total_nucls;
-
- dataset[i].data().thread_num = cfg::get().max_threads;
- dataset[i].data().paired_read_prefix = cfg::get().paired_read_prefix + "_" + ToString(i);
- dataset[i].data().single_read_prefix = cfg::get().single_read_prefix + "_" + ToString(i);
- }
- info.close();
- return;
- }
- info.close();
- }
-
- std::ofstream info;
- info.open(cfg::get().temp_bin_reads_info.c_str(), std::ios_base::out);
- info << "0 0";
- info.close();
-
- io::ReadStreamStat total_stat;
- auto& dataset = cfg::get_writable().ds.reads;
-
- INFO("Converting reads to binary format (takes a while)");
- for (size_t i = 0; i < dataset.lib_count(); ++i) {
- if (cfg::get().bwa.enabled && dataset[i].is_bwa_alignable()) {
- INFO("Library #" << i << " will be used by BWA only and thus will not be converted");
- continue;
- }
- else if (dataset[i].is_binary_covertable()) {
- INFO("Paired reads for library #" << i);
- dataset[i].data().thread_num = cfg::get().max_threads;
- dataset[i].data().paired_read_prefix = cfg::get().paired_read_prefix + "_" + ToString(i);
-
- io::PairedStreamPtr paired_reader = paired_easy_reader(dataset[i], false, 0, false, false);
- io::BinaryWriter paired_converter
- (dataset[i].data().paired_read_prefix, cfg::get().max_threads, cfg::get().buffer_size);
- io::ReadStreamStat paired_stat = paired_converter.ToBinary(*paired_reader, dataset[i].orientation());
- paired_stat.read_count_ *= 2;
- total_stat.merge(paired_stat);
-
- INFO("Single reads for library #" << i);
- dataset[i].data().single_read_prefix = cfg::get().single_read_prefix + "_" + ToString(i);
- io::SingleStreamPtr single_reader = single_easy_reader(dataset[i], false, false);
- io::BinaryWriter single_converter
- (dataset[i].data().single_read_prefix, cfg::get().max_threads, cfg::get().buffer_size);
- io::ReadStreamStat single_stat = single_converter.ToBinary(*single_reader);
- total_stat.merge(single_stat);
-
- paired_stat.merge(single_stat);
- dataset[i].data().read_length = paired_stat.max_len_;
- dataset[i].data().total_nucls = paired_stat.total_len_;
- dataset[i].data().binary_coverted = true;
- }
- else {
- INFO("Library #" << i << " doesn't need to be converted");
- }
- }
- info.open(cfg::get().temp_bin_reads_info.c_str(), std::ios_base::out);
- info << current_binary_format_version << " " << cfg::get().max_threads << " " << cfg::get().ds.reads.lib_count() << " " <<
- total_stat.read_count_ << " " << total_stat.max_len_ << " " << total_stat.total_len_ << "\n";
-
- for (size_t i = 0; i < dataset.lib_count(); ++i) {
- info << dataset[i].data().binary_coverted
- << " " << dataset[i].data().read_length
- << " " << dataset[i].data().total_nucls << "\n";
- }
- info.close();
- }
-
-public:
- ReadConverter() {
- convert_reads_to_binary();
- }
-};
-
-
-inline
-void convert_if_needed() {
- static ReadConverter converter;
-}
-
-inline
-io::BinaryPairedStreams raw_paired_binary_readers(const io::SequencingLibrary<debruijn_config::DataSetData> &lib,
- bool followed_by_rc,
- size_t insert_size = 0) {
- convert_if_needed();
- VERIFY_MSG(lib.data().binary_coverted, "Lib was not converted to binary, cannot produce binary stream");
-
- io::ReadStreamList<io::PairedReadSeq> paired_streams;
- for (size_t i = 0; i < lib.data().thread_num; ++i) {
- paired_streams.push_back(make_shared<io::BinaryFilePairedStream>(lib.data().paired_read_prefix, i, insert_size));
- }
- return io::apply_paired_wrappers(followed_by_rc, paired_streams);
-}
-
-inline
-io::BinarySingleStreams raw_single_binary_readers(const io::SequencingLibrary<debruijn_config::DataSetData> &lib,
- bool followed_by_rc,
- bool including_paired_reads) {
- convert_if_needed();
- VERIFY_MSG(lib.data().binary_coverted, "Lib was not converted to binary, cannot produce binary stream");
-
- io::BinarySingleStreams single_streams;
- for (size_t i = 0; i < lib.data().thread_num; ++i) {
- single_streams.push_back(make_shared<io::BinaryFileSingleStream>(lib.data().single_read_prefix, i));
- }
- if (including_paired_reads) {
- io::BinaryPairedStreams paired_streams;
- for (size_t i = 0; i < lib.data().thread_num; ++i) {
- paired_streams.push_back(make_shared<io::BinaryFilePairedStream>(lib.data().paired_read_prefix, i, 0));
- }
-
- return io::apply_single_wrappers(followed_by_rc, single_streams, &paired_streams);
- }
- else {
- return io::apply_single_wrappers(followed_by_rc, single_streams);
- }
-}
-
-
-inline
-io::BinaryPairedStreams paired_binary_readers(const io::SequencingLibrary<debruijn_config::DataSetData> &lib,
- bool followed_by_rc,
- size_t insert_size = 0) {
- convert_if_needed();
- return raw_paired_binary_readers(lib, followed_by_rc, insert_size);
-}
-
-
-inline
-io::BinarySingleStreams single_binary_readers(const io::SequencingLibrary<debruijn_config::DataSetData> &lib,
- bool followed_by_rc,
- bool including_paired_reads) {
- convert_if_needed();
- return raw_single_binary_readers(lib, followed_by_rc, including_paired_reads);
-}
-
-
-inline
-//todo simplify
-io::BinaryPairedStreams paired_binary_readers_for_libs(const std::vector<size_t>& libs,
- bool followed_by_rc,
- size_t insert_size = 0) {
- convert_if_needed();
-
- std::vector<io::BinaryPairedStreams> streams(cfg::get().max_threads);
- for (size_t i = 0; i < libs.size(); ++i) {
- io::BinaryPairedStreams lib_streams = raw_paired_binary_readers(cfg::get().ds.reads[libs[i]], followed_by_rc, insert_size);
-
- for (size_t j = 0; j < cfg::get().max_threads; ++j) {
- streams[j].push_back(lib_streams.ptr_at(j));
- }
- }
-
- io::BinaryPairedStreams joint_streams;
- for (size_t j = 0; j < cfg::get().max_threads; ++j) {
- joint_streams.push_back(io::MultifileWrap<io::PairedReadSeq>(streams[j]));
- }
- return joint_streams;
-}
-
-inline
-io::BinarySingleStreams single_binary_readers_for_libs(const std::vector<size_t>& libs,
- bool followed_by_rc,
- bool including_paired_reads) {
- convert_if_needed();
-
- std::vector<io::BinarySingleStreams> streams(cfg::get().max_threads);
- for (size_t i = 0; i < libs.size(); ++i) {
- io::BinarySingleStreams lib_streams = raw_single_binary_readers(cfg::get().ds.reads[libs[i]], followed_by_rc, including_paired_reads);
-
- for (size_t j = 0; j < cfg::get().max_threads; ++j) {
- streams[j].push_back(lib_streams.ptr_at(j));
- }
- }
-
- io::BinarySingleStreams joint_streams;
- for (size_t j = 0; j < cfg::get().max_threads; ++j) {
- joint_streams.push_back(io::MultifileWrap<io::SingleReadSeq>(streams[j]));
- }
- return joint_streams;
-}
-
-inline
-io::BinaryPairedStreams paired_binary_readers(bool followed_by_rc,
- size_t insert_size = 0) {
- std::vector<size_t> all_libs(cfg::get().ds.reads.lib_count());
- for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i) {
- all_libs[i] = i;
- }
- return paired_binary_readers_for_libs(all_libs, followed_by_rc, insert_size);
-}
-
-inline
-io::BinarySingleStreams single_binary_readers(bool followed_by_rc,
- bool including_paired_reads) {
- std::vector<size_t> all_libs(cfg::get().ds.reads.lib_count());
- for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i) {
- all_libs[i] = i;
- }
- return single_binary_readers_for_libs(all_libs, followed_by_rc, including_paired_reads);
-}
-
-inline
-io::BinarySingleStreamPtr single_binary_multireader(bool followed_by_rc, bool including_paired_reads) {
- return io::MultifileWrap<io::SingleReadSeq>(single_binary_readers(followed_by_rc, including_paired_reads));
-}
-
-inline
-io::BinaryPairedStreamPtr paired_binary_multireader(bool followed_by_rc, size_t insert_size = 0) {
- return io::MultifileWrap<io::PairedReadSeq>(paired_binary_readers(followed_by_rc, insert_size));
-}
-
-/*
-
-class BufferedReadersStorage {
-
-private:
-
- std::vector< SequenceSingleReadStream* > * single_streams_;
-
- std::vector< SequencePairedReadStream* > * paired_streams_;
-
- BufferedReadersStorage() {
- INFO("Creating buffered read storage");
-
- INFO("Buffering single reads... (takes a while)");
- single_streams_ = new std::vector< SequenceSingleReadStream* >(cfg::get().max_threads);
- for (size_t i = 0; i < cfg::get().max_threads; ++i) {
- io::PredictableIReader<io::SingleReadSeq> * s_stream = new io::SeqSingleReadStream(cfg::get().single_read_prefix, i);
- single_streams_->at(i) = new io::ReadBufferedStream<io::SingleReadSeq> (*s_stream);
- }
-
- INFO("Buffering paired reads... (takes a while)");
- paired_streams_ = new std::vector< SequencePairedReadStream* >(cfg::get().max_threads);
- for (size_t i = 0; i < cfg::get().max_threads; ++i) {
- io::PredictableIReader<io::PairedReadSeq> * p_stream = new io::SeqPairedReadStream(cfg::get().paired_read_prefix, i, 0);
- paired_streams_->at(i) = new io::ReadBufferedStream<io::PairedReadSeq> (*p_stream);
- }
- }
-
- BufferedReadersStorage(const BufferedReadersStorage&);
-
- BufferedReadersStorage& operator=(const BufferedReadersStorage&);
-
-public:
-
- static BufferedReadersStorage * GetInstance() {
- static BufferedReadersStorage instance;
- return &instance;
- }
-
-
- std::vector< SequenceSingleReadStream* > * GetSingleReaders() const {
- return single_streams_;
- }
-
- std::vector< SequencePairedReadStream* > * GetPairedReaders() const {
- return paired_streams_;
- }
-
-};
-
-
-std::vector< SequenceSingleReadStream* > single_buffered_binary_readers(bool followed_by_rc, bool including_paired_reads) {
- convert_if_needed();
-
- BufferedReadersStorage * storage = BufferedReadersStorage::GetInstance();
-
- if (including_paired_reads) {
- return apply_single_wrappers(followed_by_rc, *(storage->GetSingleReaders()), storage->GetPairedReaders());
- }
- else {
- return apply_single_wrappers(followed_by_rc, *(storage->GetSingleReaders()));
- }
-}
-
-std::vector< SequencePairedReadStream* > paired_buffered_binary_readers(bool followed_by_rc, size_t insert_size) {
- convert_if_needed();
-
- BufferedReadersStorage * storage = BufferedReadersStorage::GetInstance();
-
- std::vector<SequencePairedReadStream*> paired_streams(cfg::get().max_threads);
- for (size_t i = 0; i < cfg::get().max_threads; ++i) {
- paired_streams[i] = new io::InsertSizeModifyingWrapper(*(storage->GetPairedReaders()->at(i)), insert_size);
- }
- return apply_paired_wrappers(followed_by_rc, paired_streams);
-}
-
-auto_ptr<SequenceSingleReadStream> single_buffered_binary_multireader(bool followed_by_rc, bool including_paired_reads) {
- convert_if_needed();
-
- return auto_ptr<SequenceSingleReadStream>(new io::MultifileReader<io::SingleReadSeq>(single_buffered_binary_readers(followed_by_rc, including_paired_reads)));
-}
-
-auto_ptr<SequencePairedReadStream> paired_buffered_binary_multireader(bool followed_by_rc, size_t insert_size) {
- convert_if_needed();
-
- return auto_ptr<SequencePairedReadStream>(new io::MultifileReader<io::PairedReadSeq>(paired_buffered_binary_readers(followed_by_rc, insert_size)));
-}
-*/
-
-}
diff --git a/src/debruijn/repeat_resolving.cpp b/src/debruijn/repeat_resolving.cpp
deleted file mode 100644
index 3beebd0..0000000
--- a/src/debruijn/repeat_resolving.cpp
+++ /dev/null
@@ -1,99 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "standard.hpp"
-
-#include "logger/logger.hpp"
-#include "stats/debruijn_stats.hpp"
-#include "omni/visualization/graph_labeler.hpp"
-#include "de/distance_estimation.hpp"
-#include "de/smoothing_distance_estimation.hpp"
-#include "omni/omni_utils.hpp"
-#include "path_extend/path_extend_launch.hpp"
-#include "contig_output.hpp"
-#include "positions.hpp"
-#include "long_read_storage.hpp"
-#include "repeat_resolving.hpp"
-
-namespace debruijn_graph {
-
-void PEResolving(conj_graph_pack& gp) {
- vector<size_t> indexes;
- std::string name = "scaffolds";
- bool traverse_loops = true;
- if (!(cfg::get().use_scaffolder && cfg::get().pe_params.param_set.scaffolder_options.on)) {
- name = "final_contigs";
- traverse_loops = false;
- }
- path_extend::ResolveRepeatsPe(gp, cfg::get().output_dir, name, traverse_loops, boost::optional<std::string>("final_contigs"));
-}
-
-inline bool HasValidLibs() {
- for (const auto& lib : cfg::get().ds.reads) {
- if (lib.is_repeat_resolvable()) {
- if (!lib.is_paired() || !math::eq(lib.data().mean_insert_size, 0.0)) {
- return true;
- }
- }
- }
- return false;
-}
-
-void RepeatResolution::run(conj_graph_pack &gp, const char*) {
- if (cfg::get().developer_mode) {
- stats::PrepareForDrawing(gp);
- }
-
- omnigraph::DefaultLabeler<Graph> labeler(gp.g, gp.edge_pos);
- stats::detail_info_printer printer(gp, labeler, cfg::get().output_dir);
- printer(ipp_before_repeat_resolution);
-
- //todo awful hack to get around PE using cfg::get everywhere...
- auto tmp_params_storage = cfg::get().pe_params;
- if (preliminary_) {
- INFO("Setting up preliminary path extend settings")
- cfg::get_writable().pe_params = cfg::get().prelim_pe_params;
- }
-
- OutputContigs(gp.g, cfg::get().output_dir + "before_rr");
- OutputContigsToFASTG(gp.g, cfg::get().output_dir + "assembly_graph");
-
- bool no_valid_libs = !HasValidLibs();
-
- bool use_single_reads = cfg::get().use_single_reads;
- if (cfg::get().rr_enable && no_valid_libs && !use_single_reads)
- WARN("Insert size was not estimated for any of the paired libraries, repeat resolution module will not run.");
-
- if ((no_valid_libs || cfg::get().rm == debruijn_graph::resolving_mode::rm_none) && !use_single_reads) {
- OutputContigs(gp.g, cfg::get().output_dir + "final_contigs");
- return;
- }
- if (cfg::get().rm == debruijn_graph::resolving_mode::rm_path_extend) {
- INFO("Using Path-Extend repeat resolving");
- PEResolving(gp);
- } else {
- INFO("Unsupported repeat resolver");
- OutputContigs(gp.g, cfg::get().output_dir + "final_contigs");
- }
- if (preliminary_) {
- INFO("Restoring initial path extend settings")
- cfg::get_writable().pe_params = tmp_params_storage;
- }
-}
-
-void ContigOutput::run(conj_graph_pack &gp, const char*) {
- OutputContigs(gp.g, cfg::get().output_dir + "simplified_contigs", cfg::get().use_unipaths,
- cfg::get().simp.tec.plausibility_length);
- OutputContigs(gp.g, cfg::get().output_dir + "before_rr");
- OutputContigsToFASTG(gp.g, cfg::get().output_dir + "assembly_graph");
- OutputContigs(gp.g, cfg::get().output_dir + "final_contigs");
-}
-
-} // debruijn_graph
-
-
-
diff --git a/src/debruijn/repeat_resolving.hpp b/src/debruijn/repeat_resolving.hpp
deleted file mode 100644
index 6847fef..0000000
--- a/src/debruijn/repeat_resolving.hpp
+++ /dev/null
@@ -1,40 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "stage.hpp"
-
-namespace debruijn_graph {
-
-class RepeatResolution : public spades::AssemblyStage {
- const bool preliminary_;
- public:
- RepeatResolution(bool preliminary = false)
- : AssemblyStage(preliminary ? "Preliminary Repeat Resolving" : "Repeat Resolving",
- preliminary ? "repeat_resolving_preliminary" : "repeat_resolving"),
- preliminary_(preliminary) {}
-
- void load(conj_graph_pack&, const std::string &, const char*) {}
- void save(const conj_graph_pack&, const std::string &, const char*) const {}
-
- void run(conj_graph_pack &gp, const char*);
-};
-
-class ContigOutput : public spades::AssemblyStage {
- public:
- ContigOutput()
- : AssemblyStage("Contig Output", "contig_output") {}
-
- void load(conj_graph_pack&, const std::string &, const char*) {}
- void save(const conj_graph_pack&, const std::string &, const char*) const {}
-
- void run(conj_graph_pack &gp, const char*);
-};
-
-}
-
diff --git a/src/debruijn/second_phase_setup.cpp b/src/debruijn/second_phase_setup.cpp
deleted file mode 100644
index 2c2e062..0000000
--- a/src/debruijn/second_phase_setup.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "standard.hpp"
-#include "dataset_readers.hpp"
-#include "read_converter.hpp"
-
-#include "de/paired_info.hpp"
-
-#include "utils.hpp"
-#include "stats/debruijn_stats.hpp"
-#include "io/library.hpp"
-#include "is_counter.hpp"
-#include "pair_info_count.hpp"
-#include "sequence_mapper.hpp"
-#include "short_read_mapper.hpp"
-#include "long_read_mapper.hpp"
-#include "pair_info_filler.hpp"
-#include "second_phase_setup.hpp"
-#include "stats/debruijn_stats.hpp"
-#include "path_extend/split_graph_pair_info.hpp"
-
-
-namespace debruijn_graph {
-
-void SecondPhaseSetup::run(conj_graph_pack &gp, const char*) {
- INFO("Preparing second phase");
- gp.ClearRRIndices();
-
- std::string old_pe_contigs_filename = cfg::get().output_dir + "final_contigs.fasta";
- std::string new_pe_contigs_filename = cfg::get().output_dir + "first_pe_contigs.fasta";
-
- VERIFY(path::check_existence(old_pe_contigs_filename));
- INFO("Moving preliminary contigs from " << old_pe_contigs_filename << " to " << new_pe_contigs_filename);
- int code = rename(old_pe_contigs_filename.c_str(), new_pe_contigs_filename.c_str());
- VERIFY(code == 0);
-
- io::SequencingLibrary<debruijn_graph::debruijn_config::DataSetData> untrusted_contigs;
- untrusted_contigs.push_back_single(new_pe_contigs_filename);
- untrusted_contigs.set_orientation(io::LibraryOrientation::Undefined);
- untrusted_contigs.set_type(io::LibraryType::PathExtendContigs);
- cfg::get_writable().ds.reads.push_back(untrusted_contigs);
-
- //FIXME get rid of this awful variable
- cfg::get_writable().use_single_reads = false;
- INFO("Ready to run second phase");
-}
-
-}
diff --git a/src/debruijn/second_phase_setup.hpp b/src/debruijn/second_phase_setup.hpp
deleted file mode 100644
index 7a04748..0000000
--- a/src/debruijn/second_phase_setup.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "stage.hpp"
-
-namespace debruijn_graph {
-
-//todo rename
-class SecondPhaseSetup : public spades::AssemblyStage {
- public:
- SecondPhaseSetup()
- : AssemblyStage("Second Phase Setup", "second_phase_setup") {}
-
- void run(conj_graph_pack &gp, const char*);
-};
-
-}
diff --git a/src/debruijn/sequence_mapper.hpp b/src/debruijn/sequence_mapper.hpp
deleted file mode 100644
index ef9c1c4..0000000
--- a/src/debruijn/sequence_mapper.hpp
+++ /dev/null
@@ -1,431 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "omni/omni_utils.hpp"
-#include "sequence/sequence_tools.hpp"
-#include "omni/path_processor.hpp"
-
-#include "runtime_k.hpp"
-#include "edge_index.hpp"
-#include "kmer_mapper.hpp"
-
-#include <cstdlib>
-
-namespace debruijn_graph {
-
-template<class Graph>
-MappingPath<typename Graph::EdgeId> ConjugateMapping(const Graph& g,
- const MappingPath<typename Graph::EdgeId>& mp,
- size_t sequence_length) {
- MappingPath<typename Graph::EdgeId> answer;
- for (size_t i = mp.size(); i > 0; --i) {
- auto p = mp[i-1];
- auto e = p.first;
- MappingRange mr = p.second;
- answer.push_back(g.conjugate(e),
- MappingRange(mr.initial_range.Invert(sequence_length - g.k()),
- mr.mapped_range.Invert(g.length(e))));
- }
- return answer;
-}
-
-template<class Graph>
-class SequenceMapper {
-public:
- typedef typename Graph::EdgeId EdgeId;
- typedef runtime_k::RtSeq Kmer;
-
-protected:
- const Graph& g_;
-
-public:
- SequenceMapper(const Graph& g): g_(g) {
-
- }
-
- virtual ~SequenceMapper() {
-
- }
-
- virtual MappingPath<EdgeId> MapSequence(const Sequence &sequence) const = 0;
-
-
- MappingPath<EdgeId> MapRead(const io::SingleRead &read) const {
-// VERIFY(read.IsValid());
- DEBUG(read.name() << " is mapping");
- string s = read.GetSequenceString();
- size_t l = 0, r = 0;
- MappingPath<EdgeId> result;
- for(size_t i = 0; i < s.size(); i++) {
- if (read.GetSequenceString()[i] == 'N') {
- if (r > l) {
- result.join(MapSequence(Sequence(s.substr(l, r - l))), int(l));
- }
- r = i + 1;
- l = i + 1;
- } else {
- r++;
- }
- }
- if (r > l) {
- result.join(MapSequence(Sequence(s.substr(l, r - l))), int(l));
- }
- DEBUG(read.name() << " is mapped");
- DEBUG("Number of edges is " << result.size());
-
- return result;
- }
-
- virtual size_t KmerSize() const = 0;
-};
-
-template<class Graph>
-class MappingPathFixer {
-public:
-
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- MappingPathFixer(const Graph& graph)
- : g_(graph) {
- }
-
- bool CheckContiguous(const vector<typename Graph::EdgeId>& path) const {
- for (size_t i = 1; i < path.size(); ++i) {
- if (g_.EdgeEnd(path[i - 1]) != g_.EdgeStart(path[i]))
- return false;
- }
- return true;
- }
-
- Path<EdgeId> TryFixPath(const Path<EdgeId>& path, size_t length_bound = 70) const {
- return Path<EdgeId>(TryFixPath(path.sequence(), length_bound), path.start_pos(), path.end_pos());
- }
-
- vector<EdgeId> TryFixPath(const vector<EdgeId>& edges, size_t length_bound = 70) const {
- vector<EdgeId> answer;
- if (edges.empty()) {
- // WARN("Mapping path was empty");
- return vector<EdgeId>();
- }
- answer.push_back(edges[0]);
- for (size_t i = 1; i < edges.size(); ++i) {
- if (g_.EdgeEnd(edges[i - 1]) != g_.EdgeStart(edges[i])) {
- vector<EdgeId> closure = TryCloseGap(g_.EdgeEnd(edges[i - 1]),
- g_.EdgeStart(edges[i]),
- length_bound);
- answer.insert(answer.end(), closure.begin(), closure.end());
- }
- answer.push_back(edges[i]);
- }
- return answer;
- }
-
- vector<EdgeId> DeleteSameEdges(const vector<EdgeId>& path) const {
- vector<EdgeId> result;
- if (path.empty()) {
- return result;
- }
- result.push_back(path[0]);
- for (size_t i = 1; i < path.size(); ++i) {
- if (path[i] != result[result.size() - 1]) {
- result.push_back(path[i]);
- }
- }
- return result;
- }
-
-private:
- vector<EdgeId> TryCloseGap(VertexId v1, VertexId v2, size_t length_bound) const {
- if (v1 == v2)
- return vector<EdgeId>();
- TRACE("Trying to close gap between v1=" << g_.int_id(v1) << " and v2=" << g_.int_id(v2));
- PathStorageCallback<Graph> path_store(g_);
-
- TRACE("Path storage callback created");
- //todo reduce value after investigation
- ProcessPaths(g_, 0, length_bound, v1, v2, path_store);
-
- TRACE("Paths processed");
- if (path_store.size() == 0) {
- TRACE("Failed to find closing path");
- // TRACE("Failed to close gap between v1=" << graph_.int_id(v1)
- // << " (conjugate "
- // << graph_.int_id(g_.conjugate(v1))
- // << ") and v2=" << g_.int_id(v2)
- // << " (conjugate "
- // << g_.int_id(g_.conjugate(v2)) << ")");
- // return boost::none;
- return vector<EdgeId>();
- } else if (path_store.size() == 1) {
- TRACE("Unique closing path found");
- } else {
- TRACE("Several closing paths found, first chosen");
- }
- TRACE("Taking answer ");
- vector<EdgeId> answer = path_store.paths().front();
- TRACE("Gap closed");
- TRACE( "Cumulative closure length is " << CumulativeLength(g_, answer));
- return answer;
- }
- const Graph& g_;
-};
-
-template<class Graph>
-class ReadPathFinder {
-private:
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- const Graph& g_;
- typedef MappingPathFixer<Graph> GraphMappingPathFixer;
- const GraphMappingPathFixer path_fixer_;
-public:
- ReadPathFinder (const Graph& g) :
- g_(g), path_fixer_(g)
- { }
-
- vector<EdgeId> FindReadPath(const MappingPath<EdgeId>& mapping_path) const {
- if (!IsMappingPathValid(mapping_path)) {
- TRACE("read unmapped");
- return vector<EdgeId>();
- }
- vector<EdgeId> corrected_path = path_fixer_.DeleteSameEdges(
- mapping_path.simple_path());
- PrintPathInfo(corrected_path);
- if(corrected_path.size() != mapping_path.simple_path().size()) {
- DEBUG("Some edges were deleted");
- }
- vector<EdgeId> fixed_path = path_fixer_.TryFixPath(corrected_path);
- if (!path_fixer_.CheckContiguous(fixed_path)) {
- TRACE("read unmapped");
- std::stringstream debug_stream;
- for (size_t i = 0; i < fixed_path.size(); ++i) {
- debug_stream << g_.int_id(fixed_path[i]) << " ";
- }
- TRACE(debug_stream.str());
- return vector<EdgeId>();
- } else {
- DEBUG("Path fix works");
- }
- return fixed_path;
- }
-
- vector<vector<EdgeId>> FindReadPathWithGaps(const MappingPath<EdgeId>& mapping_path) const {
- if (!IsMappingPathValid(mapping_path)) {
- TRACE("read unmapped");
- return vector<vector<EdgeId>>();
- }
- vector<EdgeId> corrected_path = path_fixer_.DeleteSameEdges(
- mapping_path.simple_path());
- PrintPathInfo(corrected_path);
- if(corrected_path.size() != mapping_path.simple_path().size()) {
- DEBUG("Some edges were deleted");
- }
- vector<EdgeId> fixed_path = path_fixer_.TryFixPath(corrected_path);
- return SplitUnfixedPoints(fixed_path);
- }
-
-private:
-
- vector<vector<EdgeId>> SplitUnfixedPoints(vector<EdgeId>& path) const {
- vector<vector<EdgeId>> result;
- size_t prev_start = 0;
- for (size_t i = 1; i < path.size(); ++i) {
- if (g_.EdgeEnd(path[i - 1]) != g_.EdgeStart(path[i])) {
- result.push_back(vector<EdgeId>(path.begin() + prev_start, path.begin() + i));
- prev_start = i;
- }
- }
- result.push_back(vector<EdgeId>(path.begin() + prev_start, path.end()));
- return result;
- }
-
- bool IsTip(VertexId v) const {
- return g_.IncomingEdgeCount(v) + g_.OutgoingEdgeCount(v) == 1;
- }
-
- bool IsMappingPathValid(const MappingPath<EdgeId>& path) const {
- return path.size() != 0;
- }
-
- void PrintPathInfo(vector<EdgeId>& corrected_path) const {
- for(size_t i = 0; i < corrected_path.size(); ++i) {
- DEBUG(i + 1 << "-th edge is " << corrected_path[i].int_id());
- }
- }
-};
-
-template<class Graph, class Index>
-class NewExtendedSequenceMapper: public SequenceMapper<Graph> {
-
- using SequenceMapper<Graph>::g_;
-
- public:
- typedef std::vector<MappingRange> RangeMappings;
-
- private:
- const Index& index_;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef typename Index::KMer Kmer;
- typedef KmerMapper<Graph, Kmer> KmerSubs;
- const KmerSubs& kmer_mapper_;
- size_t k_;
- bool optimization_on_;
- // mutable size_t mapped_;
- // mutable size_t unmapped_;
-
- bool FindKmer(const Kmer &kmer, size_t kmer_pos, std::vector<EdgeId> &passed,
- RangeMappings& range_mappings) const {
- std::pair<EdgeId, size_t> position = index_.get(kmer);
- if (position.second != -1u/*index contains this k-mer*/) {
- if (passed.empty() || passed.back() != position.first ||
- kmer_pos != range_mappings.back().initial_range.end_pos ||
- position.second + 1 < range_mappings.back().mapped_range.end_pos) {
- passed.push_back(position.first);
- range_mappings.push_back(
- MappingRange(Range(kmer_pos, kmer_pos + 1),
- Range(position.second, position.second + 1)));
- } else {
- range_mappings.back().initial_range.end_pos = kmer_pos + 1;
- range_mappings.back().mapped_range.end_pos = position.second + 1;
- }
- return true;
- }
- return false;
- }
-
- bool TryThread(const Kmer& kmer, size_t kmer_pos, std::vector<EdgeId> &passed,
- RangeMappings& range_mappings) const {
- EdgeId last_edge = passed.back();
- size_t end_pos = range_mappings.back().mapped_range.end_pos;
- if (end_pos < g_.length(last_edge)) {
- if (g_.EdgeNucls(last_edge)[end_pos + k_ - 1] == kmer[k_ - 1]) {
- range_mappings.back().initial_range.end_pos++;
- range_mappings.back().mapped_range.end_pos++;
- return true;
- }
- } else {
- VertexId v = g_.EdgeEnd(last_edge);
-
- if(!optimization_on_)
- if(g_.OutgoingEdgeCount(v) > 1)
- return false;
-
- for (auto I = g_.out_begin(v), E = g_.out_end(v); I != E; ++I) {
- EdgeId edge = *I;
- if (g_.EdgeNucls(edge)[k_ - 1] == kmer[k_ - 1]) {
- passed.push_back(edge);
- range_mappings.push_back(
- MappingRange(Range(kmer_pos, kmer_pos + 1),
- Range(0, 1)));
- return true;
- }
- }
- }
- return false;
- }
-
- bool Substitute(Kmer& kmer) const {
- Kmer subs = kmer_mapper_.Substitute(kmer);
- if (subs != kmer) {
- kmer = subs;
- return true;
- }
- return false;
- }
-
- bool ProcessKmer(Kmer kmer, size_t kmer_pos, std::vector<EdgeId> &passed_edges,
- RangeMappings& range_mapping, bool try_thread) const {
- if (try_thread) {
- if (!TryThread(kmer, kmer_pos, passed_edges, range_mapping)) {
- Substitute(kmer);
- FindKmer(kmer, kmer_pos, passed_edges, range_mapping);
- return false;
- } else {
- return true;
- }
- } else {
- if (!Substitute(kmer)) {
- return FindKmer(kmer, kmer_pos, passed_edges, range_mapping);
- } else {
- FindKmer(kmer, kmer_pos, passed_edges, range_mapping);
- return false;
- }
- }
- // if (!Substitute(kmer)) {
- // if (try_thread) {
- // return TryThread(kmer, kmer_pos, passed_edges, range_mapping);
- // } else {
- // return FindKmer(kmer, kmer_pos, passed_edges, range_mapping);
- // }
- // } else {
- // FindKmer(kmer, kmer_pos, passed_edges, range_mapping);
- // return false;
- // }
- }
-
- public:
- NewExtendedSequenceMapper(const Graph& g,
- const Index& index,
- const KmerSubs& kmer_mapper,
- bool optimization_on = true) :
- SequenceMapper<Graph>(g), index_(index), kmer_mapper_(kmer_mapper), k_(g.k()+1),
- optimization_on_(optimization_on) { }
-
- ~NewExtendedSequenceMapper() {
- // TRACE("In destructor of sequence mapper");
- // TRACE(mapped_ << " sequences were mapped");
- // TRACE(unmapped_ << " sequences couldn't be mapped");
- }
-
- MappingPath<EdgeId> MapSequence(const Sequence &sequence) const {
- std::vector<EdgeId> passed_edges;
- RangeMappings range_mapping;
-
- if (sequence.size() < k_) {
- return MappingPath<EdgeId>();
- }
-
- Kmer kmer = sequence.start<Kmer>(k_);
- //kmer >>= 0;
- bool try_thread = false;
- try_thread = ProcessKmer(kmer, 0, passed_edges,
- range_mapping, try_thread);
- for (size_t i = k_; i < sequence.size(); ++i) {
- kmer <<= sequence[i];
- try_thread = ProcessKmer(kmer, i - k_ + 1, passed_edges,
- range_mapping, try_thread);
- }
-
- // if (passed_edges.empty()) {
- //// TRACE("Sequence " << sequence << "couldn't be mapped");
- // unmapped_++;
- // //todo maybe check path consistency?
- // } else {
- // mapped_++;
- // }
-
- return MappingPath<EdgeId>(passed_edges, range_mapping);
- }
-
- size_t KmerSize() const {
- return k_;
- }
-
- DECL_LOGGER("NewExtendedSequenceMapper");
-};
-
-
-template<class gp_t>
-std::shared_ptr<NewExtendedSequenceMapper<typename gp_t::graph_t, typename gp_t::index_t> > MapperInstance(const gp_t& gp) {
- return std::make_shared<NewExtendedSequenceMapper<typename gp_t::graph_t, typename gp_t::index_t> >(gp.g, gp.index, gp.kmer_mapper);
-}
-
-}
diff --git a/src/debruijn/sequence_mapper_notifier.hpp b/src/debruijn/sequence_mapper_notifier.hpp
deleted file mode 100644
index 13040ea..0000000
--- a/src/debruijn/sequence_mapper_notifier.hpp
+++ /dev/null
@@ -1,181 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef SEQUENCE_MAPPER_NOTIFIER_HPP_
-#define SEQUENCE_MAPPER_NOTIFIER_HPP_
-
-#include "sequence_mapper.hpp"
-#include "short_read_mapper.hpp"
-#include "io/paired_read.hpp"
-#include "graph_pack.hpp"
-
-#include "io/paired_read.hpp"
-
-#include <vector>
-#include <cstdlib>
-
-namespace debruijn_graph {
-//todo think if we still need all this
-class SequenceMapperListener {
-public:
- virtual void StartProcessLibrary(size_t threads_count) = 0;
- virtual void StopProcessLibrary() = 0;
-
- //TODO: think about read ierarchy
- virtual void ProcessPairedRead(size_t thread_index, const io::PairedRead& pr, const MappingPath<EdgeId>& read1, const MappingPath<EdgeId>& read2) = 0;
- virtual void ProcessPairedRead(size_t thread_index, const io::PairedReadSeq& pr, const MappingPath<EdgeId>& read1, const MappingPath<EdgeId>& read2) = 0;
- virtual void ProcessSingleRead(size_t thread_index, const io::SingleRead& r, const MappingPath<EdgeId>& read) = 0;
- virtual void ProcessSingleRead(size_t thread_index, const io::SingleReadSeq& r, const MappingPath<EdgeId>& read) = 0;
-
- virtual void MergeBuffer(size_t thread_index) = 0;
- virtual ~SequenceMapperListener() {}
-};
-
-class SequenceMapperNotifier {
-public:
- typedef SequenceMapper<conj_graph_pack::graph_t> SequenceMapperT;
-
- SequenceMapperNotifier(const conj_graph_pack& gp)
- : gp_(gp) { }
-
- void Subscribe(size_t lib_index, SequenceMapperListener* listener) {
- while ((int)lib_index >= (int)listeners_.size() - 1) {
- std::vector<SequenceMapperListener*> vect;
- listeners_.push_back(vect);
- }
- listeners_[lib_index].push_back(listener);
- }
-
- template<class ReadType>
- void ProcessLibrary(io::ReadStreamList<ReadType>& streams,
- size_t lib_index, const SequenceMapperT& mapper, size_t threads_count = 0) {
- if (threads_count == 0)
- threads_count = streams.size();
-
- streams.reset();
- NotifyStartProcessLibrary(lib_index, threads_count);
-
- size_t counter = 0, n = 15;
- size_t fmem = get_free_memory();
-# pragma omp parallel for num_threads(threads_count) shared(counter)
- for (size_t ithread = 0; ithread < threads_count; ++ithread) {
- size_t size = 0;
- size_t limit = 200000;
- ReadType r;
- auto& stream = streams[ithread];
- stream.reset();
- bool end_of_stream = false;
- while (!end_of_stream) {
- end_of_stream = stream.eof();
- while (!end_of_stream && size < limit) {
- stream >> r;
- ++size;
- NotifyProcessRead(r, mapper, lib_index, ithread);
- end_of_stream = stream.eof();
- // Stop filling buffer if the amount of available is smaller
- // than half of free memory.
- if (10 * get_free_memory() / 4 < fmem &&
- size > 10000)
- break;
- }
-# pragma omp critical
- {
- counter += size;
- if (counter >> n) {
- INFO("Processed " << counter << " reads");
- n += 1;
- }
- size = 0;
- NotifyMergeBuffer(lib_index, ithread);
- }
- }
- }
- INFO("Processed " << counter << " reads");
- NotifyStopProcessLibrary(lib_index);
- }
-
-private:
- template<class ReadType>
- void NotifyProcessRead(const ReadType& r, const SequenceMapperT& mapper, size_t ilib, size_t ithread) const;
-
- void NotifyStartProcessLibrary(size_t ilib, size_t thread_count) const {
- for (const auto& listener : listeners_[ilib])
- listener->StartProcessLibrary(thread_count);
- }
-
- void NotifyStopProcessLibrary(size_t ilib) const {
- for (const auto& listener : listeners_[ilib])
- listener->StopProcessLibrary();
- }
-
- void NotifyMergeBuffer(size_t ilib, size_t ithread) const {
- for (const auto& listener : listeners_[ilib])
- listener->MergeBuffer(ithread);
- }
- const conj_graph_pack& gp_;
-
- std::vector<std::vector<SequenceMapperListener*> > listeners_; //first vector's size = count libs
-};
-
-template<>
-inline void SequenceMapperNotifier::NotifyProcessRead(const io::PairedReadSeq& r,
- const SequenceMapperT& mapper,
- size_t ilib,
- size_t ithread) const {
-
- const Sequence& read1 = r.first().sequence();
- const Sequence& read2 = r.second().sequence();
- MappingPath<EdgeId> path1 = mapper.MapSequence(read1);
- MappingPath<EdgeId> path2 = mapper.MapSequence(read2);
- for (const auto& listener : listeners_[ilib]) {
- TRACE("Dist: " << r.second().size() << " - " << r.insert_size() << " = " << r.second().size() - r.insert_size());
- listener->ProcessPairedRead(ithread, r, path1, path2);
- listener->ProcessSingleRead(ithread, r.first(), path1);
- listener->ProcessSingleRead(ithread, r.second(), path2);
- }
-}
-
-template<>
-inline void SequenceMapperNotifier::NotifyProcessRead(const io::PairedRead& r,
- const SequenceMapperT& mapper,
- size_t ilib,
- size_t ithread) const {
- MappingPath<EdgeId> path1 = mapper.MapRead(r.first());
- MappingPath<EdgeId> path2 = mapper.MapRead(r.second());
- for (const auto& listener : listeners_[ilib]) {
- TRACE("Dist: " << r.second().size() << " - " << r.insert_size() << " = " << r.second().size() - r.insert_size());
- listener->ProcessPairedRead(ithread, r, path1, path2);
- listener->ProcessSingleRead(ithread, r.first(), path1);
- listener->ProcessSingleRead(ithread, r.second(), path2);
- }
-}
-
-template<>
-inline void SequenceMapperNotifier::NotifyProcessRead(const io::SingleReadSeq& r,
- const SequenceMapperT& mapper,
- size_t ilib,
- size_t ithread) const {
- const Sequence& read = r.sequence();
- MappingPath<EdgeId> path = mapper.MapSequence(read);
- for (const auto& listener : listeners_[ilib])
- listener->ProcessSingleRead(ithread, r, path);
-}
-
-template<>
-inline void SequenceMapperNotifier::NotifyProcessRead(const io::SingleRead& r,
- const SequenceMapperT& mapper,
- size_t ilib,
- size_t ithread) const {
- MappingPath<EdgeId> path = mapper.MapRead(r);
- for (const auto& listener : listeners_[ilib])
- listener->ProcessSingleRead(ithread, r, path);
-}
-
-} /*debruijn_graph*/
-
-
-#endif /* SEQUENCE_MAPPER_NOTIFIER_HPP_ */
diff --git a/src/debruijn/short_read_mapper.hpp b/src/debruijn/short_read_mapper.hpp
deleted file mode 100644
index c9395c2..0000000
--- a/src/debruijn/short_read_mapper.hpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * short_read_mapper.hpp
- *
- * Created on: Dec 4, 2013
- * Author: andrey
- */
-
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-
-#include "sequence_mapper.hpp"
-#include "pacbio/pac_index.hpp"
-
-namespace debruijn_graph {
-
-template<class Graph>
-class SensitiveReadMapper: public SequenceMapper<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- using SequenceMapper<Graph>::g_;
-private:
-
- size_t small_k_;
-
- static map<size_t, pacbio::PacBioMappingIndex<Graph>* > indices_;
- static size_t active_mappers_;
-
- pacbio::PacBioMappingIndex<Graph>* index_;
-
-public:
-
- SensitiveReadMapper(const Graph& g, size_t k, size_t graph_k) :
- SequenceMapper<Graph>(g), small_k_(k)
- {
- if (indices_.find(small_k_) == indices_.end()) {
- indices_.insert(make_pair(small_k_,
- new pacbio::PacBioMappingIndex<Graph>(g, small_k_, graph_k, false)));
- }
- index_ = indices_[small_k_];
- ++active_mappers_;
- }
-
- MappingPath<EdgeId> MapSequence(const Sequence &sequence) const {
- return index_->GetShortReadAlignment(sequence);
- }
-
- size_t KmerSize() const {
- return small_k_;
- }
-
- ~SensitiveReadMapper() {
- --active_mappers_;
- }
-
- static void EraseIndices() {
- if (active_mappers_ > 0) {
- WARN("There are still active mappers");
- }
- for (auto iter = indices_.begin(); iter != indices_.end(); ++iter) {
- delete iter->second;
- }
- indices_.clear();
- }
-
-};
-
-template<class Graph>
-map<size_t, pacbio::PacBioMappingIndex<Graph>* > SensitiveReadMapper<Graph>::indices_;
-
-template<class Graph>
-size_t SensitiveReadMapper<Graph>::active_mappers_ = 0;
-
-
-template<class graph_pack, class SequencingLib>
-std::shared_ptr<SequenceMapper<typename graph_pack::graph_t>> ChooseProperMapper(const graph_pack& gp, const SequencingLib& library) {
- if (library.type() == io::LibraryType::MatePairs) {
- INFO("Mapping mate-pair library, selecting sensitive read mapper with k=" << cfg::get().sensitive_map.k);
- return std::make_shared<SensitiveReadMapper<typename graph_pack::graph_t>>(gp.g, cfg::get().sensitive_map.k, gp.k_value);
- }
-
- size_t read_length = library.data().read_length;
- if (read_length < gp.k_value && library.type() == io::LibraryType::PairedEnd) {
- INFO("Read length = " << read_length << ", selecting short read mapper");
- return std::make_shared<SensitiveReadMapper<typename graph_pack::graph_t>>(gp.g, read_length/ 3, gp.k_value);
- }
-
- INFO("Selecting usual mapper");
- return MapperInstance(gp);
-}
-
-}
-
diff --git a/src/debruijn/simplification.cpp b/src/debruijn/simplification.cpp
deleted file mode 100644
index 0604af9..0000000
--- a/src/debruijn/simplification.cpp
+++ /dev/null
@@ -1,477 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "standard.hpp"
-#include "simplification/simplification_settings.hpp"
-#include "simplification/graph_simplification.hpp"
-#include "omni/visualization/graph_labeler.hpp"
-#include "io/single_read.hpp"
-#include "positions.hpp"
-
-#include "simplification.hpp"
-
-namespace debruijn_graph {
-
-using namespace debruijn::simplification;
-
-class GraphSimplifier {
- typedef std::function<void(EdgeId)> HandlerF;
- typedef omnigraph::PersistentEdgeRemovingAlgorithm<Graph,
- omnigraph::ParallelInterestingElementFinder<Graph, EdgeId>,
- LengthComparator<Graph>> TipClipperT;
- typedef omnigraph::PersistentEdgeRemovingAlgorithm<Graph,
- omnigraph::ParallelInterestingElementFinder<Graph, EdgeId>,
- CoverageComparator<Graph>> ECRemoverT;
-
- typedef std::vector<std::pair<AlgoPtr<Graph>, std::string>> AlgoStorageT;
-
- conj_graph_pack& gp_;
- Graph& g_;
- SimplifInfoContainer info_container_;
- const debruijn_config::simplification simplif_cfg_;
-
- CountingCallback<Graph> cnt_callback_;
- HandlerF removal_handler_;
- stats::detail_info_printer& printer_;
-
-// bool FastModeAvailable(const SimplifInfoContainer& info, double activation_cov_threshold) {
-// const auto& cfg = cfg::get();
-//
-// //todo fix logic
-// //also handles meta case for now
-// if (cfg.ds.single_cell) {
-// return !cfg::get().main_iteration;
-// }
-//
-// if (math::eq(info.detected_mean_coverage(), 0.) &&
-// !cfg.kcm.use_coverage_threshold) {
-// WARN("Mean coverage wasn't reliably estimated");
-// return false;
-// }
-//
-// //todo review logic
-// if (math::ls(info.detected_mean_coverage(), activation_cov_threshold) &&
-// !(cfg.kcm.use_coverage_threshold &&
-// math::ge(cfg.kcm.coverage_threshold, activation_cov_threshold))) {
-// INFO("Estimated mean coverage " << info.detected_mean_coverage() <<
-// " is less than fast mode activation coverage " << activation_cov_threshold);
-// return false;
-// }
-//
-// return true;
-// }
-
- bool PerformInitCleaning() {
- if (simplif_cfg_.init_clean.early_it_only && info_container_.main_iteration()) {
- INFO("Most init cleaning disabled on main iteration");
- return false;
- }
-
- if (math::ge(simplif_cfg_.init_clean.activation_cov, 0.)
- && math::ls(info_container_.detected_mean_coverage(), simplif_cfg_.init_clean.activation_cov)) {
- INFO("Most init cleaning disabled since detected mean " << info_container_.detected_mean_coverage()
- << " was less than activation coverage " << simplif_cfg_.init_clean.activation_cov);
- return false;
- }
-
- return true;
- }
-
- void InitialCleaning() {
- INFO("PROCEDURE == InitialCleaning");
-
- AlgoStorageT algos;
-
- PushValid(
- SelfConjugateEdgeRemoverInstance(g_,
- simplif_cfg_.init_clean.self_conj_condition,
- info_container_, removal_handler_),
- "Self conjugate edge remover",
- algos);
-
- if (PerformInitCleaning()) {
- PushValid(
- IsolatedEdgeRemoverInstance(g_,
- simplif_cfg_.init_clean.ier,
- info_container_, removal_handler_),
- "Initial isolated edge remover",
- algos);
-
- PushValid(
- TipClipperInstance(g_,
- debruijn_config::simplification::tip_clipper(simplif_cfg_.init_clean.tip_condition),
- info_container_,
- removal_handler_),
- "Initial tip clipper",
- algos);
-
- PushValid(
- ECRemoverInstance(g_,
- debruijn_config::simplification::erroneous_connections_remover(simplif_cfg_.init_clean.ec_condition),
- info_container_,
- removal_handler_),
- "Initial ec remover",
- algos);
-
- PushValid(
- LowFlankDisconnectorInstance(g_, gp_.flanking_cov,
- simplif_cfg_.init_clean.disconnect_flank_cov, info_container_,
- removal_handler_),
- "Disconnecting edges with low flanking coverage",
- algos);
- }
-
- RunAlgos(algos);
- }
-
- bool AllTopology() {
- bool res = TopologyRemoveErroneousEdges(gp_.g, simplif_cfg_.tec,
- removal_handler_);
- cnt_callback_.Report();
- res |= TopologyReliabilityRemoveErroneousEdges(gp_.g, simplif_cfg_.trec,
- removal_handler_);
- cnt_callback_.Report();
- res |= RemoveThorns(gp_.g, simplif_cfg_.isec, removal_handler_);
- cnt_callback_.Report();
- res |= MultiplicityCountingRemoveErroneousEdges(gp_.g, simplif_cfg_.tec,
- removal_handler_);
- cnt_callback_.Report();
- return res;
- }
-
- bool FinalRemoveErroneousEdges() {
-
- // gp.ClearQuality();
- // gp.FillQuality();
- // auto colorer = debruijn_graph::DefaultGPColorer(gp);
- // omnigraph::DefaultLabeler<typename gp_t::graph_t> labeler(gp.g, gp.edge_pos);
- // QualityEdgeLocalityPrintingRH<Graph> qual_removal_handler(gp.g, gp.edge_qual, labeler, colorer,
- // cfg::get().output_dir + "pictures/colored_edges_deleted/");
- //
- // //positive quality edges removed (folder colored_edges_deleted)
- // std::function<void(EdgeId)> qual_removal_handler_f = boost::bind(
- // // &QualityLoggingRemovalHandler<Graph>::HandleDelete,
- // &QualityEdgeLocalityPrintingRH<Graph>::HandleDelete,
- // boost::ref(qual_removal_handler), _1);
- //
- // std::function<void(set<EdgeId>)> set_removal_handler_f = boost::bind(
- // &omnigraph::simplification::SingleEdgeAdapter<set<EdgeId>>, _1, qual_removal_handler_f);
- //
-
- std::function<void(set<EdgeId>)> set_removal_handler_f(0);
- if (removal_handler_) {
- set_removal_handler_f = std::bind(
- &omnigraph::simplification::SingleEdgeAdapter<set<EdgeId>>, std::placeholders::_1, removal_handler_);
- }
-
- bool changed = RemoveRelativelyLowCoverageComponents(gp_.g, gp_.flanking_cov,
- simplif_cfg_.rcc, info_container_, set_removal_handler_f);
-
- cnt_callback_.Report();
-
- changed |= DisconnectRelativelyLowCoverageEdges(gp_.g, gp_.flanking_cov, simplif_cfg_.relative_ed);
-
- if (simplif_cfg_.topology_simplif_enabled && info_container_.main_iteration()) {
- changed |= AllTopology();
- changed |= MaxFlowRemoveErroneousEdges(gp_.g, simplif_cfg_.mfec,
- removal_handler_);
- cnt_callback_.Report();
- }
- return changed;
- }
-
- void PostSimplification() {
- INFO("PROCEDURE == Post simplification");
- size_t iteration = 0;
-
- AlgoStorageT algos;
-
- PushValid(
- TipClipperInstance(g_, simplif_cfg_.tc,
- info_container_, removal_handler_),
- "Tip clipper",
- algos);
-
- PushValid(
- TipClipperInstance(g_, simplif_cfg_.final_tc,
- info_container_, removal_handler_),
- "Final tip clipper",
- algos);
-
- PushValid(
- BRInstance(g_, simplif_cfg_.br,
- info_container_, removal_handler_),
- "Bulge remover",
- algos);
-
- PushValid(
- BRInstance(g_, simplif_cfg_.final_br,
- info_container_, removal_handler_),
- "Final bulge remover",
- algos);
-
- if (simplif_cfg_.topology_simplif_enabled) {
- PushValid(
- TopologyTipClipperInstance(g_, simplif_cfg_.ttc,
- info_container_, removal_handler_),
- "Topology tip clipper",
- algos);
- }
-
- //FIXME need better configuration
- if (cfg::get().ds.meta) {
- PushValid(
- BRInstance(g_, simplif_cfg_.second_final_br,
- info_container_, removal_handler_),
- "Yet another final bulge remover",
- algos);
- }
-
- bool enable_flag = true;
- while (enable_flag) {
- enable_flag = false;
-
- INFO("Iteration " << iteration);
-
- enable_flag |= FinalRemoveErroneousEdges();
- cnt_callback_.Report();
-
- enable_flag |= ClipComplexTips(gp_.g, simplif_cfg_.complex_tc, removal_handler_);
- cnt_callback_.Report();
-
- enable_flag |= RemoveComplexBulges(gp_.g, simplif_cfg_.cbr, iteration);
- cnt_callback_.Report();
-
- enable_flag |= RunAlgos(algos);
-
- iteration++;
-
- // printer(ipp_before_final_err_con_removal);
- // printer(ipp_final_tip_clipping, str(format("_%d") % iteration));
- // printer(ipp_final_err_con_removal, str(format("_%d") % iteration));
- // printer(ipp_final_bulge_removal, str(format("_%d") % iteration));
- }
-
- //fixme move to AllTopology?
- if (simplif_cfg_.topology_simplif_enabled) {
- RemoveHiddenEC(gp_.g, gp_.flanking_cov, simplif_cfg_.her, info_container_, removal_handler_);
-
- cnt_callback_.Report();
- }
-
- INFO("Disrupting self-conjugate edges");
- SelfConjugateDisruptor<Graph>(gp_.g, removal_handler_).Run();
- cnt_callback_.Report();
- }
-
- //inline
- //void IdealSimplification(Graph& graph,
- // std::function<double(EdgeId)> quality_handler_f) {
- // for (auto iterator = graph.SmartEdgeBegin(); !iterator.IsEnd();
- // ++iterator) {
- // if (math::eq(quality_handler_f(*iterator), 0.))
- // graph.DeleteEdge(*iterator);
- // }
- // CompressAllVertices(graph);
- //}
-
-// std::shared_ptr<Predicate<EdgeId>> ParseCondition(const string& condition) const {
-// ConditionParser<Graph> parser(g_, condition, info_container_);
-// return parser();
-// }
-
- void PushValid(const AlgoPtr<Graph>& algo_ptr, std::string comment, AlgoStorageT& algos) const {
- if (algo_ptr) {
- algos.push_back(std::make_pair(algo_ptr, comment));
- }
- }
-
- bool RunAlgos(AlgoStorageT& algos, bool force_primary_launch = false) {
- bool changed = false;
- for (auto algo_comment : algos) {
- INFO("Running " << algo_comment.second);
- changed |= algo_comment.first->Run(force_primary_launch);
- cnt_callback_.Report();
- }
- return changed;
- }
-
-public:
- GraphSimplifier(conj_graph_pack &gp, const SimplifInfoContainer& info_container,
- const debruijn_config::simplification& simplif_cfg,
- const std::function<void(EdgeId)>& removal_handler,
- stats::detail_info_printer& printer)
- : gp_(gp),
- g_(gp_.g),
- info_container_(info_container),
- simplif_cfg_(simplif_cfg),
- removal_handler_(AddCountingCallback(cnt_callback_, removal_handler)),
- printer_(printer) {
-
- }
-
- void SimplifyGraph() {
- printer_(ipp_before_simplification);
- INFO("Graph simplification started");
-
- InitialCleaning();
-
- AlgoStorageT algos;
-
- PushValid(
- TipClipperInstance(g_, simplif_cfg_.tc, info_container_, removal_handler_, simplif_cfg_.cycle_iter_count),
- "Tip clipper",
- algos);
- PushValid(
- BRInstance(g_, simplif_cfg_.br, info_container_, removal_handler_, simplif_cfg_.cycle_iter_count),
- "Bulge remover",
- algos);
- PushValid(
- ECRemoverInstance(g_, simplif_cfg_.ec, info_container_, removal_handler_, simplif_cfg_.cycle_iter_count),
- "Low coverage edge remover",
- algos);
-
- size_t iteration = 0;
- bool graph_changed = true;
- //cannot stop simply if nothing changed, since threshold change on every iteration
- while (iteration < simplif_cfg_.cycle_iter_count || graph_changed) {
- INFO("PROCEDURE == Simplification cycle, iteration " << iteration + 1);
- graph_changed = RunAlgos(algos);
- ++iteration;
- }
-
- printer_(ipp_before_post_simplification);
-
- if (simplif_cfg_.post_simplif_enabled) {
- PostSimplification();
- } else {
- INFO("PostSimplification disabled");
- }
- }
-};
-
-void Simplification::run(conj_graph_pack &gp, const char*) {
- using namespace omnigraph;
-
- //no other handlers here, todo change with DetachAll
- gp.index.Detach();
- gp.index.clear();
-
- omnigraph::DefaultLabeler<Graph> labeler(gp.g, gp.edge_pos);
-
- stats::detail_info_printer printer(gp, labeler, cfg::get().output_dir);
-
- // QualityLoggingRemovalHandler<Graph> qual_removal_handler(gp.g, edge_qual);
-// auto colorer = debruijn_graph::DefaultGPColorer(gp);
-// QualityEdgeLocalityPrintingRH<Graph> qual_removal_handler(gp.g, gp.edge_qual, labeler, colorer,
-// cfg::get().output_dir + "pictures/colored_edges_deleted/");
-//
-// //positive quality edges removed (folder colored_edges_deleted)
-// std::function<void(EdgeId)> removal_handler_f = boost::bind(
-// // &QualityLoggingRemovalHandler<Graph>::HandleDelete,
-// &QualityEdgeLocalityPrintingRH<Graph>::HandleDelete,
-// boost::ref(qual_removal_handler), _1);
-
-
- SimplifInfoContainer info_container;
- info_container.set_read_length(cfg::get().ds.RL())
- .set_main_iteration(cfg::get().main_iteration)
- .set_chunk_cnt(5 * cfg::get().max_threads);
-
- if (!cfg::get().ds.meta) {
- //0 if model didn't converge
- //todo take max with trusted_bound
- info_container.set_detected_mean_coverage(gp.ginfo.estimated_mean())
- .set_detected_coverage_bound(gp.ginfo.ec_bound());
- }
-
- debruijn::simplification::GraphSimplifier simplifier(gp, info_container,
- preliminary_ ? cfg::get().preliminary_simp : cfg::get().simp,
- nullptr/*removal_handler_f*/,
- printer);
- simplifier.SimplifyGraph();
-}
-
-void SimplificationCleanup::run(conj_graph_pack &gp, const char*) {
- SimplifInfoContainer info_container;
- info_container
- .set_read_length(cfg::get().ds.RL())
- .set_main_iteration(cfg::get().main_iteration)
- .set_chunk_cnt(5 * cfg::get().max_threads);
-
- IsolatedEdgeRemoverInstance(gp.g, cfg::get().simp.ier, info_container, (HandlerF<Graph>)nullptr)->Run();
-
- double low_threshold = gp.ginfo.trusted_bound();
- if (math::gr(low_threshold, 0.0)) {
- INFO("Removing all the edges having coverage " << low_threshold << " and less");
- ParallelEdgeRemovingAlgorithm<Graph, CoverageComparator<Graph>>
- cov_cleaner(gp.g,
- CoverageUpperBound<Graph>(gp.g, low_threshold),
- info_container.chunk_cnt(),
- (HandlerF<Graph>)nullptr,
- /*canonical_only*/true,
- CoverageComparator<Graph>(gp.g));
- cov_cleaner.Run();
- }
-
- omnigraph::DefaultLabeler<Graph> labeler(gp.g, gp.edge_pos);
- stats::detail_info_printer printer(gp, labeler, cfg::get().output_dir);
- printer(ipp_final_simplified);
-
- DEBUG("Graph simplification finished");
-
- INFO("Counting average coverage");
- AvgCovereageCounter<Graph> cov_counter(gp.g);
- cfg::get_writable().ds.set_avg_coverage(cov_counter.Count());
- INFO("Average coverage = " << cfg::get().ds.avg_coverage());
- if (!cfg::get().ds.single_cell) {
- if (cfg::get().ds.avg_coverage() < gp.ginfo.ec_bound())
- WARN("The determined erroneous connection coverage threshold may be determined improperly\n");
- }
-}
-
-
-#if 0
-void corrected_and_save_reads(const conj_graph_pack& gp) {
- //saving corrected reads
- //todo read input files, correct, save and use on the next iteration
-
- auto_ptr<io::IReader<io::PairedReadSeq>> paired_stream =
- paired_binary_multireader(false, /*insert_size*/0);
- io::ModifyingWrapper<io::PairedReadSeq> refined_paired_stream(
- *paired_stream,
- GraphReadCorrectorInstance(gp.g, *MapperInstance(gp)));
-
- auto_ptr<io::IReader<io::SingleReadSeq>> single_stream =
- single_binary_multireader(false, /*include_paired_reads*/false);
- io::ModifyingWrapper<io::SingleReadSeq> refined_single_stream(
- *single_stream,
- GraphReadCorrectorInstance(gp.g, *MapperInstance(gp)));
-
- if (cfg::get().graph_read_corr.binary) {
- INFO("Correcting paired reads");
-
- io::BinaryWriter paired_converter(
- cfg::get().paired_read_prefix + "_cor", cfg::get().max_threads,
- cfg::get().buffer_size);
- paired_converter.ToBinary(refined_paired_stream);
-
- INFO("Correcting single reads");
- io::BinaryWriter single_converter(
- cfg::get().single_read_prefix + "_cor", cfg::get().max_threads,
- cfg::get().buffer_size);
- single_converter.ToBinary(refined_single_stream);
- } else {
- //save in fasta
- VERIFY(false);
- }
-
- INFO("Error correction done");
-}
-#endif
-
-} //debruijn_graph
diff --git a/src/debruijn/simplification.hpp b/src/debruijn/simplification.hpp
deleted file mode 100644
index e53c724..0000000
--- a/src/debruijn/simplification.hpp
+++ /dev/null
@@ -1,34 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "stage.hpp"
-
-namespace debruijn_graph {
-
-class Simplification : public spades::AssemblyStage {
- const bool preliminary_;
- public:
- Simplification(bool preliminary = false)
- : AssemblyStage(preliminary ? "Preliminary Simplification" : "Simplification",
- preliminary ? "simplification_preliminary" : "simplification"),
- preliminary_(preliminary) {}
-
- void run(conj_graph_pack &gp, const char*);
-};
-
-class SimplificationCleanup : public spades::AssemblyStage {
- public:
- SimplificationCleanup()
- : AssemblyStage("Simplification Cleanup", "simplification_cleanup") {}
-
- void run(conj_graph_pack &gp, const char*);
-};
-
-}
-
diff --git a/src/debruijn/simplification/graph_simplification.hpp b/src/debruijn/simplification/graph_simplification.hpp
deleted file mode 100644
index 4a35755..0000000
--- a/src/debruijn/simplification/graph_simplification.hpp
+++ /dev/null
@@ -1,825 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * graph_simplification.hpp
- *
- * Created on: Aug 12, 2011
- * Author: sergey
- */
-
-#pragma once
-
-#include "standard_base.hpp"
-#include "config_struct.hpp"
-#include "debruijn_graph.hpp"
-#include "stats/debruijn_stats.hpp"
-
-#include "omni/visualization/graph_colorer.hpp"
-#include "omni/omni_utils.hpp"
-#include "omni/omni_tools.hpp"
-#include "omni/tip_clipper.hpp"
-#include "omni/complex_tip_clipper.hpp"
-#include "omni/bulge_remover.hpp"
-#include "omni/complex_bulge_remover.hpp"
-#include "omni/erroneous_connection_remover.hpp"
-#include "omni/relative_coverage_remover.hpp"
-#include "omni/mf_ec_remover.hpp"
-#include "omni/parallel_processing.hpp"
-#include "utils.hpp"
-#include "simplification/simplification_settings.hpp"
-#include "simplification/single_cell_simplification.hpp"
-#include "detail_coverage.hpp"
-#include "graph_read_correction.hpp"
-#include "detail_coverage.hpp"
-
-#include "stats/chimera_stats.hpp"
-#include "moleculo.hpp"
-
-namespace debruijn {
-
-namespace simplification {
-
-//todo remove this line
-using namespace debruijn_graph;
-
-template<class Graph>
-using AlgoPtr = std::shared_ptr<omnigraph::PersistentAlgorithmBase<Graph>>;
-
-template<class Graph>
-using EdgeConditionT = pred::TypedPredicate<typename Graph::EdgeId>;
-
-template<class Graph>
-class ConditionParser {
-private:
- typedef typename Graph::EdgeId EdgeId;
-
- const Graph& g_;
- string next_token_;
- string input_;
- const SimplifInfoContainer settings_;
- size_t curr_iteration_;
- size_t iteration_cnt_;
- std::queue<string> tokenized_input_;
-
- size_t max_length_bound_;
- double max_coverage_bound_;
-
- string ReadNext() {
- if (!tokenized_input_.empty()) {
- next_token_ = tokenized_input_.front();
- tokenized_input_.pop();
- } else {
- next_token_ = "";
- }
- return next_token_;
- }
-
- template<typename T>
- bool RelaxMax(T& cur_max, T t) {
- if (t > cur_max) {
- cur_max = t;
- return true;
- }
- return false;
- }
-
- template<typename T>
- bool RelaxMin(T& cur_min, T t) {
- if (t < cur_min) {
- cur_min = t;
- return true;
- }
- return false;
- }
-
- double GetCoverageBound() {
- if (next_token_ == "auto") {
- return settings_.detected_coverage_bound();
- } else {
- return boost::lexical_cast<double>(next_token_);
- }
- }
-
- pred::TypedPredicate<EdgeId> ParseCondition(size_t& min_length_bound,
- double& min_coverage_bound) {
- if (next_token_ == "tc_lb") {
- double length_coeff = boost::lexical_cast<double>(ReadNext());
-
- DEBUG("Creating tip length bound. Coeff " << length_coeff);
- size_t length_bound = LengthThresholdFinder::MaxTipLength(
- settings_.read_length(), g_.k(), length_coeff);
-
- DEBUG("Length bound " << length_bound);
-
- RelaxMin(min_length_bound, length_bound);
- return LengthUpperBound<Graph>(g_, length_bound);
- } else if (next_token_ == "to_ec_lb") {
- double length_coeff = boost::lexical_cast<double>(ReadNext());
-
- DEBUG( "Creating length bound for erroneous connections originated from tip merging. Coeff " << length_coeff);
- size_t length_bound =
- LengthThresholdFinder::MaxTipOriginatedECLength(
- settings_.read_length(), g_.k(), length_coeff);
-
- DEBUG("Length bound " << length_bound);
-
- RelaxMin(min_length_bound, length_bound);
- return LengthUpperBound<Graph>(g_, length_bound);
- } else if (next_token_ == "ec_lb") {
- size_t length_coeff = boost::lexical_cast<size_t>(ReadNext());
-
- DEBUG("Creating ec length bound. Coeff " << length_coeff);
- size_t length_bound =
- LengthThresholdFinder::MaxErroneousConnectionLength(
- g_.k(), length_coeff);
-
- DEBUG("Length bound " << length_bound);
-
- RelaxMin(min_length_bound, length_bound);
- return LengthUpperBound<Graph>(g_, length_bound);
- } else if (next_token_ == "lb") {
- size_t length_bound = boost::lexical_cast<size_t>(ReadNext());
-
- DEBUG("Creating length bound. Value " << length_bound);
-
- RelaxMin(min_length_bound, length_bound);
- return LengthUpperBound<Graph>(g_, length_bound);
- } else if (next_token_ == "cb") {
- ReadNext();
- double cov_bound = GetCoverageBound();
- DEBUG("Creating coverage upper bound " << cov_bound);
- RelaxMin(min_coverage_bound, cov_bound);
- return CoverageUpperBound<Graph>(g_, cov_bound);
- } else if (next_token_ == "icb") {
- VERIFY(iteration_cnt_ != -1ul && curr_iteration_ != -1ul);
- ReadNext();
- double cov_bound = GetCoverageBound();
- cov_bound = cov_bound / (double) iteration_cnt_ * (double) (curr_iteration_ + 1);
- DEBUG("Creating iterative coverage upper bound " << cov_bound);
- RelaxMin(min_coverage_bound, cov_bound);
- return CoverageUpperBound<Graph>(g_, cov_bound);
- } else if (next_token_ == "rctc") {
- ReadNext();
- DEBUG("Creating relative cov tip cond " << next_token_);
- return RelativeCoverageTipCondition<Graph>(g_, boost::lexical_cast<double>(next_token_));
- } else if (next_token_ == "disabled") {
- DEBUG("Creating disabling condition");
- return pred::AlwaysFalse<EdgeId>();
- } else if (next_token_ == "mmm") {
- ReadNext();
- DEBUG("Creating max mismatches cond " << next_token_);
- return MismatchTipCondition<Graph>(g_, lexical_cast<size_t>(next_token_));
- } else {
- VERIFY(false);
- return pred::AlwaysTrue<EdgeId>();
- }
- }
-
- pred::TypedPredicate<EdgeId> ParseConjunction(size_t& min_length_bound,
- double& min_coverage_bound) {
- pred::TypedPredicate<EdgeId> answer = pred::AlwaysTrue<EdgeId>();
- VERIFY(next_token_ == "{");
- ReadNext();
- while (next_token_ != "}") {
- answer = pred::And(answer,
- ParseCondition(min_length_bound, min_coverage_bound));
- ReadNext();
- }
- return answer;
- }
-
-public:
-
- ConditionParser(const Graph& g, string input, const SimplifInfoContainer& settings,
- size_t curr_iteration = -1ul, size_t iteration_cnt = -1ul)
- : g_(g),
- input_(input),
- settings_(settings),
- curr_iteration_(curr_iteration),
- iteration_cnt_(iteration_cnt),
- max_length_bound_(0),
- max_coverage_bound_(0.) {
- DEBUG("Creating parser for string " << input);
- using namespace boost;
- vector<string> tmp_tokenized_input;
- boost::split(tmp_tokenized_input, input_, boost::is_any_of(" ,;"), boost::token_compress_on);
- for (auto it = tmp_tokenized_input.begin();
- it != tmp_tokenized_input.end(); ++it) {
- tokenized_input_.push(*it);
- }
- ReadNext();
- }
-
- pred::TypedPredicate<EdgeId> operator()() {
- DEBUG("Parsing");
- pred::TypedPredicate<EdgeId> answer = pred::AlwaysFalse<EdgeId>();
- VERIFY_MSG(next_token_ == "{", "Expected \"{\", but next token was " << next_token_);
- while (next_token_ == "{") {
- size_t min_length_bound = numeric_limits<size_t>::max();
- double min_coverage_bound = numeric_limits<double>::max();
- answer = pred::Or(answer,
- ParseConjunction(min_length_bound, min_coverage_bound));
- RelaxMax(max_length_bound_, min_length_bound);
- RelaxMax(max_coverage_bound_, min_coverage_bound);
- ReadNext();
- }
- return answer;
- }
-
- size_t max_length_bound() const {
- return max_length_bound_;
- }
-
- double max_coverage_bound() const {
- return max_coverage_bound_;
- }
-
-private:
- DECL_LOGGER("ConditionParser");
-};
-
-//todo move to visualization
-template<class graph_pack>
-shared_ptr<omnigraph::visualization::GraphColorer<typename graph_pack::graph_t>> DefaultGPColorer(
- const graph_pack& gp) {
- auto mapper = MapperInstance(gp);
- auto path1 = mapper->MapSequence(gp.genome).path();
- auto path2 = mapper->MapSequence(!gp.genome).path();
- return omnigraph::visualization::DefaultColorer(gp.g, path1, path2);
-}
-
-template<class Graph>
-class EditDistanceTrackingCallback {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::EdgeData EdgeData;
- const Graph& g_;
-
-public:
- EditDistanceTrackingCallback(const Graph& g)
- : g_(g) {
- }
-
- bool operator()(EdgeId edge, const vector<EdgeId>& path) const {
- vector<Sequence> path_sequences;
- for (auto it = path.begin(); it != path.end(); ++it) {
- path_sequences.push_back(g_.EdgeNucls(*it));
- }
- Sequence path_sequence(
- MergeOverlappingSequences(path_sequences, g_.k()));
- size_t dist = EditDistance(g_.EdgeNucls(edge), path_sequence);
- TRACE( "Bulge sequences with distance " << dist << " were " << g_.EdgeNucls(edge) << " and " << path_sequence);
- return true;
- }
-
-private:
- DECL_LOGGER("EditDistanceTrackingCallback")
- ;
-};
-
-//template<class Graph, class SmartEdgeIt>
-//bool ClipTips(
-// Graph& g,
-// SmartEdgeIt& it,
-// const debruijn_config::simplification::tip_clipper& tc_config,
-// const SimplifInfoContainer& info,
-// std::function<void(typename Graph::EdgeId)> removal_handler = 0) {
-//
-// INFO("Clipping tips");
-//
-// string condition_str = tc_config.condition;
-//
-// ConditionParser<Graph> parser(g, condition_str, info);
-// auto condition = parser();
-//
-// omnigraph::EdgeRemovingAlgorithm<Graph> tc(g,
-// omnigraph::AddTipCondition(g, condition),
-// removal_handler, true);
-//
-// TRACE("Tip length bound " << parser.max_length_bound());
-// return tc.RunFromIterator(it,
-// make_shared<LengthUpperBound<Graph>>(g, parser.max_length_bound()));
-//}
-
-//template<class Graph>
-//bool ClipTips(
-// Graph& g,
-// const debruijn_config::simplification::tip_clipper& tc_config,
-// const SimplifInfoContainer& info,
-// std::function<void(typename Graph::EdgeId)> removal_handler = 0) {
-//
-// auto it = g.SmartEdgeBegin(LengthComparator<Graph>(g), true);
-// return ClipTips(g, it, tc_config, info, removal_handler);
-//}
-
-//enabling tip projection, todo optimize if hotspot
-template<class gp_t>
-HandlerF<typename gp_t::graph_t> WrapWithProjectionCallback(
- gp_t& gp,
- HandlerF<typename gp_t::graph_t> removal_handler) {
- typedef typename gp_t::graph_t Graph;
- typedef typename Graph::EdgeId EdgeId;
- TipsProjector<gp_t> tip_projector(gp);
-
- HandlerF<Graph> projecting_callback = std::bind(&TipsProjector<gp_t>::ProjectTip,
- tip_projector, std::placeholders::_1);
-
- return func::Composition<EdgeId>(std::ref(removal_handler), projecting_callback);
-}
-
-template<class Graph, class InterestingEdgeFinder>
-class LowCoverageEdgeRemovingAlgorithm : public PersistentEdgeRemovingAlgorithm<Graph,
- InterestingEdgeFinder, CoverageComparator<Graph>> {
- typedef typename Graph::EdgeId EdgeId;
- typedef PersistentEdgeRemovingAlgorithm<Graph, InterestingEdgeFinder, CoverageComparator<Graph>> base;
- SimplifInfoContainer simplif_info_;
- std::string condition_str_;
- pred::TypedPredicate<EdgeId> remove_condition_;
- pred::TypedPredicate<EdgeId> proceed_condition_;
-
-protected:
-
- void PrepareIteration(size_t it_cnt, size_t total_it_estimate) override {
- TRACE("Preparing iteration " << it_cnt << " out of total estimate " << total_it_estimate);
- ConditionParser<Graph> parser(this->g(), condition_str_,
- simplif_info_, it_cnt, total_it_estimate);
- remove_condition_ = omnigraph::AddAlternativesPresenceCondition(this->g(), parser());
- TRACE("Updated remove condition");
- proceed_condition_ = CoverageUpperBound<Graph>(this->g(), parser.max_coverage_bound());
- TRACE("Updated proceed condition up to coverage " << parser.max_coverage_bound());
- }
-
- bool Proceed(EdgeId e) const override {
- return proceed_condition_(e);
- }
-
- bool ShouldRemove(EdgeId e) const override {
- return remove_condition_(e);
- }
-
-public:
- LowCoverageEdgeRemovingAlgorithm(Graph& g,
- const InterestingEdgeFinder& interest_edge_finder,
- const SimplifInfoContainer& simplif_info,
- const std::string& condition_str,
- std::function<void(EdgeId)> removal_handler = nullptr,
- bool canonical_only = false,
- bool track_changes = true,
- size_t total_iteration_estimate = -1ul)
- : base(g, interest_edge_finder,
- removal_handler,
- canonical_only,
- CoverageComparator<Graph>(g),
- track_changes,
- total_iteration_estimate),
- simplif_info_(simplif_info),
- condition_str_(condition_str),
- remove_condition_(pred::AlwaysFalse<EdgeId>()),
- proceed_condition_(pred::AlwaysTrue<EdgeId>()) {}
-private:
- DECL_LOGGER("LowCoverageEdgeRemovingAlgorithm");
-};
-
-template<class Graph>
-AlternativesAnalyzer<Graph> ParseBRConfig(const Graph& g,
- const debruijn_config::simplification::bulge_remover& config) {
- size_t max_length = LengthThresholdFinder::MaxBulgeLength(
- g.k(), config.max_bulge_length_coefficient,
- config.max_additive_length_coefficient);
-
- DEBUG("Length bound " << max_length);
-
- return AlternativesAnalyzer<Graph>(g, config.max_coverage,
- max_length,
- config.max_relative_coverage,
- config.max_delta,
- config.max_relative_delta,
- config.max_number_edges);
-}
-
-template<class Graph>
-AlgoPtr<Graph> SelfConjugateEdgeRemoverInstance(Graph &g, const string& condition_str,
- const SimplifInfoContainer& info,
- HandlerF<Graph> removal_handler = 0) {
- ConditionParser<Graph> parser(g, condition_str, info);
- auto condition = pred::And(SelfConjugateCondition<Graph>(g), parser());
-
- return std::make_shared<ParallelEdgeRemovingAlgorithm<Graph>>(g,
- condition,
- info.chunk_cnt(),
- removal_handler,
- /*canonical_only*/true);
-}
-
-template<class Graph>
-bool RemoveRelativelyLowCoverageComponents(
- Graph &g,
- const FlankingCoverage<Graph>& flanking_cov,
- const debruijn_config::simplification::relative_coverage_comp_remover& rcc_config,
- const SimplifInfoContainer& info,
- typename ComponentRemover<Graph>::HandlerF removal_handler = 0) {
- if (rcc_config.enabled) {
- INFO("Removing relatively low covered connections");
- size_t connecting_path_length_bound = LengthThresholdFinder::MaxErroneousConnectionLength(
- g.k(), rcc_config.max_ec_length_coefficient);
-
- std::string pics_dir = "";//cfg::get().output_dir + "rel_cov_components/"
-
- double max_coverage = math::ge(rcc_config.max_coverage_coeff, 0.)
- ? info.detected_coverage_bound() * rcc_config.max_coverage_coeff
- : std::numeric_limits<double>::max();
-
- omnigraph::simplification::relative_coverage::
- RelativeCoverageComponentRemover<Graph> rel_rem(
- g,
- std::bind(&FlankingCoverage<Graph>::LocalCoverage,
- std::cref(flanking_cov), std::placeholders::_1, std::placeholders::_2),
- rcc_config.coverage_gap, size_t(double(info.read_length()) * rcc_config.length_coeff),
- size_t(double(info.read_length()) * rcc_config.tip_allowing_length_coeff),
- connecting_path_length_bound,
- max_coverage,
- removal_handler, rcc_config.vertex_count_limit, pics_dir);
- return rel_rem.Run();
- } else {
- INFO("Removal of relatively low covered connections disabled");
- return false;
- }
-}
-
-template<class Graph>
-bool DisconnectRelativelyLowCoverageEdges(Graph &g,
- const FlankingCoverage<Graph>& flanking_cov,
- const debruijn_config::simplification::relative_coverage_edge_disconnector& rced_config) {
- if (rced_config.enabled) {
- INFO("Disconnecting edges with relatively low coverage");
- omnigraph::simplification::relative_coverage::RelativeCoverageDisconnector<
- Graph> disconnector(g, std::bind(&FlankingCoverage<Graph>::LocalCoverage,
- std::cref(flanking_cov), std::placeholders::_1,
- std::placeholders::_2), rced_config.diff_mult);
- return disconnector.Run();
- } else {
- INFO("Disconnection of relatively low covered edges disabled");
- return false;
- }
-}
-
-template<class Graph>
-bool RemoveComplexBulges(
- Graph& g,
- debruijn_config::simplification::complex_bulge_remover cbr_config,
- size_t /*iteration*/ = 0) {
- if (!cbr_config.enabled)
- return false;
- INFO("Removing complex bulges");
- size_t max_length = (size_t) ((double) g.k() * cbr_config.max_relative_length);
- size_t max_diff = cbr_config.max_length_difference;
- omnigraph::complex_br::ComplexBulgeRemover<Graph> complex_bulge_remover(
- g, max_length, max_diff);
- return complex_bulge_remover.Run();
-}
-
-//template<class Graph>
-//bool RemoveIsolatedEdges(Graph &g, size_t max_length, double max_coverage, size_t max_length_any_cov,
-// std::function<void(typename Graph::EdgeId)> removal_handler = 0, size_t chunk_cnt = 1) {
-// typedef typename Graph::EdgeId EdgeId;
-//
-// //todo add info that some other edges might be removed =)
-// INFO("Removing isolated edges");
-// INFO("All edges shorter than " << max_length_any_cov << " will be removed");
-// INFO("Also edges shorter than " << max_length << " and coverage smaller than " << max_coverage << " will be removed");
-// //todo add warn on max_length_any_cov > max_length
-//
-// auto condition = func::And<EdgeId>(
-// make_shared<IsolatedEdgeCondition<Graph>>(g),
-// func::Or<EdgeId>(
-// make_shared<LengthUpperBound<Graph>>(g, max_length_any_cov),
-// func::And<EdgeId>(
-// make_shared<LengthUpperBound<Graph>>(g, max_length),
-// make_shared<CoverageUpperBound<Graph>>(g, max_coverage)
-// )));
-//
-// if (chunk_cnt == 1) {
-// omnigraph::EdgeRemovingAlgorithm<Graph> removing_algo(g, condition, removal_handler);
-//
-// return removing_algo.Run(LengthComparator<Graph>(g),
-// make_shared<LengthUpperBound<Graph>>(g, std::max(max_length, max_length_any_cov)));
-// } else {
-// SemiParallelAlgorithmRunner<Graph, EdgeId> runner(g);
-// SemiParallelEdgeRemovingAlgorithm<Graph> removing_algo(g, condition, removal_handler);
-//
-// return RunEdgeAlgorithm(g, runner, removing_algo, chunk_cnt);
-// }
-//}
-
-template<class Graph>
-bool ClipComplexTips(Graph& g, debruijn_config::simplification::complex_tip_clipper ctc_conf, HandlerF<Graph> removal_handler = 0) {
- if(!ctc_conf.enabled) {
- INFO("Complex tip clipping disabled");
- return false;
- }
-
- std::function<void(set<EdgeId>)> set_removal_handler_f(0);
- if (removal_handler) {
- set_removal_handler_f = std::bind(
- &omnigraph::simplification::SingleEdgeAdapter<set<EdgeId>>, std::placeholders::_1, removal_handler);
- }
-
- INFO("Complex tip clipping");
- size_t max_edge_length = g.k() * 2;
- ComplexTipClipper<Graph> tip_clipper(g, max_edge_length, "", set_removal_handler_f);
- tip_clipper.Run();
- return true;
-}
-
-template<class Graph>
-AlgoPtr<Graph> IsolatedEdgeRemoverInstance(Graph &g,
- debruijn_config::simplification::isolated_edges_remover ier,
- const SimplifInfoContainer& info,
- HandlerF<Graph> removal_handler = 0) {
- if (!ier.enabled) {
- return nullptr;
- }
-
- //todo document behavior
- size_t max_length_any_cov = std::max(info.read_length(), ier.max_length_any_cov);
-
-// INFO("Removing isolated edges");
-// INFO("All edges shorter than " << max_length_any_cov << " will be removed");
-// INFO("Also edges shorter than " << ier.max_length << " and coverage smaller than " << ier.max_coverage << " will be removed");
- //todo add warn on max_length_any_cov > max_length
-
- auto condition = pred::And(IsolatedEdgeCondition<Graph>(g),
- pred::Or(LengthUpperBound<Graph>(g, max_length_any_cov),
- pred::And(LengthUpperBound<Graph>(g, ier.max_length),
- CoverageUpperBound<Graph>(g, ier.max_coverage))));
-
- return std::make_shared<ParallelEdgeRemovingAlgorithm<Graph>>(g,
- condition,
- info.chunk_cnt(),
- removal_handler,
- /*canonical_only*/true);
-}
-
-template<class Graph>
-pred::TypedPredicate<typename Graph::EdgeId> NecessaryBulgeCondition(const Graph& g,
- const debruijn_config::simplification::bulge_remover& br_config,
- const SimplifInfoContainer&) {
- auto analyzer = ParseBRConfig(g, br_config);
- return omnigraph::NecessaryBulgeCondition(g, analyzer.max_length(), analyzer.max_coverage());
-}
-
-template<class Graph>
-pred::TypedPredicate<typename Graph::EdgeId> NecessaryTipCondition(const Graph& g,
- const debruijn_config::simplification::tip_clipper& tc_config,
- const SimplifInfoContainer& info) {
- ConditionParser<Graph> parser(g, tc_config.condition, info);
- auto condition = parser();
- return omnigraph::NecessaryTipCondition(g, parser.max_length_bound(),
- parser.max_coverage_bound());
-}
-
-template<class Graph>
-pred::TypedPredicate<typename Graph::EdgeId> NecessaryECCondition(const Graph& g,
- const debruijn_config::simplification::erroneous_connections_remover& ec_config,
- const SimplifInfoContainer& info, size_t current_iteration = 0, size_t iteration_cnt = 1) {
- ConditionParser<Graph> parser(g, ec_config.condition, info, current_iteration, iteration_cnt);
- auto condition = parser();
- return omnigraph::NecessaryECCondition(g, parser.max_length_bound(),
- parser.max_coverage_bound());
-}
-
-template<class Graph>
-AlgoPtr<Graph> ECRemoverInstance(Graph& g,
- const debruijn_config::simplification::erroneous_connections_remover& ec_config,
- const SimplifInfoContainer& info,
- HandlerF<Graph> removal_handler,
- size_t iteration_cnt = 1) {
- if (ec_config.condition.empty())
- return nullptr;
-
- typedef omnigraph::ParallelInterestingElementFinder<Graph> InterestingFinderT;
- InterestingFinderT interesting_finder(g,
- NecessaryECCondition(g, ec_config, info, iteration_cnt - 1, iteration_cnt),
- info.chunk_cnt());
- return make_shared<LowCoverageEdgeRemovingAlgorithm<Graph, InterestingFinderT>>(
- g, interesting_finder, info, ec_config.condition, removal_handler,
- /*canonical only*/ true, /*track changes*/ true, iteration_cnt);
-}
-
-template<class Graph>
-AlgoPtr<Graph> TipClipperInstance(Graph& g,
- const EdgeConditionT<Graph>& condition,
- const SimplifInfoContainer& info,
- HandlerF<Graph> removal_handler,
- bool track_changes = true,
- size_t /*iteration_cnt*/ = 1) {
- return make_shared<ParallelEdgeRemovingAlgorithm<Graph, LengthComparator<Graph>>>(g,
- AddTipCondition(g, condition),
- info.chunk_cnt(),
- removal_handler,
- /*canonical_only*/true,
- LengthComparator<Graph>(g),
- track_changes);
-}
-
-template<class Graph>
-AlgoPtr<Graph> TipClipperInstance(Graph& g,
- const debruijn_config::simplification::tip_clipper& tc_config,
- const SimplifInfoContainer& info,
- HandlerF<Graph> removal_handler,
- size_t iteration_cnt = 1) {
- if (tc_config.condition.empty())
- return nullptr;
-
- ConditionParser<Graph> parser(g, tc_config.condition, info);
- auto condition = parser();
- return TipClipperInstance(g, condition, info, removal_handler, /*track changes*/true, iteration_cnt);
-}
-
-template<class Graph>
-AlgoPtr<Graph> TopologyTipClipperInstance(
- Graph &g,
- const debruijn_config::simplification::topology_tip_clipper& ttc_config,
- const SimplifInfoContainer& info,
- HandlerF<Graph> removal_handler) {
-
- auto condition
- = pred::And(LengthUpperBound<Graph>(g,
- LengthThresholdFinder::MaxTipLength(info.read_length(), g.k(), ttc_config.length_coeff)),
- DefaultUniquenessPlausabilityCondition<Graph>(g,
- ttc_config.uniqueness_length, ttc_config.plausibility_length));
-
- return TipClipperInstance(g,
- condition, info, removal_handler, /*track changes*/false);
-}
-
-template<class Graph>
-AlgoPtr<Graph> BRInstance(Graph& g,
- const debruijn_config::simplification::bulge_remover& br_config,
- const SimplifInfoContainer& info,
- HandlerF<Graph> removal_handler,
- size_t /*iteration_cnt*/ = 1) {
- typedef ParallelInterestingElementFinder<Graph,
- typename Graph::EdgeId> InterestingEdgeFinder;
- if (!br_config.enabled || (br_config.main_iteration_only && !info.main_iteration())) {
- return nullptr;
- }
-
- auto alternatives_analyzer = ParseBRConfig(g, br_config);
-
-
- InterestingEdgeFinder interesting_edge_finder(g,
- NecessaryBulgeCondition(g,
- alternatives_analyzer.max_length(),
- alternatives_analyzer.max_coverage()),
- info.chunk_cnt());
- if (br_config.parallel) {
- INFO("Creating parallel br instance");
- return make_shared<ParallelBulgeRemover<Graph, InterestingEdgeFinder>>(g,
- interesting_edge_finder,
- br_config.buff_size,
- br_config.buff_cov_diff,
- br_config.buff_cov_rel_diff,
- alternatives_analyzer,
- nullptr,
- removal_handler,
- /*track_changes*/true);
- } else {
- INFO("Creating br instance");
- return make_shared<BulgeRemover<Graph, InterestingEdgeFinder>>(g,
- interesting_edge_finder,
- alternatives_analyzer,
- nullptr,
- removal_handler,
- /*track_changes*/true);
- }
-}
-
-//todo make this all work for end of the edges also? switch to canonical iteration?
-//todo rename, since checking topology also
-template<class Graph>
-class FlankingCovBound : public EdgeCondition<Graph> {
- typedef EdgeCondition<Graph> base;
- typedef typename Graph::EdgeId EdgeId;
- const FlankingCoverage<Graph>& flanking_cov_;
- double max_coverage_;
-public:
- FlankingCovBound(const Graph& g,
- const FlankingCoverage<Graph>& flanking_cov,
- double max_coverage)
- : base(g),
- flanking_cov_(flanking_cov),
- max_coverage_(max_coverage) {
- }
-
- bool Check(EdgeId e) const override {
- return this->g().length(e) > 1
- && this->g().OutgoingEdgeCount(this->g().EdgeStart(e)) > 1
- && math::le(flanking_cov_.CoverageOfStart(e), max_coverage_);
- }
-
-};
-
-template<class Graph, class Comparator = std::less<typename Graph::EdgeId>>
-class ParallelDisconnectionAlgorithm : public PersistentProcessingAlgorithm<Graph,
- typename Graph::EdgeId,
- ParallelInterestingElementFinder<Graph>, Comparator> {
- typedef typename Graph::EdgeId EdgeId;
- typedef PersistentProcessingAlgorithm<Graph, EdgeId,
- ParallelInterestingElementFinder<Graph>, Comparator> base;
- pred::TypedPredicate<EdgeId> condition_;
- omnigraph::simplification::relative_coverage::EdgeDisconnector<Graph> disconnector_;
-
-public:
- ParallelDisconnectionAlgorithm(Graph& g,
- pred::TypedPredicate<EdgeId> condition,
- size_t chunk_cnt,
- HandlerF<Graph> removal_handler,
- const Comparator& comp = Comparator(),
- bool track_changes = true)
- : base(g,
- ParallelInterestingElementFinder<Graph>(g, condition, chunk_cnt),
- /*canonical_only*/false, comp, track_changes),
- condition_(condition),
- disconnector_(g, removal_handler) {
- }
-
- bool Process(EdgeId e) override {
- if (condition_(e)) {
- disconnector_(e);
- return true;
- }
- return false;
- }
-
-};
-
-template<class Graph>
-AlgoPtr<Graph> LowFlankDisconnectorInstance(Graph& g,
- const FlankingCoverage<Graph>& flanking_cov,
- double cov_bound,
- const SimplifInfoContainer& info,
- HandlerF<Graph> removal_handler) {
- if (math::ls(cov_bound, 0.)) {
- INFO("Flanking coverage based disconnection disabled");
- return nullptr;
- }
-
- return make_shared<ParallelDisconnectionAlgorithm<Graph>>(g,
- FlankingCovBound<Graph>(g, flanking_cov, cov_bound),
- info.chunk_cnt(),
- removal_handler);
-}
-
-////todo add chunk_cnt
-//template<class Graph>
-//bool ClipTips(
-// Graph& g,
-// const std::string& condition,
-// const SimplifInfoContainer& info,
-// std::function<void(typename Graph::EdgeId)> removal_handler = 0) {
-//
-// if (condition != "") {
-// ConditionParser<Graph> parser(g, condition, info);
-// auto condition = parser();
-// ParallelEdgeRemovingAlgorithm<Graph, LengthComparator<Graph>> algo(g,
-// AddTipCondition(g, condition),
-// info.chunk_cnt(),
-// removal_handler,
-// /*canonical_only*/true,
-// LengthComparator<Graph>(g));
-// return algo.Run();
-// } else {
-// return false;
-// }
-//}
-
-//template<class Graph>
-//bool RemoveLowCoverageEdges(
-// Graph& g,
-// const std::string& condition,
-// const SimplifInfoContainer& info,
-// std::function<void(typename Graph::EdgeId)> removal_handler = 0) {
-//
-// if (condition != "") {
-// ConditionParser<Graph> parser(g, condition, info);
-// auto condition = parser();
-// blahblahblah
-// ParallelEdgeRemovingAlgorithm<Graph, CoverageComparator<Graph>> algo(g,
-// condition,
-// info.chunk_cnt(),
-// removal_handler,
-// /*canonical_only*/true,
-// CoverageComparator<Graph>(g));
-// return algo.Run();
-// } else {
-// return false;
-// }
-//}
-
-}
-}
diff --git a/src/debruijn/simplification/parallel_simplification_algorithms.hpp b/src/debruijn/simplification/parallel_simplification_algorithms.hpp
deleted file mode 100644
index 113cd02..0000000
--- a/src/debruijn/simplification/parallel_simplification_algorithms.hpp
+++ /dev/null
@@ -1,924 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "standard_base.hpp"
-#include "omni/graph_processing_algorithm.hpp"
-#include "omni/basic_edge_conditions.hpp"
-#include "omni/bulge_remover.hpp"
-#include "omni/construction_helper.hpp"
-#include "omni/marks_and_locks.hpp"
-#include "simplification_settings.hpp"
-
-namespace debruijn {
-
-namespace simplification {
-
-// bool EnableParallel() {
-// if (simplif_cfg_.presimp.parallel) {
-// INFO("Trying to enable parallel presimplification.");
-// if (gp_.g.AllHandlersThreadSafe()) {
-// return true;
-// } else {
-// WARN("Not all handlers are threadsafe, switching to non-parallel presimplif");
-// //gp.g.PrintHandlersNames();
-// }
-// }
-// return false;
-// }
-
-template<class Graph>
-class ParallelTipClippingFunctor {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef std::function<void(EdgeId)> HandlerF;
- typedef omnigraph::GraphElementLock<VertexId> VertexLockT;
-
- Graph& g_;
- size_t length_bound_;
- double coverage_bound_;
- HandlerF handler_f_;
-
- size_t LockingIncomingCount(VertexId v) const {
- VertexLockT lock(v);
- return g_.IncomingEdgeCount(v);
- }
-
- size_t LockingOutgoingCount(VertexId v) const {
- VertexLockT lock(v);
- return g_.OutgoingEdgeCount(v);
- }
-
- bool IsIncomingTip(EdgeId e) const {
- return g_.length(e) <= length_bound_ && math::le(g_.coverage(e), coverage_bound_)
- && LockingIncomingCount(g_.EdgeStart(e)) + LockingOutgoingCount(g_.EdgeStart(e)) == 1;
- }
-
- void RemoveEdge(EdgeId e) {
- //even full tip locking can't lead to deadlock
- VertexLockT lock1(g_.EdgeStart(e));
- VertexLockT lock2(g_.EdgeEnd(e));
- g_.DeleteEdge(e);
- }
-
-public:
-
- ParallelTipClippingFunctor(Graph& g, size_t length_bound, double coverage_bound, HandlerF handler_f = 0)
- : g_(g),
- length_bound_(length_bound),
- coverage_bound_(coverage_bound),
- handler_f_(handler_f) {
-
- }
-
- bool Process(VertexId v) {
- if (LockingOutgoingCount(v) == 0)
- return false;
-
- vector<EdgeId> tips;
- //don't need lock here after the previous check
- for (EdgeId e : g_.IncomingEdges(v)) {
- if (IsIncomingTip(e)) {
- tips.push_back(e);
- }
- }
-
- //if all of edges are tips, leave the longest one
- if (!tips.empty() && tips.size() == g_.IncomingEdgeCount(v)) {
- sort(tips.begin(), tips.end(), omnigraph::LengthComparator<Graph>(g_));
- tips.pop_back();
- }
-
- for (EdgeId e : tips) {
- if (handler_f_) {
- handler_f_(e);
- }
- //don't need any synchronization here!
- RemoveEdge(e);
- }
- return false;
- }
-
- bool ShouldFilterConjugate() const {
- return false;
- }
-};
-
-template<class Graph>
-class ParallelSimpleBRFunctor {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef omnigraph::GraphElementLock<VertexId> VertexLockT;
-
- Graph& g_;
- size_t max_length_;
- double max_coverage_;
- double max_relative_coverage_;
- size_t max_delta_;
- double max_relative_delta_;
- std::function<void(EdgeId)> handler_f_;
-
- bool LengthDiffCheck(size_t l1, size_t l2, size_t delta) const {
- return l1 <= l2 + delta && l2 <= l1 + delta;
- }
-
- EdgeId Alternative(EdgeId e, const vector<EdgeId>& edges) const {
- size_t delta = omnigraph::CountMaxDifference(max_delta_, g_.length(e), max_relative_delta_);
- for (auto it = edges.rbegin(); it != edges.rend(); ++it) {
- EdgeId candidate = *it;
- if (g_.EdgeEnd(candidate) == g_.EdgeEnd(e) && candidate != e && candidate != g_.conjugate(e)
- && LengthDiffCheck(g_.length(candidate), g_.length(e), delta)) {
- return candidate;
- }
- }
- return EdgeId(0);
- }
-
- bool ProcessEdges(const vector<EdgeId>& edges) {
- for (EdgeId e : edges) {
- if (g_.length(e) <= max_length_ && math::le(g_.coverage(e), max_coverage_)) {
- EdgeId alt = Alternative(e, edges);
- if (alt != EdgeId(0) && math::ge(g_.coverage(alt) * max_relative_coverage_, g_.coverage(e))) {
- //todo is not work in multiple threads for now :)
- //Reasons: id distribution, kmer-mapping
- handler_f_(e);
- g_.GlueEdges(e, alt);
- return true;
- }
- }
- }
- return false;
- }
-
- vector<VertexId> MultiEdgeDestinations(VertexId v) const {
- vector<VertexId> answer;
- set<VertexId> destinations;
- for (EdgeId e : g_.OutgoingEdges(v)) {
- VertexId end = g_.EdgeEnd(e);
- if (destinations.count(end) > 0) {
- answer.push_back(end);
- }
- destinations.insert(end);
- }
- return answer;
- }
-
- VertexId SingleMultiEdgeDestination(VertexId v) const {
- vector<VertexId> dests = MultiEdgeDestinations(v);
- if (dests.size() == 1) {
- return dests.front();
- } else {
- return VertexId(0);
- }
- }
-
- void RemoveBulges(VertexId v) {
- bool flag = true;
- while (flag) {
- vector<EdgeId> edges(g_.out_begin(v), g_.out_end(v));
- if (edges.size() == 1)
- return;
- sort(edges.begin(), edges.end(), omnigraph::CoverageComparator<Graph>(g_));
- flag = ProcessEdges(edges);
- }
- }
-
- bool CheckVertex(VertexId v) const {
- VertexLockT lock(v);
- return MultiEdgeDestinations(v).size() == 1 && MultiEdgeDestinations(g_.conjugate(v)).size() == 0;
- }
-
- size_t MinId(VertexId v) const {
- return std::min(v.int_id(), g_.conjugate(v).int_id());
- }
-
- bool IsMinimal(VertexId v1, VertexId v2) const {
- return MinId(v1) < MinId(v2);
- }
-
-public:
-
- ParallelSimpleBRFunctor(Graph& g, size_t max_length, double max_coverage, double max_relative_coverage, size_t max_delta, double max_relative_delta,
- std::function<void(EdgeId)> handler_f = 0)
- : g_(g),
- max_length_(max_length),
- max_coverage_(max_coverage),
- max_relative_coverage_(max_relative_coverage),
- max_delta_(max_delta),
- max_relative_delta_(max_relative_delta),
- handler_f_(handler_f) {
-
- }
-
- bool operator()(VertexId v/*, need number of vertex for stable id distribution*/) {
- vector<VertexId> multi_dest;
-
- {
- VertexLockT lock(v);
- multi_dest = MultiEdgeDestinations(v);
- }
-
- if (multi_dest.size() == 1 && IsMinimal(v, multi_dest.front())) {
- VertexId dest = multi_dest.front();
- if (CheckVertex(v) && CheckVertex(g_.conjugate(dest))) {
- VertexLockT lock1(v);
- VertexLockT lock2(dest);
- RemoveBulges(v);
- }
- }
- return false;
- }
-
- bool ShouldFilterConjugate() const {
- return false;
- }
-};
-
-template<class Graph>
-class CriticalEdgeMarker {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef std::function<void(EdgeId)> HandlerF;
-
- Graph& g_;
- size_t chunk_cnt_;
- omnigraph::GraphElementMarker<EdgeId> edge_marker_;
-
- void ProcessVertex(VertexId v) {
- if (g_.OutgoingEdgeCount(v) > 0) {
- auto max_cov_it =
- std::max_element(g_.out_begin(v), g_.out_end(v), CoverageComparator<Graph>(g_));
- DEBUG("Marking edge " << g_.str(*max_cov_it));
- edge_marker_.mark(*max_cov_it);
- }
- }
-
- template<class It>
- void ProcessVertices(It begin, It end) {
- for (auto it = begin; !(it == end); ++it) {
- ProcessVertex(*it);
- }
- }
-
-public:
-
- CriticalEdgeMarker(Graph& g, size_t chunk_cnt) : g_(g), chunk_cnt_(chunk_cnt) {
- }
-
- void PutMarks() {
- auto chunk_iterators = IterationHelper<Graph, VertexId>(g_).Chunks(chunk_cnt_);
-
- #pragma omp parallel for schedule(guided)
- for (size_t i = 0; i < chunk_iterators.size() - 1; ++i) {
- ProcessVertices(chunk_iterators[i], chunk_iterators[i + 1]);
- }
- }
-
- void ClearMarks() {
- auto chunk_iterators = IterationHelper<Graph, EdgeId>(g_).Chunks(chunk_cnt_);
-
- #pragma omp parallel for schedule(guided)
- for (size_t i = 0; i < chunk_iterators.size() - 1; ++i) {
- for (auto it = chunk_iterators[i]; it != chunk_iterators[i + 1]; ++ it) {
- edge_marker_.unmark(*it);
- }
- }
- }
-private:
- DECL_LOGGER("CriticalEdgeMarker");
-};
-
-template<class Graph>
-class ParallelLowCoverageFunctor {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef std::function<void(EdgeId)> HandlerF;
- typedef omnigraph::GraphElementLock<VertexId> VertexLockT;
-
- Graph& g_;
- typename Graph::HelperT helper_;
- pred::TypedPredicate<EdgeId> ec_condition_;
- HandlerF handler_f_;
-
- omnigraph::GraphElementMarker<EdgeId> edge_marker_;
- vector<EdgeId> edges_to_remove_;
-
- void UnlinkEdgeFromStart(EdgeId e) {
- VertexId start = g_.EdgeStart(e);
- VertexLockT lock(start);
- helper_.DeleteLink(start, e);
- }
-
- void UnlinkEdge(EdgeId e) {
- UnlinkEdgeFromStart(e);
- if (g_.conjugate(e) != e)
- UnlinkEdgeFromStart(g_.conjugate(e));
- }
-
-public:
-
- //should be launched with conjugate copies filtered
- ParallelLowCoverageFunctor(Graph& g, size_t max_length, double max_coverage, HandlerF handler_f = 0)
- : g_(g),
- helper_(g_.GetConstructionHelper()),
- ec_condition_(pred::And(pred::And(omnigraph::LengthUpperBound<Graph>(g, max_length),
- omnigraph::CoverageUpperBound<Graph>(g, max_coverage)),
- omnigraph::AlternativesPresenceCondition<Graph>(g))),
- handler_f_(handler_f) {}
-
- bool IsOfInterest(EdgeId e) const {
- return !edge_marker_.is_marked(e) && ec_condition_(e);
- }
-
- void PrepareForProcessing(size_t /*interesting_cnt*/) {
- }
-
- //no conjugate copies here!
- bool Process(EdgeId e, size_t /*idx*/) {
- if (handler_f_)
- handler_f_(e);
- DEBUG("Removing edge " << g_.str(e));
- g_.FireDeleteEdge(e);
- UnlinkEdge(e);
- helper_.DeleteUnlinkedEdge(e);
- return true;
- }
-
- bool ShouldFilterConjugate() const {
- return true;
- }
-// bool operator()(EdgeId e) {
-// if (ec_condition_->Check(e)) {
-// edges_to_remove_.push_back(e);
-// }
-// return false;
-// }
-//
-// void RemoveCollectedEdges() {
-// omnigraph::SmartSetIterator<Graph, EdgeId> to_delete(g_, edges_to_remove_.begin(), edges_to_remove_.end());
-// while (!to_delete.IsEnd()) {
-// EdgeId e = *to_delete;
-// handler_f_(e);
-// g_.DeleteEdge(e);
-// ++to_delete;
-// }
-// }
-private:
- DECL_LOGGER("ParallelLowCoverageFunctor");
-};
-
-template<class Graph>
-class ParallelCompressor {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::EdgeData EdgeData;
- typedef typename Graph::VertexId VertexId;
- typedef omnigraph::GraphElementLock<VertexId> VertexLockT;
-
- Graph& g_;
- typename Graph::HelperT helper_;
- restricted::IdSegmentStorage segment_storage_;
-
- bool IsBranching(VertexId v) const {
-// VertexLockT lock(v);
- return !g_.CheckUniqueOutgoingEdge(v) || !g_.CheckUniqueIncomingEdge(v);
- }
-
- size_t LockingIncomingCount(VertexId v) const {
- VertexLockT lock(v);
- return g_.IncomingEdgeCount(v);
- }
-
- size_t LockingOutgoingCount(VertexId v) const {
- VertexLockT lock(v);
- return g_.OutgoingEdgeCount(v);
- }
-
- vector<VertexId> LockingNextVertices(VertexId v) const {
- VertexLockT lock(v);
- vector<VertexId> answer;
- for (EdgeId e : g_.OutgoingEdges(v)) {
- answer.push_back(g_.EdgeEnd(e));
- }
- return answer;
- }
-
- vector<VertexId> FilterBranchingVertices(const vector<VertexId>& vertices) const {
- vector<VertexId> answer;
- for (VertexId v : vertices) {
- VertexLockT lock(v);
- if (!IsBranching(v)) {
- answer.push_back(v);
- }
- }
- return answer;
- }
-
- //correctly handles self-conjugate case
- bool IsMinimal(VertexId v1, VertexId v2) const {
- return !(g_.conjugate(v2) < v1);
- }
-
- //true if need to go further, false if stop on any reason!
- //to_compress is not empty only if compression needs to be done
- //don't need additional checks for v == init | conjugate(init), because init is branching!
- //fixme what about plasmids?! =)
- bool ProcessNextAndGo(VertexId& v, VertexId init, vector<VertexId>& to_compress) {
- VertexLockT lock(v);
- if (!CheckConsistent(v)) {
- to_compress.clear();
- return false;
- }
- if (IsBranching(v)) {
- if (!IsMinimal(init, v)) {
- to_compress.clear();
- }
- return false;
- } else {
- to_compress.push_back(v);
- v = g_.EdgeEnd(g_.GetUniqueOutgoingEdge(v));
- return true;
- }
- }
-
- void UnlinkEdge(VertexId v, EdgeId e) {
- VertexLockT lock(v);
- helper_.DeleteLink(v, e);
- }
-
- void UnlinkEdges(VertexId v) {
- VertexLockT lock(v);
- helper_.DeleteLink(v, g_.GetUniqueOutgoingEdge(v));
- helper_.DeleteLink(g_.conjugate(v), g_.GetUniqueOutgoingEdge(g_.conjugate(v)));
- }
-
- //fixme duplication with abstract conj graph
- //not locking!
- vector<EdgeId> EdgesToDelete(const vector<EdgeId> &path) const {
- set<EdgeId> edgesToDelete;
- edgesToDelete.insert(path[0]);
- for (size_t i = 0; i + 1 < path.size(); i++) {
- EdgeId e = path[i + 1];
- if (edgesToDelete.find(g_.conjugate(e)) == edgesToDelete.end())
- edgesToDelete.insert(e);
- }
- return vector<EdgeId>(edgesToDelete.begin(), edgesToDelete.end());
- }
-
- //not locking!
- //fixme duplication with abstract conj graph
- vector<VertexId> VerticesToDelete(const vector<EdgeId> &path) const {
- set<VertexId> verticesToDelete;
- for (size_t i = 0; i + 1 < path.size(); i++) {
- EdgeId e = path[i + 1];
- VertexId v = g_.EdgeStart(e);
- if (verticesToDelete.find(g_.conjugate(v)) == verticesToDelete.end())
- verticesToDelete.insert(v);
- }
- return vector<VertexId>(verticesToDelete.begin(), verticesToDelete.end());
- }
- //todo end duplication with abstract conj graph
-
- //not locking!
- vector<EdgeId> CollectEdges(const vector<VertexId>& to_compress) const {
- vector<EdgeId> answer;
- answer.push_back(g_.GetUniqueIncomingEdge(to_compress.front()));
- for (VertexId v : to_compress) {
- answer.push_back(g_.GetUniqueOutgoingEdge(v));
- }
- return answer;
- }
-
- void CallHandlers(const vector<EdgeId>& edges, EdgeId new_edge) const {
- g_.FireMerge(edges, new_edge);
- g_.FireDeletePath(EdgesToDelete(edges), VerticesToDelete(edges));
- g_.FireAddEdge(new_edge);
- }
-
- EdgeData MergedData(const vector<EdgeId>& edges) const {
- vector<const EdgeData*> to_merge;
- for (EdgeId e : edges) {
- to_merge.push_back(&(g_.data(e)));
- }
- return g_.master().MergeData(to_merge);
- }
-
- EdgeId SyncAddEdge(VertexId v1, VertexId v2, const EdgeData& data, restricted::IdDistributor& id_distributor) {
- EdgeId new_edge = helper_.AddEdge(data, id_distributor);
- {
- VertexLockT lock(v1);
- helper_.LinkOutgoingEdge(v1, new_edge);
- }
- if (g_.conjugate(new_edge) != new_edge) {
- VertexLockT lock(v2);
- helper_.LinkIncomingEdge(v2, new_edge);
- }
- return new_edge;
- }
-
- void ProcessBranching(VertexId next, VertexId init, size_t idx) {
- vector<VertexId> to_compress;
- while (ProcessNextAndGo(next, init, to_compress)) {
- }
-
- if (!to_compress.empty()) {
- //here we are sure that we are the ones to process the path
- //so we can collect edges without any troubles (and actually without locks todo check!)
- vector<EdgeId> edges = CollectEdges(to_compress);
-
- restricted::ListIdDistributor<restricted::SegmentIterator> id_distributor = segment_storage_.GetSegmentIdDistributor(2 * idx, 2 * idx + 1);
-
- EdgeId new_edge = SyncAddEdge(g_.EdgeStart(edges.front()), g_.EdgeEnd(edges.back()), MergeSequences(g_, edges), id_distributor);
-
- CallHandlers(edges, new_edge);
-
- VertexId final = g_.EdgeEnd(edges.back());
- UnlinkEdge(init, edges.front());
- for (VertexId v : VerticesToDelete(edges/*to_compress*/)) {
- UnlinkEdges(v);
- }
-
- if (g_.conjugate(new_edge) != new_edge) {
- UnlinkEdge(g_.conjugate(final), g_.conjugate(edges.back()));
- }
-
- for (EdgeId e : EdgesToDelete(edges)) {
- helper_.DeleteUnlinkedEdge(e);
- }
- }
- }
-
- //vertex is not consistent if the path has already been compressed or under compression right now
- //not needed here, but could check if vertex is fully isolated
- bool CheckConsistent(VertexId v) const {
- //todo change to incoming edge count
- return g_.OutgoingEdgeCount(g_.conjugate(v)) > 0;
- }
-
- //long, but safe way to get left neighbour
- //heavily relies on the current graph structure!
- VertexId LockingGetInit(VertexId v) {
- VertexLockT lock(v);
- if (!CheckConsistent(v))
- return VertexId(0);
-
- //works even if this edge is already unlinked from the vertex =)
- VERIFY(g_.CheckUniqueIncomingEdge(v));
- return g_.EdgeStart(g_.GetUniqueIncomingEdge(v));
- }
-
-public:
-
- ParallelCompressor(Graph& g)
- : g_(g),
- helper_(g_.GetConstructionHelper()) {
-
- }
-
- //returns true iff v is the "leftmost" vertex to compress in the chain
- bool IsOfInterest(VertexId v) const {
- return !IsBranching(v) && IsBranching(g_.EdgeStart(g_.GetUniqueIncomingEdge(v)));
- }
-
- void PrepareForProcessing(size_t interesting_cnt) {
- segment_storage_ = g_.GetGraphIdDistributor().Reserve(interesting_cnt * 2);
- }
-
- bool Process(VertexId v, size_t idx) {
- VertexId init = LockingGetInit(v);
- if (init != VertexId(0))
- ProcessBranching(v, init, idx);
- return false;
- }
-
- bool ShouldFilterConjugate() const {
- return false;
- }
-
-};
-
-
-//todo add conjugate filtration
-template<class Graph, class ElementType>
-class AlgorithmRunner {
- const Graph& g_;
-
- template<class Algo, class It>
- bool ProcessBucket(Algo& algo, It begin, It end) {
- bool changed = false;
- for (auto it = begin; it != end; ++it) {
- changed |= algo.Process(*it);
- }
- return changed;
- }
-
-public:
-
- const Graph& g() const {
- return g_;
- }
-
- AlgorithmRunner(Graph& g)
- : g_(g) {
-
- }
-
- template<class Algo, class ItVec>
- bool RunFromChunkIterators(Algo& algo, const ItVec& chunk_iterators) {
- DEBUG("Running from " << chunk_iterators.size() - 1 << "chunks");
- VERIFY(chunk_iterators.size() > 1);
- bool changed = false;
- #pragma omp parallel for schedule(guided) reduction(|:changed)
- for (size_t i = 0; i < chunk_iterators.size() - 1; ++i) {
- changed |= ProcessBucket(algo, chunk_iterators[i], chunk_iterators[i + 1]);
- }
- DEBUG("Finished");
- return changed;
- }
-private:
- DECL_LOGGER("AlgorithmRunner")
- ;
-};
-
-template<class Graph, class ElementType>
-class TwoStepAlgorithmRunner {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- const Graph& g_;
- const bool filter_conjugate_;
- std::vector<std::vector<ElementType>> elements_of_interest_;
-
- template<class Algo>
- bool ProcessBucket(Algo& algo, const std::vector<ElementType>& bucket, size_t idx_offset) const {
- bool changed = false;
- for (ElementType el : bucket) {
- changed |= algo.Process(el, idx_offset++);
- }
- return changed;
- }
-
- template<class Algo>
- bool Process(Algo& algo) const {
- std::vector<size_t> cumulative_bucket_sizes;
- cumulative_bucket_sizes.push_back(0);
- for (const auto& bucket : elements_of_interest_) {
- cumulative_bucket_sizes.push_back(cumulative_bucket_sizes.back() + bucket.size());
- }
- DEBUG("Preparing for processing");
- algo.PrepareForProcessing(cumulative_bucket_sizes.back());
- bool changed = false;
- DEBUG("Processing buckets");
- #pragma omp parallel for schedule(guided) reduction(|:changed)
- for (size_t i = 0; i < elements_of_interest_.size(); ++i) {
- changed |= ProcessBucket(algo, elements_of_interest_[i], cumulative_bucket_sizes[i]);
- }
- return changed;
- }
-
- template<class Algo>
- void CountElement(Algo& algo, ElementType el, size_t bucket) {
- if (filter_conjugate_ && g_.conjugate(el) < el)
- return;
- if (algo.IsOfInterest(el)) {
- TRACE("Element " << g_.str(el) << " is of interest");
- elements_of_interest_[bucket].push_back(el);
- } else {
- TRACE("Element " << g_.str(el) << " is not interesting");
- }
- }
-
- template<class Algo, class It>
- void CountAll(Algo& algo, It begin, It end, size_t bucket) {
- for (auto it = begin; !(it == end); ++it) {
- CountElement(algo, *it, bucket);
- }
- }
-
-public:
-
- const Graph& g() const {
- return g_;
- }
-
- //conjugate elements are filtered based on ids
- //should be used only if both conjugate elements are simultaneously either interesting or not
- //fixme filter_conjugate is redundant
- TwoStepAlgorithmRunner(Graph& g, bool filter_conjugate)
- : g_(g),
- filter_conjugate_(filter_conjugate) {
-
- }
-
- template<class Algo, class ItVec>
- bool RunFromChunkIterators(Algo& algo, const ItVec& chunk_iterators) {
- DEBUG("Started running from " << chunk_iterators.size() - 1 << " chunks");
- VERIFY(algo.ShouldFilterConjugate() == filter_conjugate_);
- VERIFY(chunk_iterators.size() > 1);
- elements_of_interest_.clear();
- elements_of_interest_.resize(chunk_iterators.size() - 1);
- DEBUG("Searching elements of interest");
- #pragma omp parallel for schedule(guided)
- for (size_t i = 0; i < chunk_iterators.size() - 1; ++i) {
- CountAll(algo, chunk_iterators[i], chunk_iterators[i + 1], i);
- }
- DEBUG("Processing");
- return Process(algo);
- }
-
-// template<class Algo, class It>
-// void RunFromIterator(Algo& algo, It begin, It end) {
-// RunFromChunkIterators(algo, std::vector<It> { begin, end });
-// }
-private:
- DECL_LOGGER("TwoStepAlgorithmRunner")
- ;
-};
-
-template<class Graph, class ElementType>
-class SemiParallelAlgorithmRunner {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- const Graph& g_;
-
-public:
-
- const Graph& g() const {
- return g_;
- }
-
- SemiParallelAlgorithmRunner(Graph& g)
- : g_(g) {
-
- }
-
- template<class Algo, class ItVec, class Comparator = std::less<ElementType>>
- bool RunFromChunkIterators(Algo& algo, const ItVec& chunk_iterators,
- const Comparator& comp = Comparator()) {
- VERIFY(chunk_iterators.size() > 1);
- SmartSetIterator<Graph, ElementType, Comparator> it(g_, false, comp);
-
- FillInterestingFromChunkIterators(chunk_iterators, it,
- std::bind(&Algo::IsOfInterest, std::ref(algo), std::placeholders::_1));
-
- bool changed = false;
- for (; !it.IsEnd(); ++it) {
- changed |= algo.Process(*it);
- }
- return changed;
- }
-
-private:
- DECL_LOGGER("SemiParallelAlgorithmRunner")
- ;
-};
-
-//todo generalize to use for other algorithms if needed
-template<class Graph>
-class SemiParallelEdgeRemovingAlgorithm {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- Graph& g_;
- pred::TypedPredicate<EdgeId> condition_;
- EdgeRemover<Graph> edge_remover_;
-
-public:
- SemiParallelEdgeRemovingAlgorithm(Graph& g,
- pred::TypedPredicate<EdgeId> condition,
- std::function<void(EdgeId)> removal_handler = 0) :
- g_(g), condition_(condition), edge_remover_(g, removal_handler) {
- }
-
- bool IsOfInterest(EdgeId e) const {
- return condition_->Check(e);
- }
-
- bool Process(EdgeId e) {
- edge_remover_.DeleteEdge(e);
- return true;
- }
-};
-
-template<class Graph>
-void ParallelCompress(Graph& g, size_t chunk_cnt, bool loop_post_compression = true) {
- INFO("Parallel compression");
- debruijn::simplification::ParallelCompressor<Graph> compressor(g);
- TwoStepAlgorithmRunner<Graph, typename Graph::VertexId> runner(g, false);
- RunVertexAlgorithm(g, runner, compressor, chunk_cnt);
-
- //have to call cleaner to get rid of new isolated vertices
- CleanGraph(g, chunk_cnt);
-
- if (loop_post_compression) {
- INFO("Launching post-compression to compress loops");
- CompressAllVertices(g, chunk_cnt);
- }
-}
-
-template<class Graph>
-bool ParallelClipTips(Graph& g,
- const string& tip_condition,
- const SimplifInfoContainer& info,
- std::function<void(typename Graph::EdgeId)> removal_handler = 0) {
- INFO("Parallel tip clipping");
-
- string condition_str = tip_condition;
-
- ConditionParser<Graph> parser(g, condition_str, info);
-
- parser();
-
- debruijn::simplification::ParallelTipClippingFunctor<Graph> tip_clipper(g,
- parser.max_length_bound(), parser.max_coverage_bound(), removal_handler);
-
- AlgorithmRunner<Graph, typename Graph::VertexId> runner(g);
-
- RunVertexAlgorithm(g, runner, tip_clipper, info.chunk_cnt());
-
- ParallelCompress(g, info.chunk_cnt());
- //Cleaner is launched inside ParallelCompression
- //CleanGraph(g, info.chunk_cnt());
-
- return true;
-}
-
-//template<class Graph>
-//bool ParallelRemoveBulges(Graph& g,
-// const debruijn_config::simplification::bulge_remover& br_config,
-// size_t /*read_length*/,
-// std::function<void(typename Graph::EdgeId)> removal_handler = 0) {
-// INFO("Parallel bulge remover");
-//
-// size_t max_length = LengthThresholdFinder::MaxBulgeLength(
-// g.k(), br_config.max_bulge_length_coefficient,
-// br_config.max_additive_length_coefficient);
-//
-// DEBUG("Max bulge length " << max_length);
-//
-// debruijn::simplification::ParallelSimpleBRFunctor<Graph> bulge_remover(g,
-// max_length,
-// br_config.max_coverage,
-// br_config.max_relative_coverage,
-// br_config.max_delta,
-// br_config.max_relative_delta,
-// removal_handler);
-// for (VertexId v : g) {
-// bulge_remover(v);
-// }
-//
-// Compress(g);
-// return true;
-//}
-
-template<class Graph>
-bool ParallelEC(Graph& g,
- const string& ec_condition,
- const SimplifInfoContainer& info,
- std::function<void(typename Graph::EdgeId)> removal_handler = 0) {
- INFO("Parallel ec remover");
-
- ConditionParser<Graph> parser(g, ec_condition, info);
-
- auto condition = parser();
-
- size_t max_length = parser.max_length_bound();
- double max_coverage = parser.max_coverage_bound();
-
- debruijn::simplification::CriticalEdgeMarker<Graph> critical_marker(g, info.chunk_cnt());
- critical_marker.PutMarks();
-
- debruijn::simplification::ParallelLowCoverageFunctor<Graph> ec_remover(g,
- max_length,
- max_coverage,
- removal_handler);
-
- TwoStepAlgorithmRunner<Graph, typename Graph::EdgeId> runner(g, true);
-
- RunEdgeAlgorithm(g, runner, ec_remover, info.chunk_cnt());
-
- critical_marker.ClearMarks();
-
- ParallelCompress(g, info.chunk_cnt());
- //called in parallel compress
- //CleanGraph(g, info.chunk_cnt());
- return true;
-}
-
-template<class Graph, class AlgoRunner, class Algo>
-bool RunVertexAlgorithm(Graph& g, AlgoRunner& runner, Algo& algo, size_t chunk_cnt) {
- return runner.RunFromChunkIterators(algo, IterationHelper<Graph, VertexId>(g).Chunks(chunk_cnt));
-}
-
-template<class Graph, class AlgoRunner, class Algo>
-bool RunEdgeAlgorithm(Graph& g, AlgoRunner& runner, Algo& algo, size_t chunk_cnt) {
- return runner.RunFromChunkIterators(algo, IterationHelper<Graph, EdgeId>(g).Chunks(chunk_cnt));
-}
-
-}
-
-}
diff --git a/src/debruijn/simplification/simplification_settings.hpp b/src/debruijn/simplification/simplification_settings.hpp
deleted file mode 100644
index 6a51247..0000000
--- a/src/debruijn/simplification/simplification_settings.hpp
+++ /dev/null
@@ -1,105 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-namespace debruijn {
-
-namespace simplification {
-
-class LengthThresholdFinder {
-public:
- static size_t MaxTipLength(size_t read_length, size_t k, double coeff) {
- return std::max((size_t) math::round((double)std::min(k, read_length / 2) * coeff),
- read_length);
- }
-
- static size_t MaxBulgeLength(size_t k, double coeff,
- size_t additive_coeff) {
- return std::max((size_t) math::round((double)k * coeff), k + additive_coeff);
- }
-
- static size_t MaxErroneousConnectionLength(size_t k, size_t param) {
- return k + param;
- }
-
- static size_t MaxTipOriginatedECLength(size_t read_length, size_t k,
- double coeff) {
- return 2 * MaxTipLength(read_length, k, coeff) - 1;
- }
-};
-
-//todo use GenomicInfo as field!
-class SimplifInfoContainer {
- size_t read_length_;
- double detected_mean_coverage_;
- double detected_coverage_bound_;
- bool main_iteration_;
- size_t chunk_cnt_;
-
-public:
- SimplifInfoContainer() :
- read_length_(-1ul),
- detected_mean_coverage_(-1.0),
- detected_coverage_bound_(-1.0),
- main_iteration_(false),
- chunk_cnt_(-1ul) {
- }
-
- size_t read_length() const {
- VERIFY(read_length_ != -1ul);
- return read_length_;
- }
-
- double detected_mean_coverage() const {
- VERIFY(math::ge(detected_mean_coverage_, 0.));
- return detected_mean_coverage_;
- }
-
- double detected_coverage_bound() const {
- VERIFY(math::ge(detected_coverage_bound_, 0.));
- return detected_coverage_bound_;
- }
-
- bool main_iteration() const {
- return main_iteration_;
- }
-
- size_t chunk_cnt() const {
- VERIFY(chunk_cnt_ != -1ul);
- return chunk_cnt_;
- }
-
- SimplifInfoContainer& set_read_length(size_t read_length) {
- read_length_ = read_length;
- return *this;
- }
-
- SimplifInfoContainer& set_detected_coverage_bound(double detected_coverage_bound) {
- detected_coverage_bound_ = detected_coverage_bound;
- return *this;
- }
-
- SimplifInfoContainer& set_detected_mean_coverage(double detected_mean_coverage) {
- detected_mean_coverage_ = detected_mean_coverage;
- return *this;
- }
-
- SimplifInfoContainer& set_main_iteration(bool main_iteration) {
- main_iteration_ = main_iteration;
- return *this;
- }
-
- SimplifInfoContainer& set_chunk_cnt(size_t chunk_cnt) {
- chunk_cnt_ = chunk_cnt;
- return *this;
- }
-};
-
-}
-
-}
diff --git a/src/debruijn/simplification/single_cell_simplification.hpp b/src/debruijn/simplification/single_cell_simplification.hpp
deleted file mode 100644
index fd9a893..0000000
--- a/src/debruijn/simplification/single_cell_simplification.hpp
+++ /dev/null
@@ -1,110 +0,0 @@
-#pragma once
-
-#include "config_struct.hpp"
-#include "omni/erroneous_connection_remover.hpp"
-#include "omni/mf_ec_remover.hpp"
-#include "simplification_settings.hpp"
-#include "detail_coverage.hpp"
-
-namespace debruijn {
-namespace simplification {
-
-template<class Graph>
-bool TopologyRemoveErroneousEdges(
- Graph &g,
- const debruijn_graph::debruijn_config::simplification::topology_based_ec_remover& tec_config,
- std::function<void(typename Graph::EdgeId)> removal_handler) {
- INFO("Removing connections based on topology");
- size_t max_length = LengthThresholdFinder::MaxErroneousConnectionLength(
- g.k(), tec_config.max_ec_length_coefficient);
-
- pred::TypedPredicate<typename Graph::EdgeId>
- condition(DefaultUniquenessPlausabilityCondition<Graph>(g, tec_config.uniqueness_length, tec_config.plausibility_length));
-
- return omnigraph::RemoveErroneousEdgesInLengthOrder(g, condition, max_length, removal_handler);
-}
-
-template<class Graph>
-bool MultiplicityCountingRemoveErroneousEdges(
- Graph &g,
- const debruijn_graph::debruijn_config::simplification::topology_based_ec_remover& tec_config,
- std::function<void(typename Graph::EdgeId)> removal_handler) {
- INFO("Removing connections based on topological multiplicity counting");
- size_t max_length = LengthThresholdFinder::MaxErroneousConnectionLength(
- g.k(), tec_config.max_ec_length_coefficient);
-
- pred::TypedPredicate<typename Graph::EdgeId>
- condition(MultiplicityCountingCondition<Graph>(g, tec_config.uniqueness_length,
- /*plausibility*/ MakePathLengthLowerBound(g,
- PlausiblePathFinder<Graph>(g, 2 * tec_config.plausibility_length), tec_config.plausibility_length)));
-
- return omnigraph::RemoveErroneousEdgesInLengthOrder(g, condition, max_length, removal_handler);
-}
-
-template<class Graph>
-bool RemoveThorns(
- Graph &g,
- const debruijn_graph::debruijn_config::simplification::interstrand_ec_remover& isec_config,
- std::function<void(typename Graph::EdgeId)> removal_handler) {
- INFO("Removing interstrand connections");
- size_t max_length = LengthThresholdFinder::MaxErroneousConnectionLength(
- g.k(), isec_config.max_ec_length_coefficient);
-
- auto condition
- = pred::And(LengthUpperBound<Graph>(g, max_length),
- ThornCondition<Graph>(g, isec_config.uniqueness_length, isec_config.span_distance));
-
- return omnigraph::RemoveErroneousEdgesInCoverageOrder(g, condition, numeric_limits<double>::max(), removal_handler);
-}
-
-template<class Graph>
-bool TopologyReliabilityRemoveErroneousEdges(
- Graph &g,
- const debruijn_graph::debruijn_config::simplification::tr_based_ec_remover& trec_config,
- std::function<void(typename Graph::EdgeId)> removal_handler) {
- INFO("Removing connections based on topology and reliable coverage");
- size_t max_length = LengthThresholdFinder::MaxErroneousConnectionLength(
- g.k(), trec_config.max_ec_length_coefficient);
-
- auto condition
- = pred::And(CoverageUpperBound<Graph>(g, trec_config.unreliable_coverage),
- PredicateUniquenessPlausabilityCondition<Graph>(g,
- /*uniqueness*/MakePathLengthLowerBound(g, UniquePathFinder<Graph>(g), trec_config.uniqueness_length),
- /*plausibility*/pred::AlwaysTrue<typename Graph::EdgeId>()));
-
- return omnigraph::RemoveErroneousEdgesInLengthOrder(g, condition, max_length, removal_handler);
-}
-
-template<class Graph>
-bool MaxFlowRemoveErroneousEdges(
- Graph &g,
- const debruijn_graph::debruijn_config::simplification::max_flow_ec_remover& mfec_config,
- omnigraph::HandlerF<Graph> removal_handler = 0) {
- if (!mfec_config.enabled)
- return false;
- INFO("Removing connections based on max flow strategy");
- size_t max_length = LengthThresholdFinder::MaxErroneousConnectionLength(
- g.k(), (size_t) mfec_config.max_ec_length_coefficient);
- omnigraph::MaxFlowECRemover<Graph> erroneous_edge_remover(
- g, max_length, mfec_config.uniqueness_length,
- mfec_config.plausibility_length, removal_handler);
- return erroneous_edge_remover.Process();
-}
-
-template<class Graph>
-bool RemoveHiddenEC(Graph& g,
- const debruijn_graph::FlankingCoverage<Graph>& flanking_cov,
- const debruijn_graph::debruijn_config::simplification::hidden_ec_remover& her_config,
- const SimplifInfoContainer& info,
- omnigraph::HandlerF<Graph> removal_handler) {
- if (her_config.enabled) {
- INFO("Removing hidden erroneous connections");
- return HiddenECRemover<Graph>(g, her_config.uniqueness_length, flanking_cov,
- her_config.unreliability_threshold, info.detected_coverage_bound(),
- her_config.relative_threshold, removal_handler).Run();
- }
- return false;
-}
-
-}
-}
diff --git a/src/debruijn/split_path_constructor.hpp b/src/debruijn/split_path_constructor.hpp
deleted file mode 100644
index 822a90b..0000000
--- a/src/debruijn/split_path_constructor.hpp
+++ /dev/null
@@ -1,134 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * split_path_constructor.hpp
- *
- * Created on: Jun 14, 2012
- * Author: avsirotkin
- */
-
-#pragma once
-
-#include "standard.hpp"
-#include "logger/logger.hpp"
-#include "de/paired_info.hpp"
-#include "omni/path_processor.hpp"
-#include "../include/de/paired_info.hpp"
-
-namespace debruijn_graph {
-
-template<class Graph>
-class PathInfoClass {
- public:
- typedef typename Graph::EdgeId EdgeId;
- typedef omnigraph::de::PairInfo<EdgeId> PairInfo;
-
- EdgeId base_edge;
- vector<PairInfo> path;
- PathInfoClass(): base_edge(NULL) {};
- PathInfoClass(const EdgeId Edge): base_edge(Edge) {};
- std::pair<EdgeId, double> operator[](const size_t i) const {
- if (i == 0)
- return std::make_pair(base_edge, 0.0);
-
- VERIFY(i < path.size() + 1);
- return std::make_pair(path[i-1].second, path[i-1].d());
- }
- size_t size() const { return path.size() + 1; }
- void push_back(const PairInfo& pi) { path.push_back(pi); }
- typename std::vector<PairInfo>::const_iterator begin() const { return path.begin(); }
- typename std::vector<PairInfo>::const_iterator end() const { return path.end(); }
- std::string PrintPath(const Graph& graph) const {
- std::ostringstream ss;
- ss<<" "<<graph.int_id(base_edge)<<": ";
- for (size_t j=0; j < path.size(); j++){
- ss<<"("<<graph.int_id(path[j].second)<<", "<<path[j].d()<<"), ";
- }
- return ss.str();
- }
-};
-
-template<class Graph>
-class SplitPathConstructor {
- typedef typename Graph::EdgeId EdgeId;
- typedef PathInfoClass<Graph> PathInfo;
- typedef omnigraph::de::PairInfo<EdgeId> PairInfo;
-
- public:
- SplitPathConstructor(const Graph &graph): graph_(graph) {}
-
- vector<PathInfo> ConvertPIToSplitPaths(EdgeId cur_edge, const omnigraph::de::PairedInfoIndexT<Graph> &pi, double is, double is_var) const {
- vector<PairInfo> pair_infos; //TODO: this is an adaptor for the old implementation
- for (auto i : pi.Get(cur_edge))
- for (auto j : i.second)
- pair_infos.emplace_back(cur_edge, i.first, j);
-
- vector<PathInfo> result;
- if (pair_infos.empty())
- return result;
-
- vector<bool> pair_info_used(pair_infos.size());
- TRACE("Preparing path_processor for this base edge");
- size_t path_upper_bound = PairInfoPathLengthUpperBound(graph_.k(), (size_t) is, is_var);
-
- //FIXME is path_upper_bound enough?
- PathProcessor<Graph> path_processor(graph_,
- graph_.EdgeEnd(cur_edge),
- path_upper_bound);
-
- TRACE("Path_processor is done");
-
- for (size_t i = pair_infos.size(); i > 0; --i) {
- const PairInfo& cur_info = pair_infos[i - 1];
- if (math::le(cur_info.d(), 0.))
- continue;
- if (pair_info_used[i - 1])
- continue;
- DEBUG("SPC: pi " << cur_info);
- vector<EdgeId> common_part = GetCommonPathsEnd(graph_, cur_edge, cur_info.second,
- (size_t) (cur_info.d() - cur_info.var()),
- //FIXME is it a bug?!
- (size_t) (cur_info.d() - cur_info.var()),
- path_processor);
- DEBUG("Found common part of size " << common_part.size());
- PathInfoClass<Graph> sub_res(cur_edge);
- if (common_part.size() > 0) {
- size_t total_length = 0;
- for (size_t j = 0; j < common_part.size(); ++j)
- total_length += graph_.length(common_part[j]);
-
- DEBUG("Common part " << ToString(common_part));
- for (size_t j = 0; j < common_part.size(); ++j) {
- PairInfo cur_pi(cur_edge, common_part[j],
- cur_info.d() - (double) total_length,
- cur_info.weight(),
- cur_info.var());
-
- sub_res.push_back(cur_pi);
- total_length -= graph_.length(common_part[j]);
- for (size_t ind = 0; ind + 1 < i; ++ind) {
- if (cur_pi == pair_infos[ind])
- pair_info_used[ind] = true;
- }
- }
- }
-
- sub_res.push_back(cur_info);
- result.push_back(sub_res);
- DEBUG(sub_res.PrintPath(graph_));
- }
- return result;
- }
-
- private:
- const Graph &graph_;
-};
-
-
-
-}
diff --git a/src/debruijn/stage.cpp b/src/debruijn/stage.cpp
deleted file mode 100644
index 20a5b42..0000000
--- a/src/debruijn/stage.cpp
+++ /dev/null
@@ -1,133 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "stage.hpp"
-#include "graphio.hpp"
-
-#include "logger/log_writers.hpp"
-
-#include <algorithm>
-#include <cstring>
-
-namespace spades {
-
-void AssemblyStage::load(debruijn_graph::conj_graph_pack& gp,
- const std::string &load_from,
- const char* prefix) {
- std::string p = path::append_path(load_from, prefix == NULL ? id_ : prefix);
- INFO("Loading current state from " << p);
-
- debruijn_graph::graphio::ScanAll(p, gp, false);
- debruijn_graph::load_lib_data(p);
-}
-
-void AssemblyStage::save(const debruijn_graph::conj_graph_pack& gp,
- const std::string &save_to,
- const char* prefix) const {
- std::string p = path::append_path(save_to, prefix == NULL ? id_ : prefix);
- INFO("Saving current state to " << p);
-
- debruijn_graph::graphio::PrintAll(p, gp);
- debruijn_graph::write_lib_data(p);
-}
-
-class StageIdComparator {
- public:
- StageIdComparator(const char* id)
- : id_(id) {
- const char* pos = strstr(id, ":");
- len_ = (pos != NULL ? pos - id : strlen(id));
- }
-
- bool operator()(const std::unique_ptr<AssemblyStage> &stage) const {
- const char* sid = stage->id();
- return (0 == strncmp(id_, sid, len_) && sid[len_] == 0);
- }
-
- private:
- const char* id_;
- size_t len_;
-};
-
-class PhaseIdComparator {
- public:
- PhaseIdComparator(const char* id) {
- const char* pos = strstr(id, ":");
- VERIFY(pos != NULL);
- id_ = pos + 1;
- }
-
- bool operator()(const std::unique_ptr<CompositeStageBase::PhaseBase> &phase) const {
- return 0 == strcmp(id_, phase->id());
- }
-
- private:
- const char* id_;
-};
-
-void CompositeStageBase::run(debruijn_graph::conj_graph_pack& gp,
- const char* started_from) {
- VERIFY(parent_);
- auto start_phase = phases_.begin();
- if (started_from &&
- strstr(started_from, ":") &&
- started_from == strstr(started_from, id())) {
- start_phase = std::find_if(phases_.begin(), phases_.end(), PhaseIdComparator(started_from));
- if (start_phase == phases_.end()) {
- ERROR("Invalid start stage / phase combination specified: " << started_from);
- exit(-1);
- }
- if (start_phase != phases_.begin()) {
- PhaseBase * prev_phase = std::prev(start_phase)->get();
- std::string composite_id(id());
- composite_id += ":";
- composite_id += prev_phase->id();
- prev_phase->load(gp, parent_->saves_policy().load_from_, composite_id.c_str());
- }
- }
-
- for (auto et = phases_.end(); start_phase != et; ++start_phase) {
- PhaseBase *phase = start_phase->get();
-
- INFO("PROCEDURE == " << phase->name());
- phase->run(gp, started_from);
-
- if (parent_->saves_policy().make_saves_) {
- std::string composite_id(id());
- composite_id += ":";
- composite_id += phase->id();
-
- phase->save(gp, parent_->saves_policy().save_to_, composite_id.c_str());
- }
-
- }
-}
-
-void StageManager::run(debruijn_graph::conj_graph_pack& g,
- const char* start_from) {
- auto start_stage = stages_.begin();
- if (start_from) {
- start_stage = std::find_if(stages_.begin(), stages_.end(), StageIdComparator(start_from));
- if (start_stage == stages_.end()) {
- ERROR("Invalid start stage specified: " << start_from);
- exit(-1);
- }
- if (start_stage != stages_.begin())
- (*std::prev(start_stage))->load(g, saves_policy_.load_from_);
- }
-
- for (; start_stage != stages_.end(); ++start_stage) {
- AssemblyStage *stage = start_stage->get();
-
- INFO("STAGE == " << stage->name());
- stage->run(g, start_from);
- if (saves_policy_.make_saves_)
- stage->save(g, saves_policy_.save_to_);
- }
-}
-
-}
diff --git a/src/debruijn/stage.hpp b/src/debruijn/stage.hpp
deleted file mode 100644
index a9c07ac..0000000
--- a/src/debruijn/stage.hpp
+++ /dev/null
@@ -1,155 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef __STAGE_HPP__
-#define __STAGE_HPP__
-
-#include "graph_pack.hpp"
-
-#include <vector>
-#include <memory>
-
-namespace spades {
-
-class StageManager;
-
-class AssemblyStage {
- public:
- AssemblyStage(const char *name, const char *id)
- : name_(name), id_(id), parent_(NULL) {}
-
- virtual ~AssemblyStage() {}
-
- AssemblyStage(const AssemblyStage&) = delete;
- AssemblyStage& operator=(const AssemblyStage&) = delete;
-
- const char *name() const { return name_; }
- const char *id() const { return id_; }
-
- virtual void load(debruijn_graph::conj_graph_pack&, const std::string &load_from, const char* prefix = NULL);
- virtual void save(const debruijn_graph::conj_graph_pack&, const std::string &save_to, const char* prefix = NULL) const;
- virtual void run(debruijn_graph::conj_graph_pack&, const char* started_from = NULL) = 0;
-
- private:
- const char *name_;
- const char *id_;
-
- protected:
- const StageManager *parent_;
-
- friend class StageManager;
-};
-
-class CompositeStageBase : public AssemblyStage {
- public:
- class PhaseBase : public AssemblyStage {
- public:
- PhaseBase(const char *name, const char *id)
- : AssemblyStage(name, id), parent_stage_(NULL) {}
- protected:
- CompositeStageBase *parent_stage_;
-
- friend class CompositeStageBase;
- };
-
- CompositeStageBase(const char *name, const char *id)
- : AssemblyStage(name, id) {}
-
- CompositeStageBase* add(PhaseBase *phase) {
- phases_.push_back(std::unique_ptr<PhaseBase>(phase));
- phase->parent_stage_ = this;
-
- return this;
- }
-
- CompositeStageBase* add(std::initializer_list<PhaseBase*> phases) {
- for (auto it = phases.begin(), et = phases.end(); it != et; ++it)
- add(*it);
-
- return this;
- }
-
- void run(debruijn_graph::conj_graph_pack& gp, const char* = NULL);
-
- private:
- std::vector<std::unique_ptr<PhaseBase> > phases_;
-};
-
-template<class Storage>
-class CompositeStage : public CompositeStageBase {
- public:
- class Phase : public PhaseBase {
- public:
- Phase(const char *name, const char *id)
- : PhaseBase(name, id) {}
-
- CompositeStage<Storage>* parent() { return static_cast<CompositeStage<Storage>*>(parent_stage_); }
- const CompositeStage<Storage>* parent() const { return static_cast<const CompositeStage<Storage>*>(parent_stage_); }
-
- Storage &storage() { return parent()->storage(); }
- const Storage &storage() const { return parent()->storage(); }
- };
-
- CompositeStage(const char *name, const char *id)
- : CompositeStageBase(name, id) {}
-
- Storage &storage() { return storage_; }
- const Storage &storage() const { return storage_; }
-
- private:
- Storage storage_;
-};
-
-class StageManager {
-
- public:
- struct SavesPolicy {
- bool make_saves_;
- std::string load_from_;
- std::string save_to_;
-
- SavesPolicy()
- : make_saves_(false), load_from_(""), save_to_("") {}
-
- SavesPolicy(bool make_saves, const std::string &load_from, const std::string &save_to)
- : make_saves_(make_saves), load_from_(load_from), save_to_(save_to) {}
- };
-
- StageManager(SavesPolicy policy = SavesPolicy())
- : saves_policy_(policy) {}
-
- StageManager& add(AssemblyStage *stage) {
- stages_.push_back(std::unique_ptr<AssemblyStage>(stage));
- stages_.back()->parent_ = this;
-
- return *this;
- }
- StageManager& add(std::initializer_list<AssemblyStage*> stages) {
- for (auto it = stages.begin(), et = stages.end(); it != et; ++it)
- add(*it);
-
- return *this;
- }
-
- void run(debruijn_graph::conj_graph_pack& g,
- const char* start_from = NULL);
-
- const SavesPolicy& saves_policy() const {
- return saves_policy_;
- }
-
- private:
- std::vector<std::unique_ptr<AssemblyStage> > stages_;
- SavesPolicy saves_policy_;
-
- DECL_LOGGER("StageManager");
-};
-
-
-};
-
-#endif // __STAGE_HPP__
diff --git a/src/debruijn/standard.hpp b/src/debruijn/standard.hpp
deleted file mode 100644
index 7d7d6dc..0000000
--- a/src/debruijn/standard.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "standard_base.hpp"
-
-// utils
-#include "cpp_utils.hpp"
-#include "path_helper.hpp"
-
-#include "simple_tools.hpp"
-
-// io
-#include "io/ireader.hpp"
-#include "io/converting_reader_wrapper.hpp"
-
-#include "runtime_k.hpp"
diff --git a/src/debruijn/stats/chimera_stats.hpp b/src/debruijn/stats/chimera_stats.hpp
deleted file mode 100644
index d69ade8..0000000
--- a/src/debruijn/stats/chimera_stats.hpp
+++ /dev/null
@@ -1,265 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "statistics.hpp"
-
-namespace debruijn_graph {
-
-namespace stats {
-
-template<class Graph>
-class ChimericEdgeClassifier {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- const Graph& g_;
- size_t length_bound_;
- const EdgeQuality<Graph>& edge_qual_;
- bool real_edges_mode_;
-
- template<class EdgeContainer>
- vector<EdgeId> FilterNotEqual(const EdgeContainer& edges,
- EdgeId edge) const {
- vector<EdgeId> answer;
- for (EdgeId e : edges) {
- if (e != edge) {
- answer.push_back(e);
- }
- }
- return answer;
- }
-
- bool TopologyAndQualCheck(const vector<EdgeId>& edges) const {
- return edges.size() == 1 && edge_qual_.IsPositiveQuality(edges.front());
- }
-
- bool TopologyAndQualCheck(VertexId v, EdgeId e) const {
- return TopologyAndQualCheck(
- FilterNotEqual(g_.OutgoingEdges(v), e))
- && TopologyAndQualCheck(
- FilterNotEqual(g_.IncomingEdges(v), e));
- }
-
- bool TopologyAndQualCheck(EdgeId e) const {
- return TopologyAndQualCheck(g_.EdgeStart(e), e)
- && TopologyAndQualCheck(g_.EdgeEnd(e), e);
- }
-
-public:
- ChimericEdgeClassifier(const Graph& g, size_t length_bound, const EdgeQuality<Graph>& edge_qual, bool real_edges_mode = false)
- : g_(g),
- length_bound_(length_bound),
- edge_qual_(edge_qual),
- real_edges_mode_(real_edges_mode) {
- }
-
- bool IsTrivialChimeric(EdgeId e) const {
- bool correct_qual = real_edges_mode_ ? edge_qual_.IsPositiveQuality(e) : edge_qual_.IsZeroQuality(e);
- return correct_qual && g_.length(e) <= length_bound_
- && TopologyAndQualCheck(e);
- }
-
-private:
- DECL_LOGGER("ChimericEdgeClassifier");
-};
-
-template<class Graph>
-class InterstrandAnalyzer {
- const static size_t infinity = -1u;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- const Graph& g_;
- size_t dist_bound_;
- const MappingPath<EdgeId> genome_path_;
-
- bool Relax(size_t& a, size_t b) const {
- if (b < a) {
- a = b;
- return true;
- }
- return false;
- }
-
- size_t GenomicDistance(size_t genome_path_pos, EdgeId e2,
- size_t distance_bound) const {
- for (size_t i = genome_path_pos + 1; i < genome_path_.size(); ++i) {
- int gap =
- (int)(genome_path_[i].second.initial_range.start_pos
- - genome_path_[genome_path_pos].second.initial_range.end_pos);
- VERIFY(gap >= 0);
- if (size_t(gap) > distance_bound)
- return infinity;
- if (genome_path_[i].first == e2)
- return gap;
- }
- return infinity;
- }
-
- size_t ShortestGenomicDistance(EdgeId e1, EdgeId e2,
- size_t distance_bound) const {
- size_t best = infinity;
- for (size_t i = 0; i < genome_path_.size(); ++i) {
- if (genome_path_[i].first == e1) {
- Relax(best, GenomicDistance(i, e2, distance_bound));
- }
- }
- return best;
- }
-
- size_t InnerInterstrandDistance(EdgeId e) const {
- size_t answer = infinity;
- EdgeId e1 = g_.GetUniqueIncomingEdge(g_.EdgeStart(e));
- EdgeId e2 = g_.GetUniqueOutgoingEdge(g_.EdgeEnd(e));
- if (g_.length(e2) > dist_bound_)
- return -1;
- Relax(answer,
- ShortestGenomicDistance(e1, g_.conjugate(e2),
- dist_bound_ - g_.length(e2)));
- Relax(answer,
- ShortestGenomicDistance(e2, g_.conjugate(e1),
- dist_bound_ - g_.length(e2)));
- return answer + g_.length(e2);
- }
-
-
-public:
- InterstrandAnalyzer(const Graph& g, size_t dist_bound, const MappingPath<EdgeId> genome_path)
- : g_(g),
- dist_bound_(dist_bound),
- genome_path_(genome_path) {
- }
-
- //todo rewrite and think of additionally detecting thorns with no path
- //returns -1u if no interstrand path or interstrand distance > dist_bound
- size_t InterstrandDistance(EdgeId e) const {
- size_t answer = infinity;
- Relax(answer, InnerInterstrandDistance(e));
- Relax(answer, InnerInterstrandDistance(g_.conjugate(e)));
- //todo maybe unnecessary check
- return answer <= dist_bound_ ? answer : -1u;
- }
-
-private:
- DECL_LOGGER("InterstrandAnalyzer");
-};
-
-template<class Graph>
-class ChimericEdgeStats {
- const static size_t infinity = -1u;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- const Graph& g_;
- const ChimericEdgeClassifier<Graph>& chimeric_edge_classifier_;
- const InterstrandAnalyzer<Graph>& interstrand_analyzer_;
- ostream& out_;
-
-protected:
- virtual string Head() {
- std::stringstream ss;
- ss << "int_id\t"
- << "length\t"
- << "coverage\t"
- << "interstrand_dist"
- << endl;
- return ss.str();
- }
-
- virtual string ReportChimera(EdgeId e, size_t interstrand_dist) {
- std::stringstream ss;
- ss << g_.int_id(e) << "\t"
- << g_.length(e) << "\t"
- << g_.coverage(e) << "\t";
- if (interstrand_dist < infinity) {
- ss << interstrand_dist;
- } else {
- ss << -1;
- }
- ss << endl;
- return ss.str();
- }
-
- const Graph& g() const {
- return g_;
- }
-
-public:
- ChimericEdgeStats(const Graph& g,
- const ChimericEdgeClassifier<Graph>& chimeric_edge_classifier,
- const InterstrandAnalyzer<Graph>& interstrand_analyzer,
- ostream& out)
- : g_(g),
- chimeric_edge_classifier_(chimeric_edge_classifier),
- interstrand_analyzer_(interstrand_analyzer),
- out_(out) {
- }
-
- virtual ~ChimericEdgeStats() {
- }
-
- void operator()() {
- out_ << Head() << endl;
- set<EdgeId> visited;
- for (auto it = g_.SmartEdgeBegin(); !it.IsEnd(); ++it) {
- if (visited.count(*it) > 0)
- continue;
- visited.insert(*it);
- visited.insert(g_.conjugate(*it));
- if (chimeric_edge_classifier_.IsTrivialChimeric(*it)) {
- out_ << ReportChimera(*it, interstrand_analyzer_.InterstrandDistance(*it)) << endl;
- }
- }
- }
-};
-
-template<class Graph>
-class ChimeraRelativeCoverageStats : public ChimericEdgeStats<Graph> {
- typedef ChimericEdgeStats<Graph> base;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef std::function<double(EdgeId, VertexId)> LocalCoverageFT;
-
- simplification::relative_coverage::RelativeCoverageHelper<Graph> rel_helper_;
-
- double RelativeCoverage(VertexId v, EdgeId base_edge) {
- return rel_helper_.RelativeCoverageToReport(v, rel_helper_.LocalCoverage(base_edge, v));
- }
-
-public:
- ChimeraRelativeCoverageStats(const Graph& g,
- const ChimericEdgeClassifier<Graph>& edge_classifier,
- const InterstrandAnalyzer<Graph>& interstrand_analyzer,
- LocalCoverageFT local_coverage_f,
- ostream& out)
- : base(g, edge_classifier, interstrand_analyzer, out),
- rel_helper_(g, local_coverage_f, 2.0/*any value works here*/) {
- }
-
-protected:
- virtual string Head() {
- return base::Head() + "\tmin_rel_cov\tmax_rel_cov";
- }
-
- virtual string ReportChimera(EdgeId e, size_t interstrand_dist) {
- double start_cov = RelativeCoverage(this->g().EdgeStart(e), e);
- double end_cov = RelativeCoverage(this->g().EdgeEnd(e), e);
- stringstream ss;
- ss << base::ReportChimera(e, interstrand_dist) << "\t"
- << std::min(start_cov, end_cov) << "\t"
- << std::max(start_cov, end_cov);
- return ss.str();
- }
-
-private:
- DECL_LOGGER("ChimeraRelativeCoverageStats");
-};
-
-}
-}
diff --git a/src/debruijn/stats/debruijn_stats.hpp b/src/debruijn/stats/debruijn_stats.hpp
deleted file mode 100644
index 73d04c9..0000000
--- a/src/debruijn/stats/debruijn_stats.hpp
+++ /dev/null
@@ -1,417 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "statistics.hpp"
-#include "debruijn_graph.hpp"
-
-#include "graph_pack.hpp"
-#include "sequence_mapper.hpp"
-#include "graphio.hpp"
-#include "positions.hpp"
-
-#include "omni/visualization/visualization.hpp"
-#include "omni/edges_position_handler.hpp"
-#include "omni/graph_component.hpp"
-#include "io/rc_reader_wrapper.hpp"
-#include "io/delegating_reader_wrapper.hpp"
-#include "io/io_helper.hpp"
-#include "io/wrapper_collection.hpp"
-#include "io/osequencestream.hpp"
-#include "dataset_readers.hpp"
-#include "copy_file.hpp"
-
-#include <boost/algorithm/string.hpp>
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <cmath>
-
-namespace debruijn_graph {
-
-namespace stats {
-
-template<class Graph, class Index>
-MappingPath<typename Graph::EdgeId>
-FindGenomeMappingPath(const Sequence& genome, const Graph& g,
- const Index& index,
- const KmerMapper<Graph>& kmer_mapper) {
- NewExtendedSequenceMapper<Graph, Index> srt(g, index, kmer_mapper);
- return srt.MapSequence(genome);
-}
-
-template<class graph_pack>
-MappingPath<typename graph_pack::graph_t::EdgeId>
-FindGenomeMappingPath(const Sequence& genome, const graph_pack& gp) {
- return FindGenomeMappingPath(genome, gp.g, gp.index, gp.kmer_mapper);
-}
-
-template <class graph_pack>
-shared_ptr<omnigraph::visualization::GraphColorer<Graph>> DefaultColorer(const graph_pack& gp) {
- return omnigraph::visualization::DefaultColorer(gp.g,
- FindGenomeMappingPath(gp.genome.GetSequence(), gp.g, gp.index, gp.kmer_mapper).path(),
- FindGenomeMappingPath(!gp.genome.GetSequence(), gp.g, gp.index, gp.kmer_mapper).path());
-}
-
-template <class graph_pack>
-void CollectContigPositions(graph_pack &gp) {
- if (!cfg::get().pos.contigs_for_threading.empty() &&
- path::FileExists(cfg::get().pos.contigs_for_threading))
- FillPos(gp, cfg::get().pos.contigs_for_threading, "thr_", true);
-
- if (!cfg::get().pos.contigs_to_analyze.empty() &&
- path::FileExists(cfg::get().pos.contigs_to_analyze))
- FillPos(gp, cfg::get().pos.contigs_to_analyze, "anlz_", true);
-}
-
-template<class Graph, class Index>
-class GenomeMappingStat: public AbstractStatCounter {
- private:
- typedef typename Graph::EdgeId EdgeId;
- const Graph &graph_;
- const Index& index_;
- Sequence genome_;
- size_t k_;
- public:
- GenomeMappingStat(const Graph &graph, const Index &index, GenomeStorage genome, size_t k) :
- graph_(graph), index_(index), genome_(genome.GetSequence()), k_(k) {}
-
- virtual ~GenomeMappingStat() {}
-
- virtual void Count() {
- INFO("Mapping genome");
- size_t break_number = 0;
- size_t covered_kp1mers = 0;
- size_t fail = 0;
- if (genome_.size() <= k_)
- return;
-
- runtime_k::RtSeq cur = genome_.start<runtime_k::RtSeq>(k_ + 1);
- cur >>= 0;
- bool breaked = true;
- pair<EdgeId, size_t> cur_position;
- for (size_t cur_nucl = k_; cur_nucl < genome_.size(); cur_nucl++) {
- cur <<= genome_[cur_nucl];
- if (index_.contains(cur)) {
- pair<EdgeId, size_t> next = index_.get(cur);
- if (!breaked
- && cur_position.second + 1
- < graph_.length(cur_position.first)) {
- if (next.first != cur_position.first
- || cur_position.second + 1 != next.second) {
- fail++;
- }
- }
- cur_position = next;
- covered_kp1mers++;
- breaked = false;
- } else {
- if (!breaked) {
- breaked = true;
- break_number++;
- }
- }
- }
- INFO("Genome mapped");
- INFO("Genome mapping results:");
- INFO("Covered k+1-mers:" << covered_kp1mers << " of " << (genome_.size() - k_) << " which is "
- << (100.0 * (double) covered_kp1mers / (double) (genome_.size() - k_)) << "%");
- INFO("Covered k+1-mers form " << break_number + 1 << " contigious parts");
- INFO("Continuity failtures " << fail);
- }
-};
-
-template<class Graph>
-void WriteErrorLoc(const Graph &g,
- const string& folder_name,
- std::shared_ptr<omnigraph::visualization::GraphColorer<Graph>> genome_colorer,
- const omnigraph::GraphLabeler<Graph>& labeler) {
- INFO("Writing error localities for graph to folder " << folder_name);
- GraphComponent<Graph> all(g, g.begin(), g.end());
- set<typename Graph::EdgeId> edges = genome_colorer->ColoredWith(all.edges().begin(),
- all.edges().end(), "black");
- set<typename Graph::VertexId> to_draw;
- for (auto it = edges.begin(); it != edges.end(); ++it) {
- to_draw.insert(g.EdgeEnd(*it));
- to_draw.insert(g.EdgeStart(*it));
- }
- shared_ptr<GraphSplitter<Graph>> splitter = StandardSplitter(g, to_draw);
- WriteComponents(g, folder_name, splitter, genome_colorer, labeler);
- INFO("Error localities written written to folder " << folder_name);
-}
-
-template<class graph_pack>
-void CountStats(const graph_pack& gp) {
- typedef typename graph_pack::graph_t Graph;
- typedef typename Graph::EdgeId EdgeId;
- INFO("Counting stats");
- StatList stats;
- Path<EdgeId> path1 = FindGenomeMappingPath(gp.genome.GetSequence(), gp.g, gp.index,
- gp.kmer_mapper).path();
- Path<EdgeId> path2 = FindGenomeMappingPath(!gp.genome.GetSequence(), gp.g, gp.index,
- gp.kmer_mapper).path();
- stats.AddStat(new VertexEdgeStat<Graph>(gp.g));
- stats.AddStat(new BlackEdgesStat<Graph>(gp.g, path1, path2));
- stats.AddStat(new NStat<Graph>(gp.g, path1, 50));
- stats.AddStat(new SelfComplementStat<Graph>(gp.g));
- stats.AddStat(
- new GenomeMappingStat<Graph, Index>(gp.g, gp.index,
- gp.genome, gp.k_value));
- stats.AddStat(new IsolatedEdgesStat<Graph>(gp.g, path1, path2));
- stats.Count();
- INFO("Stats counted");
-}
-
-template<class Graph>
-void WriteGraphComponentsAlongGenome(const Graph& g,
- const GraphLabeler<Graph>& labeler,
- const string& folder,
- const Path<typename Graph::EdgeId>& path1,
- const Path<typename Graph::EdgeId>& path2) {
- INFO("Writing graph components along genome");
-
- make_dir(folder);
- omnigraph::visualization::WriteComponentsAlongPath(g, path1, folder, omnigraph::visualization::DefaultColorer(g, path1, path2), labeler);
-
- INFO("Writing graph components along genome finished");
-}
-
-//todo refactoring needed: use graph pack instead!!!
-template<class Graph, class Mapper>
-void WriteGraphComponentsAlongContigs(const Graph& g,
- Mapper &mapper,
- const std::string& folder,
- std::shared_ptr<omnigraph::visualization::GraphColorer<Graph>> colorer,
- const GraphLabeler<Graph>& labeler) {
- INFO("Writing graph components along contigs");
- auto contigs_to_thread = io::EasyStream(cfg::get().pos.contigs_to_analyze, false);
- contigs_to_thread->reset();
- io::SingleRead read;
- while (!contigs_to_thread->eof()) {
- (*contigs_to_thread) >> read;
- make_dir(folder + read.name());
- omnigraph::visualization::WriteComponentsAlongPath(g, mapper.MapSequence(read.sequence()).simple_path(), folder + read.name() + "/",
- colorer, labeler);
- }
- INFO("Writing graph components along contigs finished");
-}
-
-template<class Graph>
-void WriteKmerComponent(conj_graph_pack &gp, runtime_k::RtSeq const& kp1mer, const std::string& file,
- std::shared_ptr<omnigraph::visualization::GraphColorer<Graph>> colorer,
- const omnigraph::GraphLabeler<Graph>& labeler) {
- if(!gp.index.contains(kp1mer)) {
- WARN("no such kmer in the graph");
- return;
- }
- VERIFY(gp.index.contains(kp1mer));
- auto pos = gp.index.get(kp1mer);
- typename Graph::VertexId v = pos.second * 2 < gp.g.length(pos.first) ? gp.g.EdgeStart(pos.first) : gp.g.EdgeEnd(pos.first);
- GraphComponent<Graph> component = omnigraph::VertexNeighborhood<Graph>(gp.g, v);
- omnigraph::visualization::WriteComponent<Graph>(component, file, colorer, labeler);
-}
-
-inline
-optional<runtime_k::RtSeq> FindCloseKP1mer(const conj_graph_pack &gp,
- size_t genome_pos, size_t k) {
- VERIFY(gp.genome.size() > 0);
- VERIFY(genome_pos < gp.genome.size());
- static const size_t magic_const = 200;
- for (size_t diff = 0; diff < magic_const; diff++) {
- for (int dir = -1; dir <= 1; dir += 2) {
- size_t pos = (gp.genome.size() - k + genome_pos + dir * diff) % (gp.genome.size() - k);
- runtime_k::RtSeq kp1mer = gp.kmer_mapper.Substitute(
- runtime_k::RtSeq (k + 1, gp.genome.GetSequence(), pos));
- if (gp.index.contains(kp1mer))
- return optional<runtime_k::RtSeq>(kp1mer);
- }
- }
- return boost::none;
-}
-
-inline
-void PrepareForDrawing(conj_graph_pack &gp) {
- gp.EnsureDebugInfo();
- CollectContigPositions(gp);
-}
-
-
-struct detail_info_printer {
- detail_info_printer(conj_graph_pack &gp,
- const omnigraph::GraphLabeler<Graph>& labeler,
- const string& folder)
- : gp_(gp),
- labeler_(labeler),
- folder_(folder) {
- }
-
- void operator() (info_printer_pos pos,
- const string& folder_suffix = "") {
- string pos_name = details::info_printer_pos_name(pos);
-
- ProduceDetailedInfo(pos_name + folder_suffix, pos);
- }
-
- private:
-
- void ProduceDetailedInfo(const string &pos_name,
- info_printer_pos pos) {
- static size_t call_cnt = 0;
-
- auto it = cfg::get().info_printers.find(pos);
- VERIFY(it != cfg::get().info_printers.end());
-
- const debruijn_config::info_printer & config = it->second;
-
- if (config.basic_stats) {
- VertexEdgeStat<conj_graph_pack::graph_t> stats(gp_.g);
- INFO("Number of vertices : " << stats.vertices() << ", number of edges : "
- << stats.edges() << ", sum length of edges : " << stats.edge_length());
- }
-
- if (config.save_full_graph) {
- string saves_folder = path::append_path(path::append_path(folder_, "saves/"),
- ToString(call_cnt++, 2) + "_" + pos_name + "/");
- path::make_dirs(saves_folder);
- graphio::ConjugateDataPrinter<conj_graph_pack::graph_t> printer(gp_.g);
- graphio::PrintBasicGraph(saves_folder + "graph", printer);
- }
-
- if (config.extended_stats) {
- VERIFY(cfg::get().developer_mode);
- CountStats(gp_);
- }
-
- if (!(config.write_error_loc ||
- config.write_full_graph ||
- config.write_full_nc_graph ||
- config.write_components ||
- !config.components_for_kmer.empty() ||
- config.write_components_along_genome ||
- config.write_components_along_contigs ||
- !config.components_for_genome_pos.empty())) {
- return;
- }
-
- VERIFY(cfg::get().developer_mode);
- string pics_folder = path::append_path(path::append_path(folder_, "pictures/"),
- ToString(call_cnt++, 2) + "_" + pos_name + "/");
- path::make_dirs(pics_folder);
- PrepareForDrawing(gp_);
-
- auto path1 = FindGenomeMappingPath(gp_.genome.GetSequence(), gp_.g, gp_.index,
- gp_.kmer_mapper).path();
-
- auto colorer = DefaultColorer(gp_);
-
- if (config.write_error_loc) {
- make_dir(pics_folder + "error_loc/");
- WriteErrorLoc(gp_.g, pics_folder + "error_loc/", colorer, labeler_);
- }
-
- if (config.write_full_graph) {
- WriteComponent(GraphComponent<Graph>(gp_.g, gp_.g.begin(), gp_.g.end()), pics_folder + "full_graph.dot", colorer, labeler_);
- }
-
- if (config.write_full_nc_graph) {
- WriteSimpleComponent(GraphComponent<Graph>(gp_.g, gp_.g.begin(), gp_.g.end()), pics_folder + "nc_full_graph.dot", colorer, labeler_);
- }
-
- if (config.write_components) {
- make_dir(pics_folder + "components/");
- omnigraph::visualization::WriteComponents(gp_.g, pics_folder + "components/", omnigraph::ReliableSplitter<Graph>(gp_.g), colorer, labeler_);
- }
-
- if (!config.components_for_kmer.empty()) {
- string kmer_folder = path::append_path(pics_folder, "kmer_loc/");
- make_dir(kmer_folder);
- auto kmer = runtime_k::RtSeq(gp_.k_value + 1, config.components_for_kmer.substr(0, gp_.k_value + 1).c_str());
- string file_name = path::append_path(kmer_folder, pos_name + ".dot");
- WriteKmerComponent(gp_, kmer, file_name, colorer, labeler_);
- }
-
- if (config.write_components_along_genome) {
- make_dir(pics_folder + "along_genome/");
- omnigraph::visualization::WriteComponentsAlongPath(gp_.g, path1.sequence(), pics_folder + "along_genome/", colorer, labeler_);
- }
-
- if (config.write_components_along_contigs) {
- make_dir(pics_folder + "along_contigs/");
- NewExtendedSequenceMapper<Graph, Index> mapper(gp_.g, gp_.index, gp_.kmer_mapper);
- WriteGraphComponentsAlongContigs(gp_.g, mapper, pics_folder + "along_contigs/", colorer, labeler_);
- }
-
- if (!config.components_for_genome_pos.empty()) {
- string pos_loc_folder = path::append_path(pics_folder, "pos_loc/");
- make_dir(pos_loc_folder);
- vector<string> positions;
- boost::split(positions, config.components_for_genome_pos,
- boost::is_any_of(" ,"), boost::token_compress_on);
- for (auto it = positions.begin(); it != positions.end(); ++it) {
- boost::optional<runtime_k::RtSeq> close_kp1mer = FindCloseKP1mer(gp_,
- boost::lexical_cast<int>(*it), gp_.k_value);
- if (close_kp1mer) {
- string locality_folder = path::append_path(pos_loc_folder, *it + "/");
- make_dir(locality_folder);
- WriteKmerComponent(gp_, *close_kp1mer, path::append_path(locality_folder, pos_name + ".dot"), colorer, labeler_);
- } else {
- WARN(
- "Failed to find genome kp1mer close to the one at position "
- << *it << " in the graph. Which is " << runtime_k::RtSeq (gp_.k_value + 1, gp_.genome.GetSequence(), boost::lexical_cast<int>(*it)));
- }
- }
- }
- }
-
- conj_graph_pack& gp_;
- const omnigraph::GraphLabeler<Graph>& labeler_;
- string folder_;
-};
-
-inline
-std::string ConstructComponentName(std::string file_name, size_t cnt) {
- stringstream ss;
- ss << cnt;
- string res = file_name;
- res.insert(res.length(), ss.str());
- return res;
-}
-
-template<class Graph>
-double AvgCoverage(const Graph& g,
- const std::vector<typename Graph::EdgeId>& edges) {
- double total_cov = 0.;
- size_t total_length = 0;
- for (auto it = edges.begin(); it != edges.end(); ++it) {
- total_cov += g.coverage(*it) * (double) g.length(*it);
- total_length += g.length(*it);
- }
- return total_cov / (double) total_length;
-}
-
-template<class Graph>
-size_t Nx(Graph &g, double percent) {
- size_t sum_edge_length = 0;
- vector<size_t> lengths;
- for (auto iterator = g.ConstEdgeBegin(); !iterator.IsEnd(); ++iterator) {
- lengths.push_back(g.length(*iterator));
- sum_edge_length += g.length(*iterator);
- }
- sort(lengths.begin(), lengths.end());
- double len_perc = (1.0 - percent * 0.01) * (double) (sum_edge_length);
- for (size_t i = 0; i < lengths.size(); i++) {
- if (lengths[i] >= len_perc)
- return lengths[i];
- else
- len_perc -= (double) lengths[i];
- }
- return 0;
-}
-
-}
-}
diff --git a/src/debruijn/stats/statistics.hpp b/src/debruijn/stats/statistics.hpp
deleted file mode 100644
index 9cb73ba..0000000
--- a/src/debruijn/stats/statistics.hpp
+++ /dev/null
@@ -1,272 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "standard.hpp"
-#include "omni/omni_tools.hpp"
-
-#include "simple_tools.hpp"
-#include "xmath.h"
-#include "config_struct.hpp"
-
-#include <iostream>
-#include <fstream>
-#include <map>
-
-namespace debruijn_graph {
-namespace stats {
-
-using namespace math;
-using namespace omnigraph;
-
-class AbstractStatCounter {
- public:
- AbstractStatCounter() {
- }
-
- virtual ~AbstractStatCounter() {
- }
-
- virtual void Count() = 0;
- //protected:
- // DECL_LOGGER("StatCounter")
-};
-
-class StatList: AbstractStatCounter {
- private:
- vector<AbstractStatCounter*> to_count_;
- public:
- StatList(vector<AbstractStatCounter*> to_count =
- vector<AbstractStatCounter*>()) :
- to_count_(to_count) {
- }
-
- virtual ~StatList() {
- }
-
- void AddStat(AbstractStatCounter* new_stat) {
- to_count_.push_back(new_stat);
- }
-
- const vector<AbstractStatCounter*> stats() {
- return to_count_;
- }
-
- virtual void Count() {
- for (size_t i = 0; i < to_count_.size(); i++) {
- to_count_[i]->Count();
- }
- }
-
- void DeleteStats() {
- for (size_t i = 0; i < to_count_.size(); i++)
- delete to_count_[i];
- to_count_.clear();
- }
-};
-
-template<class Graph>
-class VertexEdgeStat: public AbstractStatCounter {
- private:
- const Graph &graph_;
- public:
- VertexEdgeStat(const Graph &graph) :
- graph_(graph) {
- }
-
- virtual ~VertexEdgeStat() {
- }
-
- size_t vertices() {
- return graph_.size();
- }
-
- size_t edges() {
- size_t edgeNumber = 0;
- size_t sum_edge_length = 0;
- for (auto iterator = graph_.ConstEdgeBegin(); !iterator.IsEnd();
- ++iterator) {
- edgeNumber++;
- // if (graph_.coverage(*iterator) > 30) {
- sum_edge_length += graph_.length(*iterator);
- // }
- }
- return edgeNumber;
- }
-
- size_t edge_length() {
- size_t sum_edge_length = 0;
- for (auto iterator = graph_.ConstEdgeBegin(); !iterator.IsEnd();
- ++iterator) {
- if (graph_.coverage(*iterator) > 30) {
- sum_edge_length += graph_.length(*iterator);
- }
- }
- return sum_edge_length;
- }
-
- virtual void Count() {
- INFO(
- "Vertex count=" << vertices() << "; Edge count=" << edges());
- INFO(
- "sum length of edges " << edge_length());
- }
-};
-
-template<class Graph>
-class BlackEdgesStat: public AbstractStatCounter {
- private:
- typedef typename Graph::EdgeId EdgeId;
- const Graph &graph_;
- Path<EdgeId> path1_;
- Path<EdgeId> path2_;
- public:
- BlackEdgesStat(const Graph &graph, Path<EdgeId> path1, Path<EdgeId> path2) :
- graph_(graph), path1_(path1), path2_(path2) {
- }
-
- virtual ~BlackEdgesStat() {
- }
-
- virtual void Count() {
- size_t black_count = 0;
- size_t edge_count = 0;
- const vector<EdgeId> path_edges1 = path1_.sequence();
- const vector<EdgeId> path_edges2 = path2_.sequence();
- set<EdgeId> colored_edges;
- colored_edges.insert(path_edges1.begin(), path_edges1.end());
- colored_edges.insert(path_edges2.begin(), path_edges2.end());
- size_t sum_length = 0;
- for (auto it = graph_.ConstEdgeBegin(); !it.IsEnd(); ++it) {
- edge_count++;
- if (colored_edges.count(*it) == 0) {
- black_count++;
- sum_length += graph_.length(*it);
- }
- }
- if (edge_count > 0) {
- INFO("Error edges count: " << black_count << " which is " << 100.0 * (double) black_count / (double) edge_count << "% of all edges");
- INFO("Total length of all black edges: " << sum_length << ". While double genome length is " << (2 * cfg::get().ds.reference_genome.size()));
- } else {
- INFO("Error edges count: " << black_count << " which is 0% of all edges");
- }
- }
-};
-
-template<class Graph>
-class NStat: public AbstractStatCounter {
- private:
- typedef typename Graph::EdgeId EdgeId;
- const Graph &graph_;
- Path<EdgeId> path_;
- size_t perc_;
- public:
- NStat(const Graph &graph, Path<EdgeId> path, size_t perc = 50) :
- graph_(graph), path_(path), perc_(perc) {
- }
-
- virtual ~NStat() {
- }
-
- virtual void Count() {
- vector<size_t> lengths;
- size_t sum_all = 0;
- for (size_t i = 0; i < path_.size(); i++) {
- lengths.push_back(graph_.length(path_[i]));
- sum_all += graph_.length(path_[i]);
- }
- sort(lengths.begin(), lengths.end());
- size_t sum = 0;
- size_t current = lengths.size();
- while (current > 0 && (double) sum < (double) perc_ * 0.01 * (double) sum_all) {
- current--;
- sum += lengths[current];
- }
- if(current < lengths.size())
- INFO("N" << perc_ << ": " << lengths[current]);
- }
-};
-
-template<class Graph>
-class IsolatedEdgesStat: public AbstractStatCounter {
- private:
- typedef typename Graph::EdgeId EdgeId;
- const Graph &graph_;
- set<EdgeId> black_edges_;
- vector<size_t> lengths;
- public:
- IsolatedEdgesStat(const Graph &graph, Path<EdgeId> path1,
- Path<EdgeId> path2) :
- graph_(graph) {
- for (auto it = graph.ConstEdgeBegin(); !it.IsEnd(); ++it) {
- black_edges_.insert(*it);
- }
- for (size_t i = 0; i < path1.size(); i++) {
- black_edges_.erase(path1[i]);
- }
- for (size_t i = 0; i < path2.size(); i++) {
- black_edges_.erase(path2[i]);
- }
- }
-
- virtual ~IsolatedEdgesStat() {
- }
-
- virtual void Count() {
- lengths.clear();
- for (auto it = graph_.ConstEdgeBegin(); !it.IsEnd(); ++it) {
- EdgeId edge = *it;
- if (graph_.IsDeadEnd(graph_.EdgeEnd(edge))
- && graph_.IsDeadStart(graph_.EdgeStart(edge))
- && black_edges_.count(edge) == 0) {
- lengths.push_back(graph_.length(edge));
- }
- }INFO("Isolated not black edges: " << lengths.size());
- WriteLengths(cfg::get().output_dir, "isolated_edges.txt");
- }
-
- void WriteLengths(string folder_name, string file_name) {
- ofstream os;
- os.open((folder_name + "/" + file_name).c_str());
- WriteLengths(os);
- os.close();
- }
-
- void WriteLengths(ostream &os) {
- sort(lengths.begin(), lengths.end());
- for (size_t i = 0; i < lengths.size(); i++) {
- os << lengths[i] << endl;
- }
- }
-};
-
-template<class Graph>
-class SelfComplementStat: public AbstractStatCounter {
- private:
- typedef typename Graph::EdgeId EdgeId;
- const Graph &graph_;
- public:
- SelfComplementStat(const Graph &graph) :
- graph_(graph) {
- }
-
- virtual ~SelfComplementStat() {
- }
-
- virtual void Count() {
- size_t sc_number = 0;
- for (auto iterator = graph_.ConstEdgeBegin(); !iterator.IsEnd();
- ++iterator)
- if (graph_.conjugate(*iterator) == (*iterator))
- sc_number++;
- // INFO("Self-complement count failed!!! ");
- INFO("Self-complement count=" << sc_number);
- }
-};
-}
-}
diff --git a/src/debruijn/utils.hpp b/src/debruijn/utils.hpp
deleted file mode 100644
index 664a3a7..0000000
--- a/src/debruijn/utils.hpp
+++ /dev/null
@@ -1,138 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "openmp_wrapper.h"
-#include "standard.hpp"
-
-#include "io/paired_read.hpp"
-#include "omni/omni_utils.hpp"
-#include "omni/visualization/graph_colorer.hpp"
-#include "omni/id_track_handler.hpp"
-#include "omni/splitters.hpp"
-#include "omni/path_processor.hpp"
-
-#include "logger/logger.hpp"
-#include "xmath.h"
-#include "sequence/sequence_tools.hpp"
-
-#include "runtime_k.hpp"
-
-#include "path_helper.hpp"
-
-#include "debruijn_graph.hpp"
-#include "indices/perfect_hash_map.hpp"
-#include "edge_index.hpp"
-
-#include <iostream>
-
-namespace debruijn_graph {
-
-using omnigraph::Path;
-using omnigraph::MappingPath;
-using omnigraph::Range;
-using omnigraph::MappingRange;
-
-inline double PairedReadCountWeight(const MappingRange&, const MappingRange&) {
- return 1.;
-}
-
-inline double KmerCountProductWeight(const MappingRange& mr1,
- const MappingRange& mr2) {
- return (double)(mr1.initial_range.size() * mr2.initial_range.size());
-}
-
-class WeightDEWrapper {
-private:
-
- vector<double> new_hist;
- int left_x;
- int insert_size;
-
- void ExtendLinear(const std::map<int, size_t> & hist) {
- size_t sum_weight = 0;
-
- for (auto iter = hist.begin(); iter != hist.end(); ++iter)
- sum_weight += iter->second;
- DEBUG(sum_weight);
-
- VERIFY(hist.size() > 0);
- auto iter = hist.begin();
-
- left_x = iter->first;
-
- int prev = iter->first;
- size_t prev_val = iter->second;
-
- new_hist.push_back((double)prev_val / (double)sum_weight);
- ++iter;
-
- for (; iter != hist.end(); ++iter) {
- int x = iter->first;
- size_t y = iter->second;
- double tan = ((double)y - (double)prev_val) / (x - prev);
-
- VERIFY(prev < x);
- for (int i = prev + 1; i <= x; ++i) {
- new_hist.push_back(((double)prev_val + tan * (i - prev)) / (double)sum_weight);
- }
- prev = x;
- prev_val = y;
- DEBUG("hist " << x << " " << y);
- }
- }
-
-public:
- WeightDEWrapper(const map<int, size_t>& hist, double IS) {
- DEBUG("WeightDEWrapper " << IS);
- insert_size = (int) IS;
- DEBUG("Extending linear");
- ExtendLinear(hist);
- }
-
- ~WeightDEWrapper() {
- }
-
-
- double CountWeight(int x) const {
- int xx = insert_size - left_x + x - 1;
-
- if (!(xx >= 0 && xx < (int) new_hist.size())) return 0.;
- VERIFY(math::le(new_hist[xx], 1.));
- return 1000. * new_hist[xx];
- }
-};
-
-inline double UnityFunction(int /*x*/) {
- return 1.;
-}
-
-template<class Graph>
-Sequence MergeSequences(const Graph& g,
- const vector<typename Graph::EdgeId>& continuous_path) {
- vector < Sequence > path_sequences;
- path_sequences.push_back(g.EdgeNucls(continuous_path[0]));
- for (size_t i = 1; i < continuous_path.size(); ++i) {
- VERIFY(
- g.EdgeEnd(continuous_path[i - 1])
- == g.EdgeStart(continuous_path[i]));
- path_sequences.push_back(g.EdgeNucls(continuous_path[i]));
- }
- return MergeOverlappingSequences(path_sequences, g.k());
-}
-
-template<class Graph>
-Sequence PathSequence(const Graph& g, const Path<typename Graph::EdgeId>& path) {
- Sequence path_sequence = MergeSequences(g, path.sequence());
- size_t start = path.start_pos();
- size_t end = path_sequence.size()
- - g.length(path[path.size() - 1]) + path.end_pos();
- return path_sequence.Subseq(start, end);
-}
-
-}
diff --git a/src/dipspades/CMakeLists.txt b/src/dipspades/CMakeLists.txt
deleted file mode 100644
index 1f9bc3a..0000000
--- a/src/dipspades/CMakeLists.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-############################################################################
-# Copyright (c) 2015 Saint Petersburg State University
-# Copyright (c) 2011-2014 Saint Petersburg Academic University
-# All Rights Reserved
-# See file LICENSE for details.
-############################################################################
-
-project(dipspades CXX)
-
-add_executable(dipspades
- dipspades_config.cpp
- utils/files_utils.cpp
- main.cpp)
-
-target_include_directories(dipspades PRIVATE ${CMAKE_SOURCE_DIR}/debruijn)
-target_link_libraries(dipspades debruijn)
-
-if (SPADES_STATIC_BUILD)
- set_target_properties(dipspades PROPERTIES LINK_SEARCH_END_STATIC 1)
-endif()
-
-install(TARGETS dipspades
- DESTINATION bin
- COMPONENT runtime)
-install(DIRECTORY "${SPADES_CFG_DIR}/dipspades"
- DESTINATION share/spades/configs
- FILES_MATCHING PATTERN "*.info.template")
diff --git a/src/dipspades/consensus_contigs_constructor/consensus_contigs_constructor.hpp b/src/dipspades/consensus_contigs_constructor/consensus_contigs_constructor.hpp
deleted file mode 100644
index c5f47e7..0000000
--- a/src/dipspades/consensus_contigs_constructor/consensus_contigs_constructor.hpp
+++ /dev/null
@@ -1,332 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "../../include/io/io_helper.hpp"
-
-#include "../utils/element_printers.hpp"
-#include "../utils/files_utils.hpp"
-
-#include "contig_correctors/close_gaps_corrector.hpp"
-#include "contig_correctors/iterative_redundant_contigs_remover.hpp"
-#include "contig_correctors/overlap_searcher.hpp"
-#include "contig_correctors/same_edge_deletion_corrector.hpp"
-#include "contig_correctors/incorrect_contig_remover.hpp"
-
-using namespace debruijn_graph;
-
-namespace dipspades{
-
-class ConsensusContigsConstructor {
- conj_graph_pack &graph_pack_;
- BaseHistogram<size_t> &bulge_len_hist_;
- NewExtendedSequenceMapper<conj_graph_pack::graph_t, conj_graph_pack::index_t> seq_mapper_;
- VertexPathIndex path_index_;
-
- CorrectionResult correction_result_;
- ContigStoragePtr default_storage_;
- ContigStoragePtr composite_storage_;
-
- set<size_t> ind_zero_paths_;
-
- struct contig_name {
- string fname;
- string name;
-
- contig_name(string new_fname, string new_name) :
- fname(cut_fname_from_path(new_fname)),
- name(new_name) { }
- };
-
- typedef pair<contig_name, Sequence> contig;
-
- vector<contig> ReadContigs(string contig_fname){
- vector<contig> contigs;
- auto fstream = io::SplittingWrap(EasyStream(contig_fname, false));
-
- while(!fstream->eof()){
- SingleRead single_read;
- (*fstream) >> single_read;
- contigs.push_back(contig(contig_name(contig_fname, single_read.name()),
- single_read.sequence()));
- }
- INFO(contigs.size() << " contigs from " << contig_fname << " were read");
- return contigs;
- }
-
- vector<contig> ReadContigsFromFiles(vector<string> contig_fnames){
- vector<contig> contigs;
- for(auto it = contig_fnames.begin(); it != contig_fnames.end(); it++){
- if(fname_valid(*it)){
- auto contigs_from_file = ReadContigs(*it);
- contigs.insert(contigs.end(), contigs_from_file.begin(), contigs_from_file.end());
- }
- }
- return contigs;
- }
-
- vector<MappingPath<EdgeId> > ConstructMappathsWithoutRC(vector<contig> &contigs){
- vector<MappingPath<EdgeId> > map_paths;
- size_t zero_paths = 0;
- size_t total_length_unmapped = 0;
- for(size_t i = 0; i < contigs.size(); i++){
- map_paths.push_back(seq_mapper_.MapSequence(contigs[i].second));
- if(map_paths[map_paths.size() - 1].size() == 0){
- total_length_unmapped += contigs[i].second.size();
- zero_paths++;
- }
- }
- if(zero_paths != 0)
- INFO(ToString(zero_paths) + " contigs with total length " << total_length_unmapped <<
- " have mapped path with zero length");
- return map_paths;
- }
-
- void DefineIndicesOfZeroPaths(vector<MappingPath<EdgeId> > &map_paths){
- for(size_t i = 0; i < map_paths.size(); i++)
- if(map_paths[i].size() == 0)
- ind_zero_paths_.insert(i);
- }
-
- ContigStoragePtr CreateContigStorage(vector<contig> &contigs,
- vector<MappingPath<EdgeId> > &mapping_paths) {
- ContigStoragePtr default_storage(new SimpleContigStorage());
- for(size_t i = 0; i < mapping_paths.size(); i++){
- if(ind_zero_paths_.find(i) == ind_zero_paths_.end()){
- int i1 = int(i * 2), i2 = int(i * 2 + 1);
- default_storage->Add(MappingContigPtr(
- new SimpleMappingContig(contigs[i].first.name,
- contigs[i].first.fname, contigs[i].second,
- mapping_paths[i], i1, i2)));
- }
- }
- return default_storage;
- }
-
- void PrimaryContigsProcessing(ContigStoragePtr storage){
- INFO("Removing repetitive edges in contigs mapping starts");
- SameEdgeDeletionCorrector same_edges_corr(graph_pack_.g);
- same_edges_corr.Correct(storage);
-// INFO(storage->Size() << " contigs will be used");
- INFO("Removing repetitive edges in contigs mapping ends");
-
- INFO("Close gaps in contigs mappings starts")
- CloseGapsCorrector close_gaps_corr(graph_pack_.g);
- close_gaps_corr.Correct(storage);
-// INFO(storage->Size() << " contigs will be used");
- INFO("Close gaps in contigs mappings ends");
-
- INFO("Removing incorrect contigs")
- RemoveUnconnectContigsCorrector del_unconn_corr(graph_pack_.g);
- del_unconn_corr.Correct(storage);
-// INFO(storage->Size() << " contigs will be used");
- }
-
- string name_to_rc_name(string name){
- return name + "_RC";
- }
-
- ContigStoragePtr CreateStorageWithRCContigs(ContigStoragePtr old_storage){
- ContigStoragePtr new_storage(new SimpleContigStorage());
- TRACE("CreateStorageWithRCContigs starts");
- for(size_t i = 0; i < old_storage->Size(); i++){
- auto contig = (*old_storage)[i];
- new_storage->Add(contig);
-
- MappingContigPtr rc_contig = MappingContigPtr(
- new SimpleMappingContig(
- name_to_rc_name(contig->name()),
- contig->src_file(),
- !contig->seq(),
- GetRCToMappingPath(graph_pack_.g, contig->mapping_path(), contig->seq().size()),
- GetRCToPathSeq(graph_pack_.g, contig->path_seq()),
- contig->id() + 1, contig->id()));
- new_storage->Add(rc_contig);
- }
- TRACE("CreateStorageWithRCContigs ends");
- INFO("Addition of RC contigs. " << new_storage->Size() << " contigs will be used");
- return new_storage;
- }
-
- void RemoveRedundantContigs(ContigStoragePtr storage){
- INFO("Redundant contigs remover starts");
- VertexPathIndex path_index(graph_pack_.g);
- IterativeLoopCorrector iter_loop_corr(
- graph_pack_.g,
- graph_pack_.k_value,
- path_index,
- dsp_cfg::get().cc.max_loop_length,
- dsp_cfg::get().cc.min_lcs_size,
- dsp_cfg::get().cc.estimate_tails ?
- bulge_len_hist_.Quantile(dsp_cfg::get().cc.bulge_len_quantile) :
- dsp_cfg::get().pbr.max_bulge_nucls_len);
- iter_loop_corr.Correct(storage);
- INFO("Redundant contigs remover ends");
- correction_result_ = iter_loop_corr.Results();
- }
-
- ContigStoragePtr DefineOverlappingContigs(ContigStoragePtr storage){
- INFO("Overlapping search starts");
- path_index_.Initialize(storage);
- OverlapCorrector over_corr(graph_pack_.g,
- graph_pack_.k_value,
- dsp_cfg::get().cc.min_overlap_size,
- path_index_);
- auto new_storage = over_corr.Correct(storage);
- path_index_.Clear();
- INFO("Overlapping search ends");
- return new_storage;
- }
-
- void WriteContigsToFile(ContigStoragePtr contigs, string filename){
- size_t total_length = 0;
- ofstream out(filename);
- for(size_t i = 0; i < contigs->Size(); i++){
- vector<EdgeId> contig_path = (*contigs)[i]->path_seq();
- TRACE(i << " path: " << SimplePathWithVerticesToString(graph_pack_.g, contig_path));
- Sequence seq = (*contigs)[i]->seq();
- out << ">" << (*contigs)[i]->name() << endl;
- out << seq.str() << endl;
- total_length += seq.size();
- }
- INFO(contigs->Size() << " with total length " << total_length << " were written in " <<
- filename);
- }
-
- void WritePairedAndUnpairedContigs(ContigStoragePtr storage){
- ContigStoragePtr double_contigs(new SimpleContigStorage());
- ContigStoragePtr single_contigs(new SimpleContigStorage());
- for(size_t i = 0; i < storage->Size(); i++){
- auto contig = (*storage)[i];
- if(contig->AllMappingContigs().size() == 0){
- if(correction_result_.redundancy_map.GetValuesByKey(contig->id()).size() == 0)
- single_contigs->Add(contig);
- else
- double_contigs->Add(contig);
- }
- else
- double_contigs->Add(contig);
- }
- WriteContigsToFile(double_contigs,
- path::append_path(dsp_cfg::get().io.output_dir, "paired_consensus_contigs.fasta").c_str());
- WriteContigsToFile(single_contigs,
- path::append_path(dsp_cfg::get().io.output_dir, "unpaired_consensus_contigs.fasta").c_str());
- }
-
- void WriteAlignedHaplocontigs(){
- string fname = path::append_path(dsp_cfg::get().io.output_dir, "haplocontigs_alignment");
- ofstream out(fname.c_str());
- INFO("Writing haplocontigs alignment to " << fname);
-
- for(size_t i = 0; i < composite_storage_->Size(); i++){
- auto composite_contig = (*composite_storage_)[i];
- out << "Consensus contig: " << composite_contig->name() << endl;
- auto haplocontigs = composite_contig->AllMappingContigs();
- if(haplocontigs.size() == 0) // contig is not composite
- haplocontigs.push_back(composite_contig);
-
- if(haplocontigs.size() > 1){
- out << "\tOverlapped haplocontigs: " << endl;
- for(size_t i = 0; i < haplocontigs.size() - 1; i++)
- out << "\t\t" << haplocontigs[i]->full_name() << "\t" <<
- haplocontigs[i + 1]->full_name() << endl;
- }
-
- out << "\tAligned pairs: " << endl;
- size_t written_pairs = 0;
- for(auto h = haplocontigs.begin(); h != haplocontigs.end(); h++){
- size_t id = (*h)->id();
- auto redundant_contigs = correction_result_.redundancy_map.GetValuesByKey(id);
- for(auto it = redundant_contigs.begin(); it != redundant_contigs.end(); it++){
- out << "\t\t" << (*h)->full_name() << "\t" <<
- default_storage_->GetContigById(*it)->full_name() << endl;
- written_pairs++;
- }
- }
-
- if(written_pairs == 0)
- out << "\t\tNo pairs" << endl;
- }
-
-/* for(auto it = correction_result_.redundancy_map.begin();
- it != correction_result_.redundancy_map.end(); it++){
- auto contig1 = default_storage_->GetContigById(it->first);
- auto set_ids = it->second;
- for(auto set_it = set_ids.begin(); set_it != set_ids.end(); set_it++){
- auto contig2 = default_storage_->GetContigById(*set_it);
- out << contig1->src_file() << ":" << contig1->name() << "\t" <<
- contig2->src_file() << ":" << contig2->name() << endl;
- }
- }*/
-
- }
-
-public:
- ConsensusContigsConstructor(conj_graph_pack &graph_pack,
- BaseHistogram<size_t> &bulge_len_hist) :
- graph_pack_(graph_pack),
- bulge_len_hist_(bulge_len_hist),
- seq_mapper_(graph_pack.g, graph_pack.index,
- graph_pack.kmer_mapper, false),
- path_index_(graph_pack.g),
- correction_result_(),
- default_storage_(),
- composite_storage_() { }
-
- void Run() {
- INFO("Consensus contigs constructor starts");
- auto contigs = ReadContigsFromFiles(GetAllLinesFromFile(dsp_cfg::get().io.haplocontigs));
- INFO("Total: " << contigs.size() << " contigs were read");
- if(contigs.size() == 0)
- return;
-
- vector<MappingPath<EdgeId> > mapping_paths = ConstructMappathsWithoutRC(contigs);
- VERIFY(mapping_paths.size() == contigs.size());
- DefineIndicesOfZeroPaths(mapping_paths);
-
- auto preliminary_storage = CreateContigStorage(contigs, mapping_paths);
-
- TRACE("Preliminary storage:");
- TRACE(preliminary_storage->ToString(graph_pack_.g));
-
- PrimaryContigsProcessing(preliminary_storage);
-
- TRACE("Preliminary storage after 1st processing:");
- TRACE(preliminary_storage->ToString(graph_pack_.g));
-
- auto processed_storage = CreateStorageWithRCContigs(preliminary_storage);
- VERIFY(processed_storage->Size() % 2 == 0);
-
- default_storage_ = processed_storage->Clone();
- RemoveRedundantContigs(processed_storage);
-
- TRACE("Storage after removing redundant contigs:");
- TRACE(processed_storage->ToString(graph_pack_.g));
-
- composite_storage_ = DefineOverlappingContigs(processed_storage);
-
- string consensus_fname(path::append_path(dsp_cfg::get().io.output_dir, "consensus_contigs.fasta").c_str());
- WriteContigsToFile(composite_storage_, consensus_fname);
- WritePairedAndUnpairedContigs(composite_storage_);
-
- WriteAlignedHaplocontigs();
-
- INFO("Consensus contigs constructor ends");
- }
-
- ContigStoragePtr DefaultContigsStorage() { return default_storage_; }
-
- ContigStoragePtr CompositeContigsStorage() { return composite_storage_; }
-
- CorrectionResult RedundancyResult() { return correction_result_; }
-
-private:
- DECL_LOGGER("ConsensusContigsConstructor");
-};
-
-}
diff --git a/src/dipspades/consensus_contigs_constructor/contig_correctors/abstract_contig_corrector.hpp b/src/dipspades/consensus_contigs_constructor/contig_correctors/abstract_contig_corrector.hpp
deleted file mode 100644
index 0420854..0000000
--- a/src/dipspades/consensus_contigs_constructor/contig_correctors/abstract_contig_corrector.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "../mapping_contigs_storage.hpp"
-#include "../../utils/lcs_utils.hpp"
-#include "../../utils/path_routines.hpp"
-#include "../../utils/path_index.hpp"
-#include "../../utils/bulge_utils.hpp"
-#include "../../utils/redundancy_map.hpp"
-
-using namespace debruijn_graph;
-
-namespace dipspades {
-
-struct CorrectionResult{
- OverlapGraph g;
- RedundancyMap<size_t> redundancy_map;
-};
-
-//--------------------------------------------------------------------------
-class AbstractContigCorrector{
-protected:
- Graph& g_;
-public:
- AbstractContigCorrector(Graph& g) : g_(g) {
-
- }
- virtual ContigStoragePtr Correct(ContigStoragePtr storage) { return storage; }
- virtual MappingContigPtr Correct(MappingContigPtr contig) { return contig; }
- virtual ~AbstractContigCorrector(){}
- virtual CorrectionResult Results(){
- CorrectionResult res;
- return res;
- }
-};
-
-}
diff --git a/src/dipspades/consensus_contigs_constructor/contig_correctors/close_gaps_corrector.hpp b/src/dipspades/consensus_contigs_constructor/contig_correctors/close_gaps_corrector.hpp
deleted file mode 100644
index 23b1be4..0000000
--- a/src/dipspades/consensus_contigs_constructor/contig_correctors/close_gaps_corrector.hpp
+++ /dev/null
@@ -1,154 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "abstract_contig_corrector.hpp"
-
-using namespace debruijn_graph;
-
-namespace dipspades {
-
-class CloseGapsCorrector : public AbstractContigCorrector{
-
- set<size_t> incorr_contigs;
- size_t num_corr;
-
- size_t connected_length_;
- size_t disconnected_length_;
-
- vector<EdgeId> ClosePathGap(vector<EdgeId> path, vector<size_t> gap_index){
- vector<EdgeId> new_path;
- size_t current_gap = 0;
- for(size_t i = 0; i < path.size() - 1; i++){
- EdgeId cur_edge = path[i];
- new_path.push_back(cur_edge);
- if(i == gap_index[current_gap]){
- VertexId start = g_.EdgeEnd(cur_edge);
- VertexId end = g_.EdgeStart(path[i + 1]);
- auto dijkstra = DijkstraHelper<Graph>::CreateTargeredBoundedDijkstra(g_,
- end, dsp_cfg::get().pbr.max_bulge_nucls_len); //DijkstraHelper<Graph>::CreateBoundedDijkstra(g_, dsp_cfg::get().pbr.max_bulge_nucls_len);
- dijkstra.Run(start);
- if(dijkstra.DistanceCounted(end)){
- vector<EdgeId> add_path = dijkstra.GetShortestPathTo(end);
- for(auto e = add_path.begin(); e != add_path.end(); e++)
- if(g_.EdgeStart(*e) != g_.EdgeEnd(*e))
- new_path.push_back(*e);
- }
- else{
- // second attempt
- VertexId prev_start = g_.EdgeStart(cur_edge);
- dijkstra.Run(prev_start);
- if(dijkstra.DistanceCounted(end)){
- vector<EdgeId> add_path = dijkstra.GetShortestPathTo(end);
- new_path.erase(new_path.begin() + new_path.size() - 1);
- for(auto e = add_path.begin(); e != add_path.end(); e++)
- if(g_.EdgeStart(*e) != g_.EdgeEnd(*e))
- new_path.push_back(*e);
- }
- }
- current_gap++;
- }
- }
- new_path.push_back(path[path.size() - 1]);
- return new_path;
- }
-
- size_t CountContigsWithGaps(ContigStoragePtr storage) {
- size_t contigs_with_gaps = 0;
- for(size_t i = 0; i < storage->Size(); i++)
- if(!IsPathConnected(g_, (*storage)[i]->path_seq()))
- contigs_with_gaps++;
- return contigs_with_gaps;
- }
-
- void ProcessContigs(ContigStoragePtr storage) {
- double processed_perc = 0.1;
- double step = 0.1;
- for(size_t i = 0; i < storage->Size(); i++) {
- storage->ReplaceContig(Correct((*storage)[i]), i);
- double cur_process_perc = static_cast<double>(i) / static_cast<double>(storage->Size());
- if(cur_process_perc > processed_perc) {
- while(processed_perc + step <= cur_process_perc)
- processed_perc += step;
- INFO(ToString(processed_perc * 100.0) << "% contigs were processed");
- processed_perc += step;
- }
- }
- INFO("100% contigs were processed");
- }
-
-public:
- CloseGapsCorrector(Graph &g) :
- AbstractContigCorrector(g),
- num_corr(0),
- connected_length_(0),
- disconnected_length_(0) { }
-
- virtual ContigStoragePtr Correct(ContigStoragePtr storage){
-
- INFO(ToString(CountContigsWithGaps(storage)) << " contigs from " <<
- ToString(storage->Size()) << " have gaps before correction");
-
- ProcessContigs(storage);
-
- INFO(ToString(num_corr) << " contigs from " <<
- ToString(storage->Size()) << " with total length " << ToString(connected_length_) + " are correct");
- INFO(ToString(storage->Size() - num_corr) << " contigs from "
- << ToString(storage->Size()) << " with total length " <<
- ToString(disconnected_length_) + " have gaps after correction");
-
- storage->DeleteByIDs(incorr_contigs);
- return storage;
- }
-
- virtual MappingContigPtr Correct(MappingContigPtr contig){
- vector<EdgeId> path = contig->path_seq();
- if(path.size() <= 1){
- num_corr++;
- return contig;
- }
- vector<size_t> gap_indexes;
- for(size_t i = 0; i < path.size() - 1; i++){
- EdgeId e1 = path[i];
- EdgeId e2 = path[i + 1];
- if(!AreEdgesConnected(g_, e1, e2)){
- gap_indexes.push_back(i);
- }
- }
-
- TRACE("Contig " << contig->id() << " has " << gap_indexes.size() << " gaps");
-
- // contig is connected
- if(gap_indexes.size() == 0) {
- num_corr++;
- connected_length_ += GetPathLength(g_, contig->path_seq());
- return contig;
- }
-
- TRACE("Contig path before correction: " << SimplePathWithVerticesToString(g_, contig->path_seq()));
-
- vector<EdgeId> new_path = ClosePathGap(path, gap_indexes);
- if(IsPathConnected(g_, new_path)) {
- TRACE("Gaps were closed");
- TRACE("Contig path after correction: " << SimplePathWithVerticesToString(g_, new_path));
- num_corr++;
- connected_length_ += GetPathLength(g_, new_path);
- return MappingContigPtr(new ReplacedPathMappingContig(contig, new_path));
- }
-
- TRACE("Contig " << contig->id() << " remains incorrected!");
- incorr_contigs.insert(contig->id());
- disconnected_length_ += GetPathLength(g_, contig->path_seq());
- return contig;
- }
-
-private:
- DECL_LOGGER("CloseGapsCorrector")
-};
-
-}
diff --git a/src/dipspades/consensus_contigs_constructor/contig_correctors/equal_path_deletion_correction.hpp b/src/dipspades/consensus_contigs_constructor/contig_correctors/equal_path_deletion_correction.hpp
deleted file mode 100644
index 5177c57..0000000
--- a/src/dipspades/consensus_contigs_constructor/contig_correctors/equal_path_deletion_correction.hpp
+++ /dev/null
@@ -1,82 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "abstract_contig_corrector.hpp"
-
-using namespace debruijn_graph;
-
-namespace dipspades {
-
-class EqualPathDeletionCorrector : public AbstractContigCorrector{
- VertexPathIndex &path_index_;
- CorrectionResult res_;
-
- void InitializeMap(ContigStoragePtr contigs){
- for(size_t i = 0; i < contigs->Size(); i++){
- size_t id = (*contigs)[i]->id();
- res_.redundancy_map.AddNewKey(id);
- }
- }
-
-public:
-
- EqualPathDeletionCorrector(Graph &g, VertexPathIndex &path_index) : AbstractContigCorrector(g),
- path_index_(path_index){ }
-
- ContigStoragePtr Correct(ContigStoragePtr contigs) {
-
- INFO("Computing redundant equal contigs starts");
-
- InitializeMap(contigs);
- set<size_t> ids_for_deletion;
- for(size_t i = 0; i < contigs->Size() - 1; i++){
- size_t id1 = (*contigs)[i]->id();
- size_t rc_id1 = (*contigs)[i]->rc_id();
- if(ids_for_deletion.find(id1) == ids_for_deletion.end() &&
- ids_for_deletion.find(rc_id1) == ids_for_deletion.end()){
- auto path1 = (*contigs)[i]->path_seq();
- auto contigs_for_processing = path_index_.GetPathsIntersectedWith(path1);
- for(auto it = contigs_for_processing.begin(); it != contigs_for_processing.end(); it++){
- size_t j = *it;
- size_t id2 = (*contigs)[j]->id();
- size_t rc_id2 = (*contigs)[j]->rc_id();
- if(ids_for_deletion.find(id2) == ids_for_deletion.end() &&
- ids_for_deletion.find(rc_id2) == ids_for_deletion.end() && j > i){
- auto path2 = (*contigs)[j]->path_seq();
- if(ArePathEqual(path1, path2)){
- size_t id2 = (*contigs)[j]->id();
- ids_for_deletion.insert(id2);
- ids_for_deletion.insert(rc_id2);
- res_.redundancy_map.AddNewPair(id1, id2);
- res_.redundancy_map.AddNewPair(rc_id1, rc_id2);
- }
- }
- }
- }
- }
- RedundancyMapCondenser<size_t> condenser;
- res_.redundancy_map = condenser.Condense(res_.redundancy_map);
- INFO(ToString(ids_for_deletion.size()) + " contigs from " << contigs->Size() << " are redundant");
- contigs->DeleteByIDs(ids_for_deletion);
-
- INFO("Computing redundant equal contigs ends");
-
- return contigs;
- }
-
- MappingContigPtr Correct(MappingContigPtr contig){
- return contig;
- }
-
- CorrectionResult Result(){
- return res_;
- }
-};
-
-}
diff --git a/src/dipspades/consensus_contigs_constructor/contig_correctors/incorrect_contig_remover.hpp b/src/dipspades/consensus_contigs_constructor/contig_correctors/incorrect_contig_remover.hpp
deleted file mode 100644
index e35f2a8..0000000
--- a/src/dipspades/consensus_contigs_constructor/contig_correctors/incorrect_contig_remover.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "abstract_contig_corrector.hpp"
-
-using namespace debruijn_graph;
-
-namespace dipspades {
-
-class RemoveUnconnectContigsCorrector : public AbstractContigCorrector{
-
-public:
- RemoveUnconnectContigsCorrector(Graph &g) : AbstractContigCorrector(g){ }
-
- ContigStoragePtr Correct(ContigStoragePtr storage) {
- set<size_t> contigs_for_deletion;
- for(size_t i = 0; i < storage->Size(); i++){
- auto contig_path = (*storage)[i]->path_seq();
- TRACE((*storage)[i]->id() << " contig");
- TRACE("Path: " << SimplePathWithVerticesToString(g_, contig_path));
- if(!IsPathConnected(g_, contig_path)){
- contigs_for_deletion.insert((*storage)[i]->id());
- }
- }
- INFO(ToString(contigs_for_deletion.size()) + " contigs from " <<
- storage->Size() << " were deleted");
- storage->DeleteByIDs(contigs_for_deletion);
- return storage;
- }
-
- MappingContigPtr Correct(MappingContigPtr contig){
- return contig;
- }
-
-};
-
-}
diff --git a/src/dipspades/consensus_contigs_constructor/contig_correctors/iterative_redundant_contigs_remover.hpp b/src/dipspades/consensus_contigs_constructor/contig_correctors/iterative_redundant_contigs_remover.hpp
deleted file mode 100644
index 0f46c0a..0000000
--- a/src/dipspades/consensus_contigs_constructor/contig_correctors/iterative_redundant_contigs_remover.hpp
+++ /dev/null
@@ -1,94 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "redundant_contig_remover.hpp"
-#include "equal_path_deletion_correction.hpp"
-
-using namespace debruijn_graph;
-
-namespace dipspades {
-
-class IterativeLoopCorrector : public AbstractContigCorrector{
-
- size_t k_value_;
-
- VertexPathIndex &index_;
- size_t max_loop_len_;
- size_t min_lcs_length_;
- size_t max_tail_length_;
- CorrectionResult res;
-
-public:
- IterativeLoopCorrector(Graph &g, size_t k_value, VertexPathIndex &index, size_t max_loop_len,
- size_t min_lcs_length, size_t max_tail_length) :
- AbstractContigCorrector(g), k_value_(k_value), index_(index),
- max_loop_len_(max_loop_len), min_lcs_length_(min_lcs_length),
- max_tail_length_(max_tail_length) {
- }
-
- ContigStoragePtr Correct(ContigStoragePtr contigs) {
- {
- INFO("Equal path remover starts");
- index_.Initialize(contigs);
- EqualPathDeletionCorrector equal_path_remover(g_, index_);
- contigs = equal_path_remover.Correct(contigs);
- res.redundancy_map = equal_path_remover.Result().redundancy_map;
- index_.Clear();
- INFO(ToString(contigs->Size()) + " contigs will be used further");
- }
-
- INFO("Iterative loop corrector starts");
- {
- INFO("Only exact match iteration with parameters:");
- INFO("\tMaximal loop length - " + ToString(max_loop_len_));
- INFO("\tMinimal lcs length - " + ToString(min_lcs_length_));
- INFO("\tMaximal tail length - 0");
-
- index_.Initialize(contigs);
- LoopBulgeDeletionCorrector loop_corr(g_, k_value_,
- max_loop_len_, 0, min_lcs_length_, index_);
- contigs = loop_corr.Correct(contigs);
- auto old_map = res.redundancy_map;
- auto new_map = loop_corr.Results().redundancy_map;
- RedundancyMapMerger<size_t> map_merger;
- res.redundancy_map = map_merger.MergeTwoMaps(old_map, new_map);
- index_.Clear();
- INFO(ToString(contigs->Size()) + " contigs will be used further");
- }
-
- {
- INFO("Tails allowing match iteration with parameters:");
- INFO("\tMaximal loop length - " + ToString(max_loop_len_));
- INFO("\tMinimal lcs length - " + ToString(min_lcs_length_));
- INFO("\tMaximal tail length - " + ToString(max_tail_length_));
- index_.Initialize(contigs);
- LoopBulgeDeletionCorrector loop_corr(g_, k_value_,
- max_loop_len_, max_tail_length_, min_lcs_length_, index_);
- contigs = loop_corr.Correct(contigs);
- auto old_map = res.redundancy_map;
- auto new_map = loop_corr.Results().redundancy_map;
- RedundancyMapMerger<size_t> map_merger;
- res.redundancy_map = map_merger.MergeTwoMaps(old_map, new_map);
- index_.Clear();
- INFO(ToString(contigs->Size()) + " contigs will be used further");
- }
- INFO("Iterative loop corrector ends");
- return contigs;
- }
-
- MappingContigPtr Correct(MappingContigPtr contig){
- return contig;
- }
-
- CorrectionResult Results(){
- return res;
- }
-};
-
-}
diff --git a/src/dipspades/consensus_contigs_constructor/contig_correctors/overlap_searcher.hpp b/src/dipspades/consensus_contigs_constructor/contig_correctors/overlap_searcher.hpp
deleted file mode 100644
index 7c5d6f1..0000000
--- a/src/dipspades/consensus_contigs_constructor/contig_correctors/overlap_searcher.hpp
+++ /dev/null
@@ -1,541 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "redundant_contig_remover.hpp"
-
-using namespace debruijn_graph;
-
-namespace dipspades {
-
-void OverlapgraphToDot(string dotfname, OverlapGraph & g, ContigStoragePtr stor){
- ofstream dot(dotfname.c_str());
-
-// cout << "Number of vertices - " << g.VerticesCount() << endl;
-// cout << "Number of contigs - " << stor->Size() << endl;
- VERIFY(g.VerticesCount() <= stor->Size());
-
- dot << "digraph Overlaped_paths {" << endl << "node[fontname=<Courier>]" << endl;
-
- auto vertices = g.Vertices();
- for(auto v = vertices.begin(); v != vertices.end(); v++){
- dot << *v << "[label=\"ID = #" << *v << ". " << *v << ", RC_ID = " <<
- stor->GetContigById(*v)->rc_id() << "\"]" << endl;
- }
-
- auto edges = g.Edges();
- for(auto e = edges.begin(); e != edges.end(); e++)
- dot << e->first << "->" << e->second << "[label=\"" << g.GetWeightOf(*e) << "\"]" << endl;
-
- dot << "}";
-}
-
-//--------------------------------------------------------------------------------------------
-
-class OverlappedContigsMap {
- size_t min_lcs_length_;
-public:
- struct OverlappedKey {
- size_t id1;
- size_t id2;
- size_t id1_rc;
- size_t id2_rc;
-
- OverlappedKey(size_t new_id1, size_t new_id2,
- size_t new_id1_rc, size_t new_id2_rc) :
- id1(new_id1),
- id2(new_id2),
- id1_rc(new_id1_rc),
- id2_rc(new_id2_rc) { }
-
- OverlappedKey() :
- id1(), id2(), id1_rc(), id2_rc() { }
-
- string ToString() const {
- stringstream ss;
- ss << "<" << id1 << ", " << id2 << "> <" << id2_rc << ", " << id1_rc << ">";
- return ss.str();
- }
-
- string id() const {
- stringstream ss;
- ss << id1 << "_" << id2 << "_" << id1_rc << "_" << id2_rc;
- return ss.str();
- }
-
- OverlappedKey Reverse1() {
- return OverlappedKey(id1, id2_rc, id1_rc, id2);
- }
-
- OverlappedKey Reverse2() {
- return OverlappedKey(id2_rc, id1, id2, id1_rc);
- }
-
- OverlappedKey Reverse3() {
- return OverlappedKey(id1_rc, id2, id1, id2_rc);
- }
-
- OverlappedKey Reverse4() {
- return OverlappedKey(id2, id1_rc, id2_rc, id1);
- }
- };
-
- struct OverlappedValue {
- Range range_left;
- Range range_right;
- Range range_left_rc;
- Range range_right_rc;
- size_t lcs_length;
-
- OverlappedValue(Range new_range_left, Range new_range_right,
- Range new_range_left_rc, Range new_range_right_rc,
- size_t new_lcs_length) :
- range_left(new_range_left),
- range_right(new_range_right),
- range_left_rc(new_range_left_rc),
- range_right_rc(new_range_right_rc),
- lcs_length(new_lcs_length) { }
-
- OverlappedValue() :
- range_left(),
- range_right(),
- range_left_rc(),
- range_right_rc(),
- lcs_length() { }
-
- string ToString() const {
- stringstream ss;
- ss << "(" << range_left.start_pos << ", " << range_left.end_pos << "), (" <<
- range_right.start_pos << ", " << range_right.end_pos << "): " << lcs_length;
- return ss.str();
- }
- };
-
-private:
- class OverlappedKeyComparator {
- public:
- bool operator()(const OverlappedKey &obj1, const OverlappedKey &obj2) const {
- return obj1.id() < obj2.id();
-
- if(obj1.id1 < obj2.id1)
- return true;
- return obj1.id2 < obj2.id2;
- }
- };
-
- map<OverlappedKey, OverlappedValue, OverlappedKeyComparator> overlap_map_;
- map<pair<size_t, size_t>, pair<Range, Range> > pair_overlap_map_;
-
- void RemoveElement(OverlappedKey key) {
- overlap_map_.erase(key);
- pair_overlap_map_.erase(make_pair(key.id1, key.id2));
- pair_overlap_map_.erase(make_pair(key.id2_rc, key.id1_rc));
- }
-
- void AddElement(OverlappedKey key, OverlappedValue value) {
- overlap_map_[key] = value;
- pair_overlap_map_[make_pair(key.id1, key.id2)] =
- make_pair(value.range_left, value.range_right);
- pair_overlap_map_[make_pair(key.id2_rc, key.id1_rc)] =
- make_pair(value.range_right_rc, value.range_left_rc);
- }
-
- void ProcessReverseKey(OverlappedKey key, OverlappedValue value,
- OverlappedKey reverse_key) {
- if(overlap_map_.find(reverse_key) == overlap_map_.end())
- AddElement(key, value);
- else
- if(overlap_map_[reverse_key].lcs_length < value.lcs_length) {
- AddElement(key, value);
- RemoveElement(reverse_key);
- }
- }
-
-public:
- OverlappedContigsMap(size_t min_lcs_length) :
- min_lcs_length_(min_lcs_length) { }
-
- void Add(OverlappedKey key, OverlappedValue value) {
- if(value.lcs_length < min_lcs_length_)
- return;
- ProcessReverseKey(key, value, key.Reverse1());
- ProcessReverseKey(key, value, key.Reverse2());
- ProcessReverseKey(key, value, key.Reverse3());
- ProcessReverseKey(key, value, key.Reverse4());
- }
-
- void PrintMap() {
- for(auto it = overlap_map_.begin(); it != overlap_map_.end(); it++) {
- TRACE(it->first.ToString() << " - " << it->second.ToString());
- }
- }
-
- size_t Size() { return overlap_map_.size(); }
-
- typedef map<OverlappedKey, OverlappedValue, OverlappedKeyComparator>::const_iterator overlap_map_iter;
-
- overlap_map_iter begin() const { return overlap_map_.begin(); }
-
- overlap_map_iter end() const { return overlap_map_.end(); }
-
- pair<Range, Range> Ranges(size_t id1, size_t id2) {
- return pair_overlap_map_[make_pair(id1, id2)];
- }
-
-private:
- DECL_LOGGER("OverlappedContigsMap");
-};
-
-ostream& operator<<(ostream& os, const OverlappedContigsMap& obj) {
- for(auto it = obj.begin(); it != obj.end(); it++)
- os << it->first.ToString() << " - " << it->second.ToString() << endl;
- return os;
-}
-
-//--------------------------------------------------------------------------------------------
-
-class OverlapCorrector : public LoopBulgeDeletionCorrector{
- size_t k_value_;
-
- struct overlap_res {
- bool correctness;
- size_t size;
-
- overlap_res(bool over_corr, size_t over_size) :
- correctness(over_corr),
- size(over_size) { }
-
- overlap_res() :
- correctness(false),
- size(0) { }
- };
-
- // todo insert check of bulge sides
- overlap_res IsOverlapCorrect(vector<EdgeId> first_path, vector<size_t> first_pos,
- vector<EdgeId> last_path, vector<size_t> last_pos){
-
- VERIFY(first_pos.size() == last_pos.size());
-
- if(first_pos.size() <= 1)
- return overlap_res();
-
-// cout << "Left tail length - " << GetLeftTailLength(last_path, last_pos) << endl;
-// cout << "Right tail length - " << GetRightTailLength(first_path, first_pos) << endl;
-
- if(IsLeftTailCorrect(last_path, last_pos) && IsRightTailCorrect(first_path, first_pos)){
-
- size_t first_start = ConvInd(first_pos[0], first_path.size());
- size_t last_end = ConvInd(last_pos[last_pos.size() - 1], last_path.size());
-
- // check of reachment of left tail start
- bool is_left_tail_correct = true;
- if(IsLeftTailExist(last_path, last_pos) ){
-
- if(dsp_cfg::get().cc.tails_lie_on_bulges){
- VertexId start1 = g_.EdgeStart(first_path[0]);
- VertexId start2 = g_.EdgeStart(last_path[0]);
-
- auto path_searcher = DijkstraHelper<Graph>::CreateBackwardBoundedDijkstra(g_,
- dsp_cfg::get().pbr.max_bulge_nucls_len);
- path_searcher.Run(start1);
- auto reached_vert1 = path_searcher.ReachedVertices();
-
- path_searcher.Run(start2);
- auto reached_vert2 = path_searcher.ReachedVertices();
-
- for(size_t i = 0; i < first_start; i++){
- VertexId cur_vert = g_.EdgeStart(first_path[i]);
- reached_vert1.push_back(cur_vert);
- }
-
- bool common_vertex_exists = false;
- for(auto v1 = reached_vert1.begin(); v1 != reached_vert1.end(); v1++)
- for(auto v2 = reached_vert2.begin(); v2 != reached_vert2.end(); v2++)
- if(*v1 == *v2){
- common_vertex_exists = true;
- break;
- }
- is_left_tail_correct = common_vertex_exists;
- }
- else{
- }
- }
-
- if(!is_left_tail_correct)
- return overlap_res();
-
- // check of reachment of right tail start
- bool is_right_tail_correct = true;
- if(IsRightTailExist(first_path, first_pos)){
-
- if(dsp_cfg::get().cc.tails_lie_on_bulges){
- size_t first_path_size = first_path.size(),
- last_path_size = last_path.size();
-
- VertexId end1 = g_.EdgeStart(first_path[first_path_size - 1]);
- VertexId end2 = g_.EdgeStart(last_path[last_path_size - 1]);
-
- auto path_searcher = DijkstraHelper<Graph>::CreateBackwardBoundedDijkstra(g_,
- dsp_cfg::get().pbr.max_bulge_nucls_len);
- path_searcher.Run(end1);
- auto reached_vert1 = path_searcher.ReachedVertices();
-
- path_searcher.Run(end2);
- auto reached_vert2 = path_searcher.ReachedVertices();
-
- for(size_t i = last_end; i < last_path.size(); i++){
- VertexId cur_vert = g_.EdgeEnd(last_path[i]);
- reached_vert2.push_back(cur_vert);
- }
-
- bool common_vertex_exists = false;
- for(auto v1 = reached_vert1.begin(); v1 != reached_vert1.end(); v1++)
- for(auto v2 = reached_vert2.begin(); v2 != reached_vert2.end(); v2++)
- if(*v1 == *v2){
- common_vertex_exists = true;
- break;
- }
- is_right_tail_correct = common_vertex_exists;
- }
- }
-
- if(is_right_tail_correct)
- return overlap_res(true, GetLeftTailLength(last_path, last_pos) +
- GetRightTailLength(first_path, first_pos));
- }
- return overlap_res();
-
- }
-
- pair<overlap_res, overlap_res> ArePathsOverlapped(vector<EdgeId> path1, vector<size_t> pos1,
- vector<EdgeId> path2, vector<size_t> pos2){
-
- if(path1.size() == 0 || path2.size() == 0)
- return make_pair(overlap_res(), overlap_res());
-
- VERIFY(pos1.size() == pos2.size());
-
- if(pos1.size() <= 1)
- return make_pair(overlap_res(), overlap_res());
-
- if(!IsLCSCorrect(path1, pos1, path2, pos2))
- return make_pair(overlap_res(), overlap_res());
-
- return make_pair(IsOverlapCorrect(path2, pos2, path1, pos1), IsOverlapCorrect(path1, pos1, path2, pos2));
- }
-
- string get_composite_contig_name(size_t i, size_t length){
- stringstream ss;
- ss << i << "_contig_" << length << "_length";
- return ss.str();
- }
-
- void FillOverlapGraphByMap(OverlappedContigsMap &overlap_map, OverlapGraph &graph) {
- for(auto it = overlap_map.begin(); it != overlap_map.end(); it++) {
- graph.AddNeighVertices(it->first.id1, it->first.id2, it->second.lcs_length);
- graph.AddNeighVertices(it->first.id2_rc, it->first.id1_rc, it->second.lcs_length);
- }
- }
-
-public:
- OverlapCorrector(Graph &g, size_t k_value, size_t min_overlap_length, VertexPathIndex &path_index) :
- LoopBulgeDeletionCorrector(g,
- k_value,
- dsp_cfg::get().cc.max_loop_length,
- dsp_cfg::get().pbr.max_bulge_nucls_len,
- min_overlap_length,
- path_index),
- k_value_(k_value) {}
-
- ContigStoragePtr Correct(ContigStoragePtr contigs) {
-
- INFO("Computing overlaps starts");
-
- OverlappedContigsMap overlap_map(dsp_cfg::get().cc.min_overlap_size);
-
- OverlapGraph og;
- vector<size_t> vertices;
- vector<size_t> id, rc_id;
- for(size_t i = 0; i < contigs->Size(); i++){
- vertices.push_back((*contigs)[i]->id());
- id.push_back((*contigs)[i]->id());
- rc_id.push_back((*contigs)[i]->rc_id());
- }
- og.InitializeVertexSet(vertices, id, rc_id);
-
- vector<vector<VertexId> > seqs;
- for(size_t i = 0; i < contigs->Size(); i++){
- vector<VertexId> seq = GetListOfVertices((*contigs)[i]->path_seq());
- seqs.push_back(seq);
- }
- LCSCalculator<VertexId> lcs_calc;
- set<pair<int, int> > processed_pairs;
-
- for(size_t i = 0; i < contigs->Size(); i++){
- auto path1 = (*contigs)[i]->path_seq();
- size_t id1 = (*contigs)[i]->id();
- size_t rc_id1 = (*contigs)[i]->rc_id();
- auto contigs_for_processing = path_index_.GetPathsIntersectedWith(path1);
- for(auto it = contigs_for_processing.begin(); it != contigs_for_processing.end(); it++){
- size_t j = *it;
- size_t id2 = (*contigs)[j]->id();
- size_t rc_id2 = (*contigs)[j]->rc_id();
- bool need_process = !((i % 2 == 0 && i + 1 == j) || j <= i);
- need_process = need_process && (processed_pairs.find(pair<int, int>(rc_id1, rc_id2)) ==
- processed_pairs.end());
- if(need_process){
- processed_pairs.insert(pair<int, int>(id1, id2));
- auto path2 = (*contigs)[j]->path_seq();
- auto lcs_res = lcs_calc.LCS(seqs[i], seqs[j]);
- vector<size_t> pos1, pos2;
- auto pos_vectors_pair = GetBestPosVectors(lcs_calc, path1, seqs[i], path2, seqs[j], lcs_res);
- pos1 = pos_vectors_pair.first;
- pos2 = pos_vectors_pair.second;
-
- {
- TRACE("--------------------------------");
- size_t id_i = id1, id_j = id2;
- TRACE("Indexes " << i << " " << j );
- TRACE("IDs " << id_i << " " << id_j);
- TRACE("LCS string : " << VerticesVectorToString(g_, lcs_res));
- TRACE("Path1. " << SimplePathWithVerticesToString(g_, path1));
- TRACE("Pos1. " << VectorToString<size_t>(pos1));
- TRACE("Path2. " << SimplePathWithVerticesToString(g_, path2));
- TRACE("Pos2. " << VectorToString<size_t>(pos2));
- }
-
- // Overlapping
- auto overlap_result = ArePathsOverlapped(path1, pos1, path2, pos2);
- bool is_overlaped = overlap_result.first.correctness ||
- overlap_result.second.correctness;
-
- if(is_overlaped){
-
- size_t first_id, last_id;
- vector<EdgeId> first_path, last_path;
- vector<size_t> first_pos, last_pos;
-
- if(overlap_result.first.correctness && overlap_result.second.correctness){
- if(overlap_result.first.size < overlap_result.second.size){
- first_id = id2; last_id = id1;
- }
- else {
- first_id = id1; last_id = id2;
- }
- }
- else{
- if(overlap_result.first.correctness) {
- first_id = id2; last_id = id1;
- }
- else {
- first_id = id1; last_id = id2;
- }
- }
-
- first_path = (first_id == id1) ? path1 : path2;
- last_path = (last_id == id1) ? path1 : path2;
- first_pos = (first_id == id1) ? pos1 : pos2;
- last_pos = (last_id == id1) ? pos1 : pos2;
-
- size_t rc_first_id = contigs->GetContigById(first_id)->rc_id();
- size_t rc_last_id = contigs->GetContigById(last_id)->rc_id();
-
- size_t lcs_len1 = GetLCSLengthByPath(path1, pos1);
- size_t lcs_len2 = GetLCSLengthByPath(path2, pos2);
-
- Range overlap_first(first_pos[0], first_pos[first_pos.size() - 1]);
- Range overlap_last(last_pos[0], last_pos[last_pos.size() - 1]);
-
- Range overlap_first_rc(first_path.size() - overlap_first.end_pos,
- first_path.size() - overlap_first.start_pos);
- Range overlap_last_rc(last_path.size() - overlap_last.end_pos,
- last_path.size() - overlap_last.start_pos);
-
- overlap_map.Add(
- OverlappedContigsMap::OverlappedKey(first_id, last_id, rc_first_id, rc_last_id),
- OverlappedContigsMap::OverlappedValue(overlap_first, overlap_last,
- overlap_first_rc, overlap_last_rc, max<size_t>(lcs_len1, lcs_len2)));
-
- TRACE(first_id << " - " << last_id << ". " << overlap_first.start_pos << " - " <<
- overlap_first.end_pos << ", " << overlap_last.start_pos << " - " <<
- overlap_last.end_pos);
-
- TRACE(rc_last_id << " - " << rc_first_id << ". " << overlap_last_rc.start_pos << " - " <<
- overlap_last_rc.end_pos << ", " << overlap_first_rc.start_pos << " - " <<
- overlap_first_rc.end_pos);
- }
- }
- }
- }
-
- TRACE("Overlapped contigs map. Size - " << ToString(overlap_map.Size()) << endl <<
- overlap_map);
-
- FillOverlapGraphByMap(overlap_map, og);
-
- string fname = dsp_cfg::get().io.output_dir + "default_overlap_graph.dot";
- OverlapgraphToDot(fname, og, contigs);
-
- INFO("Overlap graph with " + ToString(og.Vertices().size()) + " vertices and " +
- ToString(og.Edges().size()) + " edges constructed");
-
- auto og_vertices = og.Vertices();
- auto edges = og.Edges();
-
- SimplifyOverlapGraph(og, 10, 5);
-
- INFO("Simplified overlap graph contains " + ToString(og.Vertices().size()) + " vertices and " +
- ToString(og.Edges().size()) + " edges");
-
- fname = dsp_cfg::get().io.output_dir + "simplified_overlap_graph.dot";
- OverlapgraphToDot(fname, og, contigs);
-
- UniquePathsSearcher ps(og);
- auto paths = ps.FindLongPaths();
- TRACE(paths.size() << " paths in overlap graph were searched");
-
- ContigStoragePtr new_storage(new SimpleContigStorage());
- size_t i = 1;
- for(auto p = paths.begin(); p != paths.end(); p++){
- VERIFY(p->size() > 0);
- if(p->size() == 1){
- TRACE("Consensus contig " << i << " is simple");
- auto contig = contigs->GetContigById((*p)[0]);
- MappingContigPtr new_rc(new ReplacedNameMappingContig(contig,
- get_composite_contig_name(i, contig->length())));
- new_storage->Add(new_rc);
- }
- else{
- TRACE("Consensus contig " << i << " is composite");
-
- vector<pair<Range, Range> > overlaps;
- vector<MappingContigPtr> mc_vect;
- for(size_t i = 0; i < p->size() - 1; i++)
- overlaps.push_back(overlap_map.Ranges((*p)[i], (*p)[i + 1]));
-
- for(auto id = p->begin(); id != p->end(); id++)
- mc_vect.push_back(contigs->GetContigById(*id));
-
- MappingContigPtr new_mc(new CompositeMappingContig(g_, k_value_,
- mc_vect, overlaps));
- new_mc->ChangeName(get_composite_contig_name(i, new_mc->length()));
- new_storage->Add(new_mc);
- }
- i++;
- }
-
- INFO("Computing overlaps ends");
-
- return new_storage;
- }
-
-private:
- DECL_LOGGER("OverlapCorrector");
-};
-
-}
diff --git a/src/dipspades/consensus_contigs_constructor/contig_correctors/redundant_contig_remover.hpp b/src/dipspades/consensus_contigs_constructor/contig_correctors/redundant_contig_remover.hpp
deleted file mode 100644
index 7a71a10..0000000
--- a/src/dipspades/consensus_contigs_constructor/contig_correctors/redundant_contig_remover.hpp
+++ /dev/null
@@ -1,891 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "abstract_contig_corrector.hpp"
-
-using namespace debruijn_graph;
-
-namespace dipspades {
-
-class LoopBulgeDeletionCorrector : public AbstractContigCorrector{
-
-protected:
- size_t k_value_;
-
- size_t max_loop_length_;
- size_t max_tail_length_;
- size_t min_lcs_length_;
- VertexPathIndex &path_index_;
-
- ostream &out_;
-
-public:
- vector<VertexId> GetListOfVertices(vector<EdgeId> path){
- return get_list_of_vertices_in_path(g_, path);
- }
-
- size_t ConvInd(size_t ind, size_t size){
- if(ind == size)
- return ind - 1;
- return ind;
- }
-
- bool IsLoopCorrect(vector<EdgeId> path, size_t i1, size_t i2){
- VERIFY(i1 < path.size());
- VERIFY(i2 < path.size());
-
- size_t length = 0;
- for(size_t i = i1; i <= i2; i++)
- length += g_.length(path[i]);
-
- return length <= max_loop_length_;
- }
-
- bool IsRegionLoop(vector<EdgeId> path, size_t i1, size_t i2){
- return g_.EdgeStart(path[i1]) == g_.EdgeEnd(path[i2]);
- }
-
- bool IsPathCorrect(vector<EdgeId> path, vector<size_t> pos){
- return (IsLeftTailCorrect(path, pos) && IsRightTailCorrect(path, pos));
- }
-
- bool IsRegionBulgeSide(vector<EdgeId> path, size_t ind1, size_t ind2){
- return g_.EdgeStart(path[ind1]) != g_.EdgeEnd(path[ind2]);
- }
-
- bool AreRegionsBulge(vector<EdgeId> path1, size_t i_11, size_t i_12,
- vector<EdgeId> path2, size_t i_21, size_t i_22){
- return IsRegionBulge(g_, CutSubpathByRegion(path1, make_pair(i_11, i_12)),
- CutSubpathByRegion(path2, make_pair(i_21, i_22)));
- }
-
- bool AreRegionsDiploidBulge(vector<EdgeId> path1, size_t i_11, size_t i_12,
- vector<EdgeId> path2, size_t i_21, size_t i_22){
-
- TRACE("Bulge: pos1: " << i_11 << " - " << i_12 << ", pos2: " << i_21 << " - " << i_22 );
-
- if(dsp_cfg::get().cc.align_bulge_sides){
- Bulge bulge(g_, k_value_, path1, make_pair(i_11, i_12), path2, make_pair(i_21, i_22));
- return bulge.IsBulgeDiploid(dsp_cfg::get().pbr.rel_bulge_align,
- dsp_cfg::get().pbr.rel_bulge_align);
- }
- return true;
- }
-
- bool IsLCSCorrect(vector<EdgeId> path1, vector<size_t> pos1,
- vector<EdgeId> path2, vector<size_t> pos2){
-
- VERIFY(pos1.size() == pos2.size());
-
- size_t pos_len = pos1.size();
- if(pos_len <= 1)
- return false;
-
- size_t lcs_len = min<size_t>(GetLCSLengthByPath(path1, pos1), GetLCSLengthByPath(path2, pos2));
- size_t path1_len = GetPathLength(g_, path1), path2_len = GetPathLength(g_, path2);
-
- TRACE("LCS length - " << lcs_len);
- TRACE("Path length1 - " << path1_len << ", path length2 - " << path2_len);
-
- if(lcs_len <= min_lcs_length_ &&
- min<size_t>(path1_len, path2_len) > min_lcs_length_){
- return false;
- }
-
- for(size_t i = 0; i < pos_len - 1; i++){
-
- TRACE("Pos1 - " << pos1[i] << ", " << pos1[i + 1]);
- TRACE("Pos2 - " << pos2[i] << ", " << pos2[i + 1]);
- // if bath are not neighbors
- if(pos1[i] + 1 != pos1[i + 1] || pos2[i] + 1 != pos2[i + 1]){
-
- TRACE("1st loop checking");
- bool is_1st_loop = false;
- bool is_1st_corr = true;
-
- size_t i_11, i_12;
- if(pos1[i] + 1 != pos1[i + 1]){
- TRACE("Positions are not consecutive");
- // it may be loop
- i_11 = ConvInd(pos1[i], path1.size());
- i_12 = ConvInd(pos1[i + 1], path1.size()) - 1;
-
- is_1st_loop = IsRegionLoop(path1, i_11, i_12);
- TRACE("Is loop - " << is_1st_loop);
- if(is_1st_loop){
- is_1st_corr = IsLoopCorrect(path1, i_11, i_12);
- }
- else{ // then region is bulge
- VERIFY(IsRegionBulgeSide(path1, i_11, i_12));
- }
- }
- else{
- i_11 = pos1[i];
- i_12 = pos1[i];
- }
-
- TRACE("2nd loop checking");
- bool is_2nd_loop = false;
- bool is_2nd_corr = true;
- size_t i_21, i_22;
- if(pos2[i] + 1 != pos2[i + 1]){
- TRACE("Positions are not consecutive");
- // it may be loop
- i_21 = ConvInd(pos2[i], path2.size());
- i_22 = ConvInd(pos2[i + 1], path2.size()) - 1;
-
- is_2nd_loop = IsRegionLoop(path2, i_21, i_22);
- TRACE("Is loop - " << is_2nd_loop );
- if(is_2nd_loop){
- is_2nd_corr = IsLoopCorrect(path2, i_21, i_22);
- }
- else{
- VERIFY(IsRegionBulgeSide(path2, i_21, i_22));
- }
- }
- else{
- i_21 = pos2[i];
- i_22 = pos2[i];
- }
-
- if(!is_1st_loop && !is_2nd_loop){
-
- i_12 = (pos1[i + 1] == path1.size()) ? path1.size() - 1 : i_12;
- i_22 = (pos2[i + 1] == path2.size()) ? path2.size() - 1 : i_22;
-
- if(AreRegionsBulge(path1, i_11, i_12, path2, i_21, i_22))
- if(!AreRegionsDiploidBulge(path1, i_11, i_12, path2, i_21, i_22))
- return false;
- }
- else
- if(!is_1st_corr || !is_2nd_corr){
- return false;
- }
- }
- }
-
- return true;
- }
-
- size_t GetLCSLengthByPath(vector<EdgeId> path, vector<size_t> pos){
- if(pos.size() <= 1)
- return 0;
- size_t pos_len = pos.size();
- size_t last_pos = pos[pos_len - 1];
-
- size_t ind_start = ConvInd(pos[0], path.size());
- size_t ind_end = ConvInd(last_pos, path.size());
- if(last_pos != path.size())
- ind_end--;
-
- size_t len = 0;
- for(size_t i = ind_start; i <= ind_end; i++)
- len += g_.length(path[i]);
-
- return len;
- }
-
- size_t GetLeftTailLength(vector<EdgeId> path, vector<size_t> pos){
- if(pos.size() <= 1)
- return 0;
-
- size_t first_pos = ConvInd(pos[0], path.size());
-
- size_t tail_len = 0;
- for(size_t i = 0; i < first_pos; i++)
- tail_len += g_.length(path[i]);
-
- return tail_len;
- }
-
- bool IsLeftTailCorrect(vector<EdgeId> path, vector<size_t> pos){
- if(pos.size() <= 1)
- return false;
-
- size_t tail_len = GetLeftTailLength(path, pos);
-// TRACE("Left tail length - " << tail_len );
- return (tail_len <= max_tail_length_);
- }
-
- size_t GetRightTailLength(vector<EdgeId> path, vector<size_t> pos){
- if(pos.size() <= 1)
- return 0;
-
- size_t last_pos = pos[pos.size() - 1];
-
- size_t tail_len = 0;
- for(size_t i = last_pos; i < path.size(); i++)
- tail_len += g_.length(path[i]);
-
- return tail_len;
- }
-
- bool IsRightTailCorrect(vector<EdgeId> path, vector<size_t> pos){
- if(pos.size() <= 1)
- return false;
-
- size_t tail_len = GetRightTailLength(path, pos);
-// TRACE("Right tail length - " << tail_len );
- return (tail_len <= max_tail_length_);
- }
-
- bool AreLeftTailsCorrect(vector<EdgeId> path1, vector<size_t> pos1,
- vector<EdgeId> path2, vector<size_t> pos2){
-
- VERIFY(pos1.size() == pos2.size());
-
- if(pos1.size() <= 1)
- return false;
-
- size_t tail_length1 = GetLeftTailLength(path1, pos1);
- size_t tail_length2 = GetLeftTailLength(path2, pos2);
-
- if(min<size_t>(tail_length1, tail_length2) > max_tail_length_)
- return false;
-
- VertexId start1 = g_.EdgeStart(path1[0]); //g_.EdgeStart(path1[first_pos1]);
- VertexId start2 = g_.EdgeStart(path2[0]); //g_.EdgeStart(path2[first_pos2]);
-
- bool are_tails_correct = false;
- if(g_.IncomingEdgeCount(start1) == 0 &&
- g_.IncomingEdgeCount(start2) == 0){
- are_tails_correct = true;
- }
- else{
-
- if(dsp_cfg::get().cc.tails_lie_on_bulges){
- // find vertex v, such that paths starts are reachable from v
- auto path_searcher1 = DijkstraHelper<Graph>::CreateBackwardBoundedDijkstra(g_,
- max_tail_length_);
- path_searcher1.Run(start1);
- auto reached_vert1 = path_searcher1.ReachedVertices();
-
- auto path_searcher2 = DijkstraHelper<Graph>::CreateBackwardBoundedDijkstra(g_,
- max_tail_length_);
- path_searcher2.Run(start2);
- auto reached_vert2 = path_searcher2.ReachedVertices();
-
- for(size_t i = 0; i < pos1[0]; i++)
- reached_vert1.push_back(g_.EdgeStart(path1[i]));
-
- for(size_t i = 0; i < pos2[0]; i++)
- reached_vert2.push_back(g_.EdgeStart(path2[i]));
-
- for(auto v1 = reached_vert1.begin(); v1 != reached_vert1.end(); v1++){
- for(auto v2 = reached_vert2.begin(); v2 != reached_vert2.end(); v2++){
- if(*v1 == *v2){
- are_tails_correct = true;
- break;
- }
- }
- if(are_tails_correct)
- break;
- }
- }
- else{
- are_tails_correct = true;
- }
- }
-
- if(!are_tails_correct)
- return false;
-
- if(!dsp_cfg::get().cc.align_bulge_sides)
- return true;
-
- Sequence tail_seq1 = GetSequenceOfPathRegion(g_, k_value_, path1,
- pair<size_t, size_t>(0, pos1[0] - 1));
-
- Sequence tail_seq2 = GetSequenceOfPathRegion(g_, k_value_, path2,
- pair<size_t, size_t>(0, pos2[0] - 1));
-
- Sequence trim_seq1, trim_seq2;
- if(min<size_t>(tail_seq1.size(), tail_seq2.size()) == tail_seq1.size()){
- trim_seq1 = tail_seq1;
- trim_seq2 = tail_seq2.Subseq(tail_seq2.size() - tail_seq1.size(),
- tail_seq2.size());
- }
- else{
- trim_seq1 = tail_seq1.Subseq(tail_seq1.size() - tail_seq2.size(),
- tail_seq1.size());
- trim_seq2 = tail_seq2;
- }
-
- if(trim_seq1.size() > max_tail_length_)
- return false;
-
- return RelAlignmentOfSequences(trim_seq1, trim_seq2) <=
- dsp_cfg::get().pbr.rel_bulge_align;
-
- }
-
- bool AreRightTailsCorrect(vector<EdgeId> path1, vector<size_t> pos1,
- vector<EdgeId> path2, vector<size_t> pos2){
-
- VERIFY(pos1.size() == pos2.size());
-
- if(pos1.size() <= 1)
- return false;
-
- size_t tail_length1 = GetRightTailLength(path1, pos1);
- size_t tail_length2 = GetRightTailLength(path2, pos2);
-
- if(min<size_t>(tail_length1, tail_length2) > max_tail_length_)
- return false;
-
- VertexId end1 = g_.EdgeEnd(path1[path1.size() - 1]);
- VertexId end2 = g_.EdgeEnd(path2[path2.size() - 1]);
-
- bool are_tails_correct = false;
-
- if(g_.OutgoingEdgeCount(end1) == 0 && g_.OutgoingEdgeCount(end2) == 0){
- are_tails_correct = true;
- }
- else{
-
- if(dsp_cfg::get().cc.tails_lie_on_bulges){
- // find vertex v, such that paths ends are reachable from v
- auto path_searcher1 = DijkstraHelper<Graph>::CreateBackwardBoundedDijkstra(g_,
- max_tail_length_);
- path_searcher1.Run(end1);
- auto reached_vert1 = path_searcher1.ReachedVertices();
-
- auto path_searcher2 = DijkstraHelper<Graph>::CreateBackwardBoundedDijkstra(g_,
- max_tail_length_);
- path_searcher2.Run(end2);
- auto reached_vert2 = path_searcher2.ReachedVertices();
-
- for(size_t i = ConvInd(pos1[pos1.size() - 1], path1.size()); i < path1.size(); i++)
- reached_vert1.push_back(g_.EdgeEnd(path1[i]));
-
- for(size_t i = ConvInd(pos2[pos2.size() - 1], path2.size()); i < path2.size(); i++)
- reached_vert2.push_back(g_.EdgeEnd(path2[i]));
-
- for(auto v1 = reached_vert1.begin(); v1 != reached_vert1.end(); v1++){
- for(auto v2 = reached_vert2.begin(); v2 != reached_vert2.end(); v2++){
- if(*v1 == *v2){
- are_tails_correct = true;
- break;
- }
- }
- if(are_tails_correct)
- break;
- }
- }
- else{
- // tail lengths comparison?
- are_tails_correct = true;
- }
- }
-
- if(!are_tails_correct)
- return false;
-
- if(!dsp_cfg::get().cc.align_bulge_sides)
- return true;
-
- Sequence tail_seq1 = GetSequenceOfPathRegion(g_, k_value_, path1,
- pair<size_t,size_t>(pos1[pos1.size() - 1], path1.size() - 1));
-
- Sequence tail_seq2 = GetSequenceOfPathRegion(g_, k_value_, path2,
- pair<size_t,size_t>(pos2[pos2.size() - 1], path2.size() - 1));
-
- Sequence trim_seq1, trim_seq2;
- if(min<size_t>(tail_seq1.size(), tail_seq2.size()) == tail_seq1.size()){
- trim_seq1 = tail_seq1;
- trim_seq2 = tail_seq2.Subseq(0, tail_seq1.size());
- }
- else{
- trim_seq1 = tail_seq1.Subseq(0, tail_seq2.size());
- trim_seq2 = tail_seq2;
- }
-
- if(trim_seq1.size() > max_tail_length_)
- return false;
-
- return (RelAlignmentOfSequences(trim_seq1, trim_seq2) <= dsp_cfg::get().pbr.rel_bulge_align);
- }
-
- bool IsLeftTailExist(vector<EdgeId>, vector<size_t> pos){
- size_t first_index = pos[0]; //ConvInd(pos[0], path.size());
- return (first_index != 0);
- }
-
- bool AreBothLeftTailsExist(vector<EdgeId> path1, vector<size_t> pos1,
- vector<EdgeId> path2, vector<size_t> pos2){
-
- VERIFY(pos1.size() == pos2.size());
- if(pos1.size() == 0)
- return false;
-
- TRACE("Left: " << IsLeftTailExist(path1, pos2) << " " << IsLeftTailExist(path2, pos2) );
- return (IsLeftTailExist(path1, pos1) && IsLeftTailExist(path2, pos2));
- }
-
- bool IsRightTailExist(vector<EdgeId> path, vector<size_t> pos){
- size_t last_index = pos[pos.size() - 1]; //ConvInd(pos[pos.size() - 1], path.size());
- return (last_index != path.size());
- }
-
- bool AreBothRightTailsExist(vector<EdgeId> path1, vector<size_t> pos1,
- vector<EdgeId> path2, vector<size_t> pos2){
- VERIFY(pos1.size() == pos2.size());
- if(pos1.size() == 0)
- return false;
-
- TRACE("Right: " << IsRightTailExist(path1, pos1) << " " << IsRightTailExist(path2, pos2) );
- return IsRightTailExist(path1, pos1) && IsRightTailExist(path2, pos2);
-
- }
-
- // yana todo replace
- vector<VertexId> RearrangementSearch(vector<EdgeId> path1, vector<EdgeId> path2){
-
- vector<VertexId> common_vertices;
- if(path1.size() == 0 || path2.size() == 0)
- return common_vertices;
-
- map<VertexId, int> vertex_count;
-
- set<VertexId> vertices1; vertices1.insert(g_.EdgeStart(path1[0]));
- for(auto e = path1.begin(); e != path1.end(); e++){
- vertices1.insert(g_.EdgeEnd(*e));
- }
-
- set<VertexId> vertices2; vertices2.insert(g_.EdgeStart(path2[0]));
- for(auto e = path2.begin(); e != path2.end(); e++){
- vertices2.insert(g_.EdgeEnd(*e));
- }
-
- for(auto v = vertices1.begin(); v != vertices1.end(); v++)
- vertex_count[*v]++;
-
- for(auto v = vertices2.begin(); v != vertices2.end(); v++)
- vertex_count[*v]++;
-
- for(auto it = vertex_count.begin(); it != vertex_count.end(); it++)
- if(it->second == 2)
- common_vertices.push_back(it->first);
-
-// TRACE("Common vertices: " );
-// PrintVectorOfVertices(cout, g_, common_vertices);
- return common_vertices;
- }
-
- bool ArePathsCorrect(vector<EdgeId> path1, vector<size_t> pos1,
- vector<EdgeId> path2, vector<size_t> pos2){
-
- VERIFY(pos1.size() == pos2.size());
-
- if(AreBothLeftTailsExist(path1, pos1, path2, pos2))
- {
- TRACE("Both left tails exist" );
- bool tail_corr = AreLeftTailsCorrect(path1, pos1, path2, pos2);
- if(!tail_corr){
- TRACE("One of left tails is not correct" );
- return false;
- }
- }
-
- if(AreBothRightTailsExist(path1, pos1, path2, pos2))
- {
- TRACE("Both right tails exist" );
- bool tail_corr = AreRightTailsCorrect(path1, pos1, path2, pos2);
- if(!tail_corr){
- TRACE("One of right tails is not correct" );
- return false;
- }
- }
-
- bool lcs_corr = IsLCSCorrect(path1, pos1, path2, pos2);
-
- if(!lcs_corr){
- TRACE("LCS is not correct" );
- auto common_vert = RearrangementSearch(path1, path2);
- if(common_vert.size() > pos1.size())
- TRACE("Possible rearrangement!");
- return false;
- }
-
- size_t lcs_length1 = GetLCSLengthByPath(path1, pos1),
- lcs_length2 = GetLCSLengthByPath(path2, pos2);
-
- return (min<size_t>(lcs_length1, lcs_length2) > 0);
- }
-
- bool IsPathRedundant(vector<EdgeId> path, vector<size_t> pos){
- if(pos.size() <= 1) return true;
- return IsLeftTailCorrect(path, pos) && IsRightTailCorrect(path, pos);
- }
-
- void CorrectPositionVertor(vector<EdgeId> path, vector<size_t> & pos){
- if(pos.size() <= 1)
- return;
-
- for(size_t i = 0; i < pos.size() - 1; i++){
- if(pos[i] + 1 != pos[i + 1]){
-
- size_t i1 = ConvInd(pos[i], path.size()) + 1;
- size_t i2 = ConvInd(pos[i + 1], path.size()) - 1;
-
- if(IsRegionLoop(path, i1, i2))
- if(!IsLoopCorrect(path, i1, i2)){
- VertexId v;
- if(pos[i + 1] == path.size())
- v = g_.EdgeEnd(path[path.size() - 1]);
- else
- v = g_.EdgeStart(path[pos[i + 1]]);
- for(size_t j = pos[i] + 1; j < pos[i + 1]; j++)
- if(g_.EdgeStart(path[j]) == v){
- pos[i + 1] = j;
- break;
- }
- }
- }
- }
- }
-
- size_t GetNumberOfErrorsFromLCS(vector<EdgeId> path, vector<size_t> pos){
- if(pos.size() <= 1)
- return 0;
-
- size_t error_num = 0;
-
- for(size_t i = 0; i < pos.size() - 1; i++){
- if(pos[i] + 1 != pos[i + 1]){
-
- size_t i1 = ConvInd(pos[i], path.size()) + 1;
- size_t i2 = ConvInd(pos[i + 1], path.size()) - 1;
-
- if(IsRegionLoop(path, i1, i2)){
- if(!IsLoopCorrect(path, i1, i2))
- error_num++;
- }
- }
- }
-
- return error_num;
- }
-
- pair<vector<size_t>, vector<size_t> > GetBestPosVectors(LCSCalculator<VertexId> & calc,
- vector<EdgeId> path1, vector<VertexId> vert_path1,
- vector<EdgeId> path2, vector<VertexId> vert_path2,
- vector<VertexId> lcs){
-
- // first path processing
- auto pos_right1 = calc.GetPosVector(vert_path1, lcs);
- auto pos_left1 = calc.GetPosVectorFromLeft(vert_path1, lcs);
-
- bool equal_num_err1 = true;
- vector<size_t> best_vect1;
-
- {
- size_t err_right1 = GetNumberOfErrorsFromLCS(path1, pos_right1);
- size_t err_left1 = GetNumberOfErrorsFromLCS(path1, pos_left1);
- equal_num_err1 = err_left1 == err_right1;
- best_vect1 = (err_left1 < err_right1) ? pos_left1 : pos_right1;
- }
-
- size_t lcs_right1 = GetLCSLengthByPath(path1, pos_right1);
- size_t lcs_left1 = GetLCSLengthByPath(path1, pos_left1);
-
- // second path processing
- auto pos_right2 = calc.GetPosVector(vert_path2, lcs);
- auto pos_left2 = calc.GetPosVectorFromLeft(vert_path2, lcs);
-
- bool equal_num_err2 = true;
- vector<size_t> best_vect2;
-
- {
- size_t err_right2 = GetNumberOfErrorsFromLCS(path2, pos_right2);
- size_t err_left2 = GetNumberOfErrorsFromLCS(path2, pos_left2);
- equal_num_err2 = err_left2 == err_right2;
- best_vect2 = (err_left2 < err_right2) ? pos_left2 : pos_right2;
- }
-
- size_t lcs_right2 = GetLCSLengthByPath(path2, pos_right2);
- size_t lcs_left2 = GetLCSLengthByPath(path2, pos_left2);
-
- if(equal_num_err1 && !equal_num_err2){
-
- size_t best_lcs2 = GetLCSLengthByPath(path2, best_vect2);
-
- if(abs_diff(lcs_right1, best_lcs2) < abs_diff(lcs_left1, best_lcs2))
- return pair<vector<size_t>, vector<size_t> >(pos_right1, best_vect2);
- else
- return pair<vector<size_t>, vector<size_t> >(pos_left1, best_vect2);
- }
-
- if(!equal_num_err1 && equal_num_err2){
- size_t best_lcs1 = GetLCSLengthByPath(path1, best_vect1);
-
- if(abs_diff(lcs_right2, best_lcs1) < abs_diff(lcs_left2, best_lcs1))
- return pair<vector<size_t>, vector<size_t> >(best_vect1, pos_right2);
- else
- return pair<vector<size_t>, vector<size_t> >(best_vect1, pos_left2);
- }
-
- if(equal_num_err1 && equal_num_err2){
-
- // best pair computing
- size_t left_left = abs_diff(lcs_left1, lcs_left2);
- size_t left_right = abs_diff(lcs_left1, lcs_right2);
- size_t right_left = abs_diff(lcs_right1, lcs_left2);
- size_t right_right = abs_diff(lcs_right1, lcs_right2);
-
- size_t min_diff = min<size_t>(min<size_t>(left_left, left_right),
- min<size_t>(right_left, right_right));
-
- if(min_diff == left_left){
- return pair<vector<size_t>, vector<size_t> >(pos_left1, pos_left2);
- }
-
- if(min_diff == left_right){
- return pair<vector<size_t>, vector<size_t> >(pos_left1, pos_right2);
- }
-
- if(min_diff == right_left){
- return pair<vector<size_t>, vector<size_t> >(pos_right1, pos_left2);
- }
-
- if(min_diff == right_right){
- return pair<vector<size_t>, vector<size_t> >(pos_right1, pos_right2);
- }
- }
-
- return pair<vector<size_t>, vector<size_t> >(best_vect1, best_vect2);
- }
-
- vector<size_t> GetBestPosVector(LCSCalculator<VertexId> & calc, vector<EdgeId> path,
- vector<VertexId> vert_path, vector<VertexId> lcs){
-
- auto pos_right = calc.GetPosVector(vert_path, lcs);
- auto pos_left = calc.GetPosVectorFromLeft(vert_path, lcs);
-
- size_t err_right = GetNumberOfErrorsFromLCS(path, pos_right);
- size_t err_left = GetNumberOfErrorsFromLCS(path, pos_left);
-
- if(min<size_t>(err_left, err_right) == err_left)
- return pos_left;
- else
- return pos_right;
- }
-
- void InitializeMap(ContigStoragePtr contigs){
- for(size_t i = 0; i < contigs->Size(); i++){
- size_t id = (*contigs)[i]->id();
- res.redundancy_map.AddNewKey(id);
- }
- }
-
- void AddRedundantContig(ContigStoragePtr contigs, size_t index_red, size_t index_main){
- size_t id_main = (*contigs)[index_main]->id(),
- id_rc_main = (*contigs)[index_main]->rc_id();
- size_t id_red = (*contigs)[index_red]->id(),
- id_rc_red = (*contigs)[index_red]->rc_id();
- redundant_contigs.insert(id_red);
- redundant_contigs.insert(id_rc_red);
- // current contig
- res.redundancy_map.AddNewPair(id_main, id_red);
- res.redundancy_map.AddNewPair(id_rc_main, id_rc_red);
- }
-
- CorrectionResult res;
- set<size_t> redundant_contigs;
-
-public:
- LoopBulgeDeletionCorrector(Graph &g, size_t k_value, size_t max_loop_length,
- size_t max_tail_length, size_t min_lcs_length, VertexPathIndex &path_index,
- ostream &out = cout) : AbstractContigCorrector(g), k_value_(k_value),
- path_index_(path_index), out_(out) {
-
- max_loop_length_ = max_loop_length;
- max_tail_length_ = max_tail_length;
- min_lcs_length_ = min_lcs_length;
- }
-
- virtual ContigStoragePtr Correct(ContigStoragePtr contigs) {
-
- INFO("Computing redundant contigs starts");
-
- redundant_contigs.clear();
-
- InitializeMap(contigs);
-
- LCSCalculator<VertexId> lcs_calc;
-
- vector<vector<VertexId> > seqs;
- for(size_t i = 0; i < contigs->Size(); i++){
- vector<VertexId> seq = GetListOfVertices((*contigs)[i]->path_seq());
- seqs.push_back(seq);
- }
-
- set<size_t> processed_contigs;
- set<size_t> absolutely_redundant;
-
- size_t contigs_number = seqs.size();
- double processed_perc = 0.1;
- double processed_step = 0.1;
-
- for(size_t i = 0; i < seqs.size() - 1; i++){
-
- size_t id_i = (*contigs)[i]->id();
- size_t rc_id_i = (*contigs)[i]->rc_id();
-
- processed_contigs.insert(id_i);
-
- if(processed_contigs.find(rc_id_i) == processed_contigs.end() &&
- absolutely_redundant.find(i) == absolutely_redundant.end()){
-
- vector<EdgeId> path1 = (*contigs)[i]->path_seq();
- set<int> analyzed_contigs;
-
- auto contigs_for_analyze = path_index_.GetPathsIntersectedWith(path1);
- for(auto it = contigs_for_analyze.begin(); it != contigs_for_analyze.end(); it++){
-
- size_t j = *it;
- size_t id_j = (*contigs)[j]->id();
- size_t rc_id_j = (*contigs)[j]->rc_id();
-
- bool need_process = !((i % 2 == 0 && i + 1 == j) || j <= i);
- need_process = need_process &&
- absolutely_redundant.find(j) == absolutely_redundant.end();
- if(need_process){
-
- vector<EdgeId> path2 = (*contigs)[j]->path_seq();
- vector<VertexId> lcs_res = lcs_calc.LCS(seqs[i], seqs[j]);
- vector<size_t> pos1, pos2;
-
- auto pos_vectors_pair = GetBestPosVectors(lcs_calc, path1, seqs[i], path2, seqs[j], lcs_res);
- pos1 = pos_vectors_pair.first;
- pos2 = pos_vectors_pair.second;
-
- {
- TRACE("--------------------------------");
- TRACE("Indexes " << i << " " << j);
- TRACE("IDs " << id_i << " " << id_j);
- TRACE("RC_Ids " << rc_id_i << " " << rc_id_j);
-
- TRACE("Path1. " << SimplePathWithVerticesToString(g_, path1));
- TRACE("Path2. " << SimplePathWithVerticesToString(g_, path2));
-
- TRACE("LCS string: " << VerticesVectorToString(g_, lcs_res));
-
- TRACE("Pos1. " << VectorToString<size_t>(pos1));
- TRACE("Pos2. " << VectorToString<size_t>(pos2));
- }
-
- if(pos1.size() > 1){
-
- bool paths_corr = ArePathsCorrect(path1, pos1, path2, pos2);
-
- {
- TRACE("ArePathsCorrect - " << paths_corr);
- }
-
- if(paths_corr){
-
- size_t first_tail1 = GetLeftTailLength(path1, pos1);
- size_t first_tail2 = GetRightTailLength(path1, pos1);
- size_t first_tails = first_tail1 + first_tail2;
-
- size_t second_tail1 = GetLeftTailLength(path2, pos2);
- size_t second_tail2 = GetRightTailLength(path2, pos2);
- size_t second_tails = second_tail1 + second_tail2;
-
- bool first_path_red = IsPathRedundant(path1, pos1);
- bool second_path_red = IsPathRedundant(path2, pos2);
-
- {
- TRACE("\tFirst tails length - " << first_tails);
- TRACE("\tFirst path is redundant - " << first_path_red);
- TRACE("\tSecond tails length - " << second_tails);
- TRACE("\tSecond path is redundant - " << second_path_red);
- }
-
- if(first_path_red && second_path_red){
- if(first_tails < second_tails){
- TRACE(id_i << " is redundant");
- AddRedundantContig(contigs, i, j);
-
- if(first_tails == 0)
- absolutely_redundant.insert(i);
- }
- else{
- TRACE(id_j << " is redundant");
- AddRedundantContig(contigs, j, i);
-
- if(second_tails == 0)
- absolutely_redundant.insert(j);
- }
- }
- else{
- if(first_path_red && !second_path_red){
- TRACE(id_i << " is redundant");
- AddRedundantContig(contigs, i, j);
-
- if(first_tails == 0)
- absolutely_redundant.insert(i);
-
- }
- else
- if(!first_path_red && second_path_red){
- TRACE(id_j << " is redundant");
- AddRedundantContig(contigs, j, i);
-
- if(second_tails == 0)
- absolutely_redundant.insert(j);
- }
- }
-
- if(absolutely_redundant.find(i) != absolutely_redundant.end())
- break;
- }
- }
- }
- }
- }
-
- double cur_process_perc = static_cast<double>(i) / static_cast<double>(contigs_number);
- if(cur_process_perc > processed_perc) {
- while(processed_perc + processed_step<= cur_process_perc)
- processed_perc += processed_step;
- INFO(ToString(processed_perc * 100.0) << "% contigs were processed");
- processed_perc += processed_step;
- }
- }
- INFO("100% contigs were processed");
-
- RedundancyMapCondenser<size_t> condenser;
- condenser.Condense(res.redundancy_map);
-
- INFO(ToString(redundant_contigs.size()) + " contigs from " + ToString(contigs->Size()) + " are redundant");
-
- contigs->DeleteByIDs(redundant_contigs);
-
- INFO("Computing redundant contigs ends");
-
- return contigs;
- }
-
- MappingContigPtr Correct(MappingContigPtr contig){
- return contig;
- }
-
- CorrectionResult Results(){
- return res;
- }
-
- virtual ~LoopBulgeDeletionCorrector(){}
-
-protected:
- DECL_LOGGER("LoopBulgeDeletionCorrector");
-};
-
-}
diff --git a/src/dipspades/consensus_contigs_constructor/contig_correctors/same_edge_deletion_corrector.hpp b/src/dipspades/consensus_contigs_constructor/contig_correctors/same_edge_deletion_corrector.hpp
deleted file mode 100644
index ccee4bc..0000000
--- a/src/dipspades/consensus_contigs_constructor/contig_correctors/same_edge_deletion_corrector.hpp
+++ /dev/null
@@ -1,71 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "abstract_contig_corrector.hpp"
-
-using namespace debruijn_graph;
-
-namespace dipspades {
-
-class SameEdgeDeletionCorrector : public AbstractContigCorrector{
-
- MappingRange WideMappingRange(MappingRange old_range, MappingRange new_range){
-
- size_t initial_range_start_pos = min<size_t>(old_range.initial_range.start_pos, new_range.initial_range.start_pos);
- size_t initial_range_end_pos = max<size_t>(old_range.initial_range.end_pos, new_range.initial_range.end_pos);
- size_t mapped_range_start_pos = min<size_t>(old_range.mapped_range.start_pos, new_range.mapped_range.start_pos);
- size_t mapped_range_end_pos = max<size_t>(old_range.mapped_range.end_pos, new_range.mapped_range.end_pos);
-
- Range init(initial_range_start_pos, initial_range_end_pos), mapp(mapped_range_start_pos, mapped_range_end_pos);
- MappingRange res(init, mapp);
- return res;
- }
-
-public:
- SameEdgeDeletionCorrector(Graph &g) : AbstractContigCorrector(g) {
- }
-
- ContigStoragePtr Correct(ContigStoragePtr contigs) {
- for(size_t i = 0; i < contigs->Size(); i++)
- contigs->ReplaceContig(Correct((*contigs)[i]), i);
- TRACE(contigs->Size() << " contigs from " << contigs->Size() << " were corrected");
- return contigs;
- }
-
- MappingContigPtr Correct(MappingContigPtr contig){
- MappingPath<EdgeId> map_path = contig->mapping_path();
-
- if(map_path.size() <= 0)
- return contig;
-
- vector<EdgeId> new_path;
- vector<MappingRange> new_ranges;
- EdgeId cur_edge = map_path[0].first;
- new_path.push_back(cur_edge);
- new_ranges.push_back(map_path[0].second);
-
- for (size_t i = 1; i < map_path.size(); i++) {
- EdgeId e = map_path[i].first;
- if (e != cur_edge) {
- cur_edge = e;
- new_path.push_back(e);
- new_ranges.push_back(map_path[i].second);
- }
- else {
- new_ranges[new_ranges.size() - 1] = WideMappingRange(
- new_ranges[new_ranges.size() - 1], map_path[i].second);
- }
- }
-
- MappingPath<EdgeId> new_map_path(new_path, new_ranges);
- return MappingContigPtr(new ReplacedPathMappingContig(contig, new_map_path));
- }
-};
-
-}
diff --git a/src/dipspades/consensus_contigs_constructor/mapping_contig.hpp b/src/dipspades/consensus_contigs_constructor/mapping_contig.hpp
deleted file mode 100644
index c651400..0000000
--- a/src/dipspades/consensus_contigs_constructor/mapping_contig.hpp
+++ /dev/null
@@ -1,381 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * mapping_contig.hpp
- *
- * Created on: 13.11.2012
- * Author: yana
- */
-
-#pragma once
-#include "overlap_graph.hpp"
-#include "../../include/omni/omni_utils.hpp"
-#include "../utils/element_printers.hpp"
-#include <map>
-
-using namespace debruijn_graph;
-
-namespace dipspades {
-
-class Loop{
- vector<EdgeId> list_;
-
-public:
- Loop(vector<EdgeId> list) : list_(list) {}
- const vector<EdgeId> edges() { return list_; }
- size_t Size() { return list_.size(); }
-};
-
-class MappingContig{
-public:
- MappingContig() { }
- virtual Sequence seq() = 0;
- virtual vector<EdgeId> path_seq() = 0;
- virtual MappingPath<EdgeId> mapping_path() = 0;
-
- virtual string name() = 0;
- virtual string src_file() = 0;
- virtual string full_name() { return src_file() + ":" + name(); }
-
- virtual size_t size() = 0;
- virtual size_t length() = 0;
- virtual size_t id(){ return -1; }
- virtual size_t rc_id() = 0;
- virtual vector<shared_ptr<MappingContig> > AllMappingContigs() = 0;
- virtual void ChangeMappingRange(size_t, MappingRange){ }
- virtual void ChangeName(string new_name) = 0;
-
- virtual string ToString(Graph &graph) = 0;
-
- virtual ~MappingContig(){}
-};
-
-typedef shared_ptr<MappingContig> MappingContigPtr;
-
-class SimpleMappingContig : public MappingContig{
- string name_;
- string src_file_;
- Sequence seq_;
- MappingPath<EdgeId> map_path_;
- vector<EdgeId> edge_path_;
- size_t id_, rc_id_;
-
-public:
- SimpleMappingContig(){ }
-
- SimpleMappingContig(string name, string src_file, Sequence seq,
- MappingPath<EdgeId> map_path, size_t id, size_t rc_id) :
- name_(name),
- src_file_(src_file),
- seq_(seq),
- map_path_(map_path),
- edge_path_(map_path_.simple_path()),
- id_(id),
- rc_id_(rc_id) { }
-
- SimpleMappingContig(string name, string src_file, Sequence seq,
- MappingPath<EdgeId> map_path, vector<EdgeId> edge_path,
- size_t id, size_t rc_id) :
- name_(name),
- src_file_(src_file),
- seq_(seq),
- map_path_(map_path),
- edge_path_(edge_path),
- id_(id),
- rc_id_(rc_id) { }
-
- Sequence seq() { return seq_; }
-
- vector<EdgeId> path_seq() { return edge_path_; }
-
- MappingPath<EdgeId> mapping_path() { return map_path_; }
-
- string name() { return name_; }
-
- string src_file() { return src_file_; }
-
- size_t size() { return edge_path_.size(); }
-
- size_t length() { return seq_.size(); }
-
- size_t id(){ return id_; }
-
- size_t rc_id() { return rc_id_; }
-
- vector<MappingContigPtr> AllMappingContigs(){
- return vector<MappingContigPtr>();
- }
-
- void ChangeMappingRange(size_t index, MappingRange new_range){
- VERIFY(index < map_path_.size());
- vector<EdgeId> new_path = map_path_.simple_path();
- vector<MappingRange> new_ranges;
- for(size_t i = 0; i < map_path_.size(); i++)
- if(i != index)
- new_ranges.push_back(map_path_[i].second);
- else
- new_ranges.push_back(new_range);
- MappingPath<EdgeId> new_map_path(new_path, new_ranges);
- map_path_ = new_map_path;
- }
-
- void ChangeName(string new_name) {
- name_ = new_name;
- }
-
- string ToString(Graph &graph) {
- stringstream ss;
- ss << "Id: " << id_ << ". Seq size: " << seq_.size() <<
- ". Map path: " << MappingPathToString(graph, map_path_);
- return ss.str();
- }
-};
-
-class ReplacedPathMappingContig : public MappingContig{
- MappingContigPtr c_;
- vector<EdgeId> new_path_;
- MappingPath<EdgeId> new_map_path_;
-
-public:
- ReplacedPathMappingContig(MappingContigPtr c, vector<EdgeId> new_path) : c_(c), new_path_(new_path) { }
-
- ReplacedPathMappingContig(MappingContigPtr c, MappingPath<EdgeId> new_map_path) : c_(c), new_map_path_(new_map_path) {
- new_path_ = new_map_path_.simple_path();
- }
-
- Sequence seq() { return c_->seq(); }
-
- vector<EdgeId> path_seq() {
- return new_path_;
- }
-
- MappingPath<EdgeId> mapping_path(){
- if(new_map_path_.size() != 0)
- return new_map_path_;
- return c_->mapping_path();
- }
-
- string name() { return c_->name(); }
-
- string src_file() { return c_->src_file(); }
-
- size_t size() { return new_path_.size(); }
-
- size_t length() { return c_->length(); }
-
- size_t id(){ return c_->id(); }
-
- size_t rc_id() { return c_->rc_id(); }
-
- vector<MappingContigPtr> AllMappingContigs(){
- return vector<MappingContigPtr>();
- }
-
- void ChangeName(string new_name) {
- c_->ChangeName(new_name);
- }
-
- string ToString(Graph &graph) {
- if(new_map_path_.size() == 0)
- return c_-> ToString(graph);
- stringstream ss;
- ss << "Id: " << id() << ". Seq size: " << seq().size() <<
- ". Map path: " << MappingPathToString(graph, new_map_path_);
- return ss.str();
- }
-};
-
-class CompositeMappingContig : public MappingContig{
-
- Graph &g_;
- size_t k_value_;
-
- vector<MappingContigPtr> contigs_;
- vector<pair<Range, Range> > overlaps_;
-
- string contig_name_;
-
- Sequence composite_seq;
- vector<EdgeId> composite_path;
- size_t composite_size;
-
- size_t IndexOfEdgeByNumberOfVertex(size_t vertex_index){
- if(vertex_index == 0)
- return 0;
- return vertex_index - 1;
- }
-
-public:
- CompositeMappingContig(Graph &g,
- size_t k_value,
- vector<MappingContigPtr> contigs,
- vector<pair<Range, Range> > overlaps) :
- g_(g),
- k_value_(k_value),
- contigs_(contigs),
- overlaps_(overlaps),
- contig_name_("") {
- VERIFY(contigs.size() > 1);
- VERIFY(contigs.size() == overlaps.size() + 1);
- composite_size = 0;
- }
-
- Sequence seq(){
- if(composite_seq.size() == 0){
- vector<EdgeId> comp_path = path_seq();
- composite_seq = GetSequenceByPath(g_, k_value_, comp_path);
- }
- return composite_seq;
- }
-
- vector<EdgeId> path_seq(){
- if(composite_path.size() == 0){
- if(overlaps_.size() == 0){
- if(contigs_.size() == 0)
- return vector<EdgeId>();
- return contigs_[0]->path_seq();
- }
- else{
- TRACE("New composite contig:");
- TRACE("Path construction of composite contig starts");
-
- TRACE("Ranges: ");
- for(auto it = overlaps_.begin(); it != overlaps_.end(); it++)
- TRACE(it->first.start_pos << " - " << it->first.end_pos << ". " <<
- it->second.start_pos << " - " << it->second.end_pos);
-
- // first path processing
- {
- TRACE("First path processing");
- TRACE("Id - " << contigs_[0]->id());
- vector<EdgeId> first_path = contigs_[0]->path_seq();
- size_t end_ind = min<size_t>(IndexOfEdgeByNumberOfVertex(overlaps_[0].first.end_pos),
- first_path.size() - 1);
- for(size_t i = 0; i <= end_ind; i++)
- composite_path.push_back(first_path[i]);
- }
-
- TRACE("Intermediate paths processing");
- // intermediate paths processing
- for(size_t i = 0; i < overlaps_.size() - 1; i++){
- auto cur_path = contigs_[i + 1]->path_seq();
- TRACE("Id: " << contigs_[i + 1]->id());
- size_t start_ind = min<size_t>(IndexOfEdgeByNumberOfVertex(overlaps_[i].second.end_pos) + 1,
- cur_path.size() - 1);
- size_t end_ind = min<size_t>(IndexOfEdgeByNumberOfVertex(overlaps_[i + 1].first.end_pos),
- cur_path.size() - 1);
- TRACE("Start - " << start_ind << ", end - " << end_ind);
- VERIFY(start_ind < cur_path.size() && end_ind < cur_path.size());
- for(size_t j = start_ind; j <= end_ind; j++)
- composite_path.push_back(cur_path[j]);
- }
-
- {
- // last path processing
- TRACE("Last path processing");
- vector<EdgeId> last_path = contigs_[contigs_.size() - 1]->path_seq();
- TRACE("Id: " << contigs_[contigs_.size() - 1]->id());
- size_t start_ind = IndexOfEdgeByNumberOfVertex(overlaps_[overlaps_.size() - 1].second.end_pos) + 1;
- start_ind = min<size_t>(start_ind, last_path.size() - 1);
- size_t end_ind = last_path.size() - 1;
- TRACE("Start - " << start_ind << ", end - " << end_ind);
- VERIFY(start_ind < last_path.size() && end_ind < last_path.size());
- for(size_t i = start_ind; i <= end_ind; i++)
- composite_path.push_back(last_path[i]);
- }
-
- // deletion of repetitive start edge
- TRACE("Deletion of repetitive start-end edge");
- if(composite_path[0] == composite_path[composite_path.size() - 1]){
- composite_path.erase(composite_path.begin() + composite_path.size() - 1);
- TRACE("Deletion done");
- }
-
- TRACE("Path construction of composite contig ends");
- }
- }
- return composite_path;
- }
-
- MappingPath<EdgeId> mapping_path(){ return MappingPath<EdgeId>(); } // todo refactor
-
- string name() { return contig_name_; }
-
- string src_file() { return ""; }
-
- size_t size(){
- return path_seq().size();
- }
-
- size_t length() { return seq().size(); }
-
- size_t id(){ return 0; }
-
- size_t rc_id() { return 0; }
-
- void ChangeName(string new_name) {
- contig_name_ = new_name;
- }
-
- vector<MappingContigPtr> AllMappingContigs(){
- return contigs_;
- }
-
- string ToString(Graph &){
- return "Composite contig";
- }
-
-private:
- DECL_LOGGER("CompositeMappingContig");
-};
-
-class ReplacedNameMappingContig : public MappingContig{
- MappingContigPtr c_;
- string contig_name_;
-
-public:
- ReplacedNameMappingContig(MappingContigPtr c, string contig_name) :
- c_(c),
- contig_name_ (contig_name) { }
-
- Sequence seq() { return c_->seq(); }
-
- vector<EdgeId> path_seq() {
- return c_->path_seq();
- }
-
- MappingPath<EdgeId> mapping_path(){
- return c_->mapping_path();
- }
-
- string name() { return contig_name_; }
-
- string src_file() { return c_->src_file(); }
-
- size_t size() { return c_->size(); }
-
- size_t length() { return c_->length(); }
-
- size_t id(){ return c_->id(); }
-
- size_t rc_id() { return c_->rc_id(); }
-
- vector<MappingContigPtr> AllMappingContigs(){
- return c_->AllMappingContigs();
- }
-
- void ChangeName(string new_name) {
- c_->ChangeName(new_name);
- }
-
- string ToString(Graph &graph) {
- return c_->ToString(graph);
- }
-};
-
-}
diff --git a/src/dipspades/consensus_contigs_constructor/mapping_contigs_storage.hpp b/src/dipspades/consensus_contigs_constructor/mapping_contigs_storage.hpp
deleted file mode 100644
index bf53477..0000000
--- a/src/dipspades/consensus_contigs_constructor/mapping_contigs_storage.hpp
+++ /dev/null
@@ -1,114 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "mapping_contig.hpp"
-
-using namespace debruijn_graph;
-
-namespace dipspades {
-
-// interface for contig storage
-class ContigStorage{
-public:
- virtual void Add(MappingContigPtr new_contig) = 0;
- virtual size_t Size() = 0;
- virtual MappingContigPtr& operator[](size_t index) = 0;
- virtual void ReplaceContig(MappingContigPtr new_contig, size_t index) = 0;
- virtual void DeleteByIDs(set<size_t> ids) = 0;
- virtual MappingContigPtr GetContigById(size_t id) = 0;
- virtual MappingContigPtr GetRCContigById(size_t id) = 0;
- virtual shared_ptr<ContigStorage> Clone() = 0;
- virtual ~ContigStorage(){}
-
- virtual string ToString(Graph &graph) = 0;
-};
-
-typedef shared_ptr<ContigStorage> ContigStoragePtr;
-
-// simple implementation
-class SimpleContigStorage : public ContigStorage{
- vector<MappingContigPtr> storage_;
-
-public:
- void Add(MappingContigPtr new_contig) {
- storage_.push_back(new_contig);
- }
-
- size_t Size() {
- return storage_.size();
- }
-
- MappingContigPtr& operator[](size_t index){
- VERIFY(index < storage_.size());
- return storage_[index];
- }
-
- void ReplaceContig(MappingContigPtr new_contig, size_t index) {
- VERIFY(index < storage_.size());
- storage_[index] = new_contig;
- }
-
- void DeleteByIDs(set<size_t> ids){
- vector<MappingContigPtr> new_storage;
- for(size_t i = 0; i < storage_.size(); i++)
- if(ids.find(storage_[i]->id()) == ids.end())
- new_storage.push_back(storage_[i]);
- storage_ = new_storage;
- }
-
- MappingContigPtr GetContigById(size_t id){
- for(size_t i = 0; i < storage_.size(); i++)
- if(storage_[i]->id() == id)
- return storage_[i];
- return MappingContigPtr(new SimpleMappingContig());
- }
-
- MappingContigPtr GetRCContigById(size_t id){
- for(size_t i = 0; i < storage_.size(); i++)
- if(storage_[i]->rc_id() == id)
- return storage_[i];
- return MappingContigPtr(new SimpleMappingContig());
- }
-
- ContigStoragePtr Clone(){
- ContigStoragePtr clone_storage(new SimpleContigStorage());
- for(size_t i = 0; i < storage_.size(); i++)
- clone_storage->Add(storage_[i]);
- return clone_storage;
- }
-
- string ToString(Graph &graph) {
- stringstream ss;
- for(auto c = storage_.begin(); c != storage_.end(); c++)
- ss << (*c)->ToString(graph) << endl;
- return ss.str();
- }
-};
-
-//-------------------------------------------------------------------------
-void save_contig_storage(Graph&g, ContigStoragePtr stor, string fname){
-
- ofstream save(fname.c_str());
- for(size_t i = 0; i < stor->Size(); i++){
- save << "#" << i << " contig" << endl;
- auto contig = (*stor)[i];
- save << "id " << contig->id() << endl;
- save << "rc_id " << contig->rc_id() << endl;
-
- auto path = contig->path_seq();
- for(size_t j = 0; j < path.size(); j++){
- save << g.int_id(path[j]) << " ";
- }
- save << endl;
- }
- save.close();
-}
-//-------------------------------------------------------------------------
-
-}
diff --git a/src/dipspades/consensus_contigs_constructor/overlap_graph.hpp b/src/dipspades/consensus_contigs_constructor/overlap_graph.hpp
deleted file mode 100644
index d18e2d9..0000000
--- a/src/dipspades/consensus_contigs_constructor/overlap_graph.hpp
+++ /dev/null
@@ -1,1119 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include <iostream>
-#include <vector>
-#include <map>
-#include <set>
-
-using namespace std;
-
-using namespace debruijn_graph;
-
-namespace dipspades {
-
-class OverlapGraph{
- map<size_t, set<size_t> > in_edges_;
- map<size_t, set<size_t> > out_edges_;
- map<size_t, pair<bool, int> > label;
-
- set<pair<size_t,size_t> > edges_;
-
- map<pair<size_t,size_t>, size_t> weights_;
-
- set<size_t> vertices_;
- map<size_t, pair<size_t, size_t> > map_id_rc_id;
- map<size_t ,size_t> id_ind;
-
- void CheckAndRemoveIsolatedVertex(size_t v){
- if(IncomingVerticesCount(v) == 0 && OutgoingVerticesCount(v) == 0){
- vertices_.erase(v);
- }
- }
-
-public:
- OverlapGraph(){}
- OverlapGraph(vector<size_t> vertices, vector<size_t> id, vector<size_t> rc_id,
- map<size_t, vector<size_t> > in_edges, map<size_t, vector<size_t> > in_weight,
- map<size_t, vector<size_t> > out_edges, map<size_t, vector<size_t> > out_weight) {
-
- InitializeVertexSet(vertices, id, rc_id);
- InitializeIncomingVertices(in_edges, in_weight);
- InitializeOutgoingVertices(out_edges, out_weight);
- }
-
- void Clear(){
- vertices_.clear();
- map_id_rc_id.clear();
- id_ind.clear();
- label.clear();
-
- in_edges_.clear();
- out_edges_.clear();
- weights_.clear();
- edges_.clear();
- }
-
- void InitializeVertexSet(vector<size_t> vertices, vector<size_t> id, vector<size_t> rc_id){
-
- VERIFY(vertices.size() == id.size());
- VERIFY(vertices.size() == rc_id.size());
-
- size_t size = vertices.size();
- for(size_t i = 0; i < size; i++){
-
- auto v = vertices[i];
- vertices_.insert(v);
-
- map_id_rc_id[v] = pair<size_t, size_t>(id[i], rc_id[i]);
- id_ind[id[i]] = v;
-
- label[v] = pair<bool, int>(false, -1);
- }
- }
-
- void InitializeIncomingVertices(map<size_t, vector<size_t> > in_edges,
- map<size_t, vector<size_t> > in_weight){
-
- VERIFY(in_edges.size() == in_weight.size());
-
- auto it_v = in_edges.begin();
- auto it_w = in_weight.begin();
-
- for(size_t i = 0; i < in_edges.size(); i++){
-
- auto v = it_v->first;
- auto w = it_w->first;
-
- VERIFY(v == w);
-
- auto v_vect = it_v->second;
- auto w_vect = it_w->second;
-
- VERIFY(v_vect.size() == w_vect.size());
-
- for(size_t j = 0; j < v_vect.size(); j++){
- AddNeighVertices(v_vect[j], v, w_vect[j]);
- }
-
- it_v++; it_w++;
- }
- }
-
- void InitializeOutgoingVertices(map<size_t, vector<size_t> > out_edges,
- map<size_t, vector<size_t> > out_weight){
-
- VERIFY(out_edges.size() == out_weight.size());
-
- auto it_v = out_edges.begin();
- auto it_w = out_weight.begin();
-
- for(size_t i = 0; i < out_edges.size(); i++){
-
- auto v = it_v->first;
- auto w = it_w->first;
-
- VERIFY(v == w);
-
- auto v_vect = it_v->second;
- auto w_vect = it_w->second;
-
- VERIFY(v_vect.size() == w_vect.size());
-
- for(size_t j = 0; j < v_vect.size(); j++){
- AddNeighVertices(v, v_vect[j], w_vect[j]);
- }
- it_v++; it_w++;
- }
- }
-
- void AddNeighVertices(size_t start, size_t end, size_t weight){
-
- if(vertices_.find(start) == vertices_.end())
- vertices_.insert(start);
- if(vertices_.find(end) == vertices_.end())
- vertices_.insert(end);
-
- if(edges_.find(make_pair(start, end)) == edges_.end())
- edges_.insert(make_pair(start, end));
- weights_[make_pair(start, end)] = weight;
-
- in_edges_[end].insert(start);
- out_edges_[start].insert(end);
- }
-
- vector<size_t> GetVerticesWithoutInEdges(){
- vector<size_t> res;
- for(auto v = vertices_.begin(); v != vertices_.end(); v++)
- if(in_edges_.find(*v) == in_edges_.end() || in_edges_[*v].size() == 0){
- res.push_back(*v);
- break;
- }
- return res;
- }
-
- set<size_t> IncomingVertices(size_t v){
- set<size_t> res;
- if(in_edges_.find(v) != in_edges_.end())
- res = in_edges_[v];
- return res;
- }
-
- size_t IncomingVerticesCount(size_t v){
- if(in_edges_.find(v) != in_edges_.end())
- return in_edges_[v].size();
- return 0;
- }
-
- set<size_t> OutgoingVertices(size_t v){
- set<size_t> res;
- if(out_edges_.find(v) != out_edges_.end())
- res = out_edges_[v];
- return res;
- }
-
- size_t OutgoingVerticesCount(size_t v){
- if(out_edges_.find(v) != out_edges_.end())
- return out_edges_[v].size();
- return 0;
- }
-
- pair<bool, int> GetLabel(size_t v){
- if(label.find(v) != label.end())
- return label[v];
- return make_pair(false, -1);
- }
-
- void SetLabel(size_t v, bool bool_label, int value){
- if(label.find(v) != label.end())
- label[v] = make_pair(bool_label, value);
- }
-
- size_t GetWeightOf(pair<size_t, size_t> edge){
- if(weights_.find(edge) != weights_.end())
- return weights_[edge];
- return 0;
- }
-
- size_t GetWeightOf(size_t start, size_t end){
- return GetWeightOf(pair<size_t, size_t>(start, end));
- }
-
- void DeleteVertex(size_t v){
- vertices_.erase(v);
- auto in_set = in_edges_[v];
- auto out_set = out_edges_[v];
-
- for(auto w = in_set.begin(); w != in_set.end(); w++){
- DeleteEdge(*w, v);
- }
-
- in_edges_.erase(v);
-
- for(auto w = out_set.begin(); w != out_set.end(); w++){
- DeleteEdge(v, *w);
- }
-
- in_edges_.erase(v);
-
- size_t id = IdByInd(v);
- map_id_rc_id.erase(v);
- id_ind.erase(id);
- }
-
- void DeleteEdge(size_t start, size_t end){
- if(out_edges_.find(start) != out_edges_.end())
- out_edges_[start].erase(end);
- if(in_edges_.find(end) != in_edges_.end())
- in_edges_[end].erase(start);
- weights_.erase(pair<size_t, size_t>(start, end));
- edges_.erase(pair<size_t, size_t>(start, end));
-
-// CheckAndRemoveIsolatedVertex(start);
-// CheckAndRemoveIsolatedVertex(end);
- }
-
- set<pair<size_t,size_t> > Edges(){
- auto res = edges_;
- return res;
- }
-
- set<size_t> Vertices(){
- auto res = vertices_;
- return res;
- }
-
- bool IsEdgeExist(size_t start, size_t end){
- if(edges_.find(pair<size_t, size_t>(start, end)) != edges_.end())
- return true;
- return false;
- }
-
- bool IsVertexExist(size_t v){
- return vertices_.find(v) != vertices_.end();
- }
-
- size_t VerticesCount(){
- return vertices_.size();
- }
-
- size_t EdgesCount(){
- return edges_.size();
- }
-
- size_t IdByInd(size_t v){
- if(map_id_rc_id.find(v) != map_id_rc_id.end())
- return map_id_rc_id[v].first;
- return size_t(-1);
- }
-
- size_t RCIdByInd(size_t v){
- if(map_id_rc_id.find(v) != map_id_rc_id.end())
- return map_id_rc_id[v].second;
- return size_t(-1);
- }
-
- size_t IndById(size_t id){
- if(id_ind.find(id) != id_ind.end())
- return id_ind[id];
- return size_t(-1);
- }
-
- size_t IndOfRC(size_t v){
- return IndById(RCIdByInd(v));
- }
-
- bool IsVertexValid(size_t v){
- return (v != size_t(-1));
- }
-
- bool IsVertexIsolated(size_t v){
- return OutgoingVerticesCount(v) == 0 && IncomingVerticesCount(v) == 0;
- }
-
- vector<size_t> GetIsolatedEdges(){
- vector<size_t> res;
- for(auto v = vertices_.begin(); v != vertices_.end(); v++)
- if(IsVertexIsolated(*v))
- res.push_back(*v);
- return res;
- }
-};
-
-//----------------------------------------------------------------------------------------------
-
-class OGD_StartVerticesDefiner{
-protected:
- OverlapGraph &g_;
-public:
- OGD_StartVerticesDefiner(OverlapGraph &g) : g_(g) {
- }
- virtual vector<size_t> GetStartVertices() = 0;
- virtual ~OGD_StartVerticesDefiner() {
- }
-};
-
-class OGD_GetParametrizedStartvertex : public OGD_StartVerticesDefiner{
- size_t start_vertex_;
-public:
- OGD_GetParametrizedStartvertex(OverlapGraph &g, size_t start_vertex) :
- OGD_StartVerticesDefiner(g), start_vertex_(start_vertex){
-
- }
-
- vector<size_t> GetStartVertices(){
- vector<size_t> res;
- res.push_back(start_vertex_);
- return res;
- }
-};
-
-class OGD_GetIsolatedAndVerticesWithoutIncoming : public OGD_StartVerticesDefiner{
-public:
- OGD_GetIsolatedAndVerticesWithoutIncoming(OverlapGraph &g) : OGD_StartVerticesDefiner(g){
-
- }
-
- vector<size_t> GetStartVertices(){
-
-// cout << "OGD_GetIsolatedAndVerticesWithoutIncoming starts" << endl;
-// cout << g_.VerticesCount() << " vertices in OG" << endl;
-
- vector<size_t> res;
-
- if(g_.VerticesCount() == 0)
- return res;
-
- vector<size_t> isolated = g_.GetIsolatedEdges();
- vector<size_t> noincoming = g_.GetVerticesWithoutInEdges();
-
- if(isolated.size() != 0)
- for(auto v = isolated.begin(); v != isolated.end(); v++)
- res.push_back(*v);
-
- if(noincoming.size() != 0)
- res.push_back(*noincoming.begin());
-
- if(res.size() == 0){
- size_t any_vertex = *g_.Vertices().begin();
- res.push_back(any_vertex);
- }
-
- return res;
- }
-};
-
-//----------------------------------------------------------------------------------------------
-class OGD_DirectionDefiner{
-protected:
- OverlapGraph &g_;
-
-public:
- OGD_DirectionDefiner(OverlapGraph &g) : g_(g){
-
- }
-
- virtual set<size_t> GetDirectedVertices(size_t vertex) = 0;
- virtual set<size_t> GetAntidirectedVertices(size_t vertex) = 0;
- virtual ~OGD_DirectionDefiner(){
-
- }
-};
-
-class OGD_OutgoingDirection : public OGD_DirectionDefiner{
-public:
- OGD_OutgoingDirection(OverlapGraph &g) : OGD_DirectionDefiner(g){
-
- }
-
- set<size_t> GetDirectedVertices(size_t vertex){
- return g_.OutgoingVertices(vertex);
- }
-
- set<size_t> GetAntidirectedVertices(size_t vertex){
- return g_.IncomingVertices(vertex);
- }
-};
-//----------------------------------------------------------------------------------------------
-
-class OGD_OneVertexProcesser{
-protected:
- OverlapGraph &g_;
- OGD_DirectionDefiner *direction_definer_;
-public:
- OGD_OneVertexProcesser(OverlapGraph &g, OGD_DirectionDefiner *direction_definer) : g_(g),
- direction_definer_(direction_definer){
- }
- virtual void ProcessVertex(size_t vertex, set<size_t> &visited, set<size_t> &queue,
- map<size_t, vector<size_t> > &paths) = 0;
- virtual ~OGD_OneVertexProcesser(){
-
- }
-};
-
-class OGD_SimpleProcessing : public OGD_OneVertexProcesser{
-public:
- OGD_SimpleProcessing(OverlapGraph &g, OGD_DirectionDefiner *direction_definer) :
- OGD_OneVertexProcesser(g, direction_definer){
-
- }
-
- void ProcessVertex(size_t vertex, set<size_t> &visited, set<size_t> &queue,
- map<size_t, vector<size_t> > &paths){
- if(visited.find(vertex) != visited.end())
- return;
-
- visited.insert(vertex);
- queue.erase(vertex);
-
- auto vert_for_visit = direction_definer_->GetDirectedVertices(vertex);
- for(auto neigh = vert_for_visit.begin(); neigh != vert_for_visit.end(); neigh++)
- if(visited.find(*neigh) == visited.end()){
- paths[*neigh] = paths[vertex];
- paths[*neigh].push_back(*neigh);
-
- queue.insert(*neigh);
- }
- }
-};
-
-class OGD_UniquePathProcessing : public OGD_OneVertexProcesser{
-public:
- OGD_UniquePathProcessing(OverlapGraph &g, OGD_DirectionDefiner *direction_definer) :
- OGD_OneVertexProcesser(g, direction_definer){
-
- }
-
- void ProcessVertex(size_t vertex, set<size_t> &visited, set<size_t> &queue,
- map<size_t, vector<size_t> > &paths){
- if(visited.find(vertex) != visited.end() || vertex == size_t(-1))
- return;
-
-// cout << "Processing of " << vertex << endl;
-
- visited.insert(vertex);
- queue.erase(vertex);
-
- size_t rc_v = g_.IndOfRC(vertex);
- if(g_.IsVertexValid(rc_v)){
- visited.insert(rc_v);
- queue.erase(rc_v);
- }
-
- auto vert_for_visit = direction_definer_->GetDirectedVertices(vertex);
-
- if(vert_for_visit.size() == 1){
- size_t neigh = *vert_for_visit.begin();
-
- if(visited.find(neigh) == visited.end() && paths.find(neigh) == paths.end()){
- paths[neigh] = paths[vertex];
- paths[neigh].push_back(neigh);
- queue.insert(neigh);
- }
- }
- else
- for(auto neigh = vert_for_visit.begin(); neigh != vert_for_visit.end(); neigh++)
- if(visited.find(*neigh) == visited.end() && paths.find(*neigh) == paths.end()){
- paths[*neigh].push_back(*neigh);
- queue.insert(*neigh);
- }
- }
-};
-
-class OGD_AlternativePathProcesser : public OGD_OneVertexProcesser{
- vector<size_t> alter_path_;
- set<size_t> forbidden_vert;
- bool alter_path_is_edge;
-
- size_t path_start, path_end;
-
-public:
- OGD_AlternativePathProcesser(OverlapGraph &g, OGD_DirectionDefiner *direct_definer,
- vector<size_t> alter_path) : OGD_OneVertexProcesser(g, direct_definer), alter_path_(alter_path){
-
- VERIFY(alter_path.size() > 1);
-
- for(auto e = alter_path.begin() + 1; e != alter_path.end() - 1; e++)
- forbidden_vert.insert(*e);
-
- alter_path_is_edge = alter_path_.size() == 2;
-
- path_start = *alter_path.begin(), path_end = *(alter_path.end() - 1);
- }
-
- void ProcessVertex(size_t vertex, set<size_t> &visited, set<size_t> &queue,
- map<size_t, vector<size_t> > &paths){
- if(visited.find(vertex) != visited.end() || vertex == size_t(-1))
- return;
-
- visited.insert(vertex);
- queue.erase(vertex);
-
- auto vert_for_visit = direction_definer_->GetDirectedVertices(vertex);
-
- for(auto neigh = vert_for_visit.begin(); neigh != vert_for_visit.end(); neigh++){
- if(visited.find(*neigh) == visited.end()){
- bool is_not_visit = (vertex == path_start && *neigh == path_end && alter_path_is_edge);
- if(!is_not_visit && forbidden_vert.find(*neigh) == forbidden_vert.end()){
- queue.insert(*neigh);
- paths[*neigh] = paths[vertex];
- paths[*neigh].push_back(*neigh);
- }
- }
- }
- }
-};
-//----------------------------------------------------------------------------------------------
-
-class OGD_NewProcessedVertexDefiner{
-protected:
- OverlapGraph &g_;
-
-public:
- OGD_NewProcessedVertexDefiner(OverlapGraph &g) : g_(g){
-
- }
- virtual size_t GetNewVertex(set<size_t> &visited, set<size_t> &queue,
- map<size_t, vector<size_t> > &paths) = 0;
- virtual ~OGD_NewProcessedVertexDefiner(){
-
- }
-};
-
-class OGD_NewVertexInQueueDefiner : public OGD_NewProcessedVertexDefiner{
-
-public:
- OGD_NewVertexInQueueDefiner(OverlapGraph &g) :
- OGD_NewProcessedVertexDefiner(g){
- }
-
- size_t GetNewVertex(set<size_t> &, set<size_t> &queue,
- map<size_t, vector<size_t> > &){
- if(queue.size() > 0)
- return *queue.begin();
- return size_t(-1);
- }
-};
-
-class OGD_SimpleNewVertexDefiner : public OGD_NewProcessedVertexDefiner{
- OGD_DirectionDefiner *direction_definer_;
-
-public:
- OGD_SimpleNewVertexDefiner(OverlapGraph &g, OGD_DirectionDefiner *direction_definer) :
- OGD_NewProcessedVertexDefiner(g), direction_definer_(direction_definer){
-
- }
-
- size_t GetNewVertex(set<size_t> &visited, set<size_t> &queue,
- map<size_t, vector<size_t> > &paths){
- if(queue.size() > 0)
- return *queue.begin();
- else{
-
- auto vertices = g_.Vertices();
- for(auto v = vertices.begin(); v != vertices.end(); v++){
- if(visited.find(*v) == visited.end()){
- auto in_vertices = direction_definer_->GetAntidirectedVertices(*v);
- bool all_invisited = true;
- for(auto in_v = in_vertices.begin(); in_v != in_vertices.end(); in_v++){
- if(visited.find(*in_v) == visited.end()){
- all_invisited = false;
- break;
- }
- }
-
- if(all_invisited){
- paths[*v].push_back(*v);
- return *v;
- }
- }
- }
-
- // if vertex without antidirected edges is not exist
- // then return any unvisited vertex
- for (auto v = vertices.begin(); v != vertices.end(); v++) {
- if (visited.find(*v) == visited.end()) {
- paths[*v].push_back(*v);
- return *v;
- }
- }
- }
- return -1;
- }
-};
-
-//----------------------------------------------------------------------------------------------
-
-class OGD_StopCondition{
-protected:
- OverlapGraph &g_;
-public:
- OGD_StopCondition(OverlapGraph &g) : g_(g){
-
- }
- virtual bool IsStop(set<size_t> &visited, set<size_t> &queue, map<size_t, vector<size_t> > &paths) = 0;
- virtual ~OGD_StopCondition(){
-
- }
-};
-
-class OGD_SearchedVertexIsFound : public OGD_StopCondition{
- size_t searched_vertex_;
-public:
- OGD_SearchedVertexIsFound(OverlapGraph &g, size_t searched_vertex) : OGD_StopCondition(g),
- searched_vertex_(searched_vertex){
-
- }
-
- bool IsStop(set<size_t> &visited, set<size_t> &queue, map<size_t, vector<size_t> > &){
- return (visited.find(searched_vertex_) != visited.end() ||
- visited.size() == g_.VerticesCount() || queue.size() == 0);
- }
-};
-
-class OGD_NoVerticesForVisit : public OGD_StopCondition{
-public:
- OGD_NoVerticesForVisit(OverlapGraph &g) : OGD_StopCondition(g){
-
- }
-
- bool IsStop(set<size_t> &visited, set<size_t> &, map<size_t, vector<size_t> > &){
- return visited.size() == g_.VerticesCount();
- }
-};
-//----------------------------------------------------------------------------------------------
-
-struct OGD_Config{
- OGD_StartVerticesDefiner * start_vert_definer;
- OGD_OneVertexProcesser * vertex_processer;
- OGD_NewProcessedVertexDefiner * new_vert_definer;
- OGD_StopCondition * stop_condition;
-
- OGD_Config(OGD_StartVerticesDefiner * &start_vert_definer,
- OGD_OneVertexProcesser * &vertex_processer,
- OGD_NewProcessedVertexDefiner * &new_vert_definer,
- OGD_StopCondition * &stop_condition){
- this->start_vert_definer = start_vert_definer;
- this->vertex_processer = vertex_processer;
- this->new_vert_definer = new_vert_definer;
- this->stop_condition = stop_condition;
- }
-
- ~OGD_Config(){
- delete new_vert_definer;
- delete start_vert_definer;
- delete stop_condition;
- delete vertex_processer;
- }
-};
-
-OGD_Config CreateConfigForUniquePathsSearch(OverlapGraph &g){
- OGD_StartVerticesDefiner *start_def = new OGD_GetIsolatedAndVerticesWithoutIncoming(g);
- OGD_DirectionDefiner *direct_def = new OGD_OutgoingDirection(g);
- OGD_OneVertexProcesser *vert_proc = new OGD_UniquePathProcessing(g, direct_def);
- OGD_NewProcessedVertexDefiner *new_vert_definer = new OGD_SimpleNewVertexDefiner(g, direct_def);
- OGD_StopCondition *stop_cond = new OGD_NoVerticesForVisit(g);
-
- OGD_Config conf(start_def, vert_proc, new_vert_definer, stop_cond);
-
- return conf;
-}
-
-OGD_Config CreateContigForDijkstraFromOneVertex(OverlapGraph &g, size_t start_vertex, size_t end_vertex){
- OGD_StartVerticesDefiner *start_def = new OGD_GetParametrizedStartvertex(g, start_vertex);
- OGD_DirectionDefiner * direct_def = new OGD_OutgoingDirection(g);
- OGD_OneVertexProcesser *vert_proc = new OGD_SimpleProcessing(g, direct_def);
- OGD_NewProcessedVertexDefiner * new_vert_definer = new OGD_NewVertexInQueueDefiner(g);
- OGD_StopCondition *stop_cond = new OGD_SearchedVertexIsFound(g, end_vertex);
-
- OGD_Config conf(start_def, vert_proc, new_vert_definer, stop_cond);
- return conf;
-}
-
-OGD_Config CreateConfigForAlternativePathSearch(OverlapGraph &g, vector<size_t> path){
-
- VERIFY(path.size() > 1);
- size_t start_vertex = *(path.begin()), end_vertex = *(path.end() - 1);
-
- OGD_StartVerticesDefiner *start_def = new OGD_GetParametrizedStartvertex(g, start_vertex);
- OGD_DirectionDefiner * direct_def = new OGD_OutgoingDirection(g);
- OGD_OneVertexProcesser *vert_proc = new OGD_AlternativePathProcesser(g, direct_def, path);
- OGD_NewProcessedVertexDefiner * new_vert_definer = new OGD_NewVertexInQueueDefiner(g);
- OGD_StopCondition *stop_cond = new OGD_SearchedVertexIsFound(g, end_vertex);
-
- OGD_Config conf(start_def, vert_proc, new_vert_definer, stop_cond);
- return conf;
-}
-
-class OverlapGraphDijkstra{
- OverlapGraph &g_;
- set<size_t> visited, queue;
- map<size_t, vector<size_t> > paths;
-
- OGD_Config& config_;
-
-public:
- OverlapGraphDijkstra(OverlapGraph &g, OGD_Config &config) : g_(g), config_(config){
-
- }
-
- void Run(){
-
-// cout << "Dijkstra run" << endl;
-// cout << "Start vertices search" << endl;
- auto start_vertices = config_.start_vert_definer->GetStartVertices();
-
-// cout << "Processing of start vertices" << endl;
- for(auto new_start_vertex = start_vertices.begin(); new_start_vertex != start_vertices.end();
- new_start_vertex++){
- if(visited.find(*new_start_vertex) == visited.end()){
- paths[*new_start_vertex].push_back(*new_start_vertex);
- config_.vertex_processer->ProcessVertex(*new_start_vertex, visited, queue, paths);
- }
- }
-
-// cout << "Dijkstra cycle starts" << endl;
- while(!config_.stop_condition->IsStop(visited, queue, paths)){
- size_t current_vertex = config_.new_vert_definer->GetNewVertex(visited, queue, paths);
- config_.vertex_processer->ProcessVertex(current_vertex, visited, queue, paths);
- }
-// cout << "Dijkstra cycle ends" << endl;
- }
-
- map<size_t, vector<size_t> > Paths(){
-// cout << "Paths:" << endl;
-// for(auto it = paths.begin(); it != paths.end(); it++){
-// cout << it->first << ". ";
-// auto path = it->second;
-// for(auto e = path.begin(); e != path.end(); e++)
-// cout << *e << " ";
-// cout << endl;
-// }
- return paths;
- }
-
- const OverlapGraph & GetGraph() { return g_; }
-
- ~OverlapGraphDijkstra(){
- }
-};
-
-//---------------------------------------------------------------------------------
-
-vector<vector<size_t> > DeleteRedundantEndsFromPaths(OverlapGraph &g, vector<vector<size_t> > paths){
-
- if(paths.size() == 0)
- return paths;
-
- vector<size_t> starts, ends;
- vector<bool> is_nes_start, is_nes_end;
- for(auto p = paths.begin(); p != paths.end(); p++){
-
- size_t cur_start = *(p->begin());
- size_t cur_end = *(p->end() - 1);
-
- starts.push_back(cur_start);
- ends.push_back(cur_end);
-
- is_nes_start.push_back(true);
-
- if(g.RCIdByInd(cur_start) == cur_end)
- is_nes_end.push_back(false);
- else
- is_nes_end.push_back(true);
- }
-
- size_t num_paths = paths.size();
- for(size_t i = 0; i < num_paths; i++){
- size_t cur_start = starts[i], cur_end = ends[i];
- for(size_t j = i + 1; j < num_paths; j++){
- size_t neig_start = starts[j], neig_end = ends[j];
- if(g.RCIdByInd(cur_start) == neig_start || g.RCIdByInd(cur_start) == neig_end)
- is_nes_start[j] = false;
-
- if(g.RCIdByInd(cur_end) == neig_start || g.RCIdByInd(cur_end) == neig_end)
- is_nes_end[j] = false;
- }
- }
-
- vector<vector<size_t> > corrected_paths;
- corrected_paths.push_back(paths[0]);
-
- for(size_t i = 1; i < num_paths; i++){
- if(!is_nes_start[i] || !is_nes_end[i]){
- if(paths[i].size() > 1){
- if(paths[i].size() == 2 && !is_nes_start[i] && !is_nes_end[i]){
- }
- else{
- vector<size_t> tmp;
- if(is_nes_start[i])
- tmp.push_back(paths[i][0]);
- for(size_t j = 1; j < paths[i].size() - 1; j++)
- tmp.push_back(paths[i][j]);
- if(is_nes_end[i])
- tmp.push_back(paths[i][paths[i].size() - 1]);
- corrected_paths.push_back(tmp);
- }
- }
- }
- else
- corrected_paths.push_back(paths[i]);
- }
-
- return corrected_paths;
-}
-
-class UniquePathsSearcher{
- OverlapGraph &g_;
-
- map<size_t, vector<size_t> > sh_paths;
-
- vector<vector<size_t> > DefineLongestPathsFromMap(){
- vector<vector<size_t> > res;
- set<size_t> used;
- while(used.size() < sh_paths.size()){
- size_t longest_path_size = 0;
- vector<size_t> longest_path;
- for(auto p = sh_paths.begin(); p != sh_paths.end(); p++){
- if(p->second.size() > longest_path_size && used.find(p->first) == used.end()){
- longest_path = p->second;
- longest_path_size = longest_path.size();
- }
- }
-
- for(auto v = longest_path.begin(); v != longest_path.end(); v++)
- if(sh_paths.find(*v) != sh_paths.end())
- used.insert(*v);
-
- res.push_back(longest_path);
- }
-
- return res;
- }
-
-public:
- UniquePathsSearcher(OverlapGraph &g) : g_(g) {}
-
- vector<vector<size_t> > FindLongPaths(){
-
- OGD_Config conf = CreateConfigForUniquePathsSearch(g_);
- OverlapGraphDijkstra dijkstra(g_, conf);
- dijkstra.Run();
- sh_paths = dijkstra.Paths();
-
- auto long_paths = DefineLongestPathsFromMap();
-
- auto corrected_long_paths = DeleteRedundantEndsFromPaths(g_, long_paths);
-
-// cout << "Long paths" << endl;
-// for(auto p = corrected_long_paths.begin(); p != corrected_long_paths.end(); p++){
-// cout << "New path. ";
-/// for(auto e = p->begin(); e != p->end(); e++)
-// cout << *e << " ";
-// cout << endl;
-// }
-
- return corrected_long_paths;
- }
-};
-
-class OverlapPathSearcher{
- OverlapGraph &g_;
-public:
- OverlapPathSearcher(OverlapGraph &g) : g_(g) {}
-
- vector<size_t> GetPathAlternativeToPath(size_t start, size_t end, vector<size_t> path){
- vector<size_t> res;
-
- VERIFY(path.size() != 0);
- VERIFY(path[0] == start && path[path.size() - 1] == end);
-
- OGD_Config conf = CreateConfigForAlternativePathSearch(g_, path);
- OverlapGraphDijkstra dijkstra(g_, conf);
- dijkstra.Run();
- map<size_t, vector<size_t> > short_paths = dijkstra.Paths();
-
- if(short_paths.find(end) != short_paths.end()){
- res = short_paths[end];
- }
-
- return res;
- }
-
- vector<vector<size_t> > GetAlternativePaths(size_t v1, size_t v2){
- vector<vector<size_t> > paths;
-
-// cout << "Outgoing count - " << g_.OutgoingVerticesCount(v1) << " and incoming - " << g_.IncomingVerticesCount(v2) << endl;
- if(g_.OutgoingVerticesCount(v1) <= 1 || g_.IncomingVerticesCount(v2) <= 1)
- return paths;
-
- OGD_Config conf = CreateContigForDijkstraFromOneVertex(g_, v1, v2);
- OverlapGraphDijkstra dijkstra(g_, conf);
- dijkstra.Run();
- map<size_t, vector<size_t> > sh_paths = dijkstra.Paths();
-
- if(sh_paths.find(v2) == sh_paths.end()){
-// INFO("Path from " + ToString(v1) + " to " + ToString(v2) + " isn't found");
- return paths;
- }
- else{
- auto fst_path = sh_paths[v2];
- paths.push_back(fst_path);
-
- vector<size_t> snd_path = GetPathAlternativeToPath(v1, v2, fst_path);
- if(snd_path.size() != 0){
- VERIFY(snd_path[0] == v1 && snd_path[snd_path.size() - 1] == v2);
- paths.push_back(snd_path);
- }
- }
-
- return paths;
- }
-};
-
-//---------------------------------------------------------------------------------
-
-void dijkstra_for_overlap_graph_test(){
- OverlapGraph g;
-// g.AddNeighVertices(1, 2, 1);
- g.AddNeighVertices(1, 3, 1);
- g.AddNeighVertices(1, 4, 1);
-
- g.AddNeighVertices(2, 3, 1);
- g.AddNeighVertices(4, 3, 1);
-
- g.AddNeighVertices(3, 4, 1);
-
-// OverlapPathSearcher path_searcher(g);
-// vector<int> path1;
-// path1.push_back(1);
-// path1.push_back(4);
-
-// auto path2 = path_searcher.GetPathAlternativeToPath(1, 4, path1);
-// for(auto v = path2.begin(); v != path2.end(); v++)
-// cout << *v << " ";
-// cout << endl;
-
- UniquePathsSearcher path_searcher2(g);
- auto paths = path_searcher2.FindLongPaths();
-
-// for(auto p = paths.begin(); p != paths.end(); p++){
-// cout << "New path. ";
-// for(auto v = p->begin(); v != p->end(); v++)
-// cout << *v << " ";
-// cout << endl;
-// }
-
-// auto paths_1_3 = path_searcher.GetAlternativePaths(1, 3);
-// for(auto p = paths_1_3.begin(); p != paths_1_3.end(); p++){
-// cout << "New path. ";
-// for(auto v = p->begin(); v != p->end(); v++)
-// cout << *v << " ";
-// cout << endl;
-// }
-}
-
-//---------------------------------------------------------------------------------
-
-class OverlapGraphCorrector{
-public:
- virtual size_t Correct(OverlapGraph & g) = 0;
- virtual ~OverlapGraphCorrector() { }
-};
-
-
-class TipClipperCorrector : OverlapGraphCorrector{
-public:
- size_t Correct(OverlapGraph & g){
- auto edges = g.Edges();
-
- size_t deleted_edges = 0;
- for(auto e = edges.begin(); e != edges.end(); e++){
- auto start = e->first;
- auto end = e->second;
-
- if(g.IncomingVerticesCount(start) == 0 && g.OutgoingVerticesCount(start) == 1 &&
- g.IncomingVerticesCount(end) > 1 /*&& g.OutgoingVerticesCount(end) > 0*/){
-// cout << "Tip - " << start << " " << end << endl;
- g.DeleteVertex(start);
- deleted_edges++;
- }
- if(g.OutgoingVerticesCount(end) == 0 && g.OutgoingVerticesCount(start) > 1 &&
- /*g.IncomingVerticesCount(start) > 0 &&*/ g.IncomingVerticesCount(end) == 1){
-// cout << "Tip - " << start << " " << end << endl;
- g.DeleteVertex(end);
- deleted_edges++;
- }
- }
-
- return deleted_edges;
- }
-};
-
-class TransitiveReductionCorrector : public OverlapGraphCorrector{
-public:
- size_t Correct(OverlapGraph & g){
- auto edges = g.Edges();
- OverlapPathSearcher ps(g);
-
- size_t res = 0;
-
- for(auto e = edges.begin(); e != edges.end(); e++){
- auto start = e->first;
- auto end = e->second;
-
- if(g.IsEdgeExist(start, end)){
-
- vector<size_t> path; path.push_back(start); path.push_back(end);
- vector<size_t> alt_path = ps.GetPathAlternativeToPath(start, end, path);
-
- if(alt_path.size() > 2){
- g.DeleteEdge(start, end);
- res++;
- }
- }
- }
-
- return res;
- }
-};
-
-
-class BulgeRemoverCorrector : public OverlapGraphCorrector{
-public:
- size_t Correct(OverlapGraph & g){
- auto vertices = g.Vertices();
- OverlapPathSearcher ps(g);
-
- size_t res = 0;
-
- for(auto v = vertices.begin(); v != vertices.end(); v++)
- for(auto w = vertices.begin(); w != vertices.end(); w++)
- if(*v != *w && g.IsVertexExist(*v) && g.IsVertexExist(*w)){
-
- auto paths = ps.GetAlternativePaths(*v, *w);
-
- if(paths.size() > 1){
-
- vector<size_t> path1 = paths[0], path2 = paths[1];
-
- size_t w1 = 0, w2 = 0;
-
- for(size_t i = 0; i < path1.size() - 1; i++){
- w1 += g.GetWeightOf(path1[i], path1[i + 1]);
- }
-
- for(size_t i = 0; i < path2.size() - 1; i++){
- w2 += g.GetWeightOf(path2[i], path2[i + 1]);
- }
-
- vector<size_t> deleted_path;
- if(w1 > w2)
- deleted_path = path2;
- else
- deleted_path = path1;
-
- // deletion of vertices
- for(size_t i = 0; i < deleted_path.size() - 1; i++)
- g.DeleteEdge(deleted_path[i], deleted_path[i + 1]);
-
- // deletion of inner vertices of bulge
- for(size_t i = 1; i < deleted_path.size() - 1; i++)
- g.DeleteVertex(deleted_path[i]);
-
- res++;
- }
- }
- return res;
- }
-};
-
-void SimplifyOverlapGraph(OverlapGraph &overlap_graph, size_t tc_num_iter, size_t br_num_iter){
-
- size_t tc_res = 1, tr_res = 1;
- for(size_t i = 0; (i < tc_num_iter && (tc_res > 0 || tr_res > 0)); i++){
- TipClipperCorrector tc_corr;
- tc_res = tc_corr.Correct(overlap_graph);
-
- TransitiveReductionCorrector tr_corr;
- tr_res = tr_corr.Correct(overlap_graph);
-
- INFO(ToString(tc_res) + " tips and " + ToString(tr_res) + " transitive edges were deleted in overlap graph");
- }
-
- INFO("Bulge remover starts");
- BulgeRemoverCorrector br_corr;
- size_t num_bulges = 1;
- for(size_t i = 0; (i < br_num_iter && num_bulges > 0); i++){
- num_bulges = br_corr.Correct(overlap_graph);
- INFO(ToString(num_bulges) + " bulges were deleted in overlap graph");
- }
-}
-
-}
diff --git a/src/dipspades/dipspades.hpp b/src/dipspades/dipspades.hpp
deleted file mode 100644
index 8300731..0000000
--- a/src/dipspades/dipspades.hpp
+++ /dev/null
@@ -1,265 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-
-#include "io/splitting_wrapper.hpp"
-#include "graph_construction.hpp"
-#include "stage.hpp"
-
-#include "dipspades_config.hpp"
-
-#include "polymorphic_bulge_remover/polymorphic_bulge_remover.hpp"
-#include "consensus_contigs_constructor/consensus_contigs_constructor.hpp"
-#include "haplotype_assembly/haplotype_assembler.hpp"
-#include "kmer_gluing/equal_sequence_gluer.hpp"
-
-using namespace debruijn_graph;
-using namespace spades;
-
-namespace dipspades {
-
-void construct_graph_from_contigs(debruijn_graph::conj_graph_pack &graph_pack){
- auto fnames = GetAllLinesFromFile(dsp_cfg::get().io.haplocontigs);
- ReadStreamList<SingleRead> streams;
- for(auto fname = fnames.begin(); fname != fnames.end(); fname++)
- if(fname_valid(*fname)){
- INFO("Addition of contigs from " << *fname);
- streams.push_back(io::SplittingWrap(EasyStream(*fname, true)));
- }
-
- INFO("Construction of the de Bruijn graph with K=" << dsp_cfg::get().bp.K);
- debruijn_config::construction params;
- params.con_mode = debruijn_graph::con_extention;
- params.early_tc.enable = false;
- params.early_tc.length_bound = 10;
- params.keep_perfect_loops = true;
- params.read_buffer_size = dsp_cfg::get().bp.read_buffer_size;
-
- ConstructGraphWithCoverage(params,
- streams,
- graph_pack.g,
- graph_pack.index,
- graph_pack.flanking_cov);
-}
-
-
-class DipSPAdesStorage{
-public:
- BaseHistogram<size_t> bulge_len_histogram;
- ContigStoragePtr default_storage;
- ContigStoragePtr composite_storage;
- CorrectionResult redundancy_map;
-};
-
-
-class DipSPAdes : public CompositeStage<DipSPAdesStorage> {
- DipSPAdesStorage dsp_params_;
-public:
- DipSPAdes() : CompositeStage<DipSPAdesStorage>("dipSPAdes", "dipspades") { }
-
- void load(debruijn_graph::conj_graph_pack&,
- const std::string &,
- const char*) { }
-
- void save(const debruijn_graph::conj_graph_pack&,
- const std::string &,
- const char*) const { }
-
- virtual ~DipSPAdes() { }
-};
-
-class ContigGraphConstructionStage : public DipSPAdes::Phase {
-public:
- ContigGraphConstructionStage() :
- DipSPAdes::Phase("Construction of graph from contigs", "contig_graph_construction") { }
-
- void run(debruijn_graph::conj_graph_pack &graph_pack, const char*) {
- construct_graph_from_contigs(graph_pack);
- }
-
- void load(debruijn_graph::conj_graph_pack& gp,
- const std::string &load_from,
- const char* prefix) {
- std::string p = path::append_path(load_from, prefix == NULL ? id() : prefix);
- INFO("Loading current state from " << p);
- debruijn_graph::graphio::ScanAll(p, gp, false);
-
- }
-
- void save(const debruijn_graph::conj_graph_pack& gp,
- const std::string & save_to,
- const char* prefix) const {
- std::string p = path::append_path(save_to, prefix == NULL ? id() : prefix);
- INFO("Saving current state to " << p);
- debruijn_graph::graphio::PrintAll(p, gp);
- }
-
- virtual ~ContigGraphConstructionStage() { }
-};
-
-class PolymorphicBulgeRemoverStage : public DipSPAdes::Phase {
-public:
- PolymorphicBulgeRemoverStage() :
- DipSPAdes::Phase("Polymorphic bulge remover", "polymorphic_br") { }
-
- void run(debruijn_graph::conj_graph_pack &graph_pack, const char*){
- if(dsp_cfg::get().pbr.enabled){
- PolymorphicBulgeRemover(graph_pack, storage().bulge_len_histogram).Run();
- INFO("Consensus graph was constructed");
- }
- }
-
- void load(debruijn_graph::conj_graph_pack& gp,
- const std::string &load_from,
- const char* prefix) {
- std::string p = path::append_path(load_from, prefix == NULL ? id() : prefix);
- INFO("Loading current state from " << p);
- debruijn_graph::graphio::ScanAll(p, gp, false);
- INFO("Loading histogram of bulge length");
- INFO("loading from " << p + ".hist");
- storage().bulge_len_histogram.LoadFrom(p + ".hist");
- }
-
- void save(const debruijn_graph::conj_graph_pack& gp,
- const std::string & save_to,
- const char* prefix) const {
- std::string p = path::append_path(save_to, prefix == NULL ? id() : prefix);
- INFO("Saving current state to " << p);
- debruijn_graph::graphio::PrintAll(p, gp);
- storage().bulge_len_histogram.SaveToFile(p + ".hist");
- }
-
- virtual ~PolymorphicBulgeRemoverStage() { }
-};
-
-class EqualKmerGluingStage : public DipSPAdes::Phase {
-public:
- EqualKmerGluingStage() :
- DipSPAdes::Phase("Equal k-mer gluing", "kmer_gluer") { }
-
- void run(debruijn_graph::conj_graph_pack &graph_pack, const char*) {
- INFO("Glueing equal kmers starts");
- EqualSequencesGluer<Graph>(graph_pack.g, graph_pack.index).GlueEqualKmers();
- INFO("Glueing equal kmers ends");
- }
-
- void load(debruijn_graph::conj_graph_pack& gp,
- const std::string &load_from,
- const char* prefix) {
- std::string p = path::append_path(load_from, prefix == NULL ? id() : prefix);
- INFO("Loading current state from " << p);
- debruijn_graph::graphio::ScanAll(p, gp, false);
- INFO("Loading histogram of bulge length");
- INFO("loading from " << p + ".hist");
- storage().bulge_len_histogram.LoadFrom(p + ".hist");
- }
-
- void save(const debruijn_graph::conj_graph_pack& gp,
- const std::string & save_to,
- const char* prefix) const {
- std::string p = path::append_path(save_to, prefix == NULL ? id() : prefix);
- INFO("Saving current state to " << p);
- debruijn_graph::graphio::PrintAll(p, gp);
- storage().bulge_len_histogram.SaveToFile(p + ".hist");
- }
-
- virtual ~EqualKmerGluingStage() { }
-};
-
-class ConsensusConstructionStage : public DipSPAdes::Phase {
-public:
- ConsensusConstructionStage() :
- DipSPAdes::Phase("Consensus contigs construction", "consensus_construction") { }
-
- void run(debruijn_graph::conj_graph_pack &graph_pack, const char*){
- if(dsp_cfg::get().cc.enabled){
- ConsensusContigsConstructor consensus_constructor(graph_pack, storage().bulge_len_histogram);
- consensus_constructor.Run();
- storage().composite_storage = consensus_constructor.CompositeContigsStorage();
- storage().default_storage = consensus_constructor.DefaultContigsStorage();
- storage().redundancy_map = consensus_constructor.RedundancyResult();
- }
- }
-
- void load(debruijn_graph::conj_graph_pack& gp,
- const std::string &load_from,
- const char* prefix) {
- std::string p = path::append_path(load_from, prefix == NULL ? id() : prefix);
- INFO("Loading current state from " << p);
- debruijn_graph::graphio::ScanAll(p, gp, false);
- }
-
- void save(const debruijn_graph::conj_graph_pack& gp,
- const std::string & save_to,
- const char* prefix) const {
- std::string p = path::append_path(save_to, prefix == NULL ? id() : prefix);
- INFO("Saving current state to " << p);
- debruijn_graph::graphio::PrintAll(p, gp);
- storage().bulge_len_histogram.SaveToFile(p + ".hist");
- }
-
- virtual ~ConsensusConstructionStage() { }
-};
-
-class HaplotypeAssemblyStage : public DipSPAdes::Phase {
-public:
- HaplotypeAssemblyStage() :
- DipSPAdes::Phase("Haplotype assembly", "haplotype_assembly") { }
-
- void run(debruijn_graph::conj_graph_pack &graph_pack, const char*) {
- if(!storage().composite_storage || !storage().default_storage)
- return;
- if(storage().composite_storage->Size() == 0 || storage().default_storage->Size() == 0)
- return;
- INFO("Diploid graph construction");
- conj_graph_pack double_graph_pack(graph_pack.k_value, dsp_cfg::get().io.tmp_dir,
- dsp_cfg::get().io.num_libraries, "");
- construct_graph_from_contigs(double_graph_pack);
- HaplotypeAssembler(graph_pack, double_graph_pack, storage().default_storage,
- storage().composite_storage, storage().redundancy_map).Run();
- }
-
- void load(debruijn_graph::conj_graph_pack&,
- const std::string &,
- const char*) { }
-
- void save(const debruijn_graph::conj_graph_pack&,
- const std::string &,
- const char*) const { }
-
- virtual ~HaplotypeAssemblyStage() { }
-};
-void run_dipspades() {
- INFO("dipSPAdes started");
-
- debruijn_graph::conj_graph_pack conj_gp(
- dsp_cfg::get().bp.K,
- dsp_cfg::get().io.tmp_dir,
- dsp_cfg::get().io.num_libraries,
- "", // reference genome
- 1); // flanking range
-
- conj_gp.kmer_mapper.Attach();
-
- StageManager DS_Manager ( {dsp_cfg::get().rp.developer_mode,
- dsp_cfg::get().io.saves,
- dsp_cfg::get().io.output_saves} );
- auto ds_phase = new DipSPAdes();
- ds_phase -> add(new ContigGraphConstructionStage()) ->
- add(new PolymorphicBulgeRemoverStage()) ->
- add(new EqualKmerGluingStage()) ->
- add(new ConsensusConstructionStage());
- if(dsp_cfg::get().ha.ha_enabled) {
- ds_phase->add(new HaplotypeAssemblyStage());
- }
-
- DS_Manager.add(ds_phase);
- DS_Manager.run(conj_gp, dsp_cfg::get().rp.entry_point.c_str());
- INFO("dipSPAdes finished");
-}
-
-}
diff --git a/src/dipspades/dipspades_config.cpp b/src/dipspades/dipspades_config.cpp
deleted file mode 100644
index 96ad099..0000000
--- a/src/dipspades/dipspades_config.cpp
+++ /dev/null
@@ -1,143 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "dipspades_config.hpp"
-#include "config_common.hpp"
-#include "utils/files_utils.hpp"
-
-using namespace dipspades;
-
-static std::string MakeLaunchTimeDirName() {
- time_t rawtime;
- struct tm * timeinfo;
- char buffer[80];
-
- time(&rawtime);
- timeinfo = localtime(&rawtime);
-
- strftime(buffer, 80, "%m.%d_%H.%M.%S", timeinfo);
- return std::string(buffer);
-}
-
-void load(dipspades_config::base_params &bp,
- boost::property_tree::ptree const &pt, bool) {
- using config_common::load;
- load(bp.K , pt, "K" );
- load(bp.max_memory , pt, "max_memory" );
- load(bp.max_threads , pt, "max_threads" );
- load(bp.read_buffer_size , pt, "read_buffer_size" );
-}
-
-void load(dipspades_config::io_params &io,
- boost::property_tree::ptree const &pt, bool) {
- using config_common::load;
- load(io.haplocontigs , pt, "haplocontigs" );
- io.num_libraries = GetAllLinesFromFile(io.haplocontigs).size();
-
- load(io.log_filename , pt, "log_filename" );
-
- load(io.output_base , pt, "output_base" );
- if (io.output_base[io.output_base.length() - 1] != '/')
- io.output_base += '/';
-
- load(io.output_dir , pt, "output_dir" );
- if (io.output_dir[io.output_dir.length() - 1] != '/')
- io.output_dir += '/';
-
- load(io.tmp_dir , pt, "tmp_dir" );
- if (io.tmp_dir[io.tmp_dir.length() - 1] != '/')
- io.tmp_dir += '/';
-
- load(io.saves , pt, "saves" );
- if(io.saves[io.saves.length() - 1] != '/')
- io.saves += '/';
-}
-
-void load(dipspades_config::run_params &rp,
- boost::property_tree::ptree const &pt, bool) {
- using config_common::load;
- load(rp.entry_point , pt, "entry_point" );
- load(rp.developer_mode , pt, "developer_mode");
-}
-
-void edit_io_params(bool developer_mode, dipspades_config::io_params &io){
- if(developer_mode){
- io.dataset_name = io.output_dir.substr(0, io.output_dir.length() - 1);
- io.output_dir = io.output_base + io.output_dir + "/";
- io.output_root = io.output_dir;
- io.output_suffix = MakeLaunchTimeDirName() + "/";
- io.output_dir = io.output_root + io.output_suffix;
- io.output_saves = io.output_dir + "saves/";
-// io.load_from = io.output_root + io.load_from;
- if (io.tmp_dir[0] != '/') { // relative path
- io.tmp_dir = io.output_dir + io.tmp_dir;
- }
- return;
- }
-
- // no developer mode
- io.dataset_name = io.output_dir;
- io.output_root = io.output_dir;
- io.output_suffix = "";
- io.output_base = "";
- io.output_saves = io.output_dir;
- io.saves = "";
- if (io.tmp_dir[0] != '/') { // relative path
- io.tmp_dir = io.output_dir + io.tmp_dir;
- }
-}
-
-inline void load(dipspades_config::polymorphic_br &pbr,
- boost::property_tree::ptree const& pt, bool){
- using config_common::load;
- load(pbr.enabled , pt, "enabled" );
- load(pbr.rel_bulge_length , pt, "rel_bulge_length" );
- load(pbr.rel_bulge_align , pt, "rel_bulge_align" );
- load(pbr.paired_vert_abs_threshold , pt, "paired_vert_abs_threshold" );
- load(pbr.paired_vert_rel_threshold , pt, "paired_vert_rel_threshold" );
- load(pbr.max_bulge_nucls_len , pt, "max_bulge_nucls_len" );
- load(pbr.max_neigh_number , pt, "max_neigh_number" );
- load(pbr.num_iters_lbr , pt, "num_iters_lbr" );
-}
-
-inline void load(dipspades_config::consensus_constructor &cc,
- boost::property_tree::ptree const& pt, bool /*complete*/){
- using config_common::load;
- load(cc.enabled , pt, "enabled" );
- load(cc.bulge_len_quantile , pt, "bulge_len_quantile" );
- load(cc.tails_lie_on_bulges , pt, "tails_lie_on_bulges" );
- load(cc.estimate_tails , pt, "estimate_tails" );
- load(cc.align_bulge_sides , pt, "align_bulge_sides" );
- load(cc.min_overlap_size , pt, "min_overlap_size" );
- load(cc.min_lcs_size , pt, "min_lcs_size" );
- load(cc.max_loop_length , pt, "max_loop_length" );
-}
-
-inline void load(dipspades_config::haplotype_assembly &ha,
- boost::property_tree::ptree const& pt, bool /*complete*/){
- using config_common::load;
- load(ha.ha_enabled , pt, "ha_enabled" );
-}
-
-void load(dipspades_config &cfg,
- boost::property_tree::ptree const &pt, bool complete){
- using config_common::load;
- load(cfg.bp , pt, "bp", complete);
- load(cfg.io , pt, "io", complete);
- load(cfg.rp , pt, "rp", complete);
- load(cfg.cc , pt, "cc", complete);
- load(cfg.ha , pt, "ha", complete);
- load(cfg.pbr , pt, "pbr", complete);
-}
-
-void load(dipspades_config &cfg, std::string const &filename) {
- boost::property_tree::ptree pt;
- boost::property_tree::read_info(filename, pt);
- load(cfg, pt, true);
- edit_io_params(cfg.rp.developer_mode, cfg.io);
-
-}
diff --git a/src/dipspades/dipspades_config.hpp b/src/dipspades/dipspades_config.hpp
deleted file mode 100644
index aab9b2d..0000000
--- a/src/dipspades/dipspades_config.hpp
+++ /dev/null
@@ -1,82 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "config_singl.hpp"
-#include <boost/property_tree/ptree_fwd.hpp>
-
-struct dipspades_config {
-
- struct base_params {
- size_t K;
- size_t max_threads;
- size_t max_memory;
- size_t read_buffer_size;
- };
-
- struct io_params {
- std::string haplocontigs;
- size_t num_libraries;
- std::string log_filename;
-
- std::string output_base;
- std::string output_root;
- std::string output_dir;
- std::string tmp_dir;
- std::string output_suffix;
- std::string output_saves;
-
- std::string dataset_name;
-
- std::string saves;
- };
-
- struct run_params {
- std::string entry_point;
- bool developer_mode;
- };
-
- struct polymorphic_br {
- bool enabled;
- double rel_bulge_length;
- double rel_bulge_align;
- size_t paired_vert_abs_threshold;
- double paired_vert_rel_threshold;
- size_t max_bulge_nucls_len;
- size_t max_neigh_number;
- size_t num_iters_lbr;
- size_t num_iters_hbr;
- };
-
- struct consensus_constructor {
- bool enabled;
- double bulge_len_quantile;
- bool tails_lie_on_bulges;
- bool align_bulge_sides;
- bool estimate_tails;
- size_t min_overlap_size;
- size_t min_lcs_size;
- size_t max_loop_length;
- };
-
- struct haplotype_assembly {
- bool ha_enabled;
- };
-
- base_params bp;
- io_params io;
- run_params rp;
- polymorphic_br pbr;
- consensus_constructor cc;
- haplotype_assembly ha;
-};
-
-void load(dipspades_config &cfg, std::string const &filename);
-
-typedef config_common::config<dipspades_config> dsp_cfg;
-
diff --git a/src/dipspades/haplotype_assembly/conservative_regions_searcher.hpp b/src/dipspades/haplotype_assembly/conservative_regions_searcher.hpp
deleted file mode 100644
index aaaed01..0000000
--- a/src/dipspades/haplotype_assembly/conservative_regions_searcher.hpp
+++ /dev/null
@@ -1,174 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "../../debruijn/sequence_mapper.hpp"
-#include "contig_separation_utils.hpp"
-
-using namespace debruijn_graph;
-
-namespace dipspades {
-
-class ConservativeRegionsSearcher{
- conj_graph_pack & dbl_gp_;
- ContigStoragePtr storage_;
- SignedLabels signed_labels_;
- ConservativeRegionStorage cons_reg_storage_;
-
- NewExtendedSequenceMapper<conj_graph_pack::graph_t, conj_graph_pack::index_t> mapper_;
- map<int, MappingPath<EdgeId> > contig_map_path_;
-
- typedef map<int, vector<int> > diff_labeled_contigs;
- diff_labeled_contigs map_of_diff_contigs_;
-
- MappingPath<EdgeId> GetMappingPath(int contig){
- if(contig_map_path_.find(contig) == contig_map_path_.end()){
- auto seq = storage_->GetContigById(contig)->seq();
- MappingPath<EdgeId> map_path = mapper_.MapSequence(seq);
- contig_map_path_[contig] = map_path;
- }
- return contig_map_path_[contig];
- }
-
- void ComputeDifferentLabeledContigs(){
- for(auto it = signed_labels_.begin(); it != signed_labels_.end(); it++)
- if(it->second == from_different){
- int contig1 = it->first.first, contig2 = it->first.second;
- map_of_diff_contigs_[contig1].push_back(contig2);
- }
- }
-
- vector<EdgeId> GetConservativeEdges(vector<MappingPath<EdgeId> > paths,
- vector<int> labels){
-
- map<EdgeId, set<int> > edge_labels;
- for(size_t i = 0; i < paths.size(); i++){
- MappingPath<EdgeId> path = paths[i];
- int label = labels[i];
-
- for(size_t j = 0; j < path.size(); j++)
- edge_labels[path[j].first].insert(label);
- }
-
-// for(auto it = edge_labels.begin(); it != edge_labels.end(); it++){
-// cout << it->first << ". Labels ";
-// PrintSet<int>(cout, it->second);
-// }
-
- vector<EdgeId> cons_edges;
- for(auto it = edge_labels.begin(); it != edge_labels.end(); it++)
- if(it->second.size() > 1)
- cons_edges.push_back(it->first);
-
- return cons_edges;
- }
-
- vector<int> GatherLabelsForSeparatedContigs(vector<int> separated_contigs){
- vector<int> labels;
- labels.push_back(1);
- for(auto c = separated_contigs.begin(); c != separated_contigs.end(); c++){
- labels.push_back(2);
- }
- return labels;
- }
-
- vector<MappingPath<EdgeId> > GatherMappingPathForContigs(vector<int> contigs){
- vector<MappingPath<EdgeId> > map_paths;
- for(auto c = contigs.begin(); c != contigs.end(); c++)
- map_paths.push_back(GetMappingPath(*c));
- return map_paths;
- }
-
- void FindTwoColoredEdges(){
-
- ComputeDifferentLabeledContigs();
-
- for(auto it = map_of_diff_contigs_.begin(); it != map_of_diff_contigs_.end(); it++){
-
- auto contig = it->first;
- auto separated_contigs = it->second;
-
-// cout << contig << ". Separated set - ";
-// PrintVector<int>(cout, separated_contigs);
-
- auto labels = GatherLabelsForSeparatedContigs(separated_contigs);
-
- // gather all mapping paths
- auto contig_map_path = GetMappingPath(contig);
- vector<MappingPath<EdgeId> > map_paths = GatherMappingPathForContigs(separated_contigs);
- map_paths.insert(map_paths.begin(), contig_map_path);
-
- // find two or more colored edges
- auto cur_cons_edges = GetConservativeEdges(map_paths, labels);
-
- // add them in storage
- for(auto e = cur_cons_edges.begin(); e != cur_cons_edges.end(); e++)
- cons_reg_storage_.AddConservativeRegion(dbl_gp_.g.EdgeNucls(*e));
- }
- }
-
- void WriteConservativeRegionsStorageToFile(string filename, cons_regions_iterator iter_begin,
- cons_regions_iterator iter_end){
- ofstream fout(filename);
- int cnt = 1;
- for(auto it = iter_begin; it != iter_end; it++){
- Sequence curr_seq = *it;
- fout << ">" << cnt << "_conservative_region_length_" << curr_seq.size() << endl;
- fout << curr_seq.str() << endl;
- cnt++;
- }
- }
-
- size_t ComputeSummaryLengthOfRegionInStorage(cons_regions_iterator iter_begin,
- cons_regions_iterator iter_end){
- size_t summary_cons_reg_length = 0;
- for(auto it = iter_begin; it != iter_end; it++){
- summary_cons_reg_length += it->size();
- }
- return summary_cons_reg_length;
- }
-
-public:
- ConservativeRegionsSearcher(conj_graph_pack &dbl_gp, ContigStoragePtr storage,
- SignedLabels signed_labels, ConservativeRegionStorage cons_reg_storage) :
- dbl_gp_(dbl_gp),
- storage_(storage),
- signed_labels_(signed_labels),
- cons_reg_storage_(cons_reg_storage),
- mapper_(dbl_gp_.g, dbl_gp_.index,
- dbl_gp_.kmer_mapper) { }
-
- void Search(){
- FindTwoColoredEdges();
- size_t cons_regions_length = ComputeSummaryLengthOfRegionInStorage(cons_reg_storage_.cons_regions_begin(),
- cons_reg_storage_.cons_regions_end());
- if(cons_regions_length > 0){
- string cons_regions_fname(path::append_path(dsp_cfg::get().io.output_dir,
- "conservative_regions.fasta").c_str());
- WriteConservativeRegionsStorageToFile(cons_regions_fname, cons_reg_storage_.cons_regions_begin(),
- cons_reg_storage_.cons_regions_end());
- INFO("Conservative regions with total length " << cons_regions_length <<
- " written in file " << cons_regions_fname);
- }
-
- size_t poss_cons_regions_length = ComputeSummaryLengthOfRegionInStorage(cons_reg_storage_.poss_cons_regions_begin(),
- cons_reg_storage_.poss_cons_regions_end());
- if(poss_cons_regions_length > 0){
- string poss_cons_regions_fname(path::append_path(dsp_cfg::get().io.output_dir,
- "possibly_conservative_regions.fasta").c_str());
-// INFO("Possibly conservative regions written in file " << poss_cons_regions_fname);
- WriteConservativeRegionsStorageToFile(poss_cons_regions_fname, cons_reg_storage_.poss_cons_regions_begin(),
- cons_reg_storage_.poss_cons_regions_end());
- INFO("Conservative regions with total length " << poss_cons_regions_length <<
- " written in file " << poss_cons_regions_fname);
- }
- }
-};
-
-}
-
diff --git a/src/dipspades/haplotype_assembly/conservative_regions_storage.hpp b/src/dipspades/haplotype_assembly/conservative_regions_storage.hpp
deleted file mode 100644
index c24065f..0000000
--- a/src/dipspades/haplotype_assembly/conservative_regions_storage.hpp
+++ /dev/null
@@ -1,44 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-namespace dipspades {
-
-typedef vector<Sequence>::iterator cons_regions_iterator;
-
-class ConservativeRegionStorage{
- vector<Sequence> cons_regions_;
- vector<Sequence> poss_cons_regions_;
-
-public:
- void AddConservativeRegion(Sequence seq){
- cons_regions_.push_back(seq);
- }
-
- void AddPossiblyConservativeRegion(Sequence seq){
- poss_cons_regions_.push_back(seq);
- }
-
- cons_regions_iterator cons_regions_begin(){
- return cons_regions_.begin();
- }
-
- cons_regions_iterator cons_regions_end(){
- return cons_regions_.end();
- }
-
- cons_regions_iterator poss_cons_regions_begin(){
- return poss_cons_regions_.begin();
- }
-
- cons_regions_iterator poss_cons_regions_end(){
- return poss_cons_regions_.end();
- }
-};
-
-}
diff --git a/src/dipspades/haplotype_assembly/contig_separation_utils.hpp b/src/dipspades/haplotype_assembly/contig_separation_utils.hpp
deleted file mode 100644
index ef494aa..0000000
--- a/src/dipspades/haplotype_assembly/contig_separation_utils.hpp
+++ /dev/null
@@ -1,515 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "../utils/range_utils.hpp"
-#include "../utils/path_routines.hpp"
-#include "../utils/bulge_utils.hpp"
-#include "conservative_regions_storage.hpp"
-#include <string>
-
-using namespace debruijn_graph;
-
-namespace dipspades {
-
-enum haplotype {unknown, from_one, from_different};
-
-typedef map<pair<int, int>, haplotype>::iterator signed_label_iter;
-
-class SignedLabels{
-
- map<pair<int, int>, haplotype> contigs_pairs;
-
-public:
- void Add(int contig_id1, int contig_id2, haplotype new_label){
- pair<int, int> proc_pair(contig_id1, contig_id2);
- if(contigs_pairs.find(proc_pair) == contigs_pairs.end())
- contigs_pairs[proc_pair] = new_label;
- else{
- haplotype old_label = contigs_pairs[proc_pair];
- if(old_label < new_label)
- contigs_pairs[proc_pair] = new_label;
- }
- }
-
- haplotype GetHaplotypeByPair(int contig_id1, int contig_id2){
- return contigs_pairs[pair<int, int>(contig_id1, contig_id2)];
- }
-
- signed_label_iter begin(){
- return contigs_pairs.begin();
- }
-
- signed_label_iter end(){
- return contigs_pairs.end();
- }
-
- void MergeWith(SignedLabels new_signed_labels){
- for(auto it = new_signed_labels.begin(); it != new_signed_labels.end(); it++){
- Add(it->first.first, it->first.second, it->second);
- }
- }
-
- size_t Size(){
- return contigs_pairs.size();
- }
-
- string ToString(){
- stringstream ss;
- for(auto it = contigs_pairs.begin(); it != contigs_pairs.end(); it++)
- ss << "Pair " << it->first.first << ", " << it->first.second << " - " << it->second << ". ";
- return ss.str();
- }
-
- void WriteToFile(string fname, ContigStoragePtr contig_storage){
- ofstream out(fname.c_str());
- for(auto it= contigs_pairs.begin(); it != contigs_pairs.end(); it++)
- if(it->second == from_different){
- auto contig1 = contig_storage->GetContigById(it->first.first);
- auto contig2 = contig_storage->GetContigById(it->first.second);
- out << contig1->src_file() << ":" << contig1->name() << "\t" <<
- contig2->src_file() << ":" << contig2->name() << endl;
- }
- out.close();
- }
-};
-
-class ContigLabelAllocator{
- ContigStoragePtr contig_storage_;
-
- Sequence GetSequenceByRange(Sequence seq, pair<size_t, size_t> r){
- return seq.Subseq(r.first, r.second);
- }
-
- bool AreRangesIntersect(MappingRange mapping_range1, MappingRange mapping_range2){
-
- Range mapped_range1 = mapping_range1.mapped_range;
- Range mapped_range2 = mapping_range2.mapped_range;
-
- if(!is_intersection_exist(mapped_range1, mapped_range2))
- return false;
-
- Range intersection = get_intersection_of_ranges(mapped_range1, mapped_range2);
- return intersection.end_pos - intersection.start_pos > 100;
- }
-
- haplotype ComputeLabelForPair(MappingRange mapping_range1, Sequence seq1,
- MappingRange mapping_range2, Sequence seq2){
-
- Range mapped_range1 = mapping_range1.mapped_range;
- Range mapped_range2 = mapping_range2.mapped_range;
-
- VERIFY(is_intersection_exist(mapped_range1, mapped_range2));
-
- TRACE("Mapping range1: " << mapped_range1.start_pos << " " <<
- mapped_range1.end_pos);
- TRACE("Mapping range2: " << mapped_range2.start_pos << " " <<
- mapped_range2.end_pos);
-
- TRACE("Init range1: " << mapping_range1.initial_range.start_pos << " " <<
- mapping_range1.initial_range.end_pos);
- TRACE("Init range2: " << mapping_range2.initial_range.start_pos << " " <<
- mapping_range2.initial_range.end_pos);
-
-
- Range intersection = get_intersection_of_ranges(mapped_range1, mapped_range2);
-
- TRACE("Intersection: " << intersection.start_pos << " " << intersection.end_pos);
-
- auto new_init_pair1 = project_init_range_to_new(mapped_range1, intersection,
- mapping_range1.initial_range);
- auto new_init_pair2 = project_init_range_to_new(mapped_range2, intersection,
- mapping_range2.initial_range);
-
- TRACE("1st projection: " << new_init_pair1.first << " " << new_init_pair1.second);
- TRACE("2nd projection: " << new_init_pair2.first << " " << new_init_pair2.second);
-
- if(!is_range_pair_correct(new_init_pair1) || !is_range_pair_correct(new_init_pair2))
- return unknown;
-
- Sequence subseq1 = GetSequenceByRange(seq1, new_init_pair1);
- Sequence subseq2 = GetSequenceByRange(seq2, new_init_pair2);
-
- double relative_align = RelAlignmentOfSequences(subseq1, subseq2);
-
- TRACE("Seq1 size - " << subseq1.size() << ", seq2 size - " << subseq2.size());
- TRACE("Relative alignment - " << relative_align);
-
- if(fabs(relative_align) < 0.0001)
- return from_one;
- return from_different;
- }
-
-public:
- ContigLabelAllocator(ContigStoragePtr contig_storage) :
- contig_storage_(contig_storage) { }
-
- SignedLabels SignLabelsOnEdge(set<size_t> contigs, EdgeId current_edge){
-
- SignedLabels this_edge_labels;
- vector<int> indexes_of_edge;
- for(auto contig = contigs.begin(); contig != contigs.end(); contig++){
- int index = get_index_of_edge(contig_storage_->GetContigById(*contig)->
- mapping_path().simple_path(), current_edge);
- VERIFY(index != -1);
- indexes_of_edge.push_back(index);
- }
- vector<int> oppa(contigs.begin(), contigs.end());
- for(size_t cnt1 = 0; cnt1 < oppa.size(); cnt1++) {
- int id1 = oppa[cnt1];
- auto seq1 = contig_storage_->GetContigById(id1)->seq();
- auto mapping_path1 = contig_storage_->GetContigById(id1)->mapping_path();
- MappingRange mapping_range1 = mapping_path1[indexes_of_edge[cnt1]].second;
- for(size_t cnt2 = cnt1 + 1; cnt2 < oppa.size(); cnt2++) {
- int id2 = oppa[cnt2];
- auto seq2 = contig_storage_->GetContigById(id2)->seq();
- auto mapping_path2 = contig_storage_->GetContigById(id2)->mapping_path();
- TRACE("Sign label for " << id1 << " " << id2);
- TRACE("Seq1 size - " << seq1.size() << " , seq2 size - " << seq2.size())
- MappingRange mapping_range2 = mapping_path2[indexes_of_edge[cnt2]].second;
- if(AreRangesIntersect(mapping_range1, mapping_range2)){
- TRACE("Intersection exists");
- haplotype label = ComputeLabelForPair(mapping_range1, seq1, mapping_range2, seq2);
- this_edge_labels.Add(id1, id2, label);
- }
- }
- }
-
- return this_edge_labels;
- }
-
-private:
- DECL_LOGGER("ContigLabelAllocator");
-};
-
-class IndexedPairOfEdges{
- pair<EdgeId, EdgeId> edges;
- pair<size_t, size_t> indexes;
- bool is_correct;
-
-public:
- IndexedPairOfEdges(){
- is_correct = false;
- }
-
- IndexedPairOfEdges(EdgeId edge1, EdgeId edge2, size_t index1, size_t index2){
- edges.first = edge1;
- edges.second = edge2;
-
- indexes.first = index1;
- indexes.second = index2;
-
- is_correct = index1 <= index2;
- }
-
- EdgeId FirstEdge(){
- VERIFY(is_correct);
- return edges.first;
- }
-
- EdgeId SecondEdge(){
- VERIFY(is_correct);
- return edges.first;
- }
-
- size_t FirstIndex(){
- VERIFY(is_correct);
- return indexes.first;
- }
-
- size_t SecondIndex(){
- VERIFY(is_correct);
- return indexes.first;
- }
-
- bool IsNull(){
- return !is_correct;
- }
-};
-
-enum separation_result {not_identified, separated, diploid_repeat, conservative_region};
-
-class SeparationResultInterpretator{
-
- bool IsConservativeRegion(SignedLabels labels){
-
- if(labels.Size() == 0)
- return false;
-
- for(auto it = labels.begin(); it != labels.end(); it++){
- if(it->second == from_different || it->second == unknown)
- return false;
- }
-
- return true;
- }
-
- bool AddNewEdgeIntoBigraph(set<int> &first_part, set<int> &second_part,
- pair<int,int> new_edge){
-
- int vertex1 = new_edge.first, vertex2 = new_edge.second;
-
- if(first_part.find(vertex1) != first_part.end() &&
- first_part.find(vertex2) != first_part.end())
- return false;
-
- if(second_part.find(vertex1) != second_part.end() &&
- second_part.find(vertex2) != second_part.end())
- return false;
-
- if(first_part.find(vertex1) != first_part.end()){
- second_part.insert(vertex2);
- }
- else{
- if(second_part.find(vertex1) != second_part.end())
- first_part.insert(vertex2);
- else{
-
- if(first_part.find(vertex2) != first_part.end())
- second_part.insert(vertex1);
- else{
- first_part.insert(vertex1);
- second_part.insert(vertex2);
- }
- }
- }
- return true;
- }
-
- bool AreSeparatedContigs(SignedLabels labels){
-
- if(labels.Size() == 0)
- return false;
-
- set<int> first_part, second_part;
-
- for(auto it = labels.begin(); it != labels.end(); it++){
- if(it->second == from_different){
- pair<int, int> new_edge = it->first;
- if(!AddNewEdgeIntoBigraph(first_part, second_part, new_edge)){
- TRACE("Edge doesn't added");
- return false;
- }
- }
- }
-
- return true;
- }
-
-public:
- separation_result Interpretate(SignedLabels labels){
-
- if(labels.Size() == 0){
- TRACE("Result unknown");
- return not_identified;
- }
-
- if(IsConservativeRegion(labels)){
- TRACE("Conservative region");
- return conservative_region;
- }
-
- if(AreSeparatedContigs(labels)){
- TRACE("Contigs are separated into two haplotypes");
- return separated;
- }
-
- TRACE("Diploid repeat");
- return diploid_repeat;
- }
-};
-
-class DiploidContigSeparator{
-
- typedef map<EdgeId, set<size_t> > EdgeContigsMap;
-
- Graph &g_;
- ContigStoragePtr default_storage_;
- ContigStoragePtr composite_storage_;
- CorrectionResult res_of_corr_cycle_;
-
- set<size_t> GetOfInnerContigsOf(size_t composite_contig_index){
- MappingContigPtr contig = (*composite_storage_)[composite_contig_index];
- set<size_t> contigs;
- vector<MappingContigPtr> inner_contigs = contig->AllMappingContigs();
- if(inner_contigs.size() == 0)
- contigs.insert(contig->id());
- else{
- for(auto it = inner_contigs.begin(); it != inner_contigs.end(); it++){
- size_t cur_id = (*it)->id();
- contigs.insert(cur_id);
- auto set_red_conts = res_of_corr_cycle_.redundancy_map.GetValuesByKey(cur_id);
- for(auto c = set_red_conts.begin(); c != set_red_conts.end(); c++)
- contigs.insert(*c);
- }
- }
- return contigs;
- }
-
- set<EdgeId> GetSetOfEdgesFromPath(vector<EdgeId> path){
- set<EdgeId> res;
- for(auto e = path.begin(); e != path.end(); e++){
- res.insert(*e);
- }
- return res;
- }
-
- IndexedPairOfEdges DefineStartAndEndEdges(vector<EdgeId> common_path, MappingContigPtr inner_contig){
- MappingPath<EdgeId> map_path = inner_contig->mapping_path();
- VERIFY(map_path.size() > 0);
- EdgeId first_edge;
- size_t first_ind = size_t(-1);
- bool is_1st_found = false;
- for(size_t i = 0; i < map_path.size(); i++){
- for(size_t j = 0; j < common_path.size(); j++)
- if(map_path[i].first == common_path[j]){
- first_edge = map_path[i].first;
- first_ind = j;
- is_1st_found = true;
- break;
- }
- if(is_1st_found)
- break;
- }
-
- EdgeId last_edge;
- size_t last_ind = size_t(-1);
- bool is_2nd_found = false;
- for(int i = int(map_path.size() - 1); i >= 0; i--){
- for(int j = int(common_path.size()- 1); j >= 0; j--)
- if(map_path[i].first == common_path[j]){
- last_edge = map_path[i].first;
- last_ind = size_t(j);
- is_2nd_found = true;
- break;
- }
- if(is_2nd_found)
- break;
- }
-
- if(first_ind <= last_ind && is_1st_found && is_2nd_found)
- return IndexedPairOfEdges(first_edge, last_edge, first_ind, last_ind);
- else
- return IndexedPairOfEdges();
- }
-
- set<size_t> DeleteSubsetFromSet(set<size_t> set_, set<size_t> subset_){
- for(auto it = subset_.begin(); it != subset_.end(); it++)
- set_.erase(*it);
- return set_;
- }
-
- EdgeContigsMap DefineContigsOnEdges(set<size_t> contigs){
- EdgeContigsMap res;
- for(auto contig = contigs.begin(); contig != contigs.end(); contig++){
- auto map_path = default_storage_->GetContigById(*contig)->mapping_path();
- for(size_t i = 0; i < map_path.size(); i++)
- res[map_path[i].first].insert(*contig);
- }
- return res;
- }
-
- SignedLabels signed_labels_;
- ConservativeRegionStorage cons_regions_stor_;
-
-public:
- DiploidContigSeparator(Graph &g, ContigStoragePtr default_storage,
- ContigStoragePtr composite_storage, CorrectionResult res_of_corr_cycle) :
- g_(g), default_storage_(default_storage), composite_storage_(composite_storage),
- res_of_corr_cycle_(res_of_corr_cycle){
- }
-
- void SeparateContigs(){
-
- SignedLabels signed_labels;
- ContigLabelAllocator label_allocator(default_storage_);
-
- // for each composite contig
- for(size_t i = 0; i < composite_storage_->Size(); i++){
-
- TRACE("New composite contig");
-
- // computing set of inner contigs
- set<size_t> inner_contigs = GetOfInnerContigsOf(i);
-
- TRACE("Number of contigs - " << inner_contigs.size());
-
- // define which contigs intersect consensus path
- vector<EdgeId> consensus_path = (*composite_storage_)[i]->path_seq();
-
- set<EdgeId> start_edge_edges_set;
- map<size_t, IndexedPairOfEdges> contig_start_end_map;
- set<size_t> contigs_for_deletion;
-
- for(auto c = inner_contigs.begin(); c != inner_contigs.end(); c++){
- MappingContigPtr contig = default_storage_->GetContigById(*c);
- auto edges = DefineStartAndEndEdges(consensus_path, contig);
- if(!edges.IsNull()){
- contig_start_end_map[*c] = edges;
- start_edge_edges_set.insert(edges.FirstEdge());
- start_edge_edges_set.insert(edges.SecondEdge());
- }
- else
- contigs_for_deletion.insert(*c);
- }
-
- inner_contigs = DeleteSubsetFromSet(inner_contigs, contigs_for_deletion);
-
- EdgeContigsMap contigs_on_edge = DefineContigsOnEdges(inner_contigs);
-
- TRACE("Defining labels");
- SeparationResultInterpretator interpret;
- for(auto e = consensus_path.begin(); e != consensus_path.end(); e++){
-
- TRACE("Edge - " << g_.str(*e) << ", start - " << g_.str(g_.EdgeStart(*e)) <<
- ", end - " << g_.str(g_.EdgeEnd(*e)));
- auto contigs_ids_on_edge = contigs_on_edge[*e];
-
- if(contigs_ids_on_edge.size() == 1){
- cons_regions_stor_.AddPossiblyConservativeRegion(g_.EdgeNucls(*e));
- TRACE(g_.int_id(*e) << " - possibly conservative region");
- }
-
- TRACE("Contigs on this edge: " << SetToString<size_t>(contigs_ids_on_edge));
-
- SignedLabels current_signed_labels = label_allocator.SignLabelsOnEdge(contigs_ids_on_edge, *e);
-
- TRACE("Signed labels for this edge");
-// current_signed_labels.Print(cout);
- signed_labels_.MergeWith(current_signed_labels);
-
- TRACE("Interpretation of results");
- auto inpret_res = interpret.Interpretate(current_signed_labels);
- TRACE("------------------------------------------");
-
- if(inpret_res == conservative_region){
- cons_regions_stor_.AddConservativeRegion(g_.EdgeNucls(*e));
- TRACE(g_.int_id(*e) << " - conservative region");
- }
- }
- }
-
- TRACE("Signed labels:");
- TRACE(signed_labels_.ToString());
-
- }
-
- SignedLabels GetSignedLabels(){
- return signed_labels_;
- }
-
- ConservativeRegionStorage GetConservativeRegionStorage(){
- return cons_regions_stor_;
- }
-
-private:
- DECL_LOGGER("DiploidContigSeparator");
-
-};
-
-}
diff --git a/src/dipspades/haplotype_assembly/haplotype_assembler.hpp b/src/dipspades/haplotype_assembly/haplotype_assembler.hpp
deleted file mode 100644
index b966050..0000000
--- a/src/dipspades/haplotype_assembly/haplotype_assembler.hpp
+++ /dev/null
@@ -1,59 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "conservative_regions_searcher.hpp"
-
-namespace dipspades {
-
-class HaplotypeAssembler {
-
- conj_graph_pack &consensus_graph_pack_;
- conj_graph_pack &double_graph_pack_;
- ContigStoragePtr default_storage_;
- ContigStoragePtr composite_storage_;
- CorrectionResult redundancy_map_;
-
-public:
-
- HaplotypeAssembler(conj_graph_pack &consensus_graph_pack,
- conj_graph_pack &double_graph_pack,
- ContigStoragePtr default_storage,
- ContigStoragePtr composite_storage,
- CorrectionResult redundancy_map) :
- consensus_graph_pack_(consensus_graph_pack),
- double_graph_pack_(double_graph_pack),
- default_storage_(default_storage),
- composite_storage_(composite_storage),
- redundancy_map_(redundancy_map) {
- double_graph_pack_.kmer_mapper.Attach();
- }
-
- void Run() {
- INFO("Contigs separation starts");
- DiploidContigSeparator separator(consensus_graph_pack_.g, default_storage_,
- composite_storage_, redundancy_map_);
- INFO("Haplocontigs number: " << default_storage_->Size());
- INFO("Consensus contigs number: " << composite_storage_->Size());
- separator.SeparateContigs();
- SignedLabels signed_labels = separator.GetSignedLabels();
- string hapl_output(path::append_path(dsp_cfg::get().io.output_dir, "haplotype_assembly.out").c_str());
- signed_labels.WriteToFile(hapl_output, default_storage_);
- INFO("Result of haplotype assembly written in file " << hapl_output);
- INFO("Contigs separation ends");
-
- INFO("Conservative regions search starts");
- ConservativeRegionStorage conservative_regions = separator.GetConservativeRegionStorage();
- ConservativeRegionsSearcher cons_regions_searcher(double_graph_pack_, default_storage_,
- signed_labels, conservative_regions);
- cons_regions_searcher.Search();
- INFO("Conservative regions search ends");
- }
-};
-
-}
diff --git a/src/dipspades/kmer_gluing/equal_sequence_gluer.hpp b/src/dipspades/kmer_gluing/equal_sequence_gluer.hpp
deleted file mode 100644
index ff4ca29..0000000
--- a/src/dipspades/kmer_gluing/equal_sequence_gluer.hpp
+++ /dev/null
@@ -1,146 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "../utils/edge_gluer.hpp"
-
-using namespace debruijn_graph;
-
-namespace dipspades {
-
-template<class Graph>
-class EqualSequencesGluer {
-private:
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- Graph &graph_;
- conj_graph_pack::index_t &index_;
-
- EdgeId ExtractShortEdge(EdgeId edge, size_t pos) {
- if(pos + 1 < graph_.length(edge)) {
- edge = graph_.SplitEdge(edge, pos + 1).first;
- }
- if(pos > 0) {
- edge = graph_.SplitEdge(edge, pos).second;
- }
- VERIFY(graph_.length(edge) == 1);
- return edge;
- }
-
- bool CheckClose(size_t a, size_t b, size_t diff) const {
- return a <= b + diff && b <= a + diff;
- }
-
- bool ConjugateEdgesCannotBeSplitted(size_t edge_length, size_t pos1, size_t pos2) {
- return CheckClose(edge_length, pos1 + pos2 + 1, 1) && CheckClose(pos1, pos2, 1);
- }
-
- void GlueEqualEdgeParts(EdgeId edge1, size_t pos1, EdgeId edge2, size_t pos2) {
- TRACE("Edge1: " << graph_.int_id(edge1) << ", length: " << graph_.length(edge1) << ", pos: " << pos1);
- TRACE("Edge2: " << graph_.int_id(edge2) << ", length: " << graph_.length(edge2) << ", pos: " << pos2);
- VERIFY(edge1 != edge2 || pos1 != pos2);
- if(edge1 == edge2) {
- if(edge1 == graph_.conjugate(edge2)) {
- WARN("Equal k-mer gluer faced a difficult situation in graph for edge " << graph_.int_id(edge1)
- << " Equal k-mers were ignored.");
- return;
- }
- if(pos1 > pos2) {
- std::swap(pos1, pos2);
- }
- pair<EdgeId, EdgeId> split_edges = graph_.SplitEdge(edge1, pos2);
- edge1 = split_edges.first;
- edge2 = split_edges.second;
- pos2 = 0;
- } else if(edge1 == graph_.conjugate(edge2)) {
- TRACE("Edges are conjugate pairs");
- if(ConjugateEdgesCannotBeSplitted(graph_.length(edge1), pos1, pos2)) {
- WARN("Equal k-mer gluer faced a difficult situation in graph for edges " << graph_.int_id(edge1) <<
- " and " << graph_.int_id(edge2) << ". Equal k-mers were ignored.");
- return;
- }
- if (pos1 + pos2 == graph_.length(edge1) - 1) {
- WARN("Equal k-mer gluer faced a difficult situation in graph for edge " << graph_.int_id(edge1)
- << " Equal k-mers were ignored.");
- }
- if(pos1 + pos2 >= graph_.length(edge1) - 1) {
- size_t tmp = pos1;
- pos1 = graph_.length(edge1) - pos2 - 1;
- pos2 = graph_.length(edge1) - tmp - 1;
- }
- INFO(pos1 << " " << pos2 << " " << graph_.length(edge1))
- TRACE("Edge1 " << graph_.int_id(edge1) << " will be splitted");
- pair<EdgeId, EdgeId> split_edges = graph_.SplitEdge(edge1, pos1 + 1);
- TRACE("Splitted pair was created");
- TRACE("New edge1: " << graph_.int_id(split_edges.first) << ", length: " << graph_.length(split_edges.first));
- TRACE("New edge2: " << graph_.int_id(split_edges.second) << ", length: " << graph_.length(split_edges.second));
- edge1 = split_edges.first;
- edge2 = graph_.conjugate(split_edges.second);
-// pos2 -= pos1 + 1;
- }
- EdgeId se1 = ExtractShortEdge(edge1, pos1);
- EdgeId se2 = ExtractShortEdge(edge2, pos2);
- VERIFY(graph_.EdgeNucls(se1) == graph_.EdgeNucls(se2));
- GlueEqualEdges(se1, se2);
- }
-
- void SafelyGlueEdges(EdgeId e1, EdgeId e2){
- // e1 -> e2
- vector<EdgeId> forbidden_edges = {e1, e2};
- EdgeGluer(graph_).MoveEdgesFromVertexToVertex(graph_.EdgeStart(e1),
- graph_.EdgeStart(e2), forbidden_edges);
- EdgeGluer(graph_).MoveEdgesFromVertexToVertex(graph_.EdgeEnd(e1),
- graph_.EdgeEnd(e2), forbidden_edges);
- graph_.GlueEdges(e1, e2);
- }
-
- void GlueEqualEdges(EdgeId edge1, EdgeId edge2) {
- set<VertexId> endVertices = {graph_.EdgeStart(edge1), graph_.EdgeEnd(edge1),
- graph_.EdgeStart(edge2), graph_.EdgeEnd(edge2),
- graph_.conjugate(graph_.EdgeStart(edge1)),
- graph_.conjugate(graph_.EdgeEnd(edge1)),
- graph_.conjugate(graph_.EdgeStart(edge2)),
- graph_.conjugate(graph_.EdgeEnd(edge2))};
- if(endVertices.size() != 8)
- return;
- SafelyGlueEdges(edge1, edge2);
- }
-
-public:
- EqualSequencesGluer(Graph &graph, conj_graph_pack::index_t &index): graph_(graph), index_(index) { }
-
- Sequence get(EdgeId e, size_t pos) const {
- return graph_.EdgeNucls(e).subseq(pos, pos + graph_.k() + 1);
- }
-
- void GlueEqualKmers() {
- size_t cnt = 0;
- for(auto it = graph_.SmartEdgeBegin(); !it.IsEnd(); ++it) {
- Sequence nucls = graph_.EdgeNucls(*it);
- runtime_k::RtSeq kmer = nucls.start<runtime_k::RtSeq>(graph_.k() + 1) >> 'A';
- for(size_t i = graph_.k(); i < graph_.length(*it); i++) {
- kmer = kmer << graph_.EdgeNucls(*it)[i];
- if(!index_.contains(kmer)) {
- continue;
- }
- pair<EdgeId, size_t> pos = index_.get(kmer);
- if(pos.first != *it || pos.second != i - graph_.k()) {
- GlueEqualEdgeParts(pos.first, pos.second, *it, i - graph_.k());
- cnt++;
- break;
- }
- }
- }
- INFO(cnt << " kmers glued");
- }
-
-private:
- DECL_LOGGER("EqualSequencesGluer");
-};
-
-}
diff --git a/src/dipspades/main.cpp b/src/dipspades/main.cpp
deleted file mode 100644
index 1a5b79f..0000000
--- a/src/dipspades/main.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * Assembler Main
- */
-#include "standard.hpp"
-#include "logger/log_writers.hpp"
-
-#include "segfault_handler.hpp"
-#include "stacktrace.hpp"
-#include "memory_limit.hpp"
-#include "copy_file.hpp"
-#include "perfcounter.hpp"
-#include "runtime_k.hpp"
-
-
-#include "graph_pack.hpp"
-#include "construction.hpp"
-#include "stage.hpp"
-
-#include "dipspades.hpp"
-#include "dipspades_config.hpp"
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <unistd.h>
-
-#include "segfault_handler.hpp"
-#include "version.hpp"
-
-void make_dirs(){
- make_dir(dsp_cfg::get().io.output_base);
- make_dir(dsp_cfg::get().io.output_root);
- make_dir(dsp_cfg::get().io.output_dir);
- make_dir(dsp_cfg::get().io.output_saves);
- make_dir(dsp_cfg::get().io.tmp_dir);
-}
-
-void copy_configs(string cfg_filename, string to) {
- using namespace debruijn_graph;
-
- if (!make_dir(to)) {
- WARN("Could not create files use in /tmp directory");
- }
- path::copy_files_by_ext(path::parent_path(cfg_filename), to, ".info", true);
-}
-
-void load_config(string cfg_filename) {
- path::CheckFileExistenceFATAL(cfg_filename);
- dsp_cfg::create_instance(cfg_filename);
-// string path_to_copy = path::append_path(dsp_cfg::get().io.output_dir, "configs");
-// copy_configs(cfg_filename, path_to_copy);
-}
-
-void create_console_logger(string cfg_filename) {
- using namespace logging;
-
- string log_props_file = dsp_cfg::get().io.log_filename;
-
- if (!path::FileExists(log_props_file)){
- log_props_file = path::append_path(path::parent_path(cfg_filename), dsp_cfg::get().io.log_filename);
- }
-
- logger *lg = create_logger(path::FileExists(log_props_file) ? log_props_file : "");
- lg->add_writer(std::make_shared<console_writer>());
- attach_logger(lg);
-}
-
-int main(int /*argc*/, char** argv) {
- perf_counter pc;
- const size_t GB = 1 << 30;
-
- srand(42);
- srandom(42);
-
- segfault_handler sh;
-
- try {
- using namespace debruijn_graph;
- string cfg_filename = argv[1];
- load_config (cfg_filename);
- make_dirs();
- if(dsp_cfg::get().rp.developer_mode)
- copy_configs(cfg_filename, path::append_path(dsp_cfg::get().io.output_dir, "configs"));
- create_console_logger(cfg_filename);
-
- VERIFY(dsp_cfg::get().bp.K >= runtime_k::MIN_K && dsp_cfg::get().bp.K < runtime_k::MAX_K);
- VERIFY(dsp_cfg::get().bp.K % 2 != 0);
-
- limit_memory(dsp_cfg::get().bp.max_memory * GB);
-
- INFO("Starting dipSPAdes, built from " SPADES_GIT_REFSPEC ", git revision " SPADES_GIT_SHA1);
- INFO("Assembling dataset (" << dsp_cfg::get().io.dataset_name << ") with K=" << dsp_cfg::get().bp.K);
- dipspades::run_dipspades();
-// link_output("latest_success");
- } catch (std::bad_alloc const& e) {
- std::cerr << "Not enough memory to run SPAdes. " << e.what() << std::endl;
- return EINTR;
- } catch (std::exception const& e) {
- std::cerr << "Exception caught " << e.what() << std::endl;
- return EINTR;
- } catch (...) {
- std::cerr << "Unknown exception caught " << std::endl;
- return EINTR;
- }
-
- unsigned ms = (unsigned)pc.time_ms();
- unsigned secs = (ms / 1000) % 60;
- unsigned mins = (ms / 1000 / 60) % 60;
- unsigned hours = (ms / 1000 / 60 / 60);
- INFO("Assembling time: " << hours << " hours " << mins << " minutes " << secs << " seconds");
-
- // OK
- return 0;
-}
diff --git a/src/dipspades/polymorphic_bulge_remover/bulge_correction_condition.hpp b/src/dipspades/polymorphic_bulge_remover/bulge_correction_condition.hpp
deleted file mode 100644
index eae7009..0000000
--- a/src/dipspades/polymorphic_bulge_remover/bulge_correction_condition.hpp
+++ /dev/null
@@ -1,128 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "../utils/bulge_utils.hpp"
-#include "../dipspades_config.hpp"
-
-using namespace debruijn_graph;
-
-namespace dipspades {
-
-class BaseBulgeCorrectionCondition{
-protected:
- Graph &graph_;
-public:
- BaseBulgeCorrectionCondition(Graph &graph) : graph_(graph) { }
- virtual bool IsBulgeCorrect(shared_ptr<BaseBulge> bulge) = 0;
- virtual bool IsPathBulgeSide(vector<EdgeId> path) = 0;
- virtual ~BaseBulgeCorrectionCondition(){ }
-};
-
-class RelatedVerticesCondition : public BaseBulgeCorrectionCondition {
- bool TwoVerticesRelated(VertexId v1, VertexId v2){
- return graph_.RelatedVertices(v1, v2);
- }
-
- bool PathContainsNoRelatedToVertex(vector<EdgeId> path, VertexId vertex,
- bool check_start_vertex = false){
- VERIFY(path.size() != 0);
-
- if(check_start_vertex)
- if(TwoVerticesRelated(graph_.EdgeStart(path[0]), vertex))
- return false;
-
- for(auto e = path.begin(); e != path.end(); e++)
- if(TwoVerticesRelated(vertex, graph_.EdgeEnd(*e)))
- return false;
- return true;
- }
-
- bool PathContainsNoRelatedVertices(vector<EdgeId> path){
- if(!PathContainsNoRelatedToVertex(path, graph_.EdgeStart(path[0])))
- return false;
- for(auto e1 = path.begin(); e1 != path.end(); e1++)
- for(auto e2 = e1 + 1; e2 != path.end(); e2++)
- if(TwoVerticesRelated(graph_.EdgeEnd(*e1), graph_.EdgeEnd(*e2)))
- return false;
- return true;
- }
-
- bool PathsContainNoRelatedVertices(shared_ptr<BaseBulge> bulge){
- auto path1 = bulge->path1();
- auto path2 = bulge->path2();
- for(auto e1 = path1.begin(); e1 != path1.end(); e1++)
- for(auto e2 = path2.begin(); e2 != path2.end(); e2++)
- if((e1 != path1.end() - 1) && (e2 != path2.end() - 1))
- if(TwoVerticesRelated(graph_.EdgeEnd(*e1), graph_.EdgeEnd(*e2)))
- return false;
- return true;
- }
-
-public:
- RelatedVerticesCondition(Graph &graph) : BaseBulgeCorrectionCondition(graph) { }
- bool IsBulgeCorrect(shared_ptr<BaseBulge> bulge){
- if(!PathContainsNoRelatedVertices(bulge->path1()) ||
- !PathContainsNoRelatedVertices(bulge->path2()))
- return false;
- return PathsContainNoRelatedVertices(bulge);
- }
-
- bool IsPathBulgeSide(vector<EdgeId> path){
- return PathContainsNoRelatedVertices(path);
- }
-};
-
-class AdjacencyToAutoRCEdges : public BaseBulgeCorrectionCondition {
-
-public:
- AdjacencyToAutoRCEdges(Graph &graph) : BaseBulgeCorrectionCondition(graph) { }
-
- bool IsBulgeCorrect(shared_ptr<BaseBulge> bulge){
- return IsPathBulgeSide(bulge->path1()) && IsPathBulgeSide(bulge->path2());
- }
-
- bool IsPathBulgeSide(vector<EdgeId> path){
- return !PathAdjacentRelatedEdges(graph_, path);
- }
-};
-
-class DiploidyCondition : public BaseBulgeCorrectionCondition {
- double rel_length_;
- double rel_align_;
-public:
- DiploidyCondition(Graph &graph,
- double rel_length,
- double rel_align) :
- BaseBulgeCorrectionCondition(graph),
- rel_length_(rel_length),
- rel_align_(rel_align) { }
-
- bool IsBulgeCorrect(shared_ptr<BaseBulge> bulge){
- return bulge->IsBulgeDiploid(rel_length_, rel_align_);
- }
-
- bool IsPathBulgeSide(vector<EdgeId>){
- return true;
- }
-};
-
-class CorrectSplitCondition : public BaseBulgeCorrectionCondition {
-public:
- CorrectSplitCondition(Graph &graph) : BaseBulgeCorrectionCondition(graph) { }
-
- bool IsBulgeCorrect(shared_ptr<BaseBulge> bulge){
- return bulge->path1().size() == bulge->path2().size();
- }
-
- bool IsPathBulgeSide(vector<EdgeId>){
- return true;
- }
-};
-
-}
diff --git a/src/dipspades/polymorphic_bulge_remover/bulge_gluer.hpp b/src/dipspades/polymorphic_bulge_remover/bulge_gluer.hpp
deleted file mode 100644
index d7e0f51..0000000
--- a/src/dipspades/polymorphic_bulge_remover/bulge_gluer.hpp
+++ /dev/null
@@ -1,88 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "glue_direction_definer.hpp"
-#include "gluing_vertices_definer.hpp"
-#include "bulge_splitter.hpp"
-#include "bulge_correction_condition.hpp"
-#include "../utils/edge_gluer.hpp"
-
-using namespace debruijn_graph;
-
-namespace dipspades {
-
-template<class GlueDirectionDefiner, class GluingVericesDefiner, class BulgeSplitter>
-class ComplexBulgeGluer {
-
- Graph &graph_;
- GlueDirectionDefiner glue_dir_definer_;
- GluingVericesDefiner glue_definer_;
- BulgeSplitter splitter_;
-
- bool IsSplittedBulgeCorrect(shared_ptr<BaseBulge> splitted_bulge){
- return !splitted_bulge->IsEmpty() && CorrectSplitCondition(graph_).IsBulgeCorrect(splitted_bulge) &&
- RelatedVerticesCondition(graph_).IsBulgeCorrect(splitted_bulge);
- }
-
- void GlueSplittedBulge(shared_ptr<BaseBulge> splitted_bulge){
- size_t bulge_edge_len = splitted_bulge->path1().size();
- EdgeGluer edge_gluer(graph_);
- TRACE("Edge gluer starts");
- for(size_t i = 0; i < bulge_edge_len - 1; i++){
- auto edge1 = splitted_bulge->path1()[i];
- auto edge2 = splitted_bulge->path2()[i];
- auto next_edge1 = splitted_bulge->path1()[i + 1];
- TRACE("edge1 - " << graph_.str(edge1) << ", edge2 - " << graph_.str(edge2) <<
- ", next_edge1 - " << graph_.str(next_edge1));
- vector<EdgeId> tmp = {edge1, edge2, next_edge1};
- edge_gluer.MoveEdgesFromVertexToVertex(
- graph_.EdgeEnd(edge1),
- graph_.EdgeEnd(edge2),
- tmp);
- graph_.GlueEdges(edge1, edge2);
- TRACE("Edges were moved");
- }
- graph_.GlueEdges(splitted_bulge->path1()[bulge_edge_len - 1],
- splitted_bulge->path2()[bulge_edge_len - 1]);
- TRACE("Gluing was completed");
- }
-
-public:
- ComplexBulgeGluer(Graph &graph, GlueDirectionDefiner glue_dir_definer,
- GluingVericesDefiner glue_definer, BulgeSplitter splitter) :
- graph_(graph),
- glue_dir_definer_(glue_dir_definer),
- glue_definer_(glue_definer),
- splitter_(splitter) { }
-
- bool GlueBulge(shared_ptr<BaseBulge> bulge){
- auto glue_dir = glue_dir_definer_.Define(bulge);
- TRACE("Gluing direction - " << glue_dir);
- if(glue_dir == undefined)
- return false;
-
- shared_ptr<BaseBulge> directed_bulge(new DirectedBulge(graph_, bulge, glue_dir));
- TRACE("Glue vertices definer starts");
- auto glue_def_res = glue_definer_.Run(directed_bulge);
- TRACE("Bulge splitter starts");
- auto splitted_bulge = splitter_.SplitBulge(directed_bulge, glue_def_res);
-
- if(IsSplittedBulgeCorrect(splitted_bulge)){
- TRACE("Splitted bulge correct");
- GlueSplittedBulge(splitted_bulge);
- return true;
- }
- return false;
- }
-
-private:
- DECL_LOGGER("ComplexBulgeGluer");
-};
-
-}
diff --git a/src/dipspades/polymorphic_bulge_remover/bulge_paths_searcher.hpp b/src/dipspades/polymorphic_bulge_remover/bulge_paths_searcher.hpp
deleted file mode 100644
index 94295a0..0000000
--- a/src/dipspades/polymorphic_bulge_remover/bulge_paths_searcher.hpp
+++ /dev/null
@@ -1,97 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include <vector>
-#include "../../include/omni/dijkstra_tools/dijkstra_helper.hpp"
-#include "../../include/omni/path_processor.hpp"
-#include "../dipspades_config.hpp"
-
-using namespace std;
-using namespace debruijn_graph;
-using namespace omnigraph;
-
-namespace dipspades {
-
-class DijkstraBulgePathsSearcher {
- typedef map<VertexId, vector<EdgeId> > shortest_paths;
-
- Graph &graph_;
- size_t search_depth_;
- size_t max_neigh_number_;
-
-public:
- DijkstraBulgePathsSearcher(Graph &graph,
- size_t search_depth,
- size_t max_neigh_number) :
- graph_(graph),
- search_depth_(search_depth),
- max_neigh_number_(max_neigh_number) {
- TRACE("Search depth - " << search_depth);
- }
-
- vector<VertexId> VerticesReachedFrom(VertexId start_vertex) {
- auto bounded_dijkstra = DijkstraHelper<Graph>::CreateBoundedDijkstra(this->graph_,
- this->search_depth_, this->max_neigh_number_);
- bounded_dijkstra.Run(start_vertex);
- TRACE("Reached vertices size - " << bounded_dijkstra.ReachedVertices());
- return bounded_dijkstra.ReachedVertices();
- }
-
- vector<vector<EdgeId> > GetAllPathsTo(VertexId start_vertex, VertexId end_vertex) {
- auto bounded_dijkstra = DijkstraHelper<Graph>::CreateBoundedDijkstra(this->graph_,
- this->search_depth_, this->max_neigh_number_);
- bounded_dijkstra.Run(start_vertex);
-
- vector<vector<EdgeId> > alternative_paths;
- auto shortest_path = bounded_dijkstra.GetShortestPathTo(end_vertex);
- alternative_paths.push_back(shortest_path);
- if(shortest_path.size() == 0)
- return alternative_paths;
-
- EdgeId shpath_last_edge = shortest_path[shortest_path.size() - 1];
- for(auto in_edge = this->graph_.IncomingEdges(end_vertex).begin();
- in_edge != this->graph_.IncomingEdges(end_vertex).end(); in_edge++){
- if(shpath_last_edge != *in_edge &&
- bounded_dijkstra.DistanceCounted(graph_.EdgeStart(*in_edge))){
- auto curr_short_path = bounded_dijkstra.GetShortestPathTo(graph_.EdgeStart(*in_edge));
- curr_short_path.push_back(*in_edge);
- alternative_paths.push_back(curr_short_path);
- }
- }
- return alternative_paths;
- }
-
-private:
- DECL_LOGGER("DijkstraBulgePathsSearcher");
-};
-
-class PathProcessorBulgeSearcher {
- Graph &graph_;
- size_t search_depth_;
-public:
- PathProcessorBulgeSearcher(Graph &graph, size_t search_depth) :
- graph_(graph),
- search_depth_(search_depth) { }
-
- vector<VertexId> VerticesReachedFrom(VertexId start_vertex) {
- auto bounded_dijkstra = DijkstraHelper<Graph>::CreateBoundedDijkstra(this->graph_,
- this->search_depth_);
- bounded_dijkstra.Run(start_vertex);
- return bounded_dijkstra.ReachedVertices();
- }
-
- vector<vector<EdgeId> > GetAllPathsTo(VertexId start_vertex, VertexId end_vertex) {
- PathStorageCallback<Graph> callback(this->graph_);
- ProcessPaths(this->graph_, 0, this->search_depth_,
- start_vertex, end_vertex, callback);
- return callback.paths();
- }
-};
-
-}
diff --git a/src/dipspades/polymorphic_bulge_remover/bulge_splitter.hpp b/src/dipspades/polymorphic_bulge_remover/bulge_splitter.hpp
deleted file mode 100644
index 0da9bb1..0000000
--- a/src/dipspades/polymorphic_bulge_remover/bulge_splitter.hpp
+++ /dev/null
@@ -1,497 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "../utils/bulge_utils.hpp"
-#include "../utils/element_printers.hpp"
-
-#include "gluing_vertices_definer.hpp"
-
-using namespace debruijn_graph;
-using namespace io;
-
-namespace dipspades {
-
-// both values inclusive
-// first index points on index of the start edge in subpath
-// last index points on index of the last edge in subpath
-typedef pair<size_t, size_t> subpath_range;
-
-class SplitResult {
- Graph &graph_;
- vector<EdgeId> path1_;
- vector<EdgeId> path2_;
-
- bool CheckExtention(vector<EdgeId> &old_path, vector<EdgeId> &new_path){
- if(old_path.size() != 0 && new_path.size() != 0)
- return graph_.EdgeEnd(old_path[old_path.size() - 1]) ==
- graph_.EdgeStart(new_path[0]);
- return true;
- }
-
- void ExtendPath(vector<EdgeId> &old_path, vector<EdgeId> new_path){
- VERIFY(CheckExtention(old_path, new_path));
- old_path.insert(old_path.end(), new_path.begin(), new_path.end());
- }
-
-public:
- SplitResult(Graph &graph, vector<EdgeId> path1, vector<EdgeId> path2) :
- graph_(graph), path1_(path1), path2_(path2) { }
-
- SplitResult(Graph &graph) : graph_(graph) { }
-
- void ExtendPaths(SplitResult new_results) {
- ExtendPath(path1_, new_results.path1());
- ExtendPath(path2_, new_results.path2());
- }
-
- vector<EdgeId> path1() { return path1_; }
-
- vector<EdgeId> path2() { return path2_; }
-
- bool IsEmpty() { return path1_.size() == 0 || path2_.size() == 0; }
-};
-
-class SubpathsSplitter {
-
- typedef vector<EdgeId> edge_path;
-
- Graph &graph_;
- pair<edge_path, edge_path> old_paths_;
- pair<vector<size_t>, vector<size_t> > part_lens_;
- pair<size_t, size_t> num_splits_;
- pair<edge_path, edge_path > split_paths_;
- pair<vector<size_t>, vector<size_t> > partlen_split_paths_;
- pair<size_t, size_t> spath_lens_;
- pair<subpath_range, subpath_range> ranges_;
-
- enum owner { no_owner, first_path, second_path };
- struct vertex_to_split {
- double rel_dist_; // relative distance of vertex from the start of subpath
- owner owner_path_; // path-owner of this vertex
- size_t edge_ind_; // index of edge end of which matches with vertex
-
- vertex_to_split(double rel_dist, owner owner_path, size_t edge_ind) :
- rel_dist_(rel_dist), owner_path_(owner_path), edge_ind_(edge_ind) { }
- };
-
- vector<vertex_to_split> vertices_to_split_;
-
- struct split_find_result {
- EdgeId edge_;
- size_t pos_;
- bool correct_;
-
- split_find_result() : edge_(), pos_(), correct_(false) { }
-
- split_find_result(EdgeId edge, size_t pos, bool correct) :
- edge_(edge),
- pos_(pos),
- correct_(correct) { }
-
- void initialize(EdgeId edge, size_t pos, bool correct){
- edge_ = edge;
- pos_ = pos;
- correct_ = correct;
- }
- };
-
- void clear(){
- split_paths_.first.clear();
- split_paths_.second.clear();
- partlen_split_paths_.first.clear();
- partlen_split_paths_.second.clear();
- vertices_to_split_.clear();
- }
-
- edge_path CutSubpath(edge_path path, subpath_range range){
- return edge_path(path.begin() + range.first, path.begin() + range.second + 1);
- }
-
- size_t DefineStartEdgePosition(size_t num_splits, subpath_range range){
- if(num_splits == 0)
- return range.second;
- return range.first;
- }
-
- size_t DefineSpathLenBefore(vector<size_t> &part_len, subpath_range range){
- if(range.first == 0)
- return 0;
- return part_len[range.first - 1];
- }
-
- size_t DefineSubpathLen(vector<size_t> &part_len, subpath_range range){
- return part_len[range.second] - DefineSpathLenBefore(part_len, range);
- }
-
- void InitializeSplitVectors(size_t num_splits, edge_path &split_path, vector<size_t> &split_lens,
- edge_path old_path, subpath_range range){
- if(num_splits == 0){
- split_path = CutSubpath(old_path, range);
- split_lens = CalculatePathPartLens(graph_, split_path);
- return;
- }
- split_path.push_back(old_path[range.first]);
- split_lens.push_back(graph_.length(old_path[range.first]));
- }
-
- size_t IndexFirstOppositeEdge(size_t split_index, set<size_t> &processed_indices){
- for(size_t index = split_index + 1; index < vertices_to_split_.size(); index++)
- if(vertices_to_split_[index].owner_path_ != vertices_to_split_[split_index].owner_path_)
- if(processed_indices.find(index) == processed_indices.end())
- return index;
- return vertices_to_split_.size() - 1;
- }
-
- bool VerticesMergePossible(size_t ind1, size_t ind2){
- VERIFY(vertices_to_split_[ind1].owner_path_ != vertices_to_split_[ind2].owner_path_);
- // todo replace magic const to config file
- return fabs(vertices_to_split_[ind1].rel_dist_ - vertices_to_split_[ind2].rel_dist_) < .01;
- }
-
- bool OwnersMatch(size_t ind, owner owner_path){
- return vertices_to_split_[ind].owner_path_ == owner_path;
- }
-
- pair<size_t, size_t> OrderByPaths(size_t ind1, size_t ind2){
- VERIFY(vertices_to_split_[ind1].owner_path_ != vertices_to_split_[ind2].owner_path_);
- if(OwnersMatch(ind1, first_path))
- return pair<size_t, size_t>(ind1, ind2);
- return pair<size_t, size_t>(ind2, ind1);
- }
-
- void PerformMerge(pair<size_t, size_t> indexes){
- size_t edge_ind1 = vertices_to_split_[indexes.first].edge_ind_ + 1;
- split_paths_.first.push_back(old_paths_.first[edge_ind1]);
- partlen_split_paths_.first.push_back(partlen_split_paths_.first[partlen_split_paths_.first.size() - 1] +
- graph_.length(old_paths_.first[edge_ind1]));
-
- size_t edge_ind2 = vertices_to_split_[indexes.second].edge_ind_ + 1;
- split_paths_.second.push_back(old_paths_.second[edge_ind2]);
- partlen_split_paths_.second.push_back(partlen_split_paths_.second[partlen_split_paths_.second.size() - 1] +
- graph_.length(old_paths_.second[edge_ind2]));
- }
-
- EdgeId get_last_edge_by_owner(owner path_owner){
- if(path_owner == first_path)
- return split_paths_.first[split_paths_.first.size() - 1];
- return split_paths_.second[split_paths_.second.size() - 1];
-// size_t path_index = vertices_to_split_[index].edge_ind_;
-// owner path_owner = vertices_to_split_[index].owner_path_;
-// if(path_owner == first_path)
-// return old_paths_.first[path_index];
-// return old_paths_.second[path_index];
- }
-
- split_find_result FindSplitPosition(pair<size_t, size_t> indices, size_t oppos_spaths_len,
- vector<size_t> &oppos_pathlen){
- EdgeId edge_to_split = get_last_edge_by_owner(
- vertices_to_split_[indices.second].owner_path_);
- size_t split_pos = size_t(vertices_to_split_[indices.first].rel_dist_ * double(oppos_spaths_len));
- TRACE("Absolute split position " << split_pos);
- TRACE("oppos_pathlen[oppos_pathlen.size() - 2] - " << oppos_pathlen[oppos_pathlen.size() - 2]);
- if(oppos_pathlen.size() != 1 && split_pos >= oppos_pathlen[oppos_pathlen.size() - 2])
- split_pos -= oppos_pathlen[oppos_pathlen.size() - 2];
-
- if(split_pos == 0) split_pos++;
-
- TRACE("Edge before split - " << graph_.str(edge_to_split) <<
- ", split pos - " << split_pos);
-
- return split_find_result(edge_to_split, split_pos, split_pos < graph_.length(edge_to_split));
- }
-
- void UpdateSplittedPath(edge_path &path, pair<EdgeId, EdgeId> splitted_edges){
- if(path.size() == 0)
- path.push_back(splitted_edges.first);
- else
- path[path.size() - 1] = splitted_edges.first;
- path.push_back(splitted_edges.second);
- }
-
- void UpdatesplittedPartLens(vector<size_t> &part_lens, pair<EdgeId, EdgeId> splitted_edges){
- if(part_lens.size() == 0)
- part_lens.push_back(graph_.length(splitted_edges.first));
- else if(part_lens.size() == 1)
- part_lens[0] = graph_.length(splitted_edges.first);
- else
- part_lens[part_lens.size() - 1] = part_lens[part_lens.size() - 2] +
- graph_.length(splitted_edges.first);
- part_lens.push_back(part_lens[part_lens.size() - 1] + graph_.length(splitted_edges.second));
- }
-
- void SplitOppositeEdge(split_find_result split_res, edge_path &oppos_path,
- vector<size_t> &oppos_partlen){
- if(!split_res.correct_ || graph_.length(split_res.edge_) < split_res.pos_)
- return;
- pair<EdgeId, EdgeId> splitted_edges = graph_.SplitEdge(split_res.edge_, split_res.pos_);
- TRACE("Edges after split - " << graph_.str(splitted_edges.first) << " " <<
- graph_.str(splitted_edges.second));
- UpdateSplittedPath(oppos_path, splitted_edges);
- UpdatesplittedPartLens(oppos_partlen, splitted_edges);
- }
-
- // first from pairs - splitted, second - opposite
- bool PerformSplit(pair<size_t, size_t> indexes,
- pair<edge_path&, edge_path& > split_paths,
- pair<vector<size_t>&, vector<size_t>& > split_partlens,
- pair<edge_path&, edge_path& > default_paths,
- pair<size_t, size_t> spaths_len,
- pair<size_t, size_t> num_splits){
-
- TRACE("New path1 before: " << SimplePathWithVerticesToString(graph_, split_paths.first));
- TRACE("New path2 before: " << SimplePathWithVerticesToString(graph_, split_paths.second));
-
- TRACE("FindEdgeAndSplitPosition");
- split_find_result split_res = FindSplitPosition(indexes, spaths_len.second,
- split_partlens.second);
-
- if(!split_res.correct_){
- TRACE("Split was not performed");
- return false;
- }
-
- TRACE("SplitOppositeEdge");
- SplitOppositeEdge(split_res, split_paths.second, split_partlens.second);
-
- // update non splitted path
- TRACE("Update non splitted path");
- if(num_splits.second != 0){
- size_t edge_ind = vertices_to_split_[indexes.first].edge_ind_ + 1;
- split_paths.first.push_back(default_paths.first[edge_ind]);
- split_partlens.first.push_back(split_partlens.first[split_partlens.first.size() - 1] +
- graph_.length(default_paths.first[edge_ind]));
- }
-
- TRACE("New path1 after: " << SimplePathWithVerticesToString(graph_, split_paths.first));
- TRACE("New path2 after: " << SimplePathWithVerticesToString(graph_, split_paths.second));
-
- return true;
- }
-
- // function expect that order in pair_to_order matches with (first_path, second_path)
- template<typename T>
- pair<T&, T&> OrderBySplitAndOpposite(size_t split_ind, size_t oppos_ind, pair<T, T> &pair_to_order){
- VERIFY(vertices_to_split_[split_ind].owner_path_ !=
- vertices_to_split_[oppos_ind].owner_path_);
- if(vertices_to_split_[split_ind].owner_path_ == first_path)
- return pair<T&, T&>(pair_to_order.first, pair_to_order.second);
- return pair<T&, T&>(pair_to_order.second, pair_to_order.first);
- }
-
- bool PerformSplitting(subpath_range range1, subpath_range range2){
- TRACE("Vector initialization");
- InitializeSplitVectors(num_splits_.second, split_paths_.first,
- partlen_split_paths_.first, old_paths_.first, range1);
- InitializeSplitVectors(num_splits_.first, split_paths_.second,
- partlen_split_paths_.second, old_paths_.second, range2);
-
- size_t num_done_splits = 0;
- size_t split_index = 1;
-
- TRACE("Splitting cycle starts");
-
- set<size_t> processed_indices;
- while(num_done_splits < num_splits_.first + num_splits_.second){
- TRACE("Splitted index - " << split_index << " , owner - " <<
- vertices_to_split_[split_index].owner_path_);
-
- size_t opposite_index = IndexFirstOppositeEdge(split_index, processed_indices);
- TRACE("Opposite index - " << opposite_index << ", owner - " <<
- vertices_to_split_[opposite_index].owner_path_);
-
- if(processed_indices.find(split_index) == processed_indices.end()){
- if(VerticesMergePossible(split_index, opposite_index) &&
- (opposite_index != vertices_to_split_.size() - 2) &&
- (opposite_index != vertices_to_split_.size() - 1)){
-
- TRACE("Merge starts");
- PerformMerge(OrderByPaths(split_index, opposite_index));
- num_done_splits += 2;
- processed_indices.insert(opposite_index);
-
- TRACE("Merge was performed");
- }
- else{
- TRACE("Split starts");
-
- bool split_res = PerformSplit(pair<size_t, size_t>(split_index, opposite_index),
- OrderBySplitAndOpposite<edge_path>(split_index, opposite_index, split_paths_),
- OrderBySplitAndOpposite<vector<size_t> >(split_index, opposite_index, partlen_split_paths_),
- OrderBySplitAndOpposite<edge_path>(split_index, opposite_index, old_paths_),
- OrderBySplitAndOpposite<size_t>(split_index, opposite_index, spath_lens_),
- OrderBySplitAndOpposite<size_t>(split_index, opposite_index, num_splits_));
-
- if(!split_res)
- return false;
-
- num_done_splits++;
- TRACE("Split was performed");
- }
-
- processed_indices.insert(split_index);
- }
- TRACE("Number done splittings - " << num_done_splits);
- split_index ++;
- TRACE("-------------------------");
- }
- TRACE("Splitting cycle ends");
- TRACE("-------------------------");
- return true;
- }
-
- void CreateVectorSplitVertices(subpath_range range1, subpath_range range2){
- pair<size_t, size_t> lens_before_spath(DefineSpathLenBefore(part_lens_.first, range1),
- DefineSpathLenBefore(part_lens_.second, range2));
- spath_lens_ = pair<size_t, size_t>(DefineSubpathLen(part_lens_.first, range1),
- DefineSubpathLen(part_lens_.second, range2));
- vertices_to_split_.push_back(vertex_to_split(0, no_owner, 0));
- size_t iter1 = range1.first;
- size_t iter2 = range2.first;
-
- TRACE("Partlens for 1st vector: " << VectorToString<size_t>(part_lens_.first));
- TRACE("Partlens for 2nd vector: " << VectorToString<size_t>(part_lens_.second));
- TRACE("Slens before - " << lens_before_spath.first << " " << lens_before_spath.second);
-
- for(size_t i = 0; i < num_splits_.first + num_splits_.second; i++){
- double rel_dist1 = double(part_lens_.first[iter1] - lens_before_spath.first) /
- double(spath_lens_.first);
- double rel_dist2 = double(part_lens_.second[iter2] - lens_before_spath.second) /
- double(spath_lens_.second);
- if(rel_dist1 < rel_dist2){
- vertices_to_split_.push_back(vertex_to_split(rel_dist1, first_path, iter1));
- iter1++;
- }
- else{
- vertices_to_split_.push_back(vertex_to_split(rel_dist2, second_path, iter2));
- iter2++;
- }
- }
- vertices_to_split_.push_back(vertex_to_split(1.0, second_path, iter2));
- vertices_to_split_.push_back(vertex_to_split(1.0, first_path, iter1));
- }
-
-public:
- SubpathsSplitter(Graph &graph, shared_ptr<BaseBulge> bulge) :
- graph_(graph),
- old_paths_(pair<edge_path, edge_path>(bulge->path1(), bulge->path2())),
- part_lens_(make_pair(CalculatePathPartLens(graph_, old_paths_.first),
- CalculatePathPartLens(graph_, old_paths_.second))),
- num_splits_(),
- split_paths_(),
- partlen_split_paths_() { }
-
- SplitResult SplitSubpaths(subpath_range range1, subpath_range range2) {
- clear();
-
- // number of splits on the 1st and the 2nd subpaths
- ranges_.first = range1;
- ranges_.second = range2;
-
- num_splits_.first = range1.second - range1.first;
- num_splits_.second = range2.second - range2.first;
-
- TRACE("Range 1: " << range1.first << " - " << range1.second);
- TRACE("Range 2: " << range2.first << " - " << range2.second);
- TRACE("Num splits 1 - " << num_splits_.first << ", num splits 2 - " << num_splits_.second);
-
- TRACE("Subpath to split1 - " << SimplePathWithVerticesToString(graph_, CutSubpath(old_paths_.first, range1)));
- TRACE("Subpath to split2 - " << SimplePathWithVerticesToString(graph_, CutSubpath(old_paths_.second, range2)));
-
- if(num_splits_.first + num_splits_.second == 0)
- return SplitResult(graph_, CutSubpath(old_paths_.first, range1),
- CutSubpath(old_paths_.second, range2));
-
- CreateVectorSplitVertices(range1, range2);
-
- TRACE("Vertices to split:");
- for(auto it = vertices_to_split_.begin(); it != vertices_to_split_.end(); it++)
- TRACE(it->rel_dist_ << " " << it->owner_path_ << " " << it->edge_ind_ );
-
- TRACE("Auxiliary vectors were created");
-
- if(!PerformSplitting(range1, range2))
- return SplitResult(graph_);
-
- TRACE("Splitted spath1 - " << SimplePathWithVerticesToString(graph_, split_paths_.first));
- TRACE("Splitted spath2 - " << SimplePathWithVerticesToString(graph_, split_paths_.second));
- return SplitResult(graph_, split_paths_.first, split_paths_.second);
- }
-
-private:
- DECL_LOGGER("SubpathSplitter");
-};
-
-class BulgeSplitter {
- Graph &graph_;
-public:
- BulgeSplitter(Graph &graph) : graph_(graph) { }
-
- shared_ptr<BaseBulge> SplitBulge(shared_ptr<BaseBulge> bulge, GluingVericesDefinerResults gluing_def_results) {
- if(bulge->IsSimple()){
- TRACE("Bulge is simple. Splitting was not performed");
- return shared_ptr<BaseBulge>(new Bulge(graph_, graph_.k(), bulge->path1(), bulge->path2()));
- }
-
- SubpathsSplitter spaths_splitter(graph_, bulge);
- if(gluing_def_results.size() == 0){
- // one big split
- TRACE("No gluing vertices. Split will perform between start and end vertices");
- auto split_res = spaths_splitter.SplitSubpaths(
- subpath_range(0, bulge->path1().size() - 1),
- subpath_range(0, bulge->path2().size() - 1));
- TRACE("bulge was splitted");
- TRACE("1st new bulge side - " << SimplePathWithVerticesToString(graph_, split_res.path1()));
- TRACE("2nd new bulge side - " << SimplePathWithVerticesToString(graph_, split_res.path2()));
- return shared_ptr<BaseBulge>(new Bulge(graph_, graph_.k(), split_res.path1(), split_res.path2()));
- }
- TRACE(gluing_def_results.size() << " - number of gluing pairs");
- // splitting before first gluing pair
- TRACE("Splitting before first gluing pair");
- auto split_result = spaths_splitter.SplitSubpaths(
- subpath_range(0, gluing_def_results.begin()->first),
- subpath_range(0, gluing_def_results.begin()->second));
-
- if(split_result.IsEmpty())
- return shared_ptr<BaseBulge>(new Bulge(graph_));
-
- // perform all intermediate splittings
- TRACE("All intermediate splittings");
- for(auto iter1 = gluing_def_results.begin(), iter2 = ++gluing_def_results.begin();
- iter2 != gluing_def_results.end(); iter1++, iter2++){
- TRACE("Gluing pairs - (" << iter1->first << " " << iter1->second << ") (" <<
- iter2->first << " " << iter2->second << ")");
- auto new_split_res = spaths_splitter.SplitSubpaths(
- subpath_range(iter1->first + 1, iter2->first),
- subpath_range(iter1->second + 1, iter2->second));
- if(new_split_res.IsEmpty())
- return shared_ptr<BaseBulge>(new Bulge(graph_));
- split_result.ExtendPaths(new_split_res);
- }
-
- // splitting after last gluing last pair
- TRACE("Splitting after last gluing last pair");
- auto last_split_res = spaths_splitter.SplitSubpaths(
- subpath_range((--gluing_def_results.end())->first + 1, bulge->path1().size() - 1),
- subpath_range((--gluing_def_results.end())->second + 1, bulge->path2().size() - 1));
- if(last_split_res.IsEmpty())
- return shared_ptr<BaseBulge>(new Bulge(graph_));
- split_result.ExtendPaths(last_split_res);
-
- TRACE("New bulge path1 - " << SimplePathWithVerticesToString(graph_, split_result.path1()));
- TRACE("New bulge path2 - " << SimplePathWithVerticesToString(graph_, split_result.path2()));
- TRACE("Splitting completed");
-
- return shared_ptr<BaseBulge>(new Bulge(graph_, graph_.k(), split_result.path1(), split_result.path2()));
- }
-
-private:
- DECL_LOGGER("BulgeSplitter");
-};
-
-}
diff --git a/src/dipspades/polymorphic_bulge_remover/complex_bulge_remover.hpp b/src/dipspades/polymorphic_bulge_remover/complex_bulge_remover.hpp
deleted file mode 100644
index 9b6d43e..0000000
--- a/src/dipspades/polymorphic_bulge_remover/complex_bulge_remover.hpp
+++ /dev/null
@@ -1,145 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "../../debruijn/config_struct.hpp"
-#include "../../debruijn/graphio.hpp"
-#include "../../debruijn/construction.hpp"
-
-#include "../utils/path_routines.hpp"
-#include "../utils/element_printers.hpp"
-#include "../utils/histogram.hpp"
-
-#include "bulge_correction_condition.hpp"
-#include "bulge_gluer.hpp"
-#include "diploid_bulge_finder.hpp"
-
-#include "../../include/io/splitting_wrapper.hpp"
-
-#include <stdlib.h>
-#include <memory.h>
-#include <math.h>
-
-namespace dipspades {
-
-bool EdgeExists(Graph &graph_, size_t edge_id){
- for(auto e = graph_.SmartEdgeBegin(); !e.IsEnd(); ++e)
- if(graph_.int_id(*e) == edge_id)
- return true;
- return false;
-}
-
-template<class BulgePathsSearcher, class BulgeGluer>
-class BulgeRemoverAlgorithm{
- typedef vector<vector<EdgeId> > paths;
-protected:
- Graph &graph_;
- BulgeGluer bulge_gluer_;
- BaseHistogram<size_t> &hist_;
- const dipspades_config::polymorphic_br &pbr_config_;
-
- DiploidBulgeFinder bulge_finder_;
- DiploidyCondition dip_bulge_checker_;
- RelatedVerticesCondition rel_bulge_checker_;
-
- bool BulgeExistFrom(VertexId start){
- return graph_.OutgoingEdgeCount(start) > 1;
- }
-
- bool BulgeExistTo(VertexId end){
- return graph_.IncomingEdgeCount(end) > 1;
- }
-
- void FillHistogram(shared_ptr<BaseBulge> bulge){
- hist_.Add(max<size_t>(GetPathLength(graph_, bulge->path1()),
- GetPathLength(graph_, bulge->path2())));
- }
-
- bool FindGlueBulge(paths &bulge_paths){
- TRACE("Bulge finder from " << bulge_paths.size() << " paths starts");
- auto bulge = bulge_finder_.Find(bulge_paths);
- if(bulge->IsEmpty()){
- TRACE("Paths do not form a bulge");
- return false;
- }
- TRACE("Paths form a bulge");
- TRACE("Bulge gluing starts");
- if(!rel_bulge_checker_.IsBulgeCorrect(bulge)/* ||
- !dip_bulge_checker_.IsBulgeCorrect(bulge)*/){
- TRACE("Bulge do not successed diploid condition");
- return false;
- }
-
- TRACE("Correct bulge:");
- TRACE("Path1:" << SimplePathWithVerticesToString(graph_, bulge->path1()));
- TRACE("Path2:" << SimplePathWithVerticesToString(graph_, bulge->path2()));
-
- FillHistogram(bulge);
- TRACE("Diploid condition was passed");
- if(!bulge_gluer_.GlueBulge(bulge))
- return false;
-
- TRACE("Bulge gluing ends");
- return true;
- }
-
-public:
- BulgeRemoverAlgorithm(Graph &graph,
- BulgeGluer bulge_gluer,
- BaseHistogram<size_t> &hist,
- const dipspades_config::polymorphic_br &pbr_config) :
- graph_(graph),
- bulge_gluer_(bulge_gluer),
- hist_(hist),
- pbr_config_(pbr_config),
- bulge_finder_(graph, pbr_config.rel_bulge_length, pbr_config.rel_bulge_align),
- dip_bulge_checker_(graph, pbr_config.rel_bulge_length, pbr_config.rel_bulge_align),
- rel_bulge_checker_(graph) { }
-
- size_t Run(){
- size_t num_merged_paths = 0;
- BulgePathsSearcher paths_searcher(graph_,
- max<size_t>(hist_.max(), pbr_config_.max_bulge_nucls_len),
- pbr_config_.max_neigh_number);
- INFO("Maximal length of glued bulge: " << hist_.max());
- TRACE("BulgeRemoverAlgorithm starts");
- for(auto v = graph_.SmartVertexBegin(); !v.IsEnd(); ++v){
- TRACE("Processing vertex " << graph_.str(*v));
- if(BulgeExistFrom(*v)){
- auto reached_vertices = paths_searcher.VerticesReachedFrom(*v);
- TRACE("Number of neigs - " << reached_vertices.size());
- for(auto neigh = SmartSetIterator<Graph, VertexId>(graph_,
- reached_vertices.begin(), reached_vertices.end());
- !neigh.IsEnd(); ++neigh){
- if(*neigh != *v && BulgeExistTo(*neigh)){
- TRACE("Bulge can be found");
- TRACE("Processing neigh " << graph_.str(*neigh));
- auto bulge_paths = paths_searcher.GetAllPathsTo(*v, *neigh);
-
- TRACE("Bulge paths:");
- for(auto p = bulge_paths.begin(); p != bulge_paths.end(); p++)
- TRACE(SimplePathWithVerticesToString(graph_, *p));
-
- if(FindGlueBulge(bulge_paths)){
- num_merged_paths++;
- TRACE("Bulge was glued");
- break;
- }
- }
- }
- }
- }
- TRACE(num_merged_paths << " bulges were glued");
- return num_merged_paths;
- }
-
-private:
- DECL_LOGGER("PolymorphicBulgeRemover");
-};
-
-}
diff --git a/src/dipspades/polymorphic_bulge_remover/diploid_bulge_finder.hpp b/src/dipspades/polymorphic_bulge_remover/diploid_bulge_finder.hpp
deleted file mode 100644
index 24565b3..0000000
--- a/src/dipspades/polymorphic_bulge_remover/diploid_bulge_finder.hpp
+++ /dev/null
@@ -1,102 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "../utils/bulge_utils.hpp"
-#include "bulge_paths_searcher.hpp"
-
-using namespace debruijn_graph;
-
-namespace dipspades {
-
-class DiploidBulgeFinder {
- typedef vector<vector<EdgeId> > paths;
-
- Graph &graph_;
- double rel_length_;
- double rel_align_;
-
- bool RelativeLengthGood(double rel_length){
- return rel_length >= rel_length_;
- }
-
- bool RelativeAlignGood(double rel_align){
- return rel_align <= rel_align_;
- }
-
- bool BulgePathsIntersected(vector<EdgeId> &path1, vector<EdgeId> &path2){
- for(auto e1 = path1.begin(); e1 != path1.end(); e1++)
- for(auto e2 = path2.begin(); e2 != path2.end(); e2++)
- if(*e1 == *e2)
- return true;
- return false;
- }
-
- vector<pair<size_t, size_t> > ChoosePairsWithoutIntersection(paths &bulge_paths){
- vector<pair<size_t, size_t> > correct_pairs;
- for(size_t i = 0; i < bulge_paths.size(); i++)
- for(size_t j = i + 1; j < bulge_paths.size(); j++)
- if(!BulgePathsIntersected(bulge_paths[i], bulge_paths[j]))
- correct_pairs.push_back(make_pair(i, j));
- return correct_pairs;
- }
-
- vector<pair<size_t, size_t> > DefineLenSatisfiedPairs(vector<size_t> &lens,
- vector<pair<size_t, size_t> > pairs){
- vector<pair<size_t, size_t> > good_pairs;
- for(auto it = pairs.begin(); it != pairs.end(); it++)
- if(RelativeLengthGood(RelativeLengthEquality(lens[it->first], lens[it->second])))
- good_pairs.push_back(*it);
- return good_pairs;
- }
-
- vector<pair<size_t, size_t> > ChooseSeqSatisfiedPairs(vector<Sequence> &seqs,
- vector<pair<size_t, size_t> > pairs){
- vector<pair<size_t, size_t> > good_pairs;
- for(auto it = pairs.begin(); it != pairs.end(); it++)
- if(RelativeAlignGood(RelAlignmentOfSequences(seqs[it->first], seqs[it->second])))
- good_pairs.push_back(*it);
- return good_pairs;
- }
-
- vector<Sequence> GetSequences(paths &bulge_paths){
- vector<Sequence> seqs;
- for(auto it = bulge_paths.begin(); it != bulge_paths.end(); it++)
- seqs.push_back(GetSequenceByPath(graph_, graph_.k(), *it));
- return seqs;
- }
-
- vector<size_t> GetLengths(paths &bulge_paths){
- vector<size_t> lens;
- for(auto it = bulge_paths.begin(); it != bulge_paths.end(); it++)
- lens.push_back(GetPathLength(graph_, *it));
- return lens;
- }
-
-public:
- DiploidBulgeFinder(Graph &graph, double rel_length, double rel_align) :
- graph_(graph),
- rel_length_(rel_length),
- rel_align_(rel_align) { }
-
- shared_ptr<BaseBulge> Find(paths &bulge_paths){
- if(bulge_paths.size() <= 1)
- return shared_ptr<BaseBulge>(new Bulge(graph_));
-
- auto good_pairs = ChoosePairsWithoutIntersection(bulge_paths);
- vector<Sequence> seqs = GetSequences(bulge_paths);
- vector<size_t> lens = GetLengths(bulge_paths);
- good_pairs = DefineLenSatisfiedPairs(lens, good_pairs);
- good_pairs = ChooseSeqSatisfiedPairs(seqs, good_pairs);
-
- if(good_pairs.size() == 0)
- return shared_ptr<BaseBulge>(new Bulge(graph_));
- return shared_ptr<BaseBulge>(new Bulge(graph_, graph_.k(), bulge_paths[good_pairs[0].first],
- bulge_paths[good_pairs[0].second]));
- }
-};
-
-}
diff --git a/src/dipspades/polymorphic_bulge_remover/glue_direction_definer.hpp b/src/dipspades/polymorphic_bulge_remover/glue_direction_definer.hpp
deleted file mode 100644
index d7d5de7..0000000
--- a/src/dipspades/polymorphic_bulge_remover/glue_direction_definer.hpp
+++ /dev/null
@@ -1,76 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "../utils/bulge_utils.hpp"
-#include "../utils/path_routines.hpp"
-#include "../dipspades_config.hpp"
-
-using namespace debruijn_graph;
-
-namespace dipspades {
-
-
-class GluingDirectionDefiner {
-protected:
- Graph &graph_;
-public:
- GluingDirectionDefiner(Graph &graph) : graph_(graph) { }
- virtual glue_direction Define(shared_ptr<BaseBulge>) {
- return undefined;
- }
- virtual ~GluingDirectionDefiner() { }
-};
-
-class RelatedBaseGlueDirectionDefiner : public GluingDirectionDefiner{
-public:
- RelatedBaseGlueDirectionDefiner(Graph &graph) : GluingDirectionDefiner(graph) { }
-
- glue_direction Define(shared_ptr<BaseBulge> bulge){
- bool rel_edges_path1 = PathAdjacentRelatedEdges(this->graph_, bulge->path1());
- bool rel_edges_path2 = PathAdjacentRelatedEdges(this->graph_, bulge->path2());
- if(rel_edges_path1 && rel_edges_path2)
- return undefined;
-
- // if only path2 contains related edges
- // we need gluing path2 to path1
- if(rel_edges_path2)
- return reverse_gluing;
- return direct_gluing;
- }
-};
-
-class CoverageBaseGlueDirectionDefiner : public GluingDirectionDefiner{
-public:
- CoverageBaseGlueDirectionDefiner(Graph &graph) : GluingDirectionDefiner(graph) { }
-
- glue_direction Define(shared_ptr<BaseBulge>){
- // todo implement me
- return direct_gluing;
- }
-};
-
-class CompositeGlueDirectionDefiner : public GluingDirectionDefiner {
- vector<shared_ptr<GluingDirectionDefiner> > &definers_;
-public:
- CompositeGlueDirectionDefiner(Graph &graph,
- vector<shared_ptr<GluingDirectionDefiner> > &definers) :
- GluingDirectionDefiner(graph),
- definers_(definers) { }
-
- glue_direction Define(shared_ptr<BaseBulge> bulge){
- set<glue_direction> directions;
- for(auto it = definers_.begin(); it != definers_.end(); it++)
- directions.insert((*it)->Define(bulge));
- if(directions.size() == 1)
- return *(directions.begin());
- return undefined;
- }
-};
-
-}
diff --git a/src/dipspades/polymorphic_bulge_remover/gluing_vertices_definer.hpp b/src/dipspades/polymorphic_bulge_remover/gluing_vertices_definer.hpp
deleted file mode 100644
index 89005a2..0000000
--- a/src/dipspades/polymorphic_bulge_remover/gluing_vertices_definer.hpp
+++ /dev/null
@@ -1,170 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "../utils/bulge_utils.hpp"
-#include "../dipspades_config.hpp"
-
-using namespace debruijn_graph;
-
-namespace dipspades {
-
-size_t abs_diff(size_t a, size_t b){
- if(a > b)
- return a - b;
- return b - a;
-}
-
-typedef map<size_t, size_t> BulgeGluingVertices;
-typedef map<size_t, size_t>::iterator BulgeGluingVerticesIter;
-
-class GluingVericesDefinerResults {
- map<size_t, size_t> gluing_pairs_;
-
- bool IsNewPairGood(pair<size_t, size_t> new_pair){
- if(gluing_pairs_.size() == 0)
- return true;
-
- auto upper = gluing_pairs_.upper_bound(new_pair.first);
- auto lower = gluing_pairs_.lower_bound(new_pair.first);
-
- // map doesnt contain element with greater 1st vertex
- if(upper == gluing_pairs_.end()){
- // map doesnt contain element with the same 1st vertex
- if(lower == upper){
- // go to the previous element (with less 1st vertex)
- // if its 2nd vertex precedes 2nd vertex of new pair, add new pair
- // otherwise do not add new pair
- lower--;
- return lower->second < new_pair.second;
- }
- // map contains element with the same key.
- // addition of new pair is incorrect
- return false;
- }
- // map contains element with greater 1st vertex
- // and corresponding 2nd vertex is <= 2nd vertex of new pair
- // addition is incorrect
- if(upper->second <= new_pair.second)
- return false;
- // map contains element with greater 1st vertex
- // and doesnt contain the same 1st vertex
- if(lower == upper){
- // if there are no other elements
- // add new pair
- if(lower == gluing_pairs_.begin())
- return true;
- // there are other elements exist
- // go to the previous element
- // and check for preceding its 2nd vertex to new pair 2nd vertex
- lower--;
- return lower->second < new_pair.second;
- }
- return false;
- }
-
-public:
- void AddNewPair(pair<size_t, size_t> new_pair){
- if(IsNewPairGood(new_pair)){
- TRACE("New pair was inserted: " << new_pair.first << " " << new_pair.second);
- gluing_pairs_.insert(new_pair);
- }
- }
-
- BulgeGluingVerticesIter begin() { return gluing_pairs_.begin(); }
-
- BulgeGluingVerticesIter end() { return gluing_pairs_.end(); }
-
- size_t size() { return gluing_pairs_.size(); }
-
-private:
- DECL_LOGGER("GluingVericesDefinerResults");
-};
-
-class GluingVericesDefiner {
- Graph &graph_;
- double rel_length_threshold_;
-
- typedef map<pair<size_t, size_t>, double> PairSubpaths;
- PairSubpaths gluing_candidate_;
-
- double RelativeSimilarityOfLength(size_t len1, size_t len2){
- return double(abs_diff(len1, len2)) / double(min<size_t>(len1, len2));
- }
-
- size_t StartSpathLength(const vector<size_t> &lens, size_t index){
- VERIFY(index < lens.size());
- return lens[index];
- }
-
- size_t EndSpathLength(const vector<size_t> &lens, size_t index){
- VERIFY(index < lens.size());
- return lens[lens.size() - 1] - lens[index];
- }
-
- void ChooseGluingCandidate(shared_ptr<BaseBulge> bulge){
-
- TRACE("Choosing gluing candidates");
- vector<size_t> part_lens1 = CalculatePathPartLens(graph_, bulge->path1());
- vector<size_t> part_lens2 = CalculatePathPartLens(graph_, bulge->path2());
-
- for(size_t i = 0; i < part_lens1.size() - 1; i++)
- for(size_t j = 0; j < part_lens2.size() - 1; j++){
- double rel_len_start_spaths = RelativeSimilarityOfLength(StartSpathLength(part_lens1, i),
- StartSpathLength(part_lens2, j));
- double rel_len_end_spaths = RelativeSimilarityOfLength(EndSpathLength(part_lens1, i),
- EndSpathLength(part_lens2, j));
-
- if(rel_len_start_spaths <= rel_length_threshold_ &&
- rel_len_end_spaths <= rel_length_threshold_){
- TRACE("New gluing candidate - " << i << ", " << j);
- TRACE("rel_len_start_spaths - " << rel_len_start_spaths);
- TRACE("rel_len_end_spaths - " << rel_len_end_spaths);
- gluing_candidate_[make_pair(i,j)] = max<double>(rel_len_start_spaths, rel_len_end_spaths);
- }
- }
- }
-
- pair<size_t, size_t> GetBestPair(){
- double min = 1;
- pair<size_t, size_t> best_res;
-
- for(auto it = gluing_candidate_.begin(); it != gluing_candidate_.end(); it++)
- if(it->second < min){
- best_res = it->first;
- min = it->second;
- }
- return best_res;
- }
-
- GluingVericesDefinerResults ChooseGluingPairs(shared_ptr<BaseBulge> bulge){
- gluing_candidate_.clear();
- ChooseGluingCandidate(bulge);
- GluingVericesDefinerResults gluing_pairs;
- while(gluing_candidate_.size() != 0){
- auto best_pair = GetBestPair();
- gluing_pairs.AddNewPair(best_pair);
- gluing_candidate_.erase(best_pair);
- }
- return gluing_pairs;
- }
-
-public:
- GluingVericesDefiner(Graph &graph, double rel_length_threshold) :
- graph_(graph),
- rel_length_threshold_(rel_length_threshold) { }
-
- GluingVericesDefinerResults Run(shared_ptr<BaseBulge> bulge){
- return ChooseGluingPairs(bulge);
- }
-
-private:
- DECL_LOGGER("GluingVericesDefiner");
-};
-
-}
diff --git a/src/dipspades/polymorphic_bulge_remover/iterative_tails_gluing.hpp b/src/dipspades/polymorphic_bulge_remover/iterative_tails_gluing.hpp
deleted file mode 100644
index a3216bc..0000000
--- a/src/dipspades/polymorphic_bulge_remover/iterative_tails_gluing.hpp
+++ /dev/null
@@ -1,132 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "../utils/bulge_utils.hpp"
-#include "../dipspades_config.hpp"
-
-using namespace debruijn_graph;
-
-namespace dipspades {
-
-class IterativeTailGluing {
- Graph &graph_;
- double rel_align_;
-
- typedef VertexId::type::edge_const_iterator edge_const_iterator;
- typedef pair<edge_const_iterator, edge_const_iterator> edge_iters;
- typedef boost::optional<EdgeId> OptEdgeId;
-
-
- OptEdgeId GetEdgeForGlue(EdgeId edge, edge_iters iters){
- double best_rel_align = 1;
- OptEdgeId res;
- for(auto it = iters.first; it != iters.second; it++){
- if(edge != *it){
- pair<Sequence, Sequence> seqs;
- if(graph_.length(edge) <= graph_.length(*it)){
- seqs.first = graph_.EdgeNucls(edge);
- seqs.second = graph_.EdgeNucls(*it).Subseq(0, seqs.first.size());
- }
- else{
- seqs.first = graph_.EdgeNucls(*it);
- seqs.second = graph_.EdgeNucls(edge).Subseq(0, seqs.first.size());
- }
- double rel_align = RelAlignmentOfSequences(seqs.first, seqs.second);
- if(rel_align <= rel_align_ && rel_align <= best_rel_align){
- best_rel_align = rel_align;
- res = *it;
- }
- }
- }
- return res;
- }
-
- bool ProcessTail(EdgeId edge, edge_iters iters){
- auto edge_for_glue = GetEdgeForGlue(edge, iters);
- if(edge_for_glue.is_initialized()){
-
- TRACE("Edge for glue " << graph_.str(edge_for_glue.get()));
- TRACE("Edges lengths" << graph_.length(edge) << " - " << graph_.length(edge_for_glue.get()));
-
- size_t min_len = min<size_t>(graph_.length(edge), graph_.length(edge_for_glue.get()));
- if(min_len == graph_.length(edge) && min_len == graph_.length(edge_for_glue.get())){
- graph_.GlueEdges(edge, edge_for_glue.get());
- }
- else{
- if(min_len == graph_.length(edge)){
- pair<EdgeId, EdgeId> new_edges = graph_.SplitEdge(edge_for_glue.get(), min_len);
- graph_.GlueEdges(edge, new_edges.first);
- }
- else {
- auto new_edges = graph_.SplitEdge(edge, min_len);
- graph_.GlueEdges(new_edges.first, edge_for_glue.get());
- }
- }
- return true;
- }
- return false;
- }
-
- bool IsTailIncoming(EdgeId edge){
- return graph_.IncomingEdgeCount(graph_.EdgeStart(edge)) == 0 &&
- graph_.OutgoingEdgeCount(graph_.EdgeStart(edge)) == 0;
- }
-
- bool ProcessTail(EdgeId edge){
- if(IsTailIncoming(edge))
- return ProcessTail(edge,
- edge_iters(graph_.IncomingEdges(graph_.EdgeEnd(edge)).begin(),
- graph_.IncomingEdges(graph_.EdgeEnd(edge)).end()));
- return ProcessTail(edge,
- edge_iters(graph_.OutgoingEdges(graph_.EdgeStart(edge)).begin(),
- graph_.OutgoingEdges(graph_.EdgeStart(edge)).end()));
- }
-
- bool EdgeIsTail(EdgeId edge) {
- return (graph_.IncomingEdgeCount(graph_.EdgeStart(edge)) == 0 &&
- graph_.OutgoingEdgeCount(graph_.EdgeStart(edge)) == 1) ||
- (graph_.IncomingEdgeCount(graph_.EdgeEnd(edge)) == 1 &&
- graph_.OutgoingEdgeCount(graph_.EdgeEnd(edge)) == 0);
- }
-
- bool EdgeIsIsolate(EdgeId edge){
- return (graph_.IncomingEdgeCount(graph_.EdgeStart(edge)) == 0 &&
- graph_.OutgoingEdgeCount(graph_.EdgeStart(edge)) == 1) &&
- (graph_.IncomingEdgeCount(graph_.EdgeEnd(edge)) == 1 &&
- graph_.OutgoingEdgeCount(graph_.EdgeEnd(edge)) == 0); }
-
- size_t ProcessTails(){
- size_t num_glued_tails = 0;
- for(auto edge = graph_.SmartEdgeBegin(); !edge.IsEnd(); ++edge)
- if(EdgeIsTail(*edge) && !EdgeIsIsolate(*edge)){
- TRACE("Processing edge " << graph_.str(*edge));
- if(ProcessTail(*edge))
- num_glued_tails++;
- }
- return num_glued_tails;
- }
-
-public:
- IterativeTailGluing(Graph &graph, double rel_align) :
- graph_(graph),
- rel_align_(rel_align) { }
-
- size_t IterativeProcessTails(){
- size_t num_glued_tails = 1;
- size_t num_iter = 1;
- while(num_glued_tails > 0){
- num_glued_tails = ProcessTails();
- INFO(num_iter << " iteration : " << num_glued_tails << " tails were glued");
- num_iter++;
- }
- return num_glued_tails;
- }
-};
-
-}
diff --git a/src/dipspades/polymorphic_bulge_remover/polymorphic_bulge_remover.hpp b/src/dipspades/polymorphic_bulge_remover/polymorphic_bulge_remover.hpp
deleted file mode 100644
index f463129..0000000
--- a/src/dipspades/polymorphic_bulge_remover/polymorphic_bulge_remover.hpp
+++ /dev/null
@@ -1,108 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "simple_bulge_remover.hpp"
-#include "complex_bulge_remover.hpp"
-#include "iterative_tails_gluing.hpp"
-#include "../../debruijn/stats/debruijn_stats.hpp"
-
-#include "omni/visualization/visualization.hpp"
-#include "omni/edges_position_handler.hpp"
-#include "omni/graph_component.hpp"
-
-using namespace debruijn_graph;
-
-namespace dipspades {
-
-class PolymorphicBulgeRemoverHelper {
-public:
- typedef ComplexBulgeGluer<RelatedBaseGlueDirectionDefiner, GluingVericesDefiner, BulgeSplitter> BaseBulgeGluer;
- static BaseBulgeGluer CreateBaseBulgeGluer(Graph &graph, double rel_len_threshold){
- return BaseBulgeGluer(graph, RelatedBaseGlueDirectionDefiner(graph),
- GluingVericesDefiner(graph, rel_len_threshold), BulgeSplitter(graph));
- }
-};
-
-class PolymorphicBulgeRemover {
- conj_graph_pack &graph_pack_;
- BaseHistogram<size_t> &bulge_len_hist_;
-
- typedef BulgeRemoverAlgorithm<DijkstraBulgePathsSearcher,
- PolymorphicBulgeRemoverHelper::BaseBulgeGluer> LightBulgeRemover;
- typedef BulgeRemoverAlgorithm<PathProcessorBulgeSearcher,
- PolymorphicBulgeRemoverHelper::BaseBulgeGluer> HardBulgeRemover;
-
- void RunSimpleBRCycle(){
- INFO("Simple polymorphic bulge remover runs");
- SimpleBulgeRemover spath_br(graph_pack_.g, bulge_len_hist_, dsp_cfg::get().pbr);
- size_t num_glued_bulges = 1;
- for(size_t num_iter = 1; num_glued_bulges > 0; num_iter++){
- num_glued_bulges = spath_br.Run();
- CompressAllVertices(graph_pack_.g, false);
- INFO(ToString(num_iter) + " iteration: " + ToString(num_glued_bulges) + " simple bulges were glued");
- }
- INFO("Simple polymorphic bulge remover ends");
- }
-
- template<class BulgeRemover>
- void BulgeRemoverCycle(string bulge_remover_name, size_t num_iters){
- INFO(bulge_remover_name + " starts");
- INFO("Maximal number of iterations: " << num_iters);
- BulgeRemover br(graph_pack_.g,
- PolymorphicBulgeRemoverHelper::CreateBaseBulgeGluer(graph_pack_.g,
- dsp_cfg::get().pbr.paired_vert_rel_threshold),
- bulge_len_hist_,
- dsp_cfg::get().pbr);
- size_t num_glued_bulges = 1;
- for(size_t i = 0; (i < num_iters) && (num_glued_bulges != 0); i++){
- num_glued_bulges = br.Run();
- CompressAllVertices(graph_pack_.g, false);
- INFO(ToString(i + 1) + " iteration: " + ToString(num_glued_bulges) + " complex bulges were glued");
- }
- INFO(bulge_remover_name + " ends");
- }
-
- void WriteComponents(string component_dir) {
- if(!dsp_cfg::get().rp.developer_mode)
- return;
-
- graph_pack_.EnsureDebugInfo();
- make_dir(dsp_cfg::get().io.output_dir + "components/");
- omnigraph::DefaultLabeler<Graph> labeler(graph_pack_.g, graph_pack_.edge_pos);
- make_dir(dsp_cfg::get().io.output_dir + "components/" + component_dir + "/");
- omnigraph::visualization::WriteComponents(graph_pack_.g,
- dsp_cfg::get().io.output_dir + "components/" + component_dir + "/",
- omnigraph::ReliableSplitter<Graph>(graph_pack_.g),
- omnigraph::visualization::DefaultColorer(graph_pack_.g, Path<EdgeId>(), Path<EdgeId>()),
- labeler);
- }
-
-public:
- PolymorphicBulgeRemover(conj_graph_pack &graph_pack,
- BaseHistogram<size_t> &bulge_len_hist) :
- graph_pack_(graph_pack),
- bulge_len_hist_(bulge_len_hist) { }
-
- void Run(){
- if(!dsp_cfg::get().pbr.enabled)
- return ;
- WriteComponents("before_pbr");
- graph_pack_.kmer_mapper.SetUnsafeMode(true);
- INFO("Polymorphic bulge remover starts");
- RunSimpleBRCycle();
- BulgeRemoverCycle<LightBulgeRemover>("LightBulgeRemover", dsp_cfg::get().pbr.num_iters_lbr);
- INFO("Index refilling");
- graph_pack_.index.Refill();
- INFO("Polymorphic ends remover ends");
- WriteComponents("after_pbr");
- graph_pack_.kmer_mapper.SetUnsafeMode(false);
- }
-};
-
-}
diff --git a/src/dipspades/polymorphic_bulge_remover/simple_bulge_remover.hpp b/src/dipspades/polymorphic_bulge_remover/simple_bulge_remover.hpp
deleted file mode 100644
index f70b3df..0000000
--- a/src/dipspades/polymorphic_bulge_remover/simple_bulge_remover.hpp
+++ /dev/null
@@ -1,51 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "bulge_correction_condition.hpp"
-#include "bulge_gluer.hpp"
-#include "../utils/histogram.hpp"
-
-using namespace debruijn_graph;
-
-namespace dipspades {
-
-class SimpleBulgeRemover{
- Graph &graph_;
- BaseHistogram<size_t> &bulge_len_hist_;
- RelatedVerticesCondition rel_bulge_checker_;
- DiploidyCondition dip_bulge_checker_;
-public:
- SimpleBulgeRemover(Graph &graph,
- BaseHistogram<size_t> &bulge_len_hist,
- const dipspades_config::polymorphic_br &pbr_config) :
- graph_(graph),
- bulge_len_hist_(bulge_len_hist),
- rel_bulge_checker_(graph),
- dip_bulge_checker_(graph, pbr_config.rel_bulge_length, pbr_config.rel_bulge_align) {}
-
- size_t Run(){
- size_t glued_edges_count = 0;
- for(auto e = graph_.SmartEdgeBegin(); !e.IsEnd(); ++e){
- vector<EdgeId> edges = graph_.GetEdgesBetween(graph_.EdgeStart(*e),
- graph_.EdgeEnd(*e));
- if(edges.size() >= 2){
- auto bulge = shared_ptr<BaseBulge>(new Bulge(graph_, graph_.k(), edges[0], edges[1]));
- if(rel_bulge_checker_.IsBulgeCorrect(bulge) &&
- dip_bulge_checker_.IsBulgeCorrect(bulge)){
- bulge_len_hist_.Add(max<size_t>(graph_.length(edges[0]), graph_.length(edges[1])));
- graph_.GlueEdges(edges[0], edges[1]);
- glued_edges_count++;
- }
- }
- }
- return glued_edges_count;
- }
-};
-
-}
diff --git a/src/dipspades/utils/bulge_utils.hpp b/src/dipspades/utils/bulge_utils.hpp
deleted file mode 100644
index bbab16f..0000000
--- a/src/dipspades/utils/bulge_utils.hpp
+++ /dev/null
@@ -1,267 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "path_routines.hpp"
-#include "element_printers.hpp"
-
-using namespace debruijn_graph;
-
-namespace dipspades {
-
-bool IsRegionBulge(Graph &g, vector<EdgeId> path1, vector<EdgeId> path2){
- if(path1.size() == 0 || path2.size() == 0)
- return true;
- if ((g.EdgeStart(path1[0]) != g.EdgeStart(path2[0])) ||
- (g.EdgeEnd(path1[path1.size() - 1]) != g.EdgeEnd(path2[path2.size() - 1])))
- return false;
- return !PathsShareEdge(path1, path2);
-}
-
-size_t AlignmentOfSequencesByParts(Sequence seq1, Sequence seq2){
- size_t max_length = 10000;
- if(min<size_t>(seq1.size(), seq2.size()) > max_length){
- size_t shrink1 = max_length;
- size_t num_full_iter = seq1.size() / shrink1;
-
- size_t summary_dist = 0;
- size_t shrink2 = size_t((double(shrink1) / double(seq1.size())) * double(seq2.size()));
- for(size_t i = 0; i < num_full_iter; i++){
- Sequence cur_seq1 = seq1.Subseq(shrink1 * i, shrink1 * (i + 1));
- Sequence cur_seq2 = seq2.Subseq(shrink2 * i, shrink2 * (i + 1));
- summary_dist += EditDistance(cur_seq1, cur_seq2);
- }
-
- if(shrink1 % seq1.size() != 0){
- Sequence cur_seq1 = seq1.Subseq(shrink1 * num_full_iter, seq1.size());
- Sequence cur_seq2 = seq2.Subseq(shrink2 * num_full_iter, seq2.size());
- summary_dist += EditDistance(cur_seq1, cur_seq2);
- }
-
- return summary_dist;
- }
- return EditDistance(seq1, seq2);
-}
-
-double RelAlignmentOfSequences(Sequence seq1, Sequence seq2){
- return double(AlignmentOfSequencesByParts(seq1, seq2)) / double(min<size_t>(seq1.size(), seq2.size()));
-}
-
-double RelativeLengthEquality(size_t len1, size_t len2){
- return double(min<size_t>(len1, len2)) / double(max<size_t>(len1, len2));
-}
-
-enum glue_direction { direct_gluing, reverse_gluing, undefined };
-
-class BaseBulge{
-protected:
- Graph &graph_;
-public:
- BaseBulge(Graph &graph) : graph_(graph) { }
- BaseBulge(const BaseBulge& bulge) : graph_(bulge.graph_) { }
-
- virtual double relative_length() = 0;
- virtual double relative_align() = 0;
- virtual bool IsBulgeDiploid(double rel_length_threshold, double rel_seq_threshold) = 0;
- virtual vector<EdgeId> path1() = 0; // todo make it const
- virtual vector<EdgeId> path2() = 0; // todo make it const
- virtual Sequence seq1() = 0;
- virtual Sequence seq2() = 0;
- virtual VertexId start_vertex() = 0;
- virtual VertexId end_vertex() = 0;
- virtual bool IsSimple() = 0;
- virtual bool IsEmpty() = 0;
-
- virtual size_t BulgeLength() = 0;
-
- virtual string StrId() = 0;
- virtual string BulgeToString() = 0;
-
- virtual ~BaseBulge() { }
-};
-
-class Bulge : public BaseBulge{
- size_t k_value_;
- vector<EdgeId> path1_;
- vector<EdgeId> path2_;
- Sequence seq1_;
- Sequence seq2_;
- double rel_length_;
- double rel_align_;
-
- void CalculateRelativeLength(size_t length1, size_t length2){
- rel_length_ = double(min<size_t>(length1, length2)) / double(max<size_t>(length1, length2));
- }
-
- void CalculateRelativeAlign(Sequence seq1, Sequence seq2){
- rel_align_ = RelAlignmentOfSequences(seq1, seq2);
- }
-
- string GetPathStr(vector<EdgeId> path) {
- string s1 = "";
- for(auto edge = path.begin(); edge != path.end(); edge++)
- s1 = ToString(graph_.int_id(*edge)) + "-";
- return s1.substr(0, s1.size() - 1);
- }
-
-public:
- Bulge(Graph &graph) : BaseBulge(graph), k_value_(graph.k()), path1_(), path2_(),
- seq1_(), seq2_(), rel_length_(), rel_align_() { }
-
- Bulge(Graph &g, size_t k_value, vector<EdgeId> path1, pair<size_t,size_t> bulge_region1,
- vector<EdgeId> path2, pair<size_t,size_t> bulge_region2) :
- BaseBulge(g), k_value_(k_value),
- path1_(CutSubpathByRegion(path1, bulge_region1)),
- path2_(CutSubpathByRegion(path2, bulge_region2)),
- seq1_(GetSequenceByPath(graph_, k_value_, path1_).str().c_str()), // todo make it lazy
- seq2_(GetSequenceByPath(graph_, k_value_, path2_).str().c_str()), // todo make it lazy
- rel_length_(0), rel_align_(0) {
- VERIFY(IsRegionBulge(graph_, path1_, path2_));
- }
-
- Bulge(Graph &g, size_t k_value, vector<EdgeId> path1, vector<EdgeId> path2) :
- BaseBulge(g), k_value_(k_value), path1_(path1), path2_(path2),
- seq1_(GetSequenceByPath(graph_, k_value_, path1_).str().c_str()),
- seq2_(GetSequenceByPath(graph_, k_value_, path2_).str().c_str()),
- rel_length_(0), rel_align_(0){
- VERIFY(IsRegionBulge(graph_, path1_, path2_));
- }
-
- Bulge(Graph &g, size_t k_value, EdgeId edge1, EdgeId edge2) : BaseBulge(g),
- k_value_(k_value),
- path1_(1, edge1), path2_(1, edge2),
- seq1_(graph_.EdgeNucls(edge1).str().c_str()),
- seq2_(graph_.EdgeNucls(edge2).str().c_str()),
- rel_length_(0), rel_align_(0) {
- VERIFY(IsRegionBulge(graph_, path1_, path2_));
- }
-
- double relative_length(){
- if(rel_length_ == 0){
- size_t length1 = GetPathLength(graph_, path1_);
- size_t length2 = GetPathLength(graph_, path2_);
- CalculateRelativeLength(length1, length2);
- }
- return rel_length_;
- }
-
- double relative_align(){
- if(rel_align_ == 0){
- Sequence seq1 = GetSequenceByPath(graph_, k_value_, path1_);
- Sequence seq2 = GetSequenceByPath(graph_, k_value_, path2_);
- CalculateRelativeAlign(seq1, seq2);
- }
- return rel_align_;
- }
-
- bool IsBulgeDiploid(double rel_length_threshold, double rel_seq_threshold){
- if(relative_length() < rel_length_threshold)
- return false;
-
- return relative_align() <= rel_seq_threshold;
- }
-
- vector<EdgeId> path1(){
- return path1_;
- }
-
- vector<EdgeId> path2(){
- return path2_;
- }
-
- Sequence seq1() { return seq1_; }
-
- Sequence seq2() { return seq2_; }
-
- VertexId start_vertex(){
- return graph_.EdgeStart(path1_[0]);
- }
-
- VertexId end_vertex(){
- return graph_.EdgeEnd(path1_[path1_.size() - 1]);
- }
-
- bool IsSimple() { return path1_.size() == 1 && path2_.size() == 1; }
-
- bool IsEmpty() { return path1_.size() == 0 || path2_.size() == 0; }
-
- string StrId() {
- string s1 = GetPathStr(path1());
- string s2 = GetPathStr(path2());
- return min<string>(s1,s2) + "_" + max<string>(s1,s2);
- }
-
- size_t BulgeLength() {
- return max<size_t>(GetPathLength(graph_, path1()), GetPathLength(graph_, path2()));
- }
-
- string BulgeToString() {
- return "Side1: " + SimplePathWithVerticesToString(graph_, path1()) + "\n" +
- "Side2: " + SimplePathWithVerticesToString(graph_, path2());
- }
-};
-
-class DirectedBulge : public BaseBulge {
- shared_ptr<BaseBulge> bulge_;
- bool glue_direct_;
-public:
- DirectedBulge(Graph &graph, shared_ptr<BaseBulge> bulge, glue_direction glue_direct = direct_gluing) :
- BaseBulge(graph), bulge_(bulge), glue_direct_(glue_direct) { }
-
- double relative_length() { return bulge_->relative_length(); }
-
- double relative_align() { return bulge_->relative_align(); }
-
- bool IsBulgeDiploid(double rel_length_threshold, double rel_seq_threshold) {
- return bulge_->IsBulgeDiploid(rel_length_threshold, rel_seq_threshold);
- }
-
- vector<EdgeId> path1() {
- if(glue_direct_ == direct_gluing)
- return bulge_->path1();
- return bulge_->path2();
- }
-
- vector<EdgeId> path2() {
- if(glue_direct_ == direct_gluing)
- return bulge_->path2();
- return bulge_->path1();
- }
-
- Sequence seq1() {
- if(glue_direct_ == direct_gluing)
- return bulge_->seq1();
- return bulge_->seq1();
- }
-
- Sequence seq2() {
- if(glue_direct_ == direct_gluing)
- return bulge_->seq2();
- return bulge_->seq2();
- }
-
- VertexId start_vertex() {
- return bulge_->start_vertex();
- }
-
- VertexId end_vertex() {
- return bulge_->end_vertex();
- }
-
- bool IsSimple() { return bulge_->IsSimple(); }
-
- bool IsEmpty() { return bulge_-> IsEmpty(); }
-
- string StrId() { return bulge_->StrId(); }
-
- size_t BulgeLength() { return bulge_->BulgeLength(); }
-
- string BulgeToString() { return bulge_->BulgeToString(); }
-};
-
-}
diff --git a/src/dipspades/utils/dijkstra_utils.hpp b/src/dipspades/utils/dijkstra_utils.hpp
deleted file mode 100644
index 90288aa..0000000
--- a/src/dipspades/utils/dijkstra_utils.hpp
+++ /dev/null
@@ -1,163 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * dijkstra_utils.hpp
- *
- * Created on: 23.11.2012
- * Author: yana
- */
-
-#pragma once
-
-#include <iostream>
-#include <map>
-#include <vector>
-
-using namespace std;
-using namespace debruijn_graph;
-
-namespace dipspades {
-
-struct paths_searcher_config{
- size_t max_num_vertices;
- size_t depth_neigh_search;
- size_t max_len_path;
-};
-
-class PathsSearcher{
-protected:
- Graph & g_;
- paths_searcher_config conf_;
-public:
- PathsSearcher(Graph & g) : g_(g) {}
- void Initialize(paths_searcher_config conf){
- conf_ = conf;
- }
-
- virtual map<VertexId, vector<EdgeId> > FindShortestPathsFrom(VertexId v) = 0;
- virtual ~PathsSearcher(){}
-};
-
-class DijkstraSearcher : public PathsSearcher{
-
-public:
- DijkstraSearcher(Graph & g) : PathsSearcher(g) {
- }
-
- map<VertexId, vector<EdgeId> > FindShortestPathsFrom(VertexId v){
- map<VertexId, vector<EdgeId> > short_paths;
-
- multimap<size_t, VertexId> dist_v;
- map<VertexId, size_t> v_dist;
- map<VertexId, size_t> v_depth;
- set<VertexId> visited;
-
- // insertion of the initial vertex
- vector<EdgeId> empty_path;
- dist_v.insert(pair<size_t, VertexId>(0, v));
- v_dist.insert(pair<VertexId, size_t>(v, 0));
- short_paths.insert(pair<VertexId, vector<EdgeId> >(v, empty_path));
- v_depth[v] = 0;
-
- size_t num_visited = 0;
-
- while((visited.size() < conf_.max_num_vertices) && (dist_v.size() != 0)){
-
- VertexId cur_v = dist_v.begin()->second;
- size_t cur_dist = dist_v.begin()->first;
-
- size_t cur_depth;
- if(v_depth.find(cur_v) != v_depth.end())
- cur_depth = v_depth[cur_v];
- else{
- size_t min_depth = 100000;
- bool is_defined = false;
-
- // defining of depth
- auto in_edges = g_.IncomingEdges(cur_v);
- for(auto e = in_edges.begin(); e!= in_edges.end(); e++){
- VertexId w = g_.EdgeStart(*e);
- if(v_depth.find(w) != v_depth.end())
- if(min_depth > v_depth[w]){
- min_depth = v_depth[w];
- is_defined = true;
- }
- }
-
- if(is_defined){
- cur_depth = min_depth + 1;
- }
- else{
- cur_depth = 0;
- }
- v_depth[cur_v] = cur_depth;
- }
-
- if((cur_depth <= conf_.depth_neigh_search)){
-
- auto out_edges = g_.OutgoingEdges(cur_v);
-
- for(auto e = out_edges.begin(); e != out_edges.end(); e++){
-
- VertexId cur_neigh = g_.EdgeEnd(*e);
-
- if(visited.find(cur_neigh) == visited.end()){
-
- size_t new_neigh_dist = g_.length(*e) + cur_dist;
-
- bool is_replaced = false;
- if(v_dist.find(cur_neigh) != v_dist.end()){
-
- size_t old_neigh_dist = v_dist[cur_neigh];
-
- if(old_neigh_dist > new_neigh_dist){
- is_replaced = true;
-
- for(auto it = dist_v.find(old_neigh_dist); it != dist_v.end(); it++)
- if(it->second == cur_neigh){
- dist_v.erase(it);
- break;
- }
- }
- }
- else
- is_replaced = true;
-
- if(is_replaced && new_neigh_dist <= conf_.max_len_path){
-
- dist_v.insert(pair<size_t, VertexId>(new_neigh_dist, cur_neigh));
- v_dist[cur_neigh] = new_neigh_dist;
-
- short_paths[cur_neigh] = short_paths[cur_v];
- short_paths[cur_neigh].push_back(*e);
- }
- }
- }
- }
- else{
- break;
- }
-
- num_visited++;
- visited.insert(cur_v);
-
- // erasing of visited element;
- for(auto it = dist_v.find(cur_dist); it != dist_v.end(); it++){
- if(it->second == cur_v){
- dist_v.erase(it);
- v_dist.erase(it->second);
- break;
- }
- }
- }
-
- return short_paths;
- }
-};
-
-}
diff --git a/src/dipspades/utils/edge_gluer.hpp b/src/dipspades/utils/edge_gluer.hpp
deleted file mode 100644
index add6139..0000000
--- a/src/dipspades/utils/edge_gluer.hpp
+++ /dev/null
@@ -1,102 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "../../include/omni/dijkstra_tools/neighbours_iterator.hpp"
-
-using namespace debruijn_graph;
-
-namespace dipspades {
-
-class EdgeGluer {
- Graph &graph_;
-
- void MoveRelatedEdge(EdgeId edge, VertexId new_start, VertexId new_end){
- EdgeId new_edge = graph_.AddEdge(new_start, new_end, graph_.EdgeNucls(edge));
- TRACE("New edge " << graph_.str(new_edge) << "was added");
- graph_.DeleteEdge(edge);
- }
-
- void MoverUnrelatedEdge(EdgeId edge, VertexId new_start, VertexId new_end){
- EdgeId new_edge = graph_.AddEdge(new_start, new_end, graph_.EdgeNucls(edge));
- TRACE("New edge - " << graph_.str(new_edge) << " old edge - " << graph_.str(edge));
- if(IsEdgeRelated(graph_, new_edge))
- graph_.DeleteEdge(edge);
- else
- graph_.GlueEdges(edge, new_edge);
- }
-
- void StandardEdgeMoving(EdgeId edge, VertexId new_start, VertexId new_end){
- graph_.AddEdge(new_start, new_end, graph_.EdgeNucls(edge));
- graph_.DeleteEdge(edge);
- }
-
-public:
- EdgeGluer(Graph &graph) : graph_(graph) { }
-
- void MoveEdgesFromVertexToVertex(VertexId old_vertex, VertexId new_vertex,
- vector<EdgeId> forbidden_edges){
-
- TRACE("New start - " << graph_.str(new_vertex) << ", old vertex - " << graph_.str(old_vertex));
- TRACE("Incoming edges");
- for(auto in_edges_iter = SmartSetIterator<Graph, EdgeId>(graph_,
- graph_.IncomingEdges(old_vertex).begin(),
- graph_.IncomingEdges(old_vertex).end());
- !in_edges_iter.IsEnd(); ++in_edges_iter){
- if(find(forbidden_edges.begin(), forbidden_edges.end(), *in_edges_iter) ==
- forbidden_edges.end()){
- TRACE("Edge " << graph_.str(*in_edges_iter) << " is not forbidden");
- if(IsEdgeRelated(graph_, *in_edges_iter)){
- TRACE("Edge is related");
- if(IsEdgeLoop(graph_, *in_edges_iter)){
- TRACE("Edge is loop");
- StandardEdgeMoving(*in_edges_iter, new_vertex, new_vertex);
- }
- else{
- TRACE("Edge is adjacent to conjugate");
- StandardEdgeMoving(*in_edges_iter, graph_.conjugate(new_vertex), new_vertex);
- }
- }
- else{
- TRACE("Edge is not related");
- StandardEdgeMoving(*in_edges_iter, graph_.EdgeStart(*in_edges_iter), new_vertex);
- }
- }
- }
-
- TRACE("Outgoing edges");
- for(auto out_edges_iter = SmartSetIterator<Graph, EdgeId>(graph_,
- graph_.OutgoingEdges(old_vertex).begin(),
- graph_.OutgoingEdges(old_vertex).end());
- !out_edges_iter.IsEnd(); ++out_edges_iter){
- if(find(forbidden_edges.begin(), forbidden_edges.end(), *out_edges_iter) ==
- forbidden_edges.end()){
- TRACE("Edge " << graph_.str(*out_edges_iter) << " is not forbidden");
- if(IsEdgeRelated(graph_, *out_edges_iter)){
- TRACE("Edge is related");
- if(IsEdgeLoop(graph_, *out_edges_iter)){
- TRACE("Edge is loop");
- StandardEdgeMoving(*out_edges_iter, new_vertex, new_vertex);
- }
- else{
- TRACE("Edge is adjacent to conjugate");
- StandardEdgeMoving(*out_edges_iter, new_vertex, graph_.conjugate(new_vertex));
- }
- }
- else{
- TRACE("Edge is not related");
- StandardEdgeMoving(*out_edges_iter, new_vertex, graph_.EdgeEnd(*out_edges_iter));
- }
- }
- }
- }
-
-private:
- DECL_LOGGER("EdgeGluer");
-};
-
-}
diff --git a/src/dipspades/utils/element_printers.hpp b/src/dipspades/utils/element_printers.hpp
deleted file mode 100644
index 370d435..0000000
--- a/src/dipspades/utils/element_printers.hpp
+++ /dev/null
@@ -1,108 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include <set>
-
-using namespace std;
-using namespace debruijn_graph;
-
-namespace dipspades {
-
-void PrintSimplePath(ostream &out, Graph &g, vector<EdgeId> path){
- for(auto e = path.begin(); e != path.end(); e++)
- out << g.int_id(*e) << " ";
- out << endl;
-}
-
-void PrintSimplePathWithVertices(ostream &out, Graph &g, vector<EdgeId> &path){
- for(auto e = path.begin(); e != path.end(); e++)
- out << g.int_id(*e) << " (" << g.length(*e) << "), " << g.int_id(g.EdgeStart(*e)) << " - " <<
- g.int_id(g.EdgeEnd(*e)) << ". ";
- out << endl;
-}
-
-string SimplePathWithVerticesToString(const Graph &g, vector<EdgeId> path){
- stringstream ss;
- for(auto e = path.begin(); e != path.end(); e++)
- ss << g.int_id(*e) << " (" << g.length(*e) << "), " << g.int_id(g.EdgeStart(*e)) << " - " <<
- g.int_id(g.EdgeEnd(*e)) << ". ";
- return ss.str();
-}
-
-string MappingPathToString(Graph &g, MappingPath<EdgeId> path){
- stringstream ss;
- for(size_t i = 0; i < path.size(); i++){
- Range init = path[i].second.initial_range, mapp = path[i].second.mapped_range;
- ss << "Edge - " << g.str(path[i].first) << " (" << g.length(path[i].first) << ") . Init range - " << init.start_pos <<
- " - " << init.end_pos << ". Mapp range - " << mapp.start_pos << " - " <<
- mapp.end_pos << ". ";
- }
- return ss.str();
-}
-
-template<class T>
-void PrintSet(ostream &out, set<T> set_elem){
- for(auto e = set_elem.begin(); e != set_elem.end(); e++)
- out << *e << " ";
- out << endl;
-}
-
-template<class T>
-string SetToString(set<T> set_elem){
- stringstream ss;
- for(auto e = set_elem.begin(); e != set_elem.end(); e++)
- ss << *e << " ";
- return ss.str();
-}
-
-template<class T>
-void PrintVector(ostream &out, vector<T> vect_elem){
- for(auto e = vect_elem.begin(); e != vect_elem.end(); e++)
- out << *e << " ";
- out << endl;
-}
-
-template<class T>
-string VectorToString(vector<T> vect_elem){
- stringstream ss;
- for(auto e = vect_elem.begin(); e != vect_elem.end(); e++)
- ss << *e << " ";
- return ss.str();
-}
-
-string VerticesVectorToString(Graph &g, vector<VertexId> vertices){
- stringstream ss;
- for(auto it = vertices.begin(); it != vertices.end(); it++)
- ss << g.str(*it) << " ";
- return ss.str();
-}
-
-void PrintEdgeWithVertices(ostream &out, Graph &g, EdgeId edge){
- out << "Edge - " << g.int_id(edge) << ". Start vertex - " << g.int_id(g.EdgeStart(edge)) <<
- ". End vertex - " << g.int_id(g.EdgeEnd(edge)) << endl;
-}
-
-void PrintEdgeWithLength(ostream &out, Graph &g, EdgeId edge){
- out << "Edge - " << g.int_id(edge) << " with length - " << g.length(edge) << endl;
-}
-
-void PrintVectorOfVertices(ostream &out, Graph &g, vector<VertexId> vect){
- for(auto v = vect.begin(); v != vect.end(); v++)
- out << g.int_id(*v) << " ";
- out << endl;
-}
-
-string MappingRangeToString(MappingRange mr){
- stringstream ss;
- ss << "Init: " << mr.initial_range.start_pos << " " << mr.initial_range.end_pos
- << ". Map: " << mr.mapped_range.start_pos << " " << mr.mapped_range.end_pos << endl;
- return ss.str();
-}
-
-}
diff --git a/src/dipspades/utils/files_utils.cpp b/src/dipspades/utils/files_utils.cpp
deleted file mode 100644
index fa4e98e..0000000
--- a/src/dipspades/utils/files_utils.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "files_utils.hpp"
-
-namespace dipspades {
-
-vector<string> GetAllLinesFromFile(string filename){
- ifstream freader(filename.c_str());
- vector<string> lines;
- if(!freader.fail()){
- while(!freader.eof()){
- string new_line;
- getline(freader, new_line);
- if(new_line != "")
- lines.push_back(new_line);
- }
- }
- return lines;
-}
-
-string cut_fname_from_path(string path){
- string res;
- for(size_t i = path.size() - 1; i > 0; i--)
- if(path[i] == '/'){
- res = path.substr(i + 1, path.size() - i - 1);
- break;
- }
-
- for(size_t i = res.size() - 1; i > 0 ; i--)
- if(res[i] == '.'){
- res = res.substr(0, i);
- break;
- }
-
- return res;
-}
-
-bool fname_valid(string fname){
- ifstream out(fname.c_str());
- return !out.fail();
-}
-
-}
diff --git a/src/dipspades/utils/histogram.hpp b/src/dipspades/utils/histogram.hpp
deleted file mode 100644
index cad66c5..0000000
--- a/src/dipspades/utils/histogram.hpp
+++ /dev/null
@@ -1,104 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include <map>
-#include <vector>
-
-using namespace std;
-
-namespace dipspades {
-
-template <typename T>
-class BaseHistogram {
-
- map<T, size_t> sorted_elems_;
- vector<T> part_sums_;
- bool invalid_part_sums_;
-
- void CalculatePartSums(){
- part_sums_.clear();
- T prev_elem = 0;
- for(auto it = sorted_elems_.begin(); it != sorted_elems_.end(); it++){
- T key = it->first;
- size_t count = it->second;
- for(size_t i = 0; i < count; i++){
- part_sums_.push_back(key + prev_elem);
- prev_elem += key;
- }
- }
- invalid_part_sums_ = false;
- }
-
- T operator[](size_t idx){
- VERIFY(idx < part_sums_.size());
- if(size() == 0)
- return 0;
- if(idx == 0)
- return part_sums_[0];
- return part_sums_[idx] - part_sums_[idx - 1];
- }
-
-public:
- void Add(T new_elem, size_t count = 1){
- invalid_part_sums_ = true;
- if(sorted_elems_.find(new_elem) == sorted_elems_.end())
- sorted_elems_[new_elem] = count;
- else
- sorted_elems_[new_elem] += count;
- }
-
- T Quantile(double quantile){
- VERIFY(quantile > 0 && quantile <= 1);
- if(invalid_part_sums_)
- CalculatePartSums();
- if(part_sums_.size() == 0)
- return 0;
- T total_sum = part_sums_[part_sums_.size() - 1];
- for(size_t i = 0; i < part_sums_.size(); i++)
- if(double(part_sums_[i]) / double(total_sum) >= quantile)
- return (*this)[i];
-
- return T(0);
- }
-
- size_t size() { return part_sums_.size(); }
-
- T max() {
- CalculatePartSums();
- if(size() == 0)
- return 0;
- return (*this)[size() - 1];
- }
-
- void SaveToFile(string filename) const {
- ofstream out(filename.c_str());
- for(auto it = sorted_elems_.begin(); it != sorted_elems_.end(); it++)
- out << it->first << ' ' << it->second << endl;
- }
-
- void LoadFrom(string filename) {
- ifstream src(filename.c_str());
- VERIFY(!src.fail());
- while(!src.eof()){
- string tmp;
- getline(src, tmp);
- if(tmp != ""){
- stringstream ss;
- ss << tmp;
- T elem;
- size_t count;
- ss >> elem;
- ss >> count;
- Add(elem, count);
- }
- }
- }
-};
-
-}
diff --git a/src/dipspades/utils/lcs_utils.hpp b/src/dipspades/utils/lcs_utils.hpp
deleted file mode 100644
index d2fb695..0000000
--- a/src/dipspades/utils/lcs_utils.hpp
+++ /dev/null
@@ -1,146 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include <vector>
-#include <string>
-#include <memory.h>
-
-using namespace std;
-
-namespace dipspades {
-
-template<class T>
-class LCSCalculator{
-
- int ** mask;
-
- void Initialize(size_t length1, size_t length2){
- mask = new int*[length1 + 1];
- for(size_t i = 0; i < length1 + 1; i++){
- mask[i] = new int[length2 + 1];
- memset(mask[i], 0, sizeof(int) * (length2 + 1));
- }
- }
-
- void Terminate(size_t length){
- for(size_t i = 0; i < length + 1; i++)
- delete[] mask[i];
- delete[] mask;
- }
-
- void RecursiveLCSLenCalc(vector<T> str1, vector<T> str2, size_t i1, size_t i2){
-
-// cout << i1 << " - " << i2 << "; " << str1[i1 - 1] << " - " << str2[i2 - 1] << "; ";
-
- if(str1[i1 - 1] == str2[i2 - 1]){
-// cout << "1st case; ";
- mask[i1][i2] = mask[i1 - 1][i2 - 1] + 1;
- }
- else{
-
-// cout << "2nd case; ";
- int res1 = mask[i1][i2 - 1];
- int res2 = mask[i1 - 1][i2];
-
- mask[i1][i2] = max<int>(res1, res2);
- }
- }
-
- int LCSLengthCalculation(vector<T> str1, vector<T> str2){
-
- for(size_t i = 1; i <= str1.size(); i++)
- for(size_t j = 1; j <= str2.size(); j++){
- RecursiveLCSLenCalc(str1, str2, i, j);
- }
-
- return mask[str1.size()][str2.size()];
- }
-
- vector<T> RecursiveRestoreLCS(vector<T> str1, vector<T> str2,
- size_t i, size_t j){
- vector<T> res;
- if(i == 0 || j == 0){
- return res;
- }
-
- if(str1[i - 1] == str2[j - 1]){
- res = RecursiveRestoreLCS(str1, str2, i - 1, j - 1);
- res.push_back(str1[i - 1]);
- return res;
- }
-
- if(mask[i][j - 1] > mask[i - 1][j])
- return RecursiveRestoreLCS(str1, str2, i, j - 1);
- else
- return RecursiveRestoreLCS(str1, str2, i - 1, j);
- }
-
- vector<T> RestoreLCS(vector<T> string1, vector<T> string2, size_t){
-
-// cout << "LCS string length - " << lcs_length << endl;
- vector<T> lcs = RecursiveRestoreLCS(string1, string2, string1.size(), string2.size());
- return lcs;
- }
-
-public:
-
- vector<T> LCS(vector<T> string1, vector<T> string2){
- vector<T> res;
- if(string1.size() == 0 || string2.size() == 0)
- return res;
-
- Initialize(string1.size(), string2.size());
-
- int lcs_length = LCSLengthCalculation(string1, string2);
- res = RestoreLCS(string1, string2, lcs_length);
- Terminate(string1.size());
-
- return res;
- }
-
- vector<size_t> GetPosVectorFromLeft(vector<T> string, vector<T> lcs){
- vector<size_t> pos;
-
- if(string.size() == 0 || lcs.size() == 0)
- return pos;
-
- int str_ind = 0;
- for(size_t i = 0; i < lcs.size(); i++){
- while(string[str_ind] != lcs[i]){
- str_ind++;
- }
- pos.push_back(str_ind);
- str_ind++;
- }
-
- VERIFY(lcs.size() == pos.size());
-
- return pos;
- }
-
- vector<size_t> GetPosVector(vector<T> string, vector<T> lcs){
- vector<size_t> pos;
- if(string.size() == 0 || lcs.size() == 0)
- return pos;
-
- int lcs_ind = int(lcs.size() - 1);
- int str_size = int(string.size());
- for(int i = str_size - 1; i >= 0 && lcs_ind >= 0; i--)
- if(string[i] == lcs[lcs_ind]){
- pos.insert(pos.begin(), size_t(i));
- lcs_ind--;
- }
-
- VERIFY(pos.size() == lcs.size());
-
- return pos;
- }
-};
-
-}
diff --git a/src/dipspades/utils/path_index.hpp b/src/dipspades/utils/path_index.hpp
deleted file mode 100644
index 17fc6b8..0000000
--- a/src/dipspades/utils/path_index.hpp
+++ /dev/null
@@ -1,68 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "../consensus_contigs_constructor/mapping_contigs_storage.hpp"
-
-namespace dipspades {
-
-class VertexPathIndex{
- Graph &g_;
-
- map<VertexId, set<size_t> > index_;
-
- void AddNewPair(VertexId v, size_t path_index){
- index_[v].insert(path_index);
- }
-
- set<size_t> JoinTwoSets(set<size_t> set1, set<size_t> set2){
- for(auto it = set2.begin(); it != set2.end(); it++)
- set1.insert(*it);
- return set1;
- }
-
-public:
- VertexPathIndex(Graph &g) : g_(g) {
- }
-
- void Initialize(ContigStoragePtr storage){
- INFO("Initialization of vertex-paths index starts");
- for(size_t i = 0; i < storage->Size(); i++){
- auto path = (*storage)[i]->path_seq();
- if(path.size() > 0){
- VertexId start_vertex = g_.EdgeStart(path[0]);
- AddNewPair(start_vertex, i);
-
- for(auto e = path.begin(); e != path.end(); e++){
- VertexId v = g_.EdgeEnd(*e);
- AddNewPair(v, i);
- }
- }
- }
- INFO("Initialization of vertex-paths index ends");
- }
-
- void Clear(){
- index_.clear();
- }
-
- set<size_t> GetPathsIntersectedWith(vector<EdgeId> path){
- set<size_t> res;
- if(path.size() == 0)
- return res;
- VertexId start_vertex = g_.EdgeStart(path[0]);
- res = index_[start_vertex];
- for(auto e = path.begin(); e != path.end(); e++){
- VertexId v = g_.EdgeEnd(*e);
- res = JoinTwoSets(res, index_[v]);
- }
- return res;
- }
-};
-
-}
diff --git a/src/dipspades/utils/path_routines.hpp b/src/dipspades/utils/path_routines.hpp
deleted file mode 100644
index 6bab192..0000000
--- a/src/dipspades/utils/path_routines.hpp
+++ /dev/null
@@ -1,286 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-#include "debruijn_graph.hpp"
-#include "standard.hpp"
-#include "graph_pack.hpp"
-
-using namespace debruijn_graph;
-
-namespace dipspades {
-
-bool AreEdgesConnected(Graph &g, EdgeId e1, EdgeId e2){
- return g.EdgeEnd(e1) == g.EdgeStart(e2);
-}
-
-bool IsPathConnected(Graph &g, vector<EdgeId> path){
- if(path.size() <= 1)
- return true;
- for(size_t i = 0; i < path.size() - 1; i++){
- EdgeId e1 = path[i];
- EdgeId e2 = path[i + 1];
- if(!AreEdgesConnected(g, e1, e2)){
- return false;
- }
- }
- return true;
-}
-
-bool PathContainsLoop(Graph &g, vector<EdgeId> p){
- if(p.size() == 0)
- return false;
-
- set<VertexId> pathv;
- pathv.insert(g.EdgeStart(p[0]));
-
- for(auto e = p.begin(); e != p.end(); e++){
- VertexId end = g.EdgeEnd(*e);
- if(pathv.find(end) == pathv.end())
- pathv.insert(end);
- else
- return true;
- }
-
- return false;
-}
-
-vector<VertexId> get_list_of_vertices_in_path(Graph &g, vector<EdgeId> path){
- vector<VertexId> list;
- if(path.size() == 0)
- return list;
-
- for(size_t i = 0; i < path.size(); i++)
- list.push_back(g.EdgeStart(path[i]));
-
- list.push_back(g.EdgeEnd(path[path.size() - 1]));
-
- return list;
-}
-
-bool is_1st_edge_not_later_2nd_edge(vector<EdgeId> path, EdgeId first_edge,
- EdgeId second_edge){
- bool first_found = false;
- for(auto e = path.begin(); e != path.end(); e++){
- if(*e == first_edge)
- first_found = true;
- if(*e == second_edge && !first_found)
- return false;
- if(*e == second_edge && first_found)
- return true;
- }
-
- return false;
-}
-
-bool is_1st_edge_not_later_2nd_edge(vector<EdgeId> path, EdgeId first_edge,
- EdgeId second_edge, int ind1, int ind2){
- bool first_found = false;
- //for(auto e = path.begin(); e != path.end(); e++){
- for(int i = ind1; i <= ind2; i++){
- EdgeId e = path[i];
- if(e == first_edge)
- first_found = true;
- if(e == second_edge && !first_found)
- return false;
- if(e == second_edge && first_found)
- return true;
- }
-
- return false;
-}
-
-int get_index_of_edge(vector<EdgeId> path, EdgeId edge){
- for(size_t i = 0; i < path.size(); i++)
- if(path[i] == edge)
- return int(i);
- return -1;
-}
-
-EdgeId GetEdgeById(conj_graph_pack & gp, size_t id){
- for(auto e = gp.g.SmartEdgeBegin(); !e.IsEnd(); ++e)
- if(gp.g.int_id(*e) == id)
- return *e;
- return EdgeId(0);
-}
-
-bool IsPathRegionCorrect(pair<size_t, size_t> region, size_t path_size){
- return region.first < path_size && region.second < path_size;
-}
-
-size_t GetLengthOfPathRegion(Graph &g, vector<EdgeId> path, pair<size_t, size_t> region){
- VERIFY(IsPathRegionCorrect(region, path.size()));
- size_t region_length = 0;
- for(size_t i = region.first; i <= region.second; i++)
- region_length += g.length(path[i]);
- return region_length;
-}
-
-size_t GetPathLength(Graph &g, vector<EdgeId> path){
- if(path.size() == 0)
- return 0;
- return GetLengthOfPathRegion(g, path, pair<size_t, size_t>(0, path.size() - 1));
-}
-
-Sequence GetSequenceOfPathRegion(Graph &g, size_t k_value, vector<EdgeId> path,
- pair<size_t, size_t> region){
- VERIFY(IsPathRegionCorrect(region, path.size()));
-
- if(region.first > region.second)
- return Sequence();
-
- EdgeId cur_edge = path[region.first];
- Sequence seq = g.EdgeNucls(cur_edge);
-
- for(auto i = region.first + 1; i <= region.second; ++i){
- Sequence next_seq = g.EdgeNucls(path[i]);
- seq = seq + next_seq.Subseq(k_value, next_seq.size());
- }
-
- return seq;
-}
-
-Sequence GetSequenceByPath(Graph &g, size_t k_value, const vector<EdgeId> path){
- if(path.size() == 0)
- return Sequence();
- return GetSequenceOfPathRegion(g, k_value, path, pair<size_t, size_t>(0, path.size() - 1));
-}
-
-Sequence GetSequenceByPath(conj_graph_pack &gp, const vector<EdgeId> path){
- if(path.size() == 0)
- return Sequence();
- return GetSequenceOfPathRegion(gp.g, gp.k_value, path, pair<int, int>(0, path.size() - 1));
-}
-
-vector<EdgeId> GetRCToPathSeq(Graph &g, vector<EdgeId> path){
- vector<EdgeId> rc_path;
- for(auto e = path.begin(); e != path.end(); e++){
- rc_path.insert(rc_path.begin(), g.conjugate(*e));
- }
- return rc_path;
-}
-
-MappingPath<EdgeId> GetRCToMappingPath(Graph &g, MappingPath<EdgeId> map_path, size_t seq_size){
- vector<EdgeId> rc_path_seq;
- vector<MappingRange> rc_map_ranges;
- for(size_t i = 0; i < map_path.size(); i++){
- // computing edges sequence
- EdgeId cur_edge = map_path[i].first;
- rc_path_seq.insert(rc_path_seq.begin(), g.conjugate(cur_edge));
-
- // computing initial ranges
- Range init_range = map_path[i].second.initial_range;
- Range rc_init_range(seq_size - init_range.end_pos, seq_size - init_range.start_pos);
-
- // computing mapped ranges
- size_t edge_length = g.length(cur_edge);
- Range map_range = map_path[i].second.mapped_range;
- Range rc_map_range(edge_length - map_range.end_pos, edge_length - map_range.start_pos);
-
- rc_map_ranges.insert(rc_map_ranges.begin(), MappingRange(rc_init_range, rc_map_range));
- }
-
- return MappingPath<EdgeId>(rc_path_seq, rc_map_ranges);
-}
-
-bool ArePathEqual(vector<EdgeId> path1, vector<EdgeId> path2){
- if(path1.size() != path2.size())
- return false;
-
- for(size_t i = 0; i < path1.size(); i++)
- if(path1[i] != path2[i])
- return false;
-
- return true;
-}
-
-bool PathsShareEdge(vector<EdgeId> path1, vector<EdgeId> path2){
- for(auto it1 = path1.begin(); it1 != path1.end(); it1++)
- for(auto it2 = path2.begin(); it2 != path2.end(); it2++)
- if(*it1 == *it2)
- return true;
- return false;
-}
-
-vector<EdgeId> CutSubpathByRegion(vector<EdgeId> path, pair<size_t, size_t> region){
- VERIFY(IsPathRegionCorrect(region, path.size()));
- vector<EdgeId> subpath;
- for(size_t i = region.first; i <= region.second; i++)
- subpath.push_back(path[i]);
- return subpath;
-}
-
-bool IsEdgeRelated(Graph &g, EdgeId edge){
- return g.RelatedVertices(g.EdgeStart(edge), g.EdgeEnd(edge));
-}
-
-bool IsEdgeLoop(Graph &g, EdgeId edge){
- return g.EdgeStart(edge) == g.EdgeEnd(edge);
-}
-
-bool VertexAdjacentRelatedEdges(Graph &g, VertexId vertex){
- auto in_edges = g.IncomingEdges(vertex);
- for(auto it = in_edges.begin(); it != in_edges.end(); it++)
- if(IsEdgeRelated(g, *it))
- return true;
- auto out_edges = g.OutgoingEdges(vertex);
- for(auto it = out_edges.begin(); it != out_edges.end(); it++)
- if(IsEdgeRelated(g, *it))
- return true;
- return false;
-}
-
-bool PathAdjacentRelatedEdges(Graph &g, vector<EdgeId> path, bool check_start = false,
- bool check_end = false){
- for(auto e = path.begin(); e != path.end() - 1; e++)
- if(VertexAdjacentRelatedEdges(g, g.EdgeEnd(*e)))
- return true;
- if(path.size() != 0)
- if(check_start)
- if(VertexAdjacentRelatedEdges(g, g.EdgeStart(path[0])))
- return true;
- if(check_end)
- if(VertexAdjacentRelatedEdges(g, g.EdgeEnd(path[path.size() - 1])))
- return true;
- return false;
-}
-
-vector<size_t> CalculatePathPartLens(Graph &g, vector<EdgeId> path){
- vector<size_t> lens;
- size_t prev_len = 0;
- for(auto e = path.begin(); e != path.end(); e++){
- lens.push_back(prev_len + g.length(*e));
- prev_len += g.length(*e);
- }
- return lens;
-}
-
-/*
-void detect_loop_length(ostream &out, Graph& g, ContigStorage* stor){
-
- for(size_t i = 0; i < stor->Size(); i++){
- vector<EdgeId> path = (*stor)[i]->PathSeq();
-
- vector<VertexId> vert = get_list_of_vertices_in_path(g, path);
-
- for(size_t j = 0; j < vert.size() - 1; j++){
- for(size_t k = j + 1; k < vert.size(); k++){
- if(vert[j] == vert[k]){
- size_t ind1 = j, ind2 = (k == path.size()) ? (k - 1) : k;
- size_t loop_length = 0;
-
- for(size_t l = ind1; l <= ind2; l++)
- loop_length += g.length(path[l]);
- out << loop_length << endl;
- }
- }
- }
- }
-}
-*/
-
-}
diff --git a/src/dipspades/utils/range_utils.hpp b/src/dipspades/utils/range_utils.hpp
deleted file mode 100644
index 3e67b09..0000000
--- a/src/dipspades/utils/range_utils.hpp
+++ /dev/null
@@ -1,57 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include <vector>
-#include <map>
-#include <set>
-#include <iostream>
-
-#include "math.h"
-
-using namespace std;
-using namespace io;
-
-namespace dipspades {
-
-bool is_intersection_exist(Range r1, Range r2){
- if(r1.end_pos <= r2.start_pos || r2.end_pos <= r1.start_pos)
- return false;
- else
- return true;
-}
-
-Range get_intersection_of_ranges(Range r1, Range r2){
- VERIFY(is_intersection_exist(r1, r2));
- size_t max_start = max<size_t>(r1.start_pos, r2.start_pos);
- size_t min_end = min<size_t>(r1.end_pos, r2.end_pos);
-
- Range r(max_start, min_end);
- return r;
-}
-
-pair<size_t, size_t> project_init_range_to_new(Range old_map_rg, Range new_map_rg, Range old_init_rg){
-
- size_t start_pos, end_pos;
-
- int shift_start = int(new_map_rg.start_pos) - int(old_map_rg.start_pos);
- double start_coeff = double(shift_start) / double(old_map_rg.end_pos - old_map_rg.start_pos);
- start_pos = old_init_rg.start_pos + int(start_coeff * double(old_init_rg.end_pos - old_init_rg.start_pos));
-
- int shift_end = int(new_map_rg.end_pos) - int(old_map_rg.end_pos);
- double end_coeff = double(shift_end) / double(old_map_rg.end_pos - old_map_rg.start_pos);
- end_pos = old_init_rg.end_pos + int(end_coeff * double(old_init_rg.end_pos - old_init_rg.start_pos));
-
- return pair<size_t, size_t>(start_pos, end_pos);
-}
-
-bool is_range_pair_correct(pair<size_t, size_t> p){
- return p.first < p.second;
-}
-
-}
diff --git a/src/dipspades/utils/redundancy_map.hpp b/src/dipspades/utils/redundancy_map.hpp
deleted file mode 100644
index ef01e7c..0000000
--- a/src/dipspades/utils/redundancy_map.hpp
+++ /dev/null
@@ -1,235 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-using namespace std;
-
-namespace dipspades {
-
-template<typename Id>
-class RedundancyMap{
- map<Id, set<Id> > red_map_;
- set<Id> all_ids_;
-
- void ComputeAllIDs(){
- for(auto it = red_map_.begin(); it != red_map_.end(); it++){
- all_ids_.insert(it->first);
- all_ids_.insert(it->second.begin(), it->second.end());
- }
- }
-
-public:
- void AddNewKey(Id key){
- set<Id> empty_set;
- red_map_[key] = empty_set;
-
- all_ids_.insert(key);
- }
-
- void AddNewPair(Id key, Id value){
- red_map_[key].insert(value);
-
- all_ids_.insert(key);
- all_ids_.insert(value);
- }
-
- set<Id> GetValuesByKey(Id key){
- return red_map_[key];
- }
-
- void SetValuesByKey(Id key, set<Id> values){
- red_map_[key].insert(values.begin(), values.end());
- all_ids_.insert(values.begin(), values.end());
- all_ids_.insert(key);
- }
-
- bool ContainsKey(Id key){
- return red_map_.find(key) != red_map_.end();
- }
-
- size_t AllElementsNumber(){
-
- ComputeAllIDs();
-
- return all_ids_.size();
- }
-
- set<Id> AllElements(){
-
- ComputeAllIDs();
-
- return all_ids_;
- }
-
- typename map<Id, set<Id> >::iterator begin(){
- return red_map_.begin();
- }
-
- typename map<Id, set<Id> >::iterator end(){
- return red_map_.end();
- }
-
- void Clear(){
- red_map_.clear();
- all_ids_.clear();
- }
-};
-
-template<typename Id>
-class RedundancyMapCondenser{
-
- RedundancyMap<Id> uncondensed_map_;
- RedundancyMap<Id> condensed_map_;
-
- size_t number_processed_;
- size_t need_processed_;
-
- map<Id, bool> is_processed_;
-
- void ProcessCondensing(){
- bool non_zero_processed = true;
- while(number_processed_ < need_processed_ && non_zero_processed){
- int num_cur_processed = 0;
- for(auto it = condensed_map_.begin(); it != condensed_map_.end(); it++){
- set<Id> cur_set = it->second;
-
- for(auto it_set = cur_set.begin(); it_set != cur_set.end(); it_set++){
- if(!is_processed_[*it_set]){
- set<Id> child_set = uncondensed_map_.GetValuesByKey(*it_set);
- it->second.insert(child_set.begin(), child_set.end());
-
- is_processed_[*it_set] = true;
- num_cur_processed++;
- }
- }
- }
- non_zero_processed = num_cur_processed != 0;
- number_processed_ += num_cur_processed;
- TRACE("Number of processed - " << number_processed_ << ", total number - " << need_processed_);
- }
-
- }
-
- void ClearParams(){
- number_processed_ = 0;
- need_processed_ = 0;
- is_processed_.clear();
- condensed_map_.Clear();
- }
-
-public:
- RedundancyMap<Id> Condense(RedundancyMap<Id> uncondensed_map){
- uncondensed_map_ = uncondensed_map;
- ClearParams();
-
- TRACE("Start condensing");
-
- TRACE("Computing of main keys");
- auto all_ids_ = uncondensed_map_.AllElements();
- map<Id, bool> is_main;
- for(auto it = all_ids_.begin(); it != all_ids_.end(); it++)
- is_main[*it] = true;
-
- for(auto it = uncondensed_map_.begin(); it != uncondensed_map_.end(); it++){
- for(auto it_set = it->second.begin(); it_set != it->second.end(); it_set++){
- is_main[*it_set] = false;
- }
- }
-
- set<Id> main_keys;
- for(auto it = is_main.begin(); it != is_main.end(); it++)
- if(it->second)
- main_keys.insert(it->first);
-
- TRACE("Number of all keys - " << all_ids_.size());
- TRACE("Number of main keys - " << main_keys.size());
- TRACE("Condensing starts");
-
- need_processed_ = all_ids_.size();
- number_processed_ = 0;
-
- for(auto it = all_ids_.begin(); it != all_ids_.end(); it++)
- is_processed_[*it] = false;
-
- for(auto main_key = main_keys.begin(); main_key != main_keys.end(); main_key++){
- condensed_map_.SetValuesByKey(*main_key, uncondensed_map_.GetValuesByKey(*main_key));
- number_processed_++;
- is_processed_[*main_key] = true;
- }
-
- // main processing
- ProcessCondensing();
-
- // processing of non visiting Ids
- while(number_processed_ < need_processed_){
- size_t max_child_setsize = 0;
- Id start_id(0);
- for(auto it = is_processed_.begin(); it != is_processed_.end(); it++){
- if(!it->second && uncondensed_map_.GetValuesByKey(it->first).size() >= max_child_setsize){
- start_id = it->first;
- max_child_setsize = uncondensed_map_.GetValuesByKey(it->first).size();
- }
- }
- auto start_set = uncondensed_map_.GetValuesByKey(start_id);
- for(auto it = start_set.begin(); it != start_set.end(); it++)
- if(!is_processed_[*it])
- condensed_map_.AddNewPair(start_id, *it);
-
- is_processed_[start_id] = true;
- number_processed_++;
- ProcessCondensing();
- }
-
- VERIFY(number_processed_ == need_processed_);
- return condensed_map_;
- }
-};
-
-template<typename Id>
-class RedundancyMapMerger{
-
- bool AreMergeResultsCorrect(RedundancyMap<Id> old_map, RedundancyMap<Id> new_map){
-// cout << "Correctness - " << old_map.AllElementsNumber() << " " << new_map.AllElementsNumber() << endl;
- return old_map.AllElementsNumber() == new_map.AllElementsNumber();
- }
-
-public:
- RedundancyMap<Id> MergeTwoMaps(RedundancyMap<Id> map1, RedundancyMap<Id> map2){
-
- for(auto it_old = map1.begin(); it_old != map1.end(); it_old++){
- Id old_key = it_old->first;
- auto old_set = it_old->second;
-
- if(map2.ContainsKey(old_key)){
- map2.SetValuesByKey(old_key, old_set);
- }
- else{
- bool is_found = false;
-
- for(auto it_new = map2.begin(); it_new != map2.end(); it_new++){
- Id new_key = it_new->first;
- auto new_set = it_new->second;
- if(new_set.find(old_key) != new_set.end()){
- map2.SetValuesByKey(new_key, old_set);
- is_found = true;
- break;
- }
- }
-
- if(!is_found)
- map2.SetValuesByKey(old_key, old_set);
- }
- }
- RedundancyMapCondenser<Id> condenser;
- map2 = condenser.Condense(map2);
- VERIFY(AreMergeResultsCorrect(map1, map2));
- return map2;
- }
-};
-
-}
diff --git a/src/dipspades/utils/sequence_utils.hpp b/src/dipspades/utils/sequence_utils.hpp
deleted file mode 100644
index 4a2a7bb..0000000
--- a/src/dipspades/utils/sequence_utils.hpp
+++ /dev/null
@@ -1,36 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-using namespace debruijn_graph;
-using namespace std;
-
-#include "bulge_utils.hpp"
-
-namespace dipspades {
-
-double RelativeAlignmentOfSequencesByMinimal(Sequence seq1, Sequence seq2, bool from_start = true){
- Sequence trim_seq1, trim_seq2;
- if(min<size_t>(seq1.size(), seq2.size()) == seq1.size()){
- trim_seq1 = seq1;
- if(from_start)
- trim_seq2 = seq2.Subseq(seq2.size());
- else
- trim_seq2 = seq2.Subseq(seq2.size() - seq1.size(), seq1.size());
- }
- else{
- if(from_start)
- trim_seq1 = seq1.Subseq(seq2.size());
- else
- trim_seq1 = seq1.Subseq(seq1.size() - seq2.size(), seq1.size());
- trim_seq2 = seq2;
- }
- return RelAlignmentOfSequences(trim_seq1, trim_seq2);
-}
-
-}
diff --git a/src/hammer/CMakeLists.txt b/src/hammer/CMakeLists.txt
deleted file mode 100644
index 7b02ce6..0000000
--- a/src/hammer/CMakeLists.txt
+++ /dev/null
@@ -1,36 +0,0 @@
-############################################################################
-# Copyright (c) 2015 Saint Petersburg State University
-# Copyright (c) 2011-2014 Saint Petersburg Academic University
-# All Rights Reserved
-# See file LICENSE for details.
-############################################################################
-
-project(hammer CXX)
-
-include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-
-add_executable(hammer
- main.cpp
- hammer_tools.cpp
- hamcluster.cpp
- kmer_cluster.cpp
- kmer_data.cpp
- config_struct_hammer.cpp
- read_corrector.cpp
- expander.cpp)
-
-# add_subdirectory(quake_count)
-# add_subdirectory(gen_test_data)
-
-target_link_libraries(hammer input cityhash BamTools yaml-cpp format input ${COMMON_LIBRARIES})
-
-if (SPADES_STATIC_BUILD)
- set_target_properties(hammer PROPERTIES LINK_SEARCH_END_STATIC 1)
-endif()
-
-install(TARGETS hammer
- DESTINATION bin
- COMPONENT runtime)
-install(DIRECTORY "${SPADES_CFG_DIR}/hammer"
- DESTINATION share/spades/configs
- FILES_MATCHING PATTERN "*.info.template")
diff --git a/src/hammer/config_struct_hammer.cpp b/src/hammer/config_struct_hammer.cpp
deleted file mode 100644
index d9df167..0000000
--- a/src/hammer/config_struct_hammer.cpp
+++ /dev/null
@@ -1,86 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * config_struct_hammer.cpp
- *
- * Created on: Oct 15, 2011
- * Author: snikolenko
- */
-
-#include "config_struct_hammer.hpp"
-#include "config_common.hpp"
-#include "openmp_wrapper.h"
-
-#include <boost/property_tree/ptree.hpp>
-#include <string>
-
-void load(hammer_config& cfg, const std::string &filename) {
- boost::property_tree::ptree pt;
- boost::property_tree::read_info(filename, pt);
-
- load(cfg, pt);
-}
-
-void load(hammer_config& cfg, boost::property_tree::ptree const& pt) {
- using config_common::load;
- load(cfg.general_do_everything_after_first_iteration, pt, "general_do_everything_after_first_iteration");
- load(cfg.general_hard_memory_limit, pt, "general_hard_memory_limit");
- load(cfg.general_max_nthreads, pt, "general_max_nthreads");
- load(cfg.general_tau, pt, "general_tau");
- load(cfg.general_max_iterations, pt, "general_max_iterations");
- load(cfg.general_debug, pt, "general_debug");
-
- load(cfg.count_do, pt, "count_do");
- load(cfg.count_numfiles, pt, "count_numfiles");
- load(cfg.count_merge_nthreads, pt, "count_merge_nthreads");
- load(cfg.count_split_buffer, pt, "count_split_buffer");
- load(cfg.count_filter_singletons, pt, "count_filter_singletons");
-
- load(cfg.hamming_do, pt, "hamming_do");
- load(cfg.hamming_blocksize_quadratic_threshold, pt, "hamming_blocksize_quadratic_threshold");
-
- load(cfg.bayes_do, pt, "bayes_do");
- load(cfg.bayes_nthreads, pt, "bayes_nthreads");
- load(cfg.bayes_singleton_threshold, pt, "bayes_singleton_threshold");
- load(cfg.bayes_nonsingleton_threshold, pt, "bayes_nonsingleton_threshold");
- load(cfg.bayes_discard_only_singletons, pt, "bayes_discard_only_singletons");
- load(cfg.bayes_debug_output, pt, "bayes_debug_output");
- load(cfg.bayes_use_hamming_dist, pt, "bayes_use_hamming_dist");
- load(cfg.bayes_hammer_mode, pt, "bayes_hammer_mode");
- load(cfg.bayes_write_bad_kmers, pt, "bayes_write_bad_kmers");
- load(cfg.bayes_write_solid_kmers, pt, "bayes_write_solid_kmers");
- load(cfg.bayes_initial_refine, pt, "bayes_initial_refine");
-
- load(cfg.expand_do, pt, "expand_do");
- load(cfg.expand_max_iterations, pt, "expand_max_iterations");
- load(cfg.expand_nthreads, pt, "expand_nthreads");
- load(cfg.expand_write_each_iteration, pt, "expand_write_each_iteration");
- load(cfg.expand_write_kmers_result, pt, "expand_write_kmers_result");
-
- load(cfg.correct_do, pt, "correct_do");
- load(cfg.correct_nthreads, pt, "correct_nthreads");
- load(cfg.correct_threshold, pt, "correct_threshold");
- load(cfg.correct_use_threshold, pt, "correct_use_threshold");
- load(cfg.correct_readbuffer, pt, "correct_readbuffer");
- load(cfg.correct_discard_bad, pt, "correct_discard_bad");
- load(cfg.correct_stats, pt, "correct_stats");
-
- std::string fname;
- load(fname, pt, "dataset");
- cfg.dataset.load(fname);
-
- load(cfg.input_working_dir, pt, "input_working_dir");
- load(cfg.input_trim_quality, pt, "input_trim_quality");
- cfg.input_qvoffset_opt = pt.get_optional<int>("input_qvoffset");
- load(cfg.output_dir, pt, "output_dir");
-
- // Fix number of threads according to OMP capabilities.
- cfg.general_max_nthreads = std::min(cfg.general_max_nthreads, (unsigned)omp_get_max_threads());
- // Inform OpenMP runtime about this :)
- omp_set_num_threads(cfg.general_max_nthreads);
-}
diff --git a/src/hammer/config_struct_hammer.hpp b/src/hammer/config_struct_hammer.hpp
deleted file mode 100644
index f22670a..0000000
--- a/src/hammer/config_struct_hammer.hpp
+++ /dev/null
@@ -1,89 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * config_struct_hammer.hpp
- *
- * Created on: Aug 15, 2011
- * Author: snikolenko
- */
-
-#ifndef CONFIG_STRUCT_HAMMER_HPP_
-#define CONFIG_STRUCT_HAMMER_HPP_
-
-#include "config_singl.hpp"
-
-#include "io/library.hpp"
-
-#include <boost/optional.hpp>
-#include <boost/property_tree/ptree_fwd.hpp>
-
-#include <string>
-
-#define CONFIG_FILENAME "/home/snikolenko/algorithmic-biology/assembler/src/hammer/config.inp"
-
-// struct for debruijn project's configuration file
-struct hammer_config {
- io::DataSet<> dataset;
-
- std::string input_working_dir;
- int input_trim_quality;
- boost::optional<int> input_qvoffset_opt;
- int input_qvoffset;
- std::string output_dir;
-
- bool general_do_everything_after_first_iteration;
- int general_hard_memory_limit;
- unsigned general_max_nthreads;
- int general_tau;
- unsigned general_max_iterations;
- bool general_debug;
-
- bool count_do;
- unsigned count_numfiles;
- unsigned count_merge_nthreads;
- size_t count_split_buffer;
- bool count_filter_singletons;
-
- bool hamming_do;
- unsigned hamming_blocksize_quadratic_threshold;
-
- bool bayes_do;
- unsigned bayes_nthreads;
- double bayes_singleton_threshold;
- double bayes_nonsingleton_threshold;
- bool bayes_discard_only_singletons;
- unsigned bayes_debug_output;
- bool bayes_use_hamming_dist;
- bool bayes_hammer_mode;
- bool bayes_write_solid_kmers;
- bool bayes_write_bad_kmers;
- bool bayes_initial_refine;
-
- bool expand_do;
- unsigned expand_max_iterations;
- unsigned expand_nthreads;
- bool expand_write_each_iteration;
- bool expand_write_kmers_result;
-
- bool correct_do;
- bool correct_discard_bad;
- bool correct_use_threshold;
- double correct_threshold;
- unsigned correct_readbuffer;
- unsigned correct_nthreads;
- bool correct_stats;
-};
-
-
-// main debruijn config load function
-void load(hammer_config& cfg, const std::string &filename);
-void load(hammer_config& cfg, boost::property_tree::ptree const& pt);
-
-typedef config_common::config<hammer_config> cfg;
-
-#endif
diff --git a/src/hammer/expander.cpp b/src/hammer/expander.cpp
deleted file mode 100644
index b6a7a6f..0000000
--- a/src/hammer/expander.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "expander.hpp"
-
-#include "config_struct_hammer.hpp"
-#include "globals.hpp"
-#include "kmer_data.hpp"
-#include "valid_kmer_generator.hpp"
-
-#include "io/read.hpp"
-
-#include <vector>
-#include <cstring>
-
-bool Expander::operator()(const Read &r) {
- int trim_quality = cfg::get().input_trim_quality;
-
- // FIXME: Get rid of this
- Read cr = r;
- size_t sz = cr.trimNsAndBadQuality(trim_quality);
-
- if (sz < hammer::K)
- return false;
-
- std::vector<unsigned> covered_by_solid(sz, false);
- std::vector<size_t> kmer_indices(sz, -1ull);
-
- ValidKMerGenerator<hammer::K> gen(cr);
- while (gen.HasMore()) {
- hammer::KMer kmer = gen.kmer();
- size_t idx = data_.checking_seq_idx(kmer);
- if (idx != -1ULL) {
- size_t read_pos = gen.pos() - 1;
-
- kmer_indices[read_pos] = idx;
- if (data_[idx].good()) {
- for (size_t j = read_pos; j < read_pos + hammer::K; ++j)
- covered_by_solid[j] = true;
- }
- }
- gen.Next();
- }
-
- for (size_t j = 0; j < sz; ++j)
- if (!covered_by_solid[j])
- return false;
-
- for (size_t j = 0; j < sz; ++j) {
- if (kmer_indices[j] == -1ull)
- continue;
-
- // FIXME: Do not lock everything
- KMerStat &kmer_data = data_[kmer_indices[j]];
- if (!kmer_data.good()) {
-# pragma omp atomic
- changed_ += 1;
-
- kmer_data.lock();
- kmer_data.mark_good();
- kmer_data.unlock();
- }
- }
-
- return false;
-}
diff --git a/src/hammer/hamcluster.cpp b/src/hammer/hamcluster.cpp
deleted file mode 100644
index 4db02ab..0000000
--- a/src/hammer/hamcluster.cpp
+++ /dev/null
@@ -1,288 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "hamcluster.hpp"
-
-#include "adt/concurrent_dsu.hpp"
-#include "io/mmapped_reader.hpp"
-#include "parallel_radix_sort.hpp"
-
-#include "config_struct_hammer.hpp"
-#include "globals.hpp"
-
-#include <iostream>
-#include <sstream>
-
-class EncoderKMer {
-public:
- inline static size_t extract(const SubKMer &x, unsigned shift, unsigned Base) {
- size_t idx = shift / SubKMer::TBits;
- size_t ishift = shift - idx * SubKMer::TBits;
- return (x.data()[idx] >> ishift) & ((1 << Base) - 1);
- }
-};
-
-struct SubKMerComparator {
- bool operator()(const SubKMer &l, const SubKMer &r) const {
- for (size_t i = 0; i < SubKMer::DataSize ; ++i) {
- if (l.data()[i] != r.data()[i]) {
- return (l.data()[i] < r.data()[i]);
- }
- }
-
- return false;
- }
-};
-
-template<class Op>
-std::pair<size_t, size_t> SubKMerSplitter::split(Op &&op) {
- std::vector<SubKMer> data; std::vector<size_t> blocks;
-
- MMappedReader bifs(bifname_, /* unlink */ true);
- MMappedReader kifs(kifname_, /* unlink */ true);
- size_t icnt = 0, ocnt = 0;
- while (bifs.good()) {
- deserialize(blocks, data, bifs, kifs);
-
- using PairSort = parallel_radix_sort::PairSort<SubKMer, size_t, SubKMer, EncoderKMer>;
- // PairSort::InitAndSort(data.data(), blocks.data(), data.size());
- PairSort::InitAndSort(data.data(), blocks.data(), data.size(), data.size() > 1000*16 ? -1 : 1);
-
- for (auto start = data.begin(), end = data.end(); start != end;) {
- auto chunk_end = std::upper_bound(start + 1, data.end(), *start, SubKMerComparator());
- op(blocks.begin() + (start - data.begin()), chunk_end - start);
- start = chunk_end;
- ocnt += 1;
- }
- icnt += 1;
- }
-
- return std::make_pair(icnt, ocnt);
-}
-
-#if 1
-static bool canMerge(const ConcurrentDSU &uf, size_t x, size_t y) {
- size_t szx = uf.set_size(x), szy = uf.set_size(y);
- const size_t hardthr = 2500;
-
- // Global threshold - no cluster larger than hard threshold
- if (szx + szy > hardthr)
- return false;
-
- // If one of the clusters is moderately large, than attach "almost" singletons
- // only.
- if ((szx > hardthr * 3 / 4 && szy > 50) ||
- (szy > hardthr * 3 / 4 && szx > 50))
- return false;
-
- return true;
-}
-#else
-static bool canMerge(const ConcurrentDSU &uf, size_t x, size_t y) {
- return (uf.set_size(x) + uf.set_size(y)) < 10000;
-}
-#endif
-
-
-static void processBlockQuadratic(ConcurrentDSU &uf,
- const std::vector<size_t>::iterator &block,
- size_t block_size,
- const KMerData &data,
- unsigned tau) {
- for (size_t i = 0; i < block_size; ++i) {
- size_t x = block[i];
- hammer::KMer kmerx = data.kmer(x);
- for (size_t j = i + 1; j < block_size; j++) {
- size_t y = block[j];
- hammer::KMer kmery = data.kmer(y);
- if (!uf.same(x, y) &&
- canMerge(uf, x, y) &&
- hamdistKMer(kmerx, kmery, tau) <= tau) {
- uf.unite(x, y);
- }
- }
- }
-}
-
-void KMerHamClusterer::cluster(const std::string &prefix,
- const KMerData &data,
- ConcurrentDSU &uf) {
- // First pass - split & sort the k-mers
- std::string fname = prefix + ".first", bfname = fname + ".blocks", kfname = fname + ".kmers";
- std::ofstream bfs(bfname, std::ios::out | std::ios::binary);
- std::ofstream kfs(kfname, std::ios::out | std::ios::binary);
- VERIFY(bfs.good()); VERIFY(kfs.good());
-
- INFO("Serializing sub-kmers.");
- for (unsigned i = 0; i < tau_ + 1; ++i) {
- size_t from = (*Globals::subKMerPositions)[i];
- size_t to = (*Globals::subKMerPositions)[i+1];
-
- INFO("Serializing: [" << from << ", " << to << ")");
- serialize(bfs, kfs,
- data, NULL, 0,
- SubKMerPartSerializer(from, to));
- }
- VERIFY(!bfs.fail()); VERIFY(!kfs.fail());
- bfs.close(); kfs.close();
-
- size_t big_blocks1 = 0;
- {
- unsigned block_thr = cfg::get().hamming_blocksize_quadratic_threshold;
-
- INFO("Splitting sub-kmers, pass 1.");
- SubKMerSplitter Splitter(bfname, kfname);
-
- fname = prefix + ".second", bfname = fname + ".blocks", kfname = fname + ".kmers";
- bfs.open(bfname, std::ios::out | std::ios::binary);
- kfs.open(kfname, std::ios::out | std::ios::binary);
- VERIFY(bfs.good()); VERIFY(kfs.good());
-
- std::pair<size_t, size_t> stat =
- Splitter.split([&] (const std::vector<size_t>::iterator &start, size_t sz) {
- if (sz < block_thr) {
- // Merge small blocks.
- processBlockQuadratic(uf, start, sz, data, tau_);
- } else {
- big_blocks1 += 1;
- // Otherwise - dump for next iteration.
- for (unsigned i = 0; i < tau_ + 1; ++i) {
- serialize(bfs, kfs,
- data, &start, sz,
- SubKMerStridedSerializer(i, tau_ + 1));
- }
- }
- });
- INFO("Splitting done."
- " Processed " << stat.first << " blocks."
- " Produced " << stat.second << " blocks.");
-
- // Sanity check - there cannot be more blocks than tau + 1 times of total
- // kmer number. And on the first pass we have only tau + 1 input blocks!
- VERIFY(stat.first == tau_ + 1);
- VERIFY(stat.second <= (tau_ + 1) * data.size());
-
- VERIFY(!bfs.fail()); VERIFY(!kfs.fail());
- bfs.close(); kfs.close();
- INFO("Merge done, total " << big_blocks1 << " new blocks generated.");
- }
-
- size_t big_blocks2 = 0;
- {
- INFO("Spliting sub-kmers, pass 2.");
- SubKMerSplitter Splitter(bfname, kfname);
- size_t nblocks = 0;
- std::pair<size_t, size_t> stat =
- Splitter.split([&] (const std::vector<size_t>::iterator &start, size_t sz) {
- if (sz > 50) {
- big_blocks2 += 1;
-#if 0
- for (size_t i = 0; i < block.size(); ++i) {
- std::string s(Globals::blob + data[block[i]], K);
- INFO("" << block[i] << ": " << s);
- }
-#endif
- }
- processBlockQuadratic(uf, start, sz, data, tau_);
- nblocks += 1;
- });
- INFO("Splitting done."
- " Processed " << stat.first << " blocks."
- " Produced " << stat.second << " blocks.");
-
- // Sanity check - there cannot be more blocks than tau + 1 times of total
- // kmer number. And there should be tau + 1 times big_blocks input blocks.
- VERIFY(stat.first == (tau_ + 1)*big_blocks1);
- VERIFY(stat.second <= (tau_ + 1) * (tau_ + 1) * data.size());
-
- INFO("Merge done, saw " << big_blocks2 << " big blocks out of " << nblocks << " processed.");
- }
-}
-
-enum {
- UNLOCKED = 0,
- PARTIALLY_LOCKED = 1,
- FULLY_LOCKED = 3
-};
-
-static bool canMerge2(const ConcurrentDSU &uf, size_t kidx, size_t cidx) {
- // If either of indices is fully locked - bail out
- uint64_t kaux = uf.root_aux(kidx), caux = uf.root_aux(cidx);
- if (kaux == FULLY_LOCKED || caux == FULLY_LOCKED)
- return false;
-
- // Otherwise there is a possibility to merge stuff.
- if (0 && (kaux == PARTIALLY_LOCKED || caux == PARTIALLY_LOCKED)) {
- // We cannot merge two partially locked clusters.
- return kaux != caux;
- }
-
- return true;
-}
-
-static void ClusterChunk(size_t start_idx, size_t end_idx, const KMerData &data, ConcurrentDSU &uf) {
- unsigned nthreads = cfg::get().general_max_nthreads;
-
- // INFO("Cluster: " << start_idx << ":" << end_idx);
-# pragma omp parallel num_threads(nthreads)
- {
-# pragma omp for
- for (size_t idx = start_idx; idx < end_idx; ++idx) {
- hammer::KMer kmer = data.kmer(idx);
-
- if (kmer.GetHash() > (!kmer).GetHash())
- continue;
-
- size_t kidx = data.seq_idx(kmer);
- size_t rckidx = -1ULL;
- // INFO("" << kmer << ":" << kidx);
-
- for (size_t k = 0; k < hammer::K; ++k) {
- hammer::KMer candidate = kmer;
- char c = candidate[k];
- for (char nc = 0; nc < 4; ++nc) {
- if (nc == c)
- continue;
- candidate.set(k, nc);
- size_t cidx = data.checking_seq_idx(candidate);
- // INFO("" << candidate << ":" << cidx);
- if (cidx != -1ULL && canMerge2(uf, kidx, cidx)) {
- uf.unite(kidx, cidx);
-
- size_t rccidx = data.seq_idx(!candidate);
- if (rckidx == -1ULL)
- rckidx = data.seq_idx(!kmer);
- uf.unite(rckidx, rccidx);
- }
- }
- }
- }
-# pragma omp barrier
- //INFO("Lock: " << start_idx << ":" << end_idx);
-# pragma omp for
- for (size_t idx = start_idx; idx < end_idx; ++idx) {
- if (uf.set_size(idx) < 2500)
- continue;
-
- if (uf.root_aux(idx) != FULLY_LOCKED)
- uf.set_root_aux(idx, FULLY_LOCKED);
- }
- }
-}
-
-void TauOneKMerHamClusterer::cluster(const std::string &, const KMerData &data, ConcurrentDSU &uf) {
- size_t start_idx = 0;
- while (start_idx < data.size()) {
- size_t end_idx = start_idx + 64*1024;
- if (end_idx > data.size())
- end_idx = data.size();
-
- ClusterChunk(start_idx, end_idx, data, uf);
-
- start_idx = end_idx;
- }
-}
diff --git a/src/hammer/hamcluster.hpp b/src/hammer/hamcluster.hpp
deleted file mode 100644
index d2041af..0000000
--- a/src/hammer/hamcluster.hpp
+++ /dev/null
@@ -1,161 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef HAMMER_SUBKMER_SORTER_HPP
-#define HAMMER_SUBKMER_SORTER_HPP
-
-#include "kmer_stat.hpp"
-#include "kmer_data.hpp"
-#include "io/mmapped_reader.hpp"
-
-#include "logger/logger.hpp"
-#include "sequence/seq.hpp"
-
-#include <iostream>
-#include <vector>
-
-class ConcurrentDSU;
-
-typedef Seq<(hammer::K + 1) / 2, uint32_t> SubKMer;
-
-template<class Reader>
-inline void binary_read(Reader &is, SubKMer &s) {
- SubKMer::DataType seq_data[SubKMer::DataSize];
-
- is.read((char*)seq_data, sizeof(seq_data));
-
- s = SubKMer(seq_data);
-}
-
-template<class Writer>
-inline Writer &binary_write(Writer &os, const SubKMer &s) {
- SubKMer::DataType seq_data[SubKMer::DataSize];
- s.copy_data(seq_data);
-
- os.write((char*)seq_data, sizeof(seq_data));
-
- return os;
-}
-
-static_assert(sizeof(SubKMer) == 4, "Too big SubKMer");
-
-class SubKMerPartSerializer{
- size_t from_;
- size_t to_;
-
-public:
- SubKMerPartSerializer(size_t from, size_t to)
- :from_(from), to_(to) { VERIFY(to_ - from_ <= hammer::K); }
-
- SubKMer serialize(hammer::KMer k) const {
- SubKMer res;
- for (size_t i = 0; i < to_ - from_; ++i)
- res.set(i, k[from_ + i]);
-
- return res;
- }
-};
-
-class SubKMerStridedSerializer{
- size_t from_;
- size_t stride_;
-
-public:
- SubKMerStridedSerializer(size_t from, size_t stride)
- :from_(from), stride_(stride) { VERIFY(from_ + stride_ <= hammer::K); }
-
- SubKMer serialize(hammer::KMer k) const {
- SubKMer res;
-
- for (size_t i = from_, j = 0; i < hammer::K; i+= stride_, ++j)
- res.set(j, k[i]);
-
- return res;
- }
-};
-
-template<class Writer,
- class SubKMerSerializer>
-void serialize(Writer &blocks, Writer &kmers,
- const KMerData &data,
- const std::vector<size_t>::iterator *block = NULL, size_t sz = 0,
- const SubKMerSerializer &serializer = SubKMerSerializer()) {
- if (sz == 0)
- sz = data.size();
-
- blocks.write((char*)&sz, sizeof(sz));
- if (block) {
- blocks.write((char*)&**block, sz * sizeof((*block)[0]));
- } else {
- for (size_t i = 0, e = sz; i != e; ++i)
- blocks.write((char*)&i, sizeof(i));
- }
-
- for (size_t i = 0, e = sz; i != e; ++i) {
- size_t idx = (block == NULL ? i : (*block)[i]);
- SubKMer s = serializer.serialize(data.kmer(idx));
- binary_write(kmers, s);
- }
-}
-
-class SubKMerSplitter {
- const std::string bifname_, kifname_;
-
- public:
- SubKMerSplitter(const std::string &bifname, const std::string &kifname)
- : bifname_(bifname), kifname_(kifname) {}
-
- template<class Writer>
- void serialize(Writer &os,
- const std::vector<size_t>::iterator &start,
- size_t sz) {
- os.write((char*)&sz, sizeof(sz));
- os.write((char*)&*start, sz * sizeof(*start));
- }
-
- template<class Reader>
- void deserialize(std::vector<size_t> &blocks,
- std::vector<SubKMer> &kmers,
- Reader &bis, Reader &kis) {
- kmers.clear(); blocks.clear();
-
- size_t sz;
- bis.read((char*)&sz, sizeof(sz));
- blocks.resize(sz);
- bis.read((char*)blocks.data(), sz * sizeof(blocks[0]));
-
- kmers.resize(sz);
- for (size_t i = 0, e = sz; i != e; ++i)
- binary_read(kis, kmers[i]);
- }
-
- template<class Op>
- std::pair<size_t, size_t> split(Op &&op);
-};
-
-class KMerHamClusterer {
- unsigned tau_;
-
- public:
- KMerHamClusterer(unsigned tau)
- : tau_(tau) {}
-
- void cluster(const std::string &prefix, const KMerData &data, ConcurrentDSU &uf);
- private:
- DECL_LOGGER("Hamming Clustering");
-};
-
-class TauOneKMerHamClusterer {
- public:
- TauOneKMerHamClusterer() {}
- void cluster(const std::string &prefix, const KMerData &data, ConcurrentDSU &uf);
- private:
- DECL_LOGGER("tau = 1 Hamming Clustering");
-};
-
-
-#endif // HAMMER_SUBKMER_SORTER_HPP
diff --git a/src/hammer/hammer_tools.cpp b/src/hammer/hammer_tools.cpp
deleted file mode 100644
index 2e5e268..0000000
--- a/src/hammer/hammer_tools.cpp
+++ /dev/null
@@ -1,274 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "io/ireadstream.hpp"
-#include "valid_kmer_generator.hpp"
-#include "globals.hpp"
-#include "kmer_data.hpp"
-#include "read_corrector.hpp"
-
-#include "io/mmapped_writer.hpp"
-
-#include <iostream>
-#include <fstream>
-#include <iomanip>
-
-#include "config_struct_hammer.hpp"
-#include "hammer_tools.hpp"
-
-using namespace std;
-
-namespace hammer {
-
-void InitializeSubKMerPositions() {
- std::ostringstream log_sstream;
- log_sstream.str("");
- Globals::subKMerPositions = new std::vector<uint32_t>(cfg::get().general_tau + 2);
- for (uint32_t i=0; i < (uint32_t)(cfg::get().general_tau + 1); ++i) {
- Globals::subKMerPositions->at(i) = (i * K / (cfg::get().general_tau + 1) );
- log_sstream << Globals::subKMerPositions->at(i) << " ";
- }
- Globals::subKMerPositions->at(cfg::get().general_tau + 1) = K;
- INFO("Hamming graph threshold tau=" << cfg::get().general_tau << ", k=" << K << ", subkmer positions = [ " << log_sstream.str() << "]" );
-}
-
-std::string getFilename(const string & dirprefix, const string & suffix) {
- std::ostringstream tmp;
- tmp.str(""); tmp << dirprefix.data() << "/" << suffix.data();
- return tmp.str();
-}
-
-string getFilename(const string & dirprefix, unsigned iter_count, const string & suffix ) {
- ostringstream tmp;
- tmp.str(""); tmp << dirprefix.data() << "/" << std::setfill('0') << std::setw(2) << iter_count << "." << suffix.data();
- return tmp.str();
-}
-
-string getReadsFilename(const std::string & dirprefix, const std::string &fname, unsigned iter_no, const std::string & suffix) {
- ostringstream tmp;
- tmp.str("");
-
- tmp << dirprefix.data() << "/" << path::basename(fname) << '.' << std::setfill('0') << std::setw(2) << iter_no << "." << suffix.data();
- return tmp.str();
-}
-
-string getFilename( const string & dirprefix, const string & suffix, int suffix_num ) {
- ostringstream tmp;
- tmp.str(""); tmp << dirprefix.data() << "/" << suffix.data() << "." << suffix_num;
- return tmp.str();
-}
-
-string getFilename( const string & dirprefix, int iter_count, const string & suffix, int suffix_num ) {
- ostringstream tmp;
- tmp.str(""); tmp << dirprefix.data() << "/" << std::setfill('0') << std::setw(2) << iter_count << "." << suffix.data() << "." << suffix_num;
- return tmp.str();
-}
-
-string getFilename( const string & dirprefix, int iter_count, const string & suffix, int suffix_num, const string & suffix2 ) {
- ostringstream tmp;
- tmp.str(""); tmp << dirprefix.data() << "/" << std::setfill('0') << std::setw(2) << iter_count << "." << suffix.data() << "." << suffix_num << "." << suffix2.data();
- return tmp.str();
-}
-
-void CorrectReadsBatch(std::vector<bool> &res,
- std::vector<Read> &reads, size_t buf_size,
- size_t &changedReads, size_t &changedNucleotides, size_t &uncorrectedNucleotides, size_t &totalNucleotides,
- const KMerData &data) {
- unsigned correct_nthreads = min(cfg::get().correct_nthreads, cfg::get().general_max_nthreads);
- bool discard_singletons = cfg::get().bayes_discard_only_singletons;
- bool correct_threshold = cfg::get().correct_use_threshold;
- bool discard_bad = cfg::get().correct_discard_bad;
-
- ReadCorrector corrector(data, cfg::get().correct_stats);
-# pragma omp parallel for shared(reads, res, data) num_threads(correct_nthreads)
- for (size_t i = 0; i < buf_size; ++i) {
- if (reads[i].size() >= K) {
- res[i] =
- corrector.CorrectOneRead(reads[i],
- correct_threshold, discard_singletons, discard_bad);
- } else
- res[i] = false;
- }
-
- changedReads += corrector.changed_reads();
- changedNucleotides += corrector.changed_nucleotides();
- uncorrectedNucleotides += corrector.uncorrected_nucleotides();
- totalNucleotides += corrector.total_nucleotides();
-}
-
-void CorrectReadFile(const KMerData &data,
- size_t &changedReads, size_t &changedNucleotides, size_t &uncorrectedNucleotides, size_t &totalNucleotides,
- const std::string &fname,
- std::ofstream *outf_good, std::ofstream *outf_bad) {
- int qvoffset = cfg::get().input_qvoffset;
- int trim_quality = cfg::get().input_trim_quality;
-
- unsigned correct_nthreads = min(cfg::get().correct_nthreads, cfg::get().general_max_nthreads);
- size_t read_buffer_size = correct_nthreads * cfg::get().correct_readbuffer;
- std::vector<Read> reads(read_buffer_size);
- std::vector<bool> res(read_buffer_size, false);
-
- ireadstream irs(fname, qvoffset);
- VERIFY(irs.is_open());
-
- unsigned buffer_no = 0;
- while (!irs.eof()) {
- size_t buf_size = 0;
- for (; buf_size < read_buffer_size && !irs.eof(); ++buf_size) {
- irs >> reads[buf_size];
- reads[buf_size].trimNsAndBadQuality(trim_quality);
- }
- INFO("Prepared batch " << buffer_no << " of " << buf_size << " reads.");
-
- CorrectReadsBatch(res, reads, buf_size,
- changedReads, changedNucleotides, uncorrectedNucleotides, totalNucleotides,
- data);
-
- INFO("Processed batch " << buffer_no);
- for (size_t i = 0; i < buf_size; ++i) {
- reads[i].print(*(res[i] ? outf_good : outf_bad), qvoffset);
- }
- INFO("Written batch " << buffer_no);
- ++buffer_no;
- }
-}
-
-void CorrectPairedReadFiles(const KMerData &data,
- size_t &changedReads, size_t &changedNucleotides, size_t &uncorrectedNucleotides, size_t &totalNucleotides,
- const std::string &fnamel, const std::string &fnamer,
- ofstream * ofbadl, ofstream * ofcorl, ofstream * ofbadr, ofstream * ofcorr, ofstream * ofunp) {
- int qvoffset = cfg::get().input_qvoffset;
- int trim_quality = cfg::get().input_trim_quality;
-
- unsigned correct_nthreads = min(cfg::get().correct_nthreads, cfg::get().general_max_nthreads);
- size_t read_buffer_size = correct_nthreads * cfg::get().correct_readbuffer;
- std::vector<Read> l(read_buffer_size);
- std::vector<Read> r(read_buffer_size);
- std::vector<bool> left_res(read_buffer_size, false);
- std::vector<bool> right_res(read_buffer_size, false);
-
- unsigned buffer_no = 0;
-
- ireadstream irsl(fnamel, qvoffset), irsr(fnamer, qvoffset);
- VERIFY(irsl.is_open()); VERIFY(irsr.is_open());
-
- while (!irsl.eof() && !irsr.eof()) {
- size_t buf_size = 0;
- for (; buf_size < read_buffer_size && !irsl.eof() && !irsr.eof(); ++buf_size) {
- irsl >> l[buf_size]; irsr >> r[buf_size];
- l[buf_size].trimNsAndBadQuality(trim_quality);
- r[buf_size].trimNsAndBadQuality(trim_quality);
- }
- INFO("Prepared batch " << buffer_no << " of " << buf_size << " reads.");
-
- CorrectReadsBatch(left_res, l, buf_size,
- changedReads, changedNucleotides, uncorrectedNucleotides, totalNucleotides,
- data);
- CorrectReadsBatch(right_res, r, buf_size,
- changedReads, changedNucleotides, uncorrectedNucleotides, totalNucleotides,
- data);
-
- INFO("Processed batch " << buffer_no);
- for (size_t i = 0; i < buf_size; ++i) {
- if (left_res[i] && right_res[i]) {
- l[i].print(*ofcorl, qvoffset);
- r[i].print(*ofcorr, qvoffset);
- } else {
- l[i].print(*(left_res[i] ? ofunp : ofbadl), qvoffset);
- r[i].print(*(right_res[i] ? ofunp : ofbadr), qvoffset);
- }
- }
- INFO("Written batch " << buffer_no);
- ++buffer_no;
- }
-}
-
-std::string getLargestPrefix(const std::string &str1, const std::string &str2) {
- string substr = "";
- for (size_t i = 0; i != str1.size() && i != str2.size(); ++i) {
- if (str1[i] == str2[i])
- substr += str1[i];
- else
- break;
- }
- return substr;
-}
-
-size_t CorrectAllReads() {
- // Now for the reconstruction step; we still have the reads in rv, correcting them in place.
- size_t changedReads = 0;
- size_t changedNucleotides = 0;
- size_t uncorrectedNucleotides = 0;
- size_t totalNucleotides = 0;
-
- int correct_nthreads = std::min(cfg::get().correct_nthreads, cfg::get().general_max_nthreads);
-
- INFO("Starting read correction in " << correct_nthreads << " threads.");
-
- const io::DataSet<> &dataset = cfg::get().dataset;
- io::DataSet<> outdataset;
- size_t ilib = 0;
- for (const auto& lib : dataset.libraries()) {
- auto outlib = lib;
- outlib.clear();
-
- size_t iread = 0;
- for (auto I = lib.paired_begin(), E = lib.paired_end(); I != E; ++I, ++iread) {
- INFO("Correcting pair of reads: " << I->first << " and " << I->second);
- std::string usuffix = std::to_string(ilib) + "_" +
- std::to_string(iread) + ".cor.fastq";
-
- std::string unpaired = getLargestPrefix(I->first, I->second) + "_unpaired.fastq";
-
- std::string outcorl = getReadsFilename(cfg::get().output_dir, I->first, Globals::iteration_no, usuffix);
- std::string outcorr = getReadsFilename(cfg::get().output_dir, I->second, Globals::iteration_no, usuffix);
- std::string outcoru = getReadsFilename(cfg::get().output_dir, unpaired, Globals::iteration_no, usuffix);
-
- std::ofstream ofcorl(outcorl.c_str());
- std::ofstream ofbadl(getReadsFilename(cfg::get().output_dir, I->first, Globals::iteration_no, "bad.fastq").c_str(),
- std::ios::out | std::ios::ate);
- std::ofstream ofcorr(outcorr.c_str());
- std::ofstream ofbadr(getReadsFilename(cfg::get().output_dir, I->second, Globals::iteration_no, "bad.fastq").c_str(),
- std::ios::out | std::ios::ate);
- std::ofstream ofunp (outcoru.c_str());
-
- CorrectPairedReadFiles(*Globals::kmer_data,
- changedReads, changedNucleotides, uncorrectedNucleotides, totalNucleotides,
- I->first, I->second,
- &ofbadl, &ofcorl, &ofbadr, &ofcorr, &ofunp);
- outlib.push_back_paired(outcorl, outcorr);
- outlib.push_back_single(outcoru);
- }
-
- for (auto I = lib.single_begin(), E = lib.single_end(); I != E; ++I, ++iread) {
- INFO("Correcting single reads: " << *I);
- std::string usuffix = std::to_string(ilib) + "_" +
- std::to_string(iread) + ".cor.fastq";
-
- std::string outcor = getReadsFilename(cfg::get().output_dir, *I, Globals::iteration_no, usuffix);
- std::ofstream ofgood(outcor.c_str());
- std::ofstream ofbad(getReadsFilename(cfg::get().output_dir, *I, Globals::iteration_no, "bad.fastq").c_str(),
- std::ios::out | std::ios::ate);
-
- CorrectReadFile(*Globals::kmer_data,
- changedReads, changedNucleotides, uncorrectedNucleotides, totalNucleotides,
- *I,
- &ofgood, &ofbad);
- outlib.push_back_single(outcor);
- }
- outdataset.push_back(outlib);
- ilib += 1;
- }
-
- cfg::get_writable().dataset = outdataset;
-
- INFO("Correction done. Changed " << changedNucleotides << " bases in " << changedReads << " reads.");
- INFO("Failed to correct " << uncorrectedNucleotides << " bases out of " << totalNucleotides << ".");
- return changedReads;
-}
-
-};
diff --git a/src/hammer/hammer_tools.hpp b/src/hammer/hammer_tools.hpp
deleted file mode 100644
index d08fa5f..0000000
--- a/src/hammer/hammer_tools.hpp
+++ /dev/null
@@ -1,57 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef HAMMER_TOOLS_HPP
-#define HAMMER_TOOLS_HPP
-
-#include <unordered_map>
-#include <algorithm>
-#include <stdexcept>
-#include <iomanip>
-#include <fstream>
-#include "io/read.hpp"
-#include "io/ireadstream.hpp"
-#include "sequence/seq.hpp"
-#include "globals.hpp"
-#include "kmer_stat.hpp"
-#include "io/mmapped_reader.hpp"
-
-namespace hammer {
-
-/// initialize subkmer positions and log about it
-void InitializeSubKMerPositions();
-
-/// parallel correction of batch of reads
-void CorrectReadsBatch(std::vector<bool> &res, std::vector<Read> &reads, size_t buf_size,
- size_t &changedReads, size_t &changedNucleotides, size_t &uncorrectedNucleotides, size_t &totalNucleotides,
- const KMerData &data);
-
-/// correct reads in a given file
-void CorrectReadFile(const KMerData &data,
- size_t &changedReads, size_t &changedNucleotides, size_t &uncorrectedNucleotides, size_t &totalNucleotides,
- const std::string &fname,
- std::ofstream *outf_good, std::ofstream *outf_bad);
-
-/// correct reads in a given pair of files
-void CorrectPairedReadFiles(const KMerData &data,
- size_t &changedReads, size_t &changedNucleotides, size_t &uncorrectedNucleotides, size_t &totalNucleotides,
- const std::string &fnamel, const std::string &fnamer,
- std::ofstream * ofbadl, std::ofstream * ofcorl, std::ofstream * ofbadr, std::ofstream * ofcorr, std::ofstream * ofunp);
-/// correct all reads
-size_t CorrectAllReads();
-
-std::string getFilename(const std::string & dirprefix, const std::string & suffix );
-std::string getFilename(const std::string & dirprefix, unsigned iter_count, const std::string & suffix );
-std::string getFilename(const std::string & dirprefix, int iter_count, const std::string & suffix, int suffix_num );
-std::string getFilename(const std::string & dirprefix, int iter_count, const std::string & suffix, int suffix_num, const std::string & suffix2 );
-std::string getFilename(const std::string & dirprefix, const std::string & suffix, int suffix_num );
-std::string getReadsFilename(const std::string & dirprefix, const std::string &fname, unsigned iter_no, const std::string & suffix);
-};
-
-
-
-#endif
diff --git a/src/hammer/kmer_cluster.cpp b/src/hammer/kmer_cluster.cpp
deleted file mode 100644
index 83ec886..0000000
--- a/src/hammer/kmer_cluster.cpp
+++ /dev/null
@@ -1,656 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "io/ireadstream.hpp"
-#include "hammer_tools.hpp"
-#include "hamcluster.hpp"
-#include "kmer_cluster.hpp"
-#include "config_struct_hammer.hpp"
-
-#include <boost/numeric/ublas/io.hpp>
-#include <boost/numeric/ublas/matrix_proxy.hpp>
-#include <boost/numeric/ublas/symmetric.hpp>
-
-#include <iostream>
-#include <fstream>
-#include <algorithm>
-
-using std::max_element;
-using std::min_element;
-
-namespace numeric = boost::numeric::ublas;
-
-using namespace hammer;
-
-std::string KMerClustering::GetGoodKMersFname() const {
- // FIXME: This is ugly!
- std::ostringstream tmp;
- tmp.str("");
- tmp << workdir_ << "/" << "kmers.solid";
-
- return tmp.str();
-}
-
-std::string KMerClustering::GetBadKMersFname() const {
- // FIXME: This is ugly!
- std::ostringstream tmp;
- tmp.str("");
- tmp << workdir_ << "/" << "kmers.bad";
-
- return tmp.str();
-}
-
-static hammer::ExpandedSeq ConsensusWithMask(const std::vector<hammer::ExpandedKMer> &kmers,
- const std::vector<size_t> &mask, size_t maskVal) {
- size_t block_size = kmers.size();
-
- // consensus of a single string is trivial
- if (block_size == 1)
- return kmers[0].seq();
-
- uint64_t scores[4*K] = {0};
- for (size_t j = 0; j < block_size; ++j) {
- if (mask[j] != maskVal)
- continue;
-
- const ExpandedSeq &kmer = kmers[j].seq();
-
- for (unsigned i = 0; i < K; ++i)
- scores[4*i + kmer[i]] += kmers[j].count();
- }
-
- hammer::ExpandedSeq res;
- for (unsigned i = 0; i < K; ++i)
- res[i] = (char)(std::max_element(scores + 4*i, scores + 4*i + 4) - (scores + 4*i));
-
- return res;
-}
-
-static hammer::ExpandedSeq Consensus(const std::vector<hammer::ExpandedKMer> &kmers) {
- size_t block_size = kmers.size();
-
- // consensus of a single string is trivial
- if (block_size == 1)
- return kmers[0].seq();
-
- uint64_t scores[4*K] = {0};
- for (size_t j = 0; j < block_size; ++j) {
- const ExpandedSeq &kmer = kmers[j].seq();
-
- for (unsigned i = 0; i < K; ++i)
- scores[4*i + kmer[i]] += kmers[j].count();
- }
-
- hammer::ExpandedSeq res;
- for (unsigned i = 0; i < K; ++i)
- res[i] = (char)(std::max_element(scores + 4*i, scores + 4*i + 4) - (scores + 4*i));
-
- return res;
-}
-
-double KMerClustering::ClusterBIC(const std::vector<Center> ¢ers,
- const std::vector<size_t> &indices, const std::vector<hammer::ExpandedKMer> &kmers) const {
- size_t block_size = indices.size();
- size_t clusters = centers.size();
- if (block_size == 0)
- return -std::numeric_limits<double>::infinity();
- assert(centers.size() > 0);
-
- double loglik = 0;
- unsigned total = 0;
- for (size_t i = 0; i < block_size; ++i) {
- loglik += kmers[i].count()*kmers[i].logL(centers[indices[i]].center_);
- total += kmers[i].count();
- }
-
- size_t nparams = (clusters - 1) + clusters*K + 2*clusters*K;
-
- if (cfg::get().bayes_debug_output > 1) {
-# pragma omp critical
- {
- std::cout << " logL: " << loglik << ", clusters: " << clusters << ", nparams: " << nparams << ", N: " << block_size << std::endl;
- }
- }
-
- return loglik - (double)nparams * log((double)total) / 2.0;
-}
-
-
-double KMerClustering::lMeansClustering(unsigned l, const std::vector<hammer::ExpandedKMer> &kmers,
- std::vector<size_t> &indices, std::vector<Center> ¢ers) {
- centers.resize(l); // there are l centers
-
- // if l==1 then clustering is trivial
- if (l == 1) {
- centers[0].center_ = Consensus(kmers);
- centers[0].count_ = kmers.size();
- for (size_t i = 0; i < kmers.size(); ++i)
- indices[i] = 0;
- return ClusterBIC(centers, indices, kmers);
- }
-
- // Provide the initial approximation.
- double totalLikelihood = 0.0;
- if (cfg::get().bayes_initial_refine) {
- // Refine the current approximation
- centers[l-1].center_ = kmers[l-1].seq();
- for (size_t i = 0; i < kmers.size(); ++i) {
- size_t cidx = indices[i];
- unsigned cdist = kmers[i].hamdist(centers[cidx].center_, K);
- unsigned mdist = kmers[i].hamdist(centers[l-1].center_, cdist);
- if (mdist < cdist) {
- indices[i] = l - 1;
- cidx = l - 1;
- }
- totalLikelihood += kmers[i].logL(centers[cidx].center_);
- }
- } else {
- // We assume that kmers are sorted wrt the count.
- for (size_t j = 0; j < l; ++j)
- centers[j].center_ = kmers[j].seq();
-
- for (size_t i = 0; i < kmers.size(); ++i) {
- unsigned mdist = K;
- unsigned cidx = 0;
- for (unsigned j = 0; j < l; ++j) {
- unsigned cdist = kmers[i].hamdist(centers[j].center_, mdist);
- if (cdist < mdist) {
- mdist = cdist;
- cidx = j;
- }
- }
- indices[i] = cidx;
- totalLikelihood += kmers[i].logL(centers[cidx].center_);
- }
- }
-
- if (cfg::get().bayes_debug_output > 1) {
-# pragma omp critical
- {
- std::cout << " centers:\n";
- for (size_t i=0; i < centers.size(); ++i) {
- std::cout << " " << centers[i].center_ << "\n";
- }
- }
- }
-
- // Main loop
- bool changed = true, improved = true;
-
- // auxiliary variables
- std::vector<size_t> dists(l);
- std::vector<double> loglike(l);
- std::vector<bool> changedCenter(l);
-
- while (changed && improved) {
- // fill everything with zeros
- changed = false;
- std::fill(changedCenter.begin(), changedCenter.end(), false);
- for (unsigned j = 0; j < l; ++j)
- centers[j].count_ = 0;
-
- double curlik = 0;
-
- // E step: find which clusters we belong to
- for (size_t i = 0; i < kmers.size(); ++i) {
- size_t newInd = 0;
- if (cfg::get().bayes_use_hamming_dist) {
- for (unsigned j = 0; j < l; ++j)
- dists[j] = kmers[i].hamdist(centers[j].center_);
-
- newInd = std::min_element(dists.begin(), dists.end()) - dists.begin();
- } else {
- for (unsigned j = 0; j < l; ++j)
- loglike[j] = kmers[i].logL(centers[j].center_);
- newInd = std::max_element(loglike.begin(), loglike.end()) - loglike.begin();
- }
-
- curlik += loglike[newInd];
- if (indices[i] != newInd) {
- changed = true;
- changedCenter[indices[i]] = true;
- changedCenter[newInd] = true;
- indices[i] = newInd;
- }
- ++centers[indices[i]].count_;
- }
-
- if (cfg::get().bayes_debug_output > 1) {
-# pragma omp critical
- {
- std::cout << " total likelihood=" << curlik << " as compared to previous " << totalLikelihood << std::endl;
- }
- }
- improved = (curlik > totalLikelihood);
- if (improved)
- totalLikelihood = curlik;
-
- // M step: find new cluster centers
- for (unsigned j=0; j < l; ++j) {
- if (!changedCenter[j])
- continue; // nothing has changed
-
- centers[j].center_ = ConsensusWithMask(kmers, indices, j);
- }
- }
-
- // last M step
- for (unsigned j=0; j < l; ++j)
- centers[j].center_ = ConsensusWithMask(kmers, indices, j);
-
- if (cfg::get().bayes_debug_output > 1) {
-# pragma omp critical
- {
- std::cout << " final centers:\n";
- for (size_t i=0; i < centers.size(); ++i) {
- std::cout << " " << centers[i].center_ << "\n";
- }
- }
- }
-
- return ClusterBIC(centers, indices, kmers);
-}
-
-
-size_t KMerClustering::SubClusterSingle(const std::vector<size_t> & block, std::vector< std::vector<size_t> > & vec) {
- size_t newkmers = 0;
-
- if (cfg::get().bayes_debug_output > 0) {
-# pragma omp critical
- {
- std::cout << " kmers:\n";
- for (size_t i = 0; i < block.size(); i++) {
- std::cout << data_.kmer(block[i]) << '\n';
- }
- }
- }
-
- size_t origBlockSize = block.size();
- if (origBlockSize == 0) return 0;
-
- // Ad-hoc max cluster limit: we start to consider only those k-mers which
- // multiplicity differs from maximum multiplicity by 10x.
- size_t maxcls = 0;
- size_t cntthr = std::max(10u, data_[block[0]].count() / 10);
- for (size_t i = 0; i < block.size(); ++i)
- maxcls += (data_[block[i]].count() > cntthr);
- // Another limit: we're interested in good centers only
- size_t maxgcnt = 0;
- for (size_t i = 0; i < block.size(); ++i) {
- float center_quality = 1 - data_[block[i]].total_qual;
- if ((center_quality > cfg::get().bayes_singleton_threshold) ||
- (cfg::get().correct_use_threshold && center_quality > cfg::get().correct_threshold))
- maxgcnt += 1;
- }
-
- maxcls = std::min(maxcls, maxgcnt) + 1;
- if (cfg::get().bayes_debug_output > 0) {
- #pragma omp critical
- {
- std::cout << "\nClustering an interesting block. Maximum # of clusters estimated: " << maxcls << std::endl;
- }
- }
-
- // Prepare the expanded k-mer structure
- std::vector<hammer::ExpandedKMer> kmers;
- for (size_t idx : block)
- kmers.emplace_back(data_.kmer(idx), data_[idx]);
-
- double bestLikelihood = -std::numeric_limits<double>::infinity();
- std::vector<Center> bestCenters;
- std::vector<size_t> indices(origBlockSize);
- std::vector<size_t> bestIndices(origBlockSize);
-
- unsigned max_l = cfg::get().bayes_hammer_mode ? 1 : (unsigned) origBlockSize;
- std::vector<Center> centers;
- for (unsigned l = 1; l <= max_l; ++l) {
- double curLikelihood = lMeansClustering(l, kmers, indices, centers);
- if (cfg::get().bayes_debug_output > 0) {
- #pragma omp critical
- {
- std::cout << " indices: ";
- for (uint32_t i = 0; i < origBlockSize; i++) std::cout << indices[i] << " ";
- std::cout << "\n";
- std::cout << " likelihood with " << l << " clusters is " << curLikelihood << std::endl;
- }
- }
- if (curLikelihood > bestLikelihood) {
- bestLikelihood = curLikelihood;
- bestCenters = centers; bestIndices = indices;
- } else if (l >= maxcls)
- break;
- }
-
- // find if centers are in clusters
- std::vector<size_t> centersInCluster(bestCenters.size(), -1u);
- for (unsigned i = 0; i < origBlockSize; i++) {
- unsigned dist = kmers[i].hamdist(bestCenters[bestIndices[i]].center_);
- if (dist == 0)
- centersInCluster[bestIndices[i]] = i;
- }
-
- if (cfg::get().bayes_debug_output > 0) {
-# pragma omp critical
- {
- std::cout << "Centers: \n";
- for (size_t k=0; k<bestCenters.size(); ++k) {
- std::cout << " " << std::setw(4) << bestCenters[k].count_ << ": ";
- if (centersInCluster[k] != -1u) {
- const KMerStat &kms = data_[block[centersInCluster[k]]];
- std::cout << kms << " " << std::setw(8) << block[centersInCluster[k]] << " ";
- } else {
- std::cout << bestCenters[k].center_;
- }
- std::cout << '\n';
- }
- std::cout << "The entire block:" << std::endl;
- for (uint32_t i = 0; i < origBlockSize; i++) {
- const KMerStat &kms = data_[block[i]];
- std::cout << " " << kms << " " << std::setw(8) << block[i] << " ";
- for (uint32_t j=0; j<K; ++j) std::cout << std::setw(3) << (unsigned)getQual(kms, j) << " "; std::cout << "\n";
- }
- std::cout << std::endl;
- }
- }
-
- // it may happen that consensus string from one subcluster occurs in other subclusters
- // we need to check for that
- bool foundBadCenter = true;
- while (foundBadCenter) {
- foundBadCenter = false;
- for (size_t k=0; k<bestCenters.size(); ++k) {
- if (foundBadCenter) break; // restart if found one bad center
- if (bestCenters[k].count_ == 0) continue;
- if (centersInCluster[k] != -1u) continue;
- for (size_t s = 0; s< bestCenters.size(); ++s) {
- if (s == k || centersInCluster[s] == -1u) continue;
- unsigned dist = hamdist(bestCenters[k].center_, bestCenters[s].center_);
- if (dist == 0) {
- // OK, that's the situation, cluster k should be added to cluster s
- for (uint32_t i = 0; i < origBlockSize; i++) {
- if (indices[i] == k) {
- indices[i] = (unsigned)s;
- bestCenters[s].count_++;
- }
- }
- bestCenters[k].count_ = 0; // it will be skipped now
- foundBadCenter = true;
- break;
- }
- }
- }
- }
-
- if (cfg::get().bayes_debug_output > 0 && origBlockSize > 2) {
- #pragma omp critical
- {
- std::cout << "\nAfter the check we got centers: \n";
- for (size_t k=0; k<bestCenters.size(); ++k) {
- std::cout << " " << bestCenters[k].center_ << " (" << bestCenters[k].count_ << ")";
- if (centersInCluster[k] != -1u) std::cout << block[centersInCluster[k]];
- std::cout << "\n";
- }
- std::cout << std::endl;
- }
- }
-
- for (size_t k = 0; k < bestCenters.size(); ++k) {
- if (bestCenters[k].count_ == 0)
- continue; // superfluous cluster with zero elements
-
- std::vector<size_t> v;
- if (bestCenters[k].count_ == 1) {
- for (size_t i = 0; i < origBlockSize; i++) {
- if (indices[i] == k) {
- v.push_back(block[i]);
- break;
- }
- }
- } else { // there are several kmers in this cluster
- for (size_t i = 0; i < origBlockSize; i++) {
- if (bestIndices[i] == k) {
- if (centersInCluster[k] == i) {
- v.insert(v.begin(), block[i]);
- } else {
- v.push_back(block[i]);
- }
- }
- }
-
- if (centersInCluster[k] == -1u) {
- unsigned new_idx = 0;
- #pragma omp critical
- {
- KMer newkmer(bestCenters[k].center_);
-
- KMerStat kms(0 /* cnt */, 1.0 /* total quality */, NULL /*quality */);
- kms.mark_good();
- new_idx = (unsigned)data_.push_back(newkmer, kms);
- if (data_.kmer(data_.seq_idx(newkmer)) != newkmer)
- newkmers += 1;
- }
- v.insert(v.begin(), new_idx);
- }
- }
- vec.push_back(v);
- }
-
- return newkmers;
-}
-
-static void UpdateErrors(numeric::matrix<uint64_t> &m,
- const KMer k, const KMer kc) {
- for (unsigned i = 0; i < K; ++i) {
- m(kc[i], k[i]) += 1;
- }
-}
-
-size_t KMerClustering::ProcessCluster(const std::vector<size_t> &cur_class,
- numeric::matrix<uint64_t> &errs,
- std::ofstream &ofs, std::ofstream &ofs_bad,
- size_t &gsingl, size_t &tsingl, size_t &tcsingl, size_t &gcsingl,
- size_t &tcls, size_t &gcls, size_t &tkmers, size_t &tncls) {
- size_t newkmers = 0;
-
- // No need for clustering for singletons
- if (cur_class.size() == 1) {
- size_t idx = cur_class[0];
- KMerStat &singl = data_[idx];
- if ((1-singl.total_qual) > cfg::get().bayes_singleton_threshold) {
- singl.mark_good();
- gsingl += 1;
-
- if (ofs.good()) {
-# pragma omp critical
- {
- ofs << " good singleton: " << idx << "\n " << singl << '\n';
- }
- }
- } else {
- if (cfg::get().correct_use_threshold && (1-singl.total_qual) > cfg::get().correct_threshold)
- singl.mark_good();
- else
- singl.mark_bad();
-
- if (ofs_bad.good()) {
-# pragma omp critical
- {
- ofs_bad << " bad singleton: " << idx << "\n " << singl << '\n';
- }
- }
- }
- tsingl += 1;
- return 0;
- }
-
- std::vector<std::vector<size_t> > blocksInPlace;
- if (cfg::get().bayes_debug_output) {
-# pragma omp critical
- {
- std::cout << "process_SIN with size=" << cur_class.size() << std::endl;
- }
- }
- newkmers += SubClusterSingle(cur_class, blocksInPlace);
-
- tncls += 1;
- for (size_t m = 0; m < blocksInPlace.size(); ++m) {
- const std::vector<size_t> ¤tBlock = blocksInPlace[m];
- if (currentBlock.size() == 0)
- continue;
-
- size_t cidx = currentBlock[0];
- KMerStat ¢er = data_[cidx];
- KMer ckmer = data_.kmer(cidx);
- double center_quality = 1 - center.total_qual;
-
- // Computing the overall quality of a cluster.
- double cluster_quality = 1;
- if (currentBlock.size() > 1) {
- for (size_t j = 1; j < currentBlock.size(); ++j)
- cluster_quality *= data_[currentBlock[j]].total_qual;
-
- cluster_quality = 1-cluster_quality;
- }
-
- if (currentBlock.size() == 1)
- tcsingl += 1;
- else
- tcls += 1;
-
- if ((center_quality > cfg::get().bayes_singleton_threshold &&
- cluster_quality > cfg::get().bayes_nonsingleton_threshold) ||
- cfg::get().bayes_hammer_mode) {
- center.mark_good();
-
- if (currentBlock.size() == 1)
- gcsingl += 1;
- else
- gcls += 1;
-
- if (ofs.good()) {
-# pragma omp critical
- {
- ofs << " center of good cluster (" << currentBlock.size() << ", " << cluster_quality << ")" << "\n "
- << center << '\n';
- }
- }
- } else {
- if (cfg::get().correct_use_threshold && center_quality > cfg::get().correct_threshold)
- center.mark_good();
- else
- center.mark_bad();
- if (ofs_bad.good()) {
-# pragma omp critical
- {
- ofs_bad << " center of bad cluster (" << currentBlock.size() << ", " << cluster_quality << ")" << "\n "
- << center << '\n';
- }
- }
- }
-
- tkmers += currentBlock.size();
-
- for (size_t j = 1; j < currentBlock.size(); ++j) {
- size_t eidx = currentBlock[j];
- KMerStat &kms = data_[eidx];
-
- UpdateErrors(errs, data_.kmer(eidx), ckmer);
-
- if (ofs_bad.good()) {
-# pragma omp critical
- {
- ofs_bad << " part of cluster (" << currentBlock.size() << ", " << cluster_quality << ")" << "\n "
- << kms << '\n';
- }
- }
- }
- }
-
- return newkmers;
-}
-
-
-class KMerStatCountComparator {
- const KMerData &data_;
-public:
- KMerStatCountComparator(const KMerData &data)
- : data_(data) {}
- bool operator()(size_t a, size_t b) {
- return data_[a].count() > data_[b].count();
- }
-};
-
-void KMerClustering::process(const std::string &Prefix) {
- size_t newkmers = 0;
- size_t gsingl = 0, tsingl = 0, tcsingl = 0, gcsingl = 0, tcls = 0, gcls = 0, tkmers = 0, tncls = 0;
-
- std::ofstream ofs, ofs_bad;
- if (cfg::get().bayes_write_solid_kmers)
- ofs.open(GetGoodKMersFname());
- if (cfg::get().bayes_write_bad_kmers)
- ofs_bad.open(GetBadKMersFname());
-
- // Open and read index file
- MMappedRecordReader<size_t> findex(Prefix + ".idx", /* unlink */ !debug_, -1ULL);
-
- std::vector<numeric::matrix<uint64_t> > errs(nthreads_, numeric::matrix<double>(4, 4, 0.0));
-
-# pragma omp parallel for shared(ofs, ofs_bad, errs) num_threads(nthreads_) schedule(guided) reduction(+:newkmers, gsingl, tsingl, tcsingl, gcsingl, tcls, gcls, tkmers, tncls)
- for (size_t chunk = 0; chunk < nthreads_ * nthreads_; ++chunk) {
- size_t *current = findex.data() + findex.size() * chunk / nthreads_ / nthreads_;
- size_t *next = findex.data() + findex.size() * (chunk + 1)/ nthreads_ / nthreads_;
- std::ifstream is(Prefix, std::ios::in | std::ios::binary);
-
- // Calculate how much we need to seek
- size_t soff = 0;
- for (size_t *csz = findex.data(); csz != current; ++csz)
- soff += *csz;
-
- // Now see the stream and start processing
- is.seekg(soff * sizeof(size_t));
-
- for (; current != next; ++current) {
- std::vector<size_t> cluster(*current);
- VERIFY(is.good());
- is.read((char*)&cluster[0], *current * sizeof(cluster[0]));
-
- // Underlying code expected classes to be sorted in count decreasing order.
- std::sort(cluster.begin(), cluster.end(), KMerStatCountComparator(data_));
-
- newkmers += ProcessCluster(cluster,
- errs[omp_get_thread_num()],
- ofs, ofs_bad,
- gsingl, tsingl, tcsingl, gcsingl,
- tcls, gcls, tkmers, tncls);
- }
- }
-
- if (!debug_) {
- int res = unlink(Prefix.c_str());
- VERIFY_MSG(res == 0,
- "unlink(2) failed. Reason: " << strerror(errno) << ". Error code: " << errno);
- }
-
- for (unsigned i = 1; i < nthreads_; ++i)
- errs[0] += errs[i];
-
- numeric::matrix<uint64_t> rowsums = prod(errs[0], numeric::scalar_matrix<double>(4, 1, 1));
- numeric::matrix<double> err(4, 4);
- for (unsigned i = 0; i < 4; ++i)
- for (unsigned j = 0; j < 4; ++j)
- err(i, j) = 1.0 * (double)errs[0](i, j) / (double)rowsums(i, 0);
-
- INFO("Subclustering done. Total " << newkmers << " non-read kmers were generated.");
- INFO("Subclustering statistics:");
- INFO(" Total singleton hamming clusters: " << tsingl << ". Among them " << gsingl << " (" << 100.0 * (double)gsingl / (double)tsingl << "%) are good");
- INFO(" Total singleton subclusters: " << tcsingl << ". Among them " << gcsingl << " (" << 100.0 * (double)gcsingl / (double)tcsingl << "%) are good");
- INFO(" Total non-singleton subcluster centers: " << tcls << ". Among them " << gcls << " (" << 100.0 * (double)gcls / (double)tcls << "%) are good");
- INFO(" Average size of non-trivial subcluster: " << 1.0 * (double)tkmers / (double)tcls << " kmers");
- INFO(" Average number of sub-clusters per non-singleton cluster: " << 1.0 * (double)(tcsingl + tcls) / (double)tncls);
- INFO(" Total solid k-mers: " << gsingl + gcsingl + gcls);
- INFO(" Substitution probabilities: " << err);
-}
diff --git a/src/hammer/kmer_data.cpp b/src/hammer/kmer_data.cpp
deleted file mode 100644
index 6375e5e..0000000
--- a/src/hammer/kmer_data.cpp
+++ /dev/null
@@ -1,569 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "kmer_data.hpp"
-#include "io/read_processor.hpp"
-#include "valid_kmer_generator.hpp"
-
-#include "io/mmapped_writer.hpp"
-#include "io/ireadstream.hpp"
-#include "io/kmer_iterator.hpp"
-#include "config_struct_hammer.hpp"
-
-#include "file_limit.hpp"
-
-#include <libcxx/sort.hpp>
-
-using namespace hammer;
-
-class BufferFiller;
-
-struct KMerComparator {
- bool operator()(const KMer &l, const KMer &r) const {
- for (size_t i = 0; i < KMer::DataSize ; ++i) {
- if (l.data()[i] != r.data()[i]) {
- return (l.data()[i] < r.data()[i]);
- }
- }
-
- return false;
- }
-};
-
-
-class HammerKMerSplitter : public KMerSplitter<hammer::KMer> {
- typedef std::vector<std::vector<KMer> > KMerBuffer;
-
- void DumpBuffers(size_t num_files, size_t nthreads,
- std::vector<KMerBuffer> &buffers,
- const path::files_t &ostreams) const;
-
- public:
- HammerKMerSplitter(std::string &work_dir)
- : KMerSplitter<hammer::KMer>(work_dir, hammer::K) {}
-
- virtual path::files_t Split(size_t num_files);
-
- friend class BufferFiller;
-};
-
-void HammerKMerSplitter::DumpBuffers(size_t num_files, size_t nthreads,
- std::vector<KMerBuffer> &buffers,
- const path::files_t &ostreams) const {
-# pragma omp parallel for num_threads(nthreads)
- for (unsigned k = 0; k < num_files; ++k) {
- size_t sz = 0;
- for (size_t i = 0; i < nthreads; ++i)
- sz += buffers[i][k].size();
-
- if (!sz)
- continue;
-
- std::vector<KMer> SortBuffer;
- SortBuffer.reserve(sz);
- for (size_t i = 0; i < nthreads; ++i) {
- KMerBuffer &entry = buffers[i];
- SortBuffer.insert(SortBuffer.end(), entry[k].begin(), entry[k].end());
- }
- libcxx::sort(SortBuffer.begin(), SortBuffer.end(), KMerComparator());
- auto it = std::unique(SortBuffer.begin(), SortBuffer.end());
-
-# pragma omp critical
- {
- FILE *f = fopen(ostreams[k].c_str(), "ab");
- VERIFY_MSG(f, "Cannot open temporary file to write");
- fwrite(SortBuffer.data(), sizeof(KMer), it - SortBuffer.begin(), f);
- fclose(f);
- }
- }
-
- for (unsigned i = 0; i < nthreads; ++i) {
- for (unsigned j = 0; j < num_files; ++j) {
- buffers[i][j].clear();
- }
- }
-}
-
-
-class BufferFiller {
- std::vector<HammerKMerSplitter::KMerBuffer> &tmp_entries_;
- unsigned num_files_;
- size_t cell_size_;
- size_t processed_;
- const HammerKMerSplitter &splitter_;
-
- public:
- BufferFiller(std::vector<HammerKMerSplitter::KMerBuffer> &tmp_entries, size_t cell_size, const HammerKMerSplitter &splitter):
- tmp_entries_(tmp_entries), num_files_((unsigned)tmp_entries[0].size()), cell_size_(cell_size), processed_(0), splitter_(splitter) {}
-
- size_t processed() const { return processed_; }
-
- bool operator()(const Read &r) {
- int trim_quality = cfg::get().input_trim_quality;
-
- // FIXME: Get rid of this
- Read cr = r;
- size_t sz = cr.trimNsAndBadQuality(trim_quality);
-
- #pragma omp atomic
- processed_ += 1;
-
- if (sz < hammer::K)
- return false;
-
- HammerKMerSplitter::KMerBuffer &entry = tmp_entries_[omp_get_thread_num()];
- ValidKMerGenerator<hammer::K> gen(cr);
- bool stop = false;
- while (gen.HasMore()) {
- KMer seq = gen.kmer();
- size_t idx = splitter_.GetFileNumForSeq(seq, num_files_);
- entry[idx].push_back(seq);
- stop |= entry[idx].size() > cell_size_;
-
- seq = !seq;
- idx = splitter_.GetFileNumForSeq(seq, num_files_);
- entry[idx].push_back(seq);
- stop |= entry[idx].size() > cell_size_;
-
- gen.Next();
- }
-
- return stop;
- }
-};
-
-path::files_t HammerKMerSplitter::Split(size_t num_files) {
- unsigned nthreads = std::min(cfg::get().count_merge_nthreads, cfg::get().general_max_nthreads);
-
- INFO("Splitting kmer instances into " << num_files << " buckets. This might take a while.");
-
- // Determine the set of output files
- path::files_t out;
- for (unsigned i = 0; i < num_files; ++i)
- out.push_back(GetRawKMersFname(i));
-
- size_t file_limit = num_files + 2*nthreads;
- size_t res = limit_file(file_limit);
- if (res < file_limit) {
- WARN("Failed to setup necessary limit for number of open files. The process might crash later on.");
- WARN("Do 'ulimit -n " << file_limit << "' in the console to overcome the limit");
- }
-
- size_t reads_buffer_size = cfg::get().count_split_buffer;
- if (reads_buffer_size == 0) {
- reads_buffer_size = 536870912ull;
- size_t mem_limit = (size_t)((double)(get_free_memory()) / (nthreads * 3));
- INFO("Memory available for splitting buffers: " << (double)mem_limit / 1024.0 / 1024.0 / 1024.0 << " Gb");
- reads_buffer_size = std::min(reads_buffer_size, mem_limit);
- }
- size_t cell_size = reads_buffer_size / (num_files * sizeof(KMer));
- // Set sane minimum cell size
- if (cell_size < 16384)
- cell_size = 16384;
-
- INFO("Using cell size of " << cell_size);
- std::vector<KMerBuffer> tmp_entries(nthreads);
- for (unsigned i = 0; i < nthreads; ++i) {
- KMerBuffer &entry = tmp_entries[i];
- entry.resize(num_files);
- for (unsigned j = 0; j < num_files; ++j) {
- entry[j].reserve((size_t)(1.1 * (double)cell_size));
- }
- }
-
- size_t n = 15;
- BufferFiller filler(tmp_entries, cell_size, *this);
- const auto& dataset = cfg::get().dataset;
- for (auto I = dataset.reads_begin(), E = dataset.reads_end(); I != E; ++I) {
- INFO("Processing " << *I);
- ireadstream irs(*I, cfg::get().input_qvoffset);
- while (!irs.eof()) {
- hammer::ReadProcessor rp(nthreads);
- rp.Run(irs, filler);
- DumpBuffers(num_files, nthreads, tmp_entries, out);
- VERIFY_MSG(rp.read() == rp.processed(), "Queue unbalanced");
-
- if (filler.processed() >> n) {
- INFO("Processed " << filler.processed() << " reads");
- n += 1;
- }
- }
- }
- INFO("Processed " << filler.processed() << " reads");
-
- return out;
-}
-
-static inline void Merge(KMerStat &lhs, const KMerStat &rhs) {
- lhs.set_count(lhs.count() + rhs.count());
- lhs.total_qual *= rhs.total_qual;
- lhs.qual += rhs.qual;
-}
-
-static void PushKMer(KMerData &data,
- KMer kmer, const unsigned char *q, double prob) {
- size_t idx = data.checking_seq_idx(kmer);
- if (idx == -1ULL)
- return;
- KMerStat &kmc = data[idx];
- kmc.lock();
- Merge(kmc,
- KMerStat(1, (float)prob, q));
- kmc.unlock();
-}
-
-static void PushKMerRC(KMerData &data,
- KMer kmer, const unsigned char *q, double prob) {
- unsigned char rcq[K];
-
- // Prepare RC kmer with quality.
- kmer = !kmer;
- for (unsigned i = 0; i < K; ++i)
- rcq[K - i - 1] = q[i];
-
- size_t idx = data.checking_seq_idx(kmer);
- if (idx == -1ULL)
- return;
- KMerStat &kmc = data[idx];
- kmc.lock();
- Merge(kmc,
- KMerStat(1, (float)prob, rcq));
- kmc.unlock();
-}
-
-class KMerDataFiller {
- KMerData &data_;
-
- public:
- KMerDataFiller(KMerData &data)
- : data_(data) {}
-
- bool operator()(const Read &r) {
- int trim_quality = cfg::get().input_trim_quality;
-
- // FIXME: Get rid of this
- Read cr = r;
- size_t sz = cr.trimNsAndBadQuality(trim_quality);
-
- if (sz < hammer::K)
- return false;
-
- ValidKMerGenerator<hammer::K> gen(cr);
- const char *q = cr.getQualityString().data();
- while (gen.HasMore()) {
- KMer kmer = gen.kmer();
- const unsigned char *kq = (const unsigned char*)(q + gen.pos() - 1);
-
- PushKMer(data_, kmer, kq, 1 - gen.correct_probability());
- PushKMerRC(data_, kmer, kq, 1 - gen.correct_probability());
-
- gen.Next();
- }
-
- return false;
- }
-};
-
-class KMerMultiplicityCounter {
- KMerData &data_;
- uint64_t *cnt_;
-
- void IncCount(const hammer::KMer &k) {
- size_t idx = data_.seq_idx(k);
- size_t block = idx * 2 / (8 * sizeof(uint64_t)), pos = (idx * 2) % (8 * sizeof(uint64_t));
- size_t mask = 3ull << pos;
-
- if (__sync_fetch_and_or(cnt_ + block, 1ull << pos) & mask)
- __sync_fetch_and_or(cnt_ + block, 2ull << pos);
- }
-
- public:
- KMerMultiplicityCounter(KMerData &data)
- : data_(data) {
- size_t blocks = (2 * data.size()) / (8 * sizeof(uint64_t)) + 1;
- cnt_ = new uint64_t[blocks];
- memset(cnt_, 0, blocks * sizeof(uint64_t));
- }
- ~KMerMultiplicityCounter() { delete[] cnt_; }
-
-
- bool operator()(const Read &r) {
- int trim_quality = cfg::get().input_trim_quality;
-
- // FIXME: Get rid of this
- Read cr = r;
- size_t sz = cr.trimNsAndBadQuality(trim_quality);
-
- if (sz < hammer::K)
- return false;
-
- ValidKMerGenerator<hammer::K> gen(cr);
- while (gen.HasMore()) {
- KMer kmer = gen.kmer();
-
- IncCount(kmer);
- IncCount(!kmer);
-
- gen.Next();
- }
-
- return false;
- }
-
- size_t count(size_t idx) const {
- size_t block = idx * 2 / (8 * sizeof(uint64_t)), pos = idx * 2 % (8 * sizeof(uint64_t));
- return (cnt_[block] >> pos) & 3;
- }
-};
-
-class NonSingletonKMerSplitter : public KMerSplitter<hammer::KMer> {
- typedef std::vector<std::vector<KMer> > KMerBuffer;
-
- std::pair<size_t, size_t>
- FillBufferFromStream(io::raw_kmer_iterator<hammer::KMer> &it,
- KMerBuffer &entry,
- size_t cell_size, size_t num_files) {
- size_t processed = 0, non_singleton = 0 ;
- for ( ; it.good(); ++it) {
- hammer::KMer seq(hammer::K, *it);
-
- size_t kidx = data_.seq_idx(seq);
- size_t cnt = counter_.count(kidx);
-
- processed += 1;
-
- if (cnt == 1)
- continue;
-
- non_singleton += 1;
-
- size_t idx = this->GetFileNumForSeq(seq, (unsigned)num_files);
- entry[idx].push_back(seq);
-
-
- if (entry[idx].size() > cell_size)
- break;
- }
- return std::make_pair(processed, non_singleton);
- }
-
- void DumpBuffers(size_t num_files, size_t nthreads,
- std::vector<KMerBuffer> &buffers,
- const path::files_t &ostreams) const {
-# pragma omp parallel for num_threads(nthreads)
- for (unsigned k = 0; k < num_files; ++k) {
- size_t sz = 0;
- for (size_t i = 0; i < nthreads; ++i)
- sz += buffers[i][k].size();
-
- if (!sz)
- continue;
-
- std::vector<KMer> SortBuffer;
- SortBuffer.reserve(sz);
- for (size_t i = 0; i < nthreads; ++i) {
- KMerBuffer &entry = buffers[i];
- SortBuffer.insert(SortBuffer.end(), entry[k].begin(), entry[k].end());
- }
- libcxx::sort(SortBuffer.begin(), SortBuffer.end(), KMerComparator());
- auto it = std::unique(SortBuffer.begin(), SortBuffer.end());
-
-# pragma omp critical
- {
- FILE *f = fopen(ostreams[k].c_str(), "ab");
- VERIFY_MSG(f, "Cannot open temporary file to write");
- fwrite(SortBuffer.data(), sizeof(KMer), it - SortBuffer.begin(), f);
- fclose(f);
- }
- }
-
- for (unsigned i = 0; i < nthreads; ++i) {
- for (unsigned j = 0; j < num_files; ++j) {
- buffers[i][j].clear();
- }
- }
- }
-
- public:
- NonSingletonKMerSplitter(std::string &work_dir,
- const std::string &final_kmers,
- const KMerData &data,
- const KMerMultiplicityCounter &counter)
- : KMerSplitter<hammer::KMer>(work_dir, hammer::K), final_kmers_(final_kmers), data_(data), counter_(counter){}
-
- virtual path::files_t Split(size_t num_files) {
- unsigned nthreads = std::min(cfg::get().count_merge_nthreads, cfg::get().general_max_nthreads);
-
- INFO("Splitting kmer instances into " << num_files << " buckets. This might take a while.");
-
- // Determine the set of output files
- path::files_t out;
- for (unsigned i = 0; i < num_files; ++i)
- out.push_back(GetRawKMersFname(i));
-
- size_t file_limit = num_files + 2*nthreads;
- size_t res = limit_file(file_limit);
- if (res < file_limit) {
- WARN("Failed to setup necessary limit for number of open files. The process might crash later on.");
- WARN("Do 'ulimit -n " << file_limit << "' in the console to overcome the limit");
- }
-
- size_t reads_buffer_size = cfg::get().count_split_buffer;
- if (reads_buffer_size == 0) {
- reads_buffer_size = 536870912ull;
- size_t mem_limit = (size_t)((double)(get_free_memory()) / (nthreads * 3));
- INFO("Memory available for splitting buffers: " << (double)mem_limit / 1024.0 / 1024.0 / 1024.0 << " Gb");
- reads_buffer_size = std::min(reads_buffer_size, mem_limit);
- }
- size_t cell_size = reads_buffer_size / (num_files * sizeof(KMer));
- // Set sane minimum cell size
- if (cell_size < 16384)
- cell_size = 16384;
-
- INFO("Using cell size of " << cell_size);
- std::vector<KMerBuffer> tmp_entries(nthreads);
- for (unsigned i = 0; i < nthreads; ++i) {
- KMerBuffer &entry = tmp_entries[i];
- entry.resize(num_files);
- for (unsigned j = 0; j < num_files; ++j) {
- entry[j].reserve((size_t)(1.1 * (double)cell_size));
- }
- }
-
- size_t n = 15;
- size_t total_kmers = 0, non_singletons = 0;
- auto kmers = io::make_kmer_iterator<hammer::KMer>(final_kmers_, hammer::K, nthreads);
- while (std::any_of(kmers.begin(), kmers.end(),
- [](const io::raw_kmer_iterator<hammer::KMer> &it) { return it.good(); })) {
-# pragma omp parallel for num_threads(nthreads) reduction(+ : total_kmers) reduction(+ : non_singletons)
- for (size_t i = 0; i < kmers.size(); ++i) {
- size_t kc, nsc;
- std::tie(kc, nsc) = FillBufferFromStream(kmers[i], tmp_entries[i], cell_size, num_files);
- total_kmers += kc;
- non_singletons += nsc;
- }
-
- DumpBuffers(num_files, nthreads, tmp_entries, out);
- if (total_kmers >> n) {
- INFO("Processed " << total_kmers << " kmers");
- n += 1;
- }
- }
- INFO("Processed " << total_kmers << " kmers");
-
- INFO("Total " << non_singletons << " non-singleton k-mers written");
-
- unlink(final_kmers_.c_str());
-
- return out;
- }
-
- private:
- const std::string final_kmers_;
- const KMerData &data_;
- const KMerMultiplicityCounter &counter_;
-};
-
-void KMerDataCounter::BuildKMerIndex(KMerData &data) {
- // Build the index
- std::string workdir = cfg::get().input_working_dir;
- HammerKMerSplitter splitter(workdir);
- KMerDiskCounter<hammer::KMer> counter(workdir, splitter);
-
- size_t kmers = KMerIndexBuilder<HammerKMerIndex>(workdir, num_files_, omp_get_max_threads()).BuildIndex(data.index_, counter, /* save final */ true);
- std::string final_kmers = counter.GetFinalKMersFname();
- // Optionally perform a filtering step
- if (cfg::get().count_filter_singletons) {
- INFO("Filtering singleton k-mers");
- data.kmers_.set_size(kmers);
- KMerMultiplicityCounter mcounter(data);
-
- const auto& dataset = cfg::get().dataset;
- for (auto I = dataset.reads_begin(), E = dataset.reads_end(); I != E; ++I) {
- INFO("Processing " << *I);
- ireadstream irs(*I, cfg::get().input_qvoffset);
- hammer::ReadProcessor rp(omp_get_max_threads());
- rp.Run(irs, mcounter);
- VERIFY_MSG(rp.read() == rp.processed(), "Queue unbalanced");
- }
-
- size_t singletons = 0;
- for (size_t idx = 0; idx < data.size(); ++idx) {
- size_t cnt = mcounter.count(idx);
- VERIFY(cnt);
- singletons += cnt == 1;
- }
- INFO("There are " << data.size() << " kmers in total. "
- "Among them " << data.size() - singletons << " (" << 100.0 * (double)(data.size() - singletons) / (double)data.size() << "%) are non-singletons.");
-
- NonSingletonKMerSplitter nssplitter(workdir, final_kmers, data, mcounter);
- KMerDiskCounter<hammer::KMer> nscounter(workdir, nssplitter);
- HammerKMerIndex reduced_index;
- kmers = KMerIndexBuilder<HammerKMerIndex>(workdir, num_files_, omp_get_max_threads()).BuildIndex(reduced_index, nscounter, /* save final */ true);
- data.index_.swap(reduced_index);
- final_kmers = nscounter.GetFinalKMersFname();
- }
-
- // Check, whether we'll ever have enough memory for running BH and bail out earlier
- double needed = 1.25 * (double)kmers * (sizeof(KMerStat) + sizeof(hammer::KMer));
- if (needed > (double) get_memory_limit())
- FATAL_ERROR("The reads contain too many k-mers to fit into available memory. You need approx. "
- << needed / 1024.0 / 1024.0 / 1024.0
- << "GB of free RAM to assemble your dataset");
-
- {
- INFO("Arranging kmers in hash map order");
- data.kmers_.set_size(kmers);
- data.kmers_.set_data(new hammer::KMer::DataType[kmers * hammer::KMer::GetDataSize(hammer::K)]);
-
- unsigned nthreads = std::min(cfg::get().count_merge_nthreads, cfg::get().general_max_nthreads);
- auto kmers_its = io::make_kmer_iterator<hammer::KMer>(final_kmers, hammer::K, 16*nthreads);
-
-# pragma omp parallel for num_threads(nthreads) schedule(guided)
- for (size_t i = 0; i < kmers_its.size(); ++i) {
- auto &kmer_it = kmers_its[i];
- for (; kmer_it.good(); ++kmer_it) {
- size_t kidx = data.index_.seq_idx(hammer::KMer(hammer::K, *kmer_it));
- memcpy(data.kmers_[kidx].data(), *kmer_it, hammer::KMer::TotalBytes);
- }
- }
-
- unlink(counter.GetFinalKMersFname().c_str());
- }
-}
-
-void KMerDataCounter::FillKMerData(KMerData &data) {
- // Now use the index to fill the kmer quality information.
- INFO("Collecting K-mer information, this takes a while.");
- data.data_.resize(data.kmers_.size());
-
- KMerDataFiller filler(data);
- const auto& dataset = cfg::get().dataset;
- for (auto I = dataset.reads_begin(), E = dataset.reads_end(); I != E; ++I) {
- INFO("Processing " << *I);
- ireadstream irs(*I, cfg::get().input_qvoffset);
- hammer::ReadProcessor rp(omp_get_max_threads());
- rp.Run(irs, filler);
- VERIFY_MSG(rp.read() == rp.processed(), "Queue unbalanced");
- }
-
- INFO("Collection done, postprocessing.");
-
- size_t singletons = 0;
- for (size_t i = 0; i < data.size(); ++i) {
- VERIFY(data[i].count());
-
- // Make sure all the kmers are marked as 'Bad' in the beginning
- data[i].mark_bad();
-
- if (data[i].count() == 1)
- singletons += 1;
- }
-
- INFO("There are " << data.size() << " kmers in total. "
- "Among them " << singletons << " (" << 100.0 * (double)singletons / (double)data.size() << "%) are singletons.");
-}
diff --git a/src/hammer/kmer_data.hpp b/src/hammer/kmer_data.hpp
deleted file mode 100644
index 9a20194..0000000
--- a/src/hammer/kmer_data.hpp
+++ /dev/null
@@ -1,141 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef __HAMMER_KMER_DATA_HPP__
-#define __HAMMER_KMER_DATA_HPP__
-
-#include "kmer_stat.hpp"
-#include "adt/array_vector.hpp"
-#include "mph_index/kmer_index.hpp"
-#include <vector>
-
-typedef KMerIndex<kmer_index_traits<hammer::KMer> > HammerKMerIndex;
-
-class KMerData {
- typedef std::vector<KMerStat> KMerDataStorageType;
- typedef std::vector<hammer::KMer> KMerStorageType;
- typedef kmer_index_traits<hammer::KMer> traits;
-
- public:
- KMerData()
- : kmers_(nullptr, 0, hammer::KMer::GetDataSize(hammer::K)) {}
-
- ~KMerData() { delete[] kmers_.data(); }
-
- size_t size() const { return kmers_.size() + push_back_buffer_.size(); }
-
- void clear() {
- data_.clear();
- push_back_buffer_.clear();
- kmer_push_back_buffer_.clear();
- KMerDataStorageType().swap(data_);
- KMerDataStorageType().swap(push_back_buffer_);
- }
-
- size_t push_back(const hammer::KMer kmer, const KMerStat &k) {
- push_back_buffer_.push_back(k);
- kmer_push_back_buffer_.push_back(kmer);
-
- return data_.size() + push_back_buffer_.size() - 1;
- }
-
- KMerStat& operator[](size_t idx) {
- size_t dsz = data_.size();
- return (idx < dsz ? data_[idx] : push_back_buffer_[idx - dsz]);
- }
- const KMerStat& operator[](size_t idx) const {
- size_t dsz = data_.size();
- return (idx < dsz ? data_[idx] : push_back_buffer_[idx - dsz]);
- }
- hammer::KMer kmer(size_t idx) const {
- if (idx < kmers_.size()) {
- auto it = kmers_.begin() + idx;
- return (traits::raw_create()(hammer::K, *it));
- }
-
- idx -= kmers_.size();
-
- return kmer_push_back_buffer_[idx];
- }
-
- size_t checking_seq_idx(hammer::KMer s) const {
- size_t idx = seq_idx(s);
- if (idx >= size())
- return -1ULL;
-
- return (s == kmer(idx) ? idx : -1ULL);
- }
-
- KMerStat& operator[](hammer::KMer s) { return operator[](seq_idx(s)); }
- const KMerStat& operator[](hammer::KMer s) const { return operator[](seq_idx(s)); }
- size_t seq_idx(hammer::KMer s) const { return index_.seq_idx(s); }
-
- template <class Writer>
- void binary_write(Writer &os) {
- size_t sz = data_.size();
- os.write((char*)&sz, sizeof(sz));
- os.write((char*)&data_[0], sz*sizeof(data_[0]));
-
- sz = push_back_buffer_.size();
- os.write((char*)&sz, sizeof(sz));
- os.write((char*)&push_back_buffer_[0], sz*sizeof(push_back_buffer_[0]));
- os.write((char*)&kmer_push_back_buffer_[0], sz*sizeof(kmer_push_back_buffer_[0]));
-
- index_.serialize(os);
- sz = kmers_.size();
- os.write((char*)&sz, sizeof(sz));
- os.write((char*)kmers_.data(), sz * sizeof(hammer::KMer::DataType) * hammer::KMer::GetDataSize(hammer::K));
- }
-
- template <class Reader>
- void binary_read(Reader &is, const std::string &) {
- clear();
-
- size_t sz = 0;
- is.read((char*)&sz, sizeof(sz));
- data_.resize(sz);
- is.read((char*)&data_[0], sz*sizeof(data_[0]));
-
- is.read((char*)&sz, sizeof(sz));
- push_back_buffer_.resize(sz);
- is.read((char*)&push_back_buffer_[0], sz*sizeof(push_back_buffer_[0]));
- kmer_push_back_buffer_.resize(sz);
- is.read((char*)&kmer_push_back_buffer_[0], sz*sizeof(kmer_push_back_buffer_[0]));
-
- index_.deserialize(is);
- is.read((char*)&sz, sizeof(sz));
- kmers_.set_size(sz);
- kmers_.set_data(new hammer::KMer::DataType[sz * hammer::KMer::GetDataSize(hammer::K)]);
- is.read((char*)kmers_.data(), sz * sizeof(hammer::KMer::DataType) * hammer::KMer::GetDataSize(hammer::K));
- }
-
- private:
- array_vector<hammer::KMer::DataType> kmers_;
-
- KMerDataStorageType data_;
- KMerStorageType kmer_push_back_buffer_;
- KMerDataStorageType push_back_buffer_;
- HammerKMerIndex index_;
-
- friend class KMerDataCounter;
-};
-
-class KMerDataCounter {
- unsigned num_files_;
-
- public:
- KMerDataCounter(unsigned num_files) : num_files_(num_files) {}
-
- void BuildKMerIndex(KMerData &data);
- void FillKMerData(KMerData &data);
-
- private:
- DECL_LOGGER("K-mer Counting");
-};
-
-
-#endif
diff --git a/src/hammer/kmer_stat.hpp b/src/hammer/kmer_stat.hpp
deleted file mode 100644
index b45f171..0000000
--- a/src/hammer/kmer_stat.hpp
+++ /dev/null
@@ -1,291 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef HAMMER_KMERSTAT_HPP_
-#define HAMMER_KMERSTAT_HPP_
-
-#include "verify.hpp"
-
-#include "sequence/seq.hpp"
-
-#include <folly/SmallLocks.h>
-
-#include <functional>
-#include <vector>
-#include <iostream>
-#include <iomanip>
-#include <map>
-#include <string>
-#include <cstdint>
-#include <cmath>
-
-#include <sched.h>
-#include <string.h>
-
-
-namespace hammer {
-const uint32_t K = 21;
-typedef Seq<K> KMer;
-};
-
-class Read;
-struct KMerStat;
-
-static inline unsigned hamdistKMer(const hammer::KMer &x, const hammer::KMer &y,
- unsigned tau = hammer::K) {
- unsigned dist = 0;
- for (unsigned i = 0; i < hammer::K; ++i) {
- if (x[i] != y[i]) {
- ++dist; if (dist > tau) return dist;
- }
- }
- return dist;
-}
-
-template<unsigned N, unsigned bits,
- typename Storage = uint64_t>
-class NibbleString {
- static const unsigned StorageBits = sizeof(Storage) * 8;
- static_assert(bits <= 8, "Too large nibbles");
- static const unsigned K = (bits * N + StorageBits - 1) / StorageBits;
- static const uint64_t MaxValue = (1ull << bits) - 1;
-
- public:
- NibbleString() { storage_.fill(0); }
-
- explicit NibbleString(const uint8_t *data) {
- for (unsigned i = 0; i < N; ++i)
- set(i, data ? data[i] : 0);
- }
-
- void set(size_t n, uint8_t value) {
- // Determine the index of storage element and the offset.
- size_t idx = n * bits / StorageBits, offset = n * bits - idx * StorageBits;
-
- storage_[idx] = (storage_[idx] & ~(MaxValue << offset)) | ((value & MaxValue) << offset);
- // Hard case: stuff crosses the boundary
- if (offset + bits >= StorageBits) {
- size_t rbits = StorageBits - offset;
- uint64_t mask = MaxValue >> rbits;
- uint8_t remaining = uint8_t((value >> rbits) & mask);
-
- storage_[idx + 1] = (storage_[idx + 1] & ~mask) | remaining;
- }
- }
-
- uint8_t operator[](size_t n) const {
- // Determine the index of storage element and the offset.
- size_t idx = n * bits / StorageBits, offset = n * bits - idx * StorageBits;
-
- // Easy case: everything do not cross the boundary
- if (offset + bits < StorageBits) {
- return (storage_[idx] >> offset) & MaxValue;
- }
-
- // Assemble stuff from parts
- size_t rbits = StorageBits - offset;
- uint64_t mask = MaxValue >> rbits;
- return uint8_t((storage_[idx] >> offset) | ((storage_[idx + 1] & mask) << rbits));
- }
-
- NibbleString& operator+=(const uint8_t *data) {
- uint64_t mv = MaxValue;
- for (unsigned i = 0; i < N; ++i)
- set(i, (uint8_t)std::min(mv, (uint64_t)data[i] + operator[](i)));
-
- return *this;
- }
-
- NibbleString& operator+=(const NibbleString &data) {
- uint64_t mv = MaxValue;
- for (unsigned i = 0; i < N; ++i)
- set(i, (uint8_t)std::min(mv, (uint64_t)data[i] + operator[](i)));
-
- return *this;
- }
-
- Storage *data() { return storage_.data(); }
- const Storage *data() const { return storage_.data(); }
-
- private:
- std::array<Storage, K> storage_;
-};
-
-using QualBitSet = NibbleString<hammer::K, 6>;
-
-struct KMerStat {
- KMerStat(uint32_t cnt, float kquality, const unsigned char *quality) : total_qual(kquality), qual(quality) {
- count_with_lock.init(0);
- set_count(cnt);
- mark_bad();
- }
- KMerStat() : total_qual(1.0), qual() {
- count_with_lock.init(0);
- set_count(0);
- mark_bad();
- }
-
- float total_qual;
- folly::PicoSpinLock<uint32_t> count_with_lock;
- QualBitSet qual;
-
- void lock() { count_with_lock.lock(); }
- void unlock() { count_with_lock.unlock(); }
- uint32_t count() const { return count_with_lock.getData() >> 1; }
- void set_count(uint32_t cnt) { count_with_lock.setData((cnt << 1) | good()); }
- bool good() const { return count_with_lock.getData() & 1; }
- void mark_good() {
- uint32_t val = count_with_lock.getData();
- count_with_lock.setData(val | 1);
- }
- void mark_bad() {
- uint32_t val = count_with_lock.getData();
- count_with_lock.setData(val & ~1);
- }
-};
-
-inline
-std::ostream& operator<<(std::ostream &os, const KMerStat &kms) {
- os << /* kms.kmer().str() << */ " (" << std::setw(3) << kms.count() << ", " << std::setprecision(6) << std::setw(8) << (1-kms.total_qual) << ')';
-
- return os;
-}
-
-template<class Writer>
-inline Writer& binary_write(Writer &os, const QualBitSet &qbs) {
- os.write((char*)qbs.data(), sizeof(qbs));
-
- return os;
-}
-
-template<class Reader>
-inline void binary_read(Reader &is, QualBitSet &qbs) {
- is.read((char*)qbs.data(), sizeof(qbs));
-}
-
-template<class Writer>
-inline Writer& binary_write(Writer &os, const KMerStat &k) {
- os.write((char*)&k.count_with_lock, sizeof(k.count_with_lock));
- os.write((char*)&k.total_qual, sizeof(k.total_qual));
- return binary_write(os, k.qual);
-}
-
-template<class Reader>
-inline void binary_read(Reader &is, KMerStat &k) {
- is.read((char*)&k.count_with_lock, sizeof(k.count_with_lock));
- is.read((char*)&k.total_qual, sizeof(k.total_qual));
- binary_read(is, k.qual);
-}
-
-inline unsigned char getQual(const KMerStat & kmc, size_t i) {
- return (unsigned char)kmc.qual[i];
-}
-
-inline double getProb(const KMerStat &kmc, size_t i, bool log);
-inline double getRevProb(const KMerStat &kmc, size_t i, bool log);
-
-namespace hammer {
-typedef std::array<char, hammer::K> ExpandedSeq;
-
-static inline unsigned hamdist(const ExpandedSeq &x, const ExpandedSeq &y,
- unsigned tau = hammer::K) {
- unsigned dist = 0;
- for (unsigned i = 0; i < hammer::K; ++i) {
- if (x[i] != y[i]) {
- ++dist; if (dist > tau) return dist;
- }
- }
- return dist;
-}
-
-class ExpandedKMer {
- public:
- ExpandedKMer(const KMer k, const KMerStat &kmc) {
- for (unsigned i = 0; i < hammer::K; ++i) {
- s_[i] = k[i];
- for (unsigned j = 0; j < 4; ++j)
- lprobs_[4*i + j] = ((char)j != s_[i] ?
- getRevProb(kmc, i, /* log */ true) - log(3) :
- getProb(kmc, i, /* log */ true));
- }
- count_ = kmc.count();
- }
-
- double logL(const ExpandedSeq ¢er) const {
- double res = 0;
- for (unsigned i = 0; i < hammer::K; ++i)
- res += lprobs_[4*i + center[i]];
-
- return res;
- }
-
- double logL(const ExpandedKMer ¢er) const {
- return logL(center.s_);
- }
-
- unsigned hamdist(const ExpandedSeq &k,
- unsigned tau = hammer::K) const {
- unsigned dist = 0;
- for (unsigned i = 0; i < hammer::K; ++i) {
- if (s_[i] != k[i]) {
- ++dist; if (dist > tau) return dist;
- }
- }
-
- return dist;
- }
-
- unsigned hamdist(const ExpandedKMer &k,
- unsigned tau = hammer::K) const {
- return hamdist(k.s_, tau);
- }
-
- double logL(const KMer center) const {
- double res = 0;
- for (unsigned i = 0; i < hammer::K; ++i)
- res += lprobs_[4*i + center[i]];
-
- return res;
- }
-
- unsigned hamdist(const KMer &k,
- unsigned tau = hammer::K) const {
- unsigned dist = 0;
- for (unsigned i = 0; i < hammer::K; ++i) {
- if (s_[i] != k[i]) {
- ++dist; if (dist > tau) return dist;
- }
- }
-
- return dist;
- }
-
- uint32_t count() const {
- return count_;
- }
-
- ExpandedSeq seq() const {
- return s_;
- }
-
- private:
- double lprobs_[4*hammer::K];
- uint32_t count_;
- ExpandedSeq s_;
-};
-
-inline
-std::ostream& operator<<(std::ostream &os, const ExpandedSeq &seq) {
- for (auto s : seq)
- os << nucl(s);
-
- return os;
-}
-
-};
-
-#endif // HAMMER_KMERSTAT_HPP_
diff --git a/src/hammer/main.cpp b/src/hammer/main.cpp
deleted file mode 100644
index 3574989..0000000
--- a/src/hammer/main.cpp
+++ /dev/null
@@ -1,291 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * main.cpp
- *
- * Created on: 08.07.2011
- * Author: snikolenko
- */
-
-
-#include "config_struct_hammer.hpp"
-#include "hammer_tools.hpp"
-#include "kmer_cluster.hpp"
-#include "globals.hpp"
-#include "kmer_data.hpp"
-#include "expander.hpp"
-
-#include "adt/concurrent_dsu.hpp"
-#include "segfault_handler.hpp"
-#include "io/read_processor.hpp"
-#include "io/ireadstream.hpp"
-
-#include "memory_limit.hpp"
-
-#include "logger/logger.hpp"
-#include "logger/log_writers.hpp"
-
-#include "version.hpp"
-
-#include <yaml-cpp/yaml.h>
-
-#include <algorithm>
-#include <iostream>
-#include <fstream>
-#include <sstream>
-#include <string>
-#include <vector>
-
-#include <cassert>
-#include <cmath>
-#include <cstdlib>
-
-std::vector<uint32_t> * Globals::subKMerPositions = NULL;
-KMerData *Globals::kmer_data = NULL;
-int Globals::iteration_no = 0;
-
-char Globals::char_offset = 0;
-bool Globals::char_offset_user = true;
-
-double Globals::quality_probs[256] = { 0 };
-double Globals::quality_lprobs[256] = { 0 };
-double Globals::quality_rprobs[256] = { 0 };
-double Globals::quality_lrprobs[256] = { 0 };
-
-struct UfCmp {
- bool operator()(const std::vector<int> &lhs, const std::vector<int> &rhs) {
- return (lhs[0] < rhs[0]);
- }
-};
-
-void create_console_logger() {
- using namespace logging;
-
- logger *lg = create_logger("");
- lg->add_writer(std::make_shared<console_writer>());
- attach_logger(lg);
-}
-
-int main(int argc, char * argv[]) {
- segfault_handler sh;
-
- srand(42);
- srandom(42);
-
- try {
- create_console_logger();
-
- std::string config_file = CONFIG_FILENAME;
- if (argc > 1) config_file = argv[1];
- INFO("Starting BayesHammer, built from " SPADES_GIT_REFSPEC ", git revision " SPADES_GIT_SHA1);
- INFO("Loading config from " << config_file.c_str());
- cfg::create_instance(config_file);
-
- // hard memory limit
- const size_t GB = 1 << 30;
- limit_memory(cfg::get().general_hard_memory_limit * GB);
-
- // determine quality offset if not specified
- if (!cfg::get().input_qvoffset_opt) {
- INFO("Trying to determine PHRED offset");
- int determined_offset = determine_offset(*cfg::get().dataset.reads_begin());
- if (determined_offset < 0) {
- ERROR("Failed to determine offset! Specify it manually and restart, please!");
- return -1;
- } else {
- INFO("Determined value is " << determined_offset);
- cfg::get_writable().input_qvoffset = determined_offset;
- }
- Globals::char_offset_user = false;
- } else {
- cfg::get_writable().input_qvoffset = *cfg::get().input_qvoffset_opt;
- Globals::char_offset_user = true;
- }
- Globals::char_offset = (char)cfg::get().input_qvoffset;
-
- // Pre-cache quality probabilities
- for (unsigned qual = 0; qual < sizeof(Globals::quality_probs) / sizeof(Globals::quality_probs[0]); ++qual) {
- Globals::quality_rprobs[qual] = (qual < 3 ? 0.75 : pow(10.0, -(int)qual / 10.0));
- Globals::quality_probs[qual] = 1 - Globals::quality_rprobs[qual];
- Globals::quality_lprobs[qual] = log(Globals::quality_probs[qual]);
- Globals::quality_lrprobs[qual] = log(Globals::quality_rprobs[qual]);
- }
-
- // initialize subkmer positions
- hammer::InitializeSubKMerPositions();
-
- INFO("Size of aux. kmer data " << sizeof(KMerStat) << " bytes");
-
- int max_iterations = cfg::get().general_max_iterations;
-
- // now we can begin the iterations
- for (Globals::iteration_no = 0; Globals::iteration_no < max_iterations; ++Globals::iteration_no) {
- std::cout << "\n === ITERATION " << Globals::iteration_no << " begins ===" << std::endl;
- bool do_everything = cfg::get().general_do_everything_after_first_iteration && (Globals::iteration_no > 0);
-
- // initialize k-mer structures
- Globals::kmer_data = new KMerData;
-
- // count k-mers
- if (cfg::get().count_do || do_everything) {
- KMerDataCounter(cfg::get().count_numfiles).BuildKMerIndex(*Globals::kmer_data);
-
- if (cfg::get().general_debug) {
- INFO("Debug mode on. Dumping K-mer index");
- std::string fname = hammer::getFilename(cfg::get().input_working_dir, Globals::iteration_no, "kmer.index");
- std::ofstream os(fname.c_str(), std::ios::binary);
- Globals::kmer_data->binary_write(os);
- }
- } else {
- INFO("Reading K-mer index");
- std::string fname = hammer::getFilename(cfg::get().input_working_dir, Globals::iteration_no, "kmer.index");
- std::ifstream is(fname.c_str(), std::ios::binary);
- VERIFY(is.good());
- Globals::kmer_data->binary_read(is, fname);
- }
-
- // Cluster the Hamming graph
- std::vector<std::vector<size_t> > classes;
- if (cfg::get().hamming_do || do_everything) {
- ConcurrentDSU uf(Globals::kmer_data->size());
- std::string ham_prefix = hammer::getFilename(cfg::get().input_working_dir, Globals::iteration_no, "kmers.hamcls");
- INFO("Clustering Hamming graph.");
- if (cfg::get().general_tau > 1) {
- KMerHamClusterer(cfg::get().general_tau).cluster(ham_prefix, *Globals::kmer_data, uf);
- } else {
- TauOneKMerHamClusterer().cluster(ham_prefix, *Globals::kmer_data, uf);
- }
-
- INFO("Extracting clusters");
- size_t num_classes = uf.extract_to_file(hammer::getFilename(cfg::get().input_working_dir, Globals::iteration_no, "kmers.hamming"));
-
-#if 0
- std::sort(classes.begin(), classes.end(), UfCmp());
- for (size_t i = 0; i < classes.size(); ++i) {
- std::cerr << i << ": { ";
- for (size_t j = 0; j < classes[i].size(); ++j)
- std::cerr << classes[i][j] << ", ";
- std::cerr << "}" << std::endl;
- }
-#endif
- INFO("Clustering done. Total clusters: " << num_classes);
- }
-
- if (cfg::get().bayes_do || do_everything) {
- KMerDataCounter(cfg::get().count_numfiles).FillKMerData(*Globals::kmer_data);
-
- INFO("Subclustering Hamming graph");
- unsigned clustering_nthreads = std::min(cfg::get().general_max_nthreads, cfg::get().bayes_nthreads);
- KMerClustering kmc(*Globals::kmer_data, clustering_nthreads,
- cfg::get().input_working_dir, cfg::get().general_debug);
- kmc.process(hammer::getFilename(cfg::get().input_working_dir, Globals::iteration_no, "kmers.hamming"));
- INFO("Finished clustering.");
-
- if (cfg::get().general_debug) {
- INFO("Debug mode on. Dumping K-mer index");
- std::string fname = hammer::getFilename(cfg::get().input_working_dir, Globals::iteration_no, "kmer.index2");
- std::ofstream os(fname.c_str(), std::ios::binary);
- Globals::kmer_data->binary_write(os);
- }
- } else {
- INFO("Reading K-mer index");
- std::string fname = hammer::getFilename(cfg::get().input_working_dir, Globals::iteration_no, "kmer.index2");
- std::ifstream is(fname.c_str(), std::ios::binary);
- VERIFY(is.good());
- Globals::kmer_data->binary_read(is, fname);
- }
-
- // expand the set of solid k-mers
- if (cfg::get().expand_do || do_everything) {
- unsigned expand_nthreads = std::min(cfg::get().general_max_nthreads, cfg::get().expand_nthreads);
- INFO("Starting solid k-mers expansion in " << expand_nthreads << " threads.");
- for (unsigned expand_iter_no = 0; expand_iter_no < cfg::get().expand_max_iterations; ++expand_iter_no) {
- Expander expander(*Globals::kmer_data);
- const io::DataSet<> &dataset = cfg::get().dataset;
- for (auto I = dataset.reads_begin(), E = dataset.reads_end(); I != E; ++I) {
- ireadstream irs(*I, cfg::get().input_qvoffset);
- hammer::ReadProcessor rp(expand_nthreads);
- rp.Run(irs, expander);
- VERIFY_MSG(rp.read() == rp.processed(), "Queue unbalanced");
- }
-
- if (cfg::get().expand_write_each_iteration) {
- std::ofstream oftmp(hammer::getFilename(cfg::get().input_working_dir, Globals::iteration_no, "goodkmers", expand_iter_no).data());
- for (size_t n = 0; n < Globals::kmer_data->size(); ++n) {
- const KMerStat &kmer_data = (*Globals::kmer_data)[n];
- if (kmer_data.good())
- oftmp << Globals::kmer_data->kmer(n).str() << "\n>" << n
- << " cnt=" << kmer_data.count() << " tql=" << (1-kmer_data.total_qual) << "\n";
- }
- }
-
- INFO("Solid k-mers iteration " << expand_iter_no << " produced " << expander.changed() << " new k-mers.");
- if (expander.changed() < 10)
- break;
- }
- INFO("Solid k-mers finalized");
-
- if (cfg::get().general_debug) {
- INFO("Debug mode on. Dumping K-mer index");
- std::string fname = hammer::getFilename(cfg::get().input_working_dir, Globals::iteration_no, "kmer.index3");
- std::ofstream os(fname.c_str(), std::ios::binary);
- Globals::kmer_data->binary_write(os);
- }
- } else {
- INFO("Reading K-mer index");
- std::string fname = hammer::getFilename(cfg::get().input_working_dir, Globals::iteration_no, "kmer.index3");
- std::ifstream is(fname.c_str(), std::ios::binary);
- VERIFY(is.good());
- Globals::kmer_data->binary_read(is, fname);
- }
-
- size_t totalReads = 0;
- // reconstruct and output the reads
- if (cfg::get().correct_do || do_everything) {
- totalReads = hammer::CorrectAllReads();
- }
-
- // prepare the reads for next iteration
- delete Globals::kmer_data;
-
- if (totalReads < 1) {
- INFO("Too few reads have changed in this iteration. Exiting.");
- break;
- }
- // break;
- }
-
- std::string fname = hammer::getFilename(cfg::get().output_dir, "corrected.yaml");
- INFO("Saving corrected dataset description to " << fname);
- cfg::get().dataset.save(fname);
-
- // clean up
- Globals::subKMerPositions->clear();
- delete Globals::subKMerPositions;
-
- INFO("All done. Exiting.");
- } catch (std::bad_alloc const& e) {
- std::cerr << "Not enough memory to run BayesHammer. " << e.what() << std::endl;
- return EINTR;
- } catch (const YAML::Exception &e) {
- std::cerr << "Error reading config file: " << e.what() << std::endl;
- return EINTR;
- } catch (std::exception const& e) {
- std::cerr << "Exception caught " << e.what() << std::endl;
- return EINTR;
- } catch (const std::string& ex) {
- std::cerr << "Exception caught: " << ex << std::endl;
- } catch (const char* s) {
- std::cerr << "Exception caught: " << s << std::endl;
- } catch (...) {
- std::cerr << "Unknown exception caught " << std::endl;
- return EINTR;
- }
-
- return 0;
-}
diff --git a/src/hammer/parallel_radix_sort.hpp b/src/hammer/parallel_radix_sort.hpp
deleted file mode 100644
index 1ed08ae..0000000
--- a/src/hammer/parallel_radix_sort.hpp
+++ /dev/null
@@ -1,592 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-// Copyright 2010, Takuya Akiba
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Takuya Akiba nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#ifndef PARALLEL_RADIX_SORT_H_
-#define PARALLEL_RADIX_SORT_H_
-
-#include "openmp_wrapper.h"
-
-#include <stdint.h>
-#include <cstring>
-#include <cassert>
-#include <climits>
-#include <algorithm>
-#include <utility>
-
-namespace parallel_radix_sort {
-
-namespace internal {
-// Size of the software managed buffer
-const size_t kOutBufferSize = 32;
-
-// The algorithm is implemented in this internal class
-template<typename PlainType, typename UnsignedType, typename Encoder,
- typename ValueManager, int Base>
-class ParallelRadixSortInternal {
-public:
- ParallelRadixSortInternal();
- ~ParallelRadixSortInternal();
-
- void Init(size_t max_elems, int max_threads);
-
- PlainType *Sort(PlainType *data, size_t num_elems, int num_threads,
- ValueManager *value_manager);
-
- static void InitAndSort(PlainType *data, size_t num_elems, int num_threads,
- ValueManager *value_manager);
-private:
- size_t max_elems_;
- int max_threads_;
-
- UnsignedType *tmp_;
- size_t **histo_;
- UnsignedType ***out_buf_;
- size_t **out_buf_n_;
-
- int num_threads_;
- size_t *pos_bgn_, *pos_end_;
- ValueManager *value_manager_;
-
- void DeleteAll();
-
- UnsignedType *SortInternal(UnsignedType *data, size_t num_elems,
- int num_threads, ValueManager *value_manager);
-
- // Compute |pos_bgn_| and |pos_end_| (associated ranges for each threads)
- void ComputeRanges(size_t num_elems);
-
- // First step of each iteration of sorting
- // Compute the histogram of |src| using bits in [b, b + Base)
- void ComputeHistogram(int b, UnsignedType *src);
-
- // Second step of each iteration of sorting
- // Scatter elements of |src| to |dst| using the histogram
- void Scatter(int b, UnsignedType *src, UnsignedType *dst);
-};
-
-template<typename PlainType, typename UnsignedType, typename Encoder,
- typename ValueManager, int Base>
-ParallelRadixSortInternal<PlainType, UnsignedType, Encoder, ValueManager, Base>
-::ParallelRadixSortInternal()
- : max_elems_(0), max_threads_(0), tmp_(NULL), histo_(NULL),
- out_buf_(NULL), out_buf_n_(NULL), pos_bgn_(NULL), pos_end_(NULL) {
- assert(sizeof(PlainType) == sizeof(UnsignedType));
-}
-
-template<typename PlainType, typename UnsignedType, typename Encoder,
- typename ValueManager, int Base>
-ParallelRadixSortInternal
-<PlainType, UnsignedType, Encoder, ValueManager, Base>
-::~ParallelRadixSortInternal() {
- DeleteAll();
-}
-
-template<typename PlainType, typename UnsignedType, typename Encoder,
- typename ValueManager, int Base>
-void ParallelRadixSortInternal
-<PlainType, UnsignedType, Encoder, ValueManager, Base>
-::DeleteAll() {
- delete [] tmp_;
- tmp_ = NULL;
-
- for (int i = 0; i < max_threads_; ++i) delete [] histo_[i];
- delete [] histo_;
- histo_ = NULL;
-
- for (int i = 0; i < max_threads_; ++i) {
- for (size_t j = 0; j < 1 << Base; ++j) {
- delete [] out_buf_[i][j];
- }
- delete [] out_buf_n_[i];
- delete [] out_buf_[i];
- }
- delete [] out_buf_;
- delete [] out_buf_n_;
- out_buf_ = NULL;
- out_buf_n_ = NULL;
-
- delete [] pos_bgn_;
- delete [] pos_end_;
- pos_bgn_ = pos_end_ = NULL;
-
- max_elems_ = 0;
- max_threads_ = 0;
-}
-
-template<typename PlainType, typename UnsignedType, typename Encoder,
- typename ValueManager, int Base>
-void ParallelRadixSortInternal
-<PlainType, UnsignedType, Encoder, ValueManager, Base>
-::Init(size_t max_elems, int max_threads) {
- DeleteAll();
-
- max_elems_ = max_elems;
-
- if (max_threads == -1) {
- max_threads = omp_get_max_threads();
- }
- assert(max_threads >= 1);
- max_threads_ = max_threads;
-
- tmp_ = new UnsignedType[max_elems];
- histo_ = new size_t*[max_threads];
- for (int i = 0; i < max_threads; ++i) {
- histo_[i] = new size_t[1 << Base];
- }
-
- out_buf_ = new UnsignedType**[max_threads];
- out_buf_n_ = new size_t*[max_threads];
- for (int i = 0; i < max_threads; ++i) {
- out_buf_[i] = new UnsignedType*[1 << Base];
- out_buf_n_[i] = new size_t[1 << Base];
- for (size_t j = 0; j < 1 << Base; ++j) {
- out_buf_[i][j] = new UnsignedType[kOutBufferSize];
- }
- }
-
- pos_bgn_ = new size_t[max_threads];
- pos_end_ = new size_t[max_threads];
-}
-
-template<typename PlainType, typename UnsignedType, typename Encoder,
- typename ValueManager, int Base>
-PlainType *ParallelRadixSortInternal
-<PlainType, UnsignedType, Encoder, ValueManager, Base>
-::Sort(PlainType *data, size_t num_elems,
- int num_threads, ValueManager *value_manager) {
- UnsignedType *src = reinterpret_cast<UnsignedType*>(data);
- UnsignedType *res = SortInternal(src, num_elems, num_threads, value_manager);
- return reinterpret_cast<PlainType*>(res);
-}
-
-template<typename PlainType, typename UnsignedType, typename Encoder,
- typename ValueManager, int Base>
-void ParallelRadixSortInternal
-<PlainType, UnsignedType, Encoder, ValueManager, Base>
-::InitAndSort(PlainType *data, size_t num_elems,
- int num_threads, ValueManager *value_manager) {
- ParallelRadixSortInternal prs;
- prs.Init(num_elems, num_threads);
- const PlainType *res = prs.Sort(data, num_elems, num_threads, value_manager);
- if (res != data) {
- for (size_t i = 0; i < num_elems; ++i) data[i] = res[i];
- }
-}
-
-template<typename PlainType, typename UnsignedType, typename Encoder,
- typename ValueManager, int Base>
-UnsignedType *ParallelRadixSortInternal
-<PlainType, UnsignedType, Encoder, ValueManager, Base>
-::SortInternal(UnsignedType *data, size_t num_elems,
- int num_threads, ValueManager *value_manager) {
- assert(num_elems <= max_elems_);
-
- if (num_threads == -1) {
- num_threads = omp_get_max_threads();
- }
- assert(1 <= num_threads && num_threads <= max_threads_);
- num_threads_ = num_threads;
-
- value_manager_ = value_manager;
-
- // Compute |pos_bgn_| and |pos_end_|
- ComputeRanges(num_elems);
-
- // Iterate from lower bits to higher bits
- const unsigned bits = CHAR_BIT * sizeof(UnsignedType);
- UnsignedType *src = data, *dst = tmp_;
- for (unsigned b = 0; b < bits; b += Base) {
- ComputeHistogram(b, src);
- Scatter(b, src, dst);
-
- std::swap(src, dst);
- value_manager->Next();
- }
-
- return src;
-}
-
-template<typename PlainType, typename UnsignedType, typename Encoder,
- typename ValueManager, int Base>
-void ParallelRadixSortInternal
-<PlainType, UnsignedType, Encoder, ValueManager, Base>
-::ComputeRanges(size_t num_elems) {
- pos_bgn_[0] = 0;
- for (int i = 0; i < num_threads_ - 1; ++i) {
- const size_t t = (num_elems - pos_bgn_[i]) / (num_threads_ - i);
- pos_bgn_[i + 1] = pos_end_[i] = pos_bgn_[i] + t;
- }
- pos_end_[num_threads_ - 1] = num_elems;
-}
-
-template<typename PlainType, typename UnsignedType, typename Encoder,
- typename ValueManager, int Base>
-void ParallelRadixSortInternal
-<PlainType, UnsignedType, Encoder, ValueManager, Base>
-::ComputeHistogram(int b, UnsignedType *src) {
- // Compute local histogram
- #ifdef _OPENMP
- #pragma omp parallel num_threads(num_threads_)
- #endif
- {
- const int my_id = omp_get_thread_num();
- const size_t my_bgn = pos_bgn_[my_id];
- const size_t my_end = pos_end_[my_id];
- size_t *my_histo = histo_[my_id];
-
- memset(my_histo, 0, sizeof(size_t) * (1 << Base));
- for (size_t i = my_bgn; i < my_end; ++i) {
- __builtin_prefetch(src + i + 1, 0, 1);
- size_t t = Encoder::extract(src[i], b, Base);
- ++my_histo[t];
- }
- }
-
- // Compute global histogram
- size_t s = 0;
- for (size_t i = 0; i < 1 << Base; ++i) {
- for (int j = 0; j < num_threads_; ++j) {
- const size_t t = s + histo_[j][i];
- histo_[j][i] = s;
- s = t;
- }
- }
-}
-
-template<typename PlainType, typename UnsignedType, typename Encoder,
- typename ValueManager, int Base>
-void ParallelRadixSortInternal
-<PlainType, UnsignedType, Encoder, ValueManager, Base>
-::Scatter(int b, UnsignedType *src, UnsignedType *dst) {
- #ifdef _OPENMP
- #pragma omp parallel num_threads(num_threads_)
- #endif
- {
- const int my_id = omp_get_thread_num();
- const size_t my_bgn = pos_bgn_[my_id];
- const size_t my_end = pos_end_[my_id];
- size_t *my_histo = histo_[my_id];
- UnsignedType **my_buf = out_buf_[my_id];
- size_t *my_buf_n = out_buf_n_[my_id];
-
- memset(my_buf_n, 0, sizeof(size_t) * (1 << Base));
- for (size_t i = my_bgn; i < my_end; ++i) {
- __builtin_prefetch(src + i + 1, 0, 1);
-
- size_t t = Encoder::extract(src[i], b, Base);
- my_buf[t][my_buf_n[t]] = src[i];
- value_manager_->Push(my_id, t, my_buf_n[t], i);
- ++my_buf_n[t];
-
- if (my_buf_n[t] == kOutBufferSize) {
- size_t p = my_histo[t];
- for (size_t j = 0; j < kOutBufferSize; ++j) {
- dst[p++] = my_buf[t][j];
- }
- value_manager_->Flush(my_id, t, kOutBufferSize, my_histo[t]);
-
- my_histo[t] += kOutBufferSize;
- my_buf_n[t] = 0;
- }
- }
-
- // Flush everything
- for (size_t i = 0; i < 1 << Base; ++i) {
- size_t p = my_histo[i];
- for (size_t j = 0; j < my_buf_n[i]; ++j) {
- dst[p++] = my_buf[i][j];
- }
- value_manager_->Flush(my_id, i, my_buf_n[i], my_histo[i]);
- }
- }
-}
-} // namespace internal
-
-// Encoders encode signed/unsigned integers and floating point numbers
-// to correctly ordered unsigned integers
-namespace encoder {
-class EncoderUnsigned {
-public:
- template<typename UnsignedType>
- inline static size_t extract(const UnsignedType &x, unsigned shift, unsigned Base) {
- return (x >> shift) & ((1 << Base) - 1);
- }
-};
-
-class EncoderSigned {
-public:
- template<typename UnsignedType>
- inline static size_t extract(const UnsignedType &x, unsigned shift, unsigned Base) {
- x = x ^ (UnsignedType(1) << (CHAR_BIT * sizeof(UnsignedType) - 1));
- return (x >> shift) & ((1 << Base) - 1);
- }
-};
-
-class EncoderDecimal {
-public:
- template<typename UnsignedType>
- inline static size_t extract(const UnsignedType &x, unsigned shift, unsigned Base) {
- static const int bits = CHAR_BIT * sizeof(UnsignedType);
- const UnsignedType a = x >> (bits - 1);
- const UnsignedType b = (-a) | (UnsignedType(1) << (bits - 1));
- x = x ^ b;
- return (x >> shift) & ((1 << Base) - 1);
- }
-};
-} // namespace encoder
-
-// Value managers are used to generalize the sorting algorithm
-// to sorting of keys and sorting of pairs
-namespace value_manager {
-class DummyValueManager {
-public:
- inline void Push(int thread __attribute__((unused)),
- size_t bucket __attribute__((unused)),
- size_t num __attribute__((unused)),
- size_t from_pos __attribute__((unused))) {}
-
- inline void Flush(int thread __attribute__((unused)),
- size_t bucket __attribute__((unused)),
- size_t num __attribute__((unused)),
- size_t to_pos __attribute__((unused))) {}
-
- void Next() {}
-};
-
-template<typename ValueType, int Base> class PairValueManager {
-public:
- PairValueManager()
- : max_elems_(0), max_threads_(0), original_(NULL), tmp_(NULL),
- src_(NULL), dst_(NULL), out_buf_(NULL) {}
-
- ~PairValueManager() {
- DeleteAll();
- }
-
- void Init(size_t max_elems, int max_threads);
-
- void Start(ValueType *original, size_t num_elems, int num_threads) {
- assert(num_elems <= max_elems_);
- assert(num_threads <= max_threads_);
- src_ = original_ = original;
- dst_ = tmp_;
- }
-
- inline void Push(int thread, size_t bucket, size_t num, size_t from_pos) {
- out_buf_[thread][bucket][num] = src_[from_pos];
- }
-
- inline void Flush(int thread, size_t bucket, size_t num, size_t to_pos) {
- for (size_t i = 0; i < num; ++i) {
- dst_[to_pos++] = out_buf_[thread][bucket][i];
- }
- }
-
- void Next() {
- std::swap(src_, dst_);
- }
-
- ValueType *GetResult() {
- return src_;
- }
-private:
- size_t max_elems_;
- int max_threads_;
-
- static const size_t kOutBufferSize = internal::kOutBufferSize;
- ValueType *original_, *tmp_;
- ValueType *src_, *dst_;
- ValueType ***out_buf_;
-
- void DeleteAll();
-};
-
-template<typename ValueType, int Base>
-void PairValueManager<ValueType, Base>
-::Init(size_t max_elems, int max_threads) {
- if (max_threads == -1) {
- max_threads = omp_get_max_threads();
- }
- assert(max_threads >= 1);
-
- DeleteAll();
-
- max_elems_ = max_elems;
- max_threads_ = max_threads;
-
- tmp_ = new ValueType[max_elems];
-
- out_buf_ = new ValueType**[max_threads];
- for (int i = 0; i < max_threads; ++i) {
- out_buf_[i] = new ValueType*[1 << Base];
- for (size_t j = 0; j < 1 << Base; ++j) {
- out_buf_[i][j] = new ValueType[kOutBufferSize];
- }
- }
-}
-
-template<typename ValueType, int Base>
-void PairValueManager<ValueType, Base>
-::DeleteAll() {
- delete [] tmp_;
- tmp_ = NULL;
-
- for (int i = 0; i < max_threads_; ++i) {
- for (size_t j = 0; j < 1 << Base; ++j) {
- delete [] out_buf_[i][j];
- }
- delete [] out_buf_[i];
- }
- delete [] out_buf_;
- out_buf_ = NULL;
-
- max_elems_ = 0;
- max_threads_ = 0;
-}
-} // namespace value_manager
-
-// Frontend class for sorting keys
-template<typename PlainType, typename UnsignedType = PlainType,
- typename Encoder = encoder::EncoderUnsigned, int Base = 8>
-class KeySort {
- typedef value_manager::DummyValueManager DummyValueManager;
- typedef internal::ParallelRadixSortInternal
- <PlainType, UnsignedType, Encoder, DummyValueManager, Base> Internal;
-
-public:
- // In the following functions, when |max_threads| or |num_threads| is -1,
- // the default value given by OpenMP would be used.
- void Init(size_t max_elems, int max_threads = -1) {
- internal_.Init(max_elems, max_threads);
- }
-
- // Notice that the pointer returned by this
- // does not necessarily equal to |data|.
- PlainType *Sort(PlainType *data, size_t num_elems, int num_threads = -1) {
- return internal_.Sort(data, num_elems, num_threads, &dummy_value_manager_);
- }
-
- static void InitAndSort(PlainType *data, size_t num_elems, int num_threads = -1) {
- DummyValueManager dvm;
- Internal::InitAndSort(data, num_elems, num_threads, &dvm);
- }
-private:
- Internal internal_;
- DummyValueManager dummy_value_manager_;
-};
-
-// Frontend class for sorting pairs
-template<typename PlainType, typename ValueType,
- typename UnsignedType = PlainType,
- typename Encoder = encoder::EncoderUnsigned,
- int Base = 8>
-class PairSort {
- typedef value_manager::PairValueManager
- <ValueType, Base> ValueManager;
- typedef internal::ParallelRadixSortInternal
- <PlainType, UnsignedType, Encoder, ValueManager, Base> Internal;
-
-public:
- // In the following functions, when |max_threads| or |num_threads| is -1,
- // the default value given by OpenMP would be used.
- void Init(size_t max_elems, int max_threads = -1) {
- internal_.Init(max_elems, max_threads);
- value_manager_.Init(max_elems, max_threads);
- }
-
- // Notice that the pointers returned by this
- // do not necessarily equal to |keys| and |vals|.
- std::pair<PlainType*, ValueType*> Sort(PlainType *keys, ValueType *vals,
- size_t num_elems, int num_threads = -1) {
- value_manager_.Start(vals, num_elems, num_threads);
- PlainType *res_keys = internal_.Sort(keys, num_elems, num_threads, &value_manager_);
- ValueType *res_vals = value_manager_.GetResult();
- return std::make_pair(res_keys, res_vals);
- }
-
- static void InitAndSort(PlainType *keys, ValueType *vals,
- size_t num_elems, int num_threads = -1) {
- ValueManager vm;
- vm.Init(num_elems, num_threads);
- vm.Start(vals, num_elems, num_threads);
- Internal::InitAndSort(keys, num_elems, num_threads, &vm);
- ValueType *res_vals = vm.GetResult();
- if (res_vals != vals) {
- for (size_t i = 0; i < num_elems; ++i) {
- vals[i] = res_vals[i];
- }
- }
- }
-private:
- Internal internal_;
- ValueManager value_manager_;
-};
-
-#define TYPE_CASE(plain_type, unsigned_type, encoder_type) \
- template<> class KeySort<plain_type> \
- : public KeySort<plain_type, unsigned_type, \
- encoder::Encoder ## encoder_type> {}; \
- template<typename V> class PairSort<plain_type, V> \
- : public PairSort<plain_type, V, unsigned_type, \
- encoder::Encoder ## encoder_type> {}; \
-
-// Signed integers
-TYPE_CASE(char, unsigned char, Signed);
-TYPE_CASE(short, unsigned short, Signed);
-TYPE_CASE(int, unsigned int, Signed);
-TYPE_CASE(long, unsigned long, Signed);
-TYPE_CASE(long long, unsigned long long, Signed);
-
-// |signed char| and |char| are treated as different types
-TYPE_CASE(signed char, unsigned char, Signed);
-
-// Floating point numbers
-TYPE_CASE(float, uint32_t, Decimal);
-TYPE_CASE(double, uint64_t, Decimal);
-
-#undef TYPE_CASE
-
-template<typename KeyType>
-void SortKeys(KeyType *data, size_t num_elems, int num_threads = -1) {
- KeySort<KeyType>::InitAndSort(data, num_elems, num_threads);
-}
-
-template<typename KeyType, typename ValueType>
-void SortPairs(KeyType *keys, ValueType *vals, size_t num_elems, int num_threads = -1) {
- PairSort<KeyType, ValueType>::InitAndSort(keys, vals, num_elems, num_threads);
-}
-}; // namespace parallel radix sort
-
-#endif // PARALLEL_RADIX_SORT_H_
diff --git a/src/hammer/quake_correct/Read.cpp b/src/hammer/quake_correct/Read.cpp
deleted file mode 100644
index ca95286..0000000
--- a/src/hammer/quake_correct/Read.cpp
+++ /dev/null
@@ -1,824 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "Read.h"
-#include "bithash.h"
-#include <iostream>
-#include <math.h>
-#include <algorithm>
-#include <set>
-#include <queue>
-
-#define TESTING false
-
-int bithash::k;
-
-////////////////////////////////////////////////////////////
-// corrections_compare
-//
-// Simple class to compare to corrected_read's in the
-// priority queue
-////////////////////////////////////////////////////////////
-class corrections_compare {
-public:
- // corrections_compare() {};
- bool operator() (const corrected_read* lhs, const corrected_read* rhs) const {
- //return lhs->likelihood < rhs->likelihood;
- if(lhs->likelihood < rhs->likelihood)
- return true;
- else if(lhs->likelihood > rhs->likelihood)
- return false;
- else
- return lhs->region_edits > rhs->region_edits;
- }
-};
-
-const float Read::trust_spread_t = .1;
-const float Read::correct_min_t = .000001;
-const float Read::learning_min_t = .00001;
-
-////////////////////////////////////////////////////////////
-// Read (constructor)
-//
-// Make shallow copies of sequence and untrusted, and
-// convert quality value string to array of probabilities
-////////////////////////////////////////////////////////////
-Read::Read(const string & h, const unsigned int* s, const string & q, vector<int> & u, const int rl)
- :untrusted(u) {
-
- header = h;
- read_length = rl;
- trim_length = rl;
- seq = new unsigned int[read_length];
- quals = new unsigned int[read_length];
- prob = new float[read_length];
- for(int i = 0; i < read_length; i++) {
- seq[i] = s[i];
- // quality values of 0,1 lead to p < .25
- quals[i] = q[i] - quality_scale;
- if(quals[i] >= max_qual) {
- cerr << "Quality value " << quals[i] << "larger than maximum allowed quality value " << max_qual << ". Increase the variable 'max_qual' in Read.h." << endl;
- exit(EXIT_FAILURE);
- }
- prob[i] = max(.25, 1.0-pow(10.0,-(quals[i]/10.0)));
- }
- trusted_read = 0;
- global_like = 1.0;
-}
-
-Read::~Read() {
- delete[] seq;
- delete[] quals;
- delete[] prob;
- if(trusted_read != 0)
- delete trusted_read;
-}
-
-////////////////////////////////////////////////////////////
-// trim
-//
-// Trim the end of the read the way BWA does it.
-// Removes affected untrusted k-mers.
-// Returns the trimmed read as a string.
-////////////////////////////////////////////////////////////
-string Read::trim(int t) {
- // find trim index
- int phredq;
- int current_trimfunc = 0;
- int max_trimfunc = 0;
- trim_length = read_length; // already set in constructor but ok
- for(int i = read_length-1; i >= 0; i--) {
- phredq = floor(.5-10*log(1.0 - prob[i])/log(10));
- current_trimfunc += (t - phredq);
- if(current_trimfunc > max_trimfunc) {
- max_trimfunc = current_trimfunc;
- trim_length = i;
- }
- }
-
- // update untrusted
- for(int i = untrusted.size()-1; i >= 0; i--) {
- if(untrusted[i] > trim_length - bithash::k)
- untrusted.pop_back();
- }
-
- vector<correction> no_cors;
- return print_corrected(no_cors);
-}
-
-
-////////////////////////////////////////////////////////////
-// single_correct
-//
-// Find the set of corrections with maximum likelihood
-// that result in all trusted kmers.
-//
-// Assumes a short read so obsolete.
-////////////////////////////////////////////////////////////
-/*
-bool Read::single_correct(bithash *trusted, ofstream & out, double (&ntnt_prob)[4][4], bool learning) {
- if(correct_subset(untrusted, trusted, out, ntnt_prob, learning)) {
- out << header << "\t" << print_seq() << "\t" << print_corrected(trusted_read->corrections) << endl;
- return true;
- } else
- return false;
-}
-*/
-
-////////////////////////////////////////////////////////////
-// correct_cc
-//
-// Find the set of corrections with maximum likelihood that
-// result in all trusted kmers in the region defined by
-// the given untrusted kmer indices.
-//
-// Will print output if the read will not be corrected,
-// but otherwise abstains.
-//
-// Corrections can be accessed through 'trusted_read'
-//
-// Return codes are:
-// 0: corrected
-// 1: ambiguous
-// 2: full queue or quit early
-// 3: empty queue or empty region
-////////////////////////////////////////////////////////////
-//bool Read::correct_cc(vector<short> region, vector<int> untrusted_subset, bithash *trusted, double (&ntnt_prob)[4][4], double prior_prob[4], bool learning) {
-int Read::correct_cc(vector<short> region, vector<int> untrusted_subset, bithash *trusted, double ntnt_prob[][4][4], double prior_prob[4], bool learning) {
-
- unsigned int max_queue_size = 400000;
-
- /*
- if(header == "@read3") {
- cout << "Untrusted: " << untrusted_subset.size() << endl;
- for(int i = 0; i < untrusted_subset.size(); i++)
- cout << untrusted_subset[i] << " ";
- cout << endl;
- cout << "Region: " << region.size() << endl;
- for(int i = 0; i < region.size(); i++)
- cout << region[i] << " ";
- cout << endl << endl;
- }
- */
-
- ////////////////////////////////////////
- // region
- ////////////////////////////////////////
- // sort by quality
- if(region.size() > 0)
- quality_quicksort(region, 0, region.size()-1);
- else
- // die quietly and try again with bigger region
- return 3;
-
- ////////////////////////////////////////
- // stats
- ////////////////////////////////////////
- unsigned int cpq_adds = 0;
- unsigned int check_count = 0;
- float exp_errors = 0;
- int nt90 = 0;
- int nt99 = 0;
- int non_acgt = 0;
- for(int i = 0; i < region.size(); i++) {
- exp_errors += (1-prob[region[i]]);
- if(prob[region[i]] < .9)
- nt90++;
- if(prob[region[i]] < .99)
- nt99++;
- if(seq[region[i]] >= 4)
- non_acgt++;
- }
-
- ////////////////////////////////////////
- // filter
- ////////////////////////////////////////
- double mylike_t = correct_min_t;
- double myglobal_t = correct_min_t;
- double myspread_t = trust_spread_t;
- if(learning) {
- if(nt99 >= 8 || non_acgt > 1) {
- //out << header << "\t" << print_seq() << "\t." << endl;
- return 2;
- }
- mylike_t = learning_min_t;
- myglobal_t = learning_min_t;
- myspread_t = trust_spread_t / 2.0;
- } else if(nt99 >= 13 || non_acgt > 2) {
- // just quit
- if(TESTING)
- cerr << header << "\t" << print_seq() << "\t." << endl;
- //cerr << header << "\t" << region.size() << "\t" << untrusted_subset.size() << "\t" << nt90 << "\t" << nt99 << "\t" << exp_errors << "\t0\t0\t0\t0" << endl;
- return 2;
-
- } else if(nt99 >= 11) {
- // proceed very cautiously
- if(aggressive)
- mylike_t = .05;
- else
- mylike_t = .1;
-
- } else if(nt99 >= 9) {
- //proceed cautiously
- if(aggressive)
- mylike_t = .001;
- else
- mylike_t = .03;
- }
-
- ////////////////////////////////////////
- // priority queue
- ////////////////////////////////////////
- // data structure for corrected_reads sorted by likelihood
- //priority_queue< corrected_read*, vector<corrected_read*>, corrections_compare > cpq;
- vector<corrected_read*> cpq;
- corrections_compare cpq_comp;
-
- ////////////////////////////////////////
- // initialize
- ////////////////////////////////////////
- corrected_read *cr, *next_cr;
- short edit_i;
- float like;
- bitset<bitsize> bituntrusted;
- for(int i = 0; i < untrusted_subset.size(); i++) {
- if(untrusted_subset[i] >= bitsize) {
- cerr << "These reads must be longer than assumed. Increase the variable 'bitsize' in 'Read.h' to the read length." << endl;
- exit(1);
- } else
- bituntrusted.set(untrusted_subset[i]);
- }
-
- bool cr_added = true; // once an iteration passes w/ no corrected reads added, we can stop
- for(short region_edit = 0; region_edit < region.size() && cr_added; region_edit++) {
- edit_i = region[region_edit];
- cr_added = false;
-
- for(short nt = 0; nt < 4; nt++) {
- if(seq[edit_i] != nt) {
- // P(obs=o|actual=a)*P(actual=a) for Bayes
- if(seq[edit_i] < 4)
- like = (1.0-prob[edit_i]) * ntnt_prob[quals[edit_i]][nt][seq[edit_i]] * prior_prob[nt] / (prob[edit_i] * prior_prob[seq[edit_i]]);
- else
- // non-ACGT
- like = prior_prob[nt] / (1.0/3.0);
-
- // P(actual=a|obs=o)
- //like = (1.0-prob[edit_i]) * ntnt_prob[seq[edit_i]][nt] * / prob[edit_i];
-
- next_cr = new corrected_read(bituntrusted, like, region_edit+1);
- next_cr->corrections.push_back(correction(edit_i, nt));
-
- // add to priority queue
- //cpq.push(next_cr);
- cpq.push_back(next_cr);
- push_heap(cpq.begin(), cpq.end(), cpq_comp);
- cpq_adds++;
- cr_added = true;
- }
- }
- }
-
- ////////////////////////////////////////
- // process corrected reads
- ////////////////////////////////////////
- // initialize likelihood parameters
- trusted_read = 0;
- float trusted_likelihood;
- signed int untrusted_count; // trust me
- bool ambiguous_flag = false;
-
- while(cpq.size() > 0) {
-
- /////////////////////////
- // quit if pq is too big
- /////////////////////////
- if(cpq.size() > max_queue_size) {
- //cout << "queue is too large for " << header << endl;
- if(TESTING)
- cerr << header << "\t" << print_seq() << "\t." << endl;
-
- if(trusted_read != 0) {
- delete trusted_read;
- trusted_read = 0;
- }
- break;
- }
-
- /////////////////////////
- // pop next
- /////////////////////////
- cr = cpq[0];
- pop_heap(cpq.begin(), cpq.end(), cpq_comp);
- cpq.pop_back();
-
- /////////////////////////
- // check likelihood
- /////////////////////////
- if(trusted_read != 0) {
- // if a corrected read exists, compare likelihoods and if likelihood is too low, break loop return true
- if(cr->likelihood < trusted_likelihood*myspread_t) {
- delete cr;
- break;
- }
- } else {
- // if no corrected read exists and likelihood is too low, break loop return false
- if(cr->likelihood < mylike_t || global_like*cr->likelihood < myglobal_t) {
- delete cr;
- break;
- }
- }
-
- /////////////////////////
- // check trust
- /////////////////////////
- // save for later comparison
- untrusted_count = (signed int)cr->untrusted.count();
- if(check_trust(cr, trusted, check_count)) {
- if(trusted_read == 0) {
- // if yes, and first trusted read, save
- trusted_read = cr;
- trusted_likelihood = cr->likelihood;
- } else {
- // if yes, and if trusted read exists
- ambiguous_flag = true;
-
- // output ambiguous corrections for testing
- if(TESTING)
- cerr << header << "\t" << print_seq() << "\t" << print_corrected(trusted_read->corrections) << "\t" << print_corrected(cr->corrections) << endl;
-
- // delete trusted_read, break loop
- delete trusted_read;
- delete cr;
- trusted_read = 0;
- break;
- }
- }
-
- /*
- if(header == "@read3") {
- cout << cr->likelihood << "\t";
- for(int c = 0; c < cr->corrections.size(); c++) {
- cout << " (" << cr->corrections[c].index << "," << cr->corrections[c].to << ")";
- }
- cout << "\t";
- for(int c = 0; c < trim_length-bithash::k+1; c++) {
- if(cr->untrusted[c])
- cout << 1;
- else
- cout << 0;
- }
- cout << endl;
- }
- */
-
- // if untrusted sharply increases, just bail
- if(((signed int)cr->untrusted.count() - untrusted_count)*3 < bithash::k) {
-
- /////////////////////////
- // add next correction
- /////////////////////////
- bool cr_added = true; // once an iteration passes w/ no corrected reads added, we can stop
- for(short region_edit = cr->region_edits; region_edit < region.size() && cr_added; region_edit++) {
- edit_i = region[region_edit];
- cr_added = false;
-
- // add relatives
- for(short nt = 0; nt < 4; nt++) {
- // if actual edit,
- if(seq[edit_i] != nt) {
- // calculate new likelihood
-
- // P(obs=o|actual=a)*P(actual=a) for Bayes
- if(seq[edit_i] < 4)
- like = cr->likelihood * (1.0-prob[edit_i]) * ntnt_prob[quals[edit_i]][nt][seq[edit_i]] * prior_prob[nt] / (prob[edit_i] * prior_prob[seq[edit_i]]);
- else
- // non-ACGT
- like = cr->likelihood * prior_prob[nt] / (1.0/3.0);
-
- // P(actual=a|obs=o)
- //like = cr->likelihood * (1.0-prob[edit_i]) * ntnt_prob[seq[edit_i]][nt] / prob[edit_i];
-
- // if thresholds ok, add new correction
- if(trusted_read != 0) {
- if(like < trusted_likelihood*myspread_t)
- continue;
- } else {
- // must consider spread or risk missing a case of ambiguity
- if(like < mylike_t*myspread_t || global_like*like < myglobal_t*myspread_t)
- continue;
- }
-
- next_cr = new corrected_read(cr->corrections, cr->untrusted, like, region_edit+1);
- next_cr->corrections.push_back(correction(edit_i, nt));
-
- // add to priority queue
- cpq.push_back(next_cr);
- push_heap(cpq.begin(), cpq.end(), cpq_comp);
- cpq_adds++;
- cr_added = true;
- }
- }
- }
- }
-
- // if not the saved max trusted, delete
- if(trusted_read != cr) {
- delete cr;
- }
- }
-
- // clean up priority queue
- for(int i = 0; i < cpq.size(); i++)
- delete cpq[i];
-
- if(trusted_read != 0) {
- //cerr << header << "\t" << region.size() << "\t" << untrusted_subset.size() << "\t" << nt90 << "\t" << nt99 << "\t" << exp_errors << "\t" << cpq_adds << "\t" << check_count << "\t1\t" << trusted_read->likelihood << endl;
- return 0;
- } else {
- if(TESTING && mylike_t > correct_min_t)
- cerr << header << "\t" << print_seq() << "\t." << endl;
- //cerr << header << "\t" << region.size() << "\t" << untrusted_subset.size() << "\t" << nt90 << "\t" << nt99 << "\t" << exp_errors << "\t" << cpq_adds << "\t" << check_count << "\t0\t0" << endl;
-
- if(ambiguous_flag)
- return 1;
- else if(cpq.size() > max_queue_size)
- return 2;
- else
- return 3;
- }
-}
-
-////////////////////////////////////////////////////////////
-// print_seq
-////////////////////////////////////////////////////////////
-string Read::print_seq() {
- char nts[5] = {'A','C','G','T','N'};
- string sseq;
- for(int i = 0; i < read_length; i++)
- sseq.push_back(nts[seq[i]]);
- return sseq;
-}
-
-////////////////////////////////////////////////////////////
-// print_corrected
-//
-// Print read with corrections and trimming.
-////////////////////////////////////////////////////////////
-string Read::print_corrected(vector<correction> & cor) {
- return print_corrected(cor, trim_length);
-}
-string Read::print_corrected(vector<correction> & cor, int print_nt) {
- char nts[5] = {'A','C','G','T','N'};
- string sseq;
- int correct_i;
- for(int i = 0; i < print_nt; i++) {
- correct_i = -1;
- for(int c = 0; c < cor.size(); c++) {
- if(cor[c].index == i)
- correct_i = c;
- }
- if(correct_i != -1)
- sseq.push_back(nts[cor[correct_i].to]);
- else
- sseq.push_back(nts[seq[i]]);
- }
- return sseq;
-}
-
-
-////////////////////////////////////////////////////////////
-// correct
-//
-// Perform correction by breaking up untrusted kmers
-// into connected components and correcting them
-// independently.
-////////////////////////////////////////////////////////////
-//string Read::correct(bithash *trusted, double (&ntnt_prob)[4][4], double prior_prob[4], bool learning) {
-string Read::correct(bithash *trusted, double ntnt_prob[][4][4], double prior_prob[4], bool learning) {
- ////////////////////////////////////////
- // find connected components
- ////////////////////////////////////////
- vector< vector<int> > cc_untrusted;
-
- // add first
- cc_untrusted.push_back(vector<int>());
- int cc = 0;
- cc_untrusted[cc].push_back(untrusted[0]);
-
- for(int i = 1; i < untrusted.size(); i++) {
- // if kmer from last untrusted doesn't reach next
- if(untrusted[i-1]+bithash::k-1 < untrusted[i]) {
- cc++;
- cc_untrusted.push_back(vector<int>());
- }
- cc_untrusted[cc].push_back(untrusted[i]);
- }
-
- ////////////////////////////////////////
- // process connected components
- ////////////////////////////////////////
- vector<correction> multi_cors;
- vector<short> chop_region;
- vector<short> big_region;
- int chop_correct_code, big_correct_code;
- for(cc = 0; cc < cc_untrusted.size(); cc++) {
- // try chopped error region
- chop_region = error_region_chop(cc_untrusted[cc]);
- chop_correct_code = correct_cc(chop_region, cc_untrusted[cc], trusted, ntnt_prob, prior_prob, learning);
- if(chop_correct_code > 0) {
- // try bigger error region
- big_region = error_region(cc_untrusted[cc]);
- if(chop_region.size() == big_region.size()) {
- // cannot correct, and nothing found so trim to untrusted
- if(chop_correct_code == 1)
- return print_corrected(multi_cors, chop_region.front());
- else
- return print_corrected(multi_cors, cc_untrusted[cc].front());
-
- } else {
- big_correct_code = correct_cc(big_region, cc_untrusted[cc], trusted, ntnt_prob, prior_prob, learning);
-
- if(big_correct_code == 1) {
- // ambiguous
- // cannot correct, but trim to region
- if(chop_correct_code == 1)
- return print_corrected(multi_cors, chop_region.front());
- else
- return print_corrected(multi_cors, big_region.front());
-
- } else if(big_correct_code == 2 || big_correct_code == 3) {
- // cannot correct, and chaotic or nothing found so trim to untrusted
- return print_corrected(multi_cors, cc_untrusted[cc].front());
- }
- }
- }
- // else, corrected!
-
- // corrected
- global_like *= trusted_read->likelihood;
-
- // store
- for(int c = 0; c < trusted_read->corrections.size(); c++)
- multi_cors.push_back(trusted_read->corrections[c]);
- }
-
- // create new trusted read (mostly for learn_errors)
- corrected_read * tmp = trusted_read;
- trusted_read = new corrected_read(multi_cors, tmp->untrusted, global_like, 0);
- delete tmp;
-
- // print read with all corrections
- return print_corrected(multi_cors);
-}
-
-
-////////////////////////////////////////////////////////////
-// error_region
-//
-// Find region of the read to consider for errors based
-// on the pattern of untrusted kmers
-////////////////////////////////////////////////////////////
-vector<short> Read::error_region(vector<int> untrusted_subset) {
- // find intersection, or union
- vector<short> region;
- if(!untrusted_intersect(untrusted_subset, region))
- untrusted_union(untrusted_subset, region);
-
- // if front kmer can reach region, there may be more
- // errors in the front
- short f = region.front();
- short b = region.back();
-
- if(bithash::k-1 >= f) {
- // extend to front
- for(short i = f-1; i >= 0; i--)
- region.push_back(i);
- }
- if(trim_length-bithash::k <= b) {
- // extend to back
- for(short i = b+1; i < trim_length; i++)
- region.push_back(i);
- }
-
- return region;
-}
-
-////////////////////////////////////////////////////////////
-// error_region_chop
-//
-// Find region of the read to consider for errors based
-// on the pattern of untrusted kmers, using trusted kmers
-// to further trim the area.
-////////////////////////////////////////////////////////////
-vector<short> Read::error_region_chop(vector<int> untrusted_subset) {
- // find intersection, or union
- vector<short> region;
- if(!untrusted_intersect(untrusted_subset, region))
- untrusted_union(untrusted_subset, region);
-
- // fix front
- int right_leftkmer = untrusted_subset.front()-1;
- if(right_leftkmer >= 0) {
- // erase all bp in rightmost left kmer
- vector<short> front_chop(region);
- region.clear();
- for(int i = 0; i < front_chop.size(); i++) {
- if(front_chop[i] > right_leftkmer+bithash::k-1)
- region.push_back(front_chop[i]);
- }
-
- // add back 1 base if it's low quality, or lower quality than the next base
- for(int er = 0; er < expand_region; er++) {
- int pre_region = region[0] - (er+1);
- if(pre_region >= 0 && (prob[pre_region] < .99 || prob[pre_region] <= prob[pre_region+1])) {
- vector<short>::iterator it;
- it = region.begin();
- region.insert(it, pre_region);
- }
- }
- } else {
- // extend to front
- for(int i = region[0]-1; i >= 0; i--)
- region.push_back(i);
- }
-
- // fix back
- int left_rightkmer = untrusted_subset.back()+1;
- if(left_rightkmer+bithash::k-1 < trim_length) {
- // erase all bp in leftmost right kmer
- vector<short> back_chop(region);
- region.clear();
- for(int i = 0; i < back_chop.size(); i++) {
- if(back_chop[i] < left_rightkmer)
- region.push_back(back_chop[i]);
- }
-
- // add back 1 base if it's low quality, or lower quality than the next base
- // Two issues with this:
- // 1. I think region could be empty, so there's a bug
- // 2. This won't help for errors in the middle of a read that are missing an untrusted kmer
- // because the region will be empty, and we'll just try the intersection.
- /*
- for(int er = 0; er < expand_region; er++) {
- int post_region = region.back() + (er+1);
- if(post_region < trim_length && (prob[post_region] < .99 || prob[post_region] <= prob[post_region-1])) {
- region.push_back(post_region);
- }
- }
- */
-
- } else {
- // extend to back
- for(int i = region.back()+1; i < trim_length; i++)
- region.push_back(i);
- }
-
- return region;
-}
-
-////////////////////////////////////////////////////////////
-// untrusted_intersect
-//
-// Compute the intersection of the untrusted kmers as
-// start,end return true if it's non-empty or false
-// otherwise
-////////////////////////////////////////////////////////////
-bool Read::untrusted_intersect(vector<int> untrusted_subset, vector<short> & region) {
- int start = 0;
- int end = read_length-1;
-
- int u;
- for(int i = 0; i < untrusted_subset.size(); i++) {
- u = untrusted_subset[i];
-
- // if overlap
- if(start <= u+bithash::k-1 && u <= end) {
- // take intersection
- start = max(start, u);
- end = min(end, u+bithash::k-1);
- } else {
- // intersection is empty
- return false;
- }
- }
-
- // intersection is non-empty
- for(short i = start; i <= end; i++)
- region.push_back(i);
- return true;
-}
-
-////////////////////////////////////////////////////////////
-// untrusted_union
-//
-// Compute the union of the untrusted kmers, though not
-////////////////////////////////////////////////////////////
-void Read::untrusted_union(vector<int> untrusted_subset, vector<short> & region) {
- short u;
- set<short> region_set;
- for(int i = 0; i < untrusted_subset.size(); i++) {
- u = untrusted_subset[i];
-
- for(short ui = u; ui < u+bithash::k; ui++)
- region_set.insert(ui);
- }
-
- set<short>::iterator it;
- for(it = region_set.begin(); it != region_set.end(); it++)
- region.push_back(*it);
-}
-
-////////////////////////////////////////////////////////////
-// quality_quicksort
-//
-// Sort the indexes from lowest probability of an accurate
-// basecall to highest
-////////////////////////////////////////////////////////////
-void Read::quality_quicksort(vector<short> & indexes, int left, int right) {
- int i = left, j = right;
- short tmp;
- float pivot = prob[indexes[(left + right) / 2]];
-
- /* partition */
- while (i <= j) {
- while (prob[indexes[i]] < pivot)
- i++;
- while (prob[indexes[j]] > pivot)
- j--;
- if (i <= j) {
- tmp = indexes[i];
- indexes[i] = indexes[j];
- indexes[j] = tmp;
- i++;
- j--;
- }
- }
-
- /* recursion */
- if (left < j)
- quality_quicksort(indexes, left, j);
- if (i < right)
- quality_quicksort(indexes, i, right);
-}
-
-////////////////////////////////////////////////////////////
-// check_trust
-//
-// Given a corrected read and data structure holding
-// trusted kmers, update the corrected_reads's vector
-// of untrusted kmers and return true if it's now empty
-////////////////////////////////////////////////////////////
-bool Read::check_trust(corrected_read *cr, bithash *trusted, unsigned int & check_count) {
- // original read HAS errors
- if(cr->corrections.empty())
- return false;
-
- // make corrections to sequence, saving nt's to fix later
- vector<int> seqsave;
- int i;
- for(i = 0; i < cr->corrections.size(); i++) {
- seqsave.push_back(seq[cr->corrections[i].index]);
- seq[cr->corrections[i].index] = cr->corrections[i].to;
- }
-
- int edit = cr->corrections.back().index;
- int kmer_start = max(0, edit-bithash::k+1);
- //int kmer_end = min(edit, read_length-k);
- int kmer_end = min(edit, trim_length-bithash::k);
-
- check_count += (kmer_end - kmer_start + 1);
-
- bool non_acgt = false;
- for(i = kmer_start; i < kmer_end+bithash::k; i++)
- if(seq[i] >=4)
- non_acgt = true;
-
- //non_acgt = true;
- if(non_acgt) {
- // easier to just check kmers one by one
- for(i = kmer_start; i <= kmer_end; i++)
- // check kmer
- cr->untrusted.set(i, !trusted->check(&seq[i]));
-
- } else {
- // check affected kmers
- Seq<kK> kmermap;
- // check first kmer and save map value
- cr->untrusted.set(kmer_start, !trusted->check(&seq[kmer_start], kmermap));
- for(i = kmer_start+1; i <= kmer_end; i++) {
- // check kmer using map value
- cr->untrusted.set(i, !trusted->check(kmermap, seq[i-1], seq[i+bithash::k-1]));
- }
- }
-
- // fix sequence
- for(i = 0; i < cr->corrections.size(); i++)
- seq[cr->corrections[i].index] = seqsave[i];
-
- return(cr->untrusted.none());
-}
diff --git a/src/hammer/quake_correct/bithash.cpp b/src/hammer/quake_correct/bithash.cpp
deleted file mode 100644
index 1764b23..0000000
--- a/src/hammer/quake_correct/bithash.cpp
+++ /dev/null
@@ -1,388 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "bithash.h"
-#include "sequence/nucl.hpp"
-#include <iostream>
-#include <fstream>
-#include <cstdlib>
-#include <cassert>
-
-using namespace::std;
-
-bithash::bithash(int _k)
- :bits()
-{
- k = _k;
- assert(_k == kK && "kK and k passed from he programm does not match");
-}
-
-bithash::~bithash() {
-}
-
-////////////////////////////////////////////////////////////
-// add
-//
-// Add a single sequence to the bitmap
-////////////////////////////////////////////////////////////
-void bithash::add(Seq<kK> kmer) {
- bits.insert(kmer);
-}
-
-
-////////////////////////////////////////////////////////////
-// check
-//
-// Check for the presence of a sequence in the tree
-//
-// Can handle N's! Returns False!
-////////////////////////////////////////////////////////////
-bool bithash::check(unsigned kmer[]) {
- for(int i = 0; i < k; i++) { // ToDo: if we add constructor which
- // can soft fail if we pass N's in seq
- // we can optimize this code.
- if (!is_dignucl(kmer[i])) {
- return false;
- }
- }
- return bits.count(Seq<kK>(kmer)) != 0;
-}
-
-////////////////////////////////////////////////////////////
-// check
-//
-// Check for the presence of a sequence in the tree.
-// Pass the kmer map value back by reference to be re-used
-//
-// Can't handle N's!
-////////////////////////////////////////////////////////////
-bool bithash::check(unsigned kmer[], Seq<kK> &kmermap) {
- kmermap = Seq<kK>(kmer);
- return bits.count(kmermap) != 0;
-}
-
-////////////////////////////////////////////////////////////
-// check
-//
-// Check for the presence of a sequence in the tree.
-////////////////////////////////////////////////////////////
-bool bithash::check(Seq<kK> kmermap) {
- return bits.count(kmermap) != 0;
-}
-
-////////////////////////////////////////////////////////////
-// check
-//
-// Check for the presence of a sequence in the tree.
-// Pass the kmer map value back by reference to be re-used
-//
-// Can't handle N's!
-////////////////////////////////////////////////////////////
-bool bithash::check(Seq<kK> &kmermap, unsigned last, unsigned next) {
- kmermap = kmermap << next;
- // ToDo we can optimize this if Seq will
- // have << operator
- return bits.count(kmermap) != 0;
-}
-
-////////////////////////////////////////////////////////////
-// file_load
-//
-// Make a prefix_tree from kmers in the FASTA-file
-////////////////////////////////////////////////////////////
-void bithash::hammer_file_load(istream & hammer_in, unsigned long long atgc[]) {
- string line;
- while(getline(hammer_in, line)) {
- if (line[0] != '>') {
- // add to tree
- string kmer = line.substr(0,k);
- add(binary_kmer(kmer));
-
- // add reverse to tree
- add(binary_rckmer(kmer));
-
- // count gc
- if(atgc != NULL) {
- unsigned int at = count_at(kmer);
- atgc[0] += at;
- atgc[1] += (k-at);
- }
- }
- }
-}
-
-
-////////////////////////////////////////////////////////////
-// file_load
-//
-// Make a prefix_tree from kmers in the file given that
-// occur >= "boundary" times
-////////////////////////////////////////////////////////////
-void bithash::meryl_file_load(const char* merf, const double boundary) {
- ifstream mer_in(merf);
- string line;
- double count;
- bool add_kmer = false;
-
- while(getline(mer_in, line)) {
- if(line[0] == '>') {
- // get count
- count = atof(line.substr(1).c_str());
- //cout << count << endl;
-
- // compare to boundary
- if(count >= boundary) {
- add_kmer = true;
- } else {
- add_kmer = false;
- }
-
- } else if(add_kmer) {
- // add to tree
- add(binary_kmer(line));
-
- // add reverse to tree
- add(binary_rckmer(line));
- }
- }
-}
-
-////////////////////////////////////////////////////////////
-// file_load
-//
-// Make a prefix_tree from kmers in the file given that
-// occur >= "boundary" times
-////////////////////////////////////////////////////////////
-void bithash::tab_file_load(istream & mer_in, const double boundary, unsigned long long atgc[]) {
- string line;
- double count;
-
- while(getline(mer_in, line)) {
- if(line[k] != ' ' && line[k] != '\t') {
- cerr << "Kmers are not of expected length " << k << endl;
- exit(EXIT_FAILURE);
- }
-
- // get count
- count = atof(line.substr(k+1).c_str());
- //cout << count << endl;
-
- // compare to boundary
- if(count >= boundary) {
- // add to tree
- add(binary_kmer(line.substr(0,k)));
-
- // add reverse to tree
- add(binary_rckmer(line.substr(0,k)));
-
- // count gc
- if(atgc != NULL) {
- unsigned int at = count_at(line.substr(0,k));
- atgc[0] += at;
- atgc[1] += (k-at);
- }
- }
- }
-}
-
-////////////////////////////////////////////////////////////
-// file_load
-//
-// Make a prefix_tree from kmers in the file given that
-// occur >= "boundary" times
-////////////////////////////////////////////////////////////
-void bithash::tab_file_load(istream & mer_in, const vector<double> boundary, unsigned long long atgc[]) {
- string line;
- double count;
- int at;
-
- while(getline(mer_in, line)) {
- if(line[k] != '\t') {
- cerr << "Kmers are not of expected length " << k << endl;
- exit(EXIT_FAILURE);
- }
-
- at = count_at(line.substr(0,k));
-
- // get count
- count = atof(line.substr(k+1).c_str());
- //cout << count << endl;
-
- // compare to boundary
- if(count >= boundary[at]) {
- // add to tree
- add(binary_kmer(line.substr(0,k)));
-
- // add reverse to tree
- add(binary_rckmer(line.substr(0,k)));
-
- // count gc
- if(atgc != NULL) {
- unsigned int at = count_at(line.substr(0,k));
- atgc[0] += at;
- atgc[1] += (k-at);
- }
- }
- }
-}
-
-////////////////////////////////////////////////////////////
-// binary_file_output
-//
-// Write bithash to file in binary format
-////////////////////////////////////////////////////////////
-void bithash::binary_file_output(char* outf) {
- /* unsigned long long mysize = (unsigned long long)bits.size() / 8ULL;
- char* buffer = new char[mysize];
- unsigned int flag = 1;
- for(unsigned long long i = 0; i < mysize; i++) {
- unsigned int temp = 0;
- for(unsigned int j = 0; j < 8; j++) { // read 8 bits from the bitset
- temp <<= 1;
- //unsigned int tmp = i*8 + j;
- //cout << tmp << ",";
- if(bits.count(i*8 + j) != 0)
- temp |= flag;
- }
- buffer[i] = (char)temp;
- }
- ofstream ofs(outf, ios::out | ios::binary);
- ofs.write(buffer, mysize);
- ofs.close();*/
-}
-
-////////////////////////////////////////////////////////////
-// binary_file_input
-//
-// Read bithash from file in binary format
-////////////////////////////////////////////////////////////
-/*
-void bithash::binary_file_input(char* inf) {
- ifstream ifs(inf, ios::binary);
-
- // get size of file
- ifs.seekg(0,ifstream::end);
- unsigned long long mysize = ifs.tellg();
- ifs.seekg(0);
-
- // allocate memory for file content
- char* buffer = new char[mysize];
-
- // read content of ifs
- ifs.read (buffer, mysize);
-
- // parse bits
- unsigned int flag = 128;
- unsigned int temp;
- for(unsigned long i = 0; i < mysize; i++) {
- temp = (unsigned int)buffer[i];
- for(unsigned int j = 0; j < 8; j++) {
- if((temp & flag) == flag)
- bits.set(i*8 + j);
- temp <<= 1;
- }
- }
-
- delete[] buffer;
-}
-*/
-
-////////////////////////////////////////////////////////////
-// binary_file_input
-//
-// Read bithash from file in binary format
-////////////////////////////////////////////////////////////
-void bithash::binary_file_input(char* inf, unsigned long long atgc[]) {
- /*unsigned int flag = 128;
- unsigned int temp;
-
- ifstream ifs(inf, ios::binary);
-
- // get size of file
- ifs.seekg(0,ifstream::end);
- unsigned long long mysize = ifs.tellg();
- ifs.seekg(0);
-
- // allocate memory for file content
- unsigned long long buffersize = 134217728; // i.e. 4^15 / 8, 16 MB
- if(mysize < buffersize)
- buffersize = mysize;
- char* buffer = new char[buffersize];
-
- for(unsigned long long b = 0; b < mysize/buffersize; b++) {
-
- // read content of ifs
- ifs.read (buffer, buffersize);
-
- // parse bits
- for(unsigned long long i = 0; i < buffersize; i++) {
- temp = (unsigned int)buffer[i];
- for(int j = 0; j < 8; j++) {
- if((temp & flag) == flag) {
- bits.set((buffersize*b + i)*8 + j);
-
- // count gc
- unsigned int at = count_at((buffersize*b + i)*8 + j);
- atgc[0] += at;
- atgc[1] += (k-at);
- }
- temp <<= 1;
- }
- }
- }
-
- delete[] buffer;*/
-}
-
-////////////////////////////////////////////////////////////
-// count_at
-//
-// Count the A's and T's in the sequence given
-////////////////////////////////////////////////////////////
-int bithash::count_at(string seq) {
- int at = 0;
- for(int i = 0; i < seq.size(); i++)
- if(seq[i] == 'A' || seq[i] == 'T')
- at += 1;
- return at;
-}
-
-int bithash::count_at(Seq<kK> seq) {
- int at = 0;
- unsigned long long mask = 3;
- unsigned long long nt;
- for(int i = 0; i < k; i++) {
- if(seq[i] == 0 || seq[i] == 3)
- at++;
- }
- return at;
-}
-
-// Convert string s to its binary equivalent in mer .
-Seq<kK> bithash::binary_kmer(const string & s) {
- return Seq<kK>(s);
-}
-
-// Convert string s to its binary equivalent in mer .
-Seq<kK> bithash::binary_rckmer(const string & s) {
- return !Seq<kK>(s); //ToDo: optimize
-}
-
-// Return the binary equivalent of ch .
-unsigned bithash::binary_nt(char ch) {
- switch (tolower (ch)) {
- case 'a' : return 0;
- case 'c' : return 1;
- case 'g' : return 2;
- case 't' : return 3;
- }
-}
-
-
-unsigned int bithash::num_kmers() {
- return (unsigned int)bits.size();
-}
diff --git a/src/hammer/quake_correct/correct.cpp b/src/hammer/quake_correct/correct.cpp
deleted file mode 100644
index d540503..0000000
--- a/src/hammer/quake_correct/correct.cpp
+++ /dev/null
@@ -1,897 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "bithash.h"
-#include "Read.h"
-#include "edit.h"
-#include "gzstream.h"
-#include <fstream>
-#include <iostream>
-#include <sstream>
-#include <vector>
-#include <string>
-#include <string.h>
-#include <cstring>
-#include <getopt.h>
-#ifdef _OPENMP
-#include <omp.h>
-#else
-#define omp_set_num_threads(x)
-#define omp_get_max_threads() 1
-#define omp_get_thread_num() 0
-#define omp_get_num_threads() 0
-#endif
-#include <cstdlib>
-#include <iomanip>
-#include <sys/stat.h>
-
-////////////////////////////////////////////////////////////
-// options
-////////////////////////////////////////////////////////////
-const static char* myopts = "r:f:k:m:b:c:a:t:q:l:p:v:zCuh";
-static struct option long_options [] = {
- {"headers", 0, 0, 1000},
- {"log", 0, 0, 1001},
- {0, 0, 0, 0}
-};
-
-// -r, fastq file of reads
-//char* fastqf;
-// -f, file of fastq files of reads
-//char* file_of_fastqf;
-
-// -z, zip output files
-//bool zip_output = false;
-
-// -k, kmer size
-static int k = 0;
-
-// -m, mer counts
-static char* merf = NULL;
-// -b, bithash
-static char* bithashf = NULL;
-// -v, good kmers from Hammer
-static char* hammerf = NULL;
-
-// -c, cutoff between trusted and untrusted mers
-static double cutoff = 0;
-// -a, AT cutoff
-static char* ATcutf = NULL;
-
-// -q
-//int Read::quality_scale;
-// -l
-static int trim_t = 30;
-// -t
-//static int trimq = 3;
-
-// -p, number of threads
-//int threads;
-
-// --headers, Print only normal headers
-static bool orig_headers = false;
-
-// -C, Contrail output
-static bool contrail_out = false;
-// -u, output uncorrected reads
-static bool uncorrected_out = false;
-// --log, output correction log
-static bool out_log = false;
-
-static bool overwrite_temp = true;
-
-// Note: to not trim, set trimq=0 and trim_t>read_length-k
-
-// constants
-#define TESTING false
-static char* nts = (char*)"ACGTN";
-//unsigned int chunks_per_thread = 200;
-
- // to collect stats
-struct stats {
- stats() {
- validated = 0;
- corrected = 0;
- removed = 0;
- trimmed = 0;
- trimmed_only = 0;
- }
- unsigned long long validated;
- unsigned long long corrected;
- unsigned long long removed;
- unsigned long long trimmed;
- unsigned long long trimmed_only;
-};
-
-static void Usage
- (char * command)
-
-// Print to stderr description of options and command line for
-// this program. command is the command that was used to
-// invoke it.
-
-{
- fprintf (stderr,
- "USAGE: correct [options]\n"
- "\n"
- "Correct sequencing errors in fastq file provided with -r\n"
- "and output trusted and corrected reads to\n"
- "<fastq-prefix>.cor.fastq.\n"
- "\n"
- "Options:\n"
- " -r <file>\n"
- " Fastq file of reads\n"
- " -f <file>\n"
- " File containing fastq file names, one per line or\n"
- " two per line for paired end reads.\n"
- " -z\n"
- " Write output files as gzipped.\n"
- " -m <file>\n"
- " File containing kmer counts in format `seq\tcount`.\n"
- " Can be gzipped.\n"
- " -b <file>\n"
- " File containing saved bithash.\n"
- " -c <num>\n"
- " Separate trusted/untrusted kmers at cutoff <num>\n"
- " -a <file>\n"
- " Separate trusted/untrusted kmers as a function of AT\n"
- " content, with cutoffs found in <file>, one per line\n"
- " -p <num>\n"
- " Use <num> openMP threads\n"
- " -l <num>=30\n"
- " Return only reads corrected and/or trimmed to >= <num>\n"
- " bp\n"
- " -q <num>\n"
- " Quality value ascii scale, generally 64 or 33. If not\n"
- " specified, it will guess.\n"
- " -v <file>\n"
- " File with good k-mers from Hammer.\n"
- " -t <num>=3\n"
- " Use BWA trim parameter <num>\n"
- " -u\n"
- " Output errors reads even if they can't be corrected,\n"
- " maintaining paired end reads.\n"
- " --headers\n"
- " Output only the original read headers without\n"
- " correction messages\n"
- " --log\n"
- " Output a log of all corrections into *.log as\n"
- " 'quality position new_nt old_nt'\n"
- "\n");
-
- return;
- }
-
-////////////////////////////////////////////////////////////
-// parse_command_line
-////////////////////////////////////////////////////////////
-static void parse_command_line(int argc, char **argv) {
- bool errflg = false;
- int ch;
- optarg = NULL;
- int option_index = 0;
- char* p;
- k = kK;
- // parse args
- while(!errflg && ((ch = getopt_long(argc, argv, myopts, long_options, &option_index)) != EOF)) {
- //while(!errflg && ((ch = getopt(argc, argv, myopts)) != EOF)) {
- switch(ch) {
- case 'r':
- fastqf = strdup(optarg);
- break;
-
- case 'f':
- file_of_fastqf = strdup(optarg);
- break;
-
- case 'z':
- zip_output = true;
- break;
-
- case 'm':
- merf = strdup(optarg);
- break;
-
- case 'b':
- bithashf = strdup(optarg);
- break;
-
- case 'c':
- cutoff = double(strtod(optarg, &p));
- if(p == optarg || cutoff < 0) {
- fprintf(stderr, "Bad mer cutoff value \"%s\"\n",optarg);
- errflg = true;
- }
- break;
-
- case 'a':
- ATcutf = strdup(optarg);
- break;
-
- case 't':
- trimq = int(strtol(optarg, &p, 10));
- if(p == optarg || trimq < 0) {
- fprintf(stderr, "Bad trim quality value \"%s\"\n",optarg);
- errflg = true;
- }
- break;
-
- case 'l':
- trim_t = int(strtol(optarg, &p, 10));
- if(p == optarg || trim_t < 1) {
- fprintf(stderr, "Bad trim threshold \"%s\"\n",optarg);
- errflg = true;
- }
- break;
-
- case 'q':
- Read::quality_scale = int(strtol(optarg, &p, 10));
- if(p == optarg || Read::quality_scale < -1) {
- fprintf(stderr, "Bad quality value scale \"%s\"\n",optarg);
- errflg = true;
- }
- break;
-
- case 'C':
- contrail_out = true;
- break;
-
- case 'u':
- uncorrected_out = true;
- break;
-
- case 'p':
- threads = int(strtol(optarg, &p, 10));
- if(p == optarg || threads <= 0) {
- fprintf(stderr, "Bad number of threads \"%s\"\n",optarg);
- errflg = true;
- }
- break;
-
- case 1000:
- orig_headers = true;
- break;
-
- case 1001:
- out_log = true;
- break;
-
- case 'v':
- hammerf = strdup(optarg);
- break;
-
- case 'h':
- Usage(argv[0]);
- exit(EXIT_FAILURE);
-
- case '?':
- fprintf (stderr, "Unrecognized option -%c\n", optopt);
-
- default:
- errflg = true;
- }
- }
-
- // for some reason, optind is not advancing properly so this
- // always returns an error
-
- // return errors
- /*
- if(errflg || optind != argc-1) {
- Usage(argv[0]);
- exit(EXIT_FAILURE);
- }
- */
-
- ////////////////////////////////////////
- // correct user input errors
- ////////////////////////////////////////
- if(fastqf == NULL && file_of_fastqf == NULL) {
- cerr << "Must provide a fastq file of reads (-r) or a file containing a list of fastq files of reads (-f)" << endl;
- exit(EXIT_FAILURE);
- }
-
- if(k == 0) {
- cerr << "Must provide kmer size (-k)" << endl;
- exit(EXIT_FAILURE);
- }
-
- if(merf != NULL) {
- if(cutoff == 0 && ATcutf == NULL) {
- cerr << "Must provide a trusted/untrusted kmer cutoff (-c) or a file containing the cutoff as a function of the AT content (-a)" << endl;
- exit(EXIT_FAILURE);
- }
- } else if(bithashf == NULL && hammerf == NULL) {
- cerr << "Must provide a file of kmer counts (-m) or a saved bithash (-b) or solid kmers from Hammer (-v)" << endl;
- exit(EXIT_FAILURE);
- }
-
-}
-
-
-////////////////////////////////////////////////////////////
-// regress_probs
-//
-// Use ntnt_counts to perform nonparametric regression
-// on ntnt_prob across quality values.
-////////////////////////////////////////////////////////////
-void regress_probs(double ntnt_prob[Read::max_qual][4][4], unsigned int ntnt_counts[Read::max_qual][4][4]) {
- double sigma = 2.0;
- double sigma2 = pow(sigma, 2);
-
- // count # occurrences for each (quality=q,actual=a) tuple
- unsigned int actual_counts[Read::max_qual][4] = {0};
- for(int q = 1; q < Read::max_qual; q++)
- for(int i = 0; i < 4; i++)
- for(int j = 0; j < 4; j++)
- actual_counts[q][i] += ntnt_counts[q][i][j];
-
- // regress
- double ntdsum;
- for(int q = 1; q < Read::max_qual; q++) {
- for(int i = 0; i < 4; i++) {
- //ntdsum = 0;
- for(int j = 0; j < 4; j++) {
- double pnum = 0;
- double pden = 0;
- for(int qr = 1; qr < Read::max_qual; qr++) {
- pnum += ntnt_counts[qr][i][j] * exp(-pow((double)(qr - q), 2)/(2*sigma2));
- pden += actual_counts[qr][i] * exp(-pow((double)(qr - q), 2)/(2*sigma2));
- }
- ntnt_prob[q][i][j] = pnum / pden;
- //ntdsum += ntnt_prob[q][i][j];
- }
-
- // re-normalize to sum to 1
- //for(int j = 0; j < 4; j++)
- //ntnt_prob[q][i][j] /= ntdsum;
- }
- }
-}
-
-
-////////////////////////////////////////////////////////////
-// output_model
-//
-// Print the error model to the file error_model.txt
-////////////////////////////////////////////////////////////
-void output_model(double ntnt_prob[Read::max_qual][4][4], unsigned int ntnt_counts[Read::max_qual][4][4], string fqf) {
- string base = split(fqf,'/').back();
-
- int suffix_index = base.rfind(".");
- string prefix;
- if(suffix_index == -1) {
- prefix = base;
- } else {
- prefix = base.substr(0,suffix_index);
- }
-
- string outf = "error_model." + prefix + ".txt";
-
- ofstream mod_out(outf.c_str());
-
- unsigned int ntsum;
- for(int q = 1; q < Read::max_qual; q++) {
- mod_out << "Quality = " << q << endl;
-
- // counts
- mod_out << "\tA\tC\tG\tT" << endl;
- for(int i = 0; i < 4; i++) {
- mod_out << nts[i];
-
- ntsum = 0;
- for(int j = 0; j < 4; j++)
- ntsum += ntnt_counts[q][i][j];
-
- for(int j = 0; j < 4; j++) {
- if(i == j)
- mod_out << "\t-";
- else if(ntsum > 0)
- mod_out << "\t" << ((double)ntnt_counts[q][i][j] / (double)ntsum) << "(" << ntnt_counts[q][i][j] << ")";
- else
- mod_out << "\t0";
- }
- mod_out << endl;
- }
-
- // probs
- mod_out << "\tA\tC\tG\tT" << endl;
- for(int i = 0; i < 4; i++) {
- mod_out << nts[i];
- for(int j = 0; j < 4; j++) {
- if(i == j)
- mod_out << "\t-";
- else
- mod_out << "\t" << ntnt_prob[q][i][j];
- }
- mod_out << endl;
- }
- mod_out << endl;
- }
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-// output_read
-//
-// Output the given possibly corrected and/or trimmed
-// read according to the given options.
-////////////////////////////////////////////////////////////////////////////////
-static void output_read(ofstream & reads_out, ofstream & corlog_out, int pe_code, string header, string ntseq, string mid, string strqual, string corseq, stats & tstats) {
- if(corseq.size() >= trim_t) {
- // check for changes
- bool corrected = false;
- for(int i = 0; i < corseq.size(); i++) {
- if(corseq[i] != ntseq[i]) {
- // log it
- if(corlog_out.good())
- corlog_out << (strqual[i]-Read::quality_scale) << "\t" << (i+1) << "\t" << corseq[i] << "\t" << ntseq[i] << endl;
- // note it
- corrected = true;
- // set qual to crap
- strqual[i] = (char)(Read::quality_scale+2);
- }
- }
- if(corrected)
- tstats.corrected++;
-
- // update header
- if(!orig_headers) {
- if(corrected)
- header += " correct";
- unsigned int trimlen = ntseq.size()-corseq.size();
- if(trimlen > 0) {
- stringstream trim_inter;
- trim_inter << trimlen;
- header += " trim=" + trim_inter.str();
- tstats.trimmed++;
- if(!corrected)
- tstats.trimmed_only++;
- } else {
- if(!corrected)
- tstats.validated++;
- }
- }
- // print
- if(contrail_out)
- reads_out << header << "\t" << corseq << endl;
- else
- reads_out << header << endl << corseq << endl << mid << endl << strqual.substr(0,corseq.size()) << endl;
- if(TESTING)
- cerr << header << "\t" << ntseq << "\t" << corseq << endl;
- } else {
- tstats.removed++;
- if(uncorrected_out || pe_code > 0) {
- // update header
- header += " error";
-
- //print
- if(contrail_out)
- reads_out << header << "\t" << ntseq << endl;
- else
- reads_out << header << endl << ntseq << endl << mid << endl << strqual << endl;
- }
- if(TESTING)
- cerr << header << "\t" << ntseq << "\t-" << endl; // or . if it's only trimmed?
- }
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-// correct_reads
-//
-// Correct the reads in the file 'fqf' using the data structure of trusted
-// kmers 'trusted', matrix of nt->nt error rates 'ntnt_prob' and prior nt
-// probabilities 'prior_prob'. 'starts' and 'counts' help openMP parallelize
-// the read processing. If 'pairedend_code' is 0, the reads are not paired;
-// if it's 1, this file is the first of a pair so print all reads and withold
-// combining; if it's 2, the file is the second of a pair so print all reads
-// and then combine both 1 and 2.
-////////////////////////////////////////////////////////////////////////////////
-static void correct_reads(string fqf, int pe_code, bithash * trusted, vector<streampos> & starts, vector<unsigned long long> & counts, double ntnt_prob[Read::max_qual][4][4], double prior_prob[4]) {
- // output directory
- struct stat st_file_info;
- string path_suffix = split(fqf,'/').back();
- string out_dir("."+path_suffix);
- if(stat(out_dir.c_str(), &st_file_info) == 0) {
- cerr << "Hidden temporary directory " << out_dir << " already exists and will be used" << endl;
- } else {
- if(mkdir(out_dir.c_str(), S_IRWXU) == -1) {
- cerr << "Failed to create hidden temporary directory " << out_dir << endl;
- exit(EXIT_FAILURE);
- }
- }
-
- // collect stats
- stats * thread_stats = new stats[omp_get_max_threads()];
-
- unsigned int chunk = 0;
-#pragma omp parallel //shared(trusted)
- {
- int tid = omp_get_thread_num();
-
- // input
- ifstream reads_in(fqf.c_str());
-
- unsigned int tchunk;
- string header,ntseq,mid,strqual,corseq;
- int trim_length;
- char* nti;
- Read *r;
-
- #pragma omp critical
- tchunk = chunk++;
-
- while(tchunk < starts.size()) {
- reads_in.seekg(starts[tchunk]);
-
- // output
- string toutf(out_dir+"/");
- stringstream tconvert;
- tconvert << tchunk;
- toutf += tconvert.str();
-
- if(overwrite_temp || stat(toutf.c_str(), &st_file_info) == -1) {
- ofstream reads_out(toutf.c_str());
- //cout << toutf << endl;
-
- // output log
- string tlogf = toutf + ".log";
- ofstream corlog_out;
- if(out_log) {
- corlog_out.open(tlogf.c_str());
- }
-
- unsigned long long tcount = 0;
- while(getline(reads_in, header)) {
- //cout << tid << " " << header << endl;
-
- // get sequence
- getline(reads_in, ntseq);
- //cout << ntseq << endl;
-
- // convert ntseq to iseq
- vector<unsigned int> iseq;
- for(int i = 0; i < ntseq.size(); i++) {
- nti = strchr(nts, ntseq[i]);
- iseq.push_back(nti - nts);
- }
-
- // get quality values
- getline(reads_in,mid);
- //cout << mid << endl;
- getline(reads_in,strqual);
- //cout << strqual << endl;
-
- vector<int> untrusted;
-
- if(iseq.size() < trim_t)
- trim_length = 0;
- else {
- for(int i = 0; i < iseq.size()-k+1; i++) {
- if(!trusted->check(&iseq[i])) {
- untrusted.push_back(i);
- }
- }
-
- trim_length = quick_trim(strqual, untrusted);
- //trim_length = iseq.size();
- }
-
- // fix error reads
- if(untrusted.size() > 0) {
- r = new Read(header, &iseq[0], strqual, untrusted, trim_length);
- corseq = r->correct(trusted, ntnt_prob, prior_prob);
-
- // output read w/ trim and corrections
- output_read(reads_out, corlog_out, pe_code, header, ntseq, mid, strqual, corseq, thread_stats[tid]);
-
- delete r;
- } else {
- output_read(reads_out, corlog_out, pe_code, header, ntseq, mid, strqual, ntseq.substr(0,trim_length), thread_stats[tid]);
- // output read as trimmed
- /*
- if(contrail_out)
- reads_out << header << "\t" << ntseq.substr(0,trim_length) << endl;
- else
- reads_out << header << endl << ntseq.substr(0,trim_length) << endl << mid << endl << strqual.substr(0,trim_length) << endl;
- */
- }
-
- if(++tcount == counts[tchunk])
- break;
- }
- reads_out.close();
- }
-
-#pragma omp critical
- tchunk = chunk++;
- }
- reads_in.close();
- }
-
- // combine stats
- for(int i = 1; i < omp_get_max_threads(); i++) {
- thread_stats[0].validated += thread_stats[i].validated;
- thread_stats[0].corrected += thread_stats[i].corrected;
- thread_stats[0].trimmed += thread_stats[i].trimmed;
- thread_stats[0].trimmed_only += thread_stats[i].trimmed_only;
- thread_stats[0].removed += thread_stats[i].removed;
- }
-
- // print stats
- int suffix_index = fqf.rfind(".");
- string outf;
- if(suffix_index == -1) {
- outf = fqf+".stats.txt";
- } else {
- outf = fqf.substr(0,suffix_index+1) + "stats.txt";
- }
- ofstream stats_out(outf.c_str());
- stats_out << "Validated: " << thread_stats[0].validated << endl;
- stats_out << "Corrected: " << thread_stats[0].corrected << endl;
- stats_out << "Trimmed: " << thread_stats[0].trimmed << endl;
- stats_out << "Trimmed only: " << thread_stats[0].trimmed_only << endl;
- stats_out << "Removed: " << thread_stats[0].removed << endl;
- stats_out.close();
-}
-
-
-////////////////////////////////////////////////////////////
-// learn_errors
-//
-// Correct reads using a much stricter filter in order
-// to count the nt->nt errors and learn the errors
-// probabilities
-////////////////////////////////////////////////////////////
-//static void learn_errors(string fqf, bithash * trusted, vector<streampos> & starts, vector<unsigned long long> & counts, double (&ntnt_prob)[4][4], double prior_prob[4]) {
-static void learn_errors(string fqf, bithash * trusted, vector<streampos> & starts, vector<unsigned long long> & counts, double ntnt_prob[Read::max_qual][4][4], double prior_prob[4]) {
- unsigned int ntnt_counts[Read::max_qual][4][4] = {0};
- unsigned int samples = 0;
-
- unsigned int chunk = 0;
-#pragma omp parallel //shared(trusted)
- {
- unsigned int tchunk;
- string header,ntseq,strqual,corseq;
- int trim_length;
- char* nti;
- Read *r;
- ifstream reads_in(fqf.c_str());
-
- while(chunk < threads*chunks_per_thread) {
-#pragma omp critical
- tchunk = chunk++;
-
- reads_in.seekg(starts[tchunk]);
-
- unsigned long long tcount = 0;
- while(getline(reads_in, header)) {
- //cout << header << endl;
-
- // get sequence
- getline(reads_in, ntseq);
- //cout << ntseq << endl;
-
- // convert ntseq to iseq
- vector<unsigned int> iseq;
- for(int i = 0; i < ntseq.size(); i++) {
- nti = strchr(nts, ntseq[i]);
- iseq.push_back(nti - nts);
- }
-
- // get quality values
- getline(reads_in,strqual);
- //cout << strqual << endl;
- getline(reads_in,strqual);
- //cout << strqual << endl;
-
- vector<int> untrusted;
-
- if(iseq.size() < trim_t)
- trim_length = 0;
- else {
- for(int i = 0; i < iseq.size()-k+1; i++) {
- if(!trusted->check(&iseq[i])) {
- untrusted.push_back(i);
- }
- }
-
- trim_length = quick_trim(strqual, untrusted);
- }
-
- // fix error reads
- if(untrusted.size() > 0) {
- // correct
- r = new Read(header, &iseq[0], strqual, untrusted, trim_length);
- corseq = r->correct(trusted, ntnt_prob, prior_prob, true);
-
- // if trimmed to long enough
- if(corseq.size() >= trim_t) {
- if(r->trusted_read != 0) { // else no guarantee there was a correction
- for(int c = 0; c < r->trusted_read->corrections.size(); c++) {
- correction cor = r->trusted_read->corrections[c];
- if(iseq[cor.index] < 4) {
- // P(obs=o|actual=a,a!=o) for Bayes
- ntnt_counts[strqual[cor.index]-Read::quality_scale][cor.to][iseq[cor.index]]++;
-
- // P(actual=a|obs=o,a!=o)
- //ntnt_counts[iseq[cor.index]][cor.to]++;
- samples++;
- }
- }
- }
- }
- delete r;
- }
-
- if(++tcount == counts[tchunk] || samples > 200000)
- break;
- }
- }
- reads_in.close();
- }
-
- regress_probs(ntnt_prob, ntnt_counts);
-
- output_model(ntnt_prob, ntnt_counts, fqf);
-}
-
-
-////////////////////////////////////////////////////////////
-// load_AT_cutoffs
-//
-// Load AT cutoffs from file
-////////////////////////////////////////////////////////////
-vector<double> load_AT_cutoffs() {
- vector<double> cutoffs;
- ifstream cut_in(ATcutf);
- string line;
- double cut;
-
- while(getline(cut_in, line)) {
- stringstream ss(stringstream::in | stringstream::out);
- ss << line;
- ss >> cut;
- cutoffs.push_back(cut);
- }
-
- if(cutoffs.size() != (k+1)) {
- cerr << "Must specify " << (k+1) << " AT cutoffs in " << ATcutf << endl;
- exit(EXIT_FAILURE);
- }
-
- return cutoffs;
-}
-
-
-////////////////////////////////////////////////////////////
-// main
-////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
- parse_command_line(argc, argv);
-
- // prepare AT and GC counts
- unsigned long long atgc[2] = {0};
-
- // make trusted kmer data structure
- bithash *trusted = new bithash(k);
-
- // get good kmers from Hammer
- if (hammerf != NULL) {
- string hammerf_str(hammerf);
- if (hammerf_str.substr(hammerf_str.size()-3) == ".gz") {
- igzstream hammerf_in(hammerf);
- trusted->hammer_file_load(hammerf_in, atgc);
- } else {
- ifstream hammerf_in(hammerf);
- trusted->hammer_file_load(hammerf_in, atgc);
- }
- }
-
- // get kmer counts
- if(merf != NULL) {
- string merf_str(merf);
- if(ATcutf != NULL) {
- if(merf_str.substr(merf_str.size()-3) == ".gz") {
- igzstream mer_in(merf);
- trusted->tab_file_load(mer_in, load_AT_cutoffs(), atgc);
- } else {
- ifstream mer_in(merf);
- trusted->tab_file_load(mer_in, load_AT_cutoffs(), atgc);
- }
- } else {
- if(merf_str.substr(merf_str.size()-3) == ".gz") {
- igzstream mer_in(merf);
- trusted->tab_file_load(mer_in, cutoff, atgc);
- } else {
- ifstream mer_in(merf);
- trusted->tab_file_load(mer_in, cutoff, atgc);
- }
- }
-
- // saved bithash
- } else if(bithashf != NULL) {
- if(strcmp(bithashf,"-") == 0) {
- cerr << "Saved bithash cannot be piped in. Please specify file." << endl;
- exit(EXIT_FAILURE);
- } else
- trusted->binary_file_input(bithashf, atgc);
- }
- cout << trusted->num_kmers() << " trusted kmers" << endl;
-
- double prior_prob[4];
- prior_prob[0] = (double)atgc[0] / (double)(atgc[0]+atgc[1]) / 2.0;
- prior_prob[1] = .5 - prior_prob[0];
- prior_prob[2] = prior_prob[1];
- prior_prob[3] = prior_prob[0];
-
- //cout << "AT: " << atgc[0] << " GC: " << atgc[1] << endl;
- cout << "AT% = " << (2*prior_prob[0]) << endl;
-
- // make list of files
- vector<string> fastqfs;
- vector<int> pairedend_codes;
- parse_fastq(fastqfs, pairedend_codes);
-
- // process each file
- string fqf;
- bool zip;
- for(int f = 0; f < fastqfs.size(); f++) {
- fqf = fastqfs[f];
- cout << fqf << endl;
-
- // unzip
- if(fqf.substr(fqf.size()-3) == ".gz") {
- zip = true;
- unzip_fastq(fqf);
- } else
- zip = false;
-
- // determine quality value scale
- if(Read::quality_scale == -1)
- guess_quality_scale(fqf);
-
- // split file
- vector<streampos> starts;
- vector<unsigned long long> counts;
- chunkify_fastq(fqf, starts, counts);
-
- // learn nt->nt transitions
- double ntnt_prob[Read::max_qual][4][4] = {0};
- for(int q = 0; q < Read::max_qual; q++)
- for(int i = 0; i < 4; i++)
- for(int j = 0; j < 4; j++)
- if(i != j)
- ntnt_prob[q][i][j] = 1.0/3.0;
-
- if(!TESTING)
- learn_errors(fqf, trusted, starts, counts, ntnt_prob, prior_prob);
-
- // correct
- correct_reads(fqf, pairedend_codes[f], trusted, starts, counts, ntnt_prob, prior_prob);
-
- // combine
- if(pairedend_codes[f] == 0) {
- combine_output(fqf, string("cor"), uncorrected_out);
- }
-
- // combine paired end
- if(pairedend_codes[f] == 2) {
- if(!zip) {
- combine_output_paired(fastqfs[f-1], fqf, string("cor"), uncorrected_out);
- } else {
- combine_output_paired(fastqfs[f-1].substr(0,fastqfs[f-1].size()-3), fqf, string("cor"), uncorrected_out);
- }
- }
-
- if(zip)
- zip_fastq(fqf);
- }
-
- return 0;
-}
diff --git a/src/hammer/quake_correct/edit.cpp b/src/hammer/quake_correct/edit.cpp
deleted file mode 100644
index 5b2fc09..0000000
--- a/src/hammer/quake_correct/edit.cpp
+++ /dev/null
@@ -1,665 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include <sys/stat.h>
-#include <fstream>
-#include "omp_wrapper.h"
-#include <iostream>
-#include <sstream>
-#include <cstring>
-#include "gzstream.h"
-#include <vector>
-#include "Read.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// options
-////////////////////////////////////////////////////////////////////////////////
-// -r, fastq file of reads
-char* fastqf = NULL;
-// -f, file of fastq files of reads
-char* file_of_fastqf = NULL;
-
-// -z, zip output files
-bool zip_output = false;
-
-// -q
-int Read::quality_scale = -1;
-// -p, number of threads
-int threads = 4;
-
-// -t
-int trimq = 3;
-
-unsigned int chunks_per_thread = 200;
-
-
-////////////////////////////////////////////////////////////////////////////////
-// split
-//
-// Split on whitespace
-////////////////////////////////////////////////////////////////////////////////
-vector<string> split(string s) {
- vector<string> splits;
- int split_num = 0;
- bool last_space = true;
-
- for(int i = 0; i < s.size(); i++) {
- if(s[i] == ' ' || s[i] == '\t' || s[i] == '\n' || s[i] == '\r') {
- if(!last_space)
- split_num++;
- last_space = true;
- } else {
- if(split_num == splits.size())
- splits.push_back("");
- splits[split_num] += s[i];
- last_space = false;
- }
- }
-
- return splits;
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-// split
-//
-// Split on the character c, trying to match Python's split method
-////////////////////////////////////////////////////////////////////////////////
-vector<string> split(string s, char c)
-{
- vector<string> splits;
- splits.push_back("");
- int split_num = 0;
-
- for(int i = 0; i < s.size(); i++) {
- if(s[i] == c) {
- split_num++;
- splits.push_back("");
- } else {
- splits[split_num] += s[i];
- }
- }
-
- return splits;
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-// unzip_fastq
-//
-// Unzip read file and remove ".gz" suffix from 'fqf'.
-////////////////////////////////////////////////////////////////////////////////
-void unzip_fastq(string & fqf) {
- char mycmd[500];
-
- // rename
- string fqf_zip(fqf);
- fqf.erase(fqf.size()-3);
-
- // unzip but leave original file
- strcpy(mycmd, "gunzip -c ");
- strcat(mycmd, fqf_zip.c_str());
- strcat(mycmd, " > ");
- strcat(mycmd, fqf.c_str());
- system(mycmd);
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-// zip_fastq
-//
-// Zip the original read file as well as the corrected
-// read files.
-////////////////////////////////////////////////////////////////////////////////
-void zip_fastq(string fqf) {
- char mycmd[100];
-
- // gzip fqf
- //strcpy(mycmd, "gzip ");
- //strcat(mycmd, fqf.c_str());
- //system(mycmd);
-
- // remove unzipped fqf, leaving only zipped
- remove(fqf.c_str());
-
- // determine output file
- /*
- string fqf_str(fqf);
- int suffix_index = fqf_str.rfind(".");
- string prefix = fqf_str.substr(0,suffix_index);
- string suffix = fqf_str.substr(suffix_index, fqf_str.size()-suffix_index);
- string pairf = prefix + string(".cor") + suffix;
- string singlef = prefix + string(".cor.single") + suffix;
-
- // gzip pair
- strcpy(mycmd, "gzip ");
- strcat(mycmd, pairf.c_str());
- system(mycmd);
-
- // gzip single
- struct stat st_file_info;
- if(stat(singlef.c_str(), &st_file_info) == 0) {
- strcpy(mycmd, "gzip ");
- strcat(mycmd, singlef.c_str());
- system(mycmd);
- }
- */
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-// combine_logs
-//
-// Combine log files that may be in out_dir into a single log file named
-// using fqf.
-////////////////////////////////////////////////////////////////////////////////
-void combine_logs(string fqf, string out_dir) {
- struct stat st_file_info;
- string log1 = out_dir+"/0.log";
- if(stat(log1.c_str(), &st_file_info) == 0) {
- // format log output
- string logf = fqf + ".log";
- ofstream corlog_out(logf.c_str());
-
- // combine
- string line;
- for(int t = 0; t < threads*chunks_per_thread; t++) {
- string tc_file(out_dir+"/");
- stringstream tc_convert;
- tc_convert << t;
- tc_file += tc_convert.str();
- tc_file += ".log";
-
- if(stat(tc_file.c_str(), &st_file_info) == 0) {
- ifstream tc_out(tc_file.c_str());
- while(getline(tc_out, line)) {
- corlog_out << line << endl;
- }
- tc_out.close();
- remove(tc_file.c_str());
- }
- }
- corlog_out.close();
- }
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-// combine_output_stream
-//
-// Combine output files in 'out_dir' into a single file defined by the given
-// stream, and remove those files along the way.
-////////////////////////////////////////////////////////////////////////////////
-void combine_output_stream(ostream & combine_out, ostream & err_out, string out_dir) {
- string header, seq, mid, qual;
- struct stat st_file_info;
- for(int t = 0; t < threads*chunks_per_thread; t++) {
- string tc_file(out_dir+"/");
- stringstream tc_convert;
- tc_convert << t;
- tc_file += tc_convert.str();
-
- // if file exists, add to single output
- if(stat(tc_file.c_str(), &st_file_info) == 0) {
- ifstream tc_out(tc_file.c_str());
- while(getline(tc_out, header)) {
- getline(tc_out, seq);
- getline(tc_out, mid);
- getline(tc_out, qual);
-
- if(!err_out.good() || header.find("error") == -1)
- combine_out << header << endl << seq << endl << mid << endl << qual << endl;
- else
- err_out << header.substr(0,header.find("error")) << endl << seq << endl << mid << endl << qual << endl;
- }
- tc_out.close();
- remove(tc_file.c_str());
- }
- }
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-// combine_output
-//
-// Combine output files in 'out_dir' into a single file and remove 'out_dir'
-////////////////////////////////////////////////////////////////////////////////
-void combine_output(string fqf, string mid_ext, bool uncorrected_out) {
- // format output directory
- string path_suffix = split(fqf,'/').back();
- string out_dir("."+path_suffix);
-
- // format output file
- int suffix_index = fqf.rfind(".");
- string prefix, suffix;
- if(suffix_index == -1) {
- prefix = fqf+".";
- suffix = "";
- } else {
- prefix = fqf.substr(0,suffix_index+1);
- suffix = fqf.substr(suffix_index, fqf.size()-suffix_index);
- }
-
- string outf;
- string errf;
- if(zip_output) {
- // zipped
- outf = prefix + mid_ext + suffix + ".gz";
- ogzstream combine_out(outf.c_str());
- ogzstream err_out;
- if(uncorrected_out) {
- errf = prefix + "err" + suffix + ".gz";
- err_out.open(errf.c_str());
- }
- combine_output_stream(combine_out, err_out, out_dir);
- combine_out.close();
- err_out.close();
- } else {
- // normal
- outf = prefix + mid_ext + suffix;
- ofstream combine_out(outf.c_str());
- ofstream err_out;
- if(uncorrected_out) {
- errf = prefix + "err" + suffix;
- err_out.open(errf.c_str());
- }
- combine_output_stream(combine_out, err_out, out_dir);
- combine_out.close();
- err_out.close();
- }
-
- // log
- combine_logs(fqf, out_dir);
-
- // remove output directory
- rmdir(out_dir.c_str());
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-// combine_output_paired_stream
-////////////////////////////////////////////////////////////////////////////////
-void combine_output_paired_stream(string fqf1, string fqf2, ostream & pair_out1, ostream & single_out1, ostream & single_err_out1, ostream & err_out1, ostream & pair_out2, ostream & single_out2, ostream & single_err_out2, ostream & err_out2) {
- // format output directories
- string path_suffix1 = split(fqf1, '/').back();
- string out_dir1("."+path_suffix1);
- string path_suffix2 = split(fqf2, '/').back();
- string out_dir2("."+path_suffix2);
-
- string header1, seq1, mid1, qual1, header2, seq2, mid2, qual2;
- struct stat st_file_info;
- for(int t = 0; t < threads*chunks_per_thread; t++) {
- // format thread-chunk output files
- string tc_file1(out_dir1+"/");
- stringstream tc_convert1;
- tc_convert1 << t;
- tc_file1 += tc_convert1.str();
-
- string tc_file2(out_dir2+"/");
- stringstream tc_convert2;
- tc_convert2 << t;
- tc_file2 += tc_convert2.str();
-
- // if file exists, both must
- if(stat(tc_file1.c_str(), &st_file_info) == 0) {
- ifstream tc_out1(tc_file1.c_str());
- ifstream tc_out2(tc_file2.c_str());
-
- while(getline(tc_out1, header1)) {
- // get read1
- getline(tc_out1, seq1);
- getline(tc_out1, mid1);
- getline(tc_out1, qual1);
-
- // get read2
- if(!getline(tc_out2, header2)) {
- cerr << "Uneven number of reads in paired end read files " << tc_file1.c_str() << " and " << tc_file2.c_str() << endl;
- exit(EXIT_FAILURE);
- }
- getline(tc_out2, seq2);
- getline(tc_out2, mid2);
- getline(tc_out2, qual2);
-
- if(header1.find("error") == -1) {
- if(header2.find("error") == -1) {
- // no errors
- pair_out1 << header1 << endl << seq1 << endl << mid1 << endl << qual1 << endl;
- pair_out2 << header2 << endl << seq2 << endl << mid2 << endl << qual2 << endl;
- } else {
- // error in 2
- single_out1 << header1 << endl << seq1 << endl << mid1 << endl << qual1 << endl;
- if(single_err_out2.good())
- single_err_out2 << header2.substr(0,header2.find("error")) << endl << seq2 << endl << mid2 << endl << qual2 << endl;
- }
- } else {
- if(header2.find("error") == -1) {
- // error in 1
- if(single_err_out1.good())
- single_err_out1 << header1.substr(0,header1.find("error")) << endl << seq1 << endl << mid1 << endl << qual1 << endl;
- single_out2 << header2 << endl << seq2 << endl << mid2 << endl << qual2 << endl;
- } else {
- // error in 1,2
- if(err_out1.good()) {
- err_out1 << header1.substr(0,header1.find("error")) << endl << seq1 << endl << mid1 << endl << qual1 << endl;
- err_out2 << header2.substr(0,header2.find("error")) << endl << seq2 << endl << mid2 << endl << qual2 << endl;
- }
- }
- }
- }
- tc_out1.close();
- tc_out2.close();
- remove(tc_file1.c_str());
- remove(tc_file2.c_str());
- }
- }
-
- // logs
- combine_logs(fqf1, out_dir1);
- combine_logs(fqf2, out_dir2);
-
- // remove output directory
- rmdir(out_dir1.c_str());
- rmdir(out_dir2.c_str());
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-// combine_output_paired
-//
-// Combine output files in 'out_dir' into a single file and remove 'out_dir'
-////////////////////////////////////////////////////////////////////////////////
-void combine_output_paired(string fqf1, string fqf2, string mid_ext, bool uncorrected_out) {
- string prefix, suffix;
-
- if(zip_output) {
- // format output pair file1
- int suffix_index = fqf1.rfind(".");
- if(suffix_index == -1) {
- prefix = fqf1+".";
- suffix = "";
- } else {
- prefix = fqf1.substr(0,suffix_index+1);
- suffix = fqf1.substr(suffix_index, fqf1.size()-suffix_index);
- }
- string outf = prefix + mid_ext + suffix + ".gz";
- ogzstream pair_out1(outf.c_str());
-
- // and single file1
- outf = prefix + mid_ext + "_single" + suffix + ".gz";
- ogzstream single_out1(outf.c_str());
-
- // and error file1
- ogzstream single_err_out1;
- ogzstream err_out1;
- if(uncorrected_out) {
- outf = prefix + "err_single" + suffix + ".gz";
- single_err_out1.open(outf.c_str());
- outf = prefix + "err" + suffix + ".gz";
- err_out1.open(outf.c_str());
- }
-
- // format output pair file2
- suffix_index = fqf2.rfind(".");
- if(suffix_index == -1) {
- prefix = fqf2+".";
- suffix = "";
- } else {
- prefix = fqf2.substr(0,suffix_index+1);
- suffix = fqf2.substr(suffix_index, fqf2.size()-suffix_index);
- }
- outf = prefix + mid_ext + suffix + ".gz";
- ogzstream pair_out2(outf.c_str());
-
- // and single file2
- outf = prefix + mid_ext + "_single" + suffix + ".gz";
- ogzstream single_out2(outf.c_str());
-
- // and error file1
- ogzstream single_err_out2;
- ogzstream err_out2;
- if(uncorrected_out) {
- outf = prefix + "err_single" + suffix + ".gz";
- single_err_out2.open(outf.c_str());
- outf = prefix + "err" + suffix + ".gz";
- err_out2.open(outf.c_str());
- }
-
- combine_output_paired_stream(fqf1, fqf2, pair_out1, single_out1, single_err_out1, err_out1, pair_out2, single_out2, single_err_out2, err_out2);
-
- pair_out1.close();
- pair_out2.close();
- single_out1.close();
- single_out2.close();
-
- } else {
- // format output pair file1
- int suffix_index = fqf1.rfind(".");
- if(suffix_index == -1) {
- prefix = fqf1+".";
- suffix = "";
- } else {
- prefix = fqf1.substr(0,suffix_index+1);
- suffix = fqf1.substr(suffix_index, fqf1.size()-suffix_index);
- }
- string outf = prefix + mid_ext + suffix;
- ofstream pair_out1(outf.c_str());
-
- // and single file1
- outf = prefix + mid_ext + "_single" + suffix;
- ofstream single_out1(outf.c_str());
-
- // and error file1
- ofstream single_err_out1;
- ofstream err_out1;
- if(uncorrected_out) {
- outf = prefix + "err_single" + suffix;
- single_err_out1.open(outf.c_str());
- outf = prefix + "err" + suffix;
- err_out1.open(outf.c_str());
- }
-
- // format output pair file2
- suffix_index = fqf2.rfind(".");
- if(suffix_index == -1) {
- prefix = fqf1+".";
- suffix = "";
- } else {
- prefix = fqf2.substr(0,suffix_index+1);
- suffix = fqf2.substr(suffix_index, fqf2.size()-suffix_index);
- }
- outf = prefix + mid_ext + suffix;
- ofstream pair_out2(outf.c_str());
-
- // and single file2
- outf = prefix + mid_ext + "_single" + suffix;
- ofstream single_out2(outf.c_str());
-
- // and error file2
- ofstream single_err_out2;
- ofstream err_out2;
- if(uncorrected_out) {
- outf = prefix + "err_single" + suffix;
- single_err_out2.open(outf.c_str());
- outf = prefix + "err" + suffix;
- err_out2.open(outf.c_str());
- }
-
- combine_output_paired_stream(fqf1, fqf2, pair_out1, single_out1, single_err_out1, err_out1, pair_out2, single_out2, single_err_out2, err_out2);
-
- pair_out1.close();
- pair_out2.close();
- single_out1.close();
- single_out2.close();
- }
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-// chunkify_fastq
-//
-// Determine start points and sequence counts for all
-// chunks to be processed in parallel.
-////////////////////////////////////////////////////////////////////////////////
-void chunkify_fastq(string fqf, vector<streampos> & starts, vector<unsigned long long> & counts) {
- // count number of sequences
- unsigned long long N = 0;
- ifstream reads_in(fqf.c_str());
- string toss;
- while(getline(reads_in, toss))
- N++;
- reads_in.close();
- N /= 4ULL;
-
- if(threads*chunks_per_thread > N) {
- // use 1 thread for everything
- counts.push_back(N);
- starts.push_back(0);
- omp_set_num_threads(1);
-
- } else {
- // determine counts per thread
- unsigned long long sum = 0;
- for(int i = 0; i < threads*chunks_per_thread-1; i++) {
- counts.push_back(N / (threads*chunks_per_thread));
- sum += counts.back();
- }
- counts.push_back(N - sum);
-
- // find start points
- reads_in.open(fqf.c_str());
- starts.push_back(reads_in.tellg());
- unsigned long long s = 0;
- unsigned int t = 0;
- while(getline(reads_in,toss)) {
- // sequence
- getline(reads_in, toss);
- // +
- getline(reads_in, toss);
- // quality
- getline(reads_in, toss);
-
- if(++s == counts[t] && t < counts.size()-1) {
- starts.push_back(reads_in.tellg());
- s = 0;
- t++;
- }
-
- // set up parallelism
- omp_set_num_threads(threads);
- }
- }
-}
-
-
-////////////////////////////////////////////////////////////
-// guess_quality_scale
-//
-// Guess at ascii scale of quality values by examining
-// a bunch of reads and looking for quality values < 64,
-// in which case we set it to 33.
-//
-// Assuming the file is unzipped.
-////////////////////////////////////////////////////////////
-void guess_quality_scale(string fqf) {
- string header, seq, mid, strqual;
- int reads_to_check = 10000;
- int reads_checked = 0;
- ifstream reads_in(fqf.c_str());
- while(getline(reads_in, header)) {
- getline(reads_in, seq);
- getline(reads_in, mid);
- getline(reads_in, strqual);
-
- for(int i = 0; i < strqual.size(); i++) {
- if(strqual[i] < 64) {
- cerr << "Guessing quality values are on ascii 33 scale" << endl;
- Read::quality_scale = 33;
- reads_in.close();
- return;
- }
- }
-
- if(++reads_checked >= reads_to_check)
- break;
- }
- reads_in.close();
- cerr << "Guessing quality values are on ascii 64 scale" << endl;
- Read::quality_scale = 64;
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-// parse_fastq
-//
-// Accept a single fastq file from input, or parse a file with names of fastq
-// files. For multiple files, attached a paired end code to tell the correction
-// method how to handle the file.
-////////////////////////////////////////////////////////////////////////////////
-vector<string> parse_fastq(vector<string> & fastqfs, vector<int> & pairedend_codes) {
- if(file_of_fastqf != NULL) {
- ifstream ff(file_of_fastqf);
- vector<string> next_fastqf;
- string line;
-
- while(getline(ff, line) && line.size() > 0) {
- next_fastqf = split(line);
-
- if(next_fastqf.size() == 1) {
- fastqfs.push_back(next_fastqf[0]);
- pairedend_codes.push_back(0);
-
- } else if(next_fastqf.size() == 2) {
- fastqfs.push_back(next_fastqf[0]);
- fastqfs.push_back(next_fastqf[1]);
- pairedend_codes.push_back(1);
- pairedend_codes.push_back(2);
-
- } else {
- cerr << "File of fastq file names must have a single fastq file per line for single reads or two fastqf files per line separated by a space for paired end reads " << endl;
- exit(EXIT_FAILURE);
- }
- }
-
- } else {
- fastqfs.push_back(string(fastqf));
- pairedend_codes.push_back(0);
- }
-
- return fastqfs;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// quick_trim
-//
-// Trim the end of the read the way BWA does it.
-// Removes affected untrusted k-mers.
-// Returns the trimmed length.
-////////////////////////////////////////////////////////////////////////////////
-int quick_trim(string strqual, vector<int> & untrusted) {
- // find trim index
- int phredq;
- int current_trimfunc = 0;
- int max_trimfunc = 0;
- int trim_length = strqual.size();
- for(int i = strqual.size()-1; i >= 0; i--) {
- //phredq = floor(.5-10*log(1.0 - prob[i])/log(10));
- phredq = strqual[i] - Read::quality_scale;
- current_trimfunc += (trimq - phredq);
- if(current_trimfunc > max_trimfunc) {
- max_trimfunc = current_trimfunc;
- trim_length = i;
- }
- }
-
- // update untrusted
- for(int i = untrusted.size()-1; i >= 0; i--) {
- if(untrusted[i] > trim_length - bithash::k)
- untrusted.pop_back();
- }
-
- return trim_length;
-}
diff --git a/src/hammer/quake_count/quake_count.cpp b/src/hammer/quake_count/quake_count.cpp
deleted file mode 100644
index 0e35c6b..0000000
--- a/src/hammer/quake_count/quake_count.cpp
+++ /dev/null
@@ -1,241 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/**
- * @file preproc.cpp
- * @author Alex Davydow
- * @version 1.0
- *
- * @section LICENSE
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- *
- * @section DESCRIPTION
- *
- * For each k-mer this program calculates number of occurring in
- * the reads provided. Reads file is supposed to be in fastq
- * format.
- */
-
-#include "standard.hpp"
-
-#include <stdint.h>
-#include <cstdio>
-#include <cstdlib>
-#include <cassert>
-#include <string>
-#include <set>
-#include <unordered_map>
-#include <vector>
-#include <iostream>
-#include <iomanip>
-#include "io/ireadstream.hpp"
-#include "io/read.hpp"
-#include "sequence/seq.hpp"
-#include "kmer_freq_info.hpp"
-#include "valid_kmer_generator.hpp"
-#define SUPPRESS_UNUSED(X) ((void) (X))
-
-using std::string;
-using std::set;
-using std::vector;
-using std::unordered_map;
-using std::map;
-using std::ofstream;
-using std::ifstream;
-using std::pair;
-
-namespace {
-
-const uint32_t kK = 55;
-typedef Seq<kK> KMer;
-typedef unordered_map<KMer, KMerFreqInfo, KMer::hash> UnorderedMap;
-
-void print_time() {
- time_t rawtime;
- tm * ptm;
- time ( &rawtime );
- ptm = gmtime( &rawtime );
- std::cout << std::setfill('0') << "[ " << std::setw(2) << ptm->tm_hour << ":" << std::setw(2) << ptm->tm_min
- << ":" << std::setw(2) << ptm->tm_sec << " ] ";
-}
-
-#define LOG(a) print_time(); std::cout << a << std::endl
-
-/**
- * @variable Every kStep k-mer will appear in the log.
- */
-const int kStep = 1e5;
-
-struct Options {
- /**
- * @variable An offset for quality in a fastq file.
- */
- uint32_t qvoffset;
- string ifile;
- string ofile;
- uint32_t error_threshold;
- /**
- * @variable How many files will be used when splitting k-mers.
- */
- uint32_t file_number;
- bool q_mers;
- bool valid;
- Options()
- : qvoffset(0),
- ifile(""),
- ofile(""),
- error_threshold(0),
- file_number(3),
- q_mers(false),
- valid(true) {}
-};
-
-void PrintHelp(char *program_name) {
- printf("Usage: %s qvoffset ifile.fastq ofile.[q]cst file_number error_threshold [q]\n",
- program_name);
- printf("Where:\n");
- printf("\tqvoffset\tan offset of fastq quality data\n");
- printf("\tifile.fastq\tan input file with reads in fastq format\n");
- printf("\tofile.[q]cst\ta filename where k-mer statistics will be outputted\n");
- printf("\terror_threshold\tnucliotides with quality lower then threshold will be cut from the ends of reads\n");
- printf("\tfile_number\thow many files will be used when splitting k-mers\n");
- printf("\tq\t\tif you want to count q-mers instead of k-mers.\n");
-}
-
-Options ParseOptions(int argc, char *argv[]) {
- Options ret;
- if (argc != 6 && argc != 7) {
- ret.valid = false;
- } else {
- ret.qvoffset = atoi(argv[1]);
- ret.valid &= (ret.qvoffset >= 0 && ret.qvoffset <= 255);
- ret.ifile = argv[2];
- ret.ofile = argv[3];
- ret.error_threshold = atoi(argv[4]);
- ret.valid &= (ret.error_threshold >= 0 && ret.error_threshold <= 255);
- ret.file_number = atoi(argv[5]);
- if (argc == 7) {
- if (string(argv[6]) == "q") {
- ret.q_mers = true;
- } else {
- ret.valid = false;
- }
- }
- }
- return ret;
-}
-
-/**
- * This function reads reads from the stream and splits them into
- * k-mers. Then k-mers are written to several file almost
- * uniformly. It is guaranteed that the same k-mers are written to the
- * same files.
- * @param ifs Steam to read reads from.
- * @param ofiles Files to write the result k-mers. They are written
- * one per line.
- */
-void SplitToFiles(ireadstream ifs, vector<ofstream *> &ofiles,
- bool q_mers, uint8_t error_threshold) {
- uint32_t file_number = ofiles.size();
- uint64_t read_number = 0;
- while (!ifs.eof()) {
- ++read_number;
- if (read_number % kStep == 0) {
- LOG("Reading read " << read_number << ".");
- }
- Read r;
- ifs >> r;
- KMer::hash hash_function;
- for (ValidKMerGenerator<kK> gen(r, error_threshold); gen.HasMore(); gen.Next()) {
- KMer kmer = gen.kmer();
- if (KMer::less2()(!kmer, kmer)) {
- kmer = !kmer;
- }
- ofstream &cur_file = *ofiles[hash_function(kmer) % file_number];
- KMer::BinWrite(cur_file, kmer);
- if (q_mers) {
- double correct_probability = gen.correct_probability();
- cur_file.write((const char*) &correct_probability, sizeof(correct_probability));
- }
- }
- }
-}
-
-/**
- * This function reads k-mer and calculates number of occurrences for
- * each of them.
- * @param ifile File with k-mer to process. One per line.
- * @param ofile Output file. For each unique k-mer there will be a
- * line with k-mer itself and number of its occurrences.
- */
-template<typename KMerStatMap>
-void EvalFile(ifstream &ifile, FILE *ofile, bool q_mers) {
- KMerStatMap stat_map;
- char buffer[kK + 1];
- buffer[kK] = 0;
- KMer kmer;
- while (KMer::BinRead(ifile, &kmer)) {
- KMerFreqInfo &info = stat_map[kmer];
- if (q_mers) {
- double correct_probability = -1;
- ifile.read((char *) &correct_probability, sizeof(correct_probability));
- assert(ifile.fail());
- info.q_count += correct_probability;
- } else {
- info.count += 1;
- }
- }
- for (typename KMerStatMap::iterator it = stat_map.begin();
- it != stat_map.end(); ++it) {
- fprintf(ofile, "%s ", it->first.str().c_str());
- if (q_mers) {
- fprintf(ofile, "%f\n", it->second.q_count);
- } else {
- fprintf(ofile, "%d\n", it->second.count);
- }
- }
-}
-}
-
-int main(int argc, char *argv[]) {
- Options opts = ParseOptions(argc, argv);
- if (!opts.valid) {
- PrintHelp(argv[0]);
- return 1;
- }
- // BasicConfigurator::configure();
- LOG("Starting preproc: evaluating " << opts.ifile << ".");
- vector<ofstream*> ofiles(opts.file_number);
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- char filename[50];
- snprintf(filename, sizeof(filename), "%u.kmer.part", i);
- ofiles[i] = new ofstream(filename);
- assert(!ofiles[i]->fail() && "Too many files to open");
- }
- SplitToFiles(ireadstream(opts.ifile, opts.qvoffset),
- ofiles, opts.q_mers, opts.error_threshold);
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- delete ofiles[i];
- }
- FILE *ofile = fopen(opts.ofile.c_str(), "w");
- assert(ofile != NULL && "Too many files to open");
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- char ifile_name[50];
- snprintf(ifile_name, sizeof(ifile_name), "%u.kmer.part", i);
- ifstream ifile(ifile_name);
- LOG("Processing " << ifile_name << ".");
- EvalFile<UnorderedMap>(ifile, ofile, opts.q_mers);
- LOG("Processed " << ifile_name << ".");
- }
- fclose(ofile);
- LOG("Preprocessing done. You can find results in " << opts.ofile << ".");
- return 0;
-}
diff --git a/src/hammer/quake_count/quake_count_17.cpp b/src/hammer/quake_count/quake_count_17.cpp
deleted file mode 100644
index d5638f9..0000000
--- a/src/hammer/quake_count/quake_count_17.cpp
+++ /dev/null
@@ -1,238 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/**
- * @file preproc.cpp
- * @author Alex Davydow
- * @version 1.0
- *
- * @section LICENSE
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- *
- * @section DESCRIPTION
- *
- * For each k-mer this program calculates number of occurring in
- * the reads provided. Reads file is supposed to be in fastq
- * format.
- */
-#include "standard.hpp"
-#include <stdint.h>
-#include <cstdio>
-#include <cstdlib>
-#include <cassert>
-#include <string>
-#include <set>
-#include <unordered_map>
-#include <vector>
-#include <iostream>
-#include <iomanip>
-#include "io/ireadstream.hpp"
-#include "io/read.hpp"
-#include "sequence/seq.hpp"
-#include "kmer_freq_info.hpp"
-#include "valid_kmer_generator.hpp"
-#define SUPPRESS_UNUSED(X) ((void) (X))
-
-using std::string;
-using std::set;
-using std::vector;
-using std::unordered_map;
-using std::map;
-using std::ofstream;
-using std::ifstream;
-
-namespace {
-
-const uint32_t kK = 17;
-typedef Seq<kK> KMer;
-typedef unordered_map<KMer, KMerFreqInfo, KMer::hash> UnorderedMap;
-
-void print_time() {
- time_t rawtime;
- tm * ptm;
- time ( &rawtime );
- ptm = gmtime( &rawtime );
- std::cout << std::setfill('0') << "[ " << std::setw(2) << ptm->tm_hour << ":" << std::setw(2) << ptm->tm_min
- << ":" << std::setw(2) << ptm->tm_sec << " ] ";
-}
-
-#define LOG(a) print_time(); std::cout << a << std::endl
-
-/**
- * @variable Every kStep k-mer will appear in the log.
- */
-const int kStep = 1e5;
-
-struct Options {
- /**
- * @variable An offset for quality in a fastq file.
- */
- uint32_t qvoffset;
- string ifile;
- string ofile;
- uint32_t error_threshold;
- /**
- * @variable How many files will be used when splitting k-mers.
- */
- uint32_t file_number;
- bool q_mers;
- bool valid;
- Options()
- : qvoffset(0),
- ifile(""),
- ofile(""),
- error_threshold(0),
- file_number(3),
- q_mers(false),
- valid(true) {}
-};
-
-void PrintHelp(char *program_name) {
- printf("Usage: %s qvoffset ifile.fastq ofile.[q]cst file_number error_threshold [q]\n",
- program_name);
- printf("Where:\n");
- printf("\tqvoffset\tan offset of fastq quality data\n");
- printf("\tifile.fastq\tan input file with reads in fastq format\n");
- printf("\tofile.[q]cst\ta filename where k-mer statistics will be outputted\n");
- printf("\terror_threshold\tnucliotides with quality lower then threshold will be cut from the ends of reads\n");
- printf("\tfile_number\thow many files will be used when splitting k-mers\n");
- printf("\tq\t\tif you want to count q-mers instead of k-mers.\n");
-}
-
-Options ParseOptions(int argc, char *argv[]) {
- Options ret;
- if (argc != 6 && argc != 7) {
- ret.valid = false;
- } else {
- ret.qvoffset = atoi(argv[1]);
- ret.valid &= (ret.qvoffset >= 0 && ret.qvoffset <= 255);
- ret.ifile = argv[2];
- ret.ofile = argv[3];
- ret.error_threshold = atoi(argv[4]);
- ret.valid &= (ret.error_threshold >= 0 && ret.error_threshold <= 255);
- ret.file_number = atoi(argv[5]);
- if (argc == 7) {
- if (string(argv[6]) == "q") {
- ret.q_mers = true;
- } else {
- ret.valid = false;
- }
- }
- }
- return ret;
-}
-
-/**
- * This function reads reads from the stream and splits them into
- * k-mers. Then k-mers are written to several file almost
- * uniformly. It is guaranteed that the same k-mers are written to the
- * same files.
- * @param ifs Steam to read reads from.
- * @param ofiles Files to write the result k-mers. They are written
- * one per line.
- */
-void SplitToFiles(ireadstream ifs, vector<ofstream *> &ofiles,
- bool q_mers, uint8_t error_threshold) {
- uint32_t file_number = ofiles.size();
- uint64_t read_number = 0;
- while (!ifs.eof()) {
- ++read_number;
- if (read_number % kStep == 0) {
- LOG("Reading read " << read_number << ".");
- }
- Read r;
- ifs >> r;
- KMer::hash hash_function;
- for (ValidKMerGenerator<kK> gen(r, error_threshold); gen.HasMore(); gen.Next()) {
- KMer kmer = gen.kmer();
- if (KMer::less2()(!kmer, kmer)) {
- kmer = !kmer;
- }
- ofstream &cur_file = *ofiles[hash_function(kmer) % file_number];
- KMer::BinWrite(cur_file, kmer);
- if (q_mers) {
- double correct_probability = gen.correct_probability();
- cur_file.write((const char*) &correct_probability, sizeof(correct_probability));
- }
- }
- }
-}
-
-/**
- * This function reads k-mer and calculates number of occurrences for
- * each of them.
- * @param ifile File with k-mer to process. One per line.
- * @param ofile Output file. For each unique k-mer there will be a
- * line with k-mer itself and number of its occurrences.
- */
-template<typename KMerStatMap>
-void EvalFile(ifstream &ifile, FILE *ofile, bool q_mers) {
- KMerStatMap stat_map;
- char buffer[kK + 1];
- buffer[kK] = 0;
- KMer kmer;
- while (KMer::BinRead(ifile, &kmer)) {
- KMerFreqInfo &info = stat_map[kmer];
- if (q_mers) {
- double correct_probability = -1;
- ifile.read((char *) &correct_probability, sizeof(correct_probability));
- assert(ifile.fail());
- info.q_count += correct_probability;
- } else {
- info.count += 1;
- }
- }
- for (typename KMerStatMap::iterator it = stat_map.begin();
- it != stat_map.end(); ++it) {
- fprintf(ofile, "%s ", it->first.str().c_str());
- if (q_mers) {
- fprintf(ofile, "%f\n", it->second.q_count);
- } else {
- fprintf(ofile, "%d\n", it->second.count);
- }
- }
-}
-}
-
-int main(int argc, char *argv[]) {
- Options opts = ParseOptions(argc, argv);
- if (!opts.valid) {
- PrintHelp(argv[0]);
- return 1;
- }
- // BasicConfigurator::configure();
- LOG("Starting preproc: evaluating " << opts.ifile << ".");
- vector<ofstream*> ofiles(opts.file_number);
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- char filename[50];
- snprintf(filename, sizeof(filename), "%u.kmer.part", i);
- ofiles[i] = new ofstream(filename);
- assert(!ofiles[i]->fail() && "Too many files to open");
- }
- SplitToFiles(ireadstream(opts.ifile, opts.qvoffset),
- ofiles, opts.q_mers, opts.error_threshold);
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- delete ofiles[i];
- }
- FILE *ofile = fopen(opts.ofile.c_str(), "w");
- assert(ofile != NULL && "Too many files to open");
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- char ifile_name[50];
- snprintf(ifile_name, sizeof(ifile_name), "%u.kmer.part", i);
- ifstream ifile(ifile_name);
- LOG("Processing " << ifile_name << ".");
- EvalFile<UnorderedMap>(ifile, ofile, opts.q_mers);
- LOG("Processed " << ifile_name << ".");
- }
- fclose(ofile);
- LOG("Preprocessing done. You can find results in " << opts.ofile << ".");
- return 0;
-}
diff --git a/src/hammer/quake_count/quake_count_19.cpp b/src/hammer/quake_count/quake_count_19.cpp
deleted file mode 100644
index c17fab6..0000000
--- a/src/hammer/quake_count/quake_count_19.cpp
+++ /dev/null
@@ -1,238 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/**
- * @file preproc.cpp
- * @author Alex Davydow
- * @version 1.0
- *
- * @section LICENSE
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- *
- * @section DESCRIPTION
- *
- * For each k-mer this program calculates number of occurring in
- * the reads provided. Reads file is supposed to be in fastq
- * format.
- */
-#include "standard.hpp"
-#include <stdint.h>
-#include <cstdio>
-#include <cstdlib>
-#include <cassert>
-#include <string>
-#include <set>
-#include <unordered_map>
-#include <vector>
-#include <iostream>
-#include <iomanip>
-#include "io/ireadstream.hpp"
-#include "io/read.hpp"
-#include "sequence/seq.hpp"
-#include "kmer_freq_info.hpp"
-#include "valid_kmer_generator.hpp"
-#define SUPPRESS_UNUSED(X) ((void) (X))
-
-using std::string;
-using std::set;
-using std::vector;
-using std::unordered_map;
-using std::map;
-using std::ofstream;
-using std::ifstream;
-
-namespace {
-
-const uint32_t kK = 19;
-typedef Seq<kK> KMer;
-typedef unordered_map<KMer, KMerFreqInfo, KMer::hash> UnorderedMap;
-
-void print_time() {
- time_t rawtime;
- tm * ptm;
- time ( &rawtime );
- ptm = gmtime( &rawtime );
- std::cout << std::setfill('0') << "[ " << std::setw(2) << ptm->tm_hour << ":" << std::setw(2) << ptm->tm_min
- << ":" << std::setw(2) << ptm->tm_sec << " ] ";
-}
-
-#define LOG(a) print_time(); std::cout << a << std::endl
-
-/**
- * @variable Every kStep k-mer will appear in the log.
- */
-const int kStep = 1e5;
-
-struct Options {
- /**
- * @variable An offset for quality in a fastq file.
- */
- uint32_t qvoffset;
- string ifile;
- string ofile;
- uint32_t error_threshold;
- /**
- * @variable How many files will be used when splitting k-mers.
- */
- uint32_t file_number;
- bool q_mers;
- bool valid;
- Options()
- : qvoffset(0),
- ifile(""),
- ofile(""),
- error_threshold(0),
- file_number(3),
- q_mers(false),
- valid(true) {}
-};
-
-void PrintHelp(char *program_name) {
- printf("Usage: %s qvoffset ifile.fastq ofile.[q]cst file_number error_threshold [q]\n",
- program_name);
- printf("Where:\n");
- printf("\tqvoffset\tan offset of fastq quality data\n");
- printf("\tifile.fastq\tan input file with reads in fastq format\n");
- printf("\tofile.[q]cst\ta filename where k-mer statistics will be outputted\n");
- printf("\terror_threshold\tnucliotides with quality lower then threshold will be cut from the ends of reads\n");
- printf("\tfile_number\thow many files will be used when splitting k-mers\n");
- printf("\tq\t\tif you want to count q-mers instead of k-mers.\n");
-}
-
-Options ParseOptions(int argc, char *argv[]) {
- Options ret;
- if (argc != 6 && argc != 7) {
- ret.valid = false;
- } else {
- ret.qvoffset = atoi(argv[1]);
- ret.valid &= (ret.qvoffset >= 0 && ret.qvoffset <= 255);
- ret.ifile = argv[2];
- ret.ofile = argv[3];
- ret.error_threshold = atoi(argv[4]);
- ret.valid &= (ret.error_threshold >= 0 && ret.error_threshold <= 255);
- ret.file_number = atoi(argv[5]);
- if (argc == 7) {
- if (string(argv[6]) == "q") {
- ret.q_mers = true;
- } else {
- ret.valid = false;
- }
- }
- }
- return ret;
-}
-
-/**
- * This function reads reads from the stream and splits them into
- * k-mers. Then k-mers are written to several file almost
- * uniformly. It is guaranteed that the same k-mers are written to the
- * same files.
- * @param ifs Steam to read reads from.
- * @param ofiles Files to write the result k-mers. They are written
- * one per line.
- */
-void SplitToFiles(ireadstream ifs, vector<ofstream *> &ofiles,
- bool q_mers, uint8_t error_threshold) {
- uint32_t file_number = ofiles.size();
- uint64_t read_number = 0;
- while (!ifs.eof()) {
- ++read_number;
- if (read_number % kStep == 0) {
- LOG("Reading read " << read_number << ".");
- }
- Read r;
- ifs >> r;
- KMer::hash hash_function;
- for (ValidKMerGenerator<kK> gen(r, error_threshold); gen.HasMore(); gen.Next()) {
- KMer kmer = gen.kmer();
- if (KMer::less2()(!kmer, kmer)) {
- kmer = !kmer;
- }
- ofstream &cur_file = *ofiles[hash_function(kmer) % file_number];
- KMer::BinWrite(cur_file, kmer);
- if (q_mers) {
- double correct_probability = gen.correct_probability();
- cur_file.write((const char*) &correct_probability, sizeof(correct_probability));
- }
- }
- }
-}
-
-/**
- * This function reads k-mer and calculates number of occurrences for
- * each of them.
- * @param ifile File with k-mer to process. One per line.
- * @param ofile Output file. For each unique k-mer there will be a
- * line with k-mer itself and number of its occurrences.
- */
-template<typename KMerStatMap>
-void EvalFile(ifstream &ifile, FILE *ofile, bool q_mers) {
- KMerStatMap stat_map;
- char buffer[kK + 1];
- buffer[kK] = 0;
- KMer kmer;
- while (KMer::BinRead(ifile, &kmer)) {
- KMerFreqInfo &info = stat_map[kmer];
- if (q_mers) {
- double correct_probability = -1;
- ifile.read((char *) &correct_probability, sizeof(correct_probability));
- assert(ifile.fail());
- info.q_count += correct_probability;
- } else {
- info.count += 1;
- }
- }
- for (typename KMerStatMap::iterator it = stat_map.begin();
- it != stat_map.end(); ++it) {
- fprintf(ofile, "%s ", it->first.str().c_str());
- if (q_mers) {
- fprintf(ofile, "%f\n", it->second.q_count);
- } else {
- fprintf(ofile, "%d\n", it->second.count);
- }
- }
-}
-}
-
-int main(int argc, char *argv[]) {
- Options opts = ParseOptions(argc, argv);
- if (!opts.valid) {
- PrintHelp(argv[0]);
- return 1;
- }
- // BasicConfigurator::configure();
- LOG("Starting preproc: evaluating " << opts.ifile << ".");
- vector<ofstream*> ofiles(opts.file_number);
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- char filename[50];
- snprintf(filename, sizeof(filename), "%u.kmer.part", i);
- ofiles[i] = new ofstream(filename);
- assert(!ofiles[i]->fail() && "Too many files to open");
- }
- SplitToFiles(ireadstream(opts.ifile, opts.qvoffset),
- ofiles, opts.q_mers, opts.error_threshold);
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- delete ofiles[i];
- }
- FILE *ofile = fopen(opts.ofile.c_str(), "w");
- assert(ofile != NULL && "Too many files to open");
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- char ifile_name[50];
- snprintf(ifile_name, sizeof(ifile_name), "%u.kmer.part", i);
- ifstream ifile(ifile_name);
- LOG("Processing " << ifile_name << ".");
- EvalFile<UnorderedMap>(ifile, ofile, opts.q_mers);
- LOG("Processed " << ifile_name << ".");
- }
- fclose(ofile);
- LOG("Preprocessing done. You can find results in " << opts.ofile << ".");
- return 0;
-}
diff --git a/src/hammer/quake_count/quake_count_21.cpp b/src/hammer/quake_count/quake_count_21.cpp
deleted file mode 100644
index aec291d..0000000
--- a/src/hammer/quake_count/quake_count_21.cpp
+++ /dev/null
@@ -1,238 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/**
- * @file preproc.cpp
- * @author Alex Davydow
- * @version 1.0
- *
- * @section LICENSE
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- *
- * @section DESCRIPTION
- *
- * For each k-mer this program calculates number of occurring in
- * the reads provided. Reads file is supposed to be in fastq
- * format.
- */
-#include "standard.hpp"
-#include <stdint.h>
-#include <cstdio>
-#include <cstdlib>
-#include <cassert>
-#include <string>
-#include <set>
-#include <unordered_map>
-#include <vector>
-#include <iostream>
-#include <iomanip>
-#include "io/ireadstream.hpp"
-#include "io/read.hpp"
-#include "sequence/seq.hpp"
-#include "kmer_freq_info.hpp"
-#include "valid_kmer_generator.hpp"
-#define SUPPRESS_UNUSED(X) ((void) (X))
-
-using std::string;
-using std::set;
-using std::vector;
-using std::unordered_map;
-using std::map;
-using std::ofstream;
-using std::ifstream;
-
-namespace {
-
-const uint32_t kK = 21;
-typedef Seq<kK> KMer;
-typedef unordered_map<KMer, KMerFreqInfo, KMer::hash> UnorderedMap;
-
-void print_time() {
- time_t rawtime;
- tm * ptm;
- time ( &rawtime );
- ptm = gmtime( &rawtime );
- std::cout << std::setfill('0') << "[ " << std::setw(2) << ptm->tm_hour << ":" << std::setw(2) << ptm->tm_min
- << ":" << std::setw(2) << ptm->tm_sec << " ] ";
-}
-
-#define LOG(a) print_time(); std::cout << a << std::endl
-
-/**
- * @variable Every kStep k-mer will appear in the log.
- */
-const int kStep = 1e5;
-
-struct Options {
- /**
- * @variable An offset for quality in a fastq file.
- */
- uint32_t qvoffset;
- string ifile;
- string ofile;
- uint32_t error_threshold;
- /**
- * @variable How many files will be used when splitting k-mers.
- */
- uint32_t file_number;
- bool q_mers;
- bool valid;
- Options()
- : qvoffset(0),
- ifile(""),
- ofile(""),
- error_threshold(0),
- file_number(3),
- q_mers(false),
- valid(true) {}
-};
-
-void PrintHelp(char *program_name) {
- printf("Usage: %s qvoffset ifile.fastq ofile.[q]cst file_number error_threshold [q]\n",
- program_name);
- printf("Where:\n");
- printf("\tqvoffset\tan offset of fastq quality data\n");
- printf("\tifile.fastq\tan input file with reads in fastq format\n");
- printf("\tofile.[q]cst\ta filename where k-mer statistics will be outputted\n");
- printf("\terror_threshold\tnucliotides with quality lower then threshold will be cut from the ends of reads\n");
- printf("\tfile_number\thow many files will be used when splitting k-mers\n");
- printf("\tq\t\tif you want to count q-mers instead of k-mers.\n");
-}
-
-Options ParseOptions(int argc, char *argv[]) {
- Options ret;
- if (argc != 6 && argc != 7) {
- ret.valid = false;
- } else {
- ret.qvoffset = atoi(argv[1]);
- ret.valid &= (ret.qvoffset >= 0 && ret.qvoffset <= 255);
- ret.ifile = argv[2];
- ret.ofile = argv[3];
- ret.error_threshold = atoi(argv[4]);
- ret.valid &= (ret.error_threshold >= 0 && ret.error_threshold <= 255);
- ret.file_number = atoi(argv[5]);
- if (argc == 7) {
- if (string(argv[6]) == "q") {
- ret.q_mers = true;
- } else {
- ret.valid = false;
- }
- }
- }
- return ret;
-}
-
-/**
- * This function reads reads from the stream and splits them into
- * k-mers. Then k-mers are written to several file almost
- * uniformly. It is guaranteed that the same k-mers are written to the
- * same files.
- * @param ifs Steam to read reads from.
- * @param ofiles Files to write the result k-mers. They are written
- * one per line.
- */
-void SplitToFiles(ireadstream ifs, vector<ofstream *> &ofiles,
- bool q_mers, uint8_t error_threshold) {
- uint32_t file_number = ofiles.size();
- uint64_t read_number = 0;
- while (!ifs.eof()) {
- ++read_number;
- if (read_number % kStep == 0) {
- LOG("Reading read " << read_number << ".");
- }
- Read r;
- ifs >> r;
- KMer::hash hash_function;
- for (ValidKMerGenerator<kK> gen(r, error_threshold); gen.HasMore(); gen.Next()) {
- KMer kmer = gen.kmer();
- if (KMer::less2()(!kmer, kmer)) {
- kmer = !kmer;
- }
- ofstream &cur_file = *ofiles[hash_function(kmer) % file_number];
- KMer::BinWrite(cur_file, kmer);
- if (q_mers) {
- double correct_probability = gen.correct_probability();
- cur_file.write((const char*) &correct_probability, sizeof(correct_probability));
- }
- }
- }
-}
-
-/**
- * This function reads k-mer and calculates number of occurrences for
- * each of them.
- * @param ifile File with k-mer to process. One per line.
- * @param ofile Output file. For each unique k-mer there will be a
- * line with k-mer itself and number of its occurrences.
- */
-template<typename KMerStatMap>
-void EvalFile(ifstream &ifile, FILE *ofile, bool q_mers) {
- KMerStatMap stat_map;
- char buffer[kK + 1];
- buffer[kK] = 0;
- KMer kmer;
- while (KMer::BinRead(ifile, &kmer)) {
- KMerFreqInfo &info = stat_map[kmer];
- if (q_mers) {
- double correct_probability = -1;
- ifile.read((char *) &correct_probability, sizeof(correct_probability));
- assert(ifile.fail());
- info.q_count += correct_probability;
- } else {
- info.count += 1;
- }
- }
- for (typename KMerStatMap::iterator it = stat_map.begin();
- it != stat_map.end(); ++it) {
- fprintf(ofile, "%s ", it->first.str().c_str());
- if (q_mers) {
- fprintf(ofile, "%f\n", it->second.q_count);
- } else {
- fprintf(ofile, "%d\n", it->second.count);
- }
- }
-}
-}
-
-int main(int argc, char *argv[]) {
- Options opts = ParseOptions(argc, argv);
- if (!opts.valid) {
- PrintHelp(argv[0]);
- return 1;
- }
- // BasicConfigurator::configure();
- LOG("Starting preproc: evaluating " << opts.ifile << ".");
- vector<ofstream*> ofiles(opts.file_number);
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- char filename[50];
- snprintf(filename, sizeof(filename), "%u.kmer.part", i);
- ofiles[i] = new ofstream(filename);
- assert(!ofiles[i]->fail() && "Too many files to open");
- }
- SplitToFiles(ireadstream(opts.ifile, opts.qvoffset),
- ofiles, opts.q_mers, opts.error_threshold);
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- delete ofiles[i];
- }
- FILE *ofile = fopen(opts.ofile.c_str(), "w");
- assert(ofile != NULL && "Too many files to open");
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- char ifile_name[50];
- snprintf(ifile_name, sizeof(ifile_name), "%u.kmer.part", i);
- ifstream ifile(ifile_name);
- LOG("Processing " << ifile_name << ".");
- EvalFile<UnorderedMap>(ifile, ofile, opts.q_mers);
- LOG("Processed " << ifile_name << ".");
- }
- fclose(ofile);
- LOG("Preprocessing done. You can find results in " << opts.ofile << ".");
- return 0;
-}
diff --git a/src/hammer/quake_count/quake_count_25.cpp b/src/hammer/quake_count/quake_count_25.cpp
deleted file mode 100644
index 1390d41..0000000
--- a/src/hammer/quake_count/quake_count_25.cpp
+++ /dev/null
@@ -1,238 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/**
- * @file preproc.cpp
- * @author Alex Davydow
- * @version 1.0
- *
- * @section LICENSE
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- *
- * @section DESCRIPTION
- *
- * For each k-mer this program calculates number of occurring in
- * the reads provided. Reads file is supposed to be in fastq
- * format.
- */
-#include "standard.hpp"
-#include <stdint.h>
-#include <cstdio>
-#include <cstdlib>
-#include <cassert>
-#include <string>
-#include <set>
-#include <unordered_map>
-#include <vector>
-#include <iostream>
-#include <iomanip>
-#include "io/ireadstream.hpp"
-#include "io/read.hpp"
-#include "sequence/seq.hpp"
-#include "kmer_freq_info.hpp"
-#include "valid_kmer_generator.hpp"
-#define SUPPRESS_UNUSED(X) ((void) (X))
-
-using std::string;
-using std::set;
-using std::vector;
-using std::unordered_map;
-using std::map;
-using std::ofstream;
-using std::ifstream;
-
-namespace {
-
-const uint32_t kK = 25;
-typedef Seq<kK> KMer;
-typedef unordered_map<KMer, KMerFreqInfo, KMer::hash> UnorderedMap;
-
-void print_time() {
- time_t rawtime;
- tm * ptm;
- time ( &rawtime );
- ptm = gmtime( &rawtime );
- std::cout << std::setfill('0') << "[ " << std::setw(2) << ptm->tm_hour << ":" << std::setw(2) << ptm->tm_min
- << ":" << std::setw(2) << ptm->tm_sec << " ] ";
-}
-
-#define LOG(a) print_time(); std::cout << a << std::endl
-
-/**
- * @variable Every kStep k-mer will appear in the log.
- */
-const int kStep = 1e5;
-
-struct Options {
- /**
- * @variable An offset for quality in a fastq file.
- */
- uint32_t qvoffset;
- string ifile;
- string ofile;
- uint32_t error_threshold;
- /**
- * @variable How many files will be used when splitting k-mers.
- */
- uint32_t file_number;
- bool q_mers;
- bool valid;
- Options()
- : qvoffset(0),
- ifile(""),
- ofile(""),
- error_threshold(0),
- file_number(3),
- q_mers(false),
- valid(true) {}
-};
-
-void PrintHelp(char *program_name) {
- printf("Usage: %s qvoffset ifile.fastq ofile.[q]cst file_number error_threshold [q]\n",
- program_name);
- printf("Where:\n");
- printf("\tqvoffset\tan offset of fastq quality data\n");
- printf("\tifile.fastq\tan input file with reads in fastq format\n");
- printf("\tofile.[q]cst\ta filename where k-mer statistics will be outputted\n");
- printf("\terror_threshold\tnucliotides with quality lower then threshold will be cut from the ends of reads\n");
- printf("\tfile_number\thow many files will be used when splitting k-mers\n");
- printf("\tq\t\tif you want to count q-mers instead of k-mers.\n");
-}
-
-Options ParseOptions(int argc, char *argv[]) {
- Options ret;
- if (argc != 6 && argc != 7) {
- ret.valid = false;
- } else {
- ret.qvoffset = atoi(argv[1]);
- ret.valid &= (ret.qvoffset >= 0 && ret.qvoffset <= 255);
- ret.ifile = argv[2];
- ret.ofile = argv[3];
- ret.error_threshold = atoi(argv[4]);
- ret.valid &= (ret.error_threshold >= 0 && ret.error_threshold <= 255);
- ret.file_number = atoi(argv[5]);
- if (argc == 7) {
- if (string(argv[6]) == "q") {
- ret.q_mers = true;
- } else {
- ret.valid = false;
- }
- }
- }
- return ret;
-}
-
-/**
- * This function reads reads from the stream and splits them into
- * k-mers. Then k-mers are written to several file almost
- * uniformly. It is guaranteed that the same k-mers are written to the
- * same files.
- * @param ifs Steam to read reads from.
- * @param ofiles Files to write the result k-mers. They are written
- * one per line.
- */
-void SplitToFiles(ireadstream ifs, vector<ofstream *> &ofiles,
- bool q_mers, uint8_t error_threshold) {
- uint32_t file_number = ofiles.size();
- uint64_t read_number = 0;
- while (!ifs.eof()) {
- ++read_number;
- if (read_number % kStep == 0) {
- LOG("Reading read " << read_number << ".");
- }
- Read r;
- ifs >> r;
- KMer::hash hash_function;
- for (ValidKMerGenerator<kK> gen(r, error_threshold); gen.HasMore(); gen.Next()) {
- KMer kmer = gen.kmer();
- if (KMer::less2()(!kmer, kmer)) {
- kmer = !kmer;
- }
- ofstream &cur_file = *ofiles[hash_function(kmer) % file_number];
- KMer::BinWrite(cur_file, kmer);
- if (q_mers) {
- double correct_probability = gen.correct_probability();
- cur_file.write((const char*) &correct_probability, sizeof(correct_probability));
- }
- }
- }
-}
-
-/**
- * This function reads k-mer and calculates number of occurrences for
- * each of them.
- * @param ifile File with k-mer to process. One per line.
- * @param ofile Output file. For each unique k-mer there will be a
- * line with k-mer itself and number of its occurrences.
- */
-template<typename KMerStatMap>
-void EvalFile(ifstream &ifile, FILE *ofile, bool q_mers) {
- KMerStatMap stat_map;
- char buffer[kK + 1];
- buffer[kK] = 0;
- KMer kmer;
- while (KMer::BinRead(ifile, &kmer)) {
- KMerFreqInfo &info = stat_map[kmer];
- if (q_mers) {
- double correct_probability = -1;
- ifile.read((char *) &correct_probability, sizeof(correct_probability));
- assert(ifile.fail());
- info.q_count += correct_probability;
- } else {
- info.count += 1;
- }
- }
- for (typename KMerStatMap::iterator it = stat_map.begin();
- it != stat_map.end(); ++it) {
- fprintf(ofile, "%s ", it->first.str().c_str());
- if (q_mers) {
- fprintf(ofile, "%f\n", it->second.q_count);
- } else {
- fprintf(ofile, "%d\n", it->second.count);
- }
- }
-}
-}
-
-int main(int argc, char *argv[]) {
- Options opts = ParseOptions(argc, argv);
- if (!opts.valid) {
- PrintHelp(argv[0]);
- return 1;
- }
- // BasicConfigurator::configure();
- LOG("Starting preproc: evaluating " << opts.ifile << ".");
- vector<ofstream*> ofiles(opts.file_number);
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- char filename[50];
- snprintf(filename, sizeof(filename), "%u.kmer.part", i);
- ofiles[i] = new ofstream(filename);
- assert(!ofiles[i]->fail() && "Too many files to open");
- }
- SplitToFiles(ireadstream(opts.ifile, opts.qvoffset),
- ofiles, opts.q_mers, opts.error_threshold);
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- delete ofiles[i];
- }
- FILE *ofile = fopen(opts.ofile.c_str(), "w");
- assert(ofile != NULL && "Too many files to open");
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- char ifile_name[50];
- snprintf(ifile_name, sizeof(ifile_name), "%u.kmer.part", i);
- ifstream ifile(ifile_name);
- LOG("Processing " << ifile_name << ".");
- EvalFile<UnorderedMap>(ifile, ofile, opts.q_mers);
- LOG("Processed " << ifile_name << ".");
- }
- fclose(ofile);
- LOG("Preprocessing done. You can find results in " << opts.ofile << ".");
- return 0;
-}
diff --git a/src/hammer/quake_count/quake_count_29.cpp b/src/hammer/quake_count/quake_count_29.cpp
deleted file mode 100644
index 6d5b6c8..0000000
--- a/src/hammer/quake_count/quake_count_29.cpp
+++ /dev/null
@@ -1,238 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/**
- * @file preproc.cpp
- * @author Alex Davydow
- * @version 1.0
- *
- * @section LICENSE
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- *
- * @section DESCRIPTION
- *
- * For each k-mer this program calculates number of occurring in
- * the reads provided. Reads file is supposed to be in fastq
- * format.
- */
-#include "standard.hpp"
-#include <stdint.h>
-#include <cstdio>
-#include <cstdlib>
-#include <cassert>
-#include <string>
-#include <set>
-#include <unordered_map>
-#include <vector>
-#include <iostream>
-#include <iomanip>
-#include "io/ireadstream.hpp"
-#include "io/read.hpp"
-#include "sequence/seq.hpp"
-#include "kmer_freq_info.hpp"
-#include "valid_kmer_generator.hpp"
-#define SUPPRESS_UNUSED(X) ((void) (X))
-
-using std::string;
-using std::set;
-using std::vector;
-using std::unordered_map;
-using std::map;
-using std::ofstream;
-using std::ifstream;
-
-namespace {
-
-const uint32_t kK = 29;
-typedef Seq<kK> KMer;
-typedef unordered_map<KMer, KMerFreqInfo, KMer::hash> UnorderedMap;
-
-void print_time() {
- time_t rawtime;
- tm * ptm;
- time ( &rawtime );
- ptm = gmtime( &rawtime );
- std::cout << std::setfill('0') << "[ " << std::setw(2) << ptm->tm_hour << ":" << std::setw(2) << ptm->tm_min
- << ":" << std::setw(2) << ptm->tm_sec << " ] ";
-}
-
-#define LOG(a) print_time(); std::cout << a << std::endl
-
-/**
- * @variable Every kStep k-mer will appear in the log.
- */
-const int kStep = 1e5;
-
-struct Options {
- /**
- * @variable An offset for quality in a fastq file.
- */
- uint32_t qvoffset;
- string ifile;
- string ofile;
- uint32_t error_threshold;
- /**
- * @variable How many files will be used when splitting k-mers.
- */
- uint32_t file_number;
- bool q_mers;
- bool valid;
- Options()
- : qvoffset(0),
- ifile(""),
- ofile(""),
- error_threshold(0),
- file_number(3),
- q_mers(false),
- valid(true) {}
-};
-
-void PrintHelp(char *program_name) {
- printf("Usage: %s qvoffset ifile.fastq ofile.[q]cst file_number error_threshold [q]\n",
- program_name);
- printf("Where:\n");
- printf("\tqvoffset\tan offset of fastq quality data\n");
- printf("\tifile.fastq\tan input file with reads in fastq format\n");
- printf("\tofile.[q]cst\ta filename where k-mer statistics will be outputted\n");
- printf("\terror_threshold\tnucliotides with quality lower then threshold will be cut from the ends of reads\n");
- printf("\tfile_number\thow many files will be used when splitting k-mers\n");
- printf("\tq\t\tif you want to count q-mers instead of k-mers.\n");
-}
-
-Options ParseOptions(int argc, char *argv[]) {
- Options ret;
- if (argc != 6 && argc != 7) {
- ret.valid = false;
- } else {
- ret.qvoffset = atoi(argv[1]);
- ret.valid &= (ret.qvoffset >= 0 && ret.qvoffset <= 255);
- ret.ifile = argv[2];
- ret.ofile = argv[3];
- ret.error_threshold = atoi(argv[4]);
- ret.valid &= (ret.error_threshold >= 0 && ret.error_threshold <= 255);
- ret.file_number = atoi(argv[5]);
- if (argc == 7) {
- if (string(argv[6]) == "q") {
- ret.q_mers = true;
- } else {
- ret.valid = false;
- }
- }
- }
- return ret;
-}
-
-/**
- * This function reads reads from the stream and splits them into
- * k-mers. Then k-mers are written to several file almost
- * uniformly. It is guaranteed that the same k-mers are written to the
- * same files.
- * @param ifs Steam to read reads from.
- * @param ofiles Files to write the result k-mers. They are written
- * one per line.
- */
-void SplitToFiles(ireadstream ifs, vector<ofstream *> &ofiles,
- bool q_mers, uint8_t error_threshold) {
- uint32_t file_number = ofiles.size();
- uint64_t read_number = 0;
- while (!ifs.eof()) {
- ++read_number;
- if (read_number % kStep == 0) {
- LOG("Reading read " << read_number << ".");
- }
- Read r;
- ifs >> r;
- KMer::hash hash_function;
- for (ValidKMerGenerator<kK> gen(r, error_threshold); gen.HasMore(); gen.Next()) {
- KMer kmer = gen.kmer();
- if (KMer::less2()(!kmer, kmer)) {
- kmer = !kmer;
- }
- ofstream &cur_file = *ofiles[hash_function(kmer) % file_number];
- KMer::BinWrite(cur_file, kmer);
- if (q_mers) {
- double correct_probability = gen.correct_probability();
- cur_file.write((const char*) &correct_probability, sizeof(correct_probability));
- }
- }
- }
-}
-
-/**
- * This function reads k-mer and calculates number of occurrences for
- * each of them.
- * @param ifile File with k-mer to process. One per line.
- * @param ofile Output file. For each unique k-mer there will be a
- * line with k-mer itself and number of its occurrences.
- */
-template<typename KMerStatMap>
-void EvalFile(ifstream &ifile, FILE *ofile, bool q_mers) {
- KMerStatMap stat_map;
- char buffer[kK + 1];
- buffer[kK] = 0;
- KMer kmer;
- while (KMer::BinRead(ifile, &kmer)) {
- KMerFreqInfo &info = stat_map[kmer];
- if (q_mers) {
- double correct_probability = -1;
- ifile.read((char *) &correct_probability, sizeof(correct_probability));
- assert(ifile.fail());
- info.q_count += correct_probability;
- } else {
- info.count += 1;
- }
- }
- for (typename KMerStatMap::iterator it = stat_map.begin();
- it != stat_map.end(); ++it) {
- fprintf(ofile, "%s ", it->first.str().c_str());
- if (q_mers) {
- fprintf(ofile, "%f\n", it->second.q_count);
- } else {
- fprintf(ofile, "%d\n", it->second.count);
- }
- }
-}
-}
-
-int main(int argc, char *argv[]) {
- Options opts = ParseOptions(argc, argv);
- if (!opts.valid) {
- PrintHelp(argv[0]);
- return 1;
- }
- // BasicConfigurator::configure();
- LOG("Starting preproc: evaluating " << opts.ifile << ".");
- vector<ofstream*> ofiles(opts.file_number);
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- char filename[50];
- snprintf(filename, sizeof(filename), "%u.kmer.part", i);
- ofiles[i] = new ofstream(filename);
- assert(!ofiles[i]->fail() && "Too many files to open");
- }
- SplitToFiles(ireadstream(opts.ifile, opts.qvoffset),
- ofiles, opts.q_mers, opts.error_threshold);
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- delete ofiles[i];
- }
- FILE *ofile = fopen(opts.ofile.c_str(), "w");
- assert(ofile != NULL && "Too many files to open");
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- char ifile_name[50];
- snprintf(ifile_name, sizeof(ifile_name), "%u.kmer.part", i);
- ifstream ifile(ifile_name);
- LOG("Processing " << ifile_name << ".");
- EvalFile<UnorderedMap>(ifile, ofile, opts.q_mers);
- LOG("Processed " << ifile_name << ".");
- }
- fclose(ofile);
- LOG("Preprocessing done. You can find results in " << opts.ofile << ".");
- return 0;
-}
diff --git a/src/hammer/quake_count/quake_count_33.cpp b/src/hammer/quake_count/quake_count_33.cpp
deleted file mode 100644
index 0987692..0000000
--- a/src/hammer/quake_count/quake_count_33.cpp
+++ /dev/null
@@ -1,239 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/**
- * @file preproc.cpp
- * @author Alex Davydow
- * @version 1.0
- *
- * @section LICENSE
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- *
- * @section DESCRIPTION
- *
- * For each k-mer this program calculates number of occurring in
- * the reads provided. Reads file is supposed to be in fastq
- * format.
- */
-#include "standard.hpp"
-#include <stdint.h>
-#include <cstdio>
-#include <cstdlib>
-#include <cassert>
-#include <string>
-#include <set>
-#include <unordered_map>
-#include <vector>
-#include <iostream>
-#include <iomanip>
-#include "io/ireadstream.hpp"
-#include "io/read.hpp"
-#include "sequence/seq.hpp"
-#include "kmer_freq_info.hpp"
-#include "valid_kmer_generator.hpp"
-
-#define SUPPRESS_UNUSED(X) ((void) (X))
-
-using std::string;
-using std::set;
-using std::vector;
-using std::unordered_map;
-using std::map;
-using std::ofstream;
-using std::ifstream;
-
-namespace {
-
-const uint32_t kK = 33;
-typedef Seq<kK> KMer;
-typedef unordered_map<KMer, KMerFreqInfo, KMer::hash> UnorderedMap;
-
-void print_time() {
- time_t rawtime;
- tm * ptm;
- time ( &rawtime );
- ptm = gmtime( &rawtime );
- std::cout << std::setfill('0') << "[ " << std::setw(2) << ptm->tm_hour << ":" << std::setw(2) << ptm->tm_min
- << ":" << std::setw(2) << ptm->tm_sec << " ] ";
-}
-
-#define LOG(a) print_time(); std::cout << a << std::endl
-
-/**
- * @variable Every kStep k-mer will appear in the log.
- */
-const int kStep = 1e5;
-
-struct Options {
- /**
- * @variable An offset for quality in a fastq file.
- */
- uint32_t qvoffset;
- string ifile;
- string ofile;
- uint32_t error_threshold;
- /**
- * @variable How many files will be used when splitting k-mers.
- */
- uint32_t file_number;
- bool q_mers;
- bool valid;
- Options()
- : qvoffset(0),
- ifile(""),
- ofile(""),
- error_threshold(0),
- file_number(3),
- q_mers(false),
- valid(true) {}
-};
-
-void PrintHelp(char *program_name) {
- printf("Usage: %s qvoffset ifile.fastq ofile.[q]cst file_number error_threshold [q]\n",
- program_name);
- printf("Where:\n");
- printf("\tqvoffset\tan offset of fastq quality data\n");
- printf("\tifile.fastq\tan input file with reads in fastq format\n");
- printf("\tofile.[q]cst\ta filename where k-mer statistics will be outputted\n");
- printf("\terror_threshold\tnucliotides with quality lower then threshold will be cut from the ends of reads\n");
- printf("\tfile_number\thow many files will be used when splitting k-mers\n");
- printf("\tq\t\tif you want to count q-mers instead of k-mers.\n");
-}
-
-Options ParseOptions(int argc, char *argv[]) {
- Options ret;
- if (argc != 6 && argc != 7) {
- ret.valid = false;
- } else {
- ret.qvoffset = atoi(argv[1]);
- ret.valid &= (ret.qvoffset >= 0 && ret.qvoffset <= 255);
- ret.ifile = argv[2];
- ret.ofile = argv[3];
- ret.error_threshold = atoi(argv[4]);
- ret.valid &= (ret.error_threshold >= 0 && ret.error_threshold <= 255);
- ret.file_number = atoi(argv[5]);
- if (argc == 7) {
- if (string(argv[6]) == "q") {
- ret.q_mers = true;
- } else {
- ret.valid = false;
- }
- }
- }
- return ret;
-}
-
-/**
- * This function reads reads from the stream and splits them into
- * k-mers. Then k-mers are written to several file almost
- * uniformly. It is guaranteed that the same k-mers are written to the
- * same files.
- * @param ifs Steam to read reads from.
- * @param ofiles Files to write the result k-mers. They are written
- * one per line.
- */
-void SplitToFiles(ireadstream ifs, vector<ofstream *> &ofiles,
- bool q_mers, uint8_t error_threshold) {
- uint32_t file_number = ofiles.size();
- uint64_t read_number = 0;
- while (!ifs.eof()) {
- ++read_number;
- if (read_number % kStep == 0) {
- LOG("Reading read " << read_number << ".");
- }
- Read r;
- ifs >> r;
- KMer::hash hash_function;
- for (ValidKMerGenerator<kK> gen(r, error_threshold); gen.HasMore(); gen.Next()) {
- KMer kmer = gen.kmer();
- if (KMer::less2()(!kmer, kmer)) {
- kmer = !kmer;
- }
- ofstream &cur_file = *ofiles[hash_function(kmer) % file_number];
- KMer::BinWrite(cur_file, kmer);
- if (q_mers) {
- double correct_probability = gen.correct_probability();
- cur_file.write((const char*) &correct_probability, sizeof(correct_probability));
- }
- }
- }
-}
-
-/**
- * This function reads k-mer and calculates number of occurrences for
- * each of them.
- * @param ifile File with k-mer to process. One per line.
- * @param ofile Output file. For each unique k-mer there will be a
- * line with k-mer itself and number of its occurrences.
- */
-template<typename KMerStatMap>
-void EvalFile(ifstream &ifile, FILE *ofile, bool q_mers) {
- KMerStatMap stat_map;
- char buffer[kK + 1];
- buffer[kK] = 0;
- KMer kmer;
- while (KMer::BinRead(ifile, &kmer)) {
- KMerFreqInfo &info = stat_map[kmer];
- if (q_mers) {
- double correct_probability = -1;
- ifile.read((char *) &correct_probability, sizeof(correct_probability));
- assert(ifile.fail());
- info.q_count += correct_probability;
- } else {
- info.count += 1;
- }
- }
- for (typename KMerStatMap::iterator it = stat_map.begin();
- it != stat_map.end(); ++it) {
- fprintf(ofile, "%s ", it->first.str().c_str());
- if (q_mers) {
- fprintf(ofile, "%f\n", it->second.q_count);
- } else {
- fprintf(ofile, "%d\n", it->second.count);
- }
- }
-}
-}
-
-int main(int argc, char *argv[]) {
- Options opts = ParseOptions(argc, argv);
- if (!opts.valid) {
- PrintHelp(argv[0]);
- return 1;
- }
- // BasicConfigurator::configure();
- LOG("Starting preproc: evaluating " << opts.ifile << ".");
- vector<ofstream*> ofiles(opts.file_number);
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- char filename[50];
- snprintf(filename, sizeof(filename), "%u.kmer.part", i);
- ofiles[i] = new ofstream(filename);
- assert(!ofiles[i]->fail() && "Too many files to open");
- }
- SplitToFiles(ireadstream(opts.ifile, opts.qvoffset),
- ofiles, opts.q_mers, opts.error_threshold);
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- delete ofiles[i];
- }
- FILE *ofile = fopen(opts.ofile.c_str(), "w");
- assert(ofile != NULL && "Too many files to open");
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- char ifile_name[50];
- snprintf(ifile_name, sizeof(ifile_name), "%u.kmer.part", i);
- ifstream ifile(ifile_name);
- LOG("Processing " << ifile_name << ".");
- EvalFile<UnorderedMap>(ifile, ofile, opts.q_mers);
- LOG("Processed " << ifile_name << ".");
- }
- fclose(ofile);
- LOG("Preprocessing done. You can find results in " << opts.ofile << ".");
- return 0;
-}
diff --git a/src/hammer/quake_count/quake_count_37.cpp b/src/hammer/quake_count/quake_count_37.cpp
deleted file mode 100644
index b86140d..0000000
--- a/src/hammer/quake_count/quake_count_37.cpp
+++ /dev/null
@@ -1,238 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/**
- * @file preproc.cpp
- * @author Alex Davydow
- * @version 1.0
- *
- * @section LICENSE
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- *
- * @section DESCRIPTION
- *
- * For each k-mer this program calculates number of occurring in
- * the reads provided. Reads file is supposed to be in fastq
- * format.
- */
-#include "standard.hpp"
-#include <stdint.h>
-#include <cstdio>
-#include <cstdlib>
-#include <cassert>
-#include <string>
-#include <set>
-#include <unordered_map>
-#include <vector>
-#include <iostream>
-#include <iomanip>
-#include "io/ireadstream.hpp"
-#include "io/read.hpp"
-#include "sequence/seq.hpp"
-#include "kmer_freq_info.hpp"
-#include "valid_kmer_generator.hpp"
-#define SUPPRESS_UNUSED(X) ((void) (X))
-
-using std::string;
-using std::set;
-using std::vector;
-using std::unordered_map;
-using std::map;
-using std::ofstream;
-using std::ifstream;
-
-namespace {
-
-const uint32_t kK = 37;
-typedef Seq<kK> KMer;
-typedef unordered_map<KMer, KMerFreqInfo, KMer::hash> UnorderedMap;
-
-void print_time() {
- time_t rawtime;
- tm * ptm;
- time ( &rawtime );
- ptm = gmtime( &rawtime );
- std::cout << std::setfill('0') << "[ " << std::setw(2) << ptm->tm_hour << ":" << std::setw(2) << ptm->tm_min
- << ":" << std::setw(2) << ptm->tm_sec << " ] ";
-}
-
-#define LOG(a) print_time(); std::cout << a << std::endl
-
-/**
- * @variable Every kStep k-mer will appear in the log.
- */
-const int kStep = 1e5;
-
-struct Options {
- /**
- * @variable An offset for quality in a fastq file.
- */
- uint32_t qvoffset;
- string ifile;
- string ofile;
- uint32_t error_threshold;
- /**
- * @variable How many files will be used when splitting k-mers.
- */
- uint32_t file_number;
- bool q_mers;
- bool valid;
- Options()
- : qvoffset(0),
- ifile(""),
- ofile(""),
- error_threshold(0),
- file_number(3),
- q_mers(false),
- valid(true) {}
-};
-
-void PrintHelp(char *program_name) {
- printf("Usage: %s qvoffset ifile.fastq ofile.[q]cst file_number error_threshold [q]\n",
- program_name);
- printf("Where:\n");
- printf("\tqvoffset\tan offset of fastq quality data\n");
- printf("\tifile.fastq\tan input file with reads in fastq format\n");
- printf("\tofile.[q]cst\ta filename where k-mer statistics will be outputted\n");
- printf("\terror_threshold\tnucliotides with quality lower then threshold will be cut from the ends of reads\n");
- printf("\tfile_number\thow many files will be used when splitting k-mers\n");
- printf("\tq\t\tif you want to count q-mers instead of k-mers.\n");
-}
-
-Options ParseOptions(int argc, char *argv[]) {
- Options ret;
- if (argc != 6 && argc != 7) {
- ret.valid = false;
- } else {
- ret.qvoffset = atoi(argv[1]);
- ret.valid &= (ret.qvoffset >= 0 && ret.qvoffset <= 255);
- ret.ifile = argv[2];
- ret.ofile = argv[3];
- ret.error_threshold = atoi(argv[4]);
- ret.valid &= (ret.error_threshold >= 0 && ret.error_threshold <= 255);
- ret.file_number = atoi(argv[5]);
- if (argc == 7) {
- if (string(argv[6]) == "q") {
- ret.q_mers = true;
- } else {
- ret.valid = false;
- }
- }
- }
- return ret;
-}
-
-/**
- * This function reads reads from the stream and splits them into
- * k-mers. Then k-mers are written to several file almost
- * uniformly. It is guaranteed that the same k-mers are written to the
- * same files.
- * @param ifs Steam to read reads from.
- * @param ofiles Files to write the result k-mers. They are written
- * one per line.
- */
-void SplitToFiles(ireadstream ifs, vector<ofstream *> &ofiles,
- bool q_mers, uint8_t error_threshold) {
- uint32_t file_number = ofiles.size();
- uint64_t read_number = 0;
- while (!ifs.eof()) {
- ++read_number;
- if (read_number % kStep == 0) {
- LOG("Reading read " << read_number << ".");
- }
- Read r;
- ifs >> r;
- KMer::hash hash_function;
- for (ValidKMerGenerator<kK> gen(r, error_threshold); gen.HasMore(); gen.Next()) {
- KMer kmer = gen.kmer();
- if (KMer::less2()(!kmer, kmer)) {
- kmer = !kmer;
- }
- ofstream &cur_file = *ofiles[hash_function(kmer) % file_number];
- KMer::BinWrite(cur_file, kmer);
- if (q_mers) {
- double correct_probability = gen.correct_probability();
- cur_file.write((const char*) &correct_probability, sizeof(correct_probability));
- }
- }
- }
-}
-
-/**
- * This function reads k-mer and calculates number of occurrences for
- * each of them.
- * @param ifile File with k-mer to process. One per line.
- * @param ofile Output file. For each unique k-mer there will be a
- * line with k-mer itself and number of its occurrences.
- */
-template<typename KMerStatMap>
-void EvalFile(ifstream &ifile, FILE *ofile, bool q_mers) {
- KMerStatMap stat_map;
- char buffer[kK + 1];
- buffer[kK] = 0;
- KMer kmer;
- while (KMer::BinRead(ifile, &kmer)) {
- KMerFreqInfo &info = stat_map[kmer];
- if (q_mers) {
- double correct_probability = -1;
- ifile.read((char *) &correct_probability, sizeof(correct_probability));
- assert(ifile.fail());
- info.q_count += correct_probability;
- } else {
- info.count += 1;
- }
- }
- for (typename KMerStatMap::iterator it = stat_map.begin();
- it != stat_map.end(); ++it) {
- fprintf(ofile, "%s ", it->first.str().c_str());
- if (q_mers) {
- fprintf(ofile, "%f\n", it->second.q_count);
- } else {
- fprintf(ofile, "%d\n", it->second.count);
- }
- }
-}
-}
-
-int main(int argc, char *argv[]) {
- Options opts = ParseOptions(argc, argv);
- if (!opts.valid) {
- PrintHelp(argv[0]);
- return 1;
- }
- // BasicConfigurator::configure();
- LOG("Starting preproc: evaluating " << opts.ifile << ".");
- vector<ofstream*> ofiles(opts.file_number);
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- char filename[50];
- snprintf(filename, sizeof(filename), "%u.kmer.part", i);
- ofiles[i] = new ofstream(filename);
- assert(!ofiles[i]->fail() && "Too many files to open");
- }
- SplitToFiles(ireadstream(opts.ifile, opts.qvoffset),
- ofiles, opts.q_mers, opts.error_threshold);
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- delete ofiles[i];
- }
- FILE *ofile = fopen(opts.ofile.c_str(), "w");
- assert(ofile != NULL && "Too many files to open");
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- char ifile_name[50];
- snprintf(ifile_name, sizeof(ifile_name), "%u.kmer.part", i);
- ifstream ifile(ifile_name);
- LOG("Processing " << ifile_name << ".");
- EvalFile<UnorderedMap>(ifile, ofile, opts.q_mers);
- LOG("Processed " << ifile_name << ".");
- }
- fclose(ofile);
- LOG("Preprocessing done. You can find results in " << opts.ofile << ".");
- return 0;
-}
diff --git a/src/hammer/quake_count/quake_count_45.cpp b/src/hammer/quake_count/quake_count_45.cpp
deleted file mode 100644
index 7439d82..0000000
--- a/src/hammer/quake_count/quake_count_45.cpp
+++ /dev/null
@@ -1,238 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/**
- * @file preproc.cpp
- * @author Alex Davydow
- * @version 1.0
- *
- * @section LICENSE
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- *
- * @section DESCRIPTION
- *
- * For each k-mer this program calculates number of occurring in
- * the reads provided. Reads file is supposed to be in fastq
- * format.
- */
-#include "standard.hpp"
-#include <stdint.h>
-#include <cstdio>
-#include <cstdlib>
-#include <cassert>
-#include <string>
-#include <set>
-#include <unordered_map>
-#include <vector>
-#include <iostream>
-#include <iomanip>
-#include "io/ireadstream.hpp"
-#include "io/read.hpp"
-#include "sequence/seq.hpp"
-#include "kmer_freq_info.hpp"
-#include "valid_kmer_generator.hpp"
-#define SUPPRESS_UNUSED(X) ((void) (X))
-
-using std::string;
-using std::set;
-using std::vector;
-using std::unordered_map;
-using std::map;
-using std::ofstream;
-using std::ifstream;
-
-namespace {
-
-const uint32_t kK = 45;
-typedef Seq<kK> KMer;
-typedef unordered_map<KMer, KMerFreqInfo, KMer::hash> UnorderedMap;
-
-void print_time() {
- time_t rawtime;
- tm * ptm;
- time ( &rawtime );
- ptm = gmtime( &rawtime );
- std::cout << std::setfill('0') << "[ " << std::setw(2) << ptm->tm_hour << ":" << std::setw(2) << ptm->tm_min
- << ":" << std::setw(2) << ptm->tm_sec << " ] ";
-}
-
-#define LOG(a) print_time(); std::cout << a << std::endl
-
-/**
- * @variable Every kStep k-mer will appear in the log.
- */
-const int kStep = 1e5;
-
-struct Options {
- /**
- * @variable An offset for quality in a fastq file.
- */
- uint32_t qvoffset;
- string ifile;
- string ofile;
- uint32_t error_threshold;
- /**
- * @variable How many files will be used when splitting k-mers.
- */
- uint32_t file_number;
- bool q_mers;
- bool valid;
- Options()
- : qvoffset(0),
- ifile(""),
- ofile(""),
- error_threshold(0),
- file_number(3),
- q_mers(false),
- valid(true) {}
-};
-
-void PrintHelp(char *program_name) {
- printf("Usage: %s qvoffset ifile.fastq ofile.[q]cst file_number error_threshold [q]\n",
- program_name);
- printf("Where:\n");
- printf("\tqvoffset\tan offset of fastq quality data\n");
- printf("\tifile.fastq\tan input file with reads in fastq format\n");
- printf("\tofile.[q]cst\ta filename where k-mer statistics will be outputted\n");
- printf("\terror_threshold\tnucliotides with quality lower then threshold will be cut from the ends of reads\n");
- printf("\tfile_number\thow many files will be used when splitting k-mers\n");
- printf("\tq\t\tif you want to count q-mers instead of k-mers.\n");
-}
-
-Options ParseOptions(int argc, char *argv[]) {
- Options ret;
- if (argc != 6 && argc != 7) {
- ret.valid = false;
- } else {
- ret.qvoffset = atoi(argv[1]);
- ret.valid &= (ret.qvoffset >= 0 && ret.qvoffset <= 255);
- ret.ifile = argv[2];
- ret.ofile = argv[3];
- ret.error_threshold = atoi(argv[4]);
- ret.valid &= (ret.error_threshold >= 0 && ret.error_threshold <= 255);
- ret.file_number = atoi(argv[5]);
- if (argc == 7) {
- if (string(argv[6]) == "q") {
- ret.q_mers = true;
- } else {
- ret.valid = false;
- }
- }
- }
- return ret;
-}
-
-/**
- * This function reads reads from the stream and splits them into
- * k-mers. Then k-mers are written to several file almost
- * uniformly. It is guaranteed that the same k-mers are written to the
- * same files.
- * @param ifs Steam to read reads from.
- * @param ofiles Files to write the result k-mers. They are written
- * one per line.
- */
-void SplitToFiles(ireadstream ifs, vector<ofstream *> &ofiles,
- bool q_mers, uint8_t error_threshold) {
- uint32_t file_number = ofiles.size();
- uint64_t read_number = 0;
- while (!ifs.eof()) {
- ++read_number;
- if (read_number % kStep == 0) {
- LOG("Reading read " << read_number << ".");
- }
- Read r;
- ifs >> r;
- KMer::hash hash_function;
- for (ValidKMerGenerator<kK> gen(r, error_threshold); gen.HasMore(); gen.Next()) {
- KMer kmer = gen.kmer();
- if (KMer::less2()(!kmer, kmer)) {
- kmer = !kmer;
- }
- ofstream &cur_file = *ofiles[hash_function(kmer) % file_number];
- KMer::BinWrite(cur_file, kmer);
- if (q_mers) {
- double correct_probability = gen.correct_probability();
- cur_file.write((const char*) &correct_probability, sizeof(correct_probability));
- }
- }
- }
-}
-
-/**
- * This function reads k-mer and calculates number of occurrences for
- * each of them.
- * @param ifile File with k-mer to process. One per line.
- * @param ofile Output file. For each unique k-mer there will be a
- * line with k-mer itself and number of its occurrences.
- */
-template<typename KMerStatMap>
-void EvalFile(ifstream &ifile, FILE *ofile, bool q_mers) {
- KMerStatMap stat_map;
- char buffer[kK + 1];
- buffer[kK] = 0;
- KMer kmer;
- while (KMer::BinRead(ifile, &kmer)) {
- KMerFreqInfo &info = stat_map[kmer];
- if (q_mers) {
- double correct_probability = -1;
- ifile.read((char *) &correct_probability, sizeof(correct_probability));
- assert(ifile.fail());
- info.q_count += correct_probability;
- } else {
- info.count += 1;
- }
- }
- for (typename KMerStatMap::iterator it = stat_map.begin();
- it != stat_map.end(); ++it) {
- fprintf(ofile, "%s ", it->first.str().c_str());
- if (q_mers) {
- fprintf(ofile, "%f\n", it->second.q_count);
- } else {
- fprintf(ofile, "%d\n", it->second.count);
- }
- }
-}
-}
-
-int main(int argc, char *argv[]) {
- Options opts = ParseOptions(argc, argv);
- if (!opts.valid) {
- PrintHelp(argv[0]);
- return 1;
- }
- // BasicConfigurator::configure();
- LOG("Starting preproc: evaluating " << opts.ifile << ".");
- vector<ofstream*> ofiles(opts.file_number);
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- char filename[50];
- snprintf(filename, sizeof(filename), "%u.kmer.part", i);
- ofiles[i] = new ofstream(filename);
- assert(!ofiles[i]->fail() && "Too many files to open");
- }
- SplitToFiles(ireadstream(opts.ifile, opts.qvoffset),
- ofiles, opts.q_mers, opts.error_threshold);
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- delete ofiles[i];
- }
- FILE *ofile = fopen(opts.ofile.c_str(), "w");
- assert(ofile != NULL && "Too many files to open");
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- char ifile_name[50];
- snprintf(ifile_name, sizeof(ifile_name), "%u.kmer.part", i);
- ifstream ifile(ifile_name);
- LOG("Processing " << ifile_name << ".");
- EvalFile<UnorderedMap>(ifile, ofile, opts.q_mers);
- LOG("Processed " << ifile_name << ".");
- }
- fclose(ofile);
- LOG("Preprocessing done. You can find results in " << opts.ofile << ".");
- return 0;
-}
diff --git a/src/hammer/quake_count/quake_count_55.cpp b/src/hammer/quake_count/quake_count_55.cpp
deleted file mode 100644
index a7af547..0000000
--- a/src/hammer/quake_count/quake_count_55.cpp
+++ /dev/null
@@ -1,240 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/**
- * @file preproc.cpp
- * @author Alex Davydow
- * @version 1.0
- *
- * @section LICENSE
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- *
- * @section DESCRIPTION
- *
- * For each k-mer this program calculates number of occurring in
- * the reads provided. Reads file is supposed to be in fastq
- * format.
- */
-#include "standard.hpp"
-#include <stdint.h>
-#include <cstdio>
-#include <cstdlib>
-#include <cassert>
-#include <string>
-#include <set>
-#include <unordered_map>
-#include <vector>
-#include <iostream>
-#include <iomanip>
-#include "io/ireadstream.hpp"
-#include "io/read.hpp"
-#include "sequence/seq.hpp"
-#include "kmer_freq_info.hpp"
-#include "valid_kmer_generator.hpp"
-#define SUPPRESS_UNUSED(X) ((void) (X))
-
-using std::string;
-using std::set;
-using std::vector;
-using std::unordered_map;
-using std::map;
-using std::ofstream;
-using std::ifstream;
-
-namespace {
-
-const uint32_t kK = 55;
-typedef Seq<kK> KMer;
-typedef unordered_map<KMer, KMerFreqInfo, KMer::hash> UnorderedMap;
-
-void print_time() {
- time_t rawtime;
- tm * ptm;
- time ( &rawtime );
- ptm = gmtime( &rawtime );
- std::cout << std::setfill('0') << "[ " << std::setw(2) << ptm->tm_hour << ":" << std::setw(2) << ptm->tm_min
- << ":" << std::setw(2) << ptm->tm_sec << " ] ";
-}
-
-#define LOG(a) print_time(); std::cout << a << std::endl
-
-/**
- * @variable Every kStep k-mer will appear in the log.
- */
-const int kStep = 1e5;
-
-struct Options {
- /**
- * @variable An offset for quality in a fastq file.
- */
- uint32_t qvoffset;
- string ifile;
- string ofile;
- uint32_t error_threshold;
- /**
- * @variable How many files will be used when splitting k-mers.
- */
- uint32_t file_number;
- bool q_mers;
- bool valid;
- Options()
- : qvoffset(0),
- ifile(""),
- ofile(""),
- error_threshold(0),
- file_number(3),
- q_mers(false),
- valid(true) {}
-};
-
-void PrintHelp(char *program_name) {
- printf("Usage: %s qvoffset ifile.fastq ofile.[q]cst file_number error_threshold [q]\n",
- program_name);
- printf("Where:\n");
- printf("\tqvoffset\tan offset of fastq quality data\n");
- printf("\tifile.fastq\tan input file with reads in fastq format\n");
- printf("\tofile.[q]cst\ta filename where k-mer statistics will be outputted\n");
- printf("\terror_threshold\tnucliotides with quality lower then threshold will be cut from the ends of reads\n");
- printf("\tfile_number\thow many files will be used when splitting k-mers\n");
- printf("\tq\t\tif you want to count q-mers instead of k-mers.\n");
-}
-
-Options ParseOptions(int argc, char *argv[]) {
- Options ret;
- if (argc != 6 && argc != 7) {
- ret.valid = false;
- } else {
- ret.qvoffset = atoi(argv[1]);
- ret.valid &= (ret.qvoffset >= 0 && ret.qvoffset <= 255);
- ret.ifile = argv[2];
- ret.ofile = argv[3];
- ret.error_threshold = atoi(argv[4]);
- ret.valid &= (ret.error_threshold >= 0 && ret.error_threshold <= 255);
- ret.file_number = atoi(argv[5]);
- if (argc == 7) {
- if (string(argv[6]) == "q") {
- ret.q_mers = true;
- } else {
- ret.valid = false;
- }
- }
- }
- return ret;
-}
-
-/**
- * This function reads reads from the stream and splits them into
- * k-mers. Then k-mers are written to several file almost
- * uniformly. It is guaranteed that the same k-mers are written to the
- * same files.
- * @param ifs Steam to read reads from.
- * @param ofiles Files to write the result k-mers. They are written
- * one per line.
- */
-void SplitToFiles(ireadstream ifs, vector<ofstream *> &ofiles,
- bool q_mers, uint8_t error_threshold) {
- uint32_t file_number = ofiles.size();
- uint64_t read_number = 0;
- while (!ifs.eof()) {
- ++read_number;
- if (read_number % kStep == 0) {
- LOG("Reading read " << read_number << ".");
- }
- Read r;
- ifs >> r;
- //cout << r.getSequenceString() << endl;
- KMer::hash hash_function;
- for (ValidKMerGenerator<kK> gen(r, error_threshold); gen.HasMore(); gen.Next()) {
- KMer kmer = gen.kmer();
- //cout << kmer.str() << endl;
- if (KMer::less2()(!kmer, kmer)) {
- kmer = !kmer;
- }
- ofstream &cur_file = *ofiles[hash_function(kmer) % file_number];
- KMer::BinWrite(cur_file, kmer);
- if (q_mers) {
- double correct_probability = gen.correct_probability();
- cur_file.write((const char*) &correct_probability, sizeof(correct_probability));
- }
- }
- }
-}
-
-/**
- * This function reads k-mer and calculates number of occurrences for
- * each of them.
- * @param ifile File with k-mer to process. One per line.
- * @param ofile Output file. For each unique k-mer there will be a
- * line with k-mer itself and number of its occurrences.
- */
-template<typename KMerStatMap>
-void EvalFile(ifstream &ifile, FILE *ofile, bool q_mers) {
- KMerStatMap stat_map;
- char buffer[kK + 1];
- buffer[kK] = 0;
- KMer kmer;
- while (KMer::BinRead(ifile, &kmer)) {
- KMerFreqInfo &info = stat_map[kmer];
- if (q_mers) {
- double correct_probability = -1;
- ifile.read((char *) &correct_probability, sizeof(correct_probability));
- assert(ifile.fail());
- info.q_count += correct_probability;
- } else {
- info.count += 1;
- }
- }
- for (typename KMerStatMap::iterator it = stat_map.begin();
- it != stat_map.end(); ++it) {
- fprintf(ofile, "%s ", it->first.str().c_str());
- if (q_mers) {
- fprintf(ofile, "%f\n", it->second.q_count);
- } else {
- fprintf(ofile, "%d\n", it->second.count);
- }
- }
-}
-}
-
-int main(int argc, char *argv[]) {
- Options opts = ParseOptions(argc, argv);
- if (!opts.valid) {
- PrintHelp(argv[0]);
- return 1;
- }
- // BasicConfigurator::configure();
- LOG("Starting preproc: evaluating " << opts.ifile << ".");
- vector<ofstream*> ofiles(opts.file_number);
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- char filename[50];
- snprintf(filename, sizeof(filename), "%u.kmer.part", i);
- ofiles[i] = new ofstream(filename);
- assert(!ofiles[i]->fail() && "Too many files to open");
- }
- SplitToFiles(ireadstream(opts.ifile, opts.qvoffset),
- ofiles, opts.q_mers, opts.error_threshold);
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- delete ofiles[i];
- }
- FILE *ofile = fopen(opts.ofile.c_str(), "w");
- assert(ofile != NULL && "Too many files to open");
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- char ifile_name[50];
- snprintf(ifile_name, sizeof(ifile_name), "%u.kmer.part", i);
- ifstream ifile(ifile_name);
- LOG("Processing " << ifile_name << ".");
- EvalFile<UnorderedMap>(ifile, ofile, opts.q_mers);
- LOG("Processed " << ifile_name << ".");
- }
- fclose(ofile);
- LOG("Preprocessing done. You can find results in " << opts.ofile << ".");
- return 0;
-}
diff --git a/src/hammer/quake_count/quake_count_65.cpp b/src/hammer/quake_count/quake_count_65.cpp
deleted file mode 100644
index d4f83ff..0000000
--- a/src/hammer/quake_count/quake_count_65.cpp
+++ /dev/null
@@ -1,238 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/**
- * @file preproc.cpp
- * @author Alex Davydow
- * @version 1.0
- *
- * @section LICENSE
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- *
- * @section DESCRIPTION
- *
- * For each k-mer this program calculates number of occurring in
- * the reads provided. Reads file is supposed to be in fastq
- * format.
- */
-#include "standard.hpp"
-#include <stdint.h>
-#include <cstdio>
-#include <cstdlib>
-#include <cassert>
-#include <string>
-#include <set>
-#include <unordered_map>
-#include <vector>
-#include <iostream>
-#include <iomanip>
-#include "io/ireadstream.hpp"
-#include "io/read.hpp"
-#include "sequence/seq.hpp"
-#include "kmer_freq_info.hpp"
-#include "valid_kmer_generator.hpp"
-#define SUPPRESS_UNUSED(X) ((void) (X))
-
-using std::string;
-using std::set;
-using std::vector;
-using std::unordered_map;
-using std::map;
-using std::ofstream;
-using std::ifstream;
-
-namespace {
-
-const uint32_t kK = 65;
-typedef Seq<kK> KMer;
-typedef unordered_map<KMer, KMerFreqInfo, KMer::hash> UnorderedMap;
-
-void print_time() {
- time_t rawtime;
- tm * ptm;
- time ( &rawtime );
- ptm = gmtime( &rawtime );
- std::cout << std::setfill('0') << "[ " << std::setw(2) << ptm->tm_hour << ":" << std::setw(2) << ptm->tm_min
- << ":" << std::setw(2) << ptm->tm_sec << " ] ";
-}
-
-#define LOG(a) print_time(); std::cout << a << std::endl
-
-/**
- * @variable Every kStep k-mer will appear in the log.
- */
-const int kStep = 1e5;
-
-struct Options {
- /**
- * @variable An offset for quality in a fastq file.
- */
- uint32_t qvoffset;
- string ifile;
- string ofile;
- uint32_t error_threshold;
- /**
- * @variable How many files will be used when splitting k-mers.
- */
- uint32_t file_number;
- bool q_mers;
- bool valid;
- Options()
- : qvoffset(0),
- ifile(""),
- ofile(""),
- error_threshold(0),
- file_number(3),
- q_mers(false),
- valid(true) {}
-};
-
-void PrintHelp(char *program_name) {
- printf("Usage: %s qvoffset ifile.fastq ofile.[q]cst file_number error_threshold [q]\n",
- program_name);
- printf("Where:\n");
- printf("\tqvoffset\tan offset of fastq quality data\n");
- printf("\tifile.fastq\tan input file with reads in fastq format\n");
- printf("\tofile.[q]cst\ta filename where k-mer statistics will be outputted\n");
- printf("\terror_threshold\tnucliotides with quality lower then threshold will be cut from the ends of reads\n");
- printf("\tfile_number\thow many files will be used when splitting k-mers\n");
- printf("\tq\t\tif you want to count q-mers instead of k-mers.\n");
-}
-
-Options ParseOptions(int argc, char *argv[]) {
- Options ret;
- if (argc != 6 && argc != 7) {
- ret.valid = false;
- } else {
- ret.qvoffset = atoi(argv[1]);
- ret.valid &= (ret.qvoffset >= 0 && ret.qvoffset <= 255);
- ret.ifile = argv[2];
- ret.ofile = argv[3];
- ret.error_threshold = atoi(argv[4]);
- ret.valid &= (ret.error_threshold >= 0 && ret.error_threshold <= 255);
- ret.file_number = atoi(argv[5]);
- if (argc == 7) {
- if (string(argv[6]) == "q") {
- ret.q_mers = true;
- } else {
- ret.valid = false;
- }
- }
- }
- return ret;
-}
-
-/**
- * This function reads reads from the stream and splits them into
- * k-mers. Then k-mers are written to several file almost
- * uniformly. It is guaranteed that the same k-mers are written to the
- * same files.
- * @param ifs Steam to read reads from.
- * @param ofiles Files to write the result k-mers. They are written
- * one per line.
- */
-void SplitToFiles(ireadstream ifs, vector<ofstream *> &ofiles,
- bool q_mers, uint8_t error_threshold) {
- uint32_t file_number = ofiles.size();
- uint64_t read_number = 0;
- while (!ifs.eof()) {
- ++read_number;
- if (read_number % kStep == 0) {
- LOG("Reading read " << read_number << ".");
- }
- Read r;
- ifs >> r;
- KMer::hash hash_function;
- for (ValidKMerGenerator<kK> gen(r, error_threshold); gen.HasMore(); gen.Next()) {
- KMer kmer = gen.kmer();
- if (KMer::less2()(!kmer, kmer)) {
- kmer = !kmer;
- }
- ofstream &cur_file = *ofiles[hash_function(kmer) % file_number];
- KMer::BinWrite(cur_file, kmer);
- if (q_mers) {
- double correct_probability = gen.correct_probability();
- cur_file.write((const char*) &correct_probability, sizeof(correct_probability));
- }
- }
- }
-}
-
-/**
- * This function reads k-mer and calculates number of occurrences for
- * each of them.
- * @param ifile File with k-mer to process. One per line.
- * @param ofile Output file. For each unique k-mer there will be a
- * line with k-mer itself and number of its occurrences.
- */
-template<typename KMerStatMap>
-void EvalFile(ifstream &ifile, FILE *ofile, bool q_mers) {
- KMerStatMap stat_map;
- char buffer[kK + 1];
- buffer[kK] = 0;
- KMer kmer;
- while (KMer::BinRead(ifile, &kmer)) {
- KMerFreqInfo &info = stat_map[kmer];
- if (q_mers) {
- double correct_probability = -1;
- ifile.read((char *) &correct_probability, sizeof(correct_probability));
- assert(ifile.fail());
- info.q_count += correct_probability;
- } else {
- info.count += 1;
- }
- }
- for (typename KMerStatMap::iterator it = stat_map.begin();
- it != stat_map.end(); ++it) {
- fprintf(ofile, "%s ", it->first.str().c_str());
- if (q_mers) {
- fprintf(ofile, "%f\n", it->second.q_count);
- } else {
- fprintf(ofile, "%d\n", it->second.count);
- }
- }
-}
-}
-
-int main(int argc, char *argv[]) {
- Options opts = ParseOptions(argc, argv);
- if (!opts.valid) {
- PrintHelp(argv[0]);
- return 1;
- }
- // BasicConfigurator::configure();
- LOG("Starting preproc: evaluating " << opts.ifile << ".");
- vector<ofstream*> ofiles(opts.file_number);
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- char filename[50];
- snprintf(filename, sizeof(filename), "%u.kmer.part", i);
- ofiles[i] = new ofstream(filename);
- assert(!ofiles[i]->fail() && "Too many files to open");
- }
- SplitToFiles(ireadstream(opts.ifile, opts.qvoffset),
- ofiles, opts.q_mers, opts.error_threshold);
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- delete ofiles[i];
- }
- FILE *ofile = fopen(opts.ofile.c_str(), "w");
- assert(ofile != NULL && "Too many files to open");
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- char ifile_name[50];
- snprintf(ifile_name, sizeof(ifile_name), "%u.kmer.part", i);
- ifstream ifile(ifile_name);
- LOG("Processing " << ifile_name << ".");
- EvalFile<UnorderedMap>(ifile, ofile, opts.q_mers);
- LOG("Processed " << ifile_name << ".");
- }
- fclose(ofile);
- LOG("Preprocessing done. You can find results in " << opts.ofile << ".");
- return 0;
-}
diff --git a/src/hammer/quake_count/quake_count_75.cpp b/src/hammer/quake_count/quake_count_75.cpp
deleted file mode 100644
index 5de86f6..0000000
--- a/src/hammer/quake_count/quake_count_75.cpp
+++ /dev/null
@@ -1,238 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/**
- * @file preproc.cpp
- * @author Alex Davydow
- * @version 1.0
- *
- * @section LICENSE
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- *
- * @section DESCRIPTION
- *
- * For each k-mer this program calculates number of occurring in
- * the reads provided. Reads file is supposed to be in fastq
- * format.
- */
-#include "standard.hpp"
-#include <stdint.h>
-#include <cstdio>
-#include <cstdlib>
-#include <cassert>
-#include <string>
-#include <set>
-#include <unordered_map>
-#include <vector>
-#include <iostream>
-#include <iomanip>
-#include "io/ireadstream.hpp"
-#include "io/read.hpp"
-#include "sequence/seq.hpp"
-#include "kmer_freq_info.hpp"
-#include "valid_kmer_generator.hpp"
-#define SUPPRESS_UNUSED(X) ((void) (X))
-
-using std::string;
-using std::set;
-using std::vector;
-using std::unordered_map;
-using std::map;
-using std::ofstream;
-using std::ifstream;
-
-namespace {
-
-const uint32_t kK = 75;
-typedef Seq<kK> KMer;
-typedef unordered_map<KMer, KMerFreqInfo, KMer::hash> UnorderedMap;
-
-void print_time() {
- time_t rawtime;
- tm * ptm;
- time ( &rawtime );
- ptm = gmtime( &rawtime );
- std::cout << std::setfill('0') << "[ " << std::setw(2) << ptm->tm_hour << ":" << std::setw(2) << ptm->tm_min
- << ":" << std::setw(2) << ptm->tm_sec << " ] ";
-}
-
-#define LOG(a) print_time(); std::cout << a << std::endl
-
-/**
- * @variable Every kStep k-mer will appear in the log.
- */
-const int kStep = 1e5;
-
-struct Options {
- /**
- * @variable An offset for quality in a fastq file.
- */
- uint32_t qvoffset;
- string ifile;
- string ofile;
- uint32_t error_threshold;
- /**
- * @variable How many files will be used when splitting k-mers.
- */
- uint32_t file_number;
- bool q_mers;
- bool valid;
- Options()
- : qvoffset(0),
- ifile(""),
- ofile(""),
- error_threshold(0),
- file_number(3),
- q_mers(false),
- valid(true) {}
-};
-
-void PrintHelp(char *program_name) {
- printf("Usage: %s qvoffset ifile.fastq ofile.[q]cst file_number error_threshold [q]\n",
- program_name);
- printf("Where:\n");
- printf("\tqvoffset\tan offset of fastq quality data\n");
- printf("\tifile.fastq\tan input file with reads in fastq format\n");
- printf("\tofile.[q]cst\ta filename where k-mer statistics will be outputted\n");
- printf("\terror_threshold\tnucliotides with quality lower then threshold will be cut from the ends of reads\n");
- printf("\tfile_number\thow many files will be used when splitting k-mers\n");
- printf("\tq\t\tif you want to count q-mers instead of k-mers.\n");
-}
-
-Options ParseOptions(int argc, char *argv[]) {
- Options ret;
- if (argc != 6 && argc != 7) {
- ret.valid = false;
- } else {
- ret.qvoffset = atoi(argv[1]);
- ret.valid &= (ret.qvoffset >= 0 && ret.qvoffset <= 255);
- ret.ifile = argv[2];
- ret.ofile = argv[3];
- ret.error_threshold = atoi(argv[4]);
- ret.valid &= (ret.error_threshold >= 0 && ret.error_threshold <= 255);
- ret.file_number = atoi(argv[5]);
- if (argc == 7) {
- if (string(argv[6]) == "q") {
- ret.q_mers = true;
- } else {
- ret.valid = false;
- }
- }
- }
- return ret;
-}
-
-/**
- * This function reads reads from the stream and splits them into
- * k-mers. Then k-mers are written to several file almost
- * uniformly. It is guaranteed that the same k-mers are written to the
- * same files.
- * @param ifs Steam to read reads from.
- * @param ofiles Files to write the result k-mers. They are written
- * one per line.
- */
-void SplitToFiles(ireadstream ifs, vector<ofstream *> &ofiles,
- bool q_mers, uint8_t error_threshold) {
- uint32_t file_number = ofiles.size();
- uint64_t read_number = 0;
- while (!ifs.eof()) {
- ++read_number;
- if (read_number % kStep == 0) {
- LOG("Reading read " << read_number << ".");
- }
- Read r;
- ifs >> r;
- KMer::hash hash_function;
- for (ValidKMerGenerator<kK> gen(r, error_threshold); gen.HasMore(); gen.Next()) {
- KMer kmer = gen.kmer();
- if (KMer::less2()(!kmer, kmer)) {
- kmer = !kmer;
- }
- ofstream &cur_file = *ofiles[hash_function(kmer) % file_number];
- KMer::BinWrite(cur_file, kmer);
- if (q_mers) {
- double correct_probability = gen.correct_probability();
- cur_file.write((const char*) &correct_probability, sizeof(correct_probability));
- }
- }
- }
-}
-
-/**
- * This function reads k-mer and calculates number of occurrences for
- * each of them.
- * @param ifile File with k-mer to process. One per line.
- * @param ofile Output file. For each unique k-mer there will be a
- * line with k-mer itself and number of its occurrences.
- */
-template<typename KMerStatMap>
-void EvalFile(ifstream &ifile, FILE *ofile, bool q_mers) {
- KMerStatMap stat_map;
- char buffer[kK + 1];
- buffer[kK] = 0;
- KMer kmer;
- while (KMer::BinRead(ifile, &kmer)) {
- KMerFreqInfo &info = stat_map[kmer];
- if (q_mers) {
- double correct_probability = -1;
- ifile.read((char *) &correct_probability, sizeof(correct_probability));
- assert(ifile.fail());
- info.q_count += correct_probability;
- } else {
- info.count += 1;
- }
- }
- for (typename KMerStatMap::iterator it = stat_map.begin();
- it != stat_map.end(); ++it) {
- fprintf(ofile, "%s ", it->first.str().c_str());
- if (q_mers) {
- fprintf(ofile, "%f\n", it->second.q_count);
- } else {
- fprintf(ofile, "%d\n", it->second.count);
- }
- }
-}
-}
-
-int main(int argc, char *argv[]) {
- Options opts = ParseOptions(argc, argv);
- if (!opts.valid) {
- PrintHelp(argv[0]);
- return 1;
- }
- // BasicConfigurator::configure();
- LOG("Starting preproc: evaluating " << opts.ifile << ".");
- vector<ofstream*> ofiles(opts.file_number);
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- char filename[50];
- snprintf(filename, sizeof(filename), "%u.kmer.part", i);
- ofiles[i] = new ofstream(filename);
- assert(!ofiles[i]->fail() && "Too many files to open");
- }
- SplitToFiles(ireadstream(opts.ifile, opts.qvoffset),
- ofiles, opts.q_mers, opts.error_threshold);
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- delete ofiles[i];
- }
- FILE *ofile = fopen(opts.ofile.c_str(), "w");
- assert(ofile != NULL && "Too many files to open");
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- char ifile_name[50];
- snprintf(ifile_name, sizeof(ifile_name), "%u.kmer.part", i);
- ifstream ifile(ifile_name);
- LOG("Processing " << ifile_name << ".");
- EvalFile<UnorderedMap>(ifile, ofile, opts.q_mers);
- LOG("Processed " << ifile_name << ".");
- }
- fclose(ofile);
- LOG("Preprocessing done. You can find results in " << opts.ofile << ".");
- return 0;
-}
diff --git a/src/hammer/quake_count/valid_kmer_generator.hpp b/src/hammer/quake_count/valid_kmer_generator.hpp
deleted file mode 100644
index be42726..0000000
--- a/src/hammer/quake_count/valid_kmer_generator.hpp
+++ /dev/null
@@ -1,194 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef HAMMER_VALIDKMERGENERATOR_HPP_
-#define HAMMER_VALIDKMERGENERATOR_HPP_
-#include <stdint.h>
-#include <cmath>
-#include <string>
-#include <vector>
-#include "io/read.hpp"
-#include "sequence/seq.hpp"
-/**
- * This class is designed to iterate through valid k-mers in read.
- * @example
- * ValidKMerGenerator<2> gen(read, 4);
- * while (gen.HasMore()) {
- * MyTrickyFunction(gen.kmer());
- * gen.Next();
- * }
- * or
- * for (ValidKMerGenerator<2> gen(read, 2); gen.HasMore; gen.Next() {
- * MyTrickyFunction(gen.kmer(), gen.pos(), gen.correct_probability());
- * k}
- * @param kkK k-mer length.
- */
-template<uint32_t kK>
-class ValidKMerGenerator {
- public:
- /**
- * @param read Read to generate k-mers from.
- * @param bad_quality_threshold This class virtually cuts
- * nucleotides with quality lower the threshold from the ends of the
- * read.
- */
- explicit ValidKMerGenerator(const Read &read,
- uint32_t bad_quality_threshold = 2) :
- bad_quality_threshold_(bad_quality_threshold),
- pos_(-1),
- end_(-1),
- len_(read.getSequenceString().size()),
- has_more_(true),
- correct_probability_(1),
- first(true),
- kmer_(),
- seq_(read.getSequenceString().data()),
- qual_(read.getQualityString().data()) {
- TrimBadQuality();
- Next();
- }
- /**
- * @param seq sequence to generate k-mers from.
- * @param qual quality string
- * @param bad_quality_threshold This class virtually cuts
- * nucleotides with quality lower the threshold from the ends of the
- * read.
- */
- explicit ValidKMerGenerator(const char *seq, const char *qual,
- size_t len,
- uint32_t bad_quality_threshold = 2) :
- bad_quality_threshold_(bad_quality_threshold),
- pos_(-1),
- end_(-1),
- len_(len),
- has_more_(true),
- correct_probability_(1),
- first(true),
- kmer_(),
- seq_(seq),
- qual_(qual) {
- TrimBadQuality();
- Next();
- }
- /**
- * @result true if Next() succeed while generating new k-mer, false
- * otherwise.
- */
- bool HasMore() const {
- return has_more_;
- }
- /**
- * @result last k-mer generated by Next().
- */
- const Seq<kK>& kmer() const {
- return kmer_;
- }
- /**
- * @result last k-mer position in initial read.
- */
- int pos() const {
- return pos_;
- }
- /**
- * @result probability that last generated k-mer is correct.
- */
- double correct_probability() const {
- return correct_probability_;
- }
- /**
- * This functions reads next k-mer from the read and sets hasmore to
- * if succeeded. You can access k-mer read with kmer().
- */
- void Next();
- private:
- void TrimBadQuality();
- double Prob(uint8_t qual) {
- if (qual < 3) {
- return 0.25;
- }
- static std::vector<double> prob(255, -1);
- if (prob[qual] < -0.1) {
- prob[qual] = 1 - pow(10.0, - qual / 10.0);
- }
- return prob[qual];
- }
- uint32_t GetQual(uint32_t pos) {
- if (pos >= len_) {
- return 2;
- } else {
- return qual_[pos];
- }
- }
- uint32_t bad_quality_threshold_;
- size_t pos_;
- size_t end_;
- size_t len_;
- bool has_more_;
- double correct_probability_;
- bool first;
- Seq<kK> kmer_;
- const char* seq_;
- const char* qual_;
- // Disallow copy and assign
- ValidKMerGenerator(const ValidKMerGenerator&);
- void operator=(const ValidKMerGenerator&);
-};
-
-template<uint32_t kK>
-void ValidKMerGenerator<kK>::TrimBadQuality() {
- pos_ = 0;
- if (qual_)
- for (; pos_ < len_; ++pos_) {
- if (GetQual(pos_) >= bad_quality_threshold_)
- break;
- }
- end_ = len_;
- if (qual_)
- for (; end_ > pos_; --end_) {
- if (GetQual(end_ - 1) >= bad_quality_threshold_)
- break;
- }
-}
-
-template<uint32_t kK>
-void ValidKMerGenerator<kK>::Next() {
- if (pos_ + kK > end_) {
- has_more_ = false;
- } else if (first || !is_nucl(seq_[pos_ + kK - 1])) {
- // in this case we have to look for new k-mer
- correct_probability_ = 1.0;
- uint32_t start_hypothesis = pos_;
- uint32_t i = pos_;
- for (; i < len_; ++i) {
- if (i == kK + start_hypothesis) {
- break;
- }
- if (qual_)
- correct_probability_ *= Prob(GetQual(i));
- if (!is_nucl(seq_[i])) {
- start_hypothesis = i + 1;
- correct_probability_ = 1.0;
- }
- }
- if (i == kK + start_hypothesis) {
- kmer_ = Seq<kK>(seq_ + start_hypothesis, 0, kK, /* raw */ true);
- pos_ = start_hypothesis + 1;
- } else {
- has_more_ = false;
- }
- } else {
- // good case we can just shift our previous answer
- kmer_ = kmer_ << seq_[pos_ + kK - 1];
- if (qual_) {
- correct_probability_ *= Prob(GetQual(pos_ + kK - 1));
- correct_probability_ /= Prob(GetQual(pos_ - 1));
- }
- ++pos_;
- }
- first = false;
-}
-#endif // HAMMER_VALIDKMERGENERATOR_HPP__
diff --git a/src/hammer/quake_enhanced/count.cpp b/src/hammer/quake_enhanced/count.cpp
deleted file mode 100644
index 8d6bede..0000000
--- a/src/hammer/quake_enhanced/count.cpp
+++ /dev/null
@@ -1,131 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include <stdint.h>
-#include <cstdio>
-#include <cstdlib>
-#include <cassert>
-#include <string>
-#include <vector>
-#include <unordered_map>
-#include "io/ireadstream.hpp"
-#include "io/read.hpp"
-#include "sequence/seq.hpp"
-#include "valid_kmer_generator.hpp"
-#include "quake_enhanced/quake.hpp"
-#define SUPPRESS_UNUSED(X) ((void) (X))
-using quake_enhanced::Quake;
-
-using std::string;
-using std::vector;
-using std::unordered_map;
-using io::Reader;
-using io::SingleRead;
-
-struct KMerInfo {
- int count;
- double q_count;
- double freq;
-};
-
-typedef Seq<kK> KMer;
-typedef unordered_map<KMer, KMerInfo, KMer::hash> UnorderedMap;
-
-/**
- * This function reads reads from the stream and splits them into
- * k-mers. Then k-mers are written to several file almost
- * uniformly. It is guaranteed that the same k-mers are written to the
- * same files.
- * @param ifs Steam to read reads from.
- * @param ofiles Files to write the result k-mers. They are written
- * one per line.
- */
-void Quake::SplitToFiles(ireadstream ifs, vector<ofstream*> &ofiles,
- uint8_t error_threshold) {
- uint32_t file_number = ofiles.size();
- while (!ifs.eof()) {
- Read r;
- ifs >> r;
- KMer::hash hash_function;
- for (ValidKMerGenerator<kK> gen(r, error_threshold); gen.HasMore(); gen.Next()) {
- KMer kmer = gen.kmer();
- if (KMer::less2()(!kmer, kmer)) {
- kmer = !kmer;
- }
- ofstream &cur_file = *ofiles[hash_function(kmer) % file_number];
- KMer::BinWrite(cur_file, kmer);
- double q_count = gen.correct_probability();
- cur_file.write((const char *) &q_count, sizeof(q_count));
- }
- }
-}
-
-/**
- * This function reads k-mer and calculates number of occurrences for
- * each of them.
- * @param ifile File with k-mer to process. One per line.
- * @param ofile Output file. For each unique k-mer there will be a
- * line with k-mer itself and number of its occurrences.
- */
-void Quake::EvalFile(ifstream &ifile, ofstream &ofile) {
- UnorderedMap stat_map;
- char buffer[kK + 1];
- buffer[kK] = 0;
- KMer kmer;
- while (KMer::BinRead(ifile, &kmer)) {
- KMerInfo &info = stat_map[kmer];
- double q_count = -1;
- ifile.read((char *) &q_count, sizeof(q_count));
- assert(ifile.fail());
- double freq = 0;
- // ToDo 0.5 threshold ==>> command line option
- if (q_count > 0.5) {
- freq = 1 / q_count;
- }
- info.q_count += q_count;
- info.count += 1;
- info.freq += freq;
- }
- for (UnorderedMap::iterator it = stat_map.begin();
- it != stat_map.end(); ++it) {
- const KMerInfo &info = it->second;
- AddToHist(info.freq);
- ofile << it->first.str().c_str() << " "
- << info.count << " "
- << info.q_count << " "
- << info.freq << endl;
- }
-}
-
-void Quake::Count(string ifile_name, string ofile_name,
- string hash_file_prefix, uint32_t hash_file_number,
- uint8_t quality_offset, uint8_t quality_threshold) {
- vector<ofstream*> ofiles(hash_file_number);
- for (uint32_t i = 0; i < hash_file_number; ++i) {
- char filename[50];
- snprintf(filename, sizeof(filename), "%s%u.part",
- hash_file_prefix.c_str(), i);
- ofiles[i] = new ofstream(filename);
- assert(ofiles[i]->fail() && "Too many files to open");
- }
- SplitToFiles(ireadstream(ifile_name, quality_offset),
- ofiles, quality_threshold);
- for (uint32_t i = 0; i < hash_file_number; ++i) {
- delete ofiles[i];
- }
- ofstream ofile(ofile_name.c_str());
- assert(ofile != NULL && "Too many files to open");
- for (uint32_t i = 0; i < hash_file_number; ++i) {
- char ifile_name[50];
- snprintf(ifile_name, sizeof(ifile_name), "%s%u.part",
- hash_file_prefix.c_str(), i);
- ifstream ifile(ifile_name);
- EvalFile(ifile, ofile);
- remove(ifile_name);
- }
- cur_state_ = kRealHistPrepared;
-}
diff --git a/src/hammer/quake_enhanced/count/count.cpp b/src/hammer/quake_enhanced/count/count.cpp
deleted file mode 100644
index 2ea1a8d..0000000
--- a/src/hammer/quake_enhanced/count/count.cpp
+++ /dev/null
@@ -1,226 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/**
- * @file preproc.cpp
- * @author Alex Davydow
- * @version 1.0
- *
- * @section LICENSE
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- *
- * @section DESCRIPTION
- *
- * For each k-mer this program calculates number of occurring in
- * the reads provided. Reads file is supposed to be in fastq
- * format.
- */
-#include <stdint.h>
-#include <cstdio>
-#include <cstdlib>
-#include <cassert>
-#include <string>
-#include <set>
-#include <unordered_map>
-#include <vector>
-#include "logging.hpp"
-#include "io/ireadstream.hpp"
-#include "io/read.hpp"
-#include "sequence/seq.hpp"
-#include "valid_kmer_generator.hpp"
-#define SUPPRESS_UNUSED(X) ((void) (X))
-
-using std::string;
-using std::set;
-using std::vector;
-using std::unordered_map;
-using std::map;
-
-namespace {
-
-DECL_LOGGER("count")
-
-struct KMerInfo {
- int count;
- double q_count;
- double q_inversed_count;
-};
-
-const uint32_t kK = 55;
-typedef Seq<kK> KMer;
-typedef unordered_map<KMer, KMerInfo, KMer::hash> UnorderedMap;
-
-/**
- * @variable Every kStep k-mer will appear in the log.
- */
-const int kStep = 1e5;
-
-struct Options {
- /**
- * @variable An offset for quality in a fastq file.
- */
- uint32_t qvoffset;
- string ifile;
- string ofile;
- uint32_t error_threshold;
- /**
- * @variable How many files will be used when splitting k-mers.
- */
- uint32_t file_number;
- bool valid;
- Options()
- : qvoffset(0),
- ifile(""),
- ofile(""),
- error_threshold(0),
- file_number(3),
- valid(true) {}
-};
-
-void PrintHelp(char *program_name) {
- printf("Usage: %s qvoffset ifile.fastq ofile.[q]cst file_number error_threshold\n",
- program_name);
- printf("Where:\n");
- printf("\tqvoffset\tan offset of fastq quality data\n");
- printf("\tifile.fastq\tan input file with reads in fastq format\n");
- printf("\tofile.[q]cst\ta filename where k-mer statistics will be outputted\n");
- printf("\terror_threshold\tnucliotides with quality lower then threshold will be cut from the ends of reads\n");
- printf("\tfile_number\thow many files will be used when splitting k-mers\n");
-}
-
-Options ParseOptions(int argc, char *argv[]) {
- Options ret;
- if (argc != 6) {
- ret.valid = false;
- } else {
- ret.qvoffset = atoi(argv[1]);
- ret.valid &= (ret.qvoffset >= 0 && ret.qvoffset <= 255);
- ret.ifile = argv[2];
- ret.ofile = argv[3];
- ret.error_threshold = atoi(argv[4]);
- ret.valid &= (ret.error_threshold >= 0 && ret.error_threshold <= 255);
- ret.file_number = atoi(argv[5]);
- }
- return ret;
-}
-
-/**
- * This function reads reads from the stream and splits them into
- * k-mers. Then k-mers are written to several file almost
- * uniformly. It is guaranteed that the same k-mers are written to the
- * same files.
- * @param ifs Steam to read reads from.
- * @param ofiles Files to write the result k-mers. They are written
- * one per line.
- */
-void SplitToFiles(ireadstream ifs, const vector<FILE*> &ofiles,
- uint8_t error_threshold) {
- uint32_t file_number = ofiles.size();
- uint64_t read_number = 0;
- while (!ifs.eof()) {
- ++read_number;
- if (read_number % kStep == 0) {
- INFO("Reading read " << read_number << ".");
- }
- Read r;
- ifs >> r;
- KMer::hash hash_function;
- for (ValidKMerGenerator<kK> gen(r, error_threshold); gen.HasMore(); gen.Next()) {
- KMer kmer = gen.kmer();
- if (KMer::less2()(!kmer, kmer)) {
- kmer = !kmer;
- }
- FILE *cur_file = ofiles[hash_function(kmer) % file_number];
- KMer::BinWrite(cur_file, kmer);
- double correct_probability = gen.correct_probability();
- fwrite(&correct_probability, sizeof(correct_probability), 1, cur_file);
- }
- }
-}
-
-/**
- * This function reads k-mer and calculates number of occurrences for
- * each of them.
- * @param ifile File with k-mer to process. One per line.
- * @param ofile Output file. For each unique k-mer there will be a
- * line with k-mer itself and number of its occurrences.
- */
-void EvalFile(FILE *ifile, FILE *ofile) {
- UnorderedMap stat_map;
- char buffer[kK + 1];
- buffer[kK] = 0;
- KMer kmer;
- while (KMer::BinRead(ifile, &kmer)) {
- KMerInfo &info = stat_map[kmer];
- double correct_probability = -1;
- bool readed =
- fread(&correct_probability, sizeof(correct_probability),
- 1, ifile);
- assert(readed == 1);
- SUPPRESS_UNUSED(readed);
- double inversed_probability = 1 / correct_probability;
- // ToDo 0.5 threshold ==>> command line option
- if (correct_probability < 0.5) {
- inversed_probability = 0;
- }
- info.q_count += correct_probability;
- info.count += 1;
- info.q_inversed_count += inversed_probability;
- }
- for (UnorderedMap::iterator it = stat_map.begin();
- it != stat_map.end(); ++it) {
- const KMerInfo &info = it->second;
- fprintf(ofile, "%s %d %f %f\n", it->first.str().c_str(),
- info.count, info.q_count, info.q_inversed_count);
- }
-}
-
-void run(const Options &opts) {
- INFO("Starting preproc: evaluating "
- << opts.ifile << ".");
- vector<FILE*> ofiles(opts.file_number);
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- char filename[50];
- snprintf(filename, sizeof(filename), "%u.kmer.part", i);
- ofiles[i] = fopen(filename, "wb");
- assert(ofiles[i] != NULL && "Too many files to open");
- }
- SplitToFiles(ireadstream(opts.ifile, opts.qvoffset),
- ofiles, opts.error_threshold);
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- fclose(ofiles[i]);
- }
- FILE *ofile = fopen(opts.ofile.c_str(), "w");
- assert(ofile != NULL && "Too many files to open");
- for (uint32_t i = 0; i < opts.file_number; ++i) {
- char ifile_name[50];
- snprintf(ifile_name, sizeof(ifile_name), "%u.kmer.part", i);
- FILE *ifile = fopen(ifile_name, "rb");
- INFO("Processing " << ifile_name << ".");
- EvalFile(ifile, ofile);
- INFO("Processed " << ifile_name << ".");
- fclose(ifile);
- }
- fclose(ofile);
- INFO("Preprocessing done. You can find results in " <<
- opts.ofile << ".");
-}
-}
-
-int main(int argc, char *argv[]) {
- Options opts = ParseOptions(argc, argv);
- if (!opts.valid) {
- PrintHelp(argv[0]);
- return 1;
- }
- run(opts);
- return 0;
-}
diff --git a/src/hammer/quake_enhanced/filter_trusted_enh/main.cpp b/src/hammer/quake_enhanced/filter_trusted_enh/main.cpp
deleted file mode 100644
index 67ff7ca..0000000
--- a/src/hammer/quake_enhanced/filter_trusted_enh/main.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include <stdint.h>
-#include <cstdlib>
-#include <cstdio>
-#include <string>
-#include <unordered_map>
-#include "logger/logger.hpp"
-
-using std::string;
-using std::unordered_map;
-
-namespace {
-/**
- * @variable Length of string buffer which will store k-mer.
- */
-const uint32_t kMaxK = 100;
-/**
- * @variable Every kStep k-mer will appear in the log.
- */
-const int kStep = 1e5;
-
-DECL_LOGGER("filter_trusted_enh")
-
-struct Options {
- string ifile;
- string ofile;
- string badfile;
- string limits;
- bool valid;
- Options()
- : ifile(""),
- ofile(""),
- badfile(""),
- limits(""),
- valid(true) {}
-};
-
-void PrintHelp(char *progname) {
- printf("Usage: %s ifile.[q]cst ofile.trust ofile.bad file.limits\n", progname);
- printf("Where:\n");
- printf("\tifile.[q]cst\tfile with k|q-mer statistics\n");
- printf("\tofile.trust\ta filename where filtered data will be outputted\n");
- printf("\tofile.bud\ta filename where filtered garbage will be outputted\n");
- printf("\tfile.limits\tfile with q-value limits for k-mers\n");
-}
-
-Options ParseOptions(int argc, char *argv[]) {
- Options ret;
- if (argc != 5) {
- ret.valid = false;
- } else {
- ret.ifile = argv[1];
- ret.ofile = argv[2];
- ret.badfile = argv[3];
- ret.limits = argv[4];
- }
- return ret;
-}
-}
-
-
-int main(int argc, char *argv[]) {
- Options opts = ParseOptions(argc, argv);
- if (!opts.valid) {
- PrintHelp(argv[0]);
- return 1;
- }
- BasicConfigurator::configure();
- INFO(logger, "Starting filter_trusted: evaluating "
- << opts.ifile << ".");
- FILE *ifile = fopen(opts.ifile.c_str(), "r");
- FILE *ofile = fopen(opts.ofile.c_str(), "w");
- FILE *badfile = fopen(opts.badfile.c_str(), "w");
- FILE *limits_file = fopen(opts.limits.c_str(), "r");
- unordered_map<uint32_t, long double> limits;
- uint32_t x;
- long double limit;
- while (fscanf(limits_file, "%u %Lf", &x, &limit) == 2) {
- limits[x] = limit;
- }
- char kmer[kMaxK];
- char format[20];
- float freq = -1;
- int count;
- float q_count;
- snprintf(format, sizeof(format), "%%%ds%%d%%f%%f", kMaxK);
- uint64_t read_number = 0;
- while (fscanf(ifile, format, kmer, &count, &q_count, &freq) != EOF) {
- ++read_number;
- if (read_number % kStep == 0) {
- INFO(logger, "Reading k-mer " << read_number << ".");
- }
- if (q_count / count > limits[count]) {
- fprintf(ofile, "%s %d %f %f\n", kmer, count, q_count, freq);
- } else {
- fprintf(badfile, "%s %d %f %f\n", kmer, count, q_count, freq);
- }
- }
- return 0;
-}
diff --git a/src/hammer/quake_enhanced/options.cpp b/src/hammer/quake_enhanced/options.cpp
deleted file mode 100644
index 2c3c233..0000000
--- a/src/hammer/quake_enhanced/options.cpp
+++ /dev/null
@@ -1,206 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include <iomanip>
-#include <string>
-#include <sstream>
-#include <vector>
-#include "getopt_pp/getopt_pp_standalone.h"
-#include "options.hpp"
-using quake_enhanced::Options;
-using GetOpt::GetOpt_pp;
-using GetOpt::Option;
-using GetOpt::OptionPresent;
-using GetOpt::Include_Environment;
-using std::string;
-using std::vector;
-using std::endl;
-using std::ostringstream;
-using std::setw;
-
-Options::Options(int argc, char **argv) :
- read_file(""),
- corrected_read_file(""),
- help_message(""),
- kmer_count_file("kmer.count"),
- hash_file_prefix("kmer_"),
- hash_file_number(1000),
- quality_offset(33),
- quality_threshold(2),
- hist_file(""),
- trusted_hist_file(""),
- bad_hist_file(""),
- top_threshold(5),
- average_min(0.9),
- limits_file(""),
- bad_threshold(0.1),
- trusted_kmer_file(""),
- bad_kmer_file("") {
- string help_module = "";
- bool need_help;
- vector<string> global_options;
- help_builder << "Usage: " << argv[0] <<
- " --read-file <file> --corrected-read-file <file> --trusted-kmer-file <file>[options]\n";
- GetOpt_pp options(argc, argv, Include_Environment);
- // Help Options
- options >> OptionPresent('h', "help", need_help);
- options >> Option('\0', "help-module", help_module);
- // General Options
- options >> Option('\0', "read-file", read_file, read_file);
- options >> Option('\0', "corrected-read-file",
- corrected_read_file, corrected_read_file);
- // Count Options
- options >> Option('\0', "hash-file-number",
- hash_file_number, hash_file_number);
- options >> Option('\0', "hash-file-prefix",
- hash_file_prefix, hash_file_prefix);
- options >> Option('\0', "quality-offset",
- quality_offset, quality_offset);
- options >> Option('\0', "quality-threshold",
- quality_threshold, quality_threshold);
- options >> Option('\0', "kmer-count-file",
- kmer_count_file, kmer_count_file);
- // PrepareHist Options
- options >> Option('\0', "hist-file",
- hist_file, hist_file);
- options >> Option('\0', "trusted-hist-file",
- trusted_hist_file, trusted_hist_file);
- options >> Option('\0', "bad-hist-file",
- bad_hist_file, bad_hist_file);
- options >> Option('\0', "top-threshold",
- top_threshold, top_threshold);
- options >> Option('\0', "average-min",
- average_min, average_min);
- // PrepareLimits Options
- options >> Option('\0', "limits-file",
- limits_file, limits_file);
- options >> Option('\0', "bad-threshold",
- bad_threshold, bad_threshold);
- // FilterTrusted Options
- options >> Option('\0', "trusted-kmer-file",
- trusted_kmer_file, trusted_kmer_file);
- options >> Option('\0', "bad-kmer-file",
- bad_kmer_file, bad_kmer_file);
- if (need_help || help_module != "") {
- valid = false;
- } else {
- Validate();
- }
- help_builder << std::left << endl;
- if (!valid) {
- help_builder <<
- "General options: \n"
- "--read-file <str> file with reads to correct in one of \n"
- " supported formats: fastq, fasta \n"
- "--corrected-read-file <str> fasta file, where corrected reads will \n"
- " be written \n"
- "--help-module <str> produce a help for a given module, \n"
- " module can be: count, prepare_hist \n"
- " prepare_limits, filter_trusted \n";
-
- if (help_module == "count") {
- help_builder <<
- "Count options: \n"
- "--kmer-count-file <str> file where kmer count info will be \n"
- " written, default kmer.count \n"
- "--hash-file-prefix <str> prefix for hash_file, default: kmer_ \n"
- "--hash-file-number <int(>0)> number of hash_files, default: 1000. \n"
- " Generally the greater this number is, \n"
- " the faster will program work, but there\n"
- " is a risk of running out of file \n"
- " descriptors \n"
- "--quality-offset <int([0..255])> offset of quality values (for fastq \n"
- " files). It's usually 33 or 64, \n"
- " default: 33 \n"
- "--quality-threshold <int([0..255])> nucleotides with quality lower than \n"
- " threshold will be cut from the ends of \n"
- " the read, default: 2 \n";
-
- } else if (help_module == "prepare_hist") {
- help_builder <<
- "PrepareHist options: \n"
- "--hist-file <str> file where k-mer histogram will be \n"
- " written, default \"\" - no histogram \n"
- "--trusted-hist <str> file where trusted k-mer histogram will\n"
- " be written, default \"\" - no histogram\n"
- "--bad-hist <str> file where bad k-mer histogram will be \n"
- " written, default \"\" - no histogram \n"
- "--top-threshold <int(>0)> we will look for maximum which is at \n"
- " least top_threshold times higher than \n"
- " previous, default 5 \n"
- "--average-min <float([0..1])> trying to find Gauss's average we will \n"
- " go to the left and to the right until \n"
- " we rich coverage average_min * max \n";
- } else if (help_module == "prepare_limits") {
- help_builder <<
- "PrepareLimits options: \n"
- "--limits-file <str> file where 1-value limits for every \n"
- " k-value will be written, \n"
- " default \"\" - not to save limits \n"
- "--bad-threshold <float(>0)> k-mer will be considered untrusted if \n"
- " its probability of being bad is at \n"
- " least bad-threshold times greater then \n"
- " probability of being good \n";
- } else if (help_module == "filter_trusted") {
- help_builder <<
- "FilterTrusted options: \n"
- "--trusted-kmer-file <str> file where trusted k-mer will be \n"
- " written \n"
- "--bad--kmer-fil <str> file where trusted k-mer will be \n"
- " written, default \"\" - no file \n";
- }
-
- }
- help_message += help_builder.str();
-}
-
-void Options::Validate() {
- // General Validation
- if (read_file == "") {
- help_builder <<
- "Error: You must provide read_file\n";
- valid = false;
- }
- if (corrected_read_file == "") {
- help_builder <<
- "Error: You must provide corrected_read_file\n";
- valid = false;
- }
- // Count Validation
- if (hash_file_number < 1) {
- help_builder <<
- "Error: hash_file_number can not be lesser than one\n";
- valid = false;
- }
- if (quality_offset < 0 || quality_offset > 255) {
- help_builder <<
- "Error: quality_offset must be in 0..255\n";
- valid = false;
- }
- if (quality_threshold < 0 || quality_threshold > 255) {
- help_builder <<
- "Error: quality_threshold must be in 0..255\n";
- valid = false;
- }
- // PrepareHist Validation
- if (average_min < 0 || average_min > 1) {
- help_builder <<
- "Error: average_min must be in 0..1\n";
- valid = false;
- }
- // PrepareLimits Validation
- if (bad_threshold < 0) {
- help_builder <<
- "Error: bad_threshold must be in 0..*\n";
- valid = false;
- }
- // FilterTrusted Validation
- if (trusted_kmer_file == "") {
- help_builder << "Error: trusted_kmer_file must be provided\n";
- valid = false;
- }
-}
diff --git a/src/hammer/quake_enhanced/test_correction_quality/main.cpp b/src/hammer/quake_enhanced/test_correction_quality/main.cpp
deleted file mode 100644
index e901de7..0000000
--- a/src/hammer/quake_enhanced/test_correction_quality/main.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include <stdint.h>
-#include <string.h>
-#include <cstdlib>
-#include <cstdio>
-#include <string>
-#include <unordered_set>
-
-using std::unordered_set;
-using std::string;
-
-namespace {
-/**
- * @variable Length of string buffer which will store k-mer.
- */
-const uint32_t kMaxK = 100;
-
-struct Options {
- string genom_file;
- string trust_file;
- string bad_file;
- bool full;
- float threshold;
- bool valid;
- Options()
- : genom_file(""),
- trust_file(""),
- bad_file(""),
- full(false),
- valid(true) {}
-};
-
-void PrintHelp(char *progname) {
- printf("Usage: %s genom.[q]cst ifile.trust ifile.bad [--full]\n", progname);
- printf("Where:\n");
- printf("\tgenom.[q]cst\tfile with k|q-mer statistics from real genom\n");
- printf("\tifile.trust\ta filename where filtered data is\n");
- printf("\tifile.bud\ta filename where filtered garbage is\n");
- printf("\t--full\tpass this option to output all incorrect k-mers with their names to stdout\n");
-}
-
-Options ParseOptions(int argc, char *argv[]) {
- Options ret;
- if (argc < 4 || argc > 5) {
- ret.valid = false;
- } else {
- ret.genom_file = argv[1];
- ret.trust_file = argv[2];
- ret.bad_file = argv[3];
- if (argc == 5 && ( !strcmp(argv[4], "--full") || !strcmp(argv[4], "-f") ) )
- ret.full = true;
- }
- return ret;
-}
-}
-
-
-int main(int argc, char *argv[]) {
- Options opts = ParseOptions(argc, argv);
- if (!opts.valid) {
- PrintHelp(argv[0]);
- return 1;
- }
- FILE *genom_file = fopen(opts.genom_file.c_str(), "r");
- FILE *trust_file = fopen(opts.trust_file.c_str(), "r");
- FILE *bad_file = fopen(opts.bad_file.c_str(), "r");
- char kmer[kMaxK];
- char format[20];
- float freq = -1;
- int count;
- float q_count;
- snprintf(format, sizeof(format), "%%%ds%%d%%f%%f", kMaxK);
- unordered_set<string> real_kmers;
- while (fscanf(genom_file, format, kmer, &count, &q_count, &freq) != EOF) {
- real_kmers.insert(string(kmer));
- }
- int trusted = 0;
- int trusted_fail = 0;
- int bad = 0;
- int bad_fail = 0;
- while (fscanf(trust_file, format, kmer, &count, &q_count, &freq) != EOF) {
- if (real_kmers.count(string(kmer)) > 0) {
- ++trusted;
- } else {
- ++trusted_fail;
- if ( opts.full ) printf(" %s\t%d\t%f\t%f\n", kmer, count, q_count, freq);
- }
- }
- printf("trusted: %d\n", trusted + trusted_fail);
- printf("erroneous: %d\n", trusted_fail);
- while (fscanf(bad_file, format, kmer, &count, &q_count, &freq) != EOF) {
- if (real_kmers.count(string(kmer)) > 0) {
- ++bad_fail;
- if ( opts.full ) printf(" %s\t%d\t%f\t%f\n", kmer, count, q_count, freq);
- } else {
- ++bad;
- }
- }
- printf("bad: %d\n", bad + bad_fail);
- printf("erroneous: %d\n", bad_fail);
- return 0;
-}
diff --git a/src/hammer/valid_kmer_generator.hpp b/src/hammer/valid_kmer_generator.hpp
deleted file mode 100644
index 9d16b84..0000000
--- a/src/hammer/valid_kmer_generator.hpp
+++ /dev/null
@@ -1,200 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef HAMMER_VALIDKMERGENERATOR_HPP_
-#define HAMMER_VALIDKMERGENERATOR_HPP_
-
-#include "globals.hpp"
-
-#include "io/read.hpp"
-#include "sequence/seq.hpp"
-
-#include <string>
-#include <vector>
-
-#include <cstdint>
-#include <cmath>
-
-/**
- * This class is designed to iterate through valid k-mers in read.
- * @example
- * ValidKMerGenerator<2> gen(read, 4);
- * while (gen.HasMore()) {
- * MyTrickyFunction(gen.kmer());
- * gen.Next();
- * }
- * or
- * for (ValidKMerGenerator<2> gen(read, 2); gen.HasMore; gen.Next() {
- * MyTrickyFunction(gen.kmer(), gen.pos(), gen.correct_probability());
- * }
- * @param kK k-mer length.
- */
-template<uint32_t kK>
-class ValidKMerGenerator {
- public:
- /**
- * @param read Read to generate k-mers from.
- * @param bad_quality_threshold This class virtually cuts
- * nucleotides with quality lower the threshold from the ends of the
- * read.
- */
- explicit ValidKMerGenerator(const Read &read,
- uint8_t bad_quality_threshold = 2) {
- Reset(read.getSequenceString().data(),
- read.getQualityString().data(),
- read.getSequenceString().size(),
- bad_quality_threshold);
- }
- /**
- * @param seq sequence to generate k-mers from.
- * @param qual quality string
- * @param bad_quality_threshold This class virtually cuts
- * nucleotides with quality lower the threshold from the ends of the
- * read.
- */
- explicit ValidKMerGenerator(const char *seq, const char *qual,
- size_t len,
- uint8_t bad_quality_threshold = 2) {
- Reset(seq, qual, len, bad_quality_threshold);
- }
-
- ValidKMerGenerator()
- : kmer_(), seq_(0), qual_(0),
- pos_(-1), end_(-1), len_(0),
- correct_probability_(1), bad_quality_threshold_(2),
- has_more_(false), first(true) {}
-
- void Reset(const char *seq, const char *qual,
- size_t len,
- uint8_t bad_quality_threshold = 2) {
- kmer_ = Seq<kK>();
- seq_ = seq;
- qual_ = qual;
- pos_ = -1;
- end_ = -1;
- len_ = len;
- correct_probability_ = 1.0;
- bad_quality_threshold_ = bad_quality_threshold;
- has_more_ = true;
- first = true;
-
- TrimBadQuality();
- Next();
- }
-
- /**
- * @result true if Next() succeed while generating new k-mer, false
- * otherwise.
- */
- bool HasMore() const {
- return has_more_;
- }
- /**
- * @result last k-mer generated by Next().
- */
- const Seq<kK>& kmer() const {
- return kmer_;
- }
- /**
- * @result last k-mer position in initial read.
- */
- size_t pos() const {
- return pos_;
- }
- /**
- * @result probability that last generated k-mer is correct.
- */
- double correct_probability() const {
- return correct_probability_;
- }
- /**
- * This functions reads next k-mer from the read and sets hasmore to
- * if succeeded. You can access k-mer read with kmer().
- */
- void Next();
- private:
- void TrimBadQuality();
- double Prob(uint8_t qual) {
- return Globals::quality_probs[qual];
- }
- uint8_t GetQual(uint32_t pos) {
- if (pos >= len_) {
- return 2;
- } else {
- return qual_[pos];
- }
- }
- Seq<kK> kmer_;
- const char* seq_;
- const char* qual_;
- size_t pos_;
- size_t end_;
- size_t len_;
- double correct_probability_;
- uint8_t bad_quality_threshold_;
- bool has_more_;
- bool first;
-
- // Disallow copy and assign
- ValidKMerGenerator(const ValidKMerGenerator&) = delete;
- void operator=(const ValidKMerGenerator&) = delete;
-};
-
-template<uint32_t kK>
-void ValidKMerGenerator<kK>::TrimBadQuality() {
- pos_ = 0;
- if (qual_)
- for (; pos_ < len_; ++pos_) {
- if (GetQual((uint32_t)pos_) >= bad_quality_threshold_)
- break;
- }
- end_ = len_;
- if (qual_)
- for (; end_ > pos_; --end_) {
- if (GetQual((uint32_t)(end_ - 1)) >= bad_quality_threshold_)
- break;
- }
-}
-
-template<uint32_t kK>
-void ValidKMerGenerator<kK>::Next() {
- if (pos_ + kK > end_) {
- has_more_ = false;
- } else if (first || !is_nucl(seq_[pos_ + kK - 1])) {
- // in this case we have to look for new k-mer
- correct_probability_ = 1.0;
- uint32_t start_hypothesis = (uint32_t)pos_;
- uint32_t i = (uint32_t)pos_;
- for (; i < len_; ++i) {
- if (i == kK + start_hypothesis) {
- break;
- }
- if (qual_)
- correct_probability_ *= Prob(GetQual(i));
- if (!is_nucl(seq_[i])) {
- start_hypothesis = i + 1;
- correct_probability_ = 1.0;
- }
- }
- if (i == kK + start_hypothesis) {
- kmer_ = Seq<kK>(seq_ + start_hypothesis, 0, kK, /* raw */ true);
- pos_ = start_hypothesis + 1;
- } else {
- has_more_ = false;
- }
- } else {
- // good case we can just shift our previous answer
- kmer_ = kmer_ << seq_[pos_ + kK - 1];
- if (qual_) {
- correct_probability_ *= Prob(GetQual((uint32_t)pos_ + kK - 1));
- correct_probability_ /= Prob(GetQual((uint32_t)pos_ - 1));
- }
- ++pos_;
- }
- first = false;
-}
-#endif // HAMMER_VALIDKMERGENERATOR_HPP__
diff --git a/src/include/adt/array_vector.hpp b/src/include/adt/array_vector.hpp
deleted file mode 100644
index 27e883b..0000000
--- a/src/include/adt/array_vector.hpp
+++ /dev/null
@@ -1,625 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef __ARRAY_VECTOR_HPP__
-#define __ARRAY_VECTOR_HPP__
-
-#include <algorithm>
-#include <memory>
-
-#include <cstdlib>
-#include <cstring>
-#include <cstddef>
-
-template <class _Cp, bool _IsConst> class __array_vector_iterator;
-template <class _Cp> class __array_reference;
-template <class _Cp> class __array_const_reference;
-template <typename ElTy> struct array_equal_to;
-
-template <class _Cp>
-class __array {
- typedef typename _Cp::__storage_type __storage_type;
- typedef typename _Cp::__storage_pointer __storage_pointer;
- typedef typename _Cp::__const_storage_pointer __const_storage_pointer;
- typedef typename _Cp::size_type __size_type;
-
-#if defined(__clang__)
- friend typename _Cp::__self;
-#else
- friend class _Cp::__self;
-#endif
- friend class __array_vector_iterator<_Cp, false>;
- friend class __array_reference<_Cp>;
- friend class __array_const_reference<_Cp>;
-
- __storage_pointer ptr_;
- __size_type size_;
- bool allocated;
-
- public:
- ~__array() {
- if (allocated)
- delete[] ptr_;
- }
-
- size_t size() const {
- return size_;
- }
-
- size_t data_size() const {
- return size_ * sizeof(__storage_type);
- }
- __storage_pointer data() const {
- return ptr_;
- }
-
- __array(const __array &that) {
- size_ = that.size_;
- ptr_ = new __storage_type[size_];
- allocated = true;
- memcpy(ptr_, that.ptr_, data_size());
- }
-
- __array(const __array_reference<_Cp> that) {
- size_ = that.size();
- ptr_ = new __storage_type[size_];
- allocated = true;
- memcpy(ptr_, that.data(), data_size());
- }
-
- __array& operator=(const __array &that) {
- __storage_pointer this_ptr = data(), that_ptr = that.data();
- if (this_ptr != that_ptr)
- memcpy(this_ptr, that_ptr, data_size());
-
- return *this;
- }
-
- __array& operator=(const __array_reference<_Cp> that) {
- __storage_pointer this_ptr = data(), that_ptr = that.data();
- if (this_ptr != that_ptr)
- memcpy(this_ptr, that_ptr, data_size());
-
- return *this;
- }
-
- __array& operator=(__const_storage_pointer that_ptr) {
- __storage_pointer this_ptr = data();
- if (this_ptr != that_ptr)
- memcpy(this_ptr, that_ptr, data_size());
-
- return *this;
- }
-
- bool operator<(const __array &that) const {
- __storage_pointer this_ptr = data(), that_ptr = that.data();
-
- for (size_t i = 0; i < size(); ++i) {
- if (this_ptr[i] != that_ptr[i])
- return this_ptr[i] < that_ptr[i];
- }
-
- return false;
- }
-
- bool operator<(const __array_reference<_Cp> that) const {
- __storage_pointer this_ptr = data(), that_ptr = that.data();
-
- for (size_t i = 0; i < size(); ++i) {
- if (this_ptr[i] != that_ptr[i])
- return this_ptr[i] < that_ptr[i];
- }
-
- return false;
- }
-
- bool operator==(const __array &that) const {
- __storage_pointer this_ptr = data(), that_ptr = that.data();
-
- for (size_t i = 0; i < size(); ++i) {
- if (this_ptr[i] != that_ptr[i])
- return false;
- }
-
- return true;
- }
-
- bool operator==(const __array_reference<_Cp> that) const {
- __storage_pointer this_ptr = data(), that_ptr = that.data();
-
- for (size_t i = 0; i < size(); ++i) {
- if (this_ptr[i] != that_ptr[i])
- return false;
- }
-
- return true;
- }
-
- bool operator!=(const __array &that) const {
- return !operator==(that);
- }
-
- bool operator!=(const __array_reference<_Cp> that) const {
- return !operator==(that);
- }
-
- private:
- __array(__storage_pointer p, __size_type sz) :
- ptr_(p), size_(sz), allocated(false) { }
-};
-
-template <class _Cp>
-class __array_reference {
- typedef typename _Cp::__storage_type __storage_type;
- typedef typename _Cp::__storage_pointer __storage_pointer;
- typedef typename _Cp::__const_storage_pointer __const_storage_pointer;
- typedef typename _Cp::size_type __size_type;
-
-#if defined(__clang__)
- friend typename _Cp::__self;
-#else
- friend class _Cp::__self;
-#endif
- friend class __array_vector_iterator<_Cp, false>;
- friend class __array<_Cp>;
- friend struct array_equal_to<__storage_type>;
-
- __storage_pointer ptr_;
- __size_type size_;
-
- public:
- size_t size() const {
- return size_;
- }
-
- size_t data_size() const {
- return size() * sizeof(__storage_type);
- }
-
- __storage_pointer data() const {
- return ptr_;
- }
-
- __array_reference& operator=(const __array<_Cp> &that) {
- __storage_pointer this_ptr = data(), that_ptr = that.data();
- if (this_ptr != that_ptr)
- memcpy(this_ptr, that_ptr, data_size());
-
- return *this;
- }
-
- __array_reference& operator=(__const_storage_pointer that_ptr) {
- __storage_pointer this_ptr = data();
- if (this_ptr != that_ptr)
- memcpy(this_ptr, that_ptr, data_size());
-
- return *this;
- }
-
- __array_reference& operator=(const __array_reference that) {
- __storage_pointer this_ptr = data(), that_ptr = that.data();
- if (this_ptr != that_ptr)
- memcpy(this_ptr, that_ptr, data_size());
-
- return *this;
- }
-
- bool operator<(const __array<_Cp> &that) const {
- __storage_pointer this_ptr = data(), that_ptr = that.data();
-
- for (size_t i = 0; i < size(); ++i) {
- if (this_ptr[i] != that_ptr[i])
- return this_ptr[i] < that_ptr[i];
- }
-
- return false;
- }
-
- bool operator<(const __array_reference that) const {
- __storage_pointer this_ptr = data(), that_ptr = that.data();
-
- for (size_t i = 0; i < size(); ++i) {
- if (this_ptr[i] != that_ptr[i])
- return this_ptr[i] < that_ptr[i];
- }
-
- return false;
- }
-
- bool operator==(const __array<_Cp> &that) const {
- __storage_pointer this_ptr = data(), that_ptr = that.data();
-
- for (size_t i = 0; i < size(); ++i) {
- if (this_ptr[i] != that_ptr[i])
- return false;
- }
-
- return true;
- }
-
- bool operator==(const __array_reference that) const {
- __storage_pointer this_ptr = data(), that_ptr = that.data();
-
- for (size_t i = 0; i < size(); ++i) {
- if (this_ptr[i] != that_ptr[i])
- return false;
- }
-
- return true;
- }
-
- bool operator!=(const __array_reference that) const {
- return !operator==(that);
- }
-
- bool operator!=(const __array<_Cp> &that) const {
- return !operator==(that);
- }
-
- private:
- __array_reference(__storage_pointer p, __size_type sz) :
- ptr_(p), size_(sz) { }
-};
-
-template <class _Cp>
-class __array_const_reference {
- typedef typename _Cp::__storage_type __storage_type;
- typedef typename _Cp::__storage_pointer __storage_pointer;
- typedef typename _Cp::__const_storage_pointer __const_storage_pointer;
- typedef typename _Cp::size_type __size_type;
-
-#if defined(__clang__)
- friend typename _Cp::__self;
-#else
- friend class _Cp::__self;
-#endif
- friend class __array_vector_iterator<_Cp, true>;
- friend struct array_equal_to<__storage_type>;
-
- __const_storage_pointer ptr_;
- __size_type size_;
-
- public:
- size_t size() const {
- return size_;
- }
-
- size_t data_size() const {
- return size() * sizeof(__storage_type);
- }
-
- __const_storage_pointer data() const {
- return ptr_;
- }
-
- __array_const_reference(const __array_const_reference &that)
- : ptr_(that.ptr_), size_(that.size_) {}
-
- bool operator<(__array_const_reference that) const {
- const __storage_pointer this_ptr = data(), that_ptr = that.data();
-
- for (size_t i = 0; i < size(); ++i) {
- if (this_ptr[i] != that_ptr[i])
- return this_ptr[i] < that_ptr[i];
- }
-
- return false;
- }
-
- bool operator==(__array_const_reference that) const {
- const __storage_pointer this_ptr = data(), that_ptr = that.data();
-
- for (size_t i = 0; i < size(); ++i) {
- if (this_ptr[i] != that_ptr[i])
- return false;
- }
-
- return true;
- }
-
- bool operator==(const __array_reference<_Cp> that) const {
- __const_storage_pointer this_ptr = data();
- const __storage_pointer that_ptr = that.data();
-
- for (size_t i = 0; i < size(); ++i) {
- if (this_ptr[i] != that_ptr[i])
- return false;
- }
-
- return true;
- }
-
- bool operator!=(const __array_const_reference that) const {
- return !operator==(that);
- }
- bool operator!=(const __array_reference<_Cp> that) const {
- return !operator==(that);
- }
-
- private:
- __array_const_reference(__const_storage_pointer p, __size_type sz) :
- ptr_(p), size_(sz) { }
- __array_const_reference& operator=(const __array_const_reference &that);
-};
-
-// This is hack. Never do this again!
-#ifdef __GLIBCXX__
-namespace std {
-template<typename _Cp>
-struct __are_same<__array_reference<_Cp>, __array<_Cp>&> {
- enum { __value = 1 };
- typedef __true_type __type;
-};
-
-template<typename _Cp>
-struct __are_same<__array<_Cp>&, __array_reference<_Cp> > {
- enum { __value = 1 };
- typedef __true_type __type;
-};
-}
-#endif
-
-template<typename _Cp>
-void swap(__array_reference<_Cp> lhs, __array_reference<_Cp> rhs) {
- std::swap_ranges(lhs.data(), lhs.data() + lhs.size(), rhs.data());
-}
-
-template<typename _Cp>
-void swap(__array<_Cp>& lhs, __array_reference<_Cp> rhs) {
- std::swap_ranges(lhs.data(), lhs.data() + lhs.size(), rhs.data());
-}
-
-template<typename _Cp>
-void swap(__array_reference<_Cp> lhs, __array<_Cp>& rhs) {
- std::swap_ranges(lhs.data(), lhs.data() + lhs.size(), rhs.data());
-}
-
-template<typename _Cp, bool _IsConst>
-class __array_vector_iterator {
- public:
- typedef typename _Cp::difference_type difference_type;
- typedef __array_vector_iterator pointer;
- typedef typename std::conditional<_IsConst, __array_const_reference<_Cp>, __array_reference<_Cp> >::type reference;
- typedef __array<_Cp> value_type;
-
- typedef std::random_access_iterator_tag iterator_category;
-
- private:
- typedef typename _Cp::__storage_type __storage_type;
- typedef typename _Cp::__storage_pointer __storage_pointer;
- typedef typename _Cp::size_type __size_type;
-
-#if defined(__clang__)
- friend typename _Cp::__self;
-#else
- friend class _Cp::__self;
-#endif
-
- __storage_pointer data_;
- __size_type el_sz_;
-
- public:
- __array_vector_iterator(__storage_pointer data, __size_type el_sz)
- : data_(data), el_sz_(el_sz) {}
-
- size_t size() const {
- return el_sz_;
- }
-
- size_t data_size() const {
- return el_sz_ * sizeof(__storage_type);
- }
-
- __storage_pointer data() const {
- return data_;
- }
-
- reference operator*() const {
- return reference(data_, el_sz_);
- }
-
- reference operator[](difference_type n) const {
- return *(*this + n);
- }
-
- __array_vector_iterator& operator++() {
- data_ += el_sz_;
- return *this;
- }
-
- __array_vector_iterator& operator--() {
- data_ -= el_sz_;
- return *this;
- }
-
- __array_vector_iterator operator++(int) {
- __array_vector_iterator res = *this;
- data_ += el_sz_;
- return res;
- }
-
- __array_vector_iterator operator--(int) {
- __array_vector_iterator res = *this;
- data_ -= el_sz_;
- return res;
- }
-
- __array_vector_iterator operator+(const difference_type &n) const {
- return __array_vector_iterator(data_ + n*el_sz_, el_sz_);
- }
-
- __array_vector_iterator& operator+=(const difference_type &n) {
- data_ += n*el_sz_;
- return *this;
- }
-
- __array_vector_iterator operator-(const difference_type &n) const {
- return __array_vector_iterator(data_ - n*el_sz_, el_sz_);
- }
-
- __array_vector_iterator& operator-=(const difference_type &n) {
- data_ -= n*el_sz_;
- return *this;
- }
-
- friend bool operator==(const __array_vector_iterator &r1,
- const __array_vector_iterator &r2) {
- return r1.data_ == r2.data_;
- }
-
- friend bool operator!=(const __array_vector_iterator &r1,
- const __array_vector_iterator &r2) {
- return r1.data_ != r2.data_;
- }
-
- friend bool operator<(const __array_vector_iterator &r1,
- const __array_vector_iterator &r2) {
- return r1.data_ < r2.data_;
- }
-
- friend bool operator<=(const __array_vector_iterator &r1,
- const __array_vector_iterator &r2) {
- return r1.data_ <= r2.data_;
- }
- friend bool operator>(const __array_vector_iterator &r1,
- const __array_vector_iterator &r2) {
- return r1.data_ > r2.data_;
- }
- friend bool operator>=(const __array_vector_iterator &r1,
- const __array_vector_iterator &r2) {
- return r1.data_ >= r2.data_;
- }
-
-
- friend __array_vector_iterator
- operator+(difference_type n,
- const __array_vector_iterator &r2) {
- return r2 + n;
- }
-
- friend difference_type
- operator-(const __array_vector_iterator &r1,
- const __array_vector_iterator &r2) {
- return (r1.data_ - r2.data_) / r1.el_sz_;
- }
-};
-
-template <typename ElTy>
-class array_vector {
- public:
- typedef size_t size_type;
- typedef ptrdiff_t difference_type;
-
- typedef __array_reference<array_vector> reference;
- typedef __array_const_reference<array_vector> const_reference;
- typedef __array<array_vector> value_type;
- typedef __array_vector_iterator<array_vector, false> iterator;
- typedef __array_vector_iterator<array_vector, true> const_iterator;
-
- private:
- typedef ElTy __storage_type;
- typedef array_vector __self;
- typedef __storage_type* __storage_pointer;
- typedef const __storage_type* __const_storage_pointer;
-
- friend class __array<__self>;
- friend class __array_reference<__self>;
- friend class __array_const_reference<__self>;
- friend class __array_vector_iterator<__self, true>;
- friend class __array_vector_iterator<__self, false>;
-
- __storage_pointer data_;
- size_type size_;
- size_type el_sz_;
-
- public:
- array_vector(__storage_pointer data, size_type sz, size_type el_sz)
- : data_(data), size_(sz), el_sz_(el_sz) {}
-
- reference operator[](size_t pos) {
- return reference(data_ + pos * el_sz_, el_sz_);
- }
- const ElTy *operator[](size_t pos) const {
- return data_ + pos * el_sz_;
- }
- iterator begin() {
- return iterator(data_, el_sz_);
- }
- iterator end() {
- return iterator(data_ + size_ * el_sz_, el_sz_);
- }
- const_iterator begin() const {
- return const_iterator(data_, el_sz_);
- }
- const_iterator end() const {
- return const_iterator(data_ + size_ * el_sz_, el_sz_);
- }
- const_iterator cbegin() const {
- return const_iterator(data_, el_sz_);
- }
- const_iterator cend() const {
- return const_iterator(data_ + size_ * el_sz_, el_sz_);
- }
-
- size_t size() const { return size_; }
-
- __storage_pointer data() const { return data_; }
-
- void set_size(size_t size) {
- size_ = size;
- }
- void set_data(__storage_pointer data) {
- data_ = data;
- }
-};
-
-template<typename ElTy>
-struct array_less {
- typedef typename array_vector<ElTy>::value_type value;
- typedef typename array_vector<ElTy>::reference reference;
-
- bool operator()(const value& lhs, const value& rhs) const {
- return lhs < rhs;
- }
- bool operator()(const value& lhs, const reference rhs) const {
- return lhs < rhs;
- }
- bool operator()(const reference lhs, const value& rhs) const {
- return lhs < rhs;
- }
- bool operator()(const reference lhs, const reference rhs) const {
- return lhs < rhs;
- }
-};
-
-template<typename ElTy>
-struct array_equal_to {
- typedef typename array_vector<ElTy>::value_type value;
- typedef typename array_vector<ElTy>::reference reference;
- typedef typename array_vector<ElTy>::const_reference const_reference;
-
- bool operator()(const value& lhs, const value& rhs) const {
- return lhs == rhs;
- }
- bool operator()(const value& lhs, const reference rhs) const {
- return lhs == rhs;
- }
- bool operator()(const reference lhs, const value& rhs) const {
- return lhs == rhs;
- }
- bool operator()(const reference lhs, const ElTy *rhs, size_t sz) const {
- return lhs == reference(rhs, sz);
- }
- bool operator()(const reference lhs, const reference rhs) const {
- return lhs == rhs;
- }
- bool operator()(const ElTy *lhs, size_t sz, const reference rhs) const {
- return const_reference(lhs, sz) == rhs;
- }
-};
-
-#endif
diff --git a/src/include/adt/bag.hpp b/src/include/adt/bag.hpp
deleted file mode 100644
index 2f7a8f2..0000000
--- a/src/include/adt/bag.hpp
+++ /dev/null
@@ -1,87 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "verify.hpp"
-
-template<class T, class hash = std::hash<T>>
-class bag {
- typedef std::unordered_map<T, size_t, hash> Data;
- Data data_;
- size_t size_;
-public:
-
- bag() : size_(0) {
- }
-
- typedef typename Data::const_iterator const_iterator;
-
- void put(const T& t, size_t mult) {
- VERIFY(mult > 0);
- data_[t] += mult;
- size_ += mult;
- }
-
- void put(const T& t) {
- put(t, 1);
- }
-
- bool take(const T& t, size_t mult) {
- VERIFY(mult > 0);
- /*typename map<T, size_t>::iterator*/auto it = data_.find(t);
- if (it == data_.end()) {
- return false;
- } else {
- size_t have = it->second;
- if (have < mult) {
- data_.erase(it->first);
- size_ -= have;
- return false;
- } else if (have == mult) {
- data_.erase(it->first);
- size_ -= have;
- return true;
- } else {
- it->second -= mult;
- size_ -= mult;
- return true;
- }
- }
- }
-
- bool take(const T& t) {
- return take(t, 1);
- }
-
- size_t mult(const T& t) const {
- auto it = data_.find(t);
- if (it == data_.end()) {
- return 0;
- } else {
- return it->second;
- }
- }
-
- void clear() {
- data_.clear();
- size_ = 0;
- }
-
- const_iterator begin() const {
- return data_.begin();
- }
-
- const_iterator end() const {
- return data_.end();
- }
-
- size_t size() const {
- return size_;
- }
-
-};
diff --git a/src/include/adt/chained_iterator.hpp b/src/include/adt/chained_iterator.hpp
deleted file mode 100644
index 18315d8..0000000
--- a/src/include/adt/chained_iterator.hpp
+++ /dev/null
@@ -1,76 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef __ADT_CHAINED_ITERATOR_HPP__
-#define __ADT_CHAINED_ITERATOR_HPP__
-
-#include <boost/iterator/iterator_facade.hpp>
-
-#include <iterator>
-#include <vector>
-
-template<class It>
-class chained_iterator :
- public boost::iterator_facade<chained_iterator<It>,
- typename std::iterator_traits<It>::value_type,
- boost::forward_traversal_tag,
- typename std::iterator_traits<It>::value_type> {
- public:
- chained_iterator(It begin, It end) :
- section_(0), current_(begin) {
- join(begin, end);
- }
-
- void join(It begin, It end) {
- begins_.push_back(begin);
- ends_.push_back(end);
- skip_empty();
- }
-
- private:
- friend class boost::iterator_core_access;
-
- bool is_end() const {
- return current_ == ends_[section_];
- }
-
- void skip_empty() {
- while ((section_ + 1) < begins_.size() &&
- current_ == ends_[section_])
- current_ = begins_[++section_];
- }
-
- void increment() {
- skip_empty();
- ++current_;
- skip_empty();
- }
-
- bool equal(const chained_iterator &other) const {
- // Special case: both ends
- bool other_end = other.is_end(), current_end = is_end();
- if (current_end || other_end)
- return other_end == current_end;
-
- // Now, make sure we are comparing the iterators from the same sequences
- // (actually, not, but this would be undefined behavior)
- return (section_ == other.section_ &&
- current_ == other.current_);
- }
-
- typename std::iterator_traits<It>::value_type dereference() const {
- return *current_;
- }
-
- size_t section_;
- It current_;
- std::vector<It> begins_;
- std::vector<It> ends_;
-};
-
-
-#endif
diff --git a/src/include/adt/concurrent_dsu.hpp b/src/include/adt/concurrent_dsu.hpp
deleted file mode 100644
index ad4cb38..0000000
--- a/src/include/adt/concurrent_dsu.hpp
+++ /dev/null
@@ -1,296 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef CONCURRENTDSU_HPP_
-#define CONCURRENTDSU_HPP_
-
-#include "io/mmapped_writer.hpp"
-
-#include <cassert>
-#include <cmath>
-#include <cstdlib>
-#include <cstdarg>
-#include <cstdint>
-
-#include <algorithm>
-#include <vector>
-#include <unordered_map>
-#include <atomic>
-
-// Silence bogus gcc warnings
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wconversion"
-
-class ConcurrentDSU {
- struct atomic_set_t {
- uint64_t data : 61;
- uint64_t aux : 2;
- bool root : 1;
- } __attribute__ ((packed));
-
- static_assert(sizeof(atomic_set_t) == 8, "Unexpected size of atomic_set_t");
-
- public:
- ConcurrentDSU(size_t size)
- : data_(size) {
-
- for (size_t i = 0; i < size; i++)
- data_[i] = { .data = 1, .aux = 0, .root = true };
- }
-
- ~ConcurrentDSU() { }
-
- void unite(size_t x, size_t y) {
- uint64_t x_size, y_size;
- uint64_t x_aux, y_aux;
-
- // Step one: update the links
- while (true) {
- x = find_set(x);
- y = find_set(y);
- if (x == y)
- return;
-
- atomic_set_t x_entry = data_[x], y_entry = data_[y];
- // If someone already changed roots => retry
- if (!x_entry.root || !y_entry.root)
- continue;
-
- // We need to link the smallest subtree to the largest
- x_size = x_entry.data, y_size = y_entry.data;
- x_aux = x_entry.aux, y_aux = y_entry.aux;
- if (x_size > y_size || (x_size == y_size && x > y)) {
- std::swap(x, y);
- std::swap(x_size, y_size);
- std::swap(x_aux, y_aux);
- std::swap(x_entry, y_entry);
- }
-
- // Link 'x' to 'y'. If someone already changed 'x' => try again.
- atomic_set_t new_x_entry = { .data = y, .aux = x_aux, .root = false };
- if (!data_[x].compare_exchange_strong(x_entry, new_x_entry))
- continue;
-
- break;
- }
-
- // Step two: update the size. We already linked 'x' to 'y'. Therefore we
- // need to add 'x_size' to whichever value is currently inside 'y'.
- while (true) {
- y = find_set(y);
- atomic_set_t y_entry = data_[y];
- // If someone already changed the roots => retry
- if (!y_entry.root)
- continue;
-
- // Update the size. If someone already changed 'y' => try again.
- atomic_set_t new_y_entry = { .data = x_size + y_entry.data, .aux = y_aux, .root = true };
- if (!data_[y].compare_exchange_strong(y_entry, new_y_entry))
- continue;
-
- break;
- }
- }
-
- size_t set_size(size_t i) const {
- while (true) {
- size_t el = find_set(i);
- atomic_set_t entry = data_[el];
- if (!entry.root)
- continue;
-
- return entry.data;
- }
- }
-
- size_t find_set(size_t x) const {
- // Step one: find the root
- size_t r = x;
- atomic_set_t r_entry = data_[r];
- while (!r_entry.root) {
- r = r_entry.data;
- r_entry = data_[r];
- }
-
- // Step two: traverse the path from 'x' to root trying to update the links
- // Note that the links might change, therefore we stop as soon as we'll
- // end at 'some' root.
- while (x != r) {
- atomic_set_t x_entry = data_[x];
- if (x_entry.root)
- break;
-
- // Try to update parent (may fail, it's ok)
- atomic_set_t new_x_entry = { .data = r, .aux = x_entry.aux, .root = false };
- data_[x].compare_exchange_weak(x_entry, new_x_entry);
- x = x_entry.data;
- }
-
- return x;
- }
-
- bool same(size_t x, size_t y) const {
- while (true) {
- x = find_set(x);
- y = find_set(y);
- if (x == y)
- return true;
- if (data_[x].load().root)
- return false;
- }
- }
-
- size_t num_sets() const {
- size_t count = 0;
- for (const auto& entry : data_) {
- count += entry.load(std::memory_order_relaxed).root;
- }
-
- return count;
- }
-
- bool is_root(size_t x) const {
- return data_[x].load(std::memory_order_relaxed).root;
- }
-
- uint64_t aux(size_t x) const {
- return data_[x].load(std::memory_order_relaxed).aux;
- }
-
- uint64_t root_aux(size_t x) const {
- while (true) {
- x = find_set(x);
- atomic_set_t entry = data_[x];
-
- if (!entry.root)
- continue;
-
- return entry.aux;
- }
- }
-
- void set_aux(size_t x, uint64_t data) {
- while (true) {
- atomic_set_t x_entry = data_[x];
- atomic_set_t new_x_entry = { .data = x_entry.data, .aux = data, .root = x_entry.root };
- if (!data_[x].compare_exchange_strong(x_entry, new_x_entry))
- continue;
-
- break;
- }
- }
-
- void set_root_aux(size_t x, uint64_t data) {
- while (true) {
- x = find_set(x);
- atomic_set_t x_entry = data_[x];
- if (!x_entry.root)
- continue;
-
- atomic_set_t new_x_entry = { .data = x_entry.data, .aux = data, .root = true };
- if (!data_[x].compare_exchange_strong(x_entry, new_x_entry))
- continue;
-
- break;
- }
- }
-
- size_t extract_to_file(const std::string& Prefix) {
- // First, touch all the sets to make them directly connect to the root
-# pragma omp parallel for
- for (size_t x = 0; x < data_.size(); ++x)
- (void) find_set(x);
-
- std::unordered_map<size_t, size_t> sizes;
-
-#if 0
- for (size_t x = 0; x < size; ++x) {
- if (data_[x].parent != x) {
- size_t t = data_[x].parent;
- VERIFY(data_[t].parent == t)
- }
- }
-#endif
-
- // Insert all the root elements into the map
- sizes.reserve(num_sets());
- for (size_t x = 0; x < data_.size(); ++x) {
- if (is_root(x))
- sizes[x] = 0;
- }
-
- // Now, calculate the counts. We can do this in parallel, because we know no
- // insertion can occur.
-# pragma omp parallel for
- for (size_t x = 0; x < data_.size(); ++x) {
- size_t& entry = sizes[parent(x)];
-# pragma omp atomic
- entry += 1;
- }
-
- // Now we know the sizes of each cluster. Go over again and calculate the
- // file-relative (cumulative) offsets.
- size_t off = 0;
- for (size_t x = 0; x < data_.size(); ++x) {
- if (is_root(x)) {
- size_t& entry = sizes[x];
- size_t noff = off + entry;
- entry = off;
- off = noff;
- }
- }
-
- // Write down the entries
- std::vector<size_t> out(off);
- for (size_t x = 0; x < data_.size(); ++x) {
- size_t& entry = sizes[parent(x)];
- out[entry++] = x;
- }
- std::ofstream os(Prefix, std::ios::binary | std::ios::out);
- os.write((char*)&out[0], out.size() * sizeof(out[0]));
- os.close();
-
- // Write down the sizes
- MMappedRecordWriter<size_t> index(Prefix + ".idx");
- index.reserve(sizes.size());
- size_t *idx = index.data();
- for (size_t x = 0, i = 0, sz = 0; x < data_.size(); ++x) {
- if (is_root(x)) {
- idx[i++] = sizes[x] - sz;
- sz = sizes[x];
- }
- }
-
- return sizes.size();
- }
-
- void get_sets(std::vector<std::vector<size_t> > &otherWay) {
- otherWay.resize(data_.size());
- for (size_t i = 0; i < data_.size(); i++) {
- size_t set = find_set(i);
- otherWay[set].push_back(i);
- }
- otherWay.erase(remove_if(otherWay.begin(), otherWay.end(), zero_size),
- otherWay.end());
- }
-
-private:
- size_t parent(size_t x) const {
- atomic_set_t val = data_[x];
- return (val.root ? x : val.data);
- }
-
- static bool zero_size(const std::vector<size_t> & v) {
- return v.size() == 0;
- }
-
- mutable std::vector<std::atomic<atomic_set_t> > data_;
-};
-
-#pragma GCC diagnostic pop
-
-#endif /* CONCURRENTDSU_HPP_ */
diff --git a/src/include/adt/filter_iterator.hpp b/src/include/adt/filter_iterator.hpp
deleted file mode 100644
index d06023d..0000000
--- a/src/include/adt/filter_iterator.hpp
+++ /dev/null
@@ -1,49 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef FILTER_ITERATOR_H_
-#define FILTER_ITERATOR_H_
-
-/**
- * Iterator with some predicate -- iterates only on elements with predicate(item) == true
- */
-template<typename iterator_type, typename predicate_type>
-class filter_iterator {
-public:
- typedef typename iterator_type::value_type value_type;
-
- filter_iterator(const iterator_type& begin, const iterator_type& end, const predicate_type& pred):
- current_(begin), end_(end), pred_(pred)
- {
- while((current_ != end_) && (!pred_(*current_))) // why do we need here? DRY, see method advance() below.
- ++current_;
- } // filter_iterator
-
- value_type operator*() const { return *current_; }
- value_type operator->() const { return *current_; }
-
- filter_iterator& operator++() { advance(); return *this; }
-
- bool operator==(const filter_iterator& rhs) const { return current_ == rhs.current_; }
- bool operator!=(const filter_iterator& rhs) const { return !(operator==(rhs)); }
-
-private:
- void advance()
- {
- do
- {
- ++current_;
- }
- while((current_ != end_) && (!pred_(*current_)));
- } // advance
-
- iterator_type current_;
- iterator_type end_;
- predicate_type pred_;
-};
-
-#endif /* FILTER_ITERATOR_H_ */
diff --git a/src/include/adt/function_traits.hpp b/src/include/adt/function_traits.hpp
deleted file mode 100644
index ebb946e..0000000
--- a/src/include/adt/function_traits.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-#ifndef __ADT_FUNCTION_TRAITS__
-#define __ADT_FUNCTION_TRAITS__
-
-#pragma once
-
-#include <functional>
-
-namespace adt {
-
-template<class F>
-struct function_traits;
-
-// function pointer
-template<class R, class... Args>
-struct function_traits<R(*)(Args...)> : public function_traits<R(Args...)> {};
-
-// member function pointer
-template<class C, class R, class... Args>
-struct function_traits<R(C::*)(Args...)> : public function_traits<R(C&, Args...)> {};
-
-// const member function pointer
-template<class C, class R, class... Args>
-struct function_traits<R(C::*)(Args...) const> : public function_traits<R(C&, Args...)> {};
-
-// member object pointer
-template<class C, class R>
-struct function_traits<R(C::*)> : public function_traits<R(C&)> {};
-
-template<class R, class... Args>
-struct function_traits<R(Args...)> {
- using return_type = R;
-
- static constexpr std::size_t arity = sizeof...(Args);
-
- template <std::size_t N>
- struct arg {
- static_assert(N < arity, "invalid argument index");
- using type = typename std::tuple_element<N, std::tuple<Args...>>::type;
- };
-};
-
-template<class F>
-struct function_traits<F&> : public function_traits<F> {};
-
-template<class F>
-struct function_traits<F&&> : public function_traits<F> {};
-
-// functors & default implementation
-template<class F>
-struct function_traits {
- private:
- using call_type = function_traits<decltype(&F::operator())>;
-
- public:
- using return_type = typename call_type::return_type;
-
- // Remeber to get rid of this argument
- static constexpr std::size_t arity = call_type::arity - 1;
-
- template <std::size_t N>
- struct arg {
- static_assert(N < arity, "invalid argument index");
- // Remeber to get rid of this argument
- using type = typename call_type::template arg<N+1>::type;
- };
-};
-
-} // namespace adt
-
-#endif // __ADT_FUNCTION_TRAITS__
diff --git a/src/include/adt/iterator_range.hpp b/src/include/adt/iterator_range.hpp
deleted file mode 100644
index e76c61b..0000000
--- a/src/include/adt/iterator_range.hpp
+++ /dev/null
@@ -1,46 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef __ITERATOR_RANGE_H__
-#define __ITERATOR_RANGE_H__
-
-#include <utility>
-#include <iterator>
-
-namespace adt {
-
-template <typename IteratorT>
-class iterator_range {
- IteratorT begin_iterator, end_iterator;
-
-public:
- template <typename Container>
- iterator_range(Container &&c)
- //TODO: Consider ADL/non-member begin/end calls.
- : begin_iterator(c.begin()), end_iterator(c.end()) {}
- iterator_range(IteratorT begin_iterator, IteratorT end_iterator)
- : begin_iterator(std::move(begin_iterator)),
- end_iterator(std::move(end_iterator)) {}
-
- IteratorT begin() const { return begin_iterator; }
- IteratorT end() const { return end_iterator; }
-};
-
-template <class T> iterator_range<T> make_range(T x, T y) {
- return iterator_range<T>(std::move(x), std::move(y));
-}
-
-template <typename T> iterator_range<T> make_range(std::pair<T, T> p) {
- return iterator_range<T>(std::move(p.first), std::move(p.second));
-}
-
-template<typename T>
-iterator_range<decltype(begin(std::declval<T>()))> drop_begin(T &&t, int n) {
- return make_range(std::next(begin(t), n), end(t));
-}
-}
-
-#endif
diff --git a/src/include/adt/kmer_hash_vector.hpp b/src/include/adt/kmer_hash_vector.hpp
deleted file mode 100644
index 345c47f..0000000
--- a/src/include/adt/kmer_hash_vector.hpp
+++ /dev/null
@@ -1,370 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * kmer_hash_vector.hpp
- *
- * Created on: Jul 19, 2012
- * Author: alex
- */
-
-#ifndef KMER_HASH_VECTOR_HPP_
-#define KMER_HASH_VECTOR_HPP_
-
-
-#include "runtime_k.hpp"
-#include "kmer_map.hpp"
-
-
-namespace runtime_k {
-
-class IKmerHashVector {
-
-protected:
- static const size_t LOAD_OVERHEAD = 1000;
-
- size_t nthreads_;
-
- size_t cell_size_;
-
-public:
- typedef RtSeq input_value_type;
-
- IKmerHashVector(size_t nthreads)
- : nthreads_ (nthreads)
- , cell_size_ (LOAD_OVERHEAD) {
- }
-
- virtual ~IKmerHashVector() {
-
- }
-
- virtual IKmerHashVector * copy() const = 0;
-
- virtual void clear() = 0;
-
- virtual void clear(size_t i) = 0;
-
- virtual bool is_full() const = 0;
-
- virtual bool is_presisely_full() const = 0;
-
- virtual size_t capacity(size_t i) const = 0;
-
- virtual size_t size(size_t i) const = 0;
-
-
- virtual void insert(const input_value_type& value) = 0;
-
- virtual void reserve(size_t cell_size) = 0;
-
-
- virtual size_t get_k() const = 0;
-
- size_t get_threads_num() const
- {
- return nthreads_;
- }
-
- virtual void dump (KmerMap<int>& destination, size_t bucketNum) = 0;
-};
-
-
-
-class KmerHashVector {
-
-public:
-
- typedef IKmerHashVector base_vector_type;
-
-private:
-
- base_vector_type * data_;
-
-public:
-
- typedef KmerHashVector vector_type;
-
- typedef base_vector_type::input_value_type input_value_type;
-
-
- KmerHashVector(size_t k, size_t nthreads);
-
- KmerHashVector(base_vector_type * vec): data_(vec) {
- }
-
- KmerHashVector(const vector_type& vec) {
- data_ = vec.data_->copy();
- }
-
- vector_type& operator=(const vector_type& vec) {
- if (vec.data_ != data_) {
- delete data_;
- data_ = vec.data_->copy();
- }
-
- return *this;
- }
-
- ~KmerHashVector() {
- delete data_;
- }
-
-
-
- bool is_full() const {
- return data_->is_full();
- }
-
- bool is_presisely_full() const {
- return data_->is_presisely_full();
- }
-
- size_t get_threads_num() const
- {
- return data_->get_threads_num();
- }
-
-
- void insert(const input_value_type& value) {
- data_->insert(value);
- }
-
- void clear() {
- data_->clear();
- }
-
-
- void clear(size_t i) {
- data_->clear(i);
- }
-
- size_t get_k() const {
- return data_->get_k();
- }
-
- size_t capacity(size_t i) const {
- return data_->capacity(i);
- }
-
- void reserve(size_t cell_size) {
- data_->reserve(cell_size);
- }
-
- base_vector_type * get_data() const {
- return data_;
- }
-
- void print_sizes() {
- for (size_t i = 0; i < data_->get_threads_num(); ++i) {
- INFO("Size " << i << ": " << data_->size(i));
- }
- }
-
- void dump (KmerMap<int>& destination, size_t bucketNum) {
- data_->dump(destination, bucketNum);
- }
-};
-
-
-// ================================= VECTOR IMPLEMENTATION =================================
-
-template <size_t size_>
-class KmerHashVectorImpl: public IKmerHashVector {
-
-public:
-
- typedef TypeContainerImpl<size_> type_container;
-
- typedef typename type_container::Kmer Kmer;
-
- typedef typename type_container::vector_type vector_type;
-
- typedef std::vector<vector_type> data_type;
-
- typedef IKmerHashVector base_type;
-
- typedef typename base_type::input_value_type input_value_type;
-
-private:
-
- data_type data_;
-
- size_t k_;
-
-public:
-
- KmerHashVectorImpl(size_t k, size_t nthreads):
- IKmerHashVector(nthreads)
- , data_ (nthreads)
- , k_ (k) {
- }
-
- virtual base_type * copy() const {
- return new KmerHashVectorImpl<size_>(*this);
- }
-
- virtual bool is_full() const {
- return data_[0].size() >= cell_size_;
- }
-
- virtual bool is_presisely_full() const {
- for (size_t i = 0; i < nthreads_; ++i) {
- if (data_[i].size() >= cell_size_)
- return true;
- }
- return false;
- }
-
- virtual void insert(const input_value_type& value) {
- Kmer kmer = type_container::from_sequence(value);
- data_[kmer.GetHash() % nthreads_].push_back(kmer);
- }
-
- virtual void clear() {
- for (size_t i = 0; i < nthreads_; ++i) {
- data_[i].clear();
- }
- }
-
- virtual void clear(size_t i) {
- data_[i].clear();
- }
-
- virtual size_t get_k() const {
- return k_;
- }
-
- virtual size_t capacity(size_t i) const {
- return data_[i].capacity();
- }
-
- virtual size_t size(size_t i) const {
- return data_[i].size();
- }
-
- virtual void reserve(size_t cell_size) {
- cell_size_ = cell_size;
- for (size_t i = 0; i < nthreads_; ++i) {
- data_[i].reserve(cell_size_ + LOAD_OVERHEAD);
- }
- }
-
- const data_type& get_data() const {
- return data_;
- }
-
- virtual void dump (KmerMap<int>& destination, size_t bucketNum) {
- KmerMapImpl<size_, int>& destImpl = dynamic_cast<KmerMapImpl<size_, int>&>(destination.get_data());
-
- for (auto it = data_[bucketNum].begin(), end = data_[bucketNum].end(); it != end; ++it) {
- ++destImpl[*it];
- }
- }
-};
-
-
-// ================================= VECTOR FACTORIES =================================
-// Single factory interface
-class SingleKmerHashVectorFactory {
-
-public:
-
- virtual IKmerHashVector * GetHashVector(size_t k, size_t nthreads) const = 0;
-
- virtual ~SingleKmerHashVectorFactory() {
-
- }
-};
-
-
-// Single factory for specific k and value
-template <size_t ts_>
-class SingleKmerHashVectorFactoryImpl: public SingleKmerHashVectorFactory {
-
-public:
-
- virtual IKmerHashVector * GetHashVector(size_t k, size_t nthreads) const {
- VERIFY_MSG(GET_UPPER_BOUND(k) == GET_K_BY_TS(ts_), k << " -> " << GET_UPPER_BOUND(k) << ", " << ts_ << " -> " << GET_K_BY_TS(ts_));
- //INFO(k << " -> " << GET_UPPER_BOUND(k) << ", " << ts_ << " -> " << GET_K_BY_TS(ts_));
-
- return new KmerHashVectorImpl< GET_K_BY_TS(ts_) >(k, nthreads);
- }
-
-};
-
-//Factory genetator
-template<size_t ts_>
-class HashVectorGenerator {
-
-public:
-
- static void GenerateHashVectors(std::vector< SingleKmerHashVectorFactory* > & factories) {
- factories[ts_] = new SingleKmerHashVectorFactoryImpl<ts_>();
- HashVectorGenerator<ts_ - 1> :: GenerateHashVectors (factories);
- }
-};
-
-//Terminating factory generator
-template<>
-class HashVectorGenerator<MIN_TS> {
-
-public:
-
- static void GenerateHashVectors(std::vector< SingleKmerHashVectorFactory* > & factories) {
- factories[MIN_TS] = new SingleKmerHashVectorFactoryImpl<MIN_TS>;
- }
-};
-
-
-//Lazy singleton for factory for every required value
-class KmerHashVectorFactory {
-
-private:
-
- std::vector < SingleKmerHashVectorFactory* > single_factories_;
-
- KmerHashVectorFactory() {
- VERIFY_MSG(MIN_K <= MAX_K, "Invalid K value range");
-
- single_factories_ = std::vector < SingleKmerHashVectorFactory* >(MAX_TS + 1);
- HashVectorGenerator<MAX_TS>::GenerateHashVectors(single_factories_);
- }
-
-public:
-
- static KmerHashVectorFactory& GetInstance() {
- static KmerHashVectorFactory instance;
-
- return instance;
- }
-
- KmerHashVector GetHashVector(size_t k, size_t nthreads) {
- VERIFY_MSG(k >= MIN_K && k <= MAX_K, "K value " + ToString(k) + " is not supported, should be >= " +
- ToString(MIN_K) + " and <= " + ToString(MAX_K));
-
- return KmerHashVector(single_factories_[GET_T_ELEMENTS_NUMBER(k)]->GetHashVector(k, nthreads));
- }
-
- IKmerHashVector * GetRawHashVector(size_t k, size_t nthreads) {
- VERIFY_MSG(k >= MIN_K && k <= MAX_K, "K value " + ToString(k) + " is not supported, should be >= " +
- ToString(MIN_K) + " and <= " + ToString(MAX_K));
-
- return single_factories_[GET_T_ELEMENTS_NUMBER(k)]->GetHashVector(k, nthreads);
- }
-};
-
-KmerHashVector GetHashVector(size_t k, size_t nthreads) {
- return KmerHashVectorFactory::GetInstance().GetHashVector(k, nthreads);
-}
-
-KmerHashVector::KmerHashVector(size_t k, size_t nthreads): data_(KmerHashVectorFactory::GetInstance().GetRawHashVector(k, nthreads)) {
-}
-
-} //namespace runtime_k
-
-#endif /* KMER_HASH_VECTOR_HPP_ */
diff --git a/src/include/adt/kmer_map.hpp b/src/include/adt/kmer_map.hpp
deleted file mode 100644
index 1f2a1bd..0000000
--- a/src/include/adt/kmer_map.hpp
+++ /dev/null
@@ -1,942 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * kmer_map.hpp
- *
- * Created on: Jul 19, 2012
- * Author: Alexander Opeykin
- */
-
-#ifndef KMER_MAP_HPP_
-#define KMER_MAP_HPP_
-
-
-#include "runtime_k.hpp"
-
-
-namespace runtime_k {
-
-template <typename Value>
-class IKmerMapIterator {
-
-public:
- virtual ~IKmerMapIterator() {
-
- }
-
- typedef pair<const RtSeq, const Value&> value_type;
-
- typedef IKmerMapIterator<Value> iterator_type;
-
-
- virtual iterator_type * operator++() = 0;
-
- virtual iterator_type * operator++(int) = 0;
-
- virtual value_type operator*() = 0;
-
- virtual RtSeq first() = 0;
-
- virtual Value& second() = 0;
-
- virtual bool operator==(iterator_type * iter) const = 0;
-
- virtual bool operator!=(iterator_type * iter) const = 0;
-
- virtual iterator_type * copy() const = 0;
-
- virtual size_t get_k() const = 0;
-
-};
-
-//Iterator pointer wrapper
-template <typename Value>
-class KmerMapIterator {
-
-public:
-
- typedef IKmerMapIterator<Value> base_iterator_type;
-
-private:
-
- base_iterator_type * iter_;
-
-public:
-
- typedef typename base_iterator_type::value_type value_type;
-
- typedef KmerMapIterator<Value> iterator_type;
-
-
- KmerMapIterator(base_iterator_type * iter): iter_(iter) {
-
- }
-
- KmerMapIterator(const iterator_type& iter) {
- iter_ = iter.iter_->copy();
- }
-
- iterator_type& operator=(const iterator_type& iter) {
- if (iter.iter_ != iter_) {
- delete iter_;
- iter_ = iter.iter_->copy();
- }
-
- return *this;
- }
-
- ~KmerMapIterator() {
- delete iter_;
- }
-
-
- iterator_type operator++() {
- return iterator_type(iter_->operator ++());
- }
-
- iterator_type operator++(int) {
- return iterator_type(iter_->operator ++(0));
- }
-
- value_type operator*() {
- return iter_->operator *();
- }
-
- const RtSeq first() {
- return iter_->first();
- }
-
- Value& second() {
- return iter_->second();
- }
-
- bool operator==(const iterator_type& iter) const {
- return iter_->operator ==(iter.iter_);
- }
-
- bool operator!=(const iterator_type& iter) const {
- return iter_->operator !=(iter.iter_);
- }
-
-
- size_t get_k() const {
- return iter_->get_k();
- }
-
- base_iterator_type * get_data() const {
- return iter_;
- }
-};
-
-
-// ================================= MAP CONST ITERATOR INTERFACE =================================
-
-template <typename Value>
-class IKmerConstMapIterator {
-
-public:
-
- typedef pair<const RtSeq, const Value&> value_type;
-
- typedef IKmerConstMapIterator<Value> iterator_type;
-
- virtual ~IKmerConstMapIterator() {
-
- }
-
- virtual iterator_type * operator++() = 0;
-
- virtual iterator_type * operator++(int) = 0;
-
- virtual const value_type operator*() const = 0;
-
- virtual RtSeq first() const = 0;
-
- virtual const Value& second() const = 0;
-
- virtual bool operator==(iterator_type * iter) const = 0;
-
- virtual bool operator!=(iterator_type * iter) const = 0;
-
- virtual iterator_type * copy() const = 0;
-
- virtual size_t get_k() const = 0;
-
-};
-
-
-//Const iterator pointer wrapper
-template <typename Value>
-class KmerConstMapIterator {
-
-public:
-
- typedef IKmerConstMapIterator<Value> base_iterator_type;
-
-private:
-
- base_iterator_type * iter_;
-
-public:
-
- typedef typename base_iterator_type::value_type value_type;
-
- typedef KmerConstMapIterator<Value> iterator_type;
-
-
- KmerConstMapIterator(base_iterator_type * iter): iter_(iter) {
- }
-
- KmerConstMapIterator(const iterator_type& iter) {
- iter_ = iter.iter_->copy();
- }
-
- iterator_type& operator=(const iterator_type& iter) {
- if (iter.iter_ != iter_) {
- delete iter_;
- iter_ = iter.iter_->copy();
- }
-
- return *this;
- }
-
- ~KmerConstMapIterator() {
- delete iter_;
- }
-
-
- iterator_type operator++() {
- return iterator_type(iter_->operator ++());
- }
-
- iterator_type operator++(int) {
- return iterator_type(iter_->operator ++(0));
- }
-
- const value_type operator*() const {
- return iter_->operator *();
- }
-
- RtSeq first() const {
- return iter_->first();
- }
-
- const Value& second() const {
- return iter_->second();
- }
-
- bool operator==(const iterator_type& iter) const {
- return iter_->operator ==(iter.iter_);
- }
-
- bool operator!=(const iterator_type& iter) const {
- return iter_->operator !=(iter.iter_);
- }
-
- size_t get_k() const {
- return iter_->get_k();
- }
-
- base_iterator_type * get_data() const {
- return iter_;
- }
-};
-
-
-// ================================= MAP INTERFACE =================================
-
-template <typename Value>
-class IKmerMap {
-
-public:
-
- typedef RtSeq key_type;
-
- typedef pair<const key_type, Value> value_type;
-
- typedef IKmerMapIterator<Value> iterator_type;
-
- typedef IKmerConstMapIterator<Value> const_iterator_type;
-
- virtual ~IKmerMap() {
-
- }
-
- virtual IKmerMap<Value> * copy() const = 0;
-
- virtual bool empty() const = 0;
-
- virtual size_t size() const = 0;
-
- virtual size_t max_size() const = 0;
-
-
- virtual const_iterator_type * cbegin() const = 0;
-
- virtual iterator_type * begin() = 0;
-
- virtual const_iterator_type * cend() const = 0;
-
- virtual iterator_type * end() = 0;
-
- virtual Value& operator[](const key_type& kmer_seq) = 0;
-
-
- virtual const_iterator_type * cfind(const key_type& kmer_seq) const = 0;
-
- virtual iterator_type * find(const key_type& kmer_seq) = 0;
-
- virtual size_t count(const key_type& kmer_seq) const = 0;
-
-
- virtual pair<iterator_type *, bool> insert(const value_type& val) = 0;
-
- virtual size_t erase(const key_type& kmer_seq) = 0;
-
- //virtual iterator_type * erase(const_iterator_type * iter) = 0;
-
- virtual iterator_type * erase(iterator_type * iter) = 0;
-
- virtual void clear() = 0;
-
-
- virtual size_t bucket_count() const = 0;
-
- virtual size_t max_bucket_count() const = 0;
-
- virtual size_t bucket_size(size_t n) const = 0;
-
- virtual size_t bucket(const RtSeq& kmer_seq) const = 0;
-
- virtual float load_factor() const = 0;
-
- virtual float max_load_factor() const = 0;
-
- virtual void max_load_factor(float z) = 0;
-
- virtual void rehash(size_t n) = 0;
-
- virtual size_t get_k() const = 0;
-};
-
-//Map pointer wrapper
-template <typename Value, typename Seq = RtSeq>
-class KmerMap {
-};
-
-template <typename Value>
-class KmerMap<Value, RtSeq> {
-
-public:
-
- typedef IKmerMap<Value> base_map_type;
-
-private:
-
- base_map_type * data_;
-
-public:
-
- typedef KmerMap<Value> map_type;
-
- typedef typename base_map_type::key_type key_type;
-
- typedef typename base_map_type::value_type value_type;
-
- typedef KmerMapIterator<Value> iterator;
-
- typedef KmerConstMapIterator<Value> const_iterator;
-
- KmerMap(size_t k);
-
- KmerMap(base_map_type * map): data_(map) {
- }
-
- KmerMap(const map_type& map) {
- data_ = map.data_->copy();
- }
-
- map_type& operator=(const map_type& map) {
- if (map.data_ != data_) {
- delete data_;
- data_ = map.data_->copy();
- }
-
- return *this;
- }
-
- ~KmerMap() {
- delete data_;
- }
-
- bool empty() const {
- return data_->empty();
- }
-
- size_t size() const {
- return data_->size();
- }
-
- size_t max_size() const {
- return data_->max_size();
- }
-
- const_iterator begin() const {
- return const_iterator(data_->cbegin());
- }
-
- iterator begin() {
- return iterator(data_->begin());
- }
-
- const_iterator end() const {
- return const_iterator(data_->cend());
- }
-
- iterator end() {
- return iterator(data_->end());
- }
-
- Value& operator[](const RtSeq& kmer_seq) {
- return data_->operator [](kmer_seq);
- }
-
- const_iterator find(const RtSeq& kmer_seq) const {
- return const_iterator(data_->cfind(kmer_seq));
- }
-
- iterator find(const RtSeq& kmer_seq) {
- return iterator(data_->find(kmer_seq));
- }
-
- size_t count(const RtSeq& kmer_seq) const {
- return data_->count(kmer_seq);
- }
-
- pair<iterator, bool> insert(const value_type& val) {
- auto res = data_->insert(val);
- return make_pair(iterator(res.first), res.second);
- }
-
- size_t erase(const RtSeq& kmer_seq) {
- return data_->erase(kmer_seq);
- }
-
-// iterator erase(const const_iterator& iter) {
-// return iterator(data_->erase(iter.get_data()));
-// }
-
- iterator erase(const iterator& iter) {
- return iterator(data_->erase(iter.get_data()));
- }
-
- void clear() {
- data_->clear();
- }
-
- size_t bucket_count() const {
- return data_->bucket_count();
- }
-
- size_t max_bucket_count() const {
- return data_->max_bucket_count();
- }
-
- size_t bucket_size(size_t n) const {
- return data_->bucket_size(n);
- }
-
- size_t bucket(const RtSeq& kmer_seq) const {
- return data_->bucket(kmer_seq);
- }
-
- float load_factor() const {
- return data_->load_factor();
- }
-
- float max_load_factor() const {
- return data_->max_load_factor();
- }
-
- void max_load_factor(float z) {
- data_->max_load_factor(z);
- }
-
- void rehash(size_t n) {
- data_->rehash(n);
- }
-
- size_t get_k() const {
- return data_->get_k();
- }
-
- base_map_type& get_data() {
- return *data_;
- }
-
-
-
-};
-
-
-// ================================= MAP ITERATOR IMPLEMENTATION =================================
-template <size_t size_, typename Value>
-class KmerMapIteratorImpl: public IKmerMapIterator<Value> {
-
- typedef TypeValueContainerImpl<size_, Value> type_container;
-
- typedef typename type_container::map_type map_type;
-
- typedef typename map_type::iterator map_iterator;
-
-
- typedef KmerMapIteratorImpl<size_, Value> iterator_impl;
-
- typedef IKmerMapIterator<Value> base_type;
-
- typedef typename base_type::value_type value_type;
-
-private:
- map_iterator iter_;
-
- size_t k_;
-
-public:
-
- KmerMapIteratorImpl(size_t k, const map_iterator& iter): iter_(iter), k_(k) {
- }
-
- virtual base_type * operator++() {
- return new iterator_impl(k_, ++iter_);
- }
-
- virtual base_type * operator++(int) {
- return new iterator_impl(k_, iter_++);
- }
-
- virtual value_type operator*() {
- return make_pair(type_container::to_sequence(iter_->first, k_), (*iter_).second);
- }
-
-
- virtual RtSeq first() {
- return type_container::to_sequence(iter_->first, k_);
- }
-
- virtual Value& second() {
- return iter_->second;
- }
-
- virtual bool operator==(base_type * iter) const {
- iterator_impl * it = dynamic_cast< iterator_impl * > (iter);
- return iter_ == it->iter_;
- }
-
- virtual bool operator!=(base_type * iter) const {
- return !operator ==(iter);
- }
-
- virtual base_type * copy() const {
- return new iterator_impl(k_, iter_);
- }
-
- virtual size_t get_k() const {
- return k_;
- }
-
- const map_iterator& get_data() const {
- return iter_;
- }
-};
-
-
-// ================================= MAP CONST ITERATOR IMPLEMENTATION =================================
-template <size_t size_, typename Value>
-class KmerConstMapIteratorImpl: public IKmerConstMapIterator<Value> {
-
- typedef TypeValueContainerImpl<size_, Value> type_container;
-
- typedef typename type_container::map_type map_type;
-
- typedef typename map_type::const_iterator map_iterator;
-
-
- typedef KmerConstMapIteratorImpl<size_, Value> iterator_impl;
-
- typedef IKmerConstMapIterator<Value> base_type;
-
- typedef typename base_type::value_type value_type;
-
-
-private:
- map_iterator iter_;
-
- size_t k_;
-
-public:
-
- KmerConstMapIteratorImpl(size_t k, const map_iterator& iter): iter_(iter), k_(k) {
- }
-
- virtual base_type * operator++() {
- return new iterator_impl(k_, ++iter_);
- }
-
- virtual base_type * operator++(int) {
- return new iterator_impl(k_, iter_++);
- }
-
- virtual const value_type operator*() const {
- return make_pair(type_container::to_sequence(iter_->first, k_), iter_->second);
- }
-
-
- virtual RtSeq first() const {
- return type_container::to_sequence(iter_->first, k_);
- }
-
- virtual const Value& second() const {
- return iter_->second;
- }
-
- virtual bool operator==(base_type * iter) const {
- iterator_impl * it = dynamic_cast< iterator_impl * > (iter);
- return iter_ == it->iter_;
- }
-
- virtual bool operator!=(base_type * iter) const {
- return !operator ==(iter);
- }
-
- virtual base_type * copy() const {
- return new iterator_impl(k_, iter_);
- }
-
- virtual size_t get_k() const {
- return k_;
- }
-
- const map_iterator& get_data() const {
- return iter_;
- }
-};
-
-
-// ================================= MAP IMPLEMENTATION =================================
-template <size_t size_, typename Value>
-class KmerMapImpl: public IKmerMap<Value> {
-
-public:
-
- typedef TypeValueContainerImpl<size_, Value> type_container;
-
- typedef typename type_container::map_type map_type;
-
- typedef typename type_container::Kmer Kmer;
-
- typedef IKmerMap<Value> base_type;
-
- typedef typename base_type::key_type key_type;
-
- typedef typename base_type::value_type value_type;
-
-
- typedef KmerMapIteratorImpl<size_, Value> iterator_impl;
-
- typedef typename base_type::iterator_type iterator_type;
-
- typedef KmerConstMapIteratorImpl<size_, Value> const_iterator_impl;
-
- typedef typename base_type::const_iterator_type const_iterator_type;
-
-private:
-
- map_type * data_;
-
- size_t k_;
-
-
-public:
-
- KmerMapImpl(size_t k, size_t n) {
- data_ = new map_type(n);
- k_ = k;
- }
-
- KmerMapImpl(size_t k) {
- data_ = new map_type();
- k_ = k;
- }
-
- KmerMapImpl(const KmerMapImpl& map) {
- data_ = new map_type(*(map.data_));
- k_ = map.k_;
- }
-
- virtual ~KmerMapImpl() {
- delete data_;
- }
-
- virtual base_type * copy() const {
- return new KmerMapImpl<size_, Value>(*this);
- }
-
- virtual bool empty() const {
- return data_->empty();
- }
-
- virtual size_t size() const {
- return data_->size();
- }
-
- virtual size_t max_size() const {
- return data_->max_size();
- }
-
- virtual const_iterator_type * cbegin() const {
- return new const_iterator_impl(k_, data_->begin());
- }
-
- virtual iterator_type * begin() {
- return new iterator_impl(k_, data_->begin());
- }
-
- virtual const_iterator_type * cend() const {
- return new const_iterator_impl(k_, data_->end());
- }
-
- virtual iterator_type * end() {
- return new iterator_impl(k_, data_->end());
- }
-
-
- virtual Value& operator[](const key_type& kmer_seq) {
- return (*data_)[type_container::from_sequence(kmer_seq)];
- }
-
- Value& operator[](const Kmer& kmer) {
- return (*data_)[kmer];
- }
-
-
- virtual const_iterator_type * cfind(const key_type& kmer_seq) const {
- return new const_iterator_impl(k_, data_->find(type_container::from_sequence(kmer_seq)));
- }
-
- virtual iterator_type * find(const key_type& kmer_seq) {
- return new iterator_impl(k_, data_->find(type_container::from_sequence(kmer_seq)));
- }
-
- virtual size_t count(const key_type& kmer_seq) const {
- return data_->count(type_container::from_sequence(kmer_seq));
- }
-
-
- virtual pair<iterator_type *, bool> insert(const value_type& val) {
- auto res = data_->insert(make_pair(type_container::from_sequence(val.first), val.second));
- return make_pair(new iterator_impl(k_, res.first), res.second);
- }
-
- virtual size_t erase(const key_type& kmer_seq) {
- return data_->erase(type_container::from_sequence(kmer_seq));
- }
-
- virtual iterator_type * erase(iterator_type * iter) {
- VERIFY_MSG(iter->get_k() == k_, "Unable to erase by iterator of different k value");
-
- //iterator_impl * it = (iterator_impl *) iter;
- iterator_impl * it = dynamic_cast< iterator_impl * > (iter);
- return new iterator_impl(k_, data_->erase(it->get_data()));
- }
-
-// virtual iterator_type * erase(const_iterator_type * iter) {
-// VERIFY_MSG(iter->get_k() == size_, "Unable to erase by iterator of different k value");
-//
-// //const_iterator_impl * it = (const_iterator_impl *) iter;
-// const_iterator_impl * it = dynamic_cast< const_iterator_impl * > (iter);
-// return new iterator_impl(data_->erase(it->get_data()));
-// }
-
-
- virtual void clear() {
- delete data_;
- data_ = new map_type();
- }
-
- virtual size_t bucket_count() const {
- return data_->bucket_count();
- }
-
- virtual size_t max_bucket_count() const {
- return data_->max_bucket_count();
- }
-
- virtual size_t bucket_size(size_t n) const {
- return data_->bucket_size(n);
- }
-
- virtual size_t bucket(const RtSeq& kmer_seq) const {
- return data_->bucket(type_container::from_sequence(kmer_seq));
- }
-
- virtual float load_factor() const {
- return data_->load_factor();
- }
-
- virtual float max_load_factor() const {
- return data_->max_load_factor();
- }
-
- virtual void max_load_factor(float z) {
- data_->max_load_factor(z);
- }
-
- virtual void rehash(size_t n) {
- data_->rehash(n);
- }
-
- virtual size_t get_k() const {
- return k_;
- }
-
-};
-
-
-// ================================= MAP FACTORIES =================================
-
-// Single factory interface
-template<class Value>
-class SingleKmerMapFactory {
-
-public:
-
- virtual IKmerMap<Value> * GetMap(size_t k, size_t capacity) const = 0;
-
- virtual IKmerMap<Value> * GetMap(size_t k) const = 0;
-
- virtual ~SingleKmerMapFactory() {
-
- }
-
-};
-
-
-// Single factory for specific k and value
-template <size_t ts_, class Value>
-class SingleKmerMapFactoryImpl: public SingleKmerMapFactory<Value> {
-
-public:
-
- virtual IKmerMap<Value> * GetMap(size_t k, size_t capacity) const {
- VERIFY_MSG(GET_UPPER_BOUND(k) == GET_K_BY_TS(ts_), k << " -> " << GET_UPPER_BOUND(k) << ", " << ts_ << " -> " << GET_K_BY_TS(ts_));
-
- return new KmerMapImpl<GET_K_BY_TS(ts_), Value>(k, capacity);
- }
-
- virtual IKmerMap<Value> * GetMap(size_t k) const {
- VERIFY_MSG(GET_UPPER_BOUND(k) == GET_K_BY_TS(ts_), k << " -> " << GET_UPPER_BOUND(k) << ", " << ts_ << " -> " << GET_K_BY_TS(ts_));
-
- return new KmerMapImpl<GET_K_BY_TS(ts_), Value>(k);
- }
-
-};
-
-//Factory genetator
-template<size_t ts_, class Value>
-class MapGenerator {
-
-public:
-
- static void GenerateMaps(std::vector< SingleKmerMapFactory<Value>* > & factories) {
- factories.at(ts_) = new SingleKmerMapFactoryImpl<ts_, Value>();
- MapGenerator<ts_ - 1, Value> :: GenerateMaps (factories);
- }
-};
-
-//Terminating factory generator
-template<class Value>
-class MapGenerator<MIN_TS, Value> {
-
-public:
-
- static void GenerateMaps(std::vector< SingleKmerMapFactory<Value>* > & factories) {
- factories.at(MIN_TS) = new SingleKmerMapFactoryImpl<MIN_TS, Value>;
- }
-};
-
-
-//Lazy singleton for factory for every required value
-template<class Value>
-class KmerValueMapFactory {
-
-private:
-
- std::vector < SingleKmerMapFactory<Value>* > single_factories_;
-
- KmerValueMapFactory() {
- VERIFY_MSG(MIN_K <= MAX_K, "Invalid K value range");
-
- single_factories_ = std::vector < SingleKmerMapFactory<Value>* >(MAX_TS + 1);
- MapGenerator<MAX_TS, Value>::GenerateMaps(single_factories_);
- }
-
- ~KmerValueMapFactory() {
- for (auto I = single_factories_.begin(), E = single_factories_.end(); I != E; ++ I)
- delete *I;
- }
-
-public:
-
- static KmerValueMapFactory& GetInstance() {
- static KmerValueMapFactory instance;
- return instance;
- }
-
- KmerMap<Value> GetMap(size_t k, size_t capacity) {
- VERIFY_MSG(k >= MIN_K && k <= MAX_K, "K value " + ToString(k) + " is not supported, should be >= " +
- ToString(MIN_K) + " and <= " + ToString(MAX_K));
-
- return KmerMap<Value>(single_factories_[GET_T_ELEMENTS_NUMBER(k)]->GetMap(k, capacity));
- }
-
- KmerMap<Value> GetMap(size_t k) {
- VERIFY_MSG(k >= MIN_K && k <= MAX_K, "K value " + ToString(k) + " is not supported, should be >= " +
- ToString(MIN_K) + " and <= " + ToString(MAX_K));
-
- return KmerMap<Value>(single_factories_[GET_T_ELEMENTS_NUMBER(k)]->GetMap(k));
- }
-
- IKmerMap<Value> * GetRawMap(size_t k) {
- VERIFY_MSG(k >= MIN_K && k <= MAX_K, "K value " + ToString(k) + " is not supported, should be >= " +
- ToString(MIN_K) + " and <= " + ToString(MAX_K));
-
- return single_factories_[GET_T_ELEMENTS_NUMBER(k)]->GetMap(k);
- }
-};
-
-
-// Main map getter
-template<class Value>
-KmerMap<Value> GetMap(size_t k, size_t capacity) {
- return KmerValueMapFactory<Value>::GetInstance().GetMap(k, capacity);
-}
-
-template<class Value>
-KmerMap<Value> GetMap(size_t k) {
- return KmerValueMapFactory<Value>::GetInstance().GetMap(k);
-}
-
-template<class Value>
-KmerMap<Value>::KmerMap(size_t k): data_(KmerValueMapFactory<Value>::GetInstance().GetRawMap(k)) {
-}
-
-
-} /* namespace runtime_k */
-
-
-#endif /* KMER_MAP_HPP_ */
diff --git a/src/include/adt/kmer_set.hpp b/src/include/adt/kmer_set.hpp
deleted file mode 100644
index 1ece8d5..0000000
--- a/src/include/adt/kmer_set.hpp
+++ /dev/null
@@ -1,364 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * kmer_set.hpp
- *
- * Created on: Jul 19, 2012
- * Author: alex
- */
-
-#ifndef KMER_SET_HPP_
-#define KMER_SET_HPP_
-
-
-#include "runtime_k.hpp"
-#include "kmer_hash_vector.hpp"
-
-
-namespace runtime_k {
-
-
-class IKmerSet {
-
-public:
-
- typedef RtSeq input_value_type;
-
- virtual ~IKmerSet() {
-
- }
-
- virtual IKmerSet * copy() const = 0;
-
- virtual bool empty() const = 0;
-
- virtual size_t size() const = 0;
-
- virtual size_t count(const input_value_type& kmer_seq) const = 0;
-
- virtual bool insert(const input_value_type& val) = 0;
-
- virtual void transfer(IKmerHashVector * vec, size_t thread_num) = 0;
-
- virtual void clear() = 0;
-
- virtual void erase() = 0;
-
- virtual void to_file(const std::string& s) const = 0;
-
- virtual size_t get_k() const = 0;
-
- virtual bool contains(const input_value_type& val) = 0;
-
-};
-
-
-
-class KmerSet {
-
-public:
-
- typedef IKmerSet base_set_type;
-
-private:
-
- base_set_type * data_;
-
-public:
-
- typedef KmerSet set_type;
-
- typedef base_set_type::input_value_type input_value_type;
-
- KmerSet(size_t k);
-
- KmerSet(base_set_type * set): data_(set) {
- }
-
- KmerSet(const set_type& set) {
- data_ = set.data_->copy();
- }
-
- set_type& operator=(const set_type& set) {
- if (set.data_ != data_) {
- delete data_;
- data_ = set.data_->copy();
- }
-
- return *this;
- }
-
- ~KmerSet() {
- delete data_;
- }
-
-
- bool empty() const {
- return data_->empty();
- }
-
- size_t size() const {
- return data_->size();
- }
-
- size_t count(const input_value_type& kmer_seq) const {
- return data_->count(kmer_seq);
- }
-
- bool insert(const input_value_type& val) {
- return data_->insert(val);
- }
-
- void transfer(const KmerHashVector& vec, size_t thread_num) {
- data_->transfer(vec.get_data(), thread_num);
- }
-
- void clear() {
- data_->clear();
- }
-
- void erase() {
- data_->erase();
- }
-
- void to_file(const std::string& s) const {
- data_->to_file(s);
- }
-
- size_t get_k() const {
- return data_->get_k();
- }
-
- base_set_type * get_data() const {
- return data_;
- }
-
- bool contains(const input_value_type& val) {
- return data_->contains(val);
- }
-
-};
-
-
-
-// ================================= SET IMPLEMENTATION =================================
-
-
-
-template <size_t size_>
-class KmerSetImpl: public IKmerSet {
-
-public:
-
- typedef TypeContainerImpl<size_> type_container;
-
- typedef typename type_container::set_type set_type;
-
- typedef IKmerSet base_type;
-
- typedef typename base_type::input_value_type input_value_type;
-
-private:
-
- set_type data_;
-
- size_t k_;
-
-public:
-
- KmerSetImpl(size_t k, size_t n): data_(n), k_(k) {
- }
-
- KmerSetImpl(size_t k): data_(), k_(k) {
- }
-
- virtual base_type * copy() const {
- return new KmerSetImpl<size_>(*this);
- }
-
- virtual bool empty() const {
- return data_.empty();
- }
-
- virtual size_t size() const {
- return data_.size();
- }
-
- virtual size_t count(const input_value_type& kmer_seq) const {
- return data_.count(type_container::from_sequence(kmer_seq));
- }
-
- virtual bool insert(const input_value_type& val) {
- return data_.insert(type_container::from_sequence(val)).second;
- }
-
- virtual void transfer(IKmerHashVector * vec, size_t thread_num) {
- VERIFY_MSG(vec->get_k() == k_, "Unable to transfer vector to set of different k values");
-
- //KmerHashVectorImpl<size_> * vec_impl = (KmerHashVectorImpl<size_> *) vec;
- KmerHashVectorImpl<size_> * vec_impl = dynamic_cast< KmerHashVectorImpl<size_> *>(vec);
- data_.insert(vec_impl->get_data()[thread_num].begin(), vec_impl->get_data()[thread_num].end());
-
-// for (auto iter = vec_impl->get_data()[thread_num].begin(); iter != vec_impl->get_data()[thread_num].end(); ++iter) {
-// data_.insert(*iter);
-// }
- }
-
- virtual void clear() {
- data_.clear();
- }
-
- virtual void erase() {
- data_.erase(data_.begin(), data_.end());
- }
-
- virtual size_t get_k() const {
- return k_;
- }
-
- virtual void to_file(const std::string& s) const {
- ofstream kmeros;
- kmeros.open(s.c_str());
- for (auto iter = data_.begin(); iter != data_.end(); ++iter) {
- kmeros << *iter << std::endl;
- }
- kmeros.close();
- }
-
- virtual bool contains(const input_value_type& val) {
- return (data_.find(type_container::from_sequence(val)) != data_.end());
- }
-
- const set_type& get_data() const {
- return data_;
- }
-
-};
-
-// ================================= SET FACTORIES =================================
-// Single factory interface
-class SingleKmerSetFactory {
-
-public:
-
- virtual IKmerSet * GetSet(size_t k, size_t capacity) const = 0;
-
- virtual IKmerSet * GetSet(size_t k) const = 0;
-
- virtual ~SingleKmerSetFactory() {
-
- }
-
-};
-
-
-// Single factory for specific k and value
-template <size_t ts_>
-class SingleKmerSetFactoryImpl: public SingleKmerSetFactory {
-
-public:
-
- virtual IKmerSet * GetSet(size_t k, size_t capacity) const {
- VERIFY_MSG(GET_UPPER_BOUND(k) == GET_K_BY_TS(ts_), k << " -> " << GET_UPPER_BOUND(k) << ", " << ts_ << " -> " << GET_K_BY_TS(ts_));
-
- return new KmerSetImpl< GET_K_BY_TS(ts_) >(k, capacity);
- }
-
- virtual IKmerSet * GetSet(size_t k) const {
- VERIFY_MSG(GET_UPPER_BOUND(k) == GET_K_BY_TS(ts_), k << " -> " << GET_UPPER_BOUND(k) << ", " << ts_ << " -> " << GET_K_BY_TS(ts_));
-
- return new KmerSetImpl< GET_K_BY_TS(ts_) >(k);
- }
-
-};
-
-//Factory genetator
-template<size_t ts_>
-class SetGenerator {
-
-public:
-
- static void GenerateSets(std::vector< SingleKmerSetFactory* > & factories) {
- factories[ts_] = new SingleKmerSetFactoryImpl<ts_>();
- SetGenerator<ts_ - 1> :: GenerateSets (factories);
- }
-};
-
-//Terminating factory generator
-template<>
-class SetGenerator<MIN_TS> {
-
-public:
-
- static void GenerateSets(std::vector< SingleKmerSetFactory* > & factories) {
- factories[MIN_TS] = new SingleKmerSetFactoryImpl<MIN_TS>();
- }
-};
-
-
-//Lazy singleton for factory for every required value
-class KmerSetFactory {
-
-private:
-
- std::vector < SingleKmerSetFactory* > single_factories_;
-
- KmerSetFactory() {
- VERIFY_MSG(MIN_K <= MAX_K, "Invalid K value range");
-
- single_factories_ = std::vector < SingleKmerSetFactory* >(MAX_TS + 1);
- SetGenerator<MAX_TS>::GenerateSets(single_factories_);
- }
-
-public:
-
- static KmerSetFactory& GetInstance() {
- static KmerSetFactory instance;
-
- return instance;
- }
-
- KmerSet GetSet(size_t k, size_t capacity) {
- VERIFY_MSG(k >= MIN_K && k <= MAX_K, "K value " + ToString(k) + " is not supported, should be >= " +
- ToString(MIN_K) + " and <= " + ToString(MAX_K));
-
- return KmerSet(single_factories_[GET_T_ELEMENTS_NUMBER(k)]->GetSet(k, capacity));
- }
-
- KmerSet GetSet(size_t k) {
- VERIFY_MSG(k >= MIN_K && k <= MAX_K, "K value " + ToString(k) + " is not supported, should be >= " +
- ToString(MIN_K) + " and <= " + ToString(MAX_K));
-
- return KmerSet(single_factories_[GET_T_ELEMENTS_NUMBER(k)]->GetSet(k));
- }
-
- IKmerSet * GetRawSet(size_t k) {
- VERIFY_MSG(k >= MIN_K && k <= MAX_K, "K value " + ToString(k) + " is not supported, should be >= " +
- ToString(MIN_K) + " and <= " + ToString(MAX_K));
-
- return single_factories_[GET_T_ELEMENTS_NUMBER(k)]->GetSet(k);
- }
-};
-
-
-// Main set getters
-KmerSet GetSet(size_t k, size_t capacity) {
- return KmerSetFactory::GetInstance().GetSet(k, capacity);
-}
-
-KmerSet GetSet(size_t k) {
- return KmerSetFactory::GetInstance().GetSet(k);
-}
-
-KmerSet::KmerSet(size_t k): data_(KmerSetFactory::GetInstance().GetRawSet(k)) {
-}
-
-} /* namespace runtime_k */
-
-
-#endif /* KMER_SET_HPP_ */
diff --git a/src/include/adt/kmer_vector.hpp b/src/include/adt/kmer_vector.hpp
deleted file mode 100644
index 7006c7c..0000000
--- a/src/include/adt/kmer_vector.hpp
+++ /dev/null
@@ -1,165 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef __KMER_VECTOR_HPP__
-#define __KMER_VECTOR_HPP__
-
-#include "array_vector.hpp"
-#include "config.hpp"
-
-#ifdef SPADES_USE_JEMALLOC
-# include <jemalloc/jemalloc.h>
-#endif
-
-template<class Seq>
-class KMerVector {
- private:
- typedef typename Seq::DataType ElTy;
-
- ElTy *realloc() {
-#ifdef SPADES_USE_JEMALLOC
- // First, try to expand in-place
- if (storage_ && sizeof(ElTy) * capacity_ * el_sz_ > 4096 &&
- je_rallocm((void**)&storage_, NULL, sizeof(ElTy) * capacity_ * el_sz_, 0, ALLOCM_NO_MOVE) == ALLOCM_SUCCESS)
- return storage_;
-
- // Failed, do usual malloc / memcpy / free cycle
- ElTy *res = (ElTy*) je_malloc(sizeof(ElTy) * capacity_ * el_sz_);
- if (storage_)
- std::memcpy(res, storage_, size_ * sizeof(ElTy) * el_sz_);
- je_free(storage_);
- storage_ = res;
-#else
- // No JEMalloc, no cookies
- ElTy *res = new ElTy[capacity_ * el_sz_];
- if (storage_)
- std:: memcpy(res, storage_, size_ * sizeof(ElTy) * el_sz_);
-
- delete[] storage_;
- storage_ = res;
-#endif
-
- return storage_;
- }
-
- public:
- typedef typename array_vector<ElTy>::reference reference;
- typedef typename array_vector<ElTy>::value_type value_type;
- typedef typename array_vector<ElTy>::iterator iterator;
- typedef typename array_vector<ElTy>::const_iterator const_iterator;
-
- typedef array_less<ElTy> less2_fast;
- typedef array_equal_to<ElTy> equal_to;
-
- explicit KMerVector(unsigned K, size_t capacity = 1)
- : K_(K), size_(0), capacity_(std::max(capacity, (size_t)1)), el_sz_(Seq::GetDataSize(K)), storage_(NULL), vector_(realloc(), size_, el_sz_) {
- }
-
- KMerVector(KMerVector &&that)
- : K_(that.K_), size_(that.size_), capacity_(that.capacity_), el_sz_(that.el_sz_), storage_(that.storage_), vector_(storage_, size_, el_sz_) {
- that.storage_ = NULL;
- }
-
- KMerVector(const KMerVector &that)
- : K_(that.K_), size_(that.size_), capacity_(that.capacity_), el_sz_(that.el_sz_), storage_(NULL), vector_(realloc(), size_, el_sz_) {
- memcpy(storage_, that.storage_, size_ * sizeof(ElTy) * el_sz_);
- }
-
- ~KMerVector() {
-#ifdef SPADES_USE_JEMALLOC
- je_free(storage_);
-#else
- delete[] storage_;
-#endif
- }
-
- KMerVector &operator=(const KMerVector& that) {
- if (this != &that) {
- K_ = that.K_;
- size_ = that.size_;
- capacity_ = that.capacity_;
- el_sz_ = that.el_sz_;
-
- storage_ = NULL;
- realloc();
- memcpy(storage_, that.storage_, size_ * sizeof(ElTy) * el_sz_);
-
- vector_.set_data(storage_);
- vector_.set_size(size_);
- }
-
- return *this;
- }
-
- void push_back(const ElTy *data) {
- if (capacity_ == size_)
- reserve(capacity_ * 2);
-
- vector_[size_] = data;
- size_ += 1;
- vector_.set_size(size_);
- }
-
- void push_back(const Seq &s) {
- push_back(s.data());
- }
-
- void reserve(size_t amount) {
- if (capacity_ < amount) {
- capacity_ = amount;
- vector_.set_data(realloc());
- }
- }
-
- void clear() {
- size_ = 0;
- vector_.set_size(size_);
- }
-
- iterator begin() {
- return vector_.begin();
- }
- const_iterator begin() const {
- return vector_.begin();
- }
- iterator end() {
- return vector_.end();
- }
- const_iterator end() const {
- return vector_.end();
- }
-
- const ElTy* data() const {
- return storage_;
- }
- size_t size() const {
- return size_;
- }
- size_t el_size() const {
- return el_sz_;
- }
- size_t el_data_size() const {
- return el_sz_ * sizeof(ElTy);
- }
- size_t capacity() const {
- return capacity_;
- }
- const ElTy *operator[](size_t idx) const {
- return vector_[idx];
- }
-
- private:
- unsigned K_;
- size_t size_;
- size_t capacity_;
- size_t el_sz_;
- ElTy *storage_;
- array_vector<ElTy> vector_;
-};
-
-
-#endif /* __KMER_VECTOR_HPP */
diff --git a/src/include/adt/parallel_seq_vector.hpp b/src/include/adt/parallel_seq_vector.hpp
deleted file mode 100644
index dd2821b..0000000
--- a/src/include/adt/parallel_seq_vector.hpp
+++ /dev/null
@@ -1,110 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "omni/parallel_unordered_map.hpp"
-#include "openmp_wrapper.h"
-
-#include "runtime_k.hpp"
-#include "kmer_map.hpp"
-#include "kmer_hash_vector.hpp"
-
-class ParallelSeqVector {
-
-public:
- typedef runtime_k::KmerHashVector par_container_t;
-
- typedef runtime_k::KmerMap<int> destination_container_t;
-
- typedef runtime_k::RtSeq Kmer;
-
-private:
-
- size_t k_;
-
- size_t nthreads_;
-
- std::vector<par_container_t> nodes_;
-
-public:
-
- ParallelSeqVector(size_t k, size_t nthreads, size_t cell_size) :
- k_(k),
- nthreads_(nthreads),
- nodes_()
-
- {
- for (size_t i = 0; i < nthreads_; ++i) {
- nodes_.push_back(runtime_k::GetHashVector(k_, nthreads_));
- }
-
- for (size_t i = 0; i < nthreads_; ++i) {
- nodes_[i].reserve(cell_size);
- }
- }
-
-
- void AddEdge(const Kmer &kmer, size_t thread_number) {
- nodes_[thread_number].insert(kmer);
- }
-
- void CountSequence(const Sequence& s, size_t thread_number) {
- if (s.size() < k_)
- return;
-
- Kmer kmer = s.start<Kmer>(k_);
-
- AddEdge(kmer, thread_number);
- for (size_t j = k_; j < s.size(); ++j) {
- kmer <<= s[j];
- AddEdge(kmer, thread_number);
- }
-
- }
-//
-// void MergeMaps(destination_container_t & dest_container, size_t i) {
-// for (size_t j = 0; j < nthreads_; ++j) {
-// dest_container.transfer(nodes_[j], i);
-// }
-// }
-
- void Dump(destination_container_t & bucket, size_t bucket_number) {
- for (size_t i = 0; i < nodes_.size(); ++i) {
- nodes_[i].dump(bucket, bucket_number);
- nodes_[i].clear(bucket_number);
- }
- }
-
-
- size_t SingleBucketCount() const {
- return nodes_[0].capacity(0);
- }
-
- bool IsFull(size_t i) const {
- return nodes_[i].is_full();
- }
-
- void Clear(size_t i) {
- nodes_[i].clear();
- }
-
- void Clear() {
- for (size_t i = 0; i < nthreads_; ++i) {
- nodes_[i].clear();
- }
- }
-
- void print_sizes() {
- for (size_t i = 0; i < nodes_.size(); ++i) {
- INFO("Size " << i << "::: ");
- nodes_[i].print_sizes();
- }
- }
-
-
-};
diff --git a/src/include/adt/pointer_iterator.hpp b/src/include/adt/pointer_iterator.hpp
deleted file mode 100644
index 9526f4c..0000000
--- a/src/include/adt/pointer_iterator.hpp
+++ /dev/null
@@ -1,172 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef __HAMMER_POINTER_ITERATOR_HPP__
-#define __HAMMER_POINTER_ITERATOR_HPP__
-
-#include <iterator>
-#include <algorithm>
-#include <iostream>
-#include <stdexcept>
-#include <vector>
-
-template<typename T>
-class pointer_iterator : public std::iterator<std::random_access_iterator_tag, T> {
-protected:
- T *data_;
-
-public:
- typedef std::random_access_iterator_tag iterator_category;
- typedef typename std::iterator<std::random_access_iterator_tag, T>::value_type value_type;
- typedef typename std::iterator<std::random_access_iterator_tag, T>::difference_type difference_type;
- typedef typename std::iterator<std::random_access_iterator_tag, T>::reference reference;
- typedef typename std::iterator<std::random_access_iterator_tag, T>::pointer pointer;
-
- pointer_iterator() : data_(NULL) {}
-
- template<typename T2>
- pointer_iterator(const pointer_iterator<T2> &r) : data_(&(*r)) {}
-
- pointer_iterator(pointer data) : data_(data) {}
-
- template<typename T2>
- pointer_iterator& operator=(const pointer_iterator<T2> &r) {
- data_ = &(*r);
- return *this;
- }
-
- pointer_iterator& operator++() {
- data_ += 1;
- return *this;
- }
-
- pointer_iterator& operator--() {
- data_ -= 1;
- return *this;
- }
-
- pointer_iterator operator++(int) {
- pointer_iterator res = *this;
- data_ += 1;
-
- return res;
- }
-
- pointer_iterator operator--(int) {
- pointer_iterator res = *this;
- data_ -= 1;
-
- return res;
- }
-
- pointer_iterator operator+(const difference_type &n) const {
- return pointer_iterator(data_ + n);
- }
-
- pointer_iterator& operator+=(const difference_type &n) {
- data_ += n; return *this;
- }
-
- pointer_iterator operator-(const difference_type &n) const {
- return pointer_iterator(pointer(data_ - n));
- }
-
- pointer_iterator& operator-=(const difference_type &n) {
- data_ -= n; return *this;
- }
-
- reference operator*() const {
- return *data_;
- }
-
- pointer operator->() const {
- return data_;
- }
-
- reference operator[](const difference_type &n) const {
- return data_[n];
- }
-
- template<typename T2>
- friend bool operator==(const pointer_iterator<T2> &r1,
- const pointer_iterator<T2> &r2);
-
- template<typename T2>
- friend bool operator!=(const pointer_iterator<T2> &r1,
- const pointer_iterator<T2> &r2);
-
- template<typename T2>
- friend bool operator<(const pointer_iterator<T2> &r1,
- const pointer_iterator<T2> &r2);
-
- template<typename T2>
- friend bool operator>(const pointer_iterator<T2> &r1,
- const pointer_iterator<T2> &r2);
-
- template<typename T2>
- friend bool operator<=(const pointer_iterator<T2> &r1,
- const pointer_iterator<T2> &r2);
-
- template<typename T2>
- friend bool operator>=(const pointer_iterator<T2> &r1,
- const pointer_iterator<T2> &r2);
-
- template<typename T2>
- friend typename pointer_iterator<T2>::difference_type
- operator+(const pointer_iterator<T2> &r1,
- const pointer_iterator<T2> &r2);
-
- template<typename T2>
- friend typename pointer_iterator<T2>::difference_type
- operator-(const pointer_iterator<T2> &r1,
- const pointer_iterator<T2> &r2);
-};
-
-template<typename T>
-inline bool operator==(const pointer_iterator<T> &r1,
- const pointer_iterator<T> &r2) {
- return (r1.data_ == r2.data_);
-}
-
-template<typename T>
-inline bool operator!=(const pointer_iterator<T> &r1,
- const pointer_iterator<T> &r2) {
- return (r1.data_ != r2.data_);
-}
-
-template<typename T>
-inline bool operator<(const pointer_iterator<T> &r1,
- const pointer_iterator<T> &r2) {
- return (r1.data_ < r2.data_);
-}
-
-template<typename T>
-inline bool operator>(const pointer_iterator<T> &r1,
- const pointer_iterator<T> &r2) {
- return (r1.data_ > r2.data_);
-}
-
-template<typename T>
-inline bool operator<=(const pointer_iterator<T> &r1,
- const pointer_iterator<T> &r2) {
- return (r1.data_ <= r2.data_);
-}
-
-template<typename T>
-inline bool operator>=(const pointer_iterator<T> &r1,
- const pointer_iterator<T> &r2) {
- return (r1.data_ >= r2.data_);
-}
-
-template<typename T>
-inline typename pointer_iterator<T>::difference_type
-operator-(const pointer_iterator<T> &r1,
- const pointer_iterator<T> &r2) {
- return (r1.data_ - r2.data_);
-}
-
-#endif // __HAMMER_POINTER_ITERATOR_HPP__
diff --git a/src/include/adt/queue_iterator.hpp b/src/include/adt/queue_iterator.hpp
deleted file mode 100644
index a125b65..0000000
--- a/src/include/adt/queue_iterator.hpp
+++ /dev/null
@@ -1,143 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef QUEUE_ITERATOR_HPP_
-#define QUEUE_ITERATOR_HPP_
-
-#include "verify.hpp"
-#include <set>
-
-template<typename T, typename Comparator>
-class erasable_priority_queue {
-private:
- std::set<T, Comparator> storage_;
-public:
- /*
- * Be careful! This constructor requires Comparator to have default constructor even if you call it with
- * specified comparator. In this case just create default constructor with VERIFY(false) inside it.
- */
- erasable_priority_queue(const Comparator& comparator = Comparator()) :
- storage_(comparator) {
- }
-
- template<typename InputIterator>
- erasable_priority_queue(InputIterator begin, InputIterator end,
- const Comparator& comparator = Comparator()) :
- storage_(begin, end, comparator) {
- }
-
- void pop() {
- VERIFY(!storage_.empty());
- storage_.erase(storage_.begin());
- }
-
- const T& top() const {
- VERIFY(!storage_.empty());
- return *(storage_.begin());
- }
-
- void push(const T& key) {
- storage_.insert(key);
- }
-
- bool erase(const T& key) {
- bool res = storage_.erase(key) > 0;
- return res;
- }
-
- void clear() {
- storage_.clear();
- }
-
- bool empty() const {
- return storage_.empty();
- }
-
- size_t size() const {
- return storage_.size();
- }
-
- template <class InputIterator>
- void insert ( InputIterator first, InputIterator last ) {
- storage_.insert(first, last);
- }
-
-};
-
-template<typename T, typename Comparator = std::less<T>>
-class DynamicQueueIterator {
-
- bool current_actual_;
- bool current_deleted_;
- T current_;
- erasable_priority_queue<T, Comparator> queue_;
-
-public:
-
- DynamicQueueIterator(const Comparator& comparator = Comparator()) :
- current_actual_(false), current_deleted_(false), queue_(comparator) {
- }
-
- template<typename InputIterator>
- void insert(InputIterator begin, InputIterator end) {
- queue_.insert(begin, end);
- }
-
- void push(const T& to_add) {
- queue_.push(to_add);
- }
-
- void erase(const T& to_remove) {
- if (current_actual_ && to_remove == current_) {
- current_deleted_ = true;
- }
- queue_.erase(to_remove);
- }
-
- void clear() {
- queue_.clear();
- current_actual_ = false;
- current_deleted_ = false;
- }
-
- bool IsEnd() const {
- return queue_.empty();
- }
-
- size_t size() const {
- return queue_.size();
- }
-
- const T& operator*() {
- VERIFY(!queue_.empty());
- if(!current_actual_ || current_deleted_) {
- current_ = queue_.top();
- current_actual_ = true;
- current_deleted_ = false;
- }
- return current_;
- }
-
- void operator++() {
- if (!current_actual_) {
- queue_.pop();
- } else if (!current_deleted_) {
- queue_.erase(current_);
- }
- current_actual_ = false;
- }
-
- //use carefully!
- void ReleaseCurrent() {
- current_actual_ = false;
- }
-
-};
-
-
-#endif /* QUEUE_ITERATOR_HPP_ */
-
diff --git a/src/include/adt/small_pod_vector.hpp b/src/include/adt/small_pod_vector.hpp
deleted file mode 100644
index d261174..0000000
--- a/src/include/adt/small_pod_vector.hpp
+++ /dev/null
@@ -1,379 +0,0 @@
-#ifndef __ADT_SMALL_POD_VECTOR__
-#define __ADT_SMALL_POD_VECTOR__
-
-#pragma once
-
-#include <llvm/PointerIntPair.h>
-
-#include <vector>
-#include <type_traits>
-
-namespace adt {
-
-#define LIKELY(EXPR) __builtin_expect((bool)(EXPR), true)
-#define UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false)
-
-template<class T>
-class SmallPODVector {
- template <typename PT1, typename PT2> class PointerUnionTraits {
- public:
- static inline void *getAsVoidPointer(void *P) { return P; }
- static inline void *getFromVoidPointer(void *P) { return P; }
- enum {
- PT1BitsAv = (int)(llvm::PointerLikeTypeTraits<PT1>::NumLowBitsAvailable),
- PT2BitsAv = (int)(llvm::PointerLikeTypeTraits<PT2>::NumLowBitsAvailable),
- NumLowBitsAvailable = PT1BitsAv < PT2BitsAv ? PT1BitsAv : PT2BitsAv
- };
- };
-
- static const unsigned SmallSizeIntBits = 3;
- static const unsigned MaxSmall = (1 << SmallSizeIntBits) - 1;
-
- typedef typename std::vector<T> vector_type;
-
- typedef llvm::PointerIntPair<void *, SmallSizeIntBits, size_t,
- PointerUnionTraits<T*, vector_type*> > container_type;
-
- typedef SmallPODVector<T> self;
- container_type data_;
-
-public:
- typedef size_t size_type;
- typedef ptrdiff_t difference_type;
- typedef T value_type;
- typedef T* iterator;
- typedef const T* const_iterator;
-
- typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
- typedef std::reverse_iterator<iterator> reverse_iterator;
-
- typedef T& reference;
- typedef const T& const_reference;
- typedef T* pointer;
- typedef const T* const_pointer;
-
-// workaround missing "is_trivially_copyable" in g++ < 5.0
-#if __GNUG__ && __GNUC__ < 5
-#define IS_TRIVIALLY_COPYABLE(T) __has_trivial_copy(T)
-#else
-#define IS_TRIVIALLY_COPYABLE(T) std::is_trivially_copyable<T>::value
-#endif
-
- static_assert(IS_TRIVIALLY_COPYABLE(value_type), "Value type for SmallPODVector should be trivially copyable");
-
-#undef IS_TRIVIALLY_COPYABLE
-
-private:
- vector_type* vector() const {
- return (data_.getInt() == 0 ? static_cast<vector_type*>(data_.getPointer()) : nullptr);
- }
-
- void impl_resize(size_type N) {
- void *data = data_.getPointer(), *new_data = data;
- size_t sz = data_.getInt(), new_sz = N;
-
- if (UNLIKELY(sz == 0 && data != nullptr)) { // vector case
- vector_type *v = static_cast<vector_type*>(data);
- if (N > MaxSmall) {
- v->resize(N);
- new_data = v;
- new_sz = 0;
- } else { // We have to turn vector into array
- if (N) {
- new_data = malloc(N * sizeof(T));
- new_sz = N;
- memcpy(new_data, v->data(), N * sizeof(T));
- } else {
- new_data = nullptr;
- new_sz = 0;
- }
- delete v;
- }
- } else if (UNLIKELY(N > MaxSmall)) {
- // Ok, we have to grow too much - allocate new vector
- vector_type *new_vector = new vector_type((T*)data, (T*)data + sz);
- new_vector->resize(N);
- if (data)
- free(data);
- new_data = new_vector;
- new_sz = 0;
- } else {
- // Otherwise, simply change the size of the allocated space
- if (N) {
- new_data = realloc(data, N * sizeof(T));
- new_sz = N;
- } else {
- free(data);
- new_data = nullptr;
- new_sz = 0;
- }
- }
-
- data_.setPointer(new_data);
- data_.setInt(new_sz);
- }
-
-public:
- SmallPODVector<T>() = default;
- SmallPODVector<T>(size_type size, const T &value = T()) {
- this->assign(size, value);
- }
-
- SmallPODVector<T>(const self &that) {
- assign(that.begin(), that.end());
- }
-
- const self& operator=(const self& that) {
- // Avoid self-assignment.
- if (this == &that) return *this;
- assign(that.begin(), that.end());
- return *this;
- }
-
- SmallPODVector<T>(self &&that) {
- data_ = that.data_;
- that.data_.setPointer(nullptr);
- that.data_.setInt(0);
- }
-
- const self& operator=(const self&& that) {
- // Avoid self-assignment.
- if (this == &that) return *this;
-
- this->impl_resize(0);
- data_ = that.data_;
- that.data_.setPointer(nullptr);
- that.data_.setInt(0);
-
- return *this;
- }
-
- ~SmallPODVector<T>() {
- this->impl_resize(0);
- }
-
- __attribute__((always_inline))
- bool empty() const {
- return data_.getInt() == 0 && data_.getPointer() == nullptr;
- }
-
- __attribute__((always_inline))
- size_type size() const {
- const auto v = vector();
- if (UNLIKELY(v != nullptr))
- return v->size();
-
- return data_.getInt();
- }
-
- __attribute__((always_inline))
- pointer data() {
- const auto v = vector();
- if (UNLIKELY(v != nullptr))
- return v->data();
-
- return pointer(data_.getPointer());
- }
-
- __attribute__((always_inline))
- const_pointer cdata() const {
- const auto v = vector();
- if (UNLIKELY(v != nullptr))
- return v->data();
-
- return const_pointer(data_.getPointer());
- }
-
- size_type max_size() const { return size_type(-1) / sizeof(T); }
- size_t capacity() const {
- const auto v = vector();
- if (UNLIKELY(v != nullptr))
- return v->capacity();
-
- return data_.getInt();
- }
-
- // forward iterator creation methods.
- __attribute__((always_inline))
- iterator begin() {
- return (iterator)(data());
- }
- __attribute__((always_inline))
- const_iterator begin() const {
- return (const_iterator)(cdata());
- }
- __attribute__((always_inline))
- const_iterator cbegin() const {
- return (const_iterator)(cdata());
- }
- __attribute__((always_inline))
- iterator end() {
- return (iterator)(data() + size());
- }
- __attribute__((always_inline))
- const_iterator end() const {
- return (const_iterator)(cdata() + size());
- }
- __attribute__((always_inline))
- const_iterator cend() const {
- return (const_iterator)(cdata() + size());
- }
-
- // reverse iterator creation methods.
- reverse_iterator rbegin() { return reverse_iterator(end()); }
- const_reverse_iterator rbegin() const{ return const_reverse_iterator(end()); }
- reverse_iterator rend() { return reverse_iterator(begin()); }
- const_reverse_iterator rend() const { return const_reverse_iterator(begin());}
-
- __attribute__((always_inline))
- reference operator[](size_type idx) {
- assert(idx < size());
- return begin()[idx];
- }
- __attribute__((always_inline))
- const_reference operator[](size_type idx) const {
- assert(idx < size());
- return begin()[idx];
- }
-
- reference front() {
- assert(!empty());
- return begin()[0];
- }
- const_reference front() const {
- assert(!empty());
- return begin()[0];
- }
-
- reference back() {
- assert(!empty());
- return end()[-1];
- }
- const_reference back() const {
- assert(!empty());
- return end()[-1];
- }
-
- void push_back(const T &value) {
- const auto v = vector();
- if (UNLIKELY(v != nullptr)) {
- v->push_back(value);
- return;
- }
-
- this->impl_resize(this->size() + 1);
- memcpy(this->end() - 1, &value, sizeof(T));
- }
-
- void pop_back() {
- // This will reallocate to array, if necessary.
- this->impl_resize(this->size() - 1);
- }
-
- T pop_back_val() {
- T res = ::std::move(this->back());
- this->pop_back();
- return res;
- }
-
- void clear() {
- this->impl_resize(0);
- }
-
- void resize(size_type count) {
- this->impl_resize(count);
- std::uninitialized_fill(this->begin() + count, this->end(), T());
- }
-
- void resize(size_type count, const T &value) {
- this->impl_resize(count);
- std::uninitialized_fill(this->begin() + count, this->end(), value);
- }
-
- void reserve(size_type count) {
- if (auto v = vector()) {
- v->reserve(count);
- }
- }
-
- void assign(size_type count, const T &value) {
- this->impl_resize(count);
- std::uninitialized_fill(this->begin(), this->end(), value);
- }
-
- template<class InputIt>
- void assign(InputIt first, InputIt last) {
- this->impl_resize(last - first);
- std::uninitialized_copy(first, last, this->begin());
- }
-
- iterator erase(const_iterator pos) {
- size_type idx = pos - this->begin();
- std::copy(iterator(pos + 1), this->end(), iterator(pos));
- this->impl_resize(this->size() - 1); // This might invalidate iterators
-
- return this->begin() + idx;
- }
-
- iterator erase(const_iterator first, const_iterator last) {
- difference_type idx = first - this->begin();
- std::copy(iterator(last), this->end(), iterator(first));
- this->impl_resize(this->size() - (last - first)); // This might invalidate iterators
-
- return this->begin() + idx;
- }
-
- iterator insert(const_iterator pos, const T &value) {
- if (pos == this->end()) {
- this->push_back(value);
- return this->end() - 1;
- }
-
- difference_type idx = pos - this->begin();
- size_type sz = this->size();
-
- this->impl_resize(sz + 1); // This might invalidate iterators
-
- iterator it = this->begin() + idx;
- std::copy_backward(it, this->end() - 1, this->end());
-
- // If we just moved the element we're inserting, be sure to update the
- // reference.
- const T *vptr = &value;
- if (it <= vptr && vptr < this->end())
- ++vptr;
-
- *it = *vptr;
-
- return it;
- }
-
- template <typename... ArgTypes> void emplace_back(ArgTypes &&... args) {
- value_type tmp(std::forward<ArgTypes>(args)...);
- push_back(std::move(tmp));
- }
-
- template <typename... ArgTypes> iterator emplace(const_iterator pos, ArgTypes &&... args) {
- value_type tmp(std::forward<ArgTypes>(args)...);
- return insert(pos, std::move(tmp));
- }
-
- bool operator==(const self &rhs) const {
- if (this->size() != rhs.size()) return false;
- return std::equal(this->begin(), this->end(), rhs.begin());
- }
- bool operator!=(const self &rhs) const {
- return !(*this == rhs);
- }
- bool operator<(const self &rhs) const {
- return std::lexicographical_compare(this->begin(), this->end(),
- rhs.begin(), rhs.end());
- }
-};
-
-#undef LIKELY
-#undef UNLIKELY
-
-}
-
-#endif // __ADT_SMALL_POD_VECTOR__
diff --git a/src/include/config_common.hpp b/src/include/config_common.hpp
deleted file mode 100755
index 53d9864..0000000
--- a/src/include/config_common.hpp
+++ /dev/null
@@ -1,199 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * config_common.hpp
- *
- * Created on: Aug 13, 2011
- * Author: Alexey.Gurevich
- */
-
-#pragma once
-
-#include "simple_tools.hpp"
-#include "path_helper.hpp"
-#include "verify.hpp"
-
-// todo: undo dirty fix
-
-#include <boost/property_tree/ptree.hpp>
-#include <boost/property_tree/info_parser.hpp>
-#include <boost/lexical_cast.hpp>
-#include <boost/algorithm/string.hpp>
-
-#include <string>
-#include <vector>
-#include <iostream>
-#include <fstream>
-#include <map>
-
-namespace config_common {
-// for enable_if/disable_if
-namespace details {
-template<class T, class S>
-struct is_equal_type {
- static const bool value = false;
-};
-
-template<class T>
-struct is_equal_type<T, T> {
- static const bool value = true;
-};
-}
-
-template<class T>
-typename boost::enable_if_c<
- details::is_equal_type<T, std::string>::value ||
- boost::is_arithmetic<T>::value>::type load(T& value,
- boost::property_tree::ptree const& pt, std::string const& key,
- bool complete) {
- if (complete || pt.find(key) != pt.not_found())
- value = pt.get<T>(key);
-}
-
-template<class T>
-typename boost::disable_if_c<
- details::is_equal_type<T, std::string>::value ||
- boost::is_arithmetic<T>::value>::type load(T& value,
- boost::property_tree::ptree const& pt, std::string const& key,
- bool complete) {
- if (complete || pt.find(key) != pt.not_found())
- load(value, pt.get_child(key), complete);
-}
-
-template<class T>
-void load_items(std::vector<T>& vec, boost::property_tree::ptree const& pt,
- std::string const& key, bool complete) {
- std::string vector_key = key + std::string(".count");
- if (complete || pt.find(vector_key) != pt.not_found()) {
- size_t count = pt.get<size_t>(vector_key);
-
- for (size_t i = 0; i != count; ++i) {
- T t;
- load(t, pt.get_child(fmt::format("{:s}.item_{:d}", key, i)),
- complete);
- vec.push_back(t);
- }
- }
-}
-
-void inline split(std::vector<std::string>& vec, std::string const& space_separated_list) {
- std::istringstream iss(space_separated_list);
- while (iss) {
- std::string value;
- iss >> value;
- if (value.length()) {
- vec.push_back(value);
- }
- }
-}
-
-void inline load_split(std::vector<std::string>& vec, boost::property_tree::ptree const& pt, std::string const& key) {
- boost::optional<std::string> values = pt.get_optional<std::string>(key);
- if (values) {
- split(vec, *values);
- }
-}
-
-template<class T>
-void inline load(std::vector<T>& vec, boost::property_tree::ptree const& pt, std::string const& key, bool /*complete*/) {
- boost::optional<T> value = pt.get_optional<T>(key);
- if (value) {
- vec.push_back(*value);
- return;
- }
- for (size_t i = 1;; i++) {
- value = pt.get_optional<std::string>(key + "#" + ToString(i));
- if (value) {
- vec.push_back(*value);
- continue;
- }
- value = pt.get_optional<std::string>(key + "." + ToString(i));
- if (value) {
- vec.push_back(*value);
- continue;
- }
- if (i > 0) {
- return;
- }
- }
-}
-
-template<class T>
-void load(T& value, boost::property_tree::ptree const& pt, std::string const& key) {
- load(value, pt, key, true);
-}
-
-template<class T>
-void load(T& value, boost::property_tree::ptree const& pt, const char* key) {
- load(value, pt, std::string(key), true);
-}
-
-template<class T>
-void load(T& value, boost::property_tree::ptree const& pt) {
- load(value, pt, true);
-}
-}
-
-template<class T>
-inline void load_param(const std::string& filename, const std::string& key,
- boost::optional<T>& value) {
- boost::property_tree::ptree pt;
- boost::property_tree::read_info(filename, pt);
- value = pt.get_optional<T>(key);
-}
-
-template<class T>
-inline void write_param(const std::string& filename, const std::string& key,
- const boost::optional<T>& value) {
- if (value) {
- std::ofstream params_stream(filename.c_str(), std::ios_base::app);
- params_stream << key << "\t" << value << std::endl;
- }
-}
-
-template<class T>
-inline void write_param(const std::string& filename, const std::string& key,
- const T &value) {
- std::ofstream params_stream(filename.c_str(), std::ios_base::app);
- params_stream << key << "\t" << value << std::endl;
-}
-
-template<class K, class V>
-inline void load_param_map(const std::string& filename, const std::string& key,
- std::map<K, V>& value) {
- boost::property_tree::ptree pt;
- boost::property_tree::read_info(filename, pt);
- boost::optional<std::string> as_str = pt.get_optional<std::string>(key);
- if (as_str) {
- std::vector<std::string> key_value_pairs;
- boost::split(key_value_pairs, *as_str, boost::is_any_of(";"));
- for (auto it = key_value_pairs.begin(); it != key_value_pairs.end();
- ++it) {
- std::vector<std::string> key_value;
- boost::split(key_value, *it, boost::is_any_of(" "));
- VERIFY(key_value.size() == 2);
- value[boost::lexical_cast<K>(key_value[0])] =
- boost::lexical_cast<K>(key_value[1]);
- }
- }
-}
-
-template<class K, class V>
-inline void write_param_map(const std::string& filename, const std::string& key,
- const std::map<K, V>& value) {
- if (value.size() > 0) {
- std::ofstream params_stream(filename.c_str(), std::ios_base::app);
- params_stream << key << "\t\"";
- std::string delim = "";
- for (auto it = value.begin(); it != value.end(); ++it) {
- params_stream << delim << it->first << " " << it->second;
- delim = ";";
- }
- params_stream << "\"" << std::endl;
- }
-}
diff --git a/src/include/config_singl.hpp b/src/include/config_singl.hpp
deleted file mode 100644
index 2dcced7..0000000
--- a/src/include/config_singl.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef __CONFIG_SINGL_HPP__
-#define __CONFIG_SINGL_HPP__
-
-#include "verify.hpp"
-
-#include <string>
-
-namespace config_common {
-
-// config singleton-wrap
-template<class Config>
-struct config {
- static std::string dirnameOf(const std::string& fname) {
- size_t pos = fname.find_last_of("\\/");
- return (std::string::npos == pos) ? "" : fname.substr(0, pos);
- }
-
- static void create_instance(std::string const& filename) {
- load(inner_cfg(), filename);
- is_initialized() = true;
- }
-
- static Config const& get() {
- VERIFY_MSG(is_initialized(), "Config not initialized");
- return inner_cfg();
- }
-
- static Config& get_writable() {
- VERIFY_MSG(is_initialized(), "Config not initialized");
- return inner_cfg();
- }
-
- private:
- static Config& inner_cfg() {
- static Config config;
- return config;
- }
-
- static bool& is_initialized() {
- static bool is_initialized = false;
- return is_initialized;
- }
-};
-
-}
-
-
-#endif // __CONFIG_SINGLE_HPP__
diff --git a/src/include/copy_file.hpp b/src/include/copy_file.hpp
deleted file mode 100644
index 862c2b0..0000000
--- a/src/include/copy_file.hpp
+++ /dev/null
@@ -1,18 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "path_helper.hpp"
-#include <string>
-
-namespace path {
-
-path::files_t files_by_prefix(std::string const& path);
-void copy_files_by_prefix(path::files_t const& files, std::string const& to_folder);
-void link_files_by_prefix(path::files_t const& files, std::string const& to_folder);
-void copy_files_by_ext(std::string const& from_folder, std::string const& to_folder, std::string const& ext, bool recursive);
-
-}
diff --git a/src/include/cpp_utils.hpp b/src/include/cpp_utils.hpp
deleted file mode 100644
index 32771a9..0000000
--- a/src/include/cpp_utils.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * cpp_utils.hpp
- *
- * Created on: Nov 14, 2011
- * Author: valery
- */
-
-#pragma once
-
-namespace utils
-{
-
-// arrays
-template <class T, size_t N>
-size_t array_size(T (&/*arr*/)[N]) {
- return N;
-}
-
-template <class T, size_t N>
-T* array_end(T (&arr)[N]) {
- return &arr[N];
-}
-
-template <size_t EXPECTED_SIZE, class T, size_t N>
-void check_array_size(T (&/*arr*/)[N]) {
- static_assert(EXPECTED_SIZE == N, "Unexpected array size");
-}
-
-template <class T>
-T identity_function(const T& t) {
- return t;
-}
-
-} // namespace utils
diff --git a/src/include/de/conj_iterator.hpp b/src/include/de/conj_iterator.hpp
deleted file mode 100644
index 50c49a5..0000000
--- a/src/include/de/conj_iterator.hpp
+++ /dev/null
@@ -1,140 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include <boost/iterator/iterator_facade.hpp>
-
-namespace omnigraph {
-
-namespace de {
-
-/**
- * @brief Proxy for containers which are essentially splitted into two: the straight and the conjugate one
- * (any of which can be empty).
- * @param C the underlying container type
- */
-template<typename C>
-class ConjProxy {
-public:
- typedef C Container;
-
- /**
- * @brief Iterator for this splitted container.
- * It automatically switches onto the conjugate half when finished the straight.
- */
- class Iterator :
- public boost::iterator_facade<Iterator, typename Container::const_reference, boost::bidirectional_traversal_tag> {
- public:
- typedef typename Container::const_iterator InnerIterator;
-
- Iterator(InnerIterator start_iter, InnerIterator stop_iter, InnerIterator jump_iter, bool conj)
- : iter_(start_iter), stop_iter_(stop_iter), jump_iter_(jump_iter), conj_(conj) { }
-
- private:
- friend class boost::iterator_core_access;
-
- /**
- * @brief Increments the iterator.
- * @detail The underlying iterator is incremented; when it reaches the `stop` position,
- * it jumps to the `jump` position which is on the conjugate half.
- */
- void increment() {
- ++iter_;
- if (!conj_ && iter_ == stop_iter_) {
- conj_ = true;
- iter_ = jump_iter_;
- }
- }
-
- void decrement() {
- if (conj_ && iter_ == jump_iter_) {
- conj_ = false;
- iter_ = stop_iter_;
- }
- --iter_;
- }
-
- bool equal(const Iterator &other) const {
- return conj_ == other.conj_ && iter_ == other.iter_;
- }
-
- typename C::const_reference dereference() const {
- return *iter_;
- }
-
- public:
- /**
- * @brief Returns the container const_iterator to the current element.
- */
- InnerIterator Iter() const {
- return iter_;
- }
-
- /**
- * @brief Returns if the iterator is on the conjugate half.
- */
- bool Conj() const {
- return conj_;
- }
-
- private:
- InnerIterator iter_, //the current position
- stop_iter_, //when to stop and jump (typically `end` of the straight half)
- jump_iter_; //where to jump (typically `begin` of the conjugate half)
- bool conj_;
- };
-
- ConjProxy(const Container &cont, const Container &conj_cont) :
- cont_(cont),
- conj_cont_(conj_cont) { }
-
- /**
- * @brief Iteration always starts from the beginning of the leftmost non-empty half.
- * If there is no such one, it essentially equals to `end`.
- */
- Iterator begin() const {
- auto conj = cont_.empty();
- auto start = conj ? conj_cont_.begin() : cont_.begin();
- return Iterator(start, cont_.end(), conj_cont_.begin(), conj);
- }
-
- /**
- * @brief Raw iterator should end right after the jumping, i.e. on the beginning
- * of the conjugate half.
- */
- Iterator conj_begin() const {
- return Iterator(conj_cont_.begin(), cont_.end(), conj_cont_.begin(), true);
- }
-
- /**
- * @brief Full iterator ends on the end of the conjugate half.
- */
- Iterator end() const {
- return Iterator(conj_cont_.end(), cont_.end(), conj_cont_.begin(), true);
- }
-
- /**
- * @brief Returns the total size of both halves.
- */
- size_t size() const {
- return cont_.size() + conj_cont_.size();
- }
-
- /**
- * @brief Returns if both halves are empty.
- */
- bool empty() const {
- return cont_.empty() && conj_cont_.empty();
- }
-
-private:
- const Container &cont_, &conj_cont_;
-};
-
-}
-
-}
diff --git a/src/include/de/data_divider.hpp b/src/include/de/data_divider.hpp
deleted file mode 100644
index bd33b93..0000000
--- a/src/include/de/data_divider.hpp
+++ /dev/null
@@ -1,140 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * data_divider.hpp
- *
- * Created on: Aug 16, 2011
- * Author: alexeyka
- */
-
-
-#ifndef DATA_DIVIDER_HPP_
-#define DATA_DIVIDER_HPP_
-
-#include <iostream>
-#include <math.h>
-#include "verify.hpp"
-#include <vector>
-#include <utility>
-#include <cstdlib>
-#include <cstdio>
-#include "index_point.hpp"
-#include "omni/omni_utils.hpp"
-
-namespace omnigraph {
-
-namespace de {
-
-template<class EdgeId>
-class DataDivider {
- typedef pair<size_t, size_t> Interval;
- typedef vector<PairInfo<EdgeId> > PairInfos;
- typedef pair<EdgeId, EdgeId> EdgePair;
- typedef vector<Point> PointArray;
- typedef std::function<double(int)> WeightFunction;
-
- // double LeftDerivative(int index, vector<int> x, vector<int> y) {
- // return outf[dist - min_value_ + 1][0] - outf[dist - min][0];
- // }
- //
- // double RightDerivative(index, std::vector<int> x, std::vector<int> y) {
- // return outf[dist - min_value_][0] - outf[dist - min - 1][0];
- // }
- //
- // double MiddleDerivative(int index, std::vector<int> x, std::vector<int> y) {
- // return 0.5f * (outf[dist - min_value_ + 1][0] - outf[dist - min - 1][0]);
- // }
-
- public:
- DataDivider(size_t threshold, const PointArray& points) :
- threshold_(threshold), points_(points)
- {
- }
-
- vector<Interval> DivideData() {
- VERIFY(points_.size() > 0);
- vector<Interval> answer;
- min_value_ = rounded_d(points_.front());
- max_value_ = rounded_d(points_.back());
- size_t begin = 0;
- for (size_t i = 0; i < points_.size() - 1; ++i) {
- if (IsANewCluster(i, points_)) {
- answer.push_back(make_pair(begin, i + 1));
- begin = i + 1;
- }
- }
- answer.push_back(make_pair(begin, points_.size()));
-
- return answer;
- }
-
- vector<Interval> DivideAndSmoothData(const EdgePair& ep,
- PairInfos& new_data,
- WeightFunction weight_f)
- {
- VERIFY(points_.size() > 0);
- vector<Interval> answer;
-
- TRACE("Data");
- //Print();
- const Point& point = points_.front();
- min_value_ = rounded_d(point);
- max_value_ = rounded_d(points_.back());
- size_t begin = 0;
- for (size_t i = 0; i < points_.size(); ++i) {
- if (i == points_.size() - 1 || IsANewCluster(i)) {
- int low_val = rounded_d(points_[begin]);
- int high_val = rounded_d(points_[i]);
- size_t new_begin = new_data.size();
- VERIFY(low_val <= high_val);
- for (int j = low_val; j <= high_val; ++j) {
- double val = 0.;
- for (size_t k = begin; k <= i; ++k) {
- val += points_[k].weight * weight_f(j - rounded_d(points_[k]));
- }
- new_data.push_back(PairInfo<EdgeId>(ep.first, ep.second, j, val, 0.));
- }
- size_t new_end = new_data.size();
- answer.push_back(make_pair(new_begin, new_end));
-
- begin = i + 1;
- }
- }
- //answer.push_back(make_pair(beginc, new_data.size()));
- TRACE("New_data ");
- Print();
-
- return answer;
- }
-
- private:
- int min_value_;
- int max_value_;
- size_t threshold_;
- PointArray points_;
-
- void Print() const {
- for (size_t i = 0; i < points_.size(); ++i) {
- TRACE(points_[i].d << " " << points_[i].weight);
- }
- }
-
- bool IsANewCluster(size_t index) {
- VERIFY(index < points_.size() - 1);
- return (math::gr(abs(points_[index + 1].d - points_[index].d), (DEDistance)threshold_));
- }
-
- DECL_LOGGER("DataDivider");
-};
-
-}
-
-
-}
-
-#endif /* DATA_DIVIDER_HPP_ */
diff --git a/src/include/de/distance_estimation.hpp b/src/include/de/distance_estimation.hpp
deleted file mode 100644
index 00b7190..0000000
--- a/src/include/de/distance_estimation.hpp
+++ /dev/null
@@ -1,311 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef DISTANCE_ESTIMATION_HPP_
-#define DISTANCE_ESTIMATION_HPP_
-
-#include "xmath.h"
-#include "openmp_wrapper.h"
-
-#include "paired_info.hpp"
-#include "omni/omni_utils.hpp"
-#include "omni/path_processor.hpp"
-
-namespace omnigraph {
-
-namespace de {
-
-//todo move to some more common place
-template<class Graph>
-class GraphDistanceFinder {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef std::vector<EdgeId> Path;
- typedef std::vector<size_t> GraphLengths;
- typedef std::map<EdgeId, GraphLengths> LengthMap;
-
- public:
- GraphDistanceFinder(const Graph& graph, size_t insert_size, size_t read_length, size_t delta) :
- graph_(graph), insert_size_(insert_size), gap_((int) (insert_size - 2 * read_length)),
- delta_((double) delta)
- {}
-
- std::vector<size_t> GetGraphDistancesLengths(EdgeId e1, EdgeId e2) const {
- LengthMap m;
- m.insert({e2, { } });
-
- FillGraphDistancesLengths(e1, m);
-
- return m[e2];
- }
-
- // finds all distances from a current edge to a set of edges
- void FillGraphDistancesLengths(EdgeId e1, LengthMap& second_edges) const {
- vector<VertexId> end_points;
- vector<size_t> path_lower_bounds;
- for (const auto& entry : second_edges) {
- EdgeId second_edge = entry.first;
- end_points.push_back(graph_.EdgeStart(second_edge));
- path_lower_bounds.push_back(PairInfoPathLengthLowerBound(graph_.k(), graph_.length(e1),
- graph_.length(second_edge), gap_, delta_));
- TRACE("Bounds for paths are " << path_lower_bounds.back());
- }
-
- size_t path_upper_bound = PairInfoPathLengthUpperBound(graph_.k(), insert_size_, delta_);
-
- DistancesLengthsCallback<Graph> callback(graph_);
-
- PathProcessor<Graph> paths_proc(graph_, graph_.EdgeEnd(e1), path_upper_bound);
-
- for (size_t i = 0; i < end_points.size(); ++i) {
- //FIXME should max dist also depend on the point?
- paths_proc.Process(end_points[i], path_lower_bounds[i], path_upper_bound, callback);
- }
-
- vector<GraphLengths> result;
-
- size_t i = 0;
- for (auto& entry : second_edges) {
- GraphLengths lengths = callback.distances(i++);
- for (size_t j = 0; j < lengths.size(); ++j) {
- lengths[j] += graph_.length(e1);
- TRACE("Resulting distance set # " << i <<
- " edge " << graph_.int_id(entry.first) << " #" << j << " length " << lengths[j]);
- }
-
- if (e1 == entry.first)
- lengths.push_back(0);
-
- std::sort(lengths.begin(), lengths.end());
- entry.second = lengths;
- }
- }
-
- private:
- DECL_LOGGER("GraphDistanceFinder");
-
- const Graph& graph_;
- const size_t insert_size_;
- const int gap_;
- const double delta_;
-};
-
-template<class Graph>
-class AbstractDistanceEstimator {
- protected:
- typedef UnclusteredPairedInfoIndexT<Graph> InPairedIndex;
- typedef PairedInfoIndexT<Graph> OutPairedIndex;
- typedef typename InPairedIndex::FullHistProxy InHistogram;
- typedef typename OutPairedIndex::Histogram OutHistogram;
-
- public:
- AbstractDistanceEstimator(const Graph& graph,
- const InPairedIndex& index,
- const GraphDistanceFinder<Graph>& distance_finder,
- size_t linkage_distance = 0)
- : graph_(graph), index_(index),
- distance_finder_(distance_finder), linkage_distance_(linkage_distance)
- {}
-
- virtual void Estimate(PairedInfoIndexT<Graph>& result, size_t nthreads) const = 0;
-
- virtual ~AbstractDistanceEstimator() {}
-
- protected:
- typedef typename Graph::EdgeId EdgeId;
- typedef pair<EdgeId, EdgeId> EdgePair;
- typedef vector<pair<int, double> > EstimHist;
- typedef vector<size_t> GraphLengths;
- typedef std::map<EdgeId, GraphLengths> LengthMap;
-
- const Graph& graph() const { return graph_; }
-
- const InPairedIndex& index() const { return index_; }
-
- void FillGraphDistancesLengths(EdgeId e1, LengthMap& second_edges) const {
- distance_finder_.FillGraphDistancesLengths(e1, second_edges);
- }
-
- OutHistogram ClusterResult(EdgePair /*ep*/, const EstimHist& estimated) const {
- OutHistogram result;
- for (size_t i = 0; i < estimated.size(); ++i) {
- size_t left = i;
- double weight = estimated[i].second;
- while (i + 1 < estimated.size() &&
- (estimated[i + 1].first - estimated[i].first) <= (int) linkage_distance_) {
- ++i;
- weight += estimated[i].second;
- }
- double center = (estimated[left].first + estimated[i].first) * 0.5;
- double var = (estimated[i].first - estimated[left].first) * 0.5;
- result.insert(Point(center, weight, var));
- }
- return result;
- }
-
- void AddToResult(const OutHistogram& clustered, EdgePair ep, PairedInfoBuffer<Graph>& result) const {
- result.AddMany(ep.first, ep.second, clustered);
- }
-
-private:
- const Graph& graph_;
- const InPairedIndex& index_;
- const GraphDistanceFinder<Graph>& distance_finder_;
- const size_t linkage_distance_;
-
- virtual const string Name() const = 0;
-};
-
-template<class Graph>
-class DistanceEstimator: public AbstractDistanceEstimator<Graph> {
- typedef AbstractDistanceEstimator<Graph> base;
- typedef typename Graph::EdgeId EdgeId;
- typedef vector<size_t> GraphLengths;
- typedef vector<pair<int, double> > EstimHist;
- typedef pair<EdgeId, EdgeId> EdgePair;
-
- protected:
- typedef typename base::InPairedIndex InPairedIndex;
- typedef typename base::OutPairedIndex OutPairedIndex;
- typedef typename base::InHistogram InHistogram;
- typedef typename base::OutHistogram OutHistogram;
-
- public:
- DistanceEstimator(const Graph& graph,
- const InPairedIndex& index,
- const GraphDistanceFinder<Graph>& distance_finder,
- size_t linkage_distance, size_t max_distance)
- : base(graph, index, distance_finder, linkage_distance), max_distance_(max_distance)
- {}
- virtual ~DistanceEstimator() {}
-
- void Init() const {
- INFO("Using " << this->Name() << " distance estimator");
- }
-
- virtual void Estimate(OutPairedIndex& result, size_t nthreads) const {
- this->Init();
- const auto& index = this->index();
-
- DEBUG("Collecting edge infos");
- std::vector<EdgeId> edges;
- for (auto it = this->graph().ConstEdgeBegin(); !it.IsEnd(); ++it)
- edges.push_back(*it);
-
- DEBUG("Processing");
- PairedInfoBuffersT<Graph> buffer(this->graph(), nthreads);
-# pragma omp parallel for num_threads(nthreads) schedule(guided, 10)
- for (size_t i = 0; i < edges.size(); ++i) {
- EdgeId edge = edges[i];
- ProcessEdge(edge, index, buffer[omp_get_thread_num()]);
- }
-
- for (size_t i = 0; i < nthreads; ++i) {
- result.Merge(buffer[i]);
- buffer[i].Clear();
- }
- }
-
- protected:
- const size_t max_distance_;
-
- virtual EstimHist EstimateEdgePairDistances(EdgePair ep,
- const InHistogram& histogram,
- const GraphLengths& raw_forward) const {
- using std::abs;
- using namespace math;
- EdgeId e1 = ep.first, e2 = ep.second;
- size_t first_len = this->graph().length(e1), second_len = this->graph().length(e2);
- int minD = rounded_d(histogram.min()), maxD = rounded_d(histogram.max());
-
- TRACE("Bounds are " << minD << " " << maxD);
- EstimHist result;
- vector<int> forward;
- forward.reserve(raw_forward.size());
- for (auto raw_length : raw_forward) {
- int length = int(raw_length);
- if (minD - int(max_distance_) <= length && length <= maxD + int(max_distance_))
- forward.push_back(length);
- }
- if (forward.size() == 0)
- return result;
-
- size_t cur_dist = 0;
- vector<double> weights(forward.size(), 0.);
- for (auto point : histogram) {
- if (ls(2. * point.d + second_len, first_len))
- continue;
- while (cur_dist + 1 < forward.size() && forward[cur_dist + 1] < point.d)
- ++cur_dist;
-
- if (cur_dist + 1 < forward.size() &&
- ls(forward[cur_dist + 1] - point.d, point.d - forward[cur_dist])) {
- ++cur_dist;
-
- if (le(abs(forward[cur_dist] - point.d), (DEDistance)max_distance_))
- weights[cur_dist] += point.weight;
- } else if (cur_dist + 1 < forward.size() &&
- eq(forward[cur_dist + 1] - point.d, point.d - forward[cur_dist])) {
- if (le(abs(forward[cur_dist] - point.d), (DEDistance)max_distance_))
- weights[cur_dist] += point.weight * 0.5;
- ++cur_dist;
- if (le(abs(forward[cur_dist] - point.d), (DEDistance)max_distance_))
- weights[cur_dist] += point.weight * 0.5;
- } else {
- if (le(abs(forward[cur_dist] - point.d), (DEDistance)max_distance_))
- weights[cur_dist] += point.weight;
- }
- }
-
- for (size_t i = 0; i < forward.size(); ++i)
- if (ge(weights[i], 0.))
- result.push_back(make_pair(forward[i], weights[i]));
-
- VERIFY(result.size() == forward.size());
- return result;
- }
-
- private:
- virtual void ProcessEdge(EdgeId e1,
- const InPairedIndex& pi,
- PairedInfoBuffer<Graph>& result) const {
- typename base::LengthMap second_edges;
- auto inner_map = pi.RawGet(e1);
- for (auto i : inner_map)
- second_edges[i.first];
-
- this->FillGraphDistancesLengths(e1, second_edges);
-
- for (const auto& entry: second_edges) {
- EdgeId e2 = entry.first;
- EdgePair ep(e1, e2);
-
- VERIFY(ep <= pi.ConjugatePair(ep));
-
- const GraphLengths& forward = entry.second;
- TRACE("Edge pair is " << this->graph().int_id(ep.first)
- << " " << this->graph().int_id(ep.second));
- auto hist = pi.Get(e1, e2);
- const EstimHist& estimated = this->EstimateEdgePairDistances(ep, hist, forward);
- OutHistogram res = this->ClusterResult(ep, estimated);
- this->AddToResult(res, ep, result);
- }
- }
-
- virtual const string Name() const {
- static const string my_name = "SIMPLE";
- return my_name;
- }
-
- DECL_LOGGER("DistanceEstimator");
-};
-
-}
-
-}
-
-#endif /* DISTANCE_ESTIMATION_HPP_ */
diff --git a/src/include/de/extensive_distance_estimation.hpp b/src/include/de/extensive_distance_estimation.hpp
deleted file mode 100644
index 7c6573e..0000000
--- a/src/include/de/extensive_distance_estimation.hpp
+++ /dev/null
@@ -1,211 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef EXTENSIVE_DISTANCE_ESTIMATION_HPP_
-#define EXTENSIVE_DISTANCE_ESTIMATION_HPP_
-
-#include "xmath.h"
-#include "paired_info.hpp"
-#include "omni/omni_utils.hpp"
-#include "distance_estimation.hpp"
-#include "weighted_distance_estimation.hpp"
-
-#include <algorithm>
-
-// No variation support in the original data
-
-namespace omnigraph {
-
-namespace de {
-
-template<class Graph>
-class ExtensiveDistanceEstimator: public WeightedDistanceEstimator<Graph> {
- protected:
- typedef WeightedDistanceEstimator<Graph> base;
- typedef typename base::InPairedIndex InPairedIndex;
- typedef typename base::OutPairedIndex OutPairedIndex;
- typedef typename base::InHistogram InHistogram;
- typedef typename base::OutHistogram OutHistogram;
-
- typedef typename InPairedIndex::Histogram TempHistogram;
-
- public:
- ExtensiveDistanceEstimator(const Graph &graph,
- const InPairedIndex& histogram,
- const GraphDistanceFinder<Graph>& distance_finder, std::function<double(int)> weight_f,
- size_t linkage_distance, size_t max_distance) :
- base(graph, histogram, distance_finder, weight_f, linkage_distance, max_distance)
- {}
-
- virtual ~ExtensiveDistanceEstimator() { }
-
- protected:
- typedef typename Graph::EdgeId EdgeId;
- typedef vector<PairInfo<EdgeId> > PairInfos;
- typedef vector<pair<int, double> > EstimHist;
- typedef vector<size_t> GraphLengths;
-
- void ExtendInfoLeft(EdgeId e1, EdgeId e2, TempHistogram& data, size_t max_shift) const {
- ExtendLeftDFS(e1, e2, data, 0, max_shift);
- }
-
- void ExtendInfoRight(EdgeId e1, EdgeId e2, TempHistogram& data, size_t max_shift) const {
- ExtendRightDFS(e1, e2, data, 0, max_shift);
- }
-
- private:
- typedef typename Graph::VertexId VertexId;
- typedef pair<EdgeId, EdgeId> EdgePair;
-
- virtual void ProcessEdge(EdgeId e1,
- const InPairedIndex& pi,
- PairedInfoBuffer<Graph>& result) const override {
- auto inner_map = pi.RawGet(e1);
- typename base::LengthMap second_edges;
- for (auto i : inner_map)
- second_edges[i.first];
-
- this->FillGraphDistancesLengths(e1, second_edges);
-
- for (const auto& entry: second_edges) {
- EdgeId e2 = entry.first;
- EdgePair ep(e1, e2);
-
- const GraphLengths& forward = entry.second;
- TempHistogram hist = pi.Get(e1, e2).Unwrap();
- DEBUG("Extending paired information");
- double weight_0 = WeightSum(hist);
- DEBUG("Extend left");
- ExtendInfoLeft(e1, e2, hist, 1000);
- DEBUG("Extend right");
- ExtendInfoRight(e1, e2, hist, 1000);
- DEBUG("Weight increased " << (WeightSum(hist) - weight_0));
- const EstimHist& estimated = this->EstimateEdgePairDistances(ep, hist, forward);
- OutHistogram res = this->ClusterResult(ep, estimated);
- this->AddToResult(res, ep, result);
- }
- }
-
- double WeightSum(const InHistogram& hist) const {
- double answer = 0.;
- for (const auto& p : hist) {
- answer += p.weight;
- }
- return answer;
- }
-
- bool IsSorted(const InHistogram& hist) const {
- if (hist.size() == 0)
- return true;
-
- auto prev = hist.begin()->d;
- for (auto p : hist) {
- if (math::gr(prev, p.d))
- return false;
-
- prev = p.d;
- }
- return true;
- }
-
- void MergeInto(const TempHistogram& what, TempHistogram& where, int shift) const {
- // assuming they are sorted already
- if (what.size() == 0)
- return;
-
- if (where.size() == 0) {
- for (auto to_be_added : what) {
- to_be_added.d += shift;
- where.insert(to_be_added);
- }
-
- VERIFY(IsSorted(where));
- return;
- }
-
- // Check, whether two histograms intersect. If not, we can just merge them
- // straightforwardly.
- if (math::ls(where.rbegin()->d, what.begin()->d + shift) ||
- math::gr(where.begin()->d, what.rbegin()->d + shift)) {
- for (auto to_be_added : what) {
- to_be_added.d += shift;
- where.insert(to_be_added);
- }
- } else {
- for (auto to_be_added : what) {
- to_be_added.d += shift;
- auto low_bound = std::lower_bound(where.begin(), where.end(), to_be_added);
- if (to_be_added == *low_bound) {
- to_be_added.weight += low_bound->weight;
- where.erase(to_be_added);
- where.insert(to_be_added);
- } else
- where.insert(low_bound, to_be_added);
- }
- }
- VERIFY(IsSorted(where));
- }
-
- TempHistogram FilterPositive(const typename InPairedIndex::FullHistProxy& hist, size_t first_len, size_t second_len) const {
- // assuming it is sorted
- TempHistogram answer;
- for (auto point : hist) {
- if (math::ge(2. * point.d + (double) second_len, (double) first_len))
- answer.insert(point);
- }
- return answer;
- }
-
- // left edge being extended to the left, shift is negative always
- void ExtendLeftDFS(EdgeId current, const EdgeId& last, TempHistogram& data, int shift, size_t max_shift) const {
- VertexId start = this->graph().EdgeStart(current);
- if (current == last)
- return;
- if (this->graph().OutgoingEdgeCount(start) > 1)
- return;
-
- for (EdgeId next : this->graph().IncomingEdges(start)) {
- auto hist = this->index().Get(next, last);
- if (-shift < (int) max_shift)
- ExtendLeftDFS(next, last, data, shift - (int) this->graph().length(next), max_shift);
- auto filtered_infos = FilterPositive(hist, this->graph().length(next), this->graph().length(last));
- if (filtered_infos.size() > 0)
- MergeInto(filtered_infos, data, shift - (int) this->graph().length(next));
- }
- }
-
- // right edge being extended to the right, shift is negative always
- void ExtendRightDFS(const EdgeId& first, EdgeId current, TempHistogram& data, int shift, size_t max_shift) const {
- VertexId end = this->graph().EdgeEnd(current);
- if (current == first)
- return;
- if (this->graph().IncomingEdgeCount(end) > 1)
- return;
-
- for (EdgeId next : this->graph().OutgoingEdges(end)) {
- auto hist = this->index().Get(first, next);
- if (-shift < (int) max_shift)
- ExtendRightDFS(first, next, data, shift - (int) this->graph().length(current), max_shift);
-
- auto filtered_infos = FilterPositive(hist, this->graph().length(first), this->graph().length(next));
- if (filtered_infos.size() > 0)
- MergeInto(filtered_infos, data, shift - (int) this->graph().length(current));
- }
- }
-
- const string Name() const override {
- static const string my_name = "EXTENSIVE";
- return my_name;
- }
-
- DECL_LOGGER("ExtensiveDistanceEstimator")
-};
-
-}
-
-}
-#endif
diff --git a/src/include/de/index_point.hpp b/src/include/de/index_point.hpp
deleted file mode 100644
index c5dba96..0000000
--- a/src/include/de/index_point.hpp
+++ /dev/null
@@ -1,455 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include <btree/btree_set.h>
-#include "adt/flat_set.hpp"
-#include "adt/small_pod_vector.hpp"
-
-namespace omnigraph {
-
-namespace de {
-
-// Define several storage-only POD types which can be
-// implicitly converted to / from double.
-
-class DEDistance {
-public:
- DEDistance() = default;
- DEDistance(int d)
- : d_((float)d) {}
- DEDistance(double d)
- : d_((float)d) {}
- DEDistance(size_t d)
- : d_((float)d) {}
- operator float() const { return d_; }
- DEDistance operator+= (double d) {
- d_ += (float)d;
- return *this;
- }
- DEDistance operator*= (double d) {
- d_ *= (float)d;
- return *this;
- }
-private:
- float d_;
-};
-
-class DEWeight {
-public:
- DEWeight() = default;
- DEWeight(double d)
- : d_((float)d) {}
- operator float() const { return d_; }
- DEWeight operator+= (double d) {
- d_ += (float)d;
- return *this;
- }
- DEWeight operator*= (double d) {
- d_ *= (float)d;
- return *this;
- }
-private:
- float d_;
-};
-
-struct __attribute__((aligned(8))) RawPoint {
- DEDistance d;
- mutable DEWeight weight;
-
- RawPoint()
- : RawPoint(0, 0) { }
-
-
- RawPoint(DEDistance distance, DEWeight weight)
- : d(distance), weight(weight) {}
-
- RawPoint(DEDistance distance, DEWeight weight, DEDistance)
- : d(distance), weight(weight) {}
-
-
- const RawPoint operator+=(const RawPoint &rhs) const {
- weight += rhs.weight;
- return *this;
- }
-
- std::string str() const {
- stringstream ss;
- ss << "Point: " << " distance = " << this->d
- << ", weight = " << this->weight;
- return ss.str();
- }
-
- bool operator<(const RawPoint& rhs) const {
- return math::ls(this->d, rhs.d);
- }
-
- bool operator==(const RawPoint& rhs) const {
- return math::eq(this->d, rhs.d);
- }
-
- bool operator!=(const RawPoint& rhs) const {
- return !(operator==(rhs));
- }
-
- RawPoint operator-() const {
- return RawPoint(-d, weight);
- }
-
- RawPoint operator+(const RawPoint &rhs) const {
- return RawPoint(d, rhs.weight + weight);
- }
-
- DEWeight variation() const {
- return 0;
- }
-
- RawPoint Conjugate(size_t l1, size_t l2) const
- {
- return RawPoint(d + DEDistance(l2) - DEDistance(l1), weight);
- }
-};
-
-struct Point : public RawPoint {
- DEDistance var;
-
- Point()
- : Point(0, 0, 0) { }
-
- Point(DEDistance distance, DEWeight weight, DEDistance variance)
- : RawPoint(distance, weight), var(variance) {}
-
- Point(const RawPoint &rhs)
- : RawPoint(rhs), var(0.0) {}
-
- bool operator<(const Point& rhs) const {
- return math::ls(this->d, rhs.d);
- }
-
- bool operator==(const Point& rhs) const {
- return math::eq(this->d, rhs.d);
- }
-
- bool operator!=(const Point& rhs) const {
- return !(operator==(rhs));
- }
-
- Point operator-() const {
- return Point(-d, weight, var);
- }
-
- Point operator+(const Point &rhs) const {
- auto weight_rhs = rhs.weight;
- // counting new bounds in the case, when we are merging pair infos with var != 0
- auto left_bound = std::min(d - var, rhs.d - rhs.var);
- auto right_bound = std::max(d + var, rhs.d + rhs.var);
- auto new_dist = (left_bound + right_bound) * 0.5f;
- auto new_weight = weight + weight_rhs;
- auto new_variance = (right_bound - left_bound) * 0.5f;
-
- return Point(new_dist, new_weight, new_variance);
- }
-
- DEDistance variation() const {
- return var;
- }
-
- Point Conjugate(size_t l1, size_t l2) const
- {
- return Point(d + DEDistance(l2) - DEDistance(l1), weight, var);
- }
-};
-
-inline int rounded_d(const RawPoint& p) {
- return math::round_to_zero(p.d);
-}
-
-inline std::ostream& operator<<(std::ostream& os, const Point &point) {
- return os << point.str();
-}
-
-inline std::ostream& operator<<(std::ostream& os, const RawPoint &point) {
- return os << point.str();
-}
-
-template<class Point>
-class Histogram {
- typedef Histogram<Point> self_type;
- typedef typename std::less<Point> key_compare;
- typedef typename std::allocator<Point> allocator_type;
- typedef typename adt::flat_set<Point, key_compare, adt::SmallPODVector> Tree;
-
- public:
- typedef typename Tree::key_type key_type;
- typedef typename Tree::value_type value_type;
- typedef typename Tree::pointer pointer;
- typedef typename Tree::const_pointer const_pointer;
- typedef typename Tree::reference reference;
- typedef typename Tree::const_reference const_reference;
- typedef typename Tree::size_type size_type;
- typedef typename Tree::difference_type difference_type;
- typedef typename Tree::iterator iterator;
- typedef typename Tree::const_iterator const_iterator;
- typedef typename Tree::reverse_iterator reverse_iterator;
- typedef typename Tree::const_reverse_iterator const_reverse_iterator;
-
- enum {
- kValueSize = sizeof(Point)
- };
-
- public:
- // Default constructor.
- Histogram() = default;
-
- // Copy constructor.
- Histogram(const self_type &x)
- : tree_(x.tree_) {}
-
- template <class InputIterator>
- Histogram(InputIterator b, InputIterator e) {
- insert(b, e);
- }
-
- // Iterator routines.
- iterator begin() { return tree_.begin(); }
- const_iterator begin() const { return tree_.begin(); }
- iterator end() { return tree_.end(); }
- const_iterator end() const { return tree_.end(); }
- reverse_iterator rbegin() { return tree_.rbegin(); }
- const_reverse_iterator rbegin() const { return tree_.rbegin(); }
- reverse_iterator rend() { return tree_.rend(); }
- const_reverse_iterator rend() const { return tree_.rend(); }
-
- // Lookup routines.
- iterator lower_bound(const key_type &key) { return tree_.lower_bound(key); }
- const_iterator lower_bound(const key_type &key) const { return tree_.lower_bound(key); }
- iterator upper_bound(const key_type &key) { return tree_.upper_bound(key); }
- const_iterator upper_bound(const key_type &key) const { return tree_.upper_bound(key); }
- std::pair<iterator,iterator> equal_range(const key_type &key) { return tree_.equal_range(key); }
- std::pair<const_iterator,const_iterator> equal_range(const key_type &key) const { return tree_.equal_range(key); }
-
- // Utility routines.
- void clear() { tree_.clear(); }
- void swap(self_type &x) { tree_.swap(x.tree_); }
-
- // Size routines.
- size_type size() const { return tree_.size(); }
- size_type max_size() const { return tree_.max_size(); }
- bool empty() const { return tree_.empty(); }
- size_type bytes_used() const { return tree_.bytes_used(); }
-
- // Lookup routines.
- iterator find(const key_type &key) { return tree_.find(key); }
- const_iterator find(const key_type &key) const { return tree_.find(key); }
- size_type count(const key_type &key) const { return tree_.count(key); }
-
- // Insertion routines.
- std::pair<iterator,bool> insert(const value_type &x) { return tree_.insert(x); }
- iterator insert(iterator position, const value_type &x) { return tree_.insert(position, x); }
- template <typename InputIterator>
- void insert(InputIterator b, InputIterator e) { tree_.insert(b, e); }
-
- // Deletion routines.
- size_type erase(const key_type &key) { return tree_.erase(key); }
- // Erase the specified iterator from the btree. The iterator must be valid
- // (i.e. not equal to end()). Return an iterator pointing to the node after
- // the one that was erased (or end() if none exists).
- iterator erase(const iterator &iter) { return tree_.erase(iter); }
- void erase(const iterator &first, const iterator &last) { tree_.erase(first, last); }
-
- bool operator==(const self_type& x) const {
- if (size() != x.size())
- return false;
-
- for (const_iterator i = begin(), xi = x.begin(); i != end(); ++i, ++xi)
- if (*i != *xi)
- return false;
-
- return true;
- }
-
- bool operator!=(const self_type& other) const {
- return !operator==(other);
- }
-
- protected:
- Tree tree_;
-
- private:
- // This is template voodoo which creates function overload depending on
- // whether Point has const operator+= or not.
- template<class>
- struct true_helper : std::true_type {};
- template<class T = Point>
- static auto test_can_merge(int) -> true_helper<decltype(std::declval<const T>().operator+=(std::declval<const T>()))>;
- template<class>
- static auto test_can_merge(long) -> std::false_type;
- template<class T = Point>
- struct can_merge : decltype(test_can_merge<T>(0)) {};
-
- public:
- // This function overload is enabled only when Point has const operator+= (e.g. RawPoint)
- // and therefore we can update it inplace.
- template<class OtherHist, class U = Point>
- typename std::enable_if<can_merge<U>::value, size_t>::type
- merge(const OtherHist &other) {
- size_t added = 0;
- for (const auto& new_point : other) {
- // First, try to insert a point
- const auto& result = insert(new_point);
- if (!result.second) {
- // We already having something there. Try to merge stuff in.
- *result.first += new_point;
- } else
- added += 1;
- }
-
- return added;
- }
-
- // Otherwise this overload is used, which removes the point from set,
- // updates it and re-inserts back.
- template<class OtherHist, class U = Point>
- typename std::enable_if<!can_merge<U>::value, size_t>::type
- merge(const OtherHist &other) {
- size_t added = 0;
- for (const auto& new_point : other) {
- // First, try to insert a point
- auto result = insert(new_point);
- if (!result.second) {
- Point updated = *result.first + new_point;
- auto after_removed = erase(result.first);
- insert(after_removed, updated);
- } else
- added += 1;
- }
-
- return added;
- }
-};
-
-template <typename T>
-inline std::ostream& operator<<(std::ostream &os, const Histogram<T> &b) {
- os << b;
- return os;
-}
-
-typedef Histogram<RawPoint> RawHistogram;
-typedef Histogram<Point> HistogramWithWeight;
-
-inline bool ClustersIntersect(Point p1, Point p2) {
- return math::le(p1.d, p2.d + p1.var + p2.var) &&
- math::le(p2.d, p1.d + p1.var + p2.var);
-}
-
-// tuple of a pair of edges @first, @second, and a @point
-template<typename EdgeId>
-struct PairInfo {
- EdgeId first;
- EdgeId second;
- Point point;
-
- PairInfo()
- : first(), second(), point() {}
-
-
- PairInfo(const PairInfo& pair_info)
- : first(pair_info.first), second(pair_info.second), point(pair_info.point) {}
-
- PairInfo(EdgeId first, EdgeId second, DEDistance d, DEWeight weight, DEDistance var)
- : first(first), second(second), point(d, weight, var) {}
-
- PairInfo(EdgeId first, EdgeId second, Point point)
- : first(first), second(second), point(point) {}
-
- // Two paired infos are considered equal
- // if they coincide in all parameters except for weight and variance.
- bool operator==(const PairInfo& rhs) const {
- const PairInfo &lhs = *this;
- return lhs.first == rhs.first && lhs.second == rhs.second && lhs.point == rhs.point;
- }
-
- bool operator!=(const PairInfo& rhs) const {
- return !(*this == rhs);
- }
-
- bool operator<(const PairInfo<EdgeId>& rhs) const {
- const PairInfo<EdgeId>& lhs = *this;
- return lhs.first == rhs.first ?
- (lhs.second == rhs.second ? lhs.point < rhs.point : lhs.second < rhs.second)
- : lhs.first < rhs.first;
- }
-
- double d() const { return point.d; }
- double weight() const { return point.weight; }
- double var() const { return point.var; }
-};
-
-template<typename EdgeId>
-ostream& operator<<(ostream& os, const PairInfo<EdgeId>& info) {
- return os << "PairInfo: first = " << info.first << ", second = " << info.second
- << "Point : " << info.point;
-}
-
-template<typename EdgeId>
-const PairInfo<EdgeId> MinPairInfo(EdgeId id) {
- return PairInfo<EdgeId>(id, EdgeId(typename EdgeId::pointer_type(1)),
- -10000000000, 0., 0.);
-}
-
-template<typename EdgeId>
-const PairInfo<EdgeId> MaxPairInfo(EdgeId id) {
- return PairInfo<EdgeId>(id, EdgeId(typename EdgeId::pointer_type(-1)),
- 10000000000, 0., 0.);
-}
-
-template<typename EdgeId>
-const PairInfo<EdgeId> MinPairInfo(EdgeId e1, EdgeId e2) {
- PairInfo<EdgeId> info = MinPairInfo(e1);
- info.second = e2;
- return info;
-}
-
-template<typename EdgeId>
-const PairInfo<EdgeId> MaxPairInfo(EdgeId e1, EdgeId e2) {
- PairInfo<EdgeId> info = MaxPairInfo(e1);
- info.second = e2;
- return info;
-}
-
-/**
- * Method returns approximate distance between occurrences of edges in genome rounded to the nearest
- * integer. In case of a tie closest to 0 value is chosen thus one can assume that distance
- * is rounded the same way as opposite one.
- * todo check that written here is true
- */
-template<typename EdgeId>
-inline int rounded_d(PairInfo<EdgeId> const& pi) {
- return math::round_to_zero(pi.d());
-}
-
-template<typename EdgeId>
-inline PairInfo<EdgeId> BackwardInfo(const PairInfo<EdgeId>& pi) {
- return PairInfo<EdgeId>(pi.second, pi.first, -pi.point);
-}
-
-template<typename EdgeId>
-inline bool IsSymmetric(PairInfo<EdgeId> const& pi) {
- return pi.first == pi.second && math::eq(pi.d(), 0.);
-}
-
-}
-
-}
-
-namespace std {
-template<>
-class numeric_limits<omnigraph::de::DEDistance> : public numeric_limits<float> {};
-template<>
-class numeric_limits<omnigraph::de::DEWeight> : public numeric_limits<float> {};
-}
diff --git a/src/include/de/insert_size_refiner.hpp b/src/include/de/insert_size_refiner.hpp
deleted file mode 100644
index a6630b5..0000000
--- a/src/include/de/insert_size_refiner.hpp
+++ /dev/null
@@ -1,166 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "cpp_utils.hpp"
-#include "stats/debruijn_stats.hpp"
-//#include "sequence_mapper.hpp"
-
-namespace omnigraph {
-
-typedef std::map<int, size_t> HistType;
-
-inline double get_median(const HistType &hist) {
- double S = 0;
- for (auto iter = hist.begin(); iter != hist.end(); ++iter)
- S += (double) iter->second;
-
- double sum = S;
- for (auto iter = hist.begin(); iter != hist.end(); ++iter) {
- sum -= (double) iter->second;
- if (sum <= S / 2) {
- return iter->first;
- }
- }
- assert(false);
- return -1;
-}
-
-inline double get_mad(const HistType &hist, double median) { // median absolute deviation
- std::map<int, size_t> hist2;
- for (auto iter = hist.begin(); iter != hist.end(); ++iter) {
- int x = abs(iter->first - math::round_to_zero(median));
- hist2[x] = iter->second;
- }
- return get_median(hist2);
-}
-
-inline void hist_crop(const HistType &hist, double low, double high, HistType& res) {
- for (auto iter = hist.begin(); iter != hist.end(); ++iter) {
- if (iter->first >= low && iter->first <= high) {
- DEBUG("Cropped histogram " << iter->first << " " << iter->second);
- res.insert(*iter);
- }
- }
-}
-
-inline
-std::pair<double, double> GetISInterval(double quantile,
- const HistType &is_hist) {
- // First, obtain the sum of the values
- double S = 0;
- for (auto iter : is_hist)
- S += (double) iter.second;
-
- double lval = S * (1 - quantile) / 2, rval = S * (1 + quantile) / 2;
- double is_min, is_max;
-
- // Now, find the quantiles
- double cS = 0;
- is_min = is_hist.begin()->first;
- is_max = is_hist.rbegin()->first;
- for (auto iter : is_hist) {
- if (cS <= lval)
- is_min = iter.first;
- else if (cS <= rval)
- is_max = iter.first;
- cS += (double) iter.second;
- }
-
- return std::make_pair(is_min, is_max);
-}
-
-inline void find_median(const HistType& hist, double& median, double& mad, HistType&cropped_hist) {
- DEBUG("Counting median and MAD");
- median = get_median(hist);
- mad = get_mad(hist, median);
- double low = median - 5. * 1.4826 * mad;
- double high = median + 5. * 1.4826 * mad;
- omnigraph::hist_crop(hist, low, high, cropped_hist);
- median = get_median(cropped_hist);
- mad = get_mad(cropped_hist, median);
-}
-
-//Moved from insert size counter.
-//TODO: Please explain constants like 1.4826.
-inline void find_mean(const HistType& hist, double& mean, double& delta, std::map<size_t, size_t>& percentiles) {
- double median = get_median(hist);
- double mad = get_mad(hist, median);
- double low = median - 5. * 1.4826 * mad;
- double high = median + 5. * 1.4826 * mad;
-
- DEBUG("Median IS: " << median);
- DEBUG("MAD: " << mad);
- DEBUG("Thresholds set to: [" << low << ", " << high << "]");
-
- size_t n = 0;
- double sum = 0.;
- double sum2 = 0.;
- DEBUG("Counting average");
- for (auto iter = hist.begin(); iter != hist.end(); ++iter) {
- if (iter->first < low || iter->first > high) {
- continue;
- }
- n += iter->second;
- sum += (double) iter->second * 1. * (double) iter->first;
- sum2 += (double)iter->second * 1. * (double)iter->first * (double)iter->first;
- }
- mean = sum / (double) n;
- delta = sqrt(sum2 / (double) n - mean * mean);
-
- low = mean - 5 * delta;
- high = mean + 5 * delta;
-
- DEBUG("Mean IS: " << mean);
- DEBUG("sd: " << delta);
- DEBUG("Thresholds set to: [" << low << ", " << high << "]");
-
- n = 0;
- sum = 0.;
- sum2 = 0.;
- for (auto iter = hist.begin(); iter != hist.end(); ++iter) {
- if (iter->first < low || iter->first > high) {
- continue;
- }
- n += iter->second;
- sum += (double) iter->second * 1. * (double) iter->first;
- sum2 += (double) iter->second * 1. * (double) iter->first * (double) iter->first;
- }
- mean = sum / (double) n;
- delta = sqrt(sum2 / (double) n - mean * mean);
-
- DEBUG("Mean IS: " << mean);
- DEBUG("sd: " << delta);
-
- size_t m = 0;
-
- DEBUG("Counting percentiles");
- //todo optimize
- size_t q[19];
- for (size_t i = 1; i < 20; ++i) {
- q[i - 1] = 5 * i;
- }
- for (auto iter = hist.begin(); iter != hist.end(); ++iter) {
- if (iter->first < low || iter->first > high) {
- continue;
- }
- size_t mm = m + iter->second;
- for (size_t i = 0; i < utils::array_size(q); i++) {
- size_t scaled_q_i((size_t) ((double) q[i] / 100. * (double) n));
- if (m < scaled_q_i && mm >= scaled_q_i) {
- percentiles[q[i]] = (size_t) iter->first;
- }
- }
- m = mm;
- }
-}
-
-
-
-
-}
diff --git a/src/include/de/pair_info_filters.hpp b/src/include/de/pair_info_filters.hpp
deleted file mode 100644
index fc1567c..0000000
--- a/src/include/de/pair_info_filters.hpp
+++ /dev/null
@@ -1,271 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef PAIR_INFO_FILTERS_HPP_
-#define PAIR_INFO_FILTERS_HPP_
-
-#include "paired_info_helpers.hpp"
-
-namespace omnigraph {
-
-namespace de {
-
-template<class Graph>
-class AbstractPairInfoChecker{
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- typedef PairInfo<EdgeId> PairInfoT;
-
-protected:
- const Graph& graph_;
-
-public:
- AbstractPairInfoChecker(const Graph &graph) : graph_(graph) { }
-
- virtual bool Check(const PairInfoT&) {
- return true;
- }
-
- virtual bool Check(EdgeId, EdgeId) {
- return true;
- }
-
- virtual ~AbstractPairInfoChecker() { }
-};
-
-template<class Graph>
-class PairInfoWeightChecker : public AbstractPairInfoChecker<Graph>{
- private:
- typedef typename Graph::EdgeId EdgeId;
- typedef PairInfo<EdgeId> PairInfoT;
- double weight_threshold_;
-
- public:
- PairInfoWeightChecker(const Graph& graph, double weight_threshold) :
- AbstractPairInfoChecker<Graph>(graph), weight_threshold_(weight_threshold) {
- }
-
- bool Check(const PairInfoT& info) {
- return math::ge(info.weight(), weight_threshold_);
- }
-};
-
-template<class Graph>
-class PairInfoWeightCheckerWithCoverage: public AbstractPairInfoChecker<Graph> {
- private:
- typedef typename Graph::EdgeId EdgeId;
- typedef PairInfo<EdgeId> PairInfoT;
- double weight_threshold_;
-
- public:
- PairInfoWeightCheckerWithCoverage(const Graph& graph, double weight_threshold) :
- AbstractPairInfoChecker<Graph>(graph), weight_threshold_(weight_threshold){
- }
-
- bool Check(const PairInfoT& info) {
- double info_weight = info.weight();
- return math::ge(info_weight, weight_threshold_)
- || (math::ge(info_weight, 0.1 * this->graph_.coverage(info.first)))
- || (math::ge(info_weight, 0.1 * this->graph_.coverage(info.second)));
- }
-};
-
-template <class Graph>
-class AmbiguousPairInfoChecker : public AbstractPairInfoChecker<Graph> {
-
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef PairInfo<EdgeId> PairInfoT;
- typedef boost::optional<EdgeId> OptEdgeId;
-
- AbstractPairInfoChecker<Graph> &standard_filter_;
- const PairedInfoIndexT<Graph>& index_;
-
- double haplom_threshold_;
- double relative_length_threshold_;
- double relative_seq_threshold_;
-
- bool IsEdgeOneHaplome(EdgeId edge){
- return this->graph_.coverage(edge) < 1.5 * haplom_threshold_;
- }
-
- bool IsPairInfoGood(EdgeId edge1, EdgeId edge2){
- return index_.Get(edge1, edge2).size() <= 1;
- }
-
- bool EdgesAreFromSimpleBulgeWithAmbPI(const PairInfoT& info){
- EdgeId edge1 = info.first;
- EdgeId edge2 = info.second;
- // edge is auto reverse complementary
- TRACE("Check for auto reverse complementary");
- if(this->graph_.conjugate(edge1) == info.second)
- return false;
- TRACE("Done");
-
- TRACE("Check for coverage 1x haplome for edge from pair info");
- if(!IsEdgeOneHaplome(edge1) || !IsEdgeOneHaplome(edge2))
- return false;
- TRACE("Done");
-
- // first edge is not side of simple bulge
- TRACE("Check for bulge side for the 1st edge");
- OptEdgeId edge1_alt = GetOtherSideOfSimpleBulge(edge1);
- if(!edge1_alt.is_initialized())
- return false;
- TRACE("Done");
-
- // second edge is not side of simple bulge
- TRACE("Check for bulge side for the 2nd edge");
- OptEdgeId edge2_alt = GetOtherSideOfSimpleBulge(edge2);
- if(!edge2_alt.is_initialized())
- return false;
- TRACE("Done");
-
- TRACE("Check for coverage 1x haplome for edge from alternative bulge sides");
- if(!IsEdgeOneHaplome(edge1_alt.get()) || !IsEdgeOneHaplome(edge2_alt.get()))
- return false;
- TRACE("Done");
-
- TRACE("Check for multiplicity of pair info");
- if(!(IsPairInfoGood(edge1, edge2_alt.get()) &&
- IsPairInfoGood(edge1_alt.get(), edge2) &&
- IsPairInfoGood(edge1_alt.get(), edge2_alt.get())))
- return false;
- TRACE("Done");
-
- return true;
- }
-
- double GetPairInfoWeight(EdgeId edge1, EdgeId edge2){
- auto hist = index_.Get(edge1, edge2);
- return (hist.size() == 1) ? float(hist.begin()->weight) : 0.0f;
- }
-
- bool InnerCheck(const PairInfoT& info){
-
- EdgeId edge1 = info.first;
- EdgeId edge2 = info.second;
-
- // get second edges of simple bulge
- OptEdgeId opt_edge1_alt = GetOtherSideOfSimpleBulge(edge1);
- VERIFY(opt_edge1_alt.is_initialized());
- EdgeId edge1_alt = opt_edge1_alt.get();
-
- OptEdgeId opt_edge2_alt = GetOtherSideOfSimpleBulge(edge2);
- VERIFY(opt_edge2_alt.is_initialized());
- EdgeId edge2_alt = opt_edge2_alt.get();
-
- double direct_weight = GetPairInfoWeight(edge1, edge2) +
- GetPairInfoWeight(edge1_alt, edge2_alt);
-
- double reverse_weight = GetPairInfoWeight(edge1, edge2_alt) +
- GetPairInfoWeight(edge1_alt, edge2);
-
- TRACE("Direct_weight " << direct_weight << ", reverse_weight " << reverse_weight);
- return direct_weight > reverse_weight;
- }
-
-public:
- AmbiguousPairInfoChecker(const Graph& graph, const PairedInfoIndexT<Graph>& index,
- AbstractPairInfoChecker<Graph> &standard_filter, double haplom_threshold,
- double relative_length_threshold, double relative_seq_threshold) :
- AbstractPairInfoChecker<Graph>(graph),
- standard_filter_(standard_filter),
- index_(index),
- haplom_threshold_(haplom_threshold),
- relative_length_threshold_(relative_length_threshold),
- relative_seq_threshold_(relative_seq_threshold) { }
-
- bool Check(const PairInfoT& info) {
- TRACE(this->graph_.int_id(info.first) << " " << this->graph_.int_id(info.second));
- if(EdgesAreFromSimpleBulgeWithAmbPI(info)){
- TRACE("Forward directed edges form a simple bulge");
- return InnerCheck(info);
- }
-
- if(EdgesAreFromSimpleBulgeWithAmbPI(BackwardInfo(info))){
- TRACE("Backward directed edges form a simple bulge");
- return InnerCheck(BackwardInfo(info));
- }
-
- TRACE("Edges do not form a bulge. Applying default checker");
- return standard_filter_.Check(info);
- }
-
-private:
- OptEdgeId GetOtherSideOfSimpleBulge(EdgeId edge){
- auto edges = this->graph_.GetEdgesBetween(this->graph_.EdgeStart(edge),
- this->graph_.EdgeEnd(edge));
- TRACE("Number alternative edges - " << edges.size());
- if(edges.size() == 1)
- return OptEdgeId();
-
- size_t edge_length = this->graph_.length(edge);
- Sequence edge_seq = this->graph_.EdgeNucls(edge);
- for(auto it_edge = edges.begin(); it_edge != edges.end(); it_edge++)
- if(*it_edge != edge){
- size_t it_edge_length = this->graph_.length(*it_edge);
- Sequence it_edge_seq = this->graph_.EdgeNucls(*it_edge);
- double length_ratio = double(min<size_t>(edge_length, it_edge_length)) /
- double(max<size_t>(edge_length, it_edge_length));
- if(length_ratio >= relative_length_threshold_){
- // size_t edit_dist = EditDistance(edge_seq, it_edge_seq);
- // double seq_ratio = edit_dist / min<size_t> (edge_seq.size(), it_edge_seq.size());
- return *it_edge;
- }
- }
- return OptEdgeId();
- }
-};
-
-template<class Graph>
-class PairInfoFilter{
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- typedef PairInfo<EdgeId> PairInfoT;
-
-protected:
- AbstractPairInfoChecker<Graph> &pair_info_checker_;
-
-public:
- PairInfoFilter(AbstractPairInfoChecker<Graph> &pair_info_checker) :
- pair_info_checker_(pair_info_checker)
- {}
-
- void Filter(PairedInfoIndexT<Graph>& index) {
- INFO("Start filtering; index size: " << index.size());
- //We can't filter while traversing, because Remove may invalidate iterators
- //So let's save edge pairs first
- using EdgePair = std::pair<EdgeId, EdgeId>;
- std::vector<EdgePair> pairs;
- for (auto i = pair_begin(index); i != pair_end(index); ++i)
- if (pair_info_checker_.Check(i.first(), i.second()))
- pairs.push_back({i.first(), i.second()});
-
- //TODO: implement fast removing of the whole set of points
- for (const auto& pair : pairs) {
- //Same thing with invalidation
- HistogramWithWeight hist;
- for (auto point : index[pair])
- if (!pair_info_checker_.Check(PairInfoT(pair.first, pair.second, point)))
- hist.insert(point);
- //index.RemoveMany(pair_hist.first.first, pair_hist.first.second, pair_hist.second);
- for (const auto& point : hist)
- index.Remove(pair.first, pair.second, point);
- }
-
- INFO("Done filtering");
- }
-};
-
-}
-
-}
-
-#endif /* PAIR_INFO_FILTERS_HPP_ */
diff --git a/src/include/de/paired_info.hpp b/src/include/de/paired_info.hpp
deleted file mode 100644
index 20c78a8..0000000
--- a/src/include/de/paired_info.hpp
+++ /dev/null
@@ -1,863 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "conj_iterator.hpp"
-#include "index_point.hpp"
-
-#include <adt/iterator_range.hpp>
-
-#include <btree/safe_btree_map.h>
-#include <sparsehash/sparse_hash_map>
-
-
-namespace omnigraph {
-
-namespace de {
-
-/**
- * @brief Paired reads info storage. Arranged as a map of map of info points.
- * @param G graph type
- * @param H map-like container type (parameterized by key and value type)
- */
-template<typename G, typename H, template<typename, typename> class Container>
-class PairedIndex {
-
-public:
- typedef G Graph;
- typedef H Histogram;
- typedef typename Graph::EdgeId EdgeId;
- typedef std::pair<EdgeId, EdgeId> EdgePair;
- typedef typename Histogram::value_type Point;
-
- typedef Container<EdgeId, Histogram> InnerMap;
- typedef Container<EdgeId, InnerMap> StorageMap;
-
- //--Data access types--
-
- typedef typename StorageMap::const_iterator ImplIterator;
-
- /**
- * @brief Smart proxy set representing a composite histogram of points between two edges.
- * @param full When true, represents the whole histogram (consisting both of directly added points
- * and "restored" conjugates).
- * When false, proxifies only the added points.
- * @detail You can work with the proxy just like with any constant set.
- * The only major difference is that it returns all consisting points by value,
- * becauses some of them don't exist in the underlying sets and are
- * restored from the conjugate info on-the-fly.
- */
- template<bool full = true>
- class HistProxy {
-
- public:
- /**
- * @brief Iterator over a proxy set of points.
- * @warning Generally, the proxy is unordered even if the set is ordered.
- * If you require that, convert it into a flat histogram with Unwrap().
- * @param full When true, traverses both straight and conjugate points,
- * and automatically recalculates the distance for latter.
- * When false, traverses only the added points and skips the rest.
- */
- class Iterator: public boost::iterator_facade<Iterator, Point, boost::bidirectional_traversal_tag, Point> {
-
- typedef typename ConjProxy<Histogram>::Iterator InnerIterator;
-
- public:
- Iterator(InnerIterator iter, float offset)
- : iter_(iter)
- , offset_(offset)
- {}
-
- private:
- friend class boost::iterator_core_access;
-
- Point dereference() const {
- Point result = *iter_;
- if (iter_.Conj())
- result.d += offset_;
- return result;
- }
-
- void increment() {
- ++iter_;
- }
-
- void decrement() {
- --iter_;
- }
-
- inline bool equal(const Iterator &other) const {
- return iter_ == other.iter_;
- }
-
- InnerIterator iter_; //current position
- float offset_; //offset to be added for conjugate distance
- };
-
- HistProxy(const Histogram& hist, const Histogram& conj_hist, float offset = 0)
- : hist_(hist, conj_hist)
- , offset_(offset)
- {}
-
- /**
- * @brief Returns an empty proxy (effectively a Null object pattern).
- */
- static const Histogram& empty_hist() {
- static Histogram res;
- return res;
- }
-
- /**
- * @brief Returns a wrapper for an ordinary histogram (for implicit conversions)
- */
- HistProxy(const Histogram& hist, float offset = 0)
- : hist_(hist, HistProxy::empty_hist())
- , offset_(offset)
- {}
-
- Iterator begin() const {
- return Iterator(hist_.begin(), offset_);
- }
-
- Iterator end() const {
- //auto i = full ? hist_.end() : hist_.conj_begin();
- //return Iterator(i, offset_);
- return Iterator(hist_.end(), offset_);
- }
-
- /**
- * @brief Finds the point with the minimal distance.
- * @todo Simplify
- */
- Point min() const {
- //Our histograms are ordered, so the minimum is `begin` of either
- //straight or conjugate half, but we should beware of emptiness.
- VERIFY(!empty());
- auto i1 = begin();
- if (full) {
- auto i2 = Iterator(hist_.conj_begin(), offset_);
- if (i1 == i2 || i2 == end())
- return *i1;
- return std::min(*i1, *i2);
- } else {
- return *i1;
- }
- }
-
- /**
- * @brief Finds the point with the maximal distance.
- * @todo Simplify
- */
- Point max() const {
- //Our histograms are ordered, so the maximum is `rbegin` of either
- //straight or conjugate half, but we should beware of emptiness.
- VERIFY(!empty());
- auto i1 = end();
- if (full) {
- auto i2 = Iterator(hist_.conj_begin(), offset_);
- if (i1 == i2 || i2 == begin())
- return *--i1;
- return std::max(*--i1, *--i2);
- } else {
- return *--i1;
- }
- }
-
- /**
- * @brief Returns the copy of all points in a simple histogram.
- */
- Histogram Unwrap() const {
- return Histogram(begin(), end());
- }
-
- size_t size() const {
- return hist_.size();
- }
-
- bool empty() const {
- return hist_.empty();
- }
-
- private:
- const ConjProxy<Histogram> hist_;
- float offset_;
- };
-
- /**
- * @brief Type synonym for full histogram proxies (with added and conjugated points)
- */
- typedef HistProxy<true> FullHistProxy;
- /**
- * @brief Type synonym for raw histogram proxies (only with directly added points)
- */
- typedef HistProxy<false> RawHistProxy;
-
- typedef typename HistProxy<true>::Iterator HistIterator;
- typedef typename HistProxy<false>::Iterator RawHistIterator;
-
- //---- Traversing edge neighbours ----
-
- template<bool full = true>
- using EdgeHist = std::pair<EdgeId, HistProxy<full>>;
-
- /**
- * @brief A proxy map representing neighbourhood of an edge,
- * where `Key` is the graph edge ID and `Value` is the proxy histogram.
- * @param full When true, represents all neighbours (consisting both of directly added data
- * and "restored" conjugates).
- * When false, proxifies only the added edges.
- * @detail You can work with the proxy just like with any constant map.
- * The only major difference is that it returns all consisting pairs by value,
- * becauses some of them don't exist in the underlying sets and are
- * restored from the conjugate info on-the-fly.
- */
- template<bool full = true>
- class EdgeProxy {
- public:
-
- /**
- * @brief Iterator over a proxy map.
- * @param full When true, traverses both straight and conjugate pairs,
- * and automatically recalculates the distance for latter.
- * When false, traverses only the added points and skips the rest.
- */
- class Iterator: public boost::iterator_facade<Iterator, EdgeHist<full>, boost::forward_traversal_tag, EdgeHist<full>> {
-
- typedef typename ConjProxy<InnerMap>::Iterator InnerIterator;
-
- void Skip() {
- if (full) { //For a full iterator, skip possibly repeated edges
- while (!iter_.Conj() &&
- index_.GetImpl(index_.graph().conjugate(edge_), index_.graph().conjugate(iter_->first)).size())
- ++iter_;
-
- } else { //For a raw iterator, skip conjugate pairs
- while (!iter_.Conj() && iter_->first < edge_)
- ++iter_;
- }
- }
-
- public:
- Iterator(const PairedIndex &index, InnerIterator iter, EdgeId edge)
- : index_ (index)
- , iter_(iter)
- , edge_(edge)
- {
- Skip();
- }
-
- void increment() {
- ++iter_;
- Skip();
- }
-
- void operator=(const Iterator &other) {
- //TODO: is this risky without an assertion?
- //We shouldn't reassign iterators from one index onto another
- iter_ = other.iter_;
- edge_ = other.edge_;
- }
-
- private:
- friend class boost::iterator_core_access;
-
- bool equal(const Iterator &other) const {
- return iter_ == other.iter_;
- }
-
- EdgeHist<full> dereference() const {
- EdgeId e2 = iter_->first;
- if (full) {
- float offset = index_.CalcOffset(edge_, e2);
- EdgePair conj = index_.ConjugatePair(edge_, e2);
- if (iter_.Conj()) {
- return std::make_pair(conj.first,
- HistProxy<full>(index_.GetImpl(edge_, conj.first),
- index_.GetImpl(e2, conj.second),
- offset));
- } else {
- return std::make_pair(e2, HistProxy<full>(iter_->second, index_.GetImpl(conj), offset));
- }
- } else {
- return std::make_pair(e2, HistProxy<full>(iter_->second));
- }
- }
-
- private:
- const PairedIndex &index_;
- InnerIterator iter_;
- EdgeId edge_;
- };
-
- EdgeProxy(const PairedIndex &index, const InnerMap& map, const InnerMap& conj_map, EdgeId edge)
- : index_(index), map_(map, conj_map), edge_(edge)
- {}
-
- Iterator begin() const {
- return Iterator(index_, map_.begin(), edge_);
- }
-
- Iterator end() const {
- auto i = full ? map_.end() : map_.conj_begin();
- return Iterator(index_, i, edge_);
- }
-
- HistProxy<full> operator[](EdgeId e2) const {
- //TODO: optimize
- EdgeId e1 = edge_;
- auto offset = index_.CalcOffset(e1, e2);
- if (full) {
- const auto& hist = index_.GetImpl(edge_, e2);
- const auto& conj_hist = index_.GetImpl(index_.ConjugatePair(edge_, e2));
- return HistProxy<full>(hist, conj_hist, offset);
- } else {
- if (index_.SwapConj(e1, e2))
- return HistProxy<full>(HistProxy<full>::empty_hist(), index_.GetImpl(e1, e2), offset);
- else
- return HistProxy<full>(index_.GetImpl(e1, e2));
- }
- }
-
- inline bool empty() const {
- return map_.empty();
- }
-
- private:
- const PairedIndex& index_;
- const ConjProxy<InnerMap> map_;
- EdgeId edge_;
- };
-
- /*template<> HistProxy<true> EdgeProxy<true>::operator[](EdgeId e2) const {
- return index_.Get(edge_, e2);
- }
-
- template<> HistProxy<false> EdgeProxy<false>::operator[](EdgeId e2) const {
- return index_.RawGet(edge_, e2);
- }*/
-
- typedef typename EdgeProxy<true>::Iterator EdgeIterator;
- typedef typename EdgeProxy<false>::Iterator RawEdgeIterator;
-
- //--Constructor--
-
- PairedIndex(const Graph &graph)
- : size_(0), graph_(graph)
- {}
-
- //--Inserting--
-public:
- /**
- * @brief Returns a conjugate pair for two edges.
- */
- inline EdgePair ConjugatePair(EdgeId e1, EdgeId e2) const {
- return std::make_pair(graph_.conjugate(e2), graph_.conjugate(e1));
- }
- /**
- * @brief Returns a conjugate pair for a pair of edges.
- */
- inline EdgePair ConjugatePair(EdgePair ep) const {
- return ConjugatePair(ep.first, ep.second);
- }
-
- bool SwapConj(EdgeId &e1, EdgeId &e2) const {
- EdgePair ep = {e1, e2}, ep_conj = ConjugatePair(ep);
- if (ep > ep_conj) {
- e1 = ep_conj.first;
- e2 = ep_conj.second;
- return true;
- }
- return false;
- }
-
-private:
- bool SwapConj(EdgeId &e1, EdgeId &e2, Point &p) const {
- if (SwapConj(e1, e2)) {
- p.d += CalcOffset(e1, e2);
- return true;
- }
- return false;
- }
-
- float CalcOffset(EdgeId e1, EdgeId e2) const {
- return float(graph_.length(e1)) - float(graph_.length(e2));
- }
-
-public:
- /**
- * @brief Adds a point between two edges to the index,
- * merging weights if there's already one with the same distance.
- */
- void Add(EdgeId e1, EdgeId e2, Point point) {
- SwapConj(e1, e2, point);
- InsertOrMerge(e1, e2, point);
- }
-
- /**
- * @brief Adds a whole set of points between two edges to the index.
- */
- template<typename TH>
- void AddMany(EdgeId e1, EdgeId e2, const TH& hist) {
- float offset = SwapConj(e1, e2) ? CalcOffset(e1, e2) : 0.0f;
- for (auto point : hist) {
- point.d += offset;
- InsertOrMerge(e1, e2, point);
- }
- }
-
-private:
-
- void InsertOrMerge(EdgeId e1, EdgeId e2,
- const Point &sp) {
- auto& straight = storage_[e1][e2];
- auto si = straight.find(sp);
- auto rp = -sp;
- if (si != straight.end()) {
- MergeData(straight, si, sp);
- if (!IsSymmetric(e1, e2, sp)) {
- auto& reversed = storage_[e2][e1];
- auto ri = reversed.find(rp);
- MergeData(reversed, ri, rp);
- }
- } else {
- InsertPoint(straight, sp);
- if (!IsSymmetric(e1, e2, sp)) {
- auto &reversed = storage_[e2][e1];
- InsertPoint(reversed, rp);
- }
- }
- }
-
- //Would be faster, but unstable for hash_map due to the iterator invalidation
- /*void InsertOrMerge(Histogram& straight, Histogram& reversed,
- const Point &sp) {
- auto si = straight.find(sp);
- auto rp = -sp;
- if (si != straight.end()) {
- MergeData(straight, si, sp);
- auto ri = reversed.find(rp);
- MergeData(reversed, ri, rp);
- }
- else {
- InsertPoint(reversed, rp);
- InsertPoint(straight, sp);
- //if (!IsSymmetric(e1, e2, point)) TODO
-
- }
- }*/
-
- static bool IsSymmetric(EdgeId e1, EdgeId e2, Point point) {
- return (e1 == e2) && math::eq(point.d, 0.f);
- }
-
- // modifying the histogram
- inline void InsertPoint(Histogram& histogram, Point point) {
- histogram.insert(point);
- ++size_;
- }
-
- void MergeData(Histogram& hist, typename Histogram::iterator to_update, const Point& to_merge) {
- //We can't just modify the existing point, because if variation is non-zero,
- //resulting distance will differ
- auto to_add = *to_update + to_merge;
- auto after_removed = hist.erase(to_update);
- hist.insert(after_removed, to_add);
- }
-
-public:
- /**
- * @brief Adds a lot of info from another index, using fast merging strategy.
- * Should be used instead of point-by-point index merge.
- */
- template<class Index>
- void Merge(const Index& index_to_add) {
- auto& base_index = storage_;
- for (auto AddI = index_to_add.data_begin(); AddI != index_to_add.data_end(); ++AddI) {
- EdgeId e1_to_add = AddI->first;
- const auto& map_to_add = AddI->second;
- InnerMap& map_already_exists = base_index[e1_to_add];
- MergeInnerMaps(map_to_add, map_already_exists);
- }
- }
-
-private:
- template<class OtherMap>
- void MergeInnerMaps(const OtherMap& map_to_add,
- InnerMap& map) {
- for (const auto& to_add : map_to_add) {
- Histogram &hist_exists = map[to_add.first];
- size_ += hist_exists.merge(to_add.second);
- }
- }
-
-public:
- //--Data deleting methods--
-
- /**
- * @brief Removes the specific entry from the index.
- * @warning Don't use it on unclustered index, because hashmaps require set_deleted_item
- * @return The number of deleted entries (0 if there wasn't such entry)
- */
- size_t Remove(EdgeId e1, EdgeId e2, Point point) {
- auto res = RemoveImpl(e1, e2, point);
- auto conj = ConjugatePair(e1, e2);
- point.d += CalcOffset(e2, e1);
- res += RemoveImpl(conj.first, conj.second, point);
- return res;
- }
-
- /**
- * @brief Removes the whole histogram from the index.
- * @warning Don't use it on unclustered index, because hashmaps require set_deleted_item
- * @return The number of deleted entries
- */
- size_t Remove(EdgeId e1, EdgeId e2) {
- SwapConj(e1, e2);
- auto res = RemoveAll(e1, e2);
- if (e1 != e2)
- res += RemoveAll(e2, e1);
- return res;
- }
-
-private:
-
- size_t RemoveImpl(EdgeId e1, EdgeId e2, Point point) {
- auto res = RemoveSingle(e1, e2, point);
- if (!IsSymmetric(e1, e2, point))
- res += RemoveSingle(e2, e1, -point);
- return res;
- }
-
- //TODO: remove duplicode
- size_t RemoveSingle(EdgeId e1, EdgeId e2, Point point) {
- auto i1 = storage_.find(e1);
- if (i1 != storage_.end()) {
- auto& map = i1->second;
- auto i2 = map.find(e2);
- if (i2 != map.end()) {
- Histogram& hist = i2->second;
- if (hist.erase(point)) {
- --size_;
- if (hist.empty()) {
- map.erase(e2);
- if (map.empty())
- storage_.erase(e1);
- }
- return 1;
- }
- return 0;
- }
- }
- return 0;
- }
-
- size_t RemoveAll(EdgeId e1, EdgeId e2) {
- auto i1 = storage_.find(e1);
- if (i1 != storage_.end()) {
- auto& map = i1->second;
- auto i2 = map.find(e2);
- if (i2 != map.end()) {
- Histogram& hist = i2->second;
- size_t size_decrease = hist.size();
- map.erase(i2);
- size_ -= size_decrease;
- if (map.empty())
- storage_.erase(i1);
- return size_decrease;
- }
- }
- return 0;
- }
-
-public:
-
- /**
- * @brief Removes all neighbourhood of an edge (all edges referring to it, and their histograms)
- * @warning Currently doesn't check the conjugate info (should it?), so it may actually
- * skip some data.
- * @return The number of deleted entries
- */
- size_t Remove(EdgeId edge) {
- InnerMap &inner_map = storage_[edge];
- for (auto iter = inner_map.begin(); iter != inner_map.end(); ++iter) {
- EdgeId e2 = iter->first;
- if (edge != e2) {
- this->Remove(e2, edge);
- }
- }
- size_t size_of_removed = inner_map.size();
- storage_.erase(edge);
- size_ -= size_of_removed;
- return size_of_removed;
- }
-
- // --Accessing--
-
- /**
- * @brief Underlying raw implementation data (for custom iterator helpers).
- */
- ImplIterator data_begin() const {
- return storage_.begin();
- }
-
- /**
- * @brief Underlying raw implementation data (for custom iterator helpers).
- */
- ImplIterator data_end() const {
- return storage_.end();
- }
-
- adt::iterator_range<ImplIterator> data() const {
- return adt::make_range(data_begin(), data_end());
- }
-
- /**
- * @brief Returns a full proxy map to the neighbourhood of some edge.
- */
- EdgeProxy<> Get(EdgeId id) const {
- return EdgeProxy<>(*this, GetImpl(id), GetImpl(graph_.conjugate(id)), id);
- }
-
- /**
- * @brief Returns a raw proxy map to neighboring edges
- * @detail You should use it when you don't care for backward
- * and conjugate info, or don't want to process them twice.
- */
- EdgeProxy<false> RawGet(EdgeId id) const {
- return EdgeProxy<false>(*this, GetImpl(id), empty_map_, id);
- }
-
- /**
- * @brief Operator alias of Get(id).
- */
- EdgeProxy<> operator[](EdgeId id) const {
- return Get(id);
- }
-
-private:
- //When there is no such edge, returns a fake empty map for safety
- const InnerMap& GetImpl(EdgeId e1) const {
- auto i = storage_.find(e1);
- if (i == storage_.end())
- return empty_map_;
- return i->second;
- }
-
- //When there is no such histogram, returns a fake empty histogram for safety
- const Histogram& GetImpl(EdgeId e1, EdgeId e2) const {
- auto i = storage_.find(e1);
- if (i != storage_.end()) {
- auto j = i->second.find(e2);
- if (j != i->second.end())
- return j->second;
- }
- return HistProxy<true>::empty_hist();
- }
-
- inline const Histogram& GetImpl(EdgePair e) const {
- return GetImpl(e.first, e.second);
- }
-
-public:
-
- /**
- * @brief Returns a full histogram proxy for all points between two edges.
- */
- HistProxy<> Get(EdgeId e1, EdgeId e2) const {
- auto offset = CalcOffset(e1, e2);
- return HistProxy<>(GetImpl(e1, e2), GetImpl(ConjugatePair(e1, e2)), offset);
- }
-
- /**
- * @brief Operator alias of Get(e1, e2).
- */
- inline HistProxy<> operator[](EdgePair p) const {
- return Get(p.first, p.second);
- }
-
- /**
- * @brief Returns a raw histogram proxy for only straight points between two edges.
- */
- HistProxy<false> RawGet(EdgeId e1, EdgeId e2) const {
- if (SwapConj(e1, e2))
- return HistProxy<false>(HistProxy<false>::empty_hist(), GetImpl(e1, e2), CalcOffset(e2, e1));
- else
- return HistProxy<false>(GetImpl(e1, e2), HistProxy<false>::empty_hist(), 0);
- }
-
- /**
- * @brief Checks if an edge (or its conjugated twin) is consisted in the index.
- */
- bool contains(EdgeId edge) const {
- return storage_.count(edge) + storage_.count(graph_.conjugate(edge)) > 0;
- }
-
- /**
- * @brief Checks if there is a histogram for two points (or their conjugate pair).
- */
- bool contains(EdgeId e1, EdgeId e2) const {
- auto conj = ConjugatePair(e1, e2);
- auto i1 = storage_.find(e1);
- if (i1 != storage_.end() && i1->second.count(e2))
- return true;
- auto i2 = storage_.find(conj.first);
- if (i2 != storage_.end() && i2->second.count(conj.second))
- return true;
- return false;
- }
-
- // --Miscellaneous--
-
- /**
- * Returns the graph the index is based on. Needed for custom iterators.
- */
- const Graph &graph() const { return graph_; }
-
- /**
- * @brief Inits the index with graph data. Used in clustered indexes.
- */
- void Init() {
- for (auto it = graph_.ConstEdgeBegin(); !it.IsEnd(); ++it)
- Add(*it, *it, Point());
- }
-
- /**
- * @brief Clears the whole index. Used in merging.
- */
- void Clear() {
- storage_.clear();
- size_ = 0;
- }
-
- /**
- * @brief Returns the physical index size (total count of all edge pairs)
- * @warning (not really total, doesn't include the conjugate info)
- */
- size_t size() const { return size_; }
-
-private:
- size_t size_;
- const Graph& graph_;
- StorageMap storage_;
- InnerMap empty_map_; //null object
-};
-
-//Aliases for common graphs
-template<typename K, typename V>
-using safe_btree_map = btree::safe_btree_map<K, V>; //Two-parameters wrapper
-template<typename Graph>
-using PairedInfoIndexT = PairedIndex<Graph, HistogramWithWeight, safe_btree_map>;
-
-template<typename K, typename V>
-using sparse_hash_map = google::sparse_hash_map<K, V>; //Two-parameters wrapper
-template<typename Graph>
-using UnclusteredPairedInfoIndexT = PairedIndex<Graph, RawHistogram, sparse_hash_map>;
-
-/**
- * @brief A collection of paired indexes which can be manipulated as one.
- * Used as a convenient wrapper in parallel index processing.
- */
-template<class Index>
-class PairedIndices {
- typedef std::vector<Index> Storage;
- Storage data_;
-
-public:
- PairedIndices() {}
-
- PairedIndices(const typename Index::Graph& graph, size_t lib_num) {
- data_.reserve(lib_num);
- for (size_t i = 0; i < lib_num; ++i)
- data_.emplace_back(graph);
- }
-
- /**
- * @brief Inits all indexes.
- */
- void Init() { for (auto& it : data_) it.Init(); }
-
- /**
- * @brief Clears all indexes.
- */
- void Clear() { for (auto& it : data_) it.Clear(); }
-
- Index& operator[](size_t i) { return data_[i]; }
-
- const Index& operator[](size_t i) const { return data_[i]; }
-
- size_t size() const { return data_.size(); }
-
- typename Storage::iterator begin() { return data_.begin(); }
- typename Storage::iterator end() { return data_.end(); }
-
- typename Storage::const_iterator begin() const { return data_.begin(); }
- typename Storage::const_iterator end() const { return data_.end(); }
-};
-
-template<class Graph>
-using PairedInfoIndicesT = PairedIndices<PairedInfoIndexT<Graph>>;
-
-template<class Graph>
-using UnclusteredPairedInfoIndicesT = PairedIndices<UnclusteredPairedInfoIndexT<Graph>>;
-
-template<typename K, typename V>
-using unordered_map = std::unordered_map<K, V>; //Two-parameters wrapper
-template<class Graph>
-using PairedInfoBuffer = PairedIndex<Graph, RawHistogram, unordered_map>;
-
-template<class Graph>
-using PairedInfoBuffersT = PairedIndices<PairedInfoBuffer<Graph>>;
-
-/*
-//Debug
-template<typename T>
-std::ostream& operator<<(std::ostream& str, const PairedInfoBuffer<T>& pi) {
- str << "--- PI of size " << pi.size() << "---\n";
-
- for (auto i = pi.data_begin(); i != pi.data_end(); ++i) {
- auto e1 = i->first;
- str << e1 << " has: \n";
-
- for (auto j = i->second.begin(); j != i->second.end(); ++j) {
- str << "- " << j->first << ": ";
- for (auto p : j->second)
- str << p << ", ";
- str << std::endl;
- }
- }
-
- str << "-------\n";
- return str;
-}
-
-//Debug
-template<typename T>
-std::ostream& operator<<(std::ostream& str, const PairedInfoIndexT<T>& pi) {
- str << "--- PI of size " << pi.size() << "---\n";
-
- for (auto i = pi.data_begin(); i != pi.data_end(); ++i) {
- auto e1 = i->first;
- str << e1 << " has: \n";
-
- for (auto j = i->second.begin(); j != i->second.end(); ++j) {
- str << "- " << j->first << ": ";
- for (auto p : j->second)
- str << p << ", ";
- str << std::endl;
- }
- }
-
- str << "-------\n";
- return str;
-}
-*/
-
-}
-
-}
diff --git a/src/include/de/paired_info_helpers.hpp b/src/include/de/paired_info_helpers.hpp
deleted file mode 100644
index 9223ad8..0000000
--- a/src/include/de/paired_info_helpers.hpp
+++ /dev/null
@@ -1,149 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "paired_info.hpp"
-#include "boost/optional.hpp"
-
-namespace omnigraph {
-
-namespace de {
-
-template<typename Index, bool full>
-class EdgePairIterator :
- public boost::iterator_facade<EdgePairIterator<Index, full>,
- typename Index::FullHistProxy,
- boost::forward_traversal_tag,
- typename Index::FullHistProxy>
-{
- typedef typename ConjProxy<typename Index::StorageMap>::Iterator OuterIterator;
- typedef boost::optional<typename Index::InnerMap::const_iterator> InnerIterator;
-
-protected:
- //They're not intended to be constucted explicitly, only via begin/end.
- EdgePairIterator(const Index& index, OuterIterator i)
- : index_(index), i_(i)
- {
- StartOver();
- }
-
- bool FakePair() {
- auto ep = std::make_pair(i_->first, (*j_)->first);
- return ep > index_.ConjugatePair(ep);
- }
-
- inline void Skip() { //For a raw iterator, skip conjugate pairs
- while (!full && j_ && FakePair()) {
- IncImpl();
- }
- }
-
- void IncImpl() {
- ++(*j_);
- if (j_ == i_->second.end()) { //Traversed all neighbours, jump to the next edge
- ++i_;
- StartOver();
- }
- }
-
-public:
- void increment() {
- IncImpl();
- Skip();
- }
-
-private:
- void StartOver() {
- if (i_.Iter() == index_.data_end()) {
- j_.reset();
- } else {
- j_ = i_->second.begin();
- Skip();
- }
- }
-
-public:
-
- typename Index::FullHistProxy dereference() const {
- return index_.Get(first(), second()); //TODO: optimize
- }
-
- bool equal(const EdgePairIterator &other) const {
- return (j_ == other.j_) && (i_.Conj() == other.i_.Conj());
- }
-
- typename Index::EdgeId first() const {
- if (i_.Conj())
- return index_.graph().conjugate((*j_)->first);
- return i_->first;
- }
-
- typename Index::EdgeId second() const {
- if (i_.Conj())
- return index_.graph().conjugate(i_->first);
- return (*j_)->first;
- }
-
- static EdgePairIterator begin(const Index& index) {
- auto i = OuterIterator(index.data_begin(), index.data_end(), index.data_begin(), !index.size());
- return EdgePairIterator(index, i);
- }
-
- static EdgePairIterator end(const Index& index) {
- auto stop = full ? index.data_end() : index.data_begin();
- auto i = OuterIterator(stop, index.data_end(), index.data_begin(), true);
- return EdgePairIterator(index, i);
- }
-
-private:
- const Index &index_;
- OuterIterator i_;
- InnerIterator j_;
-};
-
-template<typename Storage>
-inline EdgePairIterator<Storage, true> pair_begin(const Storage &s) {
- return EdgePairIterator<Storage, true>::begin(s);
-}
-
-template<typename Storage>
-inline EdgePairIterator<Storage, true> pair_end(const Storage &s) {
- return EdgePairIterator<Storage, true>::end(s);
-}
-
-template<typename Storage>
-inline EdgePairIterator<Storage, false> raw_pair_begin(const Storage &s) {
- return EdgePairIterator<Storage, false>::begin(s);
-}
-
-template<typename Storage>
-inline EdgePairIterator<Storage, false> raw_pair_end(const Storage &s) {
- return EdgePairIterator<Storage, false>::end(s);
-}
-
-//Small wrapper for range-based loops
-//Usage: for (auto i in PairsOf(index))
-/*template <typename Storage>
-class PairsOf {
-public:
- EdgePairIterator<Storage> begin() const{
- return pair_begin(storage_);
- }
-
- EdgePairIterator<Storage> end() const{
- return pair_begin(storage_);
- }
-
- PairsOf(const Storage& storage)
- : storage_(storage) {}
-private:
- const Storage& storage_;
-};*/
-
-}
-
-}
diff --git a/src/include/de/peak_finder.hpp b/src/include/de/peak_finder.hpp
deleted file mode 100644
index 4f55614..0000000
--- a/src/include/de/peak_finder.hpp
+++ /dev/null
@@ -1,386 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * PeakFinder.hpp
- *
- * Created on: Aug 15, 2011
- * Author: alexeyka
- */
-
-#ifndef PEAKFINDER_HPP_
-#define PEAKFINDER_HPP_
-
-#include "verify.hpp"
-#include "data_divider.hpp"
-#include "paired_info.hpp"
-#include "omni/omni_utils.hpp"
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-#include <complex>
-#include <cmath>
-
-namespace omnigraph{
-
-namespace de {
-
-template <class EdgeId>
-class PeakFinder {
-
- typedef std::complex<double> complex_t;
-
- public:
- PeakFinder(const vector<PairInfo<EdgeId> >& data,
- size_t begin,
- size_t end,
- size_t /*range*/,
- size_t delta,
- double percentage,
- double der_thr) :
- delta_(delta),
- percentage_(percentage),
- der_thr_(der_thr)
- {
- for (size_t i = begin; i < end; ++i) {
- x_.push_back(rounded_d(data[i]));
- y_.push_back(data[i].weight());
- }
- Init();
- }
-
- double weight() const {
- return weight_;
- }
-
- double GetNormalizedWeight() const {
- return weight_;
- }
-
- void PrintStats(string host) const {
- for (size_t i = 0; i < data_len_; ++i)
- DEBUG(host << (x_left_ + (int) i) << " " << hist_[i]);
- }
-
- void FFTSmoothing(double cutoff) {
- VERIFY(data_len_ > 0);
- if (data_len_ == 1) {
- hist_[0] = y_[0];
- return;
- }
- InitBaseline();
- SubtractBaseline();
- FFTForward(hist_);
- size_t Ncrit = (size_t) (cutoff);
-
- // cutting off - standard parabolic filter
- for (size_t i = 0; i < data_len_ && i < Ncrit; ++i)
- hist_[i] *= 1. - ((double) i * (double) i * 1.) / (double) (Ncrit * Ncrit);
-
- for (size_t i = Ncrit; i < hist_.size(); ++i)
- hist_[i] = 0.;
-
- FFTBackward(hist_);
- AddBaseline();
- }
-
- bool IsPeak(int dist, size_t range) const {
- return IsLocalMaximum(dist, range);
- }
-
- bool IsPeak(int dist) const {
- return IsLocalMaximum(dist, 10);
- }
-
- // looking for one maximum in the picture
- vector<pair<int, double> > ListPeaks(/*int delta = 3*/) const {
- TRACE("Smoothed data");
- //size_t index_max = 0;
- //for (size_t i = 0; i < data_len_; ++i) {
- //TRACE(x_left_ + (int) i << " " << hist_[i]);
- //if (hist_[i].real() > hist_[index_max].real())
- //index_max = i;
- //}
- //vector<pair<int, double> > result;
- //result.push_back(make_pair(x_left_ + index_max, hist_[index_max].real()));
- //return result;
- DEBUG("Listing peaks");
- map<int, double> peaks_;
- //another data_len_
- size_t data_len_ = (size_t) (x_right_ - x_left_);
- vector<bool> was;
- srand((unsigned) time(NULL));
- for (size_t i = 0; i < data_len_; ++i)
- was.push_back(false);
-
- size_t iteration = 0;
- for (size_t l = 0; l < data_len_; ++l) {
- //for (size_t k = 0; k < 4; ++k) {
- //size_t v = std::rand() % data_len_;
- size_t v = l;
- if (was[v])
- continue;
-
- was[v] = true;
- int index = (int) v + x_left_;
- while (index < (x_right_ - 1) && index > x_left_ && iteration < 5) {
- // if @index is local maximum, then leave it
- double right_derivative = RightDerivative(index);
- double left_derivative = LeftDerivative(index);
-
- if (math::gr(right_derivative, 0.) && math::gr(right_derivative, -left_derivative)) {
- index++;
- if ((iteration & 1) == 0)
- ++iteration;
- }
- else if (math::le(left_derivative, 0.)) {
- index--;
- if ((iteration & 1) == 1)
- ++iteration;
- }
- else
- break;
- }
-
- TRACE("FOUND " << index);
-
- //double right_derivative = RightDerivative(index);
- //double left_derivative = LeftDerivative(index);
-
- if (index < 0)
- continue;
-
- //if (index >= x_right_ - delta || index < x_left_ + delta)
- //continue;
-
- TRACE("Is in range");
-
- if (IsPeak(index, 5)) {
- TRACE("Is local maximum " << index);
- double weight_ = 0.;
- int left_bound = (x_left_ > (index - 20) ? x_left_ : (index - 20));
- int right_bound = (x_right_ < (index + 1 + 20) ? x_right_ : (index + 1 + 20));
- for (int i = left_bound; i < right_bound; ++i)
- weight_ += hist_[i - x_left_].real();
- TRACE("WEIGHT counted");
- pair<int, double> tmp_pair = make_pair(index, 100. * weight_);
- if (!peaks_.count(index)) {
- TRACE("Peaks size " << peaks_.size() << ", inserting " << tmp_pair);
- peaks_.insert(tmp_pair);
- } else {
- TRACE("NON UNIQUE");
- }
- }
- }
- TRACE("FINISHED " << peaks_.size());
- vector<pair<int, double> > peaks;
- for (auto iter = peaks_.begin(); iter != peaks_.end(); ++iter) {
- const pair<int, double>& tmp_pair = *iter;
- TRACE("next peak " << tmp_pair);
- peaks.push_back(tmp_pair);
- //for (int i = -10; i <= 10; ++i) {
- //peaks.push_back(make_pair(tmp_pair.first + i, tmp_pair.second / 21.));
- //}
- }
- return peaks;
- }
-
- vector<complex_t> getIn() const {
- return hist_;
- }
-
- vector<complex_t> getOut() const {
- return hist_;
- }
-
-private:
- double x1, x2, y1, y2;
- size_t delta_;
- double percentage_;
- double der_thr_;
- double weight_;
- vector<int> x_;
- vector<double> y_;
- size_t data_size_, data_len_;
- int x_left_, x_right_;
- vector<complex_t> hist_;
-
- size_t Rev(size_t num, size_t lg_n) {
- size_t res = 0;
- for (size_t i = 0; i < lg_n; ++i)
- if (num & (1 << i))
- res |= 1 << (lg_n - 1 - i);
- return res;
- }
-
- void FFT(vector<complex_t>& vect, bool invert) {
- size_t n = vect.size();
- size_t lg_n = 0;
- while ( (1u << lg_n) < n)
- ++lg_n;
-
- while (n < (1u << lg_n)) {
- vect.push_back(0.);
- ++n;
- }
-
- for (size_t i = 0; i < n; ++i)
- if (i < Rev(i, lg_n))
- swap(vect[i], vect[Rev(i, lg_n)]);
-
- for (size_t len = 2; len < 1 + n; len <<= 1) {
- double ang = 2 * M_PI / (double) len * (invert ? -1 : 1);
- complex_t wlen(cos(ang), sin(ang));
- for (size_t i = 0; i < n; i += len) {
- complex_t w(1.);
- for (size_t j = 0; j < (len >> 1); ++j) {
- complex_t u = vect[i + j];
- complex_t v = vect[i + j + (len >> 1)] * w;
- vect[i + j] = u + v;
- vect[i + j + (len >> 1)] = u - v;
- w *= wlen;
- }
- }
- }
-
- if (invert)
- for (size_t i = 0; i < n; ++i)
- vect[i] /= (double) n;
- }
-
-
- void FFTForward(vector<complex_t>& vect) {
- FFT(vect, false);
- }
-
- void FFTBackward(vector<complex_t>& vect) {
- FFT(vect, true);
- }
-
- void ExtendLinear(vector<complex_t>& hist) {
- size_t ind = 0;
- weight_ = 0.;
- for (size_t i = 0; i < data_len_; ++i) {
- if (ind == data_size_ - 1)
- hist.push_back((double) x_right_);
- else {
- VERIFY(x_[ind + 1] > x_[ind]);
- hist.push_back(((double) (i + x_left_ - x_[ind]) *
- y_[ind + 1] + y_[ind] *
- (double) (x_[ind + 1] - i - x_left_)) /
- (double) (1 * (x_[ind + 1] - x_[ind])));
- }
- weight_ += hist[i].real(); // filling the array on the fly
-
- if (ind < data_size_ && ((int) i == x_[ind + 1] - x_left_))
- ++ind;
- }
-
- }
-
-
- void InitBaseline() {
- size_t Np = (size_t) ((double) data_len_ * percentage_);
- if (Np == 0) Np++; // Np <> 0 !!!!
-
- double mean_beg = 0.;
- double mean_end = 0.;
- for (size_t i = 0; i < Np; ++i) {
- mean_beg += hist_[i].real();
- mean_end += hist_[data_len_ - i - 1].real();
- }
- mean_beg /= 1. * (double) Np;
- mean_end /= 1. * (double) Np;
-
- // two points defining the line
- x1 = (double) Np / 2.;
- x2 = (double) data_len_ - (double) Np / 2.;
- y1 = mean_beg;
- y2 = mean_end;
- }
-
- void SubtractBaseline() {
- // subtracting a baseline
- // it's being constructed like this: the first point is (Np/2; mean of the first percentage of data),
- // the second point is (data_len_ - Np/2; mean of the last $percentage of data)
- for (size_t i = 0; i < data_len_; ++i) {
- hist_[i] -= (y1 + (y2 - y1) * ((double) i - x1) / (x2 - x1));
- }
- }
-
- void AddBaseline() {
- for (size_t i = 0; i < data_len_; ++i) {
- hist_[i] += (y1 + (y2 - y1) * ((double) i - x1) / (x2 - x1));
- }
- }
-
- void Init() {
- data_size_ = x_.size();
- x_left_ = x_[0];
- x_right_ = x_[data_size_ - 1] + 1;
- data_len_ = x_right_ - x_left_;
- ExtendLinear(hist_);
- }
-
- bool IsInRange(int peak) const {
- return peak < x_right_ && peak >= x_left_;
- }
-
- double LeftDerivative(int dist) const {
- VERIFY(dist > x_left_);
- return hist_[dist - x_left_].real() - hist_[dist - x_left_ - 1].real();
- }
-
- double RightDerivative(int dist) const {
- VERIFY(dist < x_right_ - 1);
- return hist_[dist - x_left_ + 1].real() - hist_[dist - x_left_].real();
- }
-
- double MiddleDerivative(int dist) const {
- VERIFY(dist > x_left_ && dist < x_right_ - 1);
- return .5 * (hist_[dist - x_left_ + 1].real() - hist_[dist - x_left_ - 1].real());
- }
-
- double Derivative(int dist) const {
- if (dist == x_right_ - 1)
- return LeftDerivative(dist);
- else if (dist == x_left_)
- return RightDerivative(dist);
- else
- return MiddleDerivative(dist);
- }
-
- bool IsLocalMaximum(int peak, size_t range, int left_bound, int right_bound, size_t delta) const {
-
- DEBUG("Is local maximum : peak " << peak << " range " << range
- << " bounds " << left_bound << " " << right_bound << " delta " << delta);
- int index_max = peak;
- TRACE("Looking for the maximum");
- for (int j = left_bound; j < right_bound; ++j)
- if (math::ls(hist_[index_max - x_left_].real(), hist_[j - x_left_].real())) {
- index_max = j;
- }// else if (j < i && hist_[index_max - x_left_][0] == hist_[j - x_left][0] ) index_max = j;
- TRACE("Maximum is " << index_max);
-
- if ((size_t)abs(index_max - peak) <= delta)
- return true;
-
- return false;
- }
-
- bool IsLocalMaximum(int peak, size_t range) const {
- return IsLocalMaximum(peak, range, x_left_, x_right_, delta_);
- }
-
- DECL_LOGGER("PeakFinder");
-};
-
-}
-
-}
-
-#endif /* PEAKFINDER_HPP_ */
diff --git a/src/include/de/smoothing_distance_estimation.hpp b/src/include/de/smoothing_distance_estimation.hpp
deleted file mode 100644
index c3c1bf8..0000000
--- a/src/include/de/smoothing_distance_estimation.hpp
+++ /dev/null
@@ -1,221 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef SMOOTHING_DISTANCE_ESTIMATION_HPP_
-#define SMOOTHING_DISTANCE_ESTIMATION_HPP_
-
-#include "paired_info.hpp"
-#include "omni/omni_utils.hpp"
-#include "data_divider.hpp"
-#include "peak_finder.hpp"
-#include "extensive_distance_estimation.hpp"
-
-namespace omnigraph {
-
-namespace de {
-
-template<class Graph>
-class SmoothingDistanceEstimator: public ExtensiveDistanceEstimator<Graph> {
- //FIXME configure
- static const size_t OVERLAP_TOLERANCE = 1000;
-protected:
- typedef ExtensiveDistanceEstimator<Graph> base;
- typedef typename base::InPairedIndex InPairedIndex;
- typedef typename base::OutPairedIndex OutPairedIndex;
- typedef typename base::InHistogram InHistogram;
- typedef typename base::OutHistogram OutHistogram;
- typedef typename InPairedIndex::Histogram TempHistogram;
-
- public:
- SmoothingDistanceEstimator(const Graph& graph,
- const InPairedIndex& histogram,
- const GraphDistanceFinder<Graph>& dist_finder,
- std::function<double(int)> weight_f,
- size_t linkage_distance, size_t max_distance, size_t threshold,
- double range_coeff, double delta_coeff,
- size_t cutoff,
- size_t min_peak_points,
- double inv_density,
- double percentage,
- double derivative_threshold,
- bool only_scaffolding = false) :
- base(graph, histogram, dist_finder, weight_f, linkage_distance, max_distance),
- threshold_(threshold),
- range_coeff_(range_coeff),
- delta_coeff_(delta_coeff),
- cutoff_((int) cutoff),
- min_peak_points_(min_peak_points),
- inv_density_(inv_density),
- percentage_(percentage),
- deriv_thr(derivative_threshold),
- only_scaffolding_(only_scaffolding),
- gap_distances(0)
- {}
-
- virtual ~SmoothingDistanceEstimator(){}
-
-protected:
- typedef typename Graph::EdgeId EdgeId;
- typedef pair<EdgeId, EdgeId> EdgePair;
- typedef vector<pair<int, double> > EstimHist;
- typedef vector<PairInfo<EdgeId> > PairInfos;
- typedef vector<size_t> GraphLengths;
-
- EstimHist EstimateEdgePairDistances(EdgePair /*ep*/,
- const InHistogram& /*raw_data*/,
- const vector<size_t>& /*forward*/) const override {
- VERIFY_MSG(false, "Sorry, the SMOOOOTHING estimator is not available anymore." <<
- "SPAdes is going to terminate");
-
- return EstimHist();
- }
-
-private:
- typedef pair<size_t, size_t> Interval;
-
- size_t threshold_;
- double range_coeff_;
- double delta_coeff_;
- int cutoff_;
- size_t min_peak_points_;
- double inv_density_;
- double percentage_;
- double deriv_thr;
- bool only_scaffolding_;
- mutable size_t gap_distances;
-
- EstimHist FindEdgePairDistances(EdgePair ep,
- const InHistogram& raw_hist) const {
- size_t first_len = this->graph().length(ep.first);
- size_t second_len = this->graph().length(ep.second);
- TRACE("Lengths are " << first_len << " " << second_len);
- TempHistogram data;
- for (auto I = raw_hist.begin(), E = raw_hist.end(); I != E; ++I) {
- Point p = *I;
- if (math::ge(2 * (long) rounded_d(p) + (long) second_len, (long) first_len))
- if ((long) rounded_d(p) + (long) OVERLAP_TOLERANCE >= (long) first_len)
- data.insert(p);
- }
- EstimHist result;
- double picture_weight = 0.;
- for (auto I = data.begin(), E = data.end(); I != E; ++I)
- picture_weight += I->weight;
- if (math::ls(picture_weight, 3.))
- return result;
-
- DataDivider<EdgeId> data_divider(threshold_,
- vector<Point>(data.begin(), data.end()));
-
- PairInfos infos;
- infos.reserve(data.size());
- const vector<Interval>& clusters =
- data_divider.DivideAndSmoothData(ep, infos, this->weight_f_);
- DEBUG("Seeking for distances");
- TRACE("size " << infos.size());
-
- for (size_t i = 0; i < clusters.size(); ++i) {
- size_t begin = clusters[i].first;
- size_t end = clusters[i].second;
- TRACE("begin " << begin << " at " << rounded_d(infos[begin])
- << ", " << " end " << end << " at " << rounded_d(infos[end - 1]));
- size_t data_length = rounded_d(infos[end - 1]) - rounded_d(infos[begin]) + 1;
- TRACE("data length " << data_length);
- if (end - begin > min_peak_points_) {
- size_t range = (size_t) math::round((double) data_length * range_coeff_);
- size_t delta = (size_t) math::round((double) data_length * delta_coeff_);
- PeakFinder<EdgeId> peakfinder(infos, begin, end, range, delta, percentage_, deriv_thr);
- DEBUG("Processing window : " << rounded_d(infos[begin])
- << " " << rounded_d(infos[end - 1]));
- peakfinder.FFTSmoothing(cutoff_);
- TRACE("Listing peaks");
- const EstimHist& peaks = peakfinder.ListPeaks();
- //for (auto iter = peaks.begin(); iter != peaks.end(); ++iter) {
- //TRACE("PEAKS " << iter->first << " " << iter->second);
- //}
- if (peaks.size() == 0)
- continue;
- size_t index_of_max_weight = 0;
- for (size_t i = 0; i < peaks.size(); ++i)
- if (math::ls(peaks[index_of_max_weight].second, peaks[i].second))
- index_of_max_weight = i;
- result.push_back(peaks[index_of_max_weight]);
- }
- }
-
- if (result.size() == 0)
- return result;
- size_t index_of_max_weight = 0;
- for (size_t i = 0; i < result.size(); ++i)
- if (math::ls(result[index_of_max_weight].second, result[i].second))
- index_of_max_weight = i;
-
- EstimHist new_result;
- for (size_t i = 0; i < result.size(); ++i)
- if (result[i].second > .5 * result[index_of_max_weight].second)
- new_result.push_back(result[i]);
- return new_result;
- }
-
- void ProcessEdge(EdgeId e1,
- const InPairedIndex& pi,
- PairedInfoBuffer<Graph>& result) const override {
- typename base::LengthMap second_edges;
- auto inner_map = pi.RawGet(e1);
- for (auto I : inner_map)
- second_edges[I.first];
-
- this->FillGraphDistancesLengths(e1, second_edges);
-
- for (const auto& entry: second_edges) {
- EdgeId e2 = entry.first;
- EdgePair ep(e1, e2);
-
- VERIFY(ep <= pi.ConjugatePair(ep));
-
- TRACE("Processing edge pair " << this->graph().int_id(e1)
- << " " << this->graph().int_id(e2));
- const GraphLengths& forward = entry.second;
-
- TempHistogram hist = pi.Get(e1, e2).Unwrap();
- EstimHist estimated;
- //DEBUG("Extending paired information");
- //DEBUG("Extend left");
- //this->base::ExtendInfoLeft(e1, e2, hist, 1000);
- DEBUG("Extend right");
- this->ExtendInfoRight(e1, e2, hist, 1000);
- if (forward.size() == 0) {
- estimated = FindEdgePairDistances(ep, hist);
- ++gap_distances;
- } else if (forward.size() > 0 && (!only_scaffolding_)) {
- estimated = this->base::EstimateEdgePairDistances(ep, hist, forward);
- }
- DEBUG(gap_distances << " distances between gap edge pairs have been found");
- OutHistogram res = this->ClusterResult(ep, estimated);
- this->AddToResult(res, ep, result);
- }
- }
-
- bool IsTipTip(EdgeId e1, EdgeId e2) const {
- return (this->graph().OutgoingEdgeCount(this->graph().EdgeEnd(e1)) == 0 &&
- this->graph().IncomingEdgeCount(this->graph().EdgeEnd(e1)) == 1 &&
- this->graph().IncomingEdgeCount(this->graph().EdgeStart(e2)) == 0 &&
- this->graph().OutgoingEdgeCount(this->graph().EdgeStart(e2)) == 1);
- }
-
- const string Name() const override {
- static const string my_name = "SMOOTHING";
- return my_name;
- }
-
- DECL_LOGGER("SmoothingDistanceEstimator")
-};
-
-}
-
-}
-
-#endif /* SMOOTHING_DISTANCE_ESTIMATION_HPP_ */
diff --git a/src/include/de/weighted_distance_estimation.hpp b/src/include/de/weighted_distance_estimation.hpp
deleted file mode 100644
index eecb6bc..0000000
--- a/src/include/de/weighted_distance_estimation.hpp
+++ /dev/null
@@ -1,115 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef WEIGHTED_DISTANCE_ESTIMATION_HPP_
-#define WEIGHTED_DISTANCE_ESTIMATION_HPP_
-
-#include "xmath.h"
-#include "paired_info.hpp"
-#include "omni/omni_utils.hpp"
-#include "distance_estimation.hpp"
-
-namespace omnigraph {
-
-namespace de {
-
-template<class Graph>
-class WeightedDistanceEstimator: public DistanceEstimator<Graph> {
- protected:
- typedef DistanceEstimator<Graph> base;
- typedef typename base::InPairedIndex InPairedIndex;
- typedef typename base::OutPairedIndex OutPairedIndex;
- typedef typename base::InHistogram InHistogram;
- typedef typename base::OutHistogram OutHistogram;
-
- public:
- WeightedDistanceEstimator(const Graph &graph,
- const InPairedIndex& histogram,
- const GraphDistanceFinder<Graph>& distance_finder, std::function<double(int)> weight_f,
- size_t linkage_distance, size_t max_distance) :
- base(graph, histogram, distance_finder, linkage_distance, max_distance), weight_f_(weight_f)
- {}
-
- virtual ~WeightedDistanceEstimator() {}
-
- protected:
- typedef typename Graph::EdgeId EdgeId;
-
- typedef vector<pair<int, double> > EstimHist;
- typedef pair<EdgeId, EdgeId> EdgePair;
- typedef vector<size_t> GraphLengths;
-
- std::function<double(int)> weight_f_;
-
- virtual EstimHist EstimateEdgePairDistances(EdgePair ep,
- const InHistogram& histogram,
- const GraphLengths& raw_forward) const override {
- using std::abs;
- using namespace math;
- TRACE("Estimating with weight function");
- size_t first_len = this->graph().length(ep.first);
- size_t second_len = this->graph().length(ep.second);
-
- EstimHist result;
- int maxD = rounded_d(histogram.max()), minD = rounded_d(histogram.min());
- vector<int> forward;
- for (auto length : raw_forward) {
- if (minD - (int) this->max_distance_ <= length && length <= maxD + (int) this->max_distance_) {
- forward.push_back(length);
- }
- }
- if (forward.size() == 0)
- return result;
-
- DEDistance max_dist = this->max_distance_;
- size_t cur_dist = 0;
- vector<double> weights(forward.size());
- for (auto iter = histogram.begin(); iter != histogram.end(); ++iter) {
- Point point = *iter;
- if (ls(2. * point.d + (double) second_len, (double) first_len))
- continue;
- while (cur_dist + 1 < forward.size() && (double) forward[cur_dist + 1] < point.d) {
- ++cur_dist;
- }
- if (cur_dist + 1 < forward.size() && ls((double) forward[cur_dist + 1] - point.d,
- point.d - (double) forward[cur_dist])) {
- ++cur_dist;
- if (le(abs(forward[cur_dist] - point.d), max_dist))
- weights[cur_dist] += point.weight * weight_f_(forward[cur_dist] - rounded_d(point));
- }
- else if (cur_dist + 1 < forward.size() && eq(forward[cur_dist + 1] - point.d,
- point.d - forward[cur_dist])) {
- if (le(abs(forward[cur_dist] - point.d), max_dist))
- weights[cur_dist] += point.weight * 0.5 * weight_f_(forward[cur_dist] - rounded_d(point));
-
- ++cur_dist;
-
- if (le(abs(forward[cur_dist] - point.d), max_dist))
- weights[cur_dist] += point.weight * 0.5 * weight_f_(forward[cur_dist] - rounded_d(point));
- } else
- if (le(abs(forward[cur_dist] - point.d), max_dist))
- weights[cur_dist] += point.weight * weight_f_(forward[cur_dist] - rounded_d(point));
- }
-
- for (size_t i = 0; i < forward.size(); ++i)
- if (gr(weights[i], 0.))
- result.push_back(make_pair(forward[i], weights[i]));
-
- return result;
- }
-
- const string Name() const override {
- static const string my_name = "WEIGHTED";
- return my_name;
- }
-
-};
-
-}
-
-}
-#endif
diff --git a/src/include/file_limit.hpp b/src/include/file_limit.hpp
deleted file mode 100644
index 234ae14..0000000
--- a/src/include/file_limit.hpp
+++ /dev/null
@@ -1,33 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include <sys/types.h>
-#include <sys/time.h>
-#include <sys/resource.h>
-
-#include "verify.hpp"
-
-inline rlim_t limit_file(size_t limit) {
- struct rlimit rl;
-
- int res = getrlimit(RLIMIT_NOFILE, &rl);
- VERIFY_MSG(res == 0,
- "getrlimit(2) call failed, errno = " << errno);
-
- // We cannot go beyond hard limit and we might not have enough privileges to
- // increase the hard limit
- limit = std::max<size_t>(limit, rl.rlim_cur);
- rl.rlim_cur = std::min<size_t>(limit, rl.rlim_max);
- res = setrlimit(RLIMIT_NOFILE, &rl);
- VERIFY_MSG(res == 0,
- "setrlimit(2) call failed, errno = " << errno);
- INFO("Open file limit set to " << rl.rlim_cur);
-
- return rl.rlim_cur;
-}
diff --git a/src/include/func.hpp b/src/include/func.hpp
deleted file mode 100644
index 6420d70..0000000
--- a/src/include/func.hpp
+++ /dev/null
@@ -1,69 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include <functional>
-
-namespace func {
-
-//to use with std::function-s
-template<class T>
-void Compose(T t, std::function<void(T)> f1,
- std::function<void(T)> f2) {
- if (f1)
- f1(t);
- if (f2)
- f2(t);
-}
-
-template<class T>
-std::function<void(T)> Composition(std::function<void(T)> f1,
- std::function<void(T)> f2) {
- return std::bind(func::Compose<T>, std::placeholders::_1, f1, f2);
-}
-
-template<class A, class B>
-class Func {
-public:
- typedef std::function<B(A)> function_t;
-
- virtual B Apply(A a) const = 0;
-
- virtual ~Func() {
- }
-};
-
-template<class T>
-class AndOperator;
-
-template<class T>
-class OrOperator;
-
-template<class T>
-class NotOperator;
-
-template<class T>
-class Predicate: public Func<T, bool> {
-public:
- typedef T checked_type;
-
- bool Apply(T t) const {
- return Check(t);
- }
-
- virtual bool Check(T t) const = 0;
-
- bool operator()(T t) const { return Check(t); }
-
-
- virtual ~Predicate() {
- }
-};
-
-
-}
diff --git a/src/include/graph_print_utils.hpp b/src/include/graph_print_utils.hpp
deleted file mode 100755
index a10fcec..0000000
--- a/src/include/graph_print_utils.hpp
+++ /dev/null
@@ -1,328 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef GRAPH_PRINTER_HPP_
-#define GRAPH_PRINTER_HPP_
-
-#include "standard_base.hpp"
-
-namespace gvis {
-
-template<class VertexId>
-struct BaseVertex {
- VertexId id_;
- string label_;
- string href_;
- string fill_color_;
- BaseVertex(VertexId id, string label, string reference, string fill_color) :id_(id), label_(label), href_(reference), fill_color_(fill_color) {
- }
-};
-
-template<class VertexId>
-struct BaseEdge {
- VertexId from;
- VertexId to;
- string label;
- string color;
- BaseEdge(VertexId _from, VertexId _to, string _label, string _color) {
- from = _from;
- to = _to;
- label = _label;
- color = _color;
- }
-};
-
-class StreamRecorder {
-private:
- ostream &os_;
-protected:
- virtual ostream &os() {
- return os_;
- }
-public:
- StreamRecorder(ostream &os) : os_(os) {
- }
-
- virtual ~StreamRecorder() {
- }
-};
-
-template<class Vertex, class Edge>
-class GraphRecorder {
-public:
- virtual void recordVertex(Vertex vertex) = 0;
-
- virtual void recordEdge(Edge edge) = 0;
-
- virtual inline void startGraphRecord(const string &name) = 0;
-
- virtual inline void endGraphRecord() = 0;
-
- virtual ~GraphRecorder(){
- }
-};
-
-template<class VertexId>
-class SingleGraphRecorder : public GraphRecorder<BaseVertex<VertexId>, BaseEdge<VertexId>> {
-protected:
- typedef BaseVertex<VertexId> Vertex;
- typedef BaseEdge<VertexId> Edge;
-};
-
-template<class VertexId>
-class PairedGraphRecorder : public GraphRecorder<pair<BaseVertex<VertexId>, BaseVertex<VertexId>>, BaseEdge<pair<VertexId, VertexId>>> {
-protected:
- typedef pair<BaseVertex<VertexId>, BaseVertex<VertexId>> Vertex;
- typedef BaseEdge<pair<VertexId, VertexId>> Edge;
-};
-
-template<class VertexId>
-class DotGraphRecorder : public StreamRecorder {
-public:
- DotGraphRecorder(ostream &os) : StreamRecorder(os) {
- }
-
-protected:
- template<class vid>
- void recordVertexId(vid id) {
- this->os() << "vertex_" << id;
- }
-
- string IdToStr(VertexId u) {
- stringstream ss;
- ss << u;
- return ss.str();
- }
-
- string constructNodeId(VertexId v) {
- return constructNodePairId(v, v);
- }
-
- inline void recordParameter(ostream &os, const string &name, const string &value) {
- os << name << "=" << "<" << value << "> ";
- }
-
- inline void recordParameter(const string &name, const string &value) {
- recordParameter(this->os(), name, value);
- }
-
- inline void recordParameterInQuotes(ostream &os, const string &name, const string &value) {
- os << name << "=" << "\"" << value << "\" ";
- }
-
- inline void recordParameterInQuotes(const string &name, const string &value) {
- recordParameterInQuotes(this->os(), name, value);
- }
-
- inline double getColorParameter(int l, int r, double perc) {
- return l * perc + r * (1 - perc);
- }
-
- inline string getColor(int currentLength, int approximateLength) {
- currentLength %= approximateLength;
- int points[8][3] = {{0, 0, 1}, {0, 1, 1}, {1, 1, 1}, {0, 1, 0}, {1, 1, 0}, {1, 0, 1}, {0, 0, 1}};
- stringstream ss;
- int bound = approximateLength / 6;
- int num = currentLength / bound;
- double perc = (currentLength % bound) * 1. / bound;
- for(int i = 0; i < 3; i++) {
- ss << getColorParameter(points[num][i], points[num + 1][i], perc);
- if(i != 2)
- ss << ",";
- }
- return ss.str();
- }
-
-};
-
-
-template<class SingleVertexId>
-class DotSingleGraphRecorder: public SingleGraphRecorder<SingleVertexId>, public DotGraphRecorder<SingleVertexId> {
-private:
- typedef BaseVertex<SingleVertexId> Vertex;
- typedef BaseEdge<SingleVertexId> Edge;
-
-public:
- DotSingleGraphRecorder(ostream &os) : DotGraphRecorder<SingleVertexId>(os) {
- }
-
- void recordVertex(Vertex vertex) {
- this->recordVertexId(vertex.id_);
- this->os() << "[";
- this->recordParameterInQuotes("label", vertex.label_);
- this->os() << ",";
- this->recordParameter("style", "filled");
- this->os() << ",";
- this->recordParameter("color", "black");
- this->os() << ",";
- if(vertex.href_ != "") {
- this->recordParameterInQuotes("href", vertex.href_);
- this->os() << ",";
- }
- this->recordParameter("fillcolor", vertex.fill_color_);
- this->os() << "]" << endl;
- }
-
- void recordEdge(Edge edge) {
- this->recordVertexId(edge.from);
- this->os() << "->";
- this->recordVertexId(edge.to);
- this->os() << "[";
- this->recordParameterInQuotes("label", edge.label);
- this->os() << ",";
- this->recordParameter("color", edge.color);
- this->os() << "]" << endl;
- }
-
- inline void startGraphRecord(const string &name) {
- this->os() << "digraph " << name << " {" << endl;
- this->os() << "node" << "[";
- this->recordParameter("fontname", "Courier");
- this->recordParameter("penwidth", "1.8");
- this->os() << "]" << endl;
- }
-
- inline void endGraphRecord() {
- this->os() << "}" << endl;
- }
-};
-
-template<class SingleVertexId>
-class DotPairedGraphRecorder: public PairedGraphRecorder<SingleVertexId>, public DotGraphRecorder<SingleVertexId> {
-private:
- typedef BaseVertex<SingleVertexId> SingleVertex;
- typedef BaseEdge<SingleVertexId> SingleEdge;
- typedef typename PairedGraphRecorder<SingleVertexId>::Vertex Vertex;
- typedef typename PairedGraphRecorder<SingleVertexId>::Edge Edge;
-
-
- string constructNodePairId(SingleVertexId u, SingleVertexId v) {
- stringstream ss;
- string u_str = this->IdToStr(u);
- string v_str = this->IdToStr(v);
- if (u == v)
- ss << u;
- else if (u_str > v_str)
- ss << v_str << "_" << u_str;
- else
- ss << u_str << "_" << v_str;
- return ss.str();
- }
-
- inline string constructPortCell(const string &port, string href, const string &color) {
- stringstream ss;
- ss << "<TD BORDER=\"0\" PORT = \"port_" << port << "\" ";
- this->recordParameterInQuotes(ss, "color", color);
- this->recordParameterInQuotes(ss, "bgcolor", color);
- if(href != "") {
- ss <<"href=\"" << href << "\"";
- }
- ss << "></TD>";
- return ss.str();
- }
-
- inline string constructLabelCell(const string &label, const string &href, const string &color) {
- stringstream ss;
- ss << "<TD BORDER=\"0\" ";
- this->recordParameterInQuotes(ss, "color", color);
- this->recordParameterInQuotes(ss, "bgcolor", color);
- if(href != "") {
- ss <<"href=\"" << href << "\"";
- }
- ss << ">"
- << label << "</TD>";
- return ss.str();
- }
-
- string constructComplexNodeId(string pairId, SingleVertexId v) {
- stringstream ss;
- ss << pairId << ":port_" << v;
- return ss.str();
- }
-
- string constructTableEntry(SingleVertex v/*, const string &label, const string &href*/) {
- stringstream ss;
- ss << "<TR>";
- ss << constructPortCell(ToString(v.id_) + "_in", v.href_, v.fill_color_);
- ss << constructLabelCell(v.label_, v.href_, v.fill_color_);
- ss << constructPortCell(ToString(v.id_) + "_out", v.href_, v.fill_color_);
- ss << "</TR>\n";
- return ss.str();
- }
-
- string constructReverceTableEntry(SingleVertex v/*, const string &label, const string &href*/) {
- stringstream ss;
- ss << "<TR>";
- ss << constructPortCell(ToString(v.id_) + "_out", v.href_, v.fill_color_);
- ss << constructLabelCell(v.label_, v.href_, v.fill_color_);
- ss << constructPortCell(ToString(v.id_) + "_in", v.href_, v.fill_color_);
- ss << "</TR>\n";
- return ss.str();
- }
-
- string constructComplexNodeLabel(Vertex v) {
- return "<TABLE BORDER=\"1\" CELLSPACING=\"0\" >\n" + constructTableEntry(v.first)
- + constructReverceTableEntry(v.second) + "</TABLE>";
- }
-
- string constructVertexInPairId(SingleVertexId v, SingleVertexId rc) {
- return constructComplexNodeId(constructNodePairId(v, rc), v);
- }
-
-
-public:
- DotPairedGraphRecorder(ostream &os) : DotGraphRecorder<SingleVertexId>(os) {
- }
-
- void recordPairedVertexId(SingleVertexId id1, SingleVertexId id2) {
- this->os() << "vertex_" << constructNodePairId(id1, id2);
- }
-
- void recordVertex(Vertex vertex) {
- string pairLabel = constructComplexNodeLabel(vertex);
- recordPairedVertexId(vertex.first.id_, vertex.second.id_);
- this->os() << "[";
- this->recordParameter("label", constructComplexNodeLabel(vertex));
- this->os() << ",";
- this->recordParameter("color", "black");
- this->os() << ",";
- this->recordParameter("URL", "/vertex/" + std::to_string(vertex.first.id_) + ".svg");
- this->os() << "]" << endl;
- }
-
- void recordEdge(Edge edge) {
- this->recordVertexId(constructVertexInPairId(edge.from.first, edge.from.second));
- this->os() << "_out";
- this->os() << "->";
- this->recordVertexId(constructVertexInPairId(edge.to.first, edge.to.second));
- this->os() << "_in";
- this->os() << "[";
- this->recordParameterInQuotes("label", edge.label);
- this->os() << ",";
- this->recordParameter("color", edge.color);
- this->os() << "]" << endl;
- }
-
- inline void startGraphRecord(const string &name) {
- this->os() << "digraph " << name << " {" << endl;
- this->os() << "node" << "[";
- this->recordParameter("fontname", "Courier");
- this->os() << ",";
- this->recordParameter("penwidth", "1.8");
- this->os() << ",";
- this->recordParameter("shape", "plaintext");
- this->os() << "]" << endl;
- }
-
- inline void endGraphRecord() {
- this->os() << "}" << endl;
- }
-};
-
-
-}
-#endif //GRAPH_PRINTER_HPP_//
diff --git a/src/include/io/bam_parser.hpp b/src/include/io/bam_parser.hpp
deleted file mode 100644
index 7f81bb4..0000000
--- a/src/include/io/bam_parser.hpp
+++ /dev/null
@@ -1,67 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef COMMON_IO_BAMPARSER_HPP
-#define COMMON_IO_BAMPARSER_HPP
-
-#include "io/single_read.hpp"
-#include "io/parser.hpp"
-#include "sequence/quality.hpp"
-#include "sequence/nucl.hpp"
-#include "verify.hpp"
-
-#include "bamtools/api/BamReader.h"
-
-#include <string>
-
-namespace io {
-
-class BAMParser: public Parser {
-public:
- BAMParser(const std::string& filename, OffsetType offset_type = PhredOffset)
- : Parser(filename, offset_type) {
- open();
- }
-
- ~BAMParser() {
- close();
- }
-
- BAMParser& operator>>(SingleRead& read) {
- if (!is_open_ || eof_)
- return *this;
-
- read = SingleRead(seq_.Name, seq_.QueryBases, seq_.Qualities, offset_type_);
- eof_ = (false == reader_.GetNextAlignment(seq_));
-
- return *this;
- }
-
- void close() {
- reader_.Close();
- is_open_ = false;
- eof_ = true;
- }
-
-private:
- BamTools::BamReader reader_;
- BamTools::BamAlignment seq_;
-
- void open() {
- reader_.Open(filename_);
- is_open_ = true;
-
- eof_ = (false == reader_.GetNextAlignment(seq_));
- }
-
- BAMParser(const BAMParser& parser);
- void operator=(const BAMParser& parser);
-};
-
-}
-
-#endif /* COMMON_IO_FASTAFASTQGZPARSER_HPP */
diff --git a/src/include/io/bam_reader.hpp b/src/include/io/bam_reader.hpp
deleted file mode 100644
index a56427c..0000000
--- a/src/include/io/bam_reader.hpp
+++ /dev/null
@@ -1,105 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-//todo rename to reader
-#pragma once
-
-#include "ireader.hpp"
-#include "single_read.hpp"
-
-#include <bamtools/api/BamReader.h>
-
-namespace io {
-class BamRead : public BamTools::BamAlignment {
- public:
- BamRead() {}
-
- BamRead(const BamTools::BamAlignment &other)
- : BamTools::BamAlignment(other) {}
-
- const std::string& name() const {
- return Name;
- }
-
- size_t size() const {
- return Length;
- }
-
- size_t nucl_count() const {
- return size();
- }
-
- const std::string& GetSequenceString() const {
- return QueryBases;
- }
-
- std::string GetPhredQualityString() const {
- return Qualities;
- }
-
- operator io::SingleRead() {
- // not including quality is intentional:
- // during read correction bases might be inserted/deleted,
- // and base qualities for them are not calculated
- return io::SingleRead(name(), GetSequenceString());
- }
-
- char operator[](size_t i) const {
- VERIFY(is_nucl(QueryBases[i]));
- return dignucl(QueryBases[i]);
- }
-};
-
-class UnmappedBamStream: public ReadStream<BamRead> {
- public:
- UnmappedBamStream(const std::string &filename)
- : filename_(filename) {
- open();
- }
-
- virtual ~UnmappedBamStream() {}
-
- bool is_open() { return is_open_; }
- bool eof() { return eof_; }
- UnmappedBamStream& operator>>(BamRead& read) {
- if (!is_open_ || eof_)
- return *this;
-
- read = seq_;
- eof_ = (false == reader_.GetNextAlignment(seq_));
-
- return *this;
- }
-
- void close() {
- reader_.Close();
- is_open_ = false;
- eof_ = true;
- }
-
- void reset() {
- close();
- open();
- }
-
- ReadStreamStat get_stat() const { return ReadStreamStat(); }
-
- private:
- BamTools::BamReader reader_;
- BamTools::BamAlignment seq_;
- std::string filename_;
- bool is_open_;
- bool eof_;
-
- void open() {
- reader_.Open(filename_);
- is_open_ = true;
-
- eof_ = (false == reader_.GetNextAlignment(seq_));
- }
-
-};
-}
diff --git a/src/include/io/binary_converter.hpp b/src/include/io/binary_converter.hpp
deleted file mode 100644
index 240dcbb..0000000
--- a/src/include/io/binary_converter.hpp
+++ /dev/null
@@ -1,295 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * binary_io.hpp
- *
- * Created on: Apr 12, 2012
- * Author: andrey
- */
-
-#ifndef BINARY_IO_HPP_
-#define BINARY_IO_HPP_
-
-#include <fstream>
-
-#include "verify.hpp"
-#include "ireader.hpp"
-#include "single_read.hpp"
-#include "paired_read.hpp"
-#include "library.hpp"
-
-namespace io {
-
-template<class Read>
-class ReadBinaryWriter {
-
-public:
-
- ReadBinaryWriter(LibraryOrientation /*orientation*/ = LibraryOrientation::Undefined) {
- }
-
- bool Write(std::ostream& file, const Read& r) const {
- return r.BinWrite(file);
- }
-};
-
-template<>
-class ReadBinaryWriter<PairedRead> {
-
-private:
-
- bool rc1_;
-
- bool rc2_;
-
-public:
-
- ReadBinaryWriter(LibraryOrientation orientation) {
- switch (orientation) {
- case LibraryOrientation::FF: {
- rc1_ = false;
- rc2_ = false;
- break;
- }
- case LibraryOrientation::RR: {
- rc1_ = true;
- rc2_ = true;
- break;
- }
- case LibraryOrientation::FR: {
- rc1_ = false;
- rc2_ = true;
- break;
- }
- case LibraryOrientation::RF: {
- rc1_ = true;
- rc2_ = false;
- break;
- }
- default: {
- rc1_ = false;
- rc2_ = false;
- break;
- }
- }
-
- }
-
- bool Write(std::ostream& file, const PairedRead& r) const {
- return r.BinWrite(file, rc1_, rc2_);
- }
-};
-
-
-class BinaryWriter {
-
-private:
- const std::string file_name_prefix_;
-
- size_t file_num_;
-
- std::vector<std::ofstream*> file_ds_;
-
- size_t buf_size_;
-
- template<class Read>
- void FlushBuffer(const std::vector<Read>& buffer, const ReadBinaryWriter<Read>& read_writer, std::ostream& file, size_t from, size_t to) {
- for (size_t i = from; i < to; ++i) {
- read_writer.Write(file, buffer[i]);
- }
- }
-
- template<class Read>
- void FlushBuffer(const std::vector<Read>& buffer, const ReadBinaryWriter<Read>& read_writer, std::ostream& file) {
- FlushBuffer(buffer, read_writer, file, 0, buffer.size());
- }
-
- template<class Read>
- ReadStreamStat ToBinary(io::ReadStream<Read>& stream, size_t buf_size,
- LibraryOrientation orientation) {
-
- ReadBinaryWriter<Read> read_writer(orientation);
- size_t buffer_reads = buf_size / (sizeof (Read) * 4);
- size_t reads_to_flush = buffer_reads * file_num_;
-
- std::vector< std::vector<Read> > buf(file_num_, std::vector<Read>(buffer_reads) );
- std::vector< ReadStreamStat > read_stats(file_num_);
- std::vector< size_t > current_buf_sizes(file_num_, 0);
- size_t read_count = 0;
-
- for (size_t i = 0; i < file_num_; ++i) {
- file_ds_[i]->seekp(0);
- read_stats[i].write(*file_ds_[i]);
- }
-
- size_t buf_index;
- while (!stream.eof()) {
- buf_index = read_count % file_num_;
-
- Read& r = buf[buf_index][current_buf_sizes[buf_index]];
- stream >> r;
- read_stats[buf_index].increase(r);
-
- ++current_buf_sizes[buf_index];
- VERBOSE_POWER(++read_count, " reads processed");
-
- if (read_count % reads_to_flush == 0) {
- for (size_t i = 0; i < file_num_; ++i) {
- FlushBuffer(buf[i], read_writer, *file_ds_[i]);
- current_buf_sizes[i] = 0;
- }
- }
- }
-
- ReadStreamStat result;
- for (size_t i = 0; i < file_num_; ++i) {
- buf[i].resize(current_buf_sizes[i]);
- FlushBuffer(buf[i], read_writer, *file_ds_[i]);
-
- file_ds_[i]->seekp(0);
- read_stats[i].write(*file_ds_[i]);
- result.merge(read_stats[i]);
- }
-
- INFO(read_count << " reads written");
- return result;
- }
-
-
- template<class Read>
- ReadStreamStat ToBinaryForThread(io::ReadStream<Read>& stream, size_t buf_size,
- size_t thread_num, LibraryOrientation orientation) {
-
- ReadBinaryWriter<Read> read_writer(orientation);
- size_t buffer_reads = buf_size / (sizeof (Read) * 4);
- std::vector<Read> buf(buffer_reads);
-
- ReadStreamStat stat;
- file_ds_[thread_num]->seekp(0);
- stat.write(*file_ds_[thread_num]);
-
- size_t current = 0;
-
- while (!stream.eof()) {
- Read& r = buf[current];
- stream >> r;
- stat.increase(r);
- ++current;
-
- if (stat.read_count_ % buffer_reads == 0) {
- FlushBuffer(buf, read_writer, *file_ds_[thread_num]);
- current = 0;
- }
- }
-
- buf.resize(current);
- FlushBuffer(buf, read_writer, *file_ds_[thread_num]);
-
- file_ds_[thread_num]->seekp(0);
- stat.write(*file_ds_[thread_num]);
-
- return stat;
- }
-
-
-public:
-
- BinaryWriter(const std::string& file_name_prefix, size_t file_num,
- size_t buf_size):
- file_name_prefix_(file_name_prefix), file_num_(file_num),
- file_ds_(), buf_size_(buf_size) {
-
- std::string fname;
- for (size_t i = 0; i < file_num_; ++i) {
- fname = file_name_prefix_ + "_" + ToString(i) + ".seq";
- file_ds_.push_back(new std::ofstream(fname, std::ios_base::binary));
- }
- }
-
- ~BinaryWriter() {
- for (size_t i = 0; i < file_num_; ++i) {
- if (file_ds_[i]->is_open()) {
- file_ds_[i]->close();
- }
- delete file_ds_[i];
- }
- }
-
-
- ReadStreamStat ToBinary(io::ReadStream<io::SingleReadSeq>& stream) {
- return ToBinary(stream, buf_size_ / file_num_, LibraryOrientation::Undefined);
- }
-
- ReadStreamStat ToBinary(io::ReadStream<io::SingleRead>& stream) {
- return ToBinary(stream, buf_size_ / file_num_, LibraryOrientation::Undefined);
- }
-
- ReadStreamStat ToBinary(io::ReadStream<io::PairedReadSeq>& stream) {
- return ToBinary(stream, buf_size_ / (2 * file_num_), LibraryOrientation::Undefined);
- }
-
- ReadStreamStat ToBinary(io::ReadStream<io::PairedRead>& stream, LibraryOrientation orientation) {
- return ToBinary(stream, buf_size_ / (2 * file_num_), orientation);
- }
-
- ReadStreamStat ToBinaryForThread(io::ReadStream<io::SingleReadSeq>& stream, size_t thread_num) {
- return ToBinaryForThread(stream, buf_size_ / file_num_, thread_num, LibraryOrientation::Undefined);
- }
-
- ReadStreamStat ToBinaryForThread(io::ReadStream<io::SingleRead>& stream, size_t thread_num) {
- return ToBinaryForThread(stream, buf_size_ / file_num_, thread_num, LibraryOrientation::Undefined);
- }
-
- ReadStreamStat ToBinaryForThread(io::ReadStream<io::PairedReadSeq>& stream, size_t thread_num) {
- return ToBinaryForThread(stream, buf_size_ / (2 * file_num_), thread_num, LibraryOrientation::Undefined);
- }
-
- ReadStreamStat ToBinaryForThread(io::ReadStream<io::PairedRead>& stream, size_t thread_num, LibraryOrientation orientation) {
- return ToBinaryForThread(stream, buf_size_ / (2 * file_num_), thread_num, orientation);
- }
-
-// template<class Read>
-// void WriteReads(std::vector<Read>& data) {
-// size_t chunk_size = data.size() / file_num_;
-// size_t last_chunk_size = chunk_size + data.size() % file_num_;
-//
-// for (size_t i = 0; i < file_num_ - 1; ++i) {
-// file_ds_[i]->write((const char *) &chunk_size, sizeof(chunk_size));
-// }
-// file_ds_.back()->write((const char *) &last_chunk_size, sizeof(last_chunk_size));
-//
-// size_t start_pos = 0;
-// for (size_t i = 0; i < file_num_ - 1; ++i, start_pos += chunk_size) {
-// FlushBuffer(data, *file_ds_[i], start_pos, start_pos + chunk_size);
-// }
-// FlushBuffer(data, file_ds_.back(), start_pos, data.size());
-// }
-//
-// template<class Read>
-// void WriteSeparatedReads(std::vector< std::vector<Read> >& data) {
-// if (data.size() != file_num_) {
-// WARN("Cannot write reads, number of vectors is not equal to thread number");
-// return;
-// }
-//
-// for (size_t i = 0; i < file_num_; ++i) {
-// size_t size = data[i].size();
-// file_ds_[i]->write((const char *) &size, sizeof(size));
-// }
-//
-// for (size_t i = 0; i < file_num_; ++i) {
-// FlushBuffer(data[i], *file_ds_[i]);
-// }
-// }
-};
-
-
-}
-
-
-#endif /* BINARY_IO_HPP_ */
diff --git a/src/include/io/binary_streams.hpp b/src/include/io/binary_streams.hpp
deleted file mode 100644
index ab801c1..0000000
--- a/src/include/io/binary_streams.hpp
+++ /dev/null
@@ -1,357 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include <fstream>
-
-#include "verify.hpp"
-#include "ireader.hpp"
-#include "single_read.hpp"
-#include "paired_read.hpp"
-
-namespace io {
-
-// == Deprecated classes ==
-// Use FileReadStream and InsertSizeModyfing instead
-
-class BinaryFileSingleStream: public PredictableReadStream<SingleReadSeq> {
-private:
- std::ifstream stream_;
- ReadStreamStat read_stat_;
- size_t current_;
-
-public:
-
- BinaryFileSingleStream(const std::string& file_name_prefix, size_t file_num) {
- std::string fname;
- fname = file_name_prefix + "_" + ToString(file_num) + ".seq";
- stream_.open(fname.c_str(), std::ios_base::binary | std::ios_base::in);
-
- reset();
- }
-
- virtual bool is_open() {
- return stream_.is_open();
- }
-
- virtual bool eof() {
- return current_ == read_stat_.read_count_;
- }
-
- virtual BinaryFileSingleStream& operator>>(SingleReadSeq& read) {
- read.BinRead(stream_);
- VERIFY(current_ < read_stat_.read_count_);
-
- ++current_;
- return *this;
- }
-
- virtual void close() {
- current_ = 0;
- stream_.close();
- }
-
- virtual void reset() {
- stream_.clear();
- stream_.seekg(0);
- VERIFY(stream_.good());
- read_stat_.read(stream_);
- current_ = 0;
- }
-
- virtual size_t size() const {
- return read_stat_.read_count_;
- }
-
- virtual ReadStreamStat get_stat() const {
- return read_stat_;
- }
-
-};
-
-class BinaryFilePairedStream: public PredictableReadStream<PairedReadSeq> {
-
-private:
- std::ifstream stream_;
-
- size_t insert_size_;
-
- ReadStreamStat read_stat_;
-
- size_t current_;
-
-
-public:
-
- BinaryFilePairedStream(const std::string& file_name_prefix, size_t file_num, size_t insert_szie): stream_(), insert_size_ (insert_szie) {
- std::string fname;
- fname = file_name_prefix + "_" + ToString(file_num) + ".seq";
- stream_.open(fname.c_str(), std::ios_base::binary | std::ios_base::in);
-
- reset();
- }
-
- virtual bool is_open() {
- return stream_.is_open();
- }
-
- virtual bool eof() {
- return current_ >= read_stat_.read_count_;
- }
-
- virtual BinaryFilePairedStream& operator>>(PairedReadSeq& read) {
- read.BinRead(stream_, insert_size_);
- VERIFY(current_ < read_stat_.read_count_);
-
- ++current_;
- return *this;
- }
-
- virtual void close() {
- current_ = 0;
- stream_.close();
- }
-
-
- virtual void reset() {
- stream_.clear();
- stream_.seekg(0);
- VERIFY(stream_.good());
- read_stat_.read(stream_);
- current_ = 0;
- }
-
- virtual size_t size() const {
- return read_stat_.read_count_;
- }
-
- ReadStreamStat get_stat() const {
- ReadStreamStat stat = read_stat_;
- stat.read_count_ *= 2;
- return stat;
- }
-};
-
-
-//template <class Read>
-//class FileReadStream: public io::PredictableIReader<Read> {
-//
-//private:
-// std::ifstream stream_;
-//
-// ReadStat read_stat_;
-//
-// size_t current_;
-//
-//public:
-//
-// FileReadStream(const std::string& file_name_prefix, size_t file_num) {
-// std::string fname;
-// fname = file_name_prefix + "_" + ToString(file_num) + ".seq";
-// stream_.open(fname.c_str(), std::ios_base::binary | std::ios_base::in);
-//
-// reset();
-// }
-//
-// virtual ~FileReadStream() {
-// if (stream_.is_open()) {
-// stream_.close();
-// }
-// }
-//
-// virtual bool is_open() {
-// return stream_.is_open();
-// }
-//
-// virtual bool eof() {
-// return current_ == read_stat_.read_count_;
-// }
-//
-// virtual FileReadStream& operator>>(Read& read) {
-// read.BinRead(stream_);
-// VERIFY(current_ < read_stat_.read_count_);
-//
-// ++current_;
-// return *this;
-// }
-//
-// virtual void close() {
-// current_ = 0;
-// stream_.close();
-// }
-//
-// virtual void reset() {
-// stream_.clear();
-// stream_.seekg(0);
-// VERIFY(stream_.good());
-// read_stat_.read(stream_);
-// current_ = 0;
-// }
-//
-// virtual size_t size() const {
-// return read_stat_.read_count_;
-// }
-//
-// virtual ReadStat get_stat() const {
-// return read_stat_;
-// }
-//};
-
-//template <class Read>
-//class ReadBufferedStream: public io::PredictableIReader<Read> {
-//
-//private:
-// std::vector<Read> * data_;
-//
-// ReadStat read_stat_;
-//
-// size_t current_;
-//
-//public:
-//
-// ReadBufferedStream(io::PredictableIReader<Read>& stream) {
-// read_stat_ = stream.get_stat();
-// data_ = new std::vector<Read>(read_stat_.read_count_);
-//
-// size_t i = 0;
-// while (!stream.eof()) {
-// stream >> (*data_)[i++];
-// }
-//
-// reset();
-// }
-//
-// virtual ~ReadBufferedStream() {
-// delete data_;
-// }
-//
-// virtual bool is_open() {
-// return true;
-// }
-//
-// virtual bool eof() {
-// return current_ == read_stat_.read_count_;
-// }
-//
-// virtual ReadBufferedStream& operator>>(Read& read) {
-// read = (*data_)[current_];
-// VERIFY(current_ < read_stat_.read_count_);
-//
-// ++current_;
-// return *this;
-// }
-//
-// virtual void close() {
-// current_ = 0;
-// }
-//
-// virtual void reset() {
-// current_ = 0;
-// }
-//
-// virtual size_t size() const {
-// return read_stat_.read_count_;
-// }
-//
-// virtual ReadStat get_stat() const {
-// return read_stat_;
-// }
-//};
-
-//class SeqSingleReadStreamWrapper: public Reader<SingleReadSeq> {
-//
-//private:
-// io::IReader<io::PairedReadSeq>& stream_;
-//
-// PairedReadSeq current_read_;
-//
-// bool is_read_;
-//
-//public:
-//
-// SeqSingleReadStreamWrapper(io::IReader<io::PairedReadSeq>& stream): stream_(stream), current_read_(), is_read_(false) {
-// }
-//
-// virtual ~SeqSingleReadStreamWrapper() {}
-//
-// virtual bool is_open() {
-// return stream_.is_open();
-// }
-//
-// virtual bool eof() {
-// return stream_.eof() && !is_read_;
-// }
-//
-// virtual SeqSingleReadStreamWrapper& operator>>(io::SingleReadSeq& read) {
-// if (!is_read_) {
-// stream_ >> current_read_;
-// read = current_read_.first();
-// } else {
-// read = current_read_.second();
-// }
-// is_read_ = !is_read_;
-// return *this;
-// }
-//
-// virtual void close() {
-// stream_.close();
-// }
-//
-// virtual void reset() {
-// stream_.reset();
-// is_read_ = false;
-// }
-//
-// virtual ReadStat get_stat() const {
-// return stream_.get_stat();
-// }
-//};
-
-//class InsertSizeModifyingWrapper: public io::IReader<io::PairedReadSeq> {
-//
-//private:
-// io::IReader<io::PairedReadSeq>& stream_;
-//
-// size_t insert_size_;
-//
-//public:
-//
-// InsertSizeModifyingWrapper(io::IReader<io::PairedReadSeq>& stream, size_t insert_szie): stream_(stream), insert_size_ (insert_szie) {
-// }
-//
-// virtual ~InsertSizeModifyingWrapper() {
-// }
-//
-// virtual bool is_open() {
-// return stream_.is_open();
-// }
-//
-// virtual bool eof() {
-// return stream_.eof();
-// }
-//
-// virtual InsertSizeModifyingWrapper& operator>>(io::PairedReadSeq& read) {
-// stream_ >> read;
-// read.inc_insert_size(insert_size_);
-// return *this;
-// }
-//
-// virtual void close() {
-// stream_.close();
-// }
-//
-// virtual void reset() {
-// stream_.reset();
-// }
-//
-// virtual ReadStat get_stat() const {
-// return stream_.get_stat();
-// }
-//};
-
-}
diff --git a/src/include/io/careful_filtering_reader_wrapper.hpp b/src/include/io/careful_filtering_reader_wrapper.hpp
deleted file mode 100644
index a182d31..0000000
--- a/src/include/io/careful_filtering_reader_wrapper.hpp
+++ /dev/null
@@ -1,183 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-#pragma once
-//todo rename file
-#include "io/delegating_reader_wrapper.hpp"
-#include "library.hpp"
-
-namespace io {
-
-const size_t none = -1ul;
-
-inline std::pair<size_t, size_t> LongestValidCoords(const SingleRead& r) {
- size_t best_len = 0;
- size_t best_pos = none;
- size_t pos = none;
- std::string seq = r.GetSequenceString();
- for (size_t i = 0; i <= seq.size(); ++i) {
- if (i < seq.size() && is_nucl(seq[i])) {
- if (pos == none) {
- pos = i;
- }
- } else {
- if (pos != none) {
- size_t len = i - pos;
- if (len > best_len) {
- best_len = len;
- best_pos = pos;
- }
- }
- pos = none;
- }
- }
- if (best_len == 0) {
- return std::make_pair(0, 0);
- }
- return std::make_pair(best_pos, best_pos + best_len);
-}
-
-inline SingleRead LongestValid(const SingleRead& r,
- bool /*use_orientation*/ = false,
- LibraryOrientation /*orientation*/ = LibraryOrientation::FR) {
-
- std::pair<size_t, size_t> p = LongestValidCoords(r);
- return r.Substr(p.first, p.second);
-}
-
-inline PairedRead LongestValid(const PairedRead& r,
- bool use_orientation = false,
- LibraryOrientation orientation = LibraryOrientation::FR) {
- std::pair<size_t, size_t> c1 = LongestValidCoords(r.first());
- std::pair<size_t, size_t> c2 = LongestValidCoords(r.second());
- size_t len1 = c1.second - c1.first;
- size_t len2 = c2.second - c2.first;
- if (len1 == 0 || len2 == 0) {
- return PairedRead();
- }
- if (len1 == r.first().size() && len2 == r.second().size()) {
- return r;
- }
-
- size_t is;
- if (!use_orientation) {
- is = r.insert_size() - c1.first - r.second().size() + c2.second;
- }
- else {
- switch (orientation) {
- case LibraryOrientation::FF: {
- is = r.insert_size() - c1.first - r.second().size() + c2.second;
- break;
- }
- case LibraryOrientation::RR: {
- is = r.insert_size() - r.first().size() + c1.second - c2.first;
- break;
- }
- case LibraryOrientation::FR: {
- is = r.insert_size() - c1.first - c2.first;
- break;
- }
- case LibraryOrientation::RF: {
- is = r.insert_size() - r.first().size() + c1.second - r.second().size() + c2.second;
- break;
- }
- default: {
- is = r.insert_size() - c1.first - r.second().size() + c2.second;
- break;
- }
- }
- }
-
- return PairedRead(r.first().Substr(c1.first, c1.second), r.second().Substr(c2.first, c2.second), is);
-}
-
-
-//todo rewrite without eof
-template<typename ReadType>
-class CarefulFilteringWrapper : public DelegatingWrapper<ReadType> {
- typedef DelegatingWrapper<ReadType> base;
-public:
- /*
- * Default constructor.
- *
- * @param reader Reference to any other reader (child of IReader).
- */
- CarefulFilteringWrapper(typename base::ReadStreamPtrT reader_ptr,
- bool use_orientation = false,
- LibraryOrientation orientation = LibraryOrientation::Undefined) :
- base(reader_ptr),
- eof_(false),
- use_orientation_(use_orientation),
- orientation_(orientation) {
- StepForward();
- }
-
- /* virtual */ bool eof() {
- return eof_;
- }
-
- /*
- * Read SingleRead from stream.
- *
- * @param read The SingleRead that will store read * data.
- *
- * @return Reference to this stream.
- */
- /* virtual */ CarefulFilteringWrapper& operator>>(ReadType& read) {
- read = next_read_;
- StepForward();
- return *this;
- }
-
- /* virtual */
- void reset() {
- base::reset();
- eof_ = false;
- StepForward();
- }
-
-private:
- bool eof_;
- bool use_orientation_;
- LibraryOrientation orientation_;
- ReadType next_read_;
-
- /*
- * Read next valid read in the stream.
- */
- void StepForward() {
- while (!base::eof()) {
- base::operator >>(next_read_);
- next_read_ = LongestValid(next_read_, use_orientation_, orientation_);
- if (next_read_.IsValid()) {
- return;
- }
- }
- eof_ = true;
- }
-};
-
-template<class ReadType>
-std::shared_ptr<ReadStream<ReadType>> CarefulFilteringWrap(std::shared_ptr<ReadStream<ReadType>> reader_ptr,
- bool use_orientation = false,
- LibraryOrientation orientation = LibraryOrientation::Undefined) {
- //return reader_ptr = make_shared<CarefulFilteringWrapper<ReadType>>(reader_ptr, false, LibraryOrientation::Undefined);
- return std::shared_ptr<CarefulFilteringWrapper<ReadType> >(
- new CarefulFilteringWrapper<ReadType>(reader_ptr, use_orientation, orientation));
-}
-
-template<class ReadType>
-ReadStreamList<ReadType> CarefulFilteringWrap(const ReadStreamList<ReadType>& readers,
- bool use_orientation = false,
- LibraryOrientation orientation = LibraryOrientation::Undefined) {
- ReadStreamList<ReadType> answer;
- for (size_t i = 0; i < readers.size(); ++i) {
- answer.push_back(CarefulFilteringWrap<ReadType>(readers.ptr_at(i), use_orientation, orientation));
- }
- return answer;
-}
-
-}
diff --git a/src/include/io/converting_reader_wrapper.hpp b/src/include/io/converting_reader_wrapper.hpp
deleted file mode 100644
index 5fea9a2..0000000
--- a/src/include/io/converting_reader_wrapper.hpp
+++ /dev/null
@@ -1,120 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "read_stream_vector.hpp"
-
-namespace io {
-
-/**
- * SquashingWrapper is the class-wrapper that reads SingleReads
- * from Reader<PairedRead> (first and second single reads in a pair
- * one by one).
- */
-template <class PairedReadType>
-class SquashingWrapper : public ReadStream<typename PairedReadType::SingleReadT> {
- typedef typename PairedReadType::SingleReadT SingleReadT;
- typedef std::shared_ptr<ReadStream<PairedReadType>> PairedReaderPtrT;
- public:
-
- explicit SquashingWrapper(PairedReaderPtrT reader)
- : reader_(reader), pairedread_(), index_(0) {
- }
-
- /*
- * Check whether the stream is opened.
- *
- * @return true if the stream is opened and false otherwise.
- */
- /* virtual */ bool is_open() {
- return reader_->is_open();
- }
-
- /*
- * Check whether we've reached the end of stream.
- *
- * @return true if the end of the stream is reached and false
- * otherwise.
- */
- /* virtual */ bool eof() {
- return (index_ == 0) && (reader_->eof());
- }
-
- /*
- * Read SingleRead from stream (which is actually the part of
- * PairedRead from stream).
- *
- * @param singleread The SingleRead that will store read data.
- *
- * @return Reference to this stream.
- */
- /* virtual */ SquashingWrapper& operator>>(
- SingleReadT& singleread) {
- if (index_ == 0) {
- (*reader_) >> pairedread_;
- }
- singleread = pairedread_[index_];
- index_ = 1 - index_;
- return (*this);
- }
-
- /*
- * Close the stream.
- */
- /* virtual */ void close() {
- reader_->close();
- }
-
- /*
- * Close the stream and open it again.
- */
- /* virtual */ void reset() {
- index_ = 0;
- reader_->reset();
- }
-
- ReadStreamStat get_stat() const {
- return reader_->get_stat();
- }
-
- private:
- /*
- * @variable Internal stream reader.
- */
- PairedReaderPtrT reader_;
- /*
- * @variable Element that stores the last read PairedRead from
- * stream.
- */
- PairedReadType pairedread_;
- /*
- * @variable Index of current part of PairedRead.
- */
- size_t index_;
-
-};
-
-template<class PairedReadType>
-std::shared_ptr<ReadStream<typename PairedReadType::SingleReadT>> SquashingWrap(std::shared_ptr<ReadStream<PairedReadType>> reader_ptr) {
- return std::make_shared<SquashingWrapper<PairedReadType>>(reader_ptr);
-}
-
-template<class PairedReadType>
-ReadStreamList<typename PairedReadType::SingleReadT> SquashingWrap(ReadStreamList<PairedReadType>& readers) {
- ReadStreamList<typename PairedReadType::SingleReadT> answer;
- for (size_t i = 0; i < readers.size(); ++i) {
- answer.push_back(SquashingWrap<PairedReadType>(readers.ptr_at(i)));
- }
- return answer;
-}
-
-//template<class ReaderPtrType>
-//std::shared_ptr<Reader<typename ReaderPtrType::element_type::ReadT::SingleReadT>> SquashingWrap(ReaderPtrType reader_ptr) {
-// return std::make_shared<SquashingWrapper<typename ReaderPtrType::element_type::ReadT>>(reader_ptr);
-//}
-}
diff --git a/src/include/io/delegating_reader_wrapper.hpp b/src/include/io/delegating_reader_wrapper.hpp
deleted file mode 100644
index 163e215..0000000
--- a/src/include/io/delegating_reader_wrapper.hpp
+++ /dev/null
@@ -1,64 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "ireader.hpp"
-
-namespace io {
-
-//todo rename file
-template<typename ReadType>
-class DelegatingWrapper: public ReadStream<ReadType> {
-public:
- typedef std::shared_ptr<ReadStream<ReadType>> ReadStreamPtrT;
-
- explicit DelegatingWrapper(ReadStreamPtrT reader) : reader_(reader) {}
-
-
- /* virtual */ bool is_open() {
- return reader_->is_open();
- }
-
- /* virtual */ bool eof() {
- return reader_->eof();
- }
-
- /* virtual */ DelegatingWrapper& operator>>(ReadType& read) {
- (*reader_) >> read;
- return *this;
- }
-
- /* virtual */
- void close() {
- reader_->close();
- }
-
- /*
- * Close the stream and open it again.
- */
- /* virtual */
- void reset() {
- reader_->reset();
- }
-
- /* virtual */
- ReadStreamStat get_stat() const {
- return reader_->get_stat();
- }
-
-protected:
- ReadStream<ReadType>& reader() {
- return *reader_;
- }
-
-private:
- ReadStreamPtrT reader_;
-
-};
-
-}
diff --git a/src/include/io/easy_reader.hpp b/src/include/io/easy_reader.hpp
deleted file mode 100644
index 87508a9..0000000
--- a/src/include/io/easy_reader.hpp
+++ /dev/null
@@ -1,122 +0,0 @@
-////***************************************************************************
-////* Copyright (c) 2011-2014 Saint-Petersburg Academic University
-////* All Rights Reserved
-////* See file LICENSE for details.
-////****************************************************************************
-//
-//#pragma once
-//
-//#include "ireader.hpp"
-//#include "paired_readers.hpp"
-//#include "delegating_reader_wrapper.hpp"
-//#include "splitting_wrapper.hpp"
-//#include "rc_reader_wrapper.hpp"
-//#include "filtering_reader_wrapper.hpp"
-//#include "careful_filtering_reader_wrapper.hpp"
-//#include "single_read.hpp"
-//#include "io_helper.hpp"
-//
-//#include <memory>
-//
-//namespace io {
-//
-//////todo refactor, and maybe merge them once again
-////class EasyReader: public DelegatingReaderWrapper<SingleRead> {
-//// explicit EasyReader(const EasyReader& reader);
-//// void operator=(const EasyReader& reader);
-////
-//// Reader raw_reader_;
-////// FilteringReaderWrapper<ReadType> filtered_reader_;
-//// CarefulFilteringReaderWrapper<SingleRead> filtered_reader_;
-//// RCReaderWrapper<SingleRead> rc_reader_;
-////
-////public:
-//// explicit EasyReader(const string& filename,
-//// bool followed_by_rc, OffsetType offset_type = PhredOffset) :
-//// raw_reader_(filename, offset_type), filtered_reader_(raw_reader_), rc_reader_(
-//// filtered_reader_) {
-//// if (followed_by_rc) {
-//// Init(rc_reader_);
-//// } else {
-//// Init(filtered_reader_);
-//// }
-//// }
-////
-//// /*
-//// * Default destructor.
-//// */
-//// /* virtual */
-//// ~EasyReader() {
-//// }
-////
-////};
-////
-//////todo refactor, and maybe merge them once again
-////class EasySplittingReader: public DelegatingReaderWrapper<io::SingleRead> {
-//// explicit EasySplittingReader(const EasySplittingReader& reader);
-//// void operator=(const EasySplittingReader& reader);
-////
-//// Reader raw_reader_;
-////// FilteringReaderWrapper<ReadType> filtered_reader_;
-//// SplittingWrapper splitting_reader_;
-//// RCReaderWrapper<io::SingleRead> rc_reader_;
-////
-////public:
-//// explicit EasySplittingReader(const io::SingleRead::FilenameType& filename,
-//// bool followed_by_rc, OffsetType offset_type = PhredOffset) :
-//// raw_reader_(filename, offset_type), splitting_reader_(raw_reader_), rc_reader_(
-//// splitting_reader_) {
-//// if (followed_by_rc) {
-//// Init(rc_reader_);
-//// } else {
-//// Init(splitting_reader_);
-//// }
-//// }
-////
-//// /*
-//// * Default destructor.
-//// */
-//// /* virtual */
-//// ~EasySplittingReader() {
-//// }
-////
-////};
-//
-////class PairedEasyReader: public DelegatingReaderWrapper<io::PairedRead> {
-//// std::unique_ptr<IReader<io::PairedRead>> raw_reader_;
-//// CarefulFilteringReaderWrapper<io::PairedRead> filtered_reader_;
-//// RCReaderWrapper<io::PairedRead> rc_reader_;
-////
-////public:
-//// PairedEasyReader(const io::PairedRead::FilenamesType& filenames,
-//// bool followed_by_rc, size_t insert_size, bool change_read_order =
-//// false, bool use_orientation = true, LibraryOrientation orientation = LibraryOrientation::FR,
-//// OffsetType offset_type = PhredOffset) :
-//// raw_reader_(
-//// new SeparateReader(filenames, insert_size,
-//// change_read_order, use_orientation, orientation, offset_type)), filtered_reader_(
-//// *raw_reader_), rc_reader_(filtered_reader_) {
-//// if (followed_by_rc) {
-//// Init(rc_reader_);
-//// } else {
-//// Init(filtered_reader_);
-//// }
-//// }
-////
-//// PairedEasyReader(const std::string& filename, bool followed_by_rc,
-//// size_t insert_size, bool change_read_order = false,
-//// bool use_orientation = true, LibraryOrientation orientation = LibraryOrientation::FR,
-//// OffsetType offset_type = PhredOffset) :
-//// raw_reader_(
-//// new MixedReader(filename, insert_size, change_read_order,
-//// use_orientation, orientation, offset_type)), filtered_reader_(
-//// *raw_reader_), rc_reader_(filtered_reader_) {
-//// if (followed_by_rc) {
-//// Init(rc_reader_);
-//// } else {
-//// Init(filtered_reader_);
-//// }
-//// }
-////};
-//
-//}
diff --git a/src/include/io/fasta_fastq_gz_parser.hpp b/src/include/io/fasta_fastq_gz_parser.hpp
deleted file mode 100644
index a7fc8db..0000000
--- a/src/include/io/fasta_fastq_gz_parser.hpp
+++ /dev/null
@@ -1,165 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/**
- * @file fastqgz_parser.hpp
- * @author Mariya Fomkina
- * @version 1.0
- *
- * @section LICENSE
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- *
- * @section DESCRIPTION
- *
- * FastaFastqGzParser is the parser stream that reads data from .fastq.gz
- * files.
- */
-
-#ifndef COMMON_IO_FASTAFASTQGZPARSER_HPP
-#define COMMON_IO_FASTAFASTQGZPARSER_HPP
-
-#include <zlib.h>
-#include <string>
-#include "kseq/kseq.h"
-#include "verify.hpp"
-#include "io/single_read.hpp"
-#include "io/parser.hpp"
-#include "sequence/quality.hpp"
-#include "sequence/nucl.hpp"
-
-namespace io {
-
-namespace fastafastqgz {
-// STEP 1: declare the type of file handler and the read() function
-KSEQ_INIT(gzFile, gzread)
-}
-
-class FastaFastqGzParser: public Parser {
-public:
- /*
- * Default constructor.
- *
- * @param filename The name of the file to be opened.
- * @param offset The offset of the read quality.
- */
- FastaFastqGzParser(const std::string& filename, OffsetType offset_type =
- PhredOffset) :
- Parser(filename, offset_type), fp_(), seq_(NULL) {
- open();
- }
-
- /*
- * Default destructor.
- */
- /* virtual */
- ~FastaFastqGzParser() {
- close();
- }
-
- /*
- * Read SingleRead from stream.
- *
- * @param read The SingleRead that will store read data.
- *
- * @return Reference to this stream.
- */
- /* virtual */
- FastaFastqGzParser& operator>>(SingleRead& read) {
- if (!is_open_ || eof_) {
- return *this;
- }
- //todo offset_type_ should be used in future
- if (seq_->qual.s) {
- read = SingleRead(seq_->name.s, seq_->seq.s, seq_->qual.s, offset_type_);
- } else {
- read = SingleRead(seq_->name.s, seq_->seq.s);
-// size_t len = strlen(seq_->seq.s);
-// char* qual = (char*) malloc(len + 1);
-// char q = '\2' + 64;
-// for (size_t i = 0; i < len; ++i) {
-// qual[i] = q;
-// }
-// qual[len] = '\0';
-// read.SetAll(seq_->name.s, seq_->seq.s, qual, SolexaOffset);
-// free(qual);
- }
- ReadAhead();
- return *this;
- }
-
- /*
- * Close the stream.
- */
- /* virtual */
- void close() {
- if (is_open_) {
- // STEP 5: destroy seq
- fastafastqgz::kseq_destroy(seq_);
- // STEP 6: close the file handler
- gzclose(fp_);
- is_open_ = false;
- eof_ = true;
- }
- }
-
-private:
- /*
- * @variable File that is associated with gzipped data file.
- */
- gzFile fp_;
- /*
- * @variable Data element that stores last SingleRead got from
- * stream.
- */
- fastafastqgz::kseq_t* seq_;
-
- /*
- * Open a stream.
- */
- /* virtual */
- void open() {
- // STEP 2: open the file handler
- fp_ = gzopen(filename_.c_str(), "r");
- if (!fp_) {
- is_open_ = false;
- return;
- }
- // STEP 3: initialize seq
- seq_ = fastafastqgz::kseq_init(fp_);
- eof_ = false;
- is_open_ = true;
- ReadAhead();
- }
-
- /*
- * Read next SingleRead from file.
- */
- void ReadAhead() {
- VERIFY(is_open_);
- VERIFY(!eof_);
- if (fastafastqgz::kseq_read(seq_) < 0) {
- eof_ = true;
- }
- }
-
- /*
- * Hidden copy constructor.
- */
- FastaFastqGzParser(const FastaFastqGzParser& parser);
- /*
- * Hidden assign operator.
- */
- void operator=(const FastaFastqGzParser& parser);
-};
-
-}
-
-#endif /* COMMON_IO_FASTAFASTQGZPARSER_HPP */
diff --git a/src/include/io/file_reader.hpp b/src/include/io/file_reader.hpp
deleted file mode 100644
index 39b980e..0000000
--- a/src/include/io/file_reader.hpp
+++ /dev/null
@@ -1,129 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/**
-
- * Reader<SingleRead> is the very base class that reads from one file
- * through Parser object.
- * Reader<PairedRead> is the class that reads data from two input
- * files and gets PairedReads using this data and distance information.
- */
-
-#pragma once
-
-#include "ireader.hpp"
-#include "single_read.hpp"
-#include "parser.hpp"
-#include "path_helper.hpp"
-
-namespace io {
-
-class FileReadStream : public ReadStream<SingleRead> {
- public:
- /*
- * Default constructor.
- *
- * @param filename The name of the file to be opened.
- * @param distance Doesn't have any sense here, but necessary for
- * wrappers.
- * @param offset The offset of the read quality.
- */
- explicit FileReadStream(const std::string& filename,
- OffsetType offset_type = PhredOffset)
- : filename_(filename), offset_type_(offset_type), parser_(NULL) {
- path::CheckFileExistenceFATAL(filename_);
- parser_ = SelectParser(filename_, offset_type_);
- }
-
- /*
- * Default destructor.
- */
- /* virtual */ ~FileReadStream() {
- close();
- delete parser_;
- }
-
- /*
- * Check whether the stream is opened.
- *
- * @return true of the stream is opened and false otherwise.
- */
- /* virtual */ bool is_open() {
- if (parser_ != NULL) {
- return parser_->is_open();
- } else {
- return false;
- }
- }
-
- /*
- * Check whether we've reached the end of stream.
- *
- * @return true if the end of stream is reached and false
- * otherwise.
- */
- /* virtual */ bool eof() {
- if (parser_ != NULL) {
- return parser_->eof();
- } else {
- return true;
- }
- }
-
- /*
- * Read SingleRead from stream.
- *
- * @param singleread The SingleRead that will store read data.
- *
- * @return Reference to this stream.
- */
- /* virtual */ FileReadStream& operator>>(SingleRead& singleread) {
- if (parser_ != NULL) {
- (*parser_) >> singleread;
- }
- return *this;
- }
-
- /*
- * Close the stream.
- */
- /* virtual */ void close() {
- if (parser_ != NULL) {
- parser_->close();
- }
- }
-
- /*
- * Close the stream and open it again.
- */
- /* virtual */ void reset() {
- if (parser_ != NULL) {
- parser_->reset();
- }
- }
-
- ReadStreamStat get_stat() const {
- return ReadStreamStat();
- }
-
- private:
- /*
- * @variable The name of the file which stream reads from.
- */
- std::string filename_;
- /*
- * @variable Quality offset type.
- */
- OffsetType offset_type_;
- /*
- * @variable Internal stream that reads from file.
- */
- Parser* parser_;
-
-};
-
-}
diff --git a/src/include/io/filtering_reader_wrapper.hpp b/src/include/io/filtering_reader_wrapper.hpp
deleted file mode 100644
index a2c0c88..0000000
--- a/src/include/io/filtering_reader_wrapper.hpp
+++ /dev/null
@@ -1,148 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/**
- * @file filtering_reader_wrapper.hpp
- * @author Sergey Nurk
- * @version 1.0
- *
- * @section LICENSE
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- *
- * @section DESCRIPTION
- *
- * FilteringReaderWrapper is the class-wrapper that gets only valid
- * reads.
- */
-
-#ifndef COMMON_IO_FILTERINGREADERWRAPPER_HPP_
-#define COMMON_IO_FILTERINGREADERWRAPPER_HPP_
-
-#include "io/ireader.hpp"
-
-namespace io {
-
-template<typename ReadType>
-class FilteringReaderWrapper: public IReader<ReadType> {
-public:
- /*
- * Default constructor.
- *
- * @param reader Reference to any other reader (child of IReader).
- */
- explicit FilteringReaderWrapper(IReader<ReadType>& reader) :
- reader_(reader), eof_(false) {
- StepForward();
- }
-
- /*
- * Default destructor.
- */
- /* virtual */ ~FilteringReaderWrapper() {
- close();
- }
-
- /*
- * Check whether the stream is opened.
- *
- * @return true of the stream is opened and false otherwise.
- */
- /* virtual */ bool is_open() {
- return reader_.is_open();
- }
-
- /*
- * Check whether we've reached the end of stream.
- *
- * @return true if the end of stream is reached and false
- * otherwise.
- */
- /* virtual */ bool eof() {
- return eof_;
- }
-
- /*
- * Read SingleRead or PairedRead from stream (according to ReadType).
- *
- * @param read The SingleRead or PairedRead that will store read
- * data.
- *
- * @return Reference to this stream.
- */
- /* virtual */ FilteringReaderWrapper& operator>>(ReadType& read) {
- read = next_read_;
- StepForward();
- return *this;
- }
-
- /*
- * Close the stream.
- */
- /* virtual */
- void close() {
- reader_.close();
- }
-
- /*
- * Close the stream and open it again.
- */
- /* virtual */
- void reset() {
- reader_.reset();
- eof_ = false;
- StepForward();
- }
-
- ReadStat get_stat() const {
- return reader_.get_stat();
- }
-
-private:
- /*
- * @variable Internal stream readers.
- */
- IReader<ReadType>& reader_;
- /*
- * @variable Flag that shows whether the end of stream reached.
- */
- bool eof_;
- /*
- * @variable Next read to be outputted by stream.
- */
- ReadType next_read_;
-
- /*
- * Read next valid read in the stream.
- */
- void StepForward() {
- while (!reader_.eof()) {
- reader_ >> next_read_;
- if (next_read_.IsValid()) {
- return;
- }
- }
- eof_ = true;
- }
-
- /*
- * Hidden copy constructor.
- */
- explicit FilteringReaderWrapper(
- const FilteringReaderWrapper<ReadType>& reader);
- /*
- * Hidden assign operator.
- */
- void operator=(const FilteringReaderWrapper<ReadType>& reader);
-};
-
-}
-
-#endif /* COMMON_IO_FILTERINGREADERWRAPPER_HPP_ */
diff --git a/src/include/io/io_helper.hpp b/src/include/io/io_helper.hpp
deleted file mode 100644
index 687a79a..0000000
--- a/src/include/io/io_helper.hpp
+++ /dev/null
@@ -1,118 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "read_stream_vector.hpp"
-#include "single_read.hpp"
-#include "paired_read.hpp"
-#include "file_reader.hpp"
-#include "paired_readers.hpp"
-#include "binary_streams.hpp"
-#include "multifile_reader.hpp"
-#include "converting_reader_wrapper.hpp"
-#include "careful_filtering_reader_wrapper.hpp"
-#include "rc_reader_wrapper.hpp"
-
-namespace io {
- typedef ReadStream<SingleRead> SingleStream;
- typedef std::shared_ptr<SingleStream> SingleStreamPtr;
- typedef ReadStreamList<SingleRead> SingleStreams;
-
- typedef ReadStream<PairedRead> PairedStream;
- typedef std::shared_ptr<PairedStream> PairedStreamPtr;
- typedef ReadStreamList<PairedRead> PairedStreams;
-
- typedef ReadStream<SingleReadSeq> BinarySingleStream;
- typedef std::shared_ptr<BinarySingleStream> BinarySingleStreamPtr;
- typedef ReadStreamList<SingleReadSeq> BinarySingleStreams;
-
- typedef ReadStream<PairedReadSeq> BinaryPairedStream;
- typedef std::shared_ptr<BinaryPairedStream> BinaryPairedStreamPtr;
- typedef ReadStreamList<PairedReadSeq> BinaryPairedStreams;
-
- //old
-// typedef io::IReader<io::SingleReadSeq> SequenceSingleReadStream;
-// typedef io::IReader<io::PairedReadSeq> SequencePairedReadStream;
-// typedef io::MultifileReader<io::PairedRead> MultiPairedStream;
-// typedef io::MultifileReader<io::SingleRead> MultiSingleStream;
-
- inline BinarySingleStreams apply_single_wrappers(bool followed_by_rc,
- BinarySingleStreams& single_readers,
- BinaryPairedStreams* paired_readers = 0) {
- VERIFY(single_readers.size() != 0);
- BinarySingleStreams readers = single_readers;
-
- if (paired_readers != 0) {
- VERIFY(single_readers.size() == paired_readers->size());
- BinarySingleStreams squashed_paired = SquashingWrap<PairedReadSeq>(*paired_readers);
- readers = WrapPairsInMultifiles<SingleReadSeq>(squashed_paired, readers);
- }
-
- if (followed_by_rc) {
- readers = RCWrap<SingleReadSeq>(readers);
- }
- return readers;
- }
-
- //todo make deprecated
- inline BinaryPairedStreams apply_paired_wrappers(bool followed_by_rc,
- BinaryPairedStreams& readers) {
- VERIFY(readers.size() != 0);
- if (followed_by_rc) {
- return RCWrap<PairedReadSeq>(readers);
- } else {
- return readers;
- }
- }
-
- inline SingleStreamPtr EasyStream(const std::string& filename, bool followed_by_rc,
- bool handle_Ns = true, OffsetType offset_type = PhredOffset) {
- SingleStreamPtr reader = make_shared<FileReadStream>(filename, offset_type);
- if (handle_Ns) {
- reader = CarefulFilteringWrap<SingleRead>(reader);
- }
- if (followed_by_rc) {
- reader = RCWrap<SingleRead>(reader);
- }
- return reader;
- }
-
- inline PairedStreamPtr WrapPairedStream(PairedStreamPtr reader,
- bool followed_by_rc,
- bool use_orientation = false,
- LibraryOrientation orientation = LibraryOrientation::Undefined) {
- PairedStreamPtr answer = reader;
- answer = CarefulFilteringWrap<PairedRead>(answer, use_orientation, orientation);
- if (followed_by_rc) {
- answer = RCWrap<PairedRead>(answer);
- }
- return answer;
-
- }
-
- inline PairedStreamPtr PairedEasyStream(const std::string& filename1, const std::string& filename2,
- bool followed_by_rc, size_t insert_size, bool change_read_order = false,
- bool use_orientation = true, LibraryOrientation orientation = LibraryOrientation::FR,
- OffsetType offset_type = PhredOffset) {
- PairedStreamPtr reader = make_shared<SeparatePairedReadStream>(filename1, filename2, insert_size,
- change_read_order, use_orientation,
- orientation, offset_type);
- //Use orientation for IS calculation if it's not done by changer
- return WrapPairedStream(reader, followed_by_rc, !use_orientation, orientation);
- }
-
- inline PairedStreamPtr PairedEasyStream(const std::string& filename, bool followed_by_rc,
- size_t insert_size, bool change_read_order = false,
- bool use_orientation = true, LibraryOrientation orientation = LibraryOrientation::FR,
- OffsetType offset_type = PhredOffset) {
- PairedStreamPtr reader = make_shared<InterleavingPairedReadStream>(filename, insert_size, change_read_order,
- use_orientation, orientation, offset_type);
- //Use orientation for IS calculation if it's not done by changer
- return WrapPairedStream(reader, followed_by_rc, !use_orientation, orientation);
- }
-}
diff --git a/src/include/io/ireader.hpp b/src/include/io/ireader.hpp
deleted file mode 100644
index ddb55ad..0000000
--- a/src/include/io/ireader.hpp
+++ /dev/null
@@ -1,116 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-//todo rename to reader
-#pragma once
-
-#include <boost/noncopyable.hpp>
-
-namespace io {
-
-struct ReadStreamStat {
- size_t read_count_;
- size_t max_len_;
- uint64_t total_len_;
-
-
- ReadStreamStat(): read_count_(0), max_len_(0), total_len_(0) { }
-
- void write(std::ostream& stream) const {
- stream.write((const char *) &read_count_, sizeof(read_count_));
- stream.write((const char *) &max_len_, sizeof(max_len_));
- stream.write((const char *) &total_len_, sizeof(total_len_));
- }
-
- void read(std::istream& stream) {
- stream.read((char *) &read_count_, sizeof(read_count_));
- stream.read((char *) &max_len_, sizeof(max_len_));
- stream.read((char *) &total_len_, sizeof(total_len_));
- }
-
- template<class Read>
- void increase(const Read& read) {
- size_t len = read.size();
-
- ++read_count_;
- if (max_len_ < len) {
- max_len_ = len;
- }
- total_len_ += read.nucl_count();
- }
-
- void merge(const ReadStreamStat& stat) {
- read_count_ += stat.read_count_;
- if (max_len_ < stat.max_len_) {
- max_len_ = stat.max_len_;
- }
- total_len_ += stat.total_len_;
- }
-
- bool valid() const {
- return read_count_ != 0;
- }
-
-};
-
-/**
- * Reader is the interface for all other readers and reader wrappers.
- */
-template<typename ReadType>
-class ReadStream: boost::noncopyable {
- public:
- typedef ReadType ReadT;
-
- /*
- * Default destructor.
- */
- virtual ~ReadStream() {}
-
- /*
- * Check whether the stream is opened.
- *
- * @return true if the stream is opened and false otherwise.
- */
- virtual bool is_open() = 0;
-
- /*
- * Check whether we've reached the end of stream.
- *
- * @return true if the end of the stream is reached and false
- * otherwise.
- */
- virtual bool eof() = 0;
-
- /*
- * Read SingleRead or PairedRead from stream (according to ReadType).
- *
- * @param read The SingleRead or PairedRead that will store read data.
- *
- * @return Reference to this stream.
- */
- virtual ReadStream& operator>>(ReadType& read) = 0;
-
- /*
- * Close the stream.
- */
- virtual void close() = 0;
-
- /*
- * Close the stream and open it again.
- */
- virtual void reset() = 0;
-
- virtual ReadStreamStat get_stat() const = 0;
-
-};
-
-template<class Read>
-class PredictableReadStream: public ReadStream<Read> {
-public:
- virtual size_t size() const = 0;
-};
-
-}
diff --git a/src/include/io/ireadstream.hpp b/src/include/io/ireadstream.hpp
deleted file mode 100644
index 329d35c..0000000
--- a/src/include/io/ireadstream.hpp
+++ /dev/null
@@ -1,168 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * ifastqstream.hpp
- *
- * Created on: 03.03.2011
- * Author: vyahhi
- */
-
-#ifndef IREADSTREAM_HPP_
-#define IREADSTREAM_HPP_
-
-#include "kseq/kseq.h"
-#include <zlib.h>
-#include "verify.hpp"
-#include "read.hpp"
-#include "sequence/nucl.hpp"
-
-// STEP 1: declare the type of file handler and the read() function
-KSEQ_INIT(gzFile, gzread)
-
-/*
- * Read name, seq and qual strings from FASTQ data (one by one)
- */
-//fixme deprecated!!! remove usages!
-class ireadstream {
-
-public:
- typedef Read ReadT;
-
- ireadstream(const std::string& filename) : offset_(Read::PHRED_OFFSET) {
- filename_ = filename;
- is_open_ = open(filename);
- }
-
- ireadstream(const std::string& filename, int offset) : offset_(offset) {
- filename_ = filename;
- is_open_ = open(filename);
- }
-
- virtual ~ireadstream() {
- close();
- }
-
- bool is_open() const {
- return is_open_;
- }
-
- bool eof() const {
- return eof_;
- }
-
- static std::vector<Read>* readAll(std::string filename, int cnt = -1) {
- ireadstream irs(filename);
- VERIFY(irs.is_open());
- std::vector<Read>* res = new std::vector<Read>();
- Read r;
- while (cnt-- && irs.is_open() && !irs.eof()) {
- irs >> r;
- if (!r.isValid()) {
- cnt++;
- continue;
- }
- res->push_back(r);
- }
- irs.close();
- return res;
- }
-
- static void readAllNoValidation(std::vector<Read>* res, std::string filename, uint64_t * totalsize, int qvoffset = Read::PHRED_OFFSET, int trim_quality = -1, int cnt = -1) {
- ireadstream irs(filename, qvoffset);
- VERIFY(irs.is_open());
- *totalsize = 0;
- Read r;
- while (cnt-- && irs.is_open() && !irs.eof()) {
- irs >> r;
- size_t read_size = r.trimNsAndBadQuality(trim_quality);
- res->push_back(r);
- *totalsize += read_size;
- }
- irs.close();
- }
-
- ireadstream& operator>>(Read &r) {
- VERIFY(is_open());
- VERIFY(!eof());
- if (!is_open() || eof()) {
- return *this;
- }
- r.setName(seq_->name.s);
- if (seq_->qual.s) {
- r.setQuality(seq_->qual.s, offset_);
- }
- r.setSequence(seq_->seq.s);
- read_ahead(); // make actual read for the next result
- return *this;
- }
-
- void close() {
- if (is_open()) {
- kseq_destroy(seq_); // STEP 5: destroy seq
- gzclose(fp_); // STEP 6: close the file handler
- is_open_ = false;
- }
- }
-
- void reset() {
- close();
- open(filename_);
- }
-
-private:
- std::string filename_;
- gzFile fp_;
- kseq_t* seq_;
- bool is_open_;
- bool eof_;
- int offset_;
- /*
- * open i's file with FASTQ reads,
- * return true if it opened file, false otherwise
- */
- bool open(std::string filename) {
- fp_ = gzopen(filename.c_str(), "r"); // STEP 2: open the file handler
- if (!fp_) {
- return false;
- }
- is_open_ = true;
- seq_ = kseq_init(fp_); // STEP 3: initialize seq
- eof_ = false;
- read_ahead();
- return true;
- }
-
- void read_ahead() {
- VERIFY(is_open());
- VERIFY(!eof());
- if (kseq_read(seq_) < 0) {
- eof_ = true;
- }
- }
-};
-
-//return -1 if failed to determine offset
-inline int determine_offset(const std::string& filename) {
- ireadstream stream(filename, 0);
- size_t count = 0;
- Read r;
- while (!stream.eof() && count++ < 10000) {
- stream >> r;
- std::string q_str = r.getQualityString();
- for (size_t i = 0; i < q_str.size(); ++i) {
- int q_val = q_str[i];
- if (q_val < 59)
- return 33;
- if (q_val > 74)
- return 64;
- }
- }
- return -1;
-}
-
-#endif /* IREADSTREAM_HPP_ */
diff --git a/src/include/io/is_corrupting_wrapper.hpp b/src/include/io/is_corrupting_wrapper.hpp
deleted file mode 100644
index b754526..0000000
--- a/src/include/io/is_corrupting_wrapper.hpp
+++ /dev/null
@@ -1,33 +0,0 @@
-////***************************************************************************
-////* Copyright (c) 2011-2014 Saint-Petersburg Academic University
-////* All Rights Reserved
-////* See file LICENSE for details.
-////****************************************************************************
-// todo remove!!!
-//#ifndef IS_CORRUPTING_WRAPPER_HPP_
-//#define IS_CORRUPTING_WRAPPER_HPP_
-//
-//namespace io {
-//
-//class ISCorruptingWrapper: public DelegatingReaderWrapper<PairedRead> {
-//private:
-// const size_t is_;
-//public:
-// typedef PairedRead ReadType;
-//
-// explicit ISCorruptingWrapper(IReader<ReadType>& reader, size_t is) :
-// DelegatingReaderWrapper<PairedRead>(reader), is_(is) {
-// }
-//
-// /* virtual */
-// ISCorruptingWrapper& operator>>(ReadType& read) {
-// (this->reader()) >> read;
-// read = PairedRead(read.first(), read.second(), is_);
-// return *this;
-// }
-//
-//};
-//
-//}
-//
-//#endif /* IS_CORRUPTING_WRAPPER_HPP_ */
diff --git a/src/include/io/kmer_iterator.hpp b/src/include/io/kmer_iterator.hpp
deleted file mode 100644
index 4ece433..0000000
--- a/src/include/io/kmer_iterator.hpp
+++ /dev/null
@@ -1,54 +0,0 @@
-#ifndef __IO_KMER_ITERATOR_HPP__
-#define __IO_KMER_ITERATOR_HPP__
-
-#include "io/mmapped_reader.hpp"
-#include <string>
-
-namespace io {
-
-template<class Seq>
-using raw_kmer_iterator = MMappedFileRecordArrayIterator<typename Seq::DataType>;
-
-template<class Seq>
-raw_kmer_iterator<Seq> make_kmer_iterator(const std::string &FileName,
- unsigned K) {
- return raw_kmer_iterator<Seq>(FileName, Seq::GetDataSize(K));
-}
-
-template<class Seq>
-std::vector<raw_kmer_iterator<Seq>> make_kmer_iterator(const std::string &FileName,
- size_t K, size_t amount) {
- std::vector<raw_kmer_iterator<Seq>> res;
- if (amount == 1) {
- res.emplace_back(FileName, Seq::GetDataSize(K));
- return res;
- }
-
- // Determine the file size
- struct stat buf;
- VERIFY_MSG(stat(FileName.c_str(), &buf) != -1,
- "stat(2) failed. Reason: " << strerror(errno) << ". Error code: " << errno);
- size_t file_size = buf.st_size;
-
- // Now start creating the iterators keeping in mind, that offset should be
- // multiple of page size.
- size_t chunk = round_up(file_size / amount, getpagesize() * Seq::GetDataSize(K) * sizeof(typename Seq::DataType));
- size_t offset = 0;
- if (chunk > file_size)
- chunk = file_size;
-
- while (offset < file_size) {
- res.emplace_back(FileName, Seq::GetDataSize(K),
- offset,
- offset + chunk > file_size ? file_size - offset : chunk);
- offset += chunk;
- }
-
- return res;
-}
-
-
-
-};
-
-#endif
diff --git a/src/include/io/library.hpp b/src/include/io/library.hpp
deleted file mode 100644
index 3387d90..0000000
--- a/src/include/io/library.hpp
+++ /dev/null
@@ -1,392 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef __IO_LIBRARY_HPP__
-#define __IO_LIBRARY_HPP__
-
-#include "adt/chained_iterator.hpp"
-#include "adt/iterator_range.hpp"
-
-#include <boost/iterator/iterator_facade.hpp>
-#include <yaml-cpp/yaml.h>
-
-#include <string>
-#include <vector>
-#include <utility>
-#include <iostream>
-#include <fstream>
-#include <functional>
-#include <algorithm>
-#include "path_helper.hpp"
-
-namespace io {
-
-class DataSetReader;
-
-enum class LibraryType {
- SingleReads,
- PairedEnd,
- MatePairs,
- HQMatePairs,
- PacBioReads,
- SangerReads,
- NanoporeReads,
- TrustedContigs,
- UntrustedContigs,
- PathExtendContigs
-};
-
-static std::vector<LibraryType> LibraryPriotity = {LibraryType::SingleReads, LibraryType::SangerReads, LibraryType::PacBioReads, LibraryType::NanoporeReads,
- LibraryType::PairedEnd, LibraryType::HQMatePairs, LibraryType::MatePairs, LibraryType::TrustedContigs,
- LibraryType::PathExtendContigs, LibraryType::UntrustedContigs};
-
-enum class LibraryOrientation {
- FR,
- FF,
- RF,
- RR,
- Undefined
-};
-
-struct update_relative_filename : public std::binary_function<std::string, std::string, std::string> {
- std::string operator() (const std::string &filename, const std::string &input_dir) const {
- if (filename[0] == '/')
- return filename;
- return input_dir + filename;
- }
-};
-
-class SequencingLibraryBase {
- public:
- class paired_reads_iterator :
- public boost::iterator_facade<paired_reads_iterator,
- std::pair<std::string, std::string>,
- boost::forward_traversal_tag,
- std::pair<std::string, std::string> > {
-
- typedef std::vector<std::string>::const_iterator inner_iterator;
-
- public:
- paired_reads_iterator(inner_iterator left, inner_iterator right)
- : left_(left), right_(right){}
-
- private:
- friend class boost::iterator_core_access;
-
- void increment() { ++left_; ++right_; }
- bool equal(const paired_reads_iterator &other) const {
- return this->left_ == other.left_ && this->right_ == other.right_;
- }
- std::pair<std::string, std::string> dereference() const {
- return std::make_pair(*left_, *right_);
- }
-
- inner_iterator left_;
- inner_iterator right_;
- };
-
- typedef chained_iterator<std::vector<std::string>::const_iterator> single_reads_iterator;
-
- SequencingLibraryBase()
- : type_(LibraryType::PairedEnd), orientation_(LibraryOrientation::FR) {}
-
- void load(const YAML::Node &node);
-
- LibraryType type() const { return type_; }
- void set_type(LibraryType type) { type_ = type; }
- LibraryOrientation orientation() const { return orientation_; }
- void set_orientation(LibraryOrientation orientation) { orientation_ = orientation; }
-
- void clear() {
- left_paired_reads_.clear();
- right_paired_reads_.clear();
- single_reads_.clear();
- }
-
- void update_relative_reads_filenames(const std::string &input_dir) {
- std::transform(left_paired_reads_.begin(), left_paired_reads_.end(), left_paired_reads_.begin(), std::bind2nd(update_relative_filename(), input_dir));
- std::transform(right_paired_reads_.begin(), right_paired_reads_.end(), right_paired_reads_.begin(), std::bind2nd(update_relative_filename(), input_dir));
- std::transform(single_reads_.begin(), single_reads_.end(), single_reads_.begin(), std::bind2nd(update_relative_filename(), input_dir));
- }
-
- void push_back_single(const std::string &reads) {
- single_reads_.push_back(reads);
- }
-
- void push_back_paired(const std::string &left, const std::string &right) {
- left_paired_reads_.push_back(left);
- right_paired_reads_.push_back(right);
- }
-
- paired_reads_iterator paired_begin() const {
- return paired_reads_iterator(left_paired_reads_.begin(), right_paired_reads_.begin());
- }
- paired_reads_iterator paired_end() const {
- return paired_reads_iterator(left_paired_reads_.end(), right_paired_reads_.end());
- }
-
- adt::iterator_range<paired_reads_iterator> paired_reads() const {
- return adt::make_range(paired_begin(), paired_end());
- }
-
- single_reads_iterator reads_begin() const {
- // NOTE: We have a contract with single_end here. Single reads always go last!
- single_reads_iterator res(left_paired_reads_.begin(), left_paired_reads_.end());
- res.join(right_paired_reads_.begin(), right_paired_reads_.end());
- res.join(single_reads_.begin(), single_reads_.end());
-
- return res;
- }
- single_reads_iterator reads_end() const {
- // NOTE: Do not forget about the contract with single_begin here!
- return single_reads_iterator(single_reads_.end(), single_reads_.end());
- }
-
- adt::iterator_range<single_reads_iterator> reads() const {
- return adt::make_range(reads_begin(), reads_end());
- }
-
- single_reads_iterator single_begin() const {
- return single_reads_iterator(single_reads_.begin(), single_reads_.end());
- }
- single_reads_iterator single_end() const {
- // NOTE: Do not forget about the contract with single_begin here!
- return single_reads_iterator(single_reads_.end(), single_reads_.end());
- }
-
- adt::iterator_range<single_reads_iterator> single_reads() const {
- return adt::make_range(single_begin(), single_end());
- }
-
- bool is_graph_contructable() const {
- return (type_ == io::LibraryType::PairedEnd ||
- type_ == io::LibraryType::SingleReads ||
- type_ == io::LibraryType::HQMatePairs);
- }
-
- bool is_bwa_alignable() const {
- return type_ == io::LibraryType::MatePairs;
- }
-
- bool is_mismatch_correctable() const {
- return is_graph_contructable();
- }
-
- bool is_binary_covertable() {
- return is_graph_contructable() || is_mismatch_correctable() || is_paired();
- }
-
- bool is_paired() const {
- return (type_ == io::LibraryType::PairedEnd ||
- type_ == io::LibraryType::MatePairs||
- type_ == io::LibraryType::HQMatePairs);
- }
-
- bool is_repeat_resolvable() const {
- return (type_ == io::LibraryType::PairedEnd ||
- type_ == io::LibraryType::HQMatePairs ||
- type_ == io::LibraryType::MatePairs ||
- type_ == io::LibraryType::PacBioReads ||
- type_ == io::LibraryType::SangerReads ||
- type_ == io::LibraryType::NanoporeReads ||
- type_ == io::LibraryType::TrustedContigs ||
- type_ == io::LibraryType::UntrustedContigs ||
- type_ == io::LibraryType::PathExtendContigs);
- }
-
- static bool IsContigLib(const io::LibraryType& type) {
- static std::set<io::LibraryType> contig_lib_types{io::LibraryType::TrustedContigs,
- io::LibraryType::UntrustedContigs, io::LibraryType::PathExtendContigs};
- return contig_lib_types.count(type);
- }
-
- bool is_contig_lib() const {
- return IsContigLib(type_);
- }
-
- bool is_pacbio_alignable() const {
- return (type_ == io::LibraryType::PacBioReads ||
- type_ == io::LibraryType::SangerReads ||
- type_ == io::LibraryType::NanoporeReads ||
- //comment next line to switch alignment method for trusted contigs
- type_ == io::LibraryType::TrustedContigs ||
- type_ == io::LibraryType::UntrustedContigs);
- }
-
- private:
- LibraryType type_;
- LibraryOrientation orientation_;
-
- std::vector<std::string> left_paired_reads_;
- std::vector<std::string> right_paired_reads_;
- std::vector<std::string> single_reads_;
-};
-
-struct NoData {};
-
-template<class Data = NoData>
-class SequencingLibrary: public SequencingLibraryBase {
- public:
- const Data& data() const {
- return data_;
- }
- Data& data() {
- return data_;
- }
-
- private:
- Data data_;
-};
-
-// Just convenient wrapper to "unwrap" the iterators over libraries.
-template<class Data = NoData>
-class DataSet {
- typedef SequencingLibrary<Data> Library;
- typedef std::vector<Library> LibraryStorage;
-
- public:
- typedef typename LibraryStorage::iterator iterator;
- typedef typename LibraryStorage::const_iterator const_iterator;
- typedef chained_iterator<typename Library::single_reads_iterator> single_reads_iterator;
- typedef chained_iterator<typename Library::paired_reads_iterator> paired_reads_iterator;
-
- DataSet() {}
- explicit DataSet(const std::string &path) { load(path); }
- DataSet(const YAML::Node &node) { load(node); }
-
- void load(const std::string &filename) {
- YAML::Node config = YAML::LoadFile(filename);
- std::string input_dir = path::parent_path(filename);
- if (input_dir[input_dir.length() - 1] != '/')
- input_dir += '/';
-
- load(config);
- for (auto it = libraries_.begin(); it != libraries_.end(); ++it) {
- it->update_relative_reads_filenames(input_dir);
- }
- }
-
- void save(const std::string &filename) const {
- std::ofstream ofs(filename.c_str());
- ofs << YAML::Node(*this);
- }
-
- void load(const YAML::Node &node) {
- clear();
- for (YAML::const_iterator it = node.begin(); it != node.end(); ++it) {
- libraries_.push_back(it->as<Library>());
- }
- }
-
- void clear() { libraries_.clear(); }
- void push_back(const Library &lib) {
- libraries_.push_back(lib);
- }
- Library& operator[](size_t n) { return libraries_[n]; }
- const Library& operator[](size_t n) const { return libraries_[n]; }
- size_t lib_count() const { return libraries_.size(); }
-
- iterator library_begin() { return libraries_.begin(); }
- const_iterator library_begin() const { return libraries_.begin(); }
- iterator begin() { return libraries_.begin(); }
- const_iterator begin() const { return libraries_.begin(); }
-
- iterator library_end() { return libraries_.end(); }
- const_iterator library_end() const { return libraries_.end(); }
- iterator end() { return libraries_.end(); }
- const_iterator end() const { return libraries_.end(); }
-
- adt::iterator_range<iterator> libraries() {
- return adt::make_range(library_begin(), library_end());
- }
- adt::iterator_range<const_iterator> libraries() const {
- return adt::make_range(library_begin(), library_end());
- }
-
- single_reads_iterator reads_begin() const {
- auto it = libraries_.begin();
- single_reads_iterator res(it->reads_begin(), it->reads_end());
- ++it;
- for (auto end = libraries_.end(); it != end; ++it)
- res.join(it->reads_begin(), it->reads_end());
-
- return res;
- }
- single_reads_iterator reads_end() const {
- return single_reads_iterator(libraries_.back().reads_end(), libraries_.back().reads_end());
- }
- adt::iterator_range<single_reads_iterator> reads() {
- return adt::make_range(reads_begin(), reads_end());
- }
-
- single_reads_iterator single_begin() const {
- auto it = libraries_.begin();
- single_reads_iterator res(it->single_begin(), it->single_end());
- ++it;
- for (auto end = libraries_.end(); it != end; ++it)
- res.join(it->single_begin(), it->single_end());
-
- return res;
- }
- single_reads_iterator single_end() const {
- return single_reads_iterator(libraries_.back().single_end(), libraries_.back().single_end());
- }
- adt::iterator_range<single_reads_iterator> single_reads() {
- return adt::make_range(single_begin(), single_end());
- }
-
- paired_reads_iterator paired_begin() const {
- auto it = libraries_.begin();
- paired_reads_iterator res(it->paired_begin(), it->paired_end());
- ++it;
- for (auto end = libraries_.end(); it != end; ++it)
- res.join(it->paired_begin(), it->paired_end());
-
- return res;
- }
- paired_reads_iterator paired_end() const {
- return paired_reads_iterator(libraries_.back().paired_end(), libraries_.back().paired_end());
- }
-
- adt::iterator_range<paired_reads_iterator> paired_reads() const {
- return adt::make_range(paired_begin(), paired_end());
- }
-
- private:
- LibraryStorage libraries_;
-};
-
-}
-
-namespace YAML {
-template<>
-struct convert<io::SequencingLibraryBase > {
- static Node encode(const io::SequencingLibraryBase& rhs);
- static bool decode(const Node& node, io::SequencingLibraryBase& rhs);
-};
-
-template<>
-struct convert<io::SequencingLibrary<> > {
- static Node encode(const io::SequencingLibrary<>& rhs);
- static bool decode(const Node& node, io::SequencingLibrary<>& rhs);
-};
-
-template<class Data>
-struct convert<io::DataSet<Data> > {
- static Node encode(const io::DataSet<Data>& rhs) {
- Node node;
-
- for (auto it = rhs.library_begin(), et = rhs.library_end(); it != et; ++it)
- node.push_back(*it);
-
- return node;
- }
-};
-}
-
-
-#endif // __IO_LIBRARY_HPP__
diff --git a/src/include/io/mmapped_reader.hpp b/src/include/io/mmapped_reader.hpp
deleted file mode 100644
index f5ee92b..0000000
--- a/src/include/io/mmapped_reader.hpp
+++ /dev/null
@@ -1,360 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef HAMMER_MMAPPED_READER_HPP
-#define HAMMER_MMAPPED_READER_HPP
-
-#include "adt/pointer_iterator.hpp"
-#include "adt/array_vector.hpp"
-
-#include "verify.hpp"
-
-#include <boost/iterator/iterator_facade.hpp>
-
-#include <fcntl.h>
-#include <unistd.h>
-#include <sys/mman.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-
-#include <cstring>
-#include <cerrno>
-
-#include <string>
-#include <algorithm>
-
-class MMappedReader {
- int StreamFile;
- bool Unlink;
- std::string FileName;
-
- void remap() {
- VERIFY(BlockSize != FileSize);
-
- if (MappedRegion)
- munmap(MappedRegion, BlockSize);
-
- BlockOffset += BlockSize;
-
- if (BlockOffset + BlockSize > FileSize)
- BlockSize = FileSize - BlockOffset;
-
- // We do not add PROT_WRITE here intentionaly - remapping and write access
- // is pretty error-prone.
- if (BlockSize)
- MappedRegion =
- (uint8_t*)mmap(NULL, BlockSize,
- PROT_READ, MAP_FILE | MAP_PRIVATE,
- StreamFile, InitialOffset + BlockOffset);
- else
- MappedRegion = NULL;
- VERIFY_MSG((intptr_t)MappedRegion != -1L,
- "mmap(2) failed. Reason: " << strerror(errno) << ". Error code: " << errno);
- }
-
- void read_internal(void *buf, size_t amount) {
- memcpy(buf, MappedRegion + BytesRead - BlockOffset, amount);
- BytesRead += amount;
- }
-
- protected:
- uint8_t* MappedRegion;
- size_t FileSize, BlockOffset, BytesRead, BlockSize;
- off_t InitialOffset;
-
- public:
- MMappedReader()
- : StreamFile(-1), Unlink(false), FileName(""), MappedRegion(0), FileSize(0), BytesRead(0), InitialOffset(0)
- {}
-
- MMappedReader(const std::string &filename, bool unlink = false,
- size_t blocksize = 64*1024*1024, off_t off = 0, size_t sz = 0)
- : Unlink(unlink), FileName(filename), BlockSize(blocksize) {
- struct stat buf;
-
- InitialOffset = off;
- FileSize = (sz ? sz : (stat(FileName.c_str(), &buf) != 0 ? 0 : buf.st_size - InitialOffset));
-
- StreamFile = open(FileName.c_str(), O_RDONLY);
- VERIFY_MSG(StreamFile != -1,
- "open(2) failed. Reason: " << strerror(errno) << ". Error code: " << errno << ". File: " << FileName);
-
- if (BlockSize != -1ULL) {
- size_t PageSize = getpagesize();
- BlockSize = BlockSize / PageSize * PageSize;
- } else
- BlockSize = FileSize;
-
- if (BlockSize) {
- MappedRegion =
- (uint8_t*)mmap(NULL, BlockSize, PROT_READ | PROT_WRITE, MAP_FILE | MAP_PRIVATE,
- StreamFile, InitialOffset);
- VERIFY_MSG((intptr_t)MappedRegion != -1L,
- "mmap(2) failed. Reason: " << strerror(errno) << ". Error code: " << errno);
- } else
- MappedRegion = NULL;
-
- BlockOffset = BytesRead = 0;
- }
-
- MMappedReader(MMappedReader &&other) {
- // First, copy out the stuff
- MappedRegion = other.MappedRegion;
- FileSize = other.FileSize;
- BlockOffset = other.BlockOffset;
- BytesRead = other.BytesRead;
- BlockSize = other.BlockSize;
- FileName = std::move(other.FileName);
- Unlink = other.Unlink;
- StreamFile = other.StreamFile;
- InitialOffset = other.InitialOffset;
-
- // Now, zero out inside other, so we won't do crazy thing in dtor
- other.StreamFile = -1;
- other.Unlink = false;
- other.MappedRegion = 0;
- }
-
- MMappedReader& operator=(MMappedReader &&other) {
- if (this != &other) {
- *this = std::move(other);
- }
- return *this;
- }
-
- virtual ~MMappedReader() {
- if (StreamFile != -1)
- close(StreamFile);
- if (MappedRegion)
- munmap(MappedRegion, BlockSize);
-
- if (Unlink) {
- int res = unlink(FileName.c_str());
- VERIFY_MSG(res == 0,
- "unlink(2) failed. Reason: " << strerror(errno) << ". Error code: " << errno);
- }
- }
-
- void read(void* buf, size_t amount) {
- if (BytesRead + amount < BlockOffset + BlockSize) {
- // Easy case, no remap is necessary
- read_internal(buf, amount);
- return;
- }
-
- // Hard case - remapping is necessary. First - finish the current block.
- size_t ToRead = BlockSize - (BytesRead - BlockOffset);
- uint8_t *cbuf = (uint8_t*)buf;
-
- read_internal(cbuf, ToRead);
- amount -= ToRead; cbuf += ToRead;
-
- // Next, read as much BlockSize blocks as possible.
- while (amount >= BlockSize) {
- remap();
- read_internal(cbuf, BlockSize);
- amount -= BlockSize; cbuf += BlockSize;
- }
-
- // Finally, remap and read remaining.
- remap();
- read_internal(cbuf, amount);
- }
-
- void* skip(size_t amount) {
- // Easy case, no remapping is needed
- if (BytesRead + amount <= BlockOffset + BlockSize) {
- void* out = MappedRegion + BytesRead - BlockOffset;
- BytesRead += amount;
-
- return out;
- }
-
- // Make sure data does not cross the block boundary
- VERIFY(BytesRead == BlockOffset + BlockSize);
-
- // Now, remap and read from the beginning of the block
- remap();
-
- return skip(amount);
- }
-
- bool good() const {
- return BytesRead < FileSize;
- }
-
- size_t size() const { return FileSize; }
- size_t data_size() const { return FileSize; }
-
- void* data() const { return MappedRegion; }
-};
-
-template<typename T>
-class MMappedRecordReader : public MMappedReader {
- public:
- typedef pointer_iterator<T> iterator;
- typedef const pointer_iterator<T> const_iterator;
-
- MMappedRecordReader(const std::string &FileName, bool unlink = true,
- size_t blocksize = 64*1024*1024 / (sizeof(T) * (unsigned)getpagesize()) * (sizeof(T) * (unsigned)getpagesize()),
- off_t off = 0, size_t sz = 0):
- MMappedReader(FileName, unlink, blocksize, off, sz) {
- VERIFY(FileSize % sizeof(T) == 0);
- }
-
- void read(T* el, size_t amount) {
- MMappedReader::read(el, amount * sizeof(T));
- }
-
- size_t size() const { return FileSize / sizeof(T); }
- size_t data_size() const { return FileSize; }
- T* data() { return (T*)MappedRegion; }
- const T* data() const { return (const T*)MappedRegion; }
- T& operator[](size_t idx) { return data()[idx]; }
- const T& operator[](size_t idx) const { return data()[idx]; }
-
- iterator begin() { return iterator(data()); }
- const_iterator begin() const { return const_iterator(data()); }
- iterator end() { return iterator(data()+ size()); }
- const_iterator end() const { return const_iterator(data() + size()); }
-};
-
-template<class T>
-class MMappedFileRecordIterator :
- public boost::iterator_facade<MMappedFileRecordIterator<T>,
- const T,
- std::input_iterator_tag> {
- public:
- // Default ctor, used to implement "end" iterator
- MMappedFileRecordIterator() : good_(false) {}
- MMappedFileRecordIterator(const std::string &FileName)
- : reader_(FileName, false), good_(true) {
- reader_.read(&value_, sizeof(value_));
- }
- MMappedFileRecordIterator(MMappedRecordReader<T> &&reader)
- : reader_(std::move(reader)), good_(true) {
- reader_.read(&value_, sizeof(value_));
- }
- bool good() const {
- return good_;
- }
-
- private:
- friend class boost::iterator_core_access;
-
- void increment() {
- good_ = reader_.good();
- if (good_)
- reader_.read(&value_, sizeof(value_));
- }
- bool equal(const MMappedFileRecordIterator &other) {
- // Iterators are equal iff:
- // 1) They both are not good (at the end of the stream),
- // or
- // 2) Has the same mapped region
- return ((!reader_.good() && !other.reader_.good()) ||
- reader_.data() == other.reader_.data());
- }
- const T dereference() const { return value_; }
-
- T value_;
- MMappedRecordReader<T> reader_;
- bool good_;
-};
-
-template<typename T>
-class MMappedRecordArrayReader : public MMappedReader {
- size_t elcnt_;
-
- public:
- typedef typename array_vector<T>::iterator iterator;
- typedef typename array_vector<T>::const_iterator const_iterator;
-
- MMappedRecordArrayReader(const std::string &FileName,
- size_t elcnt = 1,
- bool unlink = true,
- off_t off = 0, size_t sz = 0):
- MMappedReader(FileName, unlink, -1ULL, off, sz), elcnt_(elcnt) {
- VERIFY(FileSize % (sizeof(T) * elcnt_) == 0);
- }
-
- void read(T* el, size_t amount) {
- MMappedReader::read(el, amount * sizeof(T) * elcnt_);
- }
-
- size_t size() const { return FileSize / sizeof(T) / elcnt_; }
- size_t data_size() const { return FileSize; }
- size_t elcnt() const { return elcnt_; }
- T* data() { return (T*)MappedRegion; }
- const T* data() const { return (const T*)MappedRegion; }
- T& operator[](size_t idx) { return data()[idx*elcnt_]; }
- const T& operator[](size_t idx) const { return data()[idx*elcnt_]; }
-
- iterator begin() { return iterator(data(), /* size */ elcnt_); }
- const_iterator begin() const { return const_iterator(data()), /* size */ elcnt_; }
- const_iterator cbegin() const { return const_iterator(data()), /* size */ elcnt_; }
- iterator end() { return iterator(data() + size()*elcnt_, elcnt_); }
- const_iterator end() const { return const_iterator(data() + size()*elcnt_, elcnt_); }
- const_iterator cend() const { return const_iterator(data() + size()*elcnt_, elcnt_); }
-};
-
-static inline size_t round_up(size_t value, size_t boundary) {
- return (value + boundary - 1) / boundary * boundary;
-}
-
-template<class T>
-class MMappedFileRecordArrayIterator :
- public boost::iterator_facade<MMappedFileRecordArrayIterator<T>,
- const T*,
- std::input_iterator_tag,
- const T*> {
- public:
- // Default ctor, used to implement "end" iterator
- MMappedFileRecordArrayIterator(): value_(NULL), array_size_(0), reader_(), good_(false) {}
- MMappedFileRecordArrayIterator(const std::string &FileName,
- size_t elcnt,
- off_t offset = 0, size_t filesize = 0)
- : value_(NULL),
- array_size_(sizeof(T) * elcnt),
- reader_(FileName, false,
- round_up(filesize > 0 ? std::min(size_t(64 * 1024 * 1024), filesize) : 64 * 1024 * 1024, array_size_ * (unsigned)getpagesize()),
- offset, filesize),
- good_(false) {
- increment();
- }
- MMappedFileRecordArrayIterator(MMappedRecordReader<T> &&reader, size_t elcnt)
- : value_(NULL), array_size_(sizeof(T) * elcnt), reader_(std::move(reader)), good_(false) {
- increment();
- }
- MMappedFileRecordArrayIterator(const MMappedFileRecordArrayIterator&) = delete;
-
- MMappedFileRecordArrayIterator(MMappedFileRecordArrayIterator&& other)
- : value_(other.value_), array_size_(other.array_size_),
- reader_(std::move(other.reader_)), good_(other.good_) {}
-
- bool good() const { return good_; }
- const MMappedRecordReader<T>& reader() const { return reader_; }
-
- private:
- friend class boost::iterator_core_access;
-
- void increment() {
- good_ = reader_.good();
- value_ = (good_ ? (T*)reader_.skip(array_size_) : NULL);
- }
- bool equal(const MMappedFileRecordArrayIterator &other) const {
- return value_ == other.value_;
- }
- const T* dereference() const { return value_; }
-
- T* value_;
- size_t array_size_;
- MMappedRecordReader<T> reader_;
- bool good_;
-};
-
-#endif // HAMMER_MMAPPED_READER_HPP
diff --git a/src/include/io/mmapped_writer.hpp b/src/include/io/mmapped_writer.hpp
deleted file mode 100644
index 90873fd..0000000
--- a/src/include/io/mmapped_writer.hpp
+++ /dev/null
@@ -1,171 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef HAMMER_MMAPPED_WRITER_HPP
-#define HAMMER_MMAPPED_WRITER_HPP
-
-#include "adt/pointer_iterator.hpp"
-#include "adt/array_vector.hpp"
-
-#include <string>
-
-#include <fcntl.h>
-#include <unistd.h>
-#include <sys/mman.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <strings.h>
-
-class MMappedWriter {
- int StreamFile;
- MMappedWriter(const MMappedWriter &) = delete;
- protected:
- uint8_t* MappedRegion;
- size_t BytesWritten, BytesReserved, FileOffset, BufOffset;
- public:
- MMappedWriter() = default;
- MMappedWriter(const std::string &FileName) {
- open(FileName);
- }
-
- void open(const std::string &FileName) {
- StreamFile = ::open(FileName.c_str(), O_RDWR | O_CREAT | O_TRUNC, (mode_t)0660);
- VERIFY_MSG(StreamFile != -1,
- "open(2) failed. Reason: " << strerror(errno) << ". Error code: " << errno);
-
- FileOffset = BytesWritten = 0;
- MappedRegion = NULL;
- }
-
- virtual ~MMappedWriter() {
- if (MappedRegion)
- munmap(MappedRegion, BytesReserved);
- close(StreamFile);
- }
-
- void write(void* buf, size_t amount) {
- memcpy(MappedRegion + BufOffset + BytesWritten, buf, amount);
- BytesWritten += amount;
- }
-
- bool good() const {
- return BytesWritten < BytesReserved;
- }
-
- void reserve(size_t amount) {
- if (MappedRegion) {
- munmap(MappedRegion, BytesReserved);
- FileOffset += BytesWritten;
- MappedRegion = NULL;
- }
-
- if (amount == 0)
- return;
-
- int res = (int)lseek(StreamFile, amount-1, SEEK_CUR);
- VERIFY_MSG(res != -1,
- "lseek(2) failed. Reason: " << strerror(errno) << ". Error code: " << errno);
- res = (int)::write(StreamFile, "", 1);
- VERIFY_MSG(res != -1,
- "write(2) failed. Reason: " << strerror(errno) << ". Error code: " << errno);
-
- // FileOffset here should be aligned to page boundary. Tune the stuff due to this fact.
- int PageSize = getpagesize();
- size_t FileOffsetAligned = FileOffset / PageSize * PageSize;
- size_t Residual = FileOffset - FileOffsetAligned;
-
- BytesReserved = amount + Residual;
- BytesWritten = 0; BufOffset = Residual;
- MappedRegion =
- (uint8_t*)mmap(NULL, BytesReserved,
- PROT_READ | PROT_WRITE, MAP_FILE | MAP_SHARED,
- StreamFile, FileOffsetAligned);
- VERIFY_MSG((intptr_t)MappedRegion != -1L,
- "mmap(2) failed. Reason: " << strerror(errno) << ". Error code: " << errno);
- }
-
- size_t size() const { return BytesReserved; }
-};
-
-template<typename T>
-class MMappedRecordWriter : public MMappedWriter {
- public:
- typedef pointer_iterator<T> iterator;
- typedef const pointer_iterator<T> const_iterator;
-
- MMappedRecordWriter() = default;
- MMappedRecordWriter(const std::string &FileName):
- MMappedWriter(FileName) {
- }
-
- void write(const T* el, size_t amount) {
- MMappedWriter::write((void*)el, amount * sizeof(T));
- }
-
- void reserve(size_t amount) {
- MMappedWriter::reserve(amount * sizeof(T));
- }
-
- void resize(size_t amount) {
- MMappedWriter::reserve(amount * sizeof(T));
- }
-
- size_t size() const { return BytesReserved / sizeof(T); }
- T* data() { return (T*)MappedRegion; }
- const T* data() const { return (const T*)MappedRegion; }
- T& operator[](size_t idx) { return data()[idx]; }
- const T& operator[](size_t idx) const { return data()[idx]; }
-
- iterator begin() { return iterator(data()); }
- const_iterator begin() const { return const_iterator(data()); }
- iterator end() { return iterator(data()+ size()); }
- const_iterator end() const { return const_iterator(data() + size()); }
-};
-
-template<typename T>
-class MMappedRecordArrayWriter : public MMappedWriter {
- size_t elcnt_;
- public:
- typedef typename array_vector<T>::iterator iterator;
- typedef typename array_vector<T>::const_iterator const_iterator;
-
- MMappedRecordArrayWriter() = default;
- MMappedRecordArrayWriter(const std::string &FileName,
- size_t elcnt = 1):
- MMappedWriter(FileName), elcnt_(elcnt) {}
-
- void open(const std::string &FileName,
- size_t elcnt = 1) {
- elcnt_ = elcnt;
- MMappedWriter::open(FileName);
- }
-
- void write(const T* el, size_t amount) {
- MMappedWriter::write((void*)el, amount * sizeof(T) * elcnt_);
- }
-
- void reserve(size_t amount) {
- MMappedWriter::reserve(amount * sizeof(T) * elcnt_);
- }
-
- void resize(size_t amount) {
- MMappedWriter::reserve(amount * sizeof(T) * elcnt_);
- }
-
- size_t size() const { return BytesReserved / sizeof(T) / elcnt_; }
- T* data() { return (T*)MappedRegion; }
- const T* data() const { return (const T*)MappedRegion; }
- T& operator[](size_t idx) { return data()[idx*elcnt_]; }
- const T& operator[](size_t idx) const { return data()[idx*elcnt_]; }
-
- iterator begin() { return iterator(data(), elcnt_); }
- const_iterator begin() const { return const_iterator(data(), elcnt_); }
- iterator end() { return iterator(data() + size()*elcnt_, elcnt_); }
- const_iterator end() const { return const_iterator(data() + size()*elcnt_, elcnt_); }
-};
-
-#endif // HAMMER_MMAPPED_WRITER_HPP
diff --git a/src/include/io/modifying_reader_wrapper.hpp b/src/include/io/modifying_reader_wrapper.hpp
deleted file mode 100644
index 2c0f1db..0000000
--- a/src/include/io/modifying_reader_wrapper.hpp
+++ /dev/null
@@ -1,113 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "verify.hpp"
-#include "io/delegating_reader_wrapper.hpp"
-
-#include <memory>
-
-namespace io {
-
-class SequenceModifier {
-public:
- virtual ~SequenceModifier() {}
-
- SingleRead Modify(const SingleRead& read) {
- return SingleRead(read.name(), Modify(read.sequence()).str());
- }
-
- SingleReadSeq Modify(const SingleReadSeq& read) {
- return SingleReadSeq(Modify(read.sequence()));
- }
-
- virtual Sequence Modify(const Sequence& s) = 0;
-};
-
-class TrivialModifier : public SequenceModifier {
-public:
-
- virtual Sequence Modify(const Sequence& s) {
- return s;
- }
-};
-
-/**
- * Attention!!! this class clears quality!!!
- */
-template<class ReadType>
-class ModifyingWrapper;
-
-template<>
-class ModifyingWrapper<SingleRead>: public DelegatingWrapper<SingleRead> {
- typedef DelegatingWrapper<SingleRead> base;
- std::shared_ptr<SequenceModifier> modifier_;
-
-public:
- ModifyingWrapper(base::ReadStreamPtrT reader, std::shared_ptr<SequenceModifier> modifier) :
- base(reader), modifier_(modifier) {}
-
- ModifyingWrapper& operator>>(SingleRead& read) {
- this->reader() >> read;
- read = modifier_->Modify(read);
- return *this;
- }
-};
-
-template<>
-class ModifyingWrapper<PairedRead>: public DelegatingWrapper<PairedRead> {
- typedef DelegatingWrapper<PairedRead> base;
- std::shared_ptr<SequenceModifier> modifier_;
-
-public:
- ModifyingWrapper(base::ReadStreamPtrT reader, std::shared_ptr<SequenceModifier> modifier) :
- base(reader), modifier_(modifier) {}
-
- ModifyingWrapper& operator>>(PairedRead& read) {
- this->reader() >> read;
- read = PairedRead(modifier_->Modify(read.first()),
- modifier_->Modify(read.second()),
- read.insert_size());
- return *this;
- }
-};
-
-template<>
-class ModifyingWrapper<SingleReadSeq>: public DelegatingWrapper<SingleReadSeq> {
- typedef DelegatingWrapper<SingleReadSeq> base;
- std::shared_ptr<SequenceModifier> modifier_;
-
-public:
- ModifyingWrapper(base::ReadStreamPtrT reader, std::shared_ptr<SequenceModifier> modifier) :
- base(reader), modifier_(modifier) {}
-
- ModifyingWrapper& operator>>(SingleReadSeq& read) {
- this->reader() >> read;
- read = modifier_->Modify(read.sequence());
- return *this;
- }
-};
-
-template<>
-class ModifyingWrapper<PairedReadSeq>: public DelegatingWrapper<PairedReadSeq> {
- typedef DelegatingWrapper<PairedReadSeq> base;
- std::shared_ptr<SequenceModifier> modifier_;
-
-public:
- ModifyingWrapper(base::ReadStreamPtrT reader, std::shared_ptr<SequenceModifier> modifier) :
- base(reader), modifier_(modifier) {}
-
- ModifyingWrapper& operator>>(PairedReadSeq& read) {
- this->reader() >> read;
- read = PairedReadSeq(modifier_->Modify(read.first().sequence())
- , SingleReadSeq(modifier_->Modify(read.second())), read.insert_size());
- return *this;
- }
-};
-
-}
diff --git a/src/include/io/mpmc_bounded.hpp b/src/include/io/mpmc_bounded.hpp
deleted file mode 100644
index 3e4b5c9..0000000
--- a/src/include/io/mpmc_bounded.hpp
+++ /dev/null
@@ -1,149 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/* Multi-consumer/multi-producer bounded queue
-
- Copyright (c) 2011 Dmitry Vyukov. All rights reserved.
-
- Redistribution and use in source and binary forms, with or without modification, are
- permitted provided that the following conditions are met:
-
- 1. Redistributions of source code must retain the above copyright notice, this list of
- conditions and the following disclaimer.
-
- 2. Redistributions in binary form must reproduce the above copyright notice, this list
- of conditions and the following disclaimer in the documentation and/or other materials
- provided with the distribution.
-
- THIS SOFTWARE IS PROVIDED BY DMITRY VYUKOV "AS IS" AND ANY EXPRESS OR IMPLIED
- WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL DMITRY VYUKOV OR
- CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
- ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include <ciso646>
-#if __GNUC__ > 4 || (__GNUC__ >= 4 && __GNUC_MINOR__ >= 5) || _LIBCPP_VERSION
-#include <atomic>
-#else
-#include <cstdatomic>
-#endif
-
-#include <cstring>
-
-template<typename T>
-class mpmc_bounded_queue {
- public:
- mpmc_bounded_queue(size_t buffer_size)
- : buffer_(new cell_t [buffer_size])
- , buffer_mask_(buffer_size - 1) {
- assert((buffer_size >= 2) && ((buffer_size & (buffer_size - 1)) == 0));
- for (size_t i = 0; i != buffer_size; i += 1)
- buffer_[i].sequence_.store(i, std::memory_order_relaxed);
- enqueue_pos_.store(0, std::memory_order_relaxed);
- dequeue_pos_.store(0, std::memory_order_relaxed);
- closed_.store(false, std::memory_order_relaxed);
- }
-
- ~mpmc_bounded_queue() {
- delete [] buffer_;
- }
-
- bool is_closed() const {
- return closed_.load(std::memory_order_relaxed);
- }
-
- void close() {
- closed_.store(true, std::memory_order_release);
- }
-
- bool enqueue(T const& data) {
- if (is_closed())
- return false;
-
- cell_t* cell;
- size_t pos = enqueue_pos_.load(std::memory_order_relaxed);
- for (;;) {
- cell = &buffer_[pos & buffer_mask_];
- size_t seq = cell->sequence_.load(std::memory_order_acquire);
- intptr_t dif = (intptr_t)seq - (intptr_t)pos;
- if (dif == 0) {
- if (enqueue_pos_.compare_exchange_weak(pos, pos + 1, std::memory_order_relaxed))
- break;
- } else if (dif < 0)
- return false;
- else
- pos = enqueue_pos_.load(std::memory_order_relaxed);
- }
-
- cell->data_ = data;
- cell->sequence_.store(pos + 1, std::memory_order_release);
-
- return true;
- }
-
- bool dequeue(T& data) {
- cell_t* cell;
- size_t pos = dequeue_pos_.load(std::memory_order_relaxed);
- for (;;) {
- cell = &buffer_[pos & buffer_mask_];
- size_t seq = cell->sequence_.load(std::memory_order_acquire);
- intptr_t dif = (intptr_t)seq - (intptr_t)(pos + 1);
- if (dif == 0) {
- if (dequeue_pos_.compare_exchange_weak(pos, pos + 1, std::memory_order_relaxed))
- break;
- } else if (dif < 0)
- return false;
- else
- pos = dequeue_pos_.load(std::memory_order_relaxed);
- }
-
- data = cell->data_;
- cell->sequence_.store(pos + buffer_mask_ + 1, std::memory_order_release);
-
- return true;
- }
-
- bool wait_dequeue(T& data) {
- bool res = false;
- do {
- res = dequeue(data);
- if (!res)
- usleep(1);
- } while (!res && !is_closed());
-
- return res;
- }
-
- private:
- struct cell_t {
- std::atomic<size_t> sequence_;
- T data_;
- };
-
- static size_t const cacheline_size = 64;
- typedef char cacheline_pad_t [cacheline_size];
-
- cacheline_pad_t pad0_;
- cell_t* const buffer_;
- size_t const buffer_mask_;
- cacheline_pad_t pad1_;
- std::atomic<size_t> enqueue_pos_;
- cacheline_pad_t pad2_;
- std::atomic<size_t> dequeue_pos_;
- cacheline_pad_t pad3_;
- std::atomic<bool> closed_;
- cacheline_pad_t pad4_;
-
- mpmc_bounded_queue(mpmc_bounded_queue const&);
- void operator = (mpmc_bounded_queue const&);
-};
diff --git a/src/include/io/multifile_reader.hpp b/src/include/io/multifile_reader.hpp
deleted file mode 100644
index 7e37cc1..0000000
--- a/src/include/io/multifile_reader.hpp
+++ /dev/null
@@ -1,99 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "read_stream_vector.hpp"
-#include <vector>
-
-namespace io {
-
-/**
- * MultifileReader is the stream that gets data from number of files,
- * given in a constructor.
- */
-template<typename ReadType>
-class MultifileStream: public ReadStream<ReadType> {
- typedef ReadStream<ReadType> StreamT;
- typedef std::shared_ptr<StreamT> ReadStreamPtrT;
-public:
- MultifileStream(const ReadStreamList<ReadType>& readers) :
- readers_(readers), current_reader_index_(0) {
- }
-
- MultifileStream(ReadStreamPtrT reader_1, ReadStreamPtrT reader_2) :
- current_reader_index_(0) {
- VERIFY(reader_1->is_open() && reader_2->is_open());
- readers_.push_back(reader_1);
- readers_.push_back(reader_2);
- }
-
- /* virtual */
- bool is_open() {
- return (readers_.size() > 0) && readers_[0].is_open();
- }
-
- /* virtual */
- bool eof() {
- while ((current_reader_index_ < readers_.size()) && readers_[current_reader_index_].eof()) {
- ++current_reader_index_;
- }
- return current_reader_index_ == readers_.size();
- }
-
- /* virtual */
- MultifileStream& operator>>(ReadType& read) {
- if (!eof()) {
- readers_[current_reader_index_] >> read;
- }
- return (*this);
- }
-
- /* virtual */
- void close() {
- readers_.close();
- }
-
- /* virtual */
- void reset() {
- readers_.reset();
- current_reader_index_ = 0;
- }
-
- /* virtual */
- ReadStreamStat get_stat() const {
- return readers_.get_stat();
- }
-
-private:
- ReadStreamList<ReadType> readers_;
- size_t current_reader_index_;
-};
-
-template<class ReadType>
-std::shared_ptr<ReadStream<ReadType>> MultifileWrap(std::shared_ptr<ReadStream<ReadType>> reader_1,
- std::shared_ptr<ReadStream<ReadType>> reader_2) {
- return std::make_shared<MultifileStream<ReadType>>(reader_1, reader_2);
-}
-
-template<class ReadType>
-std::shared_ptr<ReadStream<ReadType>> MultifileWrap(const ReadStreamList<ReadType>& readers) {
- return std::make_shared<MultifileStream<ReadType>>(readers);
-}
-
-template<class ReadType>
-ReadStreamList<ReadType> WrapPairsInMultifiles(ReadStreamList<ReadType> readers_1,
- ReadStreamList<ReadType> readers_2) {
- VERIFY(readers_1.size() == readers_2.size());
- ReadStreamList<ReadType> answer;
- for (size_t i = 0; i < readers_1.size(); ++i) {
- answer.push_back(MultifileWrap<ReadType>(readers_1.ptr_at(i), readers_2.ptr_at(i)));
- }
- return answer;
-}
-
-}
diff --git a/src/include/io/orientation.hpp b/src/include/io/orientation.hpp
deleted file mode 100644
index 7a230e5..0000000
--- a/src/include/io/orientation.hpp
+++ /dev/null
@@ -1,93 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "library.hpp"
-
-namespace io {
-
-template<typename ReadType>
-class OrientationChanger {
-
-public:
-
- virtual ReadType Perform(const ReadType& r) const = 0;
-
- virtual ~OrientationChanger() {
- }
-};
-
-template<typename ReadType>
-class IdeticalChanger : public OrientationChanger<ReadType> {
-
-public:
-
- virtual ReadType Perform(const ReadType& r) const {
- return r;
- }
-};
-
-template<typename ReadType>
-class ReverseSecondChanger : public OrientationChanger<ReadType> {
-
-public:
-
- virtual ReadType Perform(const ReadType& r) const {
- return ReadType(r.first(), !r.second(), r.insert_size());
- }
-};
-
-template<typename ReadType>
-class ReverseFirstChanger : public OrientationChanger<ReadType> {
-
-public:
-
- virtual ReadType Perform(const ReadType& r) const {
- return ReadType(!r.first(), r.second(), r.insert_size());
- }
-};
-
-template<typename ReadType>
-class ReverseChanger : public OrientationChanger<ReadType> {
-
-public:
-
- virtual ReadType Perform(const ReadType& r) const {
- return ReadType(!r.first(), !r.second(), r.insert_size());
- }
-};
-
-template<typename ReadType>
-std::unique_ptr<OrientationChanger<ReadType>> GetOrientationChanger(LibraryOrientation orientation) {
- OrientationChanger<ReadType> * result;
- switch (orientation) {
- case LibraryOrientation::FF: {
- result = new IdeticalChanger<ReadType>();
- break;
- }
- case LibraryOrientation::RR: {
- result = new ReverseChanger<ReadType>();
- break;
- }
- case LibraryOrientation::FR: {
- result = new ReverseSecondChanger<ReadType>();
- break;
- }
- case LibraryOrientation::RF: {
- result = new ReverseFirstChanger<ReadType>();
- break;
- }
- default: {
- result = new IdeticalChanger<ReadType>();
- break;
- }
- }
- return std::unique_ptr<OrientationChanger<ReadType>>(result);
-}
-
-}
diff --git a/src/include/io/osequencestream.hpp b/src/include/io/osequencestream.hpp
deleted file mode 100644
index 17d0e68..0000000
--- a/src/include/io/osequencestream.hpp
+++ /dev/null
@@ -1,367 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * oreadstream.hpp
- *
- * Created on: 23.06.2011
- * Author: vyahhi
- */
-
-#pragma once
-
-#include <fstream>
-#include <string>
-#include <vector>
-#include "single_read.hpp"
-#include "paired_read.hpp"
-
-namespace io {
-
-inline std::string MakeContigId(int number, size_t length, const std::string& prefix = "NODE") {
- return prefix + "_" + ToString(number) + "_length_" + ToString(length);
-}
-
-inline std::string MakeContigId(int number, size_t length, double coverage, const std::string& prefix = "NODE") {
- return prefix + "_" + ToString(number) + "_length_" + ToString(length) + "_cov_" + ToString(coverage);
-}
-
-inline std::string MakeContigId(int number, size_t length, double coverage, size_t id, const std::string& prefix = "NODE") {
- return prefix + "_" + ToString(number) + "_length_" + ToString(length) + "_cov_" + ToString(coverage) + "_ID_" + ToString(id);
-}
-
-class osequencestream {
-protected:
- std::ofstream ofstream_;
-
- int id_;
-
- void write_str(const std::string& s) {
- size_t cur = 0;
- while (cur < s.size()) {
- ofstream_ << s.substr(cur, 60) << std::endl;
- cur += 60;
- }
- }
-
- virtual void write_header(const std::string& s) {
- // Velvet format: NODE_1_length_24705_cov_358.255249
- ofstream_ << ">" << MakeContigId(id_++, s.size()) << std::endl;
- }
-
-public:
- osequencestream(const std::string& filename): id_(1) {
- ofstream_.open(filename.c_str());
- }
-
- virtual ~osequencestream() {
- ofstream_.close();
- }
-
- virtual osequencestream& operator<<(const std::string& s) {
- write_header(s);
- write_str(s);
- return *this;
- }
-
- virtual osequencestream& operator<<(const Sequence& seq) {
- std::string s = seq.str();
- return operator <<(s);
- }
-
- /**
- * Has different way of making headers
- * Doesn't increase counters, don't mix with other methods!
- */
- virtual osequencestream& operator<<(const SingleRead& read) {
- ofstream_ << ">" << read.name() << std::endl;
- size_t cur = 0;
- std::string s = read.GetSequenceString();
- while (cur < s.size()) {
- ofstream_ << s.substr(cur, 60) << std::endl;
- cur += 60;
- }
- return *this;
- }
-};
-
-class PairedOutputSequenceStream {
-protected:
- std::ofstream ofstreaml_;
- std::ofstream ofstreamr_;
-
- static void write(const SingleRead& read, std::ofstream& stream) {
- stream << ">" << read.name() << std::endl;
- size_t cur = 0;
- std::string s = read.GetSequenceString();
- while (cur < s.size()) {
- stream << s.substr(cur, 60) << std::endl;
- cur += 60;
- }
- }
-
-public:
- PairedOutputSequenceStream(const std::string& filename1, const std::string &filename2) {
- ofstreaml_.open(filename1);
- ofstreamr_.open(filename2);
- }
-
- virtual ~PairedOutputSequenceStream() {
- ofstreaml_.close();
- ofstreamr_.close();
- }
-
- PairedOutputSequenceStream& operator<<(const PairedRead& read) {
- write(read.first(), ofstreaml_);
- write(read.second(), ofstreamr_);
- return *this;
- }
-};
-
-
-class osequencestream_cov: public osequencestream {
-protected:
- double coverage_;
-
- virtual void write_header(const std::string& s) {
- // Velvet format: NODE_1_length_24705_cov_358.255249
- ofstream_ << ">" << MakeContigId(id_++, s.size(), coverage_) << std::endl;
- }
-
-
-public:
- osequencestream_cov(const std::string& filename)
- : osequencestream(filename), coverage_(0.) { }
-
- virtual ~osequencestream_cov() {
- ofstream_.close();
- }
-
- osequencestream_cov& operator<<(double coverage) {
- coverage_ = coverage;
- return *this;
- }
-
- osequencestream_cov& operator<<(const std::string& s) {
- write_header(s);
- write_str(s);
- return *this;
- }
-
- osequencestream_cov& operator<<(const Sequence& seq) {
- std::string s = seq.str();
- return operator <<(s);
- }
-
-};
-
-
-class osequencestream_simple: public osequencestream {
-protected:
- std::string header_;
-
- double cov_;
-
- virtual void write_header(const std::string& /*s*/) {
- ofstream_ << ">" << header_ << std::endl;
- }
-
-public:
- osequencestream_simple(const std::string& filename)
- : osequencestream(filename), header_("") { }
-
- virtual ~osequencestream_simple() {
- ofstream_.close();
- }
-
- void set_header(const std::string &header) {
- header_ = header;
- }
-
- osequencestream_simple& operator<<(const std::string& s) {
- write_header(s);
- write_str(s);
- return *this;
- }
-
- osequencestream_simple& operator<<(const Sequence& seq) {
- std::string s = seq.str();
- return operator <<(s);
- }
-
-};
-
-class osequencestream_with_id: public osequencestream {
-protected:
- size_t uid_;
-
- double cov_;
-
- virtual void write_header(const std::string& s) {
- ofstream_ << ">" << GetId(s) << std::endl;
- id_++;
- }
-
-public:
- osequencestream_with_id(const std::string& filename)
- : osequencestream(filename), uid_(0), cov_(0.0) { }
-
- virtual ~osequencestream_with_id() {
- ofstream_.close();
- }
-
- std::string GetId(const std::string& s) const {
- return MakeContigId(id_, s.size(), cov_, uid_);
- }
-
- void setCoverage(double c) {
- cov_ = c;
- }
-
- void setID(size_t uid) {
- uid_ = uid;
- }
-
- osequencestream_with_id& operator<<(const std::string& s) {
- write_header(s);
- write_str(s);
- return *this;
- }
-
- osequencestream_with_id& operator<<(double coverage) {
- cov_ = coverage;
- return *this;
- }
-
- osequencestream_with_id& operator<<(const Sequence& seq) {
- std::string s = seq.str();
- return operator <<(s);
- }
-
-};
-
-class osequencestream_with_manual_node_id: public osequencestream_with_id {
- bool is_id_set_;
- virtual void write_header(const std::string& s) {
- //for manual NODE ID setting osequencestream need to chech that node ID is really manually set
- if (!is_id_set_) {
- WARN ("NODE ID is not set manually, setting to 0");
- id_ = 0;
- }
- ofstream_ << ">" << MakeContigId(id_, s.size(), cov_, uid_) << std::endl;
- is_id_set_ = false;
- }
-
-public:
-//unfortunately constructor inheritance is supported only since g++4.8
- osequencestream_with_manual_node_id(const std::string& filename): osequencestream_with_id(filename) {
- is_id_set_ = false;
- }
-
- void setNodeID(int id) {
- id_ = id;
- is_id_set_ = true;
- }
-
- osequencestream_with_manual_node_id& operator<<(const std::string& s) {
- write_header(s);
- write_str(s);
- return *this;
- }
-
- osequencestream_with_manual_node_id& operator<<(const Sequence& seq) {
- std::string s = seq.str();
- return operator <<(s);
- }
-
-
-};
-
-
-class osequencestream_with_data_for_scaffold: public osequencestream_with_id {
-protected:
- std::ofstream scstream_;
-
- virtual void write_header(const std::string& s) {
- scstream_ << id_ << "\tNODE_" << id_ << "\t" << s.size() << "\t" << (int) round(cov_) << std::endl;
- ofstream_ << ">" << MakeContigId(id_++, s.size(), cov_, uid_) << std::endl;
- }
-
-public:
- osequencestream_with_data_for_scaffold(const std::string& filename): osequencestream_with_id(filename) {
- id_ = 1;
- std::string sc_filename = filename + ".info";
- scstream_.open(sc_filename.c_str());
- }
-
- virtual ~osequencestream_with_data_for_scaffold() {
- ofstream_.close();
- scstream_.close();
- }
-
- osequencestream_with_data_for_scaffold& operator<<(const std::string& s) {
- write_header(s);
- write_str(s);
- return *this;
- }
-
- osequencestream_with_data_for_scaffold& operator<<(const Sequence& seq) {
- std::string s = seq.str();
- return operator <<(s);
- }
-};
-
-class osequencestream_for_fastg: public osequencestream_with_id {
-protected:
- std::string header_;
-
- virtual void write_header(const std::string& s) {
- ofstream_ << ">" << s;
- }
-
-public:
- osequencestream_for_fastg(const std::string& filename):
- osequencestream_with_id(filename) {
- id_ = 1;
- }
-
- virtual ~osequencestream_for_fastg() {
- ofstream_.close();
- }
-
- void set_header(const std::string& h) {
- header_= h;
- }
-
- osequencestream_for_fastg& operator<<(const std::set<std::string>& s) {
- write_header(header_);
- if (s.size() > 0) {
- auto iter = s.begin();
- ofstream_ << ":" << *iter;
- ++iter;
- while (iter != s.end()) {
- ofstream_ << "," << *iter;
- ++iter;
- }
- }
- ofstream_ << ";" << std::endl;
- return *this;
- }
-
- osequencestream_for_fastg& operator<<(const std::string& s) {
- write_str(s);
- return *this;
- }
-
- osequencestream_for_fastg& operator<<(const Sequence& seq) {
- std::string s = seq.str();
- return operator <<(s);
- }
-
-};
-
-}
diff --git a/src/include/io/paired_read.hpp b/src/include/io/paired_read.hpp
deleted file mode 100644
index fd3fbd0..0000000
--- a/src/include/io/paired_read.hpp
+++ /dev/null
@@ -1,186 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "single_read.hpp"
-
-#include <string>
-#include <utility>
-
-namespace io {
-
-
-class PairedRead {
- public:
- typedef SingleRead SingleReadT;
- typedef int16_t size_type;
-
- PairedRead() : first_(), second_(), insert_size_(0) {}
-
- PairedRead(const SingleRead& first,
- const SingleRead& second,
- size_t insert_size)
- : first_(first), second_(second), insert_size_(insert_size) {}
-
- const SingleRead& first() const {
- return first_;
- }
-
- const SingleRead& second() const {
- return second_;
- }
-
- size_t insert_size() const {
- return insert_size_;
- }
-
- size_t distance() const {
- return insert_size_ - second_.size();
- }
-
- size_t gap() const {
- return insert_size_ - first_.size() - second_.size();
- }
-
- size_t size() const {
- return std::max(first_.size(), second_.size());
- }
-
- size_t nucl_count() const {
- return first_.size() + second_.size();
- }
-
- bool IsValid() const {
- return first_.IsValid() && second_.IsValid();
- }
-
- const SingleRead& operator[] (size_t i) const {
- if (i == 0) {
- return first_;
- } else if (i == 1) {
- return second_;
- }
- VERIFY(false);
- return first_;
- }
-
- const PairedRead operator!() const {
- return PairedRead(!second_, !first_, insert_size_);
- }
-
- bool operator==(const PairedRead& pairedread) const {
- return first_ == pairedread.first_ &&
- second_ == pairedread.second_ &&
- insert_size_ == pairedread.insert_size_;
- }
-
- bool BinWrite(std::ostream& file, bool rc1 = false, bool rc2 = false) const {
- first_.BinWrite(file, rc1);
- second_.BinWrite(file, rc2);
-
- return !file.fail();
- }
-
- void print_size() const {
- first_.print_size();
- second_.print_size();
- }
-
- private:
- SingleRead first_;
- SingleRead second_;
- size_t insert_size_;
-
-};
-
-inline std::ostream& operator<<(std::ostream& os, const PairedRead& read) {
- os << "Single read first=" << read.first() << " second=" << read.second() << std::endl;
- return os;
-}
-
-class PairedReadSeq {
- public:
- typedef SingleReadSeq SingleReadT;
- private:
- SingleReadSeq first_;
- SingleReadSeq second_;
- size_t insert_size_;
-
- public:
- PairedReadSeq() : first_(), second_(), insert_size_(0) {}
-
- bool BinRead(std::istream& file, size_t is = 0) {
- first_.BinRead(file);
- second_.BinRead(file);
-
- insert_size_ = is - (size_t) first_.GetLeftOffset() - (size_t) second_.GetRightOffset();
- return !file.fail();
- }
-
- bool BinWrite(std::ostream& file, bool rc1 = false, bool rc2 = false) const {
- first_.BinWrite(file, rc1);
- second_.BinWrite(file, rc2);
-
- return !file.fail();
- }
-
- const SingleReadSeq& first() const {
- return first_;
- }
-
- const SingleReadSeq& second() const {
- return second_;
- }
-
- size_t insert_size() const {
- return insert_size_;
- }
-
- size_t distance() const {
- return insert_size_ - second_.size();
- }
-
- size_t gap() const {
- return insert_size_ - first_.size() - second_.size();
- }
-
- size_t size() const {
- return std::max(first_.size(), second_.size());
- }
-
- size_t nucl_count() const {
- return first_.size() + second_.size();
- }
-
- PairedReadSeq(const SingleReadSeq& first,
- const SingleReadSeq& second,
- size_t insert_size)
- : first_(first), second_(second), insert_size_(insert_size) {}
-
- const SingleReadSeq& operator[] (size_t i) const {
- if (i == 0) {
- return first_;
- } else if (i == 1) {
- return second_;
- }
- VERIFY(false);
- return first_;
- }
-
- const PairedReadSeq operator!() const {
- return PairedReadSeq(!second_, !first_, insert_size_);
- }
-
-};
-
-inline std::ostream& operator<<(std::ostream& os, const PairedReadSeq& read) {
- os << "Paired read first=" << read.first() << " second=" << read.second() << std::endl;
- return os;
-}
-
-}
diff --git a/src/include/io/paired_readers.hpp b/src/include/io/paired_readers.hpp
deleted file mode 100644
index 6d6d730..0000000
--- a/src/include/io/paired_readers.hpp
+++ /dev/null
@@ -1,251 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include <string>
-#include "ireader.hpp"
-#include "paired_read.hpp"
-#include "orientation.hpp"
-
-namespace io {
-
-class SeparatePairedReadStream : public ReadStream<PairedRead> {
- public:
- /*
- * Default constructor.
- *
- * @param filename The pair that contains the names of two files to
- * be opened.
- * @param distance Distance between parts of PairedReads.
- * @param offset The offset of the read quality.
- */
- explicit SeparatePairedReadStream(const std::string& filename1, const std::string& filename2,
- size_t insert_size, bool change_order = false,
- bool use_orientation = true, LibraryOrientation orientation = LibraryOrientation::FR,
- OffsetType offset_type = PhredOffset)
- : insert_size_(insert_size),
- change_order_(change_order),
- use_orientation_(use_orientation),
- changer_(GetOrientationChanger<PairedRead>(orientation)),
- offset_type_(offset_type),
- first_(new FileReadStream(filename1, offset_type_)),
- second_(new FileReadStream(filename2, offset_type_)),
- filename1_(filename1),
- filename2_(filename2){}
-
- /*
- * Check whether the stream is opened.
- *
- * @return true of the stream is opened and false otherwise.
- */
- /* virtual */ bool is_open() {
- return first_->is_open() && second_->is_open();
- }
-
- /*
- * Check whether we've reached the end of stream.
- *
- * @return true if the end of stream is reached and false
- * otherwise.
- */
- /* virtual */ bool eof() {
-
- if (first_->eof() != second_->eof()) {
- if (first_->eof()) {
- ERROR("The number of right read-pairs is larger than the number of left read-pairs");
- } else {
- ERROR("The number of left read-pairs is larger than the number of right read-pairs");
- }
- FATAL_ERROR("Unequal number of read-pairs detected in the following files: " << filename1_ << " " << filename2_ << "");
- }
- return first_->eof();
- }
-
- /*
- * Read PairedRead from stream.
- *
- * @param pairedread The PairedRead that will store read data.
- *
- * @return Reference to this stream.
- */
- /* virtual */ SeparatePairedReadStream& operator>>(PairedRead& pairedread) {
- SingleRead sr1, sr2;
- (*first_) >> sr1;
- (*second_) >> sr2;
-
- if (use_orientation_) {
- pairedread = changer_->Perform(PairedRead(sr1, sr2, insert_size_));
- }
- else {
- pairedread = PairedRead(sr1, sr2, insert_size_);
- }
-
- if (change_order_) {
- pairedread = PairedRead(pairedread.second(), pairedread.first(), insert_size_);
- }
-
- return *this;
- }
-
- /*
- * Close the stream.
- */
- /* virtual */ void close() {
- first_->close();
- second_->close();
- }
-
- /*
- * Close the stream and open it again.
- */
- /* virtual */ void reset() {
- first_->reset();
- second_->reset();
- }
-
- ReadStreamStat get_stat() const {
- return ReadStreamStat();
- }
-
- private:
-
- size_t insert_size_;
-
- bool change_order_;
-
- bool use_orientation_;
-
- std::unique_ptr<OrientationChanger<PairedRead>> changer_;
-
- /*
- * @variable Quality offset type.
- */
- OffsetType offset_type_;
-
- /*
- * @variable The first stream (reads from first file).
- */
- std::unique_ptr<ReadStream<SingleRead>> first_;
- /*
- * @variable The second stream (reads from second file).
- */
- std::unique_ptr<ReadStream<SingleRead>> second_;
-
- //Only for providing information about error for users
- std::string filename1_;
- std::string filename2_;
-};
-
-class InterleavingPairedReadStream : public ReadStream<PairedRead> {
- public:
- /*
- * Default constructor.
- *
- * @param filename Single file
- * @param distance Distance between parts of PairedReads.
- * @param offset The offset of the read quality.
- */
- explicit InterleavingPairedReadStream(const std::string& filename, size_t insert_size, bool change_order = false,
- bool use_orientation = true, LibraryOrientation orientation = LibraryOrientation::FR,
- OffsetType offset_type = PhredOffset)
- : filename_(filename), insert_size_(insert_size),
- change_order_(change_order),
- use_orientation_(use_orientation),
- changer_(GetOrientationChanger<PairedRead>(orientation)),
- offset_type_(offset_type),
- single_(new FileReadStream(filename_, offset_type_)) {}
-
- /*
- * Check whether the stream is opened.
- *
- * @return true of the stream is opened and false otherwise.
- */
- /* virtual */ bool is_open() {
- return single_->is_open();
- }
-
- /*
- * Check whether we've reached the end of stream.
- *
- * @return true if the end of stream is reached and false
- * otherwise.
- */
- /* virtual */ bool eof() {
- return single_->eof();
- }
-
- /*
- * Read PairedRead from stream.
- *
- * @param pairedread The PairedRead that will store read data.
- *
- * @return Reference to this stream.
- */
- /* virtual */ InterleavingPairedReadStream& operator>>(PairedRead& pairedread) {
- SingleRead sr1, sr2;
- (*single_) >> sr1;
- (*single_) >> sr2;
-
- if (use_orientation_) {
- pairedread = changer_->Perform(PairedRead(sr1, sr2, insert_size_));
- }
- else {
- pairedread = PairedRead(sr1, sr2, insert_size_);
- }
-
- if (change_order_) {
- pairedread = PairedRead(pairedread.second(), pairedread.first(), insert_size_);
- }
-
- return *this;
- }
-
- /*
- * Close the stream.
- */
- /* virtual */ void close() {
- single_->close();
- }
-
- /*
- * Close the stream and open it again.
- */
- /* virtual */ void reset() {
- single_->reset();
- }
-
- ReadStreamStat get_stat() const {
- return ReadStreamStat();
- }
-
- private:
- /*
- * @variable The names of the file which stream reads from.
- */
- std::string filename_;
-
- size_t insert_size_;
-
- bool change_order_;
-
- bool use_orientation_;
-
- std::unique_ptr<OrientationChanger<PairedRead>> changer_;
-
- /*
- * @variable Quality offset type.
- */
- OffsetType offset_type_;
-
- /*
- * @variable The single read stream.
- */
- std::unique_ptr<ReadStream<SingleRead>> single_;
-
-};
-}
diff --git a/src/include/io/parser.hpp b/src/include/io/parser.hpp
deleted file mode 100644
index 865f681..0000000
--- a/src/include/io/parser.hpp
+++ /dev/null
@@ -1,145 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/**
- * @file parser.hpp
- * @author Mariya Fomkina
- * @version 1.0
- *
- * @section LICENSE
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- *
- * @section DESCRIPTION
- *
- * Parser is the parent class for all streams that read data from
- * different file types (fastq, fasta, sam etc).
- */
-
-#ifndef COMMON_IO_PARSER_HPP
-#define COMMON_IO_PARSER_HPP
-
-#include <string>
-#include "io/single_read.hpp"
-
-namespace io {
-
-class Parser {
- public:
- /*
- * Default constructor.
- *
- * @param filename The name of the file to be opened.
- * @param offset The offset of the read quality.
- */
- Parser(const std::string& filename,
- OffsetType offset_type = PhredOffset)
- : filename_(filename), offset_type_(offset_type),
- is_open_(false), eof_(true) {}
-
- /*
- * Default destructor.
- */
- virtual ~Parser() {}
-
- /*
- * Check whether the stream is opened.
- *
- * @return true of the stream is opened and false otherwise.
- */
- virtual bool is_open() const {
- return is_open_;
- }
-
- /*
- * Check whether we've reached the end of stream.
- *
- * @return true if the end of stream is reached and false
- * otherwise.
- */
- virtual bool eof() const {
- return eof_;
- }
-
- /*
- * Read SingleRead from stream.
- *
- * @param read The SingleRead that will store read data.
- *
- * @return Reference to this stream.
- */
- virtual Parser& operator>>(SingleRead& read) = 0;
-
- /*
- * Close the stream.
- */
- virtual void close() = 0;
-
- /*
- * Close the stream and open it again.
- */
- void reset() {
- close();
- open();
- }
-
- protected:
- /*
- * @variable The name the file which stream reads from.
- */
- std::string filename_;
- /*
- * @variable Quality offset type.
- */
- OffsetType offset_type_;
- /*
- * @variable Flag that shows whether the stream is opened.
- */
- bool is_open_;
- /*
- * @variable Flag that shows whether the end of the stream is
- * reached.
- */
- bool eof_;
-
- private:
- /*
- * Open a stream.
- */
- virtual void open() = 0;
-};
-
-/*
- * Get extension from filename.
- *
- * @param filename The name of the file to read from.
- *
- * @return File extension (e.g. "fastq", "fastq.gz").
- */
-std::string GetExtension(const std::string& filename);
-
-/*
- * Select parser type according to file extension.
- *
- * @param filename The name of the file to be opened.
- * @param offset The offset of the read quality.
-
- * @return Pointer to the new parser object with these filename and
- * offset.
- */
-Parser* SelectParser(const std::string& filename,
- OffsetType offset_type = PhredOffset);
-
-//todo delete???
-void first_fun(int);
-
-}
-
-#endif /* COMMON_IO_PARSER_HPP */
diff --git a/src/include/io/rc_reader_wrapper.hpp b/src/include/io/rc_reader_wrapper.hpp
deleted file mode 100644
index 5a8448d..0000000
--- a/src/include/io/rc_reader_wrapper.hpp
+++ /dev/null
@@ -1,137 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include <boost/noncopyable.hpp>
-
-#include "read_stream_vector.hpp"
-#include "delegating_reader_wrapper.hpp"
-#include "orientation.hpp"
-
-namespace io {
-
-/**
- * RCWrapper is the class-wrapper that gets reads and reverse
- * complimentary reads from given reader (one by one).
- */
-template<typename ReadType>
-class RCWrapper: public DelegatingWrapper<ReadType> {
- typedef DelegatingWrapper<ReadType> base;
-public:
- explicit RCWrapper(typename base::ReadStreamPtrT reader) :
- base(reader), rc_read_(), was_rc_(true) {
- }
-
- /* virtual */
- bool eof() {
- return was_rc_ && base::eof();
- }
-
- /* virtual */
- RCWrapper& operator>>(ReadType& read) {
- if (was_rc_) {
- base::operator >>(read);
- rc_read_ = read;
- } else {
- read = !rc_read_;
- }
- was_rc_ = !was_rc_;
- return (*this);
- }
-
- /* virtual */
- void reset() {
- was_rc_ = true;
- base::reset();
- }
-
- /* virtual */
- ReadStreamStat get_stat() const {
- ReadStreamStat stat = base::get_stat();
- stat.merge(stat);
- return stat;
- }
-
-private:
- ReadType rc_read_;
- bool was_rc_;
-};
-
-template<class ReadType>
-std::shared_ptr<ReadStream<ReadType>> RCWrap(std::shared_ptr<ReadStream<ReadType>> reader_ptr) {
- return std::make_shared<RCWrapper<ReadType>>(reader_ptr);
-}
-
-template<class ReadType>
-ReadStreamList<ReadType> RCWrap(ReadStreamList<ReadType>& readers) {
- ReadStreamList<ReadType> answer;
- for (size_t i = 0; i < readers.size(); ++i) {
- answer.push_back(RCWrap<ReadType>(readers.ptr_at(i)));
- }
- return answer;
-}
-
-template<typename ReadType>
-class OrientationChangingWrapper: public DelegatingWrapper<ReadType> {
- typedef DelegatingWrapper<ReadType> base;
- typedef std::unique_ptr<OrientationChanger<ReadType>> ChangerPtrT;
-public:
-
- OrientationChangingWrapper(typename base::ReaderStreamPtrT reader,
- LibraryOrientation orientation) :
- base(reader), changer_(GetOrientationChanger<ReadType>(orientation)) {
- }
-
- /*virtual*/
- OrientationChangingWrapper& operator>>(ReadType& read) {
- base::operator >>(read);
- read = changer_->Perform(read);
- return (*this);
- }
-
-private:
- ChangerPtrT changer_;
- bool delete_reader_;
-};
-
-template<typename ReadType>
-class RCRemovingWrapper: public DelegatingWrapper<ReadType> {
- typedef DelegatingWrapper<ReadType> base;
-public:
-
- explicit RCRemovingWrapper(typename base::ReadStreamPtrT reader) : base(reader) {
- }
-
- /*virtual*/
- RCRemovingWrapper& operator>>(ReadType& read) {
- base::operator>>(read);
-
- VERIFY(!this->eof());
- ReadType skip;
- base::operator>>(skip);
-
- return *this;
- }
-
-};
-
-template<class ReadType>
-std::shared_ptr<ReadStream<ReadType>> UnRCWrap(std::shared_ptr<ReadStream<ReadType>> reader_ptr) {
- return std::make_shared<RCRemovingWrapper<ReadType>>(reader_ptr);
-}
-
-template<class ReadType>
-ReadStreamList<ReadType> UnRCWrap(ReadStreamList<ReadType>& readers) {
- ReadStreamList<ReadType> answer;
- for (size_t i = 0; i < readers.size(); ++i) {
- answer.push_back(UnRCWrap<ReadType>(readers.ptr_at(i)));
- }
- return answer;
-}
-
-}
diff --git a/src/include/io/read.hpp b/src/include/io/read.hpp
deleted file mode 100644
index a7a0dbe..0000000
--- a/src/include/io/read.hpp
+++ /dev/null
@@ -1,231 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * read.hpp
- *
- * Created on: 29.03.2011
- * Author: vyahhi
- */
-
-#ifndef READ_HPP_
-#define READ_HPP_
-
-#include <string>
-#include <iostream>
-#include <fstream>
-#include "verify.hpp"
-#include "sequence/quality.hpp"
-#include "sequence/sequence.hpp"
-#include "sequence/nucl.hpp"
-#include "sequence/sequence_tools.hpp"
-#include "simple_tools.hpp"
-
-//fixme deprecated!!! used in hammer!
-class Read {
-public:
- static const int PHRED_OFFSET = 33;
-
- bool isValid() const {
- return valid_;
- }
-
- Sequence getSequence() const {
- VERIFY(valid_);
- return Sequence(seq_);
- }
-
- Sequence getSubSequence(size_t start, size_t length) const __attribute__ ((deprecated)) {
- VERIFY(length > 0 && start + length <= seq_.size());
- return Sequence(seq_.substr(start, length));
- }
-
- Quality getQuality() const {
- VERIFY(valid_);
- return Quality(qual_);
- }
-
- const std::string& getSequenceString() const {
- return seq_;
- }
-
- const std::string& getQualityString() const {
- return qual_;
- }
-
- std::string getPhredQualityString(int offset = PHRED_OFFSET) const {
- std::string res = qual_;
- for (size_t i = 0; i < res.size(); ++i) {
- res[i] = (char)(res[i] + offset);
- }
- return res;
- }
-
- const std::string& getName() const {
- return name_;
- }
-
- size_t size() const {
- return seq_.size();
- }
-
- char operator[](size_t i) const {
- VERIFY(is_nucl(seq_[i]));
- return dignucl(seq_[i]);
- }
-
- /**
- * trim read
- * @param ltrim first good base
- * @param rtrim last good base
- * @return whether there is anything left
- */
- bool trimLeftRight(int ltrim, int rtrim) {
- if (ltrim >= (int)seq_.size() || rtrim < 0 || rtrim < ltrim ) {
- seq_ = ""; qual_ = ""; valid_ = false; return 0;
- }
- bool donesomething = false;
- if (ltrim > 0) {
- ltrim_ += ltrim;
- seq_.erase(0, ltrim);
- qual_.erase(0, ltrim);
- donesomething = true;
- }
- if (rtrim-ltrim+1 < (int)seq_.size() && rtrim < (int)seq_.size()-ltrim-1) {
- rtrim_ -= ((int)seq_.size()-(rtrim-ltrim+1));
- seq_.erase(rtrim-ltrim+1, std::string::npos);
- qual_.erase(rtrim-ltrim+1, std::string::npos);
- donesomething = true;
- }
- if (donesomething) valid_ = updateValid();
- return true;
- }
-
- size_t trimNsAndBadQuality(int threshold) {
- int start = 0;
- for (; start < (int)seq_.size(); ++start) {
- if (seq_[start] != 'N' && (int)qual_[start] > threshold) break;
- }
- int end = 0;
- for (end = (int)seq_.size()-1; end > -1; --end) {
- if (seq_[end] != 'N' && (int)qual_[end] > threshold) break;
- }
- if (!trimLeftRight(start, end)) return 0;
- else return seq_.size();
- }
-
- /**
- * @param k k as in k-mer
- * @param start start point
- * @return the first starting point of a valid k-mer >=start; return -1 if no such place exists
- */
- size_t firstValidKmer(size_t start, size_t k) const __attribute__ ((deprecated)) {
- size_t curHypothesis = start;
- size_t i = start;
- for (; i < seq_.size(); ++i) {
- if (i >= k + curHypothesis)
- return curHypothesis;
- if (!is_nucl(seq_[i])) {
- curHypothesis = i + 1;
- }
- }
- if (i >= k + curHypothesis) {
- return curHypothesis;
- }
- return -1ULL;
- }
-
- void setSequence(const char* s, bool preserve_trimming = false) {
- seq_ = s;
- if (!preserve_trimming) {
- ltrim_ = 0; rtrim_ = initial_size_ = (int)seq_.size();
- }
- valid_ = updateValid();
- }
- void setQuality(const char* s, int offset = PHRED_OFFSET) {
- qual_ = s;
- for (size_t i = 0; i < qual_.size(); ++i) {
- qual_[i] = (char)(qual_[i] - offset);
- }
- }
- void setName(const char* s) {
- name_ = s;
- }
-
- Read()
- : valid_(false), ltrim_(0), rtrim_(0), initial_size_(0) {
- ;
- }
-
- Read(const std::string &name, const std::string &seq, const std::string &qual) :
- name_(name), seq_(seq), qual_(qual) { // for test only!
- ltrim_ = 0; initial_size_ = rtrim_ = (int)seq_.size();
- valid_ = updateValid();
- }
-
- int ltrim() const { return ltrim_; }
- void set_ltrim(unsigned val) { ltrim_ = val; };
- int rtrim() const { return rtrim_; }
- int initial_size() const { return initial_size_; }
-
-private:
- std::string name_;
- std::string seq_;
- std::string qual_;
- bool valid_;
- int ltrim_;
- int rtrim_;
- int initial_size_;
- friend class ireadstream;
- friend uint32_t TrimBadQuality(Read*, int);
- bool updateValid() const {
- if (seq_.size() == 0) {
- return false;
- }
- for (size_t i = 0; i < seq_.size(); ++i) {
- if (!is_nucl(seq_[i])) {
- return false;
- }
- }
- return true;
- }
-
-public:
- Read operator!() const {
- std::string newName;
- if (name_ == "" || name_[0] != '!') {
- newName = '!' + name_;
- } else {
- newName = name_.substr(1, name_.length());
- }
- return Read(newName, ReverseComplement(seq_), Reverse(qual_));
- }
-
- void print(std::ostream & outf, int offset) const {
- outf << "@" << name_.c_str() << "\n";
- for (int i=0; i < ltrim_; ++i) outf << "N";
- outf << seq_.c_str();
- for (int i=0; i < initial_size_ - rtrim_; ++i) outf << "N";
- outf << "\n" << "+" << name_.c_str();
- if (ltrim_ > 0) outf << " ltrim=" << ltrim_;
- if (rtrim_ < initial_size_)
- outf << " rtrim=" << (initial_size_ - rtrim_);
- outf << "\n";
- char badq = (char)( offset + 2 );
- for (int i=0; i < ltrim_; ++i) outf << badq;
- outf << getPhredQualityString( offset ).c_str();
- for (int i=0; i < initial_size_ - rtrim_; ++i) outf << badq;
- outf << "\n";
- }
-};
-
-// todo: put this to *.cpp
-//ostream& operator<<(ostream& os, const Read& read) {
-// return os << read.getSequenceString();
-//}
-
-#endif /* READ_HPP_ */
diff --git a/src/include/io/read_processor.hpp b/src/include/io/read_processor.hpp
deleted file mode 100644
index f397610..0000000
--- a/src/include/io/read_processor.hpp
+++ /dev/null
@@ -1,200 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef __HAMMER_READ_PROCESSOR_HPP__
-#define __HAMMER_READ_PROCESSOR_HPP__
-
-#include "io/mpmc_bounded.hpp"
-
-#include "openmp_wrapper.h"
-
-#pragma GCC diagnostic push
-#ifdef __clang__
-#pragma clang diagnostic ignored "-Wunused-private-field"
-#endif
-namespace hammer {
-class ReadProcessor {
- static size_t const cacheline_size = 64;
- typedef char cacheline_pad_t [cacheline_size];
-
- unsigned nthreads_;
- cacheline_pad_t pad0;
- size_t read_;
- cacheline_pad_t pad1;
- size_t processed_;
- cacheline_pad_t pad2;
-
-private:
- template<class Reader, class Op>
- bool RunSingle(Reader &irs, Op &op) {
- while (!irs.eof()) {
- typename Reader::ReadT r;
- irs >> r;
- read_ += 1;
-
- processed_ += 1;
- if (op(r))
- return true;
- }
-
- return false;
- }
-
- template<class Reader, class Op, class Writer>
- void RunSingle(Reader &irs, Op &op, Writer &writer) {
- while (!irs.eof()) {
- typename Reader::ReadT r;
- irs >> r;
- read_ += 1;
-
- auto res = op(r);
- processed_ += 1;
-
- if (res)
- writer << *res;
- }
- }
-
-public:
- ReadProcessor(unsigned nthreads)
- : nthreads_(nthreads), read_(0), processed_(0) {}
-
- size_t read() const { return read_; }
- size_t processed() const { return processed_; }
-
- template<class Reader, class Op>
- bool Run(Reader &irs, Op &op) {
- if (nthreads_ < 2)
- return RunSingle(irs, op);
-
- // Round nthreads to next power of two
- unsigned bufsize = nthreads_ - 1;
- bufsize = (bufsize >> 1) | bufsize;
- bufsize = (bufsize >> 2) | bufsize;
- bufsize = (bufsize >> 4) | bufsize;
- bufsize = (bufsize >> 8) | bufsize;
- bufsize = (bufsize >> 16) | bufsize;
- bufsize += 1;
-
- mpmc_bounded_queue<typename Reader::ReadT> in_queue(2*bufsize);
-
- bool stop = false;
-# pragma omp parallel shared(in_queue, irs, op, stop) num_threads(nthreads_)
- {
-# pragma omp master
- {
- while (!irs.eof()) {
- typename Reader::ReadT r;
- irs >> r;
-# pragma omp atomic
- read_ += 1;
-
- while (!in_queue.enqueue(r))
- sched_yield();
-
-# pragma omp flush (stop)
- if (stop)
- break;
- }
-
- in_queue.close();
- }
-
- while (1) {
- typename Reader::ReadT r;
-
- if (!in_queue.wait_dequeue(r))
- break;
-
-# pragma omp atomic
- processed_ += 1;
-
- bool res = op(r);
- if (res) {
-# pragma omp atomic
- stop |= res;
- }
- }
- }
-
-# pragma omp flush(stop)
- return stop;
- }
-
- template<class Reader, class Op, class Writer>
- void Run(Reader &irs, Op &op, Writer &writer) {
- if (nthreads_ < 2) {
- RunSingle(irs, op, writer);
- return;
- }
-
- // Round nthreads to next power of two
- unsigned bufsize = nthreads_ - 1;
- bufsize = (bufsize >> 1) | bufsize;
- bufsize = (bufsize >> 2) | bufsize;
- bufsize = (bufsize >> 4) | bufsize;
- bufsize = (bufsize >> 8) | bufsize;
- bufsize = (bufsize >> 16) | bufsize;
- bufsize += 1;
-
- mpmc_bounded_queue<typename Reader::ReadT> in_queue(bufsize), out_queue(2*bufsize);
-# pragma omp parallel shared(in_queue, out_queue, irs, op, writer) num_threads(nthreads_)
- {
-# pragma omp master
- {
- while (!irs.eof()) {
- typename Reader::ReadT r;
- irs >> r;
-
- // First, try to provide read to the queue. If it's full, never mind.
- bool status = in_queue.enqueue(r);
-
- // Flush down the output queue
- typename Reader::ReadT outr;
- while (out_queue.dequeue(outr))
- writer << outr;
-
- // If the input queue was originally full, wait until we can insert
- // the read once again.
- if (!status)
- while (!in_queue.enqueue(r))
- sched_yield();
- }
-
- in_queue.close();
-
- // Flush down the output queue while in master threads.
- typename Reader::ReadT outr;
- while (out_queue.dequeue(outr))
- writer << outr;
- }
-
- while (1) {
- typename Reader::ReadT r;
-
- if (!in_queue.wait_dequeue(r))
- break;
-
- auto res = op(r);
- if (res)
- while (!out_queue.enqueue(*res))
- sched_yield();
- }
- }
-
- // Flush down the output queue
- typename Reader::ReadT outr;
- while (out_queue.dequeue(outr))
- writer << outr;
- }
-};
-
-#pragma GCC diagnostic pop
-
-}
-
-#endif // __HAMMER_READ_PROCESSOR_HPP__
diff --git a/src/include/io/read_stream_vector.hpp b/src/include/io/read_stream_vector.hpp
deleted file mode 100644
index 5639a4c..0000000
--- a/src/include/io/read_stream_vector.hpp
+++ /dev/null
@@ -1,182 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-#include "ireader.hpp"
-#include <vector>
-
-namespace io {
-//todo rename file
-
-//todo check destroy_readers logic and usages
-template <class ReadType>
-class ReadStreamList {
-public:
- typedef ReadType ReadT;
- typedef ReadStream<ReadType> ReaderT;
- typedef std::shared_ptr<ReaderT> ReaderPtrT;
-
- private:
- std::vector<ReaderPtrT> readers_;
-
- public:
-
- explicit ReadStreamList(const std::vector<ReaderPtrT>& readers): readers_(readers) {
- }
-
- ReadStreamList() {
- }
-
- explicit ReadStreamList(ReaderT* reader_ptr): readers_(1, ReaderPtrT(reader_ptr)) {
- }
-
- explicit ReadStreamList(ReaderPtrT reader_ptr): readers_(1, reader_ptr) {
- }
-
- explicit ReadStreamList(size_t size): readers_(size) {
- }
-
-// std::vector<Reader*>& get() {
-// destroy_readers_ = false;
-// return streams_;
-// }
-
- //todo use boost iterator facade
- class iterator: public std::iterator<std::input_iterator_tag, ReaderT> {
- typedef typename std::vector<ReaderPtrT>::iterator vec_it;
- vec_it it_;
- public:
-
- iterator(vec_it it) : it_(it) {
- }
-
- void operator++ () {
- ++it_;
- }
-
- bool operator== (const iterator& that) {
- return it_ == that.it_;
- }
-
- bool operator!= (const iterator& that) {
- return it_ != that.it_;
- }
-
- ReaderT& operator*() {
- return *(*it_);
- }
- };
-
-// class const_iterator: public std::iterator<std::input_iterator_tag, Reader> {
-// typedef typename std::vector<Reader*>::iterator vec_it;
-// vec_it it_;
-// public:
-//
-// const_iterator(vec_it it) : it_(it) {
-// }
-//
-// void operator++ () {
-// ++it_;
-// }
-//
-// bool operator== (const const_iterator& that) {
-// return it_ == that.it_;
-// }
-//
-// bool operator!= (const const_iterator& that) {
-// return it_ != that.it_;
-// }
-//
-// ReaderT& operator*() {
-// return *(*it_);
-// }
-// };
-
- ReaderT& operator[](size_t i) {
- return *readers_.at(i);
- }
-
- ReaderPtrT& ptr_at(size_t i) {
- return readers_.at(i);
- }
-
- ReaderT& back() {
- return *readers_.back();
- }
-
- size_t size() const {
- return readers_.size();
- }
-
- bool eof() const {
- for (size_t i = 0; i < readers_.size(); ++i) {
- if (!readers_[i]->eof()) {
- return false;
- }
- }
- return true;
- }
-
- iterator begin() {
- return iterator(readers_.begin());
- }
-
- iterator end() {
- return iterator(readers_.end());
- }
-
-// const_iterator begin() const {
-// return iterator(streams_.begin());
-// }
-//
-// const_iterator end() const {
-// return iterator(streams_.end());
-// }
-
- void push_back(ReaderT* reader_ptr) {
- readers_.push_back(ReaderPtrT(reader_ptr));
- }
-
- void push_back(ReaderPtrT reader_ptr) {
- readers_.push_back(reader_ptr);
- }
-
- void reset() {
- for (size_t i = 0; i < readers_.size(); ++i) {
- readers_[i]->reset();
- }
- }
-
- void close() {
- for (size_t i = 0; i < readers_.size(); ++i) {
- readers_[i]->close();
- }
- }
-
- void clear() {
- readers_.clear();
- }
-
- ReadStreamStat get_stat() const {
- ReadStreamStat stat;
- for (size_t i = 0; i < readers_.size(); ++i) {
- stat.merge(readers_[i]->get_stat());
- }
- return stat;
- }
-
-// void release() {
-// destroy_readers_ = false;
-// }
-
-// const std::vector< Reader * >& get() const {
-// return streams_;
-// }
-
-};
-
-}
diff --git a/src/include/io/sam/sam_reader.hpp b/src/include/io/sam/sam_reader.hpp
deleted file mode 100644
index 63d262b..0000000
--- a/src/include/io/sam/sam_reader.hpp
+++ /dev/null
@@ -1,49 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-#pragma once
-
-#include "read.hpp"
-
-#include "logger/log_writers.hpp"
-
-#include <samtools/sam.h>
-#include <samtools/bam.h>
-
-#include <string>
-
-namespace sam_reader {
-
-class MappedSamStream {
-public:
- MappedSamStream(const std::string &filename)
- : filename_(filename) {
- open();
- }
-
- virtual ~MappedSamStream() {
- }
-
- bool is_open() const;
- bool eof() const;
- MappedSamStream& operator >>(SingleSamRead& read);
- MappedSamStream& operator >>(PairedSamRead& read);
- const char* get_contig_name(int i) const;
- void close();
- void reset();
-
-private:
- samfile_t *reader_;
- bam1_t *seq_ = bam_init1();
- std::string filename_;
- bool is_open_;
- bool eof_;
-
- void open();
-};
-
-}
-;
diff --git a/src/include/io/sequence_reader.hpp b/src/include/io/sequence_reader.hpp
deleted file mode 100644
index fb68873..0000000
--- a/src/include/io/sequence_reader.hpp
+++ /dev/null
@@ -1,77 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "io/ireader.hpp"
-#include "io/single_read.hpp"
-
-namespace io {
-
-//todo merge with VectorReader
-template <class ReadType>
-class SequenceReadStream : public ReadStream<ReadType> {
- public:
- explicit SequenceReadStream(const Sequence &sequence, const std::string &name = "")
- : sequence_(sequence),
- name_(name),
- opened_(true),
- eof_(false) {
- }
-
- virtual ~SequenceReadStream() {
- }
-
- virtual bool is_open() {
- return opened_;
- }
-
- virtual bool eof() {
- return eof_;
- }
-
- virtual void close() {
- opened_ = false;
- }
-
- void reset() {
- eof_ = false;
- opened_ = true;
- }
-
- ReadStreamStat get_stat() const {
- return ReadStreamStat();
- }
-
- SequenceReadStream& operator>>(ReadType &read);
-
- private:
- Sequence sequence_;
- std::string name_;
- bool opened_;
- bool eof_;
-};
-
-template <>
-SequenceReadStream<SingleRead> &SequenceReadStream<SingleRead>::operator>>(SingleRead &read) {
- if (!eof_) {
- read = SingleRead(name_, sequence_.str());
- eof_ = true;
- }
- return *this;
-}
-
-template <>
-SequenceReadStream<SingleReadSeq> &SequenceReadStream<SingleReadSeq>::operator>>(SingleReadSeq &read) {
- if (!eof_) {
- read = SingleReadSeq(sequence_);
- eof_ = true;
- }
- return *this;
-}
-
-}
diff --git a/src/include/io/single_read.hpp b/src/include/io/single_read.hpp
deleted file mode 100644
index 287b733..0000000
--- a/src/include/io/single_read.hpp
+++ /dev/null
@@ -1,331 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "verify.hpp"
-#include "sequence/quality.hpp"
-#include "sequence/sequence.hpp"
-#include "sequence/nucl.hpp"
-#include "sequence/sequence_tools.hpp"
-#include "simple_tools.hpp"
-
-#include <string>
-
-namespace io {
-
-/*
- * This enumerate contains offset type.
- * UnknownOffset is equal to "offset = 0".
- * PhredOffset is equal to "offset = 33".
- * SolexaOffset is equal to "offset = 64".
- */
-enum OffsetType {
- UnknownOffset = 0,
- PhredOffset = 33,
- SolexaOffset = 64
-};
-
-//todo extract code about offset from here
-
-typedef uint16_t SequenceOffsetT;
-
-
-class SingleRead {
- public:
-
- static std::string EmptyQuality(const std::string& seq) {
- return std::string(seq.size(), (char) 33);
- }
-
- static const int BAD_QUALITY_THRESHOLD = 2;
-
- SingleRead() :
- name_(""), seq_(""), qual_(""), left_offset_(0), right_offset_(0), valid_(false) {
- DEBUG(name_ << " created");
- }
-
- SingleRead(const std::string& name, const std::string& seq,
- const std::string& qual, OffsetType offset,
- SequenceOffsetT left_offset = 0, SequenceOffsetT right_offset = 0) :
- name_(name), seq_(seq), qual_(qual), left_offset_(left_offset), right_offset_(right_offset) {
- Init();
- DEBUG(name_ << " created");
- for (size_t i = 0; i < qual_.size(); ++i) {
- qual_[i] = (char)(qual_[i] - offset);
- }
- }
-
- SingleRead(const std::string& name, const std::string& seq,
- const std::string& qual,
- SequenceOffsetT left_offset = 0, SequenceOffsetT right_offset = 0) :
- name_(name), seq_(seq), qual_(qual), left_offset_(left_offset), right_offset_(right_offset) {
- DEBUG(name_ << " created");
- Init();
- }
-
- SingleRead(const std::string& name, const std::string& seq,
- SequenceOffsetT left_offset = 0, SequenceOffsetT right_offset = 0) :
- name_(name), seq_(seq), qual_(EmptyQuality(seq_)), left_offset_(left_offset), right_offset_(right_offset) {
- DEBUG(name_ << " created");
- Init();
- }
-
- bool IsValid() const {
- return valid_;
- }
-
- Sequence sequence(bool rc = false) const {
- VERIFY(valid_);
- return Sequence(seq_, rc);
- }
-
- Quality quality() const {
- VERIFY(valid_);
- return Quality(qual_);
- }
-
- const std::string& name() const {
- return name_;
- }
-
- size_t size() const {
- return seq_.size();
- }
-
- size_t nucl_count() const {
- return size();
- }
-
- const std::string& GetSequenceString() const {
- return seq_;
- }
-
- const std::string& GetQualityString() const {
- return qual_;
- }
-
- std::string GetPhredQualityString() const {
- int offset = PhredOffset;
- std::string res = qual_;
- for (size_t i = 0; i < res.size(); ++i) {
- res[i] = (char)(res[i] + offset);
- }
- return res;
- }
-
- /*
- * Return ith nucleotide of SingleRead sequence in unreadable form
- * (0, 1, 2 or 3).
- *
- * @param i Nucleotide index.
- * @return Nucleotide on ith position of SingleRead sequence.
- */
- char operator[](size_t i) const {
- VERIFY(is_nucl(seq_[i]));
- return dignucl(seq_[i]);
- }
-
- SingleRead operator!() const {
- std::string new_name;
- if (name_.length() >= 3 && name_.substr(name_.length() - 3) == "_RC") {
- new_name = name_.substr(0, name_.length() - 3);
- } else {
- new_name = name_ + "_RC";
- }
- // TODO make naming nicer
- // if (name_ == "" || name_[0] != '!') {
- // new_name = '!' + name_;
- // } else {
- // new_name = name_.substr(1, name_.length());
- // }
- return SingleRead(new_name, ReverseComplement(seq_), Reverse(qual_), right_offset_, left_offset_);
- }
-
- SingleRead SubstrStrict(size_t from, size_t to) const {
- size_t len = to - from;
- // return SingleRead(name_, seq_.substr(from, len), qual_.substr(from, len));
- // TODO remove naming?
- std::string new_name;
- if (name_.length() >= 3 && name_.substr(name_.length() - 3) == "_RC") {
- new_name = name_.substr(0, name_.length() - 3) + "_SUBSTR(" + ToString(size() - to) + "," + ToString(size() - from) + ")" + "_RC";
- } else {
- new_name = name_ + "_SUBSTR(" + ToString(from) + "," + ToString(to) + ")";
- }
- return SingleRead(new_name, seq_.substr(from, len), qual_.substr(from, len),
- SequenceOffsetT(from + (size_t) left_offset_), SequenceOffsetT(size() - to + (size_t) right_offset_));
- }
-
- SingleRead Substr(size_t from, size_t to) const {
- size_t len = to - from;
- if (len == size()) {
- return *this;
- }
- if (len == 0) {
- return SingleRead();
- }
- return SubstrStrict(from, to);
- }
-
- bool operator==(const SingleRead& singleread) const {
- return seq_ == singleread.seq_;
- }
-
- void ChangeName(const std::string& new_name) {
- name_ = new_name;
- }
-
- static bool IsValid(const std::string& seq) {
- for (size_t i = 0; i < seq.size(); ++i) {
- if (!is_nucl(seq[i])) {
- return false;
- }
- }
- return true;
- }
-
- SequenceOffsetT GetLeftOffset() const {
- return left_offset_;
- }
-
- SequenceOffsetT GetRightOffset() const {
- return right_offset_;
- }
-
- bool BinWrite(std::ostream& file, bool rc = false) const {
- sequence(rc).BinWrite(file);
- if (rc) {
- file.write((const char *) &right_offset_, sizeof(right_offset_));
- file.write((const char *) &left_offset_, sizeof(left_offset_));
- } else {
- file.write((const char *) &left_offset_, sizeof(left_offset_));
- file.write((const char *) &right_offset_, sizeof(right_offset_));
- }
- return !file.fail();
- }
-
-
- void print_size() const {
- std::cerr << size() << std::endl;
- }
-
-
- private:
- /*
- * @variable The name of SingleRead in input file.
- */
- std::string name_;
- /*
- * @variable The sequence of nucleotides.
- */
- std::string seq_;
- /*
- * @variable The quality of SingleRead.
- */
- std::string qual_;
- /*
- * @variable The flag of SingleRead correctness.
- */
-
- //Left and right offsets with respect to original sequence
- SequenceOffsetT left_offset_;
-
- SequenceOffsetT right_offset_;
-
- bool valid_;
-
- void Init() {
- VERIFY(seq_.size() == qual_.size());
- valid_ = SingleRead::IsValid(seq_);
- }
-
-};
-
-inline std::ostream& operator<<(std::ostream& os, const SingleRead& read) {
- os << "Single read name=" << read.name() << " sequence=" << read.GetSequenceString() << std::endl;
- return os;
-}
-
-class SingleReadSeq {
-
- public:
- SingleReadSeq(const Sequence& s,
- SequenceOffsetT left_offset = 0, SequenceOffsetT right_offset = 0):
- seq_(s), left_offset_(left_offset), right_offset_(right_offset) {
- }
-
- SingleReadSeq(): seq_(), left_offset_(0), right_offset_(0) {
- }
-
- bool BinRead(std::istream& file) {
- seq_.BinRead(file);
- file.read((char*) &left_offset_, sizeof(left_offset_));
- file.read((char*) &right_offset_, sizeof(right_offset_));
- return !file.fail();
- }
-
- bool BinWrite(std::ostream& file, bool rc = false) const {
- if (rc)
- (!seq_).BinWrite(file);
- else
- seq_.BinWrite(file);
- if (rc) {
- file.write((const char *) &right_offset_, sizeof(right_offset_));
- file.write((const char *) &left_offset_, sizeof(left_offset_));
- } else {
- file.write((const char *) &left_offset_, sizeof(left_offset_));
- file.write((const char *) &right_offset_, sizeof(right_offset_));
- }
- return !file.fail();
- }
-
- // SingleReadSeq(std::istream& file): seq_(file, true) {
- // }
-
- bool operator==(const SingleReadSeq& singleread) const {
- return seq_ == singleread.seq_;
- }
-
- const Sequence sequence() const {
- return seq_;
- }
-
- size_t size() const {
- return seq_.size();
- }
-
- size_t nucl_count() const {
- return size();
- }
-
- SingleReadSeq operator!() const {
- return SingleReadSeq(!seq_);
- }
-
- SequenceOffsetT GetLeftOffset() const {
- return left_offset_;
- }
-
- SequenceOffsetT GetRightOffset() const {
- return right_offset_;
- }
-
- private:
- Sequence seq_;
-
- //Left and right offsets with respect to original sequence
- SequenceOffsetT left_offset_;
-
- SequenceOffsetT right_offset_;
-};
-
-inline std::ostream& operator<<(std::ostream& os, const SingleReadSeq& read) {
- os << "Single read sequence=" << read.sequence() << std::endl;
- return os;
-}
-
-}
diff --git a/src/include/io/splitting_wrapper.hpp b/src/include/io/splitting_wrapper.hpp
deleted file mode 100644
index 8f934fa..0000000
--- a/src/include/io/splitting_wrapper.hpp
+++ /dev/null
@@ -1,75 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-#include "single_read.hpp"
-#include "delegating_reader_wrapper.hpp"
-
-namespace io {
-
-class SplittingWrapper: public DelegatingWrapper<SingleRead> {
- typedef DelegatingWrapper<SingleRead> base;
-private:
- std::vector<SingleRead> buffer_;
- size_t buffer_position_;
-
- void FillBuffer(SingleRead& tmp_read) {
- buffer_.clear();
- for(size_t i = 0; i < tmp_read.size(); i++) {
- size_t j = i;
- while(j < tmp_read.size() && is_nucl(tmp_read.GetSequenceString()[j])) {
- j++;
- }
- if(j > i) {
- buffer_.push_back(tmp_read.Substr(i, j));
- i = j - 1;
- }
- }
- buffer_position_ = 0;
- }
-
- bool Skip() {
- while(!this->reader().eof() && buffer_position_ == buffer_.size()) {
- SingleRead tmp_read;
- this->reader() >> tmp_read;
- FillBuffer(tmp_read);
- }
- return buffer_position_ != buffer_.size();
- }
-
-public:
-
- explicit SplittingWrapper(base::ReadStreamPtrT reader) :
- base(reader), buffer_position_(0) {
- }
-
- /* virtual */
- SplittingWrapper& operator>>(SingleRead& read) {
- Skip();
- read = buffer_[buffer_position_];
- buffer_position_++;
- return *this;
- }
-
- //todo fix needed!!! seems that eof can't be called multiple times in a row!!!
- /* virtual */ bool eof() {
- return !Skip();
- }
-};
-
-inline std::shared_ptr<ReadStream<SingleRead>> SplittingWrap(std::shared_ptr<ReadStream<SingleRead>> reader_ptr) {
- return std::make_shared<SplittingWrapper>(reader_ptr);
-}
-
-inline ReadStreamList<SingleRead> SplittingWrap(ReadStreamList<SingleRead>& readers) {
- ReadStreamList<SingleRead> answer;
- for (size_t i = 0; i < readers.size(); ++i) {
- answer.push_back(SplittingWrap(readers.ptr_at(i)));
- }
- return answer;
-}
-}
diff --git a/src/include/io/vector_reader.hpp b/src/include/io/vector_reader.hpp
deleted file mode 100644
index 181db45..0000000
--- a/src/include/io/vector_reader.hpp
+++ /dev/null
@@ -1,60 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-namespace io {
-
-/**
- * Use vector<T> as input-stream with operator>>(T& t)
- */
-template <typename T>
-class VectorReadStream : public ReadStream<T> {
- std::vector<T> data_;
- size_t pos_;
- bool closed_;
-public:
- VectorReadStream(const std::vector<T>& data) : data_(data), pos_(0), closed_(false) {
-
- }
-
- VectorReadStream(const T& item) : data_({item}), pos_(0), closed_(false) {
-
- }
-
- virtual bool eof() /*const */{
- return pos_ == data_.size();
- }
-
- VectorReadStream<T>& operator>>(T& t) {
- VERIFY(!eof());
- t = data_[pos_++];
- return *this;
- }
-
- void close() {
- closed_ = true;
- }
-
- virtual bool is_open() /*const */{
- return !closed_;
- }
-
- void reset() {
- pos_ = 0;
- }
-
- ReadStreamStat get_stat() const {
- //todo
- ReadStreamStat stat;
- stat.read_count_ = data_.size();
-
- return stat;
- }
-
-};
-
-}
diff --git a/src/include/io/wrapper_collection.hpp b/src/include/io/wrapper_collection.hpp
deleted file mode 100644
index 4a059cf..0000000
--- a/src/include/io/wrapper_collection.hpp
+++ /dev/null
@@ -1,115 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "single_read.hpp"
-#include "delegating_reader_wrapper.hpp"
-
-namespace io {
-
-//todo refactor!!!
-class IdSettingReaderWrapper: public DelegatingWrapper<SingleRead> {
- typedef DelegatingWrapper<SingleRead> base;
- size_t next_id_;
-public:
- IdSettingReaderWrapper(base::ReadStreamPtrT reader, size_t start_id = 0) :
- base(reader), next_id_(start_id) {
- }
-
- /* virtual */
- IdSettingReaderWrapper& operator>>(SingleRead& read) {
- this->reader() >> read;
- read.ChangeName(ToString(next_id_++));
- return *this;
- }
-};
-
-class PrefixAddingReaderWrapper: public DelegatingWrapper<SingleRead> {
- typedef DelegatingWrapper<SingleRead> base;
- std::string prefix_;
-public:
- PrefixAddingReaderWrapper(base::ReadStreamPtrT reader,
- const std::string& prefix) :
- base(reader), prefix_(prefix) {
- }
-
- /* virtual */
- PrefixAddingReaderWrapper& operator>>(SingleRead& read) {
- this->reader() >> read;
- read.ChangeName(prefix_ + read.name());
- return *this;
- }
-};
-
-//fixme currently leads to long stretches of ACGTACGT...
-class FixingWrapper: public DelegatingWrapper<SingleRead> {
- typedef DelegatingWrapper<SingleRead> base;
-
- io::SingleRead MakeValid(const io::SingleRead& read) const {
- std::string str = read.GetSequenceString();
- for (size_t i = 0; i < str.length(); ++i) {
- if (!is_nucl(str[i]))
- str[i] = nucl(char(i % 4));
- }
- return io::SingleRead(read.name(), str);
- }
-
-public:
- FixingWrapper(base::ReadStreamPtrT reader) :
- base(reader) {
- }
-
- /* virtual */
- FixingWrapper& operator>>(SingleRead& read) {
- this->reader() >> read;
- if (!read.IsValid()) {
- TRACE("Read " << read.name() << " was invalid. Fixing");
- read = MakeValid(read);
- VERIFY(read.IsValid());
- }
- return *this;
- }
-
-private:
- DECL_LOGGER("FixingWrapper");
-};
-
-class NonNuclCollapsingWrapper: public DelegatingWrapper<SingleRead> {
- typedef DelegatingWrapper<SingleRead> base;
-
- io::SingleRead MakeValid(const io::SingleRead& read) const {
- std::string str = read.GetSequenceString();
- std::stringstream ss;
- for (size_t i = 0; i < read.size(); ++i) {
- if (is_nucl(str[i]))
- ss << str[i];
- }
- return io::SingleRead(read.name(), ss.str());
- }
-
-public:
- NonNuclCollapsingWrapper(base::ReadStreamPtrT reader) :
- base(reader) {
- }
-
- /* virtual */
- NonNuclCollapsingWrapper& operator>>(SingleRead& read) {
- this->reader() >> read;
- if (!read.IsValid()) {
- TRACE("Read " << read.name() << " was invalid. Collapsing non-nucls");
- read = MakeValid(read);
- VERIFY(read.IsValid());
- }
- return *this;
- }
-
-private:
- DECL_LOGGER("NonNuclCollapsingWrapper");
-};
-
-}
diff --git a/src/include/levenshtein.hpp b/src/include/levenshtein.hpp
deleted file mode 100644
index bdf8b46..0000000
--- a/src/include/levenshtein.hpp
+++ /dev/null
@@ -1,238 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-#include <string>
-#include <vector>
-#include "simple_tools.hpp"
-/*
- * Little modified copy-paste from http://www.merriampark.com/ldcpp.htm
- */
-inline size_t edit_distance(const std::string& source, const std::string& target) {
-
- // Step 1
-
- const size_t n = source.length();
- const size_t m = target.length();
- if (n == 0) {
- return m;
- }
- if (m == 0) {
- return n;
- }
-
- // Good form to declare a TYPEDEF
-
- typedef std::vector< std::vector<size_t> > Tmatrix;
-
- Tmatrix matrix(n+1);
-
- // Size the vectors in the 2.nd dimension. Unfortunately C++ doesn't
- // allow for allocation on declaration of 2.nd dimension of vec of vec
-
- for (size_t i = 0; i <= n; i++) {
- matrix[i].resize(m+1);
- }
-
- // Step 2
-
- for (size_t i = 0; i <= n; i++) {
- matrix[i][0]=i;
- }
-
- for (size_t j = 0; j <= m; j++) {
- matrix[0][j]=j;
- }
-
- // Step 3
-
- for (size_t i = 1; i <= n; i++) {
-
- const char s_i = source[i-1];
-
- // Step 4
-
- for (size_t j = 1; j <= m; j++) {
-
- const char t_j = target[j-1];
-
- // Step 5
-
- size_t cost;
- if (s_i == t_j) {
- cost = 0;
- }
- else {
- cost = 1;
- }
-
- // Step 6
-
- const size_t above = matrix[i-1][j];
- const size_t left = matrix[i][j-1];
- const size_t diag = matrix[i-1][j-1];
- size_t cell = std::min( above + 1, std::min(left + 1, diag + cost));
-
- // Step 6A: Cover transposition, in addition to deletion,
- // insertion and substitution. This step is taken from:
- // Berghel, Hal ; Roach, David : "An Extension of Ukkonen's
- // Enhanced Dynamic Programming ASM Algorithm"
- // (http://www.acm.org/~hlb/publications/asm/asm.html)
-
- if (i>2 && j>2) {
- size_t trans=matrix[i-2][j-2]+1;
- if (source[i-2]!=t_j) trans++;
- if (s_i!=target[j-2]) trans++;
- if (cell>trans) cell=trans;
- }
-
- matrix[i][j]=cell;
- }
- }
-
- // Step 7
-
- return matrix[n][m];
-}
-
-inline std::pair<std::pair<int, int>, std::string> best_edit_distance_cigar(const std::string& source, const std::string& target) {
-
- // Step 1
-
- const size_t n = source.length();
- const size_t m = target.length();
-// if (n == 0) {
-// return m;
-// }
-// if (m == 0) {
-// return n;
-// }
-
- // Good form to declare a TYPEDEF
-
- typedef std::vector< std::vector<int> > Tmatrix;
-
- Tmatrix matrix(n+1);
-
- // Size the vectors in the 2.nd dimension. Unfortunately C++ doesn't
- // allow for allocation on declaration of 2.nd dimension of vec of vec
-
- for (size_t i = 0; i <= n; i++) {
- matrix[i].resize(m+1);
- }
-
- // Step 2
-
- for (size_t i = 0; i <= n; i++) {
- matrix[i][0]=(int)i;
- }
-
- for (size_t j = 0; j <= m; j++) {
- matrix[0][j]=0; //free inserts in front
- }
-
- // Step 3
-
- for (size_t i = 1; i <= n; i++) {
-
- const char s_i = source[i-1];
-
- // Step 4
-
- for (size_t j = 1; j <= m; j++) {
-
- const char t_j = target[j-1];
-
- // Step 5
-
- int cost;
- if (s_i == t_j) {
- cost = 0;
- }
- else {
- cost = 1;
- }
-
- // Step 6
-
- const int above = matrix[i-1][j];
- const int left = matrix[i][j-1];
- const int diag = matrix[i-1][j-1];
- int cell = std::min( above + 1, std::min(left + 1, diag + cost));
-
- // Step 6A: Cover transposition, in addition to deletion,
- // insertion and substitution. This step is taken from:
- // Berghel, Hal ; Roach, David : "An Extension of Ukkonen's
- // Enhanced Dynamic Programming ASM Algorithm"
- // (http://www.acm.org/~hlb/publications/asm/asm.html)
-
-// if (i>2 && j>2) {
-// int trans=matrix[i-2][j-2]+1;
-// if (source[i-2]!=t_j) trans++;
-// if (s_i!=target[j-2]) trans++;
-// if (cell>trans) cell=trans;
-// }
-
- matrix[i][j]=cell;
- }
- }
-
- // Step 7
- int min = matrix[n][m];
- size_t min_m = m;
-
- for (size_t j = 0; j <= m; j++) {
- if (min > matrix[n][j]) {
- min = matrix[n][j];
- min_m = j;
- }
- }
-
-// INFO("min = "<<min<< " min_m = "<< min_m);
- std::string res ="";
- char last_operation = 0;
- int cnt_last_operation = 0;
- size_t cur_pos_i = n;
- size_t cur_pos_j = min_m;
- char cur_operation = 0;
-
-
-// if (min > 0) {
-// for (int i = 0; i <= n; i++) {
-// INFO(ToString(matrix[i]));
-// }
-// }
-
- while ((cur_pos_i > 0)&&(cur_pos_j > 0)){
- if (matrix[cur_pos_i-1][cur_pos_j] < matrix[cur_pos_i][cur_pos_j]) {
- cur_operation = 'I';
- cur_pos_i--;
- }
- else {
- if (matrix[cur_pos_i][cur_pos_j-1] < matrix[cur_pos_i][cur_pos_j]) {
- cur_operation = 'D';
- cur_pos_j--;
- }
- else {
- cur_operation = 'M';
- cur_pos_i--;
- cur_pos_j--;
- }
- }
- if (cur_operation != last_operation){
- if (last_operation != 0)
- res = ToString(cnt_last_operation)+last_operation+res;
- last_operation = cur_operation;
- cnt_last_operation = 1;
- }
- else {
- cnt_last_operation++;
- }
- }
- res = ToString(cnt_last_operation)+last_operation+res;
- return std::make_pair(std::make_pair(cur_pos_j, min_m), res);
-}
diff --git a/src/include/log.hpp b/src/include/log.hpp
deleted file mode 100755
index 192ab46..0000000
--- a/src/include/log.hpp
+++ /dev/null
@@ -1,33 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * Compile time log(n,base) function for use in templates
- *
- * Created on: 02.03.2011
- * Author: vyahhi
- */
-
-#ifndef LOG_HPP_
-#define LOG_HPP_
-
-template <size_t N, size_t base = 2>
-struct log_ {
- const static size_t value = 1 + log_<N/base, base>::value;
-};
-
-template <size_t base>
-struct log_<1, base> {
- const static size_t value = 0;
-};
-
-template <size_t base>
-struct log_<0, base> {
- const static size_t value = 0;
-};
-
-#endif /* LOG_HPP_ */
diff --git a/src/include/logger/log_writers.hpp b/src/include/logger/log_writers.hpp
deleted file mode 100644
index 8a0e25b..0000000
--- a/src/include/logger/log_writers.hpp
+++ /dev/null
@@ -1,38 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-#include "path_helper.hpp"
-#include "logger.hpp"
-
-#include <iostream>
-
-#include "config.hpp"
-
-#include <iostream>
-
-namespace logging {
-
-struct console_writer : public writer {
-#ifdef SPADES_USE_JEMALLOC
- void write_msg(double time, size_t cmem, size_t max_rss, level l, const char* file, size_t line_num, const char* source, const char* msg) {
- std::cout << fmt::format("{:14s} {:>5s} / {:<5s} {:6.6s} {:24.24s} ({:26.26s}:{:4d}) {:s}",
- human_readable_time(time), human_readable_memory(cmem), human_readable_memory(max_rss), logging::level_name(l),
- source, path::filename(file), int(line_num), msg)
- << std::endl;
- }
-#else
- void write_msg(double time, size_t max_rss, level l, const char* file, size_t line_num, const char* source, const char* msg) {
- std::cout << fmt::format("{:14s} {:^5s} {:6.6s} {:24.24s} ({:26.26s}:{:4d}) {:s}",
- human_readable_time(time), human_readable_memory(max_rss), logging::level_name(l),
- source, path::filename(file), int(line_num), msg)
- << std::endl;
- }
-#endif
-};
-
-} // logging
diff --git a/src/include/logger/logger.hpp b/src/include/logger/logger.hpp
deleted file mode 100644
index 56b5cbb..0000000
--- a/src/include/logger/logger.hpp
+++ /dev/null
@@ -1,149 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-#include "perfcounter.hpp"
-
-#include <vector>
-#include <unordered_map>
-#include <string>
-#include <sstream>
-#include <memory>
-
-#include "config.hpp"
-
-namespace logging
-{
-
-/////////////////////////////////////////////////////
-enum level
-{
- L_TRACE,
- L_DEBUG,
- L_INFO,
- L_WARN,
- L_ERROR
-};
-
-inline std::string level_name(level l)
-{
- static std::string names [] =
- {
- "TRACE",
- "DEBUG",
- "INFO" ,
- "WARN" ,
- "ERROR"
- };
-
- return names[l];
-}
-
-
-/////////////////////////////////////////////////////
-struct writer
-{
-#ifdef SPADES_USE_JEMALLOC
- virtual void write_msg(double time_in_sec, size_t cmem, size_t max_rss, level l, const char* file, size_t line_num, const char* source, const char* msg) = 0;
-#else
- virtual void write_msg(double time_in_sec, size_t max_rss, level l, const char* file, size_t line_num, const char* source, const char* msg) = 0;
-#endif
- virtual ~writer(){}
-};
-
-typedef std::shared_ptr<writer> writer_ptr;
-
-/////////////////////////////////////////////////////
-struct properties
-{
- /* Reading logger properties from file
- *
- * File should contains lines like below.
- * Use leading # for comment.
- * File could contain line with default behavior description. If no 'default' entry found, default is set to INFO
- * Valid levels: TRACE, DEBUG, INFO, WARN, ERROR
- *
- * default=INFO
- * AbraCaDabra=TRACE
- * #BubaZuba=WARN
- * HariKrishna=INFO
- *
- */
-
- properties(std::string filename = "", level default_level = L_INFO);
- properties(level default_level = L_INFO);
-
- std::unordered_map<std::string, level> levels;
- level def_level;
- bool all_default;
-};
-
-////////////////////////////////////////////////////
-struct logger
-{
- logger(properties const& props);
-
- //
- bool need_log(level desired_level, const char* source) const;
- void log(level desired_level, const char* file, size_t line_num, const char* source, const char* msg);
-
- //
- void add_writer(writer_ptr ptr);
-
-private:
- properties props_ ;
- std::vector<writer_ptr> writers_;
- perf_counter timer_ ;
-};
-
-std::shared_ptr<logger>& __logger();
-logger* create_logger(std::string filename = "", level default_level = L_INFO);
-
-void attach_logger(logger *lg);
-void detach_logger();
-
-} // logging
-
-inline const char* __scope_source_name() {
- return " General ";
-}
-
-#define DECL_LOGGER(source) \
- static const char* __scope_source_name() { \
- return source; \
- }
-
-#define LOG_MSG(l, msg) \
- do { \
- std::shared_ptr<logging::logger> &__lg__ = logging::__logger(); \
- if (__lg__.get() == NULL) \
- break; \
- \
- if (__lg__->need_log((l), __scope_source_name())) { \
- std::stringstream __logger__str__; \
- __logger__str__ << msg; /* don't use brackets here! */ \
- __lg__->log((l), __FILE__, __LINE__, __scope_source_name(), __logger__str__.str().c_str()); \
- } \
- } while(0);
-
-#ifdef SPADES_DEBUG_LOGGING
-# define DEBUG(message) LOG_MSG(logging::L_DEBUG, message)
-# define TRACE(message) LOG_MSG(logging::L_TRACE, message)
-#else
-# define DEBUG(message) /* No trace */
-# define TRACE(message) /* No trace */
-#endif
-#define INFO(message) LOG_MSG(logging::L_INFO , message)
-#define VERBOSE_T(n, T, message) {size_t n_copy = (n); if (n_copy % (T) == 0 && n_copy > 0) INFO(n_copy << message)}
-#define VERBOSE(n, message) VERBOSE_T((n), 10000, message)
-#define VERBOSE_POWER_T(n, T, message) {size_t n_copy = (n); if ((n_copy & (n_copy - 1)) == 0 && (n_copy > T)) INFO(n_copy << message)}
-#define VERBOSE_POWER(n, message) VERBOSE_POWER_T((n), 10000, message)
-#define VERBOSE_POWER_T2(n, T, message) {size_t n_copy = (n); if ((n_copy & (n_copy - 1)) == 0 && (n_copy > T)) INFO(message)}
-#define VERBOSE_POWER2(n, message) VERBOSE_POWER_T2((n), 10000, message)
-#define WARN(message) LOG_MSG(logging::L_WARN, message)
-#define ERROR(message) LOG_MSG(logging::L_ERROR, message)
-#define FATAL_ERROR(message) {ERROR(message); exit(-1);}
diff --git a/src/include/memory_limit.hpp b/src/include/memory_limit.hpp
deleted file mode 100644
index dea38ae..0000000
--- a/src/include/memory_limit.hpp
+++ /dev/null
@@ -1,91 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#if __DARWIN || __DARWIN_UNIX03
-#include <mach/task.h>
-#include <mach/mach.h>
-#else
-#include <sys/resource.h>
-#endif
-
-#include <sys/time.h>
-#include <sys/resource.h>
-
-#include "config.hpp"
-
-#ifdef SPADES_USE_JEMALLOC
-# include <jemalloc/jemalloc.h>
-#endif
-
-inline void limit_memory(size_t limit) {
- rlimit rl;
- if (sizeof(rlim_t) < 8) {
- INFO("Can't limit virtual memory because of 32-bit system");
- return;
- }
-
- int res = getrlimit(RLIMIT_AS, &rl);
- VERIFY_MSG(res == 0,
- "getrlimit(2) call failed, errno = " << errno);
-
- // We cannot go beyond hard limit and we might not have enough privileges to
- // increase the hard limit
- rl.rlim_cur = std::min<size_t>(limit, rl.rlim_max);
- res = setrlimit(RLIMIT_AS, &rl);
- VERIFY_MSG(res == 0,
- "setrlimit(2) call failed, errno = " << errno);
- INFO("Memory limit set to " << (1.0 * (double)rl.rlim_cur / 1024 / 1024 / 1024) << " Gb");
-}
-
-inline size_t get_memory_limit() {
- rlimit rl;
- int res = getrlimit(RLIMIT_AS, &rl);
- VERIFY_MSG(res == 0,
- "getrlimit(2) call failed, errno = " << errno);
-
- return rl.rlim_cur;
-}
-
-#if __DARWIN || __DARWIN_UNIX03
-inline size_t get_max_rss() {
- struct task_basic_info t_info;
- mach_msg_type_number_t t_info_count = TASK_BASIC_INFO_COUNT;
-
- if (KERN_SUCCESS !=
- task_info(mach_task_self(),
- TASK_BASIC_INFO, (task_info_t)&t_info, &t_info_count))
- return -1U;
-
- return t_info.resident_size / 1024;
-}
-#else
-inline size_t get_max_rss() {
- rusage ru;
- getrusage(RUSAGE_SELF, &ru);
-
- return ru.ru_maxrss;
-}
-#endif
-
-inline size_t get_used_memory() {
-#ifdef SPADES_USE_JEMALLOC
- const size_t *cmem = 0;
- size_t clen = sizeof(cmem);
-
- je_mallctl("stats.cactive", &cmem, &clen, NULL, 0);
- return *cmem;
-#else
- get_max_rss();
-#endif
-}
-
-
-inline size_t get_free_memory() {
- return get_memory_limit() - get_used_memory();
-}
diff --git a/src/include/mph_index/bitpair_vector.hpp b/src/include/mph_index/bitpair_vector.hpp
deleted file mode 100644
index 2fa8b26..0000000
--- a/src/include/mph_index/bitpair_vector.hpp
+++ /dev/null
@@ -1,103 +0,0 @@
-#pragma once
-
-#include "common.hpp"
-
-namespace emphf {
-
- class bitpair_vector {
- public:
-
- bitpair_vector()
- : m_size(0)
- {}
-
- bitpair_vector(uint64_t n)
- : m_size(0)
- {
- resize(n);
- }
-
- void resize(uint64_t n)
- {
- // can only grow, for now
- assert(n >= size());
- m_size = n;
- m_bits.resize((m_size + 31) / 32);
- }
-
- size_t size() const
- {
- return m_size;
- }
-
- size_t mem_size() const {
- return m_bits.size() * sizeof(m_bits[0]);
- }
-
- uint64_t operator[](uint64_t pos) const
- {
- return (m_bits[pos / 32] >> ((pos % 32) * 2)) % 4;
- }
-
- void set(uint64_t pos, uint64_t val)
- {
- assert(val < 4);
- uint64_t word_pos = pos / 32;
- uint64_t word_offset = (pos % 32) * 2;
- m_bits[word_pos] &= ~(3ULL << word_offset);
- m_bits[word_pos] |= val << word_offset;
- }
-
- uint64_t range_nonzeros(uint64_t begin, uint64_t end) const
- {
- assert(begin <= end);
- assert(end <= size());
-
- uint64_t word_begin = begin / 32;
- uint64_t offset_begin = (begin % 32) * 2;
- uint64_t word_end = end / 32;
- uint64_t offset_end = (end % 32) * 2;
- uint64_t r = 0;
-
- uint64_t word = (m_bits[word_begin] >> offset_begin) << offset_begin;
- for (uint64_t w = word_begin; w < word_end; ++w) {
- r += nonzero_pairs(word);
- word = m_bits[w + 1];
- }
-
- uint64_t mask = (uint64_t(1) << offset_end) - 1;
- r += nonzero_pairs(word & mask);
-
- return r;
- }
-
- void swap(bitpair_vector& other)
- {
- std::swap(m_size, other.m_size);
- m_bits.swap(other.m_bits);
- }
-
- void save(std::ostream& os) const
- {
- os.write(reinterpret_cast<char const*>(&m_size), sizeof(m_size));
- os.write(reinterpret_cast<char const*>(m_bits.data()), (std::streamsize)(sizeof(m_bits[0]) * m_bits.size()));
- }
-
- void load(std::istream& is)
- {
- is.read(reinterpret_cast<char*>(&m_size), sizeof(m_size));
- m_bits.resize((m_size + 31) / 32);
- is.read(reinterpret_cast<char*>(m_bits.data()), (std::streamsize)(sizeof(m_bits[0]) * m_bits.size()));
- }
-
- std::vector<uint64_t> const& data() const
- {
- return m_bits;
- }
-
- protected:
- std::vector<uint64_t> m_bits;
- uint64_t m_size;
- };
-
-}
diff --git a/src/include/mph_index/common.hpp b/src/include/mph_index/common.hpp
deleted file mode 100644
index 132fd64..0000000
--- a/src/include/mph_index/common.hpp
+++ /dev/null
@@ -1,66 +0,0 @@
-#pragma once
-
-#include <cstdint>
-#include <iterator>
-#include <memory>
-#include <cassert>
-
-#include "emphf_config.hpp"
-
-namespace emphf {
-
- template <typename Iterator>
- struct iter_range
- {
- iter_range(Iterator b, Iterator e)
- : m_begin(b)
- , m_end(e)
- {}
-
- Iterator begin() const
- { return m_begin; }
-
- Iterator end() const
- { return m_end; }
-
- Iterator m_begin, m_end;
- };
-
- typedef std::pair<uint8_t const*, uint8_t const*> byte_range_t;
-
- struct identity_adaptor
- {
- byte_range_t operator()(byte_range_t s) const
- {
- return s;
- }
- };
-
- template <typename Iterator>
- iter_range<Iterator> range(Iterator begin, Iterator end)
- {
- return iter_range<Iterator>(begin, end);
- }
-
- inline uint64_t nonzero_pairs(uint64_t x)
- {
- static const uint64_t ones_step_4 = 0x1111111111111111ULL;
- x = (x | (x >> 1)) & (0x5 * ones_step_4);
-
-#if EMPHF_USE_POPCOUNT
- return (uint64_t)__builtin_popcountll(x);
-#else
- static const uint64_t ones_step_8 = 0x0101010101010101ULL;
- x = (x & 3 * ones_step_4) + ((x >> 2) & 3 * ones_step_4);
- x = (x + (x >> 4)) & 0x0f * ones_step_8;
- return (x * ones_step_8) >> 56;
-#endif
- }
-
- inline uint64_t msb(uint64_t x)
- {
- assert(x);
- return 63 - __builtin_clzll(x);
- }
-
-}
diff --git a/src/include/mph_index/hypergraph_sorter_seq.hpp b/src/include/mph_index/hypergraph_sorter_seq.hpp
deleted file mode 100644
index b80ea39..0000000
--- a/src/include/mph_index/hypergraph_sorter_seq.hpp
+++ /dev/null
@@ -1,130 +0,0 @@
-#pragma once
-
-#include <cassert>
-#include <cstdint>
-#include <tuple>
-#include <cmath>
-#include <vector>
-#include <iterator>
-#include <algorithm>
-#include <stdexcept>
-
-#include "common.hpp"
-#include "hypergraph.hpp"
-
-#include "logger/logger.hpp"
-
-namespace emphf {
-
- template <typename HypergraphType>
- class hypergraph_sorter_seq {
- public:
- typedef HypergraphType hg;
- typedef typename hg::node_t node_t;
- typedef typename hg::hyperedge hyperedge;
- typedef typename hg::xored_adj_list xored_adj_list;
-
- hypergraph_sorter_seq()
- {}
-
- template <typename Range, typename EdgeGenerator>
- bool try_generate_and_sort(Range const& input_range,
- EdgeGenerator const& edge_gen,
- size_t n,
- size_t hash_domain,
- bool verbose = true)
- {
- using std::get;
- std::vector<xored_adj_list> adj_lists;
-
- size_t m = hash_domain * 3;
-
- // do all the allocations upfront
- m_peeling_order.clear();
- m_peeling_order.reserve(n);
- adj_lists.resize(m);
-
- // generate edges
- if (verbose) {
- //logger() << "Generating hyperedges and populating adjacency lists"
- // << std::endl;
- }
-
- for (auto const& val: input_range) {
- auto edge = edge_gen(val);
- // canonical by construction
- assert(orientation(edge) == 0);
-
- adj_lists[edge.v0].add_edge(edge);
-
- std::swap(edge.v0, edge.v1);
- adj_lists[edge.v0].add_edge(edge);
-
- std::swap(edge.v0, edge.v2);
- adj_lists[edge.v0].add_edge(edge);
- }
-
- // peel
- if (verbose) {
- // logger() << "Peeling" << std::endl;
- }
-
- auto visit = [&](node_t v0) {
- if (adj_lists[v0].degree == 1) {
- auto edge = adj_lists[v0].edge_from(v0);
- m_peeling_order.push_back(edge);
-
- edge = canonicalize_edge(edge);
- adj_lists[edge.v0].delete_edge(edge);
-
- std::swap(edge.v0, edge.v1);
- adj_lists[edge.v0].delete_edge(edge);
-
- std::swap(edge.v0, edge.v2);
- adj_lists[edge.v0].delete_edge(edge);
- }
- };
-
- size_t queue_position = 0;
- for (node_t v0 = 0; v0 < m; ++v0) {
- visit(v0);
-
- while (queue_position < m_peeling_order.size()) {
- auto const& cur_edge = m_peeling_order[queue_position];
-
- visit(cur_edge.v1);
- visit(cur_edge.v2);
- queue_position += 1;
- }
- }
-
- if (m_peeling_order.size() < n) {
- if (verbose) {
- // logger() << "Hypergraph is not peelable: "
- // << (n - m_peeling_order.size()) << " edges remaining"
- // << std::endl;
- }
- return false;
- }
-
- assert(m_peeling_order.size() == n);
-
- return true;
- }
-
- typedef typename std::vector<hyperedge>::const_reverse_iterator
- peeling_iterator;
-
- std::pair<peeling_iterator, peeling_iterator>
- get_peeling_order() const
- {
- return std::make_pair(m_peeling_order.crbegin(),
- m_peeling_order.crend());
- }
-
- private:
-
- size_t m_hash_domain;
- std::vector<hyperedge> m_peeling_order;
- };
-}
diff --git a/src/include/mph_index/kmer_index.hpp b/src/include/mph_index/kmer_index.hpp
deleted file mode 100644
index 105443a..0000000
--- a/src/include/mph_index/kmer_index.hpp
+++ /dev/null
@@ -1,530 +0,0 @@
-#pragma once
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "io/mmapped_reader.hpp"
-#include "io/mmapped_writer.hpp"
-#include "adt/pointer_iterator.hpp"
-
-#include "mphf.hpp"
-#include "base_hash.hpp"
-#include "hypergraph.hpp"
-#include "hypergraph_sorter_seq.hpp"
-
-#include "openmp_wrapper.h"
-
-#include "logger/logger.hpp"
-#include "path_helper.hpp"
-
-#include "memory_limit.hpp"
-
-#include <libcxx/sort.hpp>
-
-#include <algorithm>
-#ifdef USE_GLIBCXX_PARALLEL
-#include <parallel/algorithm>
-#endif
-#include <fstream>
-#include <vector>
-#include <cmath>
-
-#include "config.hpp"
-
-#ifdef SPADES_USE_JEMALLOC
-# include <jemalloc/jemalloc.h>
-#endif
-
-template<class Index>
-class KMerIndexBuilder;
-
-template<class Seq>
-struct kmer_index_traits {
- typedef Seq SeqType;
- typedef MMappedRecordArrayReader<typename Seq::DataType> RawKMerStorage;
- typedef MMappedRecordArrayReader<typename Seq::DataType> FinalKMerStorage;
- typedef typename RawKMerStorage::iterator raw_data_iterator;
- typedef typename RawKMerStorage::const_iterator raw_data_const_iterator;
- typedef typename RawKMerStorage::iterator::value_type KMerRawData;
- typedef typename RawKMerStorage::iterator::reference KMerRawReference;
- typedef typename RawKMerStorage::const_iterator::reference KMerRawConstReference;
-
- struct raw_equal_to {
- bool operator()(const Seq &lhs, const KMerRawReference rhs) {
- return (array_equal_to<typename Seq::DataType>()(lhs.data(), lhs.data_size(), rhs));
- }
- };
-
- struct raw_create {
- Seq operator()(unsigned K, const KMerRawReference kmer) {
- return Seq(K, kmer.data());
- }
- Seq operator()(unsigned K, const KMerRawConstReference kmer) {
- return Seq(K, kmer.data());
- }
- };
-
- struct hash_function {
- uint64_t operator()(const Seq &k) const{
- return typename Seq::hash()(k);
- }
- uint64_t operator()(const KMerRawReference k) const {
- return typename Seq::hash()(k.data(), k.size());
- }
- };
-
- struct KMerRawReferenceAdaptor {
- emphf::byte_range_t operator()(const KMerRawReference k) const {
- const uint8_t * data = (const uint8_t*)k.data();
- return std::make_pair(data, data + k.data_size());
- }
- };
-
- struct KMerSeqAdaptor {
- emphf::byte_range_t operator()(const Seq &k) const {
- const uint8_t * data = (const uint8_t*)k.data();
- return std::make_pair(data, data + k.data_size() * sizeof(typename Seq::DataType));
- }
- };
-
- template<class Writer>
- static void raw_serialize(Writer &writer, RawKMerStorage *data) {
- size_t sz = data->data_size(), elcnt = data->elcnt();
- unsigned PageSize = getpagesize();
- writer.write((char*)&sz, sizeof(sz));
- writer.write((char*)&elcnt, sizeof(elcnt));
- // Make sure data is aligned to the page boundary
- size_t cpos = writer.tellp();
- size_t pos = (cpos + PageSize - 1 + sizeof(size_t)) / PageSize * PageSize;
- size_t off = pos - writer.tellp();
- writer.write((char*)&off, sizeof(off));
- writer.seekp(pos);
- writer.write((char*)data->data(), data->data_size());
- }
-
- template<class Reader>
- static RawKMerStorage *raw_deserialize(Reader &reader, const std::string &FileName) {
- size_t sz, off, elcnt;
- reader.read((char*)&sz, sizeof(sz));
- reader.read((char*)&elcnt, sizeof(elcnt));
- reader.read((char*)&off, sizeof(off));
- off -= sizeof(off);
- off += reader.tellg();
-
- return new RawKMerStorage(FileName, elcnt, false, off, sz);
- }
-
-};
-
-template<class traits>
-class KMerIndex {
- public:
- typedef traits kmer_index_traits;
- typedef typename traits::SeqType KMerSeq;
- typedef typename traits::hash_function hash_function;
- typedef typename traits::KMerRawData KMerRawData;
- typedef typename traits::KMerRawReference KMerRawReference;
- typedef size_t IdxType;
-
- private:
- using KMerDataIndex = emphf::mphf<emphf::city_hasher>;
- typedef KMerIndex __self;
-
- public:
- KMerIndex(): index_(NULL), num_buckets_(0), size_(0) {}
-
- KMerIndex(const KMerIndex&) = delete;
- KMerIndex& operator=(const KMerIndex&) = delete;
-
- ~KMerIndex() { clear(); }
-
- void clear() {
- num_buckets_ = 0;
- bucket_starts_.clear();
-
- delete[] index_;
- index_ = NULL;
- }
-
- size_t mem_size() {
- size_t sz = 0;
- for (size_t i = 0; i < num_buckets_; ++i)
- sz += index_[i].mem_size();
-
- return sz;
- }
-
- void count_size() {
- if (index_ == NULL)
- return;
- size_ = 0;
- for (size_t i = 0; i < num_buckets_; i++)
- size_ += index_[i].size();
- }
-
- size_t size() const {
- return size_;
- }
-
- size_t seq_idx(const KMerSeq &s) const {
- size_t bucket = seq_bucket(s);
-
- return bucket_starts_[bucket] +
- index_[bucket].lookup(s, typename traits::KMerSeqAdaptor());
- }
-
- size_t raw_seq_idx(const KMerRawReference data) const {
- size_t bucket = raw_seq_bucket(data);
-
- return bucket_starts_[bucket] +
- index_[bucket].lookup(data, typename traits::KMerRawReferenceAdaptor());
- }
-
- template<class Writer>
- void serialize(Writer &os) const {
- os.write((char*)&num_buckets_, sizeof(num_buckets_));
- for (size_t i = 0; i < num_buckets_; ++i)
- index_[i].save(os);
- os.write((char*)&bucket_starts_[0], (num_buckets_ + 1) * sizeof(bucket_starts_[0]));
- }
-
- template<class Reader>
- void deserialize(Reader &is) {
- clear();
-
- is.read((char*)&num_buckets_, sizeof(num_buckets_));
-
- index_ = new KMerDataIndex[num_buckets_];
- for (size_t i = 0; i < num_buckets_; ++i)
- index_[i].load(is);
-
- bucket_starts_.resize(num_buckets_ + 1);
- is.read((char*)&bucket_starts_[0], (num_buckets_ + 1) * sizeof(bucket_starts_[0]));
- count_size();
- }
-
- void swap(KMerIndex<traits> &other) {
- std::swap(index_, other.index_);
- std::swap(num_buckets_, other.num_buckets_);
- std::swap(size_, other.size_);
- std::swap(bucket_starts_, other.bucket_starts_);
- }
-
- private:
- KMerDataIndex *index_;
-
- size_t num_buckets_;
- std::vector<size_t> bucket_starts_;
- size_t size_;
-
- size_t seq_bucket(const KMerSeq &s) const {
- return hash_function()(s) % num_buckets_;
- }
- size_t raw_seq_bucket(const KMerRawReference data) const {
- return hash_function()(data) % num_buckets_;
- }
-
- friend class KMerIndexBuilder<__self>;
-};
-
-template<class Seq>
-class KMerSplitter {
- public:
- typedef typename Seq::hash hash_function;
-
- KMerSplitter(const std::string &work_dir, unsigned K, uint32_t seed = 0)
- : work_dir_(work_dir), K_(K), seed_(seed) {}
-
- virtual ~KMerSplitter() {}
-
- virtual path::files_t Split(size_t num_files) = 0;
-
- unsigned K() const { return K_; }
-
- protected:
- const std::string &work_dir_;
- hash_function hash_;
- unsigned K_;
- uint32_t seed_;
-
- std::string GetRawKMersFname(unsigned suffix) const {
- return path::append_path(work_dir_, "kmers.raw." + std::to_string(suffix));
- }
-
- unsigned GetFileNumForSeq(const Seq &s, unsigned total) const {
- return (unsigned)(hash_(s, seed_) % total);
- }
-
- DECL_LOGGER("K-mer Splitting");
-};
-
-template<class Seq, class traits = kmer_index_traits<Seq> >
-class KMerCounter {
- public:
- typedef typename traits::raw_data_iterator iterator;
- typedef typename traits::raw_data_const_iterator const_iterator;
- typedef typename traits::RawKMerStorage RawKMerStorage;
- typedef typename traits::FinalKMerStorage FinalKMerStorage;
-
- virtual size_t KMerSize() const = 0;
-
- virtual size_t Count(unsigned num_buckets, unsigned num_threads) = 0;
- virtual size_t CountAll(unsigned num_buckets, unsigned num_threads, bool merge = true) = 0;
- virtual void MergeBuckets(unsigned num_buckets) = 0;
-
- virtual void OpenBucket(size_t idx, bool unlink = true) = 0;
- virtual void ReleaseBucket(size_t idx) = 0;
- virtual RawKMerStorage* TransferBucket(size_t idx) = 0;
- virtual FinalKMerStorage* GetFinalKMers() = 0;
-
- virtual iterator bucket_begin(size_t idx) = 0;
- virtual iterator bucket_end(size_t idx) = 0;
-
- virtual ~KMerCounter() {}
-
-protected:
- DECL_LOGGER("K-mer Counting");
-};
-
-template<class Seq, class traits = kmer_index_traits<Seq> >
-class KMerDiskCounter : public KMerCounter<Seq> {
- typedef KMerCounter<Seq, traits> __super;
-public:
- KMerDiskCounter(const std::string &work_dir, KMerSplitter<Seq> &splitter)
- : work_dir_(work_dir), splitter_(splitter) {
- std::string prefix = path::append_path(work_dir, "kmers_XXXXXX");
- char *tempprefix = strcpy(new char[prefix.length() + 1], prefix.c_str());
- VERIFY_MSG(-1 != (fd_ = ::mkstemp(tempprefix)), "Cannot create temporary file");
- kmer_prefix_ = tempprefix;
- delete[] tempprefix;
- }
-
- ~KMerDiskCounter() {
- for (size_t i = 0; i < buckets_.size(); ++i)
- ReleaseBucket(i);
-
- ::close(fd_);
- ::unlink(kmer_prefix_.c_str());
- }
-
- size_t KMerSize() const {
- return Seq::GetDataSize(splitter_.K()) * sizeof(typename Seq::DataType);
- }
-
- void OpenBucket(size_t idx, bool unlink = true) {
- unsigned K = splitter_.K();
-
- buckets_[idx] = new MMappedRecordArrayReader<typename Seq::DataType>(GetMergedKMersFname((unsigned)idx), Seq::GetDataSize(K), unlink);
- }
-
- void ReleaseBucket(size_t idx) {
- delete buckets_[idx];
- buckets_[idx] = NULL;
- }
-
- MMappedRecordArrayReader<typename Seq::DataType>* TransferBucket(size_t idx) {
- MMappedRecordArrayReader<typename Seq::DataType> *res = buckets_[idx];
- buckets_[idx] = NULL;
-
- return res;
- }
-
- typename __super::iterator bucket_begin(size_t idx) {
- return buckets_[idx]->begin();
- }
- typename __super::iterator bucket_end(size_t idx) {
- return buckets_[idx]->end();
- }
-
- size_t Count(unsigned num_buckets, unsigned num_threads) {
- unsigned K = splitter_.K();
-
- // Split k-mers into buckets.
- path::files_t raw_kmers = splitter_.Split(num_buckets * num_threads);
-
- INFO("Starting k-mer counting.");
- size_t kmers = 0;
-# pragma omp parallel for shared(raw_kmers) num_threads(num_threads) schedule(dynamic) reduction(+:kmers)
- for (unsigned iFile = 0; iFile < raw_kmers.size(); ++iFile) {
- kmers += MergeKMers(raw_kmers[iFile], GetUniqueKMersFname(iFile), K);
- }
- INFO("K-mer counting done. There are " << kmers << " kmers in total. ");
-
- INFO("Merging temporary buckets.");
- for (unsigned i = 0; i < num_buckets; ++i) {
- std::string ofname = GetMergedKMersFname(i);
- std::ofstream ofs(ofname.c_str(), std::ios::out | std::ios::binary);
- for (unsigned j = 0; j < num_threads; ++j) {
- MMappedRecordArrayReader<typename Seq::DataType> ins(GetUniqueKMersFname(i + j * num_buckets), Seq::GetDataSize(K), /* unlink */ true);
- ofs.write((const char*)ins.data(), ins.data_size());
- }
- }
-
- buckets_.resize(num_buckets);
-
- return kmers;
- }
-
- void MergeBuckets(unsigned num_buckets) {
- unsigned K = splitter_.K();
-
- INFO("Merging final buckets.");
- for (unsigned i = 0; i < num_buckets; ++i)
- VERIFY(buckets_[i] == NULL);
-
- buckets_.clear();
-
- MMappedRecordArrayWriter<typename Seq::DataType> os(GetFinalKMersFname(), Seq::GetDataSize(K));
- std::string ofname = GetFinalKMersFname();
- std::ofstream ofs(ofname.c_str(), std::ios::out | std::ios::binary);
- for (unsigned j = 0; j < num_buckets; ++j) {
- MMappedRecordArrayReader<typename Seq::DataType> ins(GetMergedKMersFname(j), Seq::GetDataSize(K), /* unlink */ true);
- ofs.write((const char*)ins.data(), ins.data_size());
- }
- ofs.close();
- }
-
- size_t CountAll(unsigned num_buckets, unsigned num_threads, bool merge = true) {
- size_t kmers = Count(num_buckets, num_threads);
- if (merge)
- MergeBuckets(num_buckets);
-
- return kmers;
- }
-
- typename __super::FinalKMerStorage *GetFinalKMers() {
- unsigned K = splitter_.K();
- return new MMappedRecordArrayReader<typename Seq::DataType>(GetFinalKMersFname(), Seq::GetDataSize(K), /* unlink */ true);
- }
-
- std::string GetMergedKMersFname(unsigned suffix) const {
- return kmer_prefix_ + ".merged." + std::to_string(suffix);
- }
-
- std::string GetFinalKMersFname() const {
- return kmer_prefix_ + ".final";
- }
-
-private:
- std::string work_dir_;
- KMerSplitter<Seq> &splitter_;
- int fd_;
- std::string kmer_prefix_;
-
- std::vector<MMappedRecordArrayReader<typename Seq::DataType>*> buckets_;
-
- std::string GetUniqueKMersFname(unsigned suffix) const {
- return kmer_prefix_ + ".unique." + std::to_string(suffix);
- }
-
- size_t MergeKMers(const std::string &ifname, const std::string &ofname,
- unsigned K) {
- MMappedRecordArrayReader<typename Seq::DataType> ins(ifname, Seq::GetDataSize(K), /* unlink */ true);
-
- // Sort the stuff
- libcxx::sort(ins.begin(), ins.end(), array_less<typename Seq::DataType>());
-
- // FIXME: Use something like parallel version of unique_copy but with explicit
- // resizing.
- auto it = std::unique(ins.begin(), ins.end(), array_equal_to<typename Seq::DataType>());
-
- MMappedRecordArrayWriter<typename Seq::DataType> os(ofname, Seq::GetDataSize(K));
- os.resize(it - ins.begin());
- std::copy(ins.begin(), it, os.begin());
-
- return it - ins.begin();
- }
-};
-
-template<class Index>
-class KMerIndexBuilder {
- typedef typename Index::KMerSeq Seq;
- typedef typename Index::kmer_index_traits kmer_index_traits;
-
- std::string work_dir_;
- unsigned num_buckets_;
- unsigned num_threads_;
-
- public:
- KMerIndexBuilder(const std::string &workdir,
- unsigned num_buckets, unsigned num_threads)
- : work_dir_(workdir), num_buckets_(num_buckets), num_threads_(num_threads) {}
- size_t BuildIndex(Index &out, KMerCounter<Seq> &counter,
- bool save_final = false);
-
- unsigned num_buckets() const { return num_buckets_; }
-
- private:
-
- DECL_LOGGER("K-mer Index Building");
-};
-
-template<class Index>
-size_t KMerIndexBuilder<Index>::BuildIndex(Index &index, KMerCounter<Seq> &counter,
- bool save_final) {
- index.clear();
-
- INFO("Building kmer index ");
-
- // First, count the unique k-mers
- size_t kmers = counter.Count(num_buckets_, num_threads_);
-
- index.num_buckets_ = num_buckets_;
- index.bucket_starts_.resize(num_buckets_ + 1);
- index.index_ = new typename KMerIndex<kmer_index_traits>::KMerDataIndex[num_buckets_];
-
- INFO("Building perfect hash indices");
-
- // Index building requires up to 40 bytes per k-mer. Limit number of threads depending on the memory limit.
- unsigned num_threads = num_threads_;
-# ifdef SPADES_USE_JEMALLOC
- const size_t *cmem = 0;
- size_t clen = sizeof(cmem);
-
- je_mallctl("stats.cactive", &cmem, &clen, NULL, 0);
- size_t bucket_size = (36 * kmers + kmers * counter.KMerSize()) / num_buckets_;
- num_threads = std::min<unsigned>((unsigned) ((get_memory_limit() - *cmem) / bucket_size), num_threads);
- if (num_threads < 1)
- num_threads = 1;
- if (num_threads < num_threads_)
- WARN("Number of threads was limited down to " << num_threads << " in order to fit the memory limits during the index construction");
-# endif
-
-# pragma omp parallel for shared(index) num_threads(num_threads)
- for (unsigned iFile = 0; iFile < num_buckets_; ++iFile) {
- typename KMerIndex<kmer_index_traits>::KMerDataIndex &data_index = index.index_[iFile];
- counter.OpenBucket(iFile, !save_final);
- size_t sz = counter.bucket_end(iFile) - counter.bucket_begin(iFile);
- index.bucket_starts_[iFile + 1] = sz;
- typename kmer_index_traits::KMerRawReferenceAdaptor adaptor;
- size_t max_nodes = (size_t(std::ceil(double(sz) * 1.23)) + 2) / 3 * 3;
- if (max_nodes >= uint64_t(1) << 32) {
- emphf::hypergraph_sorter_seq<emphf::hypergraph<uint64_t> > sorter;
- typename KMerIndex<kmer_index_traits>::KMerDataIndex(sorter,
- sz, emphf::range(counter.bucket_begin(iFile), counter.bucket_end(iFile)),
- adaptor).swap(data_index);
- } else {
- emphf::hypergraph_sorter_seq<emphf::hypergraph<uint32_t> > sorter;
- typename KMerIndex<kmer_index_traits>::KMerDataIndex(sorter,
- sz, emphf::range(counter.bucket_begin(iFile), counter.bucket_end(iFile)),
- adaptor).swap(data_index);
- }
-
- counter.ReleaseBucket(iFile);
- }
-
- // Finally, record the sizes of buckets.
- for (unsigned iFile = 1; iFile < num_buckets_; ++iFile)
- index.bucket_starts_[iFile] += index.bucket_starts_[iFile - 1];
-
- if (save_final)
- counter.MergeBuckets(num_buckets_);
-
- double bits_per_kmer = 8.0 * (double)index.mem_size() / (double)kmers;
- INFO("Index built. Total " << index.mem_size() << " bytes occupied (" << bits_per_kmer << " bits per kmer).");
- index.count_size();
- return kmers;
-}
diff --git a/src/include/mph_index/mphf.hpp b/src/include/mph_index/mphf.hpp
deleted file mode 100644
index 56834c4..0000000
--- a/src/include/mph_index/mphf.hpp
+++ /dev/null
@@ -1,136 +0,0 @@
-#pragma once
-
-#include <random>
-
-#include "bitpair_vector.hpp"
-#include "ranked_bitpair_vector.hpp"
-
-#include "logger/logger.hpp"
-
-namespace emphf {
-
- template <typename BaseHasher>
- class mphf {
- public:
- mphf()
- {}
-
- template <typename HypergraphSorter, typename Range, typename Adaptor>
- mphf(HypergraphSorter& sorter, size_t n,
- Range const& input_range, Adaptor adaptor,
- double gamma = 1.23)
- : m_n(n)
- , m_hash_domain(std::max((size_t(std::ceil(double(m_n) * gamma)) + 2) / 3, size_t(2)))
- {
- typedef typename HypergraphSorter::node_t node_t;
- typedef typename HypergraphSorter::hyperedge hyperedge;
- typedef decltype(*std::begin(input_range)) value_type;
-
- size_t nodes_domain = m_hash_domain * 3;
-
- if (nodes_domain >= std::numeric_limits<node_t>::max()) {
- throw std::invalid_argument("Too many nodes for node_t");
- }
-
- auto edge_gen = [&](value_type s) {
- using std::get;
- auto hashes = m_hasher(adaptor(s));
- return hyperedge((node_t)(get<0>(hashes) % m_hash_domain),
- (node_t)(m_hash_domain +
- (get<1>(hashes) % m_hash_domain)),
- (node_t)(2 * m_hash_domain +
- (get<2>(hashes) % m_hash_domain)));
- };
-
- std::mt19937_64 rng(37); // deterministic seed
-
- for (size_t trial = 0; ; ++trial) {
- //logger() << "Hypergraph generation: trial " << trial << std::endl;
-
- m_hasher = BaseHasher::generate(rng);
- if (sorter.try_generate_and_sort(input_range, edge_gen,
- m_n, m_hash_domain)) break;
- }
-
- auto peeling_order = sorter.get_peeling_order();
- bitpair_vector bv(nodes_domain);
-
- //logger() << "Assigning values" << std::endl;
-
- for (auto edge = peeling_order.first;
- edge != peeling_order.second;
- ++edge) {
-
- uint64_t target = orientation(*edge);
- uint64_t assigned = bv[edge->v1] + bv[edge->v2];
-
- // "assigned values" must be nonzeros to be ranked, so
- // if the result is 0 we assign 3
- bv.set(edge->v0, ((target - assigned + 9) % 3) ?: 3);
- }
-
- m_bv.build(std::move(bv));
- }
-
- uint64_t size() const
- {
- return m_n;
- }
-
- size_t mem_size() const {
- return m_bv.mem_size();
- }
-
- BaseHasher const& base_hasher() const
- {
- return m_hasher;
- }
-
- template <typename T, typename Adaptor>
- uint64_t lookup(T val, Adaptor adaptor)
- {
- using std::get;
- auto hashes = m_hasher(adaptor(val));
- uint64_t nodes[3] = {get<0>(hashes) % m_hash_domain,
- m_hash_domain + (get<1>(hashes) % m_hash_domain),
- 2 * m_hash_domain + (get<2>(hashes) % m_hash_domain)};
-
- uint64_t hidx = (m_bv[nodes[0]] + m_bv[nodes[1]] + m_bv[nodes[2]]) % 3;
- return m_bv.rank(nodes[hidx]);
- }
-
- void swap(mphf& other)
- {
- std::swap(m_n, other.m_n);
- std::swap(m_hash_domain, other.m_hash_domain);
- m_hasher.swap(other.m_hasher);
- m_bv.swap(other.m_bv);
- }
-
- void save(std::ostream& os) const
- {
- os.write(reinterpret_cast<char const*>(&m_n), sizeof(m_n));
- os.write(reinterpret_cast<char const*>(&m_hash_domain),
- sizeof(m_hash_domain));
- m_hasher.save(os);
- m_bv.save(os);
- }
-
- void load(std::istream& is)
- {
- is.read(reinterpret_cast<char*>(&m_n), sizeof(m_n));
- is.read(reinterpret_cast<char*>(&m_hash_domain),
- sizeof(m_hash_domain));
- m_hasher.load(is);
- m_bv.load(is);
- }
-
-
- private:
-
- uint64_t m_n;
- uint64_t m_hash_domain;
- BaseHasher m_hasher;
- ranked_bitpair_vector m_bv;
- };
-}
diff --git a/src/include/omni/action_handlers.hpp b/src/include/omni/action_handlers.hpp
deleted file mode 100644
index 573a44e..0000000
--- a/src/include/omni/action_handlers.hpp
+++ /dev/null
@@ -1,345 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef __OMNI_ACTION_HANDLERS_HPP__
-#define __OMNI_ACTION_HANDLERS_HPP__
-
-#include "logger/logger.hpp"
-
-#include <boost/noncopyable.hpp>
-#include <string>
-#include <vector>
-
-namespace omnigraph {
-
-using std::vector;
-/**
- * ActionHandler is base listening class for graph events. All structures and information storages
- * which are meant to synchronize with graph should use this structure. In order to make handler listen
- * to graph events one should add it to graph listeners.
- * Normally structure itself extends ActionHandler and overrides several handling methods. In
- * constructor it adds itself to graph handler list and removes itself form this list in destructor.
- * All events are divided into two levels: low level events and high level events.
- * Low level events are addition/deletion of vertices/edges. These events should be triggered only after
- * high level events when all data was already transferred and graph structure is consistent.
- * High level events should be used to keep external data synchronized with graph and keep internal data
- * consistent. Now high level events are merge, glue and split. This list can be extended in near future.
- */
-template<typename VertexId, typename EdgeId>
-class ActionHandler : private boost::noncopyable {
- const std::string handler_name_;
- private:
- bool attached_;
- public:
- /**
- * Create action handler with given name. With this name one can find out what tipe of handler is it.
- */
- ActionHandler(const std::string& name)
- : handler_name_(name), attached_(true) {
- }
-
- virtual ~ActionHandler() {
- TRACE("~ActionHandler " << handler_name_);
- }
-
- /**
- * Method returns name of this handler
- */
- const std::string& name() const {
- return handler_name_;
- }
-
- /**
- * Low level event which is triggered when vertex is added to graph.
- * @param v new vertex
- */
- virtual void HandleAdd(VertexId /*v*/) {}
-
- /**
- * Low level event which is triggered when edge is added to graph.
- * @param e new edge
- */
- virtual void HandleAdd(EdgeId /*e*/) {}
-
- /**
- * Low level event which is triggered when vertex is deleted from graph.
- * @param v vertex to delete
- */
- virtual void HandleDelete(VertexId /*v*/) {}
-
- /**
- * Low level event which is triggered when edge is deleted from graph.
- * @param e edge to delete
- */
- virtual void HandleDelete(EdgeId /*e*/) {}
-
- /**
- * High level event which is triggered when merge operation is performed on graph, which is when
- * path of edges with all inner vertices having exactly one incoming and one outgoing edge is
- * replaced with a single edge. Since this is high level operation event of creation of new edge
- * and events of deletion of old edges should not have been triggered yet when this event was triggered.
- * @param old_edges path of edges to be replaced with single edge
- * @param new_edge new edge that was added to be a replacement of path
- */
- virtual void HandleMerge(const vector<EdgeId>& /*old_edges*/, EdgeId /*new_edge*/) {}
-
- /**
- * High level event which is triggered when glue operation is performed on graph, which is when
- * edge is completely replaced with other edge. This operation is widely used in bulge removal
- * when alternative path is glued to main path. Since this is high level operation event of deletion
- * of old edge should not have been triggered yet when this event was triggered.
- * @param new_edge edge glue result
- * @param edge1 edge to be glued to edge2
- * @param edge2 edge edge1 should be glued with
- */
- virtual void HandleGlue(EdgeId /*new_edge*/, EdgeId /*edge1*/, EdgeId /*edge2*/) {}
-
- /**
- * High level event which is triggered when split operation is performed on graph, which is when
- * edge is split into several shorter edges. Split operation is reverse to merge operation.
- * Since this is high level operation event of deletion of old edge and events of creation of new edges
- * should not have been triggered yet when this event was triggered.
- * @param old_edge edge to be split
- * @param new_edges edges which are results of split
- */
- virtual void HandleSplit(EdgeId /*old_edge*/, EdgeId /*new_edge_1*/,
- EdgeId /*new_edge_2*/) {}
-
- /**
- * Every thread safe descendant should override this method for correct concurrent graph processing.
- */
- virtual bool IsThreadSafe() const {
- return false;
- }
-
- bool IsAttached() const {
- return attached_;
- }
-
- void Attach() {
- VERIFY(!attached_);
- attached_ = true;
- }
-
- void Detach() {
- VERIFY(attached_);
- attached_ = false;
- }
-};
-
-template<class Graph>
-class GraphActionHandler : public ActionHandler<typename Graph::VertexId,
- typename Graph::EdgeId> {
- typedef ActionHandler<typename Graph::VertexId, typename Graph::EdgeId> base;
-
- const Graph& g_;
-
- protected:
- const Graph& g() const {
- return g_;
- }
-
- public:
- GraphActionHandler(const Graph& g, const std::string& name)
- : base(name),
- g_(g) {
- TRACE("Adding new action handler: " << this->name());
- g_.AddActionHandler(this);
- }
-
- GraphActionHandler(const GraphActionHandler<Graph> &other)
- : base(other.name()),
- g_(other.g_) {
- TRACE("Adding new action handler: " << this->name());
- g_.AddActionHandler(this);
- }
-
- virtual ~GraphActionHandler() {
- TRACE("Removing action handler: " << this->name());
- if(this->IsAttached())
- this->Detach();
- g_.RemoveActionHandler(this);
- }
-};
-
-/**
- * In order to support various types of graphs and make handler structure more flexible HandlerApplier
- * structure was introduced. If certain implementation of graph requires special handler triggering scheme
- * one can store certain extension of HandlerApplier in graph and trigger HandlerApplier methods instead
- * of GraphHandler methods.
- * HandlerApplier contains one method for each of graph events which define the exact way this event
- * should be triggered.
- */
-template<typename VertexId, typename EdgeId>
-class HandlerApplier {
- typedef ActionHandler<VertexId, EdgeId> Handler;
- public:
-
- virtual void
- ApplyAdd(Handler& handler, VertexId v) const = 0;
-
- virtual void
- ApplyAdd(Handler& handler, EdgeId e) const = 0;
-
- virtual void
- ApplyDelete(Handler& handler, VertexId v) const = 0;
-
- virtual void
- ApplyDelete(Handler& handler, EdgeId e) const = 0;
-
- virtual void ApplyMerge(Handler& handler, vector<EdgeId> old_edges,
- EdgeId new_edge) const = 0;
-
- virtual void ApplyGlue(Handler& handler, EdgeId new_edge, EdgeId edge1,
- EdgeId edge2) const = 0;
-
- virtual void ApplySplit(Handler& handler, EdgeId old_edge,
- EdgeId new_edge_1, EdgeId new_edge2) const = 0;
-
- virtual ~HandlerApplier() {
- }
-};
-
-/**
- * SimpleHandlerApplier is simple implementation of handler applier with no special filtering.
- */
-template<class Graph>
-class SimpleHandlerApplier : public HandlerApplier<typename Graph::VertexId,
- typename Graph::EdgeId> {
- public:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- typedef ActionHandler<VertexId, EdgeId> Handler;
-
- virtual void ApplyAdd(Handler& handler, VertexId v) const {
- handler.HandleAdd(v);
- }
-
- virtual void ApplyAdd(Handler& handler, EdgeId e) const {
- handler.HandleAdd(e);
- }
-
- virtual void ApplyDelete(Handler& handler, VertexId v) const {
- handler.HandleDelete(v);
- }
-
- virtual void ApplyDelete(Handler& handler, EdgeId e) const {
- handler.HandleDelete(e);
- }
-
- virtual void ApplyMerge(Handler& handler, vector<EdgeId> old_edges,
- EdgeId new_edge) const {
- handler.HandleMerge(old_edges, new_edge);
- }
-
- virtual void ApplyGlue(Handler& handler, EdgeId new_edge, EdgeId edge1,
- EdgeId edge2) const {
- handler.HandleGlue(new_edge, edge1, edge2);
- }
-
- virtual void ApplySplit(Handler& handler, EdgeId old_edge, EdgeId new_edge1,
- EdgeId new_edge2) const {
- handler.HandleSplit(old_edge, new_edge1, new_edge2);
- }
-
-};
-
-/**
- * PairedHandlerApplier is implementation of HandlerApplier for graph with synchronization of actions
- * performed with vertices/edges and its reverse-complement analogues. Thus while corresponding
- * method was called only once event should be triggered twice: for the parameters with which method
- * was called and for reverse-complement parameters. Also certain assertions were added for bad cases.
- */
-template<class Graph>
-class PairedHandlerApplier : public HandlerApplier<typename Graph::VertexId,
- typename Graph::EdgeId> {
- private:
- Graph &graph_;
- public:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- typedef ActionHandler<VertexId, EdgeId> Handler;
-
- PairedHandlerApplier(Graph &graph)
- : graph_(graph) {
- }
-
- virtual void ApplyAdd(Handler& handler, VertexId v) const {
- VertexId rcv = graph_.conjugate(v);
- handler.HandleAdd(v);
- if (v != rcv) {
- handler.HandleAdd(rcv);
- }
- }
-
- virtual void ApplyAdd(Handler& handler, EdgeId e) const {
- EdgeId rce = graph_.conjugate(e);
- handler.HandleAdd(e);
- if (e != rce) {
- handler.HandleAdd(rce);
- }
- }
-
- virtual void ApplyDelete(Handler& handler, VertexId v) const {
- VertexId rcv = graph_.conjugate(v);
- handler.HandleDelete(v);
- if (v != rcv) {
- handler.HandleDelete(rcv);
- }
- }
-
- virtual void ApplyDelete(Handler& handler, EdgeId e) const {
- EdgeId rce = graph_.conjugate(e);
- handler.HandleDelete(e);
- if (e != rce) {
- handler.HandleDelete(rce);
- }
- }
-
- virtual void ApplyMerge(Handler& handler, vector<EdgeId> old_edges,
- EdgeId new_edge) const {
- EdgeId rce = graph_.conjugate(new_edge);
- handler.HandleMerge(old_edges, new_edge);
- if (new_edge != rce) {
- vector<EdgeId> rc_old_edges;
- for (int i = (int) old_edges.size() - 1; i >= 0; i--) {
- rc_old_edges.push_back(graph_.conjugate(old_edges[i]));
- }
- handler.HandleMerge(rc_old_edges, rce);
- }
- }
-
- virtual void ApplyGlue(Handler& handler, EdgeId new_edge, EdgeId edge1,
- EdgeId edge2) const {
- EdgeId rc_edge1 = graph_.conjugate(edge1);
- EdgeId rc_edge2 = graph_.conjugate(edge2);
- VERIFY(edge1 != edge2);
- VERIFY(edge2 != rc_edge2);
- handler.HandleGlue(new_edge, edge1, edge2);
- if (edge1 != rc_edge1) {
- handler.HandleGlue(graph_.conjugate(new_edge), rc_edge1, rc_edge2);
- }
- }
-
- virtual void ApplySplit(Handler& handler, EdgeId old_edge,
- EdgeId new_edge_1, EdgeId new_edge2) const {
- EdgeId rce = graph_.conjugate(old_edge);
- //VERIFY(old_edge != rce);
- handler.HandleSplit(old_edge, new_edge_1, new_edge2);
- if (old_edge != rce) {
- handler.HandleSplit(rce, graph_.conjugate(new_edge2),
- graph_.conjugate(new_edge_1));
- }
- }
-
- private:
- DECL_LOGGER("PairedHandlerApplier")
-};
-
-};
-
-#endif
diff --git a/src/include/omni/basic_edge_conditions.hpp b/src/include/omni/basic_edge_conditions.hpp
deleted file mode 100644
index e19890f..0000000
--- a/src/include/omni/basic_edge_conditions.hpp
+++ /dev/null
@@ -1,268 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "func.hpp"
-#include "pred.hpp"
-#include "omni_utils.hpp"
-namespace omnigraph {
-
-using namespace func;
-
-template<class Graph>
-class EdgeCondition : public Predicate<typename Graph::EdgeId> {
- typedef typename Graph::EdgeId EdgeId;
-
- const Graph& g_;
- protected:
-
- EdgeCondition(const Graph& g)
- : g_(g) {
- }
-
- const Graph& g() const {
- return g_;
- }
-
-};
-
-template<class Graph>
-class IsolatedEdgeCondition : public EdgeCondition<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef EdgeCondition<Graph> base;
-
- bool IsTerminalVertex(VertexId v) const {
- return this->g().IncomingEdgeCount(v) + this->g().OutgoingEdgeCount(v) == 1;
- }
-
-public:
- IsolatedEdgeCondition(const Graph& g) : base(g) {
- }
-
- bool Check(EdgeId e) const {
- return IsTerminalVertex(this->g().EdgeStart(e)) && IsTerminalVertex(this->g().EdgeEnd(e));
- }
-
-};
-
-template<class Graph>
-inline bool HasAlternatives(const Graph& g, typename Graph::EdgeId e) {
- return g.OutgoingEdgeCount(g.EdgeStart(e)) > 1
- && g.IncomingEdgeCount(g.EdgeEnd(e)) > 1;
-}
-
-
-template<class Graph>
-class AlternativesPresenceCondition : public EdgeCondition<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef EdgeCondition<Graph> base;
-
- public:
-
- AlternativesPresenceCondition(const Graph& g)
- : base(g) {
-
- }
-
- bool Check(EdgeId e) const {
- return HasAlternatives(this->g(), e);
- }
-
-};
-
-template<class Graph>
-pred::TypedPredicate<typename Graph::EdgeId> AddAlternativesPresenceCondition(const Graph& g,
- pred::TypedPredicate<typename Graph::EdgeId> condition) {
- return pred::And(AlternativesPresenceCondition<Graph>(g), condition);
-}
-
-template<class Graph>
-class CoverageUpperBound : public EdgeCondition<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- typedef EdgeCondition<Graph> base;
- const double max_coverage_;
-
- public:
-
- CoverageUpperBound(const Graph& g, double max_coverage)
- : base(g),
- max_coverage_(max_coverage) {
- }
-
- bool Check(EdgeId e) const {
- return math::le(this->g().coverage(e), max_coverage_);
- }
-
-};
-
-template<class Graph>
-class LengthUpperBound : public EdgeCondition<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- typedef EdgeCondition<Graph> base;
-
- const size_t max_length_;
-
- public:
-
- LengthUpperBound(const Graph& g, size_t max_length)
- : base(g),
- max_length_(max_length) {
- }
-
- bool Check(EdgeId e) const {
- return this->g().length(e) <= max_length_;
- }
-
-};
-
-template<class Graph, class PathFinder>
-class PathLengthLowerBound : public EdgeCondition<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef EdgeCondition<Graph> base;
-
- PathFinder path_finder_;
- size_t min_length_;
-
- ForwardDirection<Graph> forward_;
- BackwardDirection<Graph> backward_;
-
- size_t CumulativePathLength(EdgeId e, const AbstractDirection<Graph>& direction) const {
- return CumulativeLength(this->g(), path_finder_(e, direction));
- }
-
- public:
- PathLengthLowerBound(const Graph& g, const PathFinder& path_finder,
- size_t min_length)
- : base(g),
- path_finder_(path_finder),
- min_length_(min_length),
- forward_(g),
- backward_(g) {
-
- }
-
- bool Check(EdgeId e) const {
- size_t forward = CumulativePathLength(e, forward_);
- size_t backward = CumulativePathLength(e, backward_);
- //checking that path was trivial in one of directions
- VERIFY(forward == this->g().length(e) || backward == this->g().length(e));
- return std::max(forward, backward) >= min_length_;
- }
-};
-
-template<class Graph, class PathFinder>
-PathLengthLowerBound<Graph, PathFinder>
-MakePathLengthLowerBound(const Graph& g, const PathFinder& path_finder, size_t min_length) {
- return PathLengthLowerBound<Graph, PathFinder>(g, path_finder, min_length);
-}
-
-template<class Graph>
-class UniquenessPlausabilityCondition : public EdgeCondition<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef EdgeCondition<Graph> base;
-
- virtual bool CheckUniqueness(EdgeId e, bool forward) const = 0;
-
- virtual bool CheckPlausibility(EdgeId e, bool forward) const = 0;
-
- bool SingleUnique(const vector<EdgeId>& edges, bool forward) const {
- return edges.size() == 1 && CheckUniqueness(*edges.begin(), forward);
- }
-
- bool ExistPlausible(EdgeId init_e, const vector<EdgeId>& edges,
- bool forward) const {
- for (EdgeId e : edges) {
- if (e == init_e)
- continue;
- if (CheckPlausibility(e, forward)) {
- return true;
- }
- }
- return false;
- }
-
- bool Check(EdgeId e, const AbstractDirection<Graph>& direction) const {
- return SingleUnique(direction.IncomingEdges(direction.EdgeStart(e)),
- !direction.IsForward())
- && ExistPlausible(
- e, direction.OutgoingEdges(direction.EdgeStart(e)),
- direction.IsForward());
- }
-
- public:
-
- UniquenessPlausabilityCondition(const Graph& g)
- : base(g) {
-
- }
-
- bool Check(EdgeId e) const {
- return Check(e, ForwardDirection<Graph>(this->g()))
- || Check(e, BackwardDirection<Graph>(this->g()));
- }
-
-};
-
-template<class Graph>
-class PredicateUniquenessPlausabilityCondition :
- public UniquenessPlausabilityCondition<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef pred::TypedPredicate<EdgeId> EdgePredicate;
- typedef UniquenessPlausabilityCondition<Graph> base;
-
- EdgePredicate uniqueness_condition_;
- EdgePredicate plausiblity_condition_;
-
- bool CheckUniqueness(EdgeId e, bool) const {
- return uniqueness_condition_(e);
- }
-
- bool CheckPlausibility(EdgeId e, bool) const {
- return plausiblity_condition_(e);
- }
-
- public:
-
- PredicateUniquenessPlausabilityCondition(
- const Graph& g, EdgePredicate uniqueness_condition,
- EdgePredicate plausiblity_condition)
- : base(g),
- uniqueness_condition_(uniqueness_condition),
- plausiblity_condition_(plausiblity_condition) {
- }
-
-};
-
-template<class Graph>
-class DefaultUniquenessPlausabilityCondition :
- public PredicateUniquenessPlausabilityCondition<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef pred::TypedPredicate<EdgeId> EdgePredicate;
- typedef PredicateUniquenessPlausabilityCondition<Graph> base;
-
- public:
-
- DefaultUniquenessPlausabilityCondition(const Graph& g,
- size_t uniqueness_length,
- size_t plausibility_length)
- : base(g,
- MakePathLengthLowerBound(g,
- UniquePathFinder<Graph>(g), uniqueness_length),
- MakePathLengthLowerBound(g,
- PlausiblePathFinder<Graph>(g, 2 * plausibility_length), plausibility_length)) {
- }
-
-};
-
-}
diff --git a/src/include/omni/bulge_remover.hpp b/src/include/omni/bulge_remover.hpp
deleted file mode 100644
index 545c70d..0000000
--- a/src/include/omni/bulge_remover.hpp
+++ /dev/null
@@ -1,781 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * bulge_remover.hpp
- *
- * Created on: Apr 13, 2011
- * Author: sergey
- */
-
-#pragma once
-
-#include <cmath>
-#include <stack>
-#include "standard_base.hpp"
-#include "omni_utils.hpp"
-#include "graph_component.hpp"
-#include "xmath.h"
-#include "sequence/sequence_tools.hpp"
-#include "path_processor.hpp"
-#include "graph_processing_algorithm.hpp"
-
-namespace omnigraph {
-
-template<class Graph>
-struct SimplePathCondition {
- typedef typename Graph::EdgeId EdgeId;
- const Graph& g_;
-
- SimplePathCondition(const Graph& g) :
- g_(g) {
-
- }
-
- bool operator()(EdgeId edge, const vector<EdgeId>& path) const {
- if (edge == g_.conjugate(edge))
- return false;
- for (size_t i = 0; i < path.size(); ++i)
- if (edge == path[i] || edge == g_.conjugate(path[i]))
- return false;
- for (size_t i = 0; i < path.size(); ++i) {
- if (path[i] == g_.conjugate(path[i])) {
- return false;
- }
- for (size_t j = i + 1; j < path.size(); ++j)
- if (path[i] == path[j] || path[i] == g_.conjugate(path[j]))
- return false;
- }
- return true;
- }
-};
-
-template<class Graph>
-bool TrivialCondition(typename Graph::EdgeId,
- const vector<typename Graph::EdgeId>& path) {
- for (size_t i = 0; i < path.size(); ++i)
- for (size_t j = i + 1; j < path.size(); ++j)
- if (path[i] == path[j])
- return false;
- return true;
-}
-
-template<class Graph>
-class MostCoveredSimpleAlternativePathChooser: public PathProcessor<Graph>::Callback {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- const Graph& g_;
- EdgeId forbidden_edge_;
-
- double max_coverage_;
- vector<EdgeId> most_covered_path_;
-
-public:
-
- MostCoveredSimpleAlternativePathChooser(const Graph& g, EdgeId edge) :
- g_(g), forbidden_edge_(edge), max_coverage_(-1.0) {
-
- }
-
- void HandleReversedPath(const vector<EdgeId>& reversed_path) override {
- vector<EdgeId> path = this->ReversePath(reversed_path);
- double path_cov = AvgCoverage(g_, path);
- for (size_t i = 0; i < path.size(); i++) {
- if (path[i] == forbidden_edge_)
- return;
- }
- if (path_cov > max_coverage_ && SimplePathCondition<Graph>(g_)(forbidden_edge_, path)) {
- max_coverage_ = path_cov;
- most_covered_path_ = path;
- }
- }
-
- double max_coverage() {
- return max_coverage_;
- }
-
- const vector<EdgeId>& most_covered_path() {
- return most_covered_path_;
- }
-};
-
-inline size_t CountMaxDifference(size_t absolute_diff, size_t length, double relative_diff) {
- return std::max((size_t) std::floor(relative_diff * (double) length), absolute_diff);
-}
-
-template<class Graph>
-class BulgeGluer {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef std::function<void(EdgeId edge, const vector<EdgeId>& path)> BulgeCallbackF;
- Graph& g_;
- BulgeCallbackF opt_callback_;
- std::function<void(EdgeId)> removal_handler_;
-
- void InnerProcessBulge(EdgeId edge, const vector<EdgeId>& path) {
-
- EnsureEndsPositionAligner aligner(CumulativeLength(g_, path),
- g_.length(edge));
- double prefix_length = 0.;
- vector<size_t> bulge_prefix_lengths;
-
- for (EdgeId e : path) {
- prefix_length += (double) g_.length(e);
- bulge_prefix_lengths.push_back(aligner.GetPosition((size_t) prefix_length));
- }
-
- EdgeId edge_to_split = edge;
- size_t prev_length = 0;
-
- TRACE("Process bulge " << path.size() << " edges");
-
- //fixme remove after checking results
- bool flag = false;
- VERIFY(bulge_prefix_lengths.back() == g_.length(edge));
-
- for (size_t i = 0; i < path.size(); ++i) {
- if (bulge_prefix_lengths[i] > prev_length) {
- if (bulge_prefix_lengths[i] - prev_length
- != g_.length(edge_to_split)) {
-
- TRACE("SplitEdge " << g_.str(edge_to_split));
- TRACE(
- "Start: " << g_.str(g_.EdgeStart(edge_to_split)));
- TRACE(
- "Start: " << g_.str(g_.EdgeEnd(edge_to_split)));
-
- pair<EdgeId, EdgeId> split_result = g_.SplitEdge(
- edge_to_split,
- bulge_prefix_lengths[i] - prev_length);
-
- edge_to_split = split_result.second;
-
- TRACE("GlueEdges " << g_.str(split_result.first));
- flag = true;
- g_.GlueEdges(split_result.first, path[i]);
-
- } else {
- TRACE("GlueEdges " << g_.str(edge_to_split));
- flag = true;
- g_.GlueEdges(edge_to_split, path[i]);
- }
- }
- prev_length = bulge_prefix_lengths[i];
- }
- VERIFY(flag);
- }
-
-public:
-
- BulgeGluer(Graph& g, BulgeCallbackF opt_callback = 0,
- std::function<void(EdgeId)> removal_handler = 0) :
- g_(g),
- opt_callback_(opt_callback),
- removal_handler_(removal_handler) {
-
- }
-
- void operator()(EdgeId edge, const vector<EdgeId>& path) {
- if (opt_callback_)
- opt_callback_(edge, path);
-
- if (removal_handler_)
- removal_handler_(edge);
-
- VertexId start = g_.EdgeStart(edge);
- VertexId end = g_.EdgeEnd(edge);
-
- TRACE("Projecting edge " << g_.str(edge));
- InnerProcessBulge(edge, path);
-
- TRACE("Compressing start vertex " << g_.str(start));
- g_.CompressVertex(start);
-
- TRACE("Compressing end vertex " << g_.str(end));
- g_.CompressVertex(end);
- }
-
-};
-
-template<class Graph>
-class AlternativesAnalyzer {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- const Graph& g_;
- double max_coverage_;
- size_t max_length_;
- double max_relative_coverage_;
- size_t max_delta_;
- double max_relative_delta_;
- size_t max_edge_cnt_;
-
- static vector<EdgeId> EmptyPath() {
- static vector<EdgeId> vec = {};
- return vec;
- }
-
- /**
- * Checks if alternative path is simple (doesn't contain conjugate edges, edge e or conjugate(e))
- * and its average coverage * max_relative_coverage_ is greater than g.coverage(e)
- */
- bool BulgeCondition(EdgeId e, const vector<EdgeId>& path,
- double path_coverage) const {
- return math::ge(path_coverage * max_relative_coverage_,
- g_.coverage(e)) && SimplePathCondition<Graph>(g_)(e, path);
- }
-
-public:
- AlternativesAnalyzer(const Graph& g, double max_coverage, size_t max_length,
- double max_relative_coverage, size_t max_delta,
- double max_relative_delta, size_t max_edge_cnt) :
- g_(g),
- max_coverage_(max_coverage),
- max_length_(max_length),
- max_relative_coverage_(max_relative_coverage),
- max_delta_(max_delta),
- max_relative_delta_(max_relative_delta),
- max_edge_cnt_(max_edge_cnt) {
- DEBUG("Created alternatives analyzer max_length=" << max_length
- << " max_coverage=" << max_coverage
- << " max_relative_coverage=" << max_relative_coverage
- << " max_delta=" << max_delta
- << " max_relative_delta=" << max_relative_delta);
- }
-
- vector<EdgeId> operator()(EdgeId e) const {
- if (g_.length(e) > max_length_ || math::gr(g_.coverage(e), max_coverage_)) {
- return EmptyPath();
- }
-
- size_t kplus_one_mer_coverage = (size_t) math::round((double) g_.length(e) * g_.coverage(e));
- TRACE("Processing edge " << g_.str(e) << " and coverage " << kplus_one_mer_coverage);
-
- size_t delta = CountMaxDifference(max_delta_, g_.length(e), max_relative_delta_);
-
- MostCoveredSimpleAlternativePathChooser<Graph> path_chooser(g_, e);
-
- VertexId start = g_.EdgeStart(e);
- TRACE("Start " << g_.str(start));
- VertexId end = g_.EdgeEnd(e);
- TRACE("End " << g_.str(end));
-
- ProcessPaths(g_, (g_.length(e) > delta) ? g_.length(e) - delta : 0,
- g_.length(e) + delta, start, end, path_chooser, max_edge_cnt_);
-
- const vector<EdgeId>& path = path_chooser.most_covered_path();
- if (!path.empty()) {
- VERIFY(g_.EdgeStart(path[0]) == start);
- VERIFY(g_.EdgeEnd(path.back()) == end);
- }
-
- double path_coverage = path_chooser.max_coverage();
- if (math::gr(path_coverage, 0.)) {
- TRACE("Best path with coverage " << path_coverage << " is " << PrintPath(g_, path));
-
- if (BulgeCondition(e, path, path_coverage)) {
- TRACE("Satisfied condition");
- return path;
- } else {
- TRACE("Didn't satisfy condition");
- return EmptyPath();
- }
- } else {
- TRACE("Didn't find alternative");
- return EmptyPath();
- }
- }
-
- double max_coverage() const {
- return max_coverage_;
- }
-
- size_t max_length() const {
- return max_length_;
- }
-
-private:
- DECL_LOGGER("AlternativesAnalyzer");
-};
-
-template<class Graph>
-pred::TypedPredicate<typename Graph::EdgeId>
-NecessaryBulgeCondition(const Graph& g, size_t max_length, double max_coverage) {
- return AddAlternativesPresenceCondition(g,
- pred::And(LengthUpperBound<Graph>(g, max_length),
- CoverageUpperBound<Graph>(g, max_coverage)));
-}
-
-/**
- * This class removes simple bulges from given graph with the following algorithm: it iterates through all edges of
- * the graph and for each edge checks if this edge is likely to be a simple bulge
- * if edge is judged to be one it is removed.
- */
-//template<class Graph>
-//class OldBulgeRemover: public EdgeProcessingAlgorithm<Graph> {
-// typedef EdgeProcessingAlgorithm<Graph> base;
-// typedef typename Graph::EdgeId EdgeId;
-// typedef typename Graph::VertexId VertexId;
-//
-//protected:
-//
-// /*virtual*/
-// bool ProcessEdge(EdgeId e) {
-// TRACE("Considering edge " << this->g().str(e)
-// << " of length " << this->g().length(e)
-// << " and avg coverage " << this->g().coverage(e));
-//
-// if (!HasAlternatives(this->g(), e)) {
-// TRACE("Not possible bulge edge");
-// return false;
-// }
-//
-// for (const auto& analyzer : alternatives_analyzers_) {
-// vector<EdgeId> alternative = analyzer(e);
-// if (!alternative.empty()) {
-// gluer_(e, alternative);
-// return true;
-// }
-// }
-// return false;
-// }
-//
-//public:
-//
-// typedef std::function<void(EdgeId edge, const vector<EdgeId>& path)> BulgeCallbackF;
-//
-//// BulgeRemover(Graph& g, double max_coverage, size_t max_length,
-//// double max_relative_coverage, size_t max_delta,
-//// double max_relative_delta,
-//// size_t max_edge_cnt,
-//// BulgeCallbackF opt_callback = 0,
-//// std::function<void(EdgeId)> removal_handler = 0) :
-//// base(g, true),
-//// gluer_(g, opt_callback, removal_handler) {
-//// DEBUG("Launching br max_length=" << max_length
-//// << " max_coverage=" << max_coverage
-//// << " max_relative_coverage=" << max_relative_coverage
-//// << " max_delta=" << max_delta
-//// << " max_relative_delta=" << max_relative_delta
-//// << " max_number_edges=" << max_edge_cnt);
-//// alternatives_analyzers_.push_back(
-//// AlternativesAnalyzer<Graph>(g, max_coverage,
-//// max_length, max_relative_coverage,
-//// max_delta, max_relative_delta, max_edge_cnt));
-//// }
-//
-// OldBulgeRemover(Graph& g,
-// const std::vector<AlternativesAnalyzer<Graph>>& alternatives_analyzers,
-// BulgeCallbackF opt_callback = 0,
-// std::function<void(EdgeId)> removal_handler = 0) :
-// base(g, true),
-// alternatives_analyzers_(alternatives_analyzers),
-// gluer_(g, opt_callback, removal_handler) {
-// }
-//
-//private:
-// std::vector<AlternativesAnalyzer<Graph>> alternatives_analyzers_;
-// BulgeGluer<Graph> gluer_;
-//private:
-// DECL_LOGGER("BulgeRemover")
-//};
-
-template<class Graph>
-inline double AbsoluteMaxCoverage(const std::vector<AlternativesAnalyzer<Graph>>& alternatives_analyzers) {
- double ans = -1.;
- for (const auto& analyzer : alternatives_analyzers) {
- ans = std::max(ans, analyzer.max_coverage());
- }
- return ans;
-}
-
-//fixme maybe switch on parallel finder?
-template<class Graph, class InterestingElementFinder>
-class BulgeRemover: public PersistentProcessingAlgorithm<Graph,
- typename Graph::EdgeId,
- InterestingElementFinder,
- CoverageComparator<Graph>> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef PersistentProcessingAlgorithm<Graph, EdgeId,
- InterestingElementFinder, CoverageComparator<Graph>> base;
-
-protected:
-
- /*virtual*/
- bool Process(EdgeId e) {
- TRACE("Considering edge " << this->g().str(e)
- << " of length " << this->g().length(e)
- << " and avg coverage " << this->g().coverage(e));
-
- if (!HasAlternatives(this->g(), e)) {
- TRACE("Not possible bulge edge");
- return false;
- }
-
- vector<EdgeId> alternative = alternatives_analyzer_(e);
- if (!alternative.empty()) {
- gluer_(e, alternative);
- return true;
- }
- return false;
- }
-
-public:
-
- typedef std::function<void(EdgeId edge, const vector<EdgeId>& path)> BulgeCallbackF;
-
-// BulgeRemover(Graph& g, double max_coverage, size_t max_length,
-// double max_relative_coverage, size_t max_delta,
-// double max_relative_delta,
-// size_t max_edge_cnt,
-// BulgeCallbackF opt_callback = 0,
-// std::function<void(EdgeId)> removal_handler = 0) :
-// base(g, true),
-// gluer_(g, opt_callback, removal_handler) {
-// DEBUG("Launching br max_length=" << max_length
-// << " max_coverage=" << max_coverage
-// << " max_relative_coverage=" << max_relative_coverage
-// << " max_delta=" << max_delta
-// << " max_relative_delta=" << max_relative_delta
-// << " max_number_edges=" << max_edge_cnt);
-// alternatives_analyzers_.push_back(
-// AlternativesAnalyzer<Graph>(g, max_coverage,
-// max_length, max_relative_coverage,
-// max_delta, max_relative_delta, max_edge_cnt));
-// }
-
- BulgeRemover(Graph& g, const InterestingElementFinder& interesting_finder,
- const AlternativesAnalyzer<Graph>& alternatives_analyzer,
- BulgeCallbackF opt_callback = 0,
- std::function<void(EdgeId)> removal_handler = 0,
- bool track_changes = true) :
- base(g,
- interesting_finder,
- /*canonical_only*/true,
- CoverageComparator<Graph>(g),
- track_changes),
- alternatives_analyzer_(alternatives_analyzer),
- gluer_(g, opt_callback, removal_handler) {
- }
-
-private:
- AlternativesAnalyzer<Graph> alternatives_analyzer_;
- BulgeGluer<Graph> gluer_;
-private:
- DECL_LOGGER("BulgeRemover")
-};
-
-template<class Graph, class InterestingElementFinder>
-class ParallelBulgeRemover : public PersistentAlgorithmBase<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef SmartSetIterator<Graph, EdgeId, CoverageComparator<Graph>> SmartEdgeSet;
-
- size_t buff_size_;
- double buff_cov_diff_;
- double buff_cov_rel_diff_;
- AlternativesAnalyzer<Graph> alternatives_analyzer_;
- BulgeGluer<Graph> gluer_;
- InterestingElementFinder interesting_edge_finder_;
- //todo remove
- bool tracking_;
-
- size_t curr_iteration_;
-
- SmartEdgeSet it_;
-
- static vector<EdgeId> EmptyPath() {
- static vector<EdgeId> vec = {};
- return vec;
- }
-
- struct BulgeInfo : private boost::noncopyable {
- size_t id;
- EdgeId e;
- std::vector<EdgeId> alternative;
-
- BulgeInfo() :
- id(-1ul) {
- }
-
- BulgeInfo(size_t id_, EdgeId e_, std::vector<EdgeId> alternative_) :
- id(id_), e(e_), alternative(std::move(alternative_)) {
-
- }
-
- BulgeInfo(BulgeInfo&& that) {
- *this = std::move(that);
- }
-
- BulgeInfo& operator= (BulgeInfo&& that) {
- id = that.id;
- e = that.e;
- alternative = std::move(that.alternative);
- return *this;
- }
-
-// BulgeInfo(size_t id_, EdgeId e_, std::vector<EdgeId>&& alternative_) :
-// id(id_), e(e_), alternative(std::move(alternative_)) {
-//
-// }
-//
- bool operator< (const BulgeInfo& that) const {
-// VERIFY_MSG(id != that.id, "Ooops " << id);
- return id < that.id;
- }
-
- std::string str(const Graph& g) const {
- std::stringstream ss;
- ss << "BulgeInfo " << id
- << " e: " << g.str(e)
- << " path: " << PrintPath(g, alternative);
- return ss.str();
- }
-
- };
-
- bool CheckInteracting(const BulgeInfo& info, const std::unordered_set<EdgeId>& involved_edges) const {
- if (involved_edges.count(info.e))
- return true;
- for (EdgeId e : info.alternative)
- if (involved_edges.count(e))
- return true;
- return false;
- }
-
- void AccountEdge(EdgeId e, std::unordered_set<EdgeId>& involved_edges) const {
- TRACE("Pushing edge " << this->g().str(e));
- involved_edges.insert(e);
- EdgeId conj = this->g().conjugate(e);
- TRACE("Pushing edge " << this->g().str(conj));
- involved_edges.insert(conj);
- }
-
- void AccountEdges(const BulgeInfo& info, std::unordered_set<EdgeId>& involved_edges) const {
- AccountEdge(info.e, involved_edges);
- for (EdgeId e : info.alternative) {
- AccountEdge(e, involved_edges);
- }
- }
-
- //false if time to stop
- bool FillEdgeBuffer(vector<EdgeId>& buffer, pred::TypedPredicate<EdgeId> proceed_condition) {
- VERIFY(buffer.empty());
- DEBUG("Filling edge buffer of size " << buff_size_);
- perf_counter perf;
- double low_cov = 0.;
- double cov_diff = 0.;
- while (!it_.IsEnd() && buffer.size() < buff_size_) {
- EdgeId e = *it_;
- TRACE("Current edge " << this->g().str(e));
- if (!proceed_condition(e)) {
- TRACE("Stop condition was reached.");
- //need to release last element of the iterator to make it replaceable by new elements
- it_.ReleaseCurrent();
- return false;
- }
-
- double cov = this->g().coverage(e);
- if (buffer.empty()) {
- low_cov = cov;
- cov_diff = max(buff_cov_diff_, buff_cov_rel_diff_ * low_cov);
- } else {
- if (math::gr(cov, low_cov + cov_diff)) {
- //need to release last element of the iterator to make it replaceable by new elements
- it_.ReleaseCurrent();
- return true;
- }
- }
- TRACE("Potential bulge edge");
- buffer.push_back(e);
- ++it_;
- }
-
- DEBUG("Filled in " << perf.time() << " seconds");
- if (buffer.size() == buff_size_) {
- TRACE("Buffer filled");
- return true;
- } else {
- TRACE("No more edges in iterator");
- return false;
- }
- }
-
- std::vector<std::vector<BulgeInfo>> FindBulges(const std::vector<EdgeId> edge_buffer) const {
- DEBUG("Looking for bulges (in parallel). Edge buffer size " << edge_buffer.size());
- perf_counter perf;
- std::vector<std::vector<BulgeInfo>> bulge_buffers(omp_get_max_threads());
- size_t n = edge_buffer.size();
- //order is in agreement with coverage
- #pragma omp parallel for schedule(guided)
- for (size_t i = 0; i < n; ++i) {
- EdgeId e = edge_buffer[i];
- auto alternative = alternatives_analyzer_(e);
- if (!alternative.empty()) {
- bulge_buffers[omp_get_thread_num()].push_back(BulgeInfo(i, e, std::move(alternative)));
- }
- }
- DEBUG("Bulges found in " << perf.time() << " seconds");
- return bulge_buffers;
- }
-
- std::vector<BulgeInfo> MergeBuffers(std::vector<std::vector<BulgeInfo>>&& buffers) const {
- DEBUG("Merging bulge buffers");
- perf_counter perf;
-
- std::vector<BulgeInfo> merged_bulges;
- for (auto& bulge_buffer : buffers) {
- std::copy(std::make_move_iterator(bulge_buffer.begin()),
- std::make_move_iterator(bulge_buffer.end()),
- std::back_inserter(merged_bulges));
- }
-
- DEBUG("Sorting");
- //order is in agreement with coverage
- std::sort(merged_bulges.begin(), merged_bulges.end());
- DEBUG("Total bulges " << merged_bulges.size());
- DEBUG("Buffers merged in " << perf.time() << " seconds");
- return merged_bulges;
- }
-
- SmartEdgeSet RetainIndependentBulges(std::vector<BulgeInfo>& bulges) const {
- DEBUG("Looking for independent bulges");
- size_t total_cnt = bulges.size();
- perf_counter perf;
-
- std::vector<BulgeInfo> filtered;
- filtered.reserve(bulges.size());
- //fixme switch to involved vertices to bring fully parallel glueing closer
- std::unordered_set<EdgeId> involved_edges;
- SmartEdgeSet interacting_edges(this->g(), false, CoverageComparator<Graph>(this->g()));
-
- for (BulgeInfo& info : bulges) {
- TRACE("Analyzing interactions of " << info.str(this->g()));
- if (CheckInteracting(info, involved_edges)) {
- TRACE("Interacting");
- interacting_edges.push(info.e);
- } else {
- TRACE("Independent");
- AccountEdges(info, involved_edges);
- filtered.push_back(std::move(info));
- }
- }
- bulges = std::move(filtered);
-
- DEBUG("Independent bulges identified in " << perf.time() << " seconds");
- DEBUG("Independent cnt " << bulges.size());
- DEBUG("Interacting cnt " << interacting_edges.size());
- VERIFY(bulges.size() + interacting_edges.size() == total_cnt);
-
- return interacting_edges;
- }
-
- bool ProcessBulges(const std::vector<BulgeInfo>& independent_bulges, SmartEdgeSet&& interacting_edges) {
- DEBUG("Processing bulges");
- perf_counter perf;
-
- bool triggered = false;
-
- for (const BulgeInfo& info : independent_bulges) {
- TRACE("Processing bulge " << info.str(this->g()));
- triggered = true;
- gluer_(info.e, info.alternative);
- }
-
- DEBUG("Independent bulges glued in " << perf.time() << " seconds");
- perf.reset();
-
- DEBUG("Processing remaining interacting bulges " << interacting_edges.size());
- //usual br strategy
- for (; !interacting_edges.IsEnd(); ++interacting_edges) {
- EdgeId e = *interacting_edges;
- TRACE("Processing edge " << this->g().str(e));
- std::vector<EdgeId> alternative = alternatives_analyzer_(e);
- if (!alternative.empty()) {
- gluer_(e, alternative);
- triggered = true;
- }
- }
- DEBUG("Interacting edges processed in " << perf.time() << " seconds");
- return triggered;
- }
-
-public:
-
- typedef std::function<void(EdgeId edge, const vector<EdgeId>& path)> BulgeCallbackF;
-
- ParallelBulgeRemover(Graph& g, const InterestingElementFinder& interesting_edge_finder,
- size_t buff_size, double buff_cov_diff,
- double buff_cov_rel_diff, const AlternativesAnalyzer<Graph>& alternatives_analyzer,
- BulgeCallbackF opt_callback = 0,
- std::function<void(EdgeId)> removal_handler = 0,
- bool track_changes = true) :
- PersistentAlgorithmBase<Graph>(g),
- buff_size_(buff_size),
- buff_cov_diff_(buff_cov_diff),
- buff_cov_rel_diff_(buff_cov_rel_diff),
- alternatives_analyzer_(alternatives_analyzer),
- gluer_(g, opt_callback, removal_handler),
- interesting_edge_finder_(interesting_edge_finder),
- tracking_(track_changes),
- curr_iteration_(0),
- it_(g, true, CoverageComparator<Graph>(g), true) {
- VERIFY(buff_size_ > 0);
- it_.Detach();
- }
-
- bool Run(bool force_primary_launch = false) override {
- bool primary_launch = force_primary_launch ? true : curr_iteration_ == 0;
- //todo remove if not needed;
- //potentially can vary coverage threshold in coordination with ec threshold
- auto proceed_condition = pred::AlwaysTrue<EdgeId>();
-
- if (!it_.IsAttached()) {
- it_.Attach();
- }
- if (primary_launch) {
- it_.clear();
- TRACE("Primary launch.");
- TRACE("Start search for interesting edges");
- interesting_edge_finder_.Run(it_);
- TRACE(it_.size() << " interesting edges to process");
- } else {
- VERIFY(tracking_);
- TRACE(it_.size() << " edges to process");
- }
-
- bool triggered = false;
- bool proceed = true;
- while (proceed) {
- std::vector<EdgeId> edge_buffer;
- edge_buffer.reserve(buff_size_);
- proceed = FillEdgeBuffer(edge_buffer, proceed_condition);
-
- std::vector<BulgeInfo> bulges = MergeBuffers(FindBulges(edge_buffer));
-
- auto interacting_edges = RetainIndependentBulges(bulges);
-
- bool inner_triggered = ProcessBulges(bulges, std::move(interacting_edges));
- proceed |= inner_triggered;
- triggered |= inner_triggered;
- }
-
- TRACE("Finished processing. Triggered = " << triggered);
- if (!tracking_)
- it_.Detach();
-
- curr_iteration_++;
-
- return triggered;
- }
-
-private:
- DECL_LOGGER("ParallelBulgeRemover")
-};
-
-}
diff --git a/src/include/omni/complex_bulge_remover.hpp b/src/include/omni/complex_bulge_remover.hpp
deleted file mode 100644
index 1704f73..0000000
--- a/src/include/omni/complex_bulge_remover.hpp
+++ /dev/null
@@ -1,1162 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include <cmath>
-#include <stack>
-#include <queue>
-#include "adt/concurrent_dsu.hpp"
-#include "standard_base.hpp"
-#include "omni_utils.hpp"
-#include "graph_component.hpp"
-#include "xmath.h"
-#include "sequence/sequence_tools.hpp"
-#include "path_processor.hpp"
-#include "omni/visualization/visualization.hpp"
-
-
-namespace omnigraph {
-
-namespace complex_br {
-
-template<class Graph>
-class LocalizedComponent: public GraphActionHandler<Graph> /*: public GraphComponent<Graph>*/{
- typedef GraphActionHandler<Graph> base;
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- const Graph& g_;
- VertexId start_vertex_;
- set<VertexId> end_vertices_;
- //usage of inclusive-inclusive range!!!
- map<VertexId, Range> vertex_depth_;
- multimap<size_t, VertexId> height_2_vertices_;
- size_t diff_threshold_;
-
- bool AllEdgeOut(VertexId v) const {
- for (EdgeId e : g_.OutgoingEdges(v)) {
- if (contains(g_.EdgeEnd(e)))
- return false;
- }
- return true;
- }
-
- bool AllEdgeIn(VertexId v) const {
- for (EdgeId e : g_.OutgoingEdges(v)) {
- if (!contains(g_.EdgeEnd(e)))
- return false;
- }
- return true;
- }
-
- size_t Average(Range r) const {
- return r.start_pos;
- }
-
-public:
-
-// template <class It>
- LocalizedComponent(const Graph& g, //It begin, It end,
- VertexId start_vertex/*, const vector<VertexId>& end_vertices*/) :
- base(g, "br_component"), g_(g), start_vertex_(start_vertex) {
- end_vertices_.insert(start_vertex);
- vertex_depth_.insert(make_pair(start_vertex_, Range(0, 0)));
- height_2_vertices_.insert(make_pair(0, start_vertex));
- }
-
- const Graph& g() const {
- return g_;
- }
-
- bool IsEndVertex(VertexId v) const {
- for (EdgeId e : g_.OutgoingEdges(v)) {
- if (contains(g_.EdgeEnd(e)))
- return false;
- }
- return true;
- }
-
- void AddVertex(VertexId v, Range dist_range) {
-// VERIFY(CheckCloseNeighbour(v));
-// Range r = NeighbourDistanceRange(v);
- DEBUG("Adding vertex " << g_.str(v) << " to the component");
- vertex_depth_.insert(make_pair(v, dist_range));
- height_2_vertices_.insert(make_pair(Average(dist_range), v));
- DEBUG(
- "Range " << dist_range << " Average height " << Average(dist_range));
- for (EdgeId e : g_.IncomingEdges(v)) {
- end_vertices_.erase(g_.EdgeStart(e));
- }
- if (IsEndVertex(v)) {
- end_vertices_.insert(v);
- }
- }
-
- //todo what if path processor will fail inside
- size_t TotalPathCount() const {
- size_t answer = 0;
- for (VertexId end_v : end_vertices_) {
- PathStorageCallback<Graph> path_storage(g_);
- Range r = vertex_depth_.find(end_v)->second;
- ProcessPaths(g_, r.start_pos, r.end_pos, start_vertex_, end_v, path_storage);
- answer += path_storage.size();
- }
- return answer;
- }
-
- bool CheckCompleteness() const {
- for (VertexId v : key_set(vertex_depth_)) {
- if (v == start_vertex_)
- continue;
- if (!AllEdgeIn(v) && !AllEdgeOut(v))
- return false;
- }
- return true;
- }
-
- bool NeedsProjection() const {
- DEBUG("Checking if component needs projection");
- size_t tot_path_count = TotalPathCount();
- bool answer = tot_path_count > end_vertices_.size();
-// more robust to path processor failure this way VERIFY(tot_path_count >= end_vertices_.size());
- if (answer) {
- DEBUG("Needs projection");
- } else {
- DEBUG("Doesn't need projection");
- }
- return answer;
- }
-
- bool contains(VertexId v) const {
- return vertex_depth_.count(v) > 0;
- }
-
- bool contains(EdgeId e) const {
- return contains(g_.EdgeStart(e)) && contains(g_.EdgeEnd(e));
- }
-
- Range distance_range(VertexId v) const {
- VERIFY(contains(v));
- return vertex_depth_.find(v)->second;
- }
-
- size_t avg_distance(VertexId v) const {
- VERIFY(contains(v));
- return Average(vertex_depth_.find(v)->second);
- }
-
- set<size_t> avg_distances() const {
- set<size_t> distances;
- for (VertexId v : key_set(vertex_depth_)) {
- distances.insert(avg_distance(v));
- }
- return distances;
- }
-
- VertexId start_vertex() const {
- return start_vertex_;
- }
-
- const set<VertexId>& end_vertices() const {
- return end_vertices_;
- }
-
- bool CheckCloseNeighbour(VertexId v) const {
- DEBUG("Check if vertex " << g_.str(v) << " can be processed");
- for (EdgeId e : g_.IncomingEdges(v)) {
- if (!contains(g_.EdgeStart(e))) {
- DEBUG(
- "Blocked by unprocessed or external vertex " << g_.int_id(g_.EdgeStart(e)) << " that starts edge " << g_.int_id(e));
- DEBUG("Check fail");
- return false;
- }
- }
- DEBUG("Check ok");
- return true;
- }
-
- GraphComponent<Graph> AsGraphComponent() const {
- set<VertexId> vertices = key_set(vertex_depth_);
- return GraphComponent<Graph>(g_, vertices.begin(), vertices.end());
- }
-
- bool ContainsConjugateVertices() const {
- set<VertexId> conjugate_vertices;
- for (VertexId v : key_set(vertex_depth_)) {
- if (conjugate_vertices.count(v) == 0) {
- conjugate_vertices.insert(g_.conjugate(v));
- } else {
- return true;
- }
- }
- return false;
- }
-
- virtual void HandleDelete(VertexId v) {
- VERIFY(end_vertices_.count(v) == 0);
- if (contains(v)) {
- DEBUG("Deleting vertex " << g_.str(v) << " from the component");
- size_t depth = avg_distance(v);
- vertex_depth_.erase(v);
- for (auto it = height_2_vertices_.lower_bound(depth);
- it != height_2_vertices_.upper_bound(depth); ++it) {
- if (it->second == v) {
- height_2_vertices_.erase(it);
- return;
- }
- }
- VERIFY(false);
- }
-
- }
-
- virtual void HandleDelete(EdgeId /*e*/) {
- //empty for now
- }
-
- virtual void HandleMerge(const vector<EdgeId>& /*old_edges*/, EdgeId /*new_edge*/) {
- VERIFY(false);
- }
-
- virtual void HandleGlue(EdgeId /*new_edge*/, EdgeId /*edge1*/, EdgeId /*edge2*/) {
- //empty for now
- }
-
- virtual void HandleSplit(EdgeId old_edge, EdgeId new_edge_1, EdgeId /*new_edge_2*/) {
- VERIFY(old_edge != g_.conjugate(old_edge));
- VertexId start = g_.EdgeStart(old_edge);
- VertexId end = g_.EdgeEnd(old_edge);
- if (contains(start)) {
- VERIFY(vertex_depth_.count(end) > 0);
- VERIFY(avg_distance(end) > avg_distance(start));
- VertexId new_vertex = g_.EdgeEnd(new_edge_1);
- Range new_vertex_depth(distance_range(start));
- new_vertex_depth.shift((int) g_.length(new_edge_1));
- //todo do better later (needs to be synched with splitting strategy)
-// + (vertex_depth_[end] - vertex_depth_[start])
-// * g_.length(new_edge_1) / g_.length(old_edge);
- DEBUG(
- "Inserting vertex " << g_.str(new_vertex) << " to component during split");
- vertex_depth_.insert(make_pair(new_vertex, new_vertex_depth));
- height_2_vertices_.insert(
- make_pair(Average(new_vertex_depth), new_vertex));
- }
- }
-
- const multimap<size_t, VertexId>& height_2_vertices() const {
- return height_2_vertices_;
- }
-
- const set<VertexId> vertices_on_height(size_t height) const {
- set<VertexId> answer;
- for (auto it = height_2_vertices_.lower_bound(height);
- it != height_2_vertices_.upper_bound(height); ++it) {
- answer.insert(it->second);
- }
- return answer;
- }
-
-private:
- DECL_LOGGER("LocalizedComponent")
- ;
-};
-
-template<class Graph>
-class SkeletonTree: public GraphActionHandler<Graph> {
- typedef GraphActionHandler<Graph> base;
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
-public:
-
- const set<EdgeId>& edges() const {
- return edges_;
- }
-
- const set<VertexId>& vertices() const {
- return vertices_;
- }
-
- bool Contains(EdgeId e) const {
-// VertexId start = br_comp_.g().EdgeStart(e);
-// if (next_edges_.count(start) > 0) {
-// const vector<EdgeId> edges = next_edges_.find(start)->second;
-// return find(e, next_edges_.lower_bound(start), next_edges_.upper_bound(start)) != edges.end();
-// }
-// return false;
- return edges_.count(e) > 0;
- }
-
- bool Contains(VertexId v) const {
-// return next_edges_.count(v) > 0;
- return vertices_.count(v) > 0;
- }
-
- virtual void HandleDelete(VertexId v) {
- //verify v not in the tree
- VERIFY(!Contains(v));
- }
-
- virtual void HandleDelete(EdgeId e) {
- //verify e not in the tree
- DEBUG("Trying to delete " << br_comp_.g().str(e));
- VERIFY(!Contains(e));
- }
-
- virtual void HandleMerge(const vector<EdgeId>& old_edges, EdgeId /*new_edge*/) {
- //verify false
- for (EdgeId e : old_edges) {
- VERIFY(!Contains(e));
- }
- }
-
- virtual void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) {
-// verify edge2 in tree
-// put new_edge instead of edge2
- DEBUG("Glueing " << br_comp_.g().str(new_edge) << " " << br_comp_.g().str(edge1) << " " << br_comp_.g().str(edge2));
- if (Contains(edge2)) {
- DEBUG("Erasing from tree: " << br_comp_.g().str(edge2));
- DEBUG("Inserting to tree: " << br_comp_.g().str(new_edge));
- edges_.erase(edge2);
- edges_.insert(new_edge);
- }
- }
-
- virtual void HandleSplit(EdgeId old_edge, EdgeId new_edge_1,
- EdgeId new_edge_2) {
- VERIFY(old_edge != br_comp_.g().conjugate(old_edge));
- if (Contains(old_edge)) {
- edges_.erase(old_edge);
- vertices_.insert(br_comp_.g().EdgeEnd(new_edge_1));
- edges_.insert(new_edge_1);
- edges_.insert(new_edge_2);
- }
- }
-
- SkeletonTree(const LocalizedComponent<Graph>& br_comp,
- const set<EdgeId>& edges) :
- base(br_comp.g(), "br_tree"), br_comp_(br_comp), edges_(edges) {
- DEBUG("Tree edges " << br_comp.g().str(edges));
- for (EdgeId e : edges_) {
- vertices_.insert(br_comp_.g().EdgeStart(e));
- vertices_.insert(br_comp_.g().EdgeEnd(e));
- }
- }
-
-private:
- const LocalizedComponent<Graph>& br_comp_;
- set<EdgeId> edges_;
- set<VertexId> vertices_;
-
-private:
- DECL_LOGGER("SkeletonTree")
- ;
-};
-
-typedef size_t mask;
-typedef mask mixed_color_t;
-typedef unsigned primitive_color_t;
-
-template<class Graph>
-class ComponentColoring: public GraphActionHandler<Graph> {
- typedef GraphActionHandler<Graph> base;
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
-public:
-
- size_t CountPrimitiveColors(mixed_color_t color) const {
- size_t cnt = 0;
- for (size_t shift = 0; shift < color_cnt_; ++shift) {
- mixed_color_t prim_color = 1 << shift;
- if ((prim_color & color) != 0) {
- cnt++;
- }
- }
- VERIFY(cnt > 0);
- return cnt;
- }
-
- primitive_color_t GetAnyPrimitiveColor(mixed_color_t color) const {
- for (size_t shift = 0; shift < color_cnt_; ++shift) {
- if ((1 << shift & color) != 0) {
- return primitive_color_t(shift);
- }
- }
- VERIFY(false);
- return 0;
- }
-
- bool IsSubset(mixed_color_t super_set, mixed_color_t sub_set) const {
- return (super_set | sub_set) == super_set;
- }
-
-private:
-
- const LocalizedComponent<Graph>& comp_;
- const size_t color_cnt_;
- map<VertexId, mixed_color_t> vertex_colors_;
-
- mixed_color_t CountVertexColor(VertexId v) const {
- mixed_color_t answer = mixed_color_t(0);
- for (EdgeId e : comp_.g().OutgoingEdges(v)) {
- answer |= color(e);
- }
- return answer;
- }
-
- void CountAndSetVertexColor(VertexId v) {
- vertex_colors_.insert(make_pair(v, CountVertexColor(v)));
- }
-
- void ColorComponent() {
- DEBUG("Coloring component");
- size_t cnt = 0;
- for (VertexId v : comp_.end_vertices()) {
- mixed_color_t color = 1 << cnt;
- DEBUG("Coloring exit " << comp_.g().str(v));
- vertex_colors_.insert(make_pair(v, color));
- cnt++;
- }
- for (auto it = comp_.height_2_vertices().rbegin();
- it != comp_.height_2_vertices().rend(); ++it) {
- if (vertex_colors_.count(it->second) == 0) {
- DEBUG("Coloring vertex " << comp_.g().str(it->second));
- CountAndSetVertexColor(it->second);
- }
- }
- DEBUG("Component colored");
- }
-
-public:
-
- ComponentColoring(const LocalizedComponent<Graph>& comp) :
- base(comp.g(), "br_comp_coloring"), comp_(comp), color_cnt_(
- comp_.end_vertices().size()) {
- VERIFY(comp.end_vertices().size() <= sizeof(size_t) * 8);
- ColorComponent();
- }
-
- mixed_color_t color(VertexId v) const {
- auto it = vertex_colors_.find(v);
- if (it == vertex_colors_.end()) {
- DEBUG("No color for vertex " << comp_.g().str(v));
- DEBUG(
- "Incoming edges " << comp_.g().str(comp_.g().IncomingEdges(v)));
- DEBUG(
- "Outgoing edges " << comp_.g().str(comp_.g().OutgoingEdges(v)));
- }
- VERIFY(it != vertex_colors_.end());
- return it->second;
- }
-
- mixed_color_t color(EdgeId e) const {
- return color(comp_.g().EdgeEnd(e));
- }
-
- virtual void HandleDelete(VertexId v) {
- vertex_colors_.erase(v);
- }
-
- virtual void HandleMerge(const vector<EdgeId>& /*old_edges*/, EdgeId /*new_edge*/) {
- VERIFY(false);
- }
-
- virtual void HandleGlue(EdgeId /*new_edge*/, EdgeId edge1, EdgeId edge2) {
- if (comp_.contains(edge1)) {
- VERIFY(comp_.contains(edge2));
- VERIFY(IsSubset(color(edge2), color(edge1)));
- }
- }
-
- virtual void HandleSplit(EdgeId old_edge, EdgeId new_edge_1,
- EdgeId /*new_edge_2*/) {
- VERIFY(old_edge != comp_.g().conjugate(old_edge));
- if (comp_.contains(old_edge)) {
- CountAndSetVertexColor(comp_.g().EdgeEnd(new_edge_1));
- }
- }
-
-private:
- DECL_LOGGER("ComponentColoring")
- ;
-};
-
-template<class Graph>
-class SkeletonTreeFinder {
-
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef ConcurrentDSU color_partition_ds_t;
-
- const LocalizedComponent<Graph>& component_;
- const ComponentColoring<Graph>& coloring_;
-
- vector<size_t> level_heights_;
-
- int current_level_;
- color_partition_ds_t current_color_partition_;
-
- set<VertexId> good_vertices_;
- set<EdgeId> good_edges_;
- map<VertexId, vector<EdgeId>> next_edges_;
- map<VertexId, size_t> subtree_coverage_;
-
- bool ConsistentWithPartition(mixed_color_t color) const {
- return current_color_partition_.set_size(
- GetCorrespondingDisjointSet(color))
- == coloring_.CountPrimitiveColors(color);
- }
-
- bool IsGoodEdge(EdgeId e) const {
-// VertexId start = component_.g().EdgeStart(e);
- VertexId end = component_.g().EdgeEnd(e);
- //check if end is good
- if (good_vertices_.count(end) == 0)
- return false;
-
-// is subcase of next case
-// //check if end is from previous level
-// if (component_.avg_distance(end) == level_heights_[current_level_+1])
-// return true;
-
- //check if end color is consistent with partition
- //on level before the start
- return ConsistentWithPartition(coloring_.color(end));
- }
-
- vector<EdgeId> GoodOutgoingEdges(VertexId v) const {
- vector<EdgeId> answer;
- for (EdgeId e : component_.g().OutgoingEdges(v)) {
- if (IsGoodEdge(e)) {
- DEBUG("Edge " << component_.g().str(e) << " is classified as good");
- answer.push_back(e);
- } else {
- DEBUG("Edge " << component_.g().str(e) << " is classified as NOT good");
- }
- }
- return answer;
- }
-
- vector<EdgeId> GoodOutgoingEdges(const vector<VertexId>& vertices) const {
- vector<EdgeId> answer;
- for (VertexId v : vertices) {
- if (component_.end_vertices().count(v) == 0) {
- push_back_all(answer, GoodOutgoingEdges(v));
- }
- }
- return answer;
- }
-
- set<EdgeId> VectorAsSet(const vector<EdgeId>& edges) const {
- return set<EdgeId>(edges.begin(), edges.end());
- }
-
- template<class T>
- vector<T> SetAsVector(const set<T>& edges) const {
- return vector<T>(edges.begin(), edges.end());
- }
-
- primitive_color_t GetCorrespondingDisjointSet(mixed_color_t color) const {
- return (primitive_color_t) current_color_partition_.find_set(
- coloring_.GetAnyPrimitiveColor(color));
- }
-
- void UpdateColorPartitionWithVertex(VertexId v) {
- VERIFY(component_.g().OutgoingEdgeCount(v) > 0);
- primitive_color_t ds = GetCorrespondingDisjointSet(
- coloring_.color(*(component_.g().OutgoingEdges(v).begin())));
- for (EdgeId e : component_.g().OutgoingEdges(v)) {
- current_color_partition_.unite(ds,
- GetCorrespondingDisjointSet(coloring_.color(e)));
- }
- }
-
- bool IsGoodVertex(VertexId v) const {
- if (!ConsistentWithPartition(coloring_.color(v)))
- return false;
- mixed_color_t union_color_of_good_children = mixed_color_t(0);
- for (EdgeId e : component_.g().OutgoingEdges(v)) {
- if (good_edges_.count(e) > 0) {
- union_color_of_good_children |= coloring_.color(e);
- }
- }
- return coloring_.color(v) == union_color_of_good_children;
- }
-
- void Init() {
- current_level_ = (int) level_heights_.size() - 1;
- size_t end_cnt = 0;
- for (VertexId v : component_.end_vertices()) {
- good_vertices_.insert(v);
- subtree_coverage_[v] = 0;
- end_cnt++;
- }
- }
-
- size_t absolute_coverage(EdgeId e) {
- return (size_t) (component_.g().coverage(e) * (double) component_.g().length(e));
- }
-
- void UpdateNextEdgesAndCoverage(VertexId v) {
- map<mixed_color_t, size_t> best_subtrees_coverage;
- map<mixed_color_t, EdgeId> best_alternatives;
- for (EdgeId e : component_.g().OutgoingEdges(v)) {
- if (good_edges_.count(e) > 0) {
- VertexId end = component_.g().EdgeEnd(e);
- mixed_color_t color = coloring_.color(e);
- VERIFY(subtree_coverage_.count(end) > 0);
- if (subtree_coverage_[end] + absolute_coverage(e)
- >= best_subtrees_coverage[color]) {
- best_subtrees_coverage[color] = subtree_coverage_[end]
- + absolute_coverage(e);
- best_alternatives[color] = e;
- }
- }
- }
- size_t coverage = 0;
- for (size_t cov : value_set(best_subtrees_coverage)) {
- coverage += cov;
- }
- next_edges_[v] = SetAsVector<EdgeId>(value_set(best_alternatives));
- subtree_coverage_[v] = coverage;
- }
-
-public:
- SkeletonTreeFinder(const LocalizedComponent<Graph>& component,
- const ComponentColoring<Graph>& coloring) :
- component_(component),
- coloring_(coloring),
- level_heights_(SetAsVector<size_t>(component_.avg_distances())),
- current_level_((int) level_heights_.size() - 1),
- current_color_partition_(component_.end_vertices().size()) {
-
- Init();
- }
-
- const set<EdgeId> GetTreeEdges() const {
- set<EdgeId> answer;
- std::queue<VertexId> vertex_queue;
- vertex_queue.push(component_.start_vertex());
- while (!vertex_queue.empty()) {
- VertexId v = vertex_queue.front();
- vertex_queue.pop();
- if (next_edges_.count(v) == 0)
- continue;
- for (EdgeId e : next_edges_.find(v)->second) {
- answer.insert(e);
- vertex_queue.push(component_.g().EdgeEnd(e));
- }
- }
- return answer;
- }
-
- const map<VertexId, vector<EdgeId>>& GetTree() const {
- return next_edges_;
- }
-
- bool FindTree() {
- DEBUG("Looking for tree");
- while (current_level_ >= 0) {
- size_t height = level_heights_[current_level_];
- DEBUG("Processing level " << current_level_ << " on height " << height);
- set<VertexId> level_vertices = component_.vertices_on_height(
- height);
- VERIFY(!level_vertices.empty());
-
- //looking for good edges
- insert_all(good_edges_,
- GoodOutgoingEdges(
- vector<VertexId>(level_vertices.begin(),
- level_vertices.end())));
-
-
-
- //counting colors and color partitions
- for (VertexId v : level_vertices) {
- if (component_.end_vertices().count(v) == 0) {
- UpdateColorPartitionWithVertex(v);
- if (IsGoodVertex(v)) {
- DEBUG("Vertex " << component_.g().str(v) << " is classified as good");
- good_vertices_.insert(v);
- UpdateNextEdgesAndCoverage(v);
- } else {
- DEBUG("Vertex " << component_.g().str(v) << " is classified as NOT good");
- }
- }
- }
- current_level_--;
- }
- if (good_vertices_.count(component_.start_vertex()) > 0) {
- DEBUG("Looking for tree was successful");
- return true;
- } else {
- DEBUG("Looking for tree failed");
- return false;
- }
- }
-
-private:
- DECL_LOGGER("SkeletonTreeFinder")
- ;
-};
-
-template<class Graph>
-void PrintComponent(const LocalizedComponent<Graph>& component,
- const SkeletonTree<Graph>& tree, const string& file_name) {
- typedef typename Graph::EdgeId EdgeId;
- const set<EdgeId> tree_edges = tree.edges();
- shared_ptr<omnigraph::visualization::ElementColorer<typename Graph::EdgeId>> edge_colorer = make_shared<omnigraph::visualization::MapColorer<EdgeId>>(
- tree_edges.begin(), tree_edges.end(),"green", ""
- );
- visualization::WriteComponentSinksSources(component.AsGraphComponent(), file_name,
- omnigraph::visualization::DefaultColorer(component.g(), edge_colorer),
- *StrGraphLabelerInstance(component.g()));
-}
-
-template<class Graph>
-void PrintComponent(const LocalizedComponent<Graph>& component,
- const string& file_name) {
- visualization::WriteComponent(component.AsGraphComponent(), file_name,
- omnigraph::visualization::DefaultColorer(component.g()),
- *StrGraphLabelerInstance(component.g()));
-}
-
-
-
-template<class Graph>
-class ComponentProjector {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- Graph& g_;
- const LocalizedComponent<Graph>& component_;
- const ComponentColoring<Graph>& coloring_;
- const SkeletonTree<Graph>& tree_;
-
-// DEBUG("Result: edges " << g_.str(split_res.first) << " " << g_.str(split_res.second));
-// DEBUG("New vertex" << g_.str(inner_v) << " ");
-
- bool SplitComponent() {
- DEBUG("Splitting component");
- set<size_t> level_heights(component_.avg_distances());
- DEBUG("Level heights " << ToString<size_t>(level_heights));
-
- GraphComponent<Graph> gc = component_.AsGraphComponent();
-
- for (auto it = gc.e_begin(); it != gc.e_end(); ++it) {
- VertexId start_v = g_.EdgeStart(*it);
- VertexId end_v = g_.EdgeEnd(*it);
- size_t start_dist = component_.avg_distance(start_v);
- size_t end_dist = component_.avg_distance(end_v);
- DEBUG(
- "Processing edge " << g_.str(*it) << " avg_start " << start_dist << " avg_end " << end_dist);
- set<size_t> dist_to_split(level_heights.lower_bound(start_dist),
- level_heights.upper_bound(end_dist));
- DEBUG("Distances to split " << ToString<size_t>(dist_to_split));
-
- size_t offset = start_dist;
- EdgeId e = *it;
- for (auto split_it = dist_to_split.begin();
- split_it != dist_to_split.end(); ++split_it) {
- size_t curr = *split_it;
- if (curr == start_dist || curr == end_dist)
- continue;
- DEBUG("Splitting on " << curr);
- size_t pos = curr - offset;
- if(pos >= g_.length(e)) {
- return false;
- }
- DEBUG("Splitting edge " << g_.str(e) << " on position " << pos);
- pair<EdgeId, EdgeId> split_res = g_.SplitEdge(e, pos);
- //checks accordance
- VertexId inner_v = g_.EdgeEnd(split_res.first);
- VERIFY(component_.avg_distance(inner_v) == curr);
- e = split_res.second;
- offset = curr;
- }
- }
- DEBUG("Component split");
- return true;
- }
-
- EdgeId CorrespondingTreeEdge(EdgeId e) const {
- DEBUG("Getting height of vertex " << g_.str(g_.EdgeStart(e)));
- size_t start_height = component_.avg_distance(g_.EdgeStart(e));
- DEBUG("Done");
- mixed_color_t color = coloring_.color(e);
- DEBUG("Getting height of vertex " << g_.str(g_.EdgeEnd(e)));
- size_t end_height = component_.avg_distance(g_.EdgeEnd(e));
- DEBUG("Done");
- for (VertexId v : component_.vertices_on_height(start_height)) {
- if (component_.end_vertices().count(v) == 0) {
- for (EdgeId e : g_.OutgoingEdges(v)) {
- VERIFY(
- component_.avg_distance(g_.EdgeEnd(e)) == end_height);
- if (tree_.Contains(e)
- && coloring_.IsSubset(coloring_.color(e), color)) {
- return e;
- }
- }
- }
- }
- VERIFY(false);
- return EdgeId(NULL);
- }
-
-public:
-
- bool ProjectComponent() {
- if(!SplitComponent()) {
- DEBUG("Component can't be split");
- return false;
- }
-
- DEBUG("Projecting split component");
- GraphComponent<Graph> gc = component_.AsGraphComponent();
-
- for (auto it = SmartSetIterator<Graph, EdgeId>(g_, gc.e_begin(),
- gc.e_end()); !it.IsEnd(); ++it) {
- DEBUG("Trying to project edge " << g_.str(*it));
- EdgeId target = CorrespondingTreeEdge(*it);
- DEBUG("Target found " << g_.str(target));
- if (target != *it) {
- DEBUG(
- "Glueing " << g_.str(*it) << " to target " << g_.str(target));
- g_.GlueEdges(*it, target);
- DEBUG("Glued");
- }
- DEBUG("Edge processed");
- }
- DEBUG("Component projected");
- return true;
- }
-
- ComponentProjector(Graph& g, const LocalizedComponent<Graph>& component,
- const ComponentColoring<Graph>& coloring,
- const SkeletonTree<Graph>& tree) :
- g_(g), component_(component), coloring_(coloring), tree_(tree) {
-
- }
-
-private:
- DECL_LOGGER("ComponentProjector")
- ;
-};
-
-template<class Graph>
-class LocalizedComponentFinder {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- static const size_t exit_bound = 32;
- static const size_t inf = -1ul;
-
- Graph& g_;
- size_t max_length_;
- size_t length_diff_threshold_;
-
- LocalizedComponent<Graph> comp_;
-
- map<VertexId, Range> dominated_;
- set<VertexId> interfering_;
-
- std::string ToString(EdgeId e) const {
- std::stringstream ss;
- ss << g_.str(e)
- << " start: "
- << g_.str(g_.EdgeStart(e))
- << " end: "
- << g_.str(g_.EdgeEnd(e));
- return ss.str();
- }
-
- bool CheckCompleteness() const {
- if (interfering_.size() == 0) {
- VERIFY(comp_.CheckCompleteness());
- return true;
- }
- return false;
- }
-
- //false if new interfering vertex is not dominated
- //can be slightly modified in new algorithm
- bool ProcessLocality(VertexId processing_v) {
- vector<VertexId> processed_neighb;
- vector<VertexId> unprocessed_neighb;
- for (EdgeId e : g_.OutgoingEdges(processing_v)) {
- VertexId v = g_.EdgeEnd(e);
- if (!comp_.contains(v)) {
- unprocessed_neighb.push_back(v);
- } else {
- processed_neighb.push_back(v);
- }
- }
- if (!processed_neighb.empty()) {
- for (VertexId v : unprocessed_neighb) {
- if (dominated_.count(v) > 0) {
- interfering_.insert(v);
- } else {
- return false;
- }
- }
- }
- return true;
- }
-
- bool AddVertexWithBackwardPaths(VertexId v) {
- DEBUG("Adding vertex with backward paths");
- std::queue<VertexId> q;
- q.push(v);
- while (!q.empty()) {
- VertexId next_v = q.front();
- q.pop();
- if (!ProcessLocality(next_v)) {
- return false;
- }
- if (!comp_.contains(next_v)) {
- VERIFY(dominated_.count(v) > 0);
- comp_.AddVertex(next_v, dominated_.find(next_v)->second);
- for (EdgeId e : g_.IncomingEdges(next_v)) {
- q.push(g_.EdgeStart(e));
- }
- }
- }
- return true;
- }
-
- boost::optional<VertexId> ClosestNeigbour() const {
- size_t min_dist = inf;
- boost::optional<VertexId> answer = boost::none;
- for (auto it = dominated_.begin(); it != dominated_.end(); ++it) {
- if (!comp_.contains(it->first) && it->second.start_pos < min_dist) {
- min_dist = it->second.start_pos;
- answer = boost::optional<VertexId>(it->first);
- }
- }
- return answer;
- }
-
- bool ProcessInterferingVertex(VertexId v) {
- interfering_.erase(v);
- return AddVertexWithBackwardPaths(v);
- }
-
- bool CheckPathLengths() const {
- VERIFY(CheckCompleteness());
- for (VertexId v : comp_.end_vertices()) {
- if (comp_.distance_range(v).size() > length_diff_threshold_)
- return false;
- }
- return true;
- }
-
- bool CheckPositiveHeightDiff() const {
- DEBUG("Checking for positive height diff of each edge");
- GraphComponent<Graph> gc = comp_.AsGraphComponent();
- for (auto it = gc.e_begin(); it != gc.e_end(); ++it) {
- size_t start_height = comp_.avg_distance(g_.EdgeStart(*it));
- size_t end_height = comp_.avg_distance(g_.EdgeEnd(*it));
- //VERIFY(end_height >= start_height);
- if (end_height <= start_height) {
- DEBUG("Check failed for edge " << g_.str(*it) << " start_height " << start_height << " end_height " << end_height);
- return false;
- }
- }
- return true;
- }
-
- bool CloseComponent() {
- while (!interfering_.empty()) {
- VertexId v = *interfering_.begin();
- DEBUG("Processing interfering vertex " << g_.str(v));
- if (!ProcessInterferingVertex(v)) {
- DEBUG("Vertex processing failed");
- return false;
- }
- }
- return true;
- }
-
-public:
- LocalizedComponentFinder(Graph& g, size_t max_length,
- size_t length_diff_threshold, VertexId start_v) :
- g_(g), max_length_(max_length), length_diff_threshold_(
- length_diff_threshold), comp_(g, start_v) {
- DEBUG(
- "Component finder from vertex " << g_.str(comp_.start_vertex()) << " created");
- DominatedSetFinder<Graph> dominated_set_finder(g_, start_v, max_length);
- dominated_set_finder.FillDominated();
- dominated_ = dominated_set_finder.dominated();
-// ProcessStartVertex();
- }
-
- bool ProceedFurther() {
- DEBUG("Processing further");
-
- DEBUG("Choosing closest vertex");
- do {
- optional<VertexId> next_v = ClosestNeigbour();
-
- if (next_v) {
- DEBUG(
- "Vertex " << g_.str(*next_v) << " was chosen as closest neighbour");
- interfering_.insert(*next_v);
- DEBUG("Trying to construct closure");
- if (!CloseComponent()) {
- DEBUG("Failed to close component");
- return false;
- } else {
- DEBUG("Component closed");
- }
- } else {
- DEBUG("No more vertices can be added");
- return false;
- }
- } while (!comp_.NeedsProjection());
-
- if (!CheckPathLengths()) {
- DEBUG("Path lengths check failed");
- return false;
- }
- if (!CheckPositiveHeightDiff()) {
- DEBUG("Check for positive height diff of each edge failed");
- return false;
- }
- if (comp_.ContainsConjugateVertices()) {
- DEBUG("Found component contains conjugate vertices");
- return false;
- }
- if (comp_.end_vertices().size() > exit_bound) {
- DEBUG("Too many exits:" << comp_.end_vertices().size());
- return false;
- }
- GraphComponent<Graph> gc = comp_.AsGraphComponent();
- DEBUG("Found component candidate. Vertices: " << g_.str(gc.vertices()));
- return true;
- }
-
- const LocalizedComponent<Graph>& component() {
- return comp_;
- }
-
-private:
- DECL_LOGGER("LocalizedComponentFinder")
- ;
-};
-
-template<class Graph>
-class ComplexBulgeRemover {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- Graph& g_;
- size_t max_length_;
- size_t length_diff_;
-
- string pics_folder_;
-
- bool ProcessComponent(LocalizedComponent<Graph>& component,
- size_t candidate_cnt) {
- DEBUG("Processing component");
- ComponentColoring<Graph> coloring(component);
- SkeletonTreeFinder<Graph> tree_finder(component, coloring);
- DEBUG("Looking for a tree");
- if (tree_finder.FindTree()) {
- DEBUG("Tree found");
-
- SkeletonTree<Graph> tree(component, tree_finder.GetTreeEdges());
-
- if (!pics_folder_.empty()) {
- PrintComponent(component, tree,
- pics_folder_ + "success/"
- + ToString(g_.int_id(component.start_vertex()))
- + "_" + ToString(candidate_cnt) + ".dot");
- }
-
- ComponentProjector<Graph> projector(g_, component, coloring, tree);
- if(!projector.ProjectComponent()) {
- DEBUG("Component can't be projected");
- return false;
- }
- DEBUG(
- "Successfully processed component candidate " << candidate_cnt << " start_v " << g_.str(component.start_vertex()));
- return true;
- } else {
- DEBUG(
- "Failed to find skeleton tree for candidate " << candidate_cnt << " start_v " << g_.str(component.start_vertex()));
- if (!pics_folder_.empty()) {
- //todo check if we rewrite all of the previous pics!
- PrintComponent(component,
- pics_folder_ + "fail/"
- + ToString(g_.int_id(component.start_vertex())) //+ "_" + ToString(candidate_cnt)
- + ".dot");
- }
- return false;
- }
- }
-
-public:
- ComplexBulgeRemover(Graph& g, size_t max_length, size_t length_diff,
- const string& pics_folder = "") :
- g_(g), max_length_(max_length), length_diff_(length_diff), pics_folder_(
- pics_folder) {
- }
-
- bool Run() {
- size_t cnt = 0;
- DEBUG("Complex bulge remover started");
- if (!pics_folder_.empty()) {
-// remove_dir(pics_folder_);
- make_dir(pics_folder_);
- make_dir(pics_folder_ + "success/");
- make_dir(pics_folder_ + "fail/");
- }
- bool something_done_flag = false;
- for (auto it = g_.SmartVertexBegin(); !it.IsEnd(); ++it) {
- DEBUG("Processing vertex " << g_.str(*it));
- size_t candidate_cnt = 0;
- vector<VertexId> vertices_to_post_process;
- { //important scope!!!
- LocalizedComponentFinder<Graph> comp_finder(g_, max_length_,
- length_diff_, *it);
- while (comp_finder.ProceedFurther()) {
- candidate_cnt++;
- DEBUG(
- "Found component candidate " << candidate_cnt << " start_v " << g_.str(*it));
- LocalizedComponent<Graph> component =
- comp_finder.component();
- if (ProcessComponent(component, candidate_cnt)) {
- something_done_flag = true;
- cnt++;
- GraphComponent<Graph> gc = component.AsGraphComponent();
- vertices_to_post_process.insert(
- vertices_to_post_process.end(), gc.v_begin(),
- gc.v_end());
- break;
- }
- }
- }
- for (VertexId v : vertices_to_post_process) {
- it.HandleAdd(v);
- g_.CompressVertex(v);
- }
- }
- DEBUG("Complex bulge remover finished");
- DEBUG("Bulges processed " << cnt);
- return something_done_flag;
- }
-
-private:
- DECL_LOGGER("ComplexBulgeRemover")
- ;
-};
-
-}
-
-}
diff --git a/src/include/omni/complex_tip_clipper.hpp b/src/include/omni/complex_tip_clipper.hpp
deleted file mode 100644
index 5b4f92a..0000000
--- a/src/include/omni/complex_tip_clipper.hpp
+++ /dev/null
@@ -1,116 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include <limits>
-
-#include "omni_utils.hpp"
-#include "omni/visualization/visualization.hpp"
-
-
-namespace omnigraph{
-
-
-template<class Graph>
-class ComplexTipClipper {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- Graph& g_;
- size_t max_length_;
- string pics_folder_;
- std::function<void(const set<EdgeId>&)> removal_handler_;
- const size_t edge_length_treshold = 100;
-
- bool CheckEdgeLenghts(const GraphComponent<Graph>& component) const {
- for(auto e : component.edges()) {
- if(g_.length(e) > edge_length_treshold) {
- return false;
- }
- }
- return true;
- }
-
-
- bool CheckSize(const GraphComponent<Graph> & component) const {
- return (component.vertices().size() > 1);
- }
-
- void RemoveComplexTip(GraphComponent<Graph>& component) {
- ComponentRemover<Graph> remover(g_, removal_handler_);
- remover.DeleteComponent(component.edges().begin(), component.edges().end());
- }
-
-
- bool CheckPathLengths(const map<VertexId, Range>& ranges) const {
- for(auto r : ranges) {
- if(r.second.start_pos > max_length_) {
- return false;
- }
- }
- return true;
- }
-
-public:
- ComplexTipClipper(Graph& g, size_t max_length, const string& pics_folder = "", std::function<void(const set<EdgeId>&)> removal_handler = 0) :
- g_(g), max_length_(max_length), pics_folder_(pics_folder), removal_handler_(removal_handler)
- { }
-
- bool Run() {
- size_t cnt = 0;
- INFO("Complex tip clipper started");
- if (!pics_folder_.empty()) {
- make_dir(pics_folder_);
- }
-
- bool something_done_flag = false;
- for (auto it = g_.SmartVertexBegin(); !it.IsEnd(); ++it) {
- if(g_.IncomingEdgeCount(*it) != 0) {
- continue;
- }
- DEBUG("Processing vertex " << g_.str(*it));
-
- DominatedSetFinder<Graph> dom_finder(g_, *it, max_length_ * 2);
- dom_finder.FillDominated();
- auto component = dom_finder.AsGraphComponent();
-
- if(!CheckEdgeLenghts(component)) {
- DEBUG("Tip contains too long edges");
- continue;
- }
-
- if(!CheckSize(component)) {
- DEBUG("Component doesn't meet size requirements");
- continue;
- }
- auto dominated = dom_finder.dominated();
- if(!CheckPathLengths(dominated)) {
- DEBUG("Tip contains too long paths");
- continue;
- }
-
- if (!pics_folder_.empty()) {
- visualization::WriteComponentSinksSources(component,
- pics_folder_
- + ToString(g_.int_id(*it)) //+ "_" + ToString(candidate_cnt)
- + ".dot");
- }
-
- something_done_flag = true;
- cnt++;
- RemoveComplexTip(component);
- }
- CompressAllVertices(g_);
- INFO("Complex tip clipper finished");
- INFO("Tips processed " << cnt);
- return something_done_flag;
- }
-private:
- DECL_LOGGER("ComplexTipClipper")
-};
-
-}
diff --git a/src/include/omni/concurrent_algo/bulge_remover_factory.hpp b/src/include/omni/concurrent_algo/bulge_remover_factory.hpp
deleted file mode 100644
index f145149..0000000
--- a/src/include/omni/concurrent_algo/bulge_remover_factory.hpp
+++ /dev/null
@@ -1,100 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * bulge_remover_factory.hpp
- *
- * Created on: Sep 15, 2012
- * Author: alex
- */
-
-#ifndef BULGE_REMOVER_FACTORY_HPP_
-#define BULGE_REMOVER_FACTORY_HPP_
-
-#include "omni/sequential_algorihtm_factory.hpp"
-#include "omni/concurrent_graph_component.hpp"
-#include "omni/sequential_algorihtm_factory.hpp"
-#include "omni/bulge_remover.hpp"
-#include "kmer_mapper_logger.hpp"
-
-namespace debruijn {
-
-
-template <class Graph>
-class BulgeRemoverFactory
- : public omnigraph::SequentialAlgorihtmFactory<omnigraph::ConcurrentGraphComponent<Graph>, typename Graph::EdgeId> {
-
- typedef omnigraph::ConcurrentGraphComponent<Graph> Component;
- typedef typename Component::EdgeId EdgeId;
- typedef SequentialAlgorihtmFactory<Component, EdgeId> Base;
- typedef typename Base::AlgorithmPtr AlgorithmPtr;
- typedef typename omnigraph::BulgeRemover<Component>::BulgeCallbackF BulgeCallbackF;
- typedef KmerMapperLogger<Graph> Logger;
-
-
-public:
-
- BulgeRemoverFactory(
- size_t max_length,
- double max_coverage,
- double max_relative_coverage,
- double max_delta,
- double max_relative_delta,
- BulgeCallbackF bulge_condition,
- BulgeCallbackF opt_callback = 0,
- std::function<void(EdgeId)> removal_handler = 0)
- : max_length_(max_length),
- max_coverage_(max_coverage),
- max_relative_coverage_(max_relative_coverage),
- max_delta_(max_delta),
- max_relative_delta_(max_relative_delta),
- bulge_condition_(bulge_condition),
- opt_callback_(opt_callback),
- removal_handler_(removal_handler) {
-
- // TODO: KMermapper handling here
- }
-
- virtual AlgorithmPtr CreateAlgorithm(Component& graph) {
- AlgorithmPtr ptr(
- new BulgeRemover<Component>(
- graph, max_length_, max_coverage_, max_relative_coverage_,
- max_delta_, max_relative_delta_, bulge_condition_,
- opt_callback_, removal_handler_));
-
- return ptr;
- }
-
- ~BulgeRemoverFactory() {
- while (!loggers_.empty()) {
- delete loggers_.back();
- loggers_.pop_back();
- }
- }
-
- const vector<Logger*>& loggers() const {
- return loggers_;
- }
-
-
-private:
-
- size_t max_length_;
- double max_coverage_;
- double max_relative_coverage_;
- double max_delta_;
- double max_relative_delta_;
- BulgeCallbackF bulge_condition_;
- BulgeCallbackF opt_callback_;
- std::function<void(EdgeId)> removal_handler_;
- vector<Logger*> loggers_;
-
-};
-
-} // namespace omnigraph
-
-#endif /* BULGE_REMOVER_FACTORY_HPP_ */
diff --git a/src/include/omni/concurrent_algo/component_algorithm_runner.hpp b/src/include/omni/concurrent_algo/component_algorithm_runner.hpp
deleted file mode 100644
index eb17d9f..0000000
--- a/src/include/omni/concurrent_algo/component_algorithm_runner.hpp
+++ /dev/null
@@ -1,130 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-
-/*
- * component_edge_algorithm.hpp
- *
- * Created on: Sep 7, 2012
- * Author: Alexander Opeykin (alexander.opeykin at gmail.com)
- */
-
-
-#ifndef COMPONENT_ALGORITHM_RUNNER_HPP_
-#define COMPONENT_ALGORITHM_RUNNER_HPP_
-
-#include <memory>
-
-#include "concurrent_graph_component.hpp"
-#include "sequential_algorithm.hpp"
-
-namespace omnigraph {
-
-//Deprecated! Use SmartSetIterator instead!
-template<class Graph, typename ElementId, typename Comparator = std::less<
- ElementId> >
-class SmartSet: public GraphActionHandler<Graph> {
-public:
- typedef typename set<ElementId, Comparator>::iterator iterator;
- typedef typename set<ElementId, Comparator>::const_iterator const_iterator;
-private:
- set<ElementId, Comparator> inner_set_;
- const bool add_new_;
-
-public:
- SmartSet(const Graph &graph, Comparator comparator = Comparator(),
- bool add_new = true) :
- GraphActionHandler<Graph>(graph, "SmartSet"), inner_set_(
- comparator), add_new_(add_new) {
- }
-
- template<class Iter>
- SmartSet(Iter begin, Iter end, const Graph &graph, Comparator comparator =
- Comparator(), bool add_new = true) :
- GraphActionHandler<Graph>(graph, "SmartSet"), inner_set_(begin, end,
- comparator), add_new_(add_new) {
- }
-
- virtual ~SmartSet() {
- }
-
- virtual void HandleAdd(ElementId v) {
- if (add_new_)
- inner_set_.insert(v);
- }
-
- virtual void HandleDelete(ElementId v) {
- inner_set_.erase(v);
- }
-
- iterator begin() {
- return inner_set_.begin();
- }
-
- iterator end() {
- return inner_set_.end();
- }
-
- const_iterator begin() const {
- return inner_set_.begin();
- }
-
- const_iterator end() const {
- return inner_set_.end();
- }
-
- pair<iterator, bool> insert(const ElementId& elem) {
- return inner_set_.insert(elem);
- }
-
- const set<ElementId, Comparator> &inner_set() {
- return inner_set_;
- }
-};
-
-template <class Graph, class Argument>
-class ComponentAlgorithmRunner {
-
-public:
- typedef ConcurrentGraphComponent<Graph> Component;
- typedef std::shared_ptr<SequentialAlgorithm<Argument>> AlgorithmPtr;
-
-
- ComponentAlgorithmRunner(Component& component, AlgorithmPtr algorithm)
- : component_(component),
- algorithm_(algorithm),
- not_processed_arguments_(component, std::less<Argument>(), false) {
- }
-
- template <class JavaStyleIterator>
- void Run(JavaStyleIterator& it) {
- algorithm_->Preprocessing();
-
- for (; !it.IsEnd(); ++it) {
- if (!algorithm_->ProcessNext(*it)) {
- not_processed_arguments_.insert(*it);
- }
- }
-
- algorithm_->Postprocessing();
- }
-
- void GetNotProcessedArguments(vector<Argument>& arguments) {
- arguments.insert(arguments.end(), not_processed_arguments_.begin(), not_processed_arguments_.end());
- }
-
-private:
- Component& component_;
- AlgorithmPtr algorithm_;
- SmartSet<Component, Argument> not_processed_arguments_;
-};
-
-} // namespace omnigraph
-
-
-
-#endif /* COMPONENT_ALGORITHM_HPP_ */
diff --git a/src/include/omni/concurrent_algo/concurrent_conjugate_graph_component.hpp b/src/include/omni/concurrent_algo/concurrent_conjugate_graph_component.hpp
deleted file mode 100644
index 96a040f..0000000
--- a/src/include/omni/concurrent_algo/concurrent_conjugate_graph_component.hpp
+++ /dev/null
@@ -1,121 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-
-/*
- * concurrent_conjugate_graph_component.hpp
- *
- * Created on: Aug 20, 2012
- * Author: Alexander Opeykin (alexander.opeykin at gmail.com)
- */
-
-
-#ifndef CONCURRENT_CONJUGATE_GRAPH_COMPONENT_HPP_
-#define CONCURRENT_CONJUGATE_GRAPH_COMPONENT_HPP_
-
-#include "concurrent_graph_component.hpp"
-#include "omni_utils.hpp"
-
-namespace omnigraph {
-
-template <typename Graph>
-class ConcurrentConjugateGraphComponent : public ConcurrentGraphComponent<Graph> {
-
-public:
- typedef ConcurrentGraphComponent<Graph> base;
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexData VertexData;
-
- template<class InputVertexIterator>
- ConcurrentConjugateGraphComponent(
- Graph& graph,
- const restricted::PeriodicIdDistributor& id_distributor,
- InputVertexIterator verticesBegin,
- InputVertexIterator verticesEnd)
- : base(
- graph,
- new PairedHandlerApplier<ConcurrentConjugateGraphComponent>(*this),
- id_distributor,
- verticesBegin,
- verticesEnd) {
- }
-
- VertexId conjugate(VertexId vertex) const {
- return this->graph_.conjugate(vertex);
- }
-
- EdgeId conjugate(EdgeId edge) const {
- return this->graph_.conjugate(edge);
- }
-
- virtual bool IsInternalSafe(const VertexId& vertex) const {
- return this->IsInternal(vertex) && this->IsInternal(conjugate(vertex));
- }
-
- virtual bool IsInternalSafe(const EdgeId& edge) const {
- return
- IsInternalSafe(this->EdgeStart(edge)) &&
- IsInternalSafe(this->EdgeEnd(edge));
- }
-
- virtual bool IsInComponentSafe(const EdgeId& edge) const {
- return
- this->IsInComponent(edge) &&
- this->IsInComponent(this->graph_.conjugate(edge));
- }
-
- virtual ~ConcurrentConjugateGraphComponent() {
- }
-
-
-protected:
-
- virtual void AddVertexToComponent(VertexId vertex) {
- this->vertices_.insert(vertex);
- this->temporary_vertices_.insert(vertex);
-
- this->vertices_.insert(GetConjugateWithoutChecks(vertex));
- this->temporary_vertices_.insert(GetConjugateWithoutChecks(vertex));
- }
-
- virtual VertexId HiddenAddVertex(const VertexData &data) {
- VertexId vertex = this->CreateVertex(data);
- AddVertexToComponent(vertex);
- return vertex;
- }
-
- virtual void HiddenDeleteVertex(VertexId vertex) {
-// VERIFY(IsInternalSafe(vertex));
-
- VertexId conjugate_vertex = conjugate(vertex);
-
- this->vertices_.erase(vertex);
- this->vertices_.erase(conjugate_vertex);
-
- if (this->temporary_vertices_.find(vertex) != this->temporary_vertices_.end()) {
- this->temporary_vertices_.erase(vertex);
- this->temporary_vertices_.erase(conjugate_vertex);
-
- this->DestroyVertex(vertex); // conjugate will be deleted too
- } else {
- this->deleted_vertices_.push_back(vertex);
- }
- }
-
- VertexId GetConjugateWithoutChecks(VertexId vertex) const {
- return this->graph_.conjugate(vertex);
- }
-
- EdgeId GetConjugateWithoutChecks(EdgeId edge) const {
- return this->graph_.conjugate(edge);
- }
-};
-
-} //namespace omnigraph
-
-#endif /* CONCURRENT_CONJUGATE_GRAPH_COMPONENT_HPP_ */
diff --git a/src/include/omni/concurrent_algo/concurrent_edge_algorithm.hpp b/src/include/omni/concurrent_algo/concurrent_edge_algorithm.hpp
deleted file mode 100644
index 69721a5..0000000
--- a/src/include/omni/concurrent_algo/concurrent_edge_algorithm.hpp
+++ /dev/null
@@ -1,193 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-
-/*
- * concurrent_edge_algorithm.hpp
- *
- * Created on: Sep 7, 2012
- * Author: Alexander Opeykin (alexander.opeykin at gmail.com)
- */
-
-
-#ifndef CONCURRENT_EDGE_ALGORITHM_HPP_
-#define CONCURRENT_EDGE_ALGORITHM_HPP_
-
-#include "order_and_law.hpp"
-#include "devisible_tree.hpp"
-#include "omni_tools.hpp"
-#include "sequential_algorihtm_factory.hpp"
-#include "concurrent_graph_component.hpp"
-#include "concurrent_conjugate_graph_component.hpp"
-#include "conjugate_vertex_glued_graph.hpp"
-#include "component_algorithm_runner.hpp"
-#include "perfcounter.hpp"
-
-#include <memory>
-
-namespace omnigraph {
-
-template <class Graph>
-class ConcurrentEdgeAlgorithm {
-
-public:
-
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- typedef ConjugateVertexGluedGraph<Graph> GluedVertexGraph;
- typedef ConcurrentConjugateGraphComponent<Graph> ConjugateComponent;
-
- typedef ConcurrentGraphComponent<Graph> Component;
- typedef std::shared_ptr<Component> ComponentPtr;
-
- typedef SequentialAlgorihtmFactory<Component, EdgeId> Factory;
- typedef std::shared_ptr<Factory> FactoryPtr;
-
- typedef ComponentAlgorithmRunner<Graph, EdgeId> Runner;
- typedef std::shared_ptr<Runner> RunnerPtr;
-
- ConcurrentEdgeAlgorithm(const size_t nthreads, Graph& graph, FactoryPtr factory)
- : nthreads_(nthreads), graph_(graph), factory_(factory) {
-
- TRACE("Run in " << nthreads_ << " threads")
-
- GluedVertexGraph glued_vertex_graph (graph);
- DevisibleTree<GluedVertexGraph> tree (glued_vertex_graph);
-
- const size_t component_size = tree.GetSize() / nthreads;
-
- for (size_t thread = 0; thread < nthreads_; ++thread) {
- vector<VertexId> vertices;
- if (thread == nthreads_ - 1) {
- tree.SeparateVertices(vertices, tree.GetSize());
- } else {
- tree.SeparateVertices(vertices, component_size);
- }
-
- size_t actual_size = vertices.size();
- for (size_t i = 0; i < actual_size; ++i) {
- vertices.push_back(graph.conjugate(vertices[i]));
- }
-
- ComponentPtr ptr (
- new ConjugateComponent(
- graph,
- restricted::PeriodicIdDistributor(graph.GetGraphIdDistributor(),
- graph.GetGraphIdDistributor()->GetId(),
- nthreads
- ),
- vertices.begin(),
- vertices.end()
- )
- );
-
- components_.push_back(ptr);
- }
-
- for (size_t i = 0; i < nthreads_; ++i) {
- RunnerPtr ptr (new Runner(*components_[i], factory->CreateAlgorithm(*components_[i])));
- runners_.push_back(ptr);
- }
- }
-
- // Comparator is used to define edge processing order. Not important by default.
- template <class Comparator = std::less<EdgeId>>
- void Run(Comparator comparator = std::less<EdgeId>()) {
-
- if (nthreads_ > 1) {
- VERIFY(graph_.AllHandlersThreadSafe());
- }
-
- vector<EdgeId> not_processed_edges_with_duplicates;
-
- #pragma omp parallel for num_threads(nthreads_)
- for (size_t i = 0; i < nthreads_; ++i) {
- auto it = components_[i]->SmartEdgeBegin(comparator);
- runners_[i]->Run(it);
- }
-
- for (size_t i = 0; i < nthreads_; ++i) {
- components_[i]->Synchronize();
- }
-
- restricted::PeriodicIdDistributor id_distributor(graph.GetGraphIdDistributor(),
- graph_.GetGraphIdDistributor()->GetId(), 1);
-
- ConjugateComponent all_graph_component(
- graph_, id_distributor, graph_.begin(), graph_.end());
-
- for (size_t i = 0; i < nthreads_; ++i) {
- components_[i]->GetEdgesGoingOutOfComponent(not_processed_edges_with_duplicates);
- }
-
- for (size_t i = 0; i < nthreads_; ++i) {
- runners_[i]->GetNotProcessedArguments(not_processed_edges_with_duplicates);
- }
-
- Runner border_runner(all_graph_component, factory_->CreateAlgorithm(all_graph_component));
-
- auto border_edge_iterator = all_graph_component.SmartEdgeBegin(
- comparator, ¬_processed_edges_with_duplicates);
-
- border_runner.Run(border_edge_iterator);
-
- // TODO: for debug only. remove.
- vector<EdgeId> border_not_processed_edges; // test vector. should have size = 0.
- border_runner.GetNotProcessedArguments(border_not_processed_edges);
- if (border_not_processed_edges.size() != 0) {
- INFO("WARNING: there are " << border_not_processed_edges.size() << " not processed in parallel");
- }
-// VERIFY(border_not_processed_edges.size() == 0);
-
- all_graph_component.Synchronize();
- }
-
-private:
- const size_t nthreads_;
- Graph& graph_;
- FactoryPtr factory_;
- vector<ComponentPtr> components_;
- vector<RunnerPtr> runners_;
-};
-
-
-template <class Graph, class Algorithm>
-class SequentialEdgeAlgorithm {
-private:
- Graph &graph_;
- Algorithm &algorithm_;
-
-public:
-
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- SequentialEdgeAlgorithm(Graph& graph, Algorithm &algorithm)
- : graph_(graph), algorithm_(algorithm) {
- }
-
- // Comparator is used to define edge processing order. Not important by default.
- template <class Comparator = std::less<EdgeId>>
- void Run(Comparator comparator = std::less<EdgeId>()) {
-
- algorithm_.Preprocessing();
-
- for (auto it = graph_.SmartEdgeBegin(comparator); !it.IsEnd(); ++it) {
- algorithm_.ProcessNext(*it);
- }
-
- algorithm_.Postprocessing();
- }
-
-private:
- DECL_LOGGER("ConcurrentEdgeAlgorithm")
-};
-} // namespace omnigraph
-
-
-#endif /* CONCURRENT_EDGE_ALGORITHM_HPP_ */
diff --git a/src/include/omni/concurrent_algo/concurrent_graph_component.hpp b/src/include/omni/concurrent_algo/concurrent_graph_component.hpp
deleted file mode 100644
index 2075066..0000000
--- a/src/include/omni/concurrent_algo/concurrent_graph_component.hpp
+++ /dev/null
@@ -1,472 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-
-/*
- * graph_component_wrapper.hpp
- *
- * Created on: Aug 10, 2012
- * Author: Alexander Opeykin (alexander.opeykin at gmail.com)
- */
-
-
-#ifndef CONCURRENT_GRAPH_COMPONENT_HPP_
-#define CONCURRENT_GRAPH_COMPONENT_HPP_
-
-
-#include "standard_base.hpp"
-#include "order_and_law.hpp"
-#include "abstract_editable_graph.hpp"
-
-
-namespace omnigraph {
-
-
-template <typename Graph>
-class ConcurrentGraphComponent
- : public AbstractEditableGraph<
- typename Graph::VertexId,
- typename Graph::EdgeId,
- typename Graph::DataMaster,
- typename unordered_set<typename Graph::VertexId>::const_iterator > {
-
-public:
-
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::EdgeData EdgeData;
- typedef typename Graph::VertexData VertexData;
- typedef typename unordered_set<VertexId>::const_iterator VertexIterator;
- typedef typename Graph::DataMaster DataMaster;
- typedef AbstractEditableGraph<VertexId, EdgeId, DataMaster, VertexIterator> base;
- typedef typename base::edge_const_iterator edge_const_iterator;
-
- template<class InputVertexIterator>
- ConcurrentGraphComponent(
- Graph& graph,
- HandlerApplier<VertexId, EdgeId>* applier,
- const restricted::PeriodicIdDistributor& id_distributor,
- InputVertexIterator verticesBegin,
- InputVertexIterator verticesEnd)
- : base(applier, graph.master()),
- graph_(graph), vertices_(verticesBegin, verticesEnd),
- edge_id_distributor_(id_distributor) {
-
- for (const VertexId& vertex : vertices_) {
- if (!IsInComponent(graph_.OutgoingEdges(vertex)) ||
- !IsInComponent(graph_.IncomingEdges(vertex))) {
-
- border_vertices_.insert(vertex);
- }
- }
- }
-
- edge_const_iterator out_begin(VertexId v) const {
- VERIFY_MSG(false, "Not implemented here");
- return edge_const_iterator();
- }
-
- edge_const_iterator out_end(VertexId v) const {
- VERIFY_MSG(false, "Not implemented here");
- return edge_const_iterator();
- }
-
- edge_const_iterator in_begin(VertexId v) const {
- VERIFY_MSG(false, "Not implemented here");
- return edge_const_iterator();
- }
-
- edge_const_iterator in_end(VertexId v) const {
- VERIFY_MSG(false, "Not implemented here");
- return edge_const_iterator();
- }
-
- edge_const_iterator in_begin() const {
- VERIFY_MSG(false, "Not implemented here");
- return edge_const_iterator();
- }
-
- edge_const_iterator in_end() const {
- VERIFY_MSG(false, "Not implemented here");
- return edge_const_iterator();
- }
-
- edge_const_iterator out_begin() const {
- VERIFY_MSG(false, "Not implemented here");
- return edge_const_iterator();
- }
-
- edge_const_iterator out_end() const {
- VERIFY_MSG(false, "Not implemented here");
- return edge_const_iterator();
- }
-
- virtual const EdgeData& data(EdgeId edge) const {
- return graph_.data(edge);
- }
-
- virtual const VertexData& data(VertexId vertex) const {
- return graph_.data(vertex);
- }
-
- virtual size_t OutgoingEdgeCount(VertexId vertex) const {
- if (IsAtBorder(vertex)) {
- return OutgoingEdges(vertex).size();
- } else {
- return graph_.OutgoingEdgeCount(vertex);
- }
- }
-
- virtual size_t IncomingEdgeCount(VertexId vertex) const {
- if (IsAtBorder(vertex)) {
- return IncomingEdges(vertex).size();
- } else {
- return graph_.IncomingEdgeCount(vertex);
- }
- }
-
- virtual const vector<EdgeId> OutgoingEdges(VertexId vertex) const {
- if (IsInComponent(vertex)) {
- if (IsAtBorder(vertex)) {
- return GetEdgesFromComponent(graph_.OutgoingEdges(vertex));
- } else {
- return graph_.OutgoingEdges(vertex);
- }
- } else {
- TRACE("Invalidate component action on OutgoingEdges for " << str(vertex));
- return vector<EdgeId>();
- }
- }
-
- virtual const vector<EdgeId> IncomingEdges(VertexId vertex) const {
- if (IsInComponent(vertex)) {
- if (IsAtBorder(vertex)) {
- return GetEdgesFromComponent(graph_.IncomingEdges(vertex));
- } else {
- return graph_.IncomingEdges(vertex);
- }
- } else {
- TRACE("Invalidate component action on IncomingEdges for " << str(vertex));
- return vector<EdgeId>();
- }
- }
-
- virtual vector<EdgeId> GetEdgesBetween(VertexId vertex1, VertexId vertex2) const {
- if (IsInComponent(vertex1) && IsInComponent(vertex2)) {
- return graph_.GetEdgesBetween(vertex1, vertex2);
- } else {
- TRACE("Invalidate component action on GetEdgesBetween for "
- << str(vertex1) << " and " << str(vertex2));
- return vector<EdgeId>();
- }
- }
-
- virtual VertexId EdgeStart(EdgeId edge) const {
- return graph_.EdgeStart(edge);
- }
-
- virtual VertexId EdgeEnd(EdgeId edge) const {
- return graph_.EdgeEnd(edge);
- }
-
- virtual bool RelatedVertices(VertexId vertex1, VertexId vertex2) const {
- return graph_.RelatedVertices(vertex1, vertex2);
- }
-
- virtual bool CanCompressVertex(const VertexId& vertex) const {
- return graph_.CanCompressVertex(vertex);
- }
-
- size_t k() const {
- return graph_.k();
- }
-
- double coverage(EdgeId edge) const {
- return graph_.coverage(edge);
- }
-
- size_t length(EdgeId edge) const {
- return graph_.length(edge);
- }
-
- size_t length(VertexId vertex) const {
- return graph_.length(vertex);
- }
-
- const Sequence& EdgeNucls(EdgeId edge) const {
- return graph_.EdgeNucls(edge);
- }
-
-
- virtual std::string str(const EdgeId edge) const {
- return graph_.str(edge);
- }
-
- virtual std::string str(const VertexId vertex) const {
- return graph_.str(vertex);
- }
-
- virtual size_t int_id(VertexId vertex) const {
- return graph_.int_id(vertex);
- }
-
- virtual size_t int_id(EdgeId edge) const {
- return graph_.int_id(edge);
- }
-
- template<typename Comparator>
- SmartEdgeIterator<ConcurrentGraphComponent, Comparator> SmartEdgeBegin(
- const Comparator& comparator, vector<EdgeId>* edges = 0) const {
-
- return SmartEdgeIterator<ConcurrentGraphComponent, Comparator>(*this, comparator, edges);
- }
-
- SmartEdgeIterator<ConcurrentGraphComponent> SmartEdgeBegin(vector<EdgeId>* edges = 0) const {
- return SmartEdgeIterator<ConcurrentGraphComponent, std::less<EdgeId>>(
- *this, std::less<EdgeId>(), edges);
- }
-
- virtual VertexIterator begin() const {
- return vertices_.begin();
- }
-
- virtual VertexIterator end() const {
- return vertices_.end();
- }
-
- virtual ~ConcurrentGraphComponent() {
- // failing here means that algorithm performed on this component
- // created not temporary vertex (did not delete it)
- VERIFY(temporary_vertices_.size() == 0);
- }
-
- void GetEdgesGoingOutOfComponent(vector<EdgeId>& output) {
- for (const VertexId& vertex : vertices_) {
- for (const EdgeId& edge : graph_.OutgoingEdges(vertex)) {
- if (!IsInComponent(edge)) {
- output.push_back(edge);
- }
- }
- }
- }
-
- void CompressVertex(VertexId vertex) {
- if (IsInternalSafe(vertex)) {
- base::CompressVertex(vertex);
- } else {
- vertices_to_compress_.insert(vertex);
- }
- }
-
-
-// Self methods
- bool IsInComponent(const VertexId& vertex) const {
- return vertices_.find(vertex) != vertices_.end();
- }
-
- bool IsInComponent(const EdgeId& edge) const {
- return IsInComponent(graph_.EdgeStart(edge)) && IsInComponent(graph_.EdgeEnd(edge));
- }
-
- bool IsInComponent(const std::vector<VertexId>& vertices) const {
- for (const VertexId& vertex : vertices) {
- if (!IsInComponent(vertex)) {
- return false;
- }
- }
-
- return true;
- }
-
- bool IsInComponent(const std::vector<EdgeId>& edges) const {
- for (const EdgeId& edge : edges) {
- if (!IsInComponent(edge)) {
- return false;
- }
- }
-
- return true;
- }
-
- bool IsAtBorder(const VertexId& vertex) const {
- return border_vertices_.find(vertex) != border_vertices_.end();
- }
-
- bool IsInternal(const VertexId& vertex) const {
- return IsInComponent(vertex) && !IsAtBorder(vertex);
- }
-
- virtual bool IsInternalSafe(const VertexId& vertex) const = 0;
-
- virtual bool IsInComponentSafe(const EdgeId& edge) const = 0;
-
- virtual bool IsInternalSafe(const EdgeId& edge) const = 0;
-
- virtual bool IsInternalSafe(const vector<EdgeId>& path) const {
- for (EdgeId edge : path) {
- if (!IsInternalSafe(edge)) {
- return false;
- }
- }
-
- return true;
- }
-
- virtual bool IsInComponentSafe(const vector<EdgeId>& path) const {
- for (EdgeId edge : path) {
- if (!IsInComponentSafe(edge)) {
- return false;
- }
- }
-
- return true;
- }
-
- void Synchronize() {
- TRACE("Start synchronize");
- edge_id_distributor_.Synchronize();
-
- for (VertexId vertex : deleted_vertices_) {
- graph_.HiddenDeleteVertex(vertex);
- }
-
- for (VertexId vertex : vertices_to_compress_) {
- if (graph_.CanCompressVertex(vertex)) {
- base::CompressVertex(vertex);
- }
- }
-
- deleted_vertices_.resize(0);
- TRACE("Finish synchronize");
- }
-
-
-protected:
-
- virtual void AddVertexToComponent(VertexId vertex) = 0;
-
- virtual bool AdditionalCompressCondition(VertexId vertex) const {
- return graph_.AdditionalCompressCondition(vertex);
- }
-
- virtual EdgeId HiddenAddEdge(VertexId vertex1, VertexId vertex2, const EdgeData &data) {
- return HiddenAddEdge(vertex1, vertex2, data, &edge_id_distributor_);
- }
-
- virtual EdgeId HiddenAddEdge(VertexId vertex1, VertexId vertex2,
- const EdgeData &data, restricted::IdDistributor * id_distributor) {
- //VERIFY(IsInComponent(vertex1));
- //VERIFY(IsInComponent(vertex2));
- return graph_.HiddenAddEdge(vertex1, vertex2, data, id_distributor);
- }
-
- virtual void HiddenDeleteEdge(EdgeId edge) {
- //VERIFY(IsInComponent(edge));
- graph_.HiddenDeleteEdge(edge);
- }
-
- virtual vector<EdgeId> CorrectMergePath(const vector<EdgeId>& path) {
-// VERIFY(IsInComponent(path)); // TODO: debug only??
- vector<EdgeId> corrected_path = graph_.CorrectMergePath(path);
-// VERIFY(IsInComponent(corrected_path)); // TODO: are always from same component??
- return corrected_path;
- }
-
- virtual vector<EdgeId> EdgesToDelete(const vector<EdgeId> &path) {
- vector<EdgeId> edges_to_delete = graph_.EdgesToDelete(path);
- //VERIFY(IsInComponent(edges_to_delete));
- return edges_to_delete;
- }
-
- virtual vector<VertexId> VerticesToDelete(const vector<EdgeId> &path) {
- vector<VertexId> vertices_to_delete = graph_.VerticesToDelete(path);
- //VERIFY(IsInComponent(vertices_to_delete));
- return vertices_to_delete;
- }
-
- virtual VertexId CreateVertex(const VertexData &data) {
- return graph_.CreateVertex(data);
- }
-
- virtual void DestroyVertex(VertexId vertex) {
- graph_.DestroyVertex(vertex);
- }
-
-protected:
- // observable graph methods.
- virtual void FireAddVertex(VertexId vertex) {
- base::FireAddVertex(vertex);
- graph_.FireAddVertex(vertex);
- }
-
- virtual void FireAddEdge(EdgeId edge) {
- base::FireAddEdge(edge);
- graph_.FireAddEdge(edge);
- }
-
- virtual void FireDeleteVertex(VertexId vertex) {
- base::FireDeleteVertex(vertex);
- graph_.FireDeleteVertex(vertex);
- }
-
- virtual void FireDeleteEdge(EdgeId edge) {
- base::FireDeleteEdge(edge);
- graph_.FireDeleteEdge(edge);
- }
-
- virtual void FireMerge(vector<EdgeId> oldEdges, EdgeId newEdge) {
- base::FireMerge(oldEdges, newEdge);
- graph_.FireMerge(oldEdges, newEdge);
- }
-
- virtual void FireGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) {
- base::FireGlue(new_edge, edge1, edge2);
- graph_.FireGlue(new_edge, edge1, edge2);
- }
-
- virtual void FireSplit(EdgeId edge, EdgeId newEdge1, EdgeId newEdge2) {
- base::FireSplit(edge, newEdge1, newEdge2);
- graph_.FireSplit(edge, newEdge1, newEdge2);
- }
-
- // return edges that start from component. WARNING! border edges are included also
- const vector<EdgeId> GetEdgesFromComponent(const std::vector<EdgeId>& edges) const {
- vector<EdgeId> edges_from_component;
- edges_from_component.reserve(edges.size());
-
- for (const EdgeId& edge : edges) {
- if (IsInComponent(graph_.EdgeStart(edge))) {
- edges_from_component.push_back(edge);
- }
- }
-
- return edges_from_component;
- }
-
-protected:
-
- Graph& graph_;
-
- unordered_set<VertexId> vertices_;
- unordered_set<VertexId> border_vertices_;
-
- vector<VertexId> deleted_vertices_;
- unordered_set<VertexId> temporary_vertices_;
- unordered_set<VertexId> vertices_to_compress_;
-
- restricted::PeriodicIdDistributor edge_id_distributor_;
-
-private:
- DECL_LOGGER("ConcurrentGraphComponent");
-};
-
-
-} //namespace omnigraph
-
-
-
-#endif /* CONCURRENT_GRAPH_COMPONENT_HPP_ */
diff --git a/src/include/omni/concurrent_algo/conjugate_vertex_glued_graph.hpp b/src/include/omni/concurrent_algo/conjugate_vertex_glued_graph.hpp
deleted file mode 100644
index 59b8a12..0000000
--- a/src/include/omni/concurrent_algo/conjugate_vertex_glued_graph.hpp
+++ /dev/null
@@ -1,124 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-
-/*
- * conjugate_vertexe_glued_graph.hpp
- *
- * Created on: Sep 3, 2012
- * Author: Alexander Opeykin (alexander.opeykin at gmail.com)
- */
-
-
-#ifndef CONJUGATE_VERTEX_GLUED_GRAPH_HPP_
-#define CONJUGATE_VERTEX_GLUED_GRAPH_HPP_
-
-#include "omni_utils.hpp"
-#include "standard_base.hpp"
-
-
-namespace omnigraph {
-
-/*
- * This class is used as a graph wrapper for DevisibleTree class.
- * It decorates a couple of methods to hide differences between
- * vertex and it's conjugate.
- */
-
-template <class Graph>
-class ConjugateVertexGluedGraph {
-public:
-
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename set<VertexId>::iterator iterator;
- typedef typename set<VertexId>::const_iterator const_iterator;
-
-
- ConjugateVertexGluedGraph(Graph& graph)
- : graph_(graph) {
-
- for (const VertexId& vertex : graph_) {
- vertices_.insert(GetMinWithConjugate(vertex));
- }
- }
-
- VertexId GetMinWithConjugate(const VertexId& vertex) const{
- VertexId conjugate = graph_.conjugate(vertex);
-
- return vertex < conjugate ? vertex : conjugate;
- }
-
- const_iterator begin() const {
- return vertices_.begin();
- }
-
- const_iterator end() const {
- return vertices_.end();
- }
-
- VertexId EdgeStart(const EdgeId& edge) const {
- return GetMinWithConjugate(graph_.EdgeStart(edge));
- }
-
- VertexId EdgeEnd(const EdgeId& edge) const {
- return GetMinWithConjugate(graph_.EdgeEnd(edge));
- }
-
- SmartEdgeIterator<Graph> SmartEdgeBegin() const {
- return SmartEdgeIterator<Graph>(graph_);
- }
-
- template<typename Comparator>
- SmartEdgeIterator<Graph, Comparator> SmartEdgeBegin(
- const Comparator& comparator) const {
- return SmartEdgeIterator<Graph, Comparator>(graph_, comparator);
- }
-
-
- const vector<EdgeId> OutgoingEdges(VertexId vertex) const {
- return JoinVectors(
- graph_.OutgoingEdges(vertex),
- graph_.OutgoingEdges(graph_.conjugate(vertex)));
- }
-
- const vector<EdgeId> IncomingEdges(VertexId vertex) const {
- return JoinVectors(
- graph_.IncomingEdges(vertex),
- graph_.IncomingEdges(graph_.conjugate(vertex)));
- }
-
- string str(VertexId vertex) const {
- return graph_.str(vertex);
- }
-
- string str(EdgeId edge) const {
- return graph_.str(edge);
- }
-
- size_t length(EdgeId edge) const {
- return graph_.length(edge);
- }
-
-
-private:
- const vector<EdgeId> JoinVectors(const vector<EdgeId>& edges1, const vector<EdgeId>& edges2) const {
- vector<EdgeId> result;
- result.insert(result.end(), edges1.begin(), edges1.end());
- result.insert(result.end(), edges2.begin(), edges2.end());
- return result;
- }
-
-
-private:
- Graph& graph_;
- set<VertexId> vertices_;
-};
-
-} // namespace omnigraph
-
-#endif /* CONJUGATE_VERTEX_GLUED_GRAPH_HPP_ */
diff --git a/src/include/omni/concurrent_algo/devisible_tree.hpp b/src/include/omni/concurrent_algo/devisible_tree.hpp
deleted file mode 100644
index 17c5904..0000000
--- a/src/include/omni/concurrent_algo/devisible_tree.hpp
+++ /dev/null
@@ -1,320 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-
-/*
- * devisible_tree.hpp
- *
- * Created on: Aug 23, 2012
- * Author: Alexander Opeykin (alexander.opeykin at gmail.com)
- */
-
-
-#ifndef DEVISIBLE_TREE_HPP_
-#define DEVISIBLE_TREE_HPP_
-
-
-#include <boost/pending/disjoint_sets.hpp>
-#include <boost/property_map/property_map.hpp>
-#include <queue>
-
-#include "standard_base.hpp"
-#include "omni_utils.hpp"
-
-
-namespace omnigraph {
-
-template <class Value>
-class TreeNode {
-
- struct NodeComparator {
- bool operator() (TreeNode* node1, TreeNode* node2) {
- return node1->GetSize() < node2->GetSize();
- }
- };
-
- struct SizePred {
- const size_t size;
- SizePred(size_t size) : size(size) { }
- bool operator()(TreeNode* node) {
- return node->GetSize() >= size;
- }
- };
-
-public:
- TreeNode() : subtree_size_(0) { }
-
- void AddChild(TreeNode& node) {
- children_.push_back(&node);
- }
-
- void UpdateSubtreeSize() {
- for (TreeNode* node : children_) {
- subtree_size_ += node->GetSize();
- }
- }
-
- virtual size_t GetSize() const {
- return subtree_size_;
- }
-
- TreeNode* GetSubtreeWithSize(const size_t size) {
- // find big enough child.
- auto it = std::find_if(children_.begin(), children_.end(), SizePred(size));
- if (it == children_.end()) {
- return 0;
- }
-
- TreeNode* node = (*it)->GetSubtreeWithSize(size);
- if (node == 0) {
- node = *it;
- children_.erase(it);
- }
-
- subtree_size_ -= node->GetSize();
- return node;
- }
-
- virtual void CollectValue(vector<Value>& output) {
- }
-
- virtual void CollectNodes(std::queue<TreeNode<Value>*>& nodes) {
- for (TreeNode* node : children_) {
- nodes.push(node);
- }
- subtree_size_ = 0;
- children_.clear();
- }
-
- virtual ~TreeNode() { }
-
-private:
- list<TreeNode *> children_;
- size_t subtree_size_;
-};
-
-
-template <class Value>
-class TreeNodeWithValue : public TreeNode<Value> {
-
-public:
- TreeNodeWithValue(const Value& value) : value_(value) {
- }
-
- virtual ~TreeNodeWithValue() { }
-
- virtual void CollectValue(vector<Value>& output) {
- output.push_back(value_);
- }
-
- virtual size_t GetSize() const {
- return TreeNode<Value>::GetSize() + 1;
- }
-
- Value GetValue() const {
- return value_;
- }
-
-
-private:
- Value value_;
-};
-
-
-template <class Graph>
-class DevisibleTree {
-
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- typedef TreeNodeWithValue<VertexId> Node;
- typedef TreeNode<VertexId> RootNode;
-
-
-public:
-
- DevisibleTree(Graph & graph) : graph_(graph) {
- typedef unordered_map<VertexId, int> RankMap;
- typedef unordered_map<VertexId, VertexId> ParentMap;
-
- typedef boost::associative_property_map<RankMap> BoostRankMap;
- typedef boost::associative_property_map<ParentMap> BoostParentMap;
-
- RankMap rank_map;
- ParentMap parent_map;
-
- BoostRankMap boost_rank_map(rank_map);
- BoostParentMap boost_parent_map(parent_map);
-
- boost::disjoint_sets<BoostRankMap, BoostParentMap> dset(boost_rank_map, boost_parent_map);
-
- for (const VertexId& vertex : graph_) {
- dset.make_set(vertex);
- }
-
- for (const VertexId& vertex : graph_) {
- nodes_.push_back(Node(vertex));
- index_[vertex] = nodes_.size() - 1;
- }
-
- TRACE("Creating tree of size:" << nodes_.size());
-
-
-// build trees
-// for (auto it = graph_.SmartEdgeBegin(LengthComparator<Graph>(graph_)); !it.IsEnd(); ++it) {
- for (VertexId vertex : graph) {
- for (EdgeId edge : graph.OutgoingEdges(vertex)) {
- VertexId start = graph_.EdgeStart(edge);
- VertexId end = graph_.EdgeEnd(edge);
-
- VertexId start_root = dset.find_set(start);
- VertexId end_root = dset.find_set(end);
-
- if (start_root != end_root) {
- dset.link(start_root, end_root);
- edges_.insert(edge);
- }
- }
- }
-
- TRACE("Node quantity: " << nodes_.size());
- TRACE("Edges for tree: " << edges_.size());
-
- unordered_set<VertexId> forest_roots;
-
- for (VertexId vertex : graph_) {
- forest_roots.insert(dset.find_set(vertex));
- }
-
- for (VertexId vertex : forest_roots) {
- Node& node = GetNode(vertex);
- TRACE("Adding " << vertex);
- CreateTree(node, edges_);
- root_.AddChild(node);
- }
- root_.UpdateSubtreeSize();
-
- }
-
- void SeparateVertices(vector<VertexId>& output, size_t size) {
- TreeNode<VertexId>* node = root_.GetSubtreeWithSize(min(size, GetSize()));
- if (node == 0) {
- node = &root_;
- }
- output.reserve(node->GetSize());
- CollectValues(node, output);
- }
-
- void CollectValues(TreeNode<VertexId>* root, vector<VertexId>& output) {
- std::queue<TreeNode<VertexId>*> nodes;
- nodes.push(root);
-
- while (nodes.size() > 0) {
- TreeNode<VertexId>* node = nodes.front();
- nodes.pop();
- node->CollectNodes(nodes);
- node->CollectValue(output);
- }
- }
-
- size_t GetSize() const {
- return root_.GetSize();
- }
-
-
-private:
-
- const vector<EdgeId> GetEdges(VertexId vertex) const {
- vector<EdgeId> result;
- vector<EdgeId> incoming = graph_.IncomingEdges(vertex);
- vector<EdgeId> outgoing = graph_.OutgoingEdges(vertex);
- result.insert(result.end(), outgoing.begin(), outgoing.end());
- result.insert(result.end(), incoming.begin(), incoming.end());
- return result;
- }
-
- const vector<EdgeId> Filter(const vector<EdgeId>& vertex_edges, unordered_set<EdgeId>& edges) const {
- vector<EdgeId> result;
- for (EdgeId edge : vertex_edges) {
- auto it = edges.find(edge);
- if (it != edges.end()) {
- TRACE("Edge " << edge << " went through the filter");
- result.push_back(edge);
- edges.erase(it);
- }
- }
- return result;
- }
-
- VertexId GetSecond(VertexId first, EdgeId edge) const {
- VertexId start = graph_.EdgeStart(edge);
- VertexId end = graph_.EdgeEnd(edge);
- return (first == start) ? end : start;
- }
-
-
- const vector<VertexId> GetNeighbours(VertexId vertex, unordered_set<EdgeId>& edges) const {
- const vector<EdgeId> vertex_tree_edges = Filter(GetEdges(vertex), edges);
- vector<VertexId> neighbours;
- for (EdgeId edge : vertex_tree_edges) {
- neighbours.push_back(GetSecond(vertex, edge));
- }
- return neighbours;
- }
-
- enum {
- white,
- grey
- };
- void CreateTree(Node& root, unordered_set<EdgeId>& tree_edges) {
- typedef pair<Node*, bool> stack_elem;
- stack<stack_elem> nodes;
- nodes.push(stack_elem(&root, white));
-
- while(!nodes.empty()) {
- stack_elem elem = nodes.top();
- nodes.pop();
- Node* node = elem.first;
-
- if (elem.second == white) {
- nodes.push(stack_elem(node, grey));
- const vector<VertexId> neighbours = GetNeighbours(node->GetValue(), tree_edges);
- TRACE("Tree has " << tree_edges.size() << " edges");
- for (VertexId neighbour : neighbours) {
- TRACE("Adding " << neighbour);
- Node& child = GetNode(neighbour);
- node->AddChild(child);
- nodes.push(stack_elem(&child, white));
- }
- } else {
- node->UpdateSubtreeSize();
- }
- }
- }
-
- Node& GetNode(const VertexId& vertex) {
- auto it = index_.find(vertex);
- VERIFY(it != index_.end());
-
- return nodes_[it->second];
- }
-
-
-private:
- unordered_map<VertexId, size_t> index_;
- unordered_set<EdgeId> edges_;
- vector<Node> nodes_;
- Graph& graph_;
- RootNode root_;
-
-private:
- DECL_LOGGER("DevisibleTree");
-};
-
-} // namespace omnigraph
-
-
-#endif /* DEVISIBLE_TREE_HPP_ */
diff --git a/src/include/omni/concurrent_algo/sequential_algorihtm_factory.hpp b/src/include/omni/concurrent_algo/sequential_algorihtm_factory.hpp
deleted file mode 100644
index 7b0e2db..0000000
--- a/src/include/omni/concurrent_algo/sequential_algorihtm_factory.hpp
+++ /dev/null
@@ -1,38 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-
-/*
- * sequential_algorihtm_factory.hpp
- *
- * Created on: Sep 7, 2012
- * Author: Alexander Opeykin (alexander.opeykin at gmail.com)
- */
-
-
-#ifndef SEQUENTIAL_ALGORIHTM_FACTORY_HPP_
-#define SEQUENTIAL_ALGORIHTM_FACTORY_HPP_
-
-#include "sequential_algorithm.hpp"
-
-#include <memory>
-
-namespace omnigraph {
-
-template <class Graph, class Argument>
-class SequentialAlgorihtmFactory {
-
-public:
- typedef std::shared_ptr<SequentialAlgorithm<Argument>> AlgorithmPtr;
-
- virtual AlgorithmPtr CreateAlgorithm(Graph& graph) = 0;
- virtual ~SequentialAlgorihtmFactory() { }
-};
-
-} // namespace omni
-
-#endif /* SEQUENTIAL_ALGORIHTM_FACTORY_HPP_ */
diff --git a/src/include/omni/concurrent_algo/sequential_algorithm.hpp b/src/include/omni/concurrent_algo/sequential_algorithm.hpp
deleted file mode 100644
index 14a7e6a..0000000
--- a/src/include/omni/concurrent_algo/sequential_algorithm.hpp
+++ /dev/null
@@ -1,37 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-
-/*
- * sequential_algorithm.hpp
- *
- * Created on: Sep 7, 2012
- * Author: Alexander Opeykin (alexander.opeykin at gmail.com)
- */
-
-/*
- * Interface for algorithms that can be processed part by part.
- */
-
-#ifndef SEQUENTIAL_ALGORITHM_HPP_
-#define SEQUENTIAL_ALGORITHM_HPP_
-
-namespace omnigraph {
-
-template <class T>
-class SequentialAlgorithm {
-
-public:
- virtual ~SequentialAlgorithm() { }
-
- virtual void Preprocessing() { }
- virtual void Postprocessing() { }
- virtual bool ProcessNext(const T& arg) = 0;
-};
-
-} //namespace omnigraph
-#endif /* SEQUENTIAL_ALGORITHM_HPP_ */
diff --git a/src/include/omni/coverage.hpp b/src/include/omni/coverage.hpp
deleted file mode 100644
index 6d8a872..0000000
--- a/src/include/omni/coverage.hpp
+++ /dev/null
@@ -1,342 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * coverage.hpp
- *
- * Created on: Jun 21, 2011
- * Author: sergey
- */
-
-#pragma once
-
-#include "logger/logger.hpp"
-#include <iostream>
-#include <vector>
-#include <algorithm>
-#include "../xmath.h"
-namespace omnigraph {
-
-using std::vector;
-//todo save/load absolute coverage
-template<class Graph>
-class CoverageIndex : public GraphActionHandler<Graph> {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- //typedef unordered_map<EdgeId, int> map_type;
-
- Graph& g_;
-// map_type storage_;
-
-// size_t KPlusOneMerCoverage(EdgeId edge) const {
-// return (size_t) math::round(coverage(edge) * (double) this->g().length(edge));
-// }
-
-// template<class ReadThreader>
-// Path<EdgeId> ProcessSequence(const ReadThreader& threader,
-// const Sequence& sequence) const {
-// return threader.MapSequence(sequence);
-// }
-
-// void AddPathsToGraph(const Path<EdgeId>& path) {
-//
-// if (path.sequence().size() == 0)
-// return;
-//
-// const vector<EdgeId>& edges_list = path.sequence();
-//
-// for (auto it = edges_list.cbegin(); it != edges_list.cend(); ++it) {
-// IncCoverage(*it, this->g().length(*it));
-// }
-// IncCoverage(edges_list[0], -int(path.start_pos()));
-// EdgeId last = edges_list[edges_list.size() - 1];
-// IncCoverage(last, int(path.end_pos()) - int(this->g().length(last)));
-// }
-
-// void IncCoverageInMap(EdgeId edge, int toAdd, map_type& map) {
-// //VERIFY(toAdd >= 0);
-// map[edge] += toAdd;
-// VERIFY(map[edge] >= 0);
-// }
-//
-// void AddPathsToMap(const Path<EdgeId>& path, map_type& map) {
-//
-// if (path.sequence().size() == 0)
-// return;
-//
-// const vector<EdgeId>& edges_list = path.sequence();
-//
-// for (auto it = edges_list.cbegin(); it != edges_list.cend(); ++it) {
-// IncCoverageInMap(*it, this->g().length(*it), map);
-// }
-// IncCoverageInMap(edges_list[0], -int(path.start_pos()), map);
-// EdgeId last = edges_list[edges_list.size() - 1];
-// IncCoverageInMap(last,
-// int(path.end_pos()) - int(this->g().length(last)),
-// map);
-// }
-
- public:
- CoverageIndex(Graph &g)
- : GraphActionHandler<Graph>(g, "CoverageIndex"), g_(g) {
- }
-
- virtual ~CoverageIndex() {
- }
-
- /**
- * In NON averaged units
- */
- void SetRawCoverage(EdgeId e, unsigned cov) {
- g_.data(e).set_raw_coverage(cov);
- }
-
- void IncRawCoverage(EdgeId e, unsigned count) {
- g_.data(e).inc_raw_coverage((int)count);
- }
-
- void SetAvgCoverage(EdgeId e, double cov) {
- g_.data(e).set_raw_coverage((int) math::round(cov * (double) this->g().length(e)));
- }
-
- /**
- * Returns average coverage of the edge
- */
- double coverage(EdgeId edge) const {
- return (double) RawCoverage(edge) / (double) this->g().length(edge);
- }
-
- unsigned RawCoverage(EdgeId edge) const {
- return g_.data(edge).raw_coverage();
- }
-// /**
-// * Returns average coverage of the edge
-// */
-// double operator[](EdgeId e) const {
-// return coverage(e);
-// }
-
-// /**
-// * Method increases coverage value
-// */
-// void IncCoverage(EdgeId edge, int to_add) {
-// edge->IncCoverage(to_add);
-// VERIFY(edge->GetRawCoverage() >= 0);
-// }
-//
-// /**
-// * Method increases coverage value by 1
-// */
-// void IncCoverage(EdgeId edge) {
-// IncCoverage(edge, 1);
-// }
-
-// template<class ReadThreader, class Read>
-// void Fill(io::IReader<Read>& stream, const ReadThreader& threader) {
-//
-// INFO("Processing reads (takes a while)");
-// size_t counter = 0;
-// stream.reset();
-//
-// while (!stream.eof()) {
-// Read r;
-// stream >> r;
-// Path<EdgeId> path = ProcessSequence(threader, r.sequence());
-// AddPathsToGraph(path);
-//
-// VERBOSE_POWER(++counter, " reads processed");
-// }
-//
-// INFO("DeBruijn graph coverage counted, reads used: " << counter);
-// }
-//
-// template<class ReadThreader, class Read>
-// void FillParallel(io::ReadStreamVector<io::IReader<Read> >& streams,
-// const ReadThreader& threader, size_t buffer_size) {
-//
-// INFO("Processing reads (takes a while)");
-// perf_counter pc;
-// size_t counter = 0;
-//
-// size_t nthreads = streams.size();
-// size_t buf_size = buffer_size
-// / (nthreads * (sizeof(Path<EdgeId> ) + 32));
-//
-//#pragma omp parallel num_threads(nthreads)
-// {
-//#pragma omp for reduction(+ : counter)
-// for (size_t i = 0; i < nthreads; ++i) {
-//
-// Read r;
-// io::IReader<Read>& stream = streams[i];
-// stream.reset();
-// std::vector<Path<EdgeId> > buffer(buf_size);
-//
-// size_t j = 0;
-// while (!stream.eof()) {
-// stream >> r;
-// ++counter;
-// buffer[j++] = ProcessSequence(threader, r.sequence());
-//
-// if (j == buf_size) {
-// j = 0;
-//
-//#pragma omp critical
-// {
-// for (size_t l = 0; l < buf_size; ++l) {
-// AddPathsToGraph(buffer[l]);
-// }
-// }
-// }
-// }
-//
-//#pragma omp critical
-// {
-// for (size_t l = 0; l < j; ++l) {
-// AddPathsToGraph(buffer[l]);
-// }
-// }
-// }
-//
-// }
-//
-// INFO("DeBruijn graph coverage counted, reads used: " << counter);
-//
-// INFO("Elapsed time: " << pc.time_ms());
-// }
-//
-// template<class ReadThreader, class Read>
-// void FillFastParallel(
-// io::ReadStreamVector<io::IReader<Read> >& streams,
-// const ReadThreader& threader) {
-//
-// INFO("Processing reads (takes a while)");
-// perf_counter pc;
-// size_t counter = 0;
-//
-// size_t nthreads = streams.size();
-////
-// std::vector<map_type*> maps(nthreads);
-//// maps[0] = &storage_;
-//
-// for (size_t i = 0; i < nthreads; ++i) {
-// maps[i] = new map_type();
-// }
-//
-//#pragma omp parallel num_threads(nthreads)
-// {
-//#pragma omp for reduction(+ : counter)
-// for (size_t i = 0; i < nthreads; ++i) {
-//
-// Read r;
-// io::IReader<Read>& stream = streams[i];
-// stream.reset();
-// Path<EdgeId> path;
-//
-// while (!stream.eof()) {
-// stream >> r;
-// ++counter;
-// path = ProcessSequence(threader, r.sequence());
-//
-// AddPathsToMap(path, *maps[i]);
-// }
-// }
-// }
-//
-// INFO("Merging maps");
-// for (size_t i = 0; i < nthreads; ++i) {
-// for (auto it = maps[i]->begin(); it != maps[i]->end(); ++it) {
-// it->first->IncCoverage(it->second);
-// }
-// delete maps[i];
-// }
-//
-// INFO("DeBruijn graph coverage counted, reads used: " << counter);
-//
-// INFO("Elapsed time: " << pc.time_ms());
-// }
-
-// template<class Index>
-// void FillFromIndex(Index& index) {
-// for (auto I = index.value_cbegin(), E = index.value_cend();
-// I != E; ++I) {
-// const auto& edge_info = *I;
-// VERIFY(edge_info.offset != -1u);
-// VERIFY(edge_info.edge_id.get() != NULL);
-// IncRawCoverage(edge_info.edge_id, edge_info.count);
-// }
-//
-// DEBUG("Coverage counted");
-// }
-
- virtual void HandleDelete(EdgeId edge) {
- SetRawCoverage(edge, 0);
- }
-
- virtual void HandleMerge(const vector<EdgeId>& old_edges, EdgeId new_edge) {
- unsigned coverage = 0;
- for (auto it = old_edges.begin(); it != old_edges.end(); ++it) {
- coverage += RawCoverage(*it);
- }
- SetRawCoverage(new_edge, coverage);
- }
-
- virtual void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) {
- SetRawCoverage(new_edge, RawCoverage(edge1) + RawCoverage(edge2));
- }
-
- virtual void HandleSplit(EdgeId old_edge, EdgeId new_edge1, EdgeId new_edge2) {
-// size_t length1 = this->g().length(newEdge1);
-// size_t length = this->g().length(oldEdge);
-// size_t coverage = KPlusOneMerCoverage(oldEdge);
-// size_t coverage1 = coverage * length1 / length;
-// if (coverage1 == 0)
-// coverage1 = 1;
-// size_t coverage2 = coverage - coverage1;
-// if (coverage2 == 0)
-// coverage2 = 1;
-// SetCoverage(newEdge1, coverage1);
-// SetCoverage(newEdge2, coverage2);
- double avg_cov = coverage(old_edge);
- if (old_edge == g_.conjugate(old_edge)) {
- int raw1 = std::max(1, (int) math::round(avg_cov * (double) this->g().length(new_edge1)));
- SetRawCoverage(new_edge1, raw1);
- SetRawCoverage(g_.conjugate(new_edge1), raw1);
- SetRawCoverage(new_edge2, std::max(1, (int) math::round(avg_cov * (double) this->g().length(new_edge2))));
- } else {
- SetRawCoverage(new_edge1, std::max(1, (int) math::round(avg_cov * (double) this->g().length(new_edge1))));
- SetRawCoverage(new_edge2, std::max(1, (int) math::round(avg_cov * (double) this->g().length(new_edge2))));
- }
- }
-
- void Save(EdgeId e, std::ostream& out) const {
- out << fmt::format("{:.6f}", coverage(e));
- }
-
- void Load(EdgeId e, std::istream& in) {
- double cov;
- in >> cov;
- SetAvgCoverage(e, cov);
- }
-
- /*
- * Is thread safe if different threads process different edges.
- */
- bool IsThreadSafe() const {
- return true;
- }
-};
-
-//todo discuss with Anton
-template<class Graph>
-class AbstractFlankingCoverage {
-public:
- virtual double GetInCov(typename Graph::EdgeId edge) const = 0;
- virtual double GetOutCov(typename Graph::EdgeId edge) const = 0;
-};
-
-}
diff --git a/src/include/omni/dijkstra_tools/dijkstra_algorithm.hpp b/src/include/omni/dijkstra_tools/dijkstra_algorithm.hpp
deleted file mode 100644
index e66f0ec..0000000
--- a/src/include/omni/dijkstra_tools/dijkstra_algorithm.hpp
+++ /dev/null
@@ -1,288 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-#pragma once
-
-#include "simple_tools.hpp"
-#include "dijkstra_settings.hpp"
-
-#include <queue>
-#include <vector>
-#include <set>
-#include <map>
-
-namespace omnigraph {
-
-template<typename Graph, typename distance_t = size_t>
-struct element_t{
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- distance_t distance;
- VertexId curr_vertex;
- VertexId prev_vertex;
- EdgeId edge_between;
-
- element_t(distance_t new_distance, VertexId new_cur_vertex, VertexId new_prev_vertex,
- EdgeId new_edge_between) : distance(new_distance), curr_vertex(new_cur_vertex),
- prev_vertex(new_prev_vertex), edge_between(new_edge_between) { }
-};
-
-template<typename T>
-class ReverseDistanceComparator {
-public:
- ReverseDistanceComparator() {
- }
-
- bool operator()(T obj1, T obj2){
- if(obj1.distance != obj2.distance)
- return obj2.distance < obj1.distance;
- if(obj2.curr_vertex != obj1.curr_vertex)
- return obj2.curr_vertex < obj1.curr_vertex;
- if(obj2.prev_vertex != obj1.prev_vertex)
- return obj2.prev_vertex < obj1.prev_vertex;
- return obj2.edge_between < obj1.edge_between;
- }
-};
-
-template<class Graph, class DijkstraSettings, typename distance_t = size_t>
-class Dijkstra {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- typedef distance_t DistanceType;
-
- typedef std::map<VertexId, distance_t> distances_map;
- typedef typename distances_map::const_iterator distances_map_ci;
- typedef typename std::priority_queue<element_t<Graph, distance_t>, std::vector<element_t<Graph, distance_t>>,
- ReverseDistanceComparator<element_t<Graph, distance_t>>> queue_t;
-
- // constructor parameters
- const Graph& graph_;
- DijkstraSettings settings_;
- const size_t max_vertex_number_;
-
- // changeable parameters
- bool finished_;
- size_t vertex_number_;
- bool vertex_limit_exceeded_;
-
- // accumulative structures
- distances_map distances_;
- std::set<VertexId> processed_vertices_;
- std::map<VertexId, pair<VertexId, EdgeId>> prev_vert_map_;
-
- void Init(VertexId start, queue_t &queue) {
- vertex_number_ = 0;
- distances_.clear();
- processed_vertices_.clear();
- prev_vert_map_.clear();
- set_finished(false);
- settings_.Init(start);
- queue.push(element_t<Graph, distance_t>(0, start, VertexId(0), EdgeId(0)));
- prev_vert_map_[start] = std::pair<VertexId, EdgeId>(VertexId(0), EdgeId(0));
- }
-
- void set_finished(bool state) {
- finished_ = state;
- }
-
- bool CheckPutVertex(VertexId vertex, EdgeId edge, distance_t length) const {
- return settings_.CheckPutVertex(vertex, edge, length);
- }
-
- bool CheckProcessVertex(VertexId vertex, distance_t distance) {
- ++vertex_number_;
- if (vertex_number_ > max_vertex_number_) {
- vertex_limit_exceeded_ = true;
- return false;
- }
- return (vertex_number_ < max_vertex_number_) && settings_.CheckProcessVertex(vertex, distance);
- }
-
- distance_t GetLength(EdgeId edge) const {
- return settings_.GetLength(edge);
- }
-
- void AddNeighboursToQueue(VertexId cur_vertex, distance_t cur_dist, queue_t& queue) {
- auto neigh_iterator = settings_.GetIterator(cur_vertex);
- while (neigh_iterator.HasNext()) {
- TRACE("Checking new neighbour of vertex " << graph_.str(cur_vertex) << " started");
- auto cur_pair = neigh_iterator.Next();
- if (!DistanceCounted(cur_pair.vertex)) {
- TRACE("Adding new entry to queue");
- distance_t new_dist = GetLength(cur_pair.edge) + cur_dist;
- TRACE("Entry: vertex " << graph_.str(cur_vertex) << " distance " << new_dist);
- if (CheckPutVertex(cur_pair.vertex, cur_pair.edge, new_dist)) {
- TRACE("CheckPutVertex returned true and new entry is added");
- queue.push(element_t<Graph, distance_t>(new_dist, cur_pair.vertex,
- cur_vertex, cur_pair.edge));
- }
- }
- TRACE("Checking new neighbour of vertex " << graph_.str(cur_vertex) << " finished");
- }
- TRACE("All neighbours of vertex " << graph_.str(cur_vertex) << " processed");
- }
-
-public:
- Dijkstra(const Graph &graph, DijkstraSettings settings, size_t max_vertex_number = size_t(-1)) :
- graph_(graph),
- settings_(settings),
- max_vertex_number_(max_vertex_number),
- finished_(false),
- vertex_number_(0),
- vertex_limit_exceeded_(false) {}
-
- Dijkstra(Dijkstra&& /*other*/) = default;
-
- Dijkstra& operator=(Dijkstra&& /*other*/) = default;
-
- Dijkstra(const Dijkstra& /*other*/) = delete;
-
- Dijkstra& operator=(const Dijkstra& /*other*/) = delete;
-
- bool finished() const {
- return finished_;
- }
-
- bool DistanceCounted(VertexId vertex) const {
- return distances_.find(vertex) != distances_.end();
- }
-
- distance_t GetDistance(VertexId vertex) const {
- VERIFY(DistanceCounted(vertex));
- return distances_.find(vertex)->second;
- }
-
- std::pair<distances_map_ci, distances_map_ci> GetDistances() const {
- distances_map_ci begin = distances_.begin();
- distances_map_ci end = distances_.end();
- return make_pair(begin, end);
- }
-
- void Run(VertexId start) {
- TRACE("Starting dijkstra run from vertex " << graph_.str(start));
- queue_t queue;
- Init(start, queue);
- TRACE("Priority queue initialized. Starting search");
-
- while (!queue.empty() && !finished()) {
- TRACE("Dijkstra iteration started");
- const element_t<Graph, distance_t>& next = queue.top();
- distance_t distance = next.distance;
- VertexId vertex = next.curr_vertex;
-
- prev_vert_map_[vertex] = std::pair<VertexId, EdgeId>(next.prev_vertex, next.edge_between);
- queue.pop();
- TRACE("Vertex " << graph_.str(vertex) << " with distance " << distance << " fetched from queue");
-
- if (DistanceCounted(vertex)) {
- TRACE("Distance to vertex " << graph_.str(vertex) << " already counted. Proceeding to next queue entry.");
- continue;
- }
- distances_.insert(make_pair(vertex, distance));
-
- TRACE("Vertex " << graph_.str(vertex) << " is found to be at distance "
- << distance << " from vertex " << graph_.str(start));
- if (!CheckProcessVertex(vertex, distance)) {
- TRACE("Check for processing vertex failed. Proceeding to the next queue entry.");
- continue;
- }
- processed_vertices_.insert(vertex);
- AddNeighboursToQueue(vertex, distance, queue);
- }
- set_finished(true);
- TRACE("Finished dijkstra run from vertex " << graph_.str(start));
- }
-
- std::vector<EdgeId> GetShortestPathTo(VertexId vertex) {
- std::vector<EdgeId> path;
- if (prev_vert_map_.find(vertex) == prev_vert_map_.end())
- return path;
-
- VertexId curr_vertex = vertex;
- VertexId prev_vertex = get(prev_vert_map_, vertex).first;
- EdgeId edge = get(prev_vert_map_, curr_vertex).second;
-
- while (prev_vertex != VertexId(0)) {
- if (graph_.EdgeStart(edge) == prev_vertex)
- path.insert(path.begin(), edge);
- else
- path.push_back(edge);
- curr_vertex = prev_vertex;
- const auto& prev_v_e = get(prev_vert_map_, curr_vertex);
- prev_vertex = prev_v_e.first;
- edge = prev_v_e.second;
- }
- return path;
- }
-
- vector<VertexId> ReachedVertices() const {
- vector<VertexId> result;
- for (auto it = distances_.begin(); it != distances_.end(); ++it) {
- result.push_back(it->first);
- }
- return result;
- }
-
- const set<VertexId>& ProcessedVertices() const {
- return processed_vertices_;
- }
-
- bool VertexLimitExceeded() const {
- return vertex_limit_exceeded_;
- }
-
-private:
- DECL_LOGGER("Dijkstra");
-};
-
-template<class Graph>
-class DistanceCounter {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- typedef ComposedDijkstraSettings<Graph,
- LengthCalculator<Graph>,
- VertexProcessChecker<Graph>,
- VertexPutChecker<Graph>,
- ForwardNeighbourIteratorFactory<Graph>> BaseDijkstraSettings;
-
-public:
- DistanceCounter(const Graph& graph) :
- graph_(graph),
- dijkstra_(graph, BaseDijkstraSettings(
- LengthCalculator<Graph>(),
- VertexProcessChecker<Graph>(),
- VertexPutChecker<Graph>(),
- ForwardNeighbourIteratorFactory<Graph>())),
- ready_(false) {
- }
-
- bool IsReachable(VertexId from, VertexId to) {
- EnsureFrom(from);
- return dijkstra_.DistanceCounted(to);
- }
-
- size_t Distance(VertexId from, VertexId to) {
- EnsureFrom(from);
- return dijkstra_.GetDistance(to);
- }
-
-private:
- void EnsureFrom(VertexId from) {
- if (!ready_ || prev_ != from) {
- dijkstra_.run(from);
- ready_ = true;
- prev_ = from;
- }
- }
-
- const Graph& graph_;
- Dijkstra<Graph, BaseDijkstraSettings> dijkstra_;
- VertexId prev_;
- bool ready_;
-};
-
-}
diff --git a/src/include/omni/dijkstra_tools/dijkstra_helper.hpp b/src/include/omni/dijkstra_tools/dijkstra_helper.hpp
deleted file mode 100644
index 01505b7..0000000
--- a/src/include/omni/dijkstra_tools/dijkstra_helper.hpp
+++ /dev/null
@@ -1,163 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-#include "dijkstra_algorithm.hpp"
-
-namespace omnigraph {
-
-template<class Graph>
-class DijkstraHelper {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-public:
- typedef Dijkstra<Graph, ComposedDijkstraSettings<Graph,
- LengthCalculator<Graph>,
- VertexProcessChecker<Graph>,
- VertexPutChecker<Graph>,
- UnorientedNeighbourIteratorFactory<Graph> > > UnorientedDijkstra;
-
- //------------------------------
-
- typedef Dijkstra<Graph, ComposedDijkstraSettings<Graph,
- LengthCalculator<Graph>,
- VertexProcessChecker<Graph>,
- VertexPutChecker<Graph>,
- BackwardNeighbourIteratorFactory<Graph> > > BackwardDijkstra;
-
- //------------------------------
- // bounded dijkstra
- //------------------------------
- typedef ComposedDijkstraSettings<Graph,
- LengthCalculator<Graph>,
- BoundProcessChecker<Graph>,
- BoundPutChecker<Graph>,
- ForwardNeighbourIteratorFactory<Graph> > BoundedDijkstraSettings;
-
- typedef Dijkstra<Graph, BoundedDijkstraSettings> BoundedDijkstra;
-
- static BoundedDijkstra CreateBoundedDijkstra(const Graph &graph, size_t length_bound,
- size_t max_vertex_number = -1ul){
- return BoundedDijkstra(graph, BoundedDijkstraSettings(
- LengthCalculator<Graph>(graph),
- BoundProcessChecker<Graph>(length_bound),
- BoundPutChecker<Graph>(length_bound),
- ForwardNeighbourIteratorFactory<Graph>(graph)),
- max_vertex_number);
- }
-
- //------------------------------
- // bounded backward dijkstra
- //------------------------------
-
- typedef ComposedDijkstraSettings<Graph,
- LengthCalculator<Graph>,
- BoundProcessChecker<Graph>,
- BoundPutChecker<Graph>,
- BackwardNeighbourIteratorFactory<Graph> > BackwardBoundedDijkstraSettings;
-
- typedef Dijkstra<Graph, BackwardBoundedDijkstraSettings> BackwardBoundedDijkstra;
-
- static BackwardBoundedDijkstra CreateBackwardBoundedDijkstra(const Graph &graph,
- size_t bound, size_t max_vertex_number = size_t(-1)){
- return BackwardBoundedDijkstra(graph, BackwardBoundedDijkstraSettings(
- LengthCalculator<Graph>(graph),
- BoundProcessChecker<Graph>(bound),
- BoundPutChecker<Graph>(bound),
- BackwardNeighbourIteratorFactory<Graph>(graph)), max_vertex_number);
- }
-
- //------------------------------
-
- typedef Dijkstra<Graph, ComposedDijkstraSettings<Graph,
- LengthCalculator<Graph>,
- VertexProcessChecker<Graph>,
- EdgeComponentPutChecker<Graph>,
- UnorientedNeighbourIteratorFactory<Graph> > > ComponentFinder;
- //------------------------------
-
- typedef Dijkstra<Graph, ComposedDijkstraSettings<Graph,
- ComponentLenCalculator<Graph>,
- BoundProcessChecker<Graph>,
- VertexPutChecker<Graph>,
- UnorientedNeighbourIteratorFactory<Graph> > > NeighbourhoodFinder;
- //------------------------------
-
- typedef Dijkstra<Graph, ComposedDijkstraSettings<Graph,
- LengthCalculator<Graph>,
- VertexProcessChecker<Graph>,
- SubgraphPutChecker<Graph>,
- UnorientedNeighbourIteratorFactory<Graph> > > SubgraphDijkstra;
-
- typedef ComposedDijkstraSettings<Graph,
- PathIgnoringLengthCalculator<Graph>,
- BoundProcessChecker<Graph>,
- BoundPutChecker<Graph>,
- ForwardNeighbourIteratorFactory<Graph> > PathIgnoringDijkstraSettings;
-
-
- //------------------------------
- // short edge dijkstra settings
- //------------------------------
- typedef ComposedDijkstraSettings<Graph,
- BoundedEdgeLenCalculator<Graph>,
- ZeroLengthProcessChecker<Graph>,
- VertexPutChecker<Graph>,
- UnorientedNeighbourIteratorFactory<Graph> > ShortEdgeDijkstraSettings;
-
- typedef Dijkstra<Graph, ShortEdgeDijkstraSettings> ShortEdgeDijkstra;
-
- static ShortEdgeDijkstra CreateShortEdgeDijkstra(const Graph &graph, size_t edge_length_bound,
- size_t max_vertex_number = size_t(-1)){
- return ShortEdgeDijkstra(graph, ShortEdgeDijkstraSettings(
- BoundedEdgeLenCalculator<Graph>(graph, edge_length_bound),
- ZeroLengthProcessChecker<Graph>(),
- VertexPutChecker<Graph>(),
- UnorientedNeighbourIteratorFactory<Graph>(graph)),
- max_vertex_number);
- }
-
- //------------------------------
- // counting dijkstra
- //------------------------------
- typedef CountingDijkstraSettings<Graph,
- UnorientedNeighbourIteratorFactory<Graph> > UnorientCountingDijkstraSettings;
-
- typedef Dijkstra<Graph, UnorientCountingDijkstraSettings> CountingDijkstra;
-
- static CountingDijkstra CreateCountingDijkstra(const Graph &graph, size_t max_size,
- size_t edge_length_bound, size_t max_vertex_number = size_t(-1)){
- return CountingDijkstra(graph, UnorientCountingDijkstraSettings(graph,
- UnorientedNeighbourIteratorFactory<Graph>(graph),
- max_size, edge_length_bound), max_vertex_number);
- }
-
-
- //------------------------------
- // targeted bounded dijkstra
- //------------------------------
-
- typedef ComposedDijkstraSettings<Graph,
- LengthCalculator<Graph>,
- BoundedVertexTargetedProcessChecker<Graph>,
- BoundPutChecker<Graph>,
- ForwardNeighbourIteratorFactory<Graph> > TargeredBoundedDijkstraSettings;
-
- typedef Dijkstra<Graph, TargeredBoundedDijkstraSettings> TargeredBoundedDijkstra;
-
- static TargeredBoundedDijkstra CreateTargeredBoundedDijkstra(const Graph &graph,
- VertexId target_vertex, size_t bound, size_t max_vertex_number = size_t(-1)){
- return TargeredBoundedDijkstra(graph,
- TargeredBoundedDijkstraSettings(LengthCalculator<Graph>(graph),
- BoundedVertexTargetedProcessChecker<Graph>(target_vertex, bound),
- BoundPutChecker<Graph>(bound),
- ForwardNeighbourIteratorFactory<Graph>(graph)),
- max_vertex_number);
- }
-};
-
-}
diff --git a/src/include/omni/dijkstra_tools/dijkstra_settings.hpp b/src/include/omni/dijkstra_tools/dijkstra_settings.hpp
deleted file mode 100644
index 38897b9..0000000
--- a/src/include/omni/dijkstra_tools/dijkstra_settings.hpp
+++ /dev/null
@@ -1,117 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "length_calculator.hpp"
-#include "vertex_process_checker.hpp"
-#include "vertex_put_checker.hpp"
-#include "neighbours_iterator.hpp"
-
-namespace omnigraph {
-
-template<class Graph,
- class LengthCalculator,
- class VertexProcessChecker,
- class VertexPutChecker,
- class NeighbourIteratorFactory,
- typename distance_t = size_t>
-class ComposedDijkstraSettings {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- LengthCalculator len_calc_;
- VertexProcessChecker vert_proc_checker_;
- VertexPutChecker vert_put_checker_;
- NeighbourIteratorFactory neigh_iter_factory_;
-
-public:
- typedef LengthCalculator LC;
- typedef VertexProcessChecker VPrC;
- typedef VertexPutChecker VPuC;
- typedef NeighbourIteratorFactory NIF;
-
- ComposedDijkstraSettings(LengthCalculator len_calc,
- VertexProcessChecker vert_proc_checker,
- VertexPutChecker vert_put_checker,
- NeighbourIteratorFactory neigh_iter_factory) :
- len_calc_(len_calc),
- vert_proc_checker_(vert_proc_checker),
- vert_put_checker_(vert_put_checker),
- neigh_iter_factory_(neigh_iter_factory) { }
-
- void Init(VertexId /*vertex*/){
- }
-
- distance_t GetLength(EdgeId edge) const{
- return len_calc_.GetLength(edge);
- }
-
- bool CheckProcessVertex(VertexId vertex, distance_t distance){
- return vert_proc_checker_.Check(vertex, distance);
- }
-
- bool CheckPutVertex(VertexId vertex, EdgeId edge, distance_t length) const{
- return vert_put_checker_.Check(vertex, edge, length);
- }
-
- typename NeighbourIteratorFactory::NeighbourIterator GetIterator(VertexId vertex) {
- return neigh_iter_factory_.CreateIterator(vertex);
- }
-};
-
-template<class Graph, class NeighbourIteratorFactory, typename distance_t = size_t>
-class CountingDijkstraSettings {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- const Graph &graph_;
-
- NeighbourIteratorFactory neigh_iter_factory_;
- static const distance_t inf = 100000000;
- const size_t max_size_;
- const size_t edge_length_bound_;
- mutable size_t current_;
-
-public:
- CountingDijkstraSettings(const Graph &graph,
- NeighbourIteratorFactory neigh_iter_factory,
- size_t max_size, size_t edge_length_bound) :
- graph_(graph),
- neigh_iter_factory_(neigh_iter_factory),
- max_size_(max_size),
- edge_length_bound_(edge_length_bound),
- current_(0) { }
-
- void Init(VertexId /*vertex*/){
- current_ = 0;
- }
-
- distance_t GetLength(EdgeId edge) const{
- if (graph_.length(edge) <= edge_length_bound_)
- return graph_.length(edge);
- return inf;
- }
-
- bool CheckProcessVertex(VertexId , distance_t ){
- return current_ < max_size_;
- }
-
- bool CheckPutVertex(VertexId , EdgeId edge, distance_t ) const{
- if (current_ < max_size_)
- ++current_;
- if (current_ < max_size_ && GetLength(edge) < inf)
- return true;
- return false;
- }
-
- typename NeighbourIteratorFactory::NeighbourIterator GetIterator(VertexId vertex) {
- return neigh_iter_factory_.CreateIterator(vertex);
- }
-};
-
-}
diff --git a/src/include/omni/dijkstra_tools/length_calculator.hpp b/src/include/omni/dijkstra_tools/length_calculator.hpp
deleted file mode 100644
index 36f5ae5..0000000
--- a/src/include/omni/dijkstra_tools/length_calculator.hpp
+++ /dev/null
@@ -1,112 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-#include "standard_base.hpp"
-
-namespace omnigraph {
-
-template<class Graph, typename distance_t = size_t>
-class LengthCalculator {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-protected:
- const Graph &graph_;
-public:
- LengthCalculator(const Graph &graph) : graph_(graph) { }
- virtual distance_t GetLength(EdgeId edge) const{
- return distance_t(graph_.length(edge));
- }
- virtual ~LengthCalculator() { }
-};
-
-template<class Graph, typename distance_t = size_t>
-class ComponentLenCalculator : public LengthCalculator<Graph, distance_t> {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- set<EdgeId> &component_;
-public:
- ComponentLenCalculator(const Graph &graph, set<EdgeId> &component) :
- LengthCalculator<Graph, distance_t>(graph), component_(component) { }
-
- distance_t GetLength(EdgeId edge) const{
- if (component_.count(edge) != 0)
- return 0;
- return this->graph_.length(edge);
- }
-};
-
-template<class Graph, typename distance_t = size_t>
-class BoundedEdgeLenCalculator : public LengthCalculator<Graph, distance_t> {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- distance_t bound_;
-public:
- BoundedEdgeLenCalculator(const Graph &graph, distance_t bound) :
- LengthCalculator<Graph, distance_t>(graph), bound_(bound) { }
-
- distance_t GetLength(EdgeId edge) const{
- if(this->graph_.length(edge) <= bound_)
- return 0;
- return 1;
- }
-};
-
-template<class Graph, typename distance_t = size_t>
-class AlongPathLengthCalculator : public LengthCalculator<Graph, distance_t> {
- typedef LengthCalculator<Graph, distance_t> base;
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- set<VertexId> vertex_path_;
- distance_t bound_;
-
- set<VertexId> CollectVertices(vector<EdgeId> &edge_path){
- set<VertexId> result;
- for(auto e = edge_path.begin(); e != edge_path.end(); e++){
- result.insert(this->graph_.EdgeStart(*e));
- result.insert(this->graph_.EdgeEnd(*e));
- }
- return result;
- }
-
-public:
- AlongPathLengthCalculator(const Graph &graph, vector<EdgeId> &edge_path, distance_t bound) :
- LengthCalculator<Graph, distance_t>(graph),
- vertex_path_(CollectVertices(edge_path)),
- bound_(bound) { }
-
- distance_t GetLength(EdgeId edge) const{
- if (vertex_path_.count(this->graph_.EdgeStart(edge))
- && vertex_path_.count(this->graph_.EdgeEnd(edge)))
- return min(int(base::GetLength(edge)), 200);
- return base::GetLength(edge);
- }
-};
-
-template<class Graph, typename distance_t = size_t>
-class PathIgnoringLengthCalculator : public LengthCalculator<Graph, distance_t> {
- typedef LengthCalculator<Graph, distance_t> base;
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- set<EdgeId> path_;
- distance_t bound_;
-
-public:
- PathIgnoringLengthCalculator(const Graph &graph, const vector<EdgeId> &edge_path) :
- LengthCalculator<Graph, distance_t>(graph), path_(edge_path.begin(), edge_path.end())
- { }
-
- distance_t GetLength(EdgeId edge) const {
- if (path_.find(edge) != path_.end()) {
- return 0;
- }
- return base::GetLength(edge);
- }
-};
-
-
-}
diff --git a/src/include/omni/dijkstra_tools/neighbours_iterator.hpp b/src/include/omni/dijkstra_tools/neighbours_iterator.hpp
deleted file mode 100644
index 8c34823..0000000
--- a/src/include/omni/dijkstra_tools/neighbours_iterator.hpp
+++ /dev/null
@@ -1,164 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-namespace omnigraph {
-
-template<class Graph>
-struct vertex_neighbour {
-protected:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-public:
- VertexId vertex;
- EdgeId edge;
-
- vertex_neighbour(VertexId new_vertex, EdgeId new_edge) :
- vertex(new_vertex),
- edge(new_edge) { }
-};
-
-template<class Graph>
-class NeighbourIterator {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-protected:
- const Graph &graph_;
- VertexId vertex_;
-public:
- NeighbourIterator(const Graph &graph, VertexId vertex) :
- graph_(graph),
- vertex_(vertex) { }
-
- virtual bool HasNext() = 0;
- virtual vertex_neighbour<Graph> Next() = 0;
- virtual ~NeighbourIterator() { }
-};
-
-template<class Graph>
-class ForwardNeighbourIterator : public NeighbourIterator<Graph>{
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename VertexId::type::edge_const_iterator edge_const_iterator;
-
- pair<edge_const_iterator, edge_const_iterator> out_edges_;
-public:
- ForwardNeighbourIterator(const Graph &graph, VertexId vertex) :
- NeighbourIterator<Graph>(graph, vertex),
- out_edges_(make_pair(graph.OutgoingEdges(vertex).begin(),
- graph.OutgoingEdges(vertex).end())) { }
-
- bool HasNext(){
- return out_edges_.first != out_edges_.second;
- }
-
- vertex_neighbour<Graph> Next() {
- vertex_neighbour<Graph> res(this->graph_.EdgeEnd(*out_edges_.first), *out_edges_.first);
- out_edges_.first++;
- return res;
- }
-};
-
-template<class Graph>
-class BackwardNeighbourIterator : public NeighbourIterator<Graph>{
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename VertexId::type::edge_const_iterator edge_const_iterator;
-
- pair<edge_const_iterator, edge_const_iterator> in_edges_;
-public:
- BackwardNeighbourIterator(const Graph &graph, VertexId vertex) :
- NeighbourIterator<Graph>(graph, vertex),
- in_edges_(make_pair(graph.IncomingEdges(vertex).begin(),
- graph.IncomingEdges(vertex).end())) { }
-
- bool HasNext(){
- return in_edges_.first != in_edges_.second;
- }
-
- vertex_neighbour<Graph> Next() {
- vertex_neighbour<Graph> res(this->graph_.EdgeStart(*in_edges_.first), *in_edges_.first);
- in_edges_.first++;
- return res;
- }
-};
-
-template<class Graph>
-class UnorientedNeighbourIterator : public NeighbourIterator<Graph>{
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename VertexId::type::edge_const_iterator edge_const_iterator;
-
- pair<edge_const_iterator, edge_const_iterator> in_edges_;
- pair<edge_const_iterator, edge_const_iterator> out_edges_;
-public:
- UnorientedNeighbourIterator(const Graph &graph, VertexId vertex) :
- NeighbourIterator<Graph>(graph, vertex),
- in_edges_(make_pair(graph.IncomingEdges(vertex).begin(),
- graph.IncomingEdges(vertex).end())),
- out_edges_(make_pair(graph.OutgoingEdges(vertex).begin(),
- graph.OutgoingEdges(vertex).end())) { }
-
- bool HasNext(){
- return in_edges_.first != in_edges_.second;
- }
-
- // first all outgoing edges are visited
- // then all incoming
- vertex_neighbour<Graph> Next() {
- if(out_edges_.first != out_edges_.second){
- vertex_neighbour<Graph> res(this->graph_.EdgeEnd(*out_edges_.first), *out_edges_.first);
- out_edges_.first++;
- return res;
- }
- vertex_neighbour<Graph> res(this->graph_.EdgeStart(*in_edges_.first), *in_edges_.first);
- in_edges_.first++;
- return res;
- }
-};
-
-template<class Graph>
-class ForwardNeighbourIteratorFactory {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- const Graph &graph_;
-public:
- typedef ForwardNeighbourIterator<Graph> NeighbourIterator;
- ForwardNeighbourIteratorFactory(const Graph &graph) : graph_(graph) { }
- NeighbourIterator CreateIterator(VertexId vertex){
- return NeighbourIterator(graph_, vertex);
- }
-};
-
-template<class Graph>
-class BackwardNeighbourIteratorFactory {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- const Graph &graph_;
-public:
- typedef BackwardNeighbourIterator<Graph> NeighbourIterator;
- BackwardNeighbourIteratorFactory(const Graph &graph) : graph_(graph) { }
- NeighbourIterator CreateIterator(VertexId vertex){
- return NeighbourIterator(graph_, vertex);
- }
-};
-
-template<class Graph>
-class UnorientedNeighbourIteratorFactory {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- const Graph &graph_;
-public:
- typedef UnorientedNeighbourIterator<Graph> NeighbourIterator;
- UnorientedNeighbourIteratorFactory(const Graph &graph) : graph_(graph) { }
- NeighbourIterator CreateIterator(VertexId vertex){
- return NeighbourIterator(graph_, vertex);
- }
-};
-
-}
diff --git a/src/include/omni/dijkstra_tools/vertex_process_checker.hpp b/src/include/omni/dijkstra_tools/vertex_process_checker.hpp
deleted file mode 100644
index 4cddc98..0000000
--- a/src/include/omni/dijkstra_tools/vertex_process_checker.hpp
+++ /dev/null
@@ -1,72 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-namespace omnigraph {
-
-template<class Graph, typename distance_t = size_t>
-class VertexProcessChecker {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-public:
- VertexProcessChecker() {}
- virtual bool Check(VertexId, distance_t) { return true; }
- virtual ~VertexProcessChecker() {}
-};
-
-template<class Graph, typename distance_t = size_t>
-class BoundProcessChecker : public VertexProcessChecker<Graph, distance_t> {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- const distance_t distance_bound_;
-public:
- BoundProcessChecker(distance_t distance_bound) :
- distance_bound_(distance_bound) {}
-
- bool Check(VertexId, distance_t distance) override {
- return distance <= distance_bound_;
- }
-};
-
-template<class Graph, typename distance_t = size_t>
-class ZeroLengthProcessChecker : public VertexProcessChecker<Graph, distance_t> {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-public:
- ZeroLengthProcessChecker() {}
-
- bool Check(VertexId, distance_t distance) override {
- return distance == 0;
- }
-};
-
-template<class Graph, typename distance_t = size_t>
-class BoundedVertexTargetedProcessChecker : public BoundProcessChecker<Graph, distance_t> {
- typedef BoundProcessChecker<Graph, distance_t> base;
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- VertexId target_vertex_;
- bool target_reached_;
-public:
- BoundedVertexTargetedProcessChecker(VertexId target_vertex, size_t bound) :
- base(bound),
- target_vertex_(target_vertex),
- target_reached_(false) { }
-
- bool Check(VertexId vertex, distance_t distance) override {
- if (vertex == target_vertex_)
- target_reached_ = true;
- if (target_reached_)
- return false;
- else
- return base::Check(vertex, distance);
- }
-};
-
-}
diff --git a/src/include/omni/dijkstra_tools/vertex_put_checker.hpp b/src/include/omni/dijkstra_tools/vertex_put_checker.hpp
deleted file mode 100644
index c02ddfc..0000000
--- a/src/include/omni/dijkstra_tools/vertex_put_checker.hpp
+++ /dev/null
@@ -1,63 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-namespace omnigraph {
-
-template<class Graph, typename distance_t = size_t>
-class VertexPutChecker {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-public:
- VertexPutChecker() { }
- virtual bool Check(VertexId, EdgeId, distance_t) const{ return true; }
- virtual ~VertexPutChecker() { }
-};
-
-template<class Graph, typename distance_t = size_t>
-class EdgeComponentPutChecker : public VertexPutChecker<Graph, distance_t> {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- set<EdgeId> &edges_;
-public:
- EdgeComponentPutChecker(set<EdgeId> &edges) : VertexPutChecker<Graph, distance_t>(), edges_(edges) { }
- bool Check(VertexId, EdgeId edge, distance_t) const{
- return edges_.count(edge) != 0;
- }
-};
-
-template<class Graph, typename distance_t = size_t>
-class SubgraphPutChecker : public VertexPutChecker<Graph, distance_t> {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- const set<VertexId> &subgraph_;
-public:
- SubgraphPutChecker(const set<VertexId>& subgraph) : VertexPutChecker<Graph, distance_t>(),
- subgraph_(subgraph) { }
- bool Check(VertexId vertex, EdgeId, distance_t) const{
- return subgraph_.count(vertex) != 0;
- }
-};
-
-template<class Graph, typename distance_t = size_t>
-class BoundPutChecker : public VertexPutChecker<Graph, distance_t> {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- const distance_t bound_;
-public:
- BoundPutChecker(distance_t bound) : VertexPutChecker<Graph, distance_t>(),
- bound_(bound) { }
- bool Check(VertexId, EdgeId, distance_t length) const{
- return length <= bound_;
- }
-};
-
-}
diff --git a/src/include/omni/edge_labels_handler.hpp b/src/include/omni/edge_labels_handler.hpp
deleted file mode 100644
index b6c638c..0000000
--- a/src/include/omni/edge_labels_handler.hpp
+++ /dev/null
@@ -1,222 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- *
- * Saves labeling of new_graph via different graph transformation by edges of unresolved graph - old_graph
- * Has two methods
- *
- * Created on: Aug 5, 2011
- * Author: undead
- */
-
-#ifndef EDGE_LABELS_HANDLER_HPP_
-#define EDGE_LABELS_HANDLER_HPP_
-
-//#include "utils.hpp"
-#include "visualization/graph_labeler.hpp"
-#include "simple_tools.hpp"
-#include <unordered_map>
-#include <map>
-
-using namespace omnigraph;
-
-namespace omnigraph {
-using std::map;
-
-//todo ask Shurik to remove new_graph_
-template<class Graph>
-class EdgeLabelHandler : public GraphActionHandler<Graph> {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- private:
- Graph &new_graph_;
- Graph &old_graph_;
- //From new edge to sequence of old
- public:
- map<EdgeId, vector<EdgeId> > edge_labels;
- //From old edge to set of new ones, containing it.
- map<EdgeId, set<EdgeId> > edge_inclusions;
- public:
- //TODO: integrate this to resolver, remove "from_resolve" parameter
- EdgeLabelHandler(Graph &new_graph, Graph &old_graph,
- const std::map<EdgeId, EdgeId>& from_resolve)
- : GraphActionHandler<Graph>(new_graph, "EdgePositionHandler"),
- new_graph_(new_graph),
- old_graph_(old_graph) {
- // printing from resolve
- FillLabels(from_resolve);
- /* for(auto iter = from_resolve.begin(); iter != from_resolve.end(); ++iter) {
- if (edge_inclusions.find(iter->second) == edge_inclusions.end()){
- set<EdgeId> tmp;
- edge_inclusions.insert(make_pair(iter->second, tmp));
- }
- edge_inclusions[iter->second].insert(iter->first);
-
- if (edge_labels.find(iter->first) == edge_labels.end()) {
- set<EdgeId> tmp;
- edge_labels.insert(make_pair(iter->first, tmp));
- }
- edge_labels[iter->second].push_back(iter->second);
- }
- */}
- EdgeLabelHandler(Graph &new_graph, Graph &old_graph)
- : GraphActionHandler<Graph>(new_graph, "EdgePositionHandler"),
- new_graph_(new_graph),
- old_graph_(old_graph) {
- }
- void FillLabels(const map<EdgeId, EdgeId>& from_resolve) {
- for (auto iter = from_resolve.begin(); iter != from_resolve.end();
- ++iter) {
- if (edge_inclusions.find(iter->second) == edge_inclusions.end()) {
- set<EdgeId> tmp;
- edge_inclusions.insert(make_pair(iter->second, tmp));
- }
- edge_inclusions.find(iter->second)->second.insert(iter->first);
-
- if (edge_labels.find(iter->first) == edge_labels.end()) {
- vector<EdgeId> tmp;
- edge_labels.insert(make_pair(iter->first, tmp));
- }
- edge_labels[iter->first].push_back(iter->second);
- }
- }
-
- virtual ~EdgeLabelHandler() {
- }
-
- virtual void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) {
- TRACE("Handle glue");
- if (edge_labels[edge1] != edge_labels[edge2])
- WARN("gluing two different edges is not a good idea on this step! EdgeLabel Handler can fail on such operation");
- vector<EdgeId> tmp;
- for (size_t i = 0; i < edge_labels[edge1].size(); i++) {
- edge_inclusions.find(edge_labels[edge1][i])->second.insert(
- new_edge);
- edge_inclusions.find(edge_labels[edge1][i])->second.erase(edge1);
- tmp.push_back(edge_labels[edge1][i]);
-
- edge_labels.erase(edge1);
- }
- for (size_t i = 0; i < edge_labels[edge2].size(); i++) {
- edge_inclusions.find(edge_labels[edge2][i])->second.insert(
- new_edge);
- edge_inclusions.find(edge_labels[edge2][i])->second.erase(edge2);
- edge_labels.erase(edge2);
-
- // tmp.push_back(edge_labels[edge1][i]);
- }
-
- edge_labels.insert(make_pair(new_edge, tmp));
-
- }
-
- virtual void HandleSplit(EdgeId /*oldEdge*/, EdgeId /*newEdge1*/, EdgeId /*newEdge2*/) {
- WARN("EdgesLabelHandler does not support splits");
- }
-
- virtual void HandleMerge(const vector<EdgeId>& oldEdges, EdgeId newEdge) {
- TRACE("HandleMerge by edge labels handler");
- size_t n = oldEdges.size();
- vector<EdgeId> tmp;
- for (size_t j = 0; j < n; j++) {
- TRACE( "Edge " << oldEdges[j] << " was labeled by " << edge_labels[oldEdges[j]]);
- for (size_t i = 0; i < edge_labels[oldEdges[j]].size(); i++) {
- edge_inclusions[edge_labels[oldEdges[j]][i]].insert(newEdge);
- edge_inclusions[edge_labels[oldEdges[j]][i]].erase(oldEdges[j]);
- tmp.push_back(edge_labels[oldEdges[j]][i]);
- }
- edge_labels.erase(oldEdges[j]);
- }
- if (edge_labels.find(newEdge) != edge_labels.end()) {
- DEBUG("Unexpected finding of new edge labels");
- };
- edge_labels[newEdge] = tmp;
-
- }
-
- /*
- virtual void HandleAdd(VertexId v) {
- AddVertexIntId(v);
- }
- virtual void HandleDelete(VertexId v) {
- ClearVertexId(v);
- }
- */
- virtual void HandleAdd(EdgeId e) {
- TRACE("Add edge " << e);
-
- }
- virtual void HandleDelete(EdgeId e) {
- for (size_t i = 0; i < edge_labels[e].size(); i++) {
- edge_inclusions[edge_labels[e][i]].erase(e);
- }
- edge_labels.erase(e);
- }
-
- std::string str(EdgeId edgeId) const {
- std::stringstream ss;
-
- auto it = edge_labels.find(edgeId);
- if (it != edge_labels.end()) {
- TRACE("Number of labels " << it->second.size());
- for (auto label_it = it->second.begin(), end = it->second.end();
- label_it != end; ++label_it) {
- ss << this->g().str(*label_it) << "\\n";
- }
- }
- return ss.str();
- }
- vector<pair<EdgeId, size_t> > resolvedPositions(EdgeId old_edge, size_t position_on_edge) {
- vector<pair<EdgeId, size_t> > res;
- for (auto it = edge_inclusions[old_edge].begin(); it!= edge_inclusions[old_edge].end(); it++) {
- EdgeId cur_edge = *it;
- size_t cur_shift = 0;
- for(size_t i = 0; i < edge_labels[cur_edge].size(); i++) {
- if (edge_labels[cur_edge][i] == old_edge) {
- res.push_back(make_pair(cur_edge, cur_shift + position_on_edge));
- }
- cur_shift += old_graph_.length(edge_labels[cur_edge][i]);
- }
- }
- return res;
- }
-
-};
-
-template<class Graph>
-class EdgesLabelsGraphLabeler : public GraphLabeler<Graph> {
-
- protected:
- typedef GraphLabeler<Graph> super;
- typedef typename super::EdgeId EdgeId;
- typedef typename super::VertexId VertexId;
- Graph& g_;
- public:
- EdgeLabelHandler<Graph>& EdgesLabels;
-
- EdgesLabelsGraphLabeler(Graph& g, EdgeLabelHandler<Graph>& EdgesLab)
- : g_(g),
- EdgesLabels(EdgesLab) {
- }
-
- virtual std::string label(VertexId vertexId) const {
- return g_.str(vertexId);
- }
-
- virtual std::string label(EdgeId edgeId) const {
- return EdgesLabels.str(edgeId) + ": " + g_.str(edgeId);
- }
- virtual ~EdgesLabelsGraphLabeler() {
- TRACE("~EdgesPosGraphLabeler");
- }
-
-}
-;
-}
-
-#endif /* EDGE_LABELS_HANDLER_HPP_ */
diff --git a/src/include/omni/edges_position_handler.hpp b/src/include/omni/edges_position_handler.hpp
deleted file mode 100644
index df98943..0000000
--- a/src/include/omni/edges_position_handler.hpp
+++ /dev/null
@@ -1,208 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * edges_position_handler.hpp
- *
- * Created on: 22.07.2011
- *
- */
-
-#ifndef EDGES_POSITION_HANDLER_HPP_
-#define EDGES_POSITION_HANDLER_HPP_
-
-//#include "utils.hpp"
-#include "visualization/graph_labeler.hpp"
-#include "simple_tools.hpp"
-#include "omni_utils.hpp"
-#include "mapping_path.hpp"
-#include "action_handlers.hpp"
-
-namespace omnigraph {
-
-struct EdgePosition {
- string contigId;
- MappingRange mr;
- EdgePosition(string _contigId, MappingRange _mr) : contigId(_contigId), mr(_mr) {
- }
-
- EdgePosition() {
- }
-};
-
-inline ostream& operator <<(ostream& os, const EdgePosition& ep) {
- return os << ep.contigId << " " << ep.mr;
-}
-
-template<class Graph>
-class EdgesPositionHandler: public GraphActionHandler<Graph> {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- size_t max_mapping_gap_;
- size_t max_gap_diff_;
- map<EdgeId, map<string, set<MappingRange>>> edges_positions_;
- //TODO extract set<MappingRange> as a storage class
-
- MappingRange EraseAndExtract(set<MappingRange> &ranges, set<MappingRange>::iterator &position, const MappingRange &new_pos) {
- auto &old_pos = *position;
- if(old_pos.IntersectLeftOf(new_pos) || old_pos.StrictlyContinuesWith(new_pos, max_mapping_gap_, max_gap_diff_)) {
- ranges.erase(position);
- return old_pos.Merge(new_pos);
- } else if(new_pos.IntersectLeftOf(old_pos) || new_pos.StrictlyContinuesWith(old_pos, max_mapping_gap_, max_gap_diff_)) {
- ranges.erase(position);
- return new_pos.Merge(old_pos);
- } else {
- return new_pos;
- }
- }
-
-public:
- MappingRange EraseAndExtract(set<MappingRange> &ranges, MappingRange new_pos) {
- auto it = ranges.lower_bound(new_pos);
- if(it != ranges.end()) {
- new_pos = EraseAndExtract(ranges, it, new_pos);
- it = ranges.lower_bound(new_pos);
- }
- if(it != ranges.begin()) {
- new_pos = EraseAndExtract(ranges, --it, new_pos);
- }
- return new_pos;
- }
-
- set<MappingRange> GetEdgePositions(EdgeId edge, string contig_id) const {
- VERIFY(this->IsAttached());
- auto edge_it = edges_positions_.find(edge);
- if(edge_it == edges_positions_.end())
- return set<MappingRange>();
- const auto& positions = edge_it->second;
- auto it = positions.find(contig_id);
- if(it == positions.end())
- return set<MappingRange>();
- else
- return it->second;
- }
-
- vector<EdgePosition> GetEdgePositions(EdgeId edge) const {
- VERIFY(this->IsAttached());
- auto edge_it = edges_positions_.find(edge);
- if(edge_it == edges_positions_.end())
- return vector<EdgePosition>();
- vector<EdgePosition> result;
- for(auto it = edge_it->second.begin(); it != edge_it->second.end(); ++it) {
- for(auto pos_it = it->second.begin(); pos_it != it->second.end(); ++pos_it) {
- result.push_back(EdgePosition(it->first, *pos_it));
- }
- }
- return result;
- }
-
- void AddEdgePosition(EdgeId edge, string contig_id, size_t start, size_t end, size_t m_start, size_t m_end) {
- VERIFY(this->IsAttached());
- AddEdgePosition(edge, contig_id, MappingRange(start, end, m_start, m_end));
- }
-
- void AddEdgePosition(EdgeId edge, string contig_id, MappingRange new_pos) {
- VERIFY(this->IsAttached());
- if(new_pos.empty())
- return;
- set<MappingRange> &new_set = edges_positions_[edge][contig_id];
- new_pos = EraseAndExtract(new_set, new_pos);
- new_set.insert(new_pos);
- }
-
- void AddAndShiftEdgePositions(EdgeId edge, const map<string, set<MappingRange>> &contig_map, int shift = 0) {
- VERIFY(this->IsAttached());
- for(auto contig_it = contig_map.begin(); contig_it != contig_map.end(); ++contig_it) {
- for(auto it = contig_it->second.begin(); it != contig_it->second.end(); ++it) {
- AddEdgePosition(edge, contig_it->first, it->Shift(shift).Fit(this->g().length(edge)));
- }
- }
- }
-
- template<typename Iter>
- void AddEdgePositions(EdgeId edge, Iter begin, Iter end) {
- VERIFY(this->IsAttached());
- for(auto it = begin; it != end; ++it) {
- AddEdgePosition(edge, it->contigId, it->mr);
- }
- }
-
- std::string str(EdgeId edge) const {
- VERIFY(this->IsAttached());
- std::stringstream ss;
- vector<EdgePosition> positions = GetEdgePositions(edge);
- for (auto pos_it = positions.begin(), end = positions.end(); pos_it != end; ++pos_it) {
- ss << "(" << pos_it->contigId << ": " << pos_it->mr << ")\\n";
- }
- return ss.str();
- }
-
- /**
- * @param max_mapping_gap - maximal difference in positions of
- * original sequence for two mapping ranges to be merged.
- * @param max_gap_diff - maximal difference between gaps in initial and mapped ranges for
- * mapping ranges to be merged
- */
- EdgesPositionHandler(const Graph &g, size_t max_mapping_gap, size_t max_gap_diff = 0) :
- GraphActionHandler<Graph>(g, "EdgePositionHandler"),
- max_mapping_gap_(max_mapping_gap),
- max_gap_diff_(max_gap_diff) {
- }
-
- virtual ~EdgesPositionHandler() {
- TRACE("~EdgePositionHandler ok");
- }
-
- virtual void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) {
-// TRACE("Handle glue ");
- auto positions1 = GetEdgePositions(edge1);
- auto positions2 = GetEdgePositions(edge2);
- AddEdgePositions(new_edge, positions1.begin(), positions1.end());
- AddEdgePositions(new_edge, positions2.begin(), positions2.end());
- }
-
- virtual void HandleSplit(EdgeId oldEdge, EdgeId newEdge1, EdgeId newEdge2) {
- if (oldEdge == this->g().conjugate(oldEdge)) {
- WARN("EdgesPositionHandler does not support self-conjugate splits");
- return;
- }
- if (edges_positions_.count(oldEdge) != 0) {
- auto contig_map = edges_positions_[oldEdge];
- AddAndShiftEdgePositions(newEdge1, contig_map, 0);
- AddAndShiftEdgePositions(newEdge2, contig_map, -int(this->g().length(newEdge1)));
- }
- }
-
- virtual void HandleMerge(const vector<EdgeId>& oldEdges, EdgeId newEdge) {
- int shift = 0;
- for(auto it = oldEdges.begin(); it != oldEdges.end(); ++it) {
- if (edges_positions_.count(*it) != 0) {
- AddAndShiftEdgePositions(newEdge, edges_positions_[*it], shift);
- }
- shift += int(this->g().length(*it));
- }
- }
-
- virtual void HandleAdd(EdgeId /*e*/) {
- }
-
- virtual void HandleDelete(EdgeId e) {
- edges_positions_.erase(e);
- }
-
- void clear() {
- edges_positions_.clear();
- }
-
-private:
- DECL_LOGGER("EdgesPositionHandler");
-};
-
-}
-
-#endif /* EDGES_POSITION_HANDLER_HPP_ */
diff --git a/src/include/omni/erroneous_connection_remover.hpp b/src/include/omni/erroneous_connection_remover.hpp
deleted file mode 100644
index 28a4dc1..0000000
--- a/src/include/omni/erroneous_connection_remover.hpp
+++ /dev/null
@@ -1,381 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * erroneous_connection_remover.hpp
- *
- * Created on: May 31, 2011
- * Author: sergey
- */
-
-#pragma once
-
-#include "graph_processing_algorithm.hpp"
-#include "basic_edge_conditions.hpp"
-#include "omni_tools.hpp"
-#include "omni_utils.hpp"
-#include "func.hpp"
-#include "xmath.h"
-#include "dijkstra_tools/dijkstra_helper.hpp"
-
-namespace omnigraph {
-
-template<class Graph>
-pred::TypedPredicate<typename Graph::EdgeId>
-NecessaryECCondition(const Graph& g, size_t max_length, double max_coverage) {
- return AddAlternativesPresenceCondition(g, pred::And(LengthUpperBound<Graph>(g, max_length),
- CoverageUpperBound<Graph>(g, max_coverage)));
-}
-
-template<class Graph>
-bool RemoveErroneousEdgesInCoverageOrder(Graph &g,
- pred::TypedPredicate<typename Graph::EdgeId> removal_condition,
- double max_coverage,
- std::function<void(typename Graph::EdgeId)> removal_handler) {
- omnigraph::EdgeRemovingAlgorithm<Graph> erroneous_edge_remover(g,
- AddAlternativesPresenceCondition(g, removal_condition),
- removal_handler);
-
- return erroneous_edge_remover.Run(CoverageComparator<Graph>(g),
- CoverageUpperBound<Graph>(g, max_coverage));
-}
-
-template<class Graph>
-bool RemoveErroneousEdgesInLengthOrder(Graph &g,
- pred::TypedPredicate<typename Graph::EdgeId> removal_condition,
- size_t max_length,
- std::function<void(typename Graph::EdgeId)> removal_handler) {
- omnigraph::EdgeRemovingAlgorithm<Graph> erroneous_edge_remover(g,
- AddAlternativesPresenceCondition(g, removal_condition),
- removal_handler);
-
- return erroneous_edge_remover.Run(LengthComparator<Graph>(g),
- LengthUpperBound<Graph>(g, max_length));
-}
-
-template<class Graph>
-class SelfConjugateCondition : public EdgeCondition<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef EdgeCondition<Graph> base;
-
- public:
-
- SelfConjugateCondition(const Graph& g)
- : base(g) {
- }
-
- bool Check(EdgeId e) const {
- return e == this->g().conjugate(e);
- }
-
- private:
- DECL_LOGGER("SelfConjugateCondition");
-};
-
-//coverage comparator
-//template<class Graph>
-//class RelativeCoverageCondition : public EdgeCondition<Graph> {
-// typedef typename Graph::EdgeId EdgeId;
-// typedef typename Graph::VertexId VertexId;
-// typedef EdgeCondition<Graph> base;
-//
-// double min_coverage_gap_;
-//
-// bool StrongNeighbourCondition(EdgeId neighbour_edge,
-// EdgeId possible_ec) const {
-// return neighbour_edge == possible_ec
-// || math::gr(this->g().coverage(neighbour_edge),
-// this->g().coverage(possible_ec) * min_coverage_gap_);
-//// || this->g().length(neighbour_edge)
-//// >= neighbour_length_threshold_;
-// }
-//
-// bool CheckAdjacent(const vector<EdgeId>& edges, EdgeId possible_ec) const {
-// FOREACH (EdgeId e, edges) {
-// if (!StrongNeighbourCondition(e, possible_ec))
-// return false;
-// }
-// return true;
-// }
-//
-// public:
-//
-// RelativeCoverageCondition(const Graph& g, double min_coverage_gap)
-// : base(g),
-// min_coverage_gap_(min_coverage_gap) {
-//
-// }
-//
-// bool Check(EdgeId e) const {
-// const Graph& g = this->g();
-// return CheckAdjacent(g.IncidentEdges(g.EdgeStart(e)), e)
-// && CheckAdjacent(g.IncidentEdges(g.EdgeEnd(e)), e);
-// }
-//
-// private:
-// DECL_LOGGER("RelativeCoverageCondition")
-// ;
-//
-//};
-
-//todo refactor
-template<class Graph>
-class ThornCondition : public EdgeCondition<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef EdgeCondition<Graph> base;
-
- size_t uniqueness_length_;
- size_t dijkstra_depth_;
-
- bool Unique(const vector<EdgeId>& edges, bool forward) const {
- return edges.size() == 1 && CheckUniqueness(*edges.begin(), forward);
- }
-
- bool CheckUnique(EdgeId e) const {
- TRACE("Checking conditions for edge start");
- return Unique(vector<EdgeId>(this->g().in_begin(this->g().EdgeStart(e)), this->g().in_end(this->g().EdgeStart(e))), false)
- || Unique(vector<EdgeId>(this->g().out_begin(this->g().EdgeEnd(e)), this->g().out_end(this->g().EdgeEnd(e))), true);
- }
-
- bool CheckThorn(EdgeId e) const {
- if (this->g().EdgeStart(e) == this->g().EdgeEnd(e))
- return false;
- if (this->g().RelatedVertices(this->g().EdgeStart(e),
- this->g().EdgeEnd(e))) {
- return true;
- }
- if (this->g().OutgoingEdgeCount(this->g().EdgeStart(e)) != 2)
- return false;
- if (this->g().IncomingEdgeCount(this->g().EdgeStart(e)) != 1)
- return false;
- if (this->g().OutgoingEdgeCount(this->g().EdgeEnd(e)) != 1)
- return false;
- if (this->g().IncomingEdgeCount(this->g().EdgeEnd(e)) != 2)
- return false;
-
- auto dij = DijkstraHelper<Graph>::CreateBoundedDijkstra(this->g(), dijkstra_depth_);
- dij.Run(this->g().EdgeStart(e));
- vector<VertexId> reached = dij.ReachedVertices();
- for (auto it = reached.begin(); it != reached.end(); ++it) {
- if (*it != this->g().EdgeEnd(e)
- && this->g().RelatedVertices(*it, this->g().EdgeEnd(e))) {
- return true;
- }
- }
- return false;
- }
-
- template<class EdgeContainer>
- bool CheckAlternativeCoverage(const EdgeContainer& edges, EdgeId base) const {
- for (EdgeId e: edges) {
- if (e != base && this->g().length(e) < 400
- && this->g().coverage(e) < 15 * this->g().coverage(base)) {
- return false;
- }
- }
- return true;
- }
-
- bool CheckCoverageAround(EdgeId e) const {
- return CheckAlternativeCoverage(
- this->g().IncidentEdges(this->g().EdgeStart(e)), e)
- && CheckAlternativeCoverage(
- this->g().IncidentEdges(this->g().EdgeEnd(e)), e);
- }
-
- bool CheckUniqueness(EdgeId e, bool /*forward*/) const {
- return this->g().length(e) >= uniqueness_length_;
- }
-
- public:
-
- ThornCondition(Graph& g, size_t uniqueness_length, size_t dijkstra_depth)
- : base(g),
- uniqueness_length_(uniqueness_length),
- dijkstra_depth_(dijkstra_depth) {
- }
-
- bool Check(EdgeId e) const {
- bool tmp = (CheckUnique(e) || CheckCoverageAround(e));
- if (tmp)
- tmp &= CheckThorn(e);
- return tmp;
- }
-
- private:
- DECL_LOGGER("ThornCondition")
- ;
-
-};
-
-template<class Graph>
-class MultiplicityCountingCondition : public UniquenessPlausabilityCondition<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef pred::TypedPredicate<EdgeId> EdgePredicate;
- typedef UniquenessPlausabilityCondition<Graph> base;
-
- MultiplicityCounter<Graph> multiplicity_counter_;
- EdgePredicate plausiblity_condition_;
-
-public:
- bool CheckUniqueness(EdgeId e, bool forward) const {
- TRACE( "Checking " << this->g().int_id(e) << " for uniqueness in " << (forward ? "forward" : "backward") << " direction");
- VertexId start =
- forward ? this->g().EdgeEnd(e) : this->g().EdgeStart(e);
- bool result = multiplicity_counter_.count(e, start) <= 1;
- TRACE( "Edge " << this->g().int_id(e) << " is" << (result ? "" : " not") << " unique");
- return result;
- }
-
- bool CheckPlausibility(EdgeId e, bool) const {
- return plausiblity_condition_(e);
- }
-
- MultiplicityCountingCondition(const Graph& g, size_t uniqueness_length,
- EdgePredicate plausiblity_condition)
- :
- //todo why 8???
- base(g),
- multiplicity_counter_(g, uniqueness_length, 8),
- plausiblity_condition_(plausiblity_condition) {
-
- }
-
- private:
-
- DECL_LOGGER("MultiplicityCountingCondition")
- ;
-};
-
-template<class Graph>
-class HiddenECRemover: public EdgeProcessingAlgorithm<Graph> {
- typedef EdgeProcessingAlgorithm<Graph> base;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-private:
- size_t uniqueness_length_;
- double unreliability_threshold_;
- double ec_threshold_;
- double relative_threshold_;
- const AbstractFlankingCoverage<Graph> &flanking_coverage_;
- EdgeRemover<Graph> edge_remover_;
- MultiplicityCountingCondition<Graph> condition_;
-private:
- void RemoveHiddenEC(EdgeId edge) {
- if (this->g().length(edge) <= this->g().k() || (edge == this->g().conjugate(edge) && this->g().length(edge) <= 2 * this->g().k()))
- edge_remover_.DeleteEdge(edge);
- else {
- auto split_result = this->g().SplitEdge(edge, this->g().k());
- edge_remover_.DeleteEdge(split_result.first);
- }
- }
-
- void RemoveHiddenECWithNoCompression(EdgeId edge) {
- if (this->g().length(edge) <= this->g().k() || (edge == this->g().conjugate(edge) && this->g().length(edge) <= 2 * this->g().k())) {
- edge_remover_.DeleteEdgeWithNoCompression(edge);
- } else {
- auto split_result = this->g().SplitEdge(edge, this->g().k());
- edge_remover_.DeleteEdgeWithNoCompression(split_result.first);
- }
- }
-
- void DisconnectEdges(VertexId v) {
- while(!this->g().IsDeadEnd(v)) {
- RemoveHiddenECWithNoCompression(*(this->g().out_begin(v)));
- }
- }
-
- bool FindHiddenEC(VertexId v) {
- vector<EdgeId> edges(this->g().out_begin(v), this->g().out_end(v));
- if(flanking_coverage_.GetInCov(edges[0]) > flanking_coverage_.GetInCov(edges[1])) {
- auto tmp = edges[0];
- edges[0] = edges[1];
- edges[1] = tmp;
- }
-// cout << flanking_coverage_.GetInCov(edges[0]) << " " << flanking_coverage_.GetInCov(edges[1]) << endl;
- if(flanking_coverage_.GetInCov(edges[1]) < unreliability_threshold_) {
- DisconnectEdges(v);
-// cout << "disconnected" << endl;
- return true;
- }
- if(flanking_coverage_.GetInCov(edges[0]) * relative_threshold_ < flanking_coverage_.GetInCov(edges[1]) && flanking_coverage_.GetInCov(edges[0]) < ec_threshold_) {
- RemoveHiddenEC(edges[0]);
-// cout << "success" << endl;
- return true;
- }
- return false;
- }
-
- bool CheckSuspicious(VertexId v) {
- if (this->g().IncomingEdgeCount(v) != 1 || this->g().OutgoingEdgeCount(v) != 2) {
- return false;
- }
- vector<EdgeId> edges(this->g().out_begin(v), this->g().out_end(v));
- return (edges.size() == 2 && this->g().conjugate(edges[0]) == edges[1] && condition_.CheckUniqueness(this->g().GetUniqueIncomingEdge(v), false)) || this->g().length(this->g().GetUniqueIncomingEdge(v)) >= uniqueness_length_;
- }
-
- bool ProcessEdge(EdgeId e) {
- VertexId v = this->g().EdgeEnd(e);
- if(CheckSuspicious(v)) {
-// cout << "client: " << this->g().int_id(v) << endl;
- return FindHiddenEC(v);
- }
- return false;
- }
-
-public:
- HiddenECRemover(Graph& g, size_t uniqueness_length,
- const AbstractFlankingCoverage<Graph> &flanking_coverage,
- double unreliability_threshold, double ec_threshold,
- double relative_threshold,
- std::function<void(EdgeId)> removal_handler = 0)
- : base(g), uniqueness_length_(uniqueness_length),
- unreliability_threshold_(unreliability_threshold * ec_threshold), ec_threshold_(ec_threshold),
- relative_threshold_(relative_threshold), flanking_coverage_(flanking_coverage),
- edge_remover_(g, removal_handler),
- condition_(g, uniqueness_length, pred::AlwaysTrue<EdgeId>()) {
- }
-
-private:
- DECL_LOGGER("HiddenECRemover");
-};
-
-template<class Graph>
-class SelfConjugateDisruptor: public EdgeProcessingAlgorithm<Graph> {
- typedef EdgeProcessingAlgorithm<Graph> base;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- EdgeRemover<Graph> edge_remover_;
-protected:
-
- bool ProcessEdge(EdgeId e) override {
- if (e == this->g().conjugate(e)) {
- TRACE("Disrupting self-conjugate edge " << this->g().str(e));
- EdgeId to_del = e;
- size_t len = this->g().length(e);
- if (len > 1) {
- to_del = this->g().SplitEdge(e, len / 2).second;
- }
- edge_remover_.DeleteEdge(to_del);
- return true;
- }
- return false;
- }
-
-public:
- SelfConjugateDisruptor(Graph& g,
- std::function<void(EdgeId)> removal_handler = 0)
- : base(g, true), edge_remover_(g, removal_handler) {
- }
-
-private:
- DECL_LOGGER("SelfConjugateDisruptor");
-};
-}
diff --git a/src/include/omni/graph_component.hpp b/src/include/omni/graph_component.hpp
deleted file mode 100644
index 613b6e4..0000000
--- a/src/include/omni/graph_component.hpp
+++ /dev/null
@@ -1,198 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-
-namespace omnigraph {
-
-//todo make handler!!!
-template<class Graph>
-class GraphComponent {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename std::set<VertexId>::const_iterator vertex_iterator;
- typedef typename std::set<EdgeId>::const_iterator edge_iterator;
- const Graph& graph_;
- std::set<VertexId> vertices_;
- std::set<EdgeId> edges_;
- std::set<VertexId> sinks_;
- std::set<VertexId> sources_;
- string name_;
-
-
- template<class VertexIt>
- void FillVertices(VertexIt begin, VertexIt end) {
- for (auto it = begin; it != end; ++it) {
- vertices_.insert(*it);
- }
- }
-
- template<class VertexIt>
- void FillVertices(VertexIt begin, VertexIt end, bool add_conjugate) {
- for (auto it = begin; it != end; ++it) {
- vertices_.insert(*it);
- if (add_conjugate)
- vertices_.insert(graph_.conjugate(*it));
- }
- }
-
- void FillEdges() {
- for (auto v_it = vertices_.begin(); v_it != vertices_.end(); ++v_it) {
- TRACE("working with vertex " << graph_.str(*v_it));
- for (EdgeId e : graph_.OutgoingEdges(*v_it)) {
- VertexId edge_end = graph_.EdgeEnd(e);
- TRACE(graph_.coverage(e) << " " << graph_.length(e));
- if (vertices_.count(edge_end) > 0) {
- edges_.insert(e);
- TRACE("Edge added");
- }
- }
- }
- }
-
- template<class VertexIt>
- void Fill(VertexIt begin, VertexIt end) {
- FillVertices(begin, end);
- FillEdges();
- FindSinksAndSources();
- }
-
- template<class VertexIt>
- void Fill(VertexIt begin, VertexIt end, bool add_conjugate) {
- FillVertices(begin, end, add_conjugate);
- FillEdges();
- FindSinksAndSources();
- }
-
- void FindSinksAndSources() {
- for(auto v : vertices_) {
- for(auto e : graph_.IncomingEdges(v)) {
- if(!contains(e) && !(contains(graph_.EdgeStart(e)))) {
- sources_.insert(v);
- break;
- }
- }
-
- for(auto e : graph_.OutgoingEdges(v)) {
- if(!contains(e) && !(contains(graph_.EdgeEnd(e)))) {
- sinks_.insert(v);
- break;
- }
- }
- }
- }
-
-public:
- template<class VertexIt>
- GraphComponent(const Graph &g, VertexIt begin, VertexIt end, const string &name = "") :
- graph_(g), name_(name) {
- Fill(begin, end);
- }
-
- //todo refactor and get rid of hack
- template<class VertexIt>
- GraphComponent(const Graph &g, VertexIt begin, VertexIt end,
- bool add_conjugate, const string &name = "") : graph_(g), name_(name) {
- Fill(begin, end, add_conjugate);
- }
-
- //Full graph component
- GraphComponent(const Graph &g, bool fill = true, const string &name = "") : graph_(g), name_(name) {
- if(fill) {
- Fill(g.begin(), g.end());
- }
- }
-
- //may be used for conjugate closure
- GraphComponent(const GraphComponent& component, bool add_conjugate, const string &name = "") : graph_(component.graph_), name_(name)
-// vertices_(component.vertices_.begin(), component.vertices_.end()),
-// edges_(component.edges_.begin(), component.edges_.end())
- {
- Fill(component.v_begin(), component.v_end(), add_conjugate);
- }
-
- GraphComponent<Graph> &operator=(const GraphComponent<Graph> &that) {
- VERIFY(&this->graph_ == &that.graph_);
- this->vertices_ = that.vertices_;
- this->edges_ = that.edges_;
- this->name_ = that.name_;
- return *this;
- }
-
- const Graph& g() const {
- return graph_;
- }
-
- string name() const {
- return name_;
- }
-
- size_t v_size() const {
- return vertices_.size();
- }
-
- size_t e_size() const {
- return edges_.size();
- }
-
- bool contains(EdgeId e) const {
- return edges_.count(e) > 0;
- }
-
- bool contains(VertexId v) const {
- return vertices_.count(v) > 0;
- }
-
- edge_iterator e_begin() const {
- return edges_.begin();
- }
- edge_iterator e_end() const {
- return edges_.end();
- }
-
- const std::set<EdgeId>& edges() const {
- return edges_;
- }
-
- const std::set<VertexId>& vertices() const{
- return vertices_;
- }
-
- vertex_iterator v_begin() const {
- return vertices_.begin();
- }
- vertex_iterator v_end() const {
- return vertices_.end();
- }
-
- const std::set<VertexId>& sinks() const {
- return sinks_;
- }
-
- const std::set<VertexId>& sources() const {
- return sources_;
- }
-
- bool IsBorder(VertexId v) const {
- if(vertices_.count(v) == 0)
- return false;
- for (EdgeId e : graph_.IncidentEdges(v)) {
- if (vertices_.count(graph_.EdgeStart(e)) == 0
- || vertices_.count(graph_.EdgeEnd(e)) == 0) {
- return true;
- }
- }
- return false;
- }
-
-};
-
-}
-
-
-
diff --git a/src/include/omni/graph_core.hpp b/src/include/omni/graph_core.hpp
deleted file mode 100644
index dbf7149..0000000
--- a/src/include/omni/graph_core.hpp
+++ /dev/null
@@ -1,620 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include <vector>
-#include <set>
-#include "verify.hpp"
-#include "logger/logger.hpp"
-#include "order_and_law.hpp"
-#include <boost/iterator/iterator_facade.hpp>
-#include "../simple_tools.hpp"
-
-namespace omnigraph {
-
-using std::vector;
-template<class DataMaster>
-class GraphCore;
-
-template<class DataMaster>
-class ConstructionHelper;
-
-template<class T>
-class PairedElementManipulationHelper;
-
-template<class DataMaster>
-class PairedVertex;
-
-template<class DataMaster>
-class PairedEdge;
-
-template<class DataMaster>
-class PairedEdge {
- private:
- typedef typename DataMaster::EdgeData EdgeData;
- typedef restricted::pure_pointer<PairedEdge<DataMaster>> EdgeId;
- typedef restricted::pure_pointer<PairedVertex<DataMaster>> VertexId;
- friend class GraphCore<DataMaster>;
- friend class ConstructionHelper<DataMaster>;
- friend class PairedElementManipulationHelper<EdgeId>;
- //todo unfriend
- friend class PairedVertex<DataMaster>;
- VertexId end_;
- EdgeData data_;
- EdgeId conjugate_;
-
- PairedEdge(VertexId end, const EdgeData &data)
- : end_(end),
- data_(data) {
- }
-
- EdgeData &data() {
- return data_;
- }
-
- void set_data(const EdgeData &data) {
- data_ = data;
- }
-
- VertexId end() const {
- return end_;
- }
-
- VertexId start() const {
- return conjugate_->end()->conjugate();
- }
-
- void set_conjugate(EdgeId conjugate) {
- conjugate_ = conjugate;
- }
-
- void SetEndVertex(VertexId end) {
- end_ = end;
- }
-
-public:
- EdgeId conjugate() const {
- return conjugate_;
- }
-
- size_t length(size_t k) const {
- return data_.size() - k;
- }
-};
-
-template<class DataMaster>
-class PairedVertex {
-private:
- typedef typename DataMaster::VertexData VertexData;
- typedef restricted::pure_pointer<PairedEdge<DataMaster>> EdgeId;
- typedef restricted::pure_pointer<PairedVertex<DataMaster>> VertexId;
- typedef typename std::vector<EdgeId>::const_iterator edge_raw_iterator;
-
- class conjugate_iterator : public boost::iterator_facade<conjugate_iterator,
- EdgeId, boost::forward_traversal_tag, EdgeId> {
- public:
- explicit conjugate_iterator(edge_raw_iterator it,
- bool conjugate = false)
- : it_(it),
- conjugate_(conjugate) {
- }
-
- //todo do we need it?
- conjugate_iterator()
- : conjugate_(false) {
- }
-
- private:
- friend class boost::iterator_core_access;
-
- void increment() {
- it_++;
- }
-
- bool equal(const conjugate_iterator &other) const {
- return other.it_ == it_ && other.conjugate_ == conjugate_;
- }
-
- EdgeId dereference() const {
- return (conjugate_ ? (*it_)->conjugate() : *it_);
- }
-
- edge_raw_iterator it_;
- bool conjugate_;
- };
-
-public:
- typedef conjugate_iterator edge_const_iterator;
-
-private:
- friend class GraphCore<DataMaster>;
- friend class ConstructionHelper<DataMaster>;
- friend class PairedEdge<DataMaster>;
- friend class PairedElementManipulationHelper<VertexId>;
- friend class conjugate_iterator;
-
- std::vector<EdgeId> outgoing_edges_;
-
- VertexId conjugate_;
-
- VertexData data_;
-
- bool IsMinimal() const {
- return conjugate_->conjugate_ <= conjugate_;
- }
-
- VertexId conjugate() const {
- return conjugate_;
- }
-
- void set_conjugate(VertexId conjugate) {
- conjugate_ = conjugate;
- }
-
- size_t OutgoingEdgeCount() const {
- return outgoing_edges_.size();
- }
-
- edge_const_iterator out_begin() const {
- return edge_const_iterator(outgoing_edges_.cbegin(), false);
- }
-
- edge_const_iterator out_end() const {
- return edge_const_iterator(outgoing_edges_.cend(), false);
- }
-
- size_t IncomingEdgeCount() const {
- return conjugate_->OutgoingEdgeCount();
- }
-
- size_t IncomingEdgesCount() const {
- return conjugate_->OutgoingEdgeCount();
- }
-
- edge_const_iterator in_begin() const {
- return edge_const_iterator(conjugate_->outgoing_edges_.cbegin(), true);
- }
-
- edge_const_iterator in_end() const {
- return edge_const_iterator(conjugate_->outgoing_edges_.cend(), true);
- }
-
- PairedVertex(VertexData data)
- : data_(data) {
- }
-
- VertexData &data() {
- return data_;
- }
-
- void set_data(VertexData data) {
- data_ = data;
- }
-
- const std::vector<EdgeId> OutgoingEdgesTo(VertexId v) const {
- vector<EdgeId> result;
- for (auto it = outgoing_edges_.begin(); it != outgoing_edges_.end(); ++it) {
- if ((*it)->end() == v) {
- result.push_back(*it);
- }
- }
- return result;
- }
-
- void AddOutgoingEdge(EdgeId e) {
- outgoing_edges_.insert(std::upper_bound(outgoing_edges_.begin(), outgoing_edges_.end(), e), e);
- //outgoing_edges_.push_back(e);
- }
-
- bool RemoveOutgoingEdge(const EdgeId e) {
- auto it = std::find(outgoing_edges_.begin(), outgoing_edges_.end(), e);
- if (it == outgoing_edges_.end())
- return false;
-
- outgoing_edges_.erase(it);
- return true;
- }
-
- ~PairedVertex() {
- VERIFY(outgoing_edges_.size() == 0);
- }
-};
-
-template<class DataMaster>
-class GraphCore: private boost::noncopyable {
-public:
- typedef DataMaster DataMasterT;
- typedef typename DataMasterT::VertexData VertexData;
- typedef typename DataMasterT::EdgeData EdgeData;
- typedef restricted::pure_pointer<PairedEdge<DataMaster>> EdgeId;
- typedef restricted::pure_pointer<PairedVertex<DataMaster>> VertexId;
- typedef typename std::set<VertexId>::const_iterator VertexIt;
- typedef typename PairedVertex<DataMaster>::edge_const_iterator edge_const_iterator;
-
-private:
- restricted::LocalIdDistributor id_distributor_;
- DataMaster master_;
- std::set<VertexId> vertices_;
-
- friend class ConstructionHelper<DataMaster>;
-public:
- VertexIt begin() const {
- return vertices_.begin();
- }
-
- VertexIt end() const {
- return vertices_.end();
- }
-
- const std::set<VertexId>& vertices() const {
- return vertices_;
- }
-
- size_t size() const {
- return vertices_.size();
- }
-
- edge_const_iterator out_begin(VertexId v) const {
- return v->out_begin();
- }
-
- edge_const_iterator out_end(VertexId v) const {
- return v->out_end();
- }
-
- edge_const_iterator in_begin(VertexId v) const {
- return v->in_begin();
- }
-
- edge_const_iterator in_end(VertexId v) const {
- return v->in_end();
- }
-
-private:
- void DeleteVertexFromGraph(VertexId vertex) {
- this->vertices_.erase(vertex);
- this->vertices_.erase(conjugate(vertex));
- }
-
- void DestroyVertex(VertexId vertex) {
- VertexId conjugate = vertex->conjugate();
- delete vertex.get();
- delete conjugate.get();
- }
-
- bool AdditionalCompressCondition(VertexId v) const {
- return !(EdgeEnd(GetUniqueOutgoingEdge(v)) == conjugate(v) && EdgeStart(GetUniqueIncomingEdge(v)) == conjugate(v));
- }
-
-protected:
-
- VertexId CreateVertex(const VertexData& data1, const VertexData& data2, restricted::IdDistributor& id_distributor) {
- VertexId vertex1(new PairedVertex<DataMaster>(data1), id_distributor);
- VertexId vertex2(new PairedVertex<DataMaster>(data2), id_distributor);
- vertex1->set_conjugate(vertex2);
- vertex2->set_conjugate(vertex1);
- return vertex1;
- }
-
- VertexId CreateVertex(const VertexData &data, restricted::IdDistributor &id_distributor) {
- return CreateVertex(data, master_.conjugate(data), id_distributor);
- }
-
- VertexId CreateVertex(const VertexData &data) {
- return CreateVertex(data, id_distributor_);
- }
-
- void AddVertexToGraph(VertexId vertex) {
- vertices_.insert(vertex);
- vertices_.insert(conjugate(vertex));
- }
-
- VertexId HiddenAddVertex(const VertexData& data, restricted::IdDistributor& id_distributor) {
- VertexId vertex = CreateVertex(data, id_distributor);
- AddVertexToGraph(vertex);
- return vertex;
- }
-
- VertexId HiddenAddVertex(const VertexData& data) {
- return HiddenAddVertex(data, id_distributor_);
- }
-
- void HiddenDeleteVertex(VertexId vertex) {
- DeleteVertexFromGraph(vertex);
- DestroyVertex(vertex);
- }
-
- /////////////////////////low-level ops (move to helper?!)
-
- ////what with this method?
- EdgeId AddSingleEdge(VertexId v1, VertexId v2, const EdgeData &data,
- restricted::IdDistributor &idDistributor) {
- EdgeId newEdge(new PairedEdge<DataMaster>(v2, data), idDistributor);
- if (v1 != VertexId(0))
- v1->AddOutgoingEdge(newEdge);
- return newEdge;
- }
-
- EdgeId HiddenAddEdge(const EdgeData& data, restricted::IdDistributor& id_distributor) {
- EdgeId result = AddSingleEdge(VertexId(0), VertexId(0), data, id_distributor);
- if (this->master().isSelfConjugate(data)) {
- result->set_conjugate(result);
- return result;
- }
- EdgeId rcEdge = AddSingleEdge(VertexId(0), VertexId(0), this->master().conjugate(data), id_distributor);
- result->set_conjugate(rcEdge);
- rcEdge->set_conjugate(result);
- return result;
- }
-
- EdgeId HiddenAddEdge(const EdgeData &data) {
- return HiddenAddEdge(data, id_distributor_);
- }
-
- EdgeId HiddenAddEdge(VertexId v1, VertexId v2, const EdgeData& data, restricted::IdDistributor& id_distributor) {
- // todo was suppressed for concurrent execution reasons (see concurrent_graph_component.hpp)
- // VERIFY(this->vertices_.find(v1) != this->vertices_.end() && this->vertices_.find(v2) != this->vertices_.end());
- EdgeId result = AddSingleEdge(v1, v2, data, id_distributor);
- if (this->master().isSelfConjugate(data) && (v1 == conjugate(v2))) {
- // todo why was it removed???
- // Because of some split issues: when self-conjugate edge is split armageddon happends
- // VERIFY(v1 == conjugate(v2));
- // VERIFY(v1 == conjugate(v2));
- result->set_conjugate(result);
- return result;
- }
- EdgeId rcEdge = AddSingleEdge(v2->conjugate(), v1->conjugate(), this->master().conjugate(data), id_distributor);
- result->set_conjugate(rcEdge);
- rcEdge->set_conjugate(result);
- return result;
- }
-
- EdgeId HiddenAddEdge(VertexId v1, VertexId v2, const EdgeData &data) {
- return HiddenAddEdge(v1, v2, data, id_distributor_);
- }
-
- void HiddenDeleteEdge(EdgeId edge) {
- DEBUG("Hidden delete edge " << edge.int_id());
- EdgeId rcEdge = conjugate(edge);
- VertexId rcStart = conjugate(edge->end());
- VertexId start = conjugate(rcEdge->end());
- start->RemoveOutgoingEdge(edge);
- rcStart->RemoveOutgoingEdge(rcEdge);
- if (edge != rcEdge) {
- delete rcEdge.get();
- }
- delete edge.get();
- }
-
- void HiddenDeletePath(const std::vector<EdgeId>& edgesToDelete, const std::vector<VertexId>& verticesToDelete) {
- for (auto it = edgesToDelete.begin(); it != edgesToDelete.end(); ++it)
- HiddenDeleteEdge(*it);
- for (auto it = verticesToDelete.begin(); it != verticesToDelete.end(); ++it)
- HiddenDeleteVertex(*it);
- }
-
-public:
-
- GraphCore(const DataMaster& master) : master_(master) {
- }
-
- virtual ~GraphCore() {
- VERIFY(size() == 0);
- }
-
- class IteratorContainer {
- public:
- typedef edge_const_iterator const_iterator;
- private:
- const_iterator begin_;
- const_iterator end_;
- public:
- IteratorContainer(const_iterator begin, const_iterator end) :
- begin_(begin), end_(end) {
-
- }
-
- const_iterator begin() const {
- return begin_;
- }
-
- const_iterator end() const {
- return end_;
- }
- };
-
- restricted::LocalIdDistributor &GetGraphIdDistributor() {
- return id_distributor_;
- }
-
- const restricted::LocalIdDistributor &GetGraphIdDistributor() const {
- return id_distributor_;
- }
-
- size_t int_id(EdgeId edge) const {
- return edge.int_id();
- }
-
- size_t int_id(VertexId vertex) const {
- return vertex.int_id();
- }
-
- const DataMaster& master() const {
- return master_;
- }
-
- const EdgeData& data(EdgeId edge) const {
- return edge->data();
- }
-
- const VertexData& data(VertexId v) const {
- return v->data();
- }
-
- EdgeData& data(EdgeId edge) {
- return edge->data();
- }
-
- VertexData& data(VertexId v) {
- return v->data();
- }
-
- size_t OutgoingEdgeCount(VertexId v) const {
- return v->OutgoingEdgeCount();
- }
-
- IteratorContainer OutgoingEdges(VertexId v) const {
- //INFO("Outgoing");
- return IteratorContainer(out_begin(v), out_end(v));
- }
-
- size_t IncomingEdgeCount(VertexId v) const {
- return v->IncomingEdgeCount();
- }
-
- IteratorContainer IncomingEdges(VertexId v) const {
- return IteratorContainer(in_begin(v), in_end(v));
- }
-
- std::vector<EdgeId> GetEdgesBetween(VertexId v, VertexId u) const {
- return v->OutgoingEdgesTo(u);
- }
-
- bool RelatedVertices(VertexId v1, VertexId v2) const {
- return v1 == v2 || v1 == conjugate(v2);
- }
-
- ////////////////////////edge information
- VertexId EdgeStart(EdgeId edge) const {
- return edge->start();
- }
-
- VertexId EdgeEnd(EdgeId edge) const {
- //INFO("Edge end");
- return edge->end();
- }
-
- VertexId conjugate(VertexId v) const {
- return v->conjugate();
- }
-
- EdgeId conjugate(EdgeId edge) const {
- return edge->conjugate();
- }
-
- size_t length(const EdgeId edge) const {
- return master_.length(data(edge));
- }
-
- size_t length(const VertexId v) const {
- return master_.length(data(v));
- }
-
- //////////////////////shortcut methods
-
- std::vector<EdgeId> IncidentEdges(VertexId v) const {
- vector<EdgeId> answer;
- push_back_all(answer, IncomingEdges(v));
- push_back_all(answer, OutgoingEdges(v));
- return answer;
- }
-
- EdgeId GetUniqueOutgoingEdge(VertexId v) const {
- VERIFY(CheckUniqueOutgoingEdge(v));
- return *out_begin(v);
- }
-
- bool CheckUniqueIncomingEdge(VertexId v) const {
- return IncomingEdgeCount(v) == 1;
- }
-
- EdgeId GetUniqueIncomingEdge(VertexId v) const {
- VERIFY(CheckUniqueIncomingEdge(v));
- return *in_begin(v);
- }
-
- bool CheckUniqueOutgoingEdge(VertexId v) const {
- return OutgoingEdgeCount(v) == 1;
- }
-
- bool IsDeadEnd(VertexId v) const {
- return OutgoingEdgeCount(v) == 0;
- }
-
- bool IsDeadStart(VertexId v) const {
- return IncomingEdgeCount(v) == 0;
- }
-
- bool CanCompressVertex(VertexId v) const {
- // TRACE("Compress vertex check: ");
- // TRACE("Outgoing check: " << (OutgoingEdgeCount(v) == 1));
- // TRACE("Outgoing check: " << (CheckUniqueOutgoingEdge(v)));
- // TRACE("Incoming check: " << (IncomingEdgeCount(v) == 1));
- // TRACE("Incoming check: " << (CheckUniqueIncomingEdge(v) == 1));
- // if((OutgoingEdgeCount(v) == 1) && (IncomingEdgeCount(v) == 1)) {
- // TRACE("Loop check: " << (GetUniqueOutgoingEdge(v) != GetUniqueIncomingEdge(v)));
- // TRACE("Additional check: " << AdditionalCompressCondition(v));
- // }
- return OutgoingEdgeCount(v) == 1 && IncomingEdgeCount(v) == 1 &&
- GetUniqueOutgoingEdge(v) != GetUniqueIncomingEdge(v) &&
- AdditionalCompressCondition(v);
- }
-
- //////////////////////printing
- std::string str(const EdgeId e) const {
-// return master_.str(data(edge));
- std::stringstream ss;
- ss << int_id(e) << " (" << length(e) << ")";
- return ss.str();
- }
-
- std::string str(const VertexId v) const {
-// return master_.str(data(v));
- return ToString(int_id(v));
- }
-
- std::string detailed_str(const VertexId v) const {
- std::stringstream ss;
- ss << str(v) << ";";
- ss << "Incoming edges" << str(IncomingEdges(v)) << "; ";
- ss << "Outgoing edges" << str(OutgoingEdges(v)) << ";";
- return ss.str();
- }
-
- std::string detailed_str(const std::vector<EdgeId>& path) const {
- std::stringstream ss;
- ss << "Path: ";
- ss << "Vertex " << detailed_str(EdgeStart(path[0])) << " | ";
- for (auto it = path.begin(); it != path.end(); ++it) {
- EdgeId e = *it;
- ss << "Edge " << str(e) << " | ";
- ss << "Vertex " << detailed_str(EdgeEnd(e)) << " | ";
- }
- return ss.str();
- }
-
- template<class Container>
- std::string str(const Container& container) const {
- return str(container.begin(), container.end());
- }
-
- template<class It>
- std::string str(It begin, It end) const {
- std::stringstream ss;
- std::string delim = "";
- for (auto it = begin; it != end; ++it) {
- ss << delim << str(*it);
- delim = ", ";
- }
- return ss.str();
- }
-
-private:
- DECL_LOGGER("GraphCore");
-};
-
-}
diff --git a/src/include/omni/graph_iterators.hpp b/src/include/omni/graph_iterators.hpp
deleted file mode 100644
index dab55f6..0000000
--- a/src/include/omni/graph_iterators.hpp
+++ /dev/null
@@ -1,446 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "adt/queue_iterator.hpp"
-#include "io/read_processor.hpp"
-#include "pred.hpp"
-#include "action_handlers.hpp"
-#include "simple_tools.hpp"
-#include <boost/iterator/iterator_facade.hpp>
-
-namespace omnigraph {
-
-/**
- * SmartIterator is able to iterate through collection content of which can be changed in process of
- * iteration. And as GraphActionHandler SmartIterator can change collection contents with respect to the
- * way graph is changed. Also one can define order of iteration by specifying Comparator.
- */
-template<class Graph, typename ElementId, typename Comparator = std::less<ElementId>>
-class SmartIterator : public GraphActionHandler<Graph> {
- typedef GraphActionHandler<Graph> base;
- DynamicQueueIterator<ElementId, Comparator> inner_it_;
- bool add_new_;
- bool canonical_only_;
- //todo think of checking it in HandleAdd
- pred::TypedPredicate<ElementId> add_condition_;
-
-protected:
-
- void push(const ElementId& el) {
- if ((!canonical_only_ || el <= this->g().conjugate(el)) &&
- add_condition_(el)) {
- inner_it_.push(el);
- }
- }
-
- template<typename InputIterator>
- void insert(InputIterator begin, InputIterator end) {
- for (auto it = begin; it != end; ++it) {
- push(*it);
- }
- }
-
- void erase(const ElementId& el) {
- if (!canonical_only_ || el <= this->g().conjugate(el)) {
- inner_it_.erase(el);
- }
- }
-
- void clear() {
- inner_it_.clear();
- }
-
- SmartIterator(const Graph &g, const std::string &name, bool add_new,
- const Comparator& comparator, bool canonical_only,
- pred::TypedPredicate<ElementId> add_condition = pred::AlwaysTrue<ElementId>())
- : base(g, name),
- inner_it_(comparator),
- add_new_(add_new),
- canonical_only_(canonical_only),
- add_condition_(add_condition) {
- }
-
-public:
-
- bool canonical_only() const {
- return canonical_only_;
- }
-
- bool IsEnd() const {
- return inner_it_.IsEnd();
- }
-
- size_t size() const {
- return inner_it_.size();
- }
-
- ElementId operator*() {
- return *inner_it_;
- }
-
- void operator++() {
- ++inner_it_;
- }
-
- void HandleAdd(ElementId v) override {
- if (add_new_)
- push(v);
- }
-
- void HandleDelete(ElementId v) override {
- erase(v);
- }
-
- //use carefully!
- void ReleaseCurrent() {
- inner_it_.ReleaseCurrent();
- }
-
-};
-
-/**
- * SmartIterator is abstract class which acts both as QueueIterator and GraphActionHandler. As QueueIterator
- * SmartIterator is able to iterate through collection content of which can be changed in process of
- * iteration. And as GraphActionHandler SmartIterator can change collection contents with respect to the
- * way graph is changed. Also one can define order of iteration by specifying Comparator.
- */
-template<class Graph, typename ElementId,
- typename Comparator = std::less<ElementId>>
-class SmartSetIterator : public SmartIterator<Graph, ElementId, Comparator> {
- typedef SmartIterator<Graph, ElementId, Comparator> base;
-
-public:
- SmartSetIterator(const Graph &g,
- bool add_new = false,
- const Comparator& comparator = Comparator(),
- bool canonical_only = false,
- pred::TypedPredicate<ElementId> add_condition = pred::AlwaysTrue<ElementId>())
- : base(g, "SmartSet " + ToString(this), add_new, comparator, canonical_only, add_condition) {
- }
-
- template<class Iterator>
- SmartSetIterator(const Graph &g, Iterator begin, Iterator end,
- bool add_new = false,
- const Comparator& comparator = Comparator(),
- bool canonical_only = false,
- pred::TypedPredicate<ElementId> add_condition = pred::AlwaysTrue<ElementId>())
- : SmartSetIterator(g, add_new, comparator, canonical_only, add_condition) {
- insert(begin, end);
- }
-
- template<typename InputIterator>
- void insert(InputIterator begin, InputIterator end) {
- base::insert(begin, end);
- }
-
- void push(const ElementId& el) {
- base::push(el);
- }
-
- void clear() {
- base::clear();
- }
-};
-
-/**
- * SmartVertexIterator iterates through vertices of graph. It listens to AddVertex/DeleteVertex graph events
- * and correspondingly edits the set of vertices to iterate through. Note: high level event handlers are
- * triggered before low level event handlers like H>andleAdd/HandleDelete. Thus if Comparator uses certain
- * structure which is also updated with handlers make sure that all information is updated in high level
- * event handlers.
- */
-template<class Graph, typename Comparator = std::less<typename Graph::VertexId> >
-class SmartVertexIterator : public SmartIterator<Graph,
- typename Graph::VertexId, Comparator> {
- public:
- typedef typename Graph::VertexId VertexId;
-
- static size_t get_id() {
- static size_t id = 0;
- return id++;
- }
-
- public:
- SmartVertexIterator(const Graph &g, const Comparator& comparator =
- Comparator(), bool canonical_only = false)
- : SmartIterator<Graph, VertexId, Comparator>(
- g, "SmartVertexIterator " + ToString(get_id()), true,
- comparator, canonical_only) {
- this->insert(g.begin(), g.end());
- }
-
-};
-
-//todo return verifies when they can be switched off
-template<class Graph>
-class GraphEdgeIterator : public boost::iterator_facade<GraphEdgeIterator<Graph>
- , typename Graph::EdgeId, boost::forward_traversal_tag
- , typename Graph::EdgeId> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexIt const_vertex_iterator;
- typedef typename Graph::edge_const_iterator const_edge_iterator;
-
- const Graph& g_;
- const_vertex_iterator v_it_;
- const_edge_iterator e_it_;
- bool canonical_only_;
-
-public:
-
- GraphEdgeIterator(const Graph& g, const_vertex_iterator v_it, bool canonical_only = false)
- : g_(g),
- v_it_(v_it),
- canonical_only_(canonical_only) {
- if (v_it_ != g_.end()) {
- e_it_ = g_.out_begin(*v_it_);
- Skip();
- }
- }
-
-private:
-
- bool Canonical(EdgeId e) const {
- return e <= g_.conjugate(e);
- }
-
- friend class boost::iterator_core_access;
-
- void Skip() {
- //VERIFY(v_it_ != g_.end());
- while (true) {
- if (e_it_ == g_.out_end(*v_it_)) {
- v_it_++;
- if (v_it_ == g_.end())
- return;
- e_it_ = g_.out_begin(*v_it_);
- } else {
- if (!canonical_only_ || Canonical(*e_it_))
- return;
- else
- e_it_++;
- }
- }
- }
-
- void increment() {
- if (v_it_ == g_.end())
- return;
- e_it_++;
- Skip();
- }
-
- bool equal(const GraphEdgeIterator &other) const {
- if (other.v_it_ != v_it_)
- return false;
- if (v_it_ != g_.end() && other.e_it_ != e_it_)
- return false;
- if (other.canonical_only_ != canonical_only_)
- return false;
- return true;
- }
-
- EdgeId dereference() const {
- //VERIFY(v_it_ != g_.end());
- return *e_it_;
- }
-
-};
-
-template<class Graph>
-class ConstEdgeIterator {
- typedef typename Graph::EdgeId EdgeId;
- GraphEdgeIterator<Graph> begin_, end_;
-
- public:
- ConstEdgeIterator(const Graph &g, bool canonical_only = false)
- : begin_(g, g.begin(), canonical_only), end_(g, g.end(), canonical_only) {
- }
-
- bool IsEnd() const {
- return begin_ == end_;
- }
-
- EdgeId operator*() const {
- return *begin_;
- }
-
- const ConstEdgeIterator& operator++() {
- begin_++;
- return *this;
- }
-};
-
-/**
- * SmartEdgeIterator iterates through edges of graph. It listens to AddEdge/DeleteEdge graph events
- * and correspondingly edits the set of edges to iterate through. Note: high level event handlers are
- * triggered before low level event handlers like HandleAdd/HandleDelete. Thus if Comparator uses certain
- * structure which is also updated with handlers make sure that all information is updated in high level
- * event handlers.
- */
-template<class Graph, typename Comparator = std::less<typename Graph::EdgeId> >
-class SmartEdgeIterator : public SmartIterator<Graph, typename Graph::EdgeId, Comparator> {
- typedef GraphEdgeIterator<Graph> EdgeIt;
- public:
- typedef typename Graph::EdgeId EdgeId;
-
- static size_t get_id() {
- static size_t id = 0;
- return id++;
- }
-
- public:
- SmartEdgeIterator(const Graph &g, Comparator comparator = Comparator(),
- bool canonical_only = false)
- : SmartIterator<Graph, EdgeId, Comparator>(
- g, "SmartEdgeIterator " + ToString(get_id()), true,
- comparator, canonical_only) {
- this->insert(EdgeIt(g, g.begin()), EdgeIt(g, g.end()));
-
-// for (auto it = graph.begin(); it != graph.end(); ++it) {
-// //todo: this solution doesn't work with parallel simplification
-// this->insert(graph.out_begin(*it), graph.out_end(*it));
-// //this does
-// //auto out = graph.OutgoingEdges(*it);
-// //this->base::insert(out.begin(), out.end());
-// }
- }
-};
-
-//todo move out
-template<class Graph>
-class ParallelEdgeProcessor {
- class ConstEdgeIteratorWrapper {
- public:
- typedef typename Graph::EdgeId ReadT;
-
- ConstEdgeIteratorWrapper(const Graph &g)
- : it_(g) {}
-
- bool eof() const { return it_.IsEnd(); }
-
- ConstEdgeIteratorWrapper& operator>>(typename Graph::EdgeId &val) {
- val = *it_;
- ++it_;
- return *this;
- }
-
- private:
- ConstEdgeIterator<Graph> it_;
- };
-
- public:
- ParallelEdgeProcessor(const Graph &g, unsigned nthreads)
- : rp_(nthreads), it_(g) {}
-
- template <class Processor>
- bool Run(Processor &op) { return rp_.Run(it_, op); }
-
- bool IsEnd() const { return it_.eof(); }
- size_t processed() const { return rp_.processed(); }
-
- private:
- hammer::ReadProcessor rp_;
- ConstEdgeIteratorWrapper it_;
-};
-
-//todo move out
-template<class Graph, class ElementId>
-class IterationHelper {
-};
-
-template<class Graph>
-class IterationHelper<Graph, typename Graph::VertexId> {
- const Graph& g_;
-public:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::VertexIt const_vertex_iterator;
-
- IterationHelper(const Graph& g)
- : g_(g) {
- }
-
- const_vertex_iterator begin() const {
- return g_.begin();
- }
-
- const_vertex_iterator end() const {
- return g_.end();
- }
-
- std::vector<const_vertex_iterator> Chunks(size_t chunk_cnt) const {
- VERIFY(chunk_cnt > 0);
- if (chunk_cnt == 1) {
- return {begin(), end()};
- }
-
- //trying to split vertices into equal chunks, leftovers put into first chunk
- vector<const_vertex_iterator> answer;
- size_t vertex_cnt = g_.size();
- size_t chunk_size = vertex_cnt / chunk_cnt;
- auto it = g_.begin();
- answer.push_back(it);
- for (size_t i = 0; i + chunk_cnt * chunk_size < vertex_cnt; ++i) {
- it++;
- }
- if (chunk_size > 0) {
- size_t i = 0;
- do {
- ++it;
- if (++i % chunk_size == 0)
- answer.push_back(it);
- } while (it != g_.end());
-
- VERIFY(i == chunk_cnt * chunk_size);
- } else {
- VERIFY(it == g_.end());
- answer.push_back(it);
- }
- VERIFY(answer.back() == g_.end());
- return answer;
- }
-
-};
-
-//todo move out
-template<class Graph>
-class IterationHelper<Graph, typename Graph::EdgeId> {
- typedef typename Graph::VertexId VertexId;
-
- const Graph& g_;
-public:
- typedef typename Graph::EdgeId EdgeId;
- typedef GraphEdgeIterator<Graph> const_edge_iterator;
-
- IterationHelper(const Graph& g)
- : g_(g) {
- }
-
- const_edge_iterator begin() const {
- return const_edge_iterator(g_, g_.begin());
- }
-
- const_edge_iterator end() const {
- return const_edge_iterator(g_, g_.end());
- }
-
- std::vector<omnigraph::GraphEdgeIterator<Graph>> Chunks(size_t chunk_cnt) const {
- if (chunk_cnt == 1) {
- return {begin(), end()};
- }
-
- vector<omnigraph::GraphEdgeIterator<Graph>> answer;
-
- for (auto v_it : IterationHelper<Graph, VertexId>(g_).Chunks(chunk_cnt)) {
- answer.push_back(omnigraph::GraphEdgeIterator<Graph>(g_, v_it));
- }
- return answer;
- }
-};
-
-}
diff --git a/src/include/omni/graph_processing_algorithm.hpp b/src/include/omni/graph_processing_algorithm.hpp
deleted file mode 100644
index 4c2c52c..0000000
--- a/src/include/omni/graph_processing_algorithm.hpp
+++ /dev/null
@@ -1,259 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "graph_iterators.hpp"
-#include "graph_component.hpp"
-#include "coverage.hpp"
-#include "pred.hpp"
-#include "logger/logger.hpp"
-
-namespace omnigraph {
-
-template<class Graph>
-using HandlerF = std::function<void(typename Graph::EdgeId)>;
-
-template<class Graph>
-class EdgeProcessingAlgorithm {
- typedef typename Graph::EdgeId EdgeId;
- typedef pred::TypedPredicate<EdgeId> ProceedConditionT;
-
- Graph& g_;
- bool conjugate_symmetry_;
- protected:
-
- Graph& g() {
- return g_;
- }
-
- const Graph& g() const {
- return g_;
- }
-
- virtual bool ProcessEdge(EdgeId e) = 0;
-
- public:
- EdgeProcessingAlgorithm(Graph& g,
- bool conjugate_symmetry = false)
- : g_(g), conjugate_symmetry_(conjugate_symmetry) {
-
- }
-
- virtual ~EdgeProcessingAlgorithm() {
- }
-
-// bool conjugate_symmetry() const {
-// return conjugate_symmetry_;
-// }
-
- template<class Comparator = std::less<EdgeId>>
- bool Run(const Comparator& comp = Comparator(), ProceedConditionT proceed_condition = pred::AlwaysTrue<EdgeId>()) {
- bool triggered = false;
- for (auto it = g_.SmartEdgeBegin(comp, conjugate_symmetry_); !it.IsEnd(); ++it) {
- EdgeId e = *it;
- TRACE("Current edge " << g_.str(e));
- if (!proceed_condition(e)) {
- TRACE("Stop condition was reached.");
- break;
- }
-
- TRACE("Processing edge " << this->g().str(e));
- triggered |= ProcessEdge(e);
- };
- return triggered;
- }
-
- private:
- DECL_LOGGER("EdgeProcessingAlgorithm");
-};
-
-template<class Graph>
-class CountingCallback {
- typedef typename Graph::EdgeId EdgeId;
- bool report_on_destruction_;
- std::atomic<size_t> cnt_;
-
-public:
- CountingCallback(bool report_on_destruction = false) :
- report_on_destruction_(report_on_destruction), cnt_(0) {
- }
-
- ~CountingCallback() {
- if (report_on_destruction_)
- Report();
- }
-
- void HandleDelete(EdgeId /*e*/) {
- cnt_++;
- }
-
- void Report() {
- TRACE(cnt_ << " edges were removed.")
- cnt_ = 0;
- }
-
-private:
- DECL_LOGGER("CountingCallback");
-};
-
-template<class Graph>
-std::function<void(typename Graph::EdgeId)> AddCountingCallback(CountingCallback<Graph>& cnt_callback, std::function<void(typename Graph::EdgeId)> handler) {
- std::function<void(typename Graph::EdgeId)> cnt_handler = std::bind(&CountingCallback<Graph>::HandleDelete, std::ref(cnt_callback), std::placeholders::_1);
- return func::Composition<typename Graph::EdgeId>(handler, cnt_handler);
-}
-template<class Graph>
-void RemoveIsolatedOrCompress(Graph& g, typename Graph::VertexId v) {
- if (g.IsDeadStart(v) && g.IsDeadEnd(v)) {
- g.DeleteVertex(v);
- } else {
- g.CompressVertex(v);
- }
-}
-
-template<class Graph>
-class EdgeRemover {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef std::function<void(EdgeId)> HandlerF;
-
- Graph& g_;
- HandlerF removal_handler_;
-
- public:
- EdgeRemover(Graph& g, HandlerF removal_handler = nullptr)
- : g_(g),
- removal_handler_(removal_handler) {
- }
-
- void DeleteEdge(EdgeId e) {
- VertexId start = g_.EdgeStart(e);
- VertexId end = g_.EdgeEnd(e);
- DeleteEdgeWithNoCompression(e);
- // NOTE: e here is already dead!
- TRACE("Compressing locality");
- if (!g_.RelatedVertices(start, end)) {
- TRACE("Vertices not related");
- TRACE("Processing end");
- RemoveIsolatedOrCompress(g_, end);
- TRACE("End processed");
- }
- TRACE("Processing start");
- RemoveIsolatedOrCompress(g_, start);
- TRACE("Start processed");
- }
-
- void DeleteEdgeWithNoCompression(EdgeId e) {
- TRACE("Deletion of edge " << g_.str(e));
- TRACE("Start " << g_.str(g_.EdgeStart(e)));
- TRACE("End " << g_.str(g_.EdgeEnd(e)));
- if (removal_handler_) {
- TRACE("Calling handler");
- removal_handler_(e);
- }
- TRACE("Deleting edge");
- g_.DeleteEdge(e);
- }
-
- private:
- DECL_LOGGER("EdgeRemover");
-};
-
-template<class Graph>
-class EdgeRemovingAlgorithm : public EdgeProcessingAlgorithm<Graph> {
- typedef EdgeProcessingAlgorithm<Graph> base;
- typedef typename Graph::EdgeId EdgeId;
-
- pred::TypedPredicate<EdgeId> remove_condition_;
- EdgeRemover<Graph> edge_remover_;
-
- protected:
- bool ProcessEdge(EdgeId e) {
- TRACE("Checking edge " << this->g().str(e) << " for the removal condition");
- if (remove_condition_(e)) {
- TRACE("Check passed, removing");
- edge_remover_.DeleteEdge(e);
- return true;
- }
- TRACE("Check not passed");
- return false;
- }
-
- public:
- EdgeRemovingAlgorithm(Graph& g,
- pred::TypedPredicate<EdgeId> remove_condition,
- std::function<void (EdgeId)> removal_handler = boost::none,
- bool conjugate_symmetry = false)
- : base(g, conjugate_symmetry),
- remove_condition_(remove_condition),
- edge_remover_(g, removal_handler) {}
-
- private:
- DECL_LOGGER("EdgeRemovingAlgorithm");
-};
-
-//todo rewrite with SmartSetIterator
-template<class Graph>
-class ComponentRemover {
- public:
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef std::function<void(const set<EdgeId>&)> HandlerF;
-
- private:
- Graph& g_;
- HandlerF removal_handler_;
-
- template<class ElemType>
- void InsertIfNotConjugate(set<ElemType>& elems, ElemType elem) {
- if (elems.count(g_.conjugate(elem)) == 0) {
- elems.insert(elem);
- }
- }
-
- public:
- ComponentRemover(Graph& g, HandlerF removal_handler = 0)
- : g_(g),
- removal_handler_(removal_handler) {
- }
-
- template<class EdgeIt>
- void DeleteComponent(EdgeIt begin, EdgeIt end, bool alter_vertices = true) {
- set<EdgeId> edges;
- set<VertexId> vertices;
-
- //cleaning conjugates and gathering vertices
- for (EdgeIt it = begin; it != end; ++it) {
- EdgeId e = *it;
- InsertIfNotConjugate(edges, e);
- InsertIfNotConjugate(vertices, g_.EdgeStart(e));
- InsertIfNotConjugate(vertices, g_.EdgeEnd(e));
- }
-
- if (removal_handler_) {
- removal_handler_(edges);
- }
-
- for (EdgeId e: edges) {
- g_.DeleteEdge(e);
- }
-
- if (alter_vertices) {
- for (VertexId v: vertices) {
- RemoveIsolatedOrCompress(g_, v);
- }
- }
- }
-
- template<class Container>
- void DeleteComponent(const Container& container, bool alter_vertices = true) {
- DeleteComponent(container.begin(), container.end(), alter_vertices);
- }
-
-};
-
-}
diff --git a/src/include/omni/id_track_handler.hpp b/src/include/omni/id_track_handler.hpp
deleted file mode 100644
index f2486db..0000000
--- a/src/include/omni/id_track_handler.hpp
+++ /dev/null
@@ -1,110 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include <unordered_map>
-//#include "utils.hpp"
-#include "visualization/graph_labeler.hpp"
-#include "simple_tools.hpp"
-#include "action_handlers.hpp"
-using namespace omnigraph;
-
-namespace omnigraph {
-template<class Graph>
-class GraphElementFinder : public GraphActionHandler<Graph> {
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- unordered_map<size_t, VertexId> id2vertex_;
- unordered_map<size_t, EdgeId> id2edge_;
-
-public:
- GraphElementFinder(const Graph &graph) : GraphActionHandler<Graph>(graph, "Graph element finder") {
- }
-
- virtual ~GraphElementFinder() {
- }
-
- virtual void HandleAdd(EdgeId e) {
-#pragma omp critical
- {
- id2edge_[e.int_id()] = e;
- }
- }
-
- virtual void HandleAdd(VertexId v) {
-#pragma omp critical
- {
- id2vertex_[v.int_id()] = v;
- }
- }
-
- virtual void HandleDelete(EdgeId e) {
- id2edge_[e.int_id()] = e;
- }
-
- virtual void HandleDelete(VertexId v) {
- id2vertex_[v.int_id()] = v;
- }
-
- VertexId ReturnVertexId(size_t id) const {
- auto it = id2vertex_.find(id);
- if(it == id2vertex_.end())
- return VertexId();
- else
- return it->second;
- }
-
- EdgeId ReturnEdgeId(size_t id) const {
- auto it = id2edge_.find(id);
- if(it == id2edge_.end())
- return EdgeId();
- else
- return it->second;
- }
-
- void Init() {
- for(auto it = this->g().begin(); it != this->g().end(); ++it) {
- HandleAdd(*it);
- for(auto eit = this->g().OutgoingEdges(*it).begin(); eit != this->g().OutgoingEdges(*it).end(); ++eit) {
- HandleAdd(*eit);
- }
- }
- }
-};
-
-template<class VertexId, class EdgeId>
-class BaseIdTrackHandler {
-public:
- BaseIdTrackHandler() {
- }
-
- size_t ReturnIntId(EdgeId e) const {
- return e.int_id();
- }
-
- size_t ReturnIntId(VertexId v) const {
- return v.int_id();
- }
-};
-
-template<class Graph>
-class IdTrackHandler : public BaseIdTrackHandler<typename Graph::VertexId, typename Graph::EdgeId> {
-private:
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- const Graph &graph_;
-public:
- IdTrackHandler(const Graph& g) : graph_(g) {
- }
-
- ~IdTrackHandler() {
- }
-};
-
-}
diff --git a/src/include/omni/loop_killer.hpp b/src/include/omni/loop_killer.hpp
deleted file mode 100644
index 76bc8d8..0000000
--- a/src/include/omni/loop_killer.hpp
+++ /dev/null
@@ -1,218 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "splitters.hpp"
-namespace omnigraph {
-
-template<class Graph>
-class AbstractLoopKiller {
-public:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-private:
- Graph &graph_;
-
- VertexId FindStart(set<VertexId> component_set) {
- VertexId result;
- for(auto it = component_set.begin(); it != component_set.end(); ++it) {
- vector<EdgeId> incoming = graph_.IncomingEdges(*it);
- for(auto eit = incoming.begin(); eit != incoming.end(); ++eit) {
- if(component_set.count(graph_.EdgeStart(*eit)) == 0) {
- if(result != VertexId()) {
- return VertexId();
- }
- result = *it;
- }
- }
- }
- return result;
- }
-
- VertexId FindFinish(set<VertexId> component_set) {
- VertexId result;
- for(auto it = component_set.begin(); it != component_set.end(); ++it) {
- for (auto I = graph_.out_begin(*it), E = graph_.out_end(*it); I != E; ++I) {
- if (component_set.count(graph_.EdgeEnd(*I)) == 0) {
- if (result != VertexId()) {
- return VertexId();
- }
- result = *it;
- }
- }
- }
- return result;
- }
-
-protected:
- const size_t splitting_edge_length_;
- const size_t max_component_size_;
-
- Graph &g() {
- return graph_;
- }
-
-public:
-
- AbstractLoopKiller(Graph &graph, size_t splitting_edge_length,
- size_t max_component_size) :
- graph_(graph), splitting_edge_length_(splitting_edge_length), max_component_size_(
- max_component_size) {
- }
-
- virtual ~AbstractLoopKiller() {
- }
-
-
-
- void KillAllLoops() {
- shared_ptr<GraphSplitter<Graph>> splitter_ptr = LongEdgesExclusiveSplitter<Graph>(graph_, splitting_edge_length_);
- GraphSplitter<Graph> &splitter = *splitter_ptr;
- while(splitter.HasNext()) {
- set<VertexId> component_set = splitter.Next().vertices();
- if(component_set.size() > max_component_size_)
- continue;
- VertexId start = FindStart(component_set);
- VertexId finish = FindFinish(component_set);
- if(start == VertexId() || finish == VertexId()) {
- continue;
- }
- KillLoop(start, finish, component_set);
- }
- CompressAllVertices(graph_);
- }
-
- virtual void KillLoop(VertexId start, VertexId finish, const set<VertexId> &component) = 0;
-};
-
-template<class Graph>
-class SimpleLoopKiller : public AbstractLoopKiller<Graph> {
-
-public:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
-private:
- vector<EdgeId> FindPath(VertexId start, VertexId finish, const set<VertexId> &component) {
- set<VertexId> was;
- vector<EdgeId> rr = FindPath(start, finish, was, component);
- return vector<EdgeId>(rr.rbegin(), rr.rend());
- }
-
- vector<EdgeId> FindPath(VertexId start, VertexId finish, set<VertexId> &was, const set<VertexId> &component) {
- was.insert(start);
- if (start == finish)
- return {};
- for (auto I = this->g().out_begin(start), E = this->g().out_end(start); I != E; ++I) {
- EdgeId edge = *I;
- VertexId next = this->g().EdgeEnd(edge);
- if (next == finish) {
- return { edge };
- }
- if (was.count(next) == 0 && component.count(next) != 0) {
- vector<EdgeId> result = FindPath(next, finish, was, component);
- if (result.size() > 0) {
- result.push_back(edge);
- return result;
- }
- }
- }
- return {};
- }
-
- bool CheckNotMuchRemoved(const set<EdgeId> &edges, const set<VertexId> &component) {
- size_t sum = 0;
- for (auto it = component.begin(); it != component.end(); ++it) {
- for (auto I = this->g().out_begin(*it), E = this->g().out_end(*it); I != E; ++I) {
- EdgeId edge = *I;
- if (component.count(this->g().EdgeEnd(edge)) == 1 && edges.count(edge) == 0 ) {
- if (this->g().length(edge) > 500) {
- return false;
- }
- sum += this->g().length(edge);
- }
- }
- }
-// if(sum <= 3000) {
-// cout << sum << endl;
-// }
- return sum <= 3000;
- }
-
- void RemoveExtraEdges(const set<EdgeId> &edges, const set<VertexId> &component) {
- vector<VertexId> comp(component.begin(), component.end());
- vector<EdgeId> to_delete;
- for (auto it = comp.begin(); it != comp.end(); ++it) {
- for (auto I = this->g().out_begin(*it), E = this->g().out_end(*it); I != E; ++I) {
- EdgeId edge = *I;
- if (component.count(this->g().EdgeEnd(edge)) == 1 && edges.count(edge) == 0) {
- to_delete.push_back(edge);
- }
- }
- }
-
- SmartSetIterator<Graph, EdgeId> s(this->g(), to_delete.begin(), to_delete.end());
- while (!s.IsEnd()) {
- this->g().DeleteEdge(*s);
- ++s;
- }
- }
-
- void RemoveIsolatedVertices(set<VertexId> component) {
- SmartSetIterator<Graph, VertexId> s(this->g(), component.begin(), component.end());
- while (!s.IsEnd()) {
- if (this->g().IsDeadStart(*s) && this->g().IsDeadEnd(*s)) {
- this->g().DeleteVertex(*s);
- }
- ++s;
- }
- }
-
- bool CheckStrong(const set<VertexId> &component) {
- VertexId v = *(component.begin());
- for(auto it = component.begin(); it != component.end(); ++it) {
- if(v != *it && (FindPath(v, *it, component).size() == 0 || FindPath(*it, v, component).size() == 0)) {
- return false;
- }
- }
- return true;
- }
-
-public:
- SimpleLoopKiller(Graph &graph, size_t splitting_edge_length, size_t max_component_size) :
- AbstractLoopKiller<Graph>(graph, splitting_edge_length, max_component_size) {
- }
-
- virtual void KillLoop(VertexId start, VertexId finish, const set<VertexId> &component) {
- vector<EdgeId> path = FindPath(start, finish, component);
- set<EdgeId> edges(path.begin(), path.end());
- if(path.size() > 0 || start == finish) {
-// if(start != finish || component.size() > 2)
- if(/*!CheckStrong(component) || */!CheckNotMuchRemoved(edges, component)) {
- return;
- }
-/*
- cout << this->g().int_id(start) << " " << this->g().int_id(finish) << endl;
- cout << this->g().VertexNucls(start) << endl;
-
- for(auto it = component.begin(); it != component.end(); ++it) {
- vector<EdgeId> outgoing = this->g().OutgoingEdges(*it);
- for(auto eit = outgoing.begin(); eit != outgoing.end(); ++eit) {
- if(component.count(this->g().EdgeEnd(*eit)) == 1) {
- cout << this->g().int_id(*it) << " -> " << this->g().int_id(this->g().EdgeEnd(*eit)) << " : " << this->g().length(*eit) << " : " << edges.count(*eit) << " : " << this->g().int_id(*eit) << endl;
- }
- }
- }
-*/
- RemoveExtraEdges(edges, component);
- RemoveIsolatedVertices(component);
- }
- }
-
-};
-}
diff --git a/src/include/omni/loop_resolver.hpp b/src/include/omni/loop_resolver.hpp
deleted file mode 100644
index 931ef23..0000000
--- a/src/include/omni/loop_resolver.hpp
+++ /dev/null
@@ -1,75 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef LOOP_RESOLVER_H
-#define LOOP_RESOLVER_H
-
-#include <math.h>
-
-namespace omnigraph{
-
-template<class Graph>
-class LoopResolver{
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- Graph &graph_;
- double allowCoverageVatiation_;
-public:
- LoopResolver(Graph& graph, double allowCoverageVatiation)
- :graph_(graph),allowCoverageVatiation_(allowCoverageVatiation){}
- void ResolveLoops()
- {
- for(auto iter = graph_.SmartEdgeBegin(); !iter.IsEnd(); ++iter)
- {
-// graph_.DeleteEdge(*iter);
-
- if(graph_.EdgeStart(*iter) == graph_.EdgeEnd(*iter))
- {
- INFO("LOOP RE:"<< graph_.length(*iter));
- VertexId loopNode = graph_.EdgeStart(*iter);
- //if the loop appears on a \emph{simple} path.
- if((graph_.OutgoingEdgeCount(loopNode) == 2) && (graph_.IncomingEdgeCount(loopNode)==2) )
- {
- vector<EdgeId> inComingEdges = graph_.IncomingEdges(loopNode);
- EdgeId beforeLoopEdge = (inComingEdges[0] == *iter)? inComingEdges[1]: inComingEdges[0];
- vector<EdgeId> outGoingEdges = graph_.OutgoingEdges(loopNode);
- EdgeId afterLoopEdge = (outGoingEdges[0] == *iter) ? outGoingEdges[1]: outGoingEdges[0];
- double loopCoverage = graph_.coverage(*iter);
- double beforeLoopCoverage = graph_.coverage(beforeLoopEdge);
- double afterLoopCoverage = graph_.coverage(afterLoopEdge);
-
- double variance = (afterLoopCoverage + beforeLoopCoverage)/2.0 - loopCoverage ;
-
- INFO("LOOP RE D:"<<variance <<":"<< loopCoverage);
- if( fabs( variance) <= allowCoverageVatiation_*loopCoverage)
- {
- INFO("LOOP RESO D:"<<variance <<":"<< loopCoverage);
- VertexId addedVertex = graph_.AddVertex();
- VertexId afterLoopEdgeEndVertex = graph_.EdgeEnd(afterLoopEdge);
- EdgeId addedAfterLoopEdge = graph_.AddEdge( addedVertex, afterLoopEdgeEndVertex, graph_.EdgeNucls(afterLoopEdge));
- graph_.coverage_index().SetCoverage(addedAfterLoopEdge, graph_.coverage(afterLoopEdge) * graph_.length(afterLoopEdge) );
- graph_.DeleteEdge(afterLoopEdge);
-
- EdgeId transformedLoopEdge = graph_.AddEdge(loopNode, addedVertex, graph_.EdgeNucls(*iter));
- graph_.coverage_index().SetCoverage(transformedLoopEdge, graph_.coverage(*iter) * graph_.length(*iter));
- graph_.DeleteEdge(*iter);
-
- }
-
- }
- }
- }
- }
-
-
-};
-
-}
-
-#endif /* end of include guard: LOOP_RESOLVER_H */
-
diff --git a/src/include/omni/mapping_path.hpp b/src/include/omni/mapping_path.hpp
deleted file mode 100644
index ce73466..0000000
--- a/src/include/omni/mapping_path.hpp
+++ /dev/null
@@ -1,227 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "range.hpp"
-
-namespace omnigraph {
-
-/**
- * This class is a representation of how certain sequence is mapped to genome. Needs further adjustment.
- */
-template<typename ElementId>
-class Path {
- std::vector<ElementId> sequence_;
- size_t start_pos_;
- size_t end_pos_;
- public:
- typedef typename vector<ElementId>::const_iterator iterator;
-
- Path(const vector<ElementId>& sequence, size_t start_pos, size_t end_pos)
- : sequence_(sequence), start_pos_(start_pos), end_pos_( end_pos) {
- }
-
- Path()
- : sequence_(),
- start_pos_(-1ul),
- end_pos_(-1ul) {
- }
-
- size_t start_pos() const { return start_pos_; }
- size_t end_pos() const { return end_pos_; }
-
- size_t size() const { return sequence_.size(); }
-
- const std::vector<ElementId>& sequence() const { return sequence_; }
- ElementId operator[](size_t index) const { return sequence_[index]; }
-
- iterator begin() const { return sequence_.begin(); }
- iterator end() const { return sequence_.end(); }
-};
-
-struct MappingRange {
-// on genome/contig/whatever
- Range initial_range;
-//on edge
- Range mapped_range;
-
- MappingRange() {
- }
-
- MappingRange(Range initial_range, Range mapped_range)
- : initial_range(initial_range), mapped_range(mapped_range) {}
-
- MappingRange(size_t i_start, size_t i_end, size_t m_start, size_t m_end)
- : initial_range(i_start, i_end), mapped_range(m_start, m_end) {}
-
- MappingRange Merge(const MappingRange &other) const {
- return MappingRange(initial_range.Merge(other.initial_range), mapped_range.Merge(other.mapped_range));
- }
-
- MappingRange ShiftInitial(int shift) const {
- MappingRange result(*this);
- result.initial_range.shift(shift);
- return result;
- }
-
- MappingRange Shift(int shift) const {
- VERIFY(initial_range.end_pos >= initial_range.start_pos);
- if(empty())
- return MappingRange();
- MappingRange result(*this);
- if(int(result.mapped_range.end_pos) <= -shift)
- return MappingRange();
- result.mapped_range.end_pos += shift;
- if(int(result.mapped_range.start_pos) <= -shift) {
- result.initial_range.start_pos -= result.mapped_range.start_pos + shift;
- if(result.initial_range.start_pos >= result.initial_range.end_pos)
- result.initial_range.start_pos = result.initial_range.end_pos - 1;
- result.mapped_range.start_pos = 0;
- } else {
- result.mapped_range.start_pos += shift;
- }
- return result;
- }
-
- MappingRange Fit(size_t length) const {
- VERIFY(initial_range.end_pos >= initial_range.start_pos);
- if(empty())
- return MappingRange();
- MappingRange result(*this);
- if(result.mapped_range.start_pos >= length)
- return MappingRange();
- if(result.mapped_range.end_pos >= length) {
- if(result.initial_range.end_pos + length < result.mapped_range.end_pos)
- return MappingRange();
- result.initial_range.end_pos -= result.mapped_range.end_pos - length;
- result.mapped_range.end_pos = length;
- }
- return result;
- }
-
- bool empty() const {
- return initial_range.empty() || mapped_range.empty();
- }
-
- bool operator<(const MappingRange &other) const {
- if(this->initial_range != other.initial_range)
- return this->initial_range < other.initial_range;
- return this->mapped_range < other.mapped_range;
- }
- MappingRange operator = (const MappingRange & other) {
- initial_range = other.initial_range;
- mapped_range = other.mapped_range;
- return *this;
- }
-
- bool Intersect(const MappingRange &other) {
- return initial_range.Intersect(other.initial_range) && mapped_range.Intersect(other.mapped_range);
- }
-
- bool IntersectLeftOf(const MappingRange &other) const {
- return initial_range.IntersectLeftOf(other.initial_range) && mapped_range.IntersectLeftOf(other.mapped_range);
- }
-
- bool StrictlyContinuesWith(const MappingRange &other, size_t max_gap, size_t gap_diff = 0) const {
- return this->initial_range.end_pos <= other.initial_range.start_pos
- && this->mapped_range.end_pos <= other.mapped_range.start_pos
- && other.initial_range.start_pos - this->initial_range.end_pos
- <= other.mapped_range.start_pos - this->mapped_range.end_pos + gap_diff
- && other.mapped_range.start_pos - this->mapped_range.end_pos
- <= other.initial_range.start_pos - this->initial_range.end_pos + gap_diff
- && other.initial_range.start_pos - this->initial_range.end_pos <= max_gap;
- }
-
- bool operator==(const MappingRange &that) const {
- return initial_range == that.initial_range || mapped_range == that.mapped_range;
- }
-
- bool operator!=(const MappingRange &that) const {
- return !(*this == that);
- }
-
-};
-
-inline std::ostream& operator<<(std::ostream& os, const MappingRange& map_range) {
- os << map_range.initial_range << " --> " << map_range.mapped_range;
- return os;
-}
-
-template<typename ElementId>
-class MappingPath {
- public:
- MappingPath() {}
-
- MappingPath(const std::vector<ElementId>& edges,
- const std::vector<MappingRange> range_mappings)
- : edges_(edges),
- range_mappings_(range_mappings) {}
-
- size_t size() const { return edges_.size(); }
-
- std::pair<const ElementId, const MappingRange> operator[](size_t idx) const {
- return std::make_pair(edges_[idx], range_mappings_[idx]);
- }
-
- std::pair<const ElementId, const MappingRange> front() const {
- return std::make_pair(edges_.front(), range_mappings_.front());
- }
-
- std::pair<const ElementId, const MappingRange> back() const {
- return std::make_pair(edges_.back(), range_mappings_.back());
- }
-
- size_t start_pos() const {
- return range_mappings_.front().mapped_range.start_pos;
- }
-
- size_t end_pos() const {
- return range_mappings_.back().mapped_range.end_pos;
- }
-
- Path<ElementId> path() const {
- if (edges_.size() != 0)
- return Path<ElementId>(edges_,
- range_mappings_[0].mapped_range.start_pos,
- range_mappings_[range_mappings_.size() - 1].mapped_range.end_pos);
- else
- return Path<ElementId>();
- }
-
- const std::vector<ElementId>& simple_path() const {
- return edges_;
- }
-
- void join(const MappingPath<ElementId>& that, int pos_shift = 0) {
- for (size_t i = 0; i < that.size(); ++i) {
- edges_.push_back(that.edges_[i]);
- range_mappings_.push_back(that.range_mappings_[i].ShiftInitial(pos_shift));
- }
- }
-
- void push_back(ElementId id, MappingRange range) {
- edges_.push_back(id);
- range_mappings_.push_back(range);
- }
-
- private:
- std::vector<ElementId> edges_;
- std::vector<MappingRange> range_mappings_;
-};
-
-template <typename ElementId>
-inline std::ostream& operator<<(std::ostream& os, const MappingPath<ElementId>& mp) {
- os << "MappingPath ( ";
- for(size_t i = 0; i < mp.size(); i++) {
- os << mp[i] << " ";
- }
- os << " )";
- return os;
-}
-
-}
diff --git a/src/include/omni/mf_ec_remover.hpp b/src/include/omni/mf_ec_remover.hpp
deleted file mode 100644
index ead646d..0000000
--- a/src/include/omni/mf_ec_remover.hpp
+++ /dev/null
@@ -1,508 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "graph_processing_algorithm.hpp"
-
-namespace omnigraph {
-
-using std::set;
-using std::map;
-using std::vector;
-using std::pair;
-using std::queue;
-
-template<class Graph>
-class FlowGraph {
-public:
- typedef size_t FlowVertexId;
- typedef pair<FlowVertexId, FlowVertexId> FlowEdgeId;
-
-private:
- typedef typename Graph::VertexId OuterVertexId;
- map<OuterVertexId, FlowVertexId> vertex_mapping_;
- map<FlowVertexId, map<FlowVertexId, int>> capacities_;
- set<FlowVertexId> vertices_;
- size_t vertex_number_;
- FlowVertexId source_;
- FlowVertexId sink_;
-
- FlowVertexId AddVertex() {
- vertices_.insert(vertex_number_);
- capacities_[vertex_number_];
- vertex_number_++;
- return vertex_number_ - 1;
- }
-
- void PushFlow(FlowEdgeId edge, int capacity) {
- VERIFY(capacities_[EdgeStart(edge)][EdgeEnd(edge)] >= capacity);
- capacities_[EdgeStart(edge)][EdgeEnd(edge)] -= capacity;
- capacities_[EdgeEnd(edge)][EdgeStart(edge)] += capacity;
- }
-
- void AddEdge(FlowVertexId first, FlowVertexId second, int capacity = 10000) {
- capacities_[first][second] += capacity; // operator [] creates entry with default values in case argument is not in keyset
- capacities_[second][first] += 0;
- }
-
-public:
- FlowGraph() :
- vertex_number_(0), source_(AddVertex()), sink_(AddVertex()) {
- }
-
- FlowVertexId GetCorrespondingVertex(OuterVertexId v) const {
- return vertex_mapping_.find(v)->second;
- }
-
- bool HasCorrespondingVertex(OuterVertexId v) const {
- return vertex_mapping_.find(v) == vertex_mapping_.end();
- }
-
- FlowVertexId AddVertex(OuterVertexId vertex) {
- FlowVertexId new_vertex = AddVertex();
- vertex_mapping_[vertex] = new_vertex;
- return new_vertex;
- }
-
- void AddEdge(OuterVertexId outer_first, OuterVertexId outer_second,
- int capacity = 10000) {
- VERIFY(
- vertex_mapping_.find(outer_first) != vertex_mapping_.end()
- && vertex_mapping_.find(outer_second)
- != vertex_mapping_.end());
- FlowVertexId first = vertex_mapping_[outer_first];
- FlowVertexId second = vertex_mapping_[outer_second];
- AddEdge(first, second, capacity);
- }
-
- void AddSource(OuterVertexId vertex, int capacity) {
- AddEdge(source_, GetCorrespondingVertex(vertex), capacity);
- }
-
- void AddSink(OuterVertexId vertex, int capacity) {
- AddEdge(GetCorrespondingVertex(vertex), sink_, capacity);
- }
-
- FlowVertexId Source() const {
- return source_;
- }
-
- FlowVertexId Sink() const {
- return sink_;
- }
-
- bool Connected(FlowVertexId start, FlowVertexId end) const {
- return capacities_.find(start) != capacities_.end()
- && capacities_.find(start)->second.find(end)
- != capacities_.find(start)->second.end()
- && capacities_.find(start)->second.find(end)->second > 0;
- }
-
- vector<FlowEdgeId> OutgoingEdges(FlowVertexId v) const {
- vector<FlowEdgeId> result;
- const map<FlowVertexId, int> &outgoing = capacities_.find(v)->second;
- for (auto it = outgoing.begin(); it != outgoing.end(); ++it) {
- if (it->second > 0) {
- result.push_back(make_pair(v, it->first));
- }
- }
- return result;
- }
-
- vector<FlowEdgeId> IncomingEdges(FlowVertexId v) const {
- vector<FlowEdgeId> result;
- const map<FlowVertexId, int> &outgoing = capacities_.find(v)->second;
- for (auto it = outgoing.begin(); it != outgoing.end(); ++it) {
- if (Connected(it->first, v)) {
- result.push_back(make_pair(it->first, v));
- }
- }
- return result;
- }
-
- size_t OutgoingEdgesCount(FlowVertexId v) const {
- return OutgoingEdges(v).size();
- }
-
- size_t IncomingEdgesCount(FlowVertexId v) const {
- return IncomingEdges(v).size();
- }
-
- FlowVertexId EdgeStart(FlowEdgeId edge) const {
- return edge.first;
- }
-
- FlowVertexId EdgeEnd(FlowEdgeId edge) const {
- return edge.second;
- }
-
- set<FlowVertexId>::iterator begin() const {
- return vertices_.begin();
- }
-
- set<FlowVertexId>::iterator end() const {
- return vertices_.end();
- }
-
- int GetCapacity(FlowVertexId first, FlowVertexId second) const {
- auto it1 = capacities_.find(first);
- if (it1 == capacities_.end())
- return 0;
- auto it2 = it1->second.find(second);
- if (it2 == it1->second.end())
- return 0;
- return it2->second;
- }
-
- void PushFlow(vector<FlowVertexId> path, int capacity) {
- size_t n = path.size();
- VERIFY(path[0] == source_ && path[n - 1] == sink_);
- for (size_t i = 0; i + 1 < n; i++) {
- PushFlow(make_pair(path[i], path[i + 1]), capacity);
- }
- }
-
-// void Print() const {
-// for(auto it = vertex_mapping_.begin(); it != vertex_mapping_.end(); ++it) {
-// TRACE(it->first << " " << it->second);
-// }
-// for(auto it = vertices_.begin(); it != vertices_.end();) {
-// auto out = OutgoingEdges(*it);
-// for(auto it1 = out.begin(); it1 != out.end(); ++it1) {
-// TRACE("edge " << (*it1) << " " << GetCapacity(*it, it1->second));
-// }
-// ++it;
-// if(it == vertices_.end())
-// break;
-// }
-// }
-};
-
-template<class Graph>
-class BFS {
-private:
- const Graph &graph_;
- typedef typename Graph::FlowVertexId FlowVertexId;
- typedef typename Graph::FlowEdgeId FlowEdgeId;
-
- vector<FlowVertexId> RestoreAnswer(FlowVertexId start, FlowVertexId end,
- const map<FlowVertexId, FlowVertexId> &prev) {
- vector<FlowVertexId> result;
- result.push_back(end);
- FlowVertexId current = end;
- while (current != start) {
- current = prev.find(current)->second;
- result.push_back(current);
- }
- return vector<FlowVertexId>(result.rbegin(), result.rend());
- }
-
-public:
- BFS(const Graph &graph) :
- graph_(graph) {
- }
-
- vector<FlowVertexId> Go(FlowVertexId start, FlowVertexId finish) {
- queue<FlowVertexId> q;
- q.push(start);
- map<FlowVertexId, FlowVertexId> prev;
- prev[start] = start;
- while (!q.empty()) {
- FlowVertexId current = q.front();
- q.pop();
- vector<FlowEdgeId> outgoing = graph_.OutgoingEdges(current);
- for (auto it = outgoing.begin(); it != outgoing.end(); ++it) {
- if (prev.find(it->second) == prev.end()) {
- q.push(it->second);
- prev[it->second] = current;
- }
- if (it->second == finish) {
- return RestoreAnswer(start, finish, prev);
- }
- }
- }
- return vector<FlowVertexId>();
- }
-};
-
-template<class Graph>
-class MaxFlowFinder {
-private:
- FlowGraph<Graph> &graph_;
- typedef typename FlowGraph<Graph>::FlowVertexId FlowVertexId;
- typedef typename FlowGraph<Graph>::FlowEdgeId FlowEdgeId;
-
- int MinCapacity(vector<FlowVertexId> path) {
- VERIFY(path.size() >= 2);
- int result = graph_.GetCapacity(path[0], path[1]);
- for (size_t i = 1; i + 1 < path.size(); i++) {
- result = std::min(result, graph_.GetCapacity(path[i], path[i + 1]));
- }
- return result;
- }
-
-public:
- MaxFlowFinder(FlowGraph<Graph> &graph) :
- graph_(graph) {
- }
-
- void Find() {
- BFS<FlowGraph<Graph> > bfs(graph_);
- while (true) {
- vector<FlowVertexId> path = bfs.Go(graph_.Source(), graph_.Sink());
- if (path.size() == 0)
- break;
- int capacity = MinCapacity(path);
- VERIFY(capacity > 0);
- graph_.PushFlow(path, capacity);
-// graph_.Print();
- }
- }
-};
-
-template<class Graph>
-class TopSorter {
-private:
- typedef typename Graph::FlowVertexId FlowVertexId;
- typedef typename Graph::FlowEdgeId FlowEdgeId;
- const Graph &graph_;
-
- void Find(FlowVertexId v, vector<FlowVertexId> &result, set<FlowVertexId> &visited) {
- visited.insert(v);
- vector<FlowEdgeId> outgoing = graph_.OutgoingEdges(v);
- for (auto it = outgoing.begin(); it != outgoing.end(); ++it) {
- FlowVertexId next = graph_.EdgeEnd(*it);
- if (visited.count(next) == 0) {
- Find(next, result, visited);
- }
- }
- result.push_back(v);
- }
-
-public:
- TopSorter(const Graph &graph) :
- graph_(graph) {
- }
-
- vector<FlowVertexId> Sort() {
- vector<FlowVertexId> result;
- set<FlowVertexId> visited;
- for (auto it = graph_.begin(); it != graph_.end(); ++it) {
- if (visited.count(*it) == 0) {
- Find(*it, result, visited);
- }
- }
- return result;
- }
-};
-
-template<class Graph>
-class ReverseDFSComponentFinder {
-private:
- typedef typename Graph::FlowVertexId FlowVertexId;
- typedef typename Graph::FlowEdgeId FlowEdgeId;
-
- const Graph &graph_;
-
- void Find(FlowVertexId v, map<FlowVertexId, size_t> &result, size_t cc) {
- result[v] = cc;
- vector<FlowEdgeId> incoming = graph_.IncomingEdges(v);
- for (auto it = incoming.begin(); it != incoming.end(); ++it) {
- FlowVertexId next = graph_.EdgeStart(*it);
- if (result.count(next) == 0) {
- Find(next, result, cc);
- }
- }
- }
-public:
- ReverseDFSComponentFinder(const Graph &graph) :
- graph_(graph) {
- }
-
- map<FlowVertexId, size_t> Find(const vector<FlowVertexId> &order) {
- size_t cc = 0;
- map<FlowVertexId, size_t> result;
- for (auto it = order.rbegin(); it != order.rend(); ++it) {
- if (result.count(*it) == 0) {
- Find(*it, result, cc);
- cc++;
- }
- }
- return result;
- }
-};
-
-template<class Graph>
-class StroglyConnectedComponentFinder {
-private:
- typedef typename Graph::FlowVertexId FlowVertexId;
- const Graph &graph_;
- bool ready_;
-public:
- StroglyConnectedComponentFinder(const Graph &graph) :
- graph_(graph), ready_(false) {
- }
-
- map<FlowVertexId, size_t> ColourComponents() {
- map<FlowVertexId, size_t> result;
- vector<FlowVertexId> order = TopSorter<Graph>(graph_).Sort();
- return ReverseDFSComponentFinder<Graph>(graph_).Find(order);
- }
-};
-
-template<class Graph>
-class MaxFlowECRemover {
-private:
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- Graph& g_;
- size_t max_length_;
- size_t uniqueness_length_;
- size_t plausibility_length_;
- ComponentRemover<Graph> component_remover_;
-
- bool IsTerminal(VertexId vertex) {
- return g_.OutgoingEdgeCount(vertex)
- + g_.IncomingEdgeCount(vertex) == 1;
- }
-
- bool IsTip(EdgeId edge) {
- VertexId start = g_.EdgeStart(edge);
- VertexId end = g_.EdgeEnd(edge);
- return IsTerminal(start) || IsTerminal(end);
- }
-
-
- bool IsSuspicious(EdgeId edge) {
- return g_.length(edge) <= max_length_ && !IsTip(edge);
- }
-
- set<EdgeId> CollectUnusedEdges(set<VertexId> component, FlowGraph<Graph> fg,
- const map<typename FlowGraph<Graph>::FlowVertexId, size_t> &colouring) {
- set<EdgeId> result;
- for (auto it_start = component.begin(); it_start != component.end();
- ++it_start) {
- VertexId start = *it_start;
- auto outgoing = g_.OutgoingEdges(start);
- for (auto it_edge = outgoing.begin(); it_edge != outgoing.end();
- ++it_edge) {
- EdgeId edge = *it_edge;
- VertexId end = g_.EdgeEnd(edge);
- if (component.count(end) == 1 && IsSuspicious(edge)
- && colouring.find(fg.GetCorrespondingVertex(start))->second
- != colouring.find(
- fg.GetCorrespondingVertex(end))->second) {
- result.insert(edge);
- }
- }
- }
- return result;
- }
-
- bool CheckCompleteFlow(FlowGraph<Graph> &fg) {
- return fg.OutgoingEdges(fg.Source()).size() == 0
- && fg.IncomingEdges(fg.Sink()).size() == 0;
- }
-
- bool IsPlausible(EdgeId edge) {
- return g_.length(edge) >= plausibility_length_ && !IsTip(edge);
- }
-
- bool IsUnique(EdgeId edge) {
- return g_.length(edge) >= uniqueness_length_;
- }
-
- bool IsInnerShortEdge(set<VertexId> component, EdgeId edge) {
- return !IsUnique(edge) && component.count(g_.EdgeStart(edge)) == 1
- && component.count(g_.EdgeEnd(edge)) == 1;
- }
-
- void ProcessShortEdge(FlowGraph<Graph> &fg, set<VertexId> component,
- EdgeId edge) {
- if (IsInnerShortEdge(component, edge)) {
- fg.AddEdge(g_.EdgeStart(edge), g_.EdgeEnd(edge));
- }
- }
-
- void ProcessSource(FlowGraph<Graph> &fg, set<VertexId> /*component*/,
- EdgeId edge) {
- if (IsPlausible(edge) || IsUnique(edge)) {
- fg.AddSource(g_.EdgeEnd(edge), 1);
- }
- }
-
- void ProcessSink(FlowGraph<Graph> &fg, set<VertexId> /*component*/,
- EdgeId edge) {
- if (IsPlausible(edge) || IsUnique(edge)) {
- fg.AddSink(g_.EdgeStart(edge), 1);
- }
- }
-
- void ConstructFlowGraph(FlowGraph<Graph> &fg, set<VertexId> component) {
- for (auto it = component.begin(); it != component.end(); ++it) {
- fg.AddVertex(*it);
- }
- for (auto it = component.begin(); it != component.end(); ++it) {
- VertexId vertex = *it;
- auto outgoing = g_.OutgoingEdges(vertex);
- for (auto it_edge = outgoing.begin(); it_edge != outgoing.end();
- ++it_edge) {
- EdgeId edge = *it_edge;
- ProcessShortEdge(fg, component, edge);
- ProcessSink(fg, component, edge);
- }
- auto incoming = g_.IncomingEdges(vertex);
- for (auto it_edge = incoming.begin(); it_edge != incoming.end();
- ++it_edge) {
- EdgeId edge = *it_edge;
- ProcessSource(fg, component, edge);
- }
- }
- }
-
-public:
- MaxFlowECRemover(Graph& g, size_t max_length, size_t uniqueness_length,
- size_t plausibility_length, std::function<void (EdgeId)>
- /*fixme ignored, fix after merge with relative coverage branch!!! removal_handler*/) :
- g_(g), max_length_(max_length), uniqueness_length_(
- uniqueness_length), plausibility_length_(
- plausibility_length), component_remover_(g, (std::function<void (set<EdgeId>)>) 0) {
- VERIFY(uniqueness_length >= plausibility_length);
- VERIFY(plausibility_length > max_length);
- }
-
- bool Process() {
- for (shared_ptr<GraphSplitter<Graph>> splitter_ptr = LongEdgesExclusiveSplitter<Graph>(g_,
- uniqueness_length_); splitter_ptr->HasNext();) {
- set<VertexId> component = splitter_ptr->Next().vertices();
- FlowGraph<Graph> fg;
- ConstructFlowGraph(fg, component);
-// fg.Print();
- MaxFlowFinder<Graph> mf_finder(fg);
- mf_finder.Find();
- if (!CheckCompleteFlow(fg)) {
- TRACE("Suspicious component! No edge delition!");
- continue;
- }
- StroglyConnectedComponentFinder<FlowGraph<Graph>> component_finder(
- fg);
- map<typename FlowGraph<Graph>::FlowVertexId, size_t> colouring =
- component_finder.ColourComponents();
- set<EdgeId> to_remove = CollectUnusedEdges(component, fg,
- colouring);
- component_remover_.DeleteComponent(to_remove.begin(), to_remove.end(), false);
- }
- CompressAllVertices(g_);
- CleanGraph(g_);
-
- return false;
- }
-private:
- DECL_LOGGER("MaxFlowECRemover");
-};
-}
diff --git a/src/include/omni/observable_graph.hpp b/src/include/omni/observable_graph.hpp
deleted file mode 100644
index 1189362..0000000
--- a/src/include/omni/observable_graph.hpp
+++ /dev/null
@@ -1,497 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include <vector>
-#include <set>
-#include <cstring>
-#include "logger/logger.hpp"
-#include "graph_core.hpp"
-#include "graph_iterators.hpp"
-
-namespace omnigraph {
-
-template<class DataMaster>
-class ObservableGraph: public GraphCore<DataMaster> {
-public:
- typedef GraphCore<DataMaster> base;
- typedef typename base::DataMasterT DataMasterT;
- typedef typename base::VertexData VertexData;
- typedef typename base::EdgeData EdgeData;
- typedef typename base::EdgeId EdgeId;
- typedef typename base::VertexId VertexId;
- typedef typename base::VertexIt VertexIt;
- typedef typename base::edge_const_iterator edge_const_iterator;
-
- typedef HandlerApplier<VertexId, EdgeId> Applier;
- typedef SmartVertexIterator<ObservableGraph> SmartVertexIt;
- typedef SmartEdgeIterator<ObservableGraph> SmartEdgeIt;
- typedef ConstEdgeIterator<ObservableGraph> ConstEdgeIt;
- typedef ActionHandler<VertexId, EdgeId> Handler;
-
-private:
- //todo switch to smart iterators
- mutable std::vector<Handler*> action_handler_list_;
- const HandlerApplier<VertexId, EdgeId> *applier_;
-
-public:
-//todo move to graph core
- typedef ConstructionHelper<DataMaster> HelperT;
-
- HelperT GetConstructionHelper() {
-// TODO: fix everything and restore this check
-// VERIFY(this->VerifyAllDetached());
- return HelperT(*this);
- }
-
- const Applier& GetHandlerApplier() const {
- return *applier_;
- }
-
- void AddActionHandler(Handler* action_handler) const;
-
- bool RemoveActionHandler(const Handler* action_handler) const;
-
- bool AllHandlersThreadSafe() const;
-
- // TODO: for debug. remove.
- void PrintHandlersNames() const;
-
- //todo make Fire* protected once again with helper friend class
- void FireAddVertex(VertexId v) const;
-
- void FireAddEdge(EdgeId e) const;
-
- void FireDeleteVertex(VertexId v) const;
-
- void FireDeleteEdge(EdgeId e) const;
-
- void FireMerge(std::vector<EdgeId> old_edges, EdgeId new_edge) const;
-
- void FireGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) const;
-
- void FireSplit(EdgeId edge, EdgeId new_edge1, EdgeId new_edge2) const;
-
- bool VerifyAllDetached();
-
- //smart iterators
- template<typename Comparator>
- SmartVertexIterator<ObservableGraph, Comparator> SmartVertexBegin(
- const Comparator& comparator, bool canonical_only = false) const {
- return SmartVertexIterator<ObservableGraph, Comparator>(*this,
- comparator, canonical_only);
- }
-
- SmartVertexIterator<ObservableGraph> SmartVertexBegin(bool canonical_only = false) const {
- return SmartVertexIterator<ObservableGraph>(*this, std::less<VertexId>(), canonical_only);
- }
-
- template<typename Comparator>
- SmartEdgeIterator<ObservableGraph, Comparator> SmartEdgeBegin(
- const Comparator& comparator, bool canonical_only = false) const {
- return SmartEdgeIterator<ObservableGraph, Comparator>(*this, comparator, canonical_only);
- }
-
- SmartEdgeIterator<ObservableGraph> SmartEdgeBegin(bool canonical_only = false) const {
- return SmartEdgeIterator<ObservableGraph>(*this, std::less<EdgeId>(), canonical_only);
- }
-
- ConstEdgeIterator<ObservableGraph> ConstEdgeBegin(bool canonical_only = false) const {
- return ConstEdgeIterator<ObservableGraph>(*this, canonical_only);
- }
-
- void FireDeletePath(const std::vector<EdgeId>& edges_to_delete, const std::vector<VertexId>& vertices_to_delete) const;
-
- ObservableGraph(const DataMaster& master) :
- base(master), applier_(new PairedHandlerApplier<ObservableGraph>(*this)) {
- }
-
- virtual ~ObservableGraph();
-
- /////////////////////////graph operations
- //adding/removing vertices and edges
- VertexId AddVertex(const VertexData& data) {
- return AddVertex(data, GetGraphIdDistributor());
- }
-
- VertexId AddVertex(const VertexData& data, restricted::IdDistributor& id_distributor);
-
- void DeleteVertex(VertexId v);
-
- void ForceDeleteVertex(VertexId v);
-
- using base::GetGraphIdDistributor;
- using base::conjugate;
-
- EdgeId AddEdge(const EdgeData &data) {
- return AddEdge(data, GetGraphIdDistributor());
- }
-
- EdgeId AddEdge(const EdgeData& data, restricted::IdDistributor& id_distributor);
-
- EdgeId AddEdge(VertexId v1, VertexId v2, const EdgeData &data) {
- return AddEdge(v1, v2, data, GetGraphIdDistributor());
- }
-
- EdgeId AddEdge(VertexId v1, VertexId v2, const EdgeData& data, restricted::IdDistributor& id_distributor);
-
- void DeleteEdge(EdgeId e);
-
- void DeleteAllOutgoing(VertexId v);
-
- void DeleteAllIncoming(VertexId v);
-
- void CompressVertex(VertexId v);
-
- EdgeId UnsafeCompressVertex(VertexId v);
-
- std::vector<EdgeId> EdgesToDelete(const std::vector<EdgeId>& path) const;
-
- std::vector<VertexId> VerticesToDelete(const std::vector<EdgeId>& path) const;
-
- std::vector<EdgeId> CorrectMergePath(const std::vector<EdgeId>& path) const;
-
- EdgeId MergePath(const std::vector<EdgeId>& path, bool safe_merging = true);
-
- std::pair<EdgeId, EdgeId> SplitEdge(EdgeId edge, size_t position);
-
- EdgeId GlueEdges(EdgeId edge1, EdgeId edge2);
-
-private:
- DECL_LOGGER("ObservableGraph")
-};
-
-template<class DataMaster>
-typename ObservableGraph<DataMaster>::VertexId ObservableGraph<DataMaster>::AddVertex(const VertexData& data, restricted::IdDistributor& id_distributor) {
- VertexId v = base::HiddenAddVertex(data, id_distributor);
- FireAddVertex(v);
- return v;
-}
-
-template<class DataMaster>
-void ObservableGraph<DataMaster>::DeleteVertex(VertexId v) {
- VERIFY(base::IsDeadEnd(v) && base::IsDeadStart(v));
- VERIFY(v != VertexId(NULL));
- FireDeleteVertex(v);
- base::HiddenDeleteVertex(v);
-}
-
-template<class DataMaster>
-void ObservableGraph<DataMaster>::ForceDeleteVertex(VertexId v) {
- DeleteAllOutgoing(v);
- DeleteAllIncoming(v);
- DeleteVertex(v);
-}
-
-template<class DataMaster>
-typename ObservableGraph<DataMaster>::EdgeId ObservableGraph<DataMaster>::AddEdge(VertexId v1, VertexId v2, const EdgeData& data, restricted::IdDistributor& id_distributor) {
- EdgeId e = base::HiddenAddEdge(v1, v2, data, id_distributor);
- FireAddEdge(e);
- return e;
-}
-
-template<class DataMaster>
-typename ObservableGraph<DataMaster>::EdgeId ObservableGraph<DataMaster>::AddEdge(const EdgeData& data, restricted::IdDistributor& id_distributor) {
- EdgeId e = base::HiddenAddEdge(data, id_distributor);
- FireAddEdge(e);
- return e;
-}
-
-template<class DataMaster>
-void ObservableGraph<DataMaster>::DeleteEdge(EdgeId e) {
- FireDeleteEdge(e);
- base::HiddenDeleteEdge(e);
-}
-
-template<class DataMaster>
-void ObservableGraph<DataMaster>::DeleteAllOutgoing(VertexId v) {
- while (base::OutgoingEdgeCount(v) > 0) {
- EdgeId edge = *base::out_begin(v);
- DeleteEdge(edge);
- }
-}
-
-template<class DataMaster>
-void ObservableGraph<DataMaster>::DeleteAllIncoming(VertexId v) {
- while (base::IncomingEdgeCount(v) > 0) {
- EdgeId edge = *base::in_begin(v);
- DeleteEdge(edge);
- }
-}
-
-template<class DataMaster>
-void ObservableGraph<DataMaster>::CompressVertex(VertexId v) {
- //VERIFY(CanCompressVertex(v));
- if (base::CanCompressVertex(v)) {
- UnsafeCompressVertex(v);
- } else {
- TRACE("Vertex " << base::str(v) << " can't be compressed");
- }
-}
-
-template<class DataMaster>
-typename ObservableGraph<DataMaster>::EdgeId ObservableGraph<DataMaster>::UnsafeCompressVertex(VertexId v) {
- VERIFY(base::CanCompressVertex(v));
- std::vector<EdgeId> edges_to_merge;
- edges_to_merge.push_back(base::GetUniqueIncomingEdge(v));
- edges_to_merge.push_back(base::GetUniqueOutgoingEdge(v));
- return MergePath(edges_to_merge);
-}
-
-template<class DataMaster>
-std::vector<typename ObservableGraph<DataMaster>::EdgeId> ObservableGraph<DataMaster>::EdgesToDelete(const std::vector<EdgeId>& path) const {
- std::set<EdgeId> edgesToDelete;
- edgesToDelete.insert(path[0]);
- for (size_t i = 0; i + 1 < path.size(); i++) {
- EdgeId e = path[i + 1];
- if (edgesToDelete.find(base::conjugate(e)) == edgesToDelete.end())
- edgesToDelete.insert(e);
- }
- return std::vector<EdgeId>(edgesToDelete.begin(), edgesToDelete.end());
-}
-
-template<class DataMaster>
-vector<typename ObservableGraph<DataMaster>::VertexId> ObservableGraph<DataMaster>::VerticesToDelete(const vector<EdgeId>& path) const {
- std::set<VertexId> verticesToDelete;
- for (size_t i = 0; i + 1 < path.size(); i++) {
- EdgeId e = path[i + 1];
- VertexId v = base::EdgeStart(e);
- if (verticesToDelete.find(base::conjugate(v)) == verticesToDelete.end())
- verticesToDelete.insert(v);
- }
- return vector<VertexId>(verticesToDelete.begin(), verticesToDelete.end());
-}
-
-template<class DataMaster>
-void ObservableGraph<DataMaster>::AddActionHandler(Handler* action_handler) const {
-#pragma omp critical(action_handler_list_modification)
- {
- TRACE("Action handler " << action_handler->name() << " added");
- if (find(action_handler_list_.begin(), action_handler_list_.end(), action_handler) != action_handler_list_.end()) {
- VERIFY_MSG(false, "Action handler " << action_handler->name() << " has already been added");
- } else {
- action_handler_list_.push_back(action_handler);
- }
- }
-}
-
-template<class DataMaster>
-bool ObservableGraph<DataMaster>::RemoveActionHandler(const Handler* action_handler) const {
- bool result = false;
-#pragma omp critical(action_handler_list_modification)
- {
- auto it = std::find(action_handler_list_.begin(), action_handler_list_.end(), action_handler);
- if (it != action_handler_list_.end()) {
- action_handler_list_.erase(it);
- TRACE("Action handler " << action_handler->name() << " removed");
- result = true;
- } else {
- TRACE("Action handler " << action_handler->name() << " wasn't found among graph action handlers");
- }
- }
- return result;
-}
-
-template<class DataMaster>
-bool ObservableGraph<DataMaster>::AllHandlersThreadSafe() const {
- for (Handler* handler : action_handler_list_) {
- if (handler->IsAttached() && !handler->IsThreadSafe()) {
- return false;
- }
- }
- return true;
-}
-
-template<class DataMaster>
-void ObservableGraph<DataMaster>::PrintHandlersNames() const {
- for (Handler* handler : action_handler_list_) {
- std::cout << handler->name() << " attached=" << handler->IsAttached() << std::endl;
- }
-}
-
-template<class DataMaster>
-void ObservableGraph<DataMaster>::FireAddVertex(VertexId v) const {
- for (Handler* handler_ptr : action_handler_list_) {
- if (handler_ptr->IsAttached()) {
- TRACE("FireAddVertex to handler " << handler_ptr->name());
- applier_->ApplyAdd(*handler_ptr, v);
- }
- }
-}
-
-template<class DataMaster>
-void ObservableGraph<DataMaster>::FireAddEdge(EdgeId e) const {
- for (Handler* handler_ptr : action_handler_list_) {
- if (handler_ptr->IsAttached()) {
- TRACE("FireAddEdge to handler " << handler_ptr->name());
- applier_->ApplyAdd(*handler_ptr, e);
- }
- }
-}
-
-template<class DataMaster>
-void ObservableGraph<DataMaster>::FireDeleteVertex(VertexId v) const {
- for (auto it = action_handler_list_.rbegin(); it != action_handler_list_.rend(); ++it) {
- if ((*it)->IsAttached()) {
- applier_->ApplyDelete(**it, v);
- }
- }
-}
-
-template<class DataMaster>
-void ObservableGraph<DataMaster>::FireDeleteEdge(EdgeId e) const {
- for (auto it = action_handler_list_.rbegin(); it != action_handler_list_.rend(); ++it) {
- if ((*it)->IsAttached()) {
- applier_->ApplyDelete(**it, e);
- }
- };
-}
-
-template<class DataMaster>
-void ObservableGraph<DataMaster>::FireMerge(vector<EdgeId> old_edges, EdgeId new_edge) const {
- for (Handler* handler_ptr : action_handler_list_) {
- if (handler_ptr->IsAttached()) {
- applier_->ApplyMerge(*handler_ptr, old_edges, new_edge);
- }
- }
-}
-
-template<class DataMaster>
-void ObservableGraph<DataMaster>::FireGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) const {
- for (Handler* handler_ptr : action_handler_list_) {
- if (handler_ptr->IsAttached()) {
- applier_->ApplyGlue(*handler_ptr, new_edge, edge1, edge2);
- }
- };
-}
-
-template<class DataMaster>
-void ObservableGraph<DataMaster>::FireSplit(EdgeId edge, EdgeId new_edge1, EdgeId new_edge2) const {
- for (Handler* handler_ptr : action_handler_list_) {
- if (handler_ptr->IsAttached()) {
- applier_->ApplySplit(*handler_ptr, edge, new_edge1, new_edge2);
- }
- }
-}
-
-template<class DataMaster>
-bool ObservableGraph<DataMaster>::VerifyAllDetached() {
- for (Handler* handler_ptr : action_handler_list_) {
- if (handler_ptr->IsAttached()) {
- return false;
- }
- }
- return true;
-}
-
-template<class DataMaster>
-void ObservableGraph<DataMaster>::FireDeletePath(const vector<EdgeId>& edgesToDelete, const vector<VertexId>& verticesToDelete) const {
- for (auto it = edgesToDelete.begin(); it != edgesToDelete.end(); ++it)
- FireDeleteEdge(*it);
- for (auto it = verticesToDelete.begin(); it != verticesToDelete.end(); ++it)
- FireDeleteVertex(*it);
-}
-
-template<class DataMaster>
-ObservableGraph<DataMaster>::~ObservableGraph<DataMaster>() {
- while (base::size() > 0) {
- ForceDeleteVertex(*base::begin());
- }
-}
-
-template<class DataMaster>
-vector<typename ObservableGraph<DataMaster>::EdgeId> ObservableGraph<DataMaster>::CorrectMergePath(const vector<EdgeId>& path) const {
- for (size_t i = 0; i < path.size(); i++) {
- if (path[i] == base::conjugate(path[i])) {
- vector<EdgeId> result;
- if (i < path.size() - 1 - i) {
- for (size_t j = 0; j < path.size(); j++)
- result.push_back(base::conjugate(path[path.size() - 1 - j]));
- i = path.size() - 1 - i;
- } else {
- result = path;
- }
- size_t size = 2 * i + 1;
- for (size_t j = result.size(); j < size; j++) {
- result.push_back(base::conjugate(result[size - 1 - j]));
- }
- return result;
- }
- }
- return path;
-}
-
-template<class DataMaster>
-typename ObservableGraph<DataMaster>::EdgeId ObservableGraph<DataMaster>::MergePath(const vector<EdgeId>& path, bool safe_merging) {
- VERIFY(!path.empty());
- for (size_t i = 0; i < path.size(); i++)
- for (size_t j = i + 1; j < path.size(); j++) {
- VERIFY(path[i] != path[j]);
- }
- if (path.size() == 1) {
- TRACE(
- "Path of single edge " << base::str(*(path.begin())) << ". Nothing to merge.");
- };
- // cerr << "Merging " << PrintDetailedPath(pObservableGraph<DataMaster><VertexIdT, EdgeIdT, VertexIt>ath) << endl;
- // cerr << "Conjugate " << PrintConjugatePath(path) << endl;
- vector<EdgeId> corrected_path = CorrectMergePath(path);
- VertexId v1 = base::EdgeStart(corrected_path[0]);
- VertexId v2 = base::EdgeEnd(corrected_path[corrected_path.size() - 1]);
- vector<const EdgeData*> to_merge;
- for (auto it = corrected_path.begin(); it != corrected_path.end(); ++it) {
- to_merge.push_back(&(base::data(*it)));
- }
- EdgeId new_edge = base::HiddenAddEdge(v1, v2, base::master().MergeData(to_merge, safe_merging));
- FireMerge(corrected_path, new_edge);
- vector<EdgeId> edges_to_delete = EdgesToDelete(corrected_path);
- vector<VertexId> vertices_to_delete = VerticesToDelete(corrected_path);
- FireDeletePath(edges_to_delete, vertices_to_delete);
- FireAddEdge(new_edge);
- base::HiddenDeletePath(edges_to_delete, vertices_to_delete);
- return new_edge;
-}
-
-template<class DataMaster>
-std::pair<typename ObservableGraph<DataMaster>::EdgeId, typename ObservableGraph<DataMaster>::EdgeId> ObservableGraph<DataMaster>::SplitEdge(EdgeId edge, size_t position) {
- bool sc_flag = (edge == conjugate(edge));
- VERIFY_MSG(position > 0 && position < (sc_flag ? base::length(edge) / 2 + 1 : base::length(edge)),
- "Edge length is " << base::length(edge) << " but split pos was " << position);
- std::pair<VertexData, std::pair<EdgeData, EdgeData> > newData = base::master().SplitData(base::data(edge), position, sc_flag);
- VertexId splitVertex = base::HiddenAddVertex(newData.first);
- EdgeId new_edge1 = base::HiddenAddEdge(base::EdgeStart(edge), splitVertex, newData.second.first);
- EdgeId new_edge2 = base::HiddenAddEdge(splitVertex, sc_flag ? conjugate(splitVertex) : base::EdgeEnd(edge), newData.second.second);
- VERIFY(!sc_flag || new_edge2 == conjugate(new_edge2))
- FireSplit(edge, new_edge1, new_edge2);
- FireDeleteEdge(edge);
- FireAddVertex(splitVertex);
- FireAddEdge(new_edge1);
- FireAddEdge(new_edge2);
- base::HiddenDeleteEdge(edge);
- return make_pair(new_edge1, new_edge2);
-}
-
-template<class DataMaster>
-typename ObservableGraph<DataMaster>::EdgeId ObservableGraph<DataMaster>::GlueEdges(EdgeId edge1, EdgeId edge2) {
- EdgeId new_edge = base::HiddenAddEdge(base::EdgeStart(edge2), base::EdgeEnd(edge2), base::master().GlueData(base::data(edge1), base::data(edge2)));
- FireGlue(new_edge, edge1, edge2);
- FireDeleteEdge(edge1);
- FireDeleteEdge(edge2);
- FireAddEdge(new_edge);
- VertexId start = base::EdgeStart(edge1);
- VertexId end = base::EdgeEnd(edge1);
- base::HiddenDeleteEdge(edge1);
- base::HiddenDeleteEdge(edge2);
- if (base::IsDeadStart(start) && base::IsDeadEnd(start)) {
- DeleteVertex(start);
- }
- if (base::IsDeadStart(end) && base::IsDeadEnd(end)) {
- DeleteVertex(end);
- }
- return new_edge;
-}
-}
diff --git a/src/include/omni/omni_tools.hpp b/src/include/omni/omni_tools.hpp
deleted file mode 100644
index 47d8a0d..0000000
--- a/src/include/omni/omni_tools.hpp
+++ /dev/null
@@ -1,411 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef OMNI_TOOLS_HPP_
-#define OMNI_TOOLS_HPP_
-
-#include "omni_utils.hpp"
-#include "simple_tools.hpp"
-
-#include "path_helper.hpp"
-#include "basic_edge_conditions.hpp"
-#include "graph_processing_algorithm.hpp"
-#include "parallel_processing.hpp"
-
-#ifdef USE_GLIBCXX_PARALLEL
-#include <parallel/algorithm>
-#endif
-
-namespace omnigraph {
-
-template<class Graph>
-class VertexCondition : public Predicate<typename Graph::VertexId> {
- typedef typename Graph::VertexId VertexId;
- const Graph& g_;
- protected:
-
- VertexCondition(const Graph& g)
- : g_(g) {
- }
-
- const Graph& g() const {
- return g_;
- }
-
-};
-
-template<class Graph>
-class CompressCondition : public VertexCondition<Graph> {
- typedef typename Graph::VertexId VertexId;
-
-public:
- CompressCondition(const Graph& g) :
- VertexCondition<Graph>(g) {
- }
-
- bool Check(VertexId v) const override {
- return this->g().CanCompressVertex(v);
- }
-};
-
-/**
- * Compressor compresses vertices with unique incoming and unique outgoing edge in linear time while
- * simple one-by-one compressing has square complexity.
- */
-template<class Graph>
-class Compressor : public PersistentProcessingAlgorithm<Graph,
- typename Graph::VertexId,
- ParallelInterestingElementFinder<Graph, typename Graph::VertexId>> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef PersistentProcessingAlgorithm<Graph,
- VertexId, ParallelInterestingElementFinder<Graph, VertexId>> base;
- typedef CompressCondition<Graph> ConditionT;
-
- Graph &graph_;
- ConditionT compress_condition_;
- bool safe_merging_;
-
- bool GoUniqueWayForward(EdgeId &e) {
- VertexId u = graph_.EdgeEnd(e);
- if (!graph_.CheckUniqueOutgoingEdge(u)
- || !graph_.CheckUniqueIncomingEdge(u)) {
- return false;
- }
- e = graph_.GetUniqueOutgoingEdge(u);
- return true;
- }
-
- bool GoUniqueWayBackward(EdgeId &e) {
- VertexId u = graph_.EdgeStart(e);
- if (!graph_.CheckUniqueOutgoingEdge(u)
- || !graph_.CheckUniqueIncomingEdge(u)) {
- return false;
- }
- e = graph_.GetUniqueIncomingEdge(u);
- return true;
- }
-
-//do not use without checks:)
- EdgeId CompressWithoutChecks(VertexId v) {
-
- EdgeId e = graph_.GetUniqueOutgoingEdge(v);
- EdgeId start_edge = e;
- while (GoUniqueWayBackward(e) && e != start_edge
- && !graph_.RelatedVertices(graph_.EdgeStart(e),
- graph_.EdgeEnd(e))) {
- }
- vector<EdgeId> mergeList;
- // e = graph_.conjugate(e);
- start_edge = e;
- do {
- mergeList.push_back(e);
- } while (GoUniqueWayForward(e) && e != start_edge
- && !graph_.RelatedVertices(graph_.EdgeStart(e),
- graph_.EdgeEnd(e)));
- EdgeId new_edge = graph_.MergePath(mergeList, safe_merging_);
- TRACE("Vertex compressed and is now part of edge "
- << graph_.str(new_edge));
- return new_edge;
-
- }
-
-// //todo use graph method!
-// bool CanCompressVertex(VertexId v) const {
-// if (!graph_.CheckUniqueOutgoingEdge(v)
-// || !graph_.CheckUniqueIncomingEdge(v)) {
-// TRACE(
-// "Vertex "
-// << graph_.str(v)
-// << " judged NOT compressible. Proceeding to the next vertex");
-// TRACE("Processing vertex " << graph_.str(v) << " finished");
-// return false;
-// }
-// return true;
-// }
-public:
- Compressor(Graph &graph, size_t chunk_cnt = 1, bool safe_merging = true) :
- base(graph,
- ParallelInterestingElementFinder<Graph, VertexId>(graph,
- ConditionT(graph), chunk_cnt),
- /*canonical only*/true),
- graph_(graph),
- compress_condition_(graph),
- safe_merging_(safe_merging) {
- }
-
- /**
- * Method compresses longest possible path, containing given vertex.
- * @param vertex to be compressed as part of a path
- * @return true if vertex can be compressed and false otherwise
- */
- bool CompressVertex(VertexId v) {
- TRACE("Processing vertex " << graph_.str(v) << " started");
- if (! compress_condition_.Check(v)) {
- return false;
- }
- TRACE("Vertex " << graph_.str(v) << " judged compressible");
- CompressWithoutChecks(v);
- return true;
- }
-
- EdgeId CompressVertexEdgeId(VertexId v) {
- TRACE("Processing vertex " << graph_.str(v) << " started");
- if (! compress_condition_.Check(v)) {
- return EdgeId(0);
- }
- TRACE("Vertex " << graph_.str(v) << " judged compressible");
- return CompressWithoutChecks(v);
- }
-
-// bool IsOfInterest(VertexId v) const {
-// return CanCompressVertex(v);
-// }
-
-protected:
- bool Process(VertexId v) override {
- if (compress_condition_.Check(v)) {
- CompressWithoutChecks(v);
- return true;
- } else {
- return false;
- }
- }
-
-private:
- DECL_LOGGER("Compressor")
-};
-
-/**
- * Method compresses all vertices which can be compressed.
- */
-template<class Graph>
-bool CompressAllVertices(Graph& g, bool safe_merging = true, size_t chunk_cnt = 1) {
- Compressor<Graph> compressor(g, chunk_cnt, safe_merging);
- return compressor.Run();
-}
-
-template<class Graph>
-class IsolatedVertexCondition : public VertexCondition<Graph> {
- typedef typename Graph::VertexId VertexId;
-
-public:
- IsolatedVertexCondition(const Graph& g) :
- VertexCondition<Graph>(g) {
- }
-
- bool Check(VertexId v) const override {
- return this->g().IsDeadStart(v) && this->g().IsDeadEnd(v);
- }
-};
-
-
-template<class Graph>
-class Cleaner : public PersistentProcessingAlgorithm<Graph,
- typename Graph::VertexId,
- ParallelInterestingElementFinder<Graph, typename Graph::VertexId>> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef PersistentProcessingAlgorithm<Graph,
- VertexId, ParallelInterestingElementFinder<Graph, VertexId>> base;
- typedef IsolatedVertexCondition<Graph> ConditionT;
-
- Graph& g_;
- ConditionT isolated_condition_;
-
-public:
- Cleaner(Graph& g, size_t chunk_cnt = 1) :
- base(g,
- ParallelInterestingElementFinder<Graph, VertexId>(g,
- ConditionT(g), chunk_cnt),
- /*canonical only*/true),
- g_(g), isolated_condition_(g) {
- }
-
-protected:
-
- bool Process(VertexId v) {
- if (isolated_condition_.Check(v)) {
- g_.DeleteVertex(v);
- return true;
- } else {
- return false;
- }
- }
-
-// void Clean() {
-// for (auto iter = graph_.SmartVertexBegin(); !iter.IsEnd(); ++iter) {
-// if (graph_.IsDeadStart(*iter) && graph_.IsDeadEnd(*iter)) {
-// graph_.DeleteVertex(*iter);
-// }
-// }
-// }
-
-};
-
-/**
- * Method removes isolated vertices from the graph.
- */
-template<class Graph>
-bool CleanGraph(Graph& g, size_t chunk_cnt = 1) {
- Cleaner<Graph> cleaner(g, chunk_cnt);
- return cleaner.Run();
-}
-
-template<class Graph>
-class AvgCovereageCounter {
-private:
- const Graph &graph_;
- const size_t min_length_;
-public:
- AvgCovereageCounter(const Graph &graph, size_t min_length = 0) :
- graph_(graph), min_length_(min_length) {
- }
-
- double Count() const {
- double cov = 0;
- size_t length = 0;
- for (auto it = graph_.ConstEdgeBegin(); !it.IsEnd(); ++it) {
- if (graph_.length(*it) >= min_length_) {
- cov += graph_.coverage(*it) * (double) graph_.length(*it);
- length += graph_.length(*it);
- }
- }
- if (length == 0)
- return 0.;
- return cov / (double) length;
- }
-};
-
-template<class Graph>
-class ErroneousConnectionThresholdFinder {
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- const Graph &graph_;
- size_t backet_width_;
-
- bool IsInteresting(EdgeId e) const {
- if (graph_.length(e) > graph_.k() + 1)
- return false;
-
- if (graph_.OutgoingEdgeCount(graph_.EdgeStart(e)) < 2 ||
- graph_.IncomingEdgeCount(graph_.EdgeEnd(e)) < 2)
- return false;
-
- std::vector<EdgeId> v1;
- push_back_all(v1, graph_.OutgoingEdges(graph_.EdgeStart(e)));
- std::vector<EdgeId> v2;
- push_back_all(v2, graph_.IncomingEdges(graph_.EdgeEnd(e)));
- bool eq = (v1.size() == 2 && v2.size() == 2) && ((v1[0] == v2[0] && v1[1] == v2[1]) || (v1[0] == v2[1] && v1[0] == v2[1]));
- return !eq;
- }
-
- double weight(size_t value, const map<size_t, size_t> &histogram,
- size_t backet_width) const {
- double result = 0;
- for (size_t i = 0; i < backet_width && value + i < histogram.size(); i++) {
- result += (double) (getValue(value + i, histogram) * std::min(i + 1, backet_width - i));
- }
- return result;
- }
-
- double Median(double thr = 500.0) const {
- vector<double> coverages;
- for (auto it = graph_.ConstEdgeBegin(); !it.IsEnd(); ++it) {
- if (graph_.length(*it) > thr)
- coverages.push_back(graph_.coverage(*it));
- }
-
- auto middle_it = coverages.begin() + coverages.size() / 2;
-#ifdef USE_GLIBCXX_PARALLEL
- __gnu_parallel::nth_element(coverages.begin(), middle_it, coverages.end());
-#else
- std::nth_element(coverages.begin(), middle_it, coverages.end());
-#endif
- return coverages[coverages.size() / 2];
- }
-
- size_t getValue(size_t arg, const map<size_t, size_t> &ssmap) const {
- auto it = ssmap.find(arg);
- if (it == ssmap.end())
- return 0;
- else
- return it->second;
- }
-
-public:
- ErroneousConnectionThresholdFinder(const Graph &graph, size_t backet_width = 0) :
- graph_(graph), backet_width_(backet_width) {
- }
-
- double AvgCoverage() const {
- double cov = 0;
- double length = 0;
- for (auto it = graph_.ConstEdgeBegin(); !it.IsEnd(); ++it) {
- cov += graph_.coverage(*it) * (double) graph_.length(*it);
- length += (double) graph_.length(*it);
- }
- return cov / length;
- }
-
- std::map<size_t, size_t> ConstructHistogram() const {
- std::map<size_t, size_t> result;
- for (auto it = graph_.ConstEdgeBegin(); !it.IsEnd(); ++it) {
- if (IsInteresting(*it))
- result[(size_t)graph_.coverage(*it)]++;
- }
- return result;
- }
-
- double FindThreshold(const map<size_t, size_t> &histogram) const {
- size_t backet_width = backet_width_;
- if (backet_width == 0) {
- backet_width = (size_t)(0.3 * AvgCovereageCounter<Graph>(graph_).Count() + 5);
- }
- size_t size = 0;
- if (histogram.size() != 0)
- size = histogram.rbegin()->first + 1;
- INFO("Bucket size: " << backet_width);
- size_t cnt = 0;
- for (size_t i = 1; i + backet_width < size; i++) {
- if (weight(i, histogram, backet_width) > weight(i - 1, histogram, backet_width))
- cnt++;
-
- if (i > backet_width &&
- weight(i - backet_width, histogram, backet_width) >
- weight(i - backet_width - 1, histogram, backet_width)) {
- cnt--;
- }
- if (2 * cnt >= backet_width)
- return (double) i;
-
- }
- INFO("Proper threshold was not found. Threshold set to 0.1 of average coverage");
- return 0.1 * AvgCovereageCounter<Graph>(graph_).Count();
- }
-
- double FindThreshold() const {
- INFO("Finding threshold started");
- std::map<size_t, size_t> histogram = ConstructHistogram(/*weights*/);
- for (size_t i = 0; i < histogram.size(); i++) {
- TRACE(i << " " << histogram[i]);
- }
- double result = FindThreshold(histogram);
- INFO("Average edge coverage: " << AvgCoverage());
- INFO("Graph threshold: " << result);
- result = std::max(AvgCoverage(), result);
- INFO("Threshold finding finished. Threshold is set to " << result);
- return result;
- }
-private:
- DECL_LOGGER("ThresholdFinder");
-};
-
-}
-
-#endif /* OMNI_TOOLS_HPP_ */
diff --git a/src/include/omni/omni_utils.hpp b/src/include/omni/omni_utils.hpp
deleted file mode 100644
index 8962961..0000000
--- a/src/include/omni/omni_utils.hpp
+++ /dev/null
@@ -1,586 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef OMNI_UTILS_HPP_
-#define OMNI_UTILS_HPP_
-
-#include "standard_base.hpp"
-#include "simple_tools.hpp"
-#include "xmath.h"
-#include "graph_component.hpp"
-
- #include "omni/action_handlers.hpp"
- #include "omni/graph_iterators.hpp"
-
-#include "omni/action_handlers.hpp"
-#include "omni/graph_iterators.hpp"
-#include "omni/mapping_path.hpp"
-
-#include <queue>
-
-#include <cmath>
-#include <ctime>
-
-namespace omnigraph {
-
-template<class Graph>
-struct CoverageComparator {
- private:
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- const Graph& graph_;
- public:
- CoverageComparator(const Graph &graph)
- : graph_(graph) {
- }
-
- /**
- * Standard comparator function as used in collections.
- */
- bool operator()(EdgeId edge1, EdgeId edge2) const {
- if (math::eq(graph_.coverage(edge1), graph_.coverage(edge2))) {
- return edge1 < edge2;
- }
- return math::ls(graph_.coverage(edge1), graph_.coverage(edge2));
- }
-};
-
-/**
- * This class defines which edge is more likely to be tip. In this case we just assume shorter edges
- * are more likely tips then longer ones.
- */
-template<class Graph>
-struct LengthComparator {
- private:
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- const Graph& graph_;
- public:
- /**
- * TipComparator should never be created with default constructor but it is necessary on order for
- * code to compile.
- */
- // TipComparator() {
- // VERIFY(false);
- // }
- /**
- * Construct TipComparator for given graph
- * @param graph graph for which comparator is created
- */
- LengthComparator(const Graph &graph)
- : graph_(graph) {
- }
-
- /**
- * Standard comparator function as used in collections.
- */
- bool operator()(EdgeId edge1, EdgeId edge2) const {
- if (graph_.length(edge1) == graph_.length(edge2)) {
- return edge1 < edge2;
- }
- return graph_.length(edge1) < graph_.length(edge2);
- }
-};
-
-template<class Graph>
-size_t CumulativeLength(const Graph& g,
- const std::vector<typename Graph::EdgeId>& path) {
- size_t s = 0;
- for (auto it = path.begin(); it != path.end(); ++it)
- s += g.length(*it);
-
- return s;
-}
-
-template<class Graph>
-double AvgCoverage(const Graph& g,
- const std::vector<typename Graph::EdgeId>& path) {
- double unnormalized_coverage = 0;
- size_t path_length = 0;
- for (auto edge : path) {
- size_t length = g.length(edge);
- path_length += length;
- unnormalized_coverage += g.coverage(edge) * (double) length;
- }
- return unnormalized_coverage / (double) path_length;
-}
-
-template<class Graph>
-class AbstractDirection {
- private:
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- const Graph& graph_;
-
- protected:
- const Graph &graph() const {
- return graph_;
- }
-
- public:
- AbstractDirection(const Graph& graph)
- : graph_(graph) {}
-
- virtual ~AbstractDirection() {}
-
- virtual const std::vector<EdgeId> OutgoingEdges(VertexId v) const = 0;
- virtual const std::vector<EdgeId> IncomingEdges(VertexId v) const = 0;
-
- virtual size_t OutgoingEdgeCount(VertexId v) const = 0;
- virtual size_t IncomingEdgeCount(VertexId v) const = 0;
-
- virtual VertexId EdgeStart(EdgeId edge) const = 0;
- virtual VertexId EdgeEnd(EdgeId edge) const = 0;
-
- bool CheckUniqueOutgoingEdge(VertexId v) const {
- return OutgoingEdgeCount(v) == 1;
- }
-
- EdgeId GetUniqueOutgoingEdge(VertexId v) const {
- return OutgoingEdges(v)[0];
- }
-
- bool CheckUniqueIncomingEdge(VertexId v) const {
- return IncomingEdgeCount(v) == 1;
- }
-
- EdgeId GetUniqueIncomingEdge(VertexId v) const {
- return IncomingEdges(v)[0];
- }
-
- virtual bool IsForward() const = 0;
-};
-
-template<class Graph>
-class ForwardDirection : public AbstractDirection<Graph> {
- private:
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- public:
- ForwardDirection(const Graph &graph)
- : AbstractDirection<Graph>(graph) {
- }
-
- virtual const std::vector<EdgeId> OutgoingEdges(VertexId v) const {
- return std::vector<EdgeId>(this->graph().out_begin(v), this->graph().out_end(v));
- }
-
- virtual const std::vector<EdgeId> IncomingEdges(VertexId v) const {
- return std::vector<EdgeId>(this->graph().in_begin(v), this->graph().in_end(v));
- }
-
- virtual size_t OutgoingEdgeCount(VertexId v) const {
- return this->graph().OutgoingEdgeCount(v);
- }
-
- virtual size_t IncomingEdgeCount(VertexId v) const {
- return this->graph().IncomingEdgeCount(v);
- }
-
- virtual VertexId EdgeStart(EdgeId edge) const {
- return this->graph().EdgeStart(edge);
- }
-
- virtual VertexId EdgeEnd(EdgeId edge) const {
- return this->graph().EdgeEnd(edge);
- }
-
- bool IsForward() const {
- return true;
- }
-};
-
-template<class Graph>
-class BackwardDirection : public AbstractDirection<Graph> {
- private:
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- public:
- BackwardDirection(const Graph &graph)
- : AbstractDirection<Graph>(graph) {
- }
-
- virtual const std::vector<EdgeId> OutgoingEdges(VertexId v) const {
- return std::vector<EdgeId>(this->graph().in_begin(v), this->graph().in_end(v));
- }
-
- virtual const std::vector<EdgeId> IncomingEdges(VertexId v) const {
- return std::vector<EdgeId>(this->graph().out_begin(v), this->graph().out_end(v));
- }
-
- virtual size_t OutgoingEdgeCount(VertexId v) const {
- return this->graph().IncomingEdgeCount(v);
- }
-
- virtual size_t IncomingEdgeCount(VertexId v) const {
- return this->graph().OutgoingEdgeCount(v);
- }
-
- virtual VertexId EdgeStart(EdgeId edge) const {
- return this->graph().EdgeEnd(edge);
- }
-
- virtual VertexId EdgeEnd(EdgeId edge) const {
- return this->graph().EdgeStart(edge);
- }
-
- bool IsForward() const {
- return false;
- }
-
-};
-
-template<class Graph>
-class UniquePathFinder {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- const Graph& graph_;
- public:
- //todo use length bound if needed
- UniquePathFinder(const Graph& graph, size_t /*length_bound*/ =
- std::numeric_limits<size_t>::max())
- : graph_(graph) {}
-
- std::vector<EdgeId> operator()(EdgeId e,
- const AbstractDirection<Graph> &direction) const {
- std::vector<EdgeId> answer;
- EdgeId curr = e;
- answer.push_back(curr);
- std::set<EdgeId> was;
- while (direction.CheckUniqueOutgoingEdge(direction.EdgeEnd(curr))) {
- curr = direction.GetUniqueOutgoingEdge(direction.EdgeEnd(curr));
- if (was.count(curr) > 0)
- break;
- was.insert(curr);
- answer.push_back(curr);
- }
- return answer;
- }
-
- std::vector<EdgeId> UniquePathForward(EdgeId e) const {
- return this->operator()(e, ForwardDirection<Graph>(graph_));
- }
-
- std::vector<EdgeId> UniquePathBackward(EdgeId e) const {
- auto tmp = this->operator()(e, BackwardDirection<Graph>(graph_));
- return std::vector<EdgeId>(tmp.rbegin(), tmp.rend());
- }
-
-};
-
-template<class Graph>
-class TrivialPathFinder {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- public:
- TrivialPathFinder(const Graph&, size_t = 0) {}
-
- std::vector<EdgeId> operator()(EdgeId e, const AbstractDirection<Graph> &) const {
- return {e};
- }
-
-};
-
-template<class Graph>
-class PlausiblePathFinder {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- //todo remove graph_ field???
- const Graph& graph_;
- const size_t length_bound_;
-
- class DFS {
- private:
- const Graph &graph_;
- const AbstractDirection<Graph> &direction_;
- const size_t length_bound_;
-
- std::pair<size_t, EdgeId> find(EdgeId edge, size_t length) {
- length += graph_.length(edge);
- VertexId cross = direction_.EdgeEnd(edge);
- auto result = make_pair(length, edge);
- if (length < length_bound_
- && direction_.CheckUniqueIncomingEdge(cross)) {
- std::vector<EdgeId> outgoing = direction_.OutgoingEdges(cross);
- for (auto it = outgoing.begin(); it != outgoing.end(); ++it) {
- auto candidate = find(*it, length);
- if (candidate.first > result.first)
- result = candidate;
- }
- }
- return result;
- }
-
- std::vector<EdgeId> RestoreAnswer(EdgeId start, EdgeId end) {
- std::vector<EdgeId> result;
- while (end != start) {
- result.push_back(end);
- end = direction_.GetUniqueIncomingEdge(direction_.EdgeStart(end));
- }
- result.push_back(start);
- return std::vector<EdgeId>(result.rbegin(), result.rend());
- }
-
- public:
- DFS(const Graph &graph, const AbstractDirection<Graph> &direction,
- size_t length_bound)
- : graph_(graph),
- direction_(direction),
- length_bound_(length_bound) {
- }
-
- std::vector<EdgeId> find(EdgeId edge) {
- return RestoreAnswer(edge, find(edge, 0).second);
- }
- };
-
- public:
- PlausiblePathFinder(const Graph& graph, size_t length_bound)
- : graph_(graph),
- length_bound_(length_bound) {}
-
- std::vector<EdgeId> operator()(EdgeId e,
- const AbstractDirection<Graph> &direction) const {
- return DFS(graph_, direction, length_bound_).find(e);
- }
-
-};
-
-template<class Graph>
-class MultiplicityCounter {
- private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- const Graph &graph_;
- size_t uniqueness_length_;
- size_t max_depth_;
-
- bool search(VertexId a, VertexId start, EdgeId e, size_t depth,
- std::set<VertexId> &was, pair<size_t, size_t> &result) const {
- if (depth > max_depth_)
- return false;
- if (was.count(a) == 1)
- return true;
- was.insert(a);
- if (graph_.OutgoingEdgeCount(a) == 0
- || graph_.IncomingEdgeCount(a) == 0)
- return false;
- for (auto I = graph_.out_begin(a), E = graph_.out_end(a); I != E; ++I) {
- if (*I == e) {
- if (a != start) {
- return false;
- }
- } else {
- if (graph_.length(*I) >= uniqueness_length_) {
- result.second++;
- } else {
- if (!search(graph_.EdgeEnd(*I), start, e,
- depth + 1 /*graph_.length(*it)*/, was, result))
- return false;
- }
- }
- }
- for (EdgeId in_e : graph_.IncomingEdges(a)) {
- if (in_e == e) {
- if (a != start) {
- return false;
- }
- } else {
- if (graph_.length(in_e) >= uniqueness_length_) {
- result.first++;
- } else {
- if (!search(graph_.EdgeStart(in_e), start, e,
- depth + 1 /*graph_.length(*it)*/, was, result))
- return false;
- }
- }
- }
- return true;
- }
-
- public:
- MultiplicityCounter(const Graph &graph, size_t uniqueness_length,
- size_t max_depth)
- : graph_(graph),
- uniqueness_length_(uniqueness_length),
- max_depth_(max_depth) {
- }
-
- size_t count(EdgeId e, VertexId start) const {
- std::pair<size_t, size_t> result;
- std::set<VertexId> was;
- bool valid = search(start, start, e, 0, was, result);
- if (!valid) {
- return (size_t) (-1);
- }
- if (graph_.EdgeStart(e) == start) {
- if (result.first < result.second) {
- return (size_t) (-1);
- }
- return result.first - result.second;
- } else {
- if (result.first > result.second) {
- return (size_t) (-1);
- }
- return -result.first + result.second;
- }
- }
-};
-
-template<class Graph>
-class DominatedSetFinder {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- const Graph& g_;
- VertexId start_vertex_;
- size_t max_length_;
- size_t max_count_;
-
- size_t cnt_;
- std::map<VertexId, Range> dominated_;
-
- bool CheckCanBeProcessed(VertexId v) const {
- DEBUG( "Check if vertex " << g_.str(v) << " is dominated close neighbour");
- for (EdgeId e : g_.IncomingEdges(v)) {
- if (dominated_.count(g_.EdgeStart(e)) == 0) {
- DEBUG( "Blocked by external vertex " << g_.int_id(g_.EdgeStart(e)) << " that starts edge " << g_.int_id(e));
- DEBUG("Check fail");
- return false;
- }
- }
- DEBUG("Check ok");
- return true;
- }
-
- void UpdateCanBeProcessed(VertexId v,
- std::queue<VertexId>& can_be_processed) const {
- DEBUG("Updating can be processed");
- for (EdgeId e : g_.OutgoingEdges(v)) {
- DEBUG("Considering edge " << ToString(e));
- VertexId neighbour_v = g_.EdgeEnd(e);
- if (CheckCanBeProcessed(neighbour_v)) {
- can_be_processed.push(neighbour_v);
- }
- }
- }
-
- Range NeighbourDistanceRange(VertexId v, bool dominated_only = true) const {
- DEBUG("Counting distance range for vertex " << g_.str(v));
- size_t min = numeric_limits<size_t>::max();
- size_t max = 0;
- VERIFY(g_.IncomingEdgeCount(v) > 0);
- VERIFY(!dominated_only || CheckCanBeProcessed(v));
- for (EdgeId e : g_.IncomingEdges(v)) {
- //in case of dominated_only == false
- if (dominated_.count(g_.EdgeStart(e)) == 0)
- continue;
- Range range = dominated_.find(g_.EdgeStart(e))->second;
- range.shift((int) g_.length(e));
- DEBUG("Edge " << g_.str(e) << " provide distance range " << range);
- if (range.start_pos < min)
- min = range.start_pos;
- if (range.end_pos > max)
- max = range.end_pos;
- }
- VERIFY((max > 0) && (min < numeric_limits<size_t>::max()) && (min <= max));
- Range answer(min, max);
- DEBUG("Range " << answer);
- return answer;
- }
-
- bool CheckNoEdgeToStart(VertexId v) {
- for (EdgeId e : g_.OutgoingEdges(v)) {
- if (g_.EdgeEnd(e) == start_vertex_) {
- return false;
- }
- }
- return true;
- }
-
- public:
- DominatedSetFinder(const Graph& g, VertexId v, size_t max_length = -1ul,
- size_t max_count = -1ul)
- : g_(g),
- start_vertex_(v),
- max_length_(max_length),
- max_count_(max_count),
- cnt_(0) {
-
- }
-
- //true if no thresholds exceeded
- bool FillDominated() {
- DEBUG("Adding starting vertex " << g_.str(start_vertex_) << " to dominated set");
- dominated_.insert(make_pair(start_vertex_, Range(0, 0)));
- cnt_++;
- std::queue<VertexId> can_be_processed;
- UpdateCanBeProcessed(start_vertex_, can_be_processed);
- while (!can_be_processed.empty()) {
- if (++cnt_ > max_count_) {
- return false;
- }
- VertexId v = can_be_processed.front();
- can_be_processed.pop();
- Range r = NeighbourDistanceRange(v);
- if (r.start_pos > max_length_) {
- return false;
- }
- //Currently dominated vertices cannot have edge to start vertex
- if (CheckNoEdgeToStart(v)) {
- DEBUG("Adding vertex " << g_.str(v) << " to dominated set");
- dominated_.insert(make_pair(v, r));
- UpdateCanBeProcessed(v, can_be_processed);
- }
- }
- return true;
- }
-
- const map<VertexId, Range>& dominated() const {
- return dominated_;
- }
-
- GraphComponent<Graph> AsGraphComponent() const {
- set<VertexId> vertices = key_set(dominated_);
- return GraphComponent<Graph>(g_, vertices.begin(), vertices.end());
- }
-
- //little meaning if FillDominated returned false
- const map<VertexId, Range> CountBorder() const {
- map<VertexId, Range> border;
- for (VertexId v : key_set(border)) {
- for (EdgeId e : g_.OutgoingEdges(v)) {
- VertexId e_end = g_.EdgeEnd(e);
- if (dominated_.count(e_end) == 0) {
- border[e_end] = NeighbourDistanceRange(e_end, false);
- }
- }
- }
- return border;
- }
-
-};
-
-inline size_t PairInfoPathLengthUpperBound(size_t k, size_t insert_size,
- double delta) {
- double answer = 0. + (double) insert_size + delta - (double) k - 2.;
- VERIFY(math::gr(answer, 0.));
- return (size_t)std::floor(answer);
-}
-
-inline size_t PairInfoPathLengthLowerBound(size_t k, size_t l1, size_t l2,
- int gap, double delta) {
- double answer = 0. + (double) gap + (double) k + 2. - (double) l1 - (double) l2 - delta;
- return math::gr(answer, 0.) ? (size_t)std::floor(answer) : 0;
-}
-
-}
-#endif /* OMNI_UTILS_HPP_ */
diff --git a/src/include/omni/order_and_law.hpp b/src/include/omni/order_and_law.hpp
deleted file mode 100644
index a8c7532..0000000
--- a/src/include/omni/order_and_law.hpp
+++ /dev/null
@@ -1,645 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include <boost/utility.hpp>
-
-#include <ostream>
-#include <unordered_set>
-#include <unordered_map>
-#include <stacktrace.hpp>
-#include <algorithm>
-#include <map>
-
-#include "openmp_wrapper.h"
-#include "folly/PackedSyncPtr.h"
-
-
-namespace restricted
-{
-
-//todo discuss with Anton
-static const uint16_t MAX_THREAD_CNT = 128;
-
-class IdDistributor {
-public:
- virtual size_t GetId() = 0;
- virtual ~IdDistributor() {
- }
-};
-
-template<class Iter>
-class ListIdDistributor: public IdDistributor {
- friend class IdSegmentStorage;
-private:
- Iter left_;
- Iter right_;
- size_t shift_;
- size_t max_;
- ListIdDistributor(Iter left, Iter right, size_t shift = 0, size_t max = size_t(-1)) : left_(left), right_(right), shift_(shift), max_(max) {
- }
-
-public:
- bool valid() {
- return left_ < right_;
- }
-
- size_t GetId() {
- size_t result = *(left_);
- VERIFY(result < max_);
- ++left_;
- return shift_ + result;
- }
-};
-
-class SegmentIterator {
-private:
- size_t value_;
-public:
- SegmentIterator(size_t value) : value_(value) {
- }
-
- size_t operator*() const {
- return value_;
- }
-
- void operator++() {
- value_++;
- }
-
- void operator++(int) {
- ++value_;
- }
-
- bool operator==(const SegmentIterator &that) const {
- return value_ == that.value_;
- }
-
- bool operator!=(const SegmentIterator &that) const {
- return value_ != that.value_;
- }
-};
-
-class IdSegmentStorage {
- friend class LocalIdDistributor;
-public:
- ListIdDistributor<SegmentIterator> GetSegmentIdDistributor(size_t left, size_t right) {
- VERIFY(left < right);
- VERIFY(right <= size_);
- return ListIdDistributor<SegmentIterator>(SegmentIterator(left), SegmentIterator(right), min_value_, size_);
- }
-
- template<class Iter>
- ListIdDistributor<Iter> GetSegmentIdDistributor(Iter left, Iter right) {
- VERIFY(left < right);
- return ListIdDistributor<Iter>(left, right, min_value_, size_);
- }
-
- IdSegmentStorage() : min_value_(0), size_(0) { }
-private:
- IdSegmentStorage(size_t min_value, size_t size) : min_value_(min_value), size_(size) { }
-
- size_t min_value_;
- size_t size_;
-};
-
-// Id distributor for pure_pointer. Singleton.
-class LocalIdDistributor : public IdDistributor, boost::noncopyable {
- friend class PeriodicIdDistributor;
- static const size_t INITIAL_MAX_INT_ID = 2;
-public:
- size_t GetId() {
- return max_int_id_++;
- }
-
- IdSegmentStorage Reserve(size_t size) {
- max_int_id_ += size;
- return IdSegmentStorage(max_int_id_ - size, size);
- }
-
- IdSegmentStorage ReserveUpTo(size_t max) {
- VERIFY(max_int_id_ == INITIAL_MAX_INT_ID);
- max_int_id_ = max;
- return IdSegmentStorage(0, max);
- }
-
-// static GlobalIdDistributor &GetInstance() {
-// static GlobalIdDistributor instance(INITIAL_MAX_INT_ID);
-// return instance;
-// }
-
- size_t GetMax() const {
- return max_int_id_;
- }
-
- LocalIdDistributor(size_t min_id_value = INITIAL_MAX_INT_ID) : max_int_id_(min_id_value) { }
-
-private:
- size_t max_int_id_;
-};
-
-/* id distributor used for concurrent algorithms.
- * each thread use their own PeriodicIdDistributor with period equals to
- * the quantity of threads. After thread's job is done Synchronize call are required
- * to increase id in GlobalIdDistributor.
-*/
-class PeriodicIdDistributor : public IdDistributor {
-
-public:
- PeriodicIdDistributor(LocalIdDistributor &id_distributor, size_t first_id, size_t period)
- : id_distributor_(id_distributor), cur_id_(first_id), period_(period) {
- }
-
- virtual size_t GetId() {
- size_t id = cur_id_;
- cur_id_ += period_;
-
- return id;
- }
-
- void Synchronize() const {
- size_t& global_max_id = id_distributor_.max_int_id_;
- global_max_id = std::max(cur_id_, global_max_id);
- }
-
-private:
- LocalIdDistributor &id_distributor_;
- size_t cur_id_;
- size_t period_;
-};
-
-template<class PurePtrT>
-class PurePtrLock;
-
-template<class PurePtrT>
-class PurePtrMarker;
-
-//todo maybe make it extend folly::PackedSyncPtr<T>?
-template<class T>
-struct pure_pointer {
- typedef T type;
- typedef T* pointer_type;
-
- explicit pure_pointer()
- : int_id_(0) {
- ptr_.init(pointer_type(0), MAX_THREAD_CNT);
- }
-
- explicit pure_pointer(T *ptr)
- : int_id_(size_t(ptr)) {
- ptr_.init(ptr, MAX_THREAD_CNT);
- VERIFY(int_id_ < 2);
- }
-
- explicit pure_pointer(T *ptr, IdDistributor &idDistributor)
- : int_id_(generate_id(ptr, idDistributor)) {
- ptr_.init(ptr, MAX_THREAD_CNT);
- }
-
-// lock_pointer_type& get_lockable() {
-// return ptr_;
-// }
-
- T *get() const {
- return ptr_.get();
- }
-
- T& operator*() const {
- return *ptr_;
- }
-
- T* operator->() const {
- return ptr_.get();
- }
-
- bool operator==(const pure_pointer &rhs) const {
- if (int_id_ == rhs.int_id_) {
- VERIFY(ptr_.get() == rhs.ptr_.get());
- return true;
- }
- return false;
- }
-
- bool operator!=(const pure_pointer &rhs) const {
- return !operator ==(rhs);
- }
-
- bool operator<(const pure_pointer &rhs) const {
- return this->int_id_ < rhs.int_id_;
- }
-
- bool operator<=(const pure_pointer &rhs) const {
- return *this < rhs || *this == rhs;
- }
-
- size_t hash() const {
- return this->int_id_;
- }
-
- size_t int_id() const {
- return int_id_;
- }
-
-private:
- friend class PurePtrLock<pure_pointer<T>>;
- friend class PurePtrMarker<pure_pointer<T>>;
-
- typedef folly::PackedSyncPtr<T> lock_pointer_type;
-
- static size_t generate_id(T *ptr, IdDistributor &idDistributor) {
- if (ptr == 0 || ptr == (T*) 1 || ptr == (T*) (-1)) {
- return size_t(ptr);
- }
-
- return idDistributor.GetId();
- }
-
- lock_pointer_type ptr_;
-
- size_t int_id_;
-};
-
-template<class LockT>
-class ReEnteringLock {
- LockT& lock_;
- bool reentered_;
-
- uint16_t locking_thread() const {
- //don't need barrier here (as folly documentation says)
- return lock_.extra();
- }
-
- uint16_t current_thread() const {
- return uint16_t(omp_get_thread_num());
- }
-
- void Lock() {
- lock_.lock();
- lock_.setExtra(current_thread());
- }
-
- void Unlock() {
- lock_.setExtra(MAX_THREAD_CNT);
- lock_.unlock();
- }
-
-public:
- ReEnteringLock(LockT& lock) :
- lock_(lock),
- reentered_(false) {
- if (locking_thread() == current_thread()) {
- reentered_ = true;
- } else {
- Lock();
- }
- }
-
- ~ReEnteringLock() {
- if (!reentered_) {
- Unlock();
- }
- }
-};
-
-/**
- * Lock that uses a pure ptr as a target.
- * Be careful NOT to pass a COPY of pure ptr you want to use as locked object!
- */
-template<class PurePtrT>
-class PurePtrLock {
- ReEnteringLock<typename PurePtrT::lock_pointer_type> inner_lock_;
-
-public:
- PurePtrLock(PurePtrT& pure_ptr) :
- inner_lock_(pure_ptr.ptr_)
- {
- }
-
-};
-
-/**
- * Way to "mark" pure pointer without using additional memory.
- * Marking/unmarking operations are atomic
- * Be careful NOT to pass a COPY of pure ptr you want to mark!
- * Do not use with PurePtrLocks, they use the same space for storing data...
- */
-template<class PurePtrT>
-class PurePtrMarker {
- typedef typename PurePtrT::lock_pointer_type LockWithData;
-
- void ChangeMark(PurePtrT& pure_ptr, uint16_t new_mark) const {
- LockWithData& lock_with_data = pure_ptr.ptr_;
- lock_with_data.lock();
- lock_with_data.setExtra(new_mark);
- lock_with_data.unlock();
- }
-
-public:
-
- void mark(PurePtrT& pure_ptr) const {
- ChangeMark(pure_ptr, 0);
- }
-
- void unmark(PurePtrT& pure_ptr) const {
- ChangeMark(pure_ptr, MAX_THREAD_CNT);
- }
-
- bool is_marked(const PurePtrT& pure_ptr) const {
- uint16_t curr_mark = pure_ptr.ptr_.extra();
- VERIFY(curr_mark == 0 || curr_mark == MAX_THREAD_CNT);
- return curr_mark == 0;
- }
-
-};
-
-//template<class T>
-//struct Comparator
-//{
-// typedef pure_pointer<T> pointer_type_t;
-//
-// bool operator()(pointer_type_t const& a, pointer_type_t const& b) const {
-// return a.get() < b.get();
-// }
-//};
-
-template<class T>
-struct Hash
-{
- typedef pure_pointer<T> pointer_type_t;
- std::hash<T*> inner_hash_;
-
- size_t operator()(pointer_type_t const& a) const {
- return inner_hash_(a.get());
- }
-};
-
-template<class It>
-struct iterator_wrapper
-{
- typedef typename It::value_type value_type;
- typedef typename It::difference_type difference_type;
- typedef typename It::reference reference;
- typedef typename It::pointer pointer;
-
- explicit iterator_wrapper(It it) : it_(it) {}
-
- reference operator* () const { return it_.operator* (); }
- pointer operator-> () const { return it_.operator->(); }
-
- bool operator==(const iterator_wrapper &rhs) const { return it_ == rhs.it_; }
- bool operator!=(const iterator_wrapper &rhs) const { return it_ != rhs.it_; }
-
-private:
- It it_;
-};
-
-template<class T>
-struct set
-{
- typedef Hash<typename T::type> hash_t;
- typedef std::unordered_set<T, hash_t> base_set_t;
- typedef typename base_set_t::value_type value_type;
-
- typedef iterator_wrapper<typename base_set_t::iterator > iterator;
- typedef iterator_wrapper<typename base_set_t::const_iterator > const_iterator;
-
-public:
- set() : base_set_(10, hash_t())
- {
- }
-
- template<class It>
- set(It begin, It end) : base_set_(begin, end, 10, hash_t())
- {
- }
-
- const_iterator begin() const { return const_iterator(base_set_.begin()); }
- const_iterator end () const { return const_iterator(base_set_.end ()); }
-
- iterator begin() { return iterator(base_set_.begin()); }
- iterator end () { return iterator(base_set_.end ()); }
-
- const_iterator find (const T &key) const{ return const_iterator(base_set_.find(key)); }
- iterator find (const T &key) { return iterator(base_set_.find(key)); }
-
- size_t count(T const& item) const { return base_set_.count(item); }
-
- std::pair<iterator, bool> insert(value_type const& item)
- {
- const std::pair<iterator, bool>& ret = base_set_.insert(item);
- return make_pair(iterator(ret.first), ret.second);
- }
-
- template<class It>
- void insert(It first, It last) { base_set_.insert(first, last); }
-
- size_t erase (const T& x) { return base_set_.erase(x); }
- void clear () { base_set_.clear(); }
- size_t size () const { return base_set_.size(); }
-
- bool operator==(const set &rhs) const
- {
- if(this->size() != rhs.size())
- return false;
-
- for(auto i = base_set_.begin(), j = rhs.base_set_.begin();
- i != base_set_.end() && j != rhs.base_set_.end();
- ++i, ++j)
- {
- if(*i != *j)
- return false;
- }
-
- return true;
- }
-
- bool operator!=(const set &rhs) const
- {
- return !(*this == rhs);
- }
-
- template<class Comparator>
- void Copy(std::set<T, Comparator> &container) const {
- container.insert(base_set_.begin(), base_set_.end());
- }
-
-private:
- base_set_t base_set_;
-};
-
-
-template<class Key, class Value>
-struct map
-{
- typedef Hash<typename Key::type> hash_t;
- typedef std::unordered_map<Key, Value, hash_t> base_map_t;
- typedef typename base_map_t::value_type value_type;
-
- typedef iterator_wrapper<typename base_map_t::iterator > iterator;
- typedef iterator_wrapper<typename base_map_t::const_iterator> const_iterator;
-
-public:
- map()
- : base_map_(10, hash_t())
- {
- }
-
- template<class It>
- map(It begin, It end)
- : base_map_(begin, end, 10, hash_t())
- {
- }
-
- const_iterator begin() const { return const_iterator(base_map_.begin()); }
- const_iterator end () const { return const_iterator(base_map_.end ()); }
-
- iterator begin() { return iterator(base_map_.begin()); }
- iterator end () { return iterator(base_map_.end ()); }
-
- const_iterator find (const Key &key) const
- {
- return const_iterator(base_map_.find(key));
- }
- iterator find (const Key &key) { return iterator(base_map_.find(key)); }
-
- size_t count(Key const& item) const { return base_map_.count(item); }
-
- Value& operator[](Key const& x) { return base_map_[x]; }
-
- std::pair<iterator, bool> insert(value_type const& value)
- {
- std::pair<iterator, bool> ret = base_map_.insert(value);
- return make_pair(iterator(ret.first), ret.second);
- }
-
- template<class It>
- void insert(It first, It last) { base_map_.insert(first, last); }
- size_t erase (Key const& x) { return base_map_.erase(x); }
- void clear () { base_map_.clear(); }
-
- size_t size () const { return base_map_.size(); }
-
- bool operator==(const map &rhs) const
- {
- if(size() != rhs.size())
- return false;
-
- for(auto i = base_map_.begin(), j = rhs.base_map_.begin();
- i != base_map_.end() && j != rhs.base_map_.end();
- ++i, ++j)
- {
- if(*i != *j)
- return false;
- }
-
- return true;
- }
-
- bool operator!=(const map& rhs) const
- {
- return !(*this == rhs);
- }
-
- template<class Comparator>
- void Copy(std::map<Key, Value, Comparator> &container) const {
- container.insert(base_map_.begin(), base_map_.end());
- }
-
-private:
- base_map_t base_map_;
-};
-
-template<class T>
-std::ostream &operator<<(std::ostream &stream, const pure_pointer<T>& pointer)
-{
- stream << pointer.int_id();
- return stream;
-}
-
-} // namespace restricted
-
-namespace std
-{
-template<class T>
-struct hash<restricted::pure_pointer<T>> {
- size_t operator()(const restricted::pure_pointer<T>& pointer) const {
- return pointer.hash();
- }
-};
-}
-
-template<class T, class Comparator>
-class PairComparator {
-private:
- Comparator comparator_;
-public:
- PairComparator(Comparator comparator) : comparator_(comparator) {
- }
-
- bool operator()(std::pair<T, T> a, std::pair<T, T> b) const {
- return a.first == b.first ? comparator_(a.second, b.second) : comparator_(a.first, b.first);
- }
-};
-
-//
-//template<typename T, class Comparator>
-//class MixedComparator {
-//private:
-// Comparator c1_;
-// Comparator c2_;
-//public:
-// MixedComparator(const Comparator &c1, const Comparator &c2) : c1_(c1), c2_(c2) {
-// }
-//
-// bool operator()(const T &a, const T &b) const {
-// if(c1_.IsAFAKE(a) || c1_.IsAFAKE(b)) {
-// if(c1_.IsAFAKEMin(a))
-// return !c1_.IsAFAKEMin(b);
-// if(c1_.IsAFAKEMax(b))
-// return c1_.IsAFAKEMax(a);
-// return false;
-// }
-// if(c1_.IsValidId(a) && c1_.IsValidId(b))
-// return c1_(a, b);
-// if(c1_.IsValidId(a))
-// return true;
-// if(c1_.IsValidId(b))
-// return false;
-// if(c2_.IsValidId(a) && c2_.IsValidId(b)) {
-// return c2_(a, b);
-// }
-// VERIFY(false);
-// return false;
-// }
-//
-// bool IsValidId(T element) {
-// return c1_.IsValid(element) || c2_.IsValid(element);
-// }
-//};
-
-template<class Container, class Comparator>
-class ContainerComparator {
-private:
- Comparator comparator_;
-public:
- ContainerComparator(const Comparator &comparator) : comparator_(comparator) {
- }
-
- bool operator()(const Container &a, const Container &b) const {
- for(auto ita = a.begin, itb = b.begin(); ita != a.end() && itb != b.end(); ++ita, ++itb) {
- if(*ita != *itb)
- return comparator_(*ita, *itb);
- }
- if(a.size() < b.size()) {
- return true;
- }
- return false;
- }
-
-};
-
diff --git a/src/include/omni/parallel_processing.hpp b/src/include/omni/parallel_processing.hpp
deleted file mode 100644
index a38f7d4..0000000
--- a/src/include/omni/parallel_processing.hpp
+++ /dev/null
@@ -1,289 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "logger/logger.hpp"
-#include "graph_iterators.hpp"
-#include "graph_processing_algorithm.hpp"
-
-namespace omnigraph {
-
-template<class ItVec, class SmartIt, class Predicate>
-void FillInterestingFromChunkIterators(const ItVec& chunk_iterators,
- SmartIt& smart_it,
- const Predicate& predicate) {
- VERIFY(chunk_iterators.size() > 1);
- typedef typename Predicate::checked_type ElementType;
- std::vector<std::vector<ElementType>> of_interest(omp_get_max_threads());
-
- #pragma omp parallel for schedule(guided)
- for (size_t i = 0; i < chunk_iterators.size() - 1; ++i) {
- for (auto it = chunk_iterators[i], end = chunk_iterators[i + 1]; it != end; ++it) {
- ElementType t = *it;
- if (predicate(t)) {
- of_interest[omp_get_thread_num()].push_back(t);
- }
- }
- }
-
- for (auto& chunk : of_interest) {
- smart_it.insert(chunk.begin(), chunk.end());
- chunk.clear();
- }
-}
-
-template<class Graph, class ElementId = typename Graph::EdgeId>
-class TrivialInterestingElementFinder {
-public:
-
- TrivialInterestingElementFinder() {
- }
-
- template<class SmartIt>
- bool Run(SmartIt& /*it*/) const {
- return false;
- }
-};
-
-template<class Graph, class ElementId = typename Graph::EdgeId>
-class SimpleInterestingElementFinder {
- typedef GraphEdgeIterator<Graph> EdgeIt;
-
- const Graph& g_;
- pred::TypedPredicate<ElementId> condition_;
-public:
-
- SimpleInterestingElementFinder(const Graph& g,
- pred::TypedPredicate<ElementId> condition = pred::AlwaysTrue<ElementId>())
- : g_(g), condition_(condition) {}
-
- template<class SmartIt>
- bool Run(SmartIt& interest) const {
- for (EdgeIt it = EdgeIt(g_, g_.begin()), end = EdgeIt(g_, g_.end()); it != end; ++it) {
- if (condition_(*it)) {
- interest.push(*it);
- }
- }
- return false;
- }
-};
-
-template<class Graph, class ElementId = typename Graph::EdgeId>
-class ParallelInterestingElementFinder {
- typedef GraphEdgeIterator<Graph> EdgeIt;
-
- const Graph& g_;
- pred::TypedPredicate<ElementId> condition_;
- const size_t chunk_cnt_;
-public:
-
- ParallelInterestingElementFinder(const Graph& g,
- pred::TypedPredicate<ElementId> condition,
- size_t chunk_cnt)
- : g_(g), condition_(condition), chunk_cnt_(chunk_cnt) {}
-
- template<class SmartIt>
- bool Run(SmartIt& it) const {
- TRACE("Looking for interesting elements");
- TRACE("Splitting graph into " << chunk_cnt_ << " chunks");
- FillInterestingFromChunkIterators(IterationHelper<Graph, ElementId>(g_).Chunks(chunk_cnt_), it, condition_);
- TRACE("Found " << it.size() << " interesting elements");
- return false;
- }
-private:
- DECL_LOGGER("ParallelInterestingElementFinder");
-};
-
-template<class Graph>
-class PersistentAlgorithmBase {
- Graph& g_;
-protected:
-
- PersistentAlgorithmBase(Graph& g) : g_(g) {}
-
- Graph& g() { return g_; }
- const Graph& g() const { return g_; }
-public:
- virtual ~PersistentAlgorithmBase() {}
- virtual bool Run(bool force_primary_launch = false) = 0;
-};
-
-//todo use add_condition in it_
-template<class Graph, class ElementId, class InterestingElementFinder,
- class Comparator = std::less<ElementId>>
-class PersistentProcessingAlgorithm : public PersistentAlgorithmBase<Graph> {
- InterestingElementFinder interest_el_finder_;
-
- SmartSetIterator<Graph, ElementId, Comparator> it_;
- //todo remove
- bool tracking_;
- size_t total_iteration_estimate_;
-
- size_t curr_iteration_;
-
-protected:
-
- virtual bool Process(ElementId el) = 0;
- virtual bool Proceed(ElementId /*el*/) const { return true; }
-
- virtual void PrepareIteration(size_t /*it_cnt*/, size_t /*total_it_estimate*/) {}
-
-public:
-
- PersistentProcessingAlgorithm(Graph& g,
- const InterestingElementFinder& interest_el_finder,
- bool canonical_only = false,
- const Comparator& comp = Comparator(),
- bool track_changes = true,
- size_t total_iteration_estimate = -1ul) :
- PersistentAlgorithmBase<Graph>(g),
- interest_el_finder_(interest_el_finder),
- it_(g, true, comp, canonical_only),
- tracking_(track_changes),
- total_iteration_estimate_(total_iteration_estimate),
- curr_iteration_(0) {
- it_.Detach();
- }
-
- bool Run(bool force_primary_launch = false) {
- bool primary_launch = !tracking_ || (curr_iteration_ == 0) || force_primary_launch ;
- if (!it_.IsAttached()) {
- it_.Attach();
- }
- if (primary_launch) {
- it_.clear();
- TRACE("Primary launch.");
- TRACE("Start preprocessing");
- interest_el_finder_.Run(it_);
- TRACE(it_.size() << " edges to process after preprocessing");
- } else {
- TRACE(it_.size() << " edges to process");
- VERIFY(tracking_);
- }
-
- if (curr_iteration_ >= total_iteration_estimate_) {
- PrepareIteration(total_iteration_estimate_ - 1, total_iteration_estimate_);
- } else {
- PrepareIteration(curr_iteration_, total_iteration_estimate_);
- }
-
- bool triggered = false;
- TRACE("Start processing");
- for (; !it_.IsEnd(); ++it_) {
- ElementId el = *it_;
- if (!Proceed(el)) {
- TRACE("Proceed condition turned false on element " << this->g().str(el));
- it_.ReleaseCurrent();
- break;
- }
- TRACE("Processing edge " << this->g().str(el));
- triggered |= Process(el);
- }
- TRACE("Finished processing. Triggered = " << triggered);
- if (!tracking_)
- it_.Detach();
-
- curr_iteration_++;
- return triggered;
- }
-
-};
-
-template<class Graph, class InterestingEdgeFinder,
- class Comparator = std::less<typename Graph::EdgeId>>
-class PersistentEdgeRemovingAlgorithm : public PersistentProcessingAlgorithm<Graph,
- typename Graph::EdgeId,
- InterestingEdgeFinder, Comparator> {
- typedef typename Graph::EdgeId EdgeId;
- typedef PersistentProcessingAlgorithm<Graph, EdgeId, InterestingEdgeFinder, Comparator> base;
- EdgeRemover<Graph> edge_remover_;
-public:
- PersistentEdgeRemovingAlgorithm(Graph& g,
- const InterestingEdgeFinder& interest_edge_finder,
- std::function<void(EdgeId)> removal_handler = boost::none,
- bool canonical_only = false,
- const Comparator& comp = Comparator(),
- bool track_changes = true,
- size_t total_iteration_estimate = -1ul)
- : base(g, interest_edge_finder,
- canonical_only, comp, track_changes,
- total_iteration_estimate),
- edge_remover_(g, removal_handler) {
-
- }
-
-protected:
-
- virtual bool ShouldRemove(EdgeId e) const = 0;
-
- bool Process(EdgeId e) override {
- TRACE("Checking edge " << this->g().str(e) << " for the removal condition");
- if (ShouldRemove(e)) {
- TRACE("Check passed, removing");
- edge_remover_.DeleteEdge(e);
- return true;
- }
- TRACE("Check not passed");
- return false;
- }
-
-};
-
-template<class Graph, class InterestingEdgeFinder,
- class Comparator = std::less<typename Graph::EdgeId>>
-class ConditionEdgeRemovingAlgorithm : public PersistentEdgeRemovingAlgorithm<Graph,
- InterestingEdgeFinder, Comparator> {
- typedef typename Graph::EdgeId EdgeId;
- typedef PersistentEdgeRemovingAlgorithm<Graph, InterestingEdgeFinder, Comparator> base;
- pred::TypedPredicate<EdgeId> remove_condition_;
-protected:
-
- bool ShouldRemove(EdgeId e) const override {
- return remove_condition_(e);
- }
-
-public:
- ConditionEdgeRemovingAlgorithm(Graph& g,
- const InterestingEdgeFinder& interest_edge_finder,
- pred::TypedPredicate<EdgeId> remove_condition,
- std::function<void(EdgeId)> removal_handler = boost::none,
- bool canonical_only = false,
- const Comparator& comp = Comparator(),
- bool track_changes = true)
- : base(g, interest_edge_finder,
- removal_handler,
- canonical_only, comp, track_changes),
- remove_condition_(remove_condition) {
-
- }
-};
-
-template<class Graph, class Comparator = std::less<typename Graph::EdgeId>>
-class ParallelEdgeRemovingAlgorithm : public ConditionEdgeRemovingAlgorithm<Graph,
- ParallelInterestingElementFinder<Graph>, Comparator> {
- typedef ConditionEdgeRemovingAlgorithm<Graph,
- ParallelInterestingElementFinder<Graph>, Comparator> base;
- typedef typename Graph::EdgeId EdgeId;
-
-public:
- ParallelEdgeRemovingAlgorithm(Graph& g,
- pred::TypedPredicate<EdgeId> remove_condition,
- size_t chunk_cnt,
- std::function<void(EdgeId)> removal_handler = boost::none,
- bool canonical_only = false,
- const Comparator& comp = Comparator(),
- bool track_changes = true)
- : base(g,
- ParallelInterestingElementFinder<Graph>(g, remove_condition, chunk_cnt),
- remove_condition, removal_handler,
- canonical_only, comp, track_changes) {
- }
-
-};
-
-}
diff --git a/src/include/omni/path_processor.hpp b/src/include/omni/path_processor.hpp
deleted file mode 100644
index dbeb20d..0000000
--- a/src/include/omni/path_processor.hpp
+++ /dev/null
@@ -1,441 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "standard_base.hpp"
-#include "adt/bag.hpp"
-#include "dijkstra_tools/dijkstra_helper.hpp"
-
-namespace omnigraph {
-
-template<class Graph>
-const string PrintPath(const Graph& g, const vector<typename Graph::EdgeId>& edges) {
- string delim = "";
- std::stringstream ss;
- for (size_t i = 0; i < edges.size(); ++i) {
- ss << delim << g.str(edges[i]);
- delim = " -> ";
- }
- return ss.str();
-}
-
-
-template<class Graph>
-class PathProcessor {
-
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef vector<EdgeId> Path;
- typedef typename DijkstraHelper<Graph>::BoundedDijkstra DijkstraT;
-public:
- class Callback {
-
- public:
- virtual ~Callback() {
- }
-
- virtual void Flush() {
- }
-
- virtual void HandleReversedPath(const vector<EdgeId>& reversed_path) = 0;
-
-
- protected:
- Path ReversePath(const Path& path) const {
- Path result;
- for (auto I = path.rbegin(), E = path.rend(); I != E; ++I)
- result.push_back(*I);
- return result;
- }
- };
-
-private:
-
- class Traversal {
- const PathProcessor& outer_;
- VertexId end_;
- size_t min_len_;
- size_t max_len_;
- Callback& callback_;
- size_t edge_depth_bound_;
-
- size_t curr_len_;
- size_t curr_depth_;
- size_t call_cnt_;
- Path reversed_edge_path_;
- bag<VertexId> vertex_cnts_;
-
- const Graph& g_;
- const DijkstraT& dijkstra_;
-
- void Push(EdgeId e, VertexId start_v) {
- TRACE("Pushing edge " << g_.str(e));
- curr_len_ += g_.length(e);
- curr_depth_++;
- reversed_edge_path_.push_back(e);
- vertex_cnts_.put(start_v);
- }
-
- void Pop() {
- VERIFY(!reversed_edge_path_.empty());
- EdgeId e = reversed_edge_path_.back();
- size_t len = g_.length(e);
- VERIFY(curr_len_ >= len);
-
- TRACE("Popping edge " << g_.str(e));
- vertex_cnts_.take(g_.EdgeStart(e));
- reversed_edge_path_.pop_back();
- curr_len_ -= len;
- curr_depth_--;
- }
-
- bool CanGo(EdgeId e, VertexId start_v) {
- if (!dijkstra_.DistanceCounted(start_v))
- return false;
- if (dijkstra_.GetDistance(start_v) + g_.length(e) + curr_len_ > max_len_)
- return false;
- if (curr_depth_ >= edge_depth_bound_)
- return false;
- if (vertex_cnts_.mult(start_v) >= PathProcessor::MAX_VERTEX_USAGE)
- return false;
- return true;
- }
-
- bool Go(VertexId v, const size_t min_len) {
- TRACE("Got to vertex " << g_.str(v));
- if (++call_cnt_ >= PathProcessor::MAX_CALL_CNT) {
- TRACE("Maximal count " << MAX_CALL_CNT << " of recursive calls was exceeded!");
- return true;
- }
-
- if (v == outer_.start_ && curr_len_ >= min_len) {
- //TRACE("New path found: " << PrintPath(g_, path_));
- callback_.HandleReversedPath(reversed_edge_path_);
- }
-
- TRACE("Iterating through incoming edges of vertex " << g_.int_id(v))
- //TODO: doesn`t work with parallel simplification
- vector<EdgeId> incoming;
- incoming.reserve(4);
- std::copy_if(g_.in_begin(v), g_.in_end(v), std::back_inserter(incoming), [&] (EdgeId e) {
- return dijkstra_.DistanceCounted(g_.EdgeStart(e));
- });
-
- std::sort(incoming.begin(), incoming.end(), [&] (EdgeId e1, EdgeId e2) {
- return dijkstra_.GetDistance(g_.EdgeStart(e1)) < dijkstra_.GetDistance(g_.EdgeStart(e2));
- });
-
- for (EdgeId e : incoming) {
- VertexId start_v = g_.EdgeStart(e);
- if (CanGo(e, start_v)) {
- Push(e, start_v);
- bool exceeded_limits = Go(start_v, min_len);
- Pop();
- if (exceeded_limits)
- return true;
- }
- }
- return false;
- }
-
- public:
- Traversal(const PathProcessor& outer, VertexId end,
- size_t min_len, size_t max_len,
- Callback& callback, size_t edge_depth_bound) :
- outer_(outer), end_(end),
- min_len_(min_len), max_len_(max_len),
- callback_(callback),
- edge_depth_bound_(edge_depth_bound),
- curr_len_(0), curr_depth_(0), call_cnt_(0),
- g_(outer.g_),
- dijkstra_(outer.dijkstra_) {
- reversed_edge_path_.reserve(PathProcessor::MAX_CALL_CNT);
- vertex_cnts_.put(end_);
- }
-
- //returns true iff limits were exceeded
- bool Go() {
- bool code = Go(end_, min_len_);
- VERIFY(curr_len_ == 0);
- VERIFY(curr_depth_ == 0);
- vertex_cnts_.take(end_);
- VERIFY(vertex_cnts_.size() == 0);
- return code;
- }
- };
-
- friend class Traversal;
-
-public:
-
- PathProcessor(const Graph& g, VertexId start, size_t length_bound) :
- g_(g),
- start_(start),
- dijkstra_(DijkstraHelper<Graph>::CreateBoundedDijkstra(g, length_bound, MAX_DIJKSTRA_VERTICES)) {
- TRACE("Dijkstra launched");
- dijkstra_.Run(start);
- TRACE("Dijkstra finished");
- }
-
- // dfs from the end vertices
- // 3 two mistakes, 2 bad dijkstra, 1 some bad dfs, 0 = okay
- int Process(VertexId end, size_t min_len, size_t max_len, Callback& callback, size_t edge_depth_bound = -1ul) const {
- TRACE("Process launched");
- int error_code = 0;
-
- if (dijkstra_.VertexLimitExceeded()) {
- TRACE("dijkstra : vertex limit exceeded");
- error_code = 2;
- }
-
- TRACE("Start vertex is " << g_.str(start_));
- TRACE("Bounds are " << min_len << " " << max_len);
- TRACE("End vertex " << g_.str(end));
-
- Traversal traversal(*this, end, min_len, max_len, callback, edge_depth_bound);
- error_code |= int(traversal.Go());
-
- callback.Flush();
- TRACE("Process finished with error code " << error_code);
- return error_code;
- }
-
-private:
- static const size_t MAX_CALL_CNT = 3000;
- static const size_t MAX_DIJKSTRA_VERTICES = 3000;
- static const size_t MAX_VERTEX_USAGE = 5;
-
- const Graph& g_;
- VertexId start_;
- DijkstraT dijkstra_;
-
- DECL_LOGGER("PathProcessor")
-};
-
-template<class Graph>
-int ProcessPaths(const Graph& g, size_t min_len, size_t max_len,
- typename Graph::VertexId start, typename Graph::VertexId end,
- typename PathProcessor<Graph>::Callback& callback, size_t max_edge_cnt = -1ul) {
- PathProcessor<Graph> processor(g, start, max_len);
- return processor.Process(end, min_len, max_len, callback, max_edge_cnt);
-}
-
-template<class Graph>
-class CompositeCallback: public PathProcessor<Graph>::Callback {
- typedef typename Graph::EdgeId EdgeId;
- typedef vector<EdgeId> Path;
-
-public:
- void AddProcessor(typename PathProcessor<Graph>::Callback& processor) {
- processors_.push_back(&processor);
- }
-
- void Flush() override {
- for (auto it = processors_.begin(); it != processors_.end(); ++it) {
- (*it)->Flush();
- }
- }
-
- void HandleReversedPath(const Path& path) override {
- for (auto it = processors_.begin(); it != processors_.end(); ++it) {
- (*it)->HandleReversedPath(path);
- }
- }
-
-private:
- vector<typename PathProcessor<Graph>::Callback*> processors_;
-};
-
-template<class Graph, class Comparator>
-class BestPathStorage: public PathProcessor<Graph>::Callback {
- typedef typename Graph::EdgeId EdgeId;
- typedef vector<EdgeId> Path;
-public:
- BestPathStorage(const Graph& g, Comparator comparator) :
- g_(g), cnt_(0), comparator_(comparator) {
- }
-
- void HandleReversedPath(const vector<EdgeId>& path) override {
- cnt_++;
- if(best_path_.size() == 0 || comparator_(path, best_path_))
- best_path_ = path;
- }
-
- vector<EdgeId> BestPath() const {
- return best_path_;
- }
-
- size_t size() const {
- return cnt_;
- }
-
-private:
- const Graph& g_;
- size_t cnt_;
- Comparator comparator_;
- vector<vector<Path>> best_path_;
-};
-
-
-template<class Graph>
-class PathStorageCallback: public PathProcessor<Graph>::Callback {
- typedef typename Graph::EdgeId EdgeId;
- typedef vector<EdgeId> Path;
-
-public:
- PathStorageCallback(const Graph& g) :
- g_(g) {
- }
-
- void Flush() override {
- all_paths_.push_back(cur_paths_);
- cur_paths_.clear();
- }
-
- void HandleReversedPath(const vector<EdgeId>& path) override {
- cur_paths_.push_back(this->ReversePath(path));
- }
-
- size_t size(size_t k = 0) const {
- return all_paths_[k].size();
- }
-
- const vector<Path>& paths(size_t k = 0) const {
- return all_paths_[k];
- }
-
-private:
- const Graph& g_;
- vector<vector<Path>> all_paths_;
- vector<Path> cur_paths_;
-};
-
-template<class Graph>
-class NonEmptyPathCounter: public PathProcessor<Graph>::Callback {
- typedef typename Graph::EdgeId EdgeId;
- typedef vector<EdgeId> Path;
-
-public:
- NonEmptyPathCounter(const Graph& g) :
- g_(g), count_(0) {
- }
-
- void Flush() override {
- all_paths_.push_back(cur_paths_);
- counts_.push_back(count_);
- cur_paths_.clear();
- }
-
- void HandleReversedPath(const Path& path) override {
- if (path.size() > 0) {
- ++count_;
- cur_paths_.push_back(this->ReversePath(path));
- }
- }
-
- size_t count(size_t k = 0) const {
- return counts_[k];
- }
-
- const vector<Path>& paths(size_t k = 0) const {
- return all_paths_[k];
- }
-
-private:
- const Graph& g_;
- vector<size_t> counts_;
- size_t count_;
- vector<vector<Path> > all_paths_;
- vector<Path> cur_paths_;
-};
-
-template<class Graph>
-class VertexLabelerCallback: public PathProcessor<Graph>::Callback {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef vector<EdgeId> Path;
-
-public:
- VertexLabelerCallback(const Graph& g) :
- g_(g), count_(0) {
- }
-
- void Flush() override {
- all_vertices_.push_back(vertices_);
- vertices_.clear();
- counts_.push_back(count_);
- }
-
- void HandleReversedPath(const Path& path) override {
- for (auto it = path.rbegin(); it != path.rend(); ++it) {
- if (path.size() > 0) {
- vertices_.insert(g_.EdgeStart(*it));
- vertices_.insert(g_.EdgeEnd(*it));
- ++count_;
- }
- }
- }
-
- const set<VertexId>& vertices(size_t k = 0) const {
- return all_vertices_[k];
- }
-
- size_t count(size_t k = 0) const {
- return counts_[k];
- }
-
-private:
- Graph& g_;
- vector<size_t> counts_;
- vector<set<VertexId>> all_vertices_;
- size_t count_;
- set<VertexId> vertices_;
-};
-
-template<class Graph>
-class DistancesLengthsCallback: public PathProcessor<Graph>::Callback {
- typedef typename Graph::EdgeId EdgeId;
- typedef vector<EdgeId> Path;
-
-public:
- DistancesLengthsCallback(const Graph& g) :
- g_(g) {
- }
-
- void Flush() override {
- all_distances_.push_back(distances_);
- distances_.clear();
- }
-
- void HandleReversedPath(const Path& path) override {
- size_t path_length = PathLength(path);
- distances_.insert(path_length);
- }
-
- vector<size_t> distances(size_t k = 0) const {
- VERIFY(k < all_distances_.size());
- const set<size_t>& tmp = all_distances_[k];
- return vector<size_t>(tmp.begin(), tmp.end());
- }
-
-private:
- size_t PathLength(const Path& path) const {
- size_t res = 0;
- for (auto I = path.begin(); I != path.end(); ++I)
- res += g_.length(*I);
- return res;
- }
-
- const Graph& g_;
- set<size_t> distances_;
- vector<set<size_t>> all_distances_;
-
- DECL_LOGGER("DistancesLengthsCallback");
-};
-
-}
diff --git a/src/include/omni/range.hpp b/src/include/omni/range.hpp
deleted file mode 100644
index a321eb0..0000000
--- a/src/include/omni/range.hpp
+++ /dev/null
@@ -1,92 +0,0 @@
-#pragma once
-
-#include "verify.hpp"
-
-namespace omnigraph {
-
-struct Range {
-private:
- bool inside(size_t left, size_t right, size_t point) const {
- return left <= point && point <= right;
- }
-
-public:
- //inclusive
- size_t start_pos;
- //exclusive
- size_t end_pos;
-
- size_t size() const {
- VERIFY(end_pos >= start_pos);
- return end_pos - start_pos;
- }
-
- void shift(int shift) {
- VERIFY(shift > 0 || size_t(-shift) <= start_pos);
- start_pos += shift;
- end_pos += shift;
- }
-
- Range(): start_pos(0), end_pos(0) {
- VERIFY(end_pos >= start_pos);
- }
-
- Range(size_t start_pos, size_t end_pos)
- : start_pos(start_pos),
- end_pos(end_pos) {
- VERIFY(end_pos >= start_pos);
- }
-
- bool operator<(const Range &other) const {
- if (start_pos != other.start_pos)
- return start_pos < other.start_pos;
- return end_pos < other.end_pos;
- }
-
- bool contains(const Range& that) const {
- return start_pos <= that.start_pos && end_pos >= that.end_pos;
- }
-
- Range Merge(const Range &other) const {
- return Range(this->start_pos, other.end_pos);
- }
-
- Range Invert(size_t base_length) const {
- VERIFY(base_length >= end_pos);
- return Range(base_length - end_pos, base_length - start_pos);
- }
-
- Range& operator=(const Range& other) {
- start_pos = other.start_pos;
- end_pos = other.end_pos;
- return *this;
- }
-
- bool empty() const {
- return start_pos == end_pos;
- }
-
- bool Intersect(const Range &other) const {
- return inside(start_pos, end_pos, other.start_pos) || inside(start_pos, end_pos, other.end_pos) ||
- inside(other.start_pos, other.end_pos, start_pos);
- }
-
- bool IntersectLeftOf(const Range &other) const {
- return inside(start_pos, end_pos, other.start_pos) && inside(other.start_pos, other.end_pos, end_pos);
- }
-
- bool operator==(const Range &that) const {
- return start_pos == that.start_pos && end_pos == that.end_pos;
- }
-
- bool operator!=(const Range &that) const {
- return !(*this == that);
- }
-};
-
-inline std::ostream& operator<<(std::ostream& os, const Range& range) {
- os << "[" << (range.start_pos + 1) << " - " << range.end_pos << "]";
- return os;
-}
-
-}
diff --git a/src/include/omni/relative_coverage_remover.hpp b/src/include/omni/relative_coverage_remover.hpp
deleted file mode 100644
index 6ebd70b..0000000
--- a/src/include/omni/relative_coverage_remover.hpp
+++ /dev/null
@@ -1,674 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "standard_base.hpp"
-#include "graph_component.hpp"
-#include "omni/visualization/graph_colorer.hpp"
-#include "graph_processing_algorithm.hpp"
-
-namespace omnigraph {
-
-namespace simplification {
-
-template<class EdgeContainer>
-void SingleEdgeAdapter(
- const EdgeContainer& edges,
- std::function<void(typename EdgeContainer::value_type)> single_edge_handler_f) {
- for (auto e : edges) {
- single_edge_handler_f(e);
- }
-}
-
-namespace relative_coverage {
-
-template<class Graph>
-class Component {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- const Graph& g_;
- set<EdgeId> edges_;
- set<VertexId> inner_vertices_;
- set<VertexId> border_;
- set<VertexId> terminating_vertices_;
- //maybe use something more sophisticated in future
- size_t cumm_length_;
- bool contains_deadends_;
-
- //if edge start = edge end = v returns v
- VertexId OppositeEnd(EdgeId e, VertexId v) const {
- VERIFY(g_.EdgeStart(e) == v
- || g_.EdgeEnd(e) == v);
-// VERIFY(remover_.g.EdgeStart(e) != remover_.g.EdgeEnd(e));
- if (g_.EdgeStart(e) == v) {
- return g_.EdgeEnd(e);
- } else {
- return g_.EdgeStart(e);
- }
- }
-
- void RemoveFromBorder(VertexId v) {
- size_t cnt = border_.erase(v);
- VERIFY(cnt);
- }
-
-public:
-
- Component(const Graph& g, EdgeId e) : g_(g), cumm_length_(0), contains_deadends_(false) {
- edges_.insert(e);
- cumm_length_ += g_.length(e);
- border_.insert(g.EdgeStart(e));
- border_.insert(g.EdgeEnd(e));
- }
-
- void MakeInner(VertexId v) {
- VERIFY(border_.count(v) > 0);
- if (g_.IsDeadEnd(v) || g_.IsDeadStart(v)) {
- contains_deadends_ = true;
- }
- inner_vertices_.insert(v);
- for (EdgeId e : g_.IncidentEdges(v)) {
- //seems to correctly handle loops
- if (edges_.count(e) == 0) {
- edges_.insert(e);
- cumm_length_ += g_.length(e);
- VertexId other_end = OppositeEnd(e, v);
- if (inner_vertices_.count(other_end) == 0) {
- border_.insert(other_end);
- }
- }
- }
- RemoveFromBorder(v);
- }
-
- void TerminateOnVertex(VertexId v) {
- terminating_vertices_.insert(v);
- RemoveFromBorder(v);
- }
-
- VertexId NextBorderVertex() const {
- return *border_.begin();
- }
-
- bool IsBorderEmpty() const {
- return border_.empty();
- }
-
- const set<EdgeId>& edges() const {
- return edges_;
- }
-
- bool contains(EdgeId e) const {
- return edges_.count(e) > 0;
- }
-
- const set<VertexId>& terminating_vertices() const {
- return terminating_vertices_;
- }
-
- set<EdgeId> terminating_edges() const {
- set<EdgeId> answer;
- for (VertexId v : terminating_vertices()) {
- for (EdgeId e : g_.IncidentEdges(v)) {
- if (contains(e)) {
- answer.insert(e);
- }
- }
- }
- return answer;
- }
-
- //terminating edges, going into the component
- set<EdgeId> terminating_in_edges() const {
- set<EdgeId> answer;
- for (VertexId v : terminating_vertices()) {
- for (EdgeId e : g_.OutgoingEdges(v)) {
- if (contains(e)) {
- answer.insert(e);
- }
- }
- }
- return answer;
- }
-
- //terminating edges, going out of the component
- set<EdgeId> terminating_out_edges() const {
- set<EdgeId> answer;
- for (VertexId v : terminating_vertices()) {
- for (EdgeId e : g_.IncomingEdges(v)) {
- if (contains(e)) {
- answer.insert(e);
- }
- }
- }
- return answer;
- }
-
- const Graph& g() const {
- return g_;
- }
-
- size_t inner_vertex_cnt() const {
- return inner_vertices_.size();
- }
-
- size_t length() const {
- return cumm_length_;
- }
-
- bool contains_deadends() const {
- return contains_deadends_;
- }
-};
-
-template<class Graph>
-class RelativeCoverageHelper {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef std::function<double(EdgeId, VertexId)> LocalCoverageFT;
-
- const Graph& g_;
- LocalCoverageFT local_coverage_f_;
- double min_coverage_gap_;
-
-public:
- RelativeCoverageHelper(const Graph& g, LocalCoverageFT local_coverage_f,
- double min_coverage_gap)
- : g_(g),
- local_coverage_f_(local_coverage_f),
- min_coverage_gap_(min_coverage_gap) {
- VERIFY(math::gr(min_coverage_gap, 1.));
- }
-
- double LocalCoverage(EdgeId e, VertexId v) const {
- DEBUG("Local coverage of edge " << g_.str(e) << " around vertex " << g_.str(v) << " was " << local_coverage_f_(e, v));
- return local_coverage_f_(e, v);
- }
-
- template<class EdgeContainer>
- double MaxLocalCoverage(const EdgeContainer& edges, VertexId v) const {
- double answer = 0.0;
- for (EdgeId e : edges) {
- answer = max(answer, LocalCoverage(e, v));
- }
- return answer;
- }
-
- template<class EdgeContainer>
- bool CheckAnyHighlyCovered(const EdgeContainer& edges, VertexId v,
- double base_coverage) const {
- return math::gr(MaxLocalCoverage(edges, v),
- base_coverage * min_coverage_gap_);
- }
-
- double RelativeCoverageToReport(VertexId v, double base_coverage) const {
- return std::min(MaxLocalCoverage(g_.OutgoingEdges(v), v),
- MaxLocalCoverage(g_.IncomingEdges(v), v))
- / base_coverage;
- }
-
-private:
- DECL_LOGGER("RelativeCoverageHelper");
-};
-
-template<class Graph>
-class LongestPathFinder {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- const Component<Graph>& component_;
- const Graph& g_;
- map<VertexId, int> max_distance_;
- vector<VertexId> vertex_stack_;
- bool cycle_detected_;
-
- //distance is changed!
- bool TryGetMaxDistance(VertexId v, int& distance) {
- if (max_distance_.count(v) > 0) {
- distance = max_distance_[v];
- return true;
- }
-
- //minus infinity for incoming tips
- distance = std::numeric_limits<int>::min();
- for (EdgeId e : g_.IncomingEdges(v)) {
- VertexId start = g_.EdgeStart(e);
- if (component_.contains(e)) {
- if (max_distance_.count(start) == 0) {
- if (std::find(vertex_stack_.begin(), vertex_stack_.end(), start) != vertex_stack_.end()) {
- cycle_detected_ = true;
- }
- vertex_stack_.push_back(start);
- return false;
- } else {
- distance = std::max(distance, max_distance_[start] + int(g_.length(e)));
- }
- }
- }
- //todo think...
- //currently whole length of zig-zag path
- //through several terminal vertices is counted
- if (component_.terminating_vertices().count(v) > 0) {
- distance = std::max(distance, 0);
- }
- return true;
- }
-
- void ProcessVertex(VertexId init_v) {
- vertex_stack_.push_back(init_v);
- while (!vertex_stack_.empty()) {
- if (cycle_detected_)
- return;
-
- VertexId v = vertex_stack_.back();
- int max_dist = 0;
- if (TryGetMaxDistance(v, max_dist)) {
- max_distance_[v] = max_dist;
- vertex_stack_.pop_back();
- }
- }
- }
-
-public:
- LongestPathFinder(const Component<Graph>& component)
- : component_(component), g_(component.g()), cycle_detected_(false) {
- }
-
- //-1u if component contains a cycle or no path between terminating vertices
- size_t Find() {
- int answer = 0;
- for (VertexId v : component_.terminating_vertices()) {
- ProcessVertex(v);
- if (cycle_detected_)
- return -1u;
- VERIFY(max_distance_.count(v) > 0);
- answer = std::max(answer, get(max_distance_, v));
- }
- VERIFY(answer >= 0);
- if (answer == 0)
- return -1u;
- return size_t(answer);
- }
-};
-
-template<class Graph>
-class ComponentChecker {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- const Graph& g_;
- size_t vertex_count_limit_;
- size_t length_bound_;
- size_t tip_allowing_length_bound_;
- size_t longest_connecting_path_bound_;
- double max_coverage_;
-
- bool CoverageCheck(const Component<Graph>& component) const {
- for (EdgeId e : component.edges()) {
- if (math::gr(g_.coverage(e), max_coverage_)) {
- TRACE("Too high coverage! Component contains highly covered edge " << g_.str(e)
- << " of coverage " << g_.coverage(e) << " while threshold was " << max_coverage_);
- return false;
- }
- }
- return true;
- }
-
-public:
- ComponentChecker(const Graph& g, size_t vertex_count_limit, size_t length_bound,
- size_t tip_allowing_length_bound,
- size_t longest_connecting_path_bound,
- double max_coverage)
- : g_(g), vertex_count_limit_(vertex_count_limit),
- length_bound_(length_bound),
- tip_allowing_length_bound_(tip_allowing_length_bound),
- longest_connecting_path_bound_(longest_connecting_path_bound),
- max_coverage_(max_coverage) {
- }
-
- bool SizeCheck(const Component<Graph>& component) const {
- if (component.inner_vertex_cnt() > vertex_count_limit_) {
- TRACE("Too many vertices : " << component.inner_vertex_cnt() << " ! More than " << vertex_count_limit_);
- return false;
- }
- return true;
- }
-
- bool FullCheck(const Component<Graph>& component) const {
- TRACE("Performing full check of the component");
- size_t longest_connecting_path = LongestPathFinder<Graph>(component).Find();
- if (longest_connecting_path != -1u) {
- if (longest_connecting_path >= longest_connecting_path_bound_) {
- TRACE("Length of longest path: " << longest_connecting_path << "; threshold: " << longest_connecting_path_bound_);
- return false;
- }
- } else {
- TRACE("Failed to find longest connecting path (check for cycles)");
- }
- if (!component.contains_deadends()
- && component.length() > length_bound_) {
- TRACE("Too long component of length " << component.length() << "! Longer than length bound " << length_bound_);
- return false;
- } else if (component.length() > tip_allowing_length_bound_) {
- TRACE("Too long component of length " << component.length() << "! Longer than tip allowing length bound " << tip_allowing_length_bound_);
- return false;
- }
-
- return SizeCheck(component) && CoverageCheck(component);
- }
-
-private:
- DECL_LOGGER("RelativelyLowCoveredComponentChecker");
-};
-
-//Removes last (k+1)-mer of graph edge
-template<class Graph>
-class EdgeDisconnector {
- typedef typename Graph::EdgeId EdgeId;
- Graph& g_;
- EdgeRemover<Graph> edge_remover_;
-
-public:
- EdgeDisconnector(Graph& g,
- HandlerF<Graph> removal_handler = nullptr):
- g_(g), edge_remover_(g, removal_handler) {
- }
-
- EdgeId operator()(EdgeId e) {
- VERIFY(g_.length(e) > 1);
- pair<EdgeId, EdgeId> split_res = g_.SplitEdge(e, 1);
- edge_remover_.DeleteEdge(split_res.first);
- return split_res.first;
- }
-};
-
-//todo make parallel
-template<class Graph>
-class RelativeCoverageDisconnector: public EdgeProcessingAlgorithm<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef std::function<double(EdgeId, VertexId)> LocalCoverageFT;
- typedef EdgeProcessingAlgorithm<Graph> base;
-
- const RelativeCoverageHelper<Graph> rel_helper_;
- EdgeDisconnector<Graph> disconnector_;
- size_t cnt_;
-public:
- RelativeCoverageDisconnector(Graph& g,
- LocalCoverageFT local_coverage_f, double diff_mult) :
- base(g, false),
- rel_helper_(g, local_coverage_f, diff_mult),
- disconnector_(g),
- cnt_(0) {
- }
-
- ~RelativeCoverageDisconnector() {
- DEBUG("Disconnected edge cnt " << cnt_);
- }
-
-protected:
- bool ProcessEdge(EdgeId edge) {
- DEBUG("Processing edge " << this->g().int_id(edge));
- VertexId v = this->g().EdgeStart(edge);
- double coverage_edge_around_v = rel_helper_.LocalCoverage(edge, v);
- DEBUG("Local flanking coverage - " << coverage_edge_around_v);
- DEBUG("Max local coverage incoming - " << rel_helper_.MaxLocalCoverage(this->g().IncomingEdges(v), v));
- DEBUG("Max local coverage outgoing - " << rel_helper_.MaxLocalCoverage(this->g().OutgoingEdges(v), v));
- if (this->g().length(edge) > 1 &&
- rel_helper_.CheckAnyHighlyCovered(this->g().IncomingEdges(v), v, coverage_edge_around_v) &&
- rel_helper_.CheckAnyHighlyCovered(this->g().OutgoingEdges(v), v, coverage_edge_around_v)) {
- DEBUG("Disconnecting");
- disconnector_(edge);
- cnt_++;
- return true;
- } else {
- DEBUG("No need to disconnect");
- return false;
- }
- }
-
-private:
-
- DECL_LOGGER("RelativeCoverageDisconnector");
-};
-
-template<class Graph>
-class ComponentSearcher {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- const Graph& g_;
- const RelativeCoverageHelper<Graph>& rel_helper_;
- const ComponentChecker<Graph>& checker_;
- Component<Graph> component_;
-
-public:
- ComponentSearcher(const Graph& g,
- const RelativeCoverageHelper<Graph>& rel_helper,
- const ComponentChecker<Graph>& checker,
- EdgeId first_edge)
- : g_(g), rel_helper_(rel_helper), checker_(checker),
- component_(g_, first_edge) {
- }
-
- bool FindComponent() {
- while (!component_.IsBorderEmpty()) {
- if (!checker_.SizeCheck(component_))
- return false;
-
- VertexId v = component_.NextBorderVertex();
-
- TRACE("Checking if vertex " << g_.str(v) << " is terminating.");
- //checking if there is a sufficient coverage gap
- if (!IsTerminateVertex(v)) {
- TRACE("Not terminating, adding neighbourhood");
- component_.MakeInner(v);
- if (component_.terminating_vertices().count(v) > 0) {
- TRACE("Terminating vertex classified as non-terminating");
- return false;
- }
- } else {
- TRACE("Terminating");
- component_.TerminateOnVertex(v);
- }
- }
-
- return checker_.FullCheck(component_);
- }
-
- const Component<Graph>& component() const {
- return component_;
- }
-
-private:
-
- bool IsTerminateVertex(VertexId v) const {
- double base_coverage = rel_helper_.MaxLocalCoverage(
- RetainEdgesFromComponent(g_.IncidentEdges(v)), v);
- return CheckAnyFilteredHighlyCovered(g_.OutgoingEdges(v),
- v, base_coverage)
- && CheckAnyFilteredHighlyCovered(
- g_.IncomingEdges(v), v, base_coverage);
- }
-
- template<class EdgeContainer>
- bool CheckAnyFilteredHighlyCovered(const EdgeContainer& edges,
- VertexId v,
- double base_coverage) const {
- return rel_helper_.CheckAnyHighlyCovered(
- FilterEdgesFromComponent(edges), v, base_coverage);
- }
-
- template<class EdgeContainer>
- vector<EdgeId> FilterEdgesFromComponent(
- const EdgeContainer& edges) const {
- vector<EdgeId> answer;
- for (EdgeId e : edges) {
- if (!component_.contains(e)) {
- answer.push_back(e);
- }
- }
- return answer;
- }
-
- template<class EdgeContainer>
- vector<EdgeId> RetainEdgesFromComponent(
- const EdgeContainer& edges) const {
- vector<EdgeId> answer;
- for (EdgeId e : edges) {
- if (component_.contains(e)) {
- answer.push_back(e);
- }
- }
- return answer;
- }
-
- DECL_LOGGER("RelativelyLowCoveredComponentSearcher")
- ;
-};
-
-//currently works with conjugate graphs only (due to the assumption in the outer cycle)
-template<class Graph>
-class RelativeCoverageComponentRemover : public EdgeProcessingAlgorithm<Graph> {
- typedef EdgeProcessingAlgorithm<Graph> base;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef std::function<double(EdgeId, VertexId)> LocalCoverageFT;
- typedef typename ComponentRemover<Graph>::HandlerF HandlerF;
- typedef pred::TypedPredicate<EdgeId> ProceedConditionT;
-
- RelativeCoverageHelper<Graph> rel_helper_;
- size_t length_bound_;
- size_t tip_allowing_length_bound_;
- size_t longest_connecting_path_bound_;
- double max_coverage_;
- //bound on the number of inner vertices
- size_t vertex_count_limit_;
- std::string vis_dir_;
- ComponentRemover<Graph> component_remover_;
-
- size_t fail_cnt_;
- size_t succ_cnt_;
-
- void VisualizeNontrivialComponent(const set<typename Graph::EdgeId>& edges, bool success) {
- auto colorer = omnigraph::visualization::DefaultColorer(this->g());
- auto edge_colorer = make_shared<visualization::CompositeEdgeColorer<Graph>>("black");
- edge_colorer->AddColorer(colorer);
- edge_colorer->AddColorer(make_shared<visualization::SetColorer<Graph>>(this->g(), edges, "green"));
- // shared_ptr<visualization::GraphColorer<Graph>>
- auto resulting_colorer = make_shared<visualization::CompositeGraphColorer<Graph>>(colorer, edge_colorer);
-
- StrGraphLabeler<Graph> str_labeler(this->g());
- CoverageGraphLabeler<Graph> cov_labler(this->g());
- CompositeLabeler<Graph> labeler(str_labeler, cov_labler);
-
- if (edges.size() > 1) {
- set<typename Graph::VertexId> vertices;
- for (auto e : edges) {
- vertices.insert(this->g().EdgeStart(e));
- vertices.insert(this->g().EdgeEnd(e));
- }
-
-
- auto filename = success ? vis_dir_ + "/success/" + ToString(succ_cnt_++) : vis_dir_ + "/fail/" + ToString(fail_cnt_++);
- visualization::WriteComponent(
- ComponentCloser<Graph>(this->g(), 0).CloseComponent(GraphComponent<Graph>(this->g(), vertices.begin(), vertices.end())),
- filename + ".dot", colorer, labeler);
- }
- }
-
-public:
- RelativeCoverageComponentRemover(
- Graph& g, LocalCoverageFT local_coverage_f,
- double min_coverage_gap,
- size_t length_bound,
- size_t tip_allowing_length_bound,
- size_t longest_connecting_path_bound,
- double max_coverage = std::numeric_limits<double>::max(),
- HandlerF handler_function = 0, size_t vertex_count_limit = 10,
- std::string vis_dir = "")
- : base(g),
- rel_helper_(g, local_coverage_f, min_coverage_gap),
- length_bound_(length_bound),
- tip_allowing_length_bound_(tip_allowing_length_bound),
- longest_connecting_path_bound_(longest_connecting_path_bound),
- max_coverage_(max_coverage),
- vertex_count_limit_(vertex_count_limit),
- vis_dir_(vis_dir),
- component_remover_(g, handler_function),
- fail_cnt_(0),
- succ_cnt_(0) {
- VERIFY(math::gr(min_coverage_gap, 1.));
- VERIFY(tip_allowing_length_bound >= length_bound);
- TRACE("Coverage gap " << min_coverage_gap);
- if (!vis_dir_.empty()) {
- path::make_dirs(vis_dir_);
- path::make_dirs(vis_dir_ + "/success/");
- path::make_dirs(vis_dir_ + "/fail/");
- }
- }
-
-protected:
-
- bool ProcessEdge(EdgeId e) {
- TRACE("Processing edge " << this->g().str(e));
-
- //here we use that the graph is conjugate!
- VertexId v = this->g().EdgeStart(e);
- if (this->g().IsDeadEnd(v) && this->g().IsDeadStart(v)) {
- TRACE("Isolated");
- return false;
- }
- if (this->g().IsDeadEnd(v) || this->g().IsDeadStart(v)) {
- TRACE("Tip");
- return false;
- }
-
- double local_cov = rel_helper_.LocalCoverage(e, v);
-
- TRACE("Local coverage around start " << this->g().str(v) << " is " << local_cov);
-
- //since min_coverage_gap_ > 1, we don't need to think about e here
- TRACE("Checking presence of highly covered edges around start")
- if (rel_helper_.CheckAnyHighlyCovered(this->g().OutgoingEdges(v), v, local_cov)
- && rel_helper_.CheckAnyHighlyCovered(this->g().IncomingEdges(v), v,
- local_cov)) {
- TRACE("Looking for component");
- ComponentChecker<Graph> checker(this->g(), vertex_count_limit_, length_bound_,
- tip_allowing_length_bound_,
- longest_connecting_path_bound_, max_coverage_);
- //case of e being loop is handled implicitly!
- ComponentSearcher<Graph> component_searcher(
- this->g(), rel_helper_, checker, e);
- if (component_searcher.FindComponent()) {
- TRACE("Deleting component");
- const Component<Graph>& component = component_searcher.component();
- component_remover_.DeleteComponent(component.edges());
- return true;
- } else {
- TRACE("Failed to find component");
- if (!vis_dir_.empty()) {
- TRACE("Outputting image");
- VisualizeNontrivialComponent(component_searcher.component().edges(), false);
- }
- }
- } else {
- TRACE("No highly covered edges around");
- }
-
- return false;
- }
-
-private:
- DECL_LOGGER("RelativeCoverageComponentRemover");
-};
-
-}
-}
-
-}
diff --git a/src/include/omni/splitters.hpp b/src/include/omni/splitters.hpp
deleted file mode 100644
index 2eaaed2..0000000
--- a/src/include/omni/splitters.hpp
+++ /dev/null
@@ -1,921 +0,0 @@
-#pragma once
-
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "standard_base.hpp"
-#include "graph_component.hpp"
-#include "dijkstra_tools/dijkstra_helper.hpp"
-#include "component_filters.hpp"
-
-namespace omnigraph {
-
-
-template<typename Element>
-class JSIterator {
-public:
-
- virtual Element Next() = 0;
-
- virtual bool HasNext() = 0;
-
- virtual ~JSIterator() {
- }
-};
-
-template<class Graph>
-class GraphSplitter : public JSIterator<GraphComponent<Graph>>{
-private:
- const Graph& graph_;
-public:
- GraphSplitter(const Graph& graph)
- : graph_(graph) {
- }
-
- const Graph& graph() const {
- return graph_;
- }
-};
-
-template<class Graph>
-class PrecountedComponentSplitter : public GraphSplitter<Graph> {
- bool HasNext_;
- GraphComponent<Graph> component_;
-public:
-
- template<class It>
- PrecountedComponentSplitter(const Graph &graph, It begin, It end)
- : GraphSplitter<Graph>(graph), HasNext_(false),
- component_(graph, begin, end) {
- }
-
- template<class It>
- PrecountedComponentSplitter(GraphComponent<Graph> component)
- : GraphSplitter<Graph>(component.g()), HasNext_(false),
- component_(component) {
- }
-
- GraphComponent<Graph> Next() {
- HasNext_ = false;
- return component_;
- }
-
-// virtual bool CheckPutVertex(VertexId /*vertex*/, EdgeId edge, size_t /*length*/) const {
-// return edges_.count(edge) != 0;
-// }
- bool HasNext() {
- return HasNext_;
- }
-};
-
-template<typename Element>
-class RelaxingIterator : public JSIterator<Element> {
-public:
- template<typename It>
- void Relax(It begin, It end) {
- Relax(vector<Element>(begin, end));
- }
-
-// virtual bool CheckProcessVertex(VertexId /*vertex*/, size_t distance) {
-// return distance <= bound_;
-// }
- virtual void Relax(const vector<Element> &v) = 0;
-
- virtual void Relax(Element) = 0;
-
- virtual ~RelaxingIterator() {
- }
-};
-
-template<class Collection>
-class CollectionIterator : public RelaxingIterator<typename Collection::value_type> {
-private:
- typedef typename Collection::value_type Element;
- typedef typename Collection::const_iterator Iter;
- shared_ptr<Collection> storage_;
- Iter current_;
- const Iter end_;
- set<Element> relaxed_;
-public:
- CollectionIterator(const Collection &collection)
- : current_(collection.begin()), end_(collection.end()) {
- }
-
-// virtual bool CheckPutVertex(VertexId vertex, EdgeId /*edge*/, size_t /*length*/) const {
-// return subgraph_.count(vertex) != 0;
-// }
- CollectionIterator(shared_ptr<Collection> collection)
- : storage_(collection), current_(collection->begin()), end_(collection->end()) {
- }
-
- CollectionIterator(Iter begin, Iter end)
- : current_(begin), end_(end) {
- }
-
- Element Next() {
- if(!HasNext()) { //This function actually changes value of current! It is not just to verify!
- //fixme use VERIFY_MSG instead
- VERIFY(HasNext());
- }
- Element next = *current_;
- ++current_;
- return next;
- }
-
-//public:
-// ErrorComponentSplitter(const Graph &graph, const set<EdgeId> &black_edges) :
-// base(graph), black_edges_(black_edges), iterator_(
-// graph.SmartEdgeBegin()) {
-// TRACE("ErrorComponentSplitter created and SmartIterator initialized");
-// }
-//
-// virtual ~ErrorComponentSplitter() {
-// }
-//
-// vector<VertexId> FindComponent(VertexId start_vertex) {
-// ComponentFinder<Graph> cf(this->graph(), black_edges_);
-// cf.run(start_vertex);
-// return cf.ReachedVertices();
-// }
-//
-// vector<VertexId> FindNeighbourhood(VertexId start, size_t bound) {
-// NeighbourhoodFinder<Graph> nf(this->graph(), black_edges_, bound);
-// nf.run(start);
-// return nf.ReachedVertices();
-// }
-//
-// size_t FindDiameter(const vector<VertexId> &component) {
-// set < VertexId > component_set(component.begin(), component.end());
-// size_t result = 0;
-// VertexId current = *(component.begin());
-// for (size_t i = 0; i < 4; i++) {
-// pair<VertexId, size_t> next = GetFarthest(current, component_set);
-// current = next.first;
-// result = next.second;
-// }
-// return result;
-// }
-//
-// pair<VertexId, size_t> GetFarthest(VertexId v,
-// const set<VertexId> &component) {
-// SubgraphDijkstra<Graph> sd(this->graph(), component);
-// sd.run(v);
-// pair<VertexId, size_t> result(v, 0);
-// auto bounds = sd.GetDistances();
-// for (auto it = bounds.first; it != bounds.second; ++it) {
-// if (it->second > result.second) {
-// result = *it;
-// }
-// }
-// return result;
-// }
-//
-// virtual vector<VertexId> NextComponent() {
-// TRACE("Construction of next component started");
-// if (Finished()) {
-// VERIFY(false);
-// return vector<VertexId>();
-// }
-// EdgeId next = *iterator_;
-// ++iterator_;
-// vector < VertexId > component = FindComponent(
-// this->graph().EdgeEnd(next));
-// TRACE("Error edges component constructed. It contains "
-// << component.size() << " vertices");
-// size_t component_size = FindDiameter(component);
-// TRACE("Diameter of component is " << component_size);
-// vector < VertexId > neighbourhood = FindNeighbourhood(
-// this->graph().EdgeEnd(next), (size_t) math::round(1.5 * (double) component_size));
-// TRACE("Error edges component neighborhood constructed. It contains "
-// << neighbourhood.size() << " vertices");
-// visited_.insert(component.begin(), component.end());
-// return neighbourhood;
-// }
-//
-// virtual bool Finished() {
-// while (!iterator_.IsEnd()) {
-// if (black_edges_.find(*iterator_) != black_edges_.end()
-// && visited_.find(this->graph().EdgeEnd(*iterator_))
-// == visited_.end()) {
-// return false;
-// }
-// ++iterator_;
-// }
-// return true;
-// }
- bool HasNext() {
- while(current_ != end_ && relaxed_.count(*current_) == 1) {
- ++current_;
- }
- return current_ != end_;
- }
-
- void Relax(Element e) {
- relaxed_.insert(e);
- }
-
-//template<class Graph>
-//class ShortEdgeComponentNeighbourhoodFinder: public UnorientedDijkstra<Graph> {
-//private:
-// typedef UnorientedDijkstra<Graph> base;
-//protected:
-// typedef typename base::VertexId VertexId;
-// typedef typename base::EdgeId EdgeId;
-// typedef typename base::DistanceType distance_t;
-//private:
-// distance_t bound_;
-//public:
-// ShortEdgeComponentNeighbourhoodFinder(const Graph &graph, distance_t bound) :
-// UnorientedDijkstra<Graph>(graph), bound_(bound) {
-// }
-//
-// virtual bool CheckProcessVertexVertexId (VertexId /*vertex*/, distance_t distance) {
-// return distance == 0;
-// }
-//
-// virtual distance_t GetLength(EdgeId edge) const {
-// if (this->graph().length(edge) <= bound_)
-// return 0;
-// else
-// return 1;
-// }
- void Relax(const vector<Element> &v) {
- for (auto it = v.begin(); it != v.end(); ++it)
- Relax(*it);
- }
-
- virtual ~CollectionIterator() {
- }
-};
-
-template<class Graph>
-class PathIterator : public RelaxingIterator<typename Graph::VertexId> {
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- const Graph &graph_;
- vector<VertexId> path_;
- size_t current_;
-
- static vector<VertexId> ExtractVertices(const Graph &graph, const vector<EdgeId> &path) {
- vector<VertexId> result;
- for(size_t i = 0; i < path.size(); i++) {
- if(i == 0 || path[i] != path[i - 1]) {
- result.push_back(graph.EdgeStart(path[i]));
- result.push_back(graph.EdgeEnd(path[i]));
- }
- }
- return result;
- }
-
-public:
- PathIterator(const Graph &graph, const vector<EdgeId> &path)
- : graph_(graph), path_(ExtractVertices(graph, path)), current_(0) {
- }
-
- VertexId Next() {
- if(!HasNext()) {
- VERIFY(HasNext());
- }
- VertexId next = path_[current_];
- Relax(next);
- return next;
- }
-
- bool HasNext() {
- return current_ < path_.size();
- }
-
- void Relax(const vector<VertexId> &v) {
- set<VertexId> toRelax(v.begin(), v.end());
- while(toRelax.count(path_[current_]) == 1)
- current_++;
- }
-
-//public:
-// CountingDijkstra(const Graph &graph, size_t max_size,
-// size_t edge_length_bound) :
-// base(graph), max_size_(max_size), edge_length_bound_(
-// edge_length_bound), current_(0) {
-// }
-//
-// virtual bool CheckPutVertex(VertexId /*vertex*/, EdgeId edge,
-// distance_t /*length*/) const {
-// if (current_ < max_size_) {
-// ++current_;
-// }
-// if (current_ < max_size_ && GetLength(edge) < inf) {
-// return true;
-// }
-// return false;
-// }
-//
-// virtual bool CheckProcessVertex(VertexId /*vertex*/, distance_t /*distance*/) {
-// return current_ < max_size_;
-// }
-//
-// virtual void init(VertexId /*start*/) {
-// current_ = 0;
-// }
-//
-// virtual size_t GetLength(EdgeId edge) const {
-// if (this->graph().length(edge) <= edge_length_bound_)
-// //todo change back
-//// return 1;
-// return this->graph().length(edge);
-// else
-// return inf;
-// }
- void Relax(VertexId e) {
- Relax(vector<VertexId>({e}));
- }
-};
-
-template<class Graph>
-class AbstractNeighbourhoodFinder {
-private:
- const Graph &graph_;
-public:
- AbstractNeighbourhoodFinder(const Graph &graph) : graph_(graph) {
- }
-
- const Graph &graph() const {
- return graph_;
- }
-
- virtual GraphComponent<Graph> Find(typename Graph::VertexId v) = 0;
-
- virtual vector<typename Graph::VertexId> InnerVertices(const GraphComponent<Graph> &component) = 0;
-
- virtual ~AbstractNeighbourhoodFinder() {
- }
-};
-
-template<class Graph, typename distance_t = size_t>
-class ComponentCloser {
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- const Graph &graph_;
- size_t edge_length_bound_;
-
-public:
- ComponentCloser(const Graph &graph, size_t edge_length_bound)
- : graph_(graph),
- edge_length_bound_(edge_length_bound) {
- }
-
- void CloseComponent(set<VertexId> &component) const {
- set<VertexId> additional_vertices;
- for (auto it = component.begin(); it != component.end(); ++it) {
- for (EdgeId e : graph_.OutgoingEdges(*it)) {
- if (graph_.length(e) >= edge_length_bound_) {
- additional_vertices.insert(graph_.EdgeEnd(e));
- }
- }
- for (EdgeId e : graph_.IncomingEdges(*it)) {
- if (graph_.length(e) >= edge_length_bound_) {
- additional_vertices.insert(graph_.EdgeStart(e));
- }
- }
- }
- component.insert(additional_vertices.begin(),
- additional_vertices.end());
- }
-
- GraphComponent<Graph> CloseComponent(const GraphComponent<Graph>& component) const {
- set<VertexId> vertices(component.v_begin(), component.v_end());
- CloseComponent(vertices);
- return GraphComponent<Graph>(graph_, vertices.begin(), vertices.end());
- }
-};
-
-//This method finds a neighbourhood of a set of vertices. Vertices that are connected by an edge of length more than 600 are not considered as adjacent.
-template<class Graph>
-class ReliableNeighbourhoodFinder : public AbstractNeighbourhoodFinder<Graph> {
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- set<VertexId> FindNeighbours(const set<VertexId> &s) {
- set<VertexId> result(s.begin(), s.end());
- for (VertexId v : result) {
- for (EdgeId e : this->graph().IncidentEdges(v)) {
- if(this->graph().length(e) <= edge_length_bound_) {
- result.insert(this->graph().EdgeEnd(e));
- result.insert(this->graph().EdgeStart(e));
- }
- }
- }
- return result;
- }
-
- set<VertexId> FindNeighbours(const set<VertexId> &s, size_t eps) {
- set<VertexId> result = s;
- for(size_t i = 0; i < eps; i++) {
- result = FindNeighbours(result);
- }
- return result;
- }
-
- set<VertexId> FindBorder(const GraphComponent<Graph> component) {
- set<VertexId> result;
- for(auto it = component.vertices().begin(); it != component.vertices().end(); ++it) {
- if(component.IsBorder(*it)) {
- result.insert(*it);
- }
- }
- return result;
- }
-
-public:
- static const size_t DEFAULT_EDGE_LENGTH_BOUND = 500;
- static const size_t DEFAULT_MAX_SIZE = 100;
-
- const size_t edge_length_bound_;
- const size_t max_size_;
-
- ReliableNeighbourhoodFinder(const Graph &graph, size_t edge_length_bound =
- DEFAULT_EDGE_LENGTH_BOUND,
- size_t max_size = DEFAULT_MAX_SIZE)
- : AbstractNeighbourhoodFinder<Graph>(graph),
- edge_length_bound_(edge_length_bound),
- max_size_(max_size) {
- }
-
- GraphComponent<Graph> Find(typename Graph::VertexId v) {
- auto cd = DijkstraHelper<Graph>::CreateCountingDijkstra(this->graph(), max_size_,
- edge_length_bound_);
- cd.Run(v);
- vector<VertexId> result_vector = cd.ReachedVertices();
- set<VertexId> result(result_vector.begin(), result_vector.end());
- ComponentCloser<Graph> cc(this->graph(), edge_length_bound_);
- cc.CloseComponent(result);
- return GraphComponent<Graph>(this->graph(), result.begin(),
- result.end());
- }
-
- vector<VertexId> InnerVertices(const GraphComponent<Graph> &component) {
- set<VertexId> border = FindNeighbours(FindBorder(component), 2);
- std::vector<VertexId> result;
- std::set_difference(component.vertices().begin(), component.vertices().end(), border.begin(), border.end(), std::inserter(result, result.end()));
- return vector<VertexId>(result.begin(), result.end());
- }
-};
-
-template<class Graph>
-class PathNeighbourhoodFinder : public AbstractNeighbourhoodFinder<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- VertexId OtherEnd(EdgeId e, VertexId v) const {
- if (this->graph().EdgeStart(e) == v)
- return this->graph().EdgeEnd(e);
- else
- return this->graph().EdgeStart(e);
- }
-
- bool Go(VertexId v, size_t curr_depth, set<VertexId>& grey, set<VertexId>& black) const {
- //allows single vertex to be visited many times with different depth values
- TRACE("Came to vertex " << this->graph().str(v) << " on depth " << curr_depth);
- if (curr_depth >= max_depth_) {
- TRACE("Too deep");
- return true;
- }
- if (grey.size() >= max_size_) {
- TRACE("Too many vertices");
- return false;
- }
-
- TRACE("Started processing of vertex " << this->graph().str(v));
- grey.insert(v);
-
- TRACE("Sorting incident edges");
- vector<EdgeId> incident_path;
- vector<EdgeId> incident_non_path;
- for (EdgeId e : this->graph().IncidentEdges(v)) {
- if (path_edges_.count(e) != 0) {
- /*condition not to go backward*/
- if (this->graph().EdgeStart(e) == v) {
- incident_path.push_back(e);
- }
- } else {
- incident_non_path.push_back(e);
- }
- }
-
- for (EdgeId e : incident_non_path) {
- if (this->graph().length(e) > edge_length_bound_) {
- TRACE("Edge " << this->graph().str(e) << " is too long");
- continue;
- }
- TRACE("Going along edge " << this->graph().str(e));
- if (!Go(OtherEnd(e, v), curr_depth + 1, grey, black))
- return false;
- }
-
- TRACE("End processing of vertex " << this->graph().str(v));
- black.insert(v);
-
- for (EdgeId e : incident_path) {
- if (grey.count(OtherEnd(e, v)) != 0)
- continue;
- TRACE("Going along next path edge " << this->graph().str(e));
- if (!Go(OtherEnd(e, v), 0, grey, black))
- return false;
- }
-
- return true;
- }
-
-public:
- static const size_t DEFAULT_EDGE_LENGTH_BOUND = 500;
- static const size_t DEFAULT_MAX_DEPTH = 2;
- static const size_t DEFAULT_MAX_SIZE = 20;
-
- set<EdgeId> path_edges_;
- const size_t edge_length_bound_;
- const size_t max_size_;
- const size_t max_depth_;
-
- set<VertexId> last_inner_;
-
- PathNeighbourhoodFinder(const Graph &graph, const vector<EdgeId>& path, size_t edge_length_bound = DEFAULT_EDGE_LENGTH_BOUND,
- size_t max_size = DEFAULT_MAX_SIZE, size_t max_depth = DEFAULT_MAX_DEPTH)
- : AbstractNeighbourhoodFinder<Graph>(graph),
- path_edges_(path.begin(), path.end()),
- edge_length_bound_(edge_length_bound),
- max_size_(max_size),
- max_depth_(max_depth) {
- }
-
-
- GraphComponent<Graph> Find(VertexId v) {
- TRACE("Starting from vertex " << this->graph().str(v));
- last_inner_.clear();
- set<VertexId> grey;
- set<VertexId> black;
- Go(v, 0, grey, black);
- last_inner_ = black;
- last_inner_.insert(v);
- ComponentCloser<Graph>(this->graph(), 0).CloseComponent(grey);
- return GraphComponent<Graph>(this->graph(), grey.begin(), grey.end());
- }
-
- vector<VertexId> InnerVertices(const GraphComponent<Graph> &/*component*/) {
- return vector<VertexId>(last_inner_.begin(), last_inner_.end());
- }
-private:
- DECL_LOGGER("PathNeighbourhoodFinder");
-};
-
-//todo delete and think if we really need hierarchy
-template<class Graph>
-class ShortEdgeComponentFinder : public AbstractNeighbourhoodFinder<Graph> {
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-public:
- static const size_t DEFAULT_EDGE_LENGTH_BOUND = 100;
-
- const size_t edge_length_bound_;
-
- ShortEdgeComponentFinder(const Graph &graph, size_t edge_length_bound = DEFAULT_EDGE_LENGTH_BOUND)
- : AbstractNeighbourhoodFinder<Graph>(graph),
- edge_length_bound_(edge_length_bound) {
- }
-
- GraphComponent<Graph> Find(VertexId v) {
- auto cd = DijkstraHelper<Graph>::CreateShortEdgeDijkstra(this->graph(), edge_length_bound_);
- cd.Run(v);
- set<VertexId> result = cd.ProcessedVertices();
- return GraphComponent<Graph>(this->graph(), result.begin(),
- result.end());
- }
-
- vector<VertexId> InnerVertices(const GraphComponent<Graph> &component) {
- return vector<VertexId>(component.v_begin(), component.v_end());
- }
-};
-
-template<class Graph>
-class FilteringSplitterWrapper : public GraphSplitter<Graph> {
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- shared_ptr<GraphSplitter<Graph>> inner_splitter_;
- shared_ptr<GraphComponentFilter<Graph>> checker_;
- boost::optional<GraphComponent<Graph>> next_;
-public:
- FilteringSplitterWrapper(
- shared_ptr<GraphSplitter<Graph>> inner_splitter,
- shared_ptr<GraphComponentFilter<Graph>> checker)
- : GraphSplitter<Graph>(inner_splitter->graph()), inner_splitter_(inner_splitter),
- checker_(checker) {
- }
-
- GraphComponent<Graph> Next() {
- if (!HasNext()) {
- VERIFY(false);
- return omnigraph::GraphComponent<Graph>(this->graph());
- }
- GraphComponent<Graph> result = next_.get();
- next_ = boost::optional<GraphComponent<Graph>>();
- return result;
- }
-
- bool HasNext() {
- while (!next_ && inner_splitter_->HasNext()) {
- GraphComponent<Graph> ne = inner_splitter_->Next();
- if (checker_->Check(ne)) {
- next_ = ne;
- }
- }
- return next_;
- }
-private:
- DECL_LOGGER("FilteringSplitterWrapper");
-};
-
-//TODO split combined component into several.
-template<class Graph>
-class CollectingSplitterWrapper : public GraphSplitter<Graph> {
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- shared_ptr<GraphSplitter<Graph>> inner_splitter_;
- shared_ptr<GraphComponentFilter<Graph>> checker_;
- boost::optional<GraphComponent<Graph>> next_;
- set<VertexId> filtered_;
-public:
- CollectingSplitterWrapper(
- shared_ptr<GraphSplitter<Graph>> inner_splitter,
- shared_ptr<GraphComponentFilter<Graph>> checker)
- : GraphSplitter<Graph>(inner_splitter->graph()), inner_splitter_(inner_splitter),
- checker_(checker) {
- }
-
- GraphComponent<Graph> Next() {
- if (!HasNext()) {
- VERIFY(false);
- return omnigraph::GraphComponent<Graph>(this->graph());
- } else {
- if(next_) {
- GraphComponent<Graph> result = next_.get();
- next_ = boost::optional<GraphComponent<Graph>>();
- return result;
- } else {
- GraphComponent<Graph> result(this->graph(), filtered_.begin(), filtered_.end(), false, "filtered");
- filtered_.clear();
- return result;
- }
- }
- }
-
- bool HasNext() {
- while (!next_ && inner_splitter_->HasNext()) {
- GraphComponent<Graph> ne = inner_splitter_->Next();
- if (checker_->Check(ne)) {
- next_ = ne;
- } else {
- filtered_.insert(ne.v_begin(), ne.v_end());
- }
- }
- return next_ || !filtered_.empty();
- }
-private:
- DECL_LOGGER("FilteringSplitterWrapper");
-};
-
-template<class Graph>
-class CondensingSplitterWrapper : public GraphSplitter<Graph> {
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- shared_ptr<GraphSplitter<Graph>> inner_splitter_;
- shared_ptr<GraphComponentFilter<Graph>> checker_;
- boost::optional<GraphComponent<Graph>> next_;
-
- string CutName(const string &name, size_t max_length) {
- VERIFY(max_length >= 7);
- size_t length = name.size();
- if (length <= max_length)
- return name;
- else {
- return name.substr(0, (max_length - 5) / 2) + "....." + name.substr(length - (max_length - 5) / 2, (max_length - 5) / 2);
- }
- }
-
- GraphComponent<Graph> ConstructComponent() {
- GraphComponent<Graph> next = inner_splitter_->Next();
- if (checker_->Check(next)) {
- return next;
- }
- set<VertexId> vertices(next.v_begin(), next.v_end());
- string name = next.name();
- for(size_t i = 0; i < 10 && inner_splitter_->HasNext(); i++) {
- next = inner_splitter_->Next();
- if (checker_->Check(next)) {
- next_ = next;
- break;
- } else {
- vertices.insert(next.v_begin(), next.v_end());
- if (next.name() != "") {
- name += ";";
- name += next.name();
- }
- }
- }
- return GraphComponent<Graph>(this->graph(), vertices.begin(), vertices.end(), CutName(name, 60));
- }
-
-public:
- CondensingSplitterWrapper(
- shared_ptr<GraphSplitter<Graph>> inner_splitter,
- shared_ptr<GraphComponentFilter<Graph>> checker)
- : GraphSplitter<Graph>(inner_splitter->graph()), inner_splitter_(inner_splitter),
- checker_(checker) {
- }
-
- GraphComponent<Graph> Next() {
- if (!HasNext()) {
- VERIFY(false);
- return omnigraph::GraphComponent<Graph>(this->graph());
- }
- if(next_) {
- GraphComponent<Graph> result = next_.get();
- next_ = boost::optional<GraphComponent<Graph>>();
- return result;
- } else {
- return ConstructComponent();
- }
- }
-
- bool HasNext() {
- if(next_)
- return true;
- if(!inner_splitter_->HasNext())
- return false;
- return true;
- }
-private:
- DECL_LOGGER("FilteringSplitterWrapper");
-};
-
-template<class Graph>
-class NeighbourhoodFindingSplitter : public GraphSplitter<Graph> {
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- shared_ptr<RelaxingIterator<VertexId>> inner_iterator_;
- shared_ptr<AbstractNeighbourhoodFinder<Graph>> neighbourhood_finder_;
-
-public:
- NeighbourhoodFindingSplitter(
- const Graph& graph,
- shared_ptr<RelaxingIterator<VertexId>> inner_iterator,
- shared_ptr<AbstractNeighbourhoodFinder<Graph>> neighbourhood_finder)
- : GraphSplitter<Graph>(graph),
- inner_iterator_(inner_iterator),
- neighbourhood_finder_(neighbourhood_finder) {
- }
-
- NeighbourhoodFindingSplitter(
- const Graph& graph,
- shared_ptr<RelaxingIterator<VertexId>> inner_iterator)
- : GraphSplitter<Graph>(graph),
- inner_iterator_(inner_iterator),
- neighbourhood_finder_(
- make_shared<ReliableNeighbourhoodFinder<Graph>>(graph)) {
- }
-
- NeighbourhoodFindingSplitter(const Graph& graph)
- : GraphSplitter<Graph>(graph),
- inner_iterator_(
- make_shared<CollectionIterator<set<VertexId>>>(graph.begin(), graph.end())),
- neighbourhood_finder_(make_shared<ReliableNeighbourhoodFinder<Graph>>(graph)) {
- }
-
- GraphComponent<Graph> Next() {
- VertexId next_vertex = inner_iterator_->Next();
- GraphComponent<Graph> result = neighbourhood_finder_->Find(next_vertex);
- vector<VertexId> to_relax = neighbourhood_finder_->InnerVertices(result);
- to_relax.push_back(next_vertex);
- inner_iterator_->Relax(to_relax);
- return result;
- }
-
- bool HasNext() {
- return inner_iterator_->HasNext();
- }
-};
-
-template<class Graph>
-shared_ptr<GraphSplitter<Graph>> ReliableSplitter(const Graph &graph,
- size_t edge_length_bound = ReliableNeighbourhoodFinder<Graph>::DEFAULT_EDGE_LENGTH_BOUND,
- size_t max_size = ReliableNeighbourhoodFinder<Graph>::DEFAULT_MAX_SIZE) {
- typedef typename Graph::VertexId VertexId;
- shared_ptr<RelaxingIterator<VertexId>> inner_iterator = make_shared<CollectionIterator<set<VertexId>>>(graph.begin(), graph.end());
- shared_ptr<AbstractNeighbourhoodFinder<Graph>> nf = make_shared<ReliableNeighbourhoodFinder<Graph>>(graph, edge_length_bound, max_size);
- return make_shared<NeighbourhoodFindingSplitter<Graph>>(graph,
- inner_iterator, nf);
-}
-
-template<class Graph>
-shared_ptr<GraphSplitter<Graph>> ConnectedSplitter(const Graph &graph,
- size_t edge_length_bound = 1000000,
- size_t max_size = 1000000) {
- typedef typename Graph::VertexId VertexId;
- shared_ptr<RelaxingIterator<VertexId>> inner_iterator = make_shared<CollectionIterator<set<VertexId>>>(graph.begin(), graph.end());
- shared_ptr<AbstractNeighbourhoodFinder<Graph>> nf = make_shared<ReliableNeighbourhoodFinder<Graph>>(graph, edge_length_bound, max_size);
- return make_shared<NeighbourhoodFindingSplitter<Graph>>(graph,
- inner_iterator, nf);
-}
-
-template<class Graph>
-shared_ptr<GraphSplitter<Graph>> ReliableSplitterAlongPath(
- const Graph &graph, const vector<typename Graph::EdgeId>& path, size_t edge_length_bound = PathNeighbourhoodFinder<Graph>::DEFAULT_EDGE_LENGTH_BOUND,
- size_t max_size = PathNeighbourhoodFinder<Graph>::DEFAULT_MAX_SIZE,
- size_t max_depth = PathNeighbourhoodFinder<Graph>::DEFAULT_MAX_DEPTH) {
- typedef typename Graph::VertexId VertexId;
- shared_ptr<RelaxingIterator<VertexId>> inner_iterator = make_shared<
- PathIterator<Graph>>(graph, path);
- shared_ptr<AbstractNeighbourhoodFinder<Graph>> nf = make_shared<PathNeighbourhoodFinder<Graph>>(graph, path,
- edge_length_bound, max_size, max_depth);
-
- return make_shared<NeighbourhoodFindingSplitter<Graph>>(graph,
- inner_iterator, nf);
-}
-
-template<class Graph>
-shared_ptr<GraphSplitter<Graph>> LongEdgesExclusiveSplitter(
- const Graph &graph, size_t bound =
- ReliableNeighbourhoodFinder<Graph>::DEFAULT_EDGE_LENGTH_BOUND) {
- typedef typename Graph::VertexId VertexId;
- shared_ptr<RelaxingIterator<VertexId>> inner_iterator = make_shared<
- CollectionIterator<set<VertexId>>>(graph.begin(), graph.end());
- shared_ptr<AbstractNeighbourhoodFinder<Graph>> nf = make_shared<
- ShortEdgeComponentFinder<Graph>>(graph, bound);
- return make_shared<NeighbourhoodFindingSplitter<Graph>>(graph,
- inner_iterator, nf);
-}
-
-template<class Graph, typename Collection>
-shared_ptr<GraphSplitter<Graph>> StandardSplitter(
- const Graph &graph, const Collection &collection, size_t max_size = ReliableNeighbourhoodFinder<Graph>::DEFAULT_MAX_SIZE,
- size_t edge_length_bound = ReliableNeighbourhoodFinder<Graph>::DEFAULT_EDGE_LENGTH_BOUND) {
- typedef typename Graph::VertexId VertexId;
- shared_ptr<RelaxingIterator<VertexId>> inner_iterator = make_shared<CollectionIterator<Collection>>(collection);
- shared_ptr<AbstractNeighbourhoodFinder<Graph>> nf = make_shared<
- ReliableNeighbourhoodFinder<Graph>>(graph, edge_length_bound,
- max_size);
- return make_shared<NeighbourhoodFindingSplitter<Graph>>(graph, inner_iterator, nf);
-}
-
-template<class Graph, typename Collection>
-shared_ptr<GraphSplitter<Graph>> StandardSplitter(
- const Graph &graph, shared_ptr<Collection> collection, size_t max_size = ReliableNeighbourhoodFinder<Graph>::DEFAULT_MAX_SIZE,
- size_t edge_length_bound = ReliableNeighbourhoodFinder<Graph>::DEFAULT_EDGE_LENGTH_BOUND) {
- typedef typename Graph::VertexId VertexId;
- shared_ptr<RelaxingIterator<VertexId>> inner_iterator = make_shared<CollectionIterator<Collection>>(collection);
- shared_ptr<AbstractNeighbourhoodFinder<Graph>> nf = make_shared<
- ReliableNeighbourhoodFinder<Graph>>(graph, edge_length_bound,
- max_size);
- return make_shared<NeighbourhoodFindingSplitter<Graph>>(graph, inner_iterator, nf);
-}
-
-template<class Graph>
-shared_ptr<GraphSplitter<Graph>> WholeGraphSplitter(
- const Graph &graph, size_t max_size,
- size_t edge_length_bound) {
- return NeighbourhoodFindingSplitter<Graph>(graph, graph.vertices(), max_size, edge_length_bound);
-}
-
-template<class Graph>
-GraphComponent<Graph> VertexNeighborhood(
- const Graph &graph, typename Graph::VertexId vertex, size_t max_size = ReliableNeighbourhoodFinder<Graph>::DEFAULT_MAX_SIZE,
- size_t edge_length_bound = ReliableNeighbourhoodFinder<Graph>::DEFAULT_EDGE_LENGTH_BOUND) {
- vector<typename Graph::VertexId> vv = {vertex};
- shared_ptr<vector<typename Graph::VertexId>> sh_vv = make_shared<vector<typename Graph::VertexId>>(vv);
- return StandardSplitter<Graph>(graph, sh_vv, max_size, edge_length_bound)->Next();
-}
-
-//TODO make a method that draws a picture that contains given set of edges for sure. ? mb refactor this into just drawing instead of splitting?
-template<class Graph>
-GraphComponent<Graph> EdgeNeighborhood(
- const Graph &graph, typename Graph::EdgeId edge, size_t max_size = ReliableNeighbourhoodFinder<Graph>::DEFAULT_MAX_SIZE,
- size_t edge_length_bound = ReliableNeighbourhoodFinder<Graph>::DEFAULT_EDGE_LENGTH_BOUND) {
- vector<typename Graph::VertexId> vv = {graph.EdgeStart(edge)};
- shared_ptr<vector<typename Graph::VertexId>> sh_vv = make_shared<vector<typename Graph::VertexId>>(vv);
- return StandardSplitter<Graph>(graph, sh_vv, max_size, edge_length_bound)->Next();
-}
-
-}
diff --git a/src/include/omni/tip_clipper.hpp b/src/include/omni/tip_clipper.hpp
deleted file mode 100644
index 21ab8d4..0000000
--- a/src/include/omni/tip_clipper.hpp
+++ /dev/null
@@ -1,177 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * tip_clipper.hpp
- *
- * Created on: Mar 25, 2011
- * Author: sergey
- */
-
-#pragma once
-
-#include <set>
-
-#include "omni_utils.hpp"
-#include "xmath.h"
-#include "func.hpp"
-#include "basic_edge_conditions.hpp"
-#include "graph_processing_algorithm.hpp"
-
-namespace omnigraph {
-
-template<class Graph>
-class RelativeCoverageTipCondition: public EdgeCondition<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef EdgeCondition<Graph> base;
-
- const double max_relative_coverage_;
-
- template<class IteratorType>
- double MaxCompetitorCoverage(EdgeId tip, IteratorType begin, IteratorType end) const {
- double result = 0;
- for (auto it = begin; it != end; ++it) {
- if (*it != tip)
- result = std::max(result, this->g().coverage(*it));
- }
- return result;
- }
-
- double MaxCompetitorCoverage(EdgeId tip) const {
- const Graph &g = this->g();
- VertexId start = g.EdgeStart(tip), end = g.EdgeEnd(tip);
- auto out = g.OutgoingEdges(start);
- auto in = g.IncomingEdges(end);
- return std::max(
- MaxCompetitorCoverage(tip, out.begin(), out.end()),
- MaxCompetitorCoverage(tip, in.begin(), in.end()));
-// return std::max(
-// MaxCompetitorCoverage(tip, g.out_begin(start),
-// g.out_end(start)),
-// MaxCompetitorCoverage(tip, g.in_begin(end), g.in_end(end)));
- }
-
-public:
-
- RelativeCoverageTipCondition(const Graph& g, double max_relative_coverage) :
- base(g), max_relative_coverage_(max_relative_coverage) {
- }
-
- bool Check(EdgeId e) const override {
- //+1 is a trick to deal with edges of 0 coverage from iterative run
- double max_coverage = MaxCompetitorCoverage(e) + 1;
- return math::le(this->g().coverage(e),
- max_relative_coverage_ * max_coverage);
- }
-};
-
-template<class Graph>
-class TipCondition : public EdgeCondition<Graph> {
- typedef EdgeCondition<Graph> base;
-
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- /**
- * This method checks if given vertex topologically looks like end of tip
- * @param v vertex to be checked
- * @return true if vertex judged to be tip and false otherwise.
- */
- bool IsTip(VertexId v) const {
- return this->g().IncomingEdgeCount(v) + this->g().OutgoingEdgeCount(v) == 1;
- }
-
-public:
- TipCondition(const Graph& g) : base(g) {
- }
-
- /**
- * This method checks if given edge topologically looks like a tip.
- * @param edge edge vertex to be checked
- * @return true if edge judged to be tip and false otherwise.
- */
- bool Check(EdgeId e) const override {
- return (IsTip(this->g().EdgeEnd(e)) || IsTip(this->g().EdgeStart(e)))
- && (this->g().OutgoingEdgeCount(this->g().EdgeStart(e))
- + this->g().IncomingEdgeCount(this->g().EdgeEnd(e)) > 2);
- }
-
-};
-
-
-template<class Graph>
-class MismatchTipCondition : public EdgeCondition<Graph> {
- typedef EdgeCondition<Graph> base;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- size_t max_diff_;
-
- size_t Hamming(EdgeId edge1, EdgeId edge2) const {
- size_t len = std::min(this->g().length(edge1), this->g().length(edge2));
- size_t cnt = 0;
- Sequence seq1 = this->g().EdgeNucls(edge1);
- Sequence seq2 = this->g().EdgeNucls(edge2);
- for(size_t i = 0; i < len; i++) {
- if(seq1[i] != seq2[i])
- cnt++;
- }
- return cnt;
- }
-
- bool InnerCheck(EdgeId e) const {
- size_t len = this->g().length(e);
- for (auto alt : this->g().OutgoingEdges(this->g().EdgeStart(e))) {
- if (e != alt && len < this->g().length(alt) && Hamming(e, alt) <= max_diff_) {
- return true;
- }
- }
- return false;
- }
-
-public:
- MismatchTipCondition(const Graph& g, size_t max_diff) :
- base(g), max_diff_(max_diff) {
- }
-
- bool Check(EdgeId e) const override {
- return InnerCheck(e) || InnerCheck(this->g().conjugate(e));
- }
-
-};
-
-template<class Graph>
-pred::TypedPredicate<typename Graph::EdgeId> AddTipCondition(const Graph& g,
- pred::TypedPredicate<typename Graph::EdgeId> condition) {
- return pred::And(TipCondition<Graph>(g), condition);
-}
-
-template<class Graph>
-pred::TypedPredicate<typename Graph::EdgeId>
-NecessaryTipCondition(const Graph& g, size_t max_length, double max_coverage) {
- return AddTipCondition(g, pred::And(LengthUpperBound<Graph>(g, max_length),
- CoverageUpperBound<Graph>(g, max_coverage)));
-}
-
-//template<class Graph>
-//bool ClipTips(
-// Graph& g,
-// size_t max_length,
-// shared_ptr<Predicate<typename Graph::EdgeId>> condition
-// = make_shared<func::AlwaysTrue<typename Graph::EdgeId>>(),
-// std::function<void(typename Graph::EdgeId)> removal_handler = 0) {
-//
-// omnigraph::EdgeRemovingAlgorithm<Graph> tc(g,
-// AddTipCondition(g, condition),
-// removal_handler);
-//
-// return tc.Run(LengthComparator<Graph>(g),
-// make_shared<LengthUpperBound<Graph>>(g, max_length));
-//}
-
-} // namespace omnigraph
diff --git a/src/include/omni/visualization/graph_colorer.hpp b/src/include/omni/visualization/graph_colorer.hpp
deleted file mode 100644
index 93e30bc..0000000
--- a/src/include/omni/visualization/graph_colorer.hpp
+++ /dev/null
@@ -1,340 +0,0 @@
-#pragma once
-
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "omni/omni_utils.hpp"
-#include "omni/graph_component.hpp"
-#include "omni/visualization/printing_parameter_storage.hpp"
-//#include "edges_position_handler.hpp"
-
-namespace omnigraph {
-namespace visualization {
-
-template<typename ElementId>
-class ElementColorer : public virtual ParameterStorage<ElementId, string> {
-public:
- template<typename Iter>
- set<ElementId> ColoredWith(Iter begin, Iter end, const string &color) {
- set<ElementId> result;
- for(Iter it = begin; it != end; ++it) {
- if(this->GetValue(*it) == color)
- result.insert(*it);
- }
- return result;
- }
-};
-
-//TODO remove all default color parameters!
-
-template<typename ElementId>
-class MapColorer : public ElementColorer<ElementId>, public MapParameterStorage<ElementId, string> {
-public:
- MapColorer(const string &default_color) : MapParameterStorage<ElementId, string>(default_color) {
- }
-
- MapColorer(const map<ElementId, string> &color_map) : MapParameterStorage<ElementId, string>(color_map) {
- }
-
- MapColorer(const map<ElementId, string> &color_map, const string& default_color) : MapParameterStorage<ElementId, string>(color_map, default_color) {
- }
-
- template<class It>
- MapColorer(It begin, It end, const string& color, const string& default_color) : MapParameterStorage<ElementId, string>(begin, end, color, default_color) {
- }
-
- virtual ~MapColorer() {
- }
-};
-
-template<typename ElementId>
-class FixedColorer: public MapColorer<ElementId> {
-public:
- FixedColorer(const string& default_color): MapColorer<ElementId>(default_color) {
- }
-};
-
-template<class Graph>
-class SetColorer : public MapColorer<typename Graph::EdgeId> {
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- const Graph &graph_;
-
- template<class It>
- map<EdgeId, string> ConstructColorMap(It begin, It end, const string &color) {
- map<EdgeId, string> result;
- for (auto it = begin; it != end; ++it) {
- result[*it] = color;
- }
- return result;
- }
-
-public:
- template<class It>
- SetColorer(const Graph &graph, It begin, It end, const string &color) :
- MapColorer<typename Graph::EdgeId>(ConstructColorMap(begin, end, color), "black"), graph_(graph) {
- }
-
- template<class Collection>
- SetColorer(const Graph &graph, const Collection& c, const string &color) :
- MapColorer<typename Graph::EdgeId>(ConstructColorMap(c.begin(), c.end(), color), "black"), graph_(graph) {
- }
-
-};
-//
-//template<class Graph>
-//class PositionsEdgeColorer: public ElementColorer<typename Graph::EdgeId> {
-//private:
-// typedef typename Graph::VertexId VertexId;
-// typedef typename Graph::EdgeId EdgeId;
-// const Graph &graph_;
-// EdgesPositionHandler<Graph> &positions_;
-//public:
-// PositionsEdgeColorer(const Graph &graph, EdgesPositionHandler<Graph> &positions):
-// graph_(graph), positions_(positions) {
-// }
-// string GetValue(EdgeId element) const {
-// std::vector<EdgeId> path;
-// path.push_back(element);
-// if (positions_.GetEdgePositions(element).size() == 0) return "black";
-// else {
-// if (positions_.IsConsistentWithGenome(path)) return "green";
-// else return "orange";
-// }
-// }
-//};
-
-
-template<class Graph>
-class CompositeEdgeColorer: public ElementColorer<typename Graph::EdgeId> {
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- string default_color_;
- vector<shared_ptr<ElementColorer<typename Graph::EdgeId>>> colorers_;
-
- vector<string> CollectColors(EdgeId edge) const {
- vector<string> result = {default_color_};
- for(auto it = colorers_.begin(); it != colorers_.end(); ++it) {
- string next_color = (*it)->GetValue(edge);
- if(std::find(result.begin(), result.end(), next_color) == result.end())
- result.push_back(next_color);
- }
- return result;
- }
-
- string ConstructColorString(const vector<string> &colors) const {
- if(colors.size() == 1)
- return default_color_;
- string result = "";
- for(size_t i = 1; i < colors.size(); i++)
- result += ":" + colors[i];
- return result.substr(1, result.size());
- }
-
-public:
- CompositeEdgeColorer(const string &default_color): default_color_(default_color) {
- }
-
- CompositeEdgeColorer(shared_ptr<ElementColorer<typename Graph::EdgeId>> colorer, const string &default_color): default_color_(default_color) {
- AddColorer(colorer);
- }
-
- CompositeEdgeColorer(shared_ptr<ElementColorer<typename Graph::EdgeId>> colorer1, shared_ptr<ElementColorer<typename Graph::EdgeId>> colorer2,
- const string &default_color): default_color_(default_color) {
- AddColorer(colorer1);
- AddColorer(colorer2);
- }
-
- void AddColorer(shared_ptr<ElementColorer<typename Graph::EdgeId>> colorer) {
- colorers_.push_back(colorer);
- }
-
- string GetValue(EdgeId edge) const {
- return ConstructColorString(CollectColors(edge));
- }
-};
-
-template<class Graph>
-class GraphColorer : public ElementColorer<typename Graph::VertexId>, public ElementColorer<typename Graph::EdgeId>{
-public:
- string GetValue(typename Graph::VertexId) const = 0;
- string GetValue(typename Graph::EdgeId) const = 0;
-
- template<typename Iter>
- set<typename Iter::value_type> ColoredWith(Iter begin, Iter end, const string &color) {
- return ElementColorer<typename Iter::value_type>::ColoredWith(begin, end, color);
- }
-};
-
-template<class Graph>
-class DelegatingGraphColorer : public GraphColorer<Graph> {
-private:
- const GraphColorer<Graph> &inner_colorer_;
-public:
- DelegatingGraphColorer(const GraphColorer<Graph> &inner_colorer) : inner_colorer_(inner_colorer) {
- }
-
- string GetValue(typename Graph::VertexId v) const {
- return inner_colorer_.GetValue(v);
- }
- string GetValue(typename Graph::EdgeId e) const {
- return inner_colorer_.GetValue(e);
- }
-};
-
-template<typename Graph>
-class BorderDecorator : public GraphColorer<Graph> {
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- const GraphComponent<Graph> &component_;
-// const shared_ptr<const ElementColorer<typename Graph::VertexId>> vertex_colorer_ptr_;
-// const shared_ptr<const ElementColorer<typename Graph::EdgeId>> edge_colorer_ptr_;
- const ElementColorer<typename Graph::VertexId> &vertex_colorer_;
- const ElementColorer<typename Graph::EdgeId> &edge_colorer_;
- const string border_color_;
-public:
-// BorderDecorator(const GraphComponent<Graph> &component,
-// const shared_ptr<const GraphColorer<Graph>> colorer,
-// const string &border_color) :
-// component_(component), vertex_colorer_ptr_(colorer), edge_colorer_ptr_(
-// colorer), vertex_colorer_(*colorer), edge_colorer_(
-// *colorer), border_color_(border_color) {
-// }
-
- BorderDecorator(const GraphComponent<Graph> &component,
- const GraphColorer<Graph> &colorer, const string &border_color = "yellow") :
- component_(component), vertex_colorer_(colorer), edge_colorer_(colorer), border_color_(border_color) {
- }
-
- string GetValue(VertexId v) const {
- if(component_.IsBorder(v)) {
- return border_color_;
- } else {
- return vertex_colorer_.GetValue(v);
- }
- }
-
- string GetValue(EdgeId e) const {
- return edge_colorer_.GetValue(e);
- }
-
- static shared_ptr<BorderDecorator<Graph>> GetInstance(const GraphComponent<Graph> &component,
- const GraphColorer<Graph> &colorer, const string &border_color = "yellow") {
- return make_shared<BorderDecorator<Graph>>(component, colorer, border_color);
- }
-};
-
-
-template<typename Graph>
-class SinkSourceDecorator : public GraphColorer<Graph> {
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- const GraphComponent<Graph> &component_;
-// const shared_ptr<const ElementColorer<typename Graph::VertexId>> vertex_colorer_ptr_;
-// const shared_ptr<const ElementColorer<typename Graph::EdgeId>> edge_colorer_ptr_;
- const ElementColorer<typename Graph::VertexId> &vertex_colorer_;
- const ElementColorer<typename Graph::EdgeId> &edge_colorer_;
- const string sink_color_;
- const string source_color_;
- const string sinksource_color_;
-public:
-
- SinkSourceDecorator(const GraphComponent<Graph> &component,
- const GraphColorer<Graph> &colorer, const string &sink_color = "red", const string &source_color = "orange", const string &sinksource_color = "green") :
- component_(component), vertex_colorer_(colorer), edge_colorer_(colorer), sink_color_(sink_color), source_color_(source_color), sinksource_color_(sinksource_color) {
- }
-
- string GetValue(VertexId v) const {
- if(component_.sinks().count(v) && !component_.sources().count(v)) {
- return sink_color_;
- }
- if(component_.sources().count(v) && !component_.sinks().count(v))
- {
- return source_color_;
- }
- if(component_.sources().count(v) && component_.sinks().count(v))
- {
- return sinksource_color_;
- }
-
- return vertex_colorer_.GetValue(v);
- }
-
- string GetValue(EdgeId e) const {
- return edge_colorer_.GetValue(e);
- }
-
- static shared_ptr<SinkSourceDecorator<Graph>> GetInstance(const GraphComponent<Graph> &component,
- const GraphColorer<Graph> &colorer, const string &sink_color = "red", const string &source_color = "orange") {
- return make_shared<SinkSourceDecorator<Graph>>(component, colorer, sink_color, source_color);
- }
-};
-
-template<class Graph>
-class CompositeGraphColorer: public GraphColorer<Graph> {
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- const shared_ptr<ElementColorer<VertexId>> vertex_colorer_;
- const shared_ptr<ElementColorer<EdgeId>> edge_colorer_;
-public:
- CompositeGraphColorer(shared_ptr<ElementColorer<VertexId>> vertex_colorer
- , shared_ptr<ElementColorer<EdgeId>> edge_colorer) :
- vertex_colorer_(vertex_colorer),
- edge_colorer_(edge_colorer) {
- }
-
-// explicit CompositeGraphColorer(shared_ptr<ElementColorer<EdgeId>> edge_colorer = make_shared<FixedColorer<EdgeId>>("black")) :
-// vertex_colorer_(shared_ptr<ElementColorer<VertexId>>(new FixedColorer<VertexId>("white"))),
-// edge_colorer_(edge_colorer) {
-// }
-
- string GetValue(VertexId v) const {
- return vertex_colorer_->GetValue(v);
- }
-
- string GetValue(EdgeId e) const {
- return edge_colorer_->GetValue(e);
- }
-
-};
-
-
-
-// edge_colorer management is passed here
-//TODO check all usages
-template <class Graph>
-shared_ptr<GraphColorer<Graph>> DefaultColorer(const Graph& /*g*/,
- shared_ptr<ElementColorer<typename Graph::EdgeId>> edge_colorer) {
- return shared_ptr<GraphColorer<Graph>>(new CompositeGraphColorer<Graph>(make_shared<FixedColorer<typename Graph::VertexId>>("white"), edge_colorer));
-}
-
-template <class Graph>
-shared_ptr<GraphColorer<Graph>> DefaultColorer(const Graph& g,
- const Path<typename Graph::EdgeId>& path1,
- const Path<typename Graph::EdgeId>& path2) {
- shared_ptr<ElementColorer<typename Graph::EdgeId>> edge_colorer =
- make_shared<CompositeEdgeColorer<Graph>>(
- make_shared<SetColorer<Graph>>(g, path1.sequence(), "red"),
- make_shared<SetColorer<Graph>>(g, path2.sequence(), "blue"), "black");
- return DefaultColorer(g, edge_colorer);
-}
-
-template<class Graph>
-shared_ptr<GraphColorer<Graph>> DefaultColorer(const Graph& /*g*/) {
- return shared_ptr<GraphColorer<Graph>>(new CompositeGraphColorer<Graph>(
- make_shared<FixedColorer<typename Graph::VertexId>>("white"),
- make_shared<FixedColorer<typename Graph::EdgeId>>("black")));
-}
-
-}
-}
diff --git a/src/include/omni/visualization/graph_labeler.hpp b/src/include/omni/visualization/graph_labeler.hpp
deleted file mode 100644
index 5239885..0000000
--- a/src/include/omni/visualization/graph_labeler.hpp
+++ /dev/null
@@ -1,304 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef GRAPH_LABELER_HPP_
-#define GRAPH_LABELER_HPP_
-
-#include "simple_tools.hpp"
-#include "standard_base.hpp"
-#include "omni/edges_position_handler.hpp"
-
-namespace omnigraph {
-
-/**
- * (Interface)
- * Provides string labels for vertices and edges of some graph.
- * Used with GraphPrinter to visualize graphs.
- */
-template<class Graph>
-class GraphLabeler {
-public:
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- virtual ~GraphLabeler() {
- }
-
- virtual string label(VertexId v) const = 0;
-
- virtual string label(EdgeId e) const = 0;
-
-};
-
-//template<class Graph>
-//class MapGraphLabeler {
-// typedef typename Graph::EdgeId EdgeId;
-// typedef typename Graph::VertexId VertexId;
-// map<EdgeId, string> edge_map_;
-// map<VertexId, string> vertex_map_;
-//
-//public:
-//
-// string label(VertexId v) const {
-// auto it = vertex_map_.find(v);
-// if (it == vertex_map_.end())
-// return "";
-// else
-// return it->second;
-// }
-//
-// string label(EdgeId e) const {
-// auto it = edge_map_.find(e);
-// if (it == edge_map_.end())
-// return "";
-// else
-// return it->second;
-// }
-//
-//};
-
-template<class Graph>
-class AbstractGraphLabeler: public GraphLabeler<Graph> {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- const Graph& g_;
-protected:
- AbstractGraphLabeler(const Graph& g): g_(g) {
-
- }
-
- const Graph& graph() const {
- return g_;
- }
-
-public:
- /*virtual*/ std::string label(VertexId /*v*/) const {
- return "";
- }
-
- /*virtual*/ std::string label(EdgeId /*e*/) const {
- return "";
- }
-
-};
-
-/**
- * Trivial implementation of GraphLabeler.
- * All labels are "".
- */
-template<class Graph>
-class EmptyGraphLabeler : public GraphLabeler<Graph> {
- typedef GraphLabeler<Graph> base;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-public:
- EmptyGraphLabeler() {}
-
- std::string label(VertexId /*v*/) const {
- return "";
- }
-
- std::string label(EdgeId /*e*/) const {
- return "";
- }
-};
-
-/**
- * Implementation of GraphLabeler for Graphs that have methods
- * str(VertexId) and str(EdgeId), such as AbstractGraph.
- */
-template<class Graph>
-class StrGraphLabeler : public AbstractGraphLabeler<Graph> {
- typedef AbstractGraphLabeler<Graph> base;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-public:
- StrGraphLabeler(const Graph& g) : base(g) {}
-
- /*virtual*/ std::string label(VertexId v) const {
- return this->graph().str(v);
- }
-
- /*virtual*/ std::string label(EdgeId e) const {
- return this->graph().str(e);
- }
-
- /*virtual*/ ~StrGraphLabeler() {
-
- }
-};
-
-template <class Graph>
-shared_ptr<GraphLabeler<Graph>> StrGraphLabelerInstance(const Graph& g) {
- return make_shared<StrGraphLabeler<Graph>>(g);
-}
-
-template<class Graph>
-class LengthIdGraphLabeler : public StrGraphLabeler<Graph> {
- typedef StrGraphLabeler<Graph> base;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-public:
- LengthIdGraphLabeler(const Graph& g) : base(g) {}
-
- /*virtual*/ std::string label(EdgeId e) const {
- std::stringstream ss;
- ss << this->graph().length(e) << " (id: " << this->graph().int_id(e) << ")";
- return ss.str();
- }
-
-};
-
-template<class Graph>
-class LengthGraphLabeler : public StrGraphLabeler<Graph> {
- typedef StrGraphLabeler<Graph> base;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-public:
- LengthGraphLabeler(const Graph& g) : base(g) {}
-
- /*virtual*/ std::string label(EdgeId e) const {
- return ToString(this->graph().length(e));
- }
-
-};
-
-template<class Graph>
-class CoverageGraphLabeler : public AbstractGraphLabeler<Graph> {
- typedef AbstractGraphLabeler<Graph> base;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-public:
- CoverageGraphLabeler(const Graph& g) : base(g) {}
-
- std::string label(EdgeId e) const {
- double coverage = this->graph().coverage(e);
- return " {Cov:" + ToString(coverage) + "}";
- }
-};
-
-template<class Graph>
-class CompositeLabeler : public GraphLabeler<Graph> {
-private:
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- vector<GraphLabeler<Graph>*> list_;
-
- template<typename ElementId>
- string ConstructLabel(ElementId id) const {
- vector<string> to_print;
- for(size_t i = 0; i < list_.size(); i++) {
- string next = list_[i]->label(id);
- if(next.size() != 0) {
- to_print.push_back(next);
- }
- }
- string result = "";
- for(size_t i = 0; i < to_print.size(); i++) {
- result += to_print[i];
- if(i + 1 < to_print.size())
- result += "\\n";
- }
- return result;
- }
-
-public:
- CompositeLabeler() {
- }
-
- CompositeLabeler(GraphLabeler<Graph> &labeler1, GraphLabeler<Graph> &labeler2, GraphLabeler<Graph> &labeler3, GraphLabeler<Graph> &labeler4) {
- AddLabeler(labeler1);
- AddLabeler(labeler2);
- AddLabeler(labeler3);
- AddLabeler(labeler4);
- }
-
- CompositeLabeler(GraphLabeler<Graph> &labeler1, GraphLabeler<Graph> &labeler2, GraphLabeler<Graph> &labeler3) {
- AddLabeler(labeler1);
- AddLabeler(labeler2);
- AddLabeler(labeler3);
- }
-
- CompositeLabeler(GraphLabeler<Graph> &labeler1, GraphLabeler<Graph> &labeler2) {
- AddLabeler(labeler1);
- AddLabeler(labeler2);
- }
-
- virtual ~CompositeLabeler() {
- }
-
- void AddLabeler(GraphLabeler<Graph> &labeler) {
- list_.push_back(&labeler);
- }
-
- virtual string label(VertexId vertexId) const {
- return ConstructLabel<VertexId>(vertexId);
- }
-
- virtual string label(EdgeId edgeId) const {
- return ConstructLabel<EdgeId>(edgeId);
- }
-};
-
-template<class Graph>
-class EdgePosGraphLabeler: public AbstractGraphLabeler<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-public:
- const EdgesPositionHandler<Graph>& edge_pos_;
-
- EdgePosGraphLabeler(const Graph& g, const EdgesPositionHandler<Graph>& edge_pos) :
- AbstractGraphLabeler<Graph>(g), edge_pos_(edge_pos) {
- }
-
- virtual std::string label(EdgeId edgeId) const {
- return "Positions: " + edge_pos_.str(edgeId);
- }
-
- virtual ~EdgePosGraphLabeler() {
-// TRACE("~EdgePosGraphLabeler");
- }
-private:
- DECL_LOGGER("EdgePosGraphLabeler")
-};
-
-template<class Graph>
-class DefaultLabeler: public GraphLabeler<Graph> {
-private:
- const Graph& g_;
- const EdgesPositionHandler<Graph> &edges_positions_;
-protected:
- typedef GraphLabeler<Graph> super;
- typedef typename super::EdgeId EdgeId;
- typedef typename super::VertexId VertexId;
-public:
-
- DefaultLabeler(const Graph &g, const EdgesPositionHandler<Graph> &position_handler) :
- g_(g), edges_positions_(position_handler) {
- }
-
- virtual std::string label(VertexId vertexId) const {
- return ToString(vertexId.int_id());
- }
-
- virtual std::string label(EdgeId edgeId) const {
- std::string ret_label;
- ret_label += "Id " + g_.str(edgeId) + "\\n";
- ret_label += "Positions:\\n"+ edges_positions_.str(edgeId);
- size_t len = g_.length(edgeId);
- double cov = g_.coverage(edgeId);
- ret_label += "Len(cov): " + ToString(len) + "(" + ToString(cov) + ")";
- return ret_label;
- }
-
- virtual ~DefaultLabeler() {
- }
-};
-
-}
-
-#endif /* GRAPH_LABELER_HPP_ */
diff --git a/src/include/omni/visualization/graph_printer.hpp b/src/include/omni/visualization/graph_printer.hpp
deleted file mode 100644
index 82587c2..0000000
--- a/src/include/omni/visualization/graph_printer.hpp
+++ /dev/null
@@ -1,176 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "standard_base.hpp"
-#include "graph_print_utils.hpp"
-#include "graph_labeler.hpp"
-#include "graph_colorer.hpp"
-#include "vertex_linker.hpp"
-
-namespace omnigraph {
-namespace visualization {
-
-template<class Graph>
-class GraphPrinter {
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-// ostream& os_;
- const Graph &graph_;
-protected:
- const GraphLabeler<Graph> &labeler_;
- const GraphColorer<Graph> &colorer_;
- const VertexLinker<Graph> &linker_;
-
-protected:
-// ostream& os() {
-// return os_;
-// }
-
-
- const Graph &graph() {
- return graph_;
- }
-
- template<class GvisVertexId>
- gvis::BaseVertex<GvisVertexId> CreateBaseVertex(GvisVertexId id, VertexId v) {
- return gvis::BaseVertex<GvisVertexId>(id, labeler_.label(v), linker_.GetValue(v), colorer_.GetValue(v));
- }
-
- template<class GvisVertexId>
- gvis::BaseEdge<GvisVertexId> CreateBaseEdge(GvisVertexId from, GvisVertexId to, EdgeId e){
- return gvis::BaseEdge<GvisVertexId>(from, to, this->labeler_.label(e), this->colorer_.GetValue(e));
- }
-
- virtual void ManageDrawn(VertexId v, set<VertexId> &visited) {
- visited.insert(v);
- }
-
-public:
- GraphPrinter(const Graph &graph, /*ostream &os,*/
- const GraphLabeler<Graph> &labeler,
- const GraphColorer<Graph> &colorer,
- const VertexLinker<Graph> &linker) :
- /*os_(os), */graph_(graph), labeler_(labeler), colorer_(colorer), linker_(
- linker) {
- }
-
- virtual void open() = 0;
-
- virtual void close() = 0;
-
- virtual void AddVertex(VertexId v1) = 0;
-
- template<class iter>
- void AddVertices(iter vbegin, iter vend) {
- set<VertexId> drawn;
- for(;vbegin != vend; ++vbegin) {
- if(drawn.count(*vbegin) == 0) {
- AddVertex(*vbegin);
- ManageDrawn(*vbegin, drawn);
- }
- }
- }
-
- virtual void AddEdge(EdgeId e) = 0;
-
- template<class iter>
- void AddEdges(iter ebegin, iter eend) {
- for(;ebegin != eend; ++ebegin) {
- AddEdge(*ebegin);
- }
- }
-
- virtual ~GraphPrinter() {
- }
-};
-
-template<typename Graph>
-class SingleGraphPrinter : public GraphPrinter<Graph> {
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- gvis::DotSingleGraphRecorder<size_t> recorder_;
-
-public:
- SingleGraphPrinter(const Graph &graph, ostream &os,
- const GraphLabeler<Graph> &labeler,
- const GraphColorer<Graph> &colorer,
- const VertexLinker<Graph> &linker) : GraphPrinter<Graph>(/*os_, */graph, labeler, colorer, linker), recorder_(os){
- }
-
- void open() {
- recorder_.startGraphRecord("graph_picture");
- }
-
- void close() {
- recorder_.endGraphRecord();
- }
-
- void AddVertex(VertexId v) {
- recorder_.recordVertex(this->CreateBaseVertex((size_t)this->graph().int_id(v), v));
- }
-
- void AddEdge(EdgeId edge) {
- recorder_.recordEdge(this->CreateBaseEdge((size_t)this->graph().int_id(this->graph().EdgeStart(edge)), (size_t)this->graph().int_id(this->graph().EdgeEnd(edge)), edge));
- }
-};
-
-template<typename Graph>
-class PairedGraphPrinter : public GraphPrinter<Graph> {
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- gvis::DotPairedGraphRecorder<size_t> recorder_;
-
- pair<gvis::BaseVertex<size_t>, gvis::BaseVertex<size_t>> CreateDoubleVertex(VertexId v) {
- gvis::BaseVertex<size_t> u1 = this->CreateBaseVertex((size_t)this->graph().int_id(v), v);
- gvis::BaseVertex<size_t> u2 = this->CreateBaseVertex((size_t)this->graph().int_id(this->graph().conjugate(v)), this->graph().conjugate(v));
- return make_pair(u1, u2);
- }
-
- pair<size_t, size_t> CreateDoubleVertexId(VertexId v) {
- return make_pair(this->graph().int_id(v), this->graph().int_id(this->graph().conjugate(v)));
- }
-protected:
- /*virtual */void ManageDrawn(VertexId v, set<VertexId> &visited) {
- visited.insert(v);
- visited.insert(this->graph().conjugate(v));
- }
-
-public:
- PairedGraphPrinter(const Graph &graph, ostream &os,
- const GraphLabeler<Graph> &labeler,
- const GraphColorer<Graph> &colorer,
- const VertexLinker<Graph> &linker) : GraphPrinter<Graph>(/*os_, */graph, labeler, colorer, linker), recorder_(os) {
- }
-
- void open() {
- recorder_.startGraphRecord("graph_picture");
- }
-
- void close() {
- recorder_.endGraphRecord();
- }
-
- void AddVertex(VertexId v) {
- recorder_.recordVertex(CreateDoubleVertex(v));
- }
-
- void AddEdge(EdgeId edge) {
- auto vid1 = CreateDoubleVertexId(this->graph().EdgeStart(edge));
- auto vid2 = CreateDoubleVertexId(this->graph().EdgeEnd(edge));
- recorder_.recordEdge(gvis::BaseEdge<pair<size_t, size_t>>(vid1, vid2, this->labeler_.label(edge), this->colorer_.GetValue(edge)));
- }
-};
-
-}
-}
diff --git a/src/include/omni/visualization/printing_parameter_storage.hpp b/src/include/omni/visualization/printing_parameter_storage.hpp
deleted file mode 100644
index 46d0a45..0000000
--- a/src/include/omni/visualization/printing_parameter_storage.hpp
+++ /dev/null
@@ -1,81 +0,0 @@
-#pragma once
-
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-
-#include "standard_base.hpp"
-#include "omni/graph_component.hpp"
-namespace omnigraph {
-namespace visualization {
-
-template<typename ElementId, typename Value>
-class ParameterStorage {
-public:
- virtual Value GetValue(ElementId element) const = 0;
-
- virtual ~ParameterStorage() {
- }
-};
-
-template<typename ElementId, typename Value>
-class MapParameterStorage : public virtual ParameterStorage<ElementId, Value> {
-private:
-private:
- template<class It>
- static map<ElementId, string> ConstructMap(It begin, It end, const string& color) {
- map<ElementId, string> result;
- for (auto it = begin; it != end; ++it) {
- result.insert(make_pair(*it, color));
- }
- return result;
- }
-
-protected:
- map<ElementId, Value> storage_;
-private:
- boost::optional<Value> default_value_;
-public:
- MapParameterStorage(const string &default_value) : default_value_(default_value) {
- }
-
- MapParameterStorage(map<ElementId, Value> storage, Value default_value) : storage_(storage), default_value_(default_value) {
- }
-
- MapParameterStorage(map<ElementId, Value> storage) : storage_(storage) {
- }
-
- template<class It>
- MapParameterStorage(It begin, It end, const Value& value, const string& default_value) : storage_(ConstructMap(begin, end, value)), default_value_(default_value) {
- }
-
-
- Value GetValue(ElementId element) const {
- auto it = storage_.find(element);
- if (it == storage_.end()) {
- VERIFY(default_value_);
- return default_value_.get();
- }
- return it->second;
- }
-};
-
-template<typename ElementId, typename Value>
-class DecoratorParameterStorage : public virtual ParameterStorage<ElementId, Value> {
-private:
- ParameterStorage<ElementId, Value> inner_storage_;
-public:
- DecoratorParameterStorage(ParameterStorage<ElementId, Value> inner_storage) : inner_storage_(inner_storage) {
- }
-
- Value GetInnerValue(ElementId element) {
- return inner_storage_.GetValue(element);
- }
-};
-
-}
-}
diff --git a/src/include/omni/visualization/vertex_linker.hpp b/src/include/omni/visualization/vertex_linker.hpp
deleted file mode 100644
index 2e7efbf..0000000
--- a/src/include/omni/visualization/vertex_linker.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-#pragma once
-
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "standard_base.hpp"
-#include "printing_parameter_storage.hpp"
-
-namespace omnigraph {
-namespace visualization {
-
-template<class Graph>
-class VertexLinker : public virtual ParameterStorage<typename Graph::VertexId, string> {
-};
-
-template<class Graph>
-class MapVertexLinker : public VertexLinker<Graph>, public MapParameterStorage<typename Graph::VertexId, string> {
-public:
- MapVertexLinker() : MapParameterStorage<typename Graph::VertexId, string>("") {
- }
-
- MapVertexLinker(const map<typename Graph::VertexId, string> &link_map) : MapParameterStorage<typename Graph::VertexId, string>(link_map, "") {
- }
-
- virtual ~MapVertexLinker() {
- }
-};
-
-template<class Graph>
-class EmptyGraphLinker : public MapVertexLinker<Graph> {
-public:
- EmptyGraphLinker() {
- }
-};
-
-}
-}
diff --git a/src/include/omni/visualization/visualization_utils.hpp b/src/include/omni/visualization/visualization_utils.hpp
deleted file mode 100644
index 3642e2a..0000000
--- a/src/include/omni/visualization/visualization_utils.hpp
+++ /dev/null
@@ -1,210 +0,0 @@
-#pragma once
-
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-
-#include "graph_printer.hpp"
-#include "omni/omni_utils.hpp"
-#include "omni/dijkstra_tools/dijkstra_helper.hpp"
-#include "omni/splitters.hpp"
-#include "omni/graph_component.hpp"
-#include "visualizers.hpp"
-#include "vertex_linker.hpp"
-
-namespace omnigraph {
-namespace visualization {
-
-
-template<class Graph>
-void WriteComponents(const Graph& g,
- const string& folder_name,
- shared_ptr<GraphSplitter<Graph>> inner_splitter,
- shared_ptr<GraphColorer<Graph>> colorer,
- const GraphLabeler<Graph> &labeler) {
- EmptyGraphLinker<Graph> linker;
-// shared_ptr<GraphComponentFilter<Graph>> checker = make_shared<ComponentSizeFilter<Graph>>(g, 1500, 2, 300);
- auto filter = make_shared<omnigraph::SmallComponentFilter<Graph>>(g, 3);
- shared_ptr<GraphSplitter<Graph>> splitter = make_shared<omnigraph::CollectingSplitterWrapper<Graph>>(inner_splitter, filter);
- omnigraph::visualization::SplittingGraphVisualizer<Graph>(g, labeler, *colorer, linker).SplitAndVisualize(*splitter, folder_name);
-}
-
-template<class Graph>
-void DrawComponentsOfShortEdges(const Graph& g, size_t min_length, size_t sinks, size_t sources)
-{
- vector<typename Graph::EdgeId> short_edges;
- std::string pics_folder_ = cfg::get().output_dir + ToString(min_length) + "_" + ToString(sinks) + "_" + ToString(sources) + "_"+ "pics_polymorphic/";
- make_dir(pics_folder_);
- INFO("Writing pics with components consisting of short edges to " + pics_folder_);
- shared_ptr<GraphSplitter<Graph>> splitter = LongEdgesExclusiveSplitter<Graph>(g, min_length);
- while (splitter->HasNext()) {
- GraphComponent<Graph> component = splitter->Next();
- if(component.v_size() > 3 && component.sinks().size() == sinks && component.sources().size() == sources)
- {
- bool fail = false;
- for(auto v : component.sources()) {
- if(component.g().IncomingEdgeCount(v) != 1) {
- fail = true;
- }
- }
- for(auto v : component.sinks()) {
- if(component.g().OutgoingEdgeCount(v) != 1) {
- fail = true;
- }
- }
-
- if(fail)
- {
- continue;
- }
-
- StrGraphLabeler<Graph> labeler(component.g());
- CoverageGraphLabeler<Graph> labeler2(component.g());
- CompositeLabeler<Graph> compositeLabeler(labeler, labeler2);
- WriteComponentSinksSources(component, pics_folder_ + ToString(g.int_id(*component.vertices().begin()))
- + ".dot", visualization::DefaultColorer(g),
- compositeLabeler);
- INFO("Component is written to " + ToString(g.int_id(*component.vertices().begin())) + ".dot");
-
- // PrintComponent(component,
-// pics_folder_ + "ShortComponents/"
-// + ToString(gp.g.int_id(component.vertices_[0]))
-// + ".dot");
- }
- }
-}
-
-
-template<class Graph>
-void WriteSizeLimitedComponents(const Graph& g,
- const string& folder_name,
- shared_ptr<GraphSplitter<Graph>> inner_splitter,
- shared_ptr<GraphColorer<Graph>> colorer,
- const GraphLabeler<Graph> &labeler, int min_component_size, int max_component_size, size_t max_components) {
- EmptyGraphLinker<Graph> linker;
-
- auto filter = make_shared<omnigraph::ComponentSizeFilter<Graph>>(g, 1000000000, (size_t) min_component_size, (size_t) max_component_size);
- shared_ptr<GraphSplitter<Graph>> splitter = make_shared<omnigraph::CollectingSplitterWrapper<Graph>>(inner_splitter, filter);
- omnigraph::visualization::SplittingGraphVisualizer<Graph>(g, labeler, *colorer, linker, false, max_components).SplitAndVisualize(*splitter, folder_name);
-}
-
-template<class Graph>
-void WriteComponent(const GraphComponent<Graph>& gc,
- const string& file_name, shared_ptr<GraphColorer<Graph>> colorer,
- const GraphLabeler<Graph> &labeler) {
- EmptyGraphLinker<Graph> linker;
- BorderDecorator<Graph> component_colorer(gc, *colorer, "yellow");
- ofstream os;
- os.open(file_name);
- omnigraph::visualization::ComponentVisualizer<Graph>(gc.g(), true).Visualize(gc, os, labeler, component_colorer, linker);
- os.close();
-}
-
-template<class Graph>
-void WriteComponentSinksSources(const GraphComponent<Graph>& gc,
- const string& file_name, shared_ptr<GraphColorer<Graph>> colorer,
- const GraphLabeler<Graph> &labeler) {
- EmptyGraphLinker<Graph> linker;
- SinkSourceDecorator<Graph> component_colorer(gc, *colorer);
- ofstream os;
- os.open(file_name);
- omnigraph::visualization::ComponentVisualizer<Graph>(gc.g(), true).Visualize(gc, os, labeler, component_colorer, linker);
- os.close();
-}
-
-template<class Graph>
-void WriteComponentSinksSources(const GraphComponent<Graph>& gc,
- const string& file_name) {
-
- StrGraphLabeler<Graph> labeler(gc.g());
- CoverageGraphLabeler<Graph> labeler2(gc.g());
- CompositeLabeler<Graph> compositeLabeler(labeler, labeler2);
- EmptyGraphLinker<Graph> linker;
- WriteComponentSinksSources(gc, file_name, DefaultColorer(gc.g()),
- compositeLabeler);
-}
-
-template<class Graph>
-void WriteSimpleComponent(const GraphComponent<Graph>& gc,
- const string& file_name, shared_ptr<GraphColorer<Graph>> colorer,
- const GraphLabeler<Graph> &labeler) {
- EmptyGraphLinker<Graph> linker;
- ofstream os;
- os.open(file_name);
- omnigraph::visualization::ComponentVisualizer<Graph>(gc.g(), false).Visualize(gc, os, labeler, *colorer, linker);
- os.close();
-}
-
-template<class Graph>
-void WriteComponentsAlongPath(const Graph& g, vector<typename Graph::EdgeId> path,
- const string& prefix_path, shared_ptr<GraphColorer<Graph>> colorer,
- const GraphLabeler<Graph> &labeler, bool color_path = true) {
- auto edge_colorer = make_shared<CompositeEdgeColorer<Graph>>("black");
- edge_colorer->AddColorer(colorer);
- if (color_path) {
- edge_colorer->AddColorer(make_shared<SetColorer<Graph>>(g, path, "green"));
- }
- shared_ptr<GraphColorer<Graph>> resulting_colorer = make_shared<CompositeGraphColorer<Graph>>(colorer, edge_colorer);
- shared_ptr<GraphSplitter<Graph>> rs = ReliableSplitterAlongPath<Graph>(g, path);
- auto filter = make_shared<omnigraph::SmallComponentFilter<Graph>>(g, 3);
- shared_ptr<GraphSplitter<Graph>> splitter = make_shared<omnigraph::CondensingSplitterWrapper<Graph>>(rs, filter);
- WriteComponents<Graph>(g, prefix_path, splitter, resulting_colorer, labeler);
-}
-
-template<class Graph>
-class LocalityPrintingRH {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- const Graph& g_;
- const GraphLabeler<Graph>& labeler_;
- std::shared_ptr<visualization::GraphColorer<Graph>> colorer_;
- const string output_folder_;
-public:
- LocalityPrintingRH(const Graph& g
- , const GraphLabeler<Graph>& labeler
- , std::shared_ptr<visualization::GraphColorer<Graph>> colorer
- , const string& output_folder) :
- g_(g),
- labeler_(labeler),
- colorer_(colorer),
- output_folder_(output_folder) {
- path::make_dirs(output_folder_);
- }
-
- void HandleDelete(EdgeId e, const string& add_label = "") {
- //todo magic constant
-// map<EdgeId, string> empty_coloring;
- auto edge_colorer = make_shared<visualization::CompositeEdgeColorer<Graph>>("black");
- edge_colorer->AddColorer(colorer_);
- edge_colorer->AddColorer(make_shared<visualization::SetColorer<Graph>>(this->g(), vector<EdgeId>(1, e), "green"));
- shared_ptr<visualization::GraphColorer<Graph>> resulting_colorer = make_shared<visualization::CompositeGraphColorer<Graph>>(colorer_, edge_colorer);
-
- string fn = output_folder_ + "edge_" + ToString(this->g().int_id(e)) + add_label + ".dot";
- omnigraph::visualization::WriteComponent(omnigraph::EdgeNeighborhood<Graph>(this->g(), e, 50, 250)
- , fn
- , resulting_colorer, labeler_);
- }
-
-private:
- DECL_LOGGER("LocalityPrintingRH")
- ;
-};
-
-//static void WriteFilteredComponents(const Graph& g,
-// const string& folder_name,
-// shared_ptr<GraphComponentFilter<Graph>> filter,
-// shared_ptr<GraphSplitter<Graph>> splitter,
-// shared_ptr<GraphColorer<Graph>> colorer,
-// const GraphLabeler<Graph> &labeler) {
-// EmptyGraphLinker<Graph> linker;
-//// shared_ptr<GraphComponentFilter<Graph>> checker = make_shared<ComponentSizeFilter<Graph>>(g, 1500, 2, 300);
-// omnigraph::FilteringSplitterWrapper<Graph> filtered_splitter(splitter, filter);
-// omnigraph::visualization::SplittingGraphVisualizer<Graph>(g, labeler, *colorer, linker).SplitAndVisualize(filtered_splitter, folder_name);
-//}
-
-}
-}
diff --git a/src/include/omni/visualization/visualizers.hpp b/src/include/omni/visualization/visualizers.hpp
deleted file mode 100644
index 401a7b0..0000000
--- a/src/include/omni/visualization/visualizers.hpp
+++ /dev/null
@@ -1,171 +0,0 @@
-#pragma once
-
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-
-#include "standard_base.hpp"
-#include "graph_printer.hpp"
-namespace omnigraph {
-namespace visualization {
-
-//DECL_LOGGER("omg.gvis")
-
-template<class Graph>
-class ComponentVisualizer {
- const Graph& graph_;
- const bool paired_;
-
-private:
- void Visualize(const GraphComponent<Graph>& component, GraphPrinter<Graph> &printer) {
- printer.open();
- printer.AddVertices(component.vertices().begin(), component.vertices().end());
- for (auto e_it = component.e_begin(); e_it != component.e_end();
- ++e_it) {
- printer.AddEdge(*e_it);
- }
- printer.close();
- }
-
-public:
- ComponentVisualizer(const Graph& graph, bool paired = true) :
- graph_(graph), paired_(paired) {
- }
-
- void Visualize(const GraphComponent<Graph>& component, ostream &os,
- const GraphLabeler<Graph> &labeler,
- const GraphColorer<Graph> &colorer,
- const VertexLinker<Graph> &linker) {
- if(paired_) {
- PairedGraphPrinter<Graph> printer(graph_, os, labeler, colorer, linker);
- Visualize(component, printer);
- } else {
- SingleGraphPrinter<Graph> printer(graph_, os, labeler, colorer, linker);
- Visualize(component, printer);
- }
- }
-
- void Visualize(ostream &os,
- const GraphLabeler<Graph> &labeler,
- const GraphColorer<Graph> &colorer,
- const VertexLinker<Graph> &linker) {
- GraphComponent<Graph> component(graph_, graph_.begin(), graph_.end(), false);
- Visualize(component, os, labeler, colorer, linker);
- }
-};
-
-
-template<class Graph>
-class ComponentNameGenerator {
-public:
- virtual string ComponentName(const GraphComponent<Graph>& component) = 0;
-
- virtual ~ComponentNameGenerator() {
- }
-};
-
-template<class Graph>
-class SimpleCountingComponentNameGenerator: public ComponentNameGenerator<Graph> {
-private:
- string name_;
- string extension_;
- size_t cnt_;
-public:
- SimpleCountingComponentNameGenerator(string name, string extension): name_(name), extension_(extension), cnt_(0) {
- }
-
- string ComponentName(const GraphComponent<Graph>& component) {
- cnt_++;
- stringstream ss;
- ss << name_ << "_" << cnt_;
- if(component.name().size() > 0)
- ss << "_" << component.name();
- ss << "." << extension_;
- return ss.str();
- }
-};
-
-template<class Graph>
-class CountingSizeComponentNameGenerator: public ComponentNameGenerator<Graph> {
-private:
- string name_;
- string extension_;
- size_t cnt_;
-public:
- CountingSizeComponentNameGenerator(string name, string extension): name_(name), extension_(extension), cnt_(0) {
- }
-
- string ComponentName(const GraphComponent<Graph>& component) {
- cnt_++;
- stringstream ss;
- ss << name_ << "_" << cnt_;
- if(component.name().size() > 0)
- ss << "_" << component.name();
- ss << "_size_" << component.size();
- ss << "." << extension_;
-
- return ss.str();
- }
-};
-
-
-template<class Graph>
-class SplittingGraphVisualizer {
-private:
- const Graph& graph_;
- const GraphLabeler<Graph> &labeler_;
- const GraphColorer<Graph> &colorer_;
- const VertexLinker<Graph> &linker_;
- const bool paired_;
- const size_t max_component_number_;
- static const size_t DEFAULT_MAX_COMPONENT_NUMBER = 500;
-
- string ComponentFileName(size_t cnt, const string &folder, const GraphComponent<Graph>& component) {
- stringstream ss;
- ss << folder << cnt;
- if(component.name().size() > 0)
- ss << "graph_" << component.name();
- ss << ".dot";
- return ss.str();
- }
-
-public:
- SplittingGraphVisualizer(const Graph& graph,
- const GraphLabeler<Graph> &labeler,
- const GraphColorer<Graph> &colorer,
- const VertexLinker<Graph> &linker,
- bool paired = true,
- size_t max_component_number = DEFAULT_MAX_COMPONENT_NUMBER) :
- graph_(graph), labeler_(labeler), colorer_(colorer), linker_(linker), paired_(paired), max_component_number_(max_component_number) {
- }
-
- size_t SplitAndVisualize(GraphSplitter<Graph> &splitter, const string &folder) {
- INFO("Writing components to folder " << folder);
- ComponentVisualizer<Graph> visualizer(graph_, paired_);
- size_t cnt = 0;
- while(splitter.HasNext()) {
- if(cnt > max_component_number_) {
- INFO("The number of graph components exceeded " << max_component_number_ << ". Aborting current visualization.");
- break;
- }
- cnt++;
- GraphComponent<Graph> component = splitter.Next();
- BorderDecorator<Graph> border_colorer(component, colorer_, "yellow");
- ofstream os(ComponentFileName(cnt, folder, component));
- visualizer.Visualize(component, os, labeler_, border_colorer, linker_);
- os.close();
- }
- return cnt;
- }
-
-private:
- DECL_LOGGER("SplittingGraphVisualizer");
-};
-
-}
-}
-
diff --git a/src/include/path_helper.hpp b/src/include/path_helper.hpp
deleted file mode 100644
index efd028e..0000000
--- a/src/include/path_helper.hpp
+++ /dev/null
@@ -1,74 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <dirent.h>
-#include <unistd.h>
-
-#include <string>
-#include <vector>
-#include "logger/logger.hpp"
-#include "verify.hpp"
-
-namespace path {
-//todo review and make names consistent!
-
-typedef std::vector<std::string> files_t;
-
-bool make_dir(std::string const& folder);
-std::string make_temp_dir(std::string const& prefix, std::string const& suffix);
-void remove_dir(std::string const& folder);
-bool is_regular_file(std::string const& path);
-std::string append_path(std::string const& prefix, std::string const& suffix);
-std::string current_dir();
-
-//todo why non-cons argument?!
-void make_full_path(std::string& path);
-std::string filename(std::string const& path);
-std::string basename(std::string const& path);
-std::string extension(std::string const& path);
-std::string parent_path(std::string const& path);
-bool check_existence(std::string const& path);
-void remove_if_exists(std::string const& path);
-
-//todo move to cpp and reduce code duplication!!!
-/**
- * Checks if file exists.
- * Analogs: http://www.techbytes.ca/techbyte103.html , http://www.gamedev.net/topic/211918-determining-if-a-file-exists-c/
- */
-inline bool FileExists(std::string filename) {
- struct stat st_buf;
- return stat(filename.c_str(), &st_buf) == 0 && S_ISREG(st_buf.st_mode);
-}
-
-/**
- * Exit(1) if file doesn't exists, writes FATAL log message.
- */
-inline void CheckFileExistenceFATAL(std::string filename) {
- if(!FileExists(filename))
- FATAL_ERROR("File " << filename << " doesn't exist or can't be read!");
-}
-
-inline void make_dirs(const std::string& path) {
- VERIFY(!path.empty());
-
- size_t slash_pos = 0;
- while ((slash_pos = path.find_first_of('/', slash_pos + 1)) != std::string::npos) {
- make_dir(path.substr(0, slash_pos));
- }
- if (path[path.size() - 1] != '/') {
- make_dir(path);
- }
-}
-
-// doesn't support symlinks
-std::string resolve(std::string const& path);
-std::string make_relative_path(std::string p, std::string base = current_dir());
-}
diff --git a/src/include/perfcounter.hpp b/src/include/perfcounter.hpp
deleted file mode 100644
index 0f47fee..0000000
--- a/src/include/perfcounter.hpp
+++ /dev/null
@@ -1,123 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-#include <sys/time.h>
-#include <string>
-#include <cppformat/format.h>
-
-struct perf_counter
-{
- perf_counter()
- {
- reset();
- }
-
- double time() const
- {
- struct timeval now;
- gettimeofday(&now, NULL);
-
- return (double)(now.tv_sec - time_.tv_sec) + (double)(now.tv_usec - time_.tv_usec) * 1e-6;
- }
-
- double time_ms() const
- {
- return time() * 1e3;
- }
-
- void reset()
- {
- gettimeofday(&time_, NULL);
- }
-
-private:
- struct timeval time_;
-};
-
-
-inline std::string human_readable_time(double time_in_sec)
-{
-// assert(time_in_sec > 0);
-
- size_t msec = size_t(time_in_sec * 1000) % 1000;
- size_t sec = size_t(time_in_sec);
- size_t hours = sec / 3600;
- size_t mins = (sec / 60) % 60;
- sec %= 60;
-
- return fmt::format("{:3d}:{:02d}:{:02d}.{:03d}", hours, mins, sec, msec);
-}
-
-inline std::string human_readable_memory(size_t max_rss) {
- if (max_rss < 1024 * 1024) {
- return fmt::format("{:d}M", (max_rss / 1024));
- } else {
- return fmt::format("{:d}G", (max_rss / (1024 * 1024)));
- }
-}
-
-struct avg_perf_counter
-{
- avg_perf_counter(/*const string& name*/)// : name_(name)
- {
- reset();
- }
-
-// ~avg_perf_counter() {
-// cout << "Time in counter " << name_ << ": " << human_readable_time(time()) << endl;
-// }
-
- int start(int ret = 0)
- {
- p_cnt_.reset();
- return ret;
- }
-
- int stop(int ret = 0)
- {
- counter_++;
- whole_time_ += p_cnt_.time();
- return ret;
- }
- double time() const
- {
- return whole_time_;
- }
- size_t counts()
- {
- return counter_;
- }
- double time_ms() const
- {
- return time() * 1e3;
- }
-
- double avg_time() const
- {
- return counter_ > 0 ? whole_time_/(double)counter_ : 0.;
- }
-
- double avg_time_ms() const
- {
- return avg_time() * 1e3;
- }
-
- void reset()
- {
- p_cnt_.reset();
- whole_time_ = 0;
- counter_ = 0;
- }
-
-private:
- const std::string name_;
- perf_counter p_cnt_;
- double whole_time_;
- size_t counter_;
-
-};
diff --git a/src/include/pred.hpp b/src/include/pred.hpp
deleted file mode 100644
index af85372..0000000
--- a/src/include/pred.hpp
+++ /dev/null
@@ -1,165 +0,0 @@
-#ifndef __ADT_PRED_HPP__
-#define __ADT_PRED_HPP__
-
-#pragma once
-
-#include "adt/function_traits.hpp"
-
-#include <memory>
-#include <functional>
-
-namespace pred {
-
-template<typename T>
-class TypedPredicate {
- public:
- typedef T checked_type;
-
- template<typename P>
- TypedPredicate(P p)
- : self_(std::make_shared<TypedPredicateModel<P> >(std::move(p))) {}
-
- bool operator()(T x) const {
- return self_->operator()(x);
- }
-
- private:
- struct TypedPredicateConcept {
- virtual ~TypedPredicateConcept() {};
- virtual bool operator()(T x) const = 0;
- };
-
- template<class P>
- struct TypedPredicateModel : TypedPredicateConcept {
- TypedPredicateModel(P p)
- : data_(std::move(p)) {}
-
- virtual bool operator()(T x) const override {
- return data_(x);
- }
-
- P data_;
- };
-
- std::shared_ptr<const TypedPredicateConcept> self_;
-};
-
-template<typename T>
-class AlwaysTrueOperator {
- public:
- typedef T checked_type;
-
- bool operator()(T) const {
- return true;
- }
-};
-
-template<typename T>
-class AlwaysFalseOperator {
- typedef T checked_type;
-
- public:
- bool operator()(T) const {
- return false;
- }
-};
-
-template<typename T>
-class AndOperator {
- public:
- typedef T checked_type;
-
- AndOperator(TypedPredicate<T> lhs, TypedPredicate<T> rhs)
- : lhs_(std::move(lhs)),
- rhs_(std::move(rhs)) { }
-
- bool operator()(T x) const {
- return lhs_(x) && rhs_(x);
- }
-
- private:
- const TypedPredicate<T> lhs_, rhs_;
-};
-
-template<typename T>
-class OrOperator {
- public:
- typedef T checked_type;
-
- OrOperator(TypedPredicate<T> lhs, TypedPredicate<T> rhs)
- : lhs_(std::move(lhs)), rhs_(std::move(rhs)) { }
-
- bool operator()(T x) const {
- return lhs_(x) || rhs_(x);
- }
-
- private:
- const TypedPredicate<T> lhs_, rhs_;
-};
-
-template<typename T>
-class NotOperator {
- public:
- typedef T checked_type;
-
- NotOperator(const TypedPredicate<T> p)
- : p_(std::move(p)) {}
-
- bool operator()(T x) const {
- return !p_(x);
- }
-
- private:
- const TypedPredicate<T> p_;
-};
-
-template<class P,
- bool = adt::function_traits<P>::arity == 1 &&
- std::is_same<typename adt::function_traits<P>::return_type, bool>::value>
-struct is_predicate : public std::true_type {};
-
-template<class P>
-struct is_predicate<P, false> : public std::false_type {};
-
-template<class TP1, class TP2,
- typename _T1 = typename adt::function_traits<TP1>::template arg<0>::type,
- typename _T2 = typename adt::function_traits<TP2>::template arg<0>::type,
- typename =
- typename std::enable_if<std::is_same<_T1, _T2>::value &&
- is_predicate<TP1>::value && is_predicate<TP2>::value
- >::type>
-TypedPredicate<_T1> And(TP1 lhs, TP2 rhs) {
- return AndOperator<_T1>(lhs, rhs);
-}
-
-template<class TP1, class TP2,
- typename _T1 = typename adt::function_traits<TP1>::template arg<0>::type,
- typename _T2 = typename adt::function_traits<TP2>::template arg<0>::type,
- typename =
- typename std::enable_if<std::is_same<_T1, _T2>::value &&
- is_predicate<TP1>::value && is_predicate<TP2>::value
- >::type>
-TypedPredicate<_T1> Or(TP1 lhs, TP2 rhs) {
- return OrOperator<_T1>(lhs, rhs);
-}
-
-template<class TP,
- typename _T = typename adt::function_traits<TP>::template arg<0>::type,
- typename =
- typename std::enable_if<is_predicate<TP>::value>::type>
-TypedPredicate<_T> Not(TP p) {
- return NotOperator<_T>(p);
-}
-
-template<class T>
-TypedPredicate<T> AlwaysTrue() {
- return AlwaysTrueOperator<T>();
-}
-template<class T>
-TypedPredicate<T> AlwaysFalse() {
- return AlwaysFalseOperator<T>();
-}
-
-} // namespace pred
-
-#endif // __ADT_PRED_HPP__
diff --git a/src/include/runtime_k.hpp b/src/include/runtime_k.hpp
deleted file mode 100644
index f70bcb6..0000000
--- a/src/include/runtime_k.hpp
+++ /dev/null
@@ -1,87 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * runtime_map.hpp
- *
- * Created on: Jun 21, 2012
- * Author: andrey
- */
-
-#ifndef RUNTIME_K_HPP_
-#define RUNTIME_K_HPP_
-
-#include "sequence/sequence.hpp"
-#include "sequence/seq.hpp"
-#include "sequence/simple_seq.hpp"
-#include "sequence/rtseq.hpp"
-
-
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "k_range.hpp"
-
-namespace runtime_k {
-
-#define T_SIZE sizeof(seq_element_type)
-
-#define GET_T_ELEMENTS_NUMBER(value) ((value - 1) / (T_SIZE << 2) + 1)
-
-#define GET_K_BY_TS(value) (value * (T_SIZE << 2))
-
-#define GET_UPPER_BOUND(value) GET_K_BY_TS( GET_T_ELEMENTS_NUMBER(value) )
-
-
-const size_t UPPER_BOUND = GET_UPPER_BOUND(MAX_K); //((MAX_K - 1) / (sizeof(seq_element_type) << 2) + 1) * (sizeof(seq_element_type) << 2);
-
-const size_t MAX_TS = GET_T_ELEMENTS_NUMBER(MAX_K);
-
-const size_t MIN_TS = GET_T_ELEMENTS_NUMBER(MIN_K);
-
-
-typedef RuntimeSeq<UPPER_BOUND> RtSeq;
-
-
-//Basic types and sequence <---> kmer functions
-template <size_t size_>
-class TypeContainerImpl {
-public:
- typedef SimpleSeq<size_> Kmer;
-
- typedef unordered_set<Kmer, typename Kmer::hash, typename Kmer::equal_to> set_type;
-
- typedef std::vector<Kmer> vector_type;
-
- static Kmer from_sequence(const RtSeq& seq) {
- return seq.get_sseq<size_>();
- }
-
- static RtSeq to_sequence(const Kmer& kmer, size_t k = size_) {
- return RtSeq(kmer, k);
- }
-};
-
-
-template <size_t size_, typename Value>
-class TypeValueContainerImpl: public TypeContainerImpl<size_> {
-
-public:
- typedef TypeContainerImpl<size_> base;
-
- typedef typename base::Kmer Kmer;
-
- typedef typename base::set_type set_type;
-
- typedef unordered_map<Kmer, Value, typename Kmer::hash, typename Kmer::equal_to> map_type;
-
-};
-
-} /* namespace runtime_k */
-
-#endif /* RUNTIME_K_HPP_ */
diff --git a/src/include/segfault_handler.hpp b/src/include/segfault_handler.hpp
deleted file mode 100644
index f413bf2..0000000
--- a/src/include/segfault_handler.hpp
+++ /dev/null
@@ -1,56 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-
-#pragma once
-
-#include "stacktrace.hpp"
-
-#include <signal.h>
-
-struct segfault_handler : boost::noncopyable {
- typedef std::function<void ()> callback_t;
- typedef void (*seg_handler_t)(int);
-
- segfault_handler(callback_t const& cb = 0) {
- if (callback() != 0)
- throw std::runtime_error("failed to initialize segfault_handler, it has been already initialized");
-
- callback() = cb;
- old_func_ = signal(SIGSEGV, &segfault_handler::handler);
- }
-
- ~segfault_handler() {
- callback() = 0;
- signal(SIGSEGV, old_func_);
- }
-
- private:
- static callback_t& callback() {
- static callback_t cb = 0;
- return cb;
- }
-
- static void handler(int signum) {
- if (signum == SIGSEGV) {
- std::cerr << "The program was terminated by segmentation fault" << std::endl;
- print_stacktrace();
-
- if (callback())
- callback()();
- }
-
- //TEST!!
- exit(1);
-
- signal(signum, SIG_DFL);
- kill (getpid(), signum);
- }
-
- private:
- seg_handler_t old_func_;
-};
diff --git a/src/include/sequence/nucl.hpp b/src/include/sequence/nucl.hpp
deleted file mode 100755
index 60a9af8..0000000
--- a/src/include/sequence/nucl.hpp
+++ /dev/null
@@ -1,123 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/**
- * @file nucl.hpp
- * @author vyahhi
- * @version 1.0
- *
- * @section LICENSE
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- *
- * @section DESCRIPTION
- *
- * Simple operations and checks for nucleotide-letters
- *
- */
-
-
-#ifndef NUCL_HPP_
-#define NUCL_HPP_
-
-#include "verify.hpp"
-#include <iostream>
-
-const char dignucl_map['T' + 1] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3};
-
-const bool isnucl_map[256] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
-
-const char nucl_map[4] = {'A', 'C', 'G', 'T'};
-
-const char nucl_complement_map['T' + 1] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 'T', 0, 'G', 0, 0, 0, 'C', 0, 0, 0, 0, 0, 0, 'N', 0, 0, 0, 0, 0, 'A'};
-
-/**
- * ACGT -> true
- * @param char c
- * @return true if c is 'A', 'C', 'G' or 'T'.
- */
-inline bool is_nucl(char c) { // is ACGT
- return isnucl_map[(unsigned)c];
-}
-
-/**
- * 0123 -> true
- * @param char c
- * @return true if c is 0, 1, 2 or 3.
- */
-inline bool is_dignucl(char c) { // is 0123
- return (c < 4);
-}
-
-/**
- * 0123 -> 3210
- * @param char c
- * @return c ^ 3
- */
-inline char complement(char c) {
- // VERIFY(is_dignucl(c));
- return c ^ 3;
-}
-
-/**
- * ACGT -> TGCA
- * @param char c is 'A', 'C', 'G', 'T' or 'N'
- * @return complement symbol, i.e. 'A' => 'T', 'C' => 'G', 'G' => 'C', 'T' => 'A', 'N' => 'N'
- */
-
-struct nucl_complement_functor { // still unused
- inline bool operator() (char c) const {
- char cc = nucl_complement_map[(unsigned)c];
- return cc ? cc : 'N';
- }
-};
-
-inline char nucl_complement(char c){
- // TODO: deal with 'N' case
- //VERIFY(is_nucl(c));
- char cc = nucl_complement_map[(unsigned)c];
- return cc ? cc : 'N';
-}
-
-/**
- * 0123 -> ACGT
- * @param char c is 0, 1, 2 or 3
- * @return 0 => 'A', 1 => 'C', 2 => 'G', 3 => 'T'
- */
-inline char nucl(char c) {
- return nucl_map[(unsigned)c];
-}
-
-/**
- * ACGT -> 0123
- * @param char c is 'A', 'C', 'G' or 'T'
- * @return A => 0, C => 1, G => 2, T => 3
- */
-
-/*
-struct dignucl : public unary_function<int,bool> {
- bool operator()(signed char c) const {
- return dignucl_map[c];
- }
-};*/
-
-inline char dignucl(char c) {
- // VERIFY(is_nucl(c));
- return dignucl_map[(unsigned)c];
-}
-
-
-#endif /* NUCL_HPP_ */
diff --git a/src/include/sequence/quality.hpp b/src/include/sequence/quality.hpp
deleted file mode 100755
index 9de743a..0000000
--- a/src/include/sequence/quality.hpp
+++ /dev/null
@@ -1,39 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * qual.hpp
- *
- * Created on: 03.03.2011
- * Author: vyahhi
- */
-
-#ifndef QUAL_HPP_
-#define QUAL_HPP_
-
-#include <string>
-//todo really strange class
-class Quality {
-public:
-
- Quality(const std::string &s) : qual_(s) {
- }
-
- int operator[](size_t i) const {
- return qual_[i];
- }
-
- std::string str() const { // copying (defensive)!
- return qual_;
- }
-
-private:
- std::string qual_;
- //friend class ireadstream;
-};
-
-#endif /* QUAL_HPP_ */
diff --git a/src/include/sequence/rtseq.hpp b/src/include/sequence/rtseq.hpp
deleted file mode 100644
index 2dbe934..0000000
--- a/src/include/sequence/rtseq.hpp
+++ /dev/null
@@ -1,724 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * rtseq.hpp
- *
- * Created on: Jun 28, 2012
- * Author: andrey
- */
-
-#ifndef RTSEQ_HPP_
-#define RTSEQ_HPP_
-
-#include <string>
-#include "verify.hpp"
-#include <array>
-#include <algorithm>
-#include "sequence/nucl.hpp"
-#include "log.hpp"
-#include "seq_common.hpp"
-#include "seq.hpp"
-#include "simple_seq.hpp"
-
-#include <cstring>
-#include <iostream>
-
-template<size_t max_size_, typename T = seq_element_type>
-class RuntimeSeq {
- public:
- /**
- * @variable Number of bits in type T (e.g. 8 for char)
- * @example 8: 2^8 = 256 or 16
- */
- const static size_t TBits = sizeof(T) << 3;
-
- /**
- * @variable Number of nucleotides that can be stored in one type T (e.g. 4 for char)
- * TNucl MUST be a power of two
- * @example 4: 8/2 = 4 or 16/2 = 8
- */
- const static size_t TNucl = TBits >> 1;
-
- /**
- * @variable Number of bits in TNucl (e.g. 2 for char). Useful for shifts instead of divisions.
- */
- const static size_t TNuclBits = log_<TNucl, 2>::value;
-
- const static size_t Iterations = log_<TBits, 2>::value;
-
- static const std::array<T, Iterations> ConstructLeftMasks() {
- std::array<T, Iterations> result;
- for(size_t i = 0; i < Iterations; i++) {
- size_t shift = 1 << i;
- T mask = T(T(1) << shift) - T(1);
- result[i] = T(mask << shift);
- for(size_t j = 0; j < i; j++) {
- result[j] += T(result[j] << shift);
- }
- }
- return result;
- }
-
- static const std::array<T, Iterations> ConstructRightMasks() {
- std::array<T, Iterations> result(ConstructLeftMasks());
- for(size_t i = 0; i < Iterations; i++) {
- result[i] = T(~result[i]);
- }
- return result;
- }
-
-
-
- RuntimeSeq<max_size_, T> FastRC() const {
- const static std::array<T, Iterations> LeftMasks(ConstructLeftMasks());
- const static std::array<T, Iterations> RightMasks(ConstructRightMasks());
- const static size_t LogTSize = log_<sizeof(T), 2>::value + 3;
-
- RuntimeSeq<max_size_, T> res(this->size());
-
- const size_t bit_size = size_ << 1;
- const size_t extra = bit_size & ((1 << LogTSize) - 1);
- const size_t to_extra = TBits - extra;
- const size_t filled = bit_size >> LogTSize;
- size_t real_length = filled;
- if(extra == 0) {
- for(size_t i = 0, j = filled - 1; i < filled; i++,j--) {
- res.data_[i] = data_[j];
- }
- } else {
- for(size_t i = 0, j = filled; i < filled && j > 0; i++,j--) {
- res.data_[i] = (data_[j] << to_extra) + (data_[j - 1] >> extra);
- }
- res.data_[filled] = (data_[0] << to_extra);
- real_length++;
- }
-
- for(size_t i = 0; i < real_length; i++) {
- res.data_[i] = res.data_[i] ^ T(-1);
- for(size_t it = 1; it < Iterations; it++) {
- size_t shift = 1 << it;
- res.data_[i] = T((res.data_[i] & LeftMasks[it]) >> shift) ^ T((res.data_[i] & RightMasks[it]) << shift);
- }
- }
-
- if(extra != 0) {
- res.data_[real_length - 1] = (res.data_[real_length - 1] & ((T(1) << extra) - 1));
- }
- return res;
- }
-
- /**
- * @variable Number of Ts which required to store all sequence.
- */
- const static size_t DataSize = (max_size_ + TNucl - 1) >> TNuclBits;
-
- /**
- * @variable Number of meaningful bytes in whick seq is stored
- */
- const static size_t TotalBytes = sizeof(T) * DataSize;
-
- typedef T DataType;
-
- static size_t GetDataSize(size_t size) {
- return (size + TNucl - 1) >> TNuclBits;
- }
-
- private:
- /* *
- * @variable Just some prime number to count the hash function of the kmer
- * */
- const static size_t PrimeNum = 239;
-
-
- // number of nucleotides in the last data_ bucket
- static size_t NuclsRemain(size_t size) {
- return size & (TNucl - 1);
- }
-
- // useful mask to fill the last element of the data_ array
- static size_t MaskForLastBucket(size_t size) {
- size_t nr = NuclsRemain(size);
- return nr != 0 ? (((T) 1) << (nr << 1) ) - 1 : -1ul;
- }
-
-
- /**
- * @variable Inner representation of sequence: array of Ts with length = DataSize.
- *
- * @invariant Invariant: all nucleotides >= size_ are 'A's (useful for comparison)
- */
- std::array<T, DataSize> data_;
-
- size_t size_;
-
- /**
- * Initialize data_ array of this object with C-string
- *
- * @param s C-string (ACGT chars only), strlen(s) = size_
- */
- void init(const char* s) {
- T data = 0;
- size_t cnt = 0;
- size_t cur = 0;
- for (size_t pos = 0; pos < size_; ++pos, ++s) { // unsafe!
- // VERIFY(is_nucl(*s)); // for performance
- data = data | ((T) dignucl(*s) << cnt);
- cnt += 2;
- if (cnt == TBits) {
- this->data_[cur++] = data;
- cnt = 0;
- data = 0;
- }
- }
- if (cnt != 0) {
- this->data_[cur++] = data;
- }
-
- for (; cur < DataSize; ++cur)
- this->data_[cur] = 0;
-
- VERIFY(*s == 0); // C-string always ends on 0
- }
-
- /**
- * Sets i-th symbol of Seq with 0123-char
- */
- inline void set(const size_t i, char c) {
- data_[i >> TNuclBits] = (data_[i >> TNuclBits] & ~((T) 3 << ((i & (TNucl - 1)) << 1))) | ((T) c << ((i & (TNucl - 1)) << 1));
- }
-
- // Template voodoo to calculate the length of the string regardless whether it is std::string or const char*
- template<class S>
- size_t size(const S& t,
- typename std::enable_if<std::is_class<S>::value, T>::type* = 0) {
- return t.size();
- }
- template<class S>
- size_t size(const S& t,
- typename std::enable_if<std::is_same<S, const char*>::value, T>::type* = 0) {
- return strlen(t);
- }
-
-
-
- public:
-
- const static size_t max_size = max_size_;
-
- RuntimeSeq() : size_(0) {
- std::fill(data_.begin(), data_.end(), 0);
- }
-
- /**
- * Default constructor, fills Seq with A's
- */
-
- explicit RuntimeSeq(size_t k): size_(k) {
- VERIFY(k <= max_size_);
- //VERIFY((T)(-1) >= (T)0);//be sure to use unsigned types
- std::fill(data_.begin(), data_.end(), 0);
- }
-
- RuntimeSeq(size_t k, const char* s): size_(k) {
- VERIFY(k <= max_size_);
- //VERIFY((T)(-1) >= (T)0);//be sure to use unsigned types
- init(s);
- }
-
-
- explicit RuntimeSeq(size_t k, const T* data_array): size_(k) {
- VERIFY(k <= max_size_);
- std::fill(data_.begin(), data_.end(), 0);
-
- size_t data_size = GetDataSize(size_);
- memcpy(data_.data(), data_array, data_size * sizeof(T));
-
- if (NuclsRemain(size_)) {
- data_[data_size - 1] = data_[data_size - 1] & MaskForLastBucket(size_);
- }
- }
-
- explicit RuntimeSeq(size_t k, T* data_array): size_(k) {
- VERIFY(k <= max_size_);
- std::fill(data_.begin(), data_.end(), 0);
-
- size_t data_size = GetDataSize(size_);
- memcpy(data_.data(), data_array, data_size * sizeof(T));
-
- if (NuclsRemain(size_)) {
- data_[data_size - 1] = data_[data_size - 1] & MaskForLastBucket(size_);
- }
- }
-
- template<size_t size2_, typename T2 = T>
- explicit RuntimeSeq(const Seq<size2_, T2>& seq, bool): size_(size2_) {
- VERIFY(size_ <= max_size_);
- std::fill(data_.begin(), data_.end(), 0);
- seq.copy_data(data_.data());
- }
-
- template<size_t size2_, typename T2 = T>
- explicit RuntimeSeq(const SimpleSeq<size2_, T2>& seq, size_t k): size_(k) {
- VERIFY(size_ <= max_size_);
- VERIFY(size2_ <= max_size_);
- std::fill(data_.begin(), data_.end(), 0);
- seq.copy_data(data_.data());
- }
-
-
- /**
- * Ultimate constructor from ACGT0123-string.
- *
- * @param s Any object with operator[], which returns 0123 chars
- * @param offset Offset when this sequence starts
- * @number_to_read A number of nucleotides, we want to fetch from this string
- * @warning assuming that s is a correct string, filled with ACGT _OR_ 0123
- * no init method, filling right here
- */
- template<typename S>
- explicit RuntimeSeq(size_t k, const S &s, size_t offset = 0): size_(k) {
- VERIFY(size_ <= max_size_);
- //TRACE("New Constructor for seq " << s[0] << " is first symbol");
- VERIFY(size_ == 0 || is_dignucl(s[0]) || is_nucl(s[0]));
- VERIFY(offset + size_ <= this->size(s));
-
- // which symbols does our string contain : 0123 or ACGT?
- bool digit_str = size_ == 0 || is_dignucl(s[0]);
-
- // data -- one temporary variable corresponding to the i-th array element
- // and some counters
- T data = 0;
- size_t cnt = 0;
- size_t cur = 0;
-
- for (size_t i = 0; i < size_; ++i) {
- //VERIFY(is_dignucl(s[i]) || is_nucl(s[i]));
-
- // we fill everything with zeros (As) by default.
- char c = (char) (digit_str ? s[offset + i] : dignucl(s[offset + i]));
-
- data = data | (T(c) << cnt);
- cnt += 2;
-
- if (cnt == TBits) {
- this->data_[cur++] = data;
- cnt = 0;
- data = 0;
- }
- }
-
- if (cnt != 0) {
- this->data_[cur++] = data;
- }
-
- for (; cur < DataSize; ++cur)
- this->data_[cur] = 0;
- }
-
- /**
- * Reads sequence from the file (in the same format as BinWrite writes it)
- * and returns false if error occured, true otherwise.
- */
- bool BinRead(std::istream& file) {
- file.read((char *) data_.data(), sizeof(T) * GetDataSize(size_));
- return !file.fail();
- }
-
- /**
- * Writes sequence to the file (in the same format as BinRead reads it)
- * and returns false if error occured, true otherwise.
- */
- bool BinWrite(std::ostream& file) const {
- file.write((const char *) data_.data(), sizeof(T) * GetDataSize(size_));
- return !file.fail();
- }
-
- /**
- * Reads sequence from the file (in the same format as BinWrite writes it)
- * and returns false if error occured, true otherwise.
- */
- static bool BinRead(std::istream& file, RuntimeSeq<max_size_, T> *seq) {
- return seq->BinRead(file);
- }
-
- /**
- * Writes sequence to the file (in the same format as BinRead reads it)
- * and returns false if error occured, true otherwise.
- */
- static bool BinWrite(std::ostream& file, const RuntimeSeq<max_size_, T> &seq) {
- return seq.BinWrite(file);
- }
-
-
- /**
- * Get i-th symbol of Seq.
- *
- * @param i Index of the symbol (0 <= i < size_)
- * @return 0123-char on position i
- */
- char operator[](const size_t i) const {
- VERIFY(i < size_);
- return (data_[i >> TNuclBits] >> ((i & (TNucl - 1)) << 1)) & 3;
- }
-
- /**::
- * Reverse complement.
- *
- * @return Reverse complement Seq.
- */
- RuntimeSeq<max_size_, T> operator!() const {
-// RuntimeSeq<max_size_, T> res(*this);
-// for (size_t i = 0; i < (size_ >> 1); ++i) {
-// auto front = complement(res[i]);
-// auto end = complement(res[size_ - 1 - i]);
-// res.set(i, end);
-// res.set(size_ - 1 - i, front);
-// }
-// if ((size_ & 1) == 1) {
-// res.set(size_ >> 1, complement(res[size_ >> 1]));
-// }
- return FastRC();
-// return res;
- }
-
- /**
- * Is the kmer minimal among this and !this.
- *
- * @return True if kmer < !kmer and false otherwise.
- */
- bool IsMinimal() const {
- for (size_t i = 0; (i << 1) + 1 <= size_; ++i) {
- auto front = this->operator[](i);
- auto end = complement(this->operator[](size_ - 1 - i));
- if(front != end)
- return front < end;
- }
- return true;
- }
-
- /**
- * Shift left
- *
- * @param c New 0123 char which should be added to the right.
- * @return Shifted (to the left) sequence with 'c' char on the right.
- */
- RuntimeSeq<max_size_, T> operator<<(char c) const {
- if (is_nucl(c)) {
- c = dignucl(c);
- }
-
- RuntimeSeq<max_size_, T> res(*this);
- std::array<T, DataSize>& data = res.data_;
-
- size_t data_size = GetDataSize(size_);
-
- if (data_size != 0) { // unless empty sequence
- T rm = data[data_size - 1] & 3;
- T lastnuclshift_ = ((size_ + TNucl - 1) & (TNucl - 1)) << 1;
- data[data_size - 1] = (data[data_size - 1] >> 2) | ((T) c << lastnuclshift_);
-
- if (data_size >= 2) { // if we have at least 2 elements in data
- for (int i = (int) data_size - 2; i >= 0; --i){
- T new_rm = data[i] & 3;
- data[i] = (data[i] >> 2) | (rm << (TBits - 2)); // we need & here because if we shift negative, it fill with ones :(
- rm = new_rm;
- }
- }
- }
- return res;
- }
-
- void operator <<=(char c) {
- if (is_nucl(c)) {
- c = dignucl(c);
- }
-
- size_t data_size = GetDataSize(size_);
-
- if (data_size == 0) {
- return;
- }
-
- for (size_t i = 0; i < data_size - 1; ++i) {
- data_[i] = (data_[i] >> 2) | (((T) data_[i + 1] & 3) << (TBits - 2));
- }
-
- T lastnuclshift_ = ((size_ + TNucl - 1) & (TNucl - 1)) << 1;
- data_[data_size - 1] = (data_[data_size - 1] >> 2) | ((T) c << lastnuclshift_);
- }
-
-//todo naming convention violation!
- RuntimeSeq<max_size_, T> pushBack(char c) const {
- //VERIFY(size_ + 1 <= max_size_);
-
- if (is_nucl(c)) {
- c = dignucl(c);
- }
- //VERIFY(is_dignucl(c));
- RuntimeSeq<max_size_, T> s(size_ + 1);
- copy(this->data_.begin(), this->data_.end(), s.data_.begin());
-
- size_t data_size = GetDataSize(size_ + 1);
-
- s.data_[data_size - 1] |= ((T) c << ((size_ & (TNucl - 1)) << 1));
-
- return s; //was: Seq<size_ + 1, T>(str() + nucl(c));
- }
-
-
-//todo naming convention violation!
- void pushBackThis(char c) {
- VERIFY(size_ + 1 <= max_size_);
-
- if (is_nucl(c)) {
- c = dignucl(c);
- }
-
- size_ += 1;
- size_t data_size = GetDataSize(size_);
-
- data_[data_size - 1] |= ((T) c << (((size_ - 1) & (TNucl - 1)) << 1));
- }
-
- // /**
- // * @todo optimize!!!
- // */
- // RuntimeSeq<max_size_, T> pushFront(char c) const {
- // VERIFY(size_ + 1 < max_size_);
- // if (is_nucl(c)) {
- // c = dignucl(c);
- // }
- // VERIFY(is_dignucl(c));
- // return RuntimeSeq<max_size_, T> (size_ + 1, nucl(c) + str());
- // }
-
- //todo naming convention violation!
- RuntimeSeq<max_size_, T> pushFront(char c) const {
- VERIFY(size_ + 1 <= max_size_);
- if (is_nucl(c)) {
- c = dignucl(c);
- }
- VERIFY(is_dignucl(c));
- RuntimeSeq<max_size_, T> res(size_ + 1);
-
- size_t data_size = GetDataSize(size_ + 1);
-
- T rm = c;
- for (size_t i = 0; i < data_size; ++i) {
- T new_rm = (data_[i] >> (TBits - 2)) & 3;
- res.data_[i] = (data_[i] << 2) | rm;
- rm = new_rm;
- }
-
- return res;
- }
-
-//todo naming convention violation!
- void pushFrontThis(char c) {
- VERIFY(size_ + 1 <= max_size_);
-
- if (is_nucl(c)) {
- c = dignucl(c);
- }
-
- size_ += 1;
- size_t data_size = GetDataSize(size_);
-
- T rm = c;
- for (size_t i = 0; i < data_size; ++i) {
- T new_rm = (data_[i] >> (TBits - 2)) & 3;
- data_[i] = (data_[i] << 2) | rm;
- rm = new_rm;
- }
- }
-
- /**
- * Shift right
- *
- * @param c New 0123 char which should be added to the left.
- * @return Shifted (to the right) sequence with 'c' char on the left.
- */
- RuntimeSeq<max_size_, T> operator>>(char c) const {
- if (is_nucl(c)) {
- c = dignucl(c);
- }
- VERIFY(is_dignucl(c));
-
- RuntimeSeq<max_size_, T> res(*this);
- size_t data_size = GetDataSize(size_);
-
- T rm = c;
- for (size_t i = 0; i < data_size; ++i) {
- T new_rm = (res.data_[i] >> (TBits - 2)) & 3;
- res.data_[i] = (res.data_[i] << 2) | rm;
- rm = new_rm;
- }
-
- res.data_[data_size - 1] &= MaskForLastBucket(size_);
-
- return res;
- }
-
- //todo remove code duplication!
- void operator>>=(char c) {
- if (is_nucl(c)) {
- c = dignucl(c);
- }
- VERIFY(is_dignucl(c));
-
- size_t data_size = GetDataSize(size_);
-
- T rm = (T)c;
- for (size_t i = 0; i < data_size; ++i) {
- T new_rm = (data_[i] >> (TBits - 2)) & 3;
- data_[i] = (data_[i] << 2) | rm;
- rm = new_rm;
- }
-
- data_[data_size - 1] &= MaskForLastBucket(size_);
- }
-
- bool operator==(const RuntimeSeq<max_size_, T>& s) const {
- VERIFY(size_ == s.size_);
- // INFO(this->full_str());
- // INFO(s.full_str());
- return 0 == memcmp(data_.data(), s.data_.data(), sizeof(T) * DataSize);
- }
-
- /**
- * @see operator ==()
- */
-
-
-
- bool operator!=(const RuntimeSeq<max_size_, T>& s) const {
- return !operator==(s);
- }
-
- /**
- * String representation of this Seq
- *
- * @return ACGT-string of length size_
- * @see nucl()
- */
- std::string str() const {
- std::string res(size_, '-');
- for (size_t i = 0; i < size_; ++i) {
- res[i] = nucl(operator[](i));
- }
- return res;
- }
- std::string err() const {
- return "";
- }
-
-
- std::string full_str() const {
- std::string res(max_size, '-');
- for (size_t i = 0; i < max_size; ++i) {
- res[i] = nucl(operator[](i));
- }
- return res;
- }
-
- size_t size() const {
- return size_;
- }
-
- size_t data_size() const {
- return GetDataSize(size_);
- }
-
- const T * data() const {
- return data_.data();
- }
-
- template<size_t size2_, typename T2 = T>
- Seq<size2_, T2> get_seq() const {
- VERIFY(size2_ == size_);
- return Seq<size2_, T2>((T2*) data_.data());
- }
-
- template<size_t size2_, typename T2 = T>
- SimpleSeq<size2_, T2> get_sseq() const {
- VERIFY(size2_ <= max_size_);
- return SimpleSeq<size2_, T2>((T2*) data_.data());
- }
-
- void copy_data(void * dst) const {
- memcpy(dst, (const void *) data_.data(), GetDataSize(size_) * sizeof(T));
- }
-
- char last() const {
- return operator[](size_ - 1);
- }
-
- char first() const {
- return operator[](0);
- }
-
- static size_t GetHash(const DataType *data, size_t sz, uint32_t seed = 0) {
- return CityHash64WithSeed((const char*)data, sz * sizeof(DataType), 0x9E3779B9 ^ seed);
- }
-
- size_t GetHash(unsigned seed = 0) const {
- return GetHash(data_.data(), GetDataSize(size_), seed);
- }
-
- struct hash {
- size_t operator()(const RuntimeSeq<max_size_, T>& seq, uint32_t seed = 0) const {
- return seq.GetHash(seed);
- }
-
- size_t operator()(const DataType *data, size_t sz, unsigned seed = 0) {
- return GetHash(data, sz, seed);
- }
- };
-
- struct less2 {
- int operator()(const RuntimeSeq<max_size_, T> &l, const RuntimeSeq<max_size_, T> &r) const {
- for (size_t i = 0; i < l.size(); ++i) {
- if (l[i] != r[i]) {
- return (l[i] < r[i]);
- }
- }
- return l.size() < r.size();
- }
- };
-
- /**
- * Denotes some (weird) order on k-mers. Works fast.
- */
- struct less2_fast {
- bool operator()(const RuntimeSeq<max_size_, T> &l, const RuntimeSeq<max_size_, T> &r) const {
- return 0 > memcmp(l.data(), r.data(), sizeof(T) * l.data_size());
- }
- };
-
-};
-
-template<size_t max_size_, typename T = seq_element_type>
-bool operator<(const RuntimeSeq<max_size_, T> &l, const RuntimeSeq<max_size_, T> &r) {
- for (size_t i = 0; i < l.size(); ++i) {
- if (l[i] != r[i]) {
- return (l[i] < r[i]);
- }
- }
-
- return l.size() < r.size();
-}
-
-
-template<size_t max_size_, typename T>
-std::ostream& operator<<(std::ostream& os, RuntimeSeq<max_size_, T> seq) {
- os << seq.str();
- return os;
-}
-
-
-#endif /* RTSEQ_HPP_ */
diff --git a/src/include/sequence/seq.hpp b/src/include/sequence/seq.hpp
deleted file mode 100755
index 848430f..0000000
--- a/src/include/sequence/seq.hpp
+++ /dev/null
@@ -1,525 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/**
- * @file seq.hpp
- * @author vyahhi
- * @version 1.0
- *
- * @section LICENSE
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- *
- * @section DESCRIPTION
- *
- * Immutable ACGT-sequence with compile-time size.
- * It compress sequence to array of Ts (default: char).
- */
-
-#ifndef SEQ_HPP_
-#define SEQ_HPP_
-
-#include <string>
-#include <array>
-#include <algorithm>
-#include <cstring>
-#include <iostream>
-
-#include <city/city.h>
-
-#include "verify.hpp"
-#include "sequence/nucl.hpp"
-#include "log.hpp"
-#include "seq_common.hpp"
-
-
-/**
- * @param T is max number of nucleotides, type for storage
- */
-template<size_t size_, typename T = seq_element_type>
-class Seq {
- public:
- /**
- * @variable Number of bits in type T (e.g. 8 for char)
- * @example 8: 2^8 = 256 or 16
- */
- const static size_t TBits = sizeof(T) << 3;
-
- /**
- * @variable Number of nucleotides that can be stored in one type T (e.g. 4 for char)
- * TNucl MUST be a power of two
- * @example 4: 8/2 = 4 or 16/2 = 8
- */
- const static size_t TNucl = TBits >> 1;
-
- /**
- * @variable Number of bits in TNucl (e.g. 2 for char). Useful for shifts instead of divisions.
- */
- const static size_t TNuclBits = log_<TNucl, 2>::value;
-
- /**
- * @variable Number of Ts which required to store all sequence.
- */
- const static size_t DataSize = (size_ + TNucl - 1) >> TNuclBits;
-
- typedef T DataType;
-
- /**
- * @variable Number of meaningful bytes in whick seq is stored
- */
- const static size_t TotalBytes = sizeof(T) * DataSize;
-
- static size_t GetDataSize(size_t size) {
- VERIFY(size == size_);
- return (size_ + TNucl - 1) >> TNuclBits;
- }
-
- private:
- /* *
- * @variable Just some prime number to count the hash function of the kmer
- * */
- const static size_t PrimeNum = 239;
-
- // number of nucleotides in the last data_ bucket
- const static size_t NuclsRemain = size_ & (TNucl - 1);
-
- // useful mask to fill the last element of the data_ array
- const static size_t MaskForLastBucket = (((T) 1) << (NuclsRemain << 1) ) - 1;
-
-
- /**
- * @variable Inner representation of sequence: array of Ts with length = DataSize.
- *
- * @invariant Invariant: all nucleotides >= size_ are 'A's (useful for comparison)
- */
- std::array<T, DataSize> data_;
-
- friend class Seq<size_ - 1, T> ;
-
- /**
- * Initialize data_ array of this object with C-string
- *
- * @param s C-string (ACGT chars only), strlen(s) = size_
- */
- void init(const char* s) {
- T data = 0;
- size_t cnt = 0;
- int cur = 0;
- for (size_t pos = 0; pos != size_; ++pos, ++s) { // unsafe!
- // VERIFY(is_nucl(*s)); // for performance
- data = data | (T)((T) dignucl(*s) << cnt);
- cnt += 2;
- if (cnt == TBits) {
- this->data_[cur++] = data;
- cnt = 0;
- data = 0;
- }
- }
- if (cnt != 0) {
- this->data_[cur++] = data;
- }
- VERIFY(*s == 0); // C-string always ends on 0
- }
-
- // Template voodoo to calculate the length of the string regardless whether it is std::string or const char*
- template<class S>
- size_t size(const S& t,
- typename std::enable_if<std::is_class<S>::value, T>::type* = 0) {
- return t.size();
- }
- template<class S>
- size_t size(const S& t,
- typename std::enable_if<std::is_same<S, const char*>::value, T>::type* = 0) {
- return strlen(t);
- }
-
- public:
- /**
- * Default constructor, fills Seq with A's
- */
- Seq() {
- std::fill(data_.begin(), data_.end(), 0);
- }
-
- Seq(const char* s) {
- init(s);
- }
-
- explicit Seq(T * data_array) {
- memcpy(data_.data(), data_array, TotalBytes);
- }
- explicit Seq(unsigned, const T * data_array) {
- memcpy(data_.data(), data_array, TotalBytes);
- }
-
-
- /**
- * Ultimate constructor from ACGT0123-string.
- *
- * @param s Any object with operator[], which returns 0123 chars
- * @param offset Offset when this sequence starts
- * @number_to_read A number of nucleotides, we want to fetch from this string
- * @raw Flag whether to check for string length (e.g. via strlen, or not)
- * @warning assuming that s is a correct string, filled with ACGT _OR_ 0123
- * no init method, filling right here
- */
- template<typename S>
- explicit Seq(const S &s, size_t offset = 0, size_t number_to_read = size_,
- bool raw = false) {
- if (this->size(s) == 0) {
- return;
- }
- VERIFY(offset < this->size(s));
- VERIFY(is_dignucl(s[offset]) || is_nucl(s[offset]));
- if (!raw)
- VERIFY(offset + number_to_read <= this->size(s));
-
- // which symbols does our string contain : 0123 or ACGT?
- bool digit_str = is_dignucl(s[offset]);
-
- // data -- one temporary variable corresponding to the i-th array element
- // and some counters
- T data = 0;
- size_t cnt = 0;
- size_t cur = 0;
-
- for (size_t i = 0; i < number_to_read; ++i) {
- //VERIFY(is_dignucl(s[i]) || is_nucl(s[i]));
-
- // we fill everything with zeros (As) by default.
- char c = digit_str ? s[offset + i] : (char)dignucl(s[offset + i]);
-
- data = data | (T(c) << cnt);
- cnt += 2;
-
- if (cnt == TBits) {
- this->data_[cur++] = data;
- cnt = 0;
- data = 0;
- }
- }
-
- if (cnt != 0) {
- this->data_[cur++] = data;
- }
-
- for (; cur != DataSize; ++cur)
- this->data_[cur] = 0;
- }
-
-
- /**
- * Get i-th symbol of Seq.
- *
- * @param i Index of the symbol (0 <= i < size_)
- * @return 0123-char on position i
- */
- char operator[](const size_t i) const {
- return (data_[i >> TNuclBits] >> ((i & (TNucl - 1)) << 1)) & 3;
- }
-
- /**
- * Reverse complement.
- *
- * @return Reverse complement Seq.
- */
- Seq<size_, T> operator!() const {
- Seq<size_, T> res(*this);
- for (size_t i = 0; i < (size_ >> 1); ++i) {
- T front = complement(res[i]);
- T end = complement(res[size_ - 1 - i]);
- res.set(i, (char)end);
- res.set(size_ - 1 - i, (char)front);
- }
- if ((size_ & 1) == 1) {
- res.set(size_ >> 1, complement(res[size_ >> 1]));
- }
- // can be made without complement calls, but with xor on all bytes afterwards.
- return res;
- }
-
- /**
- * Shift left
- *
- * @param c New 0123 char which should be added to the right.
- * @return Shifted (to the left) sequence with 'c' char on the right.
- */
- Seq<size_, T> operator<<(char c) const {
- if (is_nucl(c)) {
- c = dignucl(c);
- }
- Seq<size_, T> res(*this);
- std::array<T, DataSize>& data = res.data_;
- if (DataSize != 0) { // unless empty sequence
- T rm = data[DataSize - 1] & 3;
- T lastnuclshift_ = ((size_ + TNucl - 1) & (TNucl - 1)) << 1;
- data[DataSize - 1] = (data[DataSize - 1] >> 2) | ((T) c << lastnuclshift_);
-
- if (DataSize >= 2) { // if we have at least 2 elements in data
- int data_size = DataSize;
- for (int i = data_size - 2; i >= 0; --i){
- T new_rm = data[i] & 3;
- data[i] = (data[i] >> 2) | (rm << (TBits - 2)); // we need & here because if we shift negative, it fill with ones :(
- rm = new_rm;
- }
- }
- }
- return res;
- }
-
- Seq<size_ + 1, T> pushBack(char c) const {
- if (is_nucl(c)) {
- c = dignucl(c);
- }
- //VERIFY(is_dignucl(c));
- Seq<size_ + 1, T> s;
- copy(this->data_.begin(), this->data_.end(), s.data_.begin());
- s.data_[s.DataSize - 1] = s.data_[s.DataSize - 1] | ((T) c << ((size_ & (TNucl - 1)) << 1));
-
- return s; //was: Seq<size_ + 1, T>(str() + nucl(c));
-
- }
-
- // /**
- // * @todo optimize!!!
- // */
- // Seq<size_ + 1, T> pushFront(char c) const {
- // if (is_nucl(c)) {
- // c = dignucl(c);
- // }
- // VERIFY(is_dignucl(c));
- // return Seq<size_ + 1, T> (nucl(c) + str());
- // }
-
- Seq<size_ + 1, T> pushFront(char c) const {
- if (is_nucl(c)) {
- c = dignucl(c);
- }
- VERIFY(is_dignucl(c));
- Seq<size_ + 1, T> res;
-
- //if new kmer has more Ts
- if (Seq<size_ + 1, T>::DataSize > DataSize) {
- res.data_[DataSize] = (data_[DataSize - 1] >> (TBits - 2)) & 3;
- }
-
- T rm = c;
- for (size_t i = 0; i < DataSize; ++i) {
- T new_rm = (data_[i] >> (TBits - 2)) & 3;
- res.data_[i] = (data_[i] << 2) | rm;
- rm = new_rm;
- }
-
- return res;
- }
-
- /**
- * Shift right
- *
- * @param c New 0123 char which should be added to the left.
- * @return Shifted (to the right) sequence with 'c' char on the left.
- */
- Seq<size_, T> operator>>(char c) const {
- if (is_nucl(c)) {
- c = dignucl(c);
- }
- VERIFY(is_dignucl(c));
- Seq<size_, T> res(*this);
- T rm = c;
- for (size_t i = 0; i < DataSize; ++i) {
- T new_rm = (res.data_[i] >> (TBits - 2)) & 3;
- res.data_[i] = (res.data_[i] << 2) | rm;
- rm = new_rm;
- }
- if ((size_ & (TNucl - 1)) != 0) {
- T lastnuclshift_ = (size_ & (TNucl - 1)) << 1;
- res.data_[DataSize - 1] = res.data_[DataSize - 1] & (((T) 1
- << lastnuclshift_) - 1);
- }
- return res;
- }
-
- /**
- * Sets i-th symbol of Seq with 0123-char
- */
- inline void set(const size_t i, char c) {
- data_[i >> TNuclBits] = (data_[i >> TNuclBits] & ~((T) 3 << ((i & (TNucl - 1)) << 1))) | ((T) c << ((i & (TNucl - 1)) << 1));
- }
-
- bool operator==(const Seq<size_, T>& s) const {
- for (size_t i = 0; i < DataSize; ++i)
- if (data_[i] != s.data_[i])
- return false;
- return true;
- }
-
- /**
- * @see operator ==()
- */
-
- bool operator!=(const Seq<size_, T>& s) const {
- return !operator==(s);
- }
-
- /**
- * String representation of this Seq
- *
- * @return ACGT-string of length size_
- * @see nucl()
- */
- std::string str() const {
- std::string res(size_, '-');
- for (size_t i = 0; i != size_; ++i) {
- res[i] = nucl(operator[](i));
- }
- return res;
- }
-
- static size_t size() {
- return size_;
- }
-
-
- void copy_data(void * dst) const {
- memcpy(dst, (const void *) data_.data(), TotalBytes);
- }
-
- /**
- * Reads sequence from the file (in the same format as BinWrite writes it)
- * and returns false if error occured, true otherwise.
- */
- static bool BinRead(std::istream& file, Seq<size_> *seq) {
- file.read((char *) seq->data_.data(), sizeof(T) * DataSize);
- return !file.fail();
- }
-
- /**
- * Writes sequence to the file (in the same format as BinRead reads it)
- * and returns false if error occured, true otherwise.
- */
- static bool BinWrite(std::ostream& file, const Seq<size_> &seq) {
- file.write((const char *) seq.data_.data(), sizeof(T) * DataSize);
- return !file.fail();
- }
-
- /**
- * Reads sequence from the file (in the same format as BinWrite writes it)
- * and returns false if error occured, true otherwise.
- */
- bool BinRead(std::istream& file) {
- return BinRead(file, this);
- }
-
- /**
- * Writes sequence to the file (in the same format as BinRead reads it)
- * and returns false if error occured, true otherwise.
- */
- bool BinWrite(std::ostream& file) const {
- return BinWrite(file, *this);
- }
-
- /**
- * @see Seq
- */
- template<size_t size2_, typename T2 = T>
- Seq<size2_, T2> start() const {
- VERIFY(size2_ <= size_);
- return Seq<size2_, T2> (*this);
- }
-
- template<size_t size2_/* = size_ - 1*/, typename T2 = T>
- Seq<size2_, T2> end() const {
- VERIFY(size2_ <= size_);
- return Seq<size2_, T2> (*this, size_ - size2_);
- }
-
- const T *data() const {
- return data_.data();
- }
-
- size_t data_size() const {
- return DataSize;
- }
-
-
- char last() const {
- return operator[](size_ - 1);
- }
-
- char first() const {
- return operator[](0);
- }
-
- static size_t GetHash(const DataType *data, size_t sz = DataSize, uint32_t seed = 0) {
- return CityHash64WithSeed((const char*)data, sz * sizeof(DataType), 0x9E3779B9 ^ seed);
- }
-
- size_t GetHash(uint32_t seed = 0) const {
- return GetHash(data_.data(), DataSize, seed);
- }
-
- struct hash {
- size_t operator()(const Seq<size_, T>& seq, uint32_t seed = 0) const {
- return seq.GetHash(seed);
- }
-
- size_t operator()(const DataType *data, size_t sz = DataSize, uint32_t seed = 0) {
- return GetHash(data, sz, seed);
- }
- };
-
- struct equal_to {
- bool operator()(const Seq<size_, T>& l, const Seq<size_, T>& r) const {
- return r == l;
- }
- };
-
- struct less2 {
- bool operator()(const Seq<size_, T> &l, const Seq<size_, T> &r) const {
- for (size_t i = 0; i < size_; ++i) {
- if (l[i] != r[i]) {
- return (l[i] < r[i]);
- }
- }
- return false;
- }
- };
-
- /**
- * Denotes some (weird) order on k-mers. Works fast.
- */
- struct less2_fast {
- bool operator()(const Seq<size_, T> &l, const Seq<size_, T> &r) const {
- return 0 > memcmp(l.data_.data(), r.data_.data(), sizeof(T) * DataSize);
- }
- };
-};
-
-template<size_t size_, typename T>
-std::ostream& operator<<(std::ostream& os, Seq<size_, T> seq) {
- os << seq.str();
- return os;
-}
-
-//namespace std {
-//
-//template<size_t size_, typename T = seq_element_type>
-//struct hash<Seq<size_, T> {
-// typedef size_t result_type;
-// typedef Seq<size_, T> argument_type;
-//
-// result_type operator() (const argument_type& arg) {
-// return Seq<size_, T>::hash()(arg);
-// }
-//};
-//
-//}
-
-#endif /* SEQ_HPP_ */
diff --git a/src/include/sequence/sequence.hpp b/src/include/sequence/sequence.hpp
deleted file mode 100755
index 9bdef7a..0000000
--- a/src/include/sequence/sequence.hpp
+++ /dev/null
@@ -1,532 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef SEQUENCE_HPP_
-#define SEQUENCE_HPP_
-
-#include <vector>
-#include <string>
-#include <memory>
-#include <cstring>
-
-#include "sequence/seq.hpp"
-#include "sequence/rtseq.hpp"
-
-class Sequence {
- // Type to store Seq in Sequences
- typedef seq_element_type ST;
- // Number of bits in ST
- const static size_t STBits = sizeof(ST) << 3;
- // Number of nucleotides in ST
- const static size_t STN = (STBits >> 1);
- // Number of bits in STN (for faster div and mod)
- const static size_t STNBits = log_<STN, 2>::value;
-
- template<typename T>
- struct array_deleter {
- void operator()(const T* p) { delete[] p; }
- };
-
- private:
- size_t from_;
- size_t size_;
- bool rtl_; // Right to left + complimentary (?)
- std::shared_ptr<ST> data_;
-
- static size_t DataSize(size_t size) {
- return (size + STN - 1) >> STNBits;
- }
-
- template<typename S>
- void InitFromNucls(const S &s, bool rc = false) {
- size_t bytes_size = DataSize(size_);
- ST * bytes = data_.get();
-
- VERIFY(is_dignucl(s[0]) || is_nucl(s[0]));
-
- // Which symbols does our string contain : 0123 or ACGT?
- bool digit_str = is_dignucl(s[0]);
-
- // data -- one temporary variable corresponding to the i-th array element
- // and some counters
- ST data = 0;
- size_t cnt = 0;
- size_t cur = 0;
-
- if (rc) {
- for (int i = (int) size_ - 1; i >= 0; --i) {
- //VERIFY(is_dignucl(s[i]) || is_nucl(s[i]));
- char c = complement(digit_str ? s[(unsigned)i] : dignucl(s[(unsigned)i]));
-
- data = data | (ST(c) << cnt);
- cnt += 2;
-
- if (cnt == STBits) {
- bytes[cur++] = data;
- cnt = 0;
- data = 0;
- }
- }
- } else {
- for (size_t i = 0; i < size_; ++i) {
- //VERIFY(is_dignucl(s[i]) || is_nucl(s[i]));
- char c = digit_str ? s[i] : dignucl(s[i]);
-
- data = data | (ST(c) << cnt);
- cnt += 2;
-
- if (cnt == STBits) {
- bytes[cur++] = data;
- cnt = 0;
- data = 0;
- }
- }
- }
-
- if (cnt != 0)
- bytes[cur++] = data;
-
- for (; cur < bytes_size; ++cur)
- bytes[cur] = 0;
- }
-
-
- public:
- /**
- * Sequence initialization (arbitrary size string)
- *
- * @param s ACGT or 0123-string
- */
- explicit Sequence(const char* s, bool rc = false) :
- from_(0), size_(strlen(s)), rtl_(false), data_(new ST[DataSize(size_)], array_deleter<ST>()) {
- InitFromNucls(s, rc);
- }
-
- explicit Sequence(char* s, bool rc = false) :
- from_(0), size_(strlen(s)), rtl_(false), data_(new ST[DataSize(size_)], array_deleter<ST>()) {
- InitFromNucls(s, rc);
- }
-
- template<typename S>
- explicit Sequence(const S &s, bool rc = false):
- from_(0), size_(s.size()), rtl_(false), data_(new ST[DataSize(size_)], array_deleter<ST>()) {
- InitFromNucls(s, rc);
- }
-
- Sequence():
- from_(0), size_(0), rtl_(false), data_(new ST[DataSize(size_)], array_deleter<ST>()) {
- memset(data_.get(), 0, DataSize(size_));
- }
-
- template<size_t size2_>
- explicit Sequence(const Seq<size2_> &kmer, size_t):
- from_(0), size_(kmer.size()), rtl_(false), data_(new ST[DataSize(size_)], array_deleter<ST>()) {
-
- kmer.copy_data(data_.get());
- }
-
- template<size_t size2_>
- explicit Sequence(const RuntimeSeq<size2_> &kmer, size_t) :
- from_(0), size_(kmer.size()), rtl_(false), data_(new ST[DataSize(size_)], array_deleter<ST>()) {
-
- kmer.copy_data(data_.get());
- }
-
- Sequence(const Sequence &seq, size_t from, size_t size, bool rtl) :
- from_(from), size_(size), rtl_(rtl), data_(seq.data_) {
- }
-
- Sequence(const Sequence &s) :
- from_(s.from_), size_(s.size_), rtl_(s.rtl_), data_(s.data_) {
- }
-
- ~Sequence() {}
-
- const Sequence& operator=(const Sequence &rhs) {
- if (&rhs != this) {
- from_ = rhs.from_;
- size_ = rhs.size_;
- rtl_ = rhs.rtl_;
- data_ = rhs.data_;
- }
-
- return *this;
- }
-
- char operator[](const size_t index) const {
- //todo can be put back after switching to distributing release without asserts
- //VERIFY(index < size_);
- const ST *bytes = data_.get();
- if (rtl_) {
- size_t i = from_ + size_ - 1 - index;
- return complement((bytes[i >> STNBits] >> ((i & (STN - 1)) << 1)) & 3);
- } else {
- size_t i = from_ + index;
- return (bytes[i >> STNBits] >> ((i & (STN - 1)) << 1)) & 3;
- }
- }
-
- bool operator==(const Sequence &that) const {
- if (size_ != that.size_) {
- return false;
- }
-
- if (data_ == that.data_ && from_ == that.from_ && rtl_ == that.rtl_) {
- return true;
- }
-
- for (size_t i = 0; i < size_; ++i) {
- if (this->operator[](i) != that[i]) {
- return false;
- }
- }
- return true;
- }
-
- bool operator!=(const Sequence &that) const {
- return !(operator==(that));
- }
-
- /**
- * @todo Might be optimized via int comparison (not so easy)
- */
- bool operator<(const Sequence &that) const {
- size_t s = std::min(size_, that.size_);
- for (size_t i = 0; i < s; ++i) {
- if (this->operator[](i) != that[i]) {
- return (this->operator[](i) < that[i]);
- }
- }
- return (size_ < that.size_);
- }
-
- Sequence operator!() const {
- return Sequence(*this, from_, size_, !rtl_);
- }
-
- inline Sequence operator<<(char c) const;
- /**
- * @param from inclusive
- * @param to exclusive;
- */
- inline Sequence Subseq(size_t from, size_t to) const;
- inline Sequence Subseq(size_t from) const; // up to size_ by default
- inline Sequence First(size_t count) const;
- inline Sequence Last(size_t count) const;
- inline Sequence operator+(const Sequence &s) const;
-
- /////todo what are these methods???
- inline size_t find(const Sequence &t, size_t from = 0) const;
- inline size_t similar(const Sequence &t, size_t k, char directed = 0) const;
- inline size_t leftSimilar(const Sequence &t, size_t k) const;
- inline size_t rightSimilar(const Sequence &t, size_t k) const;
-
- /**
- * @param from inclusive
- * @param to exclusive;
- * @return true if two sequences intersect
- */
- inline bool intersects(const Sequence &t) const;
-
- template<size_t size2_>
- Seq<size2_> start() const;
-
- template<size_t size2_>
- Seq<size2_> fast_start() const;
-
- template<size_t size2_>
- Seq<size2_> end() const;
-
- template<class Seq>
- Seq start(size_t k) const;
-
- template<class Seq>
- Seq end(size_t k) const;
-
- inline std::string str() const;
- inline std::string err() const;
-
- size_t size() const {
- return size_;
- }
-
- private:
- inline bool ReadHeader(std::istream& file);
- inline bool WriteHeader(std::ostream& file) const;
-
- public:
- inline bool BinRead(std::istream& file);
- inline bool BinWrite(std::ostream& file) const;
-};
-
-inline std::ostream& operator<<(std::ostream& os, const Sequence& s);
-
-/**
- * start of Sequence is Seq with preferred size
- */
-template<size_t size2_>
-Seq<size2_> Sequence::start() const {
- //VERIFY(size2_ <= size_);
- return Seq<size2_> (*this);
-}
-
-template<size_t size2_>
-Seq<size2_> Sequence::fast_start() const {
- ST result[(size2_ + STN - 1) >> STNBits] = {0};
-
- size_t start = from_ >> STNBits;
- size_t end = (from_ + size_ - 1) >> STNBits;
- size_t shift = (from_ & (STN - 1)) << 1;
- const ST *bytes = data_.get();
-
- for (size_t i = start; i <= end; ++i) {
- result[i - start] = bytes[i] >> shift;
- }
-
- if (shift != 0) {
- shift = STBits - shift;
-
- for (size_t i = start + 1; i <= end; ++i) {
- result[i - start - 1] |= bytes[i] << shift;
- }
- }
-
- return (rtl_ ? !Seq<size2_>(result) : Seq<size2_>(result));
-}
-
-template<size_t size2_>
-Seq<size2_> Sequence::end() const {
- return Seq<size2_>(*this, size_ - size2_);
-}
-
-
-template<class Seq>
-Seq Sequence::start(size_t k) const {
- return Seq(unsigned(k), *this);
-}
-
-template<class Seq>
-Seq Sequence::end(size_t k) const {
- return Seq(unsigned(k), *this, size_ - k);
-}
-
-
-Sequence Sequence::First(size_t count) const {
- return Subseq(0, count);
-}
-
-Sequence Sequence::Last(size_t count) const {
- return Subseq(size_ - count);
-}
-
-bool Sequence::intersects(const Sequence &t) const {
- for (size_t i = 0; i < std::min(size_, t.size_); ++i) {
- if (this->operator[](i) == t[i]) {
- return true;
- }
- }
- return false;
-}
-
-// O(1)
-//including from, excluding to
-//safe if not #DEFINE NDEBUG
-Sequence Sequence::Subseq(size_t from, size_t to) const {
- // cerr << endl<<"subseq:" << from <<" " << to << " " << this->str() << endl;
- VERIFY(to >= from);
- VERIFY(to <= size_);
- //VERIFY(to - from <= size_);
- if (rtl_) {
- return Sequence(*this, from_ + size_ - to, to - from, true);
- } else {
- return Sequence(*this, from_ + from, to - from, false);
- }
-}
-
-//including from, excluding to
-Sequence Sequence::Subseq(size_t from) const {
- return Subseq(from, size_);
-}
-
-/**
- * @todo : must be KMP or hashing instead of this
- */
-size_t Sequence::find(const Sequence &t, size_t from) const {
- for (size_t i = from; i <= size() - t.size(); i++) {
- if (Subseq(i, i + t.size()) == t) {
- return i;
- }
- }
- return -1ULL;
-}
-
-/**
- *
- *@param k minimal intersection of sequences
- *@param directed LEFT means that after intersection t continues to left over _this and matches perfectly with _this on overlaping
- *@return 0 - undirected similarity, 1: t extends this to right, -1: this extends t
- *
- */
-size_t Sequence::similar(const Sequence &t, size_t k, char directed) const {
- size_t result = 0;
- if (directed != -1)
- result |= rightSimilar(t, k);
- if (directed != 1)
- result |= leftSimilar(t, k);
- return result;
-}
-
-size_t Sequence::leftSimilar(const Sequence &t, size_t k) const {
- return t.rightSimilar(*this, k);
-}
-
-size_t Sequence::rightSimilar(const Sequence &t, size_t k) const {
- size_t tsz = t.size();
- size_t sz = size();
- Sequence d(t.Subseq(0, k));
- for (size_t res = find(d, 0); res != -1ULL; res = find(d, res + 1)) {
- if (res + tsz < sz)
- continue;
- size_t i;
- for (i = k; i + res < sz; i++) {
- if (t[i] != this->operator[](i + res)) {
- break;
- };
- }
- if (i == sz - res)
- return 1;
- }
- return 0;
-}
-
-/**
- * @todo optimize
- */
-Sequence Sequence::operator+(const Sequence &s) const {
- return Sequence(str() + s.str());
- // TODO might be opposite to correct
- // int total = size_ + s.size_;
- // std::vector<Seq<4> > bytes((total + 3) >> 2);
- // for (size_t i = 0; i < size_; ++i) {
- // bytes[i / 4] = (bytes[i / 4] << operator [](i)); // TODO :-) use <<=
- // }
- // for (size_t i = 0, j = size_; i < s.size_; ++i, ++j) {
- // bytes[j / 4] = (bytes[j / 4]) << s[i];
- // }
- // return Sequence(new Data(bytes), 0, total, false);
-}
-
-std::string Sequence::str() const {
- std::string res(size_, '-');
- for (size_t i = 0; i < size_; ++i) {
- res[i] = nucl(this->operator[](i));
- }
- return res;
-}
-
-std::string Sequence::err() const {
- std::ostringstream oss;
- oss << "{ *data=" << data_ <<
- ", from_=" << from_ <<
- ", size_=" << size_ <<
- ", rtl_=" << int(rtl_) << " }";
- return oss.str();
-}
-
-std::ostream& operator<<(std::ostream& os, const Sequence& s) {
- os << s.str();
- return os;
-}
-
-bool Sequence::ReadHeader(std::istream& file) {
- file.read((char*) &size_, sizeof(size_));
-
- from_ = 0;
- rtl_ = false;
-
- return !file.fail();
-}
-
-bool Sequence::WriteHeader(std::ostream& file) const {
- VERIFY(from_ == 0);
- VERIFY(!rtl_);
-
- file.write((const char *) &size_, sizeof(size_));
-
- return !file.fail();
-}
-
-
-bool Sequence::BinRead(std::istream& file) {
- ReadHeader(file);
-
- data_ = std::shared_ptr<ST>(new ST[DataSize(size_)], array_deleter<ST>());
- file.read((char*)data_.get(), DataSize(size_) * sizeof(ST));
-
- return !file.fail();
-}
-
-
-bool Sequence::BinWrite(std::ostream& file) const {
- if (from_ != 0 || rtl_) {
- Sequence clear(this->str());
- return clear.BinWrite(file);
- }
-
- WriteHeader(file);
-
- file.write((const char*)data_.get(), DataSize(size_) * sizeof(ST));
-
- return !file.fail();
-}
-
-/**
- * @class SequenceBuilder
- * @section DESCRIPTION
- *
- * Class was created for build sequence. It is included method: size(), append()
- */
-
-class SequenceBuilder {
- std::vector<char> buf_;
- public:
- template<typename S>
- SequenceBuilder& append(const S &s) {
- for (size_t i = 0; i < s.size(); ++i) {
- buf_.push_back(s[i]);
- }
- return *this;
- }
-
- SequenceBuilder& append(char c) {
- buf_.push_back(c);
- return *this;
- }
-
- Sequence BuildSequence() {
- return Sequence(buf_);
- }
-
- size_t size() const {
- return buf_.size();
- }
-
- char operator[](const size_t index) const {
- VERIFY(index < buf_.size());
- return buf_[index];
- }
-
- std::string str() const {
- std::string s(buf_.size(), '-');
- for (size_t i = 0; i < s.size(); ++i) {
- s[i] = nucl(buf_[i]);
- }
- return s;
- }
-};
-
-#endif /* SEQUENCE_HPP_ */
diff --git a/src/include/sequence/sequence_tools.hpp b/src/include/sequence/sequence_tools.hpp
deleted file mode 100644
index 7e33bdb..0000000
--- a/src/include/sequence/sequence_tools.hpp
+++ /dev/null
@@ -1,159 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef SEQUENCE_TOOLS_HPP_
-#define SEQUENCE_TOOLS_HPP_
-
-#include <sstream>
-#include <string>
-#include <vector>
-
-#include "sequence/nucl.hpp"
-#include "sequence/sequence.hpp"
-#include "levenshtein.hpp"
-
-inline const std::string Reverse(const std::string &s) {
- return std::string(s.rbegin(), s.rend());
-}
-
-inline const std::string Complement(const std::string &s) {
- std::string res(s.size(), 0);
- transform(s.begin(), s.end(), res.begin(), nucl_complement);
- return res;
-}
-
-inline const Sequence MergeOverlappingSequences(std::vector<Sequence>& ss,
- size_t overlap, bool safe_merging = true) {
- if (ss.empty()) {
- return Sequence();
- }
- SequenceBuilder sb;
- Sequence prev_end = ss.front().Subseq(0, overlap);
- sb.append(prev_end);
- for (auto it = ss.begin(); it != ss.end(); ++it) {
- if(safe_merging)
- VERIFY(prev_end == it->Subseq(0, overlap));
- sb.append(it->Subseq(overlap));
- prev_end = it->Subseq(it->size() - overlap);
- }
- return sb.BuildSequence();
-}
-
-inline size_t EditDistance(const Sequence& s1, const Sequence& s2) {
- return edit_distance(s1.str(), s2.str());
-}
-
-inline bool Relax(int& val, int new_val) {
- if (new_val > val) {
- val = new_val;
- return true;
- }
- return false;
-}
-
-inline std::pair<size_t, size_t> LocalSimilarity(const Sequence& s1, const Sequence& s2) {
- size_t m = s1.size();
- size_t n = s2.size();
- std::vector<std::vector<int>> a(m + 1);
- for (size_t i = 0; i <= m; ++i) {
- a[i].resize(n + 1);
- }
- for (size_t i = 0; i <= m; ++i) {
- for (size_t j = 0; j <= n; ++j) {
- a[i][j] = 0;
- }
- }
- for (size_t i = 1; i <= m; ++i) {
- for (size_t j = 1; j <= n; ++j) {
- Relax(a[i][j], a[i - 1][j] - 1);
- Relax(a[i][j], a[i][j - 1] - 1);
- if (s1[i - 1] == s2[j - 1]) {
- Relax(a[i][j], a[i - 1][j - 1] + 1);
- } else {
- Relax(a[i][j], a[i - 1][j - 1] - 1);
- }
- }
- }
-
- //finding local alignment
- int answer = 0;
- size_t i_m = 0;
- size_t j_m = 0;
- for (size_t i = 0; i <= m; ++i) {
- for (size_t j = 0; j <= n; ++j) {
- if (Relax(answer, a[i][j])) {
- i_m = i;
- j_m = j;
- }
- }
- }
-
- //finding alignment lengths
- size_t i = i_m;
- size_t j = j_m;
- while (a[i][j] > 0) {
- if (a[i][j] == a[i][j - 1] - 1) {
- j--;
- } else if (a[i][j] == a[i-1][j] - 1) {
- i--;
- } else if (a[i][j] == a[i-1][j-1] + 1) {
- VERIFY(s1[i-1] == s2[j-1]);
- i--;
- j--;
- } else {
- VERIFY(a[i-1][j-1] - 1 == a[i][j] && s1[i-1] != s2[j-1]);
- i--;
- j--;
- }
- }
- return std::make_pair(size_t(answer), std::min(i_m - i, j_m - j));
-}
-
-inline const std::string ReverseComplement(const std::string &s) {
- std::string res(s.size(), 0);
- transform(s.begin(), s.end(), res.rbegin(), nucl_complement); // only difference with reverse is rbegin() instead of begin()
- return res;
-}
-
-class UniformPositionAligner {
-private:
- size_t upper_length_;
- size_t lower_length_;
-public:
- UniformPositionAligner(size_t upper_length, size_t lower_length) :
- upper_length_(upper_length), lower_length_(lower_length) {
- }
-
- size_t GetPosition(size_t upper_position) {
- if (upper_position * 2 + 1 >= upper_length_)
- return (2 * upper_position + 1) * lower_length_
- / (2 * upper_length_);
- else
- return lower_length_ - 1
- - GetPosition(upper_length_ - 1 - upper_position);
- }
-};
-
-class EnsureEndsPositionAligner {
-private:
- size_t upper_length_;
- size_t lower_length_;
-public:
- EnsureEndsPositionAligner(size_t upper_length, size_t lower_length) :
- upper_length_(upper_length), lower_length_(lower_length) {
- }
-
- size_t GetPosition(size_t upper_position) {
- VERIFY(upper_position > 0);
- if (lower_length_ == 1)
- return 1;
- return (2 * upper_position * lower_length_ + upper_length_)
- / (2 * upper_length_);
- }
-};
-
-#endif /* SEQUENCE_TOOLS_HPP_ */
diff --git a/src/include/sequence/simple_seq.hpp b/src/include/sequence/simple_seq.hpp
deleted file mode 100644
index ecd6b9c..0000000
--- a/src/include/sequence/simple_seq.hpp
+++ /dev/null
@@ -1,154 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * simple_seq.hpp
- *
- * Created on: Jul 23, 2012
- * Author: andrey
- */
-
-#ifndef SIMPLE_SEQ_HPP_
-#define SIMPLE_SEQ_HPP_
-
-#include <string>
-#include <array>
-#include <algorithm>
-#include <cstring>
-#include <iostream>
-
-#include "verify.hpp"
-#include "sequence/nucl.hpp"
-#include "log.hpp"
-#include "seq_common.hpp"
-/**
- * @param T is max number of nucleotides, type for storage
- */
-template<size_t size_, typename T = seq_element_type>
-class SimpleSeq {
-public:
- /**
- * @variable Number of bits in type T (e.g. 8 for char)
- * @example 8: 2^8 = 256 or 16
- */
- const static size_t TBits = sizeof(T) << 3;
-
- /**
- * @variable Number of nucleotides that can be stored in one type T (e.g. 4 for char)
- * TNucl MUST be a power of two
- * @example 4: 8/2 = 4 or 16/2 = 8
- */
- const static size_t TNucl = TBits >> 1;
-
- /**
- * @variable Number of bits in TNucl (e.g. 2 for char). Useful for shifts instead of divisions.
- */
- const static size_t TNuclBits = log_<TNucl, 2>::value;
-
- /**
- * @variable Number of Ts which required to store all sequence.
- */
- const static size_t DataSize = (size_ + TNucl - 1) >> TNuclBits;
-
- typedef T DataType;
-
- /**
- * @variable Number of meaningful bytes in whick seq is stored
- */
- const static size_t TotalBytes = sizeof(T) * DataSize;
-
-private:
- // number of nucleotides in the last data_ bucket
- const static size_t NuclsRemain = size_ & (TNucl - 1);
-
- // useful mask to fill the last element of the data_ array
- const static size_t MaskForLastBucket = (((T) 1) << (NuclsRemain << 1) ) - 1;
-
-
- /**
- * @variable Inner representation of sequence: array of Ts with length = DataSize.
- *
- * @invariant Invariant: all nucleotides >= size_ are 'A's (useful for comparison)
- */
- std::array<T, DataSize> data_;
-
-
-public:
-
- SimpleSeq() {
- //VERIFY((T)(-1) >= (T)0);//be sure to use unsigned types
- std::fill(data_.begin(), data_.end(), 0);
- }
-
- explicit SimpleSeq(T * data_array) {
- memcpy(data_.data(), data_array, TotalBytes);
- }
-
-
- char operator[](const size_t i) const {
- //VERIFY(i >= 0);
- //VERIFY(i < size_);
- return (data_[i >> TNuclBits] >> ((i & (TNucl - 1)) << 1)) & 3;
- }
-
- std::string str() const {
- std::string res(size_, '-');
- for (size_t i = 0; i < size_; ++i) {
- res[i] = nucl(operator[](i));
- }
- return res;
- }
-
- void copy_data(void * dst) const {
- memcpy(dst, (const void *) data_.data(), TotalBytes);
- }
-
- static size_t GetHash(const DataType *data, size_t sz, uint32_t seed = 0) {
- return CityHash64WithSeed((const char*)data, sz * sizeof(DataType), 0x9E3779B9 ^ seed);
- }
-
- size_t GetHash(uint32_t seed = 0) const {
- return GetHash(data_.data(), DataSize, seed);
- }
-
- struct hash {
- size_t operator()(const SimpleSeq<size_, T>& seq, uint32_t seed = 0) const {
- return seq.GetHash(seed);
- }
-
- size_t operator()(const DataType *data, size_t sz, unsigned seed = 0) {
- return GetHash(data, sz, seed);
- }
- };
-
- struct equal_to {
- bool operator()(const SimpleSeq<size_, T>& l, const SimpleSeq<size_, T>& r) const {
- return memcmp(l.data_.data(), r.data_.data(), sizeof(T) * DataSize) == 0;
- }
- };
-
- struct less2 {
- int operator()(const SimpleSeq<size_, T> &l, const SimpleSeq<size_, T> &r) const {
- for (size_t i = 0; i < size_; ++i) {
- if (l[i] != r[i]) {
- return (l[i] < r[i]);
- }
- }
- return false;
- }
- };
-
-};
-
-template<size_t size_, typename T>
-std::ostream& operator<<(std::ostream& os, SimpleSeq<size_, T> seq) {
- os << seq.str();
- return os;
-}
-
-
-#endif /* SIMPLE_SEQ_HPP_ */
diff --git a/src/include/simple_tools.hpp b/src/include/simple_tools.hpp
deleted file mode 100644
index 3f8e859..0000000
--- a/src/include/simple_tools.hpp
+++ /dev/null
@@ -1,184 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * simple_tools.hpp
- *
- * Created on: 27.05.2011
- * Author: vyahhi
- */
-
-#ifndef SIMPLE_TOOLS_HPP_
-#define SIMPLE_TOOLS_HPP_
-
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-#include "verify.hpp"
-#include "io/ireader.hpp"
-#include "path_helper.hpp"
-#include <memory>
-#include <string>
-#include <set>
-#include <vector>
-
-/**
- * Converts anything to string (using ostringstream).
- */
-template <typename T>
-std::string ToString(const T& t) {
- std::ostringstream ss;
- ss << t;
- return ss.str();
-}
-
-template <typename T>
-std::string ToString(const T& t, size_t length) {
- std::ostringstream ss;
- ss << t;
- std::string result = ss.str();
- while(result.size() < length)
- result = "0" + result;
- return result;
-}
-
-template <typename T>
-std::string ToString(std::vector<T>& t) {
- std::ostringstream ss;
- ss << "Size "<<t.size()<<": [";
- for (auto it = t.begin(); it != t.end(); ++it)
- ss<<*it<<", ";
- ss<<"]";
- return ss.str();
-}
-
-template <typename T>
-std::string ToString(std::set<T>& t) {
- std::ostringstream ss;
- ss << "Size "<<t.size()<<": [";
- for (auto it = t.begin(); it != t.end(); ++it)
- ss<<*it<<", ";
- ss<<"]";
- return ss.str();
-}
-
-template<typename T>
-inline const std::pair<T, T> ReversePair(std::pair<T, T> ep) {
- return std::pair<T, T>(ep.second, ep.first);
-}
-
-template <class ContainerT1, class ContainerT2>
-void push_back_all(ContainerT1& target, const ContainerT2& to_insert) {
- target.insert(target.end(), to_insert.begin(), to_insert.end());
-}
-
-template <class ContainerT1, class ContainerT2>
-void insert_all(ContainerT1& target, const ContainerT2& to_insert) {
- target.insert(to_insert.begin(), to_insert.end());
-}
-
-template<class MapT>
-std::set<typename MapT::key_type> key_set(const MapT& m) {
- std::set<typename MapT::key_type> answer;
- for (auto it = m.begin(); it != m.end(); ++it) {
- answer.insert(it->first);
- }
- return answer;
-}
-
-template<class MapT>
-std::set<typename MapT::mapped_type> value_set(const MapT& m) {
- std::set<typename MapT::mapped_type> answer;
- for (auto it = m.begin(); it != m.end(); ++it) {
- answer.insert(it->second);
- }
- return answer;
-}
-
-template <class MapT>
-const typename MapT::mapped_type& get(const MapT& from, const typename MapT::key_type& key) {
- auto it = from.find(key);
- VERIFY(it != from.end());
- return it->second;
-}
-
-template <class MapT>
-typename MapT::mapped_type& get(MapT& from, const typename MapT::key_type& key) {
- auto it = from.find(key);
- VERIFY(it != from.end());
- return it->second;
-}
-
-template <class MMapT>
-const std::vector<typename MMapT::mapped_type> get_all(const MMapT& from, const typename MMapT::key_type& key) {
- std::vector<typename MMapT::mapped_type> answer;
- for (auto it = from.lower_bound(key); it != from.upper_bound(key); ++it) {
- answer.push_back(it->second);
- }
- return answer;
-}
-
-class TmpFolderFixture
-{
- std::string tmp_folder_;
-
-public:
- TmpFolderFixture(std::string tmp_folder = "tmp") :
- tmp_folder_(tmp_folder)
- {
- path::make_dir(tmp_folder_);
- }
-
- ~TmpFolderFixture()
- {
- path::remove_dir(tmp_folder_);
- }
-};
-
-namespace std
-{
-template<class T1, class T2>
-std::ostream& operator<< (std::ostream& os, std::pair<T1, T2> const& pair)
-{
- return os << "(" << pair.first << ", " << pair.second << ")";
-}
-//}
-
-//namespace omnigraph
-//{
-template<class T>
-std::ostream& operator<< (std::ostream& os, const std::vector<T>& v)
-{
- os << "[";
- std::string delim = "";
- for (auto it = v.begin(); it != v.end(); ++it) {
- os << delim << *it;
- delim = ", ";
- }
-// std::copy(v.begin(), v.end(), std::ostream_iterator<T>(os, ", "));
- os << "]";
- return os;
-}
-
-template<class T>
-std::ostream& operator<< (std::ostream& os, const std::set<T>& set)
-{
- os << "{";
- bool delim = false;
- for (const auto& i : set) {
- if (delim) os << ", ";
- os << i;
- delim = true;
- }
- os << "}";
- return os;
-}
-
-}
-
-#endif /* SIMPLE_TOOLS_HPP_ */
diff --git a/src/include/smooth.hpp b/src/include/smooth.hpp
deleted file mode 100644
index f65d5ce..0000000
--- a/src/include/smooth.hpp
+++ /dev/null
@@ -1,193 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef __SMOTH_HPP__
-#define __SMOTH_HPP__
-
-#include <cmath>
-
-namespace math {
-
-template<typename T>
-static T MedianOf3(T u, T v, T w) {
- /* Median(u,v,w): */
- if ((u <= v && v <= w) ||
- (u >= v && v >= w))
- return v;
- if ((u <= w && w <= v) ||
- (u >= w && w >= v))
- return w;
-
- /* else */ return u;
-}
-
-/* Return (Index-1) of median(u,v,w) , i.e.,
- -1 : u
- 0 : v
- 1 : w
-*/
-template<typename T>
-static int IndexOfMedianOf3(T u, T v, T w) {
- if ((u <= v && v <= w) ||
- (u >= v && v >= w)) return 0;
- if ((u <= w && w <= v) ||
- (u >= w && w >= v)) return 1;
-
- /* else */ return -1;
-}
-
-enum {
- SmoothNoEndRule,
- SmoothCopyEndRule,
- SmoothTukeyEndRule
-};
-
-template<typename T>
-static bool SmoothEndStep(const T *x, T *y, size_t n, unsigned end_rule) {
- switch (end_rule) {
- default:
- case SmoothNoEndRule:
- return false;
- case SmoothCopyEndRule:
- y[0] = x[0];
- y[n-1] = x[n-1];
- return false;
- case SmoothTukeyEndRule: {
- bool chg = false;
- y[0] = MedianOf3(3*y[1] - 2*y[2], x[0], y[1]);
- chg = chg || (y[0] != x[0]);
- y[n-1] = MedianOf3(y[n-2], x[n-1], 3*y[n-2] - 2*y[n-3]);
- chg = chg || (y[n-1] != x[n-1]);
- return chg;
- }
- }
-
- return false;
-}
-
-template<typename T>
-static bool Smooth3(const T *x, T *y, size_t n, unsigned end_rule) {
- // y[] := Running Median of three (x) = "3 (x[])" with "copy ends"
- // --- return chg := ( y != x )
- bool chg = false;
-
- for (size_t i = 1; i < n-1; i++) {
- int j = IndexOfMedianOf3(x[i-1], x[i], x[i+1]);
- y[i] = x[(int)i + j];
- chg = chg || j;
- }
-
- chg |= SmoothEndStep(x, y, n, end_rule);
-
- return chg;
-}
-
-template<typename T>
-static size_t Smooth3R(const T *x, T *y, T *z, size_t n, unsigned end_rule) {
- // y[] := "3R"(x) ; 3R = Median of three, repeated until convergence
- size_t iter;
- bool chg;
-
- iter = chg = Smooth3(x, y, n, SmoothCopyEndRule);
-
- while (chg) {
- if ((chg = Smooth3(y, z, n, SmoothNoEndRule))) {
- iter += 1;
- for (size_t i = 1; i < n-1; i++)
- y[i] = z[i];
- }
- }
-
- chg |= SmoothEndStep(x, y, n, end_rule);
-
- return (iter ? iter : chg);
- /* = 0 <==> only one "3" w/o any change
- = 1 <==> either ["3" w/o change + endchange]
- or [two "3"s, 2nd w/o change ] */
-}
-
-
-template<typename T>
-static bool SplitTest(const T *x, size_t i) {
- // Split test:
- // Are we at a /-\ or \_/ location => split should be made ?
-
- if (x[i] != x[i+1])
- return false;
-
- if ((x[i-1] <= x[i] && x[i+1] <= x[i+2]) ||
- (x[i-1] >= x[i] && x[i+1] >= x[i+2]))
- return false;
-
- /* else */ return true;
-}
-
-template<typename T>
-static bool SmoothSplit3(const T *x, T *y, size_t n, bool do_ends) {
- // y[] := S(x[]) where S() = "sm_split3"
- bool chg = false;
-
- for (size_t i = 0; i < n; i++)
- y[i] = x[i];
-
- if (do_ends && SplitTest(x, 1)) {
- chg = true;
- y[1] = x[0];
- y[2] = MedianOf3(x[2], x[3], 3*x[3] - 2*x[4]);
- }
-
- for (size_t i = 2; i < n-3; i++) {
- if (SplitTest(x, i)) {
- int j;
- // plateau at x[i] == x[i+1]
-
- // at left:
- if (-1 < (j = IndexOfMedianOf3(x[i ], x[i-1], 3*x[i-1] - 2*x[i-2]))) {
- y[i] = (j == 0 ? x[i-1] : 3*x[i-1] - 2*x[i-2]);
- chg = (y[i] != x[i]);
- }
-
- // at right:
- if (-1 < (j = IndexOfMedianOf3(x[i+1], x[i+2], 3*x[i+2] - 2*x[i+3]))) {
- y[i+1] = (j == 0 ? x[i+2] : 3*x[i+2] - 2*x[i+3]);
- chg = (y[i+1] != x[i+1]);
- }
- }
- }
-
- if (do_ends && SplitTest(x, n-3)) {
- chg = true;
- y[n-2] = x[n-1];
- y[n-3] = MedianOf3(x[n-3], x[n-4], 3*x[n-4] - 2*x[n-5]);
- }
-
- return chg;
-}
-
-template<typename T>
-size_t Smooth3RS3R(std::vector<T> &y, const std::vector<T> &x,
- unsigned end_rule = SmoothTukeyEndRule, bool split_ends = false) {
- // y[1:n] := "3R S 3R"(x[1:n]); z = "work";
- size_t iter;
- bool chg;
- size_t n = x.size();
-
- y.resize(n);
- std::vector<T> z(n), w(n);
-
- iter = Smooth3R (&x[0], &y[0], &z[0], n, end_rule);
- chg = SmoothSplit3(&y[0], &z[0], n, split_ends);
- if (chg)
- iter += Smooth3R(&z[0], &y[0], &w[0], n, end_rule);
-
- /* else y == z already */
- return (iter + chg);
-}
-
-};
-
-#endif
diff --git a/src/include/ssw/ssw_cpp.h b/src/include/ssw/ssw_cpp.h
deleted file mode 100644
index cdcf717..0000000
--- a/src/include/ssw/ssw_cpp.h
+++ /dev/null
@@ -1,219 +0,0 @@
-#ifndef COMPLETE_STRIPED_SMITH_WATERMAN_CPP_H_
-#define COMPLETE_STRIPED_SMITH_WATERMAN_CPP_H_
-
-#include <stdint.h>
-#include <string>
-#include <vector>
-
-namespace StripedSmithWaterman {
-
-struct Alignment {
- uint16_t sw_score; // The best alignment score
- uint16_t sw_score_next_best; // The next best alignment score
- int32_t ref_begin; // Reference begin position of the best alignment
- int32_t ref_end; // Reference end position of the best alignment
- int32_t query_begin; // Query begin position of the best alignment
- int32_t query_end; // Query end position of the best alignment
- int32_t ref_end_next_best; // Reference end position of the next best alignment
- int32_t mismatches; // Number of mismatches of the alignment
- std::string cigar_string; // Cigar string of the best alignment
- std::vector<uint32_t> cigar; // Cigar stored in the BAM format
- // high 28 bits: length
- // low 4 bits: M/I/D/S/X (0/1/2/4/8);
- void Clear() {
- sw_score = 0;
- sw_score_next_best = 0;
- ref_begin = 0;
- ref_end = 0;
- query_begin = 0;
- query_end = 0;
- ref_end_next_best = 0;
- mismatches = 0;
- cigar_string.clear();
- cigar.clear();
- };
-};
-
-struct Filter {
- // NOTE: No matter the filter, those five fields of Alignment will be given anyway.
- // sw_score; sw_score_next_best; ref_end; query_end; ref_end_next_best.
- // NOTE: Only need score of alignments, please set 'report_begin_position'
- // and 'report_cigar' false.
-
- bool report_begin_position; // Give ref_begin and query_begin.
- // If it is not set, ref_begin and query_begin are -1.
- bool report_cigar; // Give cigar_string and cigar.
- // report_begin_position is automatically TRUE.
-
- // When *report_cigar* is true and alignment passes these two filters,
- // cigar_string and cigar will be given.
- uint16_t score_filter; // score >= score_filter
- uint16_t distance_filter; // ((ref_end - ref_begin) < distance_filter) &&
- // ((query_end - read_begin) < distance_filter)
-
- Filter()
- : report_begin_position(true)
- , report_cigar(true)
- , score_filter(0)
- , distance_filter(32767)
- {};
-
- Filter(const bool& pos, const bool& cigar, const uint16_t& score, const uint16_t& dis)
- : report_begin_position(pos)
- , report_cigar(cigar)
- , score_filter(score)
- , distance_filter(dis)
- {};
-};
-
-class Aligner {
- public:
- // =========
- // @function Construct an Aligner on default values.
- // The function will build the {A.C,G,T,N} aligner.
- // If you target for other character aligners, then please
- // use the other constructor and pass the corresponding matrix in.
- // =========
- Aligner(void);
-
- // =========
- // @function Construct an Aligner by assigning scores.
- // The function will build the {A.C,G,T,N} aligner.
- // If you target for other character aligners, then please
- // use the other constructor and pass the corresponding matrix in.
- // =========
- Aligner(const uint8_t& match_score,
- const uint8_t& mismatch_penalty,
- const uint8_t& gap_opening_penalty,
- const uint8_t& gap_extending_penalty);
-
- // =========
- // @function Construct an Aligner by the specific matrixs.
- // =========
- Aligner(const int8_t* score_matrix,
- const int& score_matrix_size,
- const int8_t* translation_matrix,
- const int& translation_matrix_size);
-
- ~Aligner(void);
-
- // =========
- // @function Build the reference sequence and thus make
- // Align(const char* query, s_align* alignment) function;
- // otherwise the reference should be given when aligning.
- // [NOTICE] If there exists a sequence, that one will be deleted
- // and replaced.
- // @param seq The reference bases;
- // [NOTICE] It is not necessary null terminated.
- // @param length The length of bases will be be built.
- // @return The length of the built bases.
- // =========
- int SetReferenceSequence(const char* seq, const int& length);
-
- void CleanReferenceSequence(void);
-
- // =========
- // @function Set penalties for opening and extending gaps
- // [NOTICE] The defaults are 3 and 1 respectively.
- // =========
- void SetGapPenalty(const uint8_t& opening, const uint8_t& extending) {
- gap_opening_penalty_ = opening;
- gap_extending_penalty_ = extending;
- };
-
- // =========
- // @function Align the query againt the reference that is set by
- // SetReferenceSequence.
- // @param query The query sequence.
- // @param filter The filter for the alignment.
- // @param alignment The container contains the result.
- // @return True: succeed; false: fail.
- // =========
- bool Align(const char* query, const Filter& filter, Alignment* alignment) const;
-
- // =========
- // @function Align the query againt the reference.
- // [NOTICE] The reference won't replace the reference
- // set by SetReferenceSequence.
- // @param query The query sequence.
- // @param ref The reference sequence.
- // [NOTICE] It is not necessary null terminated.
- // @param ref_len The length of the reference sequence.
- // @param filter The filter for the alignment.
- // @param alignment The container contains the result.
- // @return True: succeed; false: fail.
- // =========
- bool Align(const char* query, const char* ref, const int& ref_len,
- const Filter& filter, Alignment* alignment) const;
-
- // @function Clear up all containers and thus the aligner is disabled.
- // To rebuild the aligner please use Build functions.
- void Clear(void);
-
- // =========
- // @function Rebuild the aligner's ability on default values.
- // [NOTICE] If the aligner is not cleaned, rebuilding will fail.
- // @return True: succeed; false: fail.
- // =========
- bool ReBuild(void);
-
- // =========
- // @function Rebuild the aligner's ability by the specific matrixs.
- // [NOTICE] If the aligner is not cleaned, rebuilding will fail.
- // @return True: succeed; false: fail.
- // =========
- bool ReBuild(
- const uint8_t& match_score,
- const uint8_t& mismatch_penalty,
- const uint8_t& gap_opening_penalty,
- const uint8_t& gap_extending_penalty);
-
- // =========
- // @function Construct an Aligner by the specific matrixs.
- // [NOTICE] If the aligner is not cleaned, rebuilding will fail.
- // @return True: succeed; false: fail.
- // =========
- bool ReBuild(
- const int8_t* score_matrix,
- const int& score_matrix_size,
- const int8_t* translation_matrix,
- const int& translation_matrix_size);
-
- private:
- int8_t* score_matrix_;
- int score_matrix_size_;
- int8_t* translation_matrix_;
-
- uint8_t match_score_; // default: 2
- uint8_t mismatch_penalty_; // default: 2
- uint8_t gap_opening_penalty_; // default: 3
- uint8_t gap_extending_penalty_; // default: 1
-
- int8_t* translated_reference_;
- int32_t reference_length_;
-
- int TranslateBase(const char* bases, const int& length, int8_t* translated) const;
- void SetAllDefault(void);
- void BuildDefaultMatrix(void);
- void ClearMatrices(void);
-
- Aligner& operator= (const Aligner&);
- Aligner (const Aligner&);
-}; // class Aligner
-
-
-// ================
-// inline functions
-// ================
-inline void Aligner::CleanReferenceSequence(void) {
- if (reference_length_ == 0) return;
-
- // delete the current buffer
- if (reference_length_ > 1) delete [] translated_reference_;
- else delete translated_reference_;
-
- reference_length_ = 0;
-}
-} // namespace StripedSmithWaterman
-
-#endif // COMPLETE_STRIPED_SMITH_WATERMAN_CPP_H_
diff --git a/src/include/standard_base.hpp b/src/include/standard_base.hpp
deleted file mode 100644
index 7c0e012..0000000
--- a/src/include/standard_base.hpp
+++ /dev/null
@@ -1,142 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * standart.hpp
- *
- * Created on: 1 Sep 2011
- * Author: valery
- */
-
-#pragma once
-
-//==crt and stl
-#include <memory>
-#include <cstdlib>
-#include <cstdio>
-#include <time.h>
-#include <signal.h>
-#include <execinfo.h>
-
-#include <iostream>
-#include <iterator>
-#include <algorithm>
-#include <map>
-#include <vector>
-#include <set>
-#include <string>
-#include <sstream>
-#include <utility>
-#include <array>
-#include <unordered_map>
-#include <unordered_set>
-#include <deque>
-#include <cmath>
-#include <limits>
-
-using std::cin;
-using std::cout;
-using std::cerr;
-using std::endl;
-using std::map;
-using std::multimap;
-using std::unordered_map;
-using std::unordered_set;
-using std::vector;
-using std::array;
-using std::set;
-using std::string;
-using std::pair;
-using std::make_pair;
-using std::ifstream;
-using std::istream;
-using std::ofstream;
-using std::ostream;
-using std::min;
-using std::max;
-using std::abs;
-using std::stringstream;
-using std::numeric_limits;
-using std::ostream_iterator;
-using std::copy;
-
-using std::shared_ptr;
-using std::make_shared;
-
-//==boost
-
-#ifndef NDEBUG
-#define BOOST_ENABLE_ASSERT_HANDLER
-#endif
-
-#include <boost/optional.hpp>
-
-#include <boost/lexical_cast.hpp>
-#include <boost/noncopyable.hpp>
-
-using boost::optional;
-using boost::make_optional;
-using boost::none;
-
-using boost::lexical_cast;
-using boost::noncopyable;
-
-// err handling
-#include "stacktrace.hpp"
-
-// path manipulation instead of boost filesystem
-#include "path_helper.hpp"
-using path::make_dir;
-using path::remove_dir;
-
-#ifndef NDEBUG
-namespace boost {
-inline void assertion_failed(char const * expr, char const * function,
- char const * file, long line) {
- std::cerr << "Aborted by assert: " << std::endl;
- print_stacktrace();
-#if __DARWIN_UNIX03
- __assert_rtn (expr, file, (int)line, function);
-#elif __DARWIN
- __assert (expr, file, (int)line, function);
-#else
- __assert_fail (expr, file, (unsigned)line, function);
-#endif
-}
-
-inline void assertion_failed_msg(char const * expr, char const * msg,
- char const * function, char const * file,
- long line) {
- std::cerr << "Aborted by assert: " << msg << std::endl;
- print_stacktrace();
-#if __DARWIN_UNIX03
- __assert_rtn (expr, file, (int)line, function);
-#elif __DARWIN
- __assert (expr, file, (int)line, function);
-#else
- __assert_fail (expr, file, (unsigned)line, function);
-#endif
-}
-
-} // namespace boost
-
-#endif // NDEBUG
-
-//==sys
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <sys/time.h>
-
-//our
-//math
-#include "xmath.h"
-#include "func.hpp"
-#include "verify.hpp"
-// log
-#include "logger/logger.hpp"
-
-
diff --git a/src/include/verify.hpp b/src/include/verify.hpp
deleted file mode 100644
index d64f641..0000000
--- a/src/include/verify.hpp
+++ /dev/null
@@ -1,34 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-#include "stacktrace.hpp"
-
-#include "boost/current_function.hpp"
-#include <sstream>
-#include <iostream>
-#include <cassert>
-
-#define VERIFY(expr) \
- do { \
- if(!(expr))\
- print_stacktrace();\
- assert(expr); \
- } while(0);
-
-#define VERIFY_MSG(expr, msg) \
- if (!(expr)) { \
- std::stringstream ss; \
- print_stacktrace();\
- ss << "Verification of expression '" << #expr << "' failed in function '" << BOOST_CURRENT_FUNCTION << \
- "'. In file '" << __FILE__ << "' on line " << __LINE__ << ". Message '" << msg << "'." ; \
- std::cout << ss.str() << std::endl; \
- std::cerr << ss.str() << std::endl; \
- fflush(stdout); \
- fflush(stderr); \
- assert(expr); \
- }
diff --git a/src/include/xmath.h b/src/include/xmath.h
deleted file mode 100644
index 6f5ef71..0000000
--- a/src/include/xmath.h
+++ /dev/null
@@ -1,346 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef XMATH_H_
-#define XMATH_H_
-
-#include <limits>
-#include <cmath>
-
-namespace math {
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Authors: wan at google.com (Zhanyong Wan), eefacm at gmail.com (Sean Mcafee)
-//
-// The Google C++ Testing Framework (Google Test)
-
-
-// This template class serves as a compile-time function from size to
-// type. It maps a size in bytes to a primitive type with that
-// size. e.g.
-//
-// TypeWithSize<4>::UInt
-//
-// is typedef-ed to be unsigned int (unsigned integer made up of 4
-// bytes).
-//
-// Such functionality should belong to STL, but I cannot find it
-// there.
-//
-// Google Test uses this class in the implementation of floating-point
-// comparison.
-//
-// For now it only handles UInt (unsigned int) as that's all Google Test
-// needs. Other types can be easily added in the future if need
-// arises.
-template <size_t size>
-class TypeWithSize {
- public:
- // This prevents the user from using TypeWithSize<N> with incorrect
- // values of N.
- typedef void UInt;
-};
-
-// The specialization for size 4.
-template <>
-class TypeWithSize<4> {
- public:
- // unsigned int has size 4 in both gcc and MSVC.
- //
- // As base/basictypes.h doesn't compile on Windows, we cannot use
- // uint32, uint64, and etc here.
- typedef int Int;
- typedef unsigned int UInt;
-};
-
-// The specialization for size 8.
-template <>
-class TypeWithSize<8> {
- public:
- typedef long long Int; // NOLINT
- typedef unsigned long long UInt; // NOLINT
-};
-
-// This template class represents an IEEE floating-point number
-// (either single-precision or double-precision, depending on the
-// template parameters).
-//
-// The purpose of this class is to do more sophisticated number
-// comparison. (Due to round-off error, etc, it's very unlikely that
-// two floating-points will be equal exactly. Hence a naive
-// comparison by the == operation often doesn't work.)
-//
-// Format of IEEE floating-point:
-//
-// The most-significant bit being the leftmost, an IEEE
-// floating-point looks like
-//
-// sign_bit exponent_bits fraction_bits
-//
-// Here, sign_bit is a single bit that designates the sign of the
-// number.
-//
-// For float, there are 8 exponent bits and 23 fraction bits.
-//
-// For double, there are 11 exponent bits and 52 fraction bits.
-//
-// More details can be found at
-// http://en.wikipedia.org/wiki/IEEE_floating-point_standard.
-//
-// Template parameter:
-//
-// RawType: the raw floating-point type (either float or double)
-template <typename RawType>
-class FloatingPoint {
- public:
- // Defines the unsigned integer type that has the same size as the
- // floating point number.
- typedef typename TypeWithSize<sizeof(RawType)>::UInt Bits;
-
- // Constants.
-
- // # of bits in a number.
- static const size_t kBitCount = 8 * sizeof(RawType);
-
- // # of fraction bits in a number.
- static const size_t kFractionBitCount = std::numeric_limits<RawType>::digits - 1;
-
- // # of exponent bits in a number.
- static const size_t kExponentBitCount = kBitCount - 1 - kFractionBitCount;
-
- // The mask for the sign bit.
- static const Bits kSignBitMask = static_cast<Bits>(1) << (kBitCount - 1);
-
- // The mask for the fraction bits.
- static const Bits kFractionBitMask = ~static_cast<Bits>(0) >> (kExponentBitCount + 1);
-
- // The mask for the exponent bits.
- static const Bits kExponentBitMask = ~(kSignBitMask | kFractionBitMask);
-
- // How many ULP's (Units in the Last Place) we want to tolerate when
- // comparing two numbers. The larger the value, the more error we
- // allow. A 0 value means that two numbers must be exactly the same
- // to be considered equal.
- //
- // The maximum error of a single floating-point operation is 0.5
- // units in the last place. On Intel CPU's, all floating-point
- // calculations are done with 80-bit precision, while double has 64
- // bits. Therefore, 4 should be enough for ordinary use.
- //
- // See the following article for more details on ULP:
- // http://www.cygnus-software.com/papers/comparingfloats/comparingfloats.htm.
- static const size_t kMaxUlps = 4;
-
- // Constructs a FloatingPoint from a raw floating-point number.
- //
- // On an Intel CPU, passing a non-normalized NAN (Not a Number)
- // around may change its bits, although the new value is guaranteed
- // to be also a NAN. Therefore, don't expect this constructor to
- // preserve the bits in x when x is a NAN.
- explicit FloatingPoint(const RawType& x) { u_.value_ = x; }
-
- // Static methods
-
- // Reinterprets a bit pattern as a floating-point number.
- //
- // This function is needed to test the AlmostEquals() method.
- static RawType ReinterpretBits(const Bits bits) {
- FloatingPoint fp(0);
- fp.u_.bits_ = bits;
- return fp.u_.value_;
- }
-
- // Returns the floating-point number that represent positive infinity.
- static RawType Infinity() {
- return ReinterpretBits(kExponentBitMask);
- }
-
- // Non-static methods
-
- // Returns the bits that represents this number.
- const Bits &bits() const { return u_.bits_; }
-
- // Returns the exponent bits of this number.
- Bits exponent_bits() const { return kExponentBitMask & u_.bits_; }
-
- // Returns the fraction bits of this number.
- Bits fraction_bits() const { return kFractionBitMask & u_.bits_; }
-
- // Returns the sign bit of this number.
- Bits sign_bit() const { return kSignBitMask & u_.bits_; }
-
- // Returns true iff this is NAN (not a number).
- bool is_nan() const {
- // It's a NAN if the exponent bits are all ones and the fraction
- // bits are not entirely zeros.
- return (exponent_bits() == kExponentBitMask) && (fraction_bits() != 0);
- }
-
- // Returns true iff this number is at most kMaxUlps ULP's away from
- // rhs. In particular, this function:
- //
- // - returns false if either number is (or both are) NAN.
- // - treats really large numbers as almost equal to infinity.
- // - thinks +0.0 and -0.0 are 0 DLP's apart.
-
- template<class FloatingPoint2>
- bool AlmostEquals(const FloatingPoint2& rhs) const {
- static_assert(kBitCount == FloatingPoint2::kBitCount, "Can only compare similar sized types");
- // The IEEE standard says that any comparison operation involving
- // a NAN must return false.
- if (is_nan() || rhs.is_nan()) return false;
- //cout << "ULPS " << DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_) << endl;
-
- return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.bits())
- <= kMaxUlps;
- }
-
- private:
- // The data type used to store the actual floating-point number.
- union FloatingPointUnion {
- RawType value_; // The raw floating-point number.
- Bits bits_; // The bits that represent the number.
- };
-
- // Converts an integer from the sign-and-magnitude representation to
- // the biased representation. More precisely, let N be 2 to the
- // power of (kBitCount - 1), an integer x is represented by the
- // unsigned number x + N.
- //
- // For instance,
- //
- // -N + 1 (the most negative number representable using
- // sign-and-magnitude) is represented by 1;
- // 0 is represented by N; and
- // N - 1 (the biggest number representable using
- // sign-and-magnitude) is represented by 2N - 1.
- //
- // Read http://en.wikipedia.org/wiki/Signed_number_representations
- // for more details on signed number representations.
- static Bits SignAndMagnitudeToBiased(const Bits &sam) {
- if (kSignBitMask & sam) {
- // sam represents a negative number.
- return ~sam + 1;
- } else {
- // sam represents a positive number.
- return kSignBitMask | sam;
- }
- }
-
- // Given two numbers in the sign-and-magnitude representation,
- // returns the distance between them as an unsigned number.
- static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits &sam1,
- const Bits &sam2) {
- const Bits biased1 = SignAndMagnitudeToBiased(sam1);
- const Bits biased2 = SignAndMagnitudeToBiased(sam2);
- return (biased1 >= biased2) ? (biased1 - biased2) : (biased2 - biased1);
- }
-
- FloatingPointUnion u_;
-};
-
-template<class T>
-T eps();
-
-template<>
-inline double eps<double>() { return 1e-10; }
-
-template<>
-inline float eps<float>() { return (float)1e-5; }
-
-template<class T> inline
-bool eq(T lhs, T rhs) {
- const FloatingPoint<T> lhs_(lhs), rhs_(rhs);
- return lhs_.AlmostEquals(rhs_);
- //return !ls(lhs, rhs) && !ls(rhs, lhs)[> std::abs(lhs - rhs) < eps<T>()<];
-}
-
-template<class T, class U> inline
-bool eq(T lhs, U rhs) {
- const FloatingPoint<T> lhs_(lhs); const FloatingPoint<U> rhs_(rhs);
- return lhs_.AlmostEquals(rhs_);
- //return !ls(lhs, rhs) && !ls(rhs, lhs)[> std::abs(lhs - rhs) < eps<T>()<];
-}
-
-template<class T, class U> inline
-bool ls(T lhs, U rhs) {
- if (!eq(lhs, rhs))
- return (lhs < rhs);
- return false;
- //T maxim = max(std::abs(rhs), std::abs(lhs));
- //if (maxim < 1)
- //return (lhs + eps<T>() < rhs);
- //else
- //return (eps<T>() < (rhs - lhs) / maxim);
-}
-
-template<class T, class U> inline
-bool gr(T lhs, U rhs) { return ls(rhs, lhs); }
-
-template<class T, class U> inline
-bool le(T lhs, U rhs) { return !ls(rhs, lhs); }
-
-template<class T, class U> inline
-bool ge(T lhs, U rhs) { return !ls(lhs, rhs); }
-
-template<class T> inline
-T floor(T t) { return std::floor(t + eps<T>()); }
-
-template<class T> inline
-T round(T t) { return floor(t + (T)0.5); }
-
-template<class T> inline
-int round_to_zero(T t) {
- using math::ls;
- int res = (int) math::round(std::abs(t));
- if (ls(t, (T)0.))
- res = -res;
- return res;
-}
-
-// updates floating point @variable only if it does not differ from the @new_value too much
-// @returns true if the @variable was updated indeed
-template<class T> inline
-bool update_value_if_needed(T& variable, T new_value) {
- bool result = !eq<T>(variable, new_value);
-
- if (result) {
- variable = new_value;
- }
- return result;
-}
-
-}
-
-#endif /* XMATH_H_ */
diff --git a/src/io/CMakeLists.txt b/src/io/CMakeLists.txt
deleted file mode 100644
index 865dc2c..0000000
--- a/src/io/CMakeLists.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-############################################################################
-# Copyright (c) 2015 Saint Petersburg State University
-# Copyright (c) 2011-2014 Saint Petersburg Academic University
-# All Rights Reserved
-# See file LICENSE for details.
-############################################################################
-
-project(input CXX)
-
-add_library(input STATIC
- parser.cpp
- path_helper.cpp
- copy_file.cpp
- library.cpp
- logger_impl.cpp
- sam/read.cpp
- sam/sam_reader.cpp)
-
-target_link_libraries(input BamTools samtools yaml-cpp)
-
diff --git a/src/io/copy_file.cpp b/src/io/copy_file.cpp
deleted file mode 100644
index 9ed9c10..0000000
--- a/src/io/copy_file.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "copy_file.hpp"
-
-#include "path_helper.hpp"
-#include "logger/logger.hpp"
-
-#include <boost/algorithm/string.hpp>
-
-#include <iostream>
-#include <fstream>
-#include <string>
-#include <vector>
-
-#include <unistd.h>
-#include <dirent.h>
-
-#include <sys/stat.h>
-#include <sys/types.h>
-
-namespace path {
-
-namespace details {
-
-using namespace path;
-
-void copy_file(std::string from_path, std::string to_path) {
- using namespace std;
-
- make_full_path(from_path);
- make_full_path(to_path );
-
- if (from_path == to_path)
- return;
-
- std::ifstream source(from_path, ios::binary);
- std::ofstream dest (to_path.c_str() , ios::binary);
-
- dest << source.rdbuf();
-}
-
-
-void hard_link(std::string from_path, std::string to_path) {
- make_full_path(from_path);
- make_full_path(to_path );
-
- if (from_path == to_path)
- return;
-
- if (link(from_path.c_str(), to_path.c_str()) == -1) {
- WARN("Failed to create link. Reason: " << strerror(errno) << ". Error code: " << errno << ". Copying instead");
- copy_file(from_path, to_path);
- }
-}
-
-files_t files_in_folder(std::string const& path) {
- DIR *dp;
- if ((dp = opendir(path.c_str())) == NULL)
- throw std::runtime_error("can not open folder " + path);
-
- files_t files;
-
- struct dirent *dirp;
- while ((dirp = readdir(dp)) != NULL)
- if (dirp->d_type == DT_REG)
- files.push_back(append_path(path, dirp->d_name));
-
- closedir(dp);
- return files;
-}
-
-files_t folders_in_folder(std::string const& path) {
- DIR *dp;
- if ((dp = opendir(path.c_str())) == NULL)
- throw std::runtime_error("can not open folder " + path);
-
- files_t folders;
-
- struct dirent *dirp;
- while ((dirp = readdir(dp)) != NULL)
- if (dirp->d_type == DT_DIR) {
- std::string folder = dirp->d_name;
-
- if (folder != "." && folder != "..")
- folders.push_back(append_path(path, folder));
- }
-
- closedir(dp);
- return folders;
-}
-
-} // details
-
-path::files_t files_by_prefix(std::string const& path) {
- using namespace details;
- files_t files;
-
- std::string folder(parent_path(path));
- std::string prefix = filename(path);
-
- files_t out_files;
- const files_t all_files = files_in_folder(folder);
-
- for (auto it = all_files.begin(); it != all_files.end(); ++it) // no std::copy_if before C++11
- if (boost::starts_with(filename(*it), prefix))
- out_files.push_back(*it);
-
- return out_files;
-}
-
-void copy_files_by_prefix(path::files_t const& files, std::string const& to_folder) {
- using namespace details;
-
- for (auto it = files.begin(); it != files.end(); ++it) {
- files_t files_to_copy = files_by_prefix(*it);
-
- for (auto it = files_to_copy.begin(); it != files_to_copy.end(); ++it)
- copy_file(*it, append_path(to_folder, filename(*it)));
- }
-}
-
-void link_files_by_prefix(path::files_t const& files, std::string const& to_folder) {
- using namespace details;
-
- for (auto it = files.begin(); it != files.end(); ++it) {
- files_t files_to_copy = files_by_prefix(*it);
-
- for (auto it = files_to_copy.begin(); it != files_to_copy.end(); ++it)
- hard_link(*it, append_path(to_folder, filename(*it)));
- }
-}
-
-void copy_files_by_ext(std::string const& from_folder, std::string const& to_folder, std::string const& ext, bool recursive) {
- using namespace details;
-
- files_t files = files_in_folder(from_folder);
-
- for (auto it = files.begin(); it != files.end(); ++it)
- if (boost::ends_with(*it, ext))
- copy_file(*it, append_path(to_folder, filename(*it)));
-
- if (recursive) {
- files_t folders = folders_in_folder(from_folder);
-
- for (auto it = folders.begin(); it != folders.end(); ++it) {
- std::string subdir = append_path(to_folder, filename(*it));
- path:: make_dir(subdir);
- copy_files_by_ext(*it, subdir, ext, recursive);
- }
- }
-}
-
-}
diff --git a/src/io/library.cpp b/src/io/library.cpp
deleted file mode 100644
index e6b2f67..0000000
--- a/src/io/library.cpp
+++ /dev/null
@@ -1,179 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "io/library.hpp"
-
-#include <yaml-cpp/yaml.h>
-
-#include <string>
-#include <iostream>
-
-using namespace io;
-
-namespace YAML {
-template<>
-struct convert<LibraryOrientation> {
- static Node encode(const LibraryOrientation &rhs) {
- switch (rhs) {
- case LibraryOrientation::FR:
- return Node("fr");
- case LibraryOrientation::RF:
- return Node("rf");
- case LibraryOrientation::FF:
- return Node("ff");
- case LibraryOrientation::RR:
- return Node("rr");
- case LibraryOrientation::Undefined:
- default:
- return Node("undefined");
- }
- }
-
- static bool decode(const Node& node, LibraryOrientation& rhs) {
- std::string orientation = node.as<std::string>("");
-
- if (orientation == "fr")
- rhs = LibraryOrientation::FR;
- else if (orientation == "rf")
- rhs = LibraryOrientation::RF;
- else if (orientation == "ff")
- rhs = LibraryOrientation::FF;
- else if (orientation == "rr")
- rhs = LibraryOrientation::RR;
- else
- rhs = LibraryOrientation::Undefined;
-
- return true;
- }
-};
-
-template<>
-struct convert<LibraryType> {
- static Node encode(const LibraryType &rhs) {
- switch (rhs) {
- case LibraryType::PairedEnd:
- return Node("paired-end");
- case LibraryType::SingleReads:
- return Node("single");
- case LibraryType::MatePairs:
- return Node("mate-pairs");
- case LibraryType::HQMatePairs:
- return Node("hq-mate-pairs");
- case LibraryType::PacBioReads:
- return Node("pacbio");
- case LibraryType::SangerReads:
- return Node("sanger");
- case LibraryType::NanoporeReads:
- return Node("nanopore");
- case LibraryType::TrustedContigs:
- return Node("trusted-contigs");
- case LibraryType::UntrustedContigs:
- return Node("untrusted-contigs");
- case LibraryType::PathExtendContigs:
- return Node("path-extend-contigs");
- default:
- return Node();
- }
- }
-
- static bool decode(const Node& node, LibraryType& rhs) {
- std::string type = node.as<std::string>();
-
- if (type == "paired-end")
- rhs = LibraryType::PairedEnd;
- else if (type == "mate-pairs")
- rhs = LibraryType::MatePairs;
- else if (type == "hq-mate-pairs")
- rhs = LibraryType::HQMatePairs;
- else if (type == "pacbio")
- rhs = LibraryType::PacBioReads;
- else if (type == "single")
- rhs = LibraryType::SingleReads;
- else if (type == "sanger")
- rhs = LibraryType::SangerReads;
- else if (type == "nanopore")
- rhs = LibraryType::NanoporeReads;
- else if (type == "trusted-contigs")
- rhs = LibraryType::TrustedContigs;
- else if (type == "untrusted-contigs")
- rhs = LibraryType::UntrustedContigs;
- else if (type == "path-extend-contigs")
- rhs = LibraryType::PathExtendContigs;
- else
- return false;
- return true;
- }
-
-};
-
-Node convert<SequencingLibraryBase>::encode(const io::SequencingLibraryBase& rhs) {
- Node node;
-
- node["orientation"] = rhs.orientation();
- node["type"] = rhs.type();
-
- for (const auto& read_pair : rhs.paired_reads()) {
- node["left reads"].push_back(read_pair.first);
- node["right reads"].push_back(read_pair.second);
- }
- for (const auto& reads : rhs.single_reads())
- node["single reads"].push_back(reads);
-
- return node;
-}
-
-bool convert<SequencingLibraryBase>::decode(const Node& node, SequencingLibraryBase& rhs) {
- rhs.load(node);
- return true;
-}
-
-Node convert<io::SequencingLibrary<> >::encode(const io::SequencingLibrary<>& rhs) {
- return convert<io::SequencingLibraryBase>::encode(rhs);
-}
-
-bool convert<io::SequencingLibrary<> >::decode(const Node& node, io::SequencingLibrary<>& rhs) {
- rhs.load(node);
- return true;
-}
-
-} // namespace YAML
-
-void SequencingLibraryBase::load(const YAML::Node &node) {
- orientation_ = node["orientation"].as<io::LibraryOrientation>(LibraryOrientation::Undefined);
- type_ = node["type"].as<LibraryType>();
-
- switch (type_) {
- case LibraryType::PairedEnd:
- case LibraryType::MatePairs:
- case LibraryType::HQMatePairs:
- left_paired_reads_ = node["left reads"].as<std::vector<std::string> >();
- right_paired_reads_ = node["right reads"].as<std::vector<std::string> >();
-
- if (left_paired_reads_.size() != right_paired_reads_.size())
- throw("Left and right reads lists should have equal length");
-
- if (orientation_ == LibraryOrientation::Undefined)
- throw("Orientation for paired reads should be specified");
-
- // FALLTHROUGH in case of single reads present!
- if (!node["single reads"])
- break;
- case LibraryType::SingleReads:
- case LibraryType::PacBioReads:
- case LibraryType::SangerReads:
- case LibraryType::NanoporeReads:
- case LibraryType::TrustedContigs:
- case LibraryType::UntrustedContigs:
- case LibraryType::PathExtendContigs:
- single_reads_ = node["single reads"].as<std::vector<std::string> >();
- break;
- default:
- // Impossible
- std::cerr << node << std::endl;
- throw("Unsupported library type");
- }
-}
diff --git a/src/io/logger_impl.cpp b/src/io/logger_impl.cpp
deleted file mode 100644
index bbe0686..0000000
--- a/src/io/logger_impl.cpp
+++ /dev/null
@@ -1,148 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include <boost/algorithm/string.hpp>
-#include <cppformat/format.h>
-
-#include <string>
-#include <map>
-#include <fstream>
-#include <vector>
-
-#include "logger/logger.hpp"
-
-#include "config.hpp"
-
-#ifdef SPADES_USE_JEMALLOC
-# include <jemalloc/jemalloc.h>
-#endif
-
-namespace logging {
-
-properties::properties(level default_level)
- : def_level(default_level), all_default(true) {}
-
-properties::properties(std::string filename, level default_level)
- : def_level(default_level), all_default(true) {
- if (filename.empty())
- return;
-
- std::ifstream in(filename.c_str());
-
- std::map<std::string, level> remap = {
- {"TRACE", L_TRACE},
- {"DEBUG", L_DEBUG},
- {"INFO" , L_INFO },
- {"WARN" , L_WARN },
- {"ERROR", L_ERROR}
- };
-
- while (!in.eof()) {
- using namespace boost;
-
- char buf [0x400] = {};
- in.getline(buf, sizeof buf);
-
- std::string str(buf);
- trim(str);
-
- if (str.empty() || boost::starts_with(str, "#"))
- continue;
-
- std::vector<std::string> entry;
- split(entry, str, is_any_of("="));
-
- if(entry.size() != 2)
- throw std::runtime_error("invalid log file property entry: " + str);
-
- trim (entry[0]);
- trim (entry[1]);
- to_upper(entry[1]);
-
- auto it = remap.find(entry[1]);
- if(it == remap.end())
- throw std::runtime_error("invalid log file level description: " + entry[1]);
-
- levels[entry[0]] = it->second;
- }
-
- auto def = levels.find("default");
- if (def != levels.end())
- def_level = def->second;
-
- for (auto I = levels.begin(), E = levels.end(); I != E; ++I) {
- if (I->second != def_level) {
- all_default = false;
- break;
- }
- }
-}
-
-
-logger::logger(properties const& props)
- : props_(props) { }
-
-bool logger::need_log(level desired_level, const char* source) const {
- level source_level = props_.def_level;
-
- if (!props_.all_default) {
- auto it = props_.levels.find(source);
- if (it != props_.levels.end())
- source_level = it->second;
- }
-
- return desired_level >= source_level;
-}
-
-#ifdef SPADES_USE_JEMALLOC
-
-void logger::log(level desired_level, const char* file, size_t line_num, const char* source, const char* msg) {
- double time = timer_.time();
- const size_t *cmem = 0, *cmem_max = 0;
- size_t clen = sizeof(cmem);
-
- je_mallctl("stats.cactive", &cmem, &clen, NULL, 0);
- je_mallctl("stats.cactive_max", &cmem_max, &clen, NULL, 0);
-
- for (auto it = writers_.begin(); it != writers_.end(); ++it)
- (*it)->write_msg(time, (*cmem) / 1024, (*cmem_max) / 1024, desired_level, file, line_num, source, msg);
-}
-#else
-void logger::log(level desired_level, const char* file, size_t line_num, const char* source, const char* msg) {
- double time = timer_.time();
- size_t max_rss = get_max_rss();
-
- for (auto it = writers_.begin(); it != writers_.end(); ++it)
- (*it)->write_msg(time, max_rss, desired_level, file, line_num, source, msg);
-}
-#endif
-
-//
-void logger::add_writer(writer_ptr ptr)
-{
- writers_.push_back(ptr);
-}
-
-////////////////////////////////////////////////////
-std::shared_ptr<logger> &__logger() {
- static std::shared_ptr<logger> l;
- return l;
-}
-
-logger *create_logger(std::string filename, level default_level) {
- return new logger(properties(filename, default_level));
-}
-
-void attach_logger(logger *lg) {
- __logger().reset(lg);
-}
-
-void detach_logger() {
- __logger().reset();
-}
-
-
-} // logging
diff --git a/src/io/parser.cpp b/src/io/parser.cpp
deleted file mode 100644
index e1f2e89..0000000
--- a/src/io/parser.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/**
- * @file parser.cpp
- * @author Mariya Fomkina
- * @version 1.0
- *
- * @section LICENSE
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- *
- * @section DESCRIPTION
- *
- * Parser is the parent class for all streams that read data from
- * different file types (fastq, fasta, sam etc).
- * This file contains functions that are used to select exact parser
- * according to extension.
- */
-
-#include "standard_base.hpp"
-#include "logger/logger.hpp"
-#include "io/parser.hpp"
-#include "io/fasta_fastq_gz_parser.hpp"
-#include "io/bam_parser.hpp"
-
-
-namespace io {
-
-/*
- * Get extension from filename.
- *
- * @param filename The name of the file to read from.
- *
- * @return File extension (e.g. "fastq", "fastq.gz").
- */
-std::string GetExtension(const std::string& filename) {
- std::string name = filename;
- size_t pos = name.find_last_of(".");
- std::string ext = "";
- if (pos != std::string::npos) {
- ext = name.substr(name.find_last_of(".") + 1);
- if (ext == "gz") {
- ext = name.substr(name.find_last_of
- (".", name.find_last_of(".") - 1) + 1);
- }
- }
- return ext;
-}
-
-/*
- * Select parser type according to file extension.
- *
- * @param filename The name of the file to be opened.
- * @param offset The offset of the read quality.
-
- * @return Pointer to the new parser object with these filename and
- * offset.
- */
-Parser* SelectParser(const std::string& filename,
- OffsetType offset_type /*= PhredOffset*/) {
- std::string ext = GetExtension(filename);
- if (ext == "bam")
- return new BAMParser(filename, offset_type);
-
- return new FastaFastqGzParser(filename, offset_type);
- /*
- if ((ext == "fastq") || (ext == "fastq.gz") ||
- (ext == "fasta") || (ext == "fasta.gz") ||
- (ext == "fa") || (ext == "fq.gz") ||
- (ext == "fq") || (ext == "fa.gz") ||
- (ext == "seq") || (ext == "seq.gz")) {
- return new FastaFastqGzParser(filename, offset_type);
- }
-
- ERROR("Unknown file extention in input!");
- return NULL; */
-}
-
-void first_fun(int) {
-}
-
-}
diff --git a/src/io/path_helper.cpp b/src/io/path_helper.cpp
deleted file mode 100644
index d4c6f1e..0000000
--- a/src/io/path_helper.cpp
+++ /dev/null
@@ -1,201 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "path_helper.hpp"
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <dirent.h>
-#include <unistd.h>
-
-#include <boost/tokenizer.hpp>
-#include <boost/algorithm/string.hpp>
-
-#include <string>
-#include <vector>
-
-namespace path {
-
-bool make_dir(std::string const& folder) {
- return mkdir(folder.c_str(), 0755) == 0;
-}
-
-std::string make_temp_dir(std::string const& prefix,
- std::string const& suffix) {
- std::string name = append_path(prefix, suffix + "_XXXXXX");
- char* actual;
- if ((actual = ::mkdtemp(strcpy(new char[name.length() + 1], name.c_str())))
- == NULL)
- throw std::runtime_error("Cannot create temporary dir " + name);
-
- std::string result(actual);
- if (result == name)
- throw std::runtime_error("Cannot create temporary dir " + name);
-
- delete[] actual;
-
- return result;
-}
-
-void remove_dir(std::string const& folder) {
- DIR *dp;
- if ((dp = opendir(folder.c_str())) == NULL)
- throw std::runtime_error("can not open folder " + folder);
-
- struct dirent *dirp;
- while ((dirp = readdir(dp)) != NULL) {
- std::string full_path = folder + "/" + dirp->d_name;
-
- if (dirp->d_type == DT_DIR) {
- if (std::string(".") != dirp->d_name
- && std::string("..") != dirp->d_name) {
- remove_dir(full_path);
- }
- } else
- remove(full_path.c_str());
- }
-
- closedir(dp);
- remove(folder.c_str());
-}
-
-bool is_regular_file(std::string const& path) {
- struct stat st;
- return (stat(path.c_str(), &st) == 0) && (S_ISREG(st.st_mode));
-}
-
-std::string append_path(std::string const& prefix, std::string const& suffix) {
- std::string delimiter = "";
-
- if (!boost::ends_with(prefix, "/") && !boost::starts_with(suffix, "/")
- && !prefix.empty()) {
- delimiter = "/";
- }
-
- return prefix + delimiter + suffix;
-}
-
-std::string current_dir() {
- char* cwd = getcwd(NULL, 0);
- std::string result = cwd;
-
- free(cwd);
- return result;
-}
-
-void make_full_path(std::string& path) {
- if (!boost::starts_with(path, "/")) // relative path
- path = append_path(current_dir(), path);
-}
-
-std::string filename(std::string const& path) {
- size_t pos = path.find_last_of('/');
- return pos != std::string::npos ? path.substr(pos + 1) : path;
-}
-
-std::string basename(std::string const& path) {
- size_t slash = path.find_last_of('/');
- size_t after_slash = slash == std::string::npos ? 0 : slash + 1;
-
- size_t dot = path.find_last_of('.');
- if (dot < after_slash)
- dot = std::string::npos;
-
- return path.substr(after_slash, dot - after_slash);
-}
-
-std::string extension(std::string const& path) {
- size_t slash = path.find_last_of('/');
- size_t after_slash = slash == std::string::npos ? 0 : slash + 1;
- size_t dot = path.find_last_of('.');
-
- if (dot < after_slash || dot == std::string::npos || dot + 1 == path.size())
- return std::string();
-
- return path.substr(dot);
-}
-
-std::string parent_path(std::string const& path) {
- std::string cpath(path);
-
- make_full_path(cpath);
- size_t slash_pos = cpath.find_last_of('/');
-
- return (slash_pos == 0 ? std::string("/") : cpath.substr(0, slash_pos));
-}
-
-bool check_existence(std::string const& path) {
- struct stat st_buf;
- return stat(path.c_str(), &st_buf) == 0
- && (S_ISREG(st_buf.st_mode) || S_ISDIR(st_buf.st_mode)); // exists and (file or dir)
-}
-
-void remove_if_exists(std::string const& path) {
- if (check_existence(path)) {
- if (is_regular_file(path)) // file
- remove(path.c_str());
- else // dir
- remove_dir(path);
- }
-}
-
-// doesn't support symlinks
-std::string resolve(std::string const& path) {
- typedef boost::char_delimiters_separator<char> separator_t;
- typedef boost::tokenizer<separator_t> tokenizer_t;
-
- tokenizer_t tok(path, separator_t(false, "", "/"));
-
- std::string result = "/";
- for (auto it = tok.begin(); it != tok.end(); ++it) {
- if (*it == "..")
- result = parent_path(result);
-
- else if (*it == ".")
- ; // Ignore
-
- else
- // Just cat other path entries
- result = append_path(result, *it);
- }
-
- return result;
-}
-
-std::string make_relative_path(std::string p, std::string base) {
- p = resolve(p);
- base = resolve(base);
-
- std::string pp = parent_path(p);
-
- typedef boost::char_delimiters_separator<char> separator_t;
- typedef boost::tokenizer<separator_t> tokenizer_t;
-
- tokenizer_t pp_tok(pp, separator_t(false, "", "/"));
- tokenizer_t base_tok(base, separator_t(false, "", "/"));
-
- auto i = pp_tok.begin();
- auto j = base_tok.begin();
-
- while (i != pp_tok.end() && j != base_tok.end() && *i == *j) {
- ++i;
- ++j;
- }
-
- std::string result;
- for (; j != base_tok.end(); ++j)
- result = append_path("..", result);
-
- for (; i != pp_tok.end(); ++i)
- result = append_path(result, *i);
-
- return append_path(result, filename(p));
-}
-
-typedef std::vector<std::string> files_t;
-
-}
diff --git a/src/io/sam/read.cpp b/src/io/sam/read.cpp
deleted file mode 100644
index de65d03..0000000
--- a/src/io/sam/read.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include <io/sam/read.hpp>
-
-using namespace std;
-
-namespace sam_reader {
-
-string SingleSamRead::cigar() const {
- uint32_t *cigar = bam1_cigar(data_);
- string res;
- res.reserve(data_->core.n_cigar);
- for (size_t k = 0; k < data_->core.n_cigar; ++k) {
- res += std::to_string(bam_cigar_oplen(cigar[k]));
- res += bam_cigar_opchr(cigar[k]);
-
- }
- return res;
-}
-
-string SingleSamRead::name() const {
- string res(bam1_qname(data_));
- return res;
-}
-
-string SingleSamRead::seq() const {
- string res = "";
- auto b = bam1_seq(data_);
- for (int k = 0; k < data_->core.l_qseq; ++k) {
- res += bam_nt16_rev_table[bam1_seqi(b, k)];
- }
- return res;
-}
-
-
-}
-;
diff --git a/src/io/sam/sam_reader.cpp b/src/io/sam/sam_reader.cpp
deleted file mode 100644
index 77e3f4f..0000000
--- a/src/io/sam/sam_reader.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include <io/sam/read.hpp>
-#include <io/sam/sam_reader.hpp>
-
-using namespace std;
-
-namespace sam_reader {
-
-bool MappedSamStream::eof() const {
- return eof_;
-}
-
-bool MappedSamStream::is_open() const {
- return is_open_;
-}
-
-MappedSamStream& MappedSamStream::operator>>(SingleSamRead& read) {
- if (!is_open_ || eof_)
- return *this;
- read.set_data(seq_);
- int tmp = samread(reader_, seq_);
- eof_ = (0 >= tmp);
- return *this;
-}
-
-MappedSamStream& MappedSamStream::operator >>(PairedSamRead& read) {
- TRACE("starting process paired read");
- SingleSamRead r1;
- MappedSamStream::operator >>(r1);
- SingleSamRead r2;
- MappedSamStream::operator >>(r2);
-
- read = PairedSamRead(r1, r2);
- TRACE(r1.seq());
- TRACE(r2.seq());
- TRACE(r1.name());
- return *this;
-}
-
-const char* MappedSamStream::get_contig_name(int i) const {
- VERIFY(i < reader_->header->n_targets);
- return (reader_->header->target_name[i]);
-}
-
-void MappedSamStream::close() {
- samclose(reader_);
- is_open_ = false;
- eof_ = true;
- bam_destroy1(seq_);
-}
-
-void MappedSamStream::reset() {
- close();
- open();
-}
-
-void MappedSamStream::open() {
- if ((reader_ = samopen(filename_.c_str(), "r", NULL)) == NULL) {
- WARN("Fail to open SAM file " << filename_);
- is_open_ = false;
- eof_ = true;
- } else {
- is_open_ = true;
- int tmp = samread(reader_, seq_);
- eof_ = (0 >= tmp);
- }
-}
-
-}
diff --git a/src/ionhammer/CMakeLists.txt b/src/ionhammer/CMakeLists.txt
deleted file mode 100644
index 8b2f8a8..0000000
--- a/src/ionhammer/CMakeLists.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-############################################################################
-# Copyright (c) 2015 Saint Petersburg State University
-# Copyright (c) 2011-2014 Saint-Petersburg Academic University
-# All Rights Reserved
-# See file LICENSE for details.
-############################################################################
-
-project(ionhammer CXX)
-
-include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-
-add_executable(ionhammer
- kmer_data.cpp
- hamcluster.cpp
- subcluster.cpp
- err_helper_table.cpp
- config_struct.cpp
- expander.cpp
- seqeval/BaseHypothesisEvaluator.cpp
- seqeval/TreephaserLite.cpp
- main.cpp)
-
-target_link_libraries(ionhammer input cityhash BamTools yaml-cpp input ${COMMON_LIBRARIES})
-
-if (SPADES_STATIC_BUILD)
- set_target_properties(ionhammer PROPERTIES LINK_SEARCH_END_STATIC 1)
-endif()
-
-install(TARGETS ionhammer
- RUNTIME DESTINATION bin)
-install(DIRECTORY "${SPADES_CFG_DIR}/ionhammer"
- DESTINATION share/spades/configs
- FILES_MATCHING PATTERN "*.cfg.template")
diff --git a/src/ionhammer/HSeq.hpp b/src/ionhammer/HSeq.hpp
deleted file mode 100644
index 567f84f..0000000
--- a/src/ionhammer/HSeq.hpp
+++ /dev/null
@@ -1,289 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef __HAMMER_HSEQ_HPP__
-#define __HAMMER_HSEQ_HPP__
-
-#include "sequence/nucl.hpp"
-#include <city/city.h>
-
-#include <array>
-#include <string>
-#include <vector>
-#include <deque>
-
-#include <cstdint>
-
-namespace hammer {
-
-union HomopolymerRun {
- uint8_t raw;
- struct {
- uint8_t len : 6;
- uint8_t nucl : 2;
- };
-
- HomopolymerRun()
- : raw(0) {}
- HomopolymerRun(uint8_t nucl, uint8_t len)
- : len(len & 63), nucl(nucl & 3) {}
-
- bool operator==(const HomopolymerRun &that) const {
- return raw == that.raw;
- }
-
- bool operator!=(const HomopolymerRun &that) const {
- return raw != that.raw;
- }
-
- bool operator<(const HomopolymerRun &that) const {
- return raw < that.raw;
- }
-
- std::string str() const {
- return std::string(len, ::nucl(nucl));
- }
-};
-
-namespace iontorrent {
- // Container shall have push_back method
- template <typename Container>
- void toHomopolymerRuns(const std::string &seq, Container& runs) {
- if (seq.empty())
- return;
-
- char nucl = seq[0];
- uint8_t len = 1;
- for (size_t i = 1; i < seq.size(); ++i) {
- if (seq[i] != nucl) {
- runs.push_back(HomopolymerRun(dignucl(nucl), len));
- len = 1;
- nucl = seq[i];
- } else {
- ++len;
- }
- }
- if (len > 0) {
- runs.push_back(HomopolymerRun(dignucl(nucl), len));
- }
- }
-
-};
-
-template <size_t N = 16>
-class HSeq {
- public:
- typedef std::array<HomopolymerRun, N> StorageType;
-
- private:
- StorageType data_;
-
- const static size_t PrimeNum = 239;
-
- public:
- HSeq() {}
-
- HSeq(typename StorageType::const_iterator Start,
- typename StorageType::const_iterator End) {
- std::copy(Start, End, data_.begin());
- }
-
- typedef HomopolymerRun DataType;
- const static size_t DataSize = N;
- const static size_t TotalBytes = sizeof(DataType) * DataSize;
-
- static size_t GetDataSize(size_t size) {
- VERIFY(size == N);
- return N * sizeof(HomopolymerRun);
- }
-
- typename StorageType::const_iterator begin() const {
- return data_.begin();
- }
-
- typename StorageType::const_iterator end() const {
- return data_.end();
- }
-
- typename StorageType::const_reverse_iterator rbegin() const {
- return data_.rbegin();
- }
-
- typename StorageType::const_reverse_iterator rend() const {
- return data_.rend();
- }
-
- const HomopolymerRun *data() const {
- return data_.data();
- }
-
- size_t data_size() const {
- return DataSize;
- }
-
- HomopolymerRun &operator[](size_t idx) {
- return data_[idx];
- }
-
- const HomopolymerRun &operator[](size_t idx) const {
- return data_[idx];
- }
-
- HSeq<N> operator!() const {
- HSeq<N> res(*this);
-
- for (size_t i = 0; i < N / 2; ++i) {
- HomopolymerRun front = res[i], back = res[N - i - 1];
- front.nucl = complement(front.nucl) & 3;
- back.nucl = complement(back.nucl) & 3;
- res[i] = back;
- res[N - i - 1] = front;
- }
-
- if (N & 1)
- res[N/2].nucl = complement(res[N/2].nucl) & 3;
-
- return res;
- }
-
- HSeq<N> operator<<(char nucl) const {
- if (is_nucl(nucl))
- nucl = dignucl(nucl);
-
- HSeq<N> res(*this);
- // Easy case - just add to run
- HomopolymerRun &last = res[N-1];
- if (last.nucl == nucl) {
- last.len += 1;
- return res;
- }
-
- // Hard case - have to shift the stuff
- for (size_t i = 0; i < N - 1; ++i)
- res[i] = res[i + 1];
- res[N - 1].nucl = nucl;
- res[N - 1].len = 1;
-
- return res;
- }
-
- HSeq<N>& operator<<=(char nucl) {
- if (is_nucl(nucl))
- nucl = dignucl(nucl);
-
- // Easy case - just add to run
- HomopolymerRun &last = data_[N-1];
- if (last.nucl == nucl) {
- last.len = (last.len + 1) & 63;
- return *this;
- }
-
- // Hard case - have to shift the stuff
- for (size_t i = 0; i < N - 1; ++i)
- data_[i] = data_[i + 1];
- data_[N - 1].nucl = nucl & 3;
- data_[N - 1].len = 1;
-
- return *this;
- }
-
- HSeq<N> operator>>(char nucl) const {
- if (is_nucl(nucl))
- nucl = dignucl(nucl);
-
- HSeq<N> res(*this);
- // Easy case - just add to run
- HomopolymerRun &first = res[0];
- if (first.nucl == nucl) {
- first.len += 1;
- return res;
- }
-
- // Hard case - have to shift the stuff
- for (size_t i = 0; i < N - 1; ++i)
- res[i + 1] = res[i];
- res[0].nucl = nucl;
- res[0].len = 1;
-
- return res;
- }
-
- bool operator==(const HSeq<N> &that) const {
- return (data_ == that.data_);
- }
- bool operator!=(const HSeq<N> &that) const {
- return (data_ != that.data_);
- }
-
- size_t size() const {
- size_t res = 0;
- for (size_t i = 0; i < N; ++i)
- res += data_[i].len;
-
- return res;
- }
-
- std::string str() const {
- std::string res;
- for (size_t i = 0; i < N; ++i)
- res += data_[i].str();
-
- return res;
- }
-
- static size_t GetHash(const DataType *data, size_t sz = DataSize, uint32_t seed = 0) {
- return CityHash64WithSeed((const char*)data, sz * sizeof(DataType), 0x9E3779B9 ^ seed);
- }
-
- size_t GetHash(uint32_t seed = 0) const {
- return GetHash(data_.data(), DataSize, seed);
- }
-
- struct hash {
- size_t operator()(const HSeq<N> &seq, uint32_t seed = 0) const {
- return seq.GetHash(seed);
- }
-
- size_t operator()(const DataType *data, size_t sz = DataSize, uint32_t seed = 0) const {
- return GetHash(data, sz, seed);
- }
- };
-
- struct less2_fast {
- bool operator()(const HSeq<N> &l, const HSeq<N> &r) const {
- for (size_t i = 0; i < N; ++i) {
- const uint8_t lr = l[i].raw, rr = r[i].raw;
- if (lr != rr)
- return lr < rr;
- }
-
- return false;
- }
- };
-};
-
-template<size_t N>
-std::ostream& operator<<(std::ostream& os, const HSeq<N> &seq) {
- os << seq.str();
- return os;
-}
-
-namespace internal {
- template <size_t N>
- inline size_t getSize(const hammer::HSeq<N> &) {
- return N;
- }
-
- template <typename T>
- inline size_t getSize(const T& a) {
- return a.size();
- }
-}
-
-};
-
-#endif // __HAMMER_HSEQ_HPP__
diff --git a/src/ionhammer/config_struct.cpp b/src/ionhammer/config_struct.cpp
deleted file mode 100644
index 48a4969..0000000
--- a/src/ionhammer/config_struct.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "config_struct.hpp"
-
-#include "openmp_wrapper.h"
-
-#include <yaml-cpp/yaml.h>
-#include <string>
-
-namespace YAML {
-template<>
-struct convert<hammer_config::HammerStage> {
- static bool decode(const YAML::Node &node, hammer_config::HammerStage &rhs) {
- std::string val = node.as<std::string>();
-
- if (val == "count") {
- rhs = hammer_config::HammerStage::KMerCounting;
- return true;
- } else if (val == "hamcluster") {
- rhs = hammer_config::HammerStage::HammingClustering;
- return true;
- } else if (val == "subcluster") {
- rhs = hammer_config::HammerStage::SubClustering;
- return true;
- } else if (val == "correct") {
- rhs = hammer_config::HammerStage::ReadCorrection;
- return true;
- }
-
- return false;
- }
-};
-}
-
-
-namespace hammer_config {
-void load(hammer_config& cfg, const std::string &filename) {
- YAML::Node config = YAML::LoadFile(filename);
-
- cfg.dataset.load(config["dataset"].as<std::string>());
-
- cfg.working_dir = config["working_dir"].as<std::string>(".");
- cfg.output_dir = config["output_dir"].as<std::string>(".");
-
- // FIXME: Make trivial deserialization trivial
- cfg.hard_memory_limit = config["hard_memory_limit"].as<unsigned>();
-
- cfg.count_split_buffer = config["count_split_buffer"].as<size_t>(0);
-
- cfg.max_nthreads = config["max_nthreads"].as<unsigned>();
- // Fix number of threads according to OMP capabilities.
- cfg.max_nthreads = std::min(cfg.max_nthreads, (unsigned)omp_get_max_threads());
- // Inform OpenMP runtime about this :)
- omp_set_num_threads(cfg.max_nthreads);
-
- cfg.kmer_qual_threshold = config["kmer_qual_threshold"].as<double>();
- cfg.center_qual_threshold = config["center_qual_threshold"].as<double>();
- cfg.delta_score_threshold = config["delta_score_threshold"].as<double>();
- cfg.keep_uncorrected_ends = config["keep_uncorrected_ends"].as<bool>();
- cfg.tau = config["tau"].as<unsigned>();
-
- cfg.debug_mode = config["debug_mode"].as<bool>();
- cfg.start_stage = config["start_stage"].as<HammerStage>();
-}
-}
diff --git a/src/ionhammer/config_struct.hpp b/src/ionhammer/config_struct.hpp
deleted file mode 100644
index 0b51891..0000000
--- a/src/ionhammer/config_struct.hpp
+++ /dev/null
@@ -1,49 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef __HAMMER_IT_CONFIG_HPP__
-#define __HAMMER_IT_CONFIG_HPP__
-
-#include "config_singl.hpp"
-
-#include "io/library.hpp"
-
-namespace hammer_config {
-enum class HammerStage {
- KMerCounting = 1,
- HammingClustering = 2,
- SubClustering = 3,
- ReadCorrection = 4
-};
-
-struct hammer_config {
- io::DataSet<> dataset;
-
- std::string working_dir;
- std::string output_dir;
-
- unsigned max_nthreads;
- unsigned tau;
- unsigned hard_memory_limit;
-
- size_t count_split_buffer;
-
- double kmer_qual_threshold;
- double center_qual_threshold;
- double delta_score_threshold;
- bool keep_uncorrected_ends;
-
- bool debug_mode;
- HammerStage start_stage;
-};
-
-void load(hammer_config& cfg, const std::string &filename);
-}
-
-typedef config_common::config<hammer_config::hammer_config> cfg;
-
-#endif // __HAMMER_IT_CONFIG_HPP__
diff --git a/src/ionhammer/err_helper_table.cpp b/src/ionhammer/err_helper_table.cpp
deleted file mode 100644
index 9e20546..0000000
--- a/src/ionhammer/err_helper_table.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "err_helper_table.hpp"
-
-#include <fstream>
-#include <istream>
-
-#include "logger/logger.hpp"
-
-namespace hammer {
-namespace errHelper {
-
-namespace internal {
-
-static const uint32_t helper_table_data[] = {
-#include "err_helper_table.inc"
-};
-
-// numbers are cumulative sums of
-// (2 * 4^^2) / 32,
-// (2 * 4^^4) / 32,
-// ...
-const HelperTable helper_tables[] = {
- { 1, helper_table_data },
- { 2, helper_table_data + 1 },
- { 3, helper_table_data + 17 },
- { 4, helper_table_data + 273 },
- { 5, helper_table_data + 4369 }
-};
-
-}; // namespace internal
-
-}; // namespace errHelper
-}; // namespace hammer
diff --git a/src/ionhammer/err_helper_table.hpp b/src/ionhammer/err_helper_table.hpp
deleted file mode 100644
index 4d06383..0000000
--- a/src/ionhammer/err_helper_table.hpp
+++ /dev/null
@@ -1,117 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef __HAMMER_ERR_HELPER_TABLE_HPP__
-#define __HAMMER_ERR_HELPER_TABLE_HPP__
-
-#include "hkmer.hpp"
-
-#include <vector>
-#include <istream>
-#include <string>
-#include <cstdlib>
-#include <cassert>
-
-#include "logger/logger.hpp"
-
-namespace hammer {
-
-namespace errHelper {
-
-/// Type of error
-enum Hint {
- kMismatch,
- kInsertion,
- kDeletion
-};
-
-namespace internal {
-
-// maximum size of K-mers in the helper tables
-static const unsigned int MAX_K = 5;
-
-struct HelperTable {
- const unsigned k_;
- const uint32_t* storage_;
-
- template <typename It1, typename It2>
- Hint lookupHint(const It1 &x_it, const It2 &y_it,
- size_t x_nfront, size_t y_nfront) const {
-
- VERIFY(k_ <= MAX_K);
- unsigned x_code = getCode(x_it, x_nfront, k_);
- unsigned y_code = getCode(y_it, y_nfront, k_);
-
- unsigned code = x_code + (y_code << (2 * k_));
- uint32_t bt = storage_[code / 16]; // 16 hints per uint32_t
- unsigned shift = (code % 16) * 2;
- return static_cast<Hint>((bt >> shift) & 0x3);
- }
-
- template <typename HRunIter>
- static unsigned getCode(const HRunIter& x_it, size_t x_nfront, size_t k) {
- unsigned code = 0;
- unsigned len = 0;
- auto nucl = x_it->nucl;
- for (len = 0; len < x_nfront && len < k; ++len)
- code |= nucl << (2 * len);
-
- if (len == k)
- return code;
-
- for (HRunIter it = x_it + 1; ; ++it) {
- for (size_t i = 0; i < it->len; ++i) {
- code |= it->nucl << (2 * len++);
- if (len == k)
- return code;
- }
- }
-
- assert(false);
- }
-};
-
-// tables for k = 1, 2, ..., MAX_K
-extern const HelperTable helper_tables[];
-
-template <typename HRunIter>
-static inline size_t getNumberOfRemainingBases(const HRunIter &x_it,
- const HRunIter &x_end,
- size_t x_nfront) {
- size_t n = x_nfront;
- if (n >= MAX_K)
- return MAX_K;
-
- for (HRunIter it = x_it + 1; it != x_end; ++it) {
- n += it->len;
- if (n >= MAX_K)
- return MAX_K;
- }
-
- return n;
-}
-
-}; // namespace internal
-
-/// Estimate what kind of error occurred at the position
-template <typename It1, typename It2>
-static inline Hint getHint(const It1 &x_begin, const It1 &x_end,
- const It2 &y_begin, const It2 &y_end,
- size_t x_nfront, size_t y_nfront) {
- VERIFY(x_nfront <= x_begin->len);
- VERIFY(y_nfront <= y_begin->len);
- size_t x_rem = internal::getNumberOfRemainingBases(x_begin, x_end, x_nfront);
- size_t y_rem = internal::getNumberOfRemainingBases(y_begin, y_end, y_nfront);
-
- auto& table = internal::helper_tables[std::min(x_rem, y_rem) - 1];
- return table.lookupHint<It1, It2>(x_begin, y_begin, x_nfront, y_nfront);
-}
-
-}; // namespace errHelper
-}; // namespace hammer
-
-#endif // __HAMMER_ERR_HELPER_TABLE_HPP__
diff --git a/src/ionhammer/expander.cpp b/src/ionhammer/expander.cpp
deleted file mode 100644
index 0b10bf5..0000000
--- a/src/ionhammer/expander.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "expander.hpp"
-
-#include "config_struct.hpp"
-#include "kmer_data.hpp"
-#include "valid_hkmer_generator.hpp"
-
-#include "io/file_reader.hpp"
-
-#include <vector>
-#include <cstring>
-
-bool Expander::operator()(const io::SingleRead &r) {
- size_t sz = r.size();
-
- std::vector<unsigned> covered_by_solid(sz, false);
- std::vector<size_t> kmer_indices(sz, -1ull);
-
- ValidHKMerGenerator<hammer::K> gen(r);
- while (gen.HasMore()) {
- hammer::HKMer kmer = gen.kmer();
- size_t idx = data_.seq_idx(kmer), kl = kmer.size();
- size_t read_pos = gen.pos() - kl;
-
- kmer_indices[read_pos] = idx;
- if (data_[idx].changeto == idx &&
- data_[idx].qual < cfg::get().center_qual_threshold) {
- for (size_t j = read_pos; j < read_pos + kl; ++j) {
- VERIFY_MSG(j < sz, "read_pos == " << read_pos << ", r.size() == " << r.size() << ", kmer: " << kmer << ", read: " << r.GetSequenceString());
- covered_by_solid[j] = true;
- }
- }
-
- gen.Next();
- }
-
- for (size_t j = 0; j < sz; ++j) {
- if (!covered_by_solid[j] || kmer_indices[j] == -1ull)
- continue;
-
- size_t idx = kmer_indices[j];
- auto &kmer_data = data_[idx];
- if (kmer_data.changeto != idx) {
-# pragma omp atomic
- changed_ += 1;
-
- kmer_data.lock();
- kmer_data.changeto = static_cast<unsigned>(idx);
- kmer_data.unlock();
- }
- }
-
- return false;
-}
diff --git a/src/ionhammer/flow_space_read.hpp b/src/ionhammer/flow_space_read.hpp
deleted file mode 100644
index b0aac58..0000000
--- a/src/ionhammer/flow_space_read.hpp
+++ /dev/null
@@ -1,77 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef __HAMMER_IT_FLOW_SPACE_READ_HPP__
-#define __HAMMER_IT_FLOW_SPACE_READ_HPP__
-
-#include "io/single_read.hpp"
-#include "HSeq.hpp"
-
-#include <deque>
-#include <cstddef>
-#include <string>
-
-namespace hammer {
-
-/// Read interpreted as series of homopolymer runs
-class FlowSpaceRead {
- std::string name_;
- std::deque<HomopolymerRun> runs_;
- public:
- FlowSpaceRead(const io::SingleRead& read) : name_(read.name()) {
- const auto& seq = read.GetSequenceString();
- hammer::iontorrent::toHomopolymerRuns(seq, runs_);
- }
-
- template <typename It>
- FlowSpaceRead(It runs_beg, It runs_end) :
- runs_(runs_beg, runs_end) {}
-
- size_t size() const {
- return runs_.size();
- }
-
- const std::string& name() const {
- return name_;
- }
-
- HomopolymerRun operator[](size_t index) const {
- return runs_[index];
- }
-
- HomopolymerRun& operator[](size_t index) {
- return runs_[index];
- }
-
- void TrimLeft(size_t n_runs) {
- if (n_runs >= runs_.size())
- runs_.clear();
- else
- runs_.erase(runs_.begin(), runs_.begin() + n_runs);
- }
-
- void TrimRight(size_t n_runs) {
- if (n_runs >= runs_.size())
- runs_.clear();
- else
- runs_.erase(runs_.end() - n_runs, runs_.end());
- }
-
- std::string GetSequenceString() const {
- std::string seq;
- for (size_t i = 0; i < runs_.size(); ++i)
- seq += runs_[i].str();
- return seq;
- }
-
- const std::deque<hammer::HomopolymerRun>& data() const {
- return runs_;
- }
-};
-
-} // namespace hammer
-#endif
diff --git a/src/ionhammer/hamcluster.cpp b/src/ionhammer/hamcluster.cpp
deleted file mode 100644
index dda80ea..0000000
--- a/src/ionhammer/hamcluster.cpp
+++ /dev/null
@@ -1,219 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "hamcluster.hpp"
-
-#include "hkmer_distance.hpp"
-#include "adt/concurrent_dsu.hpp"
-#include "io/mmapped_reader.hpp"
-
-#include <iostream>
-#include <sstream>
-
-#ifdef USE_GLIBCXX_PARALLEL
-#include <parallel/algorithm>
-#endif
-
-struct SubKMerComparator {
- bool operator()(const SubKMerData &lhs, const SubKMerData &rhs) {
- return SubKMer::less2_fast()(lhs.data, rhs.data);
- }
-};
-
-std::pair<size_t, size_t> SubKMerSplitter::split() {
- std::vector<SubKMerData> data;
-
- MMappedReader ifs(ifname_, /* unlink */ true);
- std::ofstream ofs(ofname_, std::ios::out | std::ios::binary);
- VERIFY(ofs.good());
- size_t icnt = 0, ocnt = 0;
- while (ifs.good()) {
- SubKMerComparator comp;
-
- deserialize(data, ifs);
-
-#ifdef USE_GLIBCXX_PARALLEL
- // Explicitly force a call to parallel sort routine.
- __gnu_parallel::sort(data.begin(), data.end(), comp);
-#else
- std::sort(data.begin(), data.end(), comp);
-#endif
- for (auto start = data.begin(), end = data.end(); start != end;) {
- auto chunk_end = std::upper_bound(start + 1, data.end(), *start, comp);
- serialize(ofs, start, chunk_end);
- start = chunk_end;
- ocnt += 1;
- }
- icnt += 1;
- }
- VERIFY(!ofs.fail());
-
- ofs.close();
-
- return std::make_pair(icnt, ocnt);
-}
-
-#if 1
-static bool canMerge(const ConcurrentDSU &uf, unsigned x, unsigned y) {
- size_t szx = uf.set_size(x), szy = uf.set_size(y);
- const size_t hardthr = 2500;
-
- // Global threshold - no cluster larger than hard threshold
- if (szx + szy > hardthr)
- return false;
-
- // If one of the clusters is moderately large, than attach "almost" singletons
- // only.
- if ((szx > hardthr * 3 / 4 && szy > 50) ||
- (szy > hardthr * 3 / 4 && szx > 50))
- return false;
-
- return true;
-}
-#else
-static bool canMerge(const ConcurrentDSU &uf, unsigned x, unsigned y) {
- return (uf.set_size(x) + uf.set_size(y)) < 10000;
-}
-#endif
-
-
-static void processBlockQuadratic(ConcurrentDSU &uf,
- const std::vector<size_t> &block,
- const KMerData &data,
- unsigned tau) {
- size_t blockSize = block.size();
- for (size_t i = 0; i < blockSize; ++i) {
- auto x = static_cast<unsigned>(block[i]);
- hammer::HKMer kmerx = data[x].kmer;
- hammer::HKMer rkmerx = !kmerx;
- auto rcx = static_cast<unsigned>(data.seq_idx(rkmerx));
-
- for (size_t j = i + 1; j < blockSize; j++) {
- auto y = static_cast<unsigned>(block[j]);
- hammer::HKMer kmery = data[y].kmer;
- hammer::HKMer rkmery = !kmery;
- auto rcy = static_cast<unsigned>(data.seq_idx(rkmery));
- if ((uf.find_set(x) != uf.find_set(y) || uf.find_set(rcx) !=
- uf.find_set(rcy)) &&
- (canMerge(uf, x, y) || canMerge(uf, rcx, rcy)) &&
- (hammer::distanceHKMer(kmerx.begin(), kmerx.end(),
- kmery.begin(), kmery.end(), tau) <= tau ||
- hammer::distanceHKMer(rkmerx.begin(), rkmerx.end(),
- rkmery.begin(), rkmery.end(), tau) <= tau)) {
- uf.unite(x, y);
- uf.unite(rcx, rcy);
- }
- }
- }
-}
-
-void KMerHamClusterer::cluster(const std::string &prefix,
- const KMerData &data,
- ConcurrentDSU &uf) {
- // First pass - split & sort the k-mers
- std::ostringstream tmp;
- tmp << prefix << ".first";
- std::string fname(tmp.str());
- std::ofstream ofs(fname, std::ios::out | std::ios::binary);
- VERIFY(ofs.good());
-
- INFO("Serializing sub-kmers.");
- for (unsigned i = 0; i < tau_ + 1; ++i) {
- // size_t from = (*Globals::subKMerPositions)[i];
- // size_t to = (*Globals::subKMerPositions)[i+1];
- size_t from = 0 + i*hammer::K / (tau_ + 1);
- size_t to = 0 + (i+1)*hammer::K / (tau_ + 1);
-
- INFO("Serializing: [" << from << ", " << to << ")");
- serialize(ofs, data, NULL,
- SubKMerPartSerializer(from, to));
- }
- VERIFY(!ofs.fail());
- ofs.close();
-
- size_t big_blocks1 = 0;
- {
- INFO("Splitting sub-kmers, pass 1.");
- SubKMerSplitter Splitter(fname, fname + ".blocks");
- std::pair<size_t, size_t> stat = Splitter.split();
- INFO("Splitting done."
- " Processed " << stat.first << " blocks."
- " Produced " << stat.second << " blocks.");
-
- // Sanity check - there cannot be more blocks than tau + 1 times of total
- // kmer number. And on the first pass we have only tau + 1 input blocks!
- VERIFY(stat.first == tau_ + 1);
- VERIFY(stat.second <= (tau_ + 1) * data.size());
-
- // Ok, now in the files we have everything grouped in blocks in the output files.
-
- std::vector<size_t> block;
-
- INFO("Merge sub-kmers, pass 1");
- SubKMerBlockFile blocks(fname + ".blocks", /* unlink */ true);
-
- std::ostringstream tmp;
- tmp << prefix << ".second";
- fname = tmp.str();
-
- ofs.open(fname, std::ios::out | std::ios::binary);
- VERIFY(ofs.good());
- while (blocks.get_block(block)) {
- // unsigned block_thr = cfg::get().hamming_blocksize_quadratic_threshold;
- unsigned block_thr = 50;
- if (block.size() < block_thr) {
- // Merge small blocks.
- processBlockQuadratic(uf, block, data, tau_);
- } else {
- big_blocks1 += 1;
- // Otherwise - dump for next iteration.
- for (unsigned i = 0; i < tau_ + 1; ++i) {
- serialize(ofs, data, &block,
- SubKMerStridedSerializer(i, tau_ + 1));
- }
- }
- }
- VERIFY(!ofs.fail());
- ofs.close();
- INFO("Merge done, total " << big_blocks1 << " new blocks generated.");
- }
-
- size_t big_blocks2 = 0;
- {
- INFO("Spliting sub-kmers, pass 2.");
- SubKMerSplitter Splitter(fname, fname + ".blocks");
- std::pair<size_t, size_t> stat = Splitter.split();
- INFO("Splitting done."
- " Processed " << stat.first << " blocks."
- " Produced " << stat.second << " blocks.");
-
- // Sanity check - there cannot be more blocks than tau + 1 times of total
- // kmer number. And there should be tau + 1 times big_blocks input blocks.
- VERIFY(stat.first == (tau_ + 1)*big_blocks1);
- VERIFY(stat.second <= (tau_ + 1) * (tau_ + 1) * data.size());
-
- INFO("Merge sub-kmers, pass 2");
- SubKMerBlockFile blocks(fname + ".blocks", /* unlink */ true);
- std::vector<size_t> block;
-
- size_t nblocks = 0;
- while (blocks.get_block(block)) {
- if (block.size() > 50) {
- big_blocks2 += 1;
-#if 0
- for (size_t i = 0; i < block.size(); ++i) {
- std::string s(Globals::blob + data[block[i]], K);
- INFO("" << block[i] << ": " << s);
- }
-#endif
- }
- processBlockQuadratic(uf, block, data, tau_);
- nblocks += 1;
- }
- INFO("Merge done, saw " << big_blocks2 << " big blocks out of " << nblocks << " processed.");
- }
-}
diff --git a/src/ionhammer/hamcluster.hpp b/src/ionhammer/hamcluster.hpp
deleted file mode 100644
index f93c256..0000000
--- a/src/ionhammer/hamcluster.hpp
+++ /dev/null
@@ -1,192 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef HAMMER_SUBKMER_SORTER_HPP
-#define HAMMER_SUBKMER_SORTER_HPP
-
-#include "kmer_data.hpp"
-#include "io/mmapped_reader.hpp"
-
-#include "logger/logger.hpp"
-#include "HSeq.hpp"
-
-#include <iostream>
-#include <vector>
-
-class ConcurrentDSU;
-
-typedef hammer::HSeq<(hammer::K + 1) / 2> SubKMer;
-
-struct SubKMerData {
- uint64_t idx;
- SubKMer data;
-};
-
-template<class Reader>
-inline void binary_read(Reader &is, SubKMerData &s) {
- SubKMer::DataType seq_data[SubKMer::DataSize];
-
- is.read((char*)&s.idx, sizeof(s.idx));
- is.read((char*)seq_data, sizeof(seq_data));
-
- s.data = SubKMer(seq_data, seq_data + SubKMer::DataSize);
-}
-
-template<class Writer>
-inline Writer &binary_write(Writer &os, const SubKMerData &s) {
- os.write((char*)&s.idx, sizeof(s.idx));
- os.write((char*)s.data.data(), SubKMer::TotalBytes);
-
- return os;
-}
-
-static_assert(sizeof(SubKMerData) == 16, "Too big SubKMer");
-
-class SubKMerPartSerializer{
- size_t from_;
- size_t to_;
-
-public:
- SubKMerPartSerializer(size_t from, size_t to)
- :from_(from), to_(to) { VERIFY(to_ - from_ <= hammer::K); }
-
- SubKMerData serialize(hammer::HKMer k, size_t fidx) const {
- SubKMerData s;
-
- s.idx = fidx;
- s.data = SubKMer(k.data() + from_, k.data() + to_);
-
- // Yay for NRVO!
- return s;
- }
-};
-
-class SubKMerStridedSerializer{
- size_t from_;
- size_t to_;
- size_t stride_;
-
-public:
- SubKMerStridedSerializer(size_t from, size_t stride)
- :from_(from), stride_(stride) { VERIFY(from_ + stride_ <= hammer::K); }
-
- SubKMerData serialize(hammer::HKMer k, size_t fidx) const {
- SubKMerData s;
-
- s.idx = fidx;
-
- size_t sz = (hammer::K - from_ + stride_ - 1) / stride_;
-
- std::vector<hammer::HKMer::DataType> v(sz);
- for (size_t i = from_, j = 0; i < hammer::K; i+= stride_, ++j)
- v[j] = k[i];
-
- s.data = SubKMer(&v[0], &v[0] + sz);
-
- // Yay for NRVO!
- return s;
- }
-};
-
-class SubKMerBlockFile {
- MMappedReader ifs_;
-
- public:
- SubKMerBlockFile(const std::string &fname, bool unlink = false)
- : ifs_(fname, unlink) { }
-
- bool get_block(std::vector<size_t> &block) {
- block.clear();
-#if 0
- block.shrink_to_fit();
-#else
- std::vector<size_t>().swap(block);
-#endif
-
- if (!ifs_.good())
- return false;
-
- size_t sz;
- ifs_.read((char*)&sz, sizeof(sz));
- block.resize(sz);
- for (size_t i = 0; i < sz; ++i) {
- SubKMerData s;
- binary_read(ifs_, s);
- block[i] = s.idx;
- }
-
- return true;
- }
-};
-
-template<class Writer,
- class SubKMerSerializer>
-void serialize(Writer &os,
- const KMerData &data, const std::vector<size_t> *block = NULL,
- const SubKMerSerializer &serializer = SubKMerSerializer()) {
- size_t sz = (block == NULL ? data.size() : block->size());
- os.write((char*)&sz, sizeof(sz));
- for (size_t i = 0, e = sz; i != e; ++i) {
- size_t idx = (block == NULL ? i : (*block)[i]);
- SubKMerData s = serializer.serialize(data[idx].kmer, idx);
- binary_write(os, s);
- }
-}
-
-class SubKMerSplitter {
- const std::string ifname_;
- const std::string ofname_;
-
- public:
- SubKMerSplitter(const std::string &ifname, const std::string &ofname)
- : ifname_(ifname), ofname_(ofname) {}
-
- template<class Writer>
- void serialize(Writer &os,
- const std::vector<SubKMerData>::iterator &start,
- const std::vector<SubKMerData>::iterator &end) {
- size_t sz = end - start;
-
- os.write((char*)&sz, sizeof(sz));
- for (auto I = start, E = end; I != E; ++I)
- binary_write(os, *I);
- }
-
- template<class Reader>
- void deserialize(std::vector<SubKMerData> &res,
- Reader &is) {
- res.clear();
-#if 0
- res.shrink_to_fit();
-#else
- std::vector<SubKMerData>().swap(res);
-#endif
-
- size_t sz;
- is.read((char*)&sz, sizeof(sz));
- res.resize(sz);
-
- for (size_t i = 0, e = sz; i != e; ++i)
- binary_read(is, res[i]);
- }
-
- std::pair<size_t, size_t> split();
-};
-
-class KMerHamClusterer {
- unsigned tau_;
-
- public:
- KMerHamClusterer(unsigned tau)
- : tau_(tau) {}
-
- void cluster(const std::string &prefix, const KMerData &data, ConcurrentDSU &uf);
- private:
- DECL_LOGGER("Hamming Clustering");
-};
-
-#endif // HAMMER_SUBKMER_SORTER_HPP
diff --git a/src/ionhammer/kmer_data.cpp b/src/ionhammer/kmer_data.cpp
deleted file mode 100644
index 8ab468e..0000000
--- a/src/ionhammer/kmer_data.cpp
+++ /dev/null
@@ -1,245 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "kmer_data.hpp"
-#include "config_struct.hpp"
-#include "valid_hkmer_generator.hpp"
-
-#include "io/mmapped_writer.hpp"
-#include "io/file_reader.hpp"
-#include "io/read_processor.hpp"
-
-#include <libcxx/sort.hpp>
-
-using namespace hammer;
-
-class BufferFiller;
-
-class HammerKMerSplitter : public KMerSplitter<hammer::HKMer> {
- typedef std::vector<std::vector<HKMer> > KMerBuffer;
-
- void DumpBuffers(size_t num_files, size_t nthreads,
- std::vector<KMerBuffer> &buffers,
- const path::files_t &ostreams) const;
-
- public:
- HammerKMerSplitter(const std::string &work_dir)
- : KMerSplitter<hammer::HKMer>(work_dir, hammer::K) {}
-
- virtual path::files_t Split(size_t num_files);
-
- friend class BufferFiller;
-};
-
-void HammerKMerSplitter::DumpBuffers(size_t num_files, size_t nthreads,
- std::vector<KMerBuffer> &buffers,
- const path::files_t &ostreams) const {
-# pragma omp parallel for num_threads(nthreads)
- for (unsigned k = 0; k < num_files; ++k) {
- size_t sz = 0;
- for (size_t i = 0; i < nthreads; ++i)
- sz += buffers[i][k].size();
-
- std::vector<HKMer> SortBuffer;
- SortBuffer.reserve(sz);
- for (size_t i = 0; i < nthreads; ++i) {
- KMerBuffer &entry = buffers[i];
- SortBuffer.insert(SortBuffer.end(), entry[k].begin(), entry[k].end());
- }
- libcxx::sort(SortBuffer.begin(), SortBuffer.end(), HKMer::less2_fast());
- auto it = std::unique(SortBuffer.begin(), SortBuffer.end());
-
-# pragma omp critical
- {
- FILE *f = fopen(ostreams[k].c_str(), "ab");
- VERIFY_MSG(f, "Cannot open temporary file to write");
- fwrite(SortBuffer.data(), sizeof(HKMer), it - SortBuffer.begin(), f);
- fclose(f);
- }
- }
-
- for (unsigned i = 0; i < nthreads; ++i) {
- for (unsigned j = 0; j < num_files; ++j) {
- buffers[i][j].clear();
- }
- }
-}
-
-class BufferFiller {
- std::vector<HammerKMerSplitter::KMerBuffer> &tmp_entries_;
- unsigned num_files_;
- size_t cell_size_;
- size_t processed_;
- const HammerKMerSplitter &splitter_;
-
- public:
- BufferFiller(std::vector<HammerKMerSplitter::KMerBuffer> &tmp_entries, size_t cell_size, const HammerKMerSplitter &splitter):
- tmp_entries_(tmp_entries), num_files_((unsigned)tmp_entries[0].size()), cell_size_(cell_size), processed_(0), splitter_(splitter) {}
-
- size_t processed() const { return processed_; }
-
- bool operator()(const io::SingleRead &r) {
- ValidHKMerGenerator<hammer::K> gen(r);
- HammerKMerSplitter::KMerBuffer &entry = tmp_entries_[omp_get_thread_num()];
-
-# pragma omp atomic
- processed_ += 1;
-
- bool stop = false;
- while (gen.HasMore()) {
- HKMer seq = gen.kmer(); size_t idx;
-
- idx = splitter_.GetFileNumForSeq(seq, num_files_);
- entry[idx].push_back(seq);
- stop |= entry[idx].size() > cell_size_;
-
- seq = !seq;
-
- idx = splitter_.GetFileNumForSeq(seq, num_files_);
- entry[idx].push_back(seq);
- stop |= entry[idx].size() > cell_size_;
-
- gen.Next();
- }
-
- return stop;
- }
-};
-
-path::files_t HammerKMerSplitter::Split(size_t num_files) {
- unsigned nthreads = cfg::get().max_nthreads;
-
- INFO("Splitting kmer instances into " << num_files << " buckets. This might take a while.");
-
- // Determine the set of output files
- path::files_t out;
- for (unsigned i = 0; i < num_files; ++i)
- out.push_back(GetRawKMersFname(i));
-
- size_t reads_buffer_size = cfg::get().count_split_buffer;
- if (reads_buffer_size == 0) {
- reads_buffer_size = 536870912ull;
- size_t mem_limit = (size_t)((double)(get_free_memory()) / (nthreads * 3));
- INFO("Memory available for splitting buffers: " << (double)mem_limit / 1024.0 / 1024.0 / 1024.0 << " Gb");
- reads_buffer_size = std::min(reads_buffer_size, mem_limit);
- }
- size_t cell_size = reads_buffer_size / (num_files * sizeof(HKMer));
- // Set sane minimum cell size
- if (cell_size < 16384)
- cell_size = 16384;
-
- INFO("Using cell size of " << cell_size);
- std::vector<KMerBuffer> tmp_entries(nthreads);
- for (unsigned i = 0; i < nthreads; ++i) {
- KMerBuffer &entry = tmp_entries[i];
- entry.resize(num_files);
- for (unsigned j = 0; j < num_files; ++j) {
- entry[j].reserve((size_t)(1.1 * (double)cell_size));
- }
- }
-
- size_t n = 15;
- const auto& dataset = cfg::get().dataset;
- BufferFiller filler(tmp_entries, cell_size, *this);
- for (auto it = dataset.reads_begin(), et = dataset.reads_end(); it != et; ++it) {
- INFO("Processing " << *it);
- io::FileReadStream irs(*it, io::PhredOffset);
- hammer::ReadProcessor rp(nthreads);
- while (!irs.eof()) {
- rp.Run(irs, filler);
- DumpBuffers(num_files, nthreads, tmp_entries, out);
- VERIFY_MSG(rp.read() == rp.processed(), "Queue unbalanced");
-
- if (filler.processed() >> n) {
- INFO("Processed " << filler.processed() << " reads");
- n += 1;
- }
- }
- }
- INFO("Processed " << filler.processed() << " reads");
-
- return out;
-}
-
-static inline void Merge(KMerStat &lhs, const KMerStat &rhs) {
- if (lhs.count == 0)
- lhs.kmer = rhs.kmer;
-
- lhs.count += rhs.count;
- lhs.qual *= rhs.qual;
-}
-
-static void PushKMer(KMerData &data, HKMer kmer, double qual) {
- KMerStat &kmc = data[kmer];
- kmc.lock();
- Merge(kmc, KMerStat(1, kmer, qual));
- kmc.unlock();
-}
-
-static void PushKMerRC(KMerData &data, HKMer kmer, double qual) {
- kmer = !kmer;
-
- KMerStat &kmc = data[kmer];
- kmc.lock();
- Merge(kmc, KMerStat(1, kmer, qual));
- kmc.unlock();
-}
-
-class KMerDataFiller {
- KMerData &data_;
-
- public:
- KMerDataFiller(KMerData &data)
- : data_(data) {}
-
- bool operator()(const io::SingleRead &r) const {
- ValidHKMerGenerator<hammer::K> gen(r);
- while (gen.HasMore()) {
- HKMer kmer = gen.kmer();
- double correct = gen.correct_probability();
-
- PushKMer(data_, kmer, 1 - correct);
- PushKMerRC(data_, kmer, 1 - correct);
-
- gen.Next();
- }
-
- // Do not stop
- return false;
- }
-};
-
-void KMerDataCounter::FillKMerData(KMerData &data) {
- HammerKMerSplitter splitter(cfg::get().working_dir);
- KMerDiskCounter<hammer::HKMer> counter(cfg::get().working_dir, splitter);
- size_t sz = KMerIndexBuilder<HammerKMerIndex>(cfg::get().working_dir, num_files_, cfg::get().max_nthreads).BuildIndex(data.index_, counter);
-
- // Now use the index to fill the kmer quality information.
- INFO("Collecting K-mer information, this takes a while.");
- data.data_.resize(sz);
-
- const auto& dataset = cfg::get().dataset;
- for (auto it = dataset.reads_begin(), et = dataset.reads_end(); it != et; ++it) {
- INFO("Processing " << *it);
- io::FileReadStream irs(*it, io::PhredOffset);
- KMerDataFiller filler(data);
- hammer::ReadProcessor(cfg::get().max_nthreads).Run(irs, filler);
- }
-
- INFO("Collection done, postprocessing.");
-
- size_t singletons = 0;
- for (size_t i = 0; i < data.size(); ++i) {
- VERIFY(data[i].count);
-
- if (data[i].count == 1)
- singletons += 1;
- }
-
- INFO("Merge done. There are " << data.size() << " kmers in total. "
- "Among them " << singletons << " (" << 100.0 * double(singletons) / double(data.size()) << "%) are singletons.");
-}
diff --git a/src/ionhammer/kmer_data.hpp b/src/ionhammer/kmer_data.hpp
deleted file mode 100644
index 2c45f23..0000000
--- a/src/ionhammer/kmer_data.hpp
+++ /dev/null
@@ -1,124 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef __HAMMER_KMER_DATA_HPP__
-#define __HAMMER_KMER_DATA_HPP__
-
-#include "mph_index/kmer_index.hpp"
-#include "hkmer.hpp"
-
-#include <vector>
-
-#include <cstdlib>
-
-namespace hammer {
-
-struct KMerStat {
- size_t count;
- HKMer kmer;
- double qual;
- unsigned changeto;
- uint8_t lock_;
-
- KMerStat(size_t count = 0, HKMer kmer = HKMer(), double qual = 1.0, unsigned changeto = -1)
- : count(count), kmer(kmer), qual(qual), changeto(changeto), lock_(0) { }
-
- void lock() {
- while (__sync_val_compare_and_swap(&lock_, 0, 1) == 1)
- sched_yield();
- }
- void unlock() {
- lock_ = 0;
- __sync_synchronize();
- }
-};
-
-};
-
-typedef KMerIndex<kmer_index_traits<hammer::HKMer> > HammerKMerIndex;
-
-class KMerData {
- typedef std::vector<hammer::KMerStat> KMerDataStorageType;
-
- public:
- KMerData() {}
-
- size_t size() const { return data_.size(); }
- size_t capacity() const { return data_.capacity(); }
- void clear() {
- data_.clear();
- push_back_buffer_.clear();
- KMerDataStorageType().swap(data_);
- KMerDataStorageType().swap(push_back_buffer_);
- }
- size_t push_back(const hammer::KMerStat &k) {
- push_back_buffer_.push_back(k);
-
- return data_.size() + push_back_buffer_.size() - 1;
- }
-
- hammer::KMerStat& operator[](size_t idx) {
- size_t dsz = data_.size();
- return (idx < dsz ? data_[idx] : push_back_buffer_[idx - dsz]);
- }
- const hammer::KMerStat& operator[](size_t idx) const {
- size_t dsz = data_.size();
- return (idx < dsz ? data_[idx] : push_back_buffer_[idx - dsz]);
- }
- hammer::KMerStat& operator[](hammer::HKMer s) { return operator[](index_.seq_idx(s)); }
- const hammer::KMerStat& operator[](hammer::HKMer s) const { return operator[](index_.seq_idx(s)); }
- size_t seq_idx(hammer::HKMer s) const { return index_.seq_idx(s); }
-
- template <class Writer>
- void binary_write(Writer &os) {
- size_t sz = data_.size();
- os.write((char*)&sz, sizeof(sz));
- os.write((char*)&data_[0], sz*sizeof(data_[0]));
- index_.serialize(os);
- }
-
- template <class Reader>
- void binary_read(Reader &is) {
- size_t sz = 0;
- is.read((char*)&sz, sizeof(sz));
- data_.resize(sz);
- is.read((char*)&data_[0], sz*sizeof(data_[0]));
- index_.deserialize(is);
- }
-
- private:
- KMerDataStorageType data_;
- KMerDataStorageType push_back_buffer_;
- HammerKMerIndex index_;
-
- friend class KMerDataCounter;
-};
-
-struct CountCmp {
- const KMerData &kmer_data_;
-
- CountCmp(const KMerData &kmer_data)
- : kmer_data_(kmer_data) {}
-
- bool operator()(unsigned lhs, unsigned rhs) {
- return kmer_data_[lhs].count > kmer_data_[rhs].count;
- }
-};
-
-class KMerDataCounter {
- unsigned num_files_;
-
- public:
- KMerDataCounter(unsigned num_files) : num_files_(num_files) {}
-
- void FillKMerData(KMerData &data);
-
- private:
- DECL_LOGGER("K-mer Counting");
-};
-
-#endif // __HAMMER_KMER_DATA_HPP__
diff --git a/src/ionhammer/main.cpp b/src/ionhammer/main.cpp
deleted file mode 100644
index 0048cf8..0000000
--- a/src/ionhammer/main.cpp
+++ /dev/null
@@ -1,336 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "logger/log_writers.hpp"
-
-#include "io/file_reader.hpp"
-#include "io/bam_reader.hpp"
-#include "io/paired_readers.hpp"
-#include "io/osequencestream.hpp"
-#include "io/read_processor.hpp"
-
-#include "adt/concurrent_dsu.hpp"
-
-#include "segfault_handler.hpp"
-#include "memory_limit.hpp"
-
-#include "HSeq.hpp"
-#include "kmer_data.hpp"
-#include "hamcluster.hpp"
-#include "subcluster.hpp"
-#include "err_helper_table.hpp"
-#include "read_corrector.hpp"
-#include "expander.hpp"
-#include "config_struct.hpp"
-
-#include "openmp_wrapper.h"
-
-#include "version.hpp"
-
-#include <yaml-cpp/yaml.h>
-#include <fstream>
-#include <iomanip>
-
-#include <bamtools/api/BamReader.h>
-#include <bamtools/api/SamHeader.h>
-
-void create_console_logger() {
- using namespace logging;
-
- logger *lg = create_logger("");
- lg->add_writer(std::make_shared<console_writer>());
- attach_logger(lg);
-}
-
-struct UfCmp {
- bool operator()(const std::vector<unsigned long> &lhs,
- const std::vector<unsigned long> &rhs) {
- return lhs.size() > rhs.size();
- }
-};
-
-// This is weird workaround for bug in gcc 4.4.7
-static bool stage(hammer_config::HammerStage start, hammer_config::HammerStage current) {
- switch (start) {
- case hammer_config::HammerStage::KMerCounting:
- return true;
- case hammer_config::HammerStage::HammingClustering:
- return current != hammer_config::HammerStage::KMerCounting;
- case hammer_config::HammerStage::SubClustering:
- return (current != hammer_config::HammerStage::KMerCounting &&
- current != hammer_config::HammerStage::HammingClustering);
- case hammer_config::HammerStage::ReadCorrection:
- return current == hammer_config::HammerStage::ReadCorrection;
- }
- assert(0);
-}
-
-int main(int argc, char** argv) {
- segfault_handler sh;
-
- srand(42);
- srandom(42);
-
- try {
- create_console_logger();
-
- std::string config_file = "hammer-it.cfg";
- if (argc > 1) config_file = argv[1];
- INFO("Starting IonHammer, built from " SPADES_GIT_REFSPEC ", git revision " SPADES_GIT_SHA1);
- INFO("Loading config from " << config_file.c_str());
- cfg::create_instance(config_file);
-
- // hard memory limit
- const size_t GB = 1 << 30;
- limit_memory(cfg::get().hard_memory_limit * GB);
-
- KMerData kmer_data;
- if (stage(cfg::get().start_stage, hammer_config::HammerStage::KMerCounting)) {
- // FIXME: Actually it's num_files here
- KMerDataCounter(32).FillKMerData(kmer_data);
- if (cfg::get().debug_mode) {
- INFO("Debug mode on. Saving K-mer index.");
- std::ofstream ofs(path::append_path(cfg::get().working_dir, "count.kmdata"), std::ios::binary);
- kmer_data.binary_write(ofs);
- }
- } else {
- INFO("Loading K-mer index.");
- std::ifstream ifs(path::append_path(cfg::get().working_dir, "count.kmdata"), std::ios::binary);
- VERIFY(ifs.good());
- kmer_data.binary_read(ifs);
- INFO("Total " << kmer_data.size() << " entries were loader");
- }
-
- std::vector<std::vector<size_t> > classes;
- if (stage(cfg::get().start_stage, hammer_config::HammerStage::HammingClustering)) {
- ConcurrentDSU uf(kmer_data.size());
- KMerHamClusterer clusterer(cfg::get().tau);
- INFO("Clustering Hamming graph.");
- clusterer.cluster(path::append_path(cfg::get().working_dir, "kmers.hamcls"), kmer_data, uf);
- uf.get_sets(classes);
- size_t num_classes = classes.size();
- INFO("Clustering done. Total clusters: " << num_classes);
-
- if (cfg::get().debug_mode) {
- INFO("Debug mode on. Writing down clusters.");
- std::ofstream ofs(path::append_path(cfg::get().working_dir, "hamming.cls"), std::ios::binary);
-
- ofs.write((char*)&num_classes, sizeof(num_classes));
- for (size_t i=0; i < classes.size(); ++i) {
- size_t sz = classes[i].size();
- ofs.write((char*)&sz, sizeof(sz));
- ofs.write((char*)&classes[i][0], sz * sizeof(classes[i][0]));
- }
- }
- } else {
- INFO("Loading clusters.");
- std::ifstream ifs(path::append_path(cfg::get().working_dir, "hamming.cls"), std::ios::binary);
- VERIFY(ifs.good());
-
- size_t num_classes = 0;
- ifs.read((char*)&num_classes, sizeof(num_classes));
- classes.resize(num_classes);
-
- for (size_t i = 0; i < num_classes; ++i) {
- size_t sz = 0;
- ifs.read((char*)&sz, sizeof(sz));
- classes[i].resize(sz);
- ifs.read((char*)&classes[i][0], sz * sizeof(classes[i][0]));
- }
- }
-
- size_t singletons = 0;
- for (size_t i = 0; i < classes.size(); ++i)
- if (classes[i].size() == 1)
- singletons += 1;
- INFO("Singleton clusters: " << singletons);
-
- if (stage(cfg::get().start_stage, hammer_config::HammerStage::SubClustering)) {
- size_t nonread = 0;
-#if 1
- INFO("Subclustering.");
-# pragma omp parallel for shared(nonread, classes, kmer_data)
- for (size_t i = 0; i < classes.size(); ++i) {
- auto& cluster = classes[i];
-
-# pragma omp atomic
- nonread += subcluster(kmer_data, cluster);
- }
-#else
- INFO("Assigning centers");
-# pragma omp parallel for shared(nonread, classes, kmer_data)
- for (size_t i = 0; i < classes.size(); ++i) {
- const auto& cluster = classes[i];
-# pragma omp atomic
- nonread += assign(kmer_data, cluster);
- }
-#endif
- INFO("Total " << nonread << " nonread kmers were generated");
-
- if (cfg::get().debug_mode) {
- INFO("Debug mode on. Saving K-mer index.");
- std::ofstream ofs(path::append_path(cfg::get().working_dir, "cluster.kmdata"), std::ios::binary);
- kmer_data.binary_write(ofs);
- }
- } else {
- INFO("Loading K-mer index.");
- std::ifstream ifs(path::append_path(cfg::get().working_dir, "cluster.kmdata"), std::ios::binary);
- VERIFY(ifs.good());
- kmer_data.binary_read(ifs);
- INFO("Total " << kmer_data.size() << " entries were loader");
- }
-
-#if 0
- INFO("Starting solid k-mers expansion in " << cfg::get().max_nthreads << " threads.");
- while (true) {
- Expander expander(kmer_data);
- const io::DataSet<> &dataset = cfg::get().dataset;
- for (auto I = dataset.reads_begin(), E = dataset.reads_end(); I != E; ++I) {
- io::FileReadStream irs(*I, io::PhredOffset);
- hammer::ReadProcessor rp(cfg::get().max_nthreads);
- rp.Run(irs, expander);
- VERIFY_MSG(rp.read() == rp.processed(), "Queue unbalanced");
- }
- INFO("" << expander.changed() << " solid k-mers were generated");
- if (expander.changed() == 0)
- break;
- }
-#endif
-
-#if 0
- std::ofstream fasta_ofs("centers.fasta");
- fasta_ofs << std::fixed << std::setprecision(6) << std::setfill('0');
- std::sort(classes.begin(), classes.end(), UfCmp());
- for (size_t i = 0; i < classes.size(); ++i) {
- auto& cluster = classes[i];
- std::sort(cluster.begin(), cluster.end(), CountCmp(kmer_data));
- hammer::HKMer c = center(kmer_data, cluster);
- size_t idx = kmer_data.seq_idx(c);
- if (kmer_data[idx].kmer == c) {
- fasta_ofs << '>' << std::setw(6) << i
- << "-cov_" << std::setw(0) << kmer_data[idx].count
- << "-qual_" << 1.0 - kmer_data[idx].qual;
-
- if (cluster.size() == 1)
- fasta_ofs << "_singleton";
- fasta_ofs << '\n' << c << '\n';
- }
- }
-#endif
-
- INFO("Correcting reads.");
- using namespace hammer::correction;
- SingleReadCorrector::NoDebug debug_pred;
- SingleReadCorrector::SelectAll select_pred;
- const auto& dataset = cfg::get().dataset;
- io::DataSet<> outdataset;
- size_t ilib = 0;
- for (auto it = dataset.library_begin(), et = dataset.library_end(); it != et; ++it, ++ilib) {
- const auto& lib = *it;
- auto outlib = lib;
- outlib.clear();
-
- size_t iread = 0;
- // First, correct all the paired FASTQ files
- for (auto I = lib.paired_begin(), E = lib.paired_end(); I != E; ++I, ++iread) {
- if (path::extension(I->first) == ".bam" || path::extension(I->second) == ".bam")
- continue;
-
- INFO("Correcting pair of reads: " << I->first << " and " << I->second);
-
- std::string usuffix = std::to_string(ilib) + "_" +
- std::to_string(iread) + ".cor.fasta";
-
- std::string outcorl = path::append_path(cfg::get().output_dir, path::basename(I->first) + usuffix);
- std::string outcorr = path::append_path(cfg::get().output_dir, path::basename(I->second) + usuffix);
-
- io::PairedOutputSequenceStream ors(outcorl, outcorr);
-
- io::SeparatePairedReadStream irs(I->first, I->second, 0, false, false);
- PairedReadCorrector read_corrector(kmer_data, debug_pred, select_pred);
- hammer::ReadProcessor(cfg::get().max_nthreads).Run(irs, read_corrector, ors);
-
- outlib.push_back_paired(outcorl, outcorr);
- }
-
- // Second, correct all the single FASTQ files
- for (auto I = lib.single_begin(), E = lib.single_end(); I != E; ++I, ++iread) {
- if (path::extension(*I) == ".bam")
- continue;
-
- INFO("Correcting " << *I);
-
- std::string usuffix = std::to_string(ilib) + "_" +
- std::to_string(iread) + ".cor.fasta";
-
- std::string outcor = path::append_path(cfg::get().output_dir, path::basename(*I) + usuffix);
- io::osequencestream ors(outcor);
-
- io::FileReadStream irs(*I, io::PhredOffset);
- SingleReadCorrector read_corrector(kmer_data, debug_pred, select_pred);
- hammer::ReadProcessor(cfg::get().max_nthreads).Run(irs, read_corrector, ors);
-
- outlib.push_back_single(outcor);
- }
-
- // Finally, correct all the BAM stuff in a row
- for (auto I = lib.reads_begin(), E = lib.reads_end(); I != E; ++I, ++iread) {
- if (path::extension(*I) != ".bam")
- continue;
-
- INFO("Correcting " << *I);
-
- std::string usuffix = std::to_string(ilib) + "_" +
- std::to_string(iread) + ".cor.fasta";
-
- std::string outcor = path::append_path(cfg::get().output_dir, path::basename(*I) + usuffix);
- io::osequencestream ors(outcor);
-
- BamTools::BamReader bam_reader;
- bam_reader.Open(*I);
- auto header = bam_reader.GetHeader();
- bam_reader.Close();
-
- SingleReadCorrector read_corrector(kmer_data, &header, debug_pred, select_pred);
- io::UnmappedBamStream irs(*I);
- hammer::ReadProcessor(cfg::get().max_nthreads).Run(irs, read_corrector, ors);
-
- outlib.push_back_single(outcor);
- }
-
- outdataset.push_back(outlib);
- }
- cfg::get_writable().dataset = outdataset;
-
- std::string fname = path::append_path(cfg::get().output_dir, "corrected.yaml");
- INFO("Saving corrected dataset description to " << fname);
- cfg::get().dataset.save(fname);
-
-#if 0
- std::sort(classes.begin(), classes.end(), UfCmp());
- for (size_t i = 0; i < classes.size(); ++i) {
- auto& cluster = classes[i];
- std::sort(cluster.begin(), cluster.end(), CountCmp(kmer_data));
- dump(kmer_data, cluster);
- }
-#endif
- } catch (std::bad_alloc const& e) {
- std::cerr << "Not enough memory to run BayesHammer. " << e.what() << std::endl;
- return EINTR;
- } catch (const YAML::Exception &e) {
- std::cerr << "Error reading config file: " << e.what() << std::endl;
- return EINTR;
- } catch (std::exception const& e) {
- std::cerr << "Exception caught " << e.what() << std::endl;
- return EINTR;
- } catch (...) {
- std::cerr << "Unknown exception caught " << std::endl;
- return EINTR;
- }
-
- return 0;
-}
diff --git a/src/ionhammer/read_corrector.hpp b/src/ionhammer/read_corrector.hpp
deleted file mode 100644
index ad01e26..0000000
--- a/src/ionhammer/read_corrector.hpp
+++ /dev/null
@@ -1,1220 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef __HAMMER_IT_READ_CORRECTOR_HPP__
-#define __HAMMER_IT_READ_CORRECTOR_HPP__
-
-#include "HSeq.hpp"
-#include "flow_space_read.hpp"
-#include "hkmer_distance.hpp"
-#include "consensus.hpp"
-#include "valid_hkmer_generator.hpp"
-#include "config_struct.hpp"
-#include "io/single_read.hpp"
-
-#include <boost/optional.hpp>
-#include <boost/numeric/ublas/matrix.hpp>
-#include <boost/numeric/ublas/storage.hpp>
-
-#include <bamtools/api/BamAlignment.h>
-#include <bamtools/api/SamHeader.h>
-#include "seqeval/BaseHypothesisEvaluator.h"
-
-#include <deque>
-#include <vector>
-#include <iterator>
-#include <limits>
-#include <cassert>
-#include <list>
-#include <string>
-#include <algorithm>
-#include <fstream>
-
-#if 1
-#include "sequence/nucl.hpp"
-#include <iostream>
-#include <iomanip>
-#endif
-
-namespace hammer {
-namespace correction {
-
-namespace numeric = boost::numeric::ublas;
-
-typedef numeric::matrix<double> ScoreMatrix;
-typedef std::vector<ScoreMatrix> ScoreStorage;
-
-template <typename It1, typename It2>
-static bool exactAlignH(It1 a_begin, It1 a_initial_pos, It1 a_end,
- It2 b_initial_pos, It2 /*b_end*/,
- uint8_t max_offset, uint8_t n_cmp, int* p_offset)
-{
- int M = max_offset * 2 + 1;
- for (int i = 0; i < M; i++) {
- int offset = (i / 2) * ((i & 1) ? 1 : -1); // 0, -1, 1, -2, 2, ...
- auto a_it = a_initial_pos + offset;
- auto b_it = b_initial_pos;
- if (a_it < a_begin || a_it + n_cmp > a_end)
- continue;
- bool match = true;
- for (size_t j = 0; j < n_cmp; j++)
- if ((a_it + j)->raw != (b_it + j)->raw) {
- match = false;
- break;
- }
- if (match) {
- *p_offset = offset;
- return true;
- }
- }
- return false;
-}
-
-template <typename It1, typename It2>
-static int overlapAlignH(It1 a_begin, It1 a_end, It2 b_begin, It2 b_end,
- uint8_t max_offset)
-{
- // TODO: use dynamic programming
- int M = max_offset * 2 + 1;
- int best_offset = 0;
- int best_score = 0;
- for (int i = 0; i < M; i++) {
- int offset = (i / 2) * ((i & 1) ? 1 : -1); // 0, -1, 1, -2, 2, ...
- auto a_it = offset < 0 ? a_begin : a_begin + offset;
- auto b_it = offset < 0 ? b_begin - offset : b_begin;
- if (b_it < b_begin || a_it >= a_end)
- continue;
- int score = 0;
- for ( ; a_it != a_end && b_it != b_end; ++a_it, ++b_it)
- if (a_it->nucl == b_it->nucl)
- score += std::min(a_it->len, b_it->len);
- score -= i / 4;
- if (score > best_score) {
- best_offset = offset;
- best_score = score;
- }
- }
- return best_offset;
-}
-
-
-struct Score {
- short value;
- short dir;
- Score(short v, short d) : value(v), dir(d) {}
-};
-
-#if 1
-template <typename It1, typename It2>
-static void dump(boost::numeric::ublas::matrix<Score> &scores,
- It1 x_begin, It1 x_end, It2 y_begin, It2 y_end) {
- std::cerr << " ";
- for (auto it = x_begin; it != x_end; ++it)
- std::cerr << std::setw(3) << int(it->len) << nucl(it->nucl);
- std::cerr << "\n ";
- auto m = x_end - x_begin;
- auto n = y_end - y_begin;
- for (int i = 0; i <= m; i++)
- std::cerr << std::setw(4) << scores(i, 0).value;
- std::cerr << '\n';
- for (int i = 1; i <= n; i++) {
- auto run = *(y_begin + i - 1);
- std::cerr << std::setw(2) << int(run.len) << nucl(run.nucl) << ' ';
- for (int j = 0; j <= m; j++)
- std::cerr << std::setw(4) << scores(j, i).value;
- std::cerr << '\n';
- }
-}
-#endif
-
-template <typename It1, typename It2>
-static int alignH(It1 read_begin, It1 read_end,
- It2 consensus_begin, It2 consensus_end,
- int approx_read_offset, size_t n_skip_consensus,
- uint8_t n_side = 5, uint8_t n_cmp = 8) {
-
- int left_offset = n_side;
- int read_len = int(read_end - read_begin);
- int consensus_len = int(consensus_end - consensus_begin);
-
- It1 x_begin = read_begin + std::max(approx_read_offset - n_side, 0);
- if (x_begin == read_begin)
- left_offset = approx_read_offset;
-
- if (approx_read_offset - n_side + n_cmp >= read_len) {
- x_begin = read_end - std::min(n_cmp + 2 * n_side, read_len);
- left_offset = int(read_begin + approx_read_offset - x_begin);
- }
-
- auto x_end = x_begin + std::min(int(2 * n_side + n_cmp),
- int(read_end - x_begin));
-
- auto y_begin = consensus_begin +
- std::min(int(n_skip_consensus), consensus_len);
- if (y_begin == consensus_end)
- return 0; // weird situation
- auto y_end = y_begin + std::min(int(n_cmp),
- int(consensus_end - y_begin));
-
- // glocal alignment of homopolymer runs
- const short kDirUpLeft = 0;
- const short kDirUp = 1;
- const short kDirLeft = 2;
-
- const short kBaseDiff = -3;
- const short kRunInsertionStart = -4;
- const short kRunInsertionExtend = -5;
- const short kRunDeletionStart = -4;
- const short kRunDeletionExtend = -5;
- const short kNuclMismatch = -5;
- const short kNuclMatch = 1;
- const short kFullMatch = 5;
-
- int m = int(x_end - x_begin);
- int n = int(y_end - y_begin);
-
- using namespace boost::numeric::ublas;
- matrix<Score> scores(m + 1, n + 1, Score(0, 0));
-
- size_t highest_x = 0, highest_y = 0;
- int highest_entry = std::numeric_limits<int>::min();
-
- for (int i = 1; i <= m; i++) {
- for (int j = 1; j <= n; j++) {
- int best_score = std::numeric_limits<int>::min();
- short best_dir = 0;
-
- auto run_x = *(x_begin + i - 1);
- auto run_y = *(y_begin + j - 1);
-
- int score;
- if (run_x.raw == run_y.raw) {
- score = kNuclMatch * run_x.len + scores(i - 1, j - 1).value;
- score += kFullMatch;
- if (score > best_score) {
- best_score = score;
- best_dir = kDirUpLeft;
- }
- } else if (run_x.nucl == run_y.nucl) {
- score = kBaseDiff * std::abs(run_x.len - run_y.len);
- score += kNuclMatch * std::min(run_x.len, run_y.len);
- score += scores(i - 1, j - 1).value;
- if (score > best_score) {
- best_score = score;
- best_dir = kDirUpLeft;
- }
- } else {
- score = scores(i - 1, j - 1).value;
- score += kNuclMismatch * std::max(run_x.len, run_y.len);
-
- if (score > best_score) {
- best_score = score;
- best_dir = kDirUpLeft;
- }
- }
-
- int multiplier;
-
- if (scores(i - 1, j).dir == kDirUp)
- multiplier = kRunDeletionExtend;
- else
- multiplier = kRunDeletionStart;
- score = scores(i - 1, j).value + multiplier * run_x.len;
- if (score > best_score) {
- best_score = score;
- best_dir = kDirUp;
- }
-
- if (scores(i, j - 1).dir == kDirLeft)
- multiplier = kRunInsertionStart;
- else
- multiplier = kRunInsertionExtend;
- score = scores(i, j - 1).value + multiplier * run_y.len;
- if (score > best_score) {
- best_score = score;
- best_dir = kDirLeft;
- }
-
- scores(i, j) = Score(static_cast<short>(best_score), best_dir);
-
- if (i == m || j == n) {
- const int kOffset = 4;
- int approx_offset = i - j - left_offset;
- int offset_penalty = std::abs(approx_offset) * kOffset;
- if (best_score - offset_penalty > highest_entry) {
- highest_entry = best_score - offset_penalty;
- highest_x = i;
- highest_y = j;
- }
- }
- }
- }
-
- int min_acceptable_score = ((kNuclMatch + kFullMatch) * n_cmp * 4) / 5;
- if (scores(highest_x, highest_y).value < min_acceptable_score && n_cmp < 16U)
- return alignH(read_begin, read_end,
- consensus_begin, consensus_end,
- approx_read_offset, n_skip_consensus,
- n_side, uint8_t(n_cmp * 2));
-
- int x = int(highest_x);
- int y = int(highest_y);
- while (x > 0 && y > 0) {
- int dir = scores(x, y).dir;
- switch (dir) {
- case kDirUp:
- --x; break;
- case kDirLeft:
- --y; break;
- case kDirUpLeft:
- --x, --y; break;
- default:
- break;
- }
- }
-
-#if 0
- if (std::abs(x - y - left_offset) >= 4)
- dump(scores, x_begin, x_end, y_begin, y_end);
-#endif
-
- return x - y - left_offset;
-}
-
-// Not used now
-class HKMerProlonger {
- const KMerData& kmer_data_;
-
- public:
- struct RightSide {
- static size_t changingPosition() { return hammer::K - 1; }
- static hammer::HKMer shift(const hammer::HKMer &kmer) {
- hammer::HKMer res;
- for (size_t i = 1; i < hammer::K; ++i)
- res[i - 1] = kmer[i];
- return res;
- }
- template <typename T, typename U>
- static void append(T& cont, U obj) { cont.push_back(obj); }
- };
-
- struct LeftSide {
- static size_t changingPosition() { return 0; }
- static hammer::HKMer shift(const hammer::HKMer &kmer) {
- hammer::HKMer res;
- for (size_t i = 1; i < hammer::K; ++i)
- res[i] = kmer[i - 1];
- return res;
- }
- template <typename T, typename U>
- static void append(T& cont, U obj) { cont.push_front(obj); }
- };
-
- public:
-
- /// @param[in] seed kmer to prolong
- /// @param[in] bases_to_recover maximum number of bases to recover
- /// @param[in] side side to prolong to (RightSide/LeftSide)
- template <typename Side>
- std::deque<hammer::HomopolymerRun> prolong(const hammer::HKMer &seed,
- size_t bases_to_recover,
- Side side) {
- std::deque<hammer::HomopolymerRun> good_runs(hammer::K);
- for (size_t i = 0; i < hammer::K; ++i)
- good_runs[i] = seed[i];
-
- auto good_kmer = seed;
- auto changing_pos = Side::changingPosition();
-
- for (size_t recov = 0; recov < bases_to_recover; ++recov) {
- double inf = -std::numeric_limits<double>::infinity();
- double best_qual = inf;
- int best_nucl = -1;
- int best_len = -1;
- double next_best_qual = inf;
-
- auto kmer = Side::shift(good_kmer);
-
- for (size_t nucl = 0; nucl < 4; ++nucl) {
- if (nucl == good_kmer[changing_pos].nucl)
- continue;
- for (size_t len = 1; len <= 4; ++len) {
- kmer[changing_pos] = hammer::HomopolymerRun(nucl, len);
- auto &k = kmer_data_[kmer];
- auto qual = k.count * (1 - k.qual);
- if (qual > best_qual) {
- next_best_qual = best_qual;
- best_qual = qual;
- best_nucl = nucl;
- best_len = len;
- }
- }
- }
-
- // stop if high-quality kmer is not unique
- if (best_nucl == -1 || best_qual - next_best_qual < 0.8 * best_qual)
- break;
-
- kmer[changing_pos] = hammer::HomopolymerRun(best_nucl, best_len);
- Side::append(good_runs, kmer[changing_pos]);
- good_kmer = kmer;
- }
-
- return good_runs;
- }
-
- public:
- HKMerProlonger(const KMerData& kmer_data) : kmer_data_(kmer_data) {}
-};
-
-static const double kLowScoreThreshold = 1.0;
-
-class CorrectedRead {
- FlowSpaceRead raw_read_; // Uncorrected read
- const KMerData& kmer_data_;
- bool debug_mode_;
-
- // Stores runs after joining chunks
- std::vector<hammer::HomopolymerRun> corrected_runs_;
-
- // Contiguous part of read with strong consensus
- struct ConsensusChunk {
- int approx_read_offset; // in the vector of raw read runs
- int approx_end_read_offset_;
- unsigned rollback_end; // remove if don't align well
-
- int initial_read_offset_;
-
- enum {
- kChunkLeftAligned,
- kChunkRightAligned,
- kChunkNotAligned
- } alignment;
-
- const FlowSpaceRead& raw_read;
- size_t trimmed_left;
- size_t trimmed_right;
- bool debug_mode;
-
- std::vector<hammer::HomopolymerRun> consensus;
- std::vector<double> consensus_scores;
-
- int raw_start_offset() const {
- return initial_read_offset_;
- }
-
- ConsensusChunk(int initial_read_offset,
- int approximate_read_offset,
- int approximate_end_read_offset,
- const ScoreStorage &scores,
- unsigned rollback_end,
- const FlowSpaceRead &read,
- bool debug_mode)
- : approx_read_offset(approximate_read_offset),
- approx_end_read_offset_(approximate_end_read_offset),
- rollback_end(rollback_end),
- initial_read_offset_(initial_read_offset),
- alignment(kChunkNotAligned), raw_read(read),
- trimmed_left(0), trimmed_right(0), debug_mode(debug_mode)
- {
- bool left_trim = true;
- for (size_t i = 0; i < scores.size(); ++i) {
- auto run = hammer::iontorrent::consensus(scores[i]);
-
- // trim low-quality runs from the left side
- if (run.second <= kLowScoreThreshold && left_trim) {
- approx_read_offset += 1;
- trimmed_left += 1;
- continue;
- }
-
- if (debug_mode && left_trim) {
- std::cerr << "[ConsensusChunk] trimmed from left: " << trimmed_left << std::endl;
- std::cerr << "[ConsensusChunk] approx. read offset: " << approx_read_offset << std::endl;
- }
-
- left_trim = false;
- VERIFY(run.first.len > 0);
- consensus.push_back(run.first);
- consensus_scores.push_back(run.second);
- }
-
- size_t right_end = consensus_scores.size();
- if (right_end == 0)
- return;
-
- while (consensus_scores[right_end - 1] <= kLowScoreThreshold) {
- --right_end;
- if (right_end == 0)
- break;
- }
-
- trimmed_right = consensus.size() - right_end;
- consensus.resize(right_end);
- consensus_scores.resize(right_end);
- }
-
- void AlignLeftEndAgainstRead(size_t skip=0) {
- const auto& data = raw_read.data();
-
- int offset = alignH(data.begin(), data.end(),
- consensus.begin(), consensus.end(),
- approx_read_offset, skip);
-
- if (debug_mode) {
- std::cerr << "[approx. read offset (left)] before: " << approx_read_offset << "; after: "
- << approx_read_offset + offset << std::endl;
- }
-
- approx_read_offset += offset;
- alignment = kChunkLeftAligned;
- }
-
- void AlignRightEndAgainstRead(size_t skip=0) {
- const auto& data = raw_read.data();
- int position_on_read = approx_end_read_offset_ - 1;
- int offset = alignH(data.rbegin(), data.rend(),
- consensus.rbegin(), consensus.rend(),
- int(data.size()) - 1 - position_on_read, skip);
- if (debug_mode) {
- std::cerr << "[approx. read offset (right)] before: " << approx_read_offset << "; after: "
- << approx_read_offset - offset << std::endl;
- }
- approx_read_offset -= offset;
- alignment = kChunkRightAligned;
- }
-
- int approx_end_read_offset() const {
- return approx_end_read_offset_;
- }
-
- int approx_end_read_offset_untrimmed() const {
- return approx_end_read_offset() + int(trimmed_right);
- }
-
- private:
- void RollBack() {
- trimmed_right += rollback_end;
- auto old_size = consensus.size();
- VERIFY(old_size >= rollback_end);
- consensus.resize(old_size - rollback_end);
- approx_end_read_offset_ -= rollback_end;
- consensus_scores.resize(old_size - rollback_end);
- rollback_end = 0;
- }
-
- bool DoMerge(ConsensusChunk& chunk) {
- int right_end_offset = approx_end_read_offset();
-
- if (debug_mode) {
- std::cerr << "============== Merging chunks ===============" << std::endl;
- std::cerr << "(" << approx_read_offset << " .. " << right_end_offset << ")";
- std::cerr << " -- (" << chunk.approx_read_offset << " .. " << chunk.approx_end_read_offset() << ")" << std::endl;
-
- int white_l = 0;
- for (int i = right_end_offset - 1; i >= 0; --i)
- white_l += raw_read[i].len;
- for (size_t i = 0; i < consensus.size(); ++i)
- white_l -= consensus[i].len;
- for (int i = 0; i < white_l; ++i)
- std::cerr << ' ';
- for (size_t i = std::max(-white_l, 0); i < consensus.size(); ++i)
- std::cerr << consensus[i].str();
- std::cerr << std::endl;
-
- for (int i = 0; i < chunk.approx_read_offset; ++i)
- for (int j = 0; j < raw_read[i].len; ++j)
- std::cerr << ' ';
- for (size_t i = 0; i < chunk.consensus.size(); ++i)
- std::cerr << chunk.consensus[i].str();
- std::cerr << std::endl;
- }
-
- if (right_end_offset <= chunk.approx_read_offset) {
-
- for (int i = right_end_offset; i < chunk.approx_read_offset; ++i) {
- if (i >= static_cast<int>(raw_read.size()))
- return false;
- consensus.push_back(raw_read[i]);
- alignment = kChunkNotAligned;
-
- // TODO: maintain quality scores in raw_read_
- consensus_scores.push_back(0);
- }
-
- consensus.insert(consensus.end(),
- chunk.consensus.begin(), chunk.consensus.end());
-
- consensus_scores.insert(consensus_scores.end(),
- chunk.consensus_scores.begin(),
- chunk.consensus_scores.end());
-
- } else {
- int overlap = right_end_offset - chunk.approx_read_offset;
- overlap -= overlapAlignH(consensus.end() - overlap,
- consensus.end(),
- chunk.consensus.begin(),
- chunk.consensus.begin() + overlap,
- 5);
-
- if (overlap > static_cast<int>(chunk.consensus.size()))
- return false;
-
- if (overlap < 0) {
- chunk.approx_read_offset = right_end_offset - overlap;
- return DoMerge(chunk);
- }
-
- int n_trim = 0;
- int n_runs = int(consensus.size());
-
- // FIXME
- if (overlap > 0 && rollback_end > 0) {
- for (int i = 0; i < overlap; i++) {
- if (n_runs - overlap + i < 0 || n_runs - overlap + i >= consensus.size())
- continue;
- auto left_run = consensus[n_runs - overlap + i];
- auto right_run = chunk.consensus[i];
- if (left_run != right_run) {
- RollBack();
- AlignRightEndAgainstRead();
- return DoMerge(chunk);
- }
- }
- }
-
- if (overlap >= 3 && n_runs > overlap) {
- for ( ; n_trim < overlap / 3; ++n_trim) {
- auto score1 = consensus_scores[n_runs - n_trim - 1];
- auto score2 = chunk.consensus_scores[overlap - n_trim - 1];
- if (score1 > score2)
- break;
- }
-
- consensus.resize(consensus.size() - n_trim);
- consensus_scores.resize(consensus_scores.size() - n_trim);
- }
-
- consensus.insert(consensus.end(),
- chunk.consensus.begin() + overlap - n_trim,
- chunk.consensus.end());
-
- consensus_scores.insert(consensus_scores.end(),
- chunk.consensus_scores.begin() + overlap - n_trim,
- chunk.consensus_scores.end());
- }
-
- approx_end_read_offset_ = chunk.approx_end_read_offset();
- return true;
- }
-
- bool MergeWithDisjointChunk(ConsensusChunk& chunk) {
- if (debug_mode)
- std::cerr << "[MergeWithDisjointChunk]" << std::endl;
- AlignRightEndAgainstRead();
- if (chunk.alignment != kChunkLeftAligned)
- chunk.AlignLeftEndAgainstRead();
- return DoMerge(chunk);
- }
-
- bool MergeWithOverlappingChunk(ConsensusChunk& chunk) {
- if (debug_mode)
- std::cerr << "[MergeWithOverlappingChunk]" << std::endl;
- int right_end_offset = approx_end_read_offset_;
- size_t overlap = right_end_offset - chunk.approx_read_offset;
- if (overlap > chunk.consensus_scores.size())
- return false;
-
- AlignRightEndAgainstRead();
- if (chunk.alignment != kChunkLeftAligned)
- chunk.AlignLeftEndAgainstRead();
- return DoMerge(chunk);
- }
-
- public:
-
- bool TryMergeWith(ConsensusChunk& chunk) {
- if (chunk.consensus.empty())
- return true;
-
- alignment = kChunkNotAligned;
- int right_end_offset = approx_end_read_offset_;
-
- if (right_end_offset <= chunk.approx_read_offset)
- return MergeWithDisjointChunk(chunk);
- else
- return MergeWithOverlappingChunk(chunk);
- }
-
- };
-
- // Chunks where strong consensus was obtained
- std::list<ConsensusChunk> chunks_;
- int trimmed_by_gen_;
-
- void PushChunk(const ScoreStorage &scores,
- int initial_read_offset,
- int approx_read_offset,
- int approx_end_read_offset,
- unsigned rollback_end) {
- chunks_.push_back(ConsensusChunk(initial_read_offset, approx_read_offset,
- approx_end_read_offset, scores,
- rollback_end, raw_read_, debug_mode_));
- if (debug_mode_) {
- auto &consensus = chunks_.back().consensus;
- size_t len = consensus.size();
- size_t nucl_len = 0;
- for (size_t i = 0; i < len; ++i)
- nucl_len += consensus[i].len;
- }
-
- chunks_.back().AlignLeftEndAgainstRead();
- if (chunks_.size() == 1)
- trimmed_by_gen_ = chunks_.back().raw_start_offset();
- }
-
- const ConsensusChunk& LastChunk() const {
- return chunks_.back();
- }
-
- class ChunkCollector {
- CorrectedRead &cread_;
- const KMerData &kmer_data_;
- bool debug_mode_;
-
- ValidHKMerGenerator<hammer::K> gen;
- int pos;
- unsigned skipped;
- int raw_pos;
-
- struct Center {
- hammer::HKMer seq;
- int end_offset;
- };
-
- Center last_good_center;
- bool last_good_center_is_defined;
- bool is_first_center;
- bool replacing;
- int rollback_size;
-
- bool need_to_align;
-
- int approx_read_offset;
- int approx_end_read_offset;
- ScoreStorage scores;
- int chunk_pos;
- int raw_chunk_start_pos;
-
- unsigned approx_n_insertions;
-
- Center GetCenterOfCluster(const hammer::HKMer &seq, int start_pos) const {
- hammer::KMerStat k[2];
- k[0] = kmer_data_[kmer_data_[seq].changeto];
- k[1] = kmer_data_[kmer_data_[!seq].changeto];
- k[1].kmer = !k[1].kmer;
-
- if (k[0].qual > k[1].qual)
- std::swap(k[0], k[1]);
- using namespace hammer;
- for (size_t i = 0; i < 2; ++i) {
- auto &kmer = k[i].kmer;
- int end_diff;
- auto dist = distanceHKMer(kmer.begin(), kmer.end(), seq.begin(), seq.end(), 3, &end_diff);
- if (debug_mode_) {
- std::cerr << "[GetCenterOfCluster] distance("
- << seq << ", " << kmer << ") = " << dist << std::endl;
-
- }
- if (dist <= 2) {
- return Center{kmer, start_pos + int(hammer::K) + end_diff};
- }
- }
- return Center{seq, start_pos + int(hammer::K)};
- }
-
- bool IsInconsistent(const Center ¢er) const {
- if (!last_good_center_is_defined)
- return false;
-
- for (size_t i = 0; i < hammer::K - skipped - 1; ++i)
- if (last_good_center.seq[i + skipped + 1].nucl != center.seq[i].nucl)
- return true;
-
- return false;
- }
-
- void FlushCurrentChunk() {
- unsigned rollback_end = 0;
-
- if (replacing) {
- if (rollback_size < 0)
- rollback_size = 0;
- if (rollback_size < int(scores.size()))
- rollback_end = int(scores.size()) - rollback_size;
- replacing = false;
- rollback_size = 0;
- }
-
- if (scores.size() > hammer::K) {
- cread_.PushChunk(scores, raw_chunk_start_pos,
- approx_read_offset, approx_end_read_offset, rollback_end);
- pos = cread_.LastChunk().approx_end_read_offset_untrimmed() - hammer::K;
- pos += skipped;
- } else {
- pos -= approx_n_insertions;
- }
-
- scores.clear();
- need_to_align = false;
- chunk_pos = 0;
- skipped = 0;
- approx_n_insertions = 0;
- approx_read_offset = pos;
-
- last_good_center_is_defined = false;
- }
-
- // side effect: changes chunk_pos, pos, and approx_n_insertions
- bool TryToAlignCurrentCenter(const Center ¢er) {
- if (!last_good_center_is_defined)
- return true;
-
- if (debug_mode_) {
- std::cerr << "[TryToAlignCurrentCenter] " << center.seq.str()
- << " (previous good center is " << last_good_center.seq.str() << ","
- << " skipped " << skipped << " centers)" << std::endl;
- }
-
- // offset is how many positions the center should be shifted
- // in order to agree with last_good_center
- int offset;
- bool aligned = exactAlignH(last_good_center.seq.begin(),
- last_good_center.seq.begin() + skipped + 1,
- last_good_center.seq.end(),
- center.seq.begin(), center.seq.end(), 3, 8, &offset);
-
- bool result = aligned && chunk_pos + offset >= 0;
- if (result) {
- if (debug_mode_)
- std::cerr << "[TryToAlignCurrentCenter] offset = " << offset << std::endl;
- if (offset < 0)
- approx_n_insertions -= offset;
- pos += offset;
- chunk_pos += offset;
- }
-
- return result;
- }
-
- void IncludeIntoConsensus(const Center ¢er) {
- VERIFY(chunk_pos >= 0);
- VERIFY(chunk_pos < (1 << 16));
- is_first_center = false;
-
- if (chunk_pos + hammer::K > scores.size())
- scores.resize(chunk_pos + hammer::K, ScoreMatrix(4, 64, 0));
-
- auto k = kmer_data_[center.seq];
-
- for (size_t i = 0; i < hammer::K; ++i)
- scores[chunk_pos + i](center.seq[i].nucl, center.seq[i].len) += double(k.count) * (1.0 - k.qual);
-
- last_good_center = center;
- last_good_center_is_defined = true;
- if (raw_chunk_start_pos == -1)
- raw_chunk_start_pos = raw_pos;
- approx_end_read_offset = center.end_offset;
- if (debug_mode_) {
- std::cerr << "e.o. = " << approx_end_read_offset << std::endl;
- }
- need_to_align = false;
- skipped = 0;
- }
-
- public:
- ChunkCollector(const io::SingleRead& r, CorrectedRead &cread,
- const KMerData &kmer_data, bool debug_mode) :
- cread_(cread), kmer_data_(kmer_data), debug_mode_(debug_mode),
- gen(r), pos(int(gen.trimmed_left())), skipped(0),
- last_good_center(), last_good_center_is_defined(false),
- is_first_center(true),
- replacing(false), rollback_size(0),
- need_to_align(false),
- approx_read_offset(0), approx_end_read_offset(0),
- scores(), chunk_pos(0),
- raw_chunk_start_pos(-1),
- approx_n_insertions(0)
- {
- --pos;
- --chunk_pos;
- }
-
- void Run() {
- double lowQualThreshold = cfg::get().kmer_qual_threshold;
-
- raw_pos = int(gen.trimmed_left()) - 1;
-
- if (debug_mode_) {
- std::cerr << "gen. trimmed = " << gen.trimmed_left() << std::endl;
- }
-
- while (gen.HasMore()) {
- auto prev_chunk_pos = chunk_pos;
- auto seq = gen.kmer();
- gen.Next();
- ++pos;
- ++raw_pos;
- if (debug_mode_) {
- std::cerr << "=================================" << std::endl;
- std::cerr << "pos = " << pos << ", raw_pos = " << raw_pos <<
- ", last_good_center_is_defined = " << last_good_center_is_defined <<
- ", skipped = " << skipped << std::endl;
- }
- ++chunk_pos;
-
- auto center = Center{seq, raw_pos + int(hammer::K)};
- auto qual = kmer_data_[seq].qual;
-
- bool can_be_changed = last_good_center_is_defined || is_first_center;
- if (qual > lowQualThreshold && can_be_changed) {
- center = GetCenterOfCluster(seq, raw_pos);
- qual = kmer_data_[center.seq].qual;
- }
-
- if (qual > lowQualThreshold && last_good_center_is_defined && skipped == 0) {
- if (debug_mode_) {
- std::cerr << "raw_pos + hammer::K = " << raw_pos + hammer::K << std::endl;
- std::cerr << "last_good_center.end_offset + 1 = " << last_good_center.end_offset + 1 << std::endl;
- }
- // Finding a center by means of clustering failed.
- // Let's try the following: take last good center and make a new one
- // from it by appending next homopolymer run; if its quality is high, we use it.
- if (raw_pos + hammer::K < last_good_center.end_offset + 1) {
- --pos;
- --chunk_pos;
- if (debug_mode_) {
- std::cerr << "skipping low-quality hk-mer" << std::endl;
- }
- continue; // move to next hk-mer
- } else if (raw_pos + hammer::K == last_good_center.end_offset + 1) {
- auto seq_corr = last_good_center.seq;
- for (size_t i = 0; i < hammer::K - 1; ++i)
- seq_corr[i] = seq_corr[i + 1];
- seq_corr[hammer::K - 1] = seq[hammer::K - 1];
- center = Center{seq_corr, last_good_center.end_offset + 1};
- qual = kmer_data_[center.seq].qual;
- if (debug_mode_) {
- std::cerr << "seq_corr = " << seq_corr.str() << " , qual = " << qual << std::endl;
- }
-
- if (qual > lowQualThreshold && can_be_changed) {
- // our last resort...
- center = GetCenterOfCluster(seq_corr, raw_pos);
- qual = kmer_data_[center.seq].qual;
- }
- }
- }
-
- bool low_qual = qual > lowQualThreshold;
- bool inconsistent = IsInconsistent(center);
-
- if (debug_mode_ && !low_qual && seq != center.seq) {
- std::cerr << "replaced " << seq.str()
- << " (quality " << kmer_data_[seq].qual
- << ", count " << kmer_data_[seq].count << ")"
- << " with " << center.seq.str() << std::endl;
- }
-
- if (debug_mode_) {
- std::cerr << "quality of " << center.seq.str() << " is " << qual
- << " (count " << kmer_data_[center.seq].count << ") "
- << (inconsistent ? " INCONSISTENT" : "") << std::endl;
- }
-
- if (low_qual) {
- ++skipped;
- } else if (inconsistent) {
- if (!TryToAlignCurrentCenter(center)) {
- low_qual = true;
- ++skipped;
- }
- }
-
- if (skipped > hammer::K / 4) {
- FlushCurrentChunk();
- } else if (!low_qual) {
- if (seq != center.seq && !replacing) {
- rollback_size = prev_chunk_pos + hammer::K;
- replacing = true;
- } else if (seq == center.seq && replacing) {
- replacing = false;
- }
-
- if (debug_mode_) {
- std::cerr << "[include into consensus] raw_pos = " << raw_pos << std::endl;
- }
- IncludeIntoConsensus(center);
- }
- }
-
- FlushCurrentChunk();
- }
- };
-
- void CollectChunks(const io::SingleRead& r) {
- ChunkCollector chunk_collector(r, *this, kmer_data_, debug_mode_);
- chunk_collector.Run();
- }
-
- public:
- CorrectedRead(const io::SingleRead& read, const KMerData& kmer_data,
- bool debug_mode = false) :
- raw_read_(read),
- kmer_data_(kmer_data),
- debug_mode_(debug_mode)
- {
- CollectChunks(read);
- }
-
- void MergeChunks() {
- if (chunks_.empty())
- return;
-
- auto iter = chunks_.begin();
- ConsensusChunk& merged = *iter;
-
- if (debug_mode_) {
- if (chunks_.size() == 1) {
- iter->AlignLeftEndAgainstRead();
- for (int i = 0; i < iter->approx_read_offset; ++i)
- for (int j = 0; j < raw_read_[i].len; ++j)
- std::cerr << ' ';
- for (size_t i = 0; i < iter->consensus.size(); ++i)
- std::cerr << iter->consensus[i].str();
- std::cerr << std::endl;
- }
- }
-
- ++iter;
- while (iter != chunks_.end()) {
- if (iter->consensus.size() > hammer::K)
- merged.TryMergeWith(*iter);
- iter = chunks_.erase(iter);
- }
-
- corrected_runs_ = std::move(merged.consensus);
- }
-
- void AttachUncorrectedRuns() {
- // attach runs from the right
- const auto& data = raw_read_.data();
- int n_raw = int(raw_read_.size());
- int end_read_offset = LastChunk().approx_end_read_offset();
- if (end_read_offset < n_raw && end_read_offset >= 0) {
- corrected_runs_.insert(corrected_runs_.end(),
- data.begin() + end_read_offset,
- data.end());
- }
- if (debug_mode_) {
- std::cerr << "n_raw = " << n_raw << ", end_read_offset = " << end_read_offset << std::endl;
- }
-
- // attach runs from the left
- if (trimmed_by_gen_ > 0 && size_t(trimmed_by_gen_) <= data.size()) {
- std::vector<HomopolymerRun> runs;
- runs.reserve(corrected_runs_.size() + trimmed_by_gen_);
- runs.insert(runs.end(), data.begin(), data.begin() + trimmed_by_gen_);
- runs.insert(runs.end(), corrected_runs_.begin(), corrected_runs_.end());
- std::swap(runs, corrected_runs_);
- }
- }
-
- std::string GetSequenceString() const {
- if (chunks_.empty() && corrected_runs_.empty())
- return "";
- std::string res;
- if (!corrected_runs_.empty()) {
- for (auto it = corrected_runs_.begin(); it != corrected_runs_.end(); ++it)
- res += it->str();
- } else {
- auto& runs = chunks_.front().consensus;
- for (auto it = runs.begin(); it != runs.end(); ++it)
- res += it->str();
- }
- return res;
- }
-};
-
-class SingleReadCorrector {
- const KMerData &kmer_data_;
-
- public:
-
- struct ReadSelectionPredicate {
- virtual bool operator()(const io::SingleRead &read) = 0;
- };
-
- struct DebugOutputPredicate : public ReadSelectionPredicate {};
-
- struct NoDebug : public DebugOutputPredicate {
- virtual bool operator()(const io::SingleRead &) {
- return false;
- }
- };
-
- struct FullDebug : public DebugOutputPredicate {
- virtual bool operator()(const io::SingleRead &) {
- return true;
- }
- };
-
- class DebugIfContains : public DebugOutputPredicate {
- Sequence needle_;
- Sequence needle_rc_;
- public:
- DebugIfContains(const Sequence &seq) :
- needle_(seq), needle_rc_(!seq) {}
-
- virtual bool operator()(const io::SingleRead &read) {
- auto read_seq = read.sequence();
- if (read_seq.size() < needle_.size())
- return false;
- if (read_seq.find(needle_, 0) != -1ULL)
- return true;
- if (read_seq.find(needle_rc_, 0) != -1ULL)
- return true;
- return false;
- }
- };
-
- struct SelectPredicate : public ReadSelectionPredicate {};
- struct SelectAll : public SelectPredicate {
- virtual bool operator()(const io::SingleRead &) {
- return true;
- }
- };
-
- class SelectByName : public SelectPredicate {
- std::set<std::string> names_;
- public:
- SelectByName(const std::set<std::string>& names) :
- names_(names) {}
- virtual bool operator()(const io::SingleRead &r) {
- return names_.find(r.name()) != names_.end();
- }
- };
-
-private:
- BamTools::SamHeader* sam_header_;
- DebugOutputPredicate &debug_pred_;
- SelectPredicate &select_pred_;
-
-public:
- SingleReadCorrector(const KMerData &kmer_data,
- BamTools::SamHeader *sam_header,
- DebugOutputPredicate &debug,
- SelectPredicate &select) :
- kmer_data_(kmer_data), sam_header_(sam_header),
- debug_pred_(debug), select_pred_(select) {}
-
- SingleReadCorrector(const KMerData &kmer_data,
- DebugOutputPredicate &debug,
- SelectPredicate &select) :
- kmer_data_(kmer_data), sam_header_(NULL),
- debug_pred_(debug), select_pred_(select) {}
-
- boost::optional<io::SingleRead> operator()(const io::SingleRead &r) {
- if (!select_pred_(r))return boost::optional<io::SingleRead>();
- bool debug_mode = debug_pred_(r);
- if (debug_mode) {
- std::cerr << "=============================================" << std::endl;
-
- std::cerr << '>' << r.name() << '\n'
- << r.GetSequenceString() << std::endl;
- }
-
- CorrectedRead read(r, kmer_data_, debug_mode);
- read.MergeChunks();
- if (cfg::get().keep_uncorrected_ends)
- read.AttachUncorrectedRuns();
-
- if (debug_mode) {
- std::cerr << "final result: " << read.GetSequenceString() << std::endl;
- }
-
- auto seq = read.GetSequenceString();
- if (seq.empty())
- return boost::optional<io::SingleRead>();
-
- return io::SingleRead(r.name(), seq);
- }
-
- boost::optional<io::BamRead>
- operator()(BamTools::BamAlignment &alignment) {
- VERIFY(sam_header_);
- io::SingleRead r(alignment.Name, alignment.QueryBases);
- // reverse strand means we're working with a mapped BAM, might be
- // the case for datasets downloaded from IonCommunity
- if (alignment.IsReverseStrand())
- r = !r;
- auto corrected_r = operator()(r);
- std::string rg;
- if (!alignment.GetTag("RG", rg) || !corrected_r)
- return boost::optional<io::BamRead>();
- auto flow_order = sam_header_->ReadGroups[rg].FlowOrder;
-
- float delta_score, fit_score;
- auto seq = corrected_r.get().GetSequenceString();
- if (alignment.IsReverseStrand()) {
- std::reverse(seq.begin(), seq.end());
- for (auto it = seq.begin(); it != seq.end(); ++it) {
- switch (*it) {
- case 'A': *it = 'T'; break;
- case 'C': *it = 'G'; break;
- case 'G': *it = 'C'; break;
- case 'T': *it = 'A'; break;
- default: break;
- }
- }
- }
-
- BaseHypothesisEvaluator(alignment, flow_order, seq,
- delta_score, fit_score, 0);
- std::stringstream ss;
- ss << alignment.Name << "_" << delta_score << "_" << fit_score;
- alignment.Name = ss.str();
- if (delta_score >= cfg::get().delta_score_threshold)
- return io::BamRead(alignment);
-
- BamTools::BamAlignment corrected(alignment);
- corrected.QueryBases = corrected_r.get().GetSequenceString();
- return io::BamRead(corrected);
- }
-};
-
-class PairedReadCorrector : public SingleReadCorrector {
- public:
- PairedReadCorrector(const KMerData &kmer_data,
- DebugOutputPredicate &debug,
- SelectPredicate &select)
- : SingleReadCorrector(kmer_data, debug, select) {}
-
- boost::optional<io::PairedRead> operator()(const io::PairedRead &r) {
- auto corrected_r = SingleReadCorrector::operator()(r.first());
- auto corrected_l = SingleReadCorrector::operator()(r.second());
-
- if (!corrected_r || !corrected_l)
- return boost::optional<io::PairedRead>();
-
- return io::PairedRead(corrected_r.get(), corrected_l.get(), 0);
- }
-};
-
-}; // namespace correction
-}; // namespace hammer
-#endif // __HAMMER_IT_READ_CORRECTOR_HPP__
diff --git a/src/ionhammer/seqeval/BaseHypothesisEvaluator.cpp b/src/ionhammer/seqeval/BaseHypothesisEvaluator.cpp
deleted file mode 100644
index 6d494dc..0000000
--- a/src/ionhammer/seqeval/BaseHypothesisEvaluator.cpp
+++ /dev/null
@@ -1,302 +0,0 @@
-/* Copyright (C) 2013 Ion Torrent Systems, Inc. All Rights Reserved */
-
-//! @file BaseHypothesisEvaluator.cpp
-//! @ingroup SpadesHelpers
-//! @brief Combines code from the TS Basecaller and the TS Variant Caller to
-//! @brief give an indication about the feasibility of an alternative read sequence
-
-#include "BaseHypothesisEvaluator.h"
-
-// Function to fill in predicted signal values
-void BaseHypothesisEvaluator(BamTools::BamAlignment &alignment,
- const string &flow_order_str,
- const string &alt_base_hyp,
- float &delta_score,
- float &fit_score,
- int heavy_verbose) {
-
- // --- Step 1: Initialize Objects and retrieve relevant tags
-
- delta_score = 1e5;
- fit_score = 1e5;
- vector<string> Hypotheses(2);
- vector<float> measurements, phase_params;
- int start_flow, num_flows, prefix_flow=0;
-
- if (not GetBamTags(alignment, flow_order_str.length(), measurements, phase_params, start_flow))
- return;
- num_flows = measurements.size();
- ion::FlowOrder flow_order(flow_order_str, num_flows);
- BasecallerRead master_read;
- master_read.SetData(measurements, flow_order.num_flows());
- TreephaserLite treephaser(flow_order);
- treephaser.SetModelParameters(phase_params[0], phase_params[1]);
-
- // --- Step 2: Solve beginning of the read
- // Look at mapped vs. unmapped reads in BAM
- Hypotheses[0] = alignment.QueryBases;
- Hypotheses[1] = alt_base_hyp;
- // Safety: reverse complement reverse strand reads in mapped bam
- if (alignment.IsMapped() and alignment.IsReverseStrand()) {
- RevComplementInPlace(Hypotheses[0]);
- RevComplementInPlace(Hypotheses[1]);
- }
-
- prefix_flow = GetMasterReadPrefix(treephaser, flow_order, start_flow, Hypotheses[0], master_read);
- unsigned int prefix_size = master_read.sequence.size();
-
- // --- Step 3: creating predictions for the individual hypotheses
-
- vector<BasecallerRead> hypothesesReads(Hypotheses.size());
- vector<float> squared_distances(Hypotheses.size(), 0.0);
- int max_last_flow = 0;
-
- for (unsigned int i_hyp=0; i_hyp<hypothesesReads.size(); ++i_hyp) {
-
- hypothesesReads[i_hyp] = master_read;
- // --- add hypothesis sequence to clipped prefix
- unsigned int i_base = 0;
- int i_flow = prefix_flow;
-
- while (i_base<Hypotheses[i_hyp].length() and i_base<(2*(unsigned int)flow_order.num_flows()-prefix_size)) {
- while (i_flow < flow_order.num_flows() and flow_order.nuc_at(i_flow) != Hypotheses[i_hyp][i_base])
- i_flow++;
- if (i_flow < flow_order.num_flows() and i_flow > max_last_flow)
- max_last_flow = i_flow;
- if (i_flow >= flow_order.num_flows())
- break;
- // Add base to sequence only if it fits into flow order
- hypothesesReads[i_hyp].sequence.push_back(Hypotheses[i_hyp][i_base]);
- i_base++;
- }
- i_flow = min(i_flow, flow_order.num_flows()-1);
-
- // Solver simulates beginning of the read and then fills in the remaining clipped bases for which we have flow information
- treephaser.Solve(hypothesesReads[i_hyp], num_flows, i_flow);
- }
- // Compute L2-distance of measurements and predictions
- for (unsigned int i_hyp=0; i_hyp<hypothesesReads.size(); ++i_hyp) {
- for (int iFlow=0; iFlow<=max_last_flow; iFlow++)
- squared_distances[i_hyp] += (measurements.at(iFlow) - hypothesesReads[i_hyp].prediction.at(iFlow)) *
- (measurements.at(iFlow) - hypothesesReads[i_hyp].prediction.at(iFlow));
- }
-
- // Delta: L2-distance of alternative base Hypothesis - L2-distance of bases as called
- delta_score = squared_distances.at(1) - squared_distances.at(0);
- fit_score = min(squared_distances.at(1), squared_distances.at(0));
-
-
- // --- verbose ---
- if (heavy_verbose > 1 or (delta_score < 0 and heavy_verbose > 0)) {
- cout << "Processed read " << alignment.Name << endl;
- cout << "Delta Fit: " << delta_score << " Overall Fit: " << fit_score << endl;
- PredictionGenerationVerbose(Hypotheses, hypothesesReads, phase_params, flow_order, start_flow, prefix_size);
- }
-
-}
-
-// ----------------------------------------------------------------------
-
-bool GetBamTags(BamTools::BamAlignment &alignment,
- const int &num_flows,
- vector<float> &measurements,
- vector<float> &phase_params,
- int &start_flow) {
-
- vector<int16_t> quantized_measurements;
- // Retrieve normalized measurements from BAM file
- if (not alignment.GetTag("ZM", quantized_measurements)) {
- cerr << "ERROR: Normalized measurements ZM:tag is not present in read " << alignment.Name << endl;
- return false;
- }
- if ((int)quantized_measurements.size() > num_flows) {
- cerr << "ERROR: Normalized measurements ZM:tag length exceeds flow order length in read " << alignment.Name << endl;
- return false;
- }
- measurements.assign(quantized_measurements.size(), 0.0);
- for (size_t counter = 0; counter < quantized_measurements.size(); ++counter)
- measurements.at(counter) = (float)quantized_measurements.at(counter)/256;
-
- // Retrieve phasing parameters from BAM file
- if (not alignment.GetTag("ZP", phase_params)) {
- cerr << "ERROR: Phasing Parameters ZP:tag is not present in read " << alignment.Name << endl;
- return false;
- }
- if (phase_params.size() != 3) {
- cerr << "ERROR: Phasing Parameters ZP:tag does not have 3 phase parameters in read " << alignment.Name << endl;
- return false;
- }
- if (phase_params[0] < 0 or phase_params[0] > 1 or phase_params[1] < 0 or phase_params[1] > 1
- or phase_params[2] < 0 or phase_params[2] > 1) {
- cerr << "ERROR: Phasing Parameters ZP:tag outside of [0,1] range in read " << alignment.Name << endl;
- return false;
- }
- phase_params[2] = 0.0f; // ad-hoc corrector: zero droop
-
- // Retrieve start flow
- if (not alignment.GetTag("ZF", start_flow)) {
- cerr << "ERROR: Start Flow ZF:tag not found in read " << alignment.Name << endl;
- return false;
- }
- if (start_flow < 0 or start_flow >= num_flows) {
- cerr << "ERROR: Start flow outsize of [0,num_flows) range in read " << alignment.Name << endl;
- cerr << "Start flow: " << start_flow << " Number of flows: " << num_flows;
- return false;
- }
- // A start flow of zero indicated a read that did not pass basecaller filters
- if (start_flow == 0) {
- cerr << "WARNING: Start Flow ZF:tag has zero value in read " << alignment.Name << endl;
- return false;
- }
- return true;
-}
-
-// ----------------------------------------------------------------------
-
-int GetMasterReadPrefix(TreephaserLite &treephaser,
- const ion::FlowOrder &flow_order,
- const int &start_flow,
- const string &called_bases,
- BasecallerRead &master_read) {
-
- // Solve beginning of maybe clipped read
- int until_flow = min((start_flow+20), flow_order.num_flows());
- treephaser.Solve(master_read, until_flow, 0);
-
- // StartFlow clipped? Get solved HP length at startFlow.
- unsigned int base = 0;
- int flow = 0;
- unsigned int HPlength = 0;
- while (base < master_read.sequence.size()) {
- while (flow < flow_order.num_flows() and flow_order.nuc_at(flow) != master_read.sequence[base]) {
- flow++;
- }
- if (flow > start_flow or flow == flow_order.num_flows())
- break;
- if (flow == start_flow)
- HPlength++;
- base++;
- }
- //if (global_context.DEBUG>2)
- // printf("Solved %d bases until (not incl.) flow %d. HP of height %d at flow %d.\n", base, flow, HPlength, start_flow);
-
- // Get HP size at the start of the read as called in Hypotheses[0]
- unsigned int count = 1;
- while (count < called_bases.length() and called_bases.at(count) == called_bases.at(0))
- count++;
- //if (global_context.DEBUG>2)
- // printf("Hypothesis starts with an HP of length %d\n", count);
- // Adjust the length of the prefix and erase extra solved bases
- if (HPlength>count)
- base -= count;
- else
- base -= HPlength;
- master_read.sequence.erase(master_read.sequence.begin()+base, master_read.sequence.end());
-
- // Get flow of last prefix base
- int prefix_flow = 0;
- for (unsigned int i_base = 0; i_base < master_read.sequence.size(); i_base++) {
- while (prefix_flow < flow_order.num_flows() and flow_order.nuc_at(prefix_flow) != master_read.sequence[i_base])
- prefix_flow++;
- }
-
- return prefix_flow;
-}
-
-
-// ----------------------------------------------------------------------
-
-void PredictionGenerationVerbose(const vector<string> &Hypotheses,
- const vector<BasecallerRead> &hypothesesReads,
- const vector<float> &phase_params,
- const ion::FlowOrder &flow_order,
- const int &start_flow,
- const int &prefix_size) {
-
- printf("Calculating predictions for %d hypotheses starting at flow %d:\n", (int)Hypotheses.size(), start_flow);
- for (unsigned int iHyp=0; iHyp<Hypotheses.size(); ++iHyp) {
- for (unsigned int iBase=0; iBase<Hypotheses[iHyp].length(); ++iBase)
- printf("%c", Hypotheses[iHyp][iBase]);
- printf("\n");
- }
- printf("Solved read prefix: ");
- for (int iBase=0; iBase<prefix_size; ++iBase)
- printf("%c", hypothesesReads[0].sequence[iBase]);
- printf("\n");
- printf("Extended Hypotheses reads to:\n");
- for (unsigned int iHyp=0; iHyp<hypothesesReads.size(); ++iHyp) {
- for (unsigned int iBase=0; iBase<hypothesesReads[iHyp].sequence.size(); ++iBase)
- printf("%c", hypothesesReads[iHyp].sequence[iBase]);
- printf("\n");
- }
- printf("Phasing Parameters, cf: %f ie: %f dr: %f \n Predictions: \n",
- phase_params[0], phase_params[1], phase_params[2]);
- cout << "Flow Order : ";
- for (int i_flow=0; i_flow<flow_order.num_flows(); i_flow++) {
- cout << flow_order.nuc_at(i_flow) << " ";
- if (hypothesesReads[0].normalized_measurements[i_flow] < 0)
- cout << " ";
- }
- cout << endl << "Flow Index : ";
- for (int i_flow=0; i_flow<flow_order.num_flows(); i_flow++) {
- cout << i_flow << " ";
- if (i_flow<10) cout << " ";
- else if (i_flow<100) cout << " ";
- else if (i_flow<1000) cout << " ";
- if (hypothesesReads[0].normalized_measurements[i_flow] < 0)
- cout << " ";
- }
- cout << endl << "Measured : ";
- for (unsigned int i_flow=0; i_flow<hypothesesReads[0].normalized_measurements.size(); ++i_flow) {
- printf("%.2f", hypothesesReads[0].normalized_measurements[i_flow]);
- if (hypothesesReads[0].normalized_measurements[i_flow] < 10)
- cout << " ";
- }
- cout << endl;
- for (unsigned int i_Hyp=0; i_Hyp<hypothesesReads.size(); ++i_Hyp) {
- cout << "Prediction "<< i_Hyp << ": ";
- for (unsigned int i_flow=0; i_flow<hypothesesReads[i_Hyp].prediction.size(); ++i_flow) {
- printf("%.2f", hypothesesReads[i_Hyp].prediction[i_flow]);
- if (hypothesesReads[i_Hyp].prediction[i_flow] < 10)
- cout << " ";
- if (hypothesesReads[0].normalized_measurements[i_flow] < 0)
- cout << " ";
- }
- cout << endl;
- }
- cout << " ------------------- " << endl;
-}
-
-// ----------------------------------------------------------------------
-
-char NucComplement (char nuc)
-{
- switch(nuc) {
- case ('A') : return 'T';
- case ('C') : return 'G';
- case ('G') : return 'C';
- case ('T') : return 'A';
- case ('a') : return 't';
- case ('c') : return 'g';
- case ('g') : return 'c';
- case ('t') : return 'a';
-
- default: return nuc; // e.g. 'N' and '-' handled by default
- }
-}
-
-void RevComplementInPlace(string& seq) {
-
- char c;
- int forward_idx = 0;
- int backward_idx = seq.size()-1;
- while (forward_idx < backward_idx) {
- c = seq[forward_idx];
- seq[forward_idx] = NucComplement(seq[backward_idx]);
- seq[backward_idx] = NucComplement(c);
- forward_idx++;
- backward_idx--;
- }
- if (forward_idx == backward_idx)
- seq[forward_idx] = NucComplement(seq[forward_idx]);
-}
diff --git a/src/ionhammer/seqeval/TreephaserLite.cpp b/src/ionhammer/seqeval/TreephaserLite.cpp
deleted file mode 100644
index 14a428e..0000000
--- a/src/ionhammer/seqeval/TreephaserLite.cpp
+++ /dev/null
@@ -1,593 +0,0 @@
-/* Copyright (C) 2013 Ion Torrent Systems, Inc. All Rights Reserved */
-
-//! @file TreephaserLite.cpp
-//! @ingroup Spades Helper
-//! @brief A lighter version of TS-TreephaserLite. Perform dephasing and call base sequence by tree search.
-
-
-#include "TreephaserLite.h"
-
-//-------------------------------------------------------------------------
-
-void BasecallerRead::SetData(const vector<float> &measurements, int num_flows) {
-
- raw_measurements = measurements;
- raw_measurements.resize(num_flows, 0);
- for (int iFlow = 0; iFlow < num_flows; iFlow++) {
- if (isnan(measurements[iFlow])) {
- std::cerr << "Warning: Basecaller Read: NAN in measurements!"<< std::endl;
- raw_measurements.at(iFlow) = 0;
- }
- }
-
- key_normalizer = 1.0f;
- normalized_measurements = raw_measurements;
- sequence.reserve(2*num_flows);
- sequence.clear();
- prediction.assign(num_flows, 0);
-}
-
-//--------------------------------------------------------------------------
-
-void BasecallerRead::SetDataAndKeyNormalize(const float *measurements, int num_flows, const int *key_flows, int num_key_flows)
-{
- raw_measurements.resize(num_flows);
- normalized_measurements.resize(num_flows);
- prediction.assign(num_flows, 0);
- sequence.reserve(2*num_flows);
-
- float onemer_sum = 0.0f;
- int onemer_count = 0;
- for (int flow = 0; flow < num_key_flows; ++flow) {
- if (key_flows[flow] == 1) {
- onemer_sum += measurements[flow];
- ++onemer_count;
- }
- }
-
- key_normalizer = 1.0f;
- if (onemer_sum and onemer_count)
- key_normalizer = static_cast<float>(onemer_count) / onemer_sum;
-
- for (int flow = 0; flow < num_flows; ++flow) {
- raw_measurements[flow] = measurements[flow] * key_normalizer;
- normalized_measurements[flow] = raw_measurements[flow];
- }
-}
-
-//=========================================================================
-
-TreephaserLite::TreephaserLite(const ion::FlowOrder& flow_order, const int windowSize)
- : flow_order_(flow_order)
-{
- SetNormalizationWindowSize(windowSize);
- for (int i = 0; i < 8; i++) {
- transition_base_[i].resize(flow_order_.num_flows());
- transition_flow_[i].resize(flow_order_.num_flows());
- }
- path_.resize(kNumPaths);
- for (int p = 0; p < kNumPaths; ++p) {
- path_[p].state.resize(flow_order_.num_flows());
- path_[p].prediction.resize(flow_order_.num_flows());
- path_[p].sequence.reserve(2*flow_order_.num_flows());
- }
-}
-
-//-------------------------------------------------------------------------
-
-void TreephaserLite::SetModelParameters(double carry_forward_rate, double incomplete_extension_rate, double droop_rate)
-{
-
- double nuc_avaliability[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
- for (int flow = 0; flow < flow_order_.num_flows(); ++flow) {
- nuc_avaliability[flow_order_[flow]&7] = 1;
- for (int nuc = 0; nuc < 8; nuc++) {
- transition_base_[nuc][flow] = nuc_avaliability[nuc] * (1-droop_rate) * (1-incomplete_extension_rate);
- transition_flow_[nuc][flow] = (1-nuc_avaliability[nuc]) + nuc_avaliability[nuc] * (1-droop_rate) * incomplete_extension_rate;
- nuc_avaliability[nuc] *= carry_forward_rate;
- }
- }
-
-}
-
-// ----------------------------------------------------------------------
-
-void TreephaserLite::WindowedNormalize(BasecallerRead& read, int num_steps, int window_size) const
-{
- int num_flows = read.raw_measurements.size();
- float median_set[window_size];
-
- // Estimate and correct for additive offset
-
- float next_normalizer = 0;
- int estim_flow = 0;
- int apply_flow = 0;
-
- for (int step = 0; step < num_steps; ++step) {
-
- int window_end = estim_flow + window_size;
- int window_middle = estim_flow + window_size / 2;
- if (window_middle > num_flows)
- break;
-
- float normalizer = next_normalizer;
-
- int median_set_size = 0;
- for (; estim_flow < window_end and estim_flow < num_flows; ++estim_flow)
- if (read.prediction[estim_flow] < 0.3)
- median_set[median_set_size++] = read.raw_measurements[estim_flow] - read.prediction[estim_flow];
-
- if (median_set_size > 5) {
- std::nth_element(median_set, median_set + median_set_size/2, median_set + median_set_size);
- next_normalizer = median_set[median_set_size / 2];
- if (step == 0)
- normalizer = next_normalizer;
- }
-
- float delta = (next_normalizer - normalizer) / window_size;
-
- for (; apply_flow < window_middle and apply_flow < num_flows; ++apply_flow) {
- read.normalized_measurements[apply_flow] = read.raw_measurements[apply_flow] - normalizer;
- normalizer += delta;
- }
- }
-
- for (; apply_flow < num_flows; ++apply_flow) {
- read.normalized_measurements[apply_flow] = read.raw_measurements[apply_flow] - next_normalizer;
- }
-
- // Estimate and correct for multiplicative scaling
-
- next_normalizer = 1;
- estim_flow = 0;
- apply_flow = 0;
-
- for (int step = 0; step < num_steps; ++step) {
-
- int window_end = estim_flow + window_size;
- int window_middle = estim_flow + window_size / 2;
- if (window_middle > num_flows)
- break;
-
- float normalizer = next_normalizer;
-
- int median_set_size = 0;
- for (; estim_flow < window_end and estim_flow < num_flows; ++estim_flow)
- if (read.prediction[estim_flow] > 0.5 and read.normalized_measurements[estim_flow] > 0)
- median_set[median_set_size++] = read.normalized_measurements[estim_flow] / read.prediction[estim_flow];
-
- if (median_set_size > 5) {
- std::nth_element(median_set, median_set + median_set_size/2, median_set + median_set_size);
- next_normalizer = median_set[median_set_size / 2];
- if (step == 0)
- normalizer = next_normalizer;
- }
-
- float delta = (next_normalizer - normalizer) / window_size;
-
- for (; apply_flow < window_middle and apply_flow < num_flows; ++apply_flow) {
- read.normalized_measurements[apply_flow] /= normalizer;
- normalizer += delta;
- }
- }
-
- for (; apply_flow < num_flows; ++apply_flow) {
- read.normalized_measurements[apply_flow] /= next_normalizer;
- }
-}
-
-//-------------------------------------------------------------------------
-
-// Sliding window adaptive normalization and joint solving of sequence
-void TreephaserLite::NormalizeAndSolve(BasecallerRead& well, int max_flows, bool sliding_window)
-{
- int window_size = windowSize_;
- int solve_flows = 0;
-
- for (int num_steps = 1; solve_flows < max_flows; ++num_steps) {
- solve_flows = min((num_steps+1) * window_size, max_flows);
- int restart_flows = 0;
- if(sliding_window)
- restart_flows = max(solve_flows-100, 0);
-
- Solve(well, solve_flows, restart_flows);
- WindowedNormalize(well, num_steps, window_size);
- }
-
- Solve(well, max_flows);
-}
-
-//-------------------------------------------------------------------------
-
-void TreephaserLite::InitializeState(TreephaserPath *state) const
-{
- state->flow = 0;
- state->state[0] = 1;
- state->window_start = 0;
- state->window_end = 1;
- state->prediction.assign(flow_order_.num_flows(), 0);
- state->sequence.clear();
- state->sequence.reserve(2*flow_order_.num_flows());
- state->last_hp = 0;
-}
-
-
-//-------------------------------------------------------------------------
-
-void TreephaserLite::AdvanceState(TreephaserPath *child, const TreephaserPath *parent, char nuc, int max_flow) const
-{
- assert (child != parent);
-
- // Advance flow
- child->flow = parent->flow;
- while (child->flow < max_flow and flow_order_[child->flow] != nuc)
- child->flow++;
-
- if (child->flow == parent->flow)
- child->last_hp = parent->last_hp + 1;
- else
- child->last_hp = 1;
-
- // Initialize window
- child->window_start = parent->window_start;
- child->window_end = min(parent->window_end, max_flow);
-
- if (parent->flow != child->flow or parent->flow == 0) {
-
- // This nuc begins a new homopolymer
- float alive = 0;
- child->state[parent->window_start] = 0;
-
- for (int flow = parent->window_start; flow < child->window_end; ++flow) {
-
- // State progression according to phasing model
- if ((flow) < parent->window_end)
- alive += parent->state[flow];
- child->state[flow] = alive * transition_base_[nuc&7][flow];
- alive *= transition_flow_[nuc&7][flow];
-
- // Window maintenance
- if (flow == child->window_start and child->state[flow] < kStateWindowCutoff)
- child->window_start++;
-
- if (flow == child->window_end-1 and child->window_end < max_flow and alive > kStateWindowCutoff)
- child->window_end++;
- }
-
- } else {
- // This nuc simply prolongs current homopolymer, inherits state from parent
- //for (int flow = child->window_start; flow < child->window_end; ++flow)
- // child->state[flow] = parent->state[flow];
- memcpy(&child->state[child->window_start], &parent->state[child->window_start],
- (child->window_end-child->window_start)*sizeof(float));
- }
-
- for (int flow = parent->window_start; flow < parent->window_end; ++flow)
- child->prediction[flow] = parent->prediction[flow] + child->state[flow];
- for (int flow = parent->window_end; flow < child->window_end; ++flow)
- child->prediction[flow] = child->state[flow];
-}
-
-//-------------------------------------------------------------------------
-
-void TreephaserLite::AdvanceStateInPlace(TreephaserPath *state, char nuc, int max_flow) const
-{
-
- int old_flow = state->flow;
- int old_window_start = state->window_start;
- int old_window_end = state->window_end;
-
- // Advance in-phase flow
- while (state->flow < max_flow and flow_order_[state->flow] != nuc)
- state->flow++;
- if (state->flow == max_flow) // Immediately return if base does not fit any more
- return;
-
- if (old_flow == state->flow)
- state->last_hp++;
- else
- state->last_hp = 1;
-
- if (old_flow != state->flow or old_flow == 0) {
-
- // This nuc begins a new homopolymer, need to adjust state
- float alive = 0;
- for (int flow = old_window_start; flow < state->window_end; flow++) {
-
- // State progression according to phasing model
- if (flow < old_window_end)
- alive += state->state[flow];
- state->state[flow] = alive * transition_base_[nuc&7][flow];
- alive *= transition_flow_[nuc&7][flow];
-
- // Window maintenance
- if (flow == state->window_start and state->state[flow] < kStateWindowCutoff)
- state->window_start++;
-
- if (flow == state->window_end-1 and state->window_end < max_flow and alive > kStateWindowCutoff)
- state->window_end++;
- }
- }
-
- for (int flow = old_window_start; flow < state->window_end; ++flow)
- state->prediction[flow] += state->state[flow];
-}
-
-
-//-------------------------------------------------------------------------
-
-void TreephaserLite::Simulate(BasecallerRead& data, int max_flows)
-{
- InitializeState(&path_[0]);
-
- for (vector<char>::iterator nuc = data.sequence.begin(); nuc != data.sequence.end()
- and path_[0].flow < max_flows; ++nuc) {
- AdvanceStateInPlace(&path_[0], *nuc, flow_order_.num_flows());
- }
-
- data.prediction.swap(path_[0].prediction);
-}
-
-//-------------------------------------------------------------------------
-
-void TreephaserLite::Solve(BasecallerRead& read, int max_flows, int restart_flows)
-{
- static const char nuc_int_to_char[5] = "ACGT";
- assert(max_flows <= flow_order_.num_flows());
-
- // Initialize stack: just one root path
- for (int p = 1; p < kNumPaths; ++p)
- path_[p].in_use = false;
-
- InitializeState(&path_[0]);
- path_[0].path_metric = 0;
- path_[0].per_flow_metric = 0;
- path_[0].residual_left_of_window = 0;
- path_[0].dot_counter = 0;
- path_[0].in_use = true;
-
- int space_on_stack = kNumPaths - 1;
- float sum_of_squares_upper_bound = 1e20; //max_flows; // Squared distance of solution to measurements
-
- if (restart_flows > 0) {
- // The solver will not attempt to solve initial restart_flows
- // - Simulate restart_flows instead of solving
- // - If it turns out that solving was finished before restart_flows, simply exit without any changes to the read.
-
- restart_flows = min(restart_flows, flow_order_.num_flows());
-
- for (vector<char>::iterator nuc = read.sequence.begin();
- nuc != read.sequence.end() and path_[0].flow < restart_flows; ++nuc) {
- AdvanceStateInPlace(&path_[0], *nuc, flow_order_.num_flows());
- if (path_[0].flow < flow_order_.num_flows())
- path_[0].sequence.push_back(*nuc);
- }
-
- if (path_[0].flow < restart_flows-10) { // This read ended before restart_flows. No point resolving it.
- read.prediction.swap(path_[0].prediction);
- return;
- }
-
- for (int flow = 0; flow < path_[0].window_start; ++flow) {
- float residual = read.normalized_measurements[flow] - path_[0].prediction[flow];
- path_[0].residual_left_of_window += residual * residual;
- }
- }
-
- // Initializing variables
- //read.solution.assign(flow_order_.num_flows(), 0);
- read.sequence.clear();
- read.sequence.reserve(2*flow_order_.num_flows());
- read.prediction.assign(flow_order_.num_flows(), 0);
-
- // Main loop to select / expand / delete paths
- while (1) {
-
- // ------------------------------------------
- // Step 1: Prune the content of the stack and make sure there are at least 4 empty slots
-
- // Remove paths that are more than 'maxPathDelay' behind the longest one
- if (space_on_stack < kNumPaths-3) {
- int longest_path = 0;
- for (int p = 0; p < kNumPaths; ++p)
- if (path_[p].in_use)
- longest_path = max(longest_path, path_[p].flow);
-
- if (longest_path > kMaxPathDelay) {
- for (int p = 0; p < kNumPaths; ++p) {
- if (path_[p].in_use and path_[p].flow < longest_path-kMaxPathDelay) {
- path_[p].in_use = false;
- space_on_stack++;
- }
- }
- }
- }
-
- // If necessary, remove paths with worst perFlowMetric
- while (space_on_stack < 4) {
- // find maximum per flow metric
- float max_per_flow_metric = -0.1;
- int max_metric_path = kNumPaths;
- for (int p = 0; p < kNumPaths; ++p) {
- if (path_[p].in_use and path_[p].per_flow_metric > max_per_flow_metric) {
- max_per_flow_metric = path_[p].per_flow_metric;
- max_metric_path = p;
- }
- }
-
- // killing path with largest per flow metric
- if (!(max_metric_path < kNumPaths)) {
- printf("Failed assertion in Treephaser\n");
- for (int p = 0; p < kNumPaths; ++p) {
- if (path_[p].in_use)
- printf("Path %d, in_use = true, per_flow_metric = %f\n", p, path_[p].per_flow_metric);
- else
- printf("Path %d, in_use = false, per_flow_metric = %f\n", p, path_[p].per_flow_metric);
- }
- fflush(NULL);
- }
- assert (max_metric_path < kNumPaths);
-
- path_[max_metric_path].in_use = false;
- space_on_stack++;
- }
-
- // ------------------------------------------
- // Step 2: Select a path to expand or break if there is none
-
- TreephaserPath *parent = NULL;
- float min_path_metric = 1000;
- for (int p = 0; p < kNumPaths; ++p) {
- if (path_[p].in_use and path_[p].path_metric < min_path_metric) {
- min_path_metric = path_[p].path_metric;
- parent = &path_[p];
- }
- }
- if (!parent)
- break;
-
-
- // ------------------------------------------
- // Step 3: Construct four expanded paths and calculate feasibility metrics
- assert (space_on_stack >= 4);
-
- TreephaserPath *children[4];
-
- for (int nuc = 0, p = 0; nuc < 4; ++p)
- if (not path_[p].in_use)
- children[nuc++] = &path_[p];
-
- float penalty[4] = { 0, 0, 0, 0 };
-
- for (int nuc = 0; nuc < 4; ++nuc) {
-
- TreephaserPath *child = children[nuc];
-
- AdvanceState(child, parent, nuc_int_to_char[nuc], max_flows);
-
- // Apply easy termination rules
-
- if (child->flow >= max_flows) {
- penalty[nuc] = 25; // Mark for deletion
- continue;
- }
-
- if (child->last_hp > kMaxHP) {
- penalty[nuc] = 25; // Mark for deletion
- continue;
- }
-
- if ((int)parent->sequence.size() >= (2 * flow_order_.num_flows() - 10)) {
- penalty[nuc] = 25; // Mark for deletion
- continue;
- }
-
- child->path_metric = parent->residual_left_of_window;
- child->residual_left_of_window = parent->residual_left_of_window;
-
- float penaltyN = 0;
- float penalty1 = 0;
-
- for (int flow = parent->window_start; flow < child->window_end; ++flow) {
-
- float residual = read.normalized_measurements[flow] - child->prediction[flow];
- float residual_squared = residual * residual;
-
- // Metric calculation
- if (flow < child->window_start) {
- child->residual_left_of_window += residual_squared;
- child->path_metric += residual_squared;
- } else if (residual <= 0)
- child->path_metric += residual_squared;
-
- if (residual <= 0)
- penaltyN += residual_squared;
- else if (flow < child->flow)
- penalty1 += residual_squared;
- }
-
-
- penalty[nuc] = penalty1 + kNegativeMultiplier * penaltyN;
- penalty1 += penaltyN;
-
- if (child->flow>0)
- child->per_flow_metric = (child->path_metric + 0.5 * penalty1) / child->flow;
-
- } //looping over nucs
-
-
- // Find out which nuc has the least penalty (the greedy choice nuc)
- int best_nuc = 0;
- if (penalty[best_nuc] > penalty[1])
- best_nuc = 1;
- if (penalty[best_nuc] > penalty[2])
- best_nuc = 2;
- if (penalty[best_nuc] > penalty[3])
- best_nuc = 3;
-
- // ------------------------------------------
- // Step 4: Use calculated metrics to decide which paths are worth keeping
-
- for (int nuc = 0; nuc < 4; ++nuc) {
-
- TreephaserPath *child = children[nuc];
-
- // Path termination rules
-
- if (penalty[nuc] >= 20)
- continue;
-
- if (child->path_metric > sum_of_squares_upper_bound)
- continue;
-
- // This is the only rule that depends on finding the "best nuc"
- if (penalty[nuc] - penalty[best_nuc] >= kExtendThreshold)
- continue;
-
- float dot_signal = (read.normalized_measurements[child->flow]
- - parent->prediction[child->flow])
- / child->state[child->flow];
-
- child->dot_counter = (dot_signal < kDotThreshold) ? (parent->dot_counter + 1) : 0;
- if (child->dot_counter > 1)
- continue;
-
- // Path survived termination rules and will be kept on stack
- child->in_use = true;
- space_on_stack--;
-
- // Fill out the remaining portion of the prediction
- memcpy(&child->prediction[0], &parent->prediction[0], (parent->window_start)*sizeof(float));
-
- for (int flow = child->window_end; flow < max_flows; ++flow)
- child->prediction[flow] = 0;
-
- // Fill out the solution
- child->sequence = parent->sequence;
- child->sequence.push_back(nuc_int_to_char[nuc]);
- }
-
- // ------------------------------------------
- // Step 5. Check if the selected path is in fact the best path so far
-
- // Computing sequence squared distance
- float sum_of_squares = parent->residual_left_of_window;
- for (int flow = parent->window_start; flow < max_flows; flow++) {
-
- float residual = read.normalized_measurements[flow] - parent->prediction[flow];
- sum_of_squares += residual * residual;
- }
-
- // Updating best path
- if (sum_of_squares < sum_of_squares_upper_bound) {
- read.prediction.swap(parent->prediction);
- read.sequence.swap(parent->sequence);
- sum_of_squares_upper_bound = sum_of_squares;
- }
-
- parent->in_use = false;
- space_on_stack++;
-
- } // main decision loop
-}
-
diff --git a/src/ionhammer/subcluster.cpp b/src/ionhammer/subcluster.cpp
deleted file mode 100644
index d733cf7..0000000
--- a/src/ionhammer/subcluster.cpp
+++ /dev/null
@@ -1,135 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "subcluster.hpp"
-#include "config_struct.hpp"
-#include "consensus.hpp"
-#include "hkmer_distance.hpp"
-#include "kmer_data.hpp"
-#include "logger/log_writers.hpp"
-
-#include <boost/numeric/ublas/matrix.hpp>
-
-#include <vector>
-#include <iostream>
-
-hammer::HKMer center(const KMerData &data, const std::vector<size_t>& kmers) {
- hammer::HKMer res;
- namespace numeric = boost::numeric::ublas;
-
- for (unsigned i = 0; i < hammer::K; ++i) {
- numeric::matrix<double> scores(4, 64, 0);
- for (size_t j = 0; j < kmers.size(); ++j) {
- const hammer::KMerStat &k = data[kmers[j]];
- // FIXME: switch to MLE when we'll have use per-run quality values
-#if 1
- scores(k.kmer[i].nucl, k.kmer[i].len) += double(k.count) * (1 - k.qual);
-#else
- for (unsigned n = 0; n < 4; ++n)
- for (unsigned l = 1; l < 64; ++l)
- scores(n, l) += k.count * (n == k.kmer[i].nucl && l == k.kmer[i].len ?
- log(1 - k.qual) : log(k.qual) - log(4*63 - 1));
-#endif
- }
-
- res[i] = hammer::iontorrent::consensus(scores).first;
- }
-
- return res;
-}
-
-bool assign(KMerData &kmer_data, const std::vector<size_t> &cluster) {
- hammer::HKMer c = center(kmer_data, cluster);
- bool nonread = false;
-
- size_t idx = kmer_data.seq_idx(c);
- if (kmer_data[idx].kmer != c) {
-# pragma omp critical
- {
- idx = kmer_data.push_back(hammer::KMerStat(0, c, 1.0));
- }
- nonread = true;
- }
-
- for (size_t j = 0; j < cluster.size(); ++j)
- kmer_data[cluster[j]].changeto = unsigned(idx);
-
- return nonread;
-}
-
-void dump(const KMerData &kmer_data, const std::vector<size_t> &cluster) {
- std::cerr << "{ \n\"kmers\": {";
- for (size_t j = 0; j < cluster.size(); ++j) {
- if (j > 0) std::cerr << ", ";
- std::cerr << '"' << kmer_data[cluster[j]].kmer << "\": ["
- << kmer_data[cluster[j]].count << ", "
- << 1 - kmer_data[cluster[j]].qual << "] \n";
- }
- std::cerr << "}, \"center\": { \"status\": ";
- hammer::HKMer c = center(kmer_data, cluster);
- size_t idx = kmer_data.seq_idx(c);
- if (kmer_data[idx].kmer == c) {
- std::cerr << "\"ok\", \"center\": \"" << c << "\"}\n";
- } else {
- std::cerr << "\"not\", \"kmer\": \"" << kmer_data[idx].kmer
- << "\", \"center\": \"" << c << "\"}\n";
- }
- std::cerr << "}" << std::endl;
-}
-
-size_t subcluster(KMerData &kmer_data, std::vector<size_t> &cluster) {
- size_t nonread = 0;
-
- // First, sort the kmer indicies wrt count
- std::sort(cluster.begin(), cluster.end(), CountCmp(kmer_data));
-
- // The number of subclusters for now is really dumb: we assume that the quality should be 1.
- size_t k = 0;
- for (size_t i = 0; i < cluster.size(); ++i)
- k += kmer_data[cluster[i]].qual < cfg::get().center_qual_threshold;
-
- if (k <= 1) {
-#if 0
- dump(kmer_data, cluster);
-#endif
- return assign(kmer_data, cluster);
- }
-
- // Find the closest center
- std::vector<std::vector<size_t> > idx(k, std::vector<size_t>());
- for (size_t i = 0; i < k; ++i)
- idx[i].push_back(cluster[i]);
- for (size_t i = k; i < cluster.size(); ++i) {
- unsigned dist = std::numeric_limits<unsigned>::max();
- size_t cidx = k;
- hammer::HKMer kmerx = kmer_data[cluster[i]].kmer;
- for (size_t j = 0; j < k; ++j) {
- hammer::HKMer kmery = kmer_data[cluster[j]].kmer;
- unsigned cdist = hammer::distanceHKMer(kmerx.begin(), kmerx.end(),
- kmery.begin(), kmery.end());
- if (cdist < dist) {
- cidx = j;
- dist = cdist;
- }
- }
- VERIFY(cidx < k);
- idx[cidx].push_back(cluster[i]);
- }
-
- for (auto it = idx.begin(), et = idx.end(); it != et; ++it) {
- const std::vector<size_t> &subcluster = *it;
-
- if (assign(kmer_data, subcluster)) {
- nonread += 1;
-#if 0
- dump(kmer_data, subcluster);
-#endif
- }
- }
-
- return nonread;
-}
diff --git a/src/ionhammer/valid_hkmer_generator.hpp b/src/ionhammer/valid_hkmer_generator.hpp
deleted file mode 100644
index 86fb158..0000000
--- a/src/ionhammer/valid_hkmer_generator.hpp
+++ /dev/null
@@ -1,250 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef HAMMER_VALIDHKMERGENERATOR_HPP_
-#define HAMMER_VALIDHKMERGENERATOR_HPP_
-
-#include <deque>
-#include <string>
-#include <vector>
-
-#include "io/single_read.hpp"
-#include "HSeq.hpp"
-
-#include <cstdint>
-#include <cmath>
-
-template<size_t kK>
-class ValidHKMerGenerator {
- public:
- /**
- * @param read Read to generate k-mers from.
- * @param bad_quality_threshold This class virtually cuts
- * nucleotides with quality lower the threshold from the ends of the
- * read.
- */
- // FIXME: Switch to delegating ctor.
- explicit ValidHKMerGenerator(const io::SingleRead &read,
- unsigned bad_quality_threshold = 2) {
- Reset(read.GetSequenceString().data(),
- read.GetQualityString().data(),
- read.GetSequenceString().size(),
- bad_quality_threshold);
- }
-
- /**
- * @param seq sequence to generate k-mers from.
- * @param qual quality string
- * @param bad_quality_threshold This class virtually cuts
- * nucleotides with quality lower the threshold from the ends of the
- * read.
- */
- explicit ValidHKMerGenerator(const char *seq, const char *qual,
- size_t len,
- unsigned bad_quality_threshold = 2) {
- Reset(seq, qual, len, bad_quality_threshold);
- }
-
- ValidHKMerGenerator()
- : kmer_(), seq_(0), qual_(0),
- pos_(-1), nlen_(-1), end_(-1), len_(0),
- correct_probability_(1), bad_quality_threshold_(2),
- has_more_(false), first_(true) {}
-
- void Reset(const char *seq, const char *qual,
- size_t len,
- unsigned bad_quality_threshold = 2) {
- kmer_ = hammer::HSeq<kK>();
- seq_ = seq;
- qual_ = qual;
- pos_ = -1;
- nlen_ = -1;
- end_ = -1;
- len_ = len;
- correct_probability_ = 1.0;
- bad_quality_threshold_ = bad_quality_threshold;
- has_more_ = true;
- first_ = true;
- last_ = false;
- probs_.resize(0);
-
- TrimBadQuality();
- Next();
- }
-
- /**
- * @result true if Next() succeed while generating new k-mer, false
- * otherwise.
- */
- bool HasMore() const {
- return has_more_;
- }
-
- /**
- * @result last k-mer generated by Next().
- */
- const hammer::HSeq<kK>& kmer() const {
- return kmer_;
- }
-
- /**
- * @result last k-mer position in initial read.
- */
- size_t pos() const {
- return pos_;
- }
-
- size_t nlen() const {
- return nlen_;
- }
-
- /**
- * @result number of nucleotides trimmed from left end
- */
- size_t trimmed_left() const {
- return beg_;
- }
-
- /**
- * @result number of nucleotides trimmed from right end
- */
- size_t trimmed_right() const {
- return len_ - end_;
- }
-
- /**
- * @result probability that last generated k-mer is correct.
- */
- double correct_probability() const {
- return correct_probability_;
- }
-
- /**
- * This functions reads next k-mer from the read and sets hasmore to
- * if succeeded. You can access k-mer read with kmer().
- */
- void Next();
- private:
- void TrimBadQuality();
-
- double Prob(unsigned qual) {
- return (qual < 3 ? 0.25 : 1 - pow(10.0, -(qual / 10.0)));
- // return Globals::quality_probs[qual];
- }
-
- unsigned GetQual(size_t pos) {
- if (pos >= len_) {
- return 2;
- } else {
- return qual_[pos];
- }
- }
-
- hammer::HSeq<kK> kmer_;
- const char* seq_;
- const char* qual_;
- size_t pos_;
- size_t nlen_;
- size_t beg_;
- size_t end_;
- size_t len_;
- double correct_probability_;
- unsigned bad_quality_threshold_;
- bool has_more_;
- bool first_;
- bool last_;
- std::deque<double> probs_;
-
- // Disallow copy and assign
- ValidHKMerGenerator(const ValidHKMerGenerator&) = delete;
- void operator=(const ValidHKMerGenerator&) = delete;
-};
-
-template<size_t kK>
-void ValidHKMerGenerator<kK>::TrimBadQuality() {
- pos_ = 0;
- if (qual_)
- for (; pos_ < len_; ++pos_) {
- if (GetQual(pos_) >= bad_quality_threshold_)
- break;
- }
- beg_ = pos_;
- end_ = len_;
- if (qual_)
- for (; end_ > pos_; --end_) {
- if (GetQual(end_ - 1) >= bad_quality_threshold_)
- break;
- }
-}
-
-template<size_t kK>
-void ValidHKMerGenerator<kK>::Next() {
- if (last_) {
- has_more_ = false;
- return;
- }
-
- size_t toadd = (first_ ? kK : 1);
- char pnucl = -1;
- double cprob = 1.0;
- nlen_ = 0;
- // Build the flow-space kmer looking over homopolymer streches.
- while (toadd) {
- // If we went past the end, then there are no new kmers anymore.
- // The current one might be incomplete but we yield it anyway
- // because one hk-mer can't have much influence on the consensus.
- if (pos_ >= end_) {
- last_ = true;
- if (toadd > 0) {
- has_more_ = false;
- }
- return;
- }
-
- // Check, whether the current nucl is good (not 'N')
- char cnucl = seq_[pos_ + nlen_];
- if (!is_nucl(cnucl)) {
- toadd = kK;
- pnucl = -1;
- pos_ += nlen_ + 1;
- nlen_ = 0;
- correct_probability_ = 1.0;
- probs_.resize(0);
- continue;
- }
- if (qual_)
- cprob *= Prob(GetQual(pos_ + nlen_));
-
- // If current nucl differs from previous nucl then either we're starting the
- // k-mer or just finished the homopolymer run.
- if (cnucl != pnucl) {
- // If previous nucl was valid then finish the current homopolymer run
- if (pnucl != -1) {
- toadd -= 1;
- correct_probability_ *= cprob;
- if (probs_.size() == kK) {
- correct_probability_ /= probs_[0];
- probs_.pop_front();
- }
-
- probs_.push_back(cprob);
- cprob = 1.0;
- }
- pnucl = cnucl;
- }
-
- // If we have something to add to flowspace kmer - do it now.
- if (toadd) {
- kmer_ <<= cnucl;
- nlen_ += 1;
- }
- }
-
- pos_ += nlen_;
- first_ = false;
-}
-#endif // HAMMER_VALIDHKMERGENERATOR_HPP__
diff --git a/src/modules/CMakeLists.txt b/src/modules/CMakeLists.txt
new file mode 100644
index 0000000..280629f
--- /dev/null
+++ b/src/modules/CMakeLists.txt
@@ -0,0 +1,24 @@
+############################################################################
+# Copyright (c) 2015 Saint Petersburg State University
+# Copyright (c) 2011-2014 Saint Petersburg Academic University
+# All Rights Reserved
+# See file LICENSE for details.
+############################################################################
+
+project(spades_modules CXX)
+
+add_subdirectory(pipeline)
+add_subdirectory(assembly_graph)
+add_subdirectory(data_structures/sequence)
+add_subdirectory(math)
+add_subdirectory(algorithms/path_extend)
+add_subdirectory(algorithms)
+add_subdirectory(paired_info)
+add_subdirectory(stages)
+add_subdirectory(dev_support)
+add_subdirectory(io)
+add_subdirectory(data_structures/mph_index)
+
+add_library(spades_modules STATIC empty.cpp)
+
+target_link_libraries(spades_modules graph_support input sequence pipeline math_module path_extend paired_info stages dev_support mph_index algorithms)
diff --git a/src/modules/algorithms/CMakeLists.txt b/src/modules/algorithms/CMakeLists.txt
new file mode 100644
index 0000000..a4b8d60
--- /dev/null
+++ b/src/modules/algorithms/CMakeLists.txt
@@ -0,0 +1,11 @@
+############################################################################
+# Copyright (c) 2015 Saint Petersburg State University
+# Copyright (c) 2011-2014 Saint Petersburg Academic University
+# All Rights Reserved
+# See file LICENSE for details.
+############################################################################
+
+project(algorithms CXX)
+
+add_library(algorithms STATIC genome_consistance_checker.cpp)
+
diff --git a/src/modules/algorithms/dijkstra/dijkstra_algorithm.hpp b/src/modules/algorithms/dijkstra/dijkstra_algorithm.hpp
new file mode 100644
index 0000000..11c32d8
--- /dev/null
+++ b/src/modules/algorithms/dijkstra/dijkstra_algorithm.hpp
@@ -0,0 +1,288 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+#pragma once
+
+#include "dev_support/simple_tools.hpp"
+#include "dijkstra_settings.hpp"
+
+#include <queue>
+#include <vector>
+#include <set>
+#include <map>
+
+namespace omnigraph {
+
+template<typename Graph, typename distance_t = size_t>
+struct element_t{
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ distance_t distance;
+ VertexId curr_vertex;
+ VertexId prev_vertex;
+ EdgeId edge_between;
+
+ element_t(distance_t new_distance, VertexId new_cur_vertex, VertexId new_prev_vertex,
+ EdgeId new_edge_between) : distance(new_distance), curr_vertex(new_cur_vertex),
+ prev_vertex(new_prev_vertex), edge_between(new_edge_between) { }
+};
+
+template<typename T>
+class ReverseDistanceComparator {
+public:
+ ReverseDistanceComparator() {
+ }
+
+ bool operator()(T obj1, T obj2){
+ if(obj1.distance != obj2.distance)
+ return obj2.distance < obj1.distance;
+ if(obj2.curr_vertex != obj1.curr_vertex)
+ return obj2.curr_vertex < obj1.curr_vertex;
+ if(obj2.prev_vertex != obj1.prev_vertex)
+ return obj2.prev_vertex < obj1.prev_vertex;
+ return obj2.edge_between < obj1.edge_between;
+ }
+};
+
+template<class Graph, class DijkstraSettings, typename distance_t = size_t>
+class Dijkstra {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef distance_t DistanceType;
+
+ typedef std::map<VertexId, distance_t> distances_map;
+ typedef typename distances_map::const_iterator distances_map_ci;
+ typedef typename std::priority_queue<element_t<Graph, distance_t>, std::vector<element_t<Graph, distance_t>>,
+ ReverseDistanceComparator<element_t<Graph, distance_t>>> queue_t;
+
+ // constructor parameters
+ const Graph& graph_;
+ DijkstraSettings settings_;
+ const size_t max_vertex_number_;
+
+ // changeable parameters
+ bool finished_;
+ size_t vertex_number_;
+ bool vertex_limit_exceeded_;
+
+ // accumulative structures
+ distances_map distances_;
+ std::set<VertexId> processed_vertices_;
+ std::map<VertexId, pair<VertexId, EdgeId>> prev_vert_map_;
+
+ void Init(VertexId start, queue_t &queue) {
+ vertex_number_ = 0;
+ distances_.clear();
+ processed_vertices_.clear();
+ prev_vert_map_.clear();
+ set_finished(false);
+ settings_.Init(start);
+ queue.push(element_t<Graph, distance_t>(0, start, VertexId(0), EdgeId(0)));
+ prev_vert_map_[start] = std::pair<VertexId, EdgeId>(VertexId(0), EdgeId(0));
+ }
+
+ void set_finished(bool state) {
+ finished_ = state;
+ }
+
+ bool CheckPutVertex(VertexId vertex, EdgeId edge, distance_t length) const {
+ return settings_.CheckPutVertex(vertex, edge, length);
+ }
+
+ bool CheckProcessVertex(VertexId vertex, distance_t distance) {
+ ++vertex_number_;
+ if (vertex_number_ > max_vertex_number_) {
+ vertex_limit_exceeded_ = true;
+ return false;
+ }
+ return (vertex_number_ < max_vertex_number_) && settings_.CheckProcessVertex(vertex, distance);
+ }
+
+ distance_t GetLength(EdgeId edge) const {
+ return settings_.GetLength(edge);
+ }
+
+ void AddNeighboursToQueue(VertexId cur_vertex, distance_t cur_dist, queue_t& queue) {
+ auto neigh_iterator = settings_.GetIterator(cur_vertex);
+ while (neigh_iterator.HasNext()) {
+ TRACE("Checking new neighbour of vertex " << graph_.str(cur_vertex) << " started");
+ auto cur_pair = neigh_iterator.Next();
+ if (!DistanceCounted(cur_pair.vertex)) {
+ TRACE("Adding new entry to queue");
+ distance_t new_dist = GetLength(cur_pair.edge) + cur_dist;
+ TRACE("Entry: vertex " << graph_.str(cur_vertex) << " distance " << new_dist);
+ if (CheckPutVertex(cur_pair.vertex, cur_pair.edge, new_dist)) {
+ TRACE("CheckPutVertex returned true and new entry is added");
+ queue.push(element_t<Graph, distance_t>(new_dist, cur_pair.vertex,
+ cur_vertex, cur_pair.edge));
+ }
+ }
+ TRACE("Checking new neighbour of vertex " << graph_.str(cur_vertex) << " finished");
+ }
+ TRACE("All neighbours of vertex " << graph_.str(cur_vertex) << " processed");
+ }
+
+public:
+ Dijkstra(const Graph &graph, DijkstraSettings settings, size_t max_vertex_number = size_t(-1)) :
+ graph_(graph),
+ settings_(settings),
+ max_vertex_number_(max_vertex_number),
+ finished_(false),
+ vertex_number_(0),
+ vertex_limit_exceeded_(false) {}
+
+ Dijkstra(Dijkstra&& /*other*/) = default;
+
+ Dijkstra& operator=(Dijkstra&& /*other*/) = default;
+
+ Dijkstra(const Dijkstra& /*other*/) = delete;
+
+ Dijkstra& operator=(const Dijkstra& /*other*/) = delete;
+
+ bool finished() const {
+ return finished_;
+ }
+
+ bool DistanceCounted(VertexId vertex) const {
+ return distances_.find(vertex) != distances_.end();
+ }
+
+ distance_t GetDistance(VertexId vertex) const {
+ VERIFY(DistanceCounted(vertex));
+ return distances_.find(vertex)->second;
+ }
+
+ std::pair<distances_map_ci, distances_map_ci> GetDistances() const {
+ distances_map_ci begin = distances_.begin();
+ distances_map_ci end = distances_.end();
+ return make_pair(begin, end);
+ }
+
+ void Run(VertexId start) {
+ TRACE("Starting dijkstra run from vertex " << graph_.str(start));
+ queue_t queue;
+ Init(start, queue);
+ TRACE("Priority queue initialized. Starting search");
+
+ while (!queue.empty() && !finished()) {
+ TRACE("Dijkstra iteration started");
+ const element_t<Graph, distance_t>& next = queue.top();
+ distance_t distance = next.distance;
+ VertexId vertex = next.curr_vertex;
+
+ prev_vert_map_[vertex] = std::pair<VertexId, EdgeId>(next.prev_vertex, next.edge_between);
+ queue.pop();
+ TRACE("Vertex " << graph_.str(vertex) << " with distance " << distance << " fetched from queue");
+
+ if (DistanceCounted(vertex)) {
+ TRACE("Distance to vertex " << graph_.str(vertex) << " already counted. Proceeding to next queue entry.");
+ continue;
+ }
+ distances_.insert(make_pair(vertex, distance));
+
+ TRACE("Vertex " << graph_.str(vertex) << " is found to be at distance "
+ << distance << " from vertex " << graph_.str(start));
+ if (!CheckProcessVertex(vertex, distance)) {
+ TRACE("Check for processing vertex failed. Proceeding to the next queue entry.");
+ continue;
+ }
+ processed_vertices_.insert(vertex);
+ AddNeighboursToQueue(vertex, distance, queue);
+ }
+ set_finished(true);
+ TRACE("Finished dijkstra run from vertex " << graph_.str(start));
+ }
+
+ std::vector<EdgeId> GetShortestPathTo(VertexId vertex) {
+ std::vector<EdgeId> path;
+ if (prev_vert_map_.find(vertex) == prev_vert_map_.end())
+ return path;
+
+ VertexId curr_vertex = vertex;
+ VertexId prev_vertex = get(prev_vert_map_, vertex).first;
+ EdgeId edge = get(prev_vert_map_, curr_vertex).second;
+
+ while (prev_vertex != VertexId(0)) {
+ if (graph_.EdgeStart(edge) == prev_vertex)
+ path.insert(path.begin(), edge);
+ else
+ path.push_back(edge);
+ curr_vertex = prev_vertex;
+ const auto& prev_v_e = get(prev_vert_map_, curr_vertex);
+ prev_vertex = prev_v_e.first;
+ edge = prev_v_e.second;
+ }
+ return path;
+ }
+
+ vector<VertexId> ReachedVertices() const {
+ vector<VertexId> result;
+ for (auto it = distances_.begin(); it != distances_.end(); ++it) {
+ result.push_back(it->first);
+ }
+ return result;
+ }
+
+ const set<VertexId>& ProcessedVertices() const {
+ return processed_vertices_;
+ }
+
+ bool VertexLimitExceeded() const {
+ return vertex_limit_exceeded_;
+ }
+
+private:
+ DECL_LOGGER("Dijkstra");
+};
+
+template<class Graph>
+class DistanceCounter {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef ComposedDijkstraSettings<Graph,
+ LengthCalculator<Graph>,
+ VertexProcessChecker<Graph>,
+ VertexPutChecker<Graph>,
+ ForwardNeighbourIteratorFactory<Graph>> BaseDijkstraSettings;
+
+public:
+ DistanceCounter(const Graph& graph) :
+ graph_(graph),
+ dijkstra_(graph, BaseDijkstraSettings(
+ LengthCalculator<Graph>(),
+ VertexProcessChecker<Graph>(),
+ VertexPutChecker<Graph>(),
+ ForwardNeighbourIteratorFactory<Graph>())),
+ ready_(false) {
+ }
+
+ bool IsReachable(VertexId from, VertexId to) {
+ EnsureFrom(from);
+ return dijkstra_.DistanceCounted(to);
+ }
+
+ size_t Distance(VertexId from, VertexId to) {
+ EnsureFrom(from);
+ return dijkstra_.GetDistance(to);
+ }
+
+private:
+ void EnsureFrom(VertexId from) {
+ if (!ready_ || prev_ != from) {
+ dijkstra_.run(from);
+ ready_ = true;
+ prev_ = from;
+ }
+ }
+
+ const Graph& graph_;
+ Dijkstra<Graph, BaseDijkstraSettings> dijkstra_;
+ VertexId prev_;
+ bool ready_;
+};
+
+}
diff --git a/src/modules/algorithms/dijkstra/dijkstra_helper.hpp b/src/modules/algorithms/dijkstra/dijkstra_helper.hpp
new file mode 100644
index 0000000..756f2af
--- /dev/null
+++ b/src/modules/algorithms/dijkstra/dijkstra_helper.hpp
@@ -0,0 +1,163 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+#include "dijkstra_algorithm.hpp"
+
+namespace omnigraph {
+
+template<class Graph>
+class DijkstraHelper {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+public:
+ typedef Dijkstra<Graph, ComposedDijkstraSettings<Graph,
+ LengthCalculator<Graph>,
+ VertexProcessChecker<Graph>,
+ VertexPutChecker<Graph>,
+ UnorientedNeighbourIteratorFactory<Graph> > > UnorientedDijkstra;
+
+ //------------------------------
+
+ typedef Dijkstra<Graph, ComposedDijkstraSettings<Graph,
+ LengthCalculator<Graph>,
+ VertexProcessChecker<Graph>,
+ VertexPutChecker<Graph>,
+ BackwardNeighbourIteratorFactory<Graph> > > BackwardDijkstra;
+
+ //------------------------------
+ // bounded dijkstra
+ //------------------------------
+ typedef ComposedDijkstraSettings<Graph,
+ LengthCalculator<Graph>,
+ BoundProcessChecker<Graph>,
+ BoundPutChecker<Graph>,
+ ForwardNeighbourIteratorFactory<Graph> > BoundedDijkstraSettings;
+
+ typedef Dijkstra<Graph, BoundedDijkstraSettings> BoundedDijkstra;
+
+ static BoundedDijkstra CreateBoundedDijkstra(const Graph &graph, size_t length_bound,
+ size_t max_vertex_number = -1ul){
+ return BoundedDijkstra(graph, BoundedDijkstraSettings(
+ LengthCalculator<Graph>(graph),
+ BoundProcessChecker<Graph>(length_bound),
+ BoundPutChecker<Graph>(length_bound),
+ ForwardNeighbourIteratorFactory<Graph>(graph)),
+ max_vertex_number);
+ }
+
+ //------------------------------
+ // bounded backward dijkstra
+ //------------------------------
+
+ typedef ComposedDijkstraSettings<Graph,
+ LengthCalculator<Graph>,
+ BoundProcessChecker<Graph>,
+ BoundPutChecker<Graph>,
+ BackwardNeighbourIteratorFactory<Graph> > BackwardBoundedDijkstraSettings;
+
+ typedef Dijkstra<Graph, BackwardBoundedDijkstraSettings> BackwardBoundedDijkstra;
+
+ static BackwardBoundedDijkstra CreateBackwardBoundedDijkstra(const Graph &graph,
+ size_t bound, size_t max_vertex_number = size_t(-1)){
+ return BackwardBoundedDijkstra(graph, BackwardBoundedDijkstraSettings(
+ LengthCalculator<Graph>(graph),
+ BoundProcessChecker<Graph>(bound),
+ BoundPutChecker<Graph>(bound),
+ BackwardNeighbourIteratorFactory<Graph>(graph)), max_vertex_number);
+ }
+
+ //------------------------------
+
+ typedef Dijkstra<Graph, ComposedDijkstraSettings<Graph,
+ LengthCalculator<Graph>,
+ VertexProcessChecker<Graph>,
+ EdgeComponentPutChecker<Graph>,
+ UnorientedNeighbourIteratorFactory<Graph> > > ComponentFinder;
+ //------------------------------
+
+ typedef Dijkstra<Graph, ComposedDijkstraSettings<Graph,
+ ComponentLenCalculator<Graph>,
+ BoundProcessChecker<Graph>,
+ VertexPutChecker<Graph>,
+ UnorientedNeighbourIteratorFactory<Graph> > > NeighbourhoodFinder;
+ //------------------------------
+
+ typedef Dijkstra<Graph, ComposedDijkstraSettings<Graph,
+ LengthCalculator<Graph>,
+ VertexProcessChecker<Graph>,
+ SubgraphPutChecker<Graph>,
+ UnorientedNeighbourIteratorFactory<Graph> > > SubgraphDijkstra;
+
+ typedef ComposedDijkstraSettings<Graph,
+ PathIgnoringLengthCalculator<Graph>,
+ BoundProcessChecker<Graph>,
+ BoundPutChecker<Graph>,
+ ForwardNeighbourIteratorFactory<Graph> > PathIgnoringDijkstraSettings;
+
+
+ //------------------------------
+ // short edge dijkstra settings
+ //------------------------------
+ typedef ComposedDijkstraSettings<Graph,
+ BoundedEdgeLenCalculator<Graph>,
+ ZeroLengthProcessChecker<Graph>,
+ VertexPutChecker<Graph>,
+ UnorientedNeighbourIteratorFactory<Graph> > ShortEdgeDijkstraSettings;
+
+ typedef Dijkstra<Graph, ShortEdgeDijkstraSettings> ShortEdgeDijkstra;
+
+ static ShortEdgeDijkstra CreateShortEdgeDijkstra(const Graph &graph, size_t edge_length_bound,
+ size_t max_vertex_number = size_t(-1)){
+ return ShortEdgeDijkstra(graph, ShortEdgeDijkstraSettings(
+ BoundedEdgeLenCalculator<Graph>(graph, edge_length_bound),
+ ZeroLengthProcessChecker<Graph>(),
+ VertexPutChecker<Graph>(),
+ UnorientedNeighbourIteratorFactory<Graph>(graph)),
+ max_vertex_number);
+ }
+
+ //------------------------------
+ // counting dijkstra
+ //------------------------------
+ typedef CountingDijkstraSettings<Graph,
+ UnorientedNeighbourIteratorFactory<Graph> > UnorientCountingDijkstraSettings;
+
+ typedef Dijkstra<Graph, UnorientCountingDijkstraSettings> CountingDijkstra;
+
+ static CountingDijkstra CreateCountingDijkstra(const Graph &graph, size_t max_size,
+ size_t edge_length_bound, size_t max_vertex_number = size_t(-1)){
+ return CountingDijkstra(graph, UnorientCountingDijkstraSettings(graph,
+ UnorientedNeighbourIteratorFactory<Graph>(graph),
+ max_size, edge_length_bound), max_vertex_number);
+ }
+
+
+ //------------------------------
+ // targeted bounded dijkstra
+ //------------------------------
+
+ typedef ComposedDijkstraSettings<Graph,
+ LengthCalculator<Graph>,
+ BoundedVertexTargetedProcessChecker<Graph>,
+ BoundPutChecker<Graph>,
+ ForwardNeighbourIteratorFactory<Graph> > TargeredBoundedDijkstraSettings;
+
+ typedef Dijkstra<Graph, TargeredBoundedDijkstraSettings> TargeredBoundedDijkstra;
+
+ static TargeredBoundedDijkstra CreateTargeredBoundedDijkstra(const Graph &graph,
+ VertexId target_vertex, size_t bound, size_t max_vertex_number = size_t(-1)){
+ return TargeredBoundedDijkstra(graph,
+ TargeredBoundedDijkstraSettings(LengthCalculator<Graph>(graph),
+ BoundedVertexTargetedProcessChecker<Graph>(target_vertex, bound),
+ BoundPutChecker<Graph>(bound),
+ ForwardNeighbourIteratorFactory<Graph>(graph)),
+ max_vertex_number);
+ }
+};
+
+}
diff --git a/src/modules/algorithms/dijkstra/dijkstra_settings.hpp b/src/modules/algorithms/dijkstra/dijkstra_settings.hpp
new file mode 100644
index 0000000..4716250
--- /dev/null
+++ b/src/modules/algorithms/dijkstra/dijkstra_settings.hpp
@@ -0,0 +1,117 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "length_calculator.hpp"
+#include "vertex_process_checker.hpp"
+#include "vertex_put_checker.hpp"
+#include "neighbours_iterator.hpp"
+
+namespace omnigraph {
+
+template<class Graph,
+ class LengthCalculator,
+ class VertexProcessChecker,
+ class VertexPutChecker,
+ class NeighbourIteratorFactory,
+ typename distance_t = size_t>
+class ComposedDijkstraSettings {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ LengthCalculator len_calc_;
+ VertexProcessChecker vert_proc_checker_;
+ VertexPutChecker vert_put_checker_;
+ NeighbourIteratorFactory neigh_iter_factory_;
+
+public:
+ typedef LengthCalculator LC;
+ typedef VertexProcessChecker VPrC;
+ typedef VertexPutChecker VPuC;
+ typedef NeighbourIteratorFactory NIF;
+
+ ComposedDijkstraSettings(LengthCalculator len_calc,
+ VertexProcessChecker vert_proc_checker,
+ VertexPutChecker vert_put_checker,
+ NeighbourIteratorFactory neigh_iter_factory) :
+ len_calc_(len_calc),
+ vert_proc_checker_(vert_proc_checker),
+ vert_put_checker_(vert_put_checker),
+ neigh_iter_factory_(neigh_iter_factory) { }
+
+ void Init(VertexId /*vertex*/){
+ }
+
+ distance_t GetLength(EdgeId edge) const{
+ return len_calc_.GetLength(edge);
+ }
+
+ bool CheckProcessVertex(VertexId vertex, distance_t distance){
+ return vert_proc_checker_.Check(vertex, distance);
+ }
+
+ bool CheckPutVertex(VertexId vertex, EdgeId edge, distance_t length) const{
+ return vert_put_checker_.Check(vertex, edge, length);
+ }
+
+ typename NeighbourIteratorFactory::NeighbourIterator GetIterator(VertexId vertex) {
+ return neigh_iter_factory_.CreateIterator(vertex);
+ }
+};
+
+template<class Graph, class NeighbourIteratorFactory, typename distance_t = size_t>
+class CountingDijkstraSettings {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ const Graph &graph_;
+
+ NeighbourIteratorFactory neigh_iter_factory_;
+ static const distance_t inf = 100000000;
+ const size_t max_size_;
+ const size_t edge_length_bound_;
+ mutable size_t current_;
+
+public:
+ CountingDijkstraSettings(const Graph &graph,
+ NeighbourIteratorFactory neigh_iter_factory,
+ size_t max_size, size_t edge_length_bound) :
+ graph_(graph),
+ neigh_iter_factory_(neigh_iter_factory),
+ max_size_(max_size),
+ edge_length_bound_(edge_length_bound),
+ current_(0) { }
+
+ void Init(VertexId /*vertex*/){
+ current_ = 0;
+ }
+
+ distance_t GetLength(EdgeId edge) const{
+ if (graph_.length(edge) <= edge_length_bound_)
+ return graph_.length(edge);
+ return inf;
+ }
+
+ bool CheckProcessVertex(VertexId , distance_t ){
+ return current_ < max_size_;
+ }
+
+ bool CheckPutVertex(VertexId , EdgeId edge, distance_t ) const{
+ if (current_ < max_size_)
+ ++current_;
+ if (current_ < max_size_ && GetLength(edge) < inf)
+ return true;
+ return false;
+ }
+
+ typename NeighbourIteratorFactory::NeighbourIterator GetIterator(VertexId vertex) {
+ return neigh_iter_factory_.CreateIterator(vertex);
+ }
+};
+
+}
diff --git a/src/modules/algorithms/dijkstra/length_calculator.hpp b/src/modules/algorithms/dijkstra/length_calculator.hpp
new file mode 100644
index 0000000..ec29690
--- /dev/null
+++ b/src/modules/algorithms/dijkstra/length_calculator.hpp
@@ -0,0 +1,112 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+#include "dev_support/standard_base.hpp"
+
+namespace omnigraph {
+
+template<class Graph, typename distance_t = size_t>
+class LengthCalculator {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+protected:
+ const Graph &graph_;
+public:
+ LengthCalculator(const Graph &graph) : graph_(graph) { }
+ virtual distance_t GetLength(EdgeId edge) const{
+ return distance_t(graph_.length(edge));
+ }
+ virtual ~LengthCalculator() { }
+};
+
+template<class Graph, typename distance_t = size_t>
+class ComponentLenCalculator : public LengthCalculator<Graph, distance_t> {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ set<EdgeId> &component_;
+public:
+ ComponentLenCalculator(const Graph &graph, set<EdgeId> &component) :
+ LengthCalculator<Graph, distance_t>(graph), component_(component) { }
+
+ distance_t GetLength(EdgeId edge) const{
+ if (component_.count(edge) != 0)
+ return 0;
+ return this->graph_.length(edge);
+ }
+};
+
+template<class Graph, typename distance_t = size_t>
+class BoundedEdgeLenCalculator : public LengthCalculator<Graph, distance_t> {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ distance_t bound_;
+public:
+ BoundedEdgeLenCalculator(const Graph &graph, distance_t bound) :
+ LengthCalculator<Graph, distance_t>(graph), bound_(bound) { }
+
+ distance_t GetLength(EdgeId edge) const{
+ if(this->graph_.length(edge) <= bound_)
+ return 0;
+ return 1;
+ }
+};
+
+template<class Graph, typename distance_t = size_t>
+class AlongPathLengthCalculator : public LengthCalculator<Graph, distance_t> {
+ typedef LengthCalculator<Graph, distance_t> base;
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ set<VertexId> vertex_path_;
+ distance_t bound_;
+
+ set<VertexId> CollectVertices(vector<EdgeId> &edge_path){
+ set<VertexId> result;
+ for(auto e = edge_path.begin(); e != edge_path.end(); e++){
+ result.insert(this->graph_.EdgeStart(*e));
+ result.insert(this->graph_.EdgeEnd(*e));
+ }
+ return result;
+ }
+
+public:
+ AlongPathLengthCalculator(const Graph &graph, vector<EdgeId> &edge_path, distance_t bound) :
+ LengthCalculator<Graph, distance_t>(graph),
+ vertex_path_(CollectVertices(edge_path)),
+ bound_(bound) { }
+
+ distance_t GetLength(EdgeId edge) const{
+ if (vertex_path_.count(this->graph_.EdgeStart(edge))
+ && vertex_path_.count(this->graph_.EdgeEnd(edge)))
+ return min(int(base::GetLength(edge)), 200);
+ return base::GetLength(edge);
+ }
+};
+
+template<class Graph, typename distance_t = size_t>
+class PathIgnoringLengthCalculator : public LengthCalculator<Graph, distance_t> {
+ typedef LengthCalculator<Graph, distance_t> base;
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ set<EdgeId> path_;
+ distance_t bound_;
+
+public:
+ PathIgnoringLengthCalculator(const Graph &graph, const vector<EdgeId> &edge_path) :
+ LengthCalculator<Graph, distance_t>(graph), path_(edge_path.begin(), edge_path.end())
+ { }
+
+ distance_t GetLength(EdgeId edge) const {
+ if (path_.find(edge) != path_.end()) {
+ return 0;
+ }
+ return base::GetLength(edge);
+ }
+};
+
+
+}
diff --git a/src/modules/algorithms/dijkstra/neighbours_iterator.hpp b/src/modules/algorithms/dijkstra/neighbours_iterator.hpp
new file mode 100644
index 0000000..b7587d0
--- /dev/null
+++ b/src/modules/algorithms/dijkstra/neighbours_iterator.hpp
@@ -0,0 +1,164 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+namespace omnigraph {
+
+template<class Graph>
+struct vertex_neighbour {
+protected:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+public:
+ VertexId vertex;
+ EdgeId edge;
+
+ vertex_neighbour(VertexId new_vertex, EdgeId new_edge) :
+ vertex(new_vertex),
+ edge(new_edge) { }
+};
+
+template<class Graph>
+class NeighbourIterator {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+protected:
+ const Graph &graph_;
+ VertexId vertex_;
+public:
+ NeighbourIterator(const Graph &graph, VertexId vertex) :
+ graph_(graph),
+ vertex_(vertex) { }
+
+ virtual bool HasNext() = 0;
+ virtual vertex_neighbour<Graph> Next() = 0;
+ virtual ~NeighbourIterator() { }
+};
+
+template<class Graph>
+class ForwardNeighbourIterator : public NeighbourIterator<Graph>{
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename VertexId::type::edge_const_iterator edge_const_iterator;
+
+ pair<edge_const_iterator, edge_const_iterator> out_edges_;
+public:
+ ForwardNeighbourIterator(const Graph &graph, VertexId vertex) :
+ NeighbourIterator<Graph>(graph, vertex),
+ out_edges_(make_pair(graph.OutgoingEdges(vertex).begin(),
+ graph.OutgoingEdges(vertex).end())) { }
+
+ bool HasNext(){
+ return out_edges_.first != out_edges_.second;
+ }
+
+ vertex_neighbour<Graph> Next() {
+ vertex_neighbour<Graph> res(this->graph_.EdgeEnd(*out_edges_.first), *out_edges_.first);
+ out_edges_.first++;
+ return res;
+ }
+};
+
+template<class Graph>
+class BackwardNeighbourIterator : public NeighbourIterator<Graph>{
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename VertexId::type::edge_const_iterator edge_const_iterator;
+
+ pair<edge_const_iterator, edge_const_iterator> in_edges_;
+public:
+ BackwardNeighbourIterator(const Graph &graph, VertexId vertex) :
+ NeighbourIterator<Graph>(graph, vertex),
+ in_edges_(make_pair(graph.IncomingEdges(vertex).begin(),
+ graph.IncomingEdges(vertex).end())) { }
+
+ bool HasNext(){
+ return in_edges_.first != in_edges_.second;
+ }
+
+ vertex_neighbour<Graph> Next() {
+ vertex_neighbour<Graph> res(this->graph_.EdgeStart(*in_edges_.first), *in_edges_.first);
+ in_edges_.first++;
+ return res;
+ }
+};
+
+template<class Graph>
+class UnorientedNeighbourIterator : public NeighbourIterator<Graph>{
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename VertexId::type::edge_const_iterator edge_const_iterator;
+
+ pair<edge_const_iterator, edge_const_iterator> in_edges_;
+ pair<edge_const_iterator, edge_const_iterator> out_edges_;
+public:
+ UnorientedNeighbourIterator(const Graph &graph, VertexId vertex) :
+ NeighbourIterator<Graph>(graph, vertex),
+ in_edges_(make_pair(graph.IncomingEdges(vertex).begin(),
+ graph.IncomingEdges(vertex).end())),
+ out_edges_(make_pair(graph.OutgoingEdges(vertex).begin(),
+ graph.OutgoingEdges(vertex).end())) { }
+
+ bool HasNext(){
+ return in_edges_.first != in_edges_.second;
+ }
+
+ // first all outgoing edges are visited
+ // then all incoming
+ vertex_neighbour<Graph> Next() {
+ if(out_edges_.first != out_edges_.second){
+ vertex_neighbour<Graph> res(this->graph_.EdgeEnd(*out_edges_.first), *out_edges_.first);
+ out_edges_.first++;
+ return res;
+ }
+ vertex_neighbour<Graph> res(this->graph_.EdgeStart(*in_edges_.first), *in_edges_.first);
+ in_edges_.first++;
+ return res;
+ }
+};
+
+template<class Graph>
+class ForwardNeighbourIteratorFactory {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ const Graph &graph_;
+public:
+ typedef ForwardNeighbourIterator<Graph> NeighbourIterator;
+ ForwardNeighbourIteratorFactory(const Graph &graph) : graph_(graph) { }
+ NeighbourIterator CreateIterator(VertexId vertex){
+ return NeighbourIterator(graph_, vertex);
+ }
+};
+
+template<class Graph>
+class BackwardNeighbourIteratorFactory {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ const Graph &graph_;
+public:
+ typedef BackwardNeighbourIterator<Graph> NeighbourIterator;
+ BackwardNeighbourIteratorFactory(const Graph &graph) : graph_(graph) { }
+ NeighbourIterator CreateIterator(VertexId vertex){
+ return NeighbourIterator(graph_, vertex);
+ }
+};
+
+template<class Graph>
+class UnorientedNeighbourIteratorFactory {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ const Graph &graph_;
+public:
+ typedef UnorientedNeighbourIterator<Graph> NeighbourIterator;
+ UnorientedNeighbourIteratorFactory(const Graph &graph) : graph_(graph) { }
+ NeighbourIterator CreateIterator(VertexId vertex){
+ return NeighbourIterator(graph_, vertex);
+ }
+};
+
+}
diff --git a/src/modules/algorithms/dijkstra/vertex_process_checker.hpp b/src/modules/algorithms/dijkstra/vertex_process_checker.hpp
new file mode 100644
index 0000000..6a2f6be
--- /dev/null
+++ b/src/modules/algorithms/dijkstra/vertex_process_checker.hpp
@@ -0,0 +1,72 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+namespace omnigraph {
+
+template<class Graph, typename distance_t = size_t>
+class VertexProcessChecker {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+public:
+ VertexProcessChecker() {}
+ virtual bool Check(VertexId, distance_t) { return true; }
+ virtual ~VertexProcessChecker() {}
+};
+
+template<class Graph, typename distance_t = size_t>
+class BoundProcessChecker : public VertexProcessChecker<Graph, distance_t> {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ const distance_t distance_bound_;
+public:
+ BoundProcessChecker(distance_t distance_bound) :
+ distance_bound_(distance_bound) {}
+
+ bool Check(VertexId, distance_t distance) override {
+ return distance <= distance_bound_;
+ }
+};
+
+template<class Graph, typename distance_t = size_t>
+class ZeroLengthProcessChecker : public VertexProcessChecker<Graph, distance_t> {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+public:
+ ZeroLengthProcessChecker() {}
+
+ bool Check(VertexId, distance_t distance) override {
+ return distance == 0;
+ }
+};
+
+template<class Graph, typename distance_t = size_t>
+class BoundedVertexTargetedProcessChecker : public BoundProcessChecker<Graph, distance_t> {
+ typedef BoundProcessChecker<Graph, distance_t> base;
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ VertexId target_vertex_;
+ bool target_reached_;
+public:
+ BoundedVertexTargetedProcessChecker(VertexId target_vertex, size_t bound) :
+ base(bound),
+ target_vertex_(target_vertex),
+ target_reached_(false) { }
+
+ bool Check(VertexId vertex, distance_t distance) override {
+ if (vertex == target_vertex_)
+ target_reached_ = true;
+ if (target_reached_)
+ return false;
+ else
+ return base::Check(vertex, distance);
+ }
+};
+
+}
diff --git a/src/modules/algorithms/dijkstra/vertex_put_checker.hpp b/src/modules/algorithms/dijkstra/vertex_put_checker.hpp
new file mode 100644
index 0000000..69f1bec
--- /dev/null
+++ b/src/modules/algorithms/dijkstra/vertex_put_checker.hpp
@@ -0,0 +1,63 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+namespace omnigraph {
+
+template<class Graph, typename distance_t = size_t>
+class VertexPutChecker {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+public:
+ VertexPutChecker() { }
+ virtual bool Check(VertexId, EdgeId, distance_t) const{ return true; }
+ virtual ~VertexPutChecker() { }
+};
+
+template<class Graph, typename distance_t = size_t>
+class EdgeComponentPutChecker : public VertexPutChecker<Graph, distance_t> {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ set<EdgeId> &edges_;
+public:
+ EdgeComponentPutChecker(set<EdgeId> &edges) : VertexPutChecker<Graph, distance_t>(), edges_(edges) { }
+ bool Check(VertexId, EdgeId edge, distance_t) const{
+ return edges_.count(edge) != 0;
+ }
+};
+
+template<class Graph, typename distance_t = size_t>
+class SubgraphPutChecker : public VertexPutChecker<Graph, distance_t> {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ const set<VertexId> &subgraph_;
+public:
+ SubgraphPutChecker(const set<VertexId>& subgraph) : VertexPutChecker<Graph, distance_t>(),
+ subgraph_(subgraph) { }
+ bool Check(VertexId vertex, EdgeId, distance_t) const{
+ return subgraph_.count(vertex) != 0;
+ }
+};
+
+template<class Graph, typename distance_t = size_t>
+class BoundPutChecker : public VertexPutChecker<Graph, distance_t> {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ const distance_t bound_;
+public:
+ BoundPutChecker(distance_t bound) : VertexPutChecker<Graph, distance_t>(),
+ bound_(bound) { }
+ bool Check(VertexId, EdgeId, distance_t length) const{
+ return length <= bound_;
+ }
+};
+
+}
diff --git a/src/modules/algorithms/genome_consistance_checker.cpp b/src/modules/algorithms/genome_consistance_checker.cpp
new file mode 100644
index 0000000..c980d7c
--- /dev/null
+++ b/src/modules/algorithms/genome_consistance_checker.cpp
@@ -0,0 +1,238 @@
+#include "algorithms/genome_consistance_checker.hpp"
+#include "assembly_graph/graph_core/graph.hpp"
+#include <algorithm>
+#include <limits>
+namespace debruijn_graph {
+using omnigraph::MappingRange;
+using namespace std;
+
+//gap or overlap size. WITHOUT SIGN!
+static size_t gap(const Range &a, const Range &b) {
+ return max(a.end_pos, b.start_pos) - min (a.end_pos, b.start_pos);
+}
+bool GenomeConsistenceChecker::consequent(const Range &mr1, const Range &mr2) const{
+ if (mr1.end_pos > mr2.start_pos + absolute_max_gap_)
+ return false;
+ if (mr1.end_pos + absolute_max_gap_ < mr2.start_pos)
+ return false;
+ return true;
+
+}
+bool GenomeConsistenceChecker::consequent(const MappingRange &mr1, const MappingRange &mr2) const {
+ //do not want to think about handling gaps near 0 position.
+ if (!consequent(mr1.initial_range, mr2.initial_range) || !consequent(mr1.mapped_range, mr2.mapped_range))
+ return false;
+ size_t initial_gap = gap(mr1.initial_range, mr2.initial_range);
+ size_t mapped_gap = gap(mr1.mapped_range, mr2.mapped_range);
+ size_t max_gap = max(initial_gap, mapped_gap);
+ if ( max_gap > relative_max_gap_* double (max (min(mr1.initial_range.size(), mr1.mapped_range.size()), min(mr2.initial_range.size(), mr2.mapped_range.size()))))
+ return false;
+ return true;
+}
+
+PathScore GenomeConsistenceChecker::CountMisassemblies(const BidirectionalPath &path) const {
+ PathScore straight = CountMisassembliesWithStrand(path, "0");
+ PathScore reverse = CountMisassembliesWithStrand(path, "1");
+ size_t total_length = path.LengthAt(0);
+//TODO: constant;
+ if (total_length > std::max(straight.mapped_length, reverse.mapped_length) * 2) {
+ if (total_length > 10000) {
+ INFO ("For path length " << total_length <<" mapped less than half of the path, skipping");
+ }
+ return PathScore(0,0,0);
+ } else {
+ if (straight.mapped_length > reverse.mapped_length) {
+ return straight;
+ } else {
+ return reverse;
+ }
+ }
+}
+
+void GenomeConsistenceChecker::SpellGenome() {
+ vector<pair<EdgeId, MappingRange> > to_sort;
+ for(auto e: storage_) {
+ if (excluded_unique_.find(e) == excluded_unique_.end() ) {
+ set<MappingRange> mappings = gp_.edge_pos.GetEdgePositions(e, "fxd0");
+ if (mappings.size() > 1) {
+ INFO("edge " << e << "smth strange");
+ } else if (mappings.size() == 0) {
+ continue;
+ } else {
+ to_sort.push_back(make_pair(e, *mappings.begin()));
+ }
+ }
+ }
+ sort(to_sort.begin(), to_sort.end(), [](const pair<EdgeId, MappingRange> & a, const pair<EdgeId, MappingRange> & b) -> bool
+ {
+ return a.second.initial_range.start_pos < b.second.initial_range.start_pos;
+ }
+ );
+ size_t count = 0;
+ for(auto p: to_sort) {
+ INFO("edge " << gp_.g.int_id(p.first) << " length "<< gp_.g.length(p.first) << " coverage " << gp_.g.coverage(p.first) << " mapped to " << p.second.mapped_range.start_pos << " - " << p.second.mapped_range.end_pos << " init_range " << p.second.initial_range.start_pos << " - " << p.second.initial_range.end_pos );
+ genome_spelled_[p.first] = count;
+ count++;
+ }
+}
+
+PathScore GenomeConsistenceChecker::CountMisassembliesWithStrand(const BidirectionalPath &path, const string strand) const {
+ if (strand == "1") {
+ return (CountMisassembliesWithStrand(*path.GetConjPath(), "0"));
+ }
+ PathScore res(0, 0, 0);
+ EdgeId prev;
+ size_t prev_in_genome = std::numeric_limits<std::size_t>::max();
+ size_t prev_in_path = std::numeric_limits<std::size_t>::max();
+ MappingRange prev_range;
+ for (int i = 0; i < (int) path.Size(); i++) {
+ if (genome_spelled_.find(path.At(i)) != genome_spelled_.end()) {
+ size_t cur_in_genome = genome_spelled_[path.At(i)];
+ MappingRange cur_range = *gp_.edge_pos.GetEdgePositions(path.At(i), "fxd0").begin();
+ if (prev_in_genome != std::numeric_limits<std::size_t>::max()) {
+ if (cur_in_genome == prev_in_genome + 1) {
+ int dist_in_genome = (int) cur_range.initial_range.start_pos - (int) prev_range.initial_range.end_pos;
+ int dist_in_path = (int) path.LengthAt(prev_in_path) - (int) path.LengthAt(i) + (int) cur_range.mapped_range.start_pos - (int) prev_range.mapped_range.end_pos;
+ DEBUG("Edge " << prev.int_id() << " position in genome ordering: " << prev_in_genome);
+ DEBUG("Gap in genome / gap in path: " << dist_in_genome << " / " << dist_in_path);
+ if (abs(dist_in_genome - dist_in_path) > absolute_max_gap_ && (dist_in_genome * (1 + relative_max_gap_) < dist_in_path || dist_in_path * (1 + relative_max_gap_) < dist_in_genome)) {
+
+ res.wrong_gap_size ++;
+ }
+ } else {
+ if (path.At(i) != circular_edge_ && path.At(prev_in_path) != circular_edge_)
+ res.misassemblies++;
+ else
+ INFO("Skipping fake(circular) misassembly");
+ }
+ }
+ res.mapped_length += cur_range.mapped_range.size();
+ prev = path.At(i);
+ prev_in_genome = cur_in_genome;
+ prev_range = cur_range;
+ prev_in_path = i;
+ }
+ }
+ if (prev_in_path != std::numeric_limits<std::size_t>::max())
+ DEBUG("Edge " << prev.int_id() << " position in genome ordering: " << prev_in_genome);
+ return res;
+}
+void GenomeConsistenceChecker::RefillPos() {
+ RefillPos("0");
+ RefillPos("1");
+}
+
+
+void GenomeConsistenceChecker::RefillPos(const string &strand) {
+ for (auto e: storage_) {
+ RefillPos(strand, e);
+ }
+}
+
+void GenomeConsistenceChecker::FindBestRangeSequence(const set<MappingRange>& old_mappings, vector<MappingRange>& used_mappings) const {
+ vector<MappingRange> to_process (old_mappings.begin(), old_mappings.end());
+ sort(to_process.begin(), to_process.end(), [](const MappingRange & a, const MappingRange & b) -> bool
+ {
+ return a.mapped_range.start_pos < b.mapped_range.start_pos;
+ } );
+ size_t sz = to_process.size();
+//max weight path in orgraph of mappings
+ TRACE("constructing mapping graph" << sz << " vertices");
+ vector<vector<size_t>> consecutive_mappings(sz);
+ for(size_t i = 0; i < sz; i++) {
+ for (size_t j = i + 1; j < sz; j++) {
+ if (consequent(to_process[i], to_process[j])) {
+ consecutive_mappings[i].push_back(j);
+ } else {
+ if (to_process[j].mapped_range.start_pos > to_process[i].mapped_range.end_pos + absolute_max_gap_) {
+ break;
+ }
+ }
+ }
+ }
+ vector<size_t> scores(sz), prev(sz);
+ for(size_t i = 0; i < sz; i++) {
+ scores[i] = to_process[i].initial_range.size();
+ prev[i] = std::numeric_limits<std::size_t>::max();
+ }
+ for(size_t i = 0; i < sz; i++) {
+ for (size_t j = 0; j < consecutive_mappings[i].size(); j++) {
+ TRACE(consecutive_mappings[i][j]);
+ if (scores[consecutive_mappings[i][j]] < scores[i] + to_process[consecutive_mappings[i][j]].initial_range.size()) {
+ scores[consecutive_mappings[i][j]] = scores[i] + to_process[consecutive_mappings[i][j]].initial_range.size();
+ prev[consecutive_mappings[i][j]] = i;
+ }
+ }
+ }
+ size_t cur_max = 0;
+ size_t cur_i = 0;
+ for(size_t i = 0; i < sz; i++) {
+ if (scores[i] > cur_max) {
+ cur_max = scores[i];
+ cur_i = i;
+ }
+ }
+ used_mappings.clear();
+ while (cur_i != std::numeric_limits<std::size_t>::max()) {
+ used_mappings.push_back(to_process[cur_i]);
+ cur_i = prev[cur_i];
+ }
+ reverse(used_mappings.begin(), used_mappings.end());
+};
+
+void GenomeConsistenceChecker::RefillPos(const string &strand, const EdgeId &e) {
+ set<MappingRange> old_mappings = gp_.edge_pos.GetEdgePositions(e, strand);
+ TRACE("old mappings sz " << old_mappings.size() );
+ size_t total_mapped = 0;
+ for (auto mp:old_mappings) {
+ total_mapped += mp.initial_range.size();
+ }
+ if (total_mapped > (double) gp_.g.length(e) * 1.5) {
+ INFO ("Edge " << gp_.g.int_id(e) << "is not unique, excluding");
+ excluded_unique_.insert(e);
+ return;
+ }
+//TODO: support non-unique edges;
+ if (total_mapped < (double) gp_.g.length(e) * 0.5) {
+ DEBUG ("Edge " << gp_.g.int_id(e) << "is not mapped on strand "<< strand <<", not used");
+ return;
+ }
+ TRACE(total_mapped << " " << gp_.g.length(e));
+ string new_strand = "fxd" + strand;
+ vector<MappingRange> used_mappings;
+ FindBestRangeSequence(old_mappings, used_mappings);
+
+ size_t cur_i = 0;
+ MappingRange new_mapping;
+ new_mapping = used_mappings[cur_i];
+ size_t used_mapped = new_mapping.initial_range.size();
+ TRACE ("Edge " << gp_.g.int_id(e) << " length "<< gp_.g.length(e));
+ TRACE ("new_mapping mp_range "<< new_mapping.mapped_range.start_pos << " - " << new_mapping.mapped_range.end_pos
+ << " init_range " << new_mapping.initial_range.start_pos << " - " << new_mapping.initial_range.end_pos );
+ while (cur_i < used_mappings.size() - 1) {
+ cur_i ++;
+ used_mapped += used_mappings[cur_i].initial_range.size();
+ new_mapping = new_mapping.Merge(used_mappings[cur_i]);
+ TRACE("new_mapping mp_range "<< new_mapping.mapped_range.start_pos << " - " << new_mapping.mapped_range.end_pos
+ << " init_range " << new_mapping.initial_range.start_pos << " - " << new_mapping.initial_range.end_pos );
+ }
+//used less that 0.9 of aligned length
+ if (total_mapped * 10 >= used_mapped * 10 + gp_.g.length(e)) {
+ INFO ("Edge " << gp_.g.int_id(e) << " length "<< gp_.g.length(e) << "is potentially misassembled! mappings: ");
+ for (auto mp:old_mappings) {
+ INFO("mp_range "<< mp.mapped_range.start_pos << " - " << mp.mapped_range.end_pos << " init_range " << mp.initial_range.start_pos << " - " << mp.initial_range.end_pos );
+ if (mp.initial_range.start_pos < absolute_max_gap_) {
+ INFO ("Fake(linear order) misassembly on edge "<< e.int_id());
+ if (strand == "0") {
+ circular_edge_ = e;
+ }
+ }
+ }
+
+ }
+ gp_.edge_pos.AddEdgePosition(e, new_strand, new_mapping);
+}
+
+
+
+}
diff --git a/src/modules/algorithms/genome_consistance_checker.hpp b/src/modules/algorithms/genome_consistance_checker.hpp
new file mode 100644
index 0000000..7c106f3
--- /dev/null
+++ b/src/modules/algorithms/genome_consistance_checker.hpp
@@ -0,0 +1,77 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+
+#pragma once
+#include "visualization/graph_labeler.hpp"
+#include "assembly_graph/handlers/edges_position_handler.hpp"
+#include "assembly_graph/paths/mapping_path.hpp"
+#include "data_structures/sequence/sequence.hpp"
+#include "pipeline/graph_pack.hpp"
+#include "visualization/position_filler.hpp"
+#include "assembly_graph/paths/bidirectional_path.hpp"
+#include "assembly_graph/graph_support/scaff_supplementary.hpp"
+
+namespace debruijn_graph {
+
+
+using path_extend::BidirectionalPath;
+using path_extend::ScaffoldingUniqueEdgeStorage;
+
+struct PathScore{
+ size_t misassemblies;
+ size_t wrong_gap_size;
+ size_t mapped_length;
+ PathScore(size_t m, size_t w, size_t ml): misassemblies(m), wrong_gap_size(w), mapped_length(ml) {}
+};
+class GenomeConsistenceChecker {
+
+private:
+ const conj_graph_pack &gp_;
+ const Graph &graph_;
+ //EdgesPositionHandler<Graph> &position_handler_;
+ Sequence genome_;
+ ScaffoldingUniqueEdgeStorage storage_;
+ size_t absolute_max_gap_;
+ double relative_max_gap_;
+ set<EdgeId> excluded_unique_;
+ EdgeId circular_edge_;
+//map from unique edges to their order in genome spelling;
+ mutable map<EdgeId, size_t> genome_spelled_;
+ bool consequent(const Range &mr1, const Range &mr2) const;
+ bool consequent(const MappingRange &mr1, const MappingRange &mr2) const ;
+
+ PathScore CountMisassembliesWithStrand(const BidirectionalPath &path, const string strand) const;
+//constructs longest sequence of consequetive ranges, stores result in used_mappings
+ void FindBestRangeSequence(const set<MappingRange>& old_mappings, vector<MappingRange>& used_mappings) const;
+//Refills genomic positions uniting alingments separated with small gaps
+ void RefillPos();
+ void RefillPos(const string &strand);
+ void RefillPos(const string &strand, const EdgeId &e);
+DECL_LOGGER("GenomeConsistenceChecker");
+
+
+public:
+ GenomeConsistenceChecker(const conj_graph_pack &gp, ScaffoldingUniqueEdgeStorage &storage, size_t max_gap, double relative_max_gap /*= 0.2*/) : gp_(gp),
+ graph_(gp.g), /*position_handler_(gp.edge_pos),*/ genome_(gp.genome.GetSequence()), storage_(storage),
+ absolute_max_gap_(max_gap), relative_max_gap_(relative_max_gap), excluded_unique_(), circular_edge_() {
+ if (!gp.edge_pos.IsAttached()) {
+ gp.edge_pos.Attach();
+ }
+ gp.edge_pos.clear();
+ FillPos(gp_, gp_.genome.GetSequence(), "0");
+ FillPos(gp_, !gp_.genome.GetSequence(), "1");
+ RefillPos();
+ }
+ PathScore CountMisassemblies(const BidirectionalPath &path) const;
+//spells genome in language of long unique edges from storage;
+ void SpellGenome();
+
+};
+
+
+}
diff --git a/src/modules/algorithms/graph_construction.hpp b/src/modules/algorithms/graph_construction.hpp
new file mode 100644
index 0000000..ce32a7e
--- /dev/null
+++ b/src/modules/algorithms/graph_construction.hpp
@@ -0,0 +1,179 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * graph_construction.hpp
+ *
+ * Created on: Aug 12, 2011
+ * Author: sergey
+ */
+#pragma once
+
+#include "pipeline/graph_pack.hpp"
+
+#include "io/reads_io/io_helper.hpp"
+#include "assembly_graph/graph_core/graph.hpp"
+
+#include "data_structures/debruijn_graph/debruijn_graph_constructor.hpp"
+#include "data_structures/debruijn_graph/early_simplification.hpp"
+
+#include "dev_support/perfcounter.hpp"
+#include "io/dataset_support/read_converter.hpp"
+
+#include "assembly_graph/handlers/edges_position_handler.hpp"
+#include "assembly_graph/graph_support/detail_coverage.hpp"
+#include "data_structures/indices/storing_traits.hpp"
+#include "data_structures/indices/edge_index_builders.hpp"
+#include "dev_support/openmp_wrapper.h"
+
+namespace debruijn_graph {
+
+template<class StoringType>
+struct CoverageCollector {
+};
+
+template<>
+struct CoverageCollector<SimpleStoring> {
+ template<class Info>
+ static void CollectCoverage(Info edge_info) {
+ edge_info.edge_id->IncCoverage(edge_info.count);
+ }
+};
+
+template<>
+struct CoverageCollector<InvertableStoring> {
+ template<class Info>
+ static void CollectCoverage(Info edge_info) {
+ edge_info.edge_id->IncCoverage(edge_info.count);
+ edge_info.edge_id->conjugate()->IncCoverage(edge_info.count);
+ }
+};
+
+
+template<class Index>
+void FillCoverageFromIndex(const Index &index) {
+ for (auto I = index.value_cbegin(), E = index.value_cend();
+ I != E; ++I) {
+ const auto& edge_info = *I;
+ VERIFY(edge_info.offset != -1u);
+// VERIFY(edge_info.edge_id.get() != NULL);
+ if(edge_info.offset != -1u) {
+ CoverageCollector<typename Index::storing_type>::CollectCoverage(edge_info);
+ }
+ }
+ DEBUG("Coverage counted");
+}
+
+template<class Graph, class Readers, class Index>
+size_t ConstructGraphUsingOldIndex(Readers& streams, Graph& g,
+ Index& index, io::SingleStreamPtr contigs_stream = io::SingleStreamPtr()) {
+ INFO("Constructing DeBruijn graph");
+
+ TRACE("Filling indices");
+ size_t rl = 0;
+ VERIFY_MSG(streams.size(), "No input streams specified");
+
+ TRACE("... in parallel");
+ typedef typename Index::InnerIndexT InnerIndex;
+ typedef typename EdgeIndexHelper<InnerIndex>::CoverageFillingEdgeIndexBuilderT IndexBuilder;
+ InnerIndex& debruijn = index.inner_index();
+ //fixme hack
+ rl = IndexBuilder().BuildIndexFromStream(debruijn, streams, (contigs_stream == 0) ? 0 : &(*contigs_stream));
+
+ VERIFY(g.k() + 1== debruijn.k());
+ // FIXME: output_dir here is damn ugly!
+
+ TRACE("Filled indices");
+
+ INFO("Condensing graph");
+ DeBruijnGraphConstructor<Graph, InnerIndex> g_c(g, debruijn);
+ TRACE("Constructor ok");
+ VERIFY(!index.IsAttached());
+ index.Attach();
+ g_c.ConstructGraph(100, 10000, 1.2); // TODO: move magic constants to config
+ INFO("Graph condensed");
+
+ return rl;
+}
+
+template<class ExtensionIndex>
+void EarlyClipTips(size_t k, const config::debruijn_config::construction& params, size_t rl, ExtensionIndex& ext) {
+ if (params.early_tc.enable) {
+ size_t length_bound = rl - k;
+ if (params.early_tc.length_bound)
+ length_bound = params.early_tc.length_bound.get();
+ AlternativeEarlyTipClipper(ext, length_bound).ClipTips();
+ }
+}
+
+template<class Graph, class Read, class Index>
+ReadStatistics ConstructGraphUsingExtentionIndex(const config::debruijn_config::construction params,
+ io::ReadStreamList<Read>& streams, Graph& g,
+ Index& index, io::SingleStreamPtr contigs_stream = io::SingleStreamPtr()) {
+
+ size_t k = g.k();
+ INFO("Constructing DeBruijn graph for k=" << k);
+
+ TRACE("Filling indices");
+ VERIFY_MSG(streams.size(), "No input streams specified");
+
+ TRACE("... in parallel");
+ // FIXME: output_dir here is damn ugly!
+ typedef DeBruijnExtensionIndex<> ExtensionIndex;
+ typedef typename ExtensionIndexHelper<ExtensionIndex>::DeBruijnExtensionIndexBuilderT ExtensionIndexBuilder;
+ ExtensionIndex ext((unsigned) k, index.inner_index().workdir());
+
+ //fixme hack
+ ReadStatistics stats = ExtensionIndexBuilder().BuildExtensionIndexFromStream(ext, streams, (contigs_stream == 0) ? 0 : &(*contigs_stream), params.read_buffer_size);
+
+ EarlyClipTips(k, params, stats.max_read_length_, ext);
+
+ INFO("Condensing graph");
+ VERIFY(!index.IsAttached());
+ DeBruijnGraphExtentionConstructor<Graph> g_c(g, ext);
+ g_c.ConstructGraph(100, 10000, 1.2, params.keep_perfect_loops);//TODO move these parameters to config
+
+ INFO("Building index with from graph")
+ //todo pass buffer size
+ index.Refill();
+ index.Attach();
+
+ return stats;
+}
+
+template<class Graph, class Index, class Streams>
+ReadStatistics ConstructGraph(const config::debruijn_config::construction ¶ms,
+ Streams& streams, Graph& g,
+ Index& index, io::SingleStreamPtr contigs_stream = io::SingleStreamPtr()) {
+ if (params.con_mode == config::construction_mode::extention) {
+ return ConstructGraphUsingExtentionIndex(params, streams, g, index, contigs_stream);
+// } else if(params.con_mode == construction_mode::con_old){
+// return ConstructGraphUsingOldIndex(k, streams, g, index, contigs_stream);
+ } else {
+ INFO("Invalid construction mode")
+ VERIFY(false);
+ return {0,0,0};
+ }
+}
+
+template<class Graph, class Index, class Streams>
+ReadStatistics ConstructGraphWithCoverage(const config::debruijn_config::construction ¶ms,
+ Streams& streams, Graph& g,
+ Index& index, FlankingCoverage<Graph>& flanking_cov,
+ io::SingleStreamPtr contigs_stream = io::SingleStreamPtr()) {
+ ReadStatistics rs = ConstructGraph(params, streams, g, index, contigs_stream);
+
+ typedef typename Index::InnerIndexT InnerIndex;
+ typedef typename EdgeIndexHelper<InnerIndex>::CoverageAndGraphPositionFillingIndexBuilderT IndexBuilder;
+ INFO("Filling coverage index")
+ IndexBuilder().ParallelFillCoverage(index.inner_index(), streams);
+ INFO("Filling coverage and flanking coverage from index");
+ FillCoverageAndFlanking(index.inner_index(), g, flanking_cov);
+ return rs;
+}
+
+}
diff --git a/src/modules/algorithms/graph_read_correction.hpp b/src/modules/algorithms/graph_read_correction.hpp
new file mode 100644
index 0000000..311891d
--- /dev/null
+++ b/src/modules/algorithms/graph_read_correction.hpp
@@ -0,0 +1,187 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "assembly_graph/paths/path_utils.hpp"
+#include "assembly_graph/paths/mapping_path.hpp"
+#include "assembly_graph/paths/path_finders.hpp"
+#include "assembly_graph/paths/path_processor.hpp"
+#include "io/reads_io/modifying_reader_wrapper.hpp"
+#include "assembly_graph/graph_core/order_and_law.hpp"
+#include "assembly_graph/graph_alignment/sequence_mapper.hpp"
+
+namespace debruijn_graph {
+
+template<class gp_t>
+class TipsProjector {
+ typedef typename gp_t::graph_t Graph;
+ typedef typename Graph::EdgeId EdgeId;
+
+ gp_t& gp_;
+
+ const omnigraph::UniquePathFinder<Graph> unique_path_finder_;
+
+ optional<EdgeId> UniqueAlternativeEdge(EdgeId tip, bool outgoing_tip) {
+ vector<EdgeId> edges;
+ if (outgoing_tip) {
+ push_back_all(edges, gp_.g.OutgoingEdges(gp_.g.EdgeStart(tip)));
+ } else {
+ push_back_all(edges, gp_.g.IncomingEdges(gp_.g.EdgeEnd(tip)));
+ }
+ restricted::set<EdgeId> edges_set(edges.begin(), edges.end());
+ edges_set.erase(tip);
+ if (edges_set.size() == 1)
+ return optional < EdgeId > (*edges_set.begin());
+ else
+ return boost::none;
+ }
+
+ vector<EdgeId> UniqueAlternativePath(EdgeId tip, bool outgoing_tip) {
+ optional<EdgeId> alt_edge = UniqueAlternativeEdge(tip, outgoing_tip);
+ if (alt_edge) {
+ if (outgoing_tip) {
+ return unique_path_finder_.UniquePathForward(*alt_edge);
+ } else {
+ return unique_path_finder_.UniquePathBackward(*alt_edge);
+ }
+ }
+ return vector<EdgeId>();
+ }
+
+ void AlignAndProject(const Sequence& tip_seq, const Sequence& alt_seq,
+ bool outgoing_tip) {
+ //todo refactor
+ Sequence aligned_tip = tip_seq;
+ Sequence aligned_alt = alt_seq;
+ if (outgoing_tip) {
+ if (tip_seq.size() >= alt_seq.size()) {
+ aligned_tip = tip_seq.Subseq(0, alt_seq.size());
+ } else {
+ aligned_alt = alt_seq.Subseq(0, tip_seq.size());
+ }
+ } else {
+ if (tip_seq.size() >= alt_seq.size()) {
+ aligned_tip = tip_seq.Subseq(tip_seq.size() - alt_seq.size());
+ } else {
+ aligned_alt = alt_seq.Subseq(alt_seq.size() - tip_seq.size());
+ }
+ }
+
+ INFO(
+ "Remapping " << aligned_tip.size()
+ << " kmers of aligned_tip to aligned_alt");
+ gp_.kmer_mapper.RemapKmers(aligned_tip, aligned_alt);
+ }
+
+public:
+ TipsProjector(gp_t& gp) :
+ gp_(gp), unique_path_finder_(gp.g) {
+
+ }
+
+ void ProjectTip(EdgeId tip) {
+ TRACE("Trying to project tip " << gp_.g.str(tip));
+ bool outgoing_tip = gp_.g.IsDeadEnd(gp_.g.EdgeEnd(tip));
+ Sequence tip_seq = gp_.g.EdgeNucls(tip);
+ vector<EdgeId> alt_path = UniqueAlternativePath(tip, outgoing_tip);
+ if (alt_path.empty()) {
+ TRACE(
+ "Failed to find unique alt path for tip " << gp_.g.str(tip)
+ << ". Wasn't projected!!!");
+ } else {
+ Sequence alt_seq = MergeSequences(gp_.g, alt_path);
+ if (tip_seq.size() > alt_seq.size()) {
+ TRACE(
+ "Can't fully project tip " << gp_.g.str(tip)
+ << " with seq length " << tip_seq.size()
+ << " because alt path length is "
+ << alt_seq.size()
+ << ". Trying to project partially");
+ }
+ AlignAndProject(tip_seq, alt_seq, outgoing_tip);
+ AlignAndProject(!tip_seq, !alt_seq, !outgoing_tip);
+ TRACE("Tip projected");
+ }
+ }
+private:
+ DECL_LOGGER("TipsProjector")
+ ;
+};
+
+//todo improve logging
+template<class Graph, class Mapper>
+class GraphReadCorrector: public io::SequenceModifier {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ const Graph& graph_;
+ const Mapper mapper_;
+ const MappingPathFixer<Graph> path_fixer_;
+
+public:
+ /*virtual*/
+ Sequence Modify(const Sequence& s) {
+// if(s < !s)
+// return !Refine(!s);
+ omnigraph::MappingPath<EdgeId> mapping_path = mapper_.MapSequence(s);
+
+ if (mapping_path.size() == 0 || s.size() < graph_.k() + 1
+ || mapping_path.front().second.initial_range.start_pos != 0
+ || mapping_path.back().second.initial_range.end_pos
+ != s.size() - graph_.k()) {
+ //todo reduce concat unmapped beginning and end in future???
+ TRACE(
+ "Won't fix because wasn't mapped or start/end fell on unprojected tip/erroneous connection");
+// TRACE(
+// "For sequence of length " << s.size()
+// << " returning empty sequence");
+ return s;
+// return Sequence();
+ }
+
+ Path<EdgeId> path = path_fixer_.TryFixPath(mapping_path.path());
+// TRACE("Mapped sequence to path " << graph_.str(path.sequence()));
+
+ if (!path_fixer_.CheckContiguous(path.sequence())) {
+ TRACE("Even fixed path wasn't contiguous");
+ return s;
+ } else {
+ TRACE("Fixed path is contiguous");
+ Sequence answer = PathSequence(graph_, path);
+// if (answer != s) {
+// if (answer.size() < 1000) {
+// TRACE(
+// "Initial sequence modified, edit distance= "
+// << EditDistance(answer, s));
+// } else {
+// TRACE("Sequence too large, won't count edit distance");
+// }
+// }
+ return answer;
+ }
+
+// else {
+// TRACE("Initial sequence unmodified!");
+// }
+ }
+
+ GraphReadCorrector(const Graph& graph, const Mapper& mapper) :
+ graph_(graph), mapper_(mapper), path_fixer_(graph) {
+ }
+
+private:
+ DECL_LOGGER("ContigRefiner");
+};
+
+template<class Graph, class Mapper>
+shared_ptr<GraphReadCorrector<Graph, Mapper>> GraphReadCorrectorInstance(
+ const Graph& graph, const Mapper& mapper) {
+ return std::make_shared<GraphReadCorrector<Graph, Mapper>>(graph, mapper);
+}
+
+}
diff --git a/src/modules/algorithms/mismatch_shall_not_pass.hpp b/src/modules/algorithms/mismatch_shall_not_pass.hpp
new file mode 100644
index 0000000..ed08660
--- /dev/null
+++ b/src/modules/algorithms/mismatch_shall_not_pass.hpp
@@ -0,0 +1,344 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "algorithms/simplification/compressor.hpp"
+#include "assembly_graph/handlers/id_track_handler.hpp"
+#include "dev_support/logger/logger.hpp"
+
+#include "data_structures/sequence/runtime_k.hpp"
+#include "assembly_graph/graph_alignment/sequence_mapper.hpp"
+
+#include "pipeline/config_struct.hpp"
+
+namespace debruijn_graph {
+
+namespace mismatches {
+struct NuclCount {
+ size_t counts_[4];
+
+ NuclCount() {
+ memset(counts_, 0, sizeof(counts_));
+ }
+
+ size_t &operator[](size_t nucl) {
+ return counts_[nucl];
+ }
+
+ NuclCount &operator+=(const NuclCount &other) {
+ counts_[0] += other.counts_[0];
+ counts_[1] += other.counts_[1];
+ counts_[2] += other.counts_[2];
+ counts_[3] += other.counts_[3];
+ return *this;
+ }
+};
+
+struct MismatchEdgeInfo {
+ NuclCount operator[](size_t i) const {
+ auto it = info_.find(i);
+ if (it == info_.end())
+ return NuclCount();
+ else
+ return it->second;
+ }
+
+ void operator+=(const MismatchEdgeInfo &other) {
+ for (auto it = other.info_.begin(); it != other.info_.end(); ++it) {
+ info_[it->first] += it->second;
+ }
+ }
+
+ void IncIfContains(size_t position, size_t nucl) {
+ auto it = info_.find(position);
+ if (it != info_.end()) {
+ it->second[nucl]++;
+ }
+ }
+
+ void AddPosition(size_t position) {
+ info_[position]; //in case map did not contain this key creates entry in the map with default value
+ }
+
+public:
+ map<size_t, NuclCount> info_;
+};
+
+template<typename EdgeId>
+class MismatchStatistics {
+private:
+ typedef typename map<EdgeId, MismatchEdgeInfo>::const_iterator const_iterator;
+ map<EdgeId, MismatchEdgeInfo> statistics_;
+
+ template<class graph_pack>
+ void CollectPotensialMismatches(const graph_pack &gp) {
+ auto &kmer_mapper = gp.kmer_mapper;
+ for (auto it = kmer_mapper.begin(); it != kmer_mapper.end(); ++it) {
+ runtime_k::RtSeq from = it->first;
+ runtime_k::RtSeq to = it->second;
+ size_t cnt = 0;
+ size_t cnt_arr[4];
+ for (size_t i = 0; i < 4; i++)
+ cnt_arr[i] = 0;
+ for (size_t i = 0; i < from.size(); i++) {
+ if (from[i] != to[i]) {
+ cnt++;
+ cnt_arr[(i * 4) / from.size()]++;
+ }
+ }
+ //last two contitions - to avoid excessive indels.
+ //if two/third of nucleotides in first/last quoter are mismatches, then it means erroneous mapping
+
+ if (cnt >= 1 && cnt <= from.size() / 3 && cnt_arr[0] <= from.size() / 6 &&
+ cnt_arr[3] <= from.size() / 6) {
+ for (size_t i = 0; i < from.size(); i++) {
+ if (from[i] != to[i] && gp.index.contains(to)) {
+ pair<EdgeId, size_t> position = gp.index.get(to);
+ statistics_[position.first].AddPosition(position.second + i);
+ }
+ }
+ }
+ }
+ for (auto it = gp.g.ConstEdgeBegin(); !it.IsEnd(); ++it) {
+ if (gp.g.length(*it) < cfg::get().max_repeat_length) {
+ // INFO("edge id " <<gp.g.int_id(*it) << " added to stat" );
+ // for(size_t i = 0; i < gp.g.length(*it) + gp.g.k(); i++)
+ // statistics_[*it].AddPosition(i);
+ }
+ }
+ }
+
+ void operator+=(const MismatchStatistics<EdgeId> &other) {
+ for (auto it = other.statistics_.begin(); it != other.statistics_.end(); ++it) {
+ statistics_[it->first] += it->second;
+ }
+ }
+
+public:
+ template<class graph_pack>
+ MismatchStatistics(const graph_pack &gp) {
+ CollectPotensialMismatches(gp);
+ }
+
+ const_iterator begin() const {
+ return statistics_.begin();
+ }
+
+ const_iterator end() const {
+ return statistics_.end();
+ }
+
+ const_iterator find(const EdgeId &edge) const {
+ return statistics_.find(edge);
+ }
+
+ template<class graph_pack, class read_type>
+ void Count(io::ReadStream<read_type> &stream, const graph_pack &gp) {
+ stream.reset();
+ DEBUG("count started");
+ auto sm = MapperInstance(gp);
+ DEBUG("seq mapper created");
+ while (!stream.eof()) {
+ read_type read;
+ stream >> read;
+ const Sequence &s_read = read.sequence();
+ omnigraph::MappingPath<EdgeId> path = sm->MapSequence(s_read);
+ TRACE("read mapped");
+ if (path.size() == 1 && path[0].second.initial_range.size() == path[0].second.mapped_range.size()) {
+ Range initial_range = path[0].second.initial_range;
+ Range mapped_range = path[0].second.mapped_range;
+ const Sequence &s_edge = gp.g.EdgeNucls(path[0].first);
+ size_t len = initial_range.size() + gp.g.k();
+ size_t cnt = 0;
+ for (size_t i = 0; i < len; i++) {
+ if (s_read[initial_range.start_pos + i] != s_edge[mapped_range.start_pos + i]) {
+ cnt++;
+ }
+ }
+ if (cnt <= gp.g.k() / 3) {
+ TRACE("statistics changing");
+ auto it = statistics_.find(path[0].first);
+ if (it == statistics_.end()) {
+ // if (gp.g.length(path[0].first) < 4000)
+ // WARN ("id "<< gp.g.length(path[0].first)<<" " << len);
+ continue;
+ }
+ for (size_t i = 0; i < len; i++) {
+ size_t nucl_code = s_read[initial_range.start_pos + i];
+ it->second.IncIfContains(mapped_range.start_pos + i, nucl_code);
+ }
+ }
+ }
+ }
+ }
+
+ template<class graph_pack, class read_type>
+ void ParallelCount(io::ReadStreamList<read_type> &streams, const graph_pack &gp) {
+ size_t nthreads = streams.size();
+ std::vector<MismatchStatistics<EdgeId> *> statistics(nthreads);
+#pragma omp parallel for num_threads(nthreads) shared(streams, statistics)
+ for (size_t i = 0; i < nthreads; ++i) {
+ statistics[i] = new MismatchStatistics<EdgeId>(*this);
+ DEBUG("statistics created thread " << i);
+ statistics[i]->Count(streams[i], gp);
+ DEBUG("count finished thread " << i);
+ }
+
+ INFO("Finished collecting potential mismatches positions");
+ for (size_t i = 0; i < statistics.size(); i++) {
+ *this += *statistics[i];
+ delete statistics[i];
+ }
+ }
+};
+}
+
+template<class graph_pack, class read_type>
+class MismatchShallNotPass {
+private:
+ typedef typename graph_pack::graph_t Graph;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef runtime_k::RtSeq Kmer;
+
+ graph_pack &gp_;
+ double relative_threshold_;
+
+ EdgeId CorrectNucl(EdgeId edge, size_t position, char nucl) {
+ VERIFY(position >= gp_.g.k());
+ if (position + 1 < gp_.g.length(edge)) {
+ edge = gp_.g.SplitEdge(edge, position + 1).first;
+ }
+ EdgeId mismatch = edge;
+ if (position > gp_.g.k()) {
+ auto tmp = gp_.g.SplitEdge(edge, position - gp_.g.k());
+ edge = tmp.first;
+ mismatch = tmp.second;
+ }
+ const Sequence &s_mm = gp_.g.EdgeNucls(mismatch);
+ Sequence correct = s_mm.Subseq(0, gp_.g.k()) + Sequence(string(1, nucl)) +
+ s_mm.Subseq(gp_.g.k() + 1, gp_.g.k() * 2 + 1);
+ if (!gp_.kmer_mapper.CheckCanRemap(s_mm, correct)) {
+ return edge;
+ }
+ VERIFY(nucl != s_mm[gp_.g.k()]);
+ EdgeId correct_edge = gp_.g.AddEdge(gp_.g.EdgeStart(mismatch), gp_.g.EdgeEnd(mismatch), correct);
+ if (position > gp_.g.k()) {
+ gp_.g.GlueEdges(mismatch, correct_edge);
+ return edge;
+ } else {
+ return gp_.g.GlueEdges(mismatch, correct_edge);
+ }
+ }
+
+ EdgeId CorrectNucls(EdgeId edge, const std::vector<pair<size_t, char>> &mismatches) {
+ for (auto it = mismatches.rbegin(); it != mismatches.rend(); ++it) {
+ edge = CorrectNucl(edge, it->first, it->second);
+ }
+ EdgeId tmp = Compressor<Graph>(gp_.g).CompressVertexEdgeId(gp_.g.EdgeEnd(edge));
+ if (tmp == EdgeId(0))
+ return edge;
+ else
+ return tmp;
+ }
+
+ vector<pair<size_t, char>> FindMismatches(EdgeId edge, const mismatches::MismatchEdgeInfo &statistics) {
+ vector<pair<size_t, char>> to_correct;
+ const Sequence &s_edge = gp_.g.EdgeNucls(edge);
+ for (size_t i = gp_.g.k(); i < gp_.g.length(edge); i++) {
+ size_t cur_best = 0;
+ mismatches::NuclCount nc = statistics[i];
+ for (size_t j = 1; j < 4; j++) {
+ if (nc[j] > nc[cur_best]) {
+ cur_best = j;
+ }
+ }
+ size_t nucl_code = s_edge[i];
+ if ((double) nc[cur_best] > relative_threshold_ * (double) nc[nucl_code] + 1.) {
+ to_correct.push_back(make_pair(i, cur_best));
+ i += gp_.g.k();
+ }
+
+ }
+ return to_correct;
+ }
+
+ size_t CorrectEdge(EdgeId edge, const mismatches::MismatchEdgeInfo &statistics) {
+ vector<pair<size_t, char>> to_correct = FindMismatches(edge, statistics);
+ EdgeId new_edge = CorrectNucls(edge, to_correct);
+ if (new_edge == EdgeId(0))
+ new_edge = edge;
+
+ return to_correct.size();
+ }
+
+ size_t CorrectAllEdges(const mismatches::MismatchStatistics<typename Graph::EdgeId> &statistics) {
+ size_t res = 0;
+ set<EdgeId> conjugate_fix;
+ for (auto it = gp_.g.ConstEdgeBegin(); !it.IsEnd(); ++it) {
+ if (conjugate_fix.find(gp_.g.conjugate(*it)) == conjugate_fix.end()) {
+ conjugate_fix.insert(*it);
+ }
+ }
+ for (auto it = conjugate_fix.begin(); it != conjugate_fix.end(); ++it) {
+ DEBUG("processing edge" << gp_.g.int_id(*it));
+
+ if (statistics.find(*it) != statistics.end()) {
+ if (!gp_.g.RelatedVertices(gp_.g.EdgeStart(*it), gp_.g.EdgeEnd(*it)))
+ res += CorrectEdge(*it, statistics.find(*it)->second);
+ }
+ }
+ INFO("All edges processed");
+ return res;
+ }
+
+ size_t StopMismatchIteration(io::ReadStream<read_type> &stream) {
+ mismatches::MismatchStatistics<typename Graph::EdgeId> statistics(gp_);
+ statistics.Count(stream, gp_);
+ return CorrectAllEdges(statistics);
+ }
+
+ size_t ParallelStopMismatchIteration(io::ReadStreamList<read_type> &streams) {
+ mismatches::MismatchStatistics<typename Graph::EdgeId> statistics(gp_);
+ statistics.ParallelCount(streams, gp_);
+ return CorrectAllEdges(statistics);
+ }
+
+public:
+ MismatchShallNotPass(graph_pack &gp, double relative_threshold = 1.5) : gp_(gp), relative_threshold_(
+ relative_threshold) {
+ VERIFY(relative_threshold >= 1);
+ }
+
+
+ size_t StopAllMismatches(io::ReadStream<read_type> &stream, size_t max_iterations = 1) {
+ size_t res = 0;
+ while (max_iterations > 0) {
+ size_t last = StopMismatchIteration(stream);
+ res += last;
+ if (last == 0)
+ break;
+ max_iterations--;
+ }
+ return res;
+ }
+
+ size_t ParallelStopAllMismatches(io::ReadStreamList<read_type> &streams, size_t max_iterations = 1) {
+ size_t res = 0;
+ while (max_iterations > 0) {
+ size_t last = ParallelStopMismatchIteration(streams);
+ res += last;
+ if (last == 0)
+ break;
+ max_iterations--;
+ }
+ return res;
+ }
+};
+
+}
diff --git a/src/modules/algorithms/path_extend/CMakeLists.txt b/src/modules/algorithms/path_extend/CMakeLists.txt
new file mode 100644
index 0000000..03b447b
--- /dev/null
+++ b/src/modules/algorithms/path_extend/CMakeLists.txt
@@ -0,0 +1,18 @@
+############################################################################
+# Copyright (c) 2015 Saint Petersburg State University
+# Copyright (c) 2011-2014 Saint Petersburg Academic University
+# All Rights Reserved
+# See file LICENSE for details.
+############################################################################
+
+project(path_extend CXX)
+
+add_library(path_extend STATIC pe_config_struct.cpp
+ scaffolder2015/extension_chooser2015.cpp
+ scaffolder2015/scaffold_graph.cpp
+ scaffolder2015/scaffold_graph_constructor.cpp
+ scaffolder2015/scaffold_graph_visualizer.cpp
+ scaffolder2015/connection_condition2015.cpp)
+
+target_link_libraries(path_extend graph_support)
+
diff --git a/src/modules/algorithms/path_extend/extension_chooser.hpp b/src/modules/algorithms/path_extend/extension_chooser.hpp
new file mode 100644
index 0000000..13f197c
--- /dev/null
+++ b/src/modules/algorithms/path_extend/extension_chooser.hpp
@@ -0,0 +1,1511 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * extension.hpp
+ *
+ * Created on: Mar 5, 2012
+ * Author: andrey
+ */
+
+#ifndef EXTENSION_HPP_
+#define EXTENSION_HPP_
+
+#include <cfloat>
+#include <iostream>
+#include <fstream>
+#include "weight_counter.hpp"
+#include "pe_utils.hpp"
+#include "next_path_searcher.hpp"
+
+//#include "scaff_supplementary.hpp"
+
+namespace path_extend {
+
+typedef std::multimap<double, EdgeWithDistance> AlternativeContainer;
+
+
+class PathAnalyzer {
+protected:
+ const Graph& g_;
+
+public:
+ PathAnalyzer(const Graph& g): g_(g) {
+ }
+
+ void RemoveTrivial(const BidirectionalPath& path, std::set<size_t>& to_exclude, bool exclude_bulges = true) const {
+ if (exclude_bulges) {
+ ExcludeTrivialWithBulges(path, to_exclude);
+ } else {
+ ExcludeTrivial(path, to_exclude);
+ }
+ }
+
+protected:
+ virtual int ExcludeTrivial(const BidirectionalPath& path, std::set<size_t>& edges, int from = -1) const {
+ int edgeIndex = (from == -1) ? (int) path.Size() - 1 : from;
+ if ((int) path.Size() <= from) {
+ return edgeIndex;
+ }
+ VertexId currentVertex = g_.EdgeEnd(path[edgeIndex]);
+ while (edgeIndex >= 0 && g_.CheckUniqueIncomingEdge(currentVertex)) {
+ EdgeId e = g_.GetUniqueIncomingEdge(currentVertex);
+ currentVertex = g_.EdgeStart(e);
+
+ edges.insert((size_t) edgeIndex);
+ --edgeIndex;
+ }
+ return edgeIndex;
+ }
+
+ virtual int ExcludeTrivialWithBulges(const BidirectionalPath& path, std::set<size_t>& edges) const {
+
+ if (path.Empty()) {
+ return 0;
+ }
+
+ int lastEdge = (int) path.Size() - 1;
+ do {
+ lastEdge = ExcludeTrivial(path, edges, lastEdge);
+ bool bulge = true;
+
+ if (lastEdge >= 0) {
+ VertexId v = g_.EdgeEnd(path[lastEdge]);
+ VertexId u = g_.EdgeStart(path[lastEdge]);
+ auto bulgeCandidates = g_.IncomingEdges(v);
+
+ for (const auto& candidate: bulgeCandidates) {
+ if (g_.EdgeStart(candidate) != u) {
+ bulge = false;
+ break;
+ }
+ }
+
+ if (!bulge) {
+ break;
+ }
+ --lastEdge;
+ }
+ } while (lastEdge >= 0);
+
+ return lastEdge;
+ }
+
+protected:
+ DECL_LOGGER("PathAnalyzer")
+};
+
+
+class PreserveSimplePathsAnalyzer: public PathAnalyzer {
+
+public:
+ PreserveSimplePathsAnalyzer(const Graph &g) : PathAnalyzer(g) {
+ }
+
+ int ExcludeTrivial(const BidirectionalPath& path, std::set<size_t>& edges, int from = -1) const override {
+ int edgeIndex = PathAnalyzer::ExcludeTrivial(path, edges, from);
+
+ //Preserving simple path
+ if (edgeIndex == -1) {
+ edges.clear();
+ return (from == -1) ? (int) path.Size() - 1 : from;;
+ }
+ return edgeIndex;
+ }
+
+ int ExcludeTrivialWithBulges(const BidirectionalPath& path, std::set<size_t>& edges) const override {
+
+ if (path.Empty()) {
+ return 0;
+ }
+
+ int lastEdge = (int) path.Size() - 1;
+ bool has_bulge = false;
+ do {
+ lastEdge = PathAnalyzer::ExcludeTrivial(path, edges, lastEdge);
+
+ if (lastEdge >= 0) {
+ VertexId v = g_.EdgeEnd(path[lastEdge]);
+ VertexId u = g_.EdgeStart(path[lastEdge]);
+ auto bulgeCandidates = g_.IncomingEdges(v);
+ has_bulge = true;
+
+ for (auto iter = bulgeCandidates.begin(); iter != bulgeCandidates.end(); ++iter) {
+ if (g_.EdgeStart(*iter) != u) {
+ has_bulge = false;
+ break;
+ }
+ }
+
+ --lastEdge;
+ }
+ } while (lastEdge >= 0);
+
+ //Preserving simple path
+ if (!has_bulge && lastEdge == -1) {
+ edges.clear();
+ lastEdge = (int) path.Size() - 1;
+ }
+
+ return lastEdge;
+ }
+
+protected:
+ DECL_LOGGER("PathAnalyzer")
+
+};
+
+
+class ExtensionChooserListener {
+
+public:
+
+ virtual void ExtensionChosen(double weight) = 0;
+
+ virtual void ExtensionChosen(const AlternativeContainer& alts) = 0;
+
+ virtual ~ExtensionChooserListener() {
+
+ }
+};
+
+
+class ExtensionChooser {
+
+public:
+ typedef std::vector<EdgeWithDistance> EdgeContainer;
+
+protected:
+ const Graph& g_;
+ shared_ptr<WeightCounter> wc_;
+ //FIXME memory leak?!
+ std::vector<ExtensionChooserListener *> listeners_;
+
+ double weight_threshold_;
+
+public:
+ ExtensionChooser(const Graph& g, shared_ptr<WeightCounter> wc = nullptr, double weight_threshold = -1.):
+ g_(g), wc_(wc),
+ weight_threshold_(weight_threshold) {
+ }
+
+ virtual ~ExtensionChooser() {
+
+ }
+
+ virtual EdgeContainer Filter(const BidirectionalPath& path, const EdgeContainer& edges) const = 0;
+
+ bool CheckThreshold(double weight) const {
+ return math::ge(weight, weight_threshold_);
+ }
+
+ void Subscribe(ExtensionChooserListener * listener) {
+ listeners_.push_back(listener);
+ }
+
+ void NotifyAll(double weight) const {
+ for (auto listener_ptr : listeners_) {
+ listener_ptr->ExtensionChosen(weight);
+ }
+ }
+
+ void NotifyAll(const AlternativeContainer& alts) const {
+ for (auto listener_ptr : listeners_) {
+ listener_ptr->ExtensionChosen(alts);
+ }
+ }
+
+ bool WeightCounterBased() const {
+ return wc_ != nullptr;
+ }
+
+ const WeightCounter& wc() const {
+ VERIFY(wc_);
+ return *wc_;
+ }
+
+protected:
+ bool HasIdealInfo(EdgeId e1, EdgeId e2, size_t dist) const {
+ return math::gr(wc_->lib().IdealPairedInfo(e1, e2, (int) dist), 0.);
+ }
+
+ bool HasIdealInfo(const BidirectionalPath& p, EdgeId e, size_t gap) const {
+ for (int i = (int) p.Size() - 1; i >= 0; --i)
+ if (HasIdealInfo(p[i], e, gap + p.LengthAt(i)))
+ return true;
+ return false;
+ }
+
+private:
+ DECL_LOGGER("ExtensionChooser");
+};
+
+
+class JointExtensionChooser: public ExtensionChooser {
+
+protected:
+ shared_ptr<ExtensionChooser> first_;
+
+ shared_ptr<ExtensionChooser> second_;
+
+public:
+ JointExtensionChooser(const Graph& g, shared_ptr<ExtensionChooser> first, shared_ptr<ExtensionChooser> second): ExtensionChooser(g),
+ first_(first), second_(second)
+ {
+ }
+
+ EdgeContainer Filter(const BidirectionalPath& path, const EdgeContainer& edges) const override {
+ EdgeContainer e1 = first_->Filter(path, edges);
+ return second_->Filter(path, e1);
+ }
+};
+
+
+class TrivialExtensionChooser: public ExtensionChooser {
+
+public:
+ TrivialExtensionChooser(Graph& g): ExtensionChooser(g) {
+ }
+
+ EdgeContainer Filter(const BidirectionalPath& /*path*/, const EdgeContainer& edges) const override {
+ if (edges.size() == 1) {
+ return edges;
+ }
+ return EdgeContainer();
+ }
+};
+
+
+class TrivialExtensionChooserWithPI: public ExtensionChooser {
+
+public:
+ TrivialExtensionChooserWithPI(Graph& g, shared_ptr<WeightCounter> wc, double weight_threshold):
+ ExtensionChooser(g, wc, weight_threshold) {
+ }
+
+ EdgeContainer Filter(const BidirectionalPath& path, const EdgeContainer& edges) const override {
+ if (edges.size() == 1) {
+ double weight = wc_->CountWeight(path, edges.back().e_, std::set<size_t>());
+ NotifyAll(weight);
+
+ if (CheckThreshold(weight)) {
+ return edges;
+ }
+ }
+ return EdgeContainer();
+ }
+};
+
+class ExcludingExtensionChooser: public ExtensionChooser {
+ //FIXME what is the logic behind it?
+protected:
+ PathAnalyzer analyzer_;
+ double prior_coeff_;
+
+ AlternativeContainer FindWeights(const BidirectionalPath& path, const EdgeContainer& edges, const std::set<size_t>& to_exclude) const {
+ AlternativeContainer weights;
+ for (auto iter = edges.begin(); iter != edges.end(); ++iter) {
+ double weight = wc_->CountWeight(path, iter->e_, to_exclude);
+ weights.insert(std::make_pair(weight, *iter));
+ DEBUG("Candidate " << g_.int_id(iter->e_) << " weight " << weight << " length " << g_.length(iter->e_));
+ }
+ NotifyAll(weights);
+ return weights;
+ }
+
+ EdgeContainer FindPossibleEdges(const AlternativeContainer& weights,
+ double max_weight) const {
+ EdgeContainer top;
+ auto possible_edge = weights.lower_bound(max_weight / prior_coeff_);
+ for (auto iter = possible_edge; iter != weights.end(); ++iter) {
+ top.push_back(iter->second);
+ }
+ return top;
+ }
+
+ EdgeContainer FindFilteredEdges(const BidirectionalPath& path,
+ const EdgeContainer& edges, const std::set<size_t>& to_exclude) const {
+ AlternativeContainer weights = FindWeights(path, edges, to_exclude);
+ auto max_weight = (--weights.end())->first;
+ EdgeContainer top = FindPossibleEdges(weights, max_weight);
+ EdgeContainer result;
+ if (top.size() >= 1 && CheckThreshold(max_weight)) {
+ result = top;
+ }
+ return result;
+ }
+
+protected:
+
+ virtual void ExcludeEdges(const BidirectionalPath& path, const EdgeContainer& edges, std::set<size_t>& to_exclude) const = 0;
+
+public:
+ ExcludingExtensionChooser(const Graph& g, shared_ptr<WeightCounter> wc, PathAnalyzer analyzer, double weight_threshold, double priority) :
+ ExtensionChooser(g, wc, weight_threshold), analyzer_(analyzer), prior_coeff_(priority) {
+
+ }
+
+ virtual EdgeContainer Filter(const BidirectionalPath& path,
+ const EdgeContainer& edges) const {
+ DEBUG("Paired-end extension chooser");
+ if (edges.empty()) {
+ return edges;
+ }
+ std::set<size_t> to_exclude;
+ analyzer_.RemoveTrivial(path, to_exclude);
+ path.Print();
+ EdgeContainer result = edges;
+ ExcludeEdges(path, result, to_exclude);
+ result = FindFilteredEdges(path, result, to_exclude);
+ if (result.size() == 1) {
+ DEBUG("Paired-end extension chooser helped");
+ }
+ return result;
+ }
+
+private:
+ DECL_LOGGER("ExcludingExtensionChooser");
+
+};
+
+class SimpleExtensionChooser: public ExcludingExtensionChooser {
+protected:
+ void ExcludeEdges(const BidirectionalPath& path, const EdgeContainer& edges, std::set<size_t>& to_exclude) const override {
+ if (edges.size() < 2) {
+ return;
+ }
+ //excluding based on absense of ideal info
+ int index = (int) path.Size() - 1;
+ while (index >= 0) {
+ if (to_exclude.count(index)) {
+ index--;
+ continue;
+ }
+ EdgeId path_edge = path[index];
+
+ for (size_t i = 0; i < edges.size(); ++i) {
+ if (!HasIdealInfo(path_edge,
+ edges.at(i).e_,
+ path.LengthAt(index))) {
+ to_exclude.insert((size_t) index);
+ }
+ }
+
+ index--;
+ }
+
+ //excluding based on presense of ambiguous paired info
+ map<size_t, unsigned> edge_2_extension_cnt;
+ for (size_t i = 0; i < edges.size(); ++i) {
+ for (size_t e : wc_->PairInfoExist(path, edges.at(i).e_)) {
+ edge_2_extension_cnt[e] += 1;
+ }
+ }
+
+ for (auto e_w_ec : edge_2_extension_cnt) {
+ if (e_w_ec.second == edges.size()) {
+ to_exclude.insert(e_w_ec.first);
+ }
+ }
+ }
+
+public:
+
+ SimpleExtensionChooser(const Graph& g, shared_ptr<WeightCounter> wc, double weight_threshold, double priority) :
+ ExcludingExtensionChooser(g, wc, PathAnalyzer(g), weight_threshold, priority) {
+ }
+
+private:
+ DECL_LOGGER("SimpleExtensionChooser");
+};
+
+
+class RNAExtensionChooser: public ExcludingExtensionChooser {
+protected:
+ void ExcludeEdges(const BidirectionalPath& /*path*/, const EdgeContainer& /*edges*/, std::set<size_t>& /*to_exclude*/) const override {
+ }
+
+public:
+
+ RNAExtensionChooser(const Graph& g, shared_ptr<WeightCounter> wc, double weight_threshold, double priority) :
+ ExcludingExtensionChooser(g, wc, PreserveSimplePathsAnalyzer(g), weight_threshold, priority) {
+ }
+
+private:
+ DECL_LOGGER("SimpleExtensionChooser");
+};
+
+class LongEdgeExtensionChooser: public ExcludingExtensionChooser {
+protected:
+ virtual void ExcludeEdges(const BidirectionalPath& path, const EdgeContainer& edges, std::set<size_t>& to_exclude) const {
+ if (edges.size() < 2) {
+ return;
+ }
+ int index = (int) path.Size() - 1;
+ while (index >= 0) {
+ if (to_exclude.count(index)) {
+ index--;
+ continue;
+ }
+ EdgeId path_edge = path[index];
+ //FIXME configure!
+ if (path.graph().length(path_edge) < 200)
+ to_exclude.insert((size_t) index);
+ index--;
+ }
+ }
+public:
+ LongEdgeExtensionChooser(const Graph& g, shared_ptr<WeightCounter> wc, double weight_threshold, double priority) :
+ ExcludingExtensionChooser(g, wc, PathAnalyzer(g), weight_threshold, priority) {
+ }
+};
+
+class ScaffoldingExtensionChooser : public ExtensionChooser {
+
+protected:
+ typedef ExtensionChooser base;
+ double raw_weight_threshold_;
+ double cl_weight_threshold_;
+ const double is_scatter_coeff_ = 3.0;
+
+ void AddInfoFromEdge(const std::vector<int>& distances, const std::vector<double>& weights,
+ std::vector<pair<int, double>>& histogram, size_t len_to_path_end) const {
+ for (size_t l = 0; l < distances.size(); ++l) {
+ //todo commented out condition seems unnecessary and should be library dependent! do we need "max(0" there?
+ if (/*distances[l] > max(0, (int) len_to_path_end - int(1000)) && */math::ge(weights[l], raw_weight_threshold_)) {
+ histogram.push_back(make_pair(distances[l] - (int) len_to_path_end, weights[l]));
+ }
+ }
+ }
+
+ int CountMean(const vector<pair<int, double> >& histogram) const {
+ double dist = 0.0;
+ double sum = 0.0;
+ for (size_t i = 0; i < histogram.size(); ++i) {
+ dist += histogram[i].first * histogram[i].second;
+ sum += histogram[i].second;
+ }
+ dist /= sum;
+ return (int) round(dist);
+ }
+
+ void GetDistances(EdgeId e1, EdgeId e2, std::vector<int>& dist,
+ std::vector<double>& w) const {
+ wc_->lib().CountDistances(e1, e2, dist, w);
+ }
+
+ void CountAvrgDists(const BidirectionalPath& path, EdgeId e, std::vector<pair<int, double>> & histogram) const {
+ for (size_t j = 0; j < path.Size(); ++j) {
+ std::vector<int> distances;
+ std::vector<double> weights;
+ GetDistances(path.At(j), e, distances, weights);
+ if (distances.size() > 0) {
+ AddInfoFromEdge(distances, weights, histogram, path.LengthAt(j));
+ }
+ }
+ }
+
+ void FindBestFittedEdgesForClustered(const BidirectionalPath& path, const set<EdgeId>& edges, EdgeContainer& result) const {
+ for (EdgeId e : edges) {
+ std::vector<pair<int, double>> histogram;
+ CountAvrgDists(path, e, histogram);
+ double sum = 0.0;
+ for (size_t j = 0; j < histogram.size(); ++j) {
+ sum += histogram[j].second;
+ }
+ if (sum <= cl_weight_threshold_) {
+ continue;
+ }
+ int gap = CountMean(histogram);
+ if (HasIdealInfo(path, e, gap)) {
+ DEBUG("scaffolding " << g_.int_id(e) << " gap " << gap);
+ result.push_back(EdgeWithDistance(e, gap));
+ }
+ }
+ }
+
+ bool IsTip(EdgeId e) const {
+ return g_.IncomingEdgeCount(g_.EdgeStart(e)) == 0;
+ }
+
+ set<EdgeId> FindCandidates(const BidirectionalPath& path) const {
+ set<EdgeId> jumping_edges;
+ const auto& lib = wc_->lib();
+ //todo lib (and FindJumpEdges) knows its var so it can be counted there
+ int is_scatter = int(math::round(double(lib.GetIsVar()) * is_scatter_coeff_));
+ for (int i = (int) path.Size() - 1; i >= 0 && path.LengthAt(i) - g_.length(path.At(i)) <= lib.GetISMax(); --i) {
+ set<EdgeId> jump_edges_i;
+ lib.FindJumpEdges(path.At(i), jump_edges_i,
+ std::max(0, (int)path.LengthAt(i) - is_scatter),
+ //FIXME do we need is_scatter here?
+ int((path.LengthAt(i) + lib.GetISMax() + is_scatter)),
+ 0);
+ for (EdgeId e : jump_edges_i) {
+ if (IsTip(e)) {
+ jumping_edges.insert(e);
+ }
+ }
+ }
+ return jumping_edges;
+ }
+
+public:
+
+ ScaffoldingExtensionChooser(const Graph& g, shared_ptr<WeightCounter> wc, double is_scatter_coeff) :
+ ExtensionChooser(g, wc), raw_weight_threshold_(0.0),
+ cl_weight_threshold_(cfg::get().pe_params.param_set.scaffolder_options.cl_threshold),
+ is_scatter_coeff_(is_scatter_coeff) {
+ }
+
+ EdgeContainer Filter(const BidirectionalPath& path, const EdgeContainer& edges) const override {
+ if (edges.empty()) {
+ return edges;
+ }
+ set<EdgeId> candidates = FindCandidates(path);
+ EdgeContainer result;
+ FindBestFittedEdgesForClustered(path, candidates, result);
+ return result;
+ }
+private:
+ DECL_LOGGER("ScaffoldingExtensionChooser");
+};
+
+inline bool EdgeWithWeightCompareReverse(const pair<EdgeId, double>& p1,
+ const pair<EdgeId, double>& p2) {
+ return p1.second > p2.second;
+}
+
+class LongReadsUniqueEdgeAnalyzer {
+private:
+ DECL_LOGGER("LongReadsUniqueEdgeAnalyzer")
+public:
+ LongReadsUniqueEdgeAnalyzer(const Graph& g, const GraphCoverageMap& cov_map,
+ double filter_threshold, double prior_threshold, size_t max_repeat_length)
+ : g_(g),
+ cov_map_(cov_map),
+ filter_threshold_(filter_threshold),
+ prior_threshold_(prior_threshold),
+ max_repeat_length_(max_repeat_length) {
+ FindAllUniqueEdges();
+ }
+
+ bool IsUnique(EdgeId e) const {
+ return unique_edges_.count(e) > 0;
+ }
+
+private:
+ bool UniqueEdge(EdgeId e) const {
+ if (g_.length(e) > max_repeat_length_)
+ return true;
+ DEBUG("Analyze unique edge " << g_.int_id(e));
+ if (cov_map_.size() == 0) {
+ return false;
+ }
+ auto cov_paths = cov_map_.GetCoveringPaths(e);
+ for (auto it1 = cov_paths.begin(); it1 != cov_paths.end(); ++it1) {
+ auto pos1 = (*it1)->FindAll(e);
+ if (pos1.size() > 1) {
+ DEBUG("***not unique " << g_.int_id(e) << " len " << g_.length(e) << "***");
+ return false;
+ }
+ for (auto it2 = it1; it2 != cov_paths.end(); it2++) {
+ auto pos2 = (*it2)->FindAll(e);
+ if (pos2.size() > 1) {
+ DEBUG("***not unique " << g_.int_id(e) << " len " << g_.length(e) << "***");
+ return false;
+ }
+ if (!ConsistentPath(**it1, pos1[0], **it2, pos2[0])) {
+ DEBUG("Checking inconsistency");
+ if (CheckInconsistence(**it1, pos1[0], **it2, pos2[0],
+ cov_paths)) {
+ DEBUG("***not unique " << g_.int_id(e) << " len " << g_.length(e) << "***");
+ return false;
+ }
+ }
+ }
+ }
+ DEBUG("***edge " << g_.int_id(e) << " is unique.***");
+ return true;
+ }
+
+ bool ConsistentPath(const BidirectionalPath& path1, size_t pos1,
+ const BidirectionalPath& path2, size_t pos2) const {
+ return EqualBegins(path1, pos1, path2, pos2, false)
+ && EqualEnds(path1, pos1, path2, pos2, false);
+ }
+ bool SignificantlyDiffWeights(double w1, double w2) const {
+ if (w1 > filter_threshold_ and w2 > filter_threshold_) {
+ if (w1 > w2 * prior_threshold_ or w2 > w1 * prior_threshold_) {
+ return true;
+ }
+ return false;
+ }
+ return true;
+ }
+
+ bool CheckInconsistence(
+ const BidirectionalPath& path1, size_t pos1,
+ const BidirectionalPath& path2, size_t pos2,
+ const BidirectionalPathSet& cov_paths) const {
+ size_t first_diff_pos1 = FirstNotEqualPosition(path1, pos1, path2, pos2, false);
+ size_t first_diff_pos2 = FirstNotEqualPosition(path2, pos2, path1, pos1, false);
+ if (first_diff_pos1 != -1UL && first_diff_pos2 != -1UL) {
+ const BidirectionalPath cand1 = path1.SubPath(first_diff_pos1,
+ pos1 + 1);
+ const BidirectionalPath cand2 = path2.SubPath(first_diff_pos2,
+ pos2 + 1);
+ std::pair<double, double> weights = GetSubPathsWeights(cand1, cand2,
+ cov_paths);
+ DEBUG("Not equal begin " << g_.int_id(path1.At(first_diff_pos1)) << " weight " << weights.first << "; " << g_.int_id(path2.At(first_diff_pos2)) << " weight " << weights.second);
+ if (!SignificantlyDiffWeights(weights.first, weights.second)) {
+ DEBUG("not significantly different");
+ return true;
+ }
+ }
+ size_t last_diff_pos1 = LastNotEqualPosition(path1, pos1, path2, pos2, false);
+ size_t last_diff_pos2 = LastNotEqualPosition(path2, pos2, path1, pos1, false);
+ if (last_diff_pos1 != -1UL) {
+ const BidirectionalPath cand1 = path1.SubPath(pos1,
+ last_diff_pos1 + 1);
+ const BidirectionalPath cand2 = path2.SubPath(pos2,
+ last_diff_pos2 + 1);
+ std::pair<double, double> weights = GetSubPathsWeights(cand1, cand2,
+ cov_paths);
+ DEBUG("Not equal end " << g_.int_id(path1.At(last_diff_pos1)) << " weight " << weights.first << "; " << g_.int_id(path2.At(last_diff_pos2)) << " weight " << weights.second);
+ if (!SignificantlyDiffWeights(weights.first, weights.second)) {
+ DEBUG("not significantly different");
+ return true;
+ }
+ }
+ return false;
+ }
+
+ std::pair<double, double> GetSubPathsWeights(
+ const BidirectionalPath& cand1, const BidirectionalPath& cand2,
+ const BidirectionalPathSet& cov_paths) const {
+ double weight1 = 0.0;
+ double weight2 = 0.0;
+ for (auto iter = cov_paths.begin(); iter != cov_paths.end(); ++iter) {
+ BidirectionalPath* path = *iter;
+ if (ContainSubPath(*path, cand1)) {
+ weight1 += path->GetWeight();
+ } else if (ContainSubPath(*path, cand2)) {
+ weight2 += path->GetWeight();
+ }
+ }
+ return std::make_pair(weight1, weight2);
+ }
+
+ bool ContainSubPath(const BidirectionalPath& path,
+ const BidirectionalPath& subpath) const {
+ for (size_t i = 0; i < path.Size(); ++i) {
+ if (path.CompareFrom(i, subpath))
+ return true;
+ }
+ return false;
+ }
+
+ void FindAllUniqueCoverageEdges() {
+ if (cfg::get().uneven_depth) {
+ return;
+ }
+ double sum_cov = 0;
+ size_t sum_len = 0;
+ size_t total_len = 0;
+ for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
+ total_len += g_.length(*iter);
+ if (g_.length(*iter) >= cfg::get().max_repeat_length) {
+ sum_cov += g_.coverage(*iter) * (double)g_.length(*iter);
+ sum_len += g_.length(*iter);
+ }
+ }
+ if (sum_len * 4 < total_len) return;
+ sum_cov /= (double)sum_len;
+ DEBUG("average coverage of long edges: " << sum_cov) ;
+ for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
+ if (g_.length(*iter) > 500 && (double)g_.coverage(*iter) < 1.2 * sum_cov) {
+ if (unique_edges_.find(*iter) == unique_edges_.end()) {
+ unique_edges_.insert(*iter);
+ unique_edges_.insert(g_.conjugate(*iter));
+ DEBUG("Added coverage based unique edge " << g_.int_id(*iter) << " len "<< g_.length(*iter) << " " << g_.coverage(*iter));
+ }
+ }
+ }
+ }
+
+
+ void FindAllUniqueEdges() {
+ DEBUG("Looking for unique edges");
+ for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
+ if (UniqueEdge(*iter)) {
+ unique_edges_.insert(*iter);
+ unique_edges_.insert(g_.conjugate(*iter));
+ }
+ }
+ DEBUG("coverage based uniqueness started");
+ FindAllUniqueCoverageEdges();
+ DEBUG("Unique edges are found");
+ }
+
+ const Graph& g_;
+ const GraphCoverageMap& cov_map_;
+ double filter_threshold_;
+ double prior_threshold_;
+ std::set<EdgeId> unique_edges_;
+ size_t max_repeat_length_;
+};
+
+class SimpleScaffolding {
+public:
+ SimpleScaffolding(const Graph& g) : g_(g) {}
+
+ BidirectionalPath FindMaxCommonPath(const vector<BidirectionalPath*>& paths,
+ size_t max_diff_len) const {
+ BidirectionalPath max_end(g_);
+ for (auto it1 = paths.begin(); it1 != paths.end(); ++it1) {
+ BidirectionalPath* p1 = *it1;
+ for (size_t i = 0; i < p1->Size(); ++i) {
+ if (p1->Length() - p1->LengthAt(i) > max_diff_len) {
+ break;
+ }
+ bool contain_all = true;
+ for (size_t i1 = i + 1; i1 <= p1->Size() && contain_all; ++i1) {
+ BidirectionalPath subpath = p1->SubPath(i, i1);
+ for (auto it2 = paths.begin(); it2 != paths.end() && contain_all; ++it2) {
+ BidirectionalPath* p2 = *it2;
+ vector<size_t> positions2 = p2->FindAll(subpath.At(0));
+ bool contain = false;
+ for (size_t ipos2 = 0; ipos2 < positions2.size(); ++ipos2) {
+ size_t pos2 = positions2[ipos2];
+ if (p2->Length() - p2->LengthAt(pos2) <= max_diff_len
+ && EqualEnds(subpath, 0, *p2, pos2, false)) {
+ contain = true;
+ break;
+ }
+ }
+ if (!contain) {
+ contain_all = false;
+ }
+ }
+ if (contain_all && (i1 - i) >= max_end.Size()) {
+ max_end.Clear();
+ max_end.PushBack(subpath);
+ }
+ }
+ }
+ }
+ return max_end;
+ }
+
+private:
+ const Graph& g_;
+};
+
+class LongReadsExtensionChooser : public ExtensionChooser {
+public:
+ LongReadsExtensionChooser(const Graph& g, PathContainer& pc,
+ double filtering_threshold,
+ double weight_priority_threshold,
+ double unique_edge_priority_threshold,
+ size_t min_significant_overlap,
+ size_t max_repeat_length)
+ : ExtensionChooser(g),
+ filtering_threshold_(filtering_threshold),
+ weight_priority_threshold_(weight_priority_threshold),
+ min_significant_overlap_(min_significant_overlap),
+ cov_map_(g, pc),
+ unique_edge_analyzer_(g, cov_map_, filtering_threshold, unique_edge_priority_threshold, max_repeat_length),
+ simple_scaffolding_(g) {
+
+ }
+
+ /* Choose extension as correct only if we have reads that traverse a unique edge from the path and this extension.
+ * Edge is unique if all reads mapped to this edge are consistent.
+ * Two reads are consistent if they can form one path in the graph.
+ */
+ EdgeContainer Filter(const BidirectionalPath& path,
+ const EdgeContainer& edges) const override {
+ if (edges.empty()) {
+ return edges;
+ }DEBUG("We in Filter of LongReadsExtensionChooser");
+ path.Print();
+ map<EdgeId, double> weights_cands;
+ for (auto it = edges.begin(); it != edges.end(); ++it) {
+ weights_cands.insert(make_pair(it->e_, 0.0));
+ }
+ set<EdgeId> filtered_cands;
+ map<EdgeId, BidirectionalPathSet > support_paths_ends;
+ auto support_paths = cov_map_.GetCoveringPaths(path.Back());
+ DEBUG("Found " << support_paths.size() << " covering paths!!!");
+ for (auto it = support_paths.begin(); it != support_paths.end(); ++it) {
+ auto positions = (*it)->FindAll(path.Back());
+ (*it)->Print();
+ for (size_t i = 0; i < positions.size(); ++i) {
+ if ((int) positions[i] < (int) (*it)->Size() - 1
+ && EqualBegins(path, (int) path.Size() - 1, **it,
+ positions[i], false)) {
+ DEBUG("Checking unique path_back for " << (*it)->GetId());
+
+ if (UniqueBackPath(**it, positions[i])) {
+ DEBUG("Success");
+
+ EdgeId next = (*it)->At(positions[i] + 1);
+ weights_cands[next] += (*it)->GetWeight();
+ filtered_cands.insert(next);
+ if (support_paths_ends.count(next) == 0){
+ support_paths_ends[next] = BidirectionalPathSet();
+ }
+ support_paths_ends[next].insert(new BidirectionalPath((*it)->SubPath(positions[i] + 1)));
+ }
+ }
+ }
+ }
+ DEBUG("Candidates");
+ for (auto iter = weights_cands.begin(); iter != weights_cands.end(); ++iter) {
+ DEBUG("Candidate " << g_.int_id(iter->first) << " weight " << iter->second);
+ }
+ vector<pair<EdgeId, double> > sort_res = MapToSortVector(weights_cands);
+ DEBUG("sort res " << sort_res.size() << " tr " << weight_priority_threshold_);
+ if (sort_res.size() < 1 || sort_res[0].second < filtering_threshold_) {
+ filtered_cands.clear();
+ } else if (sort_res.size() > 1
+ && sort_res[0].second > weight_priority_threshold_ * sort_res[1].second) {
+ filtered_cands.clear();
+ filtered_cands.insert(sort_res[0].first);
+ } else if (sort_res.size() > 1) {
+ for (size_t i = 0; i < sort_res.size(); ++i) {
+ if (sort_res[i].second * weight_priority_threshold_ < sort_res[0].second) {
+ filtered_cands.erase(sort_res[i].first);
+ }
+ }
+ }
+ EdgeContainer result;
+ for (auto it = edges.begin(); it != edges.end(); ++it) {
+ if (filtered_cands.find(it->e_) != filtered_cands.end()) {
+ result.push_back(*it);
+ }
+ }
+ if (result.size() != 1) {
+ DEBUG("Long reads doesn't help =(");
+ }
+ return result;
+ }
+
+private:
+ bool UniqueBackPath(const BidirectionalPath& path, size_t pos) const {
+ int int_pos = (int) pos;
+ while (int_pos >= 0) {
+ if (unique_edge_analyzer_.IsUnique(path.At(int_pos)) > 0 && g_.length(path.At(int_pos)) >= min_significant_overlap_)
+ return true;
+ int_pos--;
+ }
+ return false;
+ }
+
+ vector<pair<EdgeId, double> > MapToSortVector(const map<EdgeId, double>& map) const {
+ vector<pair<EdgeId, double> > result1(map.begin(), map.end());
+ std::sort(result1.begin(), result1.end(), EdgeWithWeightCompareReverse);
+ return result1;
+ }
+
+ double filtering_threshold_;
+ double weight_priority_threshold_;
+ size_t min_significant_overlap_;
+ const GraphCoverageMap cov_map_;
+ LongReadsUniqueEdgeAnalyzer unique_edge_analyzer_;
+ SimpleScaffolding simple_scaffolding_;
+
+ DECL_LOGGER("LongReadsExtensionChooser");
+};
+
+class MatePairExtensionChooser : public ExtensionChooser {
+public:
+ MatePairExtensionChooser(const Graph& g, shared_ptr<PairedInfoLibrary> lib,
+ const PathContainer& paths, size_t max_number_of_paths_to_search)
+ : ExtensionChooser(g),
+ g_(g),
+ lib_(lib),
+ search_dist_(lib->GetISMax()),
+ weight_counter_(g, lib, 10),
+ cov_map_(g_, paths),
+ path_searcher_(g_, cov_map_, lib_->GetISMax(), PathsWeightCounter(g, lib, (size_t) lib->GetSingleThreshold()), max_number_of_paths_to_search),
+ unique_edge_analyzer_(g, cov_map_, 0., 1000., 8000.),
+ simple_scaffolder_(g) {
+ }
+
+ //Attention! Uses const_cast to modify path!!!
+ EdgeContainer Filter(const BidirectionalPath& path,
+ const EdgeContainer& init_edges) const override {
+ DEBUG("mp chooser");
+ path.Print();
+ if (path.Length() < lib_->GetISMin()) {
+ return EdgeContainer();
+ }
+ EdgeContainer edges = TryResolveBulge(path, init_edges);
+ map<EdgeId, BidirectionalPath*> best_paths;
+ for (size_t iedge = 0; iedge < edges.size(); ++iedge) {
+ BidirectionalPathSet following_paths = path_searcher_.FindNextPaths(path, edges[iedge].e_);
+ vector<BidirectionalPath*> max_weighted = MaxWeightedPath(path, following_paths);
+ if (max_weighted.size() == 0) {
+ DEBUG("too much paths or tip");
+ DeleteMapWithPaths(best_paths);
+ DeletePaths(following_paths);
+ best_paths.clear();
+ break;
+ } else {
+ best_paths[edges[iedge].e_] = new BidirectionalPath(*max_weighted[0]);
+ }
+ DeletePaths(following_paths);
+ }
+
+ BidirectionalPathSet next_paths;
+ if (edges.size() == 0) {
+ DEBUG("scaffolding edges size " << edges.size())
+ next_paths = path_searcher_.FindNextPaths(path, path.Back());
+ } else if (best_paths.size() == edges.size()) {
+ for (size_t iedge = 0; iedge < edges.size(); ++iedge) {
+ if (best_paths.count(edges[iedge].e_) > 0){
+ next_paths.insert(best_paths[edges[iedge].e_]);
+ }
+ }
+ }
+ EdgeContainer result = ChooseBest(path, next_paths);
+ if (result.size() != 1) {
+ DEBUG("scaffold tree");
+ result = ScaffoldTree(const_cast<BidirectionalPath&>(path));
+ }
+ DeletePaths(next_paths);
+ if (result.size() != 1) {
+ DEBUG("nobody can extend " << g_.int_id(path.Back()));
+ }
+ return result;
+ }
+
+private:
+ EdgeContainer ScaffoldTree(BidirectionalPath& path) const {
+ DEBUG("try scaffold tree");
+ vector<BidirectionalPath*> next_paths = path_searcher_.ScaffoldTree(path);
+ VERIFY(next_paths.size() <= 1);
+ EdgeContainer result;
+ if (!next_paths.empty() && next_paths.back()->Size() > 0) {
+ BidirectionalPath* res = next_paths.back();
+ for (size_t i = 0; i < res->Size() - 1; ++i) {
+ path.PushBack(res->At(i), res->GapAt(i), res->TrashPreviousAt(i), res->TrashCurrentAt(i));
+ }
+ result = EdgeContainer(1, EdgeWithDistance(res->Back(), res->GapAt(res->Size() - 1)));
+ }
+ DeletePaths(next_paths);
+ return result;
+ }
+
+ bool IsBulge(const EdgeContainer& edges) const {
+ if (edges.size() == 0)
+ return false;
+ for (EdgeWithDistance e : edges) {
+ if (!InBuble(e.e_, g_))
+ return false;
+ }
+ return true;
+ }
+
+ map<EdgeId, double> FindBulgeWeights(const BidirectionalPath& p, const EdgeContainer& edges) const {
+ map<EdgeId, double> result;
+ for (size_t i = 0; i < edges.size(); ++i) {
+ result[edges[i].e_] = 0.0;
+ }
+ for (size_t i = 0; i < p.Size(); ++i) {
+ bool common = true;
+ bool common_ideal = true;
+ for (EdgeWithDistance e : edges) {
+ common_ideal = common_ideal && weight_counter_.HasIdealPI(p.At(i), e.e_, (int) p.LengthAt(i));
+ common = common && weight_counter_.HasPI(p.At(i), e.e_, (int) p.LengthAt(i));
+ }
+ if (!common_ideal || common) {
+ continue;
+ }
+ for (size_t j = 0; j < edges.size(); ++j) {
+ result[edges[j].e_] += weight_counter_.PI(p.At(i), edges[j].e_, (int) p.LengthAt(i));
+ }
+ }
+ return result;
+ }
+
+ EdgeContainer TryResolveBulge(const BidirectionalPath& p, const EdgeContainer& edges) const {
+ if (!IsBulge(edges))
+ return edges;
+ map<EdgeId, double> weights = FindBulgeWeights(p, edges);
+ double max_w = 0.0;
+ EdgeContainer result;
+ for (EdgeWithDistance e : edges) {
+ double w = weights[e.e_];
+ DEBUG("bulge " << g_.int_id(e.e_) << " w = " << w);
+ if (math::gr(w, max_w)) {
+ max_w = w;
+ result.clear();
+ result.push_back(e);
+ } else if (math::eq(w, max_w)) {
+ result.push_back(e);
+ }
+ }
+ if (result.size() != 1) {
+ result = edges;
+ }
+ return result;
+ }
+
+ EdgeContainer ChooseBest(const BidirectionalPath& path, const BidirectionalPathSet& next_paths) const {
+ DEBUG("Try to choose from best paths...");
+ vector<BidirectionalPath*> best_path = MaxWeightedPath(path, next_paths);
+ EdgeContainer result;
+ if (best_path.size() == 1) {
+ result.push_back(EdgeWithDistance((*best_path.begin())->At(0), (*best_path.begin())->GapAt(0)));
+ } else if (best_path.size() > 1) {
+ result = TryToScaffold(path, best_path);
+ }
+ return result;
+ }
+
+ bool HasPIFromUniqueEdges(const BidirectionalPath& p1, const BidirectionalPath& p2, const set<size_t>& p1_unique_edges) const {
+ for (size_t i1 = 0; i1 < p1.Size(); ++i1) {
+ if (p1_unique_edges.find(i1) == p1_unique_edges.end()) {
+ continue;
+ }
+ for (size_t i2 = 0; i2 < p2.Size(); ++i2) {
+ int gap = (int) p1.LengthAt(i1) + (int) p2.Length() - (int) p2.LengthAt(i2);
+ if (unique_edge_analyzer_.IsUnique(p2.At(i2)) && weight_counter_.HasPI(p1.At(i1), p2.At(i2), gap)) {
+ DEBUG("has unique edge " << g_.int_id(p1.At(i1)) << " " << g_.int_id(p2.At(i2)));
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ bool SignificallyDifferentEdges(const BidirectionalPath& init_path, const BidirectionalPath& path1, const map<size_t, double>& pi1,
+ const BidirectionalPath& path2, const map<size_t, double>& pi2, const set<size_t>& unique_init_edges) const {
+ double not_common_w1 = 0.0;
+ double common_w = 0.0;
+ for (auto iter = pi1.begin(); iter != pi1.end(); ++iter) {
+ auto iter2 = pi2.find(iter->first);
+ double w = 0.0;
+ if (iter2 != pi2.end() && !math::eq(iter2->second, 0.0)) {
+ w = min(iter2->second, iter->second);
+ }
+ not_common_w1 += iter->second - w;
+ common_w += w;
+ }
+ if (common_w < 0.8 * (not_common_w1 + common_w)
+ || (HasPIFromUniqueEdges(init_path, path1, unique_init_edges) && !HasPIFromUniqueEdges(init_path, path2, unique_init_edges))) {
+ DEBUG("common_w " << common_w << " sum * 0.8 = " << 0.8 * (not_common_w1 + common_w))
+ return true;
+ }
+ return false;
+ }
+
+ set<size_t> FindNotCommonEdges(const BidirectionalPath& path, const BidirectionalPathMap< map<size_t, double> >& all_pi) const {
+ set<size_t> res;
+ for (size_t i = 0; i < path.Size(); ++i) {
+ if (!unique_edge_analyzer_.IsUnique(path.At(i))) {
+ continue;
+ }
+ size_t pi_count = 0;
+ for (auto iter = all_pi.begin(); iter != all_pi.end(); ++iter) {
+ const map<size_t, double>& info = iter->second;
+ if (info.count(i) > 0 && math::gr(info.at(i), 0.0)) {
+ pi_count++;
+ }
+ }
+ if (pi_count == 1)
+ res.insert(i);
+ }
+ return res;
+ }
+
+ void DeleteSmallWeights(const BidirectionalPath& path, BidirectionalPathSet& paths, BidirectionalPathMap< map<size_t, double> >& all_pi) const {
+ double max_weight = 0.0;
+ BidirectionalPath* max_path = NULL;
+ for (auto iter = paths.begin(); iter != paths.end(); ++iter) {
+ if ((*iter)->GetWeight() >= max_weight) {
+ max_weight = max(max_weight, (*iter)->GetWeight());
+ max_path = *iter;
+ }
+ }
+ BidirectionalPathSet to_del;
+ for (auto iter = paths.begin(); iter != paths.end(); ++iter) {
+ if (math::gr(max_weight, (*iter)->GetWeight() * 1.5) //TODO: move 1.5 to config
+ && SignificallyDifferentEdges(path, *max_path, all_pi.find(max_path)->second, **iter, all_pi.find(*iter)->second,
+ FindNotCommonEdges(path, all_pi)))
+ to_del.insert(*iter);
+ }
+ for (BidirectionalPath* p : to_del) {
+ paths.erase(p);
+ all_pi.erase(p);
+ }
+ }
+
+ void DeleteCommonPi(const BidirectionalPath& p, BidirectionalPathMap< map<size_t, double> >& all_pi) const {
+ weight_counter_.ClearCommonWeight();
+ for (size_t i = 0; i < p.Size(); ++i) {
+ double common = DBL_MAX;
+ for (auto iter = all_pi.begin(); iter != all_pi.end(); ++iter) {
+ common = iter->second.count(i) == 0 ? 0.0 : min(common, iter->second.at(i));
+ }
+ weight_counter_.SetCommonWeightFrom(i, common);
+ }
+ }
+
+ size_t FindCommonBegin(const BidirectionalPathSet& paths) const {
+ if (paths.size() == 0) {
+ return 0;
+ }
+ size_t common_begin = 0;
+ BidirectionalPath* p = *paths.begin();
+ while (common_begin < p->Size()) {
+ EdgeId e = p->At(common_begin);
+ for (BidirectionalPath* next : paths) {
+ if (common_begin >= next->Size() || next->At(common_begin) != e) {
+ return common_begin;
+ }
+ }
+ common_begin++;
+ }
+ return common_begin;
+ }
+
+ void CountAllPairInfo(const BidirectionalPath& path, const BidirectionalPathSet& next_paths,
+ BidirectionalPathMap<map<size_t, double>>& result) const {
+ result.clear();
+ size_t common_begin = FindCommonBegin(next_paths);
+ DEBUG("common begin " << common_begin);
+ for (BidirectionalPath* next : next_paths) {
+ result[next] = weight_counter_.FindPairInfoFromPath(path, 0, path.Size(), *next, common_begin, next->Size());
+ }
+ }
+
+ void CountWeightsAndFilter(const BidirectionalPath& path, BidirectionalPathSet& next_paths, bool delete_small_w) const {
+ BidirectionalPathMap<map<size_t, double> > all_pi;
+ CountAllPairInfo(path, next_paths, all_pi);
+ DeleteCommonPi(path, all_pi);
+ for (BidirectionalPath* next : next_paths) {
+ next->SetWeight((float) weight_counter_.CountPairInfo(path, 0, path.Size(), *next, 0, next->Size()));
+ }
+ if (delete_small_w) {
+ DeleteSmallWeights(path, next_paths, all_pi);
+ }
+ }
+
+ struct PathWithWeightSort {
+ PathWithWeightSort(const MatePairExtensionChooser& mp_chooser, const BidirectionalPath& path, BidirectionalPathMap< map<size_t, double> >& all_pi)
+ : mp_chooser_(mp_chooser),
+ path_(path),
+ not_common_(mp_chooser_.FindNotCommonEdges(path_, all_pi)) {
+ }
+
+ bool operator()(const BidirectionalPath* p1, const BidirectionalPath* p2) {
+ if (mp_chooser_.HasPIFromUniqueEdges(path_, *p1, not_common_) && !mp_chooser_.HasPIFromUniqueEdges(path_, *p2, not_common_)) {
+ return true;
+ }
+ if (mp_chooser_.HasPIFromUniqueEdges(path_, *p2, not_common_) && !mp_chooser_.HasPIFromUniqueEdges(path_, *p1, not_common_)) {
+ return false;
+ }
+ if (!math::eq(p1->GetWeight(), p2->GetWeight())) {
+ return math::gr(p1->GetWeight(), p2->GetWeight());
+ }
+ if (!math::eq(p1->GetWeight(), p2->GetWeight())) {
+ return math::gr(p1->GetWeight(), p2->GetWeight());
+ }
+ if (p1->Length() != p2->Length()) {
+ return p1->Length() > p2->Length();
+ }
+ return p1->Size() > p2->Size();
+ }
+ const MatePairExtensionChooser& mp_chooser_;
+ const BidirectionalPath& path_;
+ const set<size_t> not_common_;
+ };
+
+ vector<BidirectionalPath*> SortResult(const BidirectionalPath& path, BidirectionalPathSet& next_paths) const {
+ BidirectionalPathMap< map<size_t, double> > all_pi;
+ CountAllPairInfo(path, next_paths, all_pi);
+ CountWeightsAndFilter(path, next_paths, false);
+ vector<BidirectionalPath*> to_sort(next_paths.begin(), next_paths.end());
+ PathWithWeightSort comparator(*this, path, all_pi);
+ std::sort(to_sort.begin(), to_sort.end(), comparator);
+ return to_sort;
+ }
+
+ vector<BidirectionalPath*> MaxWeightedPath(const BidirectionalPath& path, const BidirectionalPathSet& following_paths) const {
+ BidirectionalPathSet result(following_paths);
+ BidirectionalPathSet prev_result;
+ while (prev_result.size() != result.size()) {
+ prev_result = result;
+ DEBUG("iteration with paths " << result.size());
+ CountWeightsAndFilter(path, result, true);
+ if (result.size() == 0)
+ result = prev_result;
+ if (result.size() == 1)
+ break;
+ }
+ if (result.size() == 0) {
+ DEBUG("bad case");
+ return vector<BidirectionalPath*>();
+ }
+ return SortResult(path, result);
+ }
+
+ BidirectionalPath ChooseFromEnds(const BidirectionalPath& path, const vector<BidirectionalPath*>& paths, const BidirectionalPath& end) const { //TODO" rewrite
+ DEBUG("choose from ends " << paths.size());
+ end.Print();
+ vector<BidirectionalPath*> new_paths;
+ vector<BidirectionalPath*> paths_to_cover;
+ for (BidirectionalPath* p : paths) {
+ int from = 0;
+ int pos = p->FindFirst(end, from);
+ while (pos > -1) {
+ BidirectionalPath* new_p = new BidirectionalPath(path);
+ BidirectionalPath* new_end = new BidirectionalPath(p->SubPath(0, pos + end.Size()));
+ new_p->PushBack(*new_end);
+ new_paths.push_back(new_p);
+ paths_to_cover.push_back(new_end);
+ from = pos + 1;
+ pos = p->FindFirst(end, from);
+ }
+ }
+ BidirectionalPath max = **new_paths.begin();
+ size_t covered_edges_max = 0;
+ size_t min_size = max.Size();
+ for (BidirectionalPath* p : new_paths) {
+ size_t cov_edges = 0;
+ for (BidirectionalPath* e : paths_to_cover) {
+ vector<size_t> poses = p->FindAll(e->Back());
+ for (size_t pos : poses) {
+ if (EqualBegins(*p, pos, *e, e->Size() - 1, true)) {
+ cov_edges++;
+ break;
+ }
+ }
+ }
+ if (cov_edges > covered_edges_max || (cov_edges == covered_edges_max && min_size > p->Size())) {
+ DEBUG("cov_e " << cov_edges << " s " << p->Size());
+ max.Clear();
+ max.PushBack(*p);
+ covered_edges_max = cov_edges;
+ min_size = max.Size();
+ }
+ }
+ for (BidirectionalPath* p : new_paths) {
+ delete p;
+ }
+ for (BidirectionalPath* p : paths_to_cover) {
+ delete p;
+ }
+ BidirectionalPath result = max.SubPath(path.Size());
+ DEBUG("res");
+ result.Print();
+ return result;
+ }
+
+ int CheckPairInfo(const BidirectionalPath& path, const BidirectionalPath& result_end, int to_add) const {
+ while (to_add < (int)result_end.Size()) {
+ map<size_t, double> weights = weight_counter_.FindPairInfoFromPath(path, 0, path.Size(), result_end, to_add, to_add + 1);
+ double weight_to_edge = 0.0;
+ for (auto iter = weights.begin(); iter != weights.end(); ++iter) {
+ weight_to_edge += iter->second;
+ }
+ if (math::gr(weight_to_edge, 0.0)) {
+ break;
+ }
+ to_add++;
+ }
+ return to_add;
+ }
+
+ EdgeContainer TryToScaffold(const BidirectionalPath& path, const vector<BidirectionalPath*>& paths) const {
+ if (paths.size() == 0) {
+ return EdgeContainer();
+ }
+ DEBUG("Simple Scaffolding")
+ for (BidirectionalPath* p : paths) {
+ p->Print();
+ }
+ BidirectionalPath max_end = simple_scaffolder_.FindMaxCommonPath(paths, search_dist_);
+ if (max_end.Size() == 0) {
+ return EdgeContainer();
+ }
+ BidirectionalPath result_end = ChooseFromEnds(path, paths, max_end);
+ int to_add = result_end.FindFirst(max_end);
+ result_end.Print();
+ EdgeContainer result;
+ to_add = CheckPairInfo(path, result_end, to_add);
+ if (to_add < 0 || to_add >= (int) result_end.Size()) {
+ return EdgeContainer();
+ }
+ size_t gap_length = result_end.Length() - result_end.LengthAt(to_add);
+ DEBUG(" edge to add " << g_.int_id(result_end.At(to_add)) << " with length " << gap_length);
+ result.push_back(EdgeWithDistance(result_end.At(to_add), gap_length));
+ return result;
+ }
+
+ const Graph& g_;
+ shared_ptr<PairedInfoLibrary> lib_;
+ size_t search_dist_;
+ mutable PathsWeightCounter weight_counter_;
+ const GraphCoverageMap cov_map_;
+ NextPathSearcher path_searcher_;
+ LongReadsUniqueEdgeAnalyzer unique_edge_analyzer_;
+ SimpleScaffolding simple_scaffolder_;
+
+ DECL_LOGGER("MatePairExtensionChooser");
+};
+
+class CoordinatedCoverageExtensionChooser: public ExtensionChooser {
+public:
+ CoordinatedCoverageExtensionChooser(const Graph& g,
+ CoverageAwareIdealInfoProvider& coverage_provider,
+ size_t max_edge_length_in_repeat, double delta, size_t min_path_len) :
+ ExtensionChooser(g), provider_(coverage_provider),
+ max_edge_length_in_repeat_(max_edge_length_in_repeat), delta_(delta), min_path_len_(min_path_len) {
+ }
+
+ EdgeContainer Filter(const BidirectionalPath& path,
+ const EdgeContainer& edges) const override {
+
+ if(path.Length() < min_path_len_) {
+ DEBUG("Path is too short");
+ return EdgeContainer();
+ }
+
+ double path_coverage = provider_.EstimatePathCoverage(path);
+ if (math::eq(path_coverage, -1.0)) {
+ DEBUG("Path coverage can't be calculated");
+ return EdgeContainer();
+ }
+ DEBUG("Path coverage is " << path_coverage);
+
+ for (auto e_d : edges) {
+ if (path.Contains(g_.EdgeEnd(e_d.e_))) {
+ DEBUG("Avoid to create loops");
+ return EdgeContainer();
+ }
+ }
+ return FindExtensionTroughRepeat(edges, path_coverage);
+ }
+
+private:
+
+ void UpdateCanBeProcessed(VertexId v,
+ std::queue<VertexId>& can_be_processed, double path_coverage) const {
+ DEBUG("Updating can be processed");
+ for (EdgeId e : g_.OutgoingEdges(v)) {
+ VertexId neighbour_v = this->g_.EdgeEnd(e);
+ if (g_.length(e) < max_edge_length_in_repeat_ && GoodExtension(e, path_coverage)) {
+ DEBUG("Adding vertex " << neighbour_v.int_id()
+ << "through edge " << g_.str(e));
+ can_be_processed.push(neighbour_v);
+ }
+ }
+ }
+
+ GraphComponent<Graph> GetRepeatComponent(const VertexId start, double path_coverage) const {
+ set<VertexId> vertices_of_component;
+ vertices_of_component.insert(start);
+ std::queue<VertexId> can_be_processed;
+ UpdateCanBeProcessed(start, can_be_processed, path_coverage);
+ while (!can_be_processed.empty()) {
+ VertexId v = can_be_processed.front();
+ can_be_processed.pop();
+ if (vertices_of_component.count(v) != 0) {
+ DEBUG("Component is too complex");
+ return GraphComponent<Graph>(g_, false);
+ }
+ DEBUG("Adding vertex " << g_.str(v) << " to component set");
+ vertices_of_component.insert(v);
+ UpdateCanBeProcessed(v, can_be_processed, path_coverage);
+ }
+
+ GraphComponent<Graph> gc(g_, vertices_of_component.begin(),
+ vertices_of_component.end());
+ return gc;
+ }
+
+ EdgeContainer FinalFilter(const EdgeContainer& edges,
+ EdgeId edge_to_extend) const {
+ EdgeContainer result;
+ for (auto e_with_d : edges) {
+ if (e_with_d.e_ == edge_to_extend) {
+ result.push_back(e_with_d);
+ }
+ }
+ return result;
+ }
+
+ bool GoodExtension(EdgeId e, double path_coverage) const {
+ return math::ge(g_.coverage(e), path_coverage * delta_);
+ }
+
+ EdgeContainer FindExtensionTroughRepeat(const EdgeContainer& edges, double path_coverage) const {
+ set<EdgeId> good_extensions;
+ for(auto edge : edges) {
+
+ if(!GoodExtension(edge.e_, path_coverage)) {
+ continue;
+ }
+
+ if (g_.length(edge.e_) > max_edge_length_in_repeat_) {
+ if (GoodExtension(edge.e_, path_coverage)) {
+ good_extensions.insert(edge.e_);
+ }
+ continue;
+ }
+
+ GraphComponent<Graph> gc = GetRepeatComponent(g_.EdgeEnd(edge.e_), path_coverage);
+ if (gc.v_size() == 0) {
+ return EdgeContainer();
+ }
+
+ for (auto e : gc.edges()) {
+ if (g_.length(e) > max_edge_length_in_repeat_) {
+ DEBUG("Repeat component contains long edges");
+ return EdgeContainer();
+ }
+ }
+
+ for (auto v : gc.sinks()) {
+ for (auto e : g_.OutgoingEdges(v)) {
+ if (GoodExtension(e, path_coverage)) {
+ good_extensions.insert(edge.e_);
+ }
+ }
+ }
+ }
+
+ DEBUG("Number of good extensions is " << good_extensions.size());
+
+ if (good_extensions.size() != 1) {
+ DEBUG("Returning");
+ return EdgeContainer();
+ }
+ auto extension = *good_extensions.begin();
+
+ if(math::ls(path_coverage, g_.coverage(extension) * delta_)) {
+ DEBUG("Extension coverage too high");
+ return EdgeContainer();
+ }
+
+ DEBUG("Filtering... Extend with edge " << extension.int_id());
+ return FinalFilter(edges, extension);
+ }
+
+ CoverageAwareIdealInfoProvider provider_;
+ const size_t max_edge_length_in_repeat_;
+ const double delta_;
+ const size_t min_path_len_;
+ DECL_LOGGER("CoordCoverageExtensionChooser");
+};
+
+}
+#endif /* EXTENSION_HPP_ */
diff --git a/src/modules/algorithms/path_extend/ideal_pair_info.hpp b/src/modules/algorithms/path_extend/ideal_pair_info.hpp
new file mode 100644
index 0000000..2a3fc1a
--- /dev/null
+++ b/src/modules/algorithms/path_extend/ideal_pair_info.hpp
@@ -0,0 +1,129 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * ideal_pair_info.hpp
+ *
+ * Created on: Oct 10, 2013
+ * Author: ira
+ */
+
+#ifndef IDEAL_PAIR_INFO_HPP_
+#define IDEAL_PAIR_INFO_HPP_
+#import <vector>
+#include "pipeline/graph_pack.hpp"
+
+namespace path_extend {
+
+using debruijn_graph::Graph;
+using debruijn_graph::EdgeId;
+
+class IdealPairInfoCounter {
+public:
+ IdealPairInfoCounter(const Graph& g, int d_min, int d_max, size_t read_size,
+ const std::map<int, size_t>& is_distribution)
+ : g_(g),
+ d_min_(d_min),
+ d_max_(d_max),
+ read_size_(read_size) {
+ size_t sum = 0;
+ for (auto iter = is_distribution.begin(); iter != is_distribution.end();
+ ++iter) {
+ sum += iter->second;
+ }
+ for (auto iter = is_distribution.begin(); iter != is_distribution.end();
+ ++iter) {
+ insert_size_distrib_[iter->first] = (double) iter->second
+ / (double) sum;
+ }
+ PreCalculateNotTotalReadsWeight();
+ }
+
+ double IdealPairedInfo(EdgeId e1, EdgeId e2, int dist, bool additive = false) const {
+ std::pair<size_t, size_t> lengths = make_pair(g_.length(e1), g_.length(e2));
+ if (pi_.find(lengths) == pi_.end()) {
+ pi_.insert(make_pair(lengths, std::map<int, double>()));
+ }
+ std::map<int, double>& weights = pi_[lengths];
+ if (weights.find(dist) == weights.end()) {
+ weights.insert(make_pair(dist, IdealPairedInfo(g_.length(e1), g_.length(e2), dist, additive)));
+ }
+ return weights[dist];
+ }
+
+ double IdealPairedInfo(size_t len1, size_t len2, int dist, bool additive = false) const {
+ double result = 0.0;
+ for (auto it = insert_size_distrib_.lower_bound(max(d_min_, 0)); it != insert_size_distrib_.upper_bound(d_max_); ++it) {
+ result += it->second * (double) IdealReads(len1, len2, dist, it->first, additive);
+ }
+ return result;
+ }
+
+private:
+
+ double IdealReads(size_t len1_1, size_t len2_1, int dist,
+ size_t is_1, bool additive) const {
+ int len1 = (int) len1_1;
+ int len2 = (int) len2_1;
+ int is = (int) is_1;
+ int k = (int) g_.k();
+ int rs = (int) read_size_;
+ double w = 0.0;
+ if (dist == 0) {
+ return len1 - is + 2 * rs - 2 - k + 1;
+ }
+ if (dist < 0) {
+ int tmp = len1;
+ len1 = len2;
+ len2 = tmp;
+ dist = -dist;
+ }
+ int gap_len = dist - len1;
+ int right_long = is - rs - 1;
+ int right_short = gap_len + len2 - 1;
+ int left_short = gap_len + k + 1 - rs;
+ int left_long = is - rs - len1 - rs + (k + 1);
+ int right = std::min(right_long, right_short);
+ int left = std::max(left_short, left_long);
+ int result = std::max(right - left + 1, 0);
+ int right_e2 = std::min(gap_len + len2 - rs + k, right_long);
+ int left_e2 = std::max(left_long, gap_len);
+ int right_not_full = std::max(right - right_e2, 0);
+ int left_not_full = std::max(left_e2 - left, 0);
+ w = result;
+ if (additive){
+ w = w - not_total_weights_right_[right_not_full]- not_total_weights_left_[left_not_full];
+ }
+ return w > 0.0 ? w : 0.0;
+ }
+
+ void PreCalculateNotTotalReadsWeight() {
+ not_total_weights_right_.push_back(0.0);
+ not_total_weights_left_.push_back(0.0);
+ for (int i = 1; i < int(read_size_) - int(g_.k()) + 1; ++i) {
+ double right = (double(i) + double(g_.k()) /2.0) / (double) read_size_;
+ double left = 1 - right;
+ not_total_weights_right_.push_back(not_total_weights_right_[i-1] + right);
+ not_total_weights_left_.push_back(not_total_weights_left_[i-1] + left);
+ }
+ }
+
+ const Graph& g_;
+ int d_min_;
+ int d_max_;
+ size_t read_size_;
+ std::vector<double> weights_;
+ std::map<int, double> insert_size_distrib_;
+ mutable std::map<std::pair<size_t, size_t>, std::map<int, double> > pi_;
+ std::vector<double> not_total_weights_right_;
+ std::vector<double> not_total_weights_left_;
+protected:
+ DECL_LOGGER("PathExtendPI");
+};
+} // path extend
+
+#endif /* IDEAL_PAIR_INFO_HPP_ */
diff --git a/src/modules/algorithms/path_extend/loop_traverser.hpp b/src/modules/algorithms/path_extend/loop_traverser.hpp
new file mode 100644
index 0000000..048615f
--- /dev/null
+++ b/src/modules/algorithms/path_extend/loop_traverser.hpp
@@ -0,0 +1,213 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * loop_traverser.hpp
+ *
+ * Created on: Jan 28, 2013
+ * Author: ira
+ */
+
+#ifndef LOOP_TRAVERSER_H_
+#define LOOP_TRAVERSER_H_
+
+#include "path_extender.hpp"
+#include "pe_resolver.hpp"
+#include "path_visualizer.hpp"
+
+namespace path_extend {
+
+class LoopTraverser {
+
+ const Graph& g_;
+ GraphCoverageMap& covMap_;
+ shared_ptr<ContigsMaker> extender_;
+private:
+ EdgeId FindStart(const set<VertexId>& component_set) const{
+ EdgeId result;
+ for (auto it = component_set.begin(); it != component_set.end(); ++it) {
+ for (auto eit = g_.in_begin(*it); eit != g_.in_end(*it); ++eit) {
+ if (component_set.count(g_.EdgeStart(*eit)) == 0) {
+ if (result != EdgeId()) {
+ return EdgeId();
+ }
+ result = *eit;
+ }
+ }
+ }
+ return result;
+ }
+
+ EdgeId FindFinish(const set<VertexId>& component_set) {
+ EdgeId result;
+ for (auto it = component_set.begin(); it != component_set.end(); ++it) {
+ for (auto I = g_.out_begin(*it), E = g_.out_end(*it);
+ I != E; ++I) {
+ if (component_set.count(g_.EdgeEnd(*I)) == 0) {
+ if (result != EdgeId()) {
+ return EdgeId();
+ }
+ result = *I;
+ }
+ }
+ }
+ return result;
+ }
+
+ void TryToGrow(BidirectionalPath* path, EdgeId component_entrance) {
+ BidirectionalPath clone = *path;
+ extender_->GrowPathSimple(*path);
+ if (!path->Contains(component_entrance)) {
+ DEBUG("Grown paths do not contain initial edges, rolling back");
+ path->Clear();
+ path->PushBack(clone);
+ }
+ }
+
+ bool IsEndInsideComponent(const BidirectionalPath &path,
+ const set <VertexId> &component_set) {
+ if (component_set.count(g_.EdgeStart(path.Front())) == 0) {
+ return false;
+ }
+ for (size_t i = 0; i < path.Size(); ++i) {
+ if (component_set.count(g_.EdgeEnd(path.At(i))) == 0)
+ return false;
+ }
+ return true;
+ }
+
+
+ bool IsEndInsideComponent(const BidirectionalPath &path, EdgeId component_entrance,
+ const set <VertexId> &component_set,
+ bool conjugate = false) {
+ int i = path.FindLast(component_entrance);
+ VERIFY_MSG(i != -1, "Component edge is not found in the path")
+
+ if ((size_t) i == path.Size() - 1) {
+ if (conjugate)
+ return component_set.count(g_.conjugate(g_.EdgeEnd(path.Back()))) > 0;
+ else
+ return component_set.count(g_.EdgeEnd(path.Back())) > 0;
+ }
+
+ if (conjugate)
+ return IsEndInsideComponent(path.SubPath((size_t) i + 1).Conjugate(), component_set);
+ else
+ return IsEndInsideComponent(path.SubPath((size_t) i + 1), component_set);
+ }
+
+ void TraverseLoop(EdgeId start, EdgeId end, const set<VertexId>& component_set) {
+ DEBUG("start " << g_.int_id(start) << " end " << g_.int_id(end));
+ BidirectionalPathSet coveredStartPaths =
+ covMap_.GetCoveringPaths(start);
+ BidirectionalPathSet coveredEndPaths =
+ covMap_.GetCoveringPaths(end);
+
+ for (auto it_path = coveredStartPaths.begin();
+ it_path != coveredStartPaths.end(); ++it_path) {
+ if ((*it_path)->FindAll(end).size() > 0) {
+ return;
+ }
+ }
+ if (coveredStartPaths.size() < 1 or coveredEndPaths.size() < 1) {
+ DEBUG("TraverseLoop STRANGE SITUATION: start " << coveredStartPaths.size() << " end " << coveredEndPaths.size());
+ return;
+ }
+
+ if (coveredStartPaths.size() > 1 or coveredEndPaths.size() > 1) {
+ DEBUG("Ambiguous situation in path joining, quitting");
+ return;
+ }
+
+ BidirectionalPath* startPath = *coveredStartPaths.begin();
+ BidirectionalPath* endPath = *coveredEndPaths.begin();
+ if ((*startPath) == endPath->Conjugate()){
+ return;
+ }
+
+ //TryToGrow(startPath, start);
+ //TryToGrow(endPath->GetConjPath(), g_.conjugate(end));
+
+ //Checking that paths ends are within component
+ if (!IsEndInsideComponent(*startPath, start, component_set) ||
+ !IsEndInsideComponent(*endPath->GetConjPath(), g_.conjugate(end), component_set, true)) {
+ DEBUG("Some path goes outside of the component")
+ return;
+ }
+
+ size_t commonSize = startPath->CommonEndSize(*endPath);
+ size_t nLen = 0;
+ DEBUG("Str " << startPath->Size() << ", end" << endPath->Size());
+ if (commonSize == 0 && !startPath->Empty() > 0 && !endPath->Empty()) {
+ DEBUG("Estimating gap size");
+ VertexId lastVertex = g_.EdgeEnd(startPath->Back());
+ VertexId firstVertex = g_.EdgeStart(endPath->Front());
+
+ if (firstVertex == lastVertex) {
+ nLen = 0;
+ } else {
+ DijkstraHelper<Graph>::BoundedDijkstra dijkstra(DijkstraHelper<Graph>::CreateBoundedDijkstra(g_, 1000, 3000));
+ dijkstra.Run(lastVertex);
+ vector<EdgeId> shortest_path = dijkstra.GetShortestPathTo(g_.EdgeStart(endPath->Front()));
+
+ if (shortest_path.size() == 0) {
+ DEBUG("Failed to find closing path");
+ return;
+ } else if (!IsEndInsideComponent(BidirectionalPath(g_, shortest_path), component_set)) {
+ DEBUG("Closing path is outside the component");
+ return;
+ } else {
+ for (size_t i = 0; i < shortest_path.size(); ++i) {
+ nLen += g_.length(shortest_path[i]);
+ }
+ nLen += g_.k();
+ }
+ }
+ }
+ if (commonSize < endPath->Size()){
+ startPath->PushBack(endPath->At(commonSize), (int) nLen);
+ }
+ for (size_t i = commonSize + 1; i < endPath->Size(); ++i) {
+ startPath->PushBack(endPath->At(i), endPath->GapAt(i), endPath->TrashPreviousAt(i), endPath->TrashCurrentAt(i));
+ }
+ DEBUG("travers");
+ startPath->Print();
+ endPath->Print();
+ DEBUG("conj");
+ endPath->GetConjPath()->Print();
+ endPath->Clear();
+ }
+
+public:
+ LoopTraverser(const Graph& g, GraphCoverageMap& coverageMap, shared_ptr<ContigsMaker> extender) :
+ g_(g), covMap_(coverageMap), extender_(extender) {
+ }
+
+ void TraverseAllLoops() {
+ DEBUG("TraverseAllLoops");
+ shared_ptr<GraphSplitter<Graph>> splitter = LongEdgesExclusiveSplitter<Graph>(g_, 1000);
+ while (splitter->HasNext()) {
+ GraphComponent<Graph> component = splitter->Next();
+ if (component.v_size() > 10)
+ continue;
+ set<VertexId> component_set(component.v_begin(), component.v_end());
+ EdgeId start = FindStart(component_set);
+ EdgeId finish = FindFinish(component_set);
+ if (start == EdgeId() || finish == EdgeId()) {
+ continue;
+ }
+ TraverseLoop(start, finish, component_set);
+ }
+
+ }
+protected:
+ DECL_LOGGER("LoopTraverser");
+};
+
+}
+
+#endif /* LOOP_TRAVERSER_H_ */
diff --git a/src/modules/algorithms/path_extend/next_path_searcher.hpp b/src/modules/algorithms/path_extend/next_path_searcher.hpp
new file mode 100644
index 0000000..e332805
--- /dev/null
+++ b/src/modules/algorithms/path_extend/next_path_searcher.hpp
@@ -0,0 +1,1031 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * next_path_searcher.hpp
+ *
+ * Created on: Sep 27, 2013
+ * Author: ira
+ */
+#pragma once
+
+#include <set>
+#include <vector>
+#include <map>
+
+#include "pipeline/graph_pack.hpp"
+#include "assembly_graph/graph_core/graph.hpp"
+#include "assembly_graph/paths/bidirectional_path.hpp"
+#include "pe_utils.hpp"
+
+namespace path_extend {
+using debruijn_graph::Graph;
+using std::set;
+using std::vector;
+using std::multimap;
+
+class Edge {
+public:
+ Edge(const Graph& g, EdgeId id, Edge* prev_e, size_t dist, int gap = 0)
+ : g_(g),
+ id_(id),
+ prev_edge_(prev_e),
+ dist_(dist),
+ gap_(gap) {
+ }
+ ~Edge() {
+ for (size_t i = 0; i < out_edges_.size(); ++i) {
+ delete out_edges_[i];
+ }
+ for (size_t i = 0; i < not_out_edges_.size(); ++i) {
+ delete not_out_edges_[i];
+ }
+ }
+ Edge* AddOutEdge(EdgeId edge, int gap = 0) {
+ return AddIfNotExist(edge, gap, out_edges_);
+ }
+ Edge* AddIncorrectOutEdge(EdgeId edge, int gap = 0) {
+ for (size_t i = 0; i < out_edges_.size(); ++i) {
+ if (out_edges_[i]->GetId() == edge) {
+ not_out_edges_.push_back(out_edges_[i]);
+ out_edges_.erase(out_edges_.begin() + i);
+ break;
+ }
+ }
+ return AddIfNotExist(edge, gap, not_out_edges_);
+ }
+ Edge* AddPath(const BidirectionalPath& path, size_t from) {
+ Edge* e = this;
+ for (size_t i = from; i < path.Size(); ++i) {
+ e = e->AddOutEdge(path.At(i), path.GapAt(i));
+ }
+ return e;
+ }
+
+ int GetOutEdgeIndex(EdgeId edge) const {
+ return GetEdgeIndex(edge, out_edges_);
+ }
+
+ int GetIncorrectEdgeIndex(EdgeId edge) const {
+ return GetEdgeIndex(edge, not_out_edges_);
+ }
+
+ size_t OutSize() const {
+ return out_edges_.size();
+ }
+
+ Edge* GetOutEdge(size_t i) const {
+ return out_edges_[i];
+ }
+
+ BidirectionalPath GetPrevPath(size_t from) const {
+ BidirectionalPath result(g_);
+ vector<pair<EdgeId, int> > edges_wgaps;
+ const Edge* e = this;
+ edges_wgaps.push_back(make_pair(e->GetId(), e->Gap()));
+ while (e->prev_edge_) {
+ e = e->prev_edge_;
+ edges_wgaps.push_back(make_pair(e->GetId(), e->Gap()));
+ }
+ for (int i = (int) edges_wgaps.size() - 1 - (int) from; i >= 0; i--) {
+ result.PushBack(edges_wgaps[i].first, edges_wgaps[i].second);
+ }
+ return result;
+ }
+
+ bool IsCorrect() {
+ Edge* e = this;
+ while (e->prev_edge_) {
+ if (e->prev_edge_->GetOutEdgeIndex(e->GetId()) == -1) {
+ TRACE("after " << g_.int_id(e->prev_edge_->GetId()) << " souldn't go " << g_.int_id(e->GetId()));
+ return false;
+ }
+ e = e->prev_edge_;
+ }
+ return true;
+ }
+
+ bool EqualBegins(const BidirectionalPath& path, int pos) {
+ BidirectionalPath p = this->GetPrevPath(0);
+ return path_extend::EqualBegins(path, (size_t) pos, p, p.Size() - 1, true);
+ }
+ size_t Length() const {
+ return dist_;
+ }
+ set<Edge*> GetPrevEdges(size_t dist) {
+ size_t init_len = Length();
+ Edge* e = this;
+ set<Edge*> result;
+ while (e && init_len - e->Length() < dist) {
+ result.insert(e);
+ e = e->prev_edge_;
+ }
+ return result;
+ }
+ EdgeId GetId() const {
+ return id_;
+ }
+ int Gap() const {
+ return gap_;
+ }
+private:
+ Edge* AddIfNotExist(EdgeId e, int gap, vector<Edge*>& vect) {
+ int i = GetEdgeIndex(e, vect);
+ if (i != -1) {
+ return vect[i];
+ }
+ size_t dist = dist_ + gap + g_.length(e);
+ vect.push_back(new Edge(g_, e, this, dist, gap));
+ return vect.back();
+ }
+ int GetEdgeIndex(EdgeId e, const vector<Edge*>& vect) const {
+ for (size_t i = 0; i < vect.size(); ++i) {
+ if (vect[i]->GetId() == e)
+ return (int) i;
+ }
+ return -1;
+ }
+ const Graph& g_;
+ EdgeId id_;
+ vector<Edge*> out_edges_;
+ vector<Edge*> not_out_edges_;
+ Edge* prev_edge_;
+ size_t dist_;
+ int gap_;
+
+protected:
+ DECL_LOGGER("NextPathSearcher")
+};
+struct PathWithDistance {
+ PathWithDistance(BidirectionalPath p, int dist)
+ : p_(p),
+ dist_(dist) {
+
+ }
+ BidirectionalPath p_;
+ int dist_;
+};
+class NextPathSearcher {
+public:
+ typedef set<EdgeWithDistance, EdgeWithDistance::DistanceComparator> EdgeSet;
+ typedef multimap<EdgeId, PathWithDistance> ConstructedPathT;
+
+ NextPathSearcher(const Graph& g, const GraphCoverageMap& cover_map, size_t search_dist, PathsWeightCounter weight_counter, size_t max_number_of_paths_to_search);
+ BidirectionalPathSet FindNextPaths(const BidirectionalPath& path, EdgeId begin_edge, bool jump = true) const ;
+ vector<BidirectionalPath*> ScaffoldTree(const BidirectionalPath& path) const;
+private:
+ bool IsOutTip(VertexId v) const;
+ bool IsInTip(VertexId v) const;
+ vector<Edge*> GrowPath(const BidirectionalPath& init_path, Edge* e) const;
+ Edge* AddEdge(const BidirectionalPath& init_path, Edge* prev_e, EdgeId e_to_add, int gap) const;
+ bool AnalyzeBubble(const BidirectionalPath& p, EdgeId buldge_edge, size_t gap, Edge* prev_edge) const;
+
+ void ScaffoldTip(const BidirectionalPath& path, Edge * current_path, vector<Edge*>& result_edges, vector<Edge*>& stopped_paths, vector<Edge*>& to_add,
+ bool jump) const;
+ void ScaffoldChristmasTree(const BidirectionalPath& path, Edge * current_path, vector<Edge*>& to_add, size_t min_length_from) const;
+ void Scaffold(const BidirectionalPath& init_path, Edge* current_path, ConstructedPathT& constructed_paths, set<EdgeId>& seeds, bool is_gap) const;
+ void FindScaffoldingCandidates(const BidirectionalPath& init_path, Edge* current_path, EdgeSet& candidate_set, size_t min_length_from) const;
+ void FindScaffoldingCandidates(EdgeId e, size_t distance_to_tip, vector<EdgeWithDistance>& jump_edges) const;
+ void OrderScaffoldingCandidates(EdgeSet& candidate_set, const BidirectionalPath& init_path, Edge* current_path, ConstructedPathT& constructed_paths, set<EdgeId>& seeds, bool is_gap) const;
+ void RemoveRedundant(ConstructedPathT& constructed_paths) const;
+ void ConvertPaths(const ConstructedPathT& constructed_paths, Edge* current_path, vector<Edge*>& to_add) const;
+ void ProcessScaffoldingCandidate(EdgeWithDistance& e, EdgeSet& candidate_set, Edge* current_path, size_t grown_path_len,
+ ConstructedPathT& constructed_paths, bool is_gap) const;
+ int EstimateGapForPath(EdgeSet& candidate_set, const BidirectionalPath& p) const;
+ void AddConstructedPath(const BidirectionalPath& cp, size_t from, int gap, ConstructedPathT& constructed_paths) const;
+ void FilterBackPaths(BidirectionalPathSet& back_paths, EdgeId edge_to_reach, BidirectionalPathSet& reached_paths, size_t max_len = -1UL) const;
+ void JoinPathsByGraph(ConstructedPathT& constructed_paths) const;
+ void JoinPathsByPI(ConstructedPathT& constructed_paths) const;
+ void JoinPathsByDejikstra(const BidirectionalPath& init_path, ConstructedPathT& constructed_paths) const;
+ map<PathWithDistance*, size_t> FindDistances(const BidirectionalPath& p, vector<PathWithDistance*>& paths) const;
+ void FindConnections(vector<PathWithDistance*>& all_paths, map<PathWithDistance*, set<PathWithDistance*> >& connections) const;
+ vector<vector<PathWithDistance*> > FilterConnections(vector<PathWithDistance*>& all_paths, map<PathWithDistance*, set<PathWithDistance*> >& connections) const;
+ void ConnectPaths(const BidirectionalPath& init_path, vector<vector<PathWithDistance*> >& variants) const;
+
+ const Graph& g_;
+ const GraphCoverageMap& cover_map_;
+ size_t search_dist_;
+ PathsWeightCounter weight_counter_;
+ size_t long_edge_len_;
+ size_t max_paths_;
+
+protected:
+ DECL_LOGGER("NextPathSearcher")
+};
+
+inline NextPathSearcher::NextPathSearcher(const Graph& g, const GraphCoverageMap& cover_map, size_t search_dist, PathsWeightCounter weight_counter, size_t max_number_of_paths_to_search)
+ : g_(g),
+ cover_map_(cover_map),
+ search_dist_(search_dist),
+ weight_counter_(weight_counter),
+ long_edge_len_(500),
+ max_paths_(max_number_of_paths_to_search) {
+
+}
+
+inline vector<BidirectionalPath*> NextPathSearcher::ScaffoldTree(const BidirectionalPath& path) const {
+ Edge* start_e = new Edge(g_, path.At(0), NULL, g_.length(path.At(0)) + path.GapAt(0), path.GapAt(0));
+ Edge* e = start_e->AddPath(path, 1);
+ //jump forward when too much paths
+ DEBUG("Scaffolding tree for edge " << g_.int_id(start_e->GetId()));
+ path.Print();
+ vector<Edge*> result_edges;
+ ScaffoldChristmasTree(path, e, result_edges, 0);
+ std::vector<BidirectionalPath*> result_paths;
+ for (size_t i = 0; i < result_edges.size(); ++i) {
+ BidirectionalPath result_path = result_edges[i]->GetPrevPath(path.Size());
+ if (!result_path.Empty())
+ result_paths.push_back(new BidirectionalPath(result_path));
+ }
+ if (result_paths.size() != 1) {
+ for (size_t i = 0; i < result_paths.size(); ++i) {
+ delete result_paths[i];
+ }
+ result_paths.clear();
+ result_edges.clear();
+ ScaffoldChristmasTree(path, e, result_edges, long_edge_len_);
+ for (size_t i = 0; i < result_edges.size(); ++i) {
+ BidirectionalPath result_path = result_edges[i]->GetPrevPath(path.Size());
+ if (!result_path.Empty())
+ result_paths.push_back(new BidirectionalPath(result_path));
+ }
+ }
+ delete start_e;
+ DEBUG( "for path " << path.GetId() << " several extension " << result_paths.size());
+ return result_paths;
+}
+
+inline BidirectionalPathSet NextPathSearcher::FindNextPaths(const BidirectionalPath& path, EdgeId begin_edge, bool jump) const {
+ TRACE("begin find next paths");
+ vector<Edge*> grow_paths;
+ vector<Edge*> result_edges;
+ vector<Edge*> stopped_paths;
+ size_t max_len = search_dist_ + path.Length();
+ std::set<Edge*> used_edges;
+ int count_to_grow = 1;
+
+ Edge* start_e = new Edge(g_, path.At(0), NULL, g_.length(path.At(0)) + path.GapAt(0), path.GapAt(0));
+ Edge* e = start_e->AddPath(path, 1);
+ if (begin_edge != path.Back()) {
+ e = e->AddOutEdge(begin_edge);
+ DEBUG( "Try to find next path for path with edge " << g_.int_id(begin_edge));
+ } else {
+ DEBUG( "Try to search for path with last edge " << g_.int_id(path.Back()) << " Scaffolding: " << jump << ", next edges " << g_.OutgoingEdgeCount(g_.EdgeEnd(path.Back())));
+ }
+ grow_paths.push_back(e);
+
+ size_t ipath = 0;
+ DEBUG("Processing paths");
+ while (ipath < grow_paths.size()) {
+ DEBUG("Processing path " << ipath << " of " << grow_paths.size() << " need to grow " << count_to_grow);
+ Edge* current_path = grow_paths[ipath++];
+ DEBUG(" edge " << g_.int_id(current_path->GetId()));
+ if (used_edges.count(current_path) > 0) {
+ count_to_grow--;
+ continue;
+ }
+ used_edges.insert(current_path);
+ if (current_path->Length() >= max_len && current_path->IsCorrect()) {
+ result_edges.push_back(current_path);
+ count_to_grow--;
+ continue;
+ }
+ DEBUG("Growing path");
+ vector<Edge*> to_add = GrowPath(path, current_path);
+ DEBUG("Path grown");
+ if (to_add.empty() && current_path->IsCorrect()) {
+ DEBUG("scaffold tip");
+ ScaffoldTip(path, current_path, result_edges, stopped_paths, to_add, jump);
+ }
+ count_to_grow--;
+ for (Edge* e_to_add : to_add) {
+ grow_paths.push_back(e_to_add);
+ count_to_grow++;
+ }
+
+ if (count_to_grow > (int) max_paths_ || ipath > max_paths_ * 10) {
+ DEBUG("too many paths");
+ delete start_e;
+ return BidirectionalPathSet();
+ }
+ }
+ DEBUG("Paths processed");
+
+ BidirectionalPathSet result_paths;
+ TRACE("adding paths " << result_edges.size());
+ for (size_t i = 0; i < result_edges.size(); ++i) {
+ BidirectionalPath result_path = result_edges[i]->GetPrevPath(path.Size());
+ if (!result_path.Empty()) {
+ result_paths.insert(new BidirectionalPath(result_path));
+ }
+ }
+ delete start_e;
+ DEBUG( "for path " << path.GetId() << " several extension " << result_paths.size());
+ return result_paths;
+}
+
+inline bool NextPathSearcher::AnalyzeBubble(const BidirectionalPath& p, EdgeId buldge_edge, size_t gap, Edge* prev_edge) const {
+ EdgeId max_edge = buldge_edge;
+ if (prev_edge->GetOutEdgeIndex(buldge_edge) != -1 || prev_edge->GetIncorrectEdgeIndex(buldge_edge) != -1) {
+ return prev_edge->GetOutEdgeIndex(buldge_edge) != -1;
+ }
+ double max_w = 0.0;
+ for (EdgeId e : g_.OutgoingEdges(g_.EdgeStart(buldge_edge))) {
+ double w = weight_counter_.CountPairInfo(p, 0, p.Size(), e, gap);
+ if (math::gr(w, max_w) || (math::eq(w, max_w) && g_.int_id(e) < g_.int_id(max_edge))) {
+ max_w = w;
+ max_edge = e;
+ }
+ }
+ for (EdgeId e : g_.OutgoingEdges(g_.EdgeStart(buldge_edge))) {
+ if (e == max_edge) {
+ prev_edge->AddOutEdge(e);
+ } else {
+ prev_edge->AddIncorrectOutEdge(e);
+ }
+ }
+ return max_edge == buldge_edge;
+}
+
+inline Edge* NextPathSearcher::AddEdge(const BidirectionalPath& init_path, Edge* prev_e, EdgeId e_to_add, int gap) const {
+ Edge* e = prev_e;
+ if (e->GetIncorrectEdgeIndex(e_to_add) != -1) {
+ return e;
+ }
+ int inext = e->GetOutEdgeIndex(e_to_add);
+ if (inext != -1) {
+ return e->GetOutEdge(inext);
+ }
+ if (InBuble(e_to_add, g_)) {
+ if (AnalyzeBubble(init_path, e_to_add, gap, e)) {
+ return e->AddOutEdge(e_to_add);
+ }
+ } else if (e->GetId() != e_to_add) {
+ return e->AddOutEdge(e_to_add);
+ }
+ return e;
+}
+
+inline vector<Edge*> NextPathSearcher::GrowPath(const BidirectionalPath& init_path, Edge* e) const {
+ TRACE("in growing path");
+ vector<Edge*> to_add;
+ if (!e->IsCorrect()) {
+ TRACE("incorrect");
+ return to_add;
+ }
+ for (EdgeId next_edge : g_.OutgoingEdges(g_.EdgeEnd(e->GetId()))) {
+ TRACE("Analyze outgoing edge " << g_.int_id(next_edge));
+ BidirectionalPathSet cov_paths = cover_map_.GetCoveringPaths(next_edge);
+ TRACE("cov_map size " << cov_paths.size());
+ bool already_added = false;
+ for (auto inext_path = cov_paths.begin(); inext_path != cov_paths.end() && !already_added; ++inext_path) {
+ vector<size_t> positions = (*inext_path)->FindAll(next_edge);
+ for (size_t pos : positions) {
+ if (pos == 0 || e->EqualBegins(**inext_path, (int) pos - 1)) {
+ TRACE("Found equal begin");
+ Edge* new_edge = AddEdge(init_path, e, (*inext_path)->At(pos), (*inext_path)->GapAt(pos));
+ if (new_edge && new_edge != e) {
+ TRACE("Add edge")
+ to_add.push_back(new_edge);
+ already_added = true;
+ break;
+ }
+ }
+ }
+ }
+ }
+ if (to_add.size() == 0) {
+ for (EdgeId next_edge : g_.OutgoingEdges(g_.EdgeEnd(e->GetId()))) {
+ if (next_edge != e->GetId()) {
+ to_add.push_back(e->AddOutEdge(next_edge));
+ }
+ }
+ }
+ stringstream str;
+ str << " for edge " << g_.int_id(e->GetId()) << " add ";
+ for (Edge* e1 : to_add) {
+ str << " " << g_.int_id(e1->GetId());
+ }
+ TRACE(str.str());
+ return to_add;
+}
+
+inline void NextPathSearcher::ScaffoldTip(const BidirectionalPath& path, Edge * current_path, vector<Edge*>& result_edges, vector<Edge*>& stopped_paths,
+ vector<Edge*>& to_add, bool jump) const {
+
+ if (jump) {
+ //jump forward when tip
+ DEBUG("Scaffolding");
+ ConstructedPathT constructed_paths;
+ set<EdgeId> seeds;
+ Scaffold(path, current_path, constructed_paths, seeds, true);
+ if (constructed_paths.empty()) {
+ stopped_paths.push_back(current_path);
+ } else {
+ DEBUG("Jumped! " << to_add.size());
+ ConvertPaths(constructed_paths, current_path, to_add);
+ }
+ } else {
+ DEBUG("Not scaffolding because going back");
+ result_edges.push_back(current_path);
+ }
+}
+
+inline void NextPathSearcher::ScaffoldChristmasTree(const BidirectionalPath& path, Edge * current_path, vector<Edge*>& to_add, size_t min_length_from) const {
+ //jump forward when too much paths
+ DEBUG("========= Scaffolding when too many paths =========");
+ ConstructedPathT constructed_paths;
+ set<EdgeId> seeds;
+ //Scaffold(path, current_path, constructed_paths, seeds, false);
+ EdgeSet candidate_set;
+ FindScaffoldingCandidates(path, current_path, candidate_set, min_length_from);
+ for (EdgeWithDistance e : candidate_set) {
+ constructed_paths.insert(make_pair(e.e_,PathWithDistance(BidirectionalPath(g_, e.e_), e.d_)));
+ }
+ RemoveRedundant(constructed_paths);
+ JoinPathsByDejikstra(path, constructed_paths);
+
+ RemoveRedundant(constructed_paths);
+ DEBUG("Scafolding candidates");
+ for (EdgeWithDistance e : candidate_set) {
+ DEBUG( "Edge " << g_.int_id(e.e_) << " (" << g_.length(e.e_) << ")" << ", distance " << e.d_);
+ }
+
+ DEBUG("scaffolding candidates for tree " << constructed_paths.size());
+ for (auto iter = constructed_paths.begin(); iter != constructed_paths.end(); ++iter){
+ iter->second.p_.Print();
+ }
+
+ if (constructed_paths.size() > 0 && constructed_paths.upper_bound(constructed_paths.begin()->first) == constructed_paths.end()) {
+ DEBUG("All paths from one seed");
+ int first_seed_pos = 0;
+ auto p = constructed_paths.begin();
+ if (constructed_paths.size() > 1) {
+ //Searching for path with max number of seeds
+ DEBUG("Many paths from one seed " << constructed_paths.size());
+ int max_seeds = 0;
+ for (auto it = constructed_paths.begin(); it != constructed_paths.end(); ++it) {
+ int seed_count = 0;
+ for (EdgeId e : seeds) {
+ if (it->second.p_.Contains(e)) {
+ ++seed_count;
+ }
+ }
+ if (seed_count > max_seeds) {
+ max_seeds = seed_count;
+ p = it;
+ }
+ }
+ DEBUG("Max seed containing contains " << max_seeds << " seeds");
+ //Looking for first seed in that path
+ PathWithDistance& winner(p->second);
+ first_seed_pos = (int) winner.p_.Size() + 1;
+ for (EdgeId e : seeds) {
+ int pos = winner.p_.FindFirst(e);
+ if (pos != -1)
+ first_seed_pos = min(pos, first_seed_pos);
+ }
+ VERIFY(first_seed_pos != (int) winner.p_.Size() + 1);
+ DEBUG("First seed position " << first_seed_pos << " seeds");
+ }
+ PathWithDistance& path_to_add(p->second);
+ int distance = path_to_add.dist_ + (int) path_to_add.p_.Length() - (int) path_to_add.p_.LengthAt(first_seed_pos);
+ to_add.push_back(current_path->AddOutEdge(path_to_add.p_[first_seed_pos], distance));
+ to_add.back() = to_add.back()->AddPath(path_to_add.p_, first_seed_pos + 1);
+ }
+ DEBUG("========= Done scaffolding when too many paths =========");
+}
+
+inline void NextPathSearcher::Scaffold(const BidirectionalPath& init_path, Edge* current_path,
+ ConstructedPathT& constructed_paths, set<EdgeId>& seeds, bool is_gap) const {
+
+ EdgeSet candidate_set;
+ FindScaffoldingCandidates(init_path, current_path, candidate_set, 0);
+
+ DEBUG("Scafolding candidates");
+ for (EdgeWithDistance e : candidate_set) {
+ DEBUG( "Edge " << g_.int_id(e.e_) << " (" << g_.length(e.e_) << ")" << ", distance " << e.d_);
+ }
+
+ OrderScaffoldingCandidates(candidate_set, init_path, current_path, constructed_paths, seeds, is_gap);
+}
+
+inline void NextPathSearcher::FindScaffoldingCandidates(const BidirectionalPath& init_path, Edge* current_path, EdgeSet& candidate_set, size_t min_length_from) const {
+ set<EdgeId> path_end;
+ set<Edge*> prev_edges = current_path->GetPrevEdges(search_dist_);
+ for (Edge* e : prev_edges) {
+ path_end.insert(e->GetId());
+ path_end.insert(g_.conjugate(e->GetId()));
+ }
+ map<EdgeId, vector<int> > candidates;
+ //current_path->GetPrevPath(0).Print();
+ TRACE(current_path->Length() << " " << init_path.Length());
+ VERIFY(current_path->Length() >= init_path.Length());
+ size_t grown_path_len = current_path->Length() - init_path.Length();
+ TRACE("Path already grown to " << grown_path_len);
+
+ for (size_t i = 0; i < init_path.Size(); ++i) {
+ if (g_.length(init_path[i]) <= min_length_from) {
+ continue;
+ }
+ vector<EdgeWithDistance> jump_edges;
+ size_t distance_to_tip = init_path.LengthAt(i) + grown_path_len;
+ FindScaffoldingCandidates(init_path[i], distance_to_tip, jump_edges);
+ for (EdgeWithDistance e : jump_edges) {
+ if (candidates.find(e.e_) == candidates.end()) {
+ candidates[e.e_] = vector<int>();
+ }
+ DEBUG("ADD JUMP EDGE FROM " << g_.int_id(init_path[i]) << " TO " << g_.int_id(e.e_))
+ candidates[e.e_].push_back(/*max(e.d_ - (int) distance_to_tip, 100)*/100);
+ }
+ }
+
+ for (std::pair<EdgeId, vector<int> > e : candidates) {
+ if (path_end.count(e.first) > 0) {
+ continue;
+ }
+ int avg_distance = 0;
+ TRACE( "All distances for edge " << g_.int_id(e.first) << " (" << g_.length(e.first) << ")");
+ for (int dist : e.second) {
+ TRACE(dist);
+ avg_distance += dist;
+ }
+ avg_distance /= (int) e.second.size();
+ candidate_set.insert(EdgeWithDistance(e.first, avg_distance));
+ }
+}
+
+inline void NextPathSearcher::FindScaffoldingCandidates(EdgeId e, size_t distance_to_tip, vector<EdgeWithDistance>& jump_edges) const {
+ if (g_.length(e) < long_edge_len_ || distance_to_tip - g_.length(e) >= search_dist_)
+ return;
+
+ TRACE("Edge " << g_.int_id(e) << ", length " << g_.length(e));
+ TRACE( distance_to_tip << " " << distance_to_tip - g_.length(e) << " " << search_dist_);
+
+ set<EdgeId> candidate_edges;
+ int min_distance = std::max((int) distance_to_tip - (int) weight_counter_.GetLib()->GetLeftVar(), 0);
+ int max_distance = (int) search_dist_ + (int) g_.length(e);
+ TRACE("Looking in range " << min_distance << " " << max_distance);
+ weight_counter_.FindJumpCandidates(e, min_distance, max_distance, long_edge_len_, candidate_edges);
+ weight_counter_.FindJumpEdges(e, candidate_edges, min_distance, max_distance, jump_edges);
+ TRACE("Found " << jump_edges.size() << " candidate(s) from this edge");
+}
+
+inline void NextPathSearcher::OrderScaffoldingCandidates(EdgeSet& candidate_set, const BidirectionalPath& init_path,
+ Edge* current_path, ConstructedPathT& constructed_paths,
+ set<EdgeId>& seeds, bool is_gap) const {
+ size_t grown_path_len = current_path->Length() - init_path.Length();
+
+ TRACE("Order Scaffolding Candidates, is gap " << is_gap);
+ for (EdgeWithDistance e : candidate_set) {
+ TRACE("e " << g_.int_id(e.e_));
+ if (constructed_paths.count(e.e_) > 0) {
+ TRACE("visited");
+ continue;
+ }
+ ProcessScaffoldingCandidate(e, candidate_set, current_path, grown_path_len, constructed_paths, is_gap);
+ for (auto p1 = constructed_paths.begin(); p1 != constructed_paths.end(); ++p1) {
+ TRACE("current constructed paths " << g_.int_id(p1->first));
+ //p1->second.p_.Print();
+ }
+
+ }
+ RemoveRedundant(constructed_paths);
+ for (auto it = constructed_paths.begin(); it != constructed_paths.end(); ++it) {
+ seeds.insert(it->first);
+ }
+ JoinPathsByGraph(constructed_paths);
+ JoinPathsByPI(constructed_paths);
+
+ RemoveRedundant(constructed_paths);
+}
+
+inline void NextPathSearcher::ConvertPaths(const ConstructedPathT& constructed_paths, Edge* current_path, vector<Edge*>& to_add) const {
+ for (auto edge = constructed_paths.begin(); edge != constructed_paths.end(); ++edge) {
+ to_add.push_back(current_path->AddOutEdge(edge->second.p_[0], edge->second.dist_));
+ to_add.back() = to_add.back()->AddPath(edge->second.p_, 1);
+ }
+}
+
+inline void NextPathSearcher::RemoveRedundant(ConstructedPathT& constructed_paths) const {
+ for (auto edge = constructed_paths.begin(); edge != constructed_paths.end();) {
+ if (edge->second.p_.Empty()) {
+ edge = constructed_paths.erase(edge);
+ } else {
+ ++edge;
+ }
+ }
+}
+
+inline void NextPathSearcher::ProcessScaffoldingCandidate(EdgeWithDistance& e, EdgeSet& candidate_set, Edge* current_path, size_t grown_path_len,
+ ConstructedPathT& constructed_paths, bool is_gap) const {
+ bool looking_for_tip = is_gap;
+ //Search back from e till tip or maximim length back
+ TRACE(" === Searching back === ");
+ TRACE( "Distances: search = " << search_dist_ << ", grown = " << grown_path_len << ", estimated gap = " << e.d_);
+ VERIFY(search_dist_ >= grown_path_len);
+ VERIFY((int) search_dist_ >= e.d_);
+
+ size_t max_length_back = search_dist_ - grown_path_len;
+ TRACE(search_dist_ << " " << grown_path_len);
+ TRACE( "Searchin for edge of length " << g_.length(e.e_) << " to dist " << max_length_back);
+ NextPathSearcher back_searcher(g_, cover_map_, max_length_back, weight_counter_, max_paths_);
+ BidirectionalPath jumped_edge(g_, g_.conjugate(e.e_));
+ BidirectionalPathSet back_paths = back_searcher.FindNextPaths(jumped_edge, jumped_edge.Back(), false);
+ TRACE(" === DONE SEARCHING === ");
+ TRACE("Found " << back_paths.size() << " is tip " << IsInTip(g_.EdgeStart(e.e_)) << " look for tip " << looking_for_tip);
+
+ if (back_paths.empty()) {
+ if (IsInTip(g_.EdgeStart(e.e_)) && looking_for_tip) {
+ TRACE( "Added tip edge " << g_.int_id(e.e_) << " (" << g_.length(e.e_) << ")" << ", distance " << e.d_);
+ constructed_paths.insert(make_pair(e.e_, PathWithDistance(BidirectionalPath(g_, e.e_), e.d_)));
+ } else if (!IsInTip(g_.EdgeStart(e.e_)) && !looking_for_tip) {
+ constructed_paths.insert(make_pair(e.e_, PathWithDistance(BidirectionalPath(g_, e.e_), e.d_)));
+ }
+ } else {
+ TRACE("Found several back paths " << back_paths.size());
+ BidirectionalPathSet reached_paths;
+ FilterBackPaths(back_paths, g_.conjugate(current_path->GetId()), reached_paths, search_dist_ - grown_path_len);
+ //Found a path back to the init path
+ if (reached_paths.size() > 0 && !looking_for_tip) {
+ TRACE("Found " << reached_paths.size() << " direct path(s) back");
+ int i = 0;
+ for (BidirectionalPath* p : reached_paths) {
+ TRACE("Processing reached path " << i++);
+ BidirectionalPath cp = p->Conjugate();
+ //Adding jumped edge since its not included in the path
+ cp.PushBack(e.e_);
+ //cp.Print();
+ int reached_edge_pos = cp.FindLast(current_path->GetId());
+ VERIFY(reached_edge_pos != -1);
+ AddConstructedPath(cp, reached_edge_pos + 1, 0, constructed_paths);
+ }
+ } else if (reached_paths.size() > 0 && looking_for_tip) {
+ DEBUG("Impossible: back path reaches tip");
+ } else if (looking_for_tip) {
+ TRACE( "Found " << back_paths.size() << " path(s) going back to tip");
+ int i = 0;
+ for (BidirectionalPath* p : back_paths) {
+ DEBUG("Processing tip path " << i++);
+ BidirectionalPath cp = p->Conjugate();
+ //Adding jumped edge since its not included in the path
+ cp.PushBack(e.e_);
+ AddConstructedPath(cp, 0, EstimateGapForPath(candidate_set, cp), constructed_paths);
+ }
+ }
+ }
+ for (BidirectionalPath* p : back_paths) {
+ delete p;
+ }
+}
+
+inline int NextPathSearcher::EstimateGapForPath(EdgeSet& candidate_set, const BidirectionalPath& p) const {
+ int gap = 0;
+ int count = 0;
+ for (EdgeWithDistance e : candidate_set) {
+ int pos = p.FindFirst(e.e_);
+ if (pos != -1) {
+ size_t length_to_e = 0;
+ for (int i = 0; i < pos; ++i) {
+ length_to_e += p.LengthAt(i);
+ }
+ gap += e.d_ - (int) length_to_e;
+ }
+ ++count;
+ }
+ gap /= count;
+ return gap > 0 ? gap : 100;
+}
+
+inline void NextPathSearcher::AddConstructedPath(const BidirectionalPath& cp, size_t from, int gap, ConstructedPathT& constructed_paths) const {
+ VERIFY(!cp.Empty());
+
+ //Adding if there is unique (candidate - tip)
+ EdgeId candidate = cp.Back();
+ for (auto it = constructed_paths.lower_bound(candidate); it != constructed_paths.upper_bound(candidate); ++it) {
+ if (it->second.p_.Front() == cp.Front()) {
+ return;
+ }
+ }
+
+ TRACE("Adding path starting from " << from);
+ constructed_paths.insert(make_pair(candidate, PathWithDistance(cp.SubPath(from), gap)));
+ TRACE("add constructed path " << g_.int_id(candidate));
+ //cp.Print();
+
+ for (size_t i = 0; i < cp.Size() - 1; ++i) {
+ EdgeId edge = cp[i];
+ for (auto it = constructed_paths.lower_bound(edge); it != constructed_paths.upper_bound(edge); ++it) {
+ TRACE("found " << g_.int_id(edge));
+ //it->second.p_.Print();
+ TRACE("clear");
+ it->second.p_.Clear();
+ }
+ }
+}
+inline bool NextPathSearcher::IsOutTip(VertexId v) const {
+ if (g_.OutgoingEdgeCount(v) == 0) {
+ return true;
+ }
+ if (g_.OutgoingEdgeCount(v) != 1) {
+ return false;
+ }
+ EdgeId oute = *g_.OutgoingEdges(v).begin();
+ for (EdgeId ine : g_.IncomingEdges(v)) {
+ if (oute == ine) {
+ return true;
+ }
+ }
+ return false;
+}
+inline bool NextPathSearcher::IsInTip(VertexId v) const {
+ if (g_.IncomingEdgeCount(v) == 0) {
+ return true;
+ }
+ if (g_.IncomingEdgeCount(v) != 1) {
+ return false;
+ }
+ EdgeId ine = *g_.IncomingEdges(v).begin();
+ for (EdgeId oute : g_.OutgoingEdges(v)) {
+ if (oute == ine) {
+ return true;
+ }
+ }
+ return false;
+}
+inline void NextPathSearcher::FilterBackPaths(BidirectionalPathSet& back_paths, EdgeId edge_to_reach, BidirectionalPathSet& reached_paths,
+ size_t max_len) const {
+ TRACE("Searching for proper back paths");
+
+ int i = 0;
+ for (auto piter = back_paths.begin(); piter != back_paths.end();) {
+ BidirectionalPath* p = *piter;
+ VERIFY(!p->Empty());
+ EdgeId last_e = p->Back();
+ VertexId last_v = g_.EdgeEnd(last_e);
+ TRACE("Processing path " << i++);
+ //p->Print();
+ if (p->FindFirst(edge_to_reach) != -1) {
+ reached_paths.insert(p);
+ ++piter;
+ } else if (IsInTip(last_v) == 0 && p->Length() < max_len) {
+ ++piter;
+ } else {
+ delete p;
+ piter = back_paths.erase(piter);
+ }
+ }
+}
+
+inline void NextPathSearcher::JoinPathsByGraph(ConstructedPathT& constructed_paths) const {
+ TRACE("== try to join paths using graph ==");
+ for (auto p1 = constructed_paths.begin(); p1 != constructed_paths.end(); ++p1) {
+ //p1->second.p_.Print();
+ }
+ TRACE("== printed ==");
+
+ //Removing edges whose seed is contained in any other path
+ set<EdgeId> to_remove;
+ for (auto p1 = constructed_paths.begin(); p1 != constructed_paths.end(); ++p1) {
+ if (to_remove.count(p1->first) > 0) {
+ continue;
+ }
+ for (auto p2 = constructed_paths.begin(); p2 != constructed_paths.end(); ++p2) {
+ if (p1->first == p2->first || to_remove.count(p2->first) > 0) {
+ continue;
+ }
+ if (p1->second.p_.Contains(p2->first)) {
+ to_remove.insert(p2->first);
+ }
+ }
+ }
+ for (auto p = constructed_paths.begin(); p != constructed_paths.end(); ) {
+ if (to_remove.count(p->first) > 0) {
+ p = constructed_paths.erase(p);
+ } else {
+ ++p;
+ }
+ }
+}
+
+inline void NextPathSearcher::JoinPathsByPI(ConstructedPathT& constructed_paths) const {
+ DEBUG("== try to join paths ===");
+ for (auto p1 = constructed_paths.begin(); p1 != constructed_paths.end(); ++p1) {
+ p1->second.p_.Print();
+ }
+ DEBUG("== printed ===");
+
+ //Checking paired info
+ set<EdgeId> visited;
+ for (auto p1 = constructed_paths.begin(); p1 != constructed_paths.end(); ++p1) {
+ if (visited.count(p1->first) > 0) {
+ continue;
+ }
+ for (auto p2 = constructed_paths.begin(); p2 != constructed_paths.end(); ++p2) {
+ if (p1->first == p2->first) {
+ continue;
+ }
+ BidirectionalPath& path1 = p1->second.p_;
+ BidirectionalPath& path2 = p2->second.p_;
+ bool has_pi = false;
+ for (size_t i = 0; i < path1.Size(); ++i) {
+
+ for (size_t j = 0; j < path2.Size(); ++j) {
+ size_t len_to_e2 = path2.Length() - path2.LengthAt(j);
+ size_t dist = path1.LengthAt(i) + len_to_e2;
+ size_t min_dist = (size_t) max(0, (int) dist - (int) weight_counter_.GetLib()->GetLeftVar());
+ size_t max_dist = dist + search_dist_;
+ DEBUG("try to find pair info between " << g_.int_id(path1[i]) << " and " << g_.int_id(path2[j])
+ << " distance from " << min_dist
+ <<" to " << max_dist);
+ if (path1[i] != path2[j] &&
+ weight_counter_.HasPI(path1[i], path2[j], min_dist, max_dist)) {
+ has_pi = true;
+ break;
+ }
+ }
+ if (has_pi) {
+ break;
+ }
+ }
+
+ set<EdgeId> edges_path1;
+ for (size_t i = 0; i < path1.Size(); ++i) {
+ edges_path1.insert(path1.At(i));
+ }
+ for (size_t i = 0; i < path2.Size(); ++i) {
+ if (edges_path1.count(path2.At(i)) > 0 || edges_path1.count(g_.conjugate(path2.At(i))) > 0) {
+ has_pi = false;
+ }
+ }
+ if (has_pi) {
+ DEBUG("has pi from ");
+ path1.Print();
+ DEBUG("to");
+ path2.Print();
+ path1.PushBack(path2.Front(), 100);
+ for (int i = 1; i < (int) path2.Size(); ++i) {
+ path1.PushBack(path2[i], path2.GapAt(i), path2.TrashPreviousAt(i), path2.TrashCurrentAt(i));
+ }
+ DEBUG("new path");
+ path1.Print();
+ path2.Clear();
+ visited.insert(p2->first);
+ }
+ }
+ }
+}
+inline void Generate(size_t l, size_t r, vector<size_t> a,
+ vector<vector<size_t> >& res, vector<PathWithDistance*>& all_paths, map<PathWithDistance*, set<PathWithDistance*> >& connections) {
+ if (l == r) {
+ DEBUG("result " << a.size())
+ res.push_back(a);
+ } else {
+ for (size_t i = l; i < r; ++i) {
+ if (l > 0 && connections[all_paths[a[l - 1]]].count(all_paths[a[i]]) == 0) {
+ DEBUG(" not connected " << a[l-1] << " and " << a[i])
+ continue;
+ }
+ DEBUG(" connected " << l-1 << " and " << i)
+ size_t v = a[l];
+ a[l] = a[i];
+ a[i] = v;
+ Generate(l + 1, r, a, res, all_paths, connections);
+ v = a[l];
+ a[l] = a[i];
+ a[i] = v;
+ }
+ }
+}
+
+inline vector<vector<size_t> > Generate(size_t n, vector<PathWithDistance*>& all_paths, map<PathWithDistance*, set<PathWithDistance*> >& connections) {
+ vector<vector<size_t> > result;
+ if (n > 5) {
+ return result;
+ }
+ vector<size_t> a;
+ for (size_t i = 0; i < n; ++i) {
+ a.push_back(i);
+ }
+ Generate(0, n, a, result, all_paths, connections);
+ return result;
+}
+
+inline map<PathWithDistance*, size_t> NextPathSearcher::FindDistances(const BidirectionalPath& p, vector<PathWithDistance*>& paths) const {
+ DEBUG("find distances from e " << g_.int_id(p.Back()))
+ map<PathWithDistance*, size_t> result;
+ DijkstraHelper<Graph>::BoundedDijkstra dijkstra(DijkstraHelper<Graph>::CreateBoundedDijkstra(g_, search_dist_, 3000));
+ dijkstra.Run(g_.EdgeEnd(p.Back()));
+ DEBUG("paths size " << paths.size());
+ for (auto ipath = paths.begin(); ipath != paths.end(); ++ipath) {
+ vector<EdgeId> shortest_path = dijkstra.GetShortestPathTo(g_.EdgeStart((*ipath)->p_.Front()));
+ if (shortest_path.size() != 0) {
+ int gap = 0;
+ for (size_t i = 0; i < shortest_path.size(); ++i) {
+ gap += (int) g_.length(shortest_path[i]);
+ }
+ gap += (int) g_.k();
+ result[*ipath] = gap;
+ }
+ }
+ DEBUG("return result " << result.size());
+ return result;
+}
+
+inline void NextPathSearcher::FindConnections(vector<PathWithDistance*>& all_paths, map<PathWithDistance*, set<PathWithDistance*> >& connections) const {
+ for (auto p1 = all_paths.begin(); p1 != all_paths.end(); ++p1) {
+ map<PathWithDistance*, size_t> distances = FindDistances((*p1)->p_, all_paths);
+ connections[*p1] = set<PathWithDistance*>();
+ for (auto iter = distances.begin(); iter != distances.end(); ++iter) {
+ if ((*p1)->p_.Length() + iter->second < search_dist_){
+ connections[*p1].insert(iter->first);
+ }
+ }
+ }
+}
+
+inline void NextPathSearcher::ConnectPaths(const BidirectionalPath& init_path, vector<vector<PathWithDistance*> >& variants) const {
+ if (variants.size() == 1 && variants[0].size() > 0) {
+ vector<PathWithDistance*> res = variants[0];
+ vector<PathWithDistance*> for_dijkstra;
+ BidirectionalPath& path1 = res[0]->p_;
+ for_dijkstra.push_back(res[0]);
+ map<PathWithDistance*, size_t> distances = FindDistances(init_path, for_dijkstra);
+ size_t gap = distances.count(res[0]) > 0 ? distances[res[0]] : 100 + g_.k();
+ BidirectionalPath p(path1);
+ path1.Clear();
+ path1.PushBack(p.Front(), (int)gap);
+ path1.PushBack(p.SubPath(1));
+ for (size_t i = 1; i < res.size(); ++i) {
+ for_dijkstra.clear();
+ for_dijkstra.push_back(res[i]);
+ BidirectionalPath& path2 = res[i]->p_;
+ distances = FindDistances(path1, for_dijkstra);
+ gap = distances.count(res[i]) > 0 ? distances[res[i]] : 100 + g_.k();
+ path1.PushBack(path2.Front(), (int)gap);
+ for (int i = 1; i < (int) path2.Size(); ++i) {
+ path1.PushBack(path2[i], path2.GapAt(i), path2.TrashPreviousAt(i), path2.TrashCurrentAt(i));
+ }
+ path2.Clear();
+ }
+ } else if (variants.size() > 1) {
+ vector<PathWithDistance*> res = variants[0];
+ EdgeId last = res.back()->p_.Back();
+ for (size_t i = 1; i < variants.size(); ++i) {
+ if (last != variants[i].back()->p_.Back()) {
+ return;
+ }
+ }
+ for (size_t i = 0; i < res.size(); ++i) {
+ res[i]->p_.Clear();
+ }
+ int gap = (int) 1000 + (int) g_.k();
+ res[0]->p_.PushBack(last, gap);
+ }
+}
+
+inline vector<vector<PathWithDistance*> > NextPathSearcher::FilterConnections(vector<PathWithDistance*>& all_paths, map<PathWithDistance*, set<PathWithDistance*> >& connections) const {
+ vector<vector<PathWithDistance*> > variants;
+ DEBUG("filter connections " << connections.size() << " all paths size " << all_paths.size())
+ vector<vector<size_t> > permutations = Generate(all_paths.size(), all_paths, connections);
+ DEBUG("generated all permutations " << permutations.size());
+ for (size_t i = 0; i < permutations.size(); ++i) {
+ vector<PathWithDistance*> variant;
+ for (size_t j = 0; j < permutations[i].size(); ++j) {
+ variant.push_back(all_paths[permutations[i][j]]);
+ }
+ variants.push_back(variant);
+ }
+ return variants;
+}
+
+inline void NextPathSearcher::JoinPathsByDejikstra(const BidirectionalPath& init_path, ConstructedPathT& constructed_paths) const {
+ DEBUG("== try to join paths by dejikstra ===");
+ for (auto p1 = constructed_paths.begin(); p1 != constructed_paths.end(); ++p1) {
+ p1->second.p_.Print();
+ }
+ DEBUG("== printed ===");
+
+ vector<PathWithDistance*> all_paths;
+ for (auto p1 = constructed_paths.begin(); p1 != constructed_paths.end(); ++p1) {
+ if (p1->second.p_.Size() != 0) {
+ all_paths.push_back(&p1->second);
+ }
+ }
+ map<PathWithDistance*, set<PathWithDistance*> > connections;
+ FindConnections(all_paths, connections);
+ vector<vector<PathWithDistance*> > variants = FilterConnections(all_paths, connections);
+ ConnectPaths(init_path, variants);
+
+ DEBUG("== after to join paths ===");
+ for (auto p1 = constructed_paths.begin(); p1 != constructed_paths.end(); ++p1) {
+ p1->second.p_.Print();
+ }
+ DEBUG("== printed ===");
+}
+
+} // namespace path_extend
diff --git a/src/modules/algorithms/path_extend/overlap_analysis.hpp b/src/modules/algorithms/path_extend/overlap_analysis.hpp
new file mode 100644
index 0000000..279dc4a
--- /dev/null
+++ b/src/modules/algorithms/path_extend/overlap_analysis.hpp
@@ -0,0 +1,113 @@
+#pragma once
+
+#include "dev_support/logger/logger.hpp"
+#include "dev_support/range.hpp"
+#include "ssw/ssw_cpp.h"
+
+namespace debruijn_graph {
+using omnigraph::Range;
+
+struct OverlapInfo {
+ Range r1;
+ Range r2;
+ size_t match_cnt;
+
+ OverlapInfo(const Range& r1_, const Range& r2_, size_t match_cnt_)
+ : r1(r1_),
+ r2(r2_),
+ match_cnt(match_cnt_) {
+ VERIFY(match_cnt <= std::min(r1.size(), r2.size()));
+ }
+
+ OverlapInfo()
+ : match_cnt(0) {
+ }
+
+ double identity() const {
+ if (match_cnt == 0)
+ return 0.;
+ return (double)match_cnt / (double)size();
+ }
+
+ size_t size() const {
+ return std::max(r1.size(), r2.size());
+ }
+
+ bool operator==(const OverlapInfo &that) const {
+ return r1 == that.r1 && r2 == that.r2 && match_cnt == that.match_cnt;
+ }
+
+ bool operator!=(const OverlapInfo &that) const {
+ return !(*this == that);
+ }
+};
+
+std::ostream& operator<<(std::ostream& os, const OverlapInfo& info) {
+ return os << "R1: [" << info.r1.start_pos << ", " << info.r1.end_pos
+ << "]; R2: [" << info.r2.start_pos << ", " << info.r2.end_pos << "]"
+ << "; match_cnt: " << info.match_cnt;
+}
+
+class SWOverlapAnalyzer {
+ static const uint32_t CIGAR_FLAG_MASK = (1 << 4) - 1;
+ static const uint32_t CIGAR_MATCH_FLAG = 7;
+ typedef typename Graph::EdgeId EdgeId;
+ size_t flank_length_;
+
+ const StripedSmithWaterman::Aligner aligner_;
+ const StripedSmithWaterman::Filter filter_;
+
+ size_t CountMatches(std::vector<uint32_t> cigar) const {
+ size_t match_cnt = 0;
+ for (uint32_t entry : cigar) {
+ if ((entry & CIGAR_FLAG_MASK) == CIGAR_MATCH_FLAG) {
+ match_cnt += (entry >> 4);
+ }
+ }
+ return match_cnt;
+ }
+
+ OverlapInfo InnerAnalyze(const Sequence& s1, const Sequence& s2) const {
+ if (s1.size() == 0 || s2.size() == 0) {
+ return OverlapInfo();
+ }
+ StripedSmithWaterman::Alignment alignment;
+ if (aligner_.Align(s1.str().c_str(), s2.str().c_str(), int(s2.size()), filter_, &alignment)) {
+ if (alignment.sw_score > 0) {
+ return OverlapInfo(Range(alignment.query_begin, alignment.query_end + 1),
+ Range(alignment.ref_begin, alignment.ref_end + 1),
+ CountMatches(alignment.cigar));
+ }
+ }
+ return OverlapInfo();
+ }
+
+public:
+ SWOverlapAnalyzer(size_t flank_length)
+ : flank_length_(flank_length),
+ aligner_(/*match_score*/2,
+ /*mismatch_penalty*/6,
+ /*gap_opening_penalty*/8,
+ /*gap_extending_penalty*/8) {
+ }
+
+
+ OverlapInfo AnalyzeOverlap(const Sequence& s1, const Sequence& s2) const {
+ size_t start1 = flank_length_ > s1.size() ? 0 : s1.size() - flank_length_;
+ size_t end2 = flank_length_ > s2.size() ? s2.size() : flank_length_;
+
+ OverlapInfo result = InnerAnalyze(s1.Subseq(start1, s1.size()), s2.Subseq(0, end2));
+ if (result == OverlapInfo())
+ return result;
+
+ result.r1.shift(int(start1));
+ return result;
+ }
+
+ template<class Graph>
+ OverlapInfo AnalyzeOverlap(const Graph& g, EdgeId e1, EdgeId e2) const {
+ return AnalyzeOverlap(g.EdgeNucls(e1), g.EdgeNucls(e2));
+ }
+};
+
+}
diff --git a/src/modules/algorithms/path_extend/paired_library.hpp b/src/modules/algorithms/path_extend/paired_library.hpp
new file mode 100644
index 0000000..f176ab9
--- /dev/null
+++ b/src/modules/algorithms/path_extend/paired_library.hpp
@@ -0,0 +1,179 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * paired_library.hpp
+ *
+ * Created on: Feb 19, 2012
+ * Author: andrey
+ */
+
+#ifndef PAIRED_LIBRARY_HPP_
+#define PAIRED_LIBRARY_HPP_
+
+#include "pipeline/graph_pack.hpp"
+#include "paired_info/paired_info.hpp"
+#include "ideal_pair_info.hpp"
+
+#include "math/xmath.h"
+
+namespace path_extend {
+
+using debruijn_graph::Graph;
+using debruijn_graph::EdgeId;
+
+using omnigraph::de::PairedInfoIndexT;
+typedef omnigraph::de::PairInfo<EdgeId> DePairInfo;
+using omnigraph::de::Point;
+
+struct PairedInfoLibrary {
+ PairedInfoLibrary(size_t k, const Graph& g, size_t readS, size_t is,
+ size_t is_min, size_t is_max, size_t is_var,
+ bool is_mp,
+ const std::map<int, size_t>& is_distribution)
+ : g_(g),
+ k_(k),
+ read_size_(readS),
+ is_(is),
+ is_min_(is_min),
+ is_max_(is_max),
+ is_var_(is_var),
+ is_mp_(is_mp),
+ single_threshold_(-1.0),
+ coverage_coeff_(1.0),
+ ideal_pi_counter_(g, (int) is_min, (int) is_max, readS, is_distribution) {
+ }
+
+ virtual ~PairedInfoLibrary() {}
+
+ void SetCoverage(double cov) { coverage_coeff_ = cov; }
+ void SetSingleThreshold(double threshold) { single_threshold_ = threshold; }
+
+ virtual size_t FindJumpEdges(EdgeId e, set<EdgeId>& result, int min_dist, int max_dist, size_t min_len = 0) const = 0;
+ virtual void CountDistances(EdgeId e1, EdgeId e2, vector<int>& dist, vector<double>& w) const = 0;
+ virtual double CountPairedInfo(EdgeId e1, EdgeId e2, int distance, bool from_interval = false) const = 0;
+ virtual double CountPairedInfo(EdgeId e1, EdgeId e2, int dist_min, int dist_max) const = 0;
+
+ double IdealPairedInfo(EdgeId e1, EdgeId e2, int distance, bool additive = false) const {
+ return ideal_pi_counter_.IdealPairedInfo(e1, e2, distance, additive);
+ }
+
+ size_t GetISMin() const { return is_min_; }
+ double GetSingleThreshold() const { return single_threshold_; }
+ double GetCoverageCoeff() const { return coverage_coeff_; }
+ size_t GetISMax() const { return is_max_; }
+ size_t GetIsVar() const { return is_var_; }
+ size_t GetLeftVar() const { return is_ - is_min_; }
+ size_t GetRightVar() const { return is_max_ - is_; }
+ size_t GetReadSize() const { return read_size_; }
+ bool IsMp() const { return is_mp_; }
+
+ const Graph& g_;
+ size_t k_;
+ size_t read_size_;
+ size_t is_;
+ size_t is_min_;
+ size_t is_max_;
+ size_t is_var_;
+ bool is_mp_;
+ double single_threshold_;
+ double coverage_coeff_;
+ IdealPairInfoCounter ideal_pi_counter_;
+protected:
+ DECL_LOGGER("PathExtendPI");
+};
+
+template<class Index>
+struct PairedInfoLibraryWithIndex : public PairedInfoLibrary {
+
+ PairedInfoLibraryWithIndex(size_t k, const Graph& g, size_t readS, size_t is, size_t is_min, size_t is_max, size_t is_div,
+ const Index& index, bool is_mp,
+ const std::map<int, size_t>& is_distribution)
+ : PairedInfoLibrary(k, g, readS, is, is_min, is_max, is_div, is_mp, is_distribution),
+ index_(index) {}
+
+ size_t FindJumpEdges(EdgeId e, std::set<EdgeId>& result, int min_dist, int max_dist, size_t min_len = 0) const override {
+ VERIFY(index_.size() > 0);
+ result.clear();
+
+ auto infos = index_.Get(e);
+ // We do not care about iteration order here - all the edges collected
+ // will be inside std::set<EdgeId>
+ for (auto it : infos) {
+ EdgeId e2 = it.first;
+ if (e2 == e)
+ continue;
+ if (g_.length(e2) < min_len)
+ continue;
+ for (auto point : it.second) {
+ omnigraph::de::DEDistance dist = point.d;
+ if (math::le(dist, (omnigraph::de::DEDistance) max_dist) &&
+ math::ge(dist, (omnigraph::de::DEDistance) min_dist)) {
+ result.insert(e2);
+ }
+ }
+ }
+ return result.size();
+ }
+
+
+ void CountDistances(EdgeId e1, EdgeId e2, vector<int>& dist, vector<double>& w) const override {
+ VERIFY(index_.size() > 0);
+ if (e1 == e2)
+ return;
+
+ for (auto point : index_.Get(e1, e2)) {
+ int pairedDistance = de::rounded_d(point);
+ dist.push_back(pairedDistance);
+ w.push_back(point.weight);
+ }
+ }
+
+ double CountPairedInfo(EdgeId e1, EdgeId e2, int distance,
+ bool from_interval = false) const override {
+ VERIFY(index_.size() != 0);
+ double weight = 0.0;
+
+ for (auto point : index_.Get(e1, e2)) {
+ int pairedDistance = de::rounded_d(point);
+ int distanceDev = (int) point.variance(); //max((int) pointIter->var, (int) is_variation_);
+ //Can be modified according to distance comparison
+ int d_min = distance - distanceDev;
+ int d_max = distance + distanceDev;
+
+ if (from_interval) {
+ d_min -= (int) (is_ - is_min_);
+ d_max += (int) (is_max_ - is_);
+ }
+ if (pairedDistance >= d_min && pairedDistance <= d_max) {
+ weight += point.weight;
+ }
+ }
+ return weight;
+ }
+
+ double CountPairedInfo(EdgeId e1, EdgeId e2, int dist_min, int dist_max) const override {
+ VERIFY(index_.size() != 0);
+ double weight = 0.0;
+ for (const auto &point : index_.Get(e1, e2)) {
+ int dist = de::rounded_d(point);
+ if (dist >= dist_min && dist <= dist_max)
+ weight += point.weight;
+ }
+ return weight;
+ }
+
+ const Index& index_;
+protected:
+ DECL_LOGGER("PathExtendPI");
+};
+
+typedef std::vector<shared_ptr<PairedInfoLibrary> > PairedInfoLibraries;
+
+} // path extend
+
+#endif /* PAIRED_LIBRARY_HPP_ */
diff --git a/src/modules/algorithms/path_extend/path_extend_launch.hpp b/src/modules/algorithms/path_extend/path_extend_launch.hpp
new file mode 100644
index 0000000..7acdeff
--- /dev/null
+++ b/src/modules/algorithms/path_extend/path_extend_launch.hpp
@@ -0,0 +1,975 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * lc_launch.hpp
+ *
+ * Created on: Dec 1, 2011
+ * Author: andrey
+ */
+
+#ifndef PATH_EXTEND_LAUNCH_HPP_
+#define PATH_EXTEND_LAUNCH_HPP_
+
+#include "scaffolder2015/scaffold_graph_constructor.hpp"
+#include "pe_config_struct.hpp"
+#include "pe_resolver.hpp"
+#include "path_extender.hpp"
+#include "pe_io.hpp"
+#include "path_visualizer.hpp"
+#include "loop_traverser.hpp"
+#include "assembly_graph/graph_alignment/long_read_storage.hpp"
+#include "next_path_searcher.hpp"
+#include "scaffolder2015/extension_chooser2015.hpp"
+#include "algorithms/genome_consistance_checker.hpp"
+#include "scaffolder2015/scaffold_graph.hpp"
+#include "scaffolder2015/scaffold_graph_visualizer.hpp"
+
+namespace path_extend {
+
+using namespace debruijn_graph;
+typedef omnigraph::de::PairedInfoIndicesT<Graph> PairedInfoIndicesT;
+
+inline size_t FindMaxOverlapedLen(const vector<shared_ptr<PairedInfoLibrary> >& libs) {
+ size_t max = 0;
+ for (size_t i = 0; i < libs.size(); ++i) {
+ max = std::max(libs[i]->GetISMax(), max);
+ }
+ return max;
+}
+
+inline string GetEtcDir(const std::string& output_dir) {
+ return output_dir + cfg::get().pe_params.etc_dir + "/";
+}
+
+inline void DebugOutputPaths(const conj_graph_pack& gp,
+ const std::string& output_dir, const PathContainer& paths,
+ const string& name) {
+ PathInfoWriter path_writer;
+ PathVisualizer visualizer;
+
+ DefaultContigCorrector<ConjugateDeBruijnGraph> corrector(gp.g);
+ DefaultContigConstructor<ConjugateDeBruijnGraph> constructor(gp.g, corrector);
+ ContigWriter writer(gp.g, constructor, gp.components);
+
+ string etcDir = GetEtcDir(output_dir);
+ if (!cfg::get().pe_params.debug_output) {
+ return;
+ }
+ writer.OutputPaths(paths, etcDir + name);
+ if (cfg::get().pe_params.output.write_paths) {
+ path_writer.WritePaths(paths, etcDir + name + ".dat");
+ }
+ if (cfg::get().pe_params.viz.print_paths) {
+ visualizer.writeGraphWithPathsSimple(gp, etcDir + name + ".dot", name,
+ paths);
+ }
+}
+
+inline double GetWeightThreshold(shared_ptr<PairedInfoLibrary> lib, const pe_config::ParamSetT& pset) {
+ return lib->IsMp() ? pset.mate_pair_options.weight_threshold : pset.extension_options.weight_threshold;
+}
+
+inline double GetPriorityCoeff(shared_ptr<PairedInfoLibrary> lib, const pe_config::ParamSetT& pset) {
+ return lib->IsMp() ? pset.mate_pair_options.priority_coeff : pset.extension_options.priority_coeff;
+}
+
+inline void SetSingleThresholdForLib(shared_ptr<PairedInfoLibrary> lib, const pe_config::ParamSetT &pset, double threshold, double correction_coeff = 1.0) {
+ if (lib->IsMp()) {
+ lib->SetSingleThreshold(pset.mate_pair_options.use_default_single_threshold || math::le(threshold, 0.0) ?
+ pset.mate_pair_options.single_threshold : threshold);
+ }
+ else {
+ double t = pset.extension_options.use_default_single_threshold || math::le(threshold, 0.0) ?
+ pset.extension_options.single_threshold : threshold;
+ t = correction_coeff * t;
+ lib->SetSingleThreshold(t);
+ }
+}
+
+
+inline string MakeNewName(const std::string& contigs_name, const std::string& subname) {
+ return contigs_name.substr(0, contigs_name.rfind(".fasta")) + "_" + subname + ".fasta";
+}
+
+inline void OutputBrokenScaffolds(PathContainer& paths, int k,
+ const ContigWriter& writer,
+ const std::string& filename) {
+ if (!cfg::get().pe_params.param_set.scaffolder_options.on
+ or !cfg::get().use_scaffolder
+ or cfg::get().pe_params.obs == obs_none) {
+ return;
+ }
+
+ int min_gap = cfg::get().pe_params.obs == obs_break_all ? k / 2 : k;
+
+ ScaffoldBreaker breaker(min_gap, paths);
+ breaker.container().SortByLength();
+ writer.OutputPaths(breaker.container(), filename);
+}
+
+inline void AddPathsToContainer(const conj_graph_pack& gp,
+ const std::vector<PathInfo<Graph> > paths,
+ size_t size_threshold, PathContainer& result) {
+ for (size_t i = 0; i < paths.size(); ++i) {
+ auto path = paths.at(i);
+ vector<EdgeId> edges = path.getPath();
+ if (edges.size() <= size_threshold) {
+ continue;
+ }
+ BidirectionalPath* new_path = new BidirectionalPath(gp.g, edges);
+ BidirectionalPath* conj_path = new BidirectionalPath(new_path->Conjugate());
+ new_path->SetWeight((float) path.getWeight());
+ conj_path->SetWeight((float) path.getWeight());
+ result.AddPair(new_path, conj_path);
+ }
+ DEBUG("Long reads paths " << result.size() << " == ");
+}
+
+bool HasOnlyMPLibs() {
+ for (const auto& lib : cfg::get().ds.reads) {
+ if (!((lib.type() == io::LibraryType::MatePairs || lib.type() == io::LibraryType::HQMatePairs) &&
+ lib.data().mean_insert_size > 0.0)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+bool UseCoverageResolverForSingleReads(const io::LibraryType& type) {
+ return HasOnlyMPLibs() && (type == io::LibraryType::HQMatePairs);
+}
+
+inline size_t CountEdgesInGraph(const Graph& g) {
+ size_t count = 0;
+ for (auto iter = g.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
+ count++;
+ }
+ return count;
+}
+
+inline size_t GetNumberMPPaths(const Graph& g) {
+ size_t count_edge = CountEdgesInGraph(g);
+ if (count_edge < 1000) {
+ return 1000;
+ }
+ if (count_edge < 10000) {
+ return 100;
+ }
+ return 50;
+}
+
+inline string LibStr(size_t count) {
+ return count == 1 ? "library" : "libraries";
+}
+
+inline void ClonePathContainer(PathContainer& spaths, PathContainer& tpaths, GraphCoverageMap& tmap) {
+ tpaths.clear();
+ tmap.Clear();
+
+ for (auto iter = spaths.begin(); iter != spaths.end(); ++iter) {
+ BidirectionalPath& path = *iter.get();
+ BidirectionalPath* new_path = new BidirectionalPath(path.graph());
+ new_path->Subscribe(&tmap);
+ new_path->PushBack(path);
+
+ BidirectionalPath& cpath = *iter.getConjugate();
+ BidirectionalPath* new_cpath = new BidirectionalPath(cpath.graph());
+ new_cpath->Subscribe(&tmap);
+ new_cpath->PushBack(cpath);
+
+ tpaths.AddPair(new_path, new_cpath);
+ }
+}
+
+inline void FinalizePaths(PathContainer& paths, GraphCoverageMap& cover_map, size_t min_edge_len, size_t max_path_diff, bool mate_pairs = false) {
+ PathExtendResolver resolver(cover_map.graph());
+
+
+ if (cfg::get().pe_params.param_set.remove_overlaps) {
+ resolver.removeOverlaps(paths, cover_map, min_edge_len, max_path_diff, cfg::get().pe_params.param_set.cut_all_overlaps);
+ }
+ else {
+ resolver.removeEqualPaths(paths, cover_map, min_edge_len);
+ }
+ if (mate_pairs) {
+ resolver.RemoveMatePairEnds(paths, min_edge_len);
+ }
+ if (cfg::get().avoid_rc_connections) {
+ paths.FilterInterstandBulges();
+ }
+ paths.FilterEmptyPaths();
+ if (!mate_pairs) {
+ resolver.addUncoveredEdges(paths, cover_map);
+ }
+ paths.SortByLength();
+ for(auto& path : paths) {
+ path.first->ResetOverlaps();
+ }
+
+}
+
+inline void TraverseLoops(PathContainer& paths, GraphCoverageMap& cover_map, shared_ptr<ContigsMaker> extender) {
+ INFO("Traversing tandem repeats");
+ LoopTraverser loopTraverser(cover_map.graph(), cover_map, extender);
+ loopTraverser.TraverseAllLoops();
+ paths.SortByLength();
+}
+
+inline bool IsForSingleReadExtender(const io::SequencingLibrary<config::DataSetData> &lib) {
+ io::LibraryType lt = lib.type();
+ return (lib.data().single_reads_mapped ||
+ lt == io::LibraryType::PacBioReads ||
+ lt == io::LibraryType::SangerReads ||
+ lt == io::LibraryType::NanoporeReads ||
+ lib.is_contig_lib());
+}
+
+inline bool IsForPEExtender(const io::SequencingLibrary<config::DataSetData> &lib) {
+ return (lib.type() == io::LibraryType::PairedEnd &&
+ lib.data().mean_insert_size > 0.0);
+}
+
+inline bool IsForShortLoopExtender(const io::SequencingLibrary<config::DataSetData> &lib) {
+ return (lib.type() == io::LibraryType::PairedEnd &&
+ lib.data().mean_insert_size > 0.0);
+}
+
+inline bool IsForScaffoldingExtender(const io::SequencingLibrary<config::DataSetData> &lib) {
+ return (lib.type() == io::LibraryType::PairedEnd &&
+ lib.data().mean_insert_size > 0.0);
+}
+
+inline bool IsForMPExtender(const io::SequencingLibrary<config::DataSetData> &lib) {
+ return lib.data().mean_insert_size > 0.0 &&
+ (lib.type() == io::LibraryType::HQMatePairs ||
+ lib.type() == io::LibraryType::MatePairs);
+}
+
+enum class PathExtendStage {
+ PEStage,
+ PEPolishing,
+ MPStage,
+ FinalizingPEStage,
+ FinalPolishing,
+};
+
+inline bool IsPEStage(PathExtendStage stage) {
+ return stage == PathExtendStage::PEPolishing || stage == PathExtendStage::PEStage;
+}
+
+inline bool IsMPStage(PathExtendStage stage) {
+ return stage == PathExtendStage::MPStage;
+}
+
+inline bool IsFinalStage(PathExtendStage stage) {
+ return stage == PathExtendStage::FinalizingPEStage || stage == PathExtendStage::FinalPolishing;
+}
+
+inline bool IsPolishingStage(PathExtendStage stage) {
+ return stage == PathExtendStage::PEPolishing || stage == PathExtendStage::FinalPolishing;
+}
+
+
+template<class Index>
+inline shared_ptr<PairedInfoLibrary> MakeNewLib(const conj_graph_pack::graph_t& g,
+ const Index& paired_index,
+ size_t index) {
+ const auto& lib = cfg::get().ds.reads[index];
+ size_t read_length = lib.data().read_length;
+ size_t is = (size_t) lib.data().mean_insert_size;
+ int is_min = (int) lib.data().insert_size_left_quantile;
+ int is_max = (int) lib.data().insert_size_right_quantile;
+ int var = (int) lib.data().insert_size_deviation;
+ bool is_mp = lib.type() == io::LibraryType::MatePairs || lib.type() == io::LibraryType::HQMatePairs ;
+ return make_shared< PairedInfoLibraryWithIndex<decltype(paired_index[index])> >(cfg::get().K, g, read_length,
+ is, is_min > 0.0 ? size_t(is_min) : 0, is_max > 0.0 ? size_t(is_max) : 0,
+ size_t(var),
+ paired_index[index], is_mp,
+ lib.data().insert_size_distribution);
+}
+
+pe_config::LongReads GetLongReadsConfig(const io::LibraryType& type) {
+ auto long_reads = cfg::get().pe_params.long_reads;
+ if (io::SequencingLibraryBase::is_long_read_lib(type)) {
+ return long_reads.pacbio_reads;
+ } else if (type == io::LibraryType::PathExtendContigs){
+ return long_reads.meta_contigs;
+ } else if (io::SequencingLibraryBase::is_contig_lib(type)) {
+ return long_reads.contigs;
+ }
+ return long_reads.single_reads;
+}
+
+inline shared_ptr<ExtensionChooser> MakeLongReadsExtensionChooser(const conj_graph_pack& gp,
+ size_t lib_index,
+ size_t max_repeat_length) {
+ PathContainer paths;
+ AddPathsToContainer(gp, gp.single_long_reads[lib_index].GetAllPaths(), 1, paths);
+
+ auto long_reads_config = GetLongReadsConfig(cfg::get().ds.reads[lib_index].type());
+ return make_shared<LongReadsExtensionChooser>(gp.g, paths, long_reads_config.filtering,
+ long_reads_config.weight_priority,
+ long_reads_config.unique_edge_priority,
+ long_reads_config.min_significant_overlap,
+ max_repeat_length);
+}
+
+inline shared_ptr<SimpleExtender> MakeLongReadsExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map,
+ size_t lib_index,
+ const pe_config::ParamSetT& pset) {
+ const auto& lib = cfg::get().ds.reads[lib_index];
+ size_t resolvable_repeat_length_bound = 10000ul;
+ if (!lib.is_contig_lib()) {
+ resolvable_repeat_length_bound = std::max(resolvable_repeat_length_bound, lib.data().read_length);
+ }
+ INFO("resolvable_repeat_length_bound set to " << resolvable_repeat_length_bound);
+
+ auto long_read_ec = MakeLongReadsExtensionChooser(gp, lib_index, pset.extension_options.max_repeat_length);
+ return make_shared<SimpleExtender>(gp, cov_map, long_read_ec, resolvable_repeat_length_bound,
+ pset.loop_removal.max_loops, true, UseCoverageResolverForSingleReads(lib.type()));
+}
+
+inline shared_ptr<SimpleExtender> MakeLongEdgePEExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map,
+ size_t lib_index, const pe_config::ParamSetT& pset, bool investigate_loops) {
+ shared_ptr<PairedInfoLibrary> lib = MakeNewLib(gp.g, gp.clustered_indices, lib_index);
+ SetSingleThresholdForLib(lib, pset, cfg::get().ds.reads[lib_index].data().pi_threshold);
+ INFO("Threshold for lib #" << lib_index << ": " << lib->GetSingleThreshold());
+
+ shared_ptr<WeightCounter> wc = make_shared<PathCoverWeightCounter>(gp.g, lib, pset.normalize_weight);
+ shared_ptr<ExtensionChooser> extension = make_shared<LongEdgeExtensionChooser>(gp.g, wc, GetWeightThreshold(lib, pset), GetPriorityCoeff(lib, pset));
+ return make_shared<SimpleExtender>(gp, cov_map, extension, lib->GetISMax(), pset.loop_removal.max_loops, investigate_loops, false);
+}
+
+inline shared_ptr<SimpleExtensionChooser> MakeMetaExtensionChooser(const conj_graph_pack& gp,
+ shared_ptr<PairedInfoLibrary> lib,
+ const pe_config::ParamSetT& pset) {
+ VERIFY(cfg::get().mode == config::pipeline_type::meta);
+ VERIFY(!lib->IsMp());
+ shared_ptr<WeightCounter> wc = make_shared<MetagenomicWeightCounter>(gp.g, lib, /*read_length*/cfg::get().ds.RL(),
+ /*normalized_threshold*/ 0.3, /*raw_threshold*/ 3, /*estimation_edge_length*/ 300);
+ return make_shared<SimpleExtensionChooser>(gp.g, wc,
+ pset.extension_options.weight_threshold,
+ pset.extension_options.priority_coeff);
+}
+
+inline shared_ptr<SimpleExtender> MakeMetaExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map,
+ size_t lib_index, const pe_config::ParamSetT& pset, bool investigate_loops) {
+ shared_ptr<PairedInfoLibrary> lib = MakeNewLib(gp.g, gp.clustered_indices, lib_index);
+ return make_shared<SimpleExtender>(gp, cov_map, MakeMetaExtensionChooser(gp, lib, pset),
+ lib->GetISMax(), pset.loop_removal.max_loops,
+ investigate_loops, false);
+}
+
+inline shared_ptr<SimpleExtender> MakePEExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map,
+ size_t lib_index, const pe_config::ParamSetT& pset, bool investigate_loops) {
+ shared_ptr<PairedInfoLibrary> lib = MakeNewLib(gp.g, gp.clustered_indices, lib_index);
+ SetSingleThresholdForLib(lib, pset, cfg::get().ds.reads[lib_index].data().pi_threshold);
+ INFO("Threshold for lib #" << lib_index << ": " << lib->GetSingleThreshold());
+
+ shared_ptr<WeightCounter> wc = make_shared<PathCoverWeightCounter>(gp.g, lib, pset.normalize_weight);
+ auto extension = make_shared<SimpleExtensionChooser>(gp.g, wc, GetWeightThreshold(lib, pset), GetPriorityCoeff(lib, pset));
+ return make_shared<SimpleExtender>(gp, cov_map, extension, lib->GetISMax(), pset.loop_removal.max_loops, investigate_loops, false);
+}
+
+inline shared_ptr<PathExtender> MakeScaffoldingExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map,
+ size_t lib_index, const pe_config::ParamSetT& pset) {
+ shared_ptr<PairedInfoLibrary> lib = MakeNewLib(gp.g, gp.scaffolding_indices, lib_index);
+
+ shared_ptr<WeightCounter> counter = make_shared<ReadCountWeightCounter>(gp.g, lib);
+ //FIXME this variable was not used!
+ //double prior_coef = GetPriorityCoeff(lib, pset);
+ //FIXME review parameters
+ //todo put parameters in config
+ //FIXME remove max_must_overlap from config
+ double var_coeff = 3.0;
+ auto scaff_chooser = std::make_shared<ScaffoldingExtensionChooser>(gp.g, counter, var_coeff);
+
+ vector<shared_ptr<GapJoiner>> joiners;
+
+ if (pset.scaffolder_options.use_la_gap_joiner) {
+ joiners.push_back(std::make_shared<LAGapJoiner>(gp.g, pset.scaffolder_options.min_overlap_length,
+ pset.scaffolder_options.flank_multiplication_coefficient,
+ pset.scaffolder_options.flank_addition_coefficient));
+ }
+
+ joiners.push_back(std::make_shared<HammingGapJoiner>(gp.g, pset.scaffolder_options.min_gap_score,
+ pset.scaffolder_options.short_overlap,
+ (int) 2 * cfg::get().ds.RL()));
+
+ auto composite_gap_joiner = std::make_shared<CompositeGapJoiner>(gp.g,
+ joiners,
+ size_t(pset.scaffolder_options.max_can_overlap * (double) gp.g.k()),
+ int(math::round((double) gp.g.k() - var_coeff * (double) lib->GetIsVar())),
+ pset.scaffolder_options.artificial_gap);
+
+ return make_shared<ScaffoldingPathExtender>(gp, cov_map, scaff_chooser, composite_gap_joiner, lib->GetISMax(), pset.loop_removal.max_loops, false);
+}
+
+
+inline shared_ptr<PathExtender> MakeScaffolding2015Extender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map,
+ size_t lib_index, const pe_config::ParamSetT& pset, const ScaffoldingUniqueEdgeStorage& storage) {
+ shared_ptr<PairedInfoLibrary> lib;
+ INFO("for lib " << lib_index);
+
+ //TODO:: temporary solution
+ if (gp.paired_indices[lib_index].size() > gp.clustered_indices[lib_index].size()) {
+ INFO("Paired unclustered indices not empty, using them");
+ lib = MakeNewLib(gp.g, gp.paired_indices, lib_index);
+ } else if (gp.clustered_indices[lib_index].size() != 0 ) {
+ INFO("clustered indices not empty, using them");
+ lib = MakeNewLib(gp.g, gp.clustered_indices, lib_index);
+ } else {
+ ERROR("All paired indices are empty!");
+ }
+
+ shared_ptr<WeightCounter> counter = make_shared<ReadCountWeightCounter>(gp.g, lib);
+//TODO::was copypasted from MakeScaffoldingExtender
+//TODO::REWRITE
+ double var_coeff = 3.0;
+ DEBUG("here creating extchooser");
+//TODO: 2 is relative weight cutoff, to config!
+ auto scaff_chooser = std::make_shared<ExtensionChooser2015>(gp.g, counter, var_coeff, storage, 2, lib_index);
+
+ auto gap_joiner = std::make_shared<HammingGapJoiner>(gp.g, pset.scaffolder_options.min_gap_score,
+ pset.scaffolder_options.short_overlap,
+ (int) 2 * cfg::get().ds.RL());
+
+ return make_shared<ScaffoldingPathExtender>(gp, cov_map, scaff_chooser, gap_joiner, lib->GetISMax(), pset.loop_removal.max_loops, false , false);
+}
+
+
+inline shared_ptr<SimpleExtender> MakeMPExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map, const PathContainer& paths,
+ size_t lib_index, const pe_config::ParamSetT& pset) {
+
+ shared_ptr<PairedInfoLibrary> lib = MakeNewLib(gp.g, gp.paired_indices, lib_index);
+ SetSingleThresholdForLib(lib, pset, cfg::get().ds.reads[lib_index].data().pi_threshold);
+ INFO("Threshold for lib #" << lib_index << ": " << lib->GetSingleThreshold());
+
+ size_t max_number_of_paths_to_search = GetNumberMPPaths(gp.g);
+ DEBUG("max number of mp paths " << max_number_of_paths_to_search);
+
+ shared_ptr<MatePairExtensionChooser> chooser = make_shared<MatePairExtensionChooser>(gp.g, lib, paths, max_number_of_paths_to_search);
+ return make_shared<SimpleExtender>(gp, cov_map, chooser, lib->GetISMax(), pset.loop_removal.mp_max_loops, true, false);
+}
+
+inline shared_ptr<SimpleExtender> MakeCoordCoverageExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map,
+ const pe_config::ParamSetT& pset) {
+ shared_ptr<PairedInfoLibrary> lib = MakeNewLib(gp.g, gp.clustered_indices, 0);
+ CoverageAwareIdealInfoProvider provider(gp.g, lib, -1ul, 0);
+ auto coord_chooser = make_shared<CoordinatedCoverageExtensionChooser>(gp.g, provider,
+ pset.coordinated_coverage.max_edge_length_in_repeat,
+ pset.coordinated_coverage.delta,
+ pset.coordinated_coverage.min_path_len);
+ auto chooser = make_shared<JointExtensionChooser>(gp.g, MakeMetaExtensionChooser(gp, lib, pset), coord_chooser);
+ return make_shared<SimpleExtender>(gp, cov_map, chooser, -1ul, pset.loop_removal.mp_max_loops, true, false);
+}
+
+inline shared_ptr<SimpleExtender> MakeRNAExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map,
+ size_t lib_index, const pe_config::ParamSetT& pset, bool investigate_loops) {
+ shared_ptr<PairedInfoLibrary> lib = MakeNewLib(gp.g, gp.clustered_indices, lib_index);
+ SetSingleThresholdForLib(lib, pset, cfg::get().ds.reads[lib_index].data().pi_threshold);
+ INFO("Threshold for lib #" << lib_index << ": " << lib->GetSingleThreshold());
+
+ shared_ptr<WeightCounter> wc = make_shared<PathCoverWeightCounter>(gp.g, lib, pset.normalize_weight);
+ shared_ptr<RNAExtensionChooser> extension = make_shared<RNAExtensionChooser>(gp.g, wc, GetWeightThreshold(lib, pset), GetPriorityCoeff(lib, pset));
+ return make_shared<MultiExtender>(gp, cov_map, extension, lib->GetISMax(), pset.loop_removal.max_loops, investigate_loops, false);
+}
+
+inline shared_ptr<SimpleExtender> MakeRNALongReadsExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map, size_t lib_index,
+ const pe_config::ParamSetT& pset) {
+ VERIFY_MSG(false, "Long reads rna extender is not implemented yet")
+
+ const auto& lib = cfg::get().ds.reads[lib_index];
+ size_t resolvable_repeat_length_bound = 10000ul;
+ if (!lib.is_contig_lib()) {
+ resolvable_repeat_length_bound = std::max(resolvable_repeat_length_bound, lib.data().read_length);
+ }
+ INFO("resolvable_repeat_length_bound set to " << resolvable_repeat_length_bound);
+
+ auto long_reads_ec = MakeLongReadsExtensionChooser(gp, lib_index, pset.extension_options.max_repeat_length);
+ return make_shared<SimpleExtender>(gp, cov_map, long_reads_ec, resolvable_repeat_length_bound,
+ pset.loop_removal.max_loops, true, UseCoverageResolverForSingleReads(lib.type()));
+}
+
+inline bool InsertSizeCompare(const shared_ptr<PairedInfoLibrary> lib1,
+ const shared_ptr<PairedInfoLibrary> lib2) {
+ return lib1->GetISMax() < lib2->GetISMax();
+}
+
+template<typename Base, typename T>
+inline bool instanceof(const T *ptr) {
+ return dynamic_cast<const Base*>(ptr) != nullptr;
+}
+
+//Used for debug purpose only
+inline void PrintExtenders(vector<shared_ptr<PathExtender> >& extenders) {
+ DEBUG("Extenders in vector:");
+ for(size_t i = 0; i < extenders.size(); ++i) {
+ string type = typeid(*extenders[i]).name();
+ DEBUG("Extender #i" << type);
+ if (instanceof<SimpleExtender>(extenders[i].get())) {
+ auto ec = ((SimpleExtender *) extenders[i].get())->GetExtensionChooser();
+ string chooser_type = typeid(*ec).name();
+ DEBUG(" Extender #i" << chooser_type);
+ }
+ else if (instanceof<ScaffoldingPathExtender>(extenders[i].get())) {
+ auto ec = ((ScaffoldingPathExtender *) extenders[i].get())->GetExtensionChooser();
+ string chooser_type = typeid(*ec).name();
+ DEBUG(" Extender #i" << chooser_type);
+ }
+ }
+}
+
+inline vector<shared_ptr<PathExtender> > MakeAllExtenders(PathExtendStage stage, const conj_graph_pack& gp, const GraphCoverageMap& cov_map,
+ const pe_config::ParamSetT& pset, const ScaffoldingUniqueEdgeStorage& storage, const PathContainer& paths_for_mp = PathContainer()) {
+
+ vector<shared_ptr<PathExtender> > result;
+ vector<shared_ptr<PathExtender> > pes;
+ vector<shared_ptr<PathExtender> > pes2015;
+ vector<shared_ptr<PathExtender> > pe_loops;
+ vector<shared_ptr<PathExtender> > pe_scafs;
+ vector<shared_ptr<PathExtender> > mps;
+
+ size_t single_read_libs = 0;
+ size_t pe_libs = 0;
+ size_t scf_pe_libs = 0;
+ size_t mp_libs = 0;
+
+ for (io::LibraryType lt : io::LibraryPriotity) {
+ for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i) {
+ const auto& lib = cfg::get().ds.reads[i];
+ if (lib.type() != lt)
+ continue;
+
+ //TODO: scaff2015 does not need any single read libs?
+ if (IsForSingleReadExtender(lib) && pset.sm != sm_2015) {
+ result.push_back(MakeLongReadsExtender(gp, cov_map, i, pset));
+ ++single_read_libs;
+ }
+ if (IsForPEExtender(lib)) {
+ ++pe_libs;
+ if (IsPEStage(stage) && (pset.sm == sm_old_pe_2015 || pset.sm == sm_old || pset.sm == sm_combined)) {
+ if (cfg::get().mode == config::pipeline_type::meta)
+ //TODO proper configuration via config
+ pes.push_back(MakeMetaExtender(gp, cov_map, i, pset, false));
+ else if (cfg::get().mode == config::pipeline_type::moleculo)
+ pes.push_back(MakeLongEdgePEExtender(gp, cov_map, i, pset, false));
+ else if (cfg::get().mode == config::pipeline_type::rna && !IsPolishingStage(stage))
+ pes.push_back(MakeRNAExtender(gp, cov_map, i, pset, false));
+ else
+ pes.push_back(MakePEExtender(gp, cov_map, i, pset, false));
+ }
+ else if (pset.sm == sm_2015) {
+ pes2015.push_back(MakeScaffolding2015Extender(gp, cov_map, i, pset, storage));
+ }
+ }
+ //FIXME logic is very cryptic!
+ if (IsForShortLoopExtender(lib) && (pset.sm == sm_old_pe_2015 || pset.sm == sm_old || pset.sm == sm_combined)) {
+ if (cfg::get().mode == config::pipeline_type::meta)
+ pes.push_back(MakeMetaExtender(gp, cov_map, i, pset, true));
+ else if (cfg::get().mode == config::pipeline_type::rna && !IsPolishingStage(stage))
+ pes.push_back(MakeRNAExtender(gp, cov_map, i, pset, true));
+ else
+ pe_loops.push_back(MakePEExtender(gp, cov_map, i, pset, true));
+ }
+ if (IsForScaffoldingExtender(lib) && cfg::get().use_scaffolder && pset.scaffolder_options.on) {
+ ++scf_pe_libs;
+ if (pset.sm == sm_old || pset.sm == sm_combined) {
+ pe_scafs.push_back(MakeScaffoldingExtender(gp, cov_map, i, pset));
+ }
+ if (pset.sm == sm_old_pe_2015 || pset.sm == sm_combined) {
+ pe_scafs.push_back(MakeScaffolding2015Extender(gp, cov_map, i, pset, storage));
+ }
+ }
+ if (IsForMPExtender(lib) && IsMPStage(stage)) {
+ ++mp_libs;
+ if (pset.sm == sm_old || pset.sm == sm_combined) {
+ mps.push_back(MakeMPExtender(gp, cov_map, paths_for_mp, i, pset));
+ }
+ if (is_2015_scaffolder_enabled(pset.sm)) {
+ mps.push_back(MakeScaffolding2015Extender(gp, cov_map, i, pset, storage));
+ }
+ }
+ }
+
+ //std::sort(scaff_libs.begin(), scaff_libs.end(), InsertSizeCompare);
+ result.insert(result.end(), pes.begin(), pes.end());
+ result.insert(result.end(), pes2015.begin(), pes2015.end());
+ result.insert(result.end(), pe_loops.begin(), pe_loops.end());
+ result.insert(result.end(), pe_scafs.begin(), pe_scafs.end());
+ result.insert(result.end(), mps.begin(), mps.end());
+ pes.clear();
+ pe_loops.clear();
+ pe_scafs.clear();
+ pes2015.clear();
+ mps.clear();
+ }
+
+ INFO("Using " << pe_libs << " paired-end " << LibStr(pe_libs));
+ INFO("Using " << scf_pe_libs << " paired-end scaffolding " << LibStr(scf_pe_libs));
+ INFO("Using " << mp_libs << " mate-pair " << LibStr(mp_libs));
+ INFO("Using " << single_read_libs << " single read " << LibStr(single_read_libs));
+ INFO("Scaffolder is " << (pset.scaffolder_options.on ? "on" : "off"));
+
+ if (pset.use_coordinated_coverage) {
+ INFO("Using additional coordinated coverage extender");
+ result.push_back(MakeCoordCoverageExtender(gp, cov_map, pset));
+ }
+
+ PrintExtenders(result);
+ return result;
+}
+
+inline shared_ptr<scaffold_graph::ScaffoldGraph> ConstructScaffoldGraph(const conj_graph_pack& gp,
+ const ScaffoldingUniqueEdgeStorage& edge_storage,
+ const pe_config::ParamSetT::ScaffoldGraphParamsT& params) {
+ using namespace scaffold_graph;
+ vector<shared_ptr<ConnectionCondition>> conditions;
+
+ INFO("Constructing connections");
+ LengthEdgeCondition edge_condition(gp.g, edge_storage.GetMinLength());
+
+ for (size_t lib_index = 0; lib_index < cfg::get().ds.reads.lib_count(); ++lib_index) {
+ auto lib = cfg::get().ds.reads[lib_index];
+ if (lib.is_paired()) {
+ shared_ptr<PairedInfoLibrary> paired_lib;
+ if (IsForMPExtender(lib))
+ paired_lib = MakeNewLib(gp.g, gp.paired_indices, lib_index);
+ else if (IsForPEExtender(lib))
+ paired_lib = MakeNewLib(gp.g, gp.clustered_indices, lib_index);
+ else
+ INFO("Unusable paired lib #" << lib_index);
+ conditions.push_back(make_shared<AdvancedPairedConnectionCondition>(gp.g, paired_lib, lib_index,
+ params.always_add,
+ params.never_add,
+ params.relative_threshold));
+ }
+ }
+ if (params.graph_connectivity) {
+ auto as_con = make_shared<AssemblyGraphConnectionCondition>(gp.g, params.max_path_length, edge_storage);
+ for (auto e_iter = gp.g.ConstEdgeBegin(); !e_iter.IsEnd(); ++e_iter) {
+ if (edge_condition.IsSuitable(*e_iter))
+ as_con->AddInterestingEdge(*e_iter);
+ }
+ conditions.push_back(as_con);
+ }
+ INFO("Total conditions " << conditions.size());
+
+ INFO("Constructing scaffold graph from set of size " << edge_storage.GetSet().size());
+
+ DefaultScaffoldGraphConstructor constructor(gp.g, edge_storage.GetSet(), conditions, edge_condition);
+ auto scaffoldGraph = constructor.Construct();
+
+ INFO("Scaffold graph contains " << scaffoldGraph->VertexCount() << " vertices and " << scaffoldGraph->EdgeCount() << " edges");
+ return scaffoldGraph;
+}
+
+
+inline void PrintScaffoldGraph(shared_ptr<scaffold_graph::ScaffoldGraph> scaffoldGraph,
+ const set<EdgeId> main_edge_set,
+ const string& filename) {
+ using namespace scaffold_graph;
+
+ auto vcolorer = make_shared<ScaffoldVertexSetColorer>(main_edge_set);
+ auto ecolorer = make_shared<ScaffoldEdgeColorer>();
+ CompositeGraphColorer <ScaffoldGraph> colorer(vcolorer, ecolorer);
+
+ INFO("Visualizing single grpah");
+ ScaffoldGraphVisualizer singleVisualizer(*scaffoldGraph, false);
+ std::ofstream single_dot;
+ single_dot.open((filename + "_single.dot").c_str());
+ singleVisualizer.Visualize(single_dot, colorer);
+ single_dot.close();
+
+ INFO("Visualizing paired grpah");
+ ScaffoldGraphVisualizer pairedVisualizer(*scaffoldGraph, true);
+ std::ofstream paired_dot;
+ paired_dot.open((filename + "_paired.dot").c_str());
+ pairedVisualizer.Visualize(paired_dot, colorer);
+ paired_dot.close();
+
+ INFO("Printing scaffold grpah");
+ std::ofstream data_stream;
+ data_stream.open((filename + ".data").c_str());
+ scaffoldGraph->Print(data_stream);
+ data_stream.close();
+}
+
+
+inline size_t FindOverlapLenForStage(PathExtendStage stage) {
+ size_t res = 0;
+ for (const auto& lib : cfg::get().ds.reads) {
+ if (IsForPEExtender(lib) && IsPEStage(stage)) {
+ res = max(res, (size_t) lib.data().insert_size_right_quantile);
+ } else if (IsForShortLoopExtender(lib)) {
+ res = max(res, (size_t) lib.data().insert_size_right_quantile);
+ } else if (IsForMPExtender(lib) && IsMPStage(stage)) {
+ res = max(res, (size_t) lib.data().insert_size_right_quantile);
+ }
+ }
+ return res;
+}
+
+inline bool MPLibsExist() {
+ for (const auto& lib : cfg::get().ds.reads)
+ if (IsForMPExtender(lib))
+ return true;
+
+ return false;
+}
+
+inline void CountMisassembliesWithReference(debruijn_graph::GenomeConsistenceChecker& genome_checker, const PathContainer& paths) {
+ size_t total_mis = 0 , gap_mis = 0;
+ genome_checker.SpellGenome();
+ for (auto iter = paths.begin(); iter != paths.end(); ++iter) {
+ BidirectionalPath *path = iter.get();
+ auto map_res = genome_checker.CountMisassemblies(*path);
+ if (map_res.misassemblies > 0) {
+ INFO ("there are " << map_res.misassemblies << " misassemblies in path: ");
+ path->PrintInfo();
+ total_mis += map_res.misassemblies;
+ }
+ if (map_res.wrong_gap_size > 0) {
+ INFO ("there are " << map_res.wrong_gap_size << " wrong gaps in path: ");
+ path->PrintInfo();
+ gap_mis += map_res.wrong_gap_size;
+ }
+ }
+ INFO ("In total found " << total_mis << " misassemblies " << " and " << gap_mis << " gaps.");
+}
+
+inline ScaffoldingUniqueEdgeStorage FillUniqueEdgeStorage(const conj_graph_pack& gp,
+ size_t& min_unique_length,
+ double& unique_variation) {
+
+ ScaffoldingUniqueEdgeStorage main_unique_storage;
+ //Setting scaffolding2015 parameters
+ if (cfg::get().pe_params.param_set.scaffolding2015.autodetect) {
+ INFO("Autodetecting unique edge set parameters...");
+ bool pe_found = false;
+ //TODO constants
+ size_t min_MP_IS = 10000;
+ for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i) {
+
+ if (IsForPEExtender(cfg::get().ds.reads[i])) {
+ pe_found = true;
+ }
+ if (IsForMPExtender(cfg::get().ds.reads[i])) {
+ min_MP_IS = min(min_MP_IS, (size_t) cfg::get().ds.reads[i].data().mean_insert_size);
+ }
+ }
+ if (pe_found) {
+ //TODO constants
+ unique_variation = 0.5;
+ INFO("PE lib found, we believe in coverage");
+ } else {
+ unique_variation = 50;
+ INFO("No paired libs found, we do not believe in coverage");
+ }
+ min_unique_length = min_MP_IS;
+ INFO("Minimal unique edge length set to the smallest MP library IS: " << min_unique_length);
+
+ } else {
+ INFO("Unique edge set constructed with parameters from config : length " << min_unique_length
+ << " variation " << unique_variation);
+ }
+ ScaffoldingUniqueEdgeAnalyzer unique_edge_analyzer(gp, min_unique_length, unique_variation);
+ unique_edge_analyzer.FillUniqueEdgeStorage(main_unique_storage);
+
+ return main_unique_storage;
+}
+
+inline void ResolveRepeatsPe(conj_graph_pack& gp,
+ const std::string& output_dir,
+ const std::string& contigs_name,
+ bool traversLoops,
+ boost::optional<std::string> broken_contigs) {
+
+ INFO("ExSPAnder repeat resolving tool started");
+
+ ScaffoldingUniqueEdgeStorage main_unique_storage;
+ auto sc_mode = cfg::get().pe_params.param_set.sm;
+ auto min_unique_length = cfg::get().pe_params.param_set.scaffolding2015.min_unique_length;
+ auto unique_variaton = cfg::get().pe_params.param_set.scaffolding2015.unique_coverage_variation;
+
+ if (is_2015_scaffolder_enabled(sc_mode)) {
+ main_unique_storage = FillUniqueEdgeStorage(gp, min_unique_length, unique_variaton);
+ }
+
+ make_dir(output_dir);
+ make_dir(GetEtcDir(output_dir));
+ const pe_config::ParamSetT &pset = cfg::get().pe_params.param_set;
+
+ //Scaffold graph
+ shared_ptr<scaffold_graph::ScaffoldGraph> scaffoldGraph;
+ if (cfg::get().pe_params.param_set.scaffold_graph_params.construct) {
+ scaffoldGraph = ConstructScaffoldGraph(gp, main_unique_storage, cfg::get().pe_params.param_set.scaffold_graph_params);
+ if (cfg::get().pe_params.param_set.scaffold_graph_params.output) {
+ PrintScaffoldGraph(scaffoldGraph, main_unique_storage.GetSet(), GetEtcDir(output_dir) + "scaffold_graph");
+ }
+ }
+
+
+ DefaultContigCorrector<ConjugateDeBruijnGraph> corrector(gp.g);
+ DefaultContigConstructor<ConjugateDeBruijnGraph> constructor(gp.g, corrector);
+ ContigWriter writer(gp.g, constructor, gp.components);
+
+
+//make pe + long reads extenders
+ GraphCoverageMap cover_map(gp.g);
+ INFO("SUBSTAGE = paired-end libraries")
+ PathExtendStage exspander_stage = PathExtendStage::PEStage;
+ vector<shared_ptr<PathExtender> > all_libs = MakeAllExtenders(exspander_stage, gp, cover_map, pset,
+ main_unique_storage);
+
+ //Parameters are subject to change
+ size_t max_is_right_quantile = max(FindOverlapLenForStage(exspander_stage), gp.g.k() + 100);
+ size_t min_edge_len = 100;
+
+ shared_ptr<CompositeExtender> mainPE = make_shared<CompositeExtender>(gp.g, cover_map, all_libs,
+ max_is_right_quantile, main_unique_storage,
+ cfg::get().pe_params.param_set.extension_options.max_repeat_length);
+
+//extend pe + long reads
+ PathExtendResolver resolver(gp.g);
+ auto seeds = resolver.makeSimpleSeeds();
+ DebugOutputPaths(gp, output_dir, seeds, "init_paths");
+ seeds.SortByLength();
+ INFO("Growing paths using paired-end and long single reads");
+ auto paths = resolver.extendSeeds(seeds, *mainPE);
+ paths.SortByLength();
+ DebugOutputPaths(gp, output_dir, paths, "pe_before_overlap");
+
+ PathContainer clone_paths;
+ GraphCoverageMap clone_map(gp.g);
+ bool mp_exist = MPLibsExist();
+
+ if (mp_exist) {
+ ClonePathContainer(paths, clone_paths, clone_map);
+ }
+
+ exspander_stage = PathExtendStage::PEPolishing;
+ all_libs = MakeAllExtenders(exspander_stage, gp, cover_map, pset, main_unique_storage);
+ mainPE = make_shared<CompositeExtender>(gp.g, cover_map, all_libs,
+ max_is_right_quantile, main_unique_storage,
+ cfg::get().pe_params.param_set.extension_options.max_repeat_length);
+
+ //We do not run overlap removal in 2015 mode
+ if (!is_2015_scaffolder_enabled(sc_mode))
+ FinalizePaths(paths, cover_map, min_edge_len, max_is_right_quantile);
+ if (broken_contigs.is_initialized()) {
+ OutputBrokenScaffolds(paths, (int) gp.g.k(), writer,
+ output_dir + (mp_exist ? "pe_contigs" : broken_contigs.get()));
+ }
+ DebugOutputPaths(gp, output_dir, paths, "pe_before_traverse");
+ if (traversLoops) {
+ TraverseLoops(paths, cover_map, mainPE);
+ FinalizePaths(paths, cover_map, min_edge_len, max_is_right_quantile);
+ }
+ DebugOutputPaths(gp, output_dir, paths, (mp_exist ? "pe_final_paths" : "final_paths"));
+ writer.OutputPaths(paths, output_dir + (mp_exist ? "pe_scaffolds" : contigs_name));
+
+ cover_map.Clear();
+ seeds.DeleteAllPaths();
+ paths.DeleteAllPaths();
+ if (!mp_exist) {
+ return;
+ }
+
+//MP
+ DebugOutputPaths(gp, output_dir, clone_paths, "mp_before_extend");
+
+ INFO("SUBSTAGE = mate-pair libraries ")
+ exspander_stage = PathExtendStage::MPStage;
+ all_libs.clear();
+ max_is_right_quantile = FindOverlapLenForStage(exspander_stage);
+ PathContainer mp_paths(clone_paths);
+
+ if (is_2015_scaffolder_enabled(sc_mode)) {
+ //TODO: constants
+ for (auto cur_length = min_unique_length; cur_length > 500; cur_length -= 500) {
+ ScaffoldingUniqueEdgeStorage current_unique_storage;
+ ScaffoldingUniqueEdgeAnalyzer unique_edge_analyzer(gp, cur_length, unique_variaton);
+ unique_edge_analyzer.FillUniqueEdgeStorage(current_unique_storage);
+ all_libs = MakeAllExtenders(exspander_stage, gp, clone_map, pset, current_unique_storage, clone_paths);
+ shared_ptr<CompositeExtender> mp_main_pe = make_shared<CompositeExtender>(gp.g, clone_map, all_libs,
+ max_is_right_quantile,
+ main_unique_storage,
+ cfg::get().pe_params.param_set.extension_options.max_repeat_length);
+ INFO("Growing paths using mate-pairs unique length " << cur_length);
+ mp_paths = resolver.extendSeeds(mp_paths, *mp_main_pe);
+ DebugOutputPaths(gp, output_dir, mp_paths, "mp_before_overlap_" + std::to_string(cur_length));
+ }
+ } else {
+ all_libs = MakeAllExtenders(exspander_stage, gp, clone_map, pset, main_unique_storage, clone_paths);
+ shared_ptr<CompositeExtender> mp_main_pe = make_shared<CompositeExtender>(gp.g, clone_map, all_libs,
+ max_is_right_quantile,
+ main_unique_storage,
+ cfg::get().pe_params.param_set.extension_options.max_repeat_length);
+ INFO("Growing paths using mate-pairs");
+ mp_paths = resolver.extendSeeds(clone_paths, *mp_main_pe);
+
+ DebugOutputPaths(gp, output_dir, mp_paths, "mp_before_overlap");
+ FinalizePaths(mp_paths, clone_map, max_is_right_quantile, max_is_right_quantile, true);
+ }
+ DebugOutputPaths(gp, output_dir, mp_paths, "mp_final_paths");
+ DEBUG("Paths are grown with mate-pairs");
+
+//MP end
+
+//pe again
+ INFO("SUBSTAGE = polishing paths")
+ exspander_stage = PathExtendStage::FinalizingPEStage;
+ all_libs.clear();
+ all_libs = MakeAllExtenders(exspander_stage, gp, clone_map, pset, main_unique_storage);
+ max_is_right_quantile = FindOverlapLenForStage(exspander_stage);
+ shared_ptr<CompositeExtender> last_extender = make_shared<CompositeExtender>(gp.g, clone_map, all_libs,
+ max_is_right_quantile, main_unique_storage,
+ cfg::get().pe_params.param_set.extension_options.max_repeat_length);
+
+ auto last_paths = resolver.extendSeeds(mp_paths, *last_extender);
+ DebugOutputPaths(gp, output_dir, last_paths, "mp2_before_overlap");
+
+ exspander_stage = PathExtendStage::FinalPolishing;
+ all_libs = MakeAllExtenders(exspander_stage, gp, clone_map, pset, main_unique_storage);
+ last_extender = make_shared<CompositeExtender>(gp.g, clone_map, all_libs,
+ max_is_right_quantile, main_unique_storage,
+ cfg::get().pe_params.param_set.extension_options.max_repeat_length);
+ if (!is_2015_scaffolder_enabled(sc_mode)) {
+ FinalizePaths(last_paths, clone_map, min_edge_len, max_is_right_quantile);
+ DebugOutputPaths(gp, output_dir, last_paths, "mp2_before_traverse");
+ }
+
+ TraverseLoops(last_paths, clone_map, last_extender);
+ FinalizePaths(last_paths, clone_map, min_edge_len, max_is_right_quantile);
+
+//result
+ if (broken_contigs.is_initialized()) {
+ OutputBrokenScaffolds(last_paths, (int) gp.g.k(), writer, output_dir + broken_contigs.get());
+ }
+ debruijn_graph::GenomeConsistenceChecker genome_checker (gp, main_unique_storage, 1000, 0.2);
+ DebugOutputPaths(gp, output_dir, last_paths, "mp2_final_paths");
+ writer.OutputPaths(last_paths, output_dir + contigs_name);
+ if (gp.genome.size() > 0)
+ CountMisassembliesWithReference(genome_checker, last_paths);
+ //FinalizeUniquenessPaths();
+
+//TODO:: destructor?
+ last_paths.DeleteAllPaths();
+ seeds.DeleteAllPaths();
+ mp_paths.DeleteAllPaths();
+ clone_paths.DeleteAllPaths();
+
+ INFO("ExSPAnder repeat resolving tool finished");
+}
+
+} /* path_extend */
+
+
+
+#endif /* PATH_EXTEND_LAUNCH_HPP_ */
diff --git a/src/modules/algorithms/path_extend/path_extender.hpp b/src/modules/algorithms/path_extend/path_extender.hpp
new file mode 100644
index 0000000..628a3ab
--- /dev/null
+++ b/src/modules/algorithms/path_extend/path_extender.hpp
@@ -0,0 +1,1458 @@
+//***************************************************************************
+//* Copyright (c) 2011-2014 Saint-Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//****************************************************************************
+
+/*
+ * path_extender.hpp
+ *
+ * Created on: Mar 5, 2012
+ * Author: andrey
+ */
+
+#pragma once
+
+
+#include "extension_chooser.hpp"
+#include "path_filter.hpp"
+#include "overlap_analysis.hpp"
+#include "assembly_graph/graph_support/scaff_supplementary.hpp"
+#include <cmath>
+
+
+namespace path_extend {
+
+class ShortLoopResolver {
+public:
+ ShortLoopResolver(const Graph& g)
+ : g_(g) { }
+
+ virtual ~ShortLoopResolver() { }
+
+ virtual void ResolveShortLoop(BidirectionalPath& path) const = 0;
+
+protected:
+ DECL_LOGGER("PathExtender")
+ const Graph& g_;
+
+ void UndoCycles(BidirectionalPath& p, EdgeId next_edge) const {
+ if (p.Size() <= 2) {
+ return;
+ }
+ EdgeId first_edge = p.Back();
+ EdgeId second_edge = next_edge;
+ while (p.Size() > 2) {
+ if (p.At(p.Size() - 1) == first_edge && p.At(p.Size() - 2) == second_edge) {
+ p.PopBack(2);
+ } else {
+ return;;
+ }
+ }
+ }
+
+ void MakeCycleStep(BidirectionalPath& path, EdgeId e) const {
+ if (path.Size() == 0) {
+ return;
+ }
+ EdgeId pathEnd = path.Back();
+ path.PushBack(e);
+ path.PushBack(pathEnd);
+ }
+};
+
+class CovShortLoopResolver : public ShortLoopResolver {
+public:
+ CovShortLoopResolver(const conj_graph_pack& gp)
+ : ShortLoopResolver(gp.g), gp_(gp) {
+
+ }
+
+ void ResolveShortLoop(BidirectionalPath& path) const override {
+ DEBUG("resolve short loop by coverage");
+ path.Print();
+
+ pair<EdgeId, EdgeId> edges;
+ if (path.Size() >= 1 && GetLoopAndExit(g_, path.Back(), edges)) {
+ DEBUG("Coverage Short Loop Resolver");
+ UndoCycles(path, edges.first);
+ EdgeId e1 = path.Back();
+ EdgeId e2 = edges.first;
+ EdgeId e_out = edges.second;
+ auto prob_e_in = g_.IncomingEdges(g_.EdgeEnd(e2));
+ EdgeId e_in = *prob_e_in.begin();
+ size_t count = 0;
+ for (auto edge = prob_e_in.begin(); edge != prob_e_in.end(); ++edge) {
+ if (*edge != e2)
+ e_in = *edge;
+ count++;
+ }
+ if (count != 2) {
+ return;
+ }
+ double in_cov = gp_.flanking_cov.GetOutCov(e_in); //g_.coverage(e_in);
+ double out_cov = gp_.flanking_cov.GetInCov(e_out); //g_.coverage(e_out);
+ double cov = (in_cov + out_cov) / 2.0;
+ double time1 = math::round(gp_.flanking_cov.GetInCov(e1) / cov);//math::round(gp_.g.coverage(e1) / cov);
+ double time2 = math::round(gp_.flanking_cov.GetInCov(e2) / cov);////math::round(gp_.g.coverage(e2) / cov);
+ size_t time = (size_t) std::max(0.0, std::min(time1 - 1.0, time2));
+ for (size_t i = 0; i < time; ++i) {
+ MakeCycleStep(path, edges.first);
+ }
+ path.PushBack(edges.second);
+ DEBUG("loop with start " << g_.int_id(e_in)
+ <<" e1 " << g_.int_id(e1)
+ << " e2 " << g_.int_id(e2)
+ << " out " <<g_.int_id(e_out)
+ << " cov in = " << in_cov
+ << " cov out " << out_cov
+ << " cov " << cov
+ << " cov e1 = " << gp_.g.coverage(e1)
+ << " cov e2 = " << gp_.g.coverage(e2)
+ << " time1 = " << time1
+ << " time2 = " << time2
+ << " time = " << time);
+ }
+ }
+private:
+ const conj_graph_pack& gp_;
+};
+
+class SimpleLoopResolver : public ShortLoopResolver {
+
+public:
+ SimpleLoopResolver(Graph& g) : ShortLoopResolver(g) { }
+
+ void ResolveShortLoop(BidirectionalPath& path) const override {
+ pair<EdgeId, EdgeId> edges;
+ if (path.Size() >= 1 && GetLoopAndExit(g_, path.Back(), edges)) {
+ DEBUG("Resolving short loop...");
+ EdgeId e = path.Back();
+ path.PushBack(edges.first);
+ path.PushBack(e);
+ path.PushBack(edges.second);
+ DEBUG("Resolving short loop done");
+ }
+ }
+
+protected:
+ DECL_LOGGER("PathExtender")
+};
+
+class LoopResolver : public ShortLoopResolver {
+ static const size_t ITER_COUNT = 10;
+ const WeightCounter& wc_;
+
+public:
+ LoopResolver(const Graph& g, const WeightCounter& wc)
+ : ShortLoopResolver(g),
+ wc_(wc) { }
+
+ void MakeBestChoice(BidirectionalPath& path, pair<EdgeId, EdgeId>& edges) const {
+ UndoCycles(path, edges.first);
+ BidirectionalPath experiment(path);
+ double max_weight = wc_.CountWeight(experiment, edges.second);
+ double diff = max_weight - wc_.CountWeight(experiment, edges.first);
+ size_t maxIter = 0;
+ for (size_t i = 1; i <= ITER_COUNT; ++i) {
+ double weight = wc_.CountWeight(experiment, edges.first);
+ if (weight > 0) {
+ MakeCycleStep(experiment, edges.first);
+ weight = wc_.CountWeight(experiment, edges.second);
+ double weight2 = wc_.CountWeight(experiment, edges.first);
+ if (weight > max_weight || (weight == max_weight && weight - weight2 > diff)
+ || (weight == max_weight && weight - weight2 == diff && i == 1)) {
+ max_weight = weight;
+ maxIter = i;
+ diff = weight - weight2;
+ }
+ }
+ }
+ for (size_t i = 0; i < maxIter; ++i) {
+ MakeCycleStep(path, edges.first);
+ }
+ path.PushBack(edges.second);
+ }
+
+ void ResolveShortLoop(BidirectionalPath& path) const override {
+ pair<EdgeId, EdgeId> edges;
+ if (path.Size() >=1 && GetLoopAndExit(g_, path.Back(), edges)) {
+ DEBUG("Resolving short loop...");
+ MakeBestChoice(path, edges);
+ DEBUG("Resolving short loop done");
+ }
+ }
+};
+
+class GapJoiner {
+
+public:
+ static const int INVALID_GAP = -1000000;
+ GapJoiner(const Graph& g)
+ : g_(g) { }
+
+ virtual Gap FixGap( EdgeId source, EdgeId sink, int initial_gap) const = 0;
+
+ virtual ~GapJoiner() { }
+protected:
+ const Graph& g_;
+};
+
+class SimpleGapJoiner : public GapJoiner {
+
+public:
+ SimpleGapJoiner(const Graph& g) : GapJoiner(g) { }
+
+ Gap FixGap(EdgeId source, EdgeId sink, int initial_gap) const override {
+ if (initial_gap > 2 * (int) g_.k()) {
+ return Gap(initial_gap);
+ }
+ for (int l = (int) g_.k(); l > 0; --l) {
+ if (g_.EdgeNucls(sink).Subseq(g_.length(source) + g_.k() - l) == g_.EdgeNucls(sink).Subseq(0, l)) {
+ DEBUG("Found correct gap length");
+ DEBUG("Inintial: " << initial_gap << ", new gap: " << g_.k() - l);
+ return Gap((int) g_.k() - l);
+ }
+ }
+ DEBUG("Perfect overlap is not found, inintial: " << initial_gap);
+ return Gap(initial_gap);
+ }
+};
+
+class HammingGapJoiner: public GapJoiner {
+ const double min_gap_score_;
+ const size_t short_overlap_threshold_;
+ const size_t basic_overlap_length_;
+
+ vector<size_t> DiffPos(const Sequence& s1, const Sequence& s2) const {
+ VERIFY(s1.size() == s2.size());
+ vector < size_t > answer;
+ for (size_t i = 0; i < s1.size(); ++i)
+ if (s1[i] != s2[i])
+ answer.push_back(i);
+ return answer;
+ }
+
+ size_t HammingDistance(const Sequence& s1, const Sequence& s2) const {
+ VERIFY(s1.size() == s2.size());
+ size_t dist = 0;
+ for (size_t i = 0; i < s1.size(); ++i) {
+ if (s1[i] != s2[i]) {
+ dist++;
+ }
+ }
+ return dist;
+ }
+
+// double ScoreGap(const Sequence& s1, const Sequence& s2, int gap, int initial_gap) const {
+// VERIFY(s1.size() == s2.size());
+// return 1.0 - (double) HammingDistance(s1, s2) / (double) s1.size()
+// - (double) abs(gap - initial_gap) / (double) (2 * g_.k());
+// }
+
+
+ double ScoreGap(const Sequence& s1, const Sequence& s2) const {
+ VERIFY(s1.size() == s2.size());
+ return 1.0 - (double) HammingDistance(s1, s2) / (double) s1.size();
+ }
+
+public:
+
+ //todo review parameters in usages
+ HammingGapJoiner(const Graph& g,
+ double min_gap_score,
+ size_t short_overlap_threshold,
+ size_t basic_overlap_length):
+ GapJoiner(g),
+ min_gap_score_(min_gap_score),
+ short_overlap_threshold_(short_overlap_threshold),
+ basic_overlap_length_(basic_overlap_length)
+ {
+ DEBUG("HammingGapJoiner params: \n min_gap_score " << min_gap_score_ <<
+ "\n short_overlap_threshold " << short_overlap_threshold_ <<
+ "\n basic_overlap_length " << basic_overlap_length_);
+ }
+
+ //estimated_gap is in k-mers
+ Gap FixGap(EdgeId source, EdgeId sink, int estimated_gap) const override {
+
+ size_t corrected_start_overlap = basic_overlap_length_;
+ if (estimated_gap < 0) {
+ corrected_start_overlap -= estimated_gap;
+ }
+
+ corrected_start_overlap = min(corrected_start_overlap,
+ g_.k() + min(g_.length(source), g_.length(sink)));
+
+ DEBUG("Corrected max overlap " << corrected_start_overlap);
+
+ double best_score = min_gap_score_;
+ int fixed_gap = INVALID_GAP;
+
+ double overlap_coeff = 0.3;
+ size_t min_overlap = 1ul;
+ if (estimated_gap < 0) {
+ size_t estimated_overlap = g_.k() - estimated_gap;
+ min_overlap = max(size_t(math::round(overlap_coeff * double(estimated_overlap))), 1ul);
+ }
+ //todo better usage of estimated overlap
+ DEBUG("Min overlap " << min_overlap);
+
+ for (size_t l = corrected_start_overlap; l >= min_overlap; --l) {
+ //TRACE("Sink: " << g_.EdgeNucls(sink).Subseq(g_.length(sink) + g_.k() - l).str());
+ //TRACE("Source: " << g_.EdgeNucls(source).Subseq(0, l));
+ double score = 0;
+ score = ScoreGap(g_.EdgeNucls(source).Subseq(g_.length(source) + g_.k() - l),
+ g_.EdgeNucls(sink).Subseq(0, l));
+ if (math::gr(score, best_score)) {
+ TRACE("Curr overlap " << l);
+ TRACE("Score: " << score);
+ best_score = score;
+ fixed_gap = int(g_.k() - l);
+ }
+
+ if (l == short_overlap_threshold_ && fixed_gap != INVALID_GAP) {
+ //look at "short" overlaps only if long overlaps couldn't be found
+ DEBUG("Not looking at short overlaps");
+ break;
+ }
+ }
+
+ if (fixed_gap != INVALID_GAP) {
+ DEBUG("Found candidate gap length with score " << best_score);
+ DEBUG("Estimated gap: " << estimated_gap <<
+ ", fixed gap: " << fixed_gap << " (overlap " << g_.k() - fixed_gap<< ")");
+ }
+ return Gap(fixed_gap);
+ }
+
+private:
+ DECL_LOGGER("HammingGapJoiner");
+};
+
+//deprecated!
+//fixme reduce code duplication with HammingGapJoiner
+class LikelihoodHammingGapJoiner: public GapJoiner {
+ static const size_t DEFAULT_PADDING_LENGTH = 10;
+ const double min_gap_score_;
+ const size_t short_overlap_threshold_;
+ const size_t basic_overlap_length_;
+
+ vector<size_t> DiffPos(const Sequence& s1, const Sequence& s2) const {
+ VERIFY(s1.size() == s2.size());
+ vector < size_t > answer;
+ for (size_t i = 0; i < s1.size(); ++i)
+ if (s1[i] != s2[i])
+ answer.push_back(i);
+ return answer;
+ }
+
+ size_t HammingDistance(const Sequence& s1, const Sequence& s2) const {
+ VERIFY(s1.size() == s2.size());
+ size_t dist = 0;
+ for (size_t i = 0; i < s1.size(); ++i) {
+ if (s1[i] != s2[i]) {
+ dist++;
+ }
+ }
+ return dist;
+ }
+
+// double ScoreGap(const Sequence& s1, const Sequence& s2, int gap, int initial_gap) const {
+// VERIFY(s1.size() == s2.size());
+// return 1.0 - (double) HammingDistance(s1, s2) / (double) s1.size()
+// - (double) abs(gap - initial_gap) / (double) (2 * g_.k());
+// }
+
+ //FIXME use GC content, change match prob and use partition of tip sequence into bad and good part
+ double ScoreGap(const Sequence& s1, const Sequence& s2) const {
+ static double match_prob = 0.9;
+ static double log_match_prob = log2(match_prob);
+ static double log_mismatch_prob = log2(1. - match_prob);
+ VERIFY(s1.size() == s2.size());
+ size_t n = s1.size();
+ size_t mismatches = HammingDistance(s1, s2);
+ VERIFY(mismatches <= n);
+ return 2.*double(n) + double(n - mismatches) * log_match_prob + double(mismatches) * log_mismatch_prob;
+ }
+
+public:
+
+ //todo review parameters in usages
+ LikelihoodHammingGapJoiner(const Graph& g,
+ double min_gap_score,
+ size_t short_overlap_threshold,
+ size_t basic_overlap_length):
+ GapJoiner(g),
+ min_gap_score_(min_gap_score),
+ short_overlap_threshold_(short_overlap_threshold),
+ basic_overlap_length_(basic_overlap_length)
+ {
+ DEBUG("LikelihoodHammingGapJoiner params: \n min_gap_score " << min_gap_score_ <<
+ "\n short_overlap_threshold " << short_overlap_threshold_ <<
+ "\n basic_overlap_length " << basic_overlap_length_);
+ }
+
+ //estimated_gap is in k-mers
+ Gap FixGap(EdgeId source, EdgeId sink, int estimated_gap) const override {
+
+ size_t corrected_start_overlap = basic_overlap_length_;
+ if (estimated_gap < 0) {
+ corrected_start_overlap -= estimated_gap;
+ }
+
+ corrected_start_overlap = min(corrected_start_overlap,
+ g_.k() + min(g_.length(source), g_.length(sink)));
+
+ DEBUG("Corrected max overlap " << corrected_start_overlap);
+
+ double best_score = min_gap_score_;
+ int fixed_gap = INVALID_GAP;
+
+ double overlap_coeff = 0.3;
+ size_t min_overlap = 1ul;
+ if (estimated_gap < 0) {
+ size_t estimated_overlap = g_.k() - estimated_gap;
+ min_overlap = max(size_t(math::round(overlap_coeff * double(estimated_overlap))), 1ul);
+ }
+ //todo better usage of estimated overlap
+ DEBUG("Min overlap " << min_overlap);
+
+ for (size_t l = corrected_start_overlap; l >= min_overlap; --l) {
+ //TRACE("Sink: " << g_.EdgeNucls(sink).Subseq(g_.length(sink) + g_.k() - l).str());
+ //TRACE("Source: " << g_.EdgeNucls(source).Subseq(0, l));
+ double score = 0;
+ score = ScoreGap(g_.EdgeNucls(source).Subseq(g_.length(source) + g_.k() - l),
+ g_.EdgeNucls(sink).Subseq(0, l));
+ if (math::gr(score, best_score)) {
+ TRACE("Curr overlap " << l);
+ TRACE("Score: " << score);
+ best_score = score;
+ fixed_gap = int(g_.k() - l);
+ }
+
+ if (l == short_overlap_threshold_ && fixed_gap != INVALID_GAP) {
+ //look at "short" overlaps only if long overlaps couldn't be found
+ DEBUG("Not looking at short overlaps");
+ break;
+ }
+ }
+
+ if (fixed_gap != INVALID_GAP) {
+ DEBUG("Found candidate gap length with score " << best_score);
+ DEBUG("Estimated gap: " << estimated_gap <<
+ ", fixed gap: " << fixed_gap << " (overlap " << g_.k() - fixed_gap<< ")");
+ }
+ return Gap(fixed_gap);
+ }
+
+private:
+ DECL_LOGGER("LikelihoodHammingGapJoiner");
+};
+
+//if I was in LA
+class LAGapJoiner: public GapJoiner {
+public:
+ LAGapJoiner(const Graph& g, size_t min_la_length,
+ double flank_multiplication_coefficient,
+ double flank_addition_coefficient) :
+ GapJoiner(g), min_la_length_(min_la_length), flank_addition_coefficient_(
+ flank_addition_coefficient), flank_multiplication_coefficient_(
+ flank_multiplication_coefficient) {
+ DEBUG("flank_multiplication_coefficient - " << flank_multiplication_coefficient_); DEBUG("flank_addition_coefficient_ - " << flank_addition_coefficient_ );
+ }
+
+ Gap FixGap(EdgeId source, EdgeId sink, int initial_gap) const override {
+
+ DEBUG("Overlap doesn't exceed " << size_t(abs(initial_gap) * ESTIMATED_GAP_MULTIPLIER) + GAP_ADDITIONAL_COEFFICIENT);
+ SWOverlapAnalyzer overlap_analyzer(
+ size_t(abs(initial_gap) * ESTIMATED_GAP_MULTIPLIER) + GAP_ADDITIONAL_COEFFICIENT);
+
+ auto overlap_info = overlap_analyzer.AnalyzeOverlap(g_, source,
+ sink);
+
+ DEBUG(overlap_info);
+
+ if (overlap_info.size() < min_la_length_) {
+ DEBUG("Low alignment size");
+ return Gap(INVALID_GAP);
+ }
+
+ size_t max_flank_length = max(overlap_info.r2.start_pos,
+ g_.length(source) + g_.k() - overlap_info.r1.end_pos);
+ DEBUG("Max flank length - " << max_flank_length);
+
+ if ((double) max_flank_length * flank_multiplication_coefficient_
+ + flank_addition_coefficient_ > overlap_info.size()) {
+ DEBUG("Too long flanks for such alignment");
+ return Gap(INVALID_GAP);
+ }
+
+ if (overlap_info.identity() < IDENTITY_RATIO) {
+ DEBUG("Low identity score");
+ return Gap(INVALID_GAP);
+ }
+
+ if ((g_.length(source) + g_.k()) - overlap_info.r1.end_pos > g_.length(source)) {
+ DEBUG("Save kmers. Don't want to have edges shorter than k");
+ return Gap(INVALID_GAP);
+ }
+
+ if (overlap_info.r2.start_pos > g_.length(sink)) {
+ DEBUG("Save kmers. Don't want to have edges shorter than k");
+ return Gap(INVALID_GAP);
+ }
+
+ return Gap(
+ (int) (-overlap_info.r1.size() - overlap_info.r2.start_pos
+ + g_.k()),
+ (uint32_t) (g_.length(source) + g_.k()
+ - overlap_info.r1.end_pos),
+ (uint32_t) overlap_info.r2.start_pos);
+ }
+
+private:
+ DECL_LOGGER("LAGapJoiner");
+ const size_t min_la_length_;
+ const double flank_addition_coefficient_;
+ const double flank_multiplication_coefficient_;
+ constexpr static double IDENTITY_RATIO = 0.9;
+ constexpr static double ESTIMATED_GAP_MULTIPLIER = 2.0;
+ const size_t GAP_ADDITIONAL_COEFFICIENT = 30;
+};
+
+
+class CompositeGapJoiner: public GapJoiner {
+public:
+
+ CompositeGapJoiner(const Graph& g,
+ const vector<shared_ptr<GapJoiner>>& joiners,
+ size_t may_overlap_threhold,
+ int must_overlap_threhold,
+ size_t artificail_gap) :
+ GapJoiner(g),
+ joiners_(joiners),
+ may_overlap_threshold_(may_overlap_threhold),
+ must_overlap_threshold_(must_overlap_threhold),
+ artificial_gap_(artificail_gap)
+ { }
+
+ Gap FixGap(EdgeId source, EdgeId sink, int estimated_gap) const override {
+ DEBUG("Trying to fix estimated gap " << estimated_gap <<
+ " between " << g_.str(source) << " and " << g_.str(sink));
+
+ if (estimated_gap > int(g_.k() + may_overlap_threshold_)) {
+ DEBUG("Edges are supposed to be too far to check overlaps");
+ return Gap(estimated_gap);
+ }
+
+ for (auto joiner : joiners_) {
+ Gap gap = joiner->FixGap(source, sink, estimated_gap);
+ if (gap.gap_ != GapJoiner::INVALID_GAP) {
+ return gap;
+ }
+ }
+
+ //couldn't find decent overlap
+ if (estimated_gap < must_overlap_threshold_) {
+ DEBUG("Estimated gap looks unreliable");
+ return Gap(INVALID_GAP);
+ } else {
+ DEBUG("Overlap was not found");
+ return Gap(max(estimated_gap, int(g_.k() + artificial_gap_)));
+ }
+ }
+
+private:
+ vector<shared_ptr<GapJoiner>> joiners_;
+ const size_t may_overlap_threshold_;
+ const int must_overlap_threshold_;
+ const size_t artificial_gap_;
+
+ DECL_LOGGER("CompositeGapJoiner");
+};
+
+//FIXME move to tests
+//Just for test. Look at overlap_analysis_tests
+inline Gap MimicLAGapJoiner(Sequence& s1, Sequence& s2) {
+ const int INVALID_GAP = -1000000;
+ constexpr static double IDENTITY_RATIO = 0.9;
+
+ SWOverlapAnalyzer overlap_analyzer_(10000);
+ auto overlap_info = overlap_analyzer_.AnalyzeOverlap(s1, s2);
+ size_t min_la_length_ = 4;
+ if (overlap_info.size() < min_la_length_) {
+ DEBUG("Low alignment size");
+ return Gap(INVALID_GAP);
+ }
+ if (overlap_info.identity() < IDENTITY_RATIO) {
+ DEBUG("Low identity score");
+ return Gap(INVALID_GAP);
+ }
+ std::cout << overlap_info;
+
+ return Gap(
+ (int) (-overlap_info.r1.size() - overlap_info.r2.start_pos),
+ (uint32_t) (s1.size() - overlap_info.r1.end_pos),
+ (uint32_t) overlap_info.r2.start_pos);
+}
+
+
+//Detects a cycle as a minsuffix > IS present earlier in the path. Overlap is allowed.
+class InsertSizeLoopDetector {
+protected:
+ const Graph& g_;
+ const GraphCoverageMap& cov_map_;
+ size_t min_cycle_len_;
+
+public:
+ InsertSizeLoopDetector(const Graph& g, const GraphCoverageMap& cov_map, size_t is): g_(g), cov_map_(cov_map), min_cycle_len_(is) {
+ }
+
+ size_t GetMinCycleLenth() const {
+ return min_cycle_len_;
+ }
+
+ bool CheckCycledNonIS(const BidirectionalPath& path) const {
+ if (path.Size() <= 2) {
+ return false;
+ }
+ BidirectionalPath last = path.SubPath(path.Size() - 2);
+ int pos = path.FindFirst(last);
+ VERIFY(pos >= 0);
+ return size_t(pos) != path.Size() - 2;
+ }
+
+ bool CheckCycled(const BidirectionalPath& path) const {
+ return FindCycleStart(path) != -1;
+ }
+//first suffix longer than min_cycle_len
+ int FindPosIS(const BidirectionalPath& path) const {
+ int i = (int) path.Size() - 1;
+ while (i >= 0 && path.LengthAt(i) < min_cycle_len_) {
+ --i;
+ }
+ return i;
+ }
+ int FindCycleStart(const BidirectionalPath& path) const {
+ TRACE("Looking for IS cycle " << min_cycle_len_);
+ int i = FindPosIS(path);
+ TRACE("last is pos " << i);
+ if (i < 0) return -1;
+//Tail
+ BidirectionalPath last = path.SubPath(i);
+ //last.Print();
+
+ int pos = path.FindFirst(last);
+// not cycle
+ if (pos == i) pos = -1;
+ TRACE("looking for 1sr IS cycle " << pos);
+ return pos;
+ }
+
+//After cycle detected, removes min suffix > IS.
+//returns the beginning of the cycle.
+ int RemoveCycle(BidirectionalPath& path) const {
+ int pos = FindCycleStart(path);
+ DEBUG("Found IS cycle " << pos);
+ if (pos == -1) {
+ return -1;
+ }
+
+ int last_edge_pos = FindPosIS(path);
+ VERIFY(last_edge_pos > -1);
+ DEBUG("last edge pos " << last_edge_pos);
+ VERIFY(last_edge_pos > pos);
+ for (int i = (int) path.Size() - 1; i >= last_edge_pos; --i) {
+ path.PopBack();
+ }
+ VERIFY((int) path.Size() == last_edge_pos);
+ VERIFY(pos < (int) path.Size());
+ DEBUG("result pos " <<pos);
+ return pos;
+ }
+};
+
+class RepeatDetector {
+public:
+ RepeatDetector(const Graph& g, const GraphCoverageMap& cov_map, size_t max_repeat_len)
+ : g_(g),
+ cov_map_(cov_map),
+ used_paths_(),
+ repeat_len_(max_repeat_len){
+ empty_ = new BidirectionalPath(g_);
+ }
+ ~RepeatDetector() {
+ delete empty_;
+ }
+
+ BidirectionalPath* RepeatPath(const BidirectionalPath& p) {
+ if (p.Size() == 0) {
+ return empty_;
+ }
+ EdgeId last_e = p.Back();
+ BidirectionalPathSet cov_paths = cov_map_.GetCoveringPaths(last_e);
+ DEBUG("cov paths for e " << g_.int_id(last_e) << " size " << cov_paths.size());
+ size_t max_common_size = 0;
+ BidirectionalPath* result_p = empty_;
+ for (BidirectionalPath* cov_p : cov_paths) {
+ if (used_paths_.find(cov_p) == used_paths_.end() || cov_p == &p || cov_p == p.GetConjPath()) {
+ continue;
+ }
+ size_t common_size = MaxCommonSize(p, *cov_p);
+ DEBUG("max comon size with path " << cov_p->GetId() << " is " << common_size);
+ if (common_size == 0) {
+ continue;
+ }
+ VERIFY(common_size <= p.Size());
+ if (p.LengthAt(p.Size() - common_size) > repeat_len_) {
+ DEBUG("repeat from " << (p.Size() - common_size) << " length " << p.LengthAt(p.Size() - common_size) << " repeat length " << repeat_len_);
+ max_common_size = max(common_size, max_common_size);
+ result_p = cov_p;
+ }
+ }
+ used_paths_.insert(&p);
+ DEBUG("max common size " << max_common_size);
+ return result_p;
+ }
+ size_t MaxCommonSize(const BidirectionalPath& p1, const BidirectionalPath& p2) const {
+ DEBUG("max coomon size ")
+ EdgeId last_e = p1.Back();
+ vector<size_t> positions2 = p2.FindAll(last_e);
+ DEBUG("pos size " << positions2.size())
+ size_t max_common_size = 0;
+ for (size_t pos2 : positions2) {
+ size_t common_size = MaxCommonSize(p1, p1.Size() - 1, p2, pos2);
+ DEBUG("max common size from " << pos2 << " is " << common_size);
+ max_common_size = max(max_common_size, common_size);
+ }
+ return max_common_size;
+ }
+private:
+ size_t MaxCommonSize(const BidirectionalPath& p1, size_t pos1, const BidirectionalPath& p2, size_t pos2) const {
+ int i1 = (int) pos1;
+ int i2 = (int) pos2;
+ while (i1 >= 0 && i2 >= 0 &&
+ p1.At((size_t) i1) == p2.At((size_t) i2) &&
+ p1.GapAt((size_t) i1) == p2.GapAt((size_t) i2)) {
+ i1--;
+ i2--;
+ }
+ if (i1 >=0 && i2>=0 && p1.At((size_t) i1) == p2.At((size_t) i2)) {
+ i1--;
+ i2--;
+ }
+
+ VERIFY(i1 <= (int)pos1);
+ return std::max(size_t((int) pos1 - i1), (size_t)1);
+ }
+ const Graph& g_;
+ const GraphCoverageMap& cov_map_;
+ set<const BidirectionalPath*> used_paths_;
+ size_t repeat_len_;
+ BidirectionalPath* empty_;
+};
+
+class ContigsMaker {
+public:
+ ContigsMaker(const Graph & g)
+ : g_(g) { }
+
+ virtual ~ContigsMaker() { }
+
+ virtual void GrowPath(BidirectionalPath& path, PathContainer* paths_storage = nullptr) = 0;
+
+ virtual void GrowPathSimple(BidirectionalPath& path, PathContainer* paths_storage = nullptr) = 0;
+
+ virtual void GrowAll(PathContainer & paths, PathContainer& paths_storage) = 0;
+
+protected:
+ const Graph& g_;
+ DECL_LOGGER("PathExtender")
+};
+
+struct UsedUniqueStorage {
+ set<EdgeId> used_;
+
+ const ScaffoldingUniqueEdgeStorage& unique_;
+ void insert(EdgeId e) {
+ if (unique_.IsUnique(e)) {
+ used_.insert(e);
+ used_.insert(e->conjugate());
+ }
+ }
+ bool IsUsedAndUnique (EdgeId e) {
+ return (unique_.IsUnique(e) && used_.find(e) != used_.end());
+ }
+ UsedUniqueStorage(const ScaffoldingUniqueEdgeStorage& unique ):used_(), unique_(unique) {}
+};
+class PathExtender {
+public:
+ PathExtender(const Graph & g): g_(g){ }
+
+ virtual ~PathExtender() { }
+
+ virtual bool MakeGrowStep(BidirectionalPath& path, PathContainer* paths_storage = nullptr) = 0;
+
+ void AddUniqueEdgeStorage(shared_ptr<UsedUniqueStorage> used_storage) {
+ used_storage_ = used_storage;
+ }
+protected:
+ const Graph& g_;
+ shared_ptr<UsedUniqueStorage> used_storage_;
+ DECL_LOGGER("PathExtender")
+};
+
+class CompositeExtender : public ContigsMaker {
+public:
+ CompositeExtender(Graph & g, GraphCoverageMap& cov_map, size_t max_diff_len, size_t max_repeat_length)
+ : ContigsMaker(g),
+ cover_map_(cov_map),
+ repeat_detector_(g, cover_map_, 2 * max_repeat_length),
+ extenders_(),
+ max_diff_len_(max_diff_len) {
+ }
+
+ CompositeExtender(Graph & g, GraphCoverageMap& cov_map, vector<shared_ptr<PathExtender> > pes, size_t max_diff_len, const ScaffoldingUniqueEdgeStorage& unique, size_t max_repeat_length)
+ : ContigsMaker(g),
+ cover_map_(cov_map),
+ repeat_detector_(g, cover_map_, 2 * max_repeat_length),
+ extenders_(),
+ max_diff_len_(max_diff_len) {
+ extenders_ = pes;
+ used_storage_ = make_shared<UsedUniqueStorage>(UsedUniqueStorage( unique));
+ for (auto ex: extenders_) {
+ ex->AddUniqueEdgeStorage(used_storage_);
+ }
+ }
+
+ void AddExtender(shared_ptr<PathExtender> pe) {
+ extenders_.push_back(pe);
+ pe->AddUniqueEdgeStorage(used_storage_);
+ }
+
+ void GrowAll(PathContainer& paths, PathContainer& result) override {
+ result.clear();
+ GrowAllPaths(paths, result);
+ LengthPathFilter filter(g_, 0);
+ filter.filter(result);
+ }
+
+ void GrowPath(BidirectionalPath& path, PathContainer* paths_storage) override {
+ while (MakeGrowStep(path, paths_storage)) { }
+ }
+
+ void GrowPathSimple(BidirectionalPath& path, PathContainer* paths_storage) override {
+ while (MakeGrowStep(path, paths_storage, false)) { }
+ }
+
+ bool MakeGrowStep(BidirectionalPath& path, PathContainer* paths_storage, bool detect_repeats_online = true) {
+ DEBUG("make grow step composite extender");
+ auto sc_mode = cfg::get().pe_params.param_set.sm;
+ if (is_2015_scaffolder_enabled(sc_mode) || cfg::get().mode == config::pipeline_type::meta) {
+ DEBUG("force switch off online repeats detect, 2015 on");
+ //FIXME disable for all!
+ detect_repeats_online = false;
+ }
+ if (detect_repeats_online) {
+ BidirectionalPath *repeat_path = repeat_detector_.RepeatPath(path);
+ size_t repeat_size = repeat_detector_.MaxCommonSize(path, *repeat_path);
+
+ if (repeat_size > 0) {
+ DEBUG("repeat with length " << repeat_size);
+ path.Print();
+ repeat_path->Print();
+ BidirectionalPath repeat = path.SubPath(path.Size() - repeat_size);
+ int begin_repeat = repeat_path->FindLast(repeat);
+ VERIFY(begin_repeat > -1);
+ size_t end_repeat = (size_t) begin_repeat + repeat_size;
+ DEBUG("not consistent subpaths ");
+ BidirectionalPath begin1 = path.SubPath(0, path.Size() - repeat_size);
+ begin1.Print();
+ BidirectionalPath begin2 = repeat_path->SubPath(0, begin_repeat);
+ begin2.Print();
+ int gpa_in_repeat_path = repeat_path->GapAt(begin_repeat);
+ BidirectionalPath end2 = repeat_path->SubPath(end_repeat);
+ BidirectionalPath begin1_conj = path.SubPath(0, path.Size() - repeat_size + 1).Conjugate();
+ BidirectionalPath begin2_conj = repeat_path->SubPath(0, begin_repeat + 1).Conjugate();
+ pair<size_t, size_t> last = ComparePaths(0, 0, begin1_conj, begin2_conj, max_diff_len_);
+ DEBUG("last " << last.first << " last2 " << last.second);
+ path.Clear();
+ repeat_path->Clear();
+ int gap_len = repeat.GapAt(0);
+
+ if (begin2.Size() == 0 || last.second != 0) { //TODO: incorrect: common edges, but then different ends
+ path.PushBack(begin1);
+ repeat_path->PushBack(begin2);
+ } else {
+ gap_len = gpa_in_repeat_path;
+ path.PushBack(begin2);
+ repeat_path->PushBack(begin1);
+ }
+
+ path.PushBack(repeat.At(0), gap_len);
+ path.PushBack(repeat.SubPath(1));
+ path.PushBack(end2);
+ DEBUG("new path");
+ path.Print();
+ return false;
+ }
+ }
+
+ size_t current = 0;
+ while (current < extenders_.size()) {
+ DEBUG("step " << current << " from " <<extenders_.size());
+ if (extenders_[current]->MakeGrowStep(path, paths_storage)) {
+ return true;
+ }
+ ++current;
+ }
+ return false;
+ }
+
+private:
+ GraphCoverageMap& cover_map_;
+ RepeatDetector repeat_detector_;
+ vector<shared_ptr<PathExtender> > extenders_;
+ size_t max_diff_len_;
+ shared_ptr<UsedUniqueStorage> used_storage_;
+
+ void SubscribeCoverageMap(BidirectionalPath * path) {
+ path->Subscribe(&cover_map_);
+ for (size_t i = 0; i < path->Size(); ++i) {
+ cover_map_.BackEdgeAdded(path->At(i), path, path->GapAt(i));
+ }
+ }
+
+ void GrowAllPaths(PathContainer& paths, PathContainer& result) {
+ cover_map_.Clear();
+ for (size_t i = 0; i < paths.size(); ++i) {
+ VERBOSE_POWER_T2(i, 100, "Processed " << i << " paths from " << paths.size() << " (" << i * 100 / paths.size() << "%)");
+ if (paths.size() > 10 && i % (paths.size() / 10 + 1) == 0) {
+ INFO("Processed " << i << " paths from " << paths.size() << " (" << i * 100 / paths.size() << "%)");
+ }
+//In 2015 modes do not use a seed already used in paths.
+ auto sc_mode = cfg::get().pe_params.param_set.sm;
+ if (sc_mode == sm_old_pe_2015 || sc_mode == sm_2015 || sc_mode == sm_combined) {
+ bool was_used = false;
+ for (size_t ind =0; ind < paths.Get(i)->Size(); ind++) {
+ EdgeId eid = paths.Get(i)->At(ind);
+ if (used_storage_->IsUsedAndUnique(eid)) {
+ was_used = true; break;
+ } else {
+ used_storage_->insert(eid);
+ }
+ }
+ if (was_used) {
+ DEBUG("skipping already used seed");
+ continue;
+ }
+ }
+//TODO: coverage_map should be exterminated
+ if (!cover_map_.IsCovered(*paths.Get(i))) {
+ BidirectionalPath * path = new BidirectionalPath(*paths.Get(i));
+ BidirectionalPath * conjugatePath = new BidirectionalPath(*paths.GetConjugate(i));
+ result.AddPair(path, conjugatePath);
+ SubscribeCoverageMap(path);
+ SubscribeCoverageMap(conjugatePath);
+ size_t count_trying = 0;
+ size_t current_path_len = 0;
+ do {
+ current_path_len = path->Length();
+ count_trying++;
+ GrowPath(*path, &result);
+ GrowPath(*conjugatePath, &result);
+ } while (count_trying < 10 && (path->Length() != current_path_len));
+ path->CheckConjugateEnd(cfg::get().max_repeat_length);
+ DEBUG("result path " << path->GetId());
+ path->Print();
+ }
+ }
+ }
+
+};
+
+//All Path-Extenders inherits this one.
+
+class LoopDetectingPathExtender : public PathExtender {
+
+protected:
+ size_t maxLoops_;
+ bool investigateShortLoops_;
+ bool use_short_loop_cov_resolver_;
+ CovShortLoopResolver cov_loop_resolver_;
+
+ vector<shared_ptr<BidirectionalPath> > visited_cycles_;
+ InsertSizeLoopDetector is_detector_;
+ const GraphCoverageMap& cov_map_;
+
+public:
+ LoopDetectingPathExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map, size_t max_loops, bool investigateShortLoops,
+ bool use_short_loop_cov_resolver, size_t is)
+ : PathExtender(gp.g),
+ maxLoops_(max_loops),
+ investigateShortLoops_(investigateShortLoops),
+ use_short_loop_cov_resolver_(use_short_loop_cov_resolver),
+ cov_loop_resolver_(gp),
+ is_detector_(gp.g, cov_map, is),
+ cov_map_(cov_map) {
+
+ }
+
+ size_t getMaxLoops() const {
+ return maxLoops_;
+ }
+
+ bool isInvestigateShortLoops() const {
+ return investigateShortLoops_;
+ }
+
+ void setInvestigateShortLoops(bool investigateShortLoops) {
+ this->investigateShortLoops_ = investigateShortLoops;
+ }
+
+ void setMaxLoops(size_t maxLoops) {
+ if (maxLoops != 0) {
+ this->maxLoops_ = maxLoops;
+ }
+ }
+//seems that it is outofdate
+ bool InExistingLoop(const BidirectionalPath& path) {
+ TRACE("Checking existing loops");
+ int j = 0;
+ for (auto cycle : visited_cycles_) {
+ VERBOSE_POWER2(j++, "checking ");
+ int pos = path.FindLast(*cycle);
+ if (pos == -1)
+ continue;
+
+ int start_cycle_pos = pos + (int) cycle->Size();
+ bool only_cycles_in_tail = true;
+ int last_cycle_pos = start_cycle_pos;
+ DEBUG("start_cycle pos "<< last_cycle_pos);
+ for (int i = start_cycle_pos; i < (int) path.Size() - (int) cycle->Size(); i += (int) cycle->Size()) {
+ if (!path.CompareFrom(i, *cycle)) {
+ only_cycles_in_tail = false;
+ break;
+ } else {
+ last_cycle_pos = i + (int) cycle->Size();
+ DEBUG("last cycle pos changed " << last_cycle_pos);
+ }
+ }
+ DEBUG("last_cycle_pos " << last_cycle_pos);
+ only_cycles_in_tail = only_cycles_in_tail && cycle->CompareFrom(0, path.SubPath(last_cycle_pos));
+ if (only_cycles_in_tail) {
+// seems that most of this is useless, checking
+ VERIFY (last_cycle_pos == start_cycle_pos);
+ DEBUG("find cycle " << last_cycle_pos);
+ DEBUG("path");
+ path.Print();
+ DEBUG("last subpath");
+ path.SubPath(last_cycle_pos).Print();
+ DEBUG("cycle");
+ cycle->Print();
+ DEBUG("last_cycle_pos " << last_cycle_pos << " path size " << path.Size());
+ VERIFY(last_cycle_pos <= (int)path.Size());
+ DEBUG("last cycle pos + cycle " << last_cycle_pos + (int)cycle->Size());
+ VERIFY(last_cycle_pos + (int)cycle->Size() >= (int)path.Size());
+
+ return true;
+ }
+ }
+ return false;
+ }
+
+ void AddCycledEdges(const BidirectionalPath& path, size_t pos) {
+ if (pos >= path.Size()) {
+ DEBUG("Wrong position in IS cycle");
+ return;
+ }
+ visited_cycles_.push_back(std::make_shared<BidirectionalPath>(path.SubPath(pos)));
+ DEBUG("add cycle");
+ path.SubPath(pos).Print();
+ }
+
+ bool DetectCycle(BidirectionalPath& path) {
+ DEBUG("detect cycle");
+ if (is_detector_.CheckCycled(path)) {
+ DEBUG("Checking IS cycle");
+ int loop_pos = is_detector_.RemoveCycle(path);
+ DEBUG("Removed IS cycle");
+ if (loop_pos != -1) {
+ AddCycledEdges(path, loop_pos);
+ return true;
+ }
+ }
+ return false;
+ }
+
+ bool DetectCycleScaffolding(BidirectionalPath& path) {
+ return is_detector_.CheckCycledNonIS(path);
+ }
+
+ virtual bool MakeSimpleGrowStep(BidirectionalPath& path, PathContainer* paths_storage = nullptr) = 0;
+
+ virtual bool ResolveShortLoopByCov(BidirectionalPath& path) = 0;
+
+ virtual bool ResolveShortLoopByPI(BidirectionalPath& path) = 0;
+
+ virtual bool CanInvestigateShortLoop() const {
+ return false;
+ }
+
+ bool MakeGrowStep(BidirectionalPath& path, PathContainer* paths_storage) override {
+ if (InExistingLoop(path)) {
+ DEBUG("in existing loop");
+ return false;
+ }
+ bool result = false;
+ LoopDetector loop_detector(&path, cov_map_);
+ if (DetectCycle(path)) {
+ result = false;
+ } else if (path.Size() >= 1 && InvestigateShortLoop() && loop_detector.EdgeInShortLoop(path.Back()) && use_short_loop_cov_resolver_) {
+ DEBUG("edge in short loop");
+ result = ResolveShortLoop(path);
+ } else if (InvestigateShortLoop() && loop_detector.PrevEdgeInShortLoop() && use_short_loop_cov_resolver_) {
+ DEBUG("Prev edge in short loop");
+ path.PopBack();
+ result = ResolveShortLoop(path);
+ } else {
+ DEBUG("Making step");
+ result = MakeSimpleGrowStep(path, paths_storage);
+ DEBUG("Made step");
+ if (DetectCycle(path)) {
+ result = false;
+ } else if (path.Size() >= 1 && InvestigateShortLoop() && loop_detector.EdgeInShortLoop(path.Back())) {
+ DEBUG("Edge in short loop");
+ result = ResolveShortLoop(path);
+ } else if (InvestigateShortLoop() && loop_detector.PrevEdgeInShortLoop()) {
+ DEBUG("Prev edge in short loop");
+ path.PopBack();
+ result = ResolveShortLoop(path);
+ }
+ }
+ return result;
+ }
+
+private:
+ bool ResolveShortLoop(BidirectionalPath& p) {
+ if (use_short_loop_cov_resolver_) {
+ return ResolveShortLoopByCov(p);
+ } else {
+ return ResolveShortLoopByPI(p);
+ }
+ }
+
+ bool InvestigateShortLoop() {
+ return investigateShortLoops_ && (use_short_loop_cov_resolver_ || CanInvestigateShortLoop());
+ }
+protected:
+ DECL_LOGGER("LoopDetectingPathExtender")
+};
+
+class SimpleExtender: public LoopDetectingPathExtender {
+
+protected:
+
+ shared_ptr<ExtensionChooser> extensionChooser_;
+
+ void FindFollowingEdges(BidirectionalPath& path, ExtensionChooser::EdgeContainer * result) {
+ DEBUG("Looking for the following edges")
+ result->clear();
+ vector<EdgeId> edges;
+ DEBUG("Pushing back")
+ push_back_all(edges, g_.OutgoingEdges(g_.EdgeEnd(path.Back())));
+ result->reserve(edges.size());
+ for (auto iter = edges.begin(); iter != edges.end(); ++iter) {
+ DEBUG("Adding edge w distance " << g_.int_id(*iter));
+ result->push_back(EdgeWithDistance(*iter, 0));
+ }
+ DEBUG("Following edges found");
+ }
+
+
+public:
+
+ SimpleExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map, shared_ptr<ExtensionChooser> ec,
+ size_t is, size_t max_loops, bool investigate_short_loops, bool use_short_loop_cov_resolver):
+ LoopDetectingPathExtender(gp, cov_map, max_loops, investigate_short_loops, use_short_loop_cov_resolver, is),
+ extensionChooser_(ec) {
+ }
+
+ std::shared_ptr<ExtensionChooser> GetExtensionChooser() const {
+ return extensionChooser_;
+ }
+
+ bool CanInvestigateShortLoop() const override {
+ return extensionChooser_->WeightCounterBased();
+ }
+
+ bool ResolveShortLoopByCov(BidirectionalPath& path) override {
+ LoopDetector loop_detector(&path, cov_map_);
+ size_t init_len = path.Length();
+ bool result = false;
+ while (path.Size() >= 1 && loop_detector.EdgeInShortLoop(path.Back())) {
+ cov_loop_resolver_.ResolveShortLoop(path);
+ if (init_len == path.Length()) {
+ return result;
+ } else {
+ result = true;
+ }
+ init_len = path.Length();
+ }
+ return true;
+ }
+
+ bool ResolveShortLoopByPI(BidirectionalPath& path) override {
+ if (extensionChooser_->WeightCounterBased()) {
+ LoopResolver loop_resolver(g_, extensionChooser_->wc());
+ LoopDetector loop_detector(&path, cov_map_);
+ size_t init_len = path.Length();
+ bool result = false;
+ while (path.Size() >= 1 && loop_detector.EdgeInShortLoop(path.Back())) {
+ loop_resolver.ResolveShortLoop(path);
+ if (init_len == path.Length()) {
+ return result;
+ } else {
+ result = true;
+ }
+ init_len = path.Length();
+ }
+ return true;
+ }
+ return false;
+ }
+
+ bool MakeSimpleGrowStep(BidirectionalPath& path, PathContainer* paths_storage) override {
+ ExtensionChooser::EdgeContainer candidates;
+ return FilterCandidates(path, candidates) and AddCandidates(path, paths_storage, candidates);
+ }
+
+protected:
+ virtual bool FilterCandidates(BidirectionalPath& path, ExtensionChooser::EdgeContainer& candidates) {
+ if (path.Size() == 0) {
+ return false;
+ }
+ DEBUG("Simple grow step");
+ path.Print();
+ FindFollowingEdges(path, &candidates);
+ DEBUG("found candidates");
+ DEBUG(candidates.size())
+ if (candidates.size() == 1) {
+ LoopDetector loop_detector(&path, cov_map_);
+ if (!investigateShortLoops_ && (loop_detector.EdgeInShortLoop(path.Back()) or loop_detector.EdgeInShortLoop(candidates.back().e_))
+ && extensionChooser_->WeightCounterBased()) {
+ return false;
+ }
+ }
+ DEBUG("more filtering");
+ candidates = extensionChooser_->Filter(path, candidates);
+ DEBUG("filtered candidates");
+ DEBUG(candidates.size())
+ return true;
+ }
+
+ virtual bool AddCandidates(BidirectionalPath& path, PathContainer* /*paths_storage*/, ExtensionChooser::EdgeContainer& candidates) {
+ if (candidates.size() != 1)
+ return false;
+
+ LoopDetector loop_detector(&path, cov_map_);
+ DEBUG("loop detecor");
+ if (!investigateShortLoops_ &&
+ (loop_detector.EdgeInShortLoop(path.Back()) or loop_detector.EdgeInShortLoop(candidates.back().e_))
+ && extensionChooser_->WeightCounterBased()) {
+ return false;
+ }
+ DEBUG("push");
+ auto sc_mode = cfg::get().pe_params.param_set.sm;
+ EdgeId eid = candidates.back().e_;
+//In 2015 modes when trying to use already used unique edge, it is not added and path growing stops.
+//That allows us to avoid overlap removal hacks used earlier.
+ if (is_2015_scaffolder_enabled(sc_mode)) {
+ if (used_storage_->IsUsedAndUnique(eid)) {
+ return false;
+ } else {
+ used_storage_->insert(eid);
+ }
+ }
+ path.PushBack(eid, candidates.back().d_);
+ DEBUG("push done");
+ return true;
+ }
+
+protected:
+ DECL_LOGGER("SimpleExtender")
+
+};
+
+
+class MultiExtender: public SimpleExtender {
+
+protected:
+ size_t max_candidates_;
+
+public:
+
+ MultiExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map, shared_ptr<ExtensionChooser> ec,
+ size_t is, size_t max_loops, bool investigate_short_loops, bool use_short_loop_cov_resolver,
+ size_t max_candidates = 0):
+ SimpleExtender(gp, cov_map, ec, is, max_loops, investigate_short_loops, use_short_loop_cov_resolver),
+ max_candidates_(max_candidates) {
+ }
+
+protected:
+ virtual bool AddCandidates(BidirectionalPath& path, PathContainer* paths_storage, ExtensionChooser::EdgeContainer& candidates) override {
+ bool res = false;
+ if (candidates.size() >= 1 && (max_candidates_ == 0 || candidates.size() <= max_candidates_)) {
+ LoopDetector loop_detector(&path, cov_map_);
+ DEBUG("loop detector");
+ if (!investigateShortLoops_ && loop_detector.EdgeInShortLoop(path.Back())
+ && extensionChooser_->WeightCounterBased()) {
+ return false;
+ }
+//First candidate is adding to THIS path.
+ else if (not (!investigateShortLoops_ && loop_detector.EdgeInShortLoop(candidates.front().e_)
+ && extensionChooser_->WeightCounterBased())) {
+ DEBUG("push");
+ path.PushBack(candidates.front().e_, candidates.front().d_);
+ DEBUG("push done");
+ res = true;
+ }
+ if (candidates.size() > 1) {
+ DEBUG("Found " << candidates.size() << " candidates");
+ }
+//Creating new paths for other than new candidate.
+ for (size_t i = 1; i < candidates.size(); ++i) {
+ if (not (!investigateShortLoops_ && loop_detector.EdgeInShortLoop(candidates.front().e_)
+ && extensionChooser_->WeightCounterBased())) {
+ BidirectionalPath *p = new BidirectionalPath(path);
+ p->PushBack(candidates[i].e_, candidates[i].d_);
+ BidirectionalPath *cp = new BidirectionalPath(p->Conjugate());
+ paths_storage->AddPair(p, cp);
+ }
+ }
+ }
+
+ return res;
+ }
+
+protected:
+ DECL_LOGGER("MultiExtender")
+
+};
+
+
+class ScaffoldingPathExtender: public LoopDetectingPathExtender {
+ std::shared_ptr<ExtensionChooser> extension_chooser_;
+ ExtensionChooser::EdgeContainer sources_;
+ std::shared_ptr<GapJoiner> gap_joiner_;
+
+//When check_sink_ set to false we can scaffold not only tips
+ bool check_sink_;
+
+ void InitSources() {
+ sources_.clear();
+
+ for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
+ if (g_.IncomingEdgeCount(g_.EdgeStart(*iter)) == 0) {
+ sources_.push_back(EdgeWithDistance(*iter, 0));
+ }
+ }
+ }
+
+ bool IsSink(EdgeId e) const {
+ return g_.OutgoingEdgeCount(g_.EdgeEnd(e)) == 0;
+ }
+
+
+public:
+
+ ScaffoldingPathExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map, std::shared_ptr<ExtensionChooser> extension_chooser,
+ std::shared_ptr<GapJoiner> gap_joiner, size_t is, size_t max_loops, bool investigateShortLoops, bool check_sink = true):
+ LoopDetectingPathExtender(gp, cov_map, max_loops, investigateShortLoops, false, is),
+ extension_chooser_(extension_chooser),
+ gap_joiner_(gap_joiner),check_sink_(check_sink)
+ {
+ InitSources();
+ }
+
+ bool MakeSimpleGrowStep(BidirectionalPath& path, PathContainer* /*paths_storage*/) override {
+ if (path.Size() < 1 || (check_sink_ && !IsSink(path.Back())) ) {
+ return false;
+ }
+ DEBUG("scaffolding:");
+ DEBUG("Simple grow step, growing path");
+ path.Print();
+ ExtensionChooser::EdgeContainer candidates = extension_chooser_->Filter(path, sources_);
+ DEBUG("scaffolding candidates " << candidates.size() << " from sources " << sources_.size());
+
+ if (candidates.size() == 1) {
+ if (candidates[0].e_ == path.Back() || (cfg::get().avoid_rc_connections && candidates[0].e_ == g_.conjugate(path.Back()))) {
+ return false;
+ }
+ BidirectionalPath temp_path(path);
+ temp_path.PushBack(candidates[0].e_);
+ if(this->DetectCycleScaffolding(temp_path)) {
+ return false;
+ }
+
+ auto sc_mode = cfg::get().pe_params.param_set.sm;
+ EdgeId eid = candidates.back().e_;
+ if(cfg::get().pe_params.param_set.scaffolder_options.fix_gaps && check_sink_) {
+ Gap gap = gap_joiner_->FixGap(path.Back(), candidates.back().e_, candidates.back().d_);
+ if (gap.gap_ != GapJoiner::INVALID_GAP) {
+ DEBUG("Scaffolding. PathId: " << path.GetId() << " path length: " << path.Length() <<
+ ", fixed gap length: " << gap.gap_ << ", trash length: " << gap.trash_previous_ << "-" <<
+ gap.trash_current_);
+
+ if (is_2015_scaffolder_enabled(sc_mode)) {
+ if (used_storage_->IsUsedAndUnique(eid)) {
+ return false;
+ } else {
+ used_storage_->insert(eid);
+ }
+ }
+ path.PushBack(eid, gap);
+ return true;
+ }
+ else {
+ DEBUG("Looks like wrong scaffolding. PathId: " << path.GetId() << " path length: " <<
+ path.Length() << ", fixed gap length: " << candidates.back().d_);
+ return false;
+ }
+ }
+ else {
+ DEBUG("Gap joiners off");
+ DEBUG("Scaffolding. PathId: " << path.GetId() << " path length: " << path.Length() << ", fixed gap length: " << candidates.back().d_ );
+ if (is_2015_scaffolder_enabled(sc_mode)) {
+ if (used_storage_->IsUsedAndUnique(eid)) {
+ return false;
+ } else {
+ used_storage_->insert(eid);
+ }
+ }
+ path.PushBack(candidates.back().e_, candidates.back().d_);
+ return true;
+ }
+ }
+ DEBUG("scaffolding end");
+ return false;
+ }
+
+ bool ResolveShortLoopByCov(BidirectionalPath&) override {
+ return false;
+ }
+
+ bool ResolveShortLoopByPI(BidirectionalPath&) override {
+ return false;
+ }
+
+ std::shared_ptr<ExtensionChooser> GetExtensionChooser() const {
+ return extension_chooser_;
+ }
+
+private:
+ DECL_LOGGER("ScaffoldingPathExtender");
+};
+
+}
diff --git a/src/modules/algorithms/path_extend/path_filter.hpp b/src/modules/algorithms/path_extend/path_filter.hpp
new file mode 100644
index 0000000..35f78c2
--- /dev/null
+++ b/src/modules/algorithms/path_extend/path_filter.hpp
@@ -0,0 +1,134 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * path_filter.hpp
+ *
+ * Created on: Mar 14, 2012
+ * Author: andrey
+ */
+
+#ifndef PATH_FILTER_HPP_
+#define PATH_FILTER_HPP_
+
+#include "assembly_graph/paths/bidirectional_path.hpp"
+
+namespace path_extend {
+
+class CopyOnWritePathFilter {
+
+protected:
+ Graph& g;
+
+public:
+ CopyOnWritePathFilter(Graph& g_): g(g_) {
+ }
+
+ virtual bool predicate(BidirectionalPath& path) = 0;
+
+ virtual bool conjugateOperator(bool p, bool cp) {
+ return p || cp;
+ }
+
+ PathContainer filter(PathContainer& paths) {
+ PathContainer result;
+
+ for (size_t i = 0; i < paths.size(); ++i) {
+ if (conjugateOperator(predicate(*paths.Get(i)), predicate(*paths.GetConjugate(i)))) {
+ result.AddPair(paths.Get(i), paths.GetConjugate(i));
+ }
+ }
+
+ return result;
+ }
+
+};
+
+
+class IdFilter: public CopyOnWritePathFilter {
+
+protected:
+ std::set<size_t> ids;
+
+public:
+
+ IdFilter(Graph& g_, std::set<size_t> ids_): CopyOnWritePathFilter(g_), ids(ids_) {
+ }
+
+ virtual bool predicate(BidirectionalPath& path) {
+ return ids.count(path.GetId()) > 0;
+ }
+};
+
+
+class ErasingPathFilter {
+
+protected:
+ const Graph& g;
+
+public:
+ ErasingPathFilter(const Graph& g_): g(g_) {
+ }
+
+ virtual bool predicate(BidirectionalPath& path) = 0;
+
+ virtual bool conjugateOperator(bool p, bool cp) {
+ return p && cp;
+ }
+
+ void filter(PathContainer& paths) {
+ for (PathContainer::Iterator iter = paths.begin(); iter != paths.end(); ) {
+ if (!conjugateOperator(predicate(*iter.get()), predicate(*iter.getConjugate()))) {
+ iter = paths.erase(iter);
+ }
+ else {
+ ++iter;
+ }
+ }
+ }
+
+};
+
+
+class CoveragePathFilter: public ErasingPathFilter {
+
+protected:
+ double minCoverage;
+
+public:
+ CoveragePathFilter(Graph& g_, double cov): ErasingPathFilter(g_), minCoverage(cov) {
+
+ }
+
+ virtual bool predicate(BidirectionalPath& path) {
+ for (size_t i = 0; i < path.Size(); ++i) {
+ if (math::ls(g.coverage(path[i]), minCoverage)) {
+ return false;
+ }
+ }
+ return true;
+ }
+};
+
+
+class LengthPathFilter: public ErasingPathFilter {
+
+protected:
+ size_t minLength;
+
+public:
+ LengthPathFilter(const Graph& g_, size_t len): ErasingPathFilter(g_), minLength(len) {
+ }
+
+ virtual bool predicate(BidirectionalPath& path) {
+ return path.Length() > minLength;
+ }
+};
+
+}
+
+#endif /* PATH_FILTER_HPP_ */
diff --git a/src/modules/algorithms/path_extend/path_visualizer.hpp b/src/modules/algorithms/path_extend/path_visualizer.hpp
new file mode 100644
index 0000000..abcd4ad
--- /dev/null
+++ b/src/modules/algorithms/path_extend/path_visualizer.hpp
@@ -0,0 +1,172 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * path_visualizer.hpp
+ *
+ * Created on: Mar 22, 2012
+ * Author: andrey
+ */
+
+#ifndef PATH_VISUALIZER_HPP_
+#define PATH_VISUALIZER_HPP_
+
+#include "assembly_graph/paths/bidirectional_path.hpp"
+#include "assembly_graph/stats/picture_dump.hpp"
+
+namespace path_extend {
+
+using namespace debruijn_graph;
+
+template<class Graph>
+class PathGraphLabeler : public AbstractGraphLabeler<Graph> {
+ typedef AbstractGraphLabeler<Graph> base;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ std::map<EdgeId, std::string> labels_;
+
+public:
+ PathGraphLabeler(const Graph& g, const PathContainer& paths) : base(g) {
+ for(size_t i = 0; i < paths.size(); ++i) {
+ BidirectionalPath * path = paths.Get(i);
+ for (size_t j = 0; j < path->Size(); ++j) {
+ if (labels_.count(path->At(j)) > 0) {
+ labels_[path->At(j)] += ", ";
+ }
+ labels_[path->At(j)] += "(" + ToString(path->GetId()) + " : " + ToString(j) + ")";
+ }
+
+ path = paths.GetConjugate(i);
+ for (size_t j = 0; j < path->Size(); ++j) {
+ if (labels_.count(path->At(j)) > 0) {
+ labels_[path->At(j)] += ", ";
+ }
+ labels_[path->At(j)] += "(" + ToString(path->GetId()) + " : " + ToString(j) + ")";
+ }
+ }
+ }
+
+ virtual std::string label(VertexId /*vertexId*/) const {
+ return "";
+ }
+
+ virtual std::string label(EdgeId edgeId) const {
+ auto label = labels_.find(edgeId);
+ return label == labels_.end() ? "" : label->second;
+ }
+};
+
+
+class PathVisualizer {
+
+protected:
+ bool writeLength;
+ bool writePos;
+
+public:
+
+ PathVisualizer(): writeLength(true), writePos(true) {
+
+ }
+
+ void writeGraphWithPathsSimple(const conj_graph_pack& gp, const string& file_name, const string& graph_name, const PathContainer& paths) const {
+ INFO("Visualizing graph " << graph_name << " to file " << file_name);
+ std::fstream filestr;
+ filestr.open(file_name.c_str(), std::fstream::out);
+
+ StrGraphLabeler<Graph> str_labeler(gp.g);
+ PathGraphLabeler<Graph> path_labeler(gp.g, paths);
+ CoverageGraphLabeler<Graph> cov_labler(gp.g);
+ EdgePosGraphLabeler<Graph> pos_labeler(gp.g, gp.edge_pos);
+
+ CompositeLabeler<Graph> composite_labeler(str_labeler, cov_labler, path_labeler, pos_labeler);
+ shared_ptr<omnigraph::visualization::GraphColorer<Graph>> colorer;
+ if (gp.index.IsAttached()) {
+ colorer = stats::DefaultColorer(gp);
+ } else {
+ colorer = omnigraph::visualization::DefaultColorer(gp.g);
+ }
+
+ omnigraph::visualization::ComponentVisualizer<Graph> visualizer(gp.g, false);
+ omnigraph::visualization::EmptyGraphLinker<Graph> linker;
+ visualizer.Visualize(filestr, composite_labeler, *colorer, linker);
+ filestr.close();
+ INFO("Visualizing graph done");
+ }
+
+ void writeGraphSimple(const conj_graph_pack& gp, const string& file_name, const string& graph_name) const{
+ INFO("Visualizing graph " << graph_name << " to file " << file_name);
+ std::fstream filestr;
+ filestr.open(file_name.c_str(), std::fstream::out);
+
+ StrGraphLabeler<Graph> str_labeler(gp.g);
+ EdgePosGraphLabeler<Graph> pos_labeler(gp.g, gp.edge_pos);
+ CoverageGraphLabeler<Graph> cov_labler(gp.g);
+ CompositeLabeler<Graph> composite_labeler(str_labeler, cov_labler, pos_labeler);
+
+ shared_ptr<omnigraph::visualization::GraphColorer<Graph>> colorer;
+
+ if (gp.index.IsAttached()) {
+ colorer = stats::DefaultColorer(gp);
+ } else {
+ Path<EdgeId> empty;
+ colorer = omnigraph::visualization::DefaultColorer(gp.g, empty, empty);
+ }
+
+ omnigraph::visualization::ComponentVisualizer<Graph> visualizer(gp.g, false);
+ omnigraph::visualization::EmptyGraphLinker<Graph> linker;
+ visualizer.Visualize(filestr, composite_labeler, *colorer, linker);
+ filestr.close();
+ INFO("Visualizing graph done");
+ }
+
+ void writeGraphSimple(const Graph& g, const string& file_name, const string& graph_name) const{
+ INFO("Visualizing graph " << graph_name << " to file " << file_name);
+ std::fstream filestr;
+ filestr.open(file_name.c_str(), std::fstream::out);
+
+ StrGraphLabeler<Graph> str_labeler(g);
+ CoverageGraphLabeler<Graph> cov_labler(g);
+ CompositeLabeler<Graph> composite_labeler(str_labeler, cov_labler);
+
+ shared_ptr<omnigraph::visualization::GraphColorer<Graph>> colorer;
+
+ Path<EdgeId> empty;
+ colorer = omnigraph::visualization::DefaultColorer(g, empty, empty);
+
+ omnigraph::visualization::ComponentVisualizer<Graph> visualizer(g, false);
+ omnigraph::visualization::EmptyGraphLinker<Graph> linker;
+ visualizer.Visualize(filestr, composite_labeler, *colorer, linker);
+ filestr.close();
+ INFO("Visualizing graph done");
+ }
+
+ bool isWriteLength() const
+ {
+ return writeLength;
+ }
+
+ bool isWritePos() const
+ {
+ return writePos;
+ }
+
+ void setWriteLength(bool writeLength)
+ {
+ this->writeLength = writeLength;
+ }
+
+ void setWritePos(bool writePos)
+ {
+ this->writePos = writePos;
+ }
+};
+
+}
+
+#endif /* PATH_VISUALIZER_HPP_ */
diff --git a/src/modules/algorithms/path_extend/pe_config_struct.cpp b/src/modules/algorithms/path_extend/pe_config_struct.cpp
new file mode 100644
index 0000000..5f1d5b5
--- /dev/null
+++ b/src/modules/algorithms/path_extend/pe_config_struct.cpp
@@ -0,0 +1,172 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "pe_config_struct.hpp"
+#include "pipeline/config_common.hpp"
+
+namespace path_extend {
+
+void load(output_broken_scaffolds& obs, boost::property_tree::ptree const& pt, std::string const& key, bool complete) {
+ if (complete || pt.find(key) != pt.not_found()) {
+ std::string ep = pt.get<std::string>(key);
+ obs = pe_config::output_broken_scaffolds_id(ep);
+ }
+}
+
+void load(scaffolding_mode &sm, boost::property_tree::ptree const& pt, std::string const& key, bool complete) {
+ if (complete || pt.find(key) != pt.not_found()) {
+ std::string ep = pt.get<std::string>(key);
+ sm = pe_config::scaffolding_mode_id(ep);
+ }
+}
+
+void load(pe_config::ParamSetT::ScaffoldGraphParamsT& sg, boost::property_tree::ptree const& pt, bool /*complete*/) {
+ using config_common::load;
+ load(sg.construct, pt, "construct" );
+ load(sg.output, pt, "output" );
+ load(sg.always_add, pt, "always_add" );
+ load(sg.never_add, pt, "never_add" );
+ load(sg.relative_threshold, pt, "relative_threshold" );
+ load(sg.graph_connectivity, pt, "graph_connectivity");
+ load(sg.max_path_length, pt, "max_path_length" );
+}
+
+void load(pe_config::OutputParamsT& o, boost::property_tree::ptree const& pt, bool complete) {
+ using config_common::load;
+
+ load(o.write_overlaped_paths, pt, "write_overlaped_paths" , complete);
+ load(o.write_paths, pt, "write_paths" , complete);
+}
+
+void load(pe_config::VisualizeParamsT& o, boost::property_tree::ptree const& pt, bool complete) {
+ using config_common::load;
+ load(o.print_overlaped_paths, pt, "print_overlaped_paths" , complete);
+ load(o.print_paths, pt, "print_paths" , complete);
+}
+
+void load(pe_config::ParamSetT::ExtensionOptionsT& es,
+ boost::property_tree::ptree const& pt, bool complete) {
+ using config_common::load;
+ load(es.use_default_single_threshold, pt, "use_default_single_threshold", complete);
+ load(es.priority_coeff, pt, "priority_coeff", complete);
+ load(es.weight_threshold, pt, "weight_threshold", complete);
+ load(es.single_threshold, pt, "single_threshold", complete);
+ load(es.max_repeat_length, pt, "max_repeat_length", complete);
+}
+
+void load(pe_config::ParamSetT::LoopRemovalT& lr,
+ boost::property_tree::ptree const& pt, bool complete) {
+ using config_common::load;
+ load(lr.max_loops, pt, "max_loops", complete);
+ load(lr.mp_max_loops, pt, "mp_max_loops", complete);
+}
+
+void load(pe_config::ParamSetT::CoordinatedCoverageT& coord_cov,
+ boost::property_tree::ptree const& pt, bool complete) {
+ using config_common::load;
+ load(coord_cov.max_edge_length_in_repeat, pt, "max_edge_length_repeat", complete);
+ load(coord_cov.delta, pt, "delta", complete);
+ load(coord_cov.min_path_len, pt, "min_path_len", complete);
+}
+
+void load(pe_config::ParamSetT::ScaffolderOptionsT& so,
+ boost::property_tree::ptree const& pt, bool complete)
+{
+ using config_common::load;
+ load(so.on , pt, "on" , complete);
+ load(so.cutoff , pt, "cutoff", complete);
+ load(so.rel_cutoff , pt, "rel_cutoff", complete);
+ load(so.sum_threshold , pt, "sum_threshold", complete);
+
+ load(so.cluster_info , pt, "cluster_info", complete);
+ load(so.cl_threshold , pt, "cl_threshold", complete);
+
+ load(so.fix_gaps , pt, "fix_gaps", complete);
+ load(so.use_la_gap_joiner , pt, "use_la_gap_joiner", complete);
+ load(so.min_gap_score , pt, "min_gap_score", complete);
+ load(so.max_must_overlap , pt, "max_must_overlap", complete);
+ load(so.max_can_overlap , pt, "max_can_overlap", complete);
+ load(so.short_overlap , pt, "short_overlap", complete);
+ load(so.artificial_gap , pt, "artificial_gap", complete);
+ load(so.use_old_score , pt, "use_old_score", complete);
+ load(so.min_overlap_length, pt, "min_overlap_length", complete);
+ load(so.flank_addition_coefficient, pt, "flank_addition_coefficient", complete);
+ load(so.flank_multiplication_coefficient, pt, "flank_multiplication_coefficient", complete);
+}
+
+void load(pe_config::ParamSetT& p, boost::property_tree::ptree const& pt, bool complete) {
+ using config_common::load;
+ load(p.sm, pt, "scaffolding_mode", complete);
+ load(p.normalize_weight, pt, "normalize_weight", complete);
+ load(p.cut_all_overlaps, pt, "cut_all_overlaps", complete);
+ load(p.remove_overlaps, pt, "remove_overlaps", complete);
+ load(p.multi_path_extend, pt, "multi_path_extend", complete);
+ load(p.split_edge_length, pt, "split_edge_length", complete);
+ load(p.extension_options, pt, "extension_options", complete);
+ load(p.mate_pair_options, pt, "mate_pair_options", complete);
+ load(p.scaffolder_options, pt, "scaffolder", complete);
+ load(p.loop_removal, pt, "loop_removal", complete);
+ load(p.coordinated_coverage, pt, "coordinated_coverage", complete);
+ load(p.use_coordinated_coverage, pt, "use_coordinated_coverage", complete);
+ load(p.scaffolding2015, pt, "scaffolding2015", complete);
+ load(p.scaffold_graph_params, pt, "scaffold_graph", complete);
+}
+
+
+void load(pe_config::LongReads& p, boost::property_tree::ptree const& pt,
+ bool complete) {
+ using config_common::load;
+ load(p.filtering, pt, "filtering", complete);
+ load(p.weight_priority, pt, "weight_priority", complete);
+ load(p.unique_edge_priority, pt, "unique_edge_priority", complete);
+ load(p.min_significant_overlap, pt, "min_significant_overlap", complete);
+
+}
+
+void load(pe_config::ParamSetT::Scaffolding2015& p, boost::property_tree::ptree const& pt,
+ bool) {
+ using config_common::load;
+ load(p.autodetect, pt, "autodetect");
+ load(p.min_unique_length, pt, "min_unique_length");
+ load(p.unique_coverage_variation, pt, "unique_coverage_variation");
+}
+
+void load(pe_config::AllLongReads& p, boost::property_tree::ptree const& pt,
+ bool complete) {
+ using config_common::load;
+ load(p.pacbio_reads, pt, "pacbio_reads", complete);
+ load(p.single_reads, pt, "single_reads", complete);
+ load(p.contigs, pt, "contigs", complete);
+ load(p.meta_contigs, pt, "meta_untrusted_contigs", complete);
+}
+
+void load(pe_config::MainPEParamsT& p, boost::property_tree::ptree const& pt,
+ bool complete) {
+ using config_common::load;
+ load(p.debug_output, pt, "debug_output", complete);
+ load(p.output, pt, "output", complete);
+ load(p.viz, pt, "visualize", complete);
+ load(p.obs, pt, "output_broken_scaffolds", complete);
+ load(p.param_set, pt, "params", complete);
+ load(p.long_reads, pt, "long_reads", complete);
+ if (!p.debug_output) {
+ p.output.DisableAll();
+ p.viz.DisableAll();
+ }
+ p.etc_dir = "path_extend";
+}
+
+//// main long contigs config load function
+//void load(pe_config& pe_cfg, boost::property_tree::ptree const& pt, bool complete) {
+// using config_common::load;
+//
+// load(pe_cfg.dataset_name , pt, "dataset", complete);
+// load(pe_cfg.params , pt, "pe_params", complete);
+//}
+
+};
+
diff --git a/src/modules/algorithms/path_extend/pe_config_struct.hpp b/src/modules/algorithms/path_extend/pe_config_struct.hpp
new file mode 100644
index 0000000..47578c7
--- /dev/null
+++ b/src/modules/algorithms/path_extend/pe_config_struct.hpp
@@ -0,0 +1,252 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * lc_config_struct.hpp
+ *
+ * Created on: Aug 16, 2011
+ * Author: Alexey.Gurevich
+ */
+
+#ifndef LC_CONFIG_STRUCT_HPP_
+#define LC_CONFIG_STRUCT_HPP_
+
+#include "pipeline/config_singl.hpp"
+#include "dev_support/cpp_utils.hpp"
+
+#include <boost/optional.hpp>
+#include <boost/property_tree/ptree_fwd.hpp>
+#include <boost/bimap.hpp>
+
+#include <string>
+#include <vector>
+
+namespace path_extend {
+
+enum output_broken_scaffolds {
+ obs_none,
+ obs_break_gaps,
+ obs_break_all
+};
+
+enum scaffolding_mode {
+ sm_old,
+ sm_2015,
+ sm_combined,
+ sm_old_pe_2015
+};
+
+inline bool is_2015_scaffolder_enabled(const scaffolding_mode mode) {
+ return (mode == sm_old_pe_2015 || mode == sm_2015 || mode == sm_combined);
+}
+
+// struct for path extend subproject's configuration file
+struct pe_config {
+
+ typedef boost::bimap<std::string, output_broken_scaffolds> output_broken_scaffolds_id_mapping;
+
+ static const output_broken_scaffolds_id_mapping FillOBSInfo() {
+ output_broken_scaffolds_id_mapping::value_type info[] = {
+ output_broken_scaffolds_id_mapping::value_type("none", obs_none),
+ output_broken_scaffolds_id_mapping::value_type("break_gaps", obs_break_gaps),
+ output_broken_scaffolds_id_mapping::value_type("break_all", obs_break_all)
+ };
+
+ return output_broken_scaffolds_id_mapping(info, utils::array_end(info));
+ }
+
+ static const output_broken_scaffolds_id_mapping& output_broken_scaffolds_info() {
+ static output_broken_scaffolds_id_mapping output_broken_scaffolds_info = FillOBSInfo();
+ return output_broken_scaffolds_info;
+ }
+
+ static const std::string& output_broken_scaffolds_name(output_broken_scaffolds obs) {
+ auto it = output_broken_scaffolds_info().right.find(obs);
+ VERIFY_MSG(it != output_broken_scaffolds_info().right.end(),
+ "No name for output broken scaffolds mode id = " << obs);
+
+ return it->second;
+ }
+
+ static output_broken_scaffolds output_broken_scaffolds_id(std::string name) {
+ auto it = output_broken_scaffolds_info().left.find(name);
+ VERIFY_MSG(it != output_broken_scaffolds_info().left.end(),
+ "There is no output broken scaffolds mode with name = " << name);
+
+ return it->second;
+ }
+
+ typedef boost::bimap<std::string, scaffolding_mode> scaffolding_mode_id_mapping;
+
+ static const scaffolding_mode_id_mapping FillSMInfo() {
+ scaffolding_mode_id_mapping::value_type info[] = {
+ scaffolding_mode_id_mapping::value_type("old", sm_old),
+ scaffolding_mode_id_mapping::value_type("2015", sm_2015),
+ scaffolding_mode_id_mapping::value_type("combined", sm_combined),
+ scaffolding_mode_id_mapping::value_type("old_pe_2015", sm_old_pe_2015)
+ };
+
+ return scaffolding_mode_id_mapping(info, utils::array_end(info));
+ }
+
+ static const scaffolding_mode_id_mapping& scaffolding_mode_info() {
+ static scaffolding_mode_id_mapping scaffolding_mode_info = FillSMInfo();
+ return scaffolding_mode_info;
+ }
+
+ static const std::string& scaffolding_mode_name(scaffolding_mode sm) {
+ auto it = scaffolding_mode_info().right.find(sm);
+ VERIFY_MSG(it != scaffolding_mode_info().right.end(),
+ "No name for scaffolding mode id = " << sm);
+
+ return it->second;
+ }
+
+ static scaffolding_mode scaffolding_mode_id(std::string name) {
+ auto it = scaffolding_mode_info().left.find(name);
+ VERIFY_MSG(it != scaffolding_mode_info().left.end(),
+ "There is no scaffolding mode with name = " << name);
+
+ return it->second;
+ }
+
+ struct OutputParamsT {
+ bool write_overlaped_paths;
+ bool write_paths;
+
+ void DisableAll() {
+ write_overlaped_paths = false;
+ write_paths = false;
+ }
+ };
+
+
+
+ struct VisualizeParamsT {
+ bool print_overlaped_paths;
+ bool print_paths;
+
+ void DisableAll() {
+ print_overlaped_paths = false;
+ print_paths = false;
+ }
+ };
+
+ struct ParamSetT {
+ scaffolding_mode sm;
+
+ bool normalize_weight;
+ size_t split_edge_length;
+
+ bool multi_path_extend;
+ bool remove_overlaps;
+ bool cut_all_overlaps;
+
+ struct ExtensionOptionsT {
+ bool use_default_single_threshold;
+ double single_threshold;
+ double weight_threshold;
+ double priority_coeff;
+ size_t max_repeat_length;
+ } extension_options;
+
+ ExtensionOptionsT mate_pair_options;
+
+
+ struct ScaffolderOptionsT {
+ bool on;
+ int cutoff;
+ double rel_cutoff;
+ double sum_threshold;
+
+ bool cluster_info;
+ double cl_threshold;
+
+ bool fix_gaps;
+ bool use_la_gap_joiner;
+ double min_gap_score;
+ double max_must_overlap;
+ double max_can_overlap;
+ int short_overlap;
+ size_t artificial_gap;
+
+ bool use_old_score;
+
+ size_t min_overlap_length;
+ double flank_addition_coefficient;
+ double flank_multiplication_coefficient;
+ } scaffolder_options;
+
+
+ struct LoopRemovalT {
+ size_t max_loops;
+ size_t mp_max_loops;
+ } loop_removal;
+
+
+ bool use_coordinated_coverage;
+
+ struct CoordinatedCoverageT {
+ size_t max_edge_length_in_repeat;
+ double delta;
+ size_t min_path_len;
+ } coordinated_coverage;
+ struct Scaffolding2015 {
+ bool autodetect;
+ size_t min_unique_length;
+ double unique_coverage_variation;
+ } scaffolding2015;
+ struct ScaffoldGraphParamsT {
+ bool construct;
+ bool output;
+ size_t always_add;
+ size_t never_add;
+ double relative_threshold;
+ bool graph_connectivity;
+ size_t max_path_length;
+ } scaffold_graph_params;
+ };
+
+ struct LongReads {
+ double filtering;
+ double weight_priority;
+ double unique_edge_priority;
+ size_t min_significant_overlap;
+ };
+
+ struct AllLongReads{
+ LongReads single_reads;
+ LongReads pacbio_reads;
+ LongReads contigs;
+ LongReads meta_contigs;
+ };
+
+
+ struct MainPEParamsT {
+ output_broken_scaffolds obs;
+
+ bool finalize_paths;
+ bool debug_output;
+ std::string etc_dir;
+
+ OutputParamsT output;
+ VisualizeParamsT viz;
+ ParamSetT param_set;
+ AllLongReads long_reads;
+ }; // params;
+
+};
+
+void load(pe_config::ParamSetT& p, boost::property_tree::ptree const& pt, bool complete = true);
+void load(pe_config::MainPEParamsT& p, boost::property_tree::ptree const& pt, bool complete = true);
+//void load(pe_config& pe_cfg, boost::property_tree::ptree const& pt, bool complete);
+
+}
+
+//typedef config_common::config<path_extend::pe_config> pe_cfg;
+
+#endif /* CONFIG_STRUCT_HPP_ */
diff --git a/src/modules/algorithms/path_extend/pe_io.hpp b/src/modules/algorithms/path_extend/pe_io.hpp
new file mode 100644
index 0000000..60c22f1
--- /dev/null
+++ b/src/modules/algorithms/path_extend/pe_io.hpp
@@ -0,0 +1,263 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef PE_IO_HPP_
+#define PE_IO_HPP_
+
+
+#include "algorithms/genome_consistance_checker.hpp"
+#include "assembly_graph/paths/bidirectional_path.hpp"
+#include "assembly_graph/graph_support/contig_output.hpp"
+#include "assembly_graph/components/connected_component.hpp"
+#include "io/reads_io/osequencestream.hpp"
+#include <stack>
+#include <algorithm>
+
+namespace path_extend {
+
+using namespace debruijn_graph;
+
+class ContigWriter {
+protected:
+ DECL_LOGGER("PathExtendIO")
+
+protected:
+ const Graph& g_;
+ ContigConstructor<Graph> &constructor_;
+ size_t k_;
+ map<EdgeId, ExtendedContigIdT> ids_;
+ const ConnectedComponentCounter &c_counter_;
+ //TODO: add constructor
+ string ToString(const BidirectionalPath& path) const {
+ stringstream ss;
+ if (path.IsInterstrandBulge() && path.Size() == 1) {
+ ss << constructor_.construct(path.Back()).first.substr(k_, g_.length(path.Back()) - k_);
+ return ss.str();
+ }
+
+ if (!path.Empty()) {
+ ss << constructor_.construct(path[0]).first.substr(0, k_);
+ }
+
+ for (size_t i = 0; i < path.Size(); ++i) {
+ int gap = i == 0 ? 0 : path.GapAt(i);
+ if (gap > (int) k_) {
+ for (size_t j = 0; j < gap - k_; ++j) {
+ ss << "N";
+ }
+ ss << constructor_.construct(path[i]).first;
+ } else {
+ int overlapLen = (int) k_ - gap;
+ if (overlapLen >= (int) g_.length(path[i]) + (int) k_) {
+ if(overlapLen > (int) g_.length(path[i]) + (int) k_) {
+ WARN("Such scaffolding logic leads to local misassemblies");
+ }
+ continue;
+ }
+ auto temp_str = g_.EdgeNucls(path[i]).Subseq(overlapLen).str();
+ if(i != path.Size() - 1) {
+ for(size_t j = 0 ; j < path.TrashPreviousAt(i + 1); ++j) {
+ temp_str.pop_back();
+ if(temp_str.size() == 0) {
+ break;
+ }
+ }
+ }
+ ss << temp_str;
+ }
+ }
+ return ss.str();
+ }
+
+ string ToFASTGString(const BidirectionalPath& path) const {
+ if (path.Empty())
+ return "";
+ string res = ids_.at(path.Front()).short_id_;
+ for (size_t i = 1; i < path.Size(); ++i) {
+ if (g_.EdgeEnd(path[i - 1]) != g_.EdgeStart(path[i])) {
+ res += ";\n" + ids_.at(path[i]).short_id_;
+ }
+ else {
+ res += "," + ids_.at(path[i]).short_id_;
+ }
+ }
+ return res;
+ }
+
+
+public:
+ ContigWriter(const Graph& g, ContigConstructor<Graph> &constructor, const ConnectedComponentCounter &c_counter): g_(g), constructor_(constructor), k_(g.k()), ids_(), c_counter_(c_counter) {
+ MakeContigIdMap(g_, ids_, c_counter, "NODE");
+ }
+
+
+ void WriteEdges(const string &filename) const {
+ INFO("Outputting edges to " << filename);
+ io::osequencestream_with_id oss(filename);
+
+ set<EdgeId> included;
+ for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
+ if (included.count(*iter) == 0) {
+ oss.setCoverage(g_.coverage(*iter));
+ oss.setID((int) g_.int_id(*iter));
+ oss << g_.EdgeNucls(*iter);
+
+ included.insert(*iter);
+ included.insert(g_.conjugate(*iter));
+ }
+ }
+ DEBUG("Contigs written");
+ }
+
+
+ void WritePaths(const PathContainer &paths, const string &filename) const {
+ INFO("Outputting path data to " << filename);
+ std::ofstream oss;
+ oss.open(filename.c_str());
+ int i = 1;
+ oss << paths.size() << endl;
+ for (auto iter = paths.begin(); iter != paths.end(); ++iter) {
+ //oss << i << endl;
+ i++;
+ BidirectionalPath* path = iter.get();
+ if (path->GetId() % 2 != 0) {
+ path = path->GetConjPath();
+ }
+ oss << "PATH " << path->GetId() << " " << path->Size() << " " << path->Length() + k_ << endl;
+ for (size_t j = 0; j < path->Size(); ++j) {
+ oss << g_.int_id(path->At(j)) << " " << g_.length(path->At(j)) << " " << path->GapAt(j) << " " << path->TrashPreviousAt(j) << " " << path->TrashCurrentAt(j) << endl;
+ }
+ //oss << endl;
+ }
+ oss.close();
+ DEBUG("Edges written");
+ }
+
+ void LoadPaths(PathContainer &paths, GraphCoverageMap &cover_map, const string &filename) const {
+ paths.clear();
+ map<size_t, EdgeId> int_ids;
+ for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
+ int_ids.insert(make_pair(g_.int_id(*iter), *iter));
+ }
+
+ std::ifstream iss;
+ iss.open(filename);
+ size_t psize;
+ iss >> psize;
+ for(size_t i = 0; i < psize && !iss.eof(); ++i) {
+ string s;
+ size_t id;
+ size_t size;
+ size_t len;
+ iss >> s >> id >> size >> len;
+ VERIFY(s == "PATH");
+
+ BidirectionalPath * path = new BidirectionalPath(g_);
+ BidirectionalPath * conjugatePath = new BidirectionalPath(g_);
+ paths.AddPair(path, conjugatePath);
+ path->Subscribe(&cover_map);
+ conjugatePath->Subscribe(&cover_map);
+ for (size_t j = 0; !iss.eof() && j < size; ++j) {
+ size_t eid;
+ size_t elen;
+ int gap;
+ uint32_t trash_prev;
+ uint32_t trash_current;
+
+ iss >> eid >> elen >> gap >> trash_prev >> trash_current;
+ Gap gap_struct(gap, trash_prev, trash_current);
+ EdgeId edge = int_ids[eid];
+ conjugatePath->PushBack(edge, gap_struct);
+ VERIFY(g_.length(edge) == elen);
+ }
+ VERIFY(path->Length() + k_ == len);
+ }
+ VERIFY(psize == paths.size());
+ iss.close();
+ }
+
+ void WritePathsToFASTA(const PathContainer &paths,
+ const string &filename_base,
+ bool write_fastg = true) const {
+
+ INFO("Writing contigs to " << filename_base);
+ io::osequencestream_simple oss(filename_base + ".fasta");
+
+ std::ofstream os_fastg;
+ if (write_fastg)
+ os_fastg.open((filename_base + ".paths").c_str());
+
+ int i = 0;
+ for (auto iter = paths.begin(); iter != paths.end(); ++iter) {
+ if (iter.get()->Length() <= 0)
+ continue;
+ i++;
+ DEBUG("NODE " << i);
+ BidirectionalPath* path = iter.get();
+ path->Print();
+ string contig_id;
+ string path_string = ToString(*path);
+ if (cfg::get().pd) {
+ EdgeId e = path->At(0);
+ size_t component = c_counter_.GetComponent(e);
+ contig_id = io::MakeContigComponentId(i, path_string.length(), path->Coverage(), component);
+ } else {
+ contig_id = io::MakeContigId(i, path_string.length(), path->Coverage());
+ }
+ oss.set_header(contig_id);
+ if (write_fastg) {
+ os_fastg << contig_id<< endl;
+ os_fastg << ToFASTGString(*iter.get()) << endl;
+ os_fastg << contig_id << "'" << endl;
+ os_fastg << ToFASTGString(*iter.getConjugate()) << endl;
+ }
+ oss << path_string;
+ }
+ if (write_fastg)
+ os_fastg.close();
+ DEBUG("Contigs written");
+ }
+
+ void WriteFASTGPaths(const PathContainer& paths, const string& filename) const {
+ INFO("Writing FASTG paths to " << filename);
+ std::ofstream oss(filename.c_str());
+ for (auto iter = paths.begin(); iter != paths.end(); ++iter) {
+ if (iter.get()->Length() <= 0)
+ continue;
+ oss << ToFASTGString(*iter.get()) << endl;
+ oss << ToFASTGString(*iter.getConjugate()) << endl;
+ }
+ oss.close();
+ }
+
+ void OutputPaths(const PathContainer& paths, const string& filename_base) const {
+ WritePathsToFASTA(paths, filename_base);
+ }
+
+};
+
+
+class PathInfoWriter {
+protected:
+ DECL_LOGGER("PathExtendIO")
+
+public:
+
+ void WritePaths(const PathContainer &paths, const string &filename){
+ std::ofstream oss(filename.c_str());
+
+ for (auto iter = paths.begin(); iter != paths.end(); ++iter) {
+ iter.get()->Print(oss);
+ }
+
+ oss.close();
+ }
+};
+
+}
+
+#endif /* PE_IO_HPP_ */
diff --git a/src/modules/algorithms/path_extend/pe_resolver.hpp b/src/modules/algorithms/path_extend/pe_resolver.hpp
new file mode 100644
index 0000000..9729c70
--- /dev/null
+++ b/src/modules/algorithms/path_extend/pe_resolver.hpp
@@ -0,0 +1,520 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * pe_resolver.hpp
+ *
+ * Created on: Mar 12, 2012
+ * Author: andrey
+ */
+
+#ifndef PE_RESOLVER_HPP_
+#define PE_RESOLVER_HPP_
+
+#include "path_extender.hpp"
+#include "pe_io.hpp"
+
+namespace path_extend {
+
+
+class SimpleOverlapRemover {
+
+public:
+ SimpleOverlapRemover(const Graph& g, GraphCoverageMap& cm)
+ : g_(g), coverage_map_(cm) {
+ }
+
+ void RemoveOverlaps(PathContainer& paths) const {
+ for (size_t i = 0; i < paths.size(); i++) {
+ FindAndRemovePathOverlap(paths, paths.Get(i));
+ FindAndRemovePathOverlap(paths, paths.GetConjugate(i));
+ }
+ }
+
+ void CutPseudoSelfConjugatePaths(PathContainer& paths) {
+ vector<pair<BidirectionalPath *, BidirectionalPath *>> tmp_paths(paths.begin(), paths.end());
+ for(auto it = tmp_paths.begin(); it != tmp_paths.end(); ++it) {
+ BidirectionalPath * path1 = it->first;
+ BidirectionalPath * path2 = it->second;
+ bool ups = false;
+ if(path1 != path2) {
+ size_t last = 0;
+ while(last < path1->Size() && path1->operator [](last) == path2->operator [](last)) {
+ last++;
+ }
+ if(last > 0) {
+ AddOverlap(paths, path1, 0, last - 1);
+ path1->PopBack(last);
+ path2->PopBack(last);
+ }
+ }
+ if(ups) path1->Print();
+ }
+ }
+
+ void RemoveSimilarPaths(PathContainer& paths, size_t min_edge_len, size_t max_path_diff, bool del_only_equal, bool del_subpaths, bool del_begins, bool del_all, bool add_overlap_begins) const {
+ DEBUG("== Removing similar paths ==");
+ DEBUG("Min edge len " << min_edge_len << ", max path diff " << max_path_diff)
+ DEBUG("Only equal " << del_only_equal << ", subpaths " << del_subpaths << ", starts " << del_begins << ", all " << del_all << ", add starts " << add_overlap_begins);
+ std::vector<EdgeId> edges = GetSortedEdges();
+ for (size_t edgeIndex = 0; edgeIndex < edges.size(); ++edgeIndex) {
+ EdgeId edge = edges.at(edgeIndex);
+ BidirectionalPathSet cov_paths = coverage_map_.GetCoveringPaths(edge);
+ std::vector<BidirectionalPath*> cov_vect(cov_paths.begin(), cov_paths.end());
+ std::sort(cov_vect.begin(), cov_vect.end(), PathIdCompare);
+ for (size_t vect_i = 0; vect_i < cov_vect.size(); ++vect_i) {
+ BidirectionalPath* path1 = cov_vect.at(vect_i);
+ if (cov_paths.find(path1) == cov_paths.end()) {
+ continue;
+ }
+ for (size_t vect_i1 = vect_i + 1; vect_i1 < cov_vect.size(); ++vect_i1) {
+ BidirectionalPath* path2 = cov_vect.at(vect_i1);
+ if (path1 == path2 || path1 == path2->GetConjPath()) {
+ continue;
+ }
+ if (cov_paths.find(path2) == cov_paths.end())
+ continue;
+ if ((*path1) == (*path2)) {
+ if (path2->IsOverlap()) {
+ path1->SetOverlap(true);
+ }
+ DEBUG("Removing path " << path2->GetId() << " because of path " << path1->GetId());
+ path2->Print();
+ path1->Print();
+ path2->Clear();
+ cov_paths = coverage_map_.GetCoveringPaths(edge);
+ continue;
+ }
+ if (g_.length(edge) <= min_edge_len || path1->IsOverlap() || path2->IsOverlap() || del_only_equal) {
+ continue;
+ }
+ CompareAndCut(paths, edge, path1, path2, max_path_diff,
+ del_subpaths, del_begins, del_all, add_overlap_begins);
+ cov_paths = coverage_map_.GetCoveringPaths(edge);
+ }
+ }
+ }
+ DEBUG("== Emd removing similar paths ==");
+ }
+
+private:
+
+ void SubscribeCoverageMap(BidirectionalPath* path) const {
+ path->Subscribe(&coverage_map_);
+ for (size_t i = 0; i < path->Size(); ++i) {
+ coverage_map_.BackEdgeAdded(path->At(i), path, path->GapAt(i));
+ }
+ }
+
+ void CompareAndCut(PathContainer& paths, EdgeId edge, BidirectionalPath* path1, BidirectionalPath* path2,
+ size_t max_path_diff,
+ bool del_subpaths, bool del_begins,
+ bool del_all, bool add_overlap_begins) const {
+ vector<size_t> positions1 = path1->FindAll(edge);
+ vector<size_t> positions2 = path2->FindAll(edge);
+ size_t i1 = 0;
+ size_t i2 = 0;
+ bool renewed = false;
+ while (i1 < positions1.size()) {
+ while (i2 < positions2.size()) {
+ DEBUG("CompareAndCutFromPos paths " << g_.int_id(edge));
+ CompareAndCutFromPos(paths, path1, (int) positions1[i1], path2,
+ (int) positions2[i2], max_path_diff,
+ del_subpaths, del_begins, del_all, add_overlap_begins);
+
+ if (positions1[i1] >= path1->Size() || path1->At(positions1[i1]) != edge || positions2[i2] >= path2->Size() || path2->At(positions2[i2]) != edge) {
+ vector<size_t> new_positions1 = path1->FindAll(edge);
+ vector<size_t> new_positions2 = path2->FindAll(edge);
+
+ if (new_positions1.size() == positions1.size() && new_positions2.size() == positions2.size()) {
+ return;
+ }
+ else {
+ positions1 = new_positions1;
+ positions2 = new_positions2;
+ i1 = 0;
+ i2 = 0;
+ renewed = true;
+ break;
+ }
+ ++i2;
+ }
+ ++i2;
+ }
+
+ if (renewed) {
+ renewed = false;
+ continue;
+ }
+ ++i1;
+ }
+ }
+
+ void CompareAndCutFromPos(PathContainer& paths, BidirectionalPath* path1, int pos1,
+ BidirectionalPath* path2, int pos2,
+ size_t max_path_diff,
+ bool delete_subpaths, bool delete_begins,
+ bool delete_all, bool add_overlap_begins) const {
+ int last2 = pos2;
+ int last1 = pos1;
+ if (last1 >= (int) path1->Size() || last2 >= (int) path2->Size()) {
+ return;
+ }
+ vector<int> other_path_end;
+ pair<int, int> posRes = ComparePaths(last1, last2, *path1, *path2, max_path_diff);
+ last1 = posRes.first;
+ last2 = posRes.second;
+ BidirectionalPath* conj1 = path1->GetConjPath();
+ BidirectionalPath* conj2 = path2->GetConjPath();
+ size_t first1 = conj1->Size() - pos1 - 1;
+ size_t first2 = conj2->Size() - pos2 - 1;
+ posRes = ComparePaths(first1, first2, *conj1, *conj2, max_path_diff);
+ first2 = conj2->Size() - posRes.second - 1;
+ first1 = conj1->Size() - posRes.first - 1;
+ if ((int)path2->LengthAt(last2) - (int)g_.length(path2->At(last2)) < (int) max_path_diff) {
+ last2 = (int)path2->Size() - 1;
+ }
+ if ((int)path2->Length() - (int)path2->LengthAt(first2) < (int) max_path_diff) {
+ first2 = 0;
+ }
+ if ((int)path1->LengthAt(last1) - (int)g_.length(path1->At(last1)) < (int) max_path_diff) {
+ last1 = (int)path1->Size() - 1;
+ }
+ if ((int)path1->Length() - (int)path1->LengthAt(first1) < (int) max_path_diff) {
+ first1 = 0;
+ }
+
+ CutOverlaps(paths, path1, first1, last1, path1->Size(), path2,
+ first2, last2, path2->Size(), delete_subpaths,
+ delete_begins, delete_all, add_overlap_begins);
+ }
+
+ void AddOverlap(PathContainer& paths, BidirectionalPath* path1, size_t first1, size_t last1) const {
+ BidirectionalPath* overlap = new BidirectionalPath(path1->SubPath(first1, last1 + 1));
+ BidirectionalPath* conj_overlap = new BidirectionalPath(overlap->Conjugate());
+ SubscribeCoverageMap(overlap);
+ SubscribeCoverageMap(conj_overlap);
+ paths.AddPair(overlap, conj_overlap);
+ }
+
+ bool CutOverlaps(PathContainer& paths, BidirectionalPath* path1, size_t first1, size_t last1, size_t size1, BidirectionalPath* path2, size_t first2,
+ size_t last2, size_t size2, bool del_subpaths, bool del_begins, bool del_all, bool add_overlap_begins) const {
+ if (first1 == 0 && last1 == size1 - 1 && del_subpaths) {
+ DEBUG("Removing path " << path1->GetId() << " because of path " << path2->GetId());
+ path1->Print();
+ path2->Print();
+ path1->Clear();
+ } else if (first2 == 0 && last2 == size2 - 1 && del_subpaths) {
+ DEBUG("Removing path " << path2->GetId() << " because of path " << path1->GetId());
+ path2->Print();
+ path1->Print();
+ path2->Clear();
+ } else if (first2 == 0 && first1 == 0 && del_begins) {
+ DEBUG("Path " << path1->GetId() << ", len " << path1->Length() << " and path " << path2->GetId() << ", len " << path2->Length() << " have similar starts");
+ DEBUG("Path 1: " << last1 << " edges of length " << path1->Length() - path1->LengthAt(min(last1 + 1, path1->Size() - 1)));
+ DEBUG("Path 2: " << last2 << " edges of length " << path2->Length() - path2->LengthAt(min(last2 + 1, path2->Size() - 1)));
+ DEBUG("Path 1 has overlap start " << path1->HasOverlapedBegin() << ", path 2 has overlap start " << path2->HasOverlapedBegin());
+
+ if (add_overlap_begins) {
+ AddOverlap(paths, path1, first1, last1);
+ DEBUG("Detaching overlap " << path2->GetId() << " and " << path1->GetId());
+ path2->Print();
+ path1->Print();
+ path1->GetConjPath()->PopBack(last1 + 1);
+ path2->GetConjPath()->PopBack(last2 + 1);
+ } else if (path1->Length() < path2->Length()) {
+ DEBUG("Detaching overlap from " << path1->GetId() << " because of "<< path2->GetId());
+ path1->Print();
+ path2->Print();
+ path1->GetConjPath()->PopBack(last1 + 1);
+ } else {
+ DEBUG("Detaching overlap from " << path2->GetId() << " because of "<< path1->GetId());
+ path2->Print();
+ path1->Print();
+ path2->GetConjPath()->PopBack(last2 + 1);
+ }
+ } else if ((last1 == size1 - 1 && last2 == size2 - 1) && del_begins) {
+ DEBUG("Path " << path1->GetId() << ", len " << path1->Length() << " and path " << path2->GetId() << ", len " << path2->Length() << " have similar ends");
+ DEBUG("Path 1: " << path1->Size() - first1 << " edges of length " << path1->LengthAt(first1));
+ DEBUG("Path 2: " << path2->Size() - first2 << " edges of length " << path2->LengthAt(first2));
+ DEBUG("Path 1 has overlap end " << path1->HasOverlapedEnd() << ", path 2 has overlap end " << path2->HasOverlapedEnd());
+
+ if (add_overlap_begins){
+ AddOverlap(paths, path1, first1, last1);
+ DEBUG("Detaching overlap " << path2->GetId() << " and " << path1->GetId());
+ path2->Print();
+ path1->Print();
+ path1->PopBack(last1 + 1 - first1);
+ path2->PopBack(last2 + 1 - first2);
+ }
+ if (path1->Length() < path2->Length()) {
+ DEBUG("Detaching overlap from " << path1->GetId() << " because of "<< path2->GetId());
+ path1->Print();
+ path2->Print();
+ path1->PopBack(last1 + 1 - first1);
+ } else {
+ DEBUG("Detaching overlap from " << path2->GetId() << " because of "<< path1->GetId());
+ path2->Print();
+ path1->Print();
+ path2->PopBack(last2 + 1 - first2);
+ }
+ } else if (first2 == 0 && del_all) {
+ DEBUG("Detaching overlap from " << path2->GetConjPath()->GetId() << " because of "<< path1->GetId());
+ DEBUG("Does it have overlap in the beginning: " << path2->HasOverlapedBegin());
+ path2->Print();
+ DEBUG(" >>>> ")
+ path1->Print();
+ DEBUG(" ==== ");
+ path2->GetConjPath()->PopBack(last2 + 1);
+ } else if (last2 == size2 - 1 && del_all) {
+ DEBUG("Detaching overlap from " << path2->GetId() << " because of "<< path1->GetId());
+ DEBUG("Does it have overlap in the end: " << path2->HasOverlapedEnd());
+ path2->Print();
+ DEBUG(" >>>> ")
+ path1->Print();
+ DEBUG(" ==== ");
+ path2->PopBack(last1 + 1 - first1);
+ } else if (first1 == 0 && del_all) {
+ DEBUG("Detaching overlap from " << path1->GetConjPath()->GetId() << " because of "<< path2->GetId());
+ DEBUG("Does it have overlap in the end: " << path1->HasOverlapedBegin());
+ path1->Print();
+ DEBUG(" >>>> ")
+ path2->Print();
+ DEBUG(" ==== ");
+ path1->GetConjPath()->PopBack(last1 + 1);
+ } else if (last1 == size1 - 1 && del_all) {
+ DEBUG("Detaching overlap from " << path1->GetId() << " because of "<< path2->GetId());
+ DEBUG("Does it have overlap in the end: " << path1->HasOverlapedBegin());
+ path1->Print();
+ DEBUG(" >>>> ")
+ path2->Print();
+ DEBUG(" ==== ");
+ path1->PopBack(last1 + 1 - first1);
+ } else {
+ return false;
+ }
+ return true;
+ }
+
+ std::vector<EdgeId> GetSortedEdges() const {
+ std::set<EdgeId> edges_set;
+ for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
+ edges_set.insert(*iter);
+ edges_set.insert(g_.conjugate(*iter));
+ }
+ std::vector<EdgeId> edges(edges_set.begin(), edges_set.end());
+ std::sort(edges.begin(), edges.end(), EdgeLengthAndIdComparator(g_));
+ return edges;
+ }
+
+ bool HasAlreadyOverlapedEnd(BidirectionalPath * path) const {
+ return !path->IsOverlap() and path->HasOverlapedEnd();
+ }
+
+ bool HasAlreadyOverlapedBegin(BidirectionalPath * path) const {
+ return !path->IsOverlap() and path->HasOverlapedBegin();
+ }
+
+ bool IsSamePath(BidirectionalPath * path1,
+ BidirectionalPath * path2) const {
+ return *path2 == *path1 or *path2 == *path1->GetConjPath();
+ }
+
+ void RemoveOverlap(PathContainer& paths, BidirectionalPath* path1,
+ BidirectionalPath* path2, size_t overlap_size) const {
+ BidirectionalPath* conj2 = path2->GetConjPath();
+ if (path1->IsOverlap() && overlap_size == path1->Size()) {
+ DEBUG("Detaching overlap from " << path2->GetConjPath()->GetId() << " because of "<< path1->GetId());
+ path2->Print();
+ path1->Print();
+ conj2->PopBack(overlap_size);
+ path2->SetOverlapedBeginTo(path1);
+ } else if (path2->IsOverlap() && path2->Size() == overlap_size) {
+ DEBUG("Detaching overlap from " << path1->GetId() << " because of "<< path2->GetId());
+ path1->Print();
+ path2->Print();
+ path1->PopBack(overlap_size);
+ path1->SetOverlapedEndTo(path2);
+ } else if (overlap_size < path2->Size()
+ && overlap_size < path1->Size()) {
+ BidirectionalPath* overlap = new BidirectionalPath(g_, path1->Back());
+ BidirectionalPath* conj_overlap = new BidirectionalPath(g_, g_.conjugate(path1->Back()));
+ SubscribeCoverageMap(overlap);
+ SubscribeCoverageMap(conj_overlap);
+ paths.AddPair(overlap, conj_overlap);
+ DEBUG("Detaching overlap " << path1->GetId() << " and " << conj2->GetId());
+ path1->Print();
+ conj2->Print();
+ path1->PopBack();
+ conj2->PopBack();
+
+ for (size_t i = 1; i < overlap_size; ++i) {
+ conj_overlap->PushBack(g_.conjugate(path1->Back()));
+ path1->PopBack();
+ conj2->PopBack();
+ }
+ overlap->SetOverlap(true);
+ path1->SetOverlapedEndTo(overlap);
+ path2->SetOverlapedBeginTo(overlap);
+ }
+ }
+
+ void FindAndRemovePathOverlap(PathContainer& all_paths,
+ BidirectionalPath* path1) const {
+ int last = (int) path1->Size() - 1;
+ if (last <= 0 or coverage_map_.GetCoverage(path1->At(last)) <= 1) {
+ return;
+ }
+ BidirectionalPathSet paths =
+ coverage_map_.GetCoveringPaths(path1->At(last));
+ BidirectionalPath* overlap_path = NULL;
+ size_t overlap_size = 0;
+ for (auto path_iter = paths.begin(); path_iter != paths.end();
+ ++path_iter) {
+ if (IsSamePath(*path_iter, path1)) {
+ continue;
+ }
+ size_t over_size = path1->OverlapEndSize(*path_iter);
+ if (over_size > overlap_size) {
+ overlap_size = over_size;
+ overlap_path = *path_iter;
+ } else if (over_size == overlap_size &&
+ (overlap_path == NULL || (*path_iter)->GetId() < overlap_path->GetId())) {
+ overlap_path = *path_iter;
+ }
+ }
+ if (overlap_path == NULL) {
+ return;
+ }
+ if (overlap_size > 0) {
+ RemoveOverlap(all_paths, path1, overlap_path, overlap_size);
+ }
+ }
+
+ class EdgeLengthAndIdComparator {
+ public:
+ EdgeLengthAndIdComparator(const Graph& g)
+ : g_(g) {
+ }
+ bool operator()(const EdgeId& e1, const EdgeId& e2) const {
+ if (g_.length(e1) > g_.length(e2)) {
+ return true;
+ }
+ if (g_.length(e2) > g_.length(e1)) {
+ return false;
+ }
+ return e1.int_id() < e2.int_id();
+ }
+ private:
+ const Graph& g_;
+ };
+
+ const Graph& g_;
+ GraphCoverageMap& coverage_map_;
+protected:
+ DECL_LOGGER("PEResolver")
+};
+
+class PathExtendResolver {
+
+protected:
+ const Graph& g_;
+ size_t k_;
+
+public:
+ PathExtendResolver(const Graph& g): g_(g), k_(g.k()) {
+ }
+
+ PathContainer makeSimpleSeeds() {
+ std::set<EdgeId> included;
+ PathContainer edges;
+ for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
+ if (g_.int_id(*iter) <= 0 or InTwoEdgeCycle(*iter, g_))
+ continue;
+ if (included.count(*iter) == 0) {
+ BidirectionalPath * first = new BidirectionalPath(g_, *iter);
+ BidirectionalPath * second = new BidirectionalPath(g_, g_.conjugate(*iter));
+ edges.AddPair(first,second);
+ included.insert(*iter);
+ included.insert(g_.conjugate(*iter));
+ }
+ }
+ return edges;
+ }
+
+ PathContainer extendSeeds(PathContainer& seeds, ContigsMaker& pathExtender) {
+ PathContainer paths;
+ pathExtender.GrowAll(seeds, paths);
+ return paths;
+ }
+
+ void removeEqualPaths(PathContainer& paths, GraphCoverageMap& coverage_map,
+ size_t max_overlap) {
+
+ SimpleOverlapRemover remover(g_, coverage_map);
+ remover.RemoveSimilarPaths(paths, max_overlap, max_overlap, true, false, false, false, false);
+ }
+
+ void removeOverlaps(PathContainer& paths, GraphCoverageMap& coverage_map, size_t min_edge_len, size_t max_path_diff, bool add_overlaps_begin) {
+ SimpleOverlapRemover remover(g_, coverage_map);
+ if (cfg::get().mode == config::pipeline_type::moleculo)
+ remover.CutPseudoSelfConjugatePaths(paths);
+ //writer.WritePathsToFASTA(paths, output_dir + "/before.fasta");
+ //DEBUG("Removing subpaths");
+ //delete not only eq,
+ remover.RemoveSimilarPaths(paths, min_edge_len, max_path_diff, false, true, false, false, add_overlaps_begin);
+ //writer.WritePathsToFASTA(paths, output_dir + "/remove_similar.fasta");
+ //DEBUG("Remove overlaps")
+ remover.RemoveOverlaps(paths);
+ //writer.WritePathsToFASTA(paths, output_dir + "/after_remove_overlaps.fasta");
+ remover.RemoveSimilarPaths(paths, min_edge_len, max_path_diff, true, false, false, false, add_overlaps_begin);
+ //writer.WritePathsToFASTA(paths, output_dir + "/remove_equal.fasta");
+ //DEBUG("remove similar path. Max difference " << max_overlap);
+ remover.RemoveSimilarPaths(paths, min_edge_len, max_path_diff, false, true, true, true, add_overlaps_begin);
+ DEBUG("end removing");
+ }
+
+ void RemoveMatePairEnds(PathContainer& paths, size_t min_edge_len) const {
+ DEBUG("remove mp ends");
+ for (size_t i = 0; i < paths.size(); ++i) {
+ RemoveMatePairEnd(*paths.Get(i), min_edge_len);
+ RemoveMatePairEnd(*paths.GetConjugate(i), min_edge_len);
+ }
+ }
+
+ void addUncoveredEdges(PathContainer& paths, GraphCoverageMap& coverageMap) {
+ std::set<EdgeId> included;
+ for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
+ if (included.count(*iter) == 0 && !coverageMap.IsCovered(*iter)) {
+ BidirectionalPath* path = new BidirectionalPath(g_, *iter);
+ BidirectionalPath* conj = new BidirectionalPath(g_, g_.conjugate(*iter));
+ path->Subscribe(&coverageMap);
+ conj->Subscribe(&coverageMap);
+ coverageMap.BackEdgeAdded(path->At(0), path, path->GapAt(0));
+ coverageMap.BackEdgeAdded(conj->At(0), conj, conj->GapAt(0));
+ paths.AddPair(path, conj);
+ included.insert(*iter);
+ included.insert(g_.conjugate(*iter));
+ }
+ }
+ }
+
+private:
+ void RemoveMatePairEnd(BidirectionalPath& path, size_t min_edge_len) const {
+ int pos = int(path.Size()) - 1;
+ while (pos > 0 and g_.length(path.At(pos)) < min_edge_len) {
+ path.PopBack();
+ pos--;
+ }
+ }
+protected:
+ DECL_LOGGER("PEResolver")
+};
+
+} /* PE_RESOLVER_HPP_ */
+
+#endif
diff --git a/src/modules/algorithms/path_extend/pe_utils.hpp b/src/modules/algorithms/path_extend/pe_utils.hpp
new file mode 100644
index 0000000..f061af5
--- /dev/null
+++ b/src/modules/algorithms/path_extend/pe_utils.hpp
@@ -0,0 +1,462 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * pe_utils.hpp
+ *
+ * Created on: Nov 27, 2012
+ * Author: andrey
+ */
+
+#ifndef PE_UTILS_HPP_
+#define PE_UTILS_HPP_
+
+#include "assembly_graph/paths/bidirectional_path.hpp"
+
+using namespace debruijn_graph;
+
+namespace path_extend {
+
+//Checks whether we are in a cycle of length 2, used only for seed selection.
+inline bool InTwoEdgeCycle(EdgeId e, const Graph &g) {
+ auto v = g.EdgeEnd(e);
+ if (g.OutgoingEdgeCount(v) >= 1) {
+ auto edges = g.OutgoingEdges(v);
+ for (auto it = edges.begin(); it != edges.end(); ++it) {
+ if (g.EdgeStart(e) == g.EdgeEnd(*it)) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+inline bool InBuble(EdgeId e, const Graph& g) {
+ auto edges = g.OutgoingEdges(g.EdgeStart(e));
+ auto endVertex = g.EdgeEnd(e);
+ for (auto it = edges.begin(); it != edges.end(); ++it) {
+ if ((g.EdgeEnd(*it) == endVertex) and (*it != e)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+
+// Handles all paths in PathContainer.
+// For each edge output all paths that _traverse_ this path. If path contains multiple instances - count them. Position of the edge is not reported.
+//TODO: Inside is some WTF, should be rewritten.
+//TODO: Memory leaks, inefficient data structure.
+class GraphCoverageMap: public PathListener {
+
+public:
+ typedef BidirectionalPathMultiset MapDataT;
+
+
+protected:
+ const Graph& g_;
+
+ std::map <EdgeId, MapDataT * > edgeCoverage_;
+
+ MapDataT * empty_;
+
+ virtual void EdgeAdded(EdgeId e, BidirectionalPath * path, Gap /*gap*/) {
+ auto iter = edgeCoverage_.find(e);
+ if (iter == edgeCoverage_.end()) {
+ edgeCoverage_.insert(std::make_pair(e, new MapDataT()));
+ }
+ edgeCoverage_[e]->insert(path);
+ }
+
+ virtual void EdgeRemoved(EdgeId e, BidirectionalPath * path) {
+ auto iter = edgeCoverage_.find(e);
+ if (iter != edgeCoverage_.end()) {
+ if (iter->second->count(path) == 0) {
+ DEBUG("Error erasing path from coverage map");
+ } else {
+ auto entry = iter->second->find(path);
+ iter->second->erase(entry);
+ }
+ }
+ }
+
+public:
+ GraphCoverageMap(const Graph& g) : g_(g), edgeCoverage_() {
+ empty_ = new MapDataT();
+ }
+
+ GraphCoverageMap(const Graph& g, const PathContainer& paths) : g_(g), edgeCoverage_() {
+ empty_ = new MapDataT();
+ for (size_t i = 0; i < paths.size(); ++i) {
+ for (size_t j = 0; j < paths.Get(i)->Size(); ++j) {
+ EdgeAdded(paths.Get(i)->At(j), paths.Get(i), paths.Get(i)->GapAt(j));
+ }
+ for (size_t j = 0; j < paths.GetConjugate(i)->Size(); ++j) {
+ EdgeAdded(paths.GetConjugate(i)->At(j), paths.GetConjugate(i), paths.GetConjugate(i)->GapAt(j));
+ }
+ }
+ }
+
+ virtual ~GraphCoverageMap() {
+ delete empty_;
+ for (auto iter = edgeCoverage_.begin(); iter != edgeCoverage_.end(); ++iter) {
+ delete iter->second;
+ }
+ }
+
+ void Clear() {
+ for (auto iter = edgeCoverage_.begin(); iter != edgeCoverage_.end(); ++iter) {
+ MapDataT* cover_paths = iter->second;
+ for (auto ipath = cover_paths->begin(); ipath != cover_paths->end(); ++ipath) {
+ BidirectionalPath* p = *ipath;
+ p->Unsubscribe(this);
+ }
+ delete cover_paths;
+ }
+ edgeCoverage_.clear();
+ }
+
+ void Subscribe(BidirectionalPath * path) {
+ path->Subscribe(this);
+ for (size_t i = 0; i < path->Size(); ++i) {
+ BackEdgeAdded(path->At(i), path, path->GapAt(i));
+ }
+ }
+
+ virtual void FrontEdgeAdded(EdgeId e, BidirectionalPath * path, Gap gap) {
+ EdgeAdded(e, path, gap);
+ }
+
+ virtual void BackEdgeAdded(EdgeId e, BidirectionalPath * path, Gap gap) {
+ EdgeAdded(e, path, gap);
+ }
+
+ virtual void FrontEdgeRemoved(EdgeId e, BidirectionalPath * path) {
+ EdgeRemoved(e, path);
+ }
+
+ virtual void BackEdgeRemoved(EdgeId e, BidirectionalPath * path) {
+ EdgeRemoved(e, path);
+ }
+
+ MapDataT * GetEdgePaths(EdgeId e) const {
+ auto iter = edgeCoverage_.find(e);
+ if (iter != edgeCoverage_.end()) {
+ return iter->second;
+ }
+ return empty_;
+ }
+
+ int GetCoverage(EdgeId e) const {
+ return (int) GetEdgePaths(e)->size();
+ }
+
+ bool IsCovered(EdgeId e) const {
+ return GetCoverage(e) > 0;
+ }
+
+ bool IsCovered(const BidirectionalPath& path) const {
+ for (size_t i = 0; i < path.Size(); ++i) {
+ if (!IsCovered(path[i])) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ int GetCoverage(const BidirectionalPath& path) const {
+ if (path.Empty()) {
+ return 0;
+ }
+
+ int cov = GetCoverage(path[0]);
+ for (size_t i = 1; i < path.Size(); ++i) {
+ int currentCov = GetCoverage(path[i]);
+ if (cov > currentCov) {
+ cov = currentCov;
+ }
+ }
+
+ return cov;
+ }
+
+ BidirectionalPathSet GetCoveringPaths(EdgeId e) const {
+ auto mapData = GetEdgePaths(e);
+ return BidirectionalPathSet(mapData->begin(), mapData->end());
+ }
+
+ int GetUniqueCoverage(EdgeId e) const {
+ return (int) GetCoveringPaths(e).size();
+ }
+
+ std::map <EdgeId, MapDataT * >::const_iterator begin() const {
+ return edgeCoverage_.begin();
+ }
+
+ std::map <EdgeId, MapDataT * >::const_iterator end() const {
+ return edgeCoverage_.end();
+ }
+
+ // DEBUG
+
+ void PrintUncovered() const {
+ DEBUG("Uncovered edges");
+ int s = 0;
+ for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
+ if (!IsCovered(*iter)) {
+ DEBUG(g_.int_id(*iter) << " (" << g_.length(*iter) << ") ~ " << g_.int_id(g_.conjugate(*iter)) << " (" << g_.length(g_.conjugate(*iter)) << ")");
+ s += 1;
+ }
+ }
+ DEBUG("Uncovered edges " << s / 2);
+ }
+
+ void PrintMulticovered() const {
+ DEBUG("Multicovered edges");
+ for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
+ auto paths = GetCoveringPaths(*iter);
+ if (paths.size() > 1 && g_.length(*iter) > 1000) {
+ DEBUG(g_.int_id(*iter) << " (" << g_.length(*iter) << "). " << " Covered: " << paths.size());
+ for (auto path = paths.begin(); path != paths.end(); ++path) {
+ (*path)->Print();
+ }
+ DEBUG("=====");
+ }
+ }
+ }
+
+ size_t size() const {
+ return edgeCoverage_.size();
+ }
+
+ const Graph& graph() const {
+ return g_;
+ }
+
+private:
+ GraphCoverageMap(const GraphCoverageMap& t) : g_(t.g_), empty_(t.empty_) {}
+};
+
+inline bool GetLoopAndExit(const Graph& g, EdgeId e, pair<EdgeId, EdgeId>& result) {
+ VertexId v = g.EdgeEnd(e);
+ VertexId start = g.EdgeStart(e);
+ if (g.OutgoingEdgeCount(v) != 2 || g.IncomingEdgeCount(v) != 1 || g.OutgoingEdgeCount(start) != 1 || g.IncomingEdgeCount(start) != 2) {
+ return false;
+ }
+ EdgeId loop;
+ EdgeId exit;
+ bool loop_found = false;
+ bool exit_found = false;
+ auto edges = g.OutgoingEdges(v);
+ for (auto edge = edges.begin(); edge != edges.end(); ++edge) {
+ if (g.EdgeEnd(*edge) == g.EdgeStart(e) && *edge != e) {
+ loop = *edge;
+ loop_found = true;
+ } else if (*edge != e) {
+ exit = *edge;
+ exit_found = true;
+ }
+ }
+ result = make_pair(loop, exit);
+ return exit_found && loop_found;
+}
+
+class LoopDetector {
+public:
+ LoopDetector(BidirectionalPath* p, const GraphCoverageMap& cov_map);
+ size_t LoopEdges(size_t skip_identical_edges, size_t min_cycle_appearences) const;
+ size_t LoopLength(size_t skip_identical_edges, size_t min_cycle_appearences) const;
+ bool PathIsLoop(size_t edges) const;
+ size_t LastLoopCount(size_t skip_identical_edges, size_t min_cycle_appearences) const;
+ size_t LastLoopCount(size_t edges) const;
+ bool IsCycled(size_t loopLimit, size_t& skip_identical_edges) const;
+ size_t EdgesToRemove(size_t skip_identical_edges, bool fullRemoval = false) const;
+ void RemoveLoop(size_t skip_identical_edges, bool fullRemoval = true);
+ bool EdgeInShortLoop(EdgeId e) const;
+ bool PrevEdgeInShortLoop() const;
+private:
+ BidirectionalPath* path_;
+ const GraphCoverageMap& cov_map_;
+ DECL_LOGGER("BidirectionalPath");
+};
+
+inline LoopDetector::LoopDetector(BidirectionalPath* p, const GraphCoverageMap& cov_map)
+ : path_(p),
+ cov_map_(cov_map) {
+}
+
+inline size_t LoopDetector::LoopEdges(size_t skip_identical_edges, size_t min_cycle_appearences) const {
+ if (path_->Size() == 0) {
+ return 0;
+ }
+ EdgeId e = path_->Back();
+ size_t count = cov_map_.GetEdgePaths(e)->count(path_);
+ if (count <= 1 || count < min_cycle_appearences * (skip_identical_edges + 1)) {
+ return 0;
+ }
+ vector<size_t> edge_positions = path_->FindAll(e);
+ VERIFY(edge_positions.size() == count);
+ VERIFY(edge_positions.size() >= skip_identical_edges);
+ size_t loopSize = edge_positions.back() - edge_positions[edge_positions.size() - 1 - (skip_identical_edges + 1)];
+ return loopSize;
+}
+
+inline bool LoopDetector::PathIsLoop(size_t edges) const {
+ if (edges == 0 || path_->Size() <= 1)
+ return false;
+
+ for (size_t i = 0; i < edges; ++i) {
+ EdgeId e = path_->At(i);
+ for (int j = (int) path_->Size() - ((int) edges - (int) i); j >= 0; j -= (int) edges) {
+ if (path_->operator [](j) != e) {
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+inline size_t LoopDetector::LastLoopCount(size_t skip_identical_edges, size_t min_cycle_appearences) const {
+ size_t edges = LoopEdges(skip_identical_edges, min_cycle_appearences);
+ return LastLoopCount(edges);
+}
+
+inline size_t LoopDetector::LastLoopCount(size_t edges) const {
+ if (edges == 0) {
+ return 0;
+ }
+
+ BidirectionalPath loop = path_->SubPath(path_->Size() - edges);
+ size_t count = 0;
+ int i = (int) path_->Size() - (int) edges;
+ int delta = -(int) edges;
+
+ while (i >= 0) {
+ if (!path_->CompareFrom(i, loop)) {
+ break;
+ }
+ ++count;
+ i += delta;
+ }
+
+ return count;
+}
+
+inline bool LoopDetector::IsCycled(size_t loopLimit, size_t& skip_identical_edges) const {
+ if (path_->Size() == 0 or cov_map_.GetEdgePaths(path_->Back())->count(path_) < loopLimit) {
+ return false;
+ }
+ skip_identical_edges = 0;
+ size_t loop_count = LastLoopCount(skip_identical_edges, loopLimit);
+ while (loop_count > 0) {
+ if (loop_count >= loopLimit) {
+ return true;
+ }
+ loop_count = LastLoopCount(++skip_identical_edges, loopLimit);
+ }
+ return false;
+}
+
+inline size_t LoopDetector::EdgesToRemove(size_t skip_identical_edges, bool fullRemoval) const {
+ size_t edges = LoopEdges(skip_identical_edges, 1);
+ size_t count = LastLoopCount(edges);
+ bool onlyCycle = PathIsLoop(edges);
+ int result;
+
+ if (onlyCycle || path_->Size() <= count * edges) {
+ result = (int) path_->Size() - (int) edges;
+ } else if (fullRemoval) {
+ result = (int) count * (int) edges;
+ } else {
+ result = (int) (count - 1) * (int) edges;
+ }
+
+ return result < 0 ? 0 : result;
+}
+
+inline void LoopDetector::RemoveLoop(size_t skip_identical_edges, bool fullRemoval) {
+ size_t toRemove = EdgesToRemove(skip_identical_edges, fullRemoval);
+ for (size_t i = 0; i < toRemove; ++i) {
+ path_->PopBack();
+ }
+}
+
+inline bool LoopDetector::EdgeInShortLoop(EdgeId e) const {
+ pair<EdgeId, EdgeId> temp;
+ return GetLoopAndExit(path_->graph(), e, temp);
+}
+
+inline bool LoopDetector::PrevEdgeInShortLoop() const {
+ if (path_->Size() <= 2) {
+ return false;
+ }
+ const Graph& g = path_->graph();
+ EdgeId e2 = path_->At(path_->Size() - 1);
+ EdgeId e1 = path_->At(path_->Size() - 2);
+ VertexId v2 = g.EdgeEnd(e1);
+ if (g.OutgoingEdgeCount(v2) == 2 && g.EdgeEnd(e2) == g.EdgeStart(e1) && g.EdgeEnd(e1) == g.EdgeStart(e2)) {
+ return EdgeInShortLoop(e1);
+ }
+ return false;
+}
+
+class ScaffoldBreaker {
+private:
+
+ int min_gap_;
+
+ PathContainer container_;
+
+ void SplitPath(const BidirectionalPath& path) {
+ size_t i = 0;
+
+ while (i < path.Size()) {
+ BidirectionalPath * p = new BidirectionalPath(path.graph(), path[i]);
+ ++i;
+
+ while (i < path.Size() and path.GapAt(i) <= min_gap_) {
+ p->PushBack(path[i], path.GapAt(i), path.TrashPreviousAt(i), path.TrashCurrentAt(i));
+ ++i;
+ }
+
+ if (i < path.Size()) {
+ DEBUG("split path " << i << " gap " << path.GapAt(i));
+ p->Print();
+ }
+
+ BidirectionalPath * cp = new BidirectionalPath(p->Conjugate());
+ container_.AddPair(p, cp);
+ }
+ }
+
+public:
+
+ ScaffoldBreaker(int min_gap, const PathContainer &paths)
+ : min_gap_(min_gap) {
+ for (auto it = paths.begin(); it != paths.end(); ++it) {
+ SplitPath(*it.get());
+ }
+ }
+
+ ~ScaffoldBreaker() {
+ // FIXME: WTF, Why doesn't PathContainer own the paths?
+ container_.DeleteAllPaths();
+ }
+
+ void clear() {
+ container_.clear();
+ }
+
+ PathContainer& container() {
+ return container_;
+ }
+
+};
+
+}
+
+#endif /* PE_UTILS_HPP_ */
diff --git a/src/modules/algorithms/path_extend/scaffolder2015/connection_condition2015.cpp b/src/modules/algorithms/path_extend/scaffolder2015/connection_condition2015.cpp
new file mode 100644
index 0000000..14ba367
--- /dev/null
+++ b/src/modules/algorithms/path_extend/scaffolder2015/connection_condition2015.cpp
@@ -0,0 +1,144 @@
+#include "connection_condition2015.hpp"
+namespace path_extend {
+
+PairedLibConnectionCondition::PairedLibConnectionCondition(const debruijn_graph::Graph &graph,
+ shared_ptr <PairedInfoLibrary> lib,
+ size_t lib_index,
+ size_t min_read_count) :
+ graph_(graph),
+ lib_(lib),
+ lib_index_(lib_index),
+ min_read_count_(min_read_count),
+//TODO reconsider condition
+ left_dist_delta_(5 * (int) lib_->GetISMax()),
+ right_dist_delta_(max(5 * (int) lib_->GetIsVar(), int(lib_->is_))) {
+}
+
+size_t PairedLibConnectionCondition::GetLibIndex() const {
+ return lib_index_;
+}
+
+set <debruijn_graph::EdgeId> PairedLibConnectionCondition::ConnectedWith(debruijn_graph::EdgeId e) const {
+ set <debruijn_graph::EdgeId> all_edges;
+ int e_length = (int) graph_.length(e);
+ lib_->FindJumpEdges(e, all_edges, e_length - left_dist_delta_, e_length + right_dist_delta_);
+
+ set <debruijn_graph::EdgeId> result;
+ for (auto edge : all_edges) {
+ if (edge != e && edge != graph_.conjugate(e) &&
+ math::ge(GetWeight(e, edge), (double) min_read_count_)) {
+ result.insert(edge);
+ }
+ }
+ return result;
+}
+
+double PairedLibConnectionCondition::GetWeight(debruijn_graph::EdgeId e1, debruijn_graph::EdgeId e2) const {
+ int e_length = (int) graph_.length(e1);
+ return lib_->CountPairedInfo(e1, e2, e_length - left_dist_delta_, e_length + right_dist_delta_);
+}
+
+AdvancedPairedConnectionCondition::AdvancedPairedConnectionCondition(const debruijn_graph::Graph &graph,
+ shared_ptr <PairedInfoLibrary> lib,
+ size_t lib_index,
+ size_t always_add,
+ size_t never_add,
+ double relative_threshold):
+ PairedLibConnectionCondition(graph, lib, lib_index, never_add),
+ always_add_(always_add),
+ never_add_(never_add),
+ relative_threshold_(relative_threshold) {}
+
+set <debruijn_graph::EdgeId> AdvancedPairedConnectionCondition::ConnectedWith(debruijn_graph::EdgeId e) const {
+ set <debruijn_graph::EdgeId> all_edges;
+ int e_length = (int) graph_.length(e);
+ lib_->FindJumpEdges(e, all_edges, e_length - left_dist_delta_, e_length + right_dist_delta_);
+
+ double max_weight = 0;
+ for (auto edge : all_edges) {
+ if (edge != e && edge != graph_.conjugate(e)) {
+ double w = GetWeight(e, edge);
+ if (math::gr(w, max_weight))
+ max_weight = w;
+ }
+ }
+ double threshold = std::max((double) never_add_, std::min((double) always_add_, max_weight * relative_threshold_));
+
+ set <debruijn_graph::EdgeId> result;
+ for (auto edge : all_edges) {
+ if (edge != e && edge != graph_.conjugate(e) &&
+ math::ge(GetWeight(e, edge), threshold)) {
+ result.insert(edge);
+ }
+ }
+ return result;
+}
+
+
+//TODO: We use same part of index twice, is it necessary?
+int PairedLibConnectionCondition::GetMedianGap(debruijn_graph::EdgeId e1, debruijn_graph::EdgeId e2) const {
+ std::vector<int> distances;
+ std::vector<double> weights;
+ int e_length = (int) graph_.length(e1);
+ lib_->CountDistances(e1, e2, distances, weights);
+ std::vector<pair<int, double> >h(distances.size());
+ for (size_t i = 0; i< distances.size(); i++) {
+ if (distances[i] >= e_length - left_dist_delta_ && distances[i] <= e_length + right_dist_delta_)
+ h.push_back(std::make_pair(distances[i], weights[i]));
+ }
+//TODO: is it really necessary?
+ std::sort(h.begin(), h.end());
+ double sum = 0.0;
+ double sum2 = 0.0;
+ for (size_t j = 0; j< h.size(); ++j) {
+ sum += h[j].second;
+ }
+ size_t i = 0;
+ for (; i < h.size(); ++i) {
+ sum2 += h[i].second;
+ if (sum2 * 2 > sum)
+ break;
+ }
+ return (int) round(h[i].first - e_length);
+}
+
+AssemblyGraphConnectionCondition::AssemblyGraphConnectionCondition(const debruijn_graph::Graph &g,
+ size_t max_connection_length, const ScaffoldingUniqueEdgeStorage & unique_edges) :
+ g_(g), max_connection_length_(max_connection_length), interesting_edge_set_(unique_edges.GetSet()), stored_distances_() {
+}
+
+set <debruijn_graph::EdgeId> AssemblyGraphConnectionCondition::ConnectedWith(debruijn_graph::EdgeId e) const {
+ VERIFY_MSG(interesting_edge_set_.find(e)!= interesting_edge_set_.end(), " edge "<< e.int_id() << " not applicable for connection condition");
+ if (stored_distances_.find(e) != stored_distances_.end()) {
+ return stored_distances_[e];
+ }
+ stored_distances_.insert(make_pair(e, set<debruijn_graph::EdgeId>()));
+ for (auto connected: g_.OutgoingEdges(g_.EdgeEnd(e))) {
+ if (interesting_edge_set_.find(connected) != interesting_edge_set_.end()) {
+ stored_distances_[e].insert(connected);
+ }
+ }
+ DijkstraHelper<debruijn_graph::Graph>::BoundedDijkstra dijkstra(
+ DijkstraHelper<debruijn_graph::Graph>::CreateBoundedDijkstra(g_, max_connection_length_));
+ dijkstra.Run(g_.EdgeEnd(e));
+ for (auto v: dijkstra.ReachedVertices()) {
+ for (auto connected: g_.OutgoingEdges(v)) {
+ if (interesting_edge_set_.find(connected) != interesting_edge_set_.end() && dijkstra.GetDistance(v) < max_connection_length_) {
+ stored_distances_[e].insert(connected);
+ }
+ }
+ }
+ return stored_distances_[e];
+}
+void AssemblyGraphConnectionCondition::AddInterestingEdge(debruijn_graph::EdgeId e) {
+ interesting_edge_set_.insert(e);
+}
+double AssemblyGraphConnectionCondition::GetWeight(debruijn_graph::EdgeId, debruijn_graph::EdgeId) const {
+ return 1.0;
+}
+
+size_t AssemblyGraphConnectionCondition::GetLibIndex() const {
+ return (size_t) - 1;
+}
+
+}
diff --git a/src/modules/algorithms/path_extend/scaffolder2015/connection_condition2015.hpp b/src/modules/algorithms/path_extend/scaffolder2015/connection_condition2015.hpp
new file mode 100644
index 0000000..0cfe58e
--- /dev/null
+++ b/src/modules/algorithms/path_extend/scaffolder2015/connection_condition2015.hpp
@@ -0,0 +1,90 @@
+
+#ifndef CONNECTION_CONDITION2015_HPP
+#define CONNECTION_CONDITION2015_HPP
+#include "algorithms/genome_consistance_checker.hpp"
+#include "dev_support/logger/logger.hpp"
+#include "algorithms/path_extend/paired_library.hpp"
+#include "assembly_graph/graph_support/scaff_supplementary.hpp"
+#include <map>
+#include <set>
+
+namespace path_extend {
+
+/* Connection condition are used by both scaffolder's extension chooser and scaffold graph */
+
+class ConnectionCondition {
+public:
+// Outputs the edges e is connected with.
+//TODO performance issue: think about inside filtering. Return only unique connected edges?
+ virtual set <debruijn_graph::EdgeId> ConnectedWith(debruijn_graph::EdgeId e) const = 0;
+// Outputs the weight of the pair e1 and e2
+ virtual double GetWeight(debruijn_graph::EdgeId e1, debruijn_graph::EdgeId e2) const = 0;
+ virtual size_t GetLibIndex() const = 0;
+ virtual ~ConnectionCondition() {
+ }
+};
+
+// Main (mate pair library) connection condition.
+class PairedLibConnectionCondition : public ConnectionCondition {
+protected:
+ const debruijn_graph::Graph &graph_;
+ shared_ptr <PairedInfoLibrary> lib_;
+ size_t lib_index_;
+//Minimal number of mate pairs to call connection sound
+ size_t min_read_count_;
+public:
+//Only paired info with gap between e1 and e2 between -left_dist_delta_ and right_dist_delta_ taken in account
+ int left_dist_delta_;
+ int right_dist_delta_;
+
+ PairedLibConnectionCondition(const debruijn_graph::Graph &graph,
+ shared_ptr <PairedInfoLibrary> lib,
+ size_t lib_index,
+ size_t min_read_count);
+ size_t GetLibIndex() const override;
+ set <debruijn_graph::EdgeId> ConnectedWith(debruijn_graph::EdgeId e) const override;
+ double GetWeight(debruijn_graph::EdgeId e1, debruijn_graph::EdgeId e2) const override;
+//Returns median gap size
+ int GetMedianGap (debruijn_graph::EdgeId e1, debruijn_graph::EdgeId e2) const;
+};
+
+//Advanced mate-pair connection condition
+class AdvancedPairedConnectionCondition: public PairedLibConnectionCondition {
+protected:
+ size_t always_add_;
+ size_t never_add_;
+ double relative_threshold_;
+
+public:
+ AdvancedPairedConnectionCondition(const debruijn_graph::Graph &graph,
+ shared_ptr <PairedInfoLibrary> lib,
+ size_t lib_index,
+ size_t always_add,
+ size_t never_add,
+ double relative_threshold);
+
+ set <debruijn_graph::EdgeId> ConnectedWith(debruijn_graph::EdgeId e) const override;
+
+};
+
+/* Condition used to find connected in graph edges.
+*
+*/
+class AssemblyGraphConnectionCondition : public ConnectionCondition {
+protected:
+ const debruijn_graph::Graph &g_;
+//Maximal gap to the connection.
+ size_t max_connection_length_;
+ set<EdgeId> interesting_edge_set_;
+ mutable map <debruijn_graph::Graph::EdgeId, set<debruijn_graph::Graph::EdgeId>> stored_distances_;
+public:
+ AssemblyGraphConnectionCondition(const debruijn_graph::Graph &g, size_t max_connection_length,
+ const ScaffoldingUniqueEdgeStorage& unique_edges);
+ void AddInterestingEdge(debruijn_graph::EdgeId e);
+ set <debruijn_graph::EdgeId> ConnectedWith(debruijn_graph::EdgeId e) const override;
+ double GetWeight(debruijn_graph::EdgeId, debruijn_graph::EdgeId) const override;
+ size_t GetLibIndex() const override;
+};
+}
+
+#endif //PROJECT_CONNECTION_CONDITION2015_HPP
diff --git a/src/modules/algorithms/path_extend/scaffolder2015/extension_chooser2015.cpp b/src/modules/algorithms/path_extend/scaffolder2015/extension_chooser2015.cpp
new file mode 100644
index 0000000..1e2af32
--- /dev/null
+++ b/src/modules/algorithms/path_extend/scaffolder2015/extension_chooser2015.cpp
@@ -0,0 +1,82 @@
+//
+// Created by lab42 on 8/26/15.
+//
+
+#include "extension_chooser2015.hpp"
+
+namespace path_extend {
+using namespace std;
+
+std::pair<EdgeId, int> ExtensionChooser2015::FindLastUniqueInPath(const BidirectionalPath& path) const {
+ for (int i = (int)path.Size() - 1; i >= 0; --i) {
+ if (unique_edges_.IsUnique(path.At(i))) {
+ return std::make_pair(path.At(i), i);
+ }
+ }
+ return std::make_pair(EdgeId(0), -1);
+}
+
+ExtensionChooser::EdgeContainer ExtensionChooser2015::FindNextUniqueEdge(const EdgeId from) const {
+ VERIFY(unique_edges_.IsUnique(from));
+ EdgeContainer result;
+ set<EdgeId> candidate_edges = paired_connection_condition_.ConnectedWith(from);
+ vector<pair<double, pair<EdgeId, int >>> to_sort;
+ for (EdgeId e : candidate_edges) {
+ if (!unique_edges_.IsUnique(e)) {
+ continue;
+ }
+ double sum = paired_connection_condition_.GetWeight(from, e);
+ DEBUG("edge " << g_.int_id(e) << " weight " << sum);
+ if (sum < absolute_weight_threshold_) {
+ DEBUG("Edge " << g_.int_id(e) << " weight " << sum << " failed absolute weight threshold " << absolute_weight_threshold_);
+ continue;
+ }
+ int gap = paired_connection_condition_.GetMedianGap(from, e);
+
+ auto connected_with = graph_connection_condition_.ConnectedWith(from);
+ if (connected_with.find(e) != connected_with.end()) {
+ sum *= graph_connection_bonus_;
+ }
+ to_sort.push_back(make_pair(sum, make_pair(e, gap)));
+ }
+//descending order, reverse iterators;
+ sort(to_sort.rbegin(), to_sort.rend());
+ for(size_t j = 0; j < to_sort.size(); j++) {
+ if (j == 0 || to_sort[j].first* relative_weight_threshold_ > to_sort[j - 1].first) {
+ result.push_back(EdgeWithDistance(to_sort[j].second.first, to_sort[j].second.second));
+ DEBUG("Edge " << g_.int_id(to_sort[j].second.first) << " gap " << to_sort[j].second.second << " weight "<< to_sort[j].first << " passed absolute weight threshold " << absolute_weight_threshold_);
+ } else {
+ DEBUG ("Edge " << g_.int_id(to_sort[j].second.first) << " weight " << to_sort[j].first << " failed relative weight threshold " << relative_weight_threshold_);
+ DEBUG("other removed");
+ break;
+ }
+ }
+ return result;
+}
+
+ExtensionChooser::EdgeContainer ExtensionChooser2015::Filter(const BidirectionalPath& path, const ExtensionChooser::EdgeContainer& /*edges*/) const {
+// set<EdgeId> candidates = FindCandidates(path);
+ pair<EdgeId, int> last_unique = FindLastUniqueInPath(path);
+ EdgeContainer result;
+
+ if (last_unique.second < 0) {
+// No unique edge found
+ return result;
+ }
+
+ result = FindNextUniqueEdge(last_unique.first);
+//Backward check. We connected edges iff they are best continuation to each other.
+ if (result.size() == 1) {
+ //We should reduce gap size with length of the edges that came after last unique.
+ result[0].d_ -= int (path.LengthAt(last_unique.second) - g_.length(last_unique.first));
+
+ DEBUG("For edge " << g_.int_id(last_unique.first) << " unique next edge "<< result[0].e_ <<" found, doing backwards check ");
+ EdgeContainer backwards_check = FindNextUniqueEdge(g_.conjugate(result[0].e_));
+ if ((backwards_check.size() != 1) || (g_.conjugate(backwards_check[0].e_) != last_unique.first)) {
+ result.clear();
+ }
+ }
+ return result;
+}
+
+}
diff --git a/src/modules/algorithms/path_extend/scaffolder2015/extension_chooser2015.hpp b/src/modules/algorithms/path_extend/scaffolder2015/extension_chooser2015.hpp
new file mode 100644
index 0000000..5afa91b
--- /dev/null
+++ b/src/modules/algorithms/path_extend/scaffolder2015/extension_chooser2015.hpp
@@ -0,0 +1,49 @@
+//
+// Created by lab42 on 8/26/15.
+//
+#pragma once
+
+#include "algorithms/path_extend/extension_chooser.hpp"
+#include "connection_condition2015.hpp"
+#include "algorithms/genome_consistance_checker.hpp"
+#include "dev_support/logger/logger.hpp"
+#include <map>
+#include <set>
+namespace path_extend {
+class ExtensionChooser2015: public ScaffoldingExtensionChooser {
+private:
+ const ScaffoldingUniqueEdgeStorage& unique_edges_;
+// for possible connections e1 and e2 if weight(e1) > relative_weight_threshold_ * weight(e2) then e2 will be ignored
+ double relative_weight_threshold_;
+ PairedLibConnectionCondition paired_connection_condition_;
+ AssemblyGraphConnectionCondition graph_connection_condition_;
+// weight < absolute_weight_threshold_ will be ignored
+ size_t absolute_weight_threshold_;
+// multiplicator for the pairs which are connected in graph.
+ double graph_connection_bonus_;
+
+protected:
+//If path contains no unique edges return -1
+ pair<EdgeId, int> FindLastUniqueInPath(const BidirectionalPath& path) const;
+//Find all possible next unique edges confirmed with mate-pair information. (absolute/relative)_weight_threshold_ used for filtering
+ EdgeContainer FindNextUniqueEdge(const EdgeId from) const;
+ DECL_LOGGER("ExtensionChooser2015")
+public:
+ ExtensionChooser2015(const Graph& g, shared_ptr<WeightCounter> wc, double is_scatter_coeff,
+ const ScaffoldingUniqueEdgeStorage& unique_edges ,double relative_threshold, size_t lib_index):
+ ScaffoldingExtensionChooser(g, wc, is_scatter_coeff), unique_edges_(unique_edges), relative_weight_threshold_(relative_threshold), paired_connection_condition_(g,
+ wc->get_libptr(), lib_index,
+//TODO: constants are subject to reconsider
+ 0), graph_connection_condition_(g, 2*unique_edges_.GetMinLength(), unique_edges), absolute_weight_threshold_(2), graph_connection_bonus_(2) {
+ INFO("ExtensionChooser2015 created");
+ }
+/* @param edges are really not used and left for compatibility
+ * @returns possible next edge if there is unique one, else returns empty container
+ *
+ */
+
+ EdgeContainer Filter(const BidirectionalPath& path, const EdgeContainer& edges) const override;
+};
+
+
+}
diff --git a/src/modules/algorithms/path_extend/scaffolder2015/scaffold_graph.cpp b/src/modules/algorithms/path_extend/scaffolder2015/scaffold_graph.cpp
new file mode 100644
index 0000000..7e3312a
--- /dev/null
+++ b/src/modules/algorithms/path_extend/scaffolder2015/scaffold_graph.cpp
@@ -0,0 +1,275 @@
+#include "scaffold_graph.hpp"
+
+
+namespace path_extend {
+namespace scaffold_graph {
+
+std::atomic<ScaffoldGraph::ScaffoldEdgeIdT> ScaffoldGraph::ScaffoldEdge::scaffold_edge_id_{0};
+
+void ScaffoldGraph::AddEdgeSimple(const ScaffoldGraph::ScaffoldEdge &e, size_t conjugate_id) {
+ edges_.emplace(e.getId(), e);
+ outgoing_edges_.emplace(e.getStart(), e.getId());
+ incoming_edges_.emplace(e.getEnd(), e.getId());
+ conjugate_[e.getId()] = conjugate_id;
+}
+
+void ScaffoldGraph::DeleteOutgoing(const ScaffoldGraph::ScaffoldEdge &e) {
+ auto e_range = outgoing_edges_.equal_range(e.getStart());
+ for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) {
+ if (edges_.at(edge_id->second) == e) {
+ outgoing_edges_.erase(edge_id);
+ }
+ }
+}
+
+void ScaffoldGraph::DeleteIncoming(const ScaffoldGraph::ScaffoldEdge &e) {
+ auto e_range = incoming_edges_.equal_range(e.getEnd());
+ for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) {
+ if (edges_.at(edge_id->second) == e) {
+ incoming_edges_.erase(edge_id);
+ }
+ }
+}
+
+void ScaffoldGraph::DeleteAllOutgoingEdgesSimple(ScaffoldGraph::ScaffoldVertex v) {
+ auto e_range = outgoing_edges_.equal_range(v);
+ for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) {
+ DeleteIncoming(edges_.at(edge_id->second));
+ }
+ outgoing_edges_.erase(v);
+}
+
+void ScaffoldGraph::DeleteEdgeFromStorage(const ScaffoldGraph::ScaffoldEdge &e) {
+ VERIFY(!Exists(e));
+
+ size_t conjugate_id = conjugate_[e.getId()];
+ edges_.erase(e.getId());
+ edges_.erase(conjugate_id);
+ conjugate_.erase(e.getId());
+ conjugate_.erase(conjugate_id);
+}
+
+void ScaffoldGraph::DeleteAllIncomingEdgesSimple(ScaffoldGraph::ScaffoldVertex v) {
+ auto e_range = incoming_edges_.equal_range(v);
+ for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) {
+ DeleteOutgoing(edges_.at(edge_id->second));
+ }
+ incoming_edges_.erase(v);
+}
+
+bool ScaffoldGraph::Exists(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
+ return vertices_.count(assembly_graph_edge) != 0;
+}
+
+bool ScaffoldGraph::Exists(const ScaffoldGraph::ScaffoldEdge &e) const {
+ auto e_range = outgoing_edges_.equal_range(e.getStart());
+ for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) {
+ if (edges_.at(edge_id->second) == e) {
+ return true;
+ }
+ }
+ return false;
+}
+
+ScaffoldGraph::ScaffoldVertex ScaffoldGraph::conjugate(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
+ return assembly_graph_.conjugate(assembly_graph_edge);
+}
+
+ScaffoldGraph::ScaffoldEdge ScaffoldGraph::conjugate(const ScaffoldGraph::ScaffoldEdge &e) const {
+ auto iter = conjugate_.find(e.getId());
+ if (iter != conjugate_.end()) {
+ return edges_.at(iter->second);
+ }
+ return ScaffoldEdge(conjugate(e.getEnd()), conjugate(e.getStart()), e.getColor(), e.getWeight());
+}
+
+bool ScaffoldGraph::AddVertex(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) {
+ if (!Exists(assembly_graph_edge)) {
+ VERIFY(!Exists(conjugate(assembly_graph_edge)));
+ vertices_.insert(assembly_graph_edge);
+ vertices_.insert(conjugate(assembly_graph_edge));
+ return true;
+ }
+ return false;
+}
+
+void ScaffoldGraph::AddVertices(const set<ScaffoldGraph::ScaffoldVertex> &vertices) {
+ for (auto v : vertices) {
+ AddVertex(v);
+ }
+}
+
+bool ScaffoldGraph::AddEdge(ScaffoldGraph::ScaffoldVertex v1, ScaffoldGraph::ScaffoldVertex v2, size_t lib_id, double weight) {
+ VERIFY(Exists(v1));
+ VERIFY(Exists(v2));
+
+ ScaffoldEdge e(v1, v2, lib_id, weight);
+ if (Exists(e)) {
+ VERIFY(Exists(conjugate(e)));
+ return false;
+ }
+
+ auto conj = conjugate(e);
+ AddEdgeSimple(e, conj.getId());
+ AddEdgeSimple(conj, e.getId());
+ return true;
+}
+
+void ScaffoldGraph::Print(ostream &os) const {
+ for (auto v: vertices_) {
+ os << "Vertex " << int_id(v) << " ~ " << int_id(conjugate(v))
+ << ": len = " << assembly_graph_.length(v) << ", cov = " << assembly_graph_.coverage(v) << endl;
+ }
+ for (auto e_iter = edges_.begin(); e_iter != edges_.end(); ++e_iter) {
+ os << "Edge " << e_iter->second.getId() << " ~ " << conjugate(e_iter->second).getId() <<
+ ": " << int_id(e_iter->second.getStart()) << " -> " << int_id(e_iter->second.getEnd()) <<
+ ", lib index = " << e_iter->second.getColor() << ", weight " << e_iter->second.getWeight() << endl;
+ }
+}
+
+ScaffoldGraph::ScaffoldEdge ScaffoldGraph::UniqueIncoming(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
+ VERIFY(HasUniqueIncoming(assembly_graph_edge));
+ return edges_.at(incoming_edges_.find(assembly_graph_edge)->second);
+}
+
+ScaffoldGraph::ScaffoldEdge ScaffoldGraph::UniqueOutgoing(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
+ VERIFY(HasUniqueOutgoing(assembly_graph_edge));
+ return edges_.at(outgoing_edges_.find(assembly_graph_edge)->second);
+}
+
+bool ScaffoldGraph::HasUniqueIncoming(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
+ return IncomingEdgeCount(assembly_graph_edge) == 1;
+}
+
+bool ScaffoldGraph::HasUniqueOutgoing(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
+ return OutgoingEdgeCount(assembly_graph_edge) == 1;
+}
+
+size_t ScaffoldGraph::IncomingEdgeCount(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
+ return incoming_edges_.count(assembly_graph_edge);
+}
+
+size_t ScaffoldGraph::OutgoingEdgeCount(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
+ return outgoing_edges_.count(assembly_graph_edge);
+}
+
+vector<ScaffoldGraph::ScaffoldEdge> ScaffoldGraph::IncomingEdges(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
+ vector<ScaffoldEdge> result;
+ auto e_range = incoming_edges_.equal_range(assembly_graph_edge);
+ for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) {
+ result.push_back(edges_.at(edge_id->second));
+ }
+ return result;
+}
+
+vector<ScaffoldGraph::ScaffoldEdge> ScaffoldGraph::OutgoingEdges(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
+ vector<ScaffoldEdge> result;
+ auto e_range = outgoing_edges_.equal_range(assembly_graph_edge);
+ for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) {
+ result.push_back(edges_.at(edge_id->second));
+ }
+ return result;
+}
+
+const debruijn_graph::Graph &ScaffoldGraph::AssemblyGraph() const {
+ return assembly_graph_;
+}
+
+size_t ScaffoldGraph::EdgeCount() const {
+ return edges_.size();
+}
+
+size_t ScaffoldGraph::VertexCount() const {
+ return vertices_.size();
+}
+
+ScaffoldGraph::ScaffoldVertex ScaffoldGraph::EdgeEnd(ScaffoldEdge e) const {
+ return e.getEnd();
+}
+
+ScaffoldGraph::ScaffoldVertex ScaffoldGraph::EdgeStart(ScaffoldEdge e) const {
+ return e.getStart();
+}
+
+size_t ScaffoldGraph::int_id(ScaffoldGraph::ScaffoldEdge e) const {
+ return e.getId();
+}
+
+size_t ScaffoldGraph::int_id(ScaffoldGraph::ScaffoldVertex v) const {
+ return assembly_graph_.int_id(v);
+}
+
+ScaffoldGraph::ConstScaffoldEdgeIterator ScaffoldGraph::eend() const {
+ return ConstScaffoldEdgeIterator(edges_.cend());
+}
+
+ScaffoldGraph::ConstScaffoldEdgeIterator ScaffoldGraph::ebegin() const {
+ return ConstScaffoldEdgeIterator(edges_.cbegin());
+}
+
+ScaffoldGraph::VertexStorage::const_iterator ScaffoldGraph::vend() const {
+ return vertices_.cend();
+}
+
+ScaffoldGraph::VertexStorage::const_iterator ScaffoldGraph::vbegin() const {
+ return vertices_.cbegin();
+}
+
+adt::iterator_range<ScaffoldGraph::VertexStorage::const_iterator> ScaffoldGraph::vertices() const {
+ return adt::make_range(vbegin(), vend());
+}
+
+adt::iterator_range<ScaffoldGraph::ConstScaffoldEdgeIterator> ScaffoldGraph::edges() const {
+ return adt::make_range(ebegin(), eend());
+}
+
+bool ScaffoldGraph::IsVertexIsolated(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
+ bool
+ result = incoming_edges_.count(assembly_graph_edge) == 0 && outgoing_edges_.count(assembly_graph_edge) == 0;
+ VERIFY((incoming_edges_.count(conjugate(assembly_graph_edge)) == 0
+ && incoming_edges_.count(assembly_graph_edge) == 0) == result);
+ return result;
+}
+
+bool ScaffoldGraph::RemoveVertex(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) {
+ if (Exists(assembly_graph_edge)) {
+ VERIFY(Exists(conjugate(assembly_graph_edge)));
+
+ DeleteAllOutgoingEdgesSimple(assembly_graph_edge);
+ DeleteAllIncomingEdgesSimple(assembly_graph_edge);
+ DeleteAllOutgoingEdgesSimple(conjugate(assembly_graph_edge));
+ DeleteAllIncomingEdgesSimple(conjugate(assembly_graph_edge));
+
+ VERIFY(incoming_edges_.count(assembly_graph_edge) == 0);
+ VERIFY(outgoing_edges_.count(assembly_graph_edge) == 0);
+ VERIFY(incoming_edges_.count(conjugate(assembly_graph_edge)) == 0);
+ VERIFY(outgoing_edges_.count(conjugate(assembly_graph_edge)) == 0);
+
+ vertices_.erase(assembly_graph_edge);
+ vertices_.erase(conjugate(assembly_graph_edge));
+
+ return true;
+ }
+ return false;
+}
+
+bool ScaffoldGraph::RemoveEdge(const ScaffoldGraph::ScaffoldEdge &e) {
+ if (Exists(e)) {
+ VERIFY(Exists(conjugate(e)));
+ DeleteOutgoing(e);
+ DeleteIncoming(e);
+ DeleteOutgoing(conjugate(e));
+ DeleteIncoming(conjugate(e));
+ DeleteEdgeFromStorage(e);
+
+ return true;
+ }
+ return false;
+}
+
+bool ScaffoldGraph::AddEdge(const ScaffoldGraph::ScaffoldEdge &e) {
+ return AddEdge(e.getStart(), e.getEnd(), e.getColor(), e.getWeight());
+}
+
+} //scaffold_graph
+} //path_extend
\ No newline at end of file
diff --git a/src/modules/algorithms/path_extend/scaffolder2015/scaffold_graph.hpp b/src/modules/algorithms/path_extend/scaffolder2015/scaffold_graph.hpp
new file mode 100644
index 0000000..5e51863
--- /dev/null
+++ b/src/modules/algorithms/path_extend/scaffolder2015/scaffold_graph.hpp
@@ -0,0 +1,234 @@
+//
+// Created by andrey on 17.09.15.
+//
+#pragma once
+
+#include "dev_support/logger/logger.hpp"
+#include "assembly_graph/graph_core/graph.hpp"
+#include "algorithms/path_extend/paired_library.hpp"
+#include "connection_condition2015.hpp"
+
+#include "dev_support/standard_base.hpp"
+#include "utils/adt/iterator_range.hpp"
+
+namespace path_extend {
+namespace scaffold_graph {
+
+//do NOT add "using namespace debruijn_graph" in order not to confuse between EdgeId typdefs
+
+class ScaffoldGraph {
+
+public:
+ //EdgeId in de Bruijn graph is vertex in scaffolding graph
+ typedef debruijn_graph::EdgeId ScaffoldVertex;
+
+ //Unique edge id
+ typedef size_t ScaffoldEdgeIdT;
+
+ //Scaffold edge indormation class
+ struct ScaffoldEdge {
+ private:
+ //unique id
+ ScaffoldEdgeIdT id_;
+ //id counter
+ static std::atomic<ScaffoldEdgeIdT> scaffold_edge_id_;
+
+ ScaffoldVertex start_;
+ ScaffoldVertex end_;
+ //color = lib#
+ size_t color_;
+ //read pair weight or anything else
+ double weight_;
+
+ public:
+
+ ScaffoldEdge(ScaffoldVertex start, ScaffoldVertex end, size_t lib_id = (size_t) -1, double weight = 0) :
+ id_(scaffold_edge_id_++),
+ start_(start), end_(end),
+ color_(lib_id),
+ weight_(weight) {
+ }
+
+ ScaffoldEdgeIdT getId() const {
+ return id_;
+ }
+
+
+ size_t getColor() const {
+ return color_;
+ }
+
+ double getWeight() const {
+ return weight_;
+ }
+
+ const ScaffoldVertex getStart() const {
+ return start_;
+ }
+
+ const ScaffoldVertex getEnd() const {
+ return end_;
+ }
+
+ bool operator==(const ScaffoldEdge &e) const {
+ return color_ == e.color_ && weight_ == e.weight_ && start_ == e.start_ && end_ == e.end_;
+ }
+
+ bool operator==(const ScaffoldEdge &e) {
+ return color_ == e.color_ && weight_ == e.weight_ && start_ == e.start_ && end_ == e.end_;
+ }
+ };
+
+ //typedef for possibility to use in templated graph visualizers
+ typedef ScaffoldVertex VertexId;
+ typedef ScaffoldEdge EdgeId;
+
+ //All vertices are stored in set
+ typedef std::set<ScaffoldVertex> VertexStorage;
+ //Edges are stored in map: Id -> Edge Information
+ typedef std::unordered_map<ScaffoldEdgeIdT, ScaffoldEdge> EdgeStorage;
+ //Adjacency list contains vertrx and edge id (instead of whole edge information)
+ typedef std::unordered_multimap<ScaffoldVertex, ScaffoldEdgeIdT> AdjacencyStorage;
+
+ struct ConstScaffoldEdgeIterator: public boost::iterator_facade<ConstScaffoldEdgeIterator,
+ const ScaffoldEdge,
+ boost::forward_traversal_tag> {
+ private:
+ EdgeStorage::const_iterator iter_;
+
+ public:
+ ConstScaffoldEdgeIterator(EdgeStorage::const_iterator iter) : iter_(iter) {
+ }
+
+ private:
+ friend class boost::iterator_core_access;
+
+ void increment() {
+ ++iter_;
+ }
+
+ bool equal(const ConstScaffoldEdgeIterator &other) const {
+ return iter_ == other.iter_;
+ }
+
+ const ScaffoldEdge& dereference() const {
+ return iter_->second;
+ }
+ };
+
+//TODO:: fix this. Seems that only ebegin and eend are broken.
+private:
+ EdgeStorage edges_;
+
+ VertexStorage vertices_;
+
+ const debruijn_graph::Graph &assembly_graph_;
+
+ //Map for storing conjugate scaffolding edges
+ std::unordered_map<ScaffoldEdgeIdT, ScaffoldEdgeIdT> conjugate_;
+
+ AdjacencyStorage outgoing_edges_;
+
+ AdjacencyStorage incoming_edges_;
+
+ //Add edge without any checks and conjugate
+ void AddEdgeSimple(const ScaffoldEdge &e, size_t conjugate_id);
+
+ //Delete outgoing edge from adjancecy list without checks
+ //and removing conjugate and respective incoming edge
+ void DeleteOutgoing(const ScaffoldEdge &e);
+
+ //Delete incoming edge from adjancecy list without checks
+ //and removing conjugate and respective outoging edge
+ void DeleteIncoming(const ScaffoldEdge &e);
+
+ //Delete all edge info from storage
+ void DeleteEdgeFromStorage(const ScaffoldEdge &e);
+
+ //Detelte all outgoing from v edges from adjacency lists
+ void DeleteAllOutgoingEdgesSimple(ScaffoldVertex v);
+
+ //Detelte all incoming from v edges from adjacency lists
+ void DeleteAllIncomingEdgesSimple(ScaffoldVertex v);
+
+public:
+ ScaffoldGraph(const debruijn_graph::Graph &g) : assembly_graph_(g) {
+ }
+
+ bool Exists(ScaffoldVertex assembly_graph_edge) const;
+
+ bool Exists(const ScaffoldEdge &e) const;
+
+ ScaffoldVertex conjugate(ScaffoldVertex assembly_graph_edge) const;
+
+ //Return structure thay is equal to conjugate of e (not exactrly the same structure as in graph)
+ ScaffoldEdge conjugate(const ScaffoldEdge &e) const;
+
+ //Add isolated vertex to the graph if not exitsts
+ bool AddVertex(ScaffoldVertex assembly_graph_edge);
+
+ void AddVertices(const set<ScaffoldVertex> &vertices);
+
+ //Add edge (and conjugate) if not exists
+ //v1 and v2 must exist
+ bool AddEdge(ScaffoldVertex v1, ScaffoldVertex v2, size_t lib_id, double weight);
+
+ bool AddEdge(const ScaffoldEdge &e);
+
+ //Rempve edge from edge container and all adjacency lists
+ bool RemoveEdge(const ScaffoldEdge &e);
+
+ //Remove vertex and all adjacent edges
+ bool RemoveVertex(ScaffoldVertex assembly_graph_edge);
+
+ bool IsVertexIsolated(ScaffoldVertex assembly_graph_edge) const;
+
+ VertexStorage::const_iterator vbegin() const;
+
+ VertexStorage::const_iterator vend() const;
+
+ adt::iterator_range<VertexStorage::const_iterator> vertices() const;
+
+ ConstScaffoldEdgeIterator ebegin() const;
+
+ ConstScaffoldEdgeIterator eend() const;
+
+ adt::iterator_range<ScaffoldGraph::ConstScaffoldEdgeIterator> edges() const;
+
+ size_t int_id(ScaffoldVertex v) const;
+
+ size_t int_id(ScaffoldEdge e) const;
+
+ ScaffoldVertex EdgeStart(ScaffoldEdge e) const;
+
+ ScaffoldVertex EdgeEnd(ScaffoldEdge e) const;
+
+ size_t VertexCount() const;
+
+ size_t EdgeCount() const;
+
+ const debruijn_graph::Graph & AssemblyGraph() const;
+
+ vector<ScaffoldEdge> OutgoingEdges(ScaffoldVertex assembly_graph_edge) const;
+
+ vector<ScaffoldEdge> IncomingEdges(ScaffoldVertex assembly_graph_edge) const;
+
+ size_t OutgoingEdgeCount(ScaffoldVertex assembly_graph_edge) const;
+
+ size_t IncomingEdgeCount(ScaffoldVertex assembly_graph_edge) const;
+
+ bool HasUniqueOutgoing(ScaffoldVertex assembly_graph_edge) const;
+
+ bool HasUniqueIncoming(ScaffoldVertex assembly_graph_edge) const;
+
+ ScaffoldEdge UniqueOutgoing(ScaffoldVertex assembly_graph_edge) const;
+
+ ScaffoldEdge UniqueIncoming(ScaffoldVertex assembly_graph_edge) const;
+
+ void Print(ostream &os) const;
+
+};
+
+} //scaffold_graph
+} //path_extend
+
diff --git a/src/modules/algorithms/path_extend/scaffolder2015/scaffold_graph_constructor.cpp b/src/modules/algorithms/path_extend/scaffolder2015/scaffold_graph_constructor.cpp
new file mode 100644
index 0000000..61a813b
--- /dev/null
+++ b/src/modules/algorithms/path_extend/scaffolder2015/scaffold_graph_constructor.cpp
@@ -0,0 +1,77 @@
+//
+// Created by andrey on 04.12.15.
+//
+
+#include "scaffold_graph_constructor.hpp"
+
+namespace path_extend {
+namespace scaffold_graph {
+
+
+bool LengthEdgeCondition::IsSuitable(debruijn_graph::EdgeId e) const {
+ return graph_.length(e) >= min_length_;
+}
+
+void BaseScaffoldGraphConstructor::ConstructFromEdgeConditions(const EdgeCondition &edge_condition,
+ vector<shared_ptr<ConnectionCondition>> &connection_conditions,
+ bool use_terminal_vertices_only) {
+ for (auto e = graph_->AssemblyGraph().ConstEdgeBegin(); !e.IsEnd(); ++e) {
+ if (edge_condition.IsSuitable(*e)) {
+ graph_->AddVertex(*e);
+ }
+ }
+ ConstructFromConditions(connection_conditions, use_terminal_vertices_only);
+}
+
+void BaseScaffoldGraphConstructor::ConstructFromSet(const set<EdgeId> edge_set,
+ vector<shared_ptr<ConnectionCondition>> &connection_conditions,
+ bool use_terminal_vertices_only) {
+ graph_->AddVertices(edge_set);
+ ConstructFromConditions(connection_conditions, use_terminal_vertices_only);
+}
+
+void BaseScaffoldGraphConstructor::ConstructFromConditions(vector<shared_ptr<ConnectionCondition>> &connection_conditions,
+ bool use_terminal_vertices_only) {
+//TODO :: awful. It depends on ordering of connected conditions.
+ for (auto condition : connection_conditions) {
+ if (condition->GetLibIndex() == (size_t) -1)
+ ConstructFromSingleCondition(condition, true);
+ else
+ ConstructFromSingleCondition(condition, use_terminal_vertices_only);
+ }
+}
+
+void BaseScaffoldGraphConstructor::ConstructFromSingleCondition(const shared_ptr<ConnectionCondition> condition,
+ bool use_terminal_vertices_only) {
+ for (const auto& v : graph_->vertices()) {
+ TRACE("Vertex " << graph_->int_id(v));
+
+ if (use_terminal_vertices_only && graph_->OutgoingEdgeCount(v) > 0)
+ continue;
+
+ auto connected_with = condition->ConnectedWith(v);
+ for (auto connected : connected_with) {
+ TRACE("Connected with " << graph_->int_id(connected));
+ if (graph_->Exists(connected)) {
+ if (use_terminal_vertices_only && graph_->IncomingEdgeCount(connected) > 0)
+ continue;
+ graph_->AddEdge(v, connected, condition->GetLibIndex(), condition->GetWeight(v, connected));
+ }
+ }
+ }
+}
+
+
+shared_ptr<ScaffoldGraph> SimpleScaffoldGraphConstructor::Construct() {
+ ConstructFromSet(edge_set_, connection_conditions_);
+ return graph_;
+}
+
+shared_ptr<ScaffoldGraph> DefaultScaffoldGraphConstructor::Construct() {
+ ConstructFromSet(edge_set_, connection_conditions_);
+ ConstructFromEdgeConditions(edge_condition_, connection_conditions_);
+ return graph_;
+}
+
+} //scaffold_graph
+} //path_extend
\ No newline at end of file
diff --git a/src/debruijn/path_extend/scaffolder2015/scaffold_graph_constructor.hpp b/src/modules/algorithms/path_extend/scaffolder2015/scaffold_graph_constructor.hpp
similarity index 100%
rename from src/debruijn/path_extend/scaffolder2015/scaffold_graph_constructor.hpp
rename to src/modules/algorithms/path_extend/scaffolder2015/scaffold_graph_constructor.hpp
diff --git a/src/modules/algorithms/path_extend/scaffolder2015/scaffold_graph_visualizer.cpp b/src/modules/algorithms/path_extend/scaffolder2015/scaffold_graph_visualizer.cpp
new file mode 100644
index 0000000..8e5aec6
--- /dev/null
+++ b/src/modules/algorithms/path_extend/scaffolder2015/scaffold_graph_visualizer.cpp
@@ -0,0 +1,72 @@
+//
+// Created by andrey on 21.09.15.
+//
+
+#include "scaffold_graph_visualizer.hpp"
+
+namespace path_extend{ namespace scaffold_graph {
+
+const map<size_t, string> ScaffoldEdgeColorer::color_map =
+ {{(size_t) -1, "black"},
+ {0, "red"},
+ {1, "blue"},
+ {2, "green"},
+ {3, "magenta"},
+ {4, "orange"},
+ {5, "cyan"}};
+
+const string ScaffoldEdgeColorer::default_color = "black";
+
+string ScaffoldGraphLabeler::label(EdgeId e) const {
+ return "ID: " + ToString(e.getId()) +
+ "\\n Weight: " + ToString(e.getWeight()) +
+ "\\n Lib#: " + ToString(e.getColor());
+}
+
+string ScaffoldGraphLabeler::label(VertexId v) const {
+ return "ID: " + ToString(graph_.int_id(v)) +
+ "\\n Len: " + ToString(graph_.AssemblyGraph().length(v)) +
+ "\\n Cov: " + ToString(graph_.AssemblyGraph().coverage(v));
+}
+
+void ScaffoldGraphVisualizer::Visualize(GraphPrinter<ScaffoldGraph> &printer) {
+ printer.open();
+ printer.AddVertices(graph_.vbegin(), graph_.vend());
+ //for (auto e = graph_.ebegin(); e != graph_.eend(); ++e) {
+ for (const auto& e : graph_.edges()) {
+ printer.AddEdge(e);
+ }
+ printer.close();
+}
+
+void ScaffoldGraphVisualizer::Visualize(ostream &os, CompositeGraphColorer<ScaffoldGraph>& colorer) {
+ ScaffoldGraphLabeler labeler(graph_);
+ EmptyGraphLinker<ScaffoldGraph> linker;
+
+ if (paired_) {
+ PairedGraphPrinter <ScaffoldGraph> printer(graph_, os, labeler, colorer, linker);
+ Visualize(printer);
+ } else {
+ SingleGraphPrinter <ScaffoldGraph> printer(graph_, os, labeler, colorer, linker);
+ Visualize(printer);
+ }
+}
+
+string ScaffoldEdgeColorer::GetValue(ScaffoldGraph::EdgeId e) const {
+ auto it = color_map.find(e.getColor());
+ if (it != color_map.end()) {
+ return it->second;
+ }
+ return default_color;
+}
+
+string ScaffoldVertexSetColorer::GetValue(ScaffoldGraph::VertexId v) const {
+ if (vertex_set_.count(v) > 0)
+ return "white";
+ return "yellow";
+}
+} //scaffold_graph
+} //path_extend
+
+
+
diff --git a/src/modules/algorithms/path_extend/scaffolder2015/scaffold_graph_visualizer.hpp b/src/modules/algorithms/path_extend/scaffolder2015/scaffold_graph_visualizer.hpp
new file mode 100644
index 0000000..2ed651c
--- /dev/null
+++ b/src/modules/algorithms/path_extend/scaffolder2015/scaffold_graph_visualizer.hpp
@@ -0,0 +1,73 @@
+//
+// Created by andrey on 21.09.15.
+//
+
+#ifndef PROJECT_SCAFFOLD_GRAPH_VISUALIZER_HPP
+#define PROJECT_SCAFFOLD_GRAPH_VISUALIZER_HPP
+
+#include "pipeline/graphio.hpp"
+#include "scaffold_graph.hpp"
+
+namespace path_extend { namespace scaffold_graph {
+
+using namespace omnigraph::visualization;
+
+
+class ScaffoldGraphLabeler : public GraphLabeler<ScaffoldGraph> {
+
+private:
+ const ScaffoldGraph &graph_;
+
+public:
+ ScaffoldGraphLabeler(const ScaffoldGraph &graph) : graph_(graph) {
+ }
+
+ string label(VertexId v) const;
+
+ string label(EdgeId e) const;
+};
+
+
+class ScaffoldEdgeColorer : public ElementColorer<ScaffoldGraph::EdgeId> {
+private:
+ static const map<size_t, string> color_map;
+
+ static const string default_color;
+
+public:
+ string GetValue(ScaffoldGraph::EdgeId e) const;
+};
+
+
+class ScaffoldVertexSetColorer : public ElementColorer<ScaffoldGraph::VertexId> {
+ private:
+ set<ScaffoldGraph::VertexId> vertex_set_;
+
+ public:
+ ScaffoldVertexSetColorer(const set<ScaffoldGraph::VertexId>& vertex_set): vertex_set_(vertex_set) {
+ }
+
+ string GetValue(ScaffoldGraph::VertexId v) const;
+};
+
+class ScaffoldGraphVisualizer {
+
+ const ScaffoldGraph &graph_;
+ const bool paired_;
+
+private:
+ void Visualize(GraphPrinter<ScaffoldGraph> &printer);
+
+public:
+ ScaffoldGraphVisualizer(const ScaffoldGraph &graph, bool paired = true) :
+ graph_(graph), paired_(paired) {
+ }
+
+ void Visualize(ostream &os, CompositeGraphColorer<ScaffoldGraph>& colorer);
+};
+
+} //scaffold_graph
+} //path_extend
+
+
+#endif //PROJECT_SCAFFOLD_GRAPH_VISUALIZER_HPP
diff --git a/src/modules/algorithms/path_extend/split_graph_pair_info.hpp b/src/modules/algorithms/path_extend/split_graph_pair_info.hpp
new file mode 100644
index 0000000..8991d57
--- /dev/null
+++ b/src/modules/algorithms/path_extend/split_graph_pair_info.hpp
@@ -0,0 +1,449 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * split_graph_pair_info.hpp
+ *
+ * Created on: May 14, 2013
+ * Author: ira
+ */
+
+#ifndef SPLIT_GRAPH_PAIR_INFO_HPP_
+#define SPLIT_GRAPH_PAIR_INFO_HPP_
+
+#include <paired_info/weights.hpp>
+#include "assembly_graph/graph_alignment/sequence_mapper_notifier.hpp"
+#include "io/dataset_support/read_converter.hpp"
+#include "ideal_pair_info.hpp"
+
+using namespace debruijn_graph;
+
+namespace path_extend {
+
+inline double FindIntersection(vector<double>& pi1, vector<double>& pi2) {
+ std::sort(pi1.begin(), pi1.end());
+ std::sort(pi2.begin(), pi2.end());
+ size_t iter1 = 0;
+ size_t iter2 = 0;
+ double threshold = 0.0;
+ double percent1 = 0.0;
+ double percent2 = 1.0;
+ while (percent1 < percent2 and iter1 < pi1.size() and iter2 < pi2.size()) {
+ threshold = pi1[iter1];
+ while (iter2 < pi2.size() and pi2[iter2] <= threshold) {
+ iter2++;
+ }
+ percent1 = (double) iter1 / (double) pi1.size();
+ percent2 = 1.0 - (double) iter2 / (double) pi2.size();
+ iter1 += 1;
+ }
+ return threshold;
+}
+
+class Basket {
+ EdgeId edgeId_;
+ size_t index_;
+
+public:
+ Basket(EdgeId edgeId, size_t index)
+ : edgeId_(edgeId), index_(index) { }
+
+ Basket(const Basket& b)
+ : edgeId_(b.edgeId_), index_(b.index_) {}
+
+ const EdgeId edgeId() const {
+ return edgeId_;
+ }
+
+ size_t index() const {
+ return index_;
+ }
+
+ bool operator<(const Basket& rhs) const {
+ if (edgeId() != rhs.edgeId()) {
+ return edgeId() < rhs.edgeId();
+ }
+ return index() < rhs.index();
+ }
+
+ bool operator==(const Basket& rhs) const {
+ return edgeId() == rhs.edgeId() && index() == rhs.index();
+ }
+};
+
+struct PairInfo {
+ double weight_;
+ double distance_;
+ size_t count_;
+
+ PairInfo()
+ : weight_(0.), distance_(0.), count_(0) {}
+
+ PairInfo(double weight, double distance, size_t count = 0)
+ : weight_(weight), distance_(distance), count_(count) {}
+
+};
+
+class EdgePairInfo {
+ EdgeId edgeId_;
+ size_t basket_size_;
+ vector<map<Basket, PairInfo> > pair_info_;
+
+public:
+ EdgePairInfo() {
+ basket_size_ = 0;
+ }
+
+ EdgePairInfo(size_t length, EdgeId edgeId, size_t basket_size)
+ : edgeId_(edgeId),
+ basket_size_(basket_size) {
+ size_t count_baskets = length / basket_size_ + 1;
+ for (size_t index = 0; index < count_baskets; ++index) {
+ pair_info_.push_back(map<Basket, PairInfo>());
+ }
+ }
+
+ EdgePairInfo(const EdgePairInfo& pairInfo)
+ : edgeId_(pairInfo.edgeId_),
+ basket_size_(pairInfo.basket_size_) {
+ for (size_t index = 0; index < pairInfo.pair_info_.size(); ++index) {
+ pair_info_.push_back(pairInfo.pair_info_[index]);
+ }
+ }
+
+ void AddPairInfo(size_t pos_begin1, size_t pos_end1, EdgeId edgeId2,
+ size_t pos_begin2, size_t pos_end2, double weight,
+ double edge_distance) {
+ size_t begin_basket_index1 = GetBasketIndex(pos_begin1);
+ size_t end_basket_index1 = GetBasketIndex(pos_end1);
+ size_t begin_basket_index2 = GetBasketIndex(pos_begin2);
+ size_t end_basket_index2 = GetBasketIndex(pos_end2);
+ for (size_t index1 = begin_basket_index1; index1 <= end_basket_index1;
+ ++index1) {
+ for (size_t index2 = begin_basket_index2;
+ index2 <= end_basket_index2; ++index2) {
+ AddPairInfoToBasket(index1, edgeId2, index2, weight,
+ edge_distance);
+ }
+ }
+ }
+
+ void AddPairInfo(const EdgePairInfo& edgePairInfo) {
+ for (size_t index = 0; index < pair_info_.size(); ++index) {
+ const map<Basket, PairInfo>& basketInfoToAdd = edgePairInfo
+ .pair_info_[index];
+ map<Basket, PairInfo>& oldBasketInfo = pair_info_[index];
+ for (auto iter = basketInfoToAdd.begin();
+ iter != basketInfoToAdd.end(); ++iter) {
+ if (oldBasketInfo.find(iter->first) == oldBasketInfo.end()) {
+ oldBasketInfo[iter->first] = iter->second;
+ } else {
+ PairInfo& pairInfo = oldBasketInfo[iter->first];
+ oldBasketInfo[iter->first] = PairInfo(
+ pairInfo.weight_ + iter->second.weight_,
+ CountNewDistance(pairInfo, iter->second.distance_,
+ iter->second.count_),
+ iter->second.count_ + pairInfo.count_);
+ }
+ }
+ }
+ }
+
+ map<Basket, PairInfo>& GetInfo(size_t index) {
+ return pair_info_.at(index);
+ }
+
+ size_t size() {
+ return pair_info_.size();
+ }
+
+private:
+ size_t GetBasketIndex(size_t pos) const {
+ return pos / basket_size_;
+ }
+
+ void AddPairInfoToBasket(size_t index1, EdgeId edgeId2, size_t index2,
+ double weight, double edge_distance) {
+ Basket basket2(edgeId2, index2);
+ if (pair_info_[index1].find(basket2) == pair_info_[index1].end()) {
+ pair_info_[index1][basket2] = PairInfo(0.0, 0);
+ }
+ PairInfo oldPairInfo = pair_info_[index1][basket2];
+ double basket_distance = GetBasketDistance(edge_distance, index1,
+ index2);
+ pair_info_[index1][basket2] = PairInfo(
+ oldPairInfo.weight_ + weight,
+ CountNewDistance(oldPairInfo, basket_distance),
+ oldPairInfo.count_ + 1);
+ }
+
+ double CountNewDistance(PairInfo& oldPairInfo, double distance,
+ size_t count = 1) {
+ return (oldPairInfo.distance_ * (double) oldPairInfo.count_
+ + distance * (double) count)
+ / (double) (oldPairInfo.count_ + count);
+ }
+
+ double GetBasketDistance(double edge_distance, size_t index1,
+ size_t index2) {
+ return edge_distance - (double) index1 * (double) basket_size_
+ + (double) index2 * (double) basket_size_;
+ }
+};
+
+class BasketsPairInfoIndex {
+ const conj_graph_pack& gp_;
+ size_t basket_size_;
+ map<EdgeId, EdgePairInfo> pair_info_;
+
+public:
+ BasketsPairInfoIndex(const conj_graph_pack& gp, size_t basket_size)
+ : gp_(gp),
+ basket_size_(basket_size) {
+ }
+
+ void AddPairInfo(EdgeId edgeId1, size_t pos_begin1, size_t pos_end1,
+ EdgeId edgeId2, size_t pos_begin2, size_t pos_end2,
+ double weight, double edge_distance) {
+ if (pair_info_.find(edgeId1) == pair_info_.end()) {
+ EdgePairInfo edgePairInfo2(gp_.g.length(edgeId1), edgeId1,
+ basket_size_);
+ pair_info_.insert(make_pair(edgeId1, edgePairInfo2));
+ }
+ pair_info_[edgeId1].AddPairInfo(pos_begin1, pos_end1, edgeId2,
+ pos_begin2, pos_end2, weight,
+ edge_distance);
+ }
+
+ EdgePairInfo& GetEdgePairInfo(EdgeId edgeId) {
+ return pair_info_[edgeId];
+ }
+
+ void AddAll(const BasketsPairInfoIndex& index) {
+ for (auto it = index.pair_info_.begin(); it != index.pair_info_.end();
+ ++it) {
+ if (pair_info_.find(it->first) == pair_info_.end()) {
+ pair_info_.insert(make_pair(it->first, it->second));
+ } else {
+ pair_info_[it->first].AddPairInfo(it->second);
+ }
+ }
+ }
+
+ void Clear() {
+ pair_info_.clear();
+ }
+
+ size_t size() const {
+ return pair_info_.size();
+ }
+
+};
+
+class SplitGraphPairInfo : public SequenceMapperListener {
+
+public:
+ //TODO: d_min = ? d_max = ? for ideal_pi_counter_
+ SplitGraphPairInfo(conj_graph_pack& gp, size_t is,
+ size_t is_var,
+ size_t is_min, size_t is_max,
+ size_t read_size, size_t /* k */, size_t basket_size,
+ const std::map<int, size_t>& is_distribution)
+ : gp_(gp),
+ is_(is),
+ is_var_(is_var),
+ is_min_(is_min),
+ is_max_(is_max),
+ basket_size_(basket_size),
+ basket_index_(gp, basket_size),
+ threshold_(-1),
+ ideal_pi_counter_(gp.g, (int)is_min_,
+ (int)is_max_, read_size, is_distribution) {
+
+ }
+
+ void StartProcessLibrary(size_t threads_count) override {
+ baskets_buffer_.clear();
+ for (size_t i = 0; i < threads_count; ++i)
+ baskets_buffer_.emplace_back(gp_, basket_size_);
+ }
+
+ void ProcessPairedRead(size_t thread_index,
+ const io::PairedRead& r,
+ const MappingPath<EdgeId>& read1,
+ const MappingPath<EdgeId>& read2) override {
+ ProcessPairedRead(baskets_buffer_[thread_index], r.first().size(), r.second().size(),
+ read1, read2, r.distance());
+ }
+
+ void ProcessPairedRead(size_t thread_index,
+ const io::PairedReadSeq& r,
+ const MappingPath<EdgeId>& read1,
+ const MappingPath<EdgeId>& read2) override {
+ ProcessPairedRead(baskets_buffer_[thread_index], r.first().size(), r.second().size(),
+ read1, read2, r.distance());
+ }
+
+ void ProcessSingleRead(size_t, const io::SingleRead&, const MappingPath<EdgeId>&) override {
+ //only paired reads are interesting
+ }
+
+ void ProcessSingleRead(size_t, const io::SingleReadSeq&, const MappingPath<EdgeId>&) override {
+ //only paired reads are interesting
+ }
+ void MergeBuffer(size_t thread_index) override {
+ basket_index_.AddAll(baskets_buffer_[thread_index]);
+ baskets_buffer_[thread_index].Clear();
+ }
+
+ void StopProcessLibrary() override {
+ for (size_t i = 0; i < baskets_buffer_.size(); ++i)
+ MergeBuffer(i);
+
+ FindThreshold();
+
+ baskets_buffer_.clear();
+ }
+
+ double GetThreshold() const {
+ return threshold_;
+ }
+
+private:
+ void FindThreshold() {
+ size_t min_long_edge = basket_size_;
+ const Graph& g = gp_.g;
+ vector<double> good_pi;
+ vector<double> bad_pi;
+ double insert_size_min = (double) is_ - 2. * (double) is_var_;
+ double insert_size_max = (double) is_ + 2. * (double) is_var_;
+ for (auto e = g.ConstEdgeBegin(); !e.IsEnd(); ++e) {
+ EdgeId edge = *e;
+
+ if (g.length(edge) > min_long_edge) {
+ if (g.int_id(edge) <= 0)
+ continue;
+
+ EdgePairInfo& edge_pi = basket_index_.GetEdgePairInfo(edge);
+ if (edge_pi.size() == 0)
+ continue;
+ size_t count_backets = LastBasketIndex(edge, (int) insert_size_max,
+ edge_pi);
+ for (size_t index = 0; index <= count_backets; ++index) {
+ map<Basket, PairInfo>& basket_info = edge_pi.GetInfo(index);
+ set<size_t> pair_baskets = GetBaskets(index,
+ (int) insert_size_min,
+ (int) insert_size_max,
+ edge_pi);
+ for (auto iter = basket_info.begin(); iter != basket_info.end(); ++iter) {
+ PairInfo& pi = iter->second;
+ if (iter->first.edgeId() == edge &&
+ pair_baskets.find(iter->first.index()) != pair_baskets.end()) {
+ good_pi.push_back(GetNormalizedWeight(pi));
+ } else {
+ bad_pi.push_back(GetNormalizedWeight(pi));
+ }
+ }
+ }
+ }
+ }
+ DEBUG("good pi size " << good_pi.size() << " bad pi size " << bad_pi.size());
+ threshold_ = FindIntersection(good_pi, bad_pi);
+ INFO("Threshold for paired information " << threshold_);
+ }
+
+ size_t LastBasketIndex(EdgeId edgeId, int insert_size_max,
+ EdgePairInfo& edge_pair_info) {
+ return min((gp_.g.length(edgeId) - insert_size_max) / basket_size_,
+ edge_pair_info.size() - 1);
+ }
+
+ size_t FindBeginPairBasket(size_t index, int insert_size_min,
+ EdgePairInfo& edge_pair_info) {
+ return min(index + insert_size_min / basket_size_,
+ edge_pair_info.size() - 1);
+ }
+
+ size_t FindEndPairBasket(size_t index, int insert_size_max,
+ EdgePairInfo& edge_pair_info) {
+ return min(index + insert_size_max / basket_size_,
+ edge_pair_info.size() - 1);
+ }
+
+ set<size_t> GetBaskets(size_t index, int insert_size_min,
+ int insert_size_max, EdgePairInfo& edge_pair_info) {
+ set<size_t> result;
+ size_t begin = FindBeginPairBasket(index, insert_size_min,
+ edge_pair_info);
+ size_t end = FindEndPairBasket(index, insert_size_max, edge_pair_info);
+ for (size_t pair_index = begin; pair_index <= end; ++pair_index) {
+ result.insert(pair_index);
+ }
+ return result;
+ }
+
+ double GetNormalizedWeight(PairInfo& pi) {
+ return pi.weight_
+ / ideal_pi_counter_.IdealPairedInfo(basket_size_, basket_size_,
+ (int) pi.distance_);
+ }
+
+ void InnerProcess(BasketsPairInfoIndex& basket_index,
+ const MappingPath<EdgeId>& path1,
+ const MappingPath<EdgeId>& path2,
+ size_t read_distance) {
+ for (size_t i = 0; i < path1.size(); ++i) {
+ pair<EdgeId, MappingRange> mapping_edge_1 = path1[i];
+ for (size_t j = 0; j < path2.size(); ++j) {
+ pair<EdgeId, MappingRange> mapping_edge_2 = path2[j];
+ double weight = PairedReadCountWeight(mapping_edge_1.second,
+ mapping_edge_2.second);
+ size_t kmer_distance = read_distance
+ + mapping_edge_2.second.initial_range.end_pos
+ - mapping_edge_1.second.initial_range.start_pos;
+ int edge_distance = (int) kmer_distance
+ + (int) mapping_edge_1.second.mapped_range.start_pos
+ - (int) mapping_edge_2.second.mapped_range.end_pos;
+
+ basket_index.AddPairInfo(
+ mapping_edge_1.first,
+ mapping_edge_1.second.mapped_range.start_pos,
+ mapping_edge_1.second.mapped_range.end_pos,
+ mapping_edge_2.first,
+ mapping_edge_2.second.mapped_range.start_pos,
+ mapping_edge_2.second.mapped_range.end_pos, weight,
+ (double) edge_distance);
+ }
+ }
+ }
+
+ void ProcessPairedRead(BasketsPairInfoIndex& basket_index,
+ size_t r1_length,
+ size_t r2_length,
+ const MappingPath<EdgeId>& path1,
+ const MappingPath<EdgeId>& path2,
+ size_t read_distance) {
+ InnerProcess(basket_index, path1, path2, read_distance);
+ InnerProcess(basket_index, ConjugateMapping(gp_.g, path2, r2_length),
+ ConjugateMapping(gp_.g, path1, r1_length), read_distance);
+ }
+
+ const conj_graph_pack& gp_;
+ size_t is_;
+ size_t is_var_;
+ size_t is_min_;
+ size_t is_max_;
+ size_t basket_size_;
+ BasketsPairInfoIndex basket_index_;
+ vector<BasketsPairInfoIndex> baskets_buffer_;
+ double threshold_;
+ IdealPairInfoCounter ideal_pi_counter_;
+};
+
+} /* path_extend */
+
+#endif /* SPLIT_GRAPH_PAIR_INFO_HPP_ */
diff --git a/src/debruijn/path_extend/utils/CMakeLists.txt b/src/modules/algorithms/path_extend/utils/CMakeLists.txt
similarity index 100%
rename from src/debruijn/path_extend/utils/CMakeLists.txt
rename to src/modules/algorithms/path_extend/utils/CMakeLists.txt
diff --git a/src/debruijn/path_extend/utils/find_aligns.py b/src/modules/algorithms/path_extend/utils/find_aligns.py
similarity index 100%
rename from src/debruijn/path_extend/utils/find_aligns.py
rename to src/modules/algorithms/path_extend/utils/find_aligns.py
diff --git a/src/debruijn/path_extend/utils/find_single_threshold.py b/src/modules/algorithms/path_extend/utils/find_single_threshold.py
similarity index 100%
rename from src/debruijn/path_extend/utils/find_single_threshold.py
rename to src/modules/algorithms/path_extend/utils/find_single_threshold.py
diff --git a/src/modules/algorithms/path_extend/utils/paired_info_checker.cpp b/src/modules/algorithms/path_extend/utils/paired_info_checker.cpp
new file mode 100644
index 0000000..f77046a
--- /dev/null
+++ b/src/modules/algorithms/path_extend/utils/paired_info_checker.cpp
@@ -0,0 +1,204 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * paired_info_checker.cpp
+ *
+ * Created on: Sep 26, 2011
+ * Author: andrey
+ */
+
+#include "../lc_common.hpp"
+#include "../lc_io.hpp"
+
+using namespace debruijn_graph;
+
+class PairedInfoChecker {
+private:
+ Graph& g_;
+
+public:
+ PairedInfoChecker(Graph& g) : g_(g) {
+
+ }
+
+ bool IsSymmetric(PairedInfoIndex<Graph>& index) {
+ bool result = true;
+ for (auto iter = index.begin(); iter != index.end(); ++iter) {
+ auto pi = *iter;
+ if (pi.size() == 0) {
+ continue;
+ }
+ EdgeId e1 = pi.back().first;
+ EdgeId e2 = pi.back().second;
+
+ auto sym_pi = index.GetEdgePairInfo(e2, e1);
+
+ for (auto i1 = pi.begin(); i1 != pi.end(); ++i1) {
+ for (auto i2 = sym_pi.begin(); i2 != sym_pi.end(); ++i2) {
+ if (math::eq(i1->d, - i2->d) && !math::eq(i1->weight, i2->weight)) {
+ INFO("No symmetric found ");
+ result = false;
+ }
+ }
+ }
+
+ }
+ return result;
+ }
+
+ bool IsConjugateSymmetric(PairedInfoIndex<Graph>& index) {
+ bool result = true;
+ for (auto iter = index.begin(); iter != index.end(); ++iter) {
+ auto pi = *iter;
+ if (pi.size() == 0) {
+ continue;
+ }
+ EdgeId e1 = pi.back().first;
+ EdgeId e2 = pi.back().second;
+
+ auto conj_pi = index.GetEdgePairInfo(g_.conjugate(e1), g_.conjugate(e2));
+
+ for (auto i1 = pi.begin(); i1 != pi.end(); ++i1) {
+ for (auto i2 = conj_pi.begin(); i2 != conj_pi.end(); ++i2) {
+ double new_d = i1->d - g_.length(e1) + g_.length(e2);
+ if (math::eq(i1->d, - new_d) && !math::eq(i1->weight, i2->weight)) {
+ INFO("No conjugate found ");
+ result = false;
+ }
+ }
+ }
+
+ }
+ return result;
+ }
+
+ bool AreEqual(PairedInfoIndex<Graph>& index1, PairedInfoIndex<Graph>& index2) {
+ bool result = true;
+ for (auto iter = index1.begin(); iter != index1.end(); ++iter) {
+ auto pi = *iter;
+ if (pi.size() == 0) {
+ continue;
+ }
+ EdgeId e1 = pi.back().first;
+ EdgeId e2 = pi.back().second;
+
+ auto pi2 = index2.GetEdgePairInfo(e1, e2);
+
+ for (auto i1 = pi.begin(); i1 != pi.end(); ++i1) {
+ for (auto i2 = pi2.begin(); i2 != pi2.end(); ++i2) {
+ if (math::eq(i1->d, i2->d) && !math::eq(i1->weight, i2->weight)) {
+ INFO("Unequal weights");
+ result = false;
+ }
+ }
+ }
+
+ }
+ return result;
+ }
+
+ void AggregatePairedInfo(PairedInfoIndex<Graph>& clustered, PairedInfoIndex<Graph>& advanced,
+ size_t insert_size, size_t read_length,
+ PairedInfoIndex<Graph>* result) {
+
+ PairedInfoWeightNormalizer<Graph> normalizer(g_, insert_size, read_length, K);
+
+ for (auto iter = clustered.begin(); iter != clustered.end(); ++iter) {
+ auto pi = *iter;
+ if (pi.size() == 0) {
+ continue;
+ }
+
+ EdgeId e1 = pi.back().first;
+ EdgeId e2 = pi.back().second;
+
+ auto pi2 = advanced.GetEdgePairInfo(e1, e2);
+
+ for (auto i1 = pi.begin(); i1 != pi.end(); ++i1) {
+
+ auto norm_pi = normalizer.NormalizeWeight(*i1);
+
+ for (auto i2 = pi2.begin(); i2 != pi2.end(); ++i2) {
+ if (math::ge(i1->d, i2->d - lc_cfg::get().u.dev) && math::le(i1->d, i2->d + lc_cfg::get().u.dev) && math::gr(i2->weight, 0.0)) {
+ norm_pi.weight *= lc_cfg::get().es.advanced_coeff;
+ }
+ }
+
+ result->AddPairInfo(norm_pi, false);
+ }
+
+ }
+
+ }
+
+};
+
+
+int main() {
+ cfg::create_instance(cfg_filename);
+ lc_cfg::create_instance(long_contigs::lc_cfg_filename);
+
+ Graph g(K);
+ EdgeIndex<K + 1, Graph> index(g);
+ PairedInfoIndex<Graph> pairedIndex(g, 0);
+ KmerMapper<K+1, Graph> mapper(g);
+ Sequence sequence("");
+
+ long_contigs::LoadFromFile(lc_cfg::get().ds.graph_file, &g, sequence, &mapper);
+ PairedInfoChecker checker(g);
+
+ DataScanner<Graph> dataScanner(g);
+
+ switch (lc_cfg::get().u.mode) {
+ case 1: {
+ INFO("Checking " << lc_cfg::get().u.file1);
+ dataScanner.loadPaired(lc_cfg::get().u.file1, pairedIndex);
+ INFO("Symmetric: " << checker.IsSymmetric(pairedIndex));
+ INFO("Conjugate symmetric: " << checker.IsConjugateSymmetric(pairedIndex));
+ break;
+ }
+ case 2: {
+ PairedInfoIndex<Graph> pairedIndex2(g, 0);
+ dataScanner.loadPaired(lc_cfg::get().u.file1, pairedIndex);
+ dataScanner.loadPaired(lc_cfg::get().u.file2, pairedIndex2);
+
+ INFO("Checking " << lc_cfg::get().u.file1 << " and " << lc_cfg::get().u.file2);
+ INFO("1 is subset of 2 " << checker.AreEqual(pairedIndex, pairedIndex2));
+ INFO("2 is subset of 1 " << checker.AreEqual(pairedIndex2, pairedIndex));
+ break;
+ }
+ case 3: {
+ INFO("Aggregating paired info");
+
+ PairedInfoIndex<Graph> cl(g, 0);
+ PairedInfoIndex<Graph> ad(g, 0);
+ PairedInfoIndex<Graph> res(g, 0);
+
+ dataScanner.loadPaired(lc_cfg::get().u.clustered, cl);
+ dataScanner.loadPaired(lc_cfg::get().u.advanced, ad);
+
+ checker.AggregatePairedInfo(cl, ad,
+ lc_cfg::get().u.insert_size, lc_cfg::get().u.read_size,
+ &res);
+
+ DataPrinter<Graph> dataPrinter(g);
+ dataPrinter.savePaired( "./" + lc_cfg::get().paired_info_file_prefix + "IS" + ToString(lc_cfg::get().u.insert_size) + "_RS" + ToString(lc_cfg::get().u.read_size)
+ + "_agregate_" + ToString(lc_cfg::get().es.advanced_coeff), res);
+
+ INFO("Done");
+ break;
+
+ }
+ default: {
+ INFO("Unknown mode");
+ }
+ }
+
+ return 0;
+}
+
diff --git a/src/debruijn/path_extend/utils/run_all_parametrs.py b/src/modules/algorithms/path_extend/utils/run_all_parametrs.py
similarity index 100%
rename from src/debruijn/path_extend/utils/run_all_parametrs.py
rename to src/modules/algorithms/path_extend/utils/run_all_parametrs.py
diff --git a/src/modules/algorithms/path_extend/weight_counter.hpp b/src/modules/algorithms/path_extend/weight_counter.hpp
new file mode 100644
index 0000000..a2d224b
--- /dev/null
+++ b/src/modules/algorithms/path_extend/weight_counter.hpp
@@ -0,0 +1,544 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * weight_counter.hpp
+ *
+ * Created on: Feb 19, 2012
+ * Author: andrey
+ */
+
+#ifndef WEIGHT_COUNTER_HPP_
+#define WEIGHT_COUNTER_HPP_
+
+#include "assembly_graph/paths/bidirectional_path.hpp"
+#include "paired_library.hpp"
+#include <algorithm>
+#include <boost/math/special_functions/fpclassify.hpp>
+
+namespace path_extend {
+
+inline int median(const vector<int>& dist, const vector<double>& w, int min, int max) {
+ VERIFY(dist.size() == w.size());
+ double S = 0;
+ for (size_t i = 0; i < w.size(); ++i) {
+ if (dist[i] >= min && dist[i] <= max)
+ S += w[i];
+ }
+ if (S == 0) {
+ DEBUG("Empty histogram");
+ return 0;
+ }
+
+ double sum = S;
+ for (size_t i = 0; i < w.size(); ++i) {
+ if (dist[i] >= min && dist[i] <= max) {
+ sum -= w[i];
+ if (sum <= S / 2) {
+ return dist[i];
+ }
+ }
+ }
+ VERIFY(false);
+ return -1;
+}
+
+struct EdgeWithPairedInfo {
+ size_t e_;
+ double pi_;
+
+ EdgeWithPairedInfo(size_t e_, double pi) :
+ e_(e_), pi_(pi) {
+
+ }
+};
+
+struct EdgeWithDistance {
+ EdgeId e_;
+ int d_;
+
+ EdgeWithDistance(EdgeId e, size_t d) :
+ e_(e), d_((int) d) {
+ }
+
+ struct DistanceComparator {
+ bool operator()(const EdgeWithDistance& e1, const EdgeWithDistance& e2) {
+ if (e1.d_ == e2.d_)
+ return e1.e_ < e2.e_;
+ return e1.d_ > e2.d_;
+ }
+ };
+
+ //static DistanceComparator comparator;
+};
+
+class IdealInfoProvider {
+public:
+ virtual ~IdealInfoProvider() {}
+
+ virtual std::vector<EdgeWithPairedInfo> FindCoveredEdges(const BidirectionalPath& path, EdgeId candidate) const = 0;
+};
+
+class BasicIdealInfoProvider : public IdealInfoProvider {
+ const shared_ptr<PairedInfoLibrary> lib_;
+public:
+ BasicIdealInfoProvider(const shared_ptr<PairedInfoLibrary>& lib) : lib_(lib) {
+ }
+
+ std::vector<EdgeWithPairedInfo> FindCoveredEdges(const BidirectionalPath& path, EdgeId candidate) const override {
+ std::vector<EdgeWithPairedInfo> covered;
+ for (int i = (int) path.Size() - 1; i >= 0; --i) {
+ double w = lib_->IdealPairedInfo(path[i], candidate,
+ (int) path.LengthAt(i));
+ //FIXME think if we need extremely low ideal weights
+ if (math::gr(w, 0.)) {
+ covered.push_back(EdgeWithPairedInfo(i, w));
+ }
+ }
+ return covered;
+ }
+};
+
+class WeightCounter {
+
+protected:
+ const Graph& g_;
+ const shared_ptr<PairedInfoLibrary> lib_;
+ bool normalize_weight_;
+ shared_ptr<IdealInfoProvider> ideal_provider_;
+
+public:
+
+ WeightCounter(const Graph& g, const shared_ptr<PairedInfoLibrary>& lib,
+ bool normalize_weight = true,
+ shared_ptr<IdealInfoProvider> ideal_provider = nullptr) :
+ g_(g), lib_(lib), normalize_weight_(normalize_weight), ideal_provider_(ideal_provider) {
+ if (!ideal_provider_) {
+ ideal_provider_ = make_shared<BasicIdealInfoProvider>(lib);
+ }
+ }
+
+ virtual std::set<size_t> PairInfoExist(const BidirectionalPath& path, EdgeId e,
+ int gap = 0) const = 0;
+
+ virtual double CountWeight(const BidirectionalPath& path, EdgeId e,
+ const std::set<size_t>& excluded_edges = std::set<size_t>(), int gapLength = 0) const = 0;
+
+ const PairedInfoLibrary& lib() const {
+ return *lib_;
+ }
+
+ const shared_ptr<PairedInfoLibrary> get_libptr() const {
+ return lib_;
+ };
+
+private:
+ DECL_LOGGER("WeightCounter");
+};
+
+class ReadCountWeightCounter: public WeightCounter {
+
+ std::vector<EdgeWithPairedInfo> CountLib(const BidirectionalPath& path, EdgeId e,
+ int add_gap = 0) const {
+ std::vector<EdgeWithPairedInfo> answer;
+
+ for (const EdgeWithPairedInfo& e_w_pi : ideal_provider_->FindCoveredEdges(path, e)) {
+ double w = lib_->CountPairedInfo(path[e_w_pi.e_], e,
+ (int) path.LengthAt(e_w_pi.e_) + add_gap);
+
+ if (normalize_weight_) {
+ w /= e_w_pi.pi_;
+ }
+ answer.push_back(EdgeWithPairedInfo(e_w_pi.e_, w));
+ }
+
+ return answer;
+ }
+
+public:
+
+ ReadCountWeightCounter(const Graph& g, const shared_ptr<PairedInfoLibrary>& lib,
+ bool normalize_weight = true,
+ shared_ptr<IdealInfoProvider> ideal_provider = nullptr) :
+ WeightCounter(g, lib, normalize_weight, ideal_provider) {
+ }
+
+ double CountWeight(const BidirectionalPath& path, EdgeId e,
+ const std::set<size_t>& excluded_edges, int gap) const override {
+ double weight = 0.0;
+
+ for (const auto& e_w_pi : CountLib(path, e, gap)) {
+ if (!excluded_edges.count(e_w_pi.e_)) {
+ weight += e_w_pi.pi_;
+ }
+ }
+
+ return weight;
+ }
+
+ std::set<size_t> PairInfoExist(const BidirectionalPath& path, EdgeId e,
+ int gap = 0) const override {
+ std::set<size_t> answer;
+ for (const auto& e_w_pi : CountLib(path, e, gap)) {
+ if (math::gr(e_w_pi.pi_, 0.)) {
+ answer.insert(e_w_pi.e_);
+ }
+ }
+
+ return answer;
+ }
+
+};
+
+class PathCoverWeightCounter: public WeightCounter {
+ double single_threshold_;
+
+ double TotalIdealNonExcluded(const std::vector<EdgeWithPairedInfo>& ideally_covered_edges,
+ const std::set<size_t>& excluded_edges) const {
+ double ideal_total = 0.0;
+
+ for (const EdgeWithPairedInfo& e_w_pi : ideally_covered_edges) {
+ if (!excluded_edges.count(e_w_pi.e_))
+ ideal_total += e_w_pi.pi_;
+ }
+
+ return ideal_total;
+ }
+
+ std::vector<EdgeWithPairedInfo> CountLib(const BidirectionalPath& path, EdgeId e,
+ const std::vector<EdgeWithPairedInfo>& ideally_covered_edges, int add_gap = 0) const {
+ std::vector<EdgeWithPairedInfo> answer;
+
+ for (const EdgeWithPairedInfo& e_w_pi : ideally_covered_edges) {
+ double ideal_weight = e_w_pi.pi_;
+
+ double weight = lib_->CountPairedInfo(
+ path[e_w_pi.e_], e,
+ (int) path.LengthAt(e_w_pi.e_) + add_gap);
+
+ if (normalize_weight_) {
+ weight /= ideal_weight;
+ }
+
+ if (math::ge(weight, single_threshold_)) {
+ answer.push_back(EdgeWithPairedInfo(e_w_pi.e_, ideal_weight));
+ }
+ }
+
+ return answer;
+ }
+
+public:
+
+ PathCoverWeightCounter(const Graph& g, const shared_ptr<PairedInfoLibrary>& lib,
+ bool normalize_weight = true,
+ double single_threshold = -1.,
+ shared_ptr<IdealInfoProvider> ideal_provider = nullptr) :
+ WeightCounter(g, lib, normalize_weight, ideal_provider), single_threshold_(single_threshold) {
+ if (math::ls(single_threshold_, 0.)) {
+ single_threshold_ = lib_->GetSingleThreshold();
+ }
+ }
+
+ double CountWeight(const BidirectionalPath& path, EdgeId e,
+ const std::set<size_t>& excluded_edges, int gap) const override {
+ double lib_weight = 0.;
+ const auto ideal_coverage = ideal_provider_->FindCoveredEdges(path, e);
+
+ for (const auto& e_w_pi : CountLib(path, e, ideal_coverage, gap)) {
+ if (!excluded_edges.count(e_w_pi.e_)) {
+ lib_weight += e_w_pi.pi_;
+ }
+ }
+
+ double total_ideal_coverage = TotalIdealNonExcluded(ideal_coverage, excluded_edges);
+ return math::eq(total_ideal_coverage, 0.) ? 0. : lib_weight / total_ideal_coverage;
+ }
+
+ std::set<size_t> PairInfoExist(const BidirectionalPath& path, EdgeId e,
+ int gap = 0) const override {
+ std::set<size_t> answer;
+ for (const auto& e_w_pi : CountLib(path, e, ideal_provider_->FindCoveredEdges(path, e), gap)) {
+ if (math::gr(e_w_pi.pi_, 0.)) {
+ answer.insert(e_w_pi.e_);
+ }
+ }
+ return answer;
+ }
+};
+
+class CoverageAwareIdealInfoProvider : public BasicIdealInfoProvider {
+ static constexpr double MAGIC_COEFF = 2.;
+ const Graph& g_;
+ size_t read_length_;
+ size_t estimation_edge_length_;
+
+public:
+ //works for single lib only!!!
+ double EstimatePathCoverage(const BidirectionalPath& path) const {
+ double answer = -1.0;
+ for (int i = (int) path.Size() - 1; i >= 0; --i) {
+ EdgeId e = path.At(i);
+ if (g_.length(e) > estimation_edge_length_) {
+ if (answer < 0 || g_.coverage(e) < answer) {
+ answer = g_.coverage(e);
+ }
+ }
+ }
+ return answer;
+ }
+
+ CoverageAwareIdealInfoProvider(const Graph& g, const shared_ptr<PairedInfoLibrary>& lib,
+ size_t read_length, size_t estimation_edge_length) :
+ BasicIdealInfoProvider(lib), g_(g), read_length_(read_length),
+ estimation_edge_length_(estimation_edge_length) {
+ VERIFY(read_length_ > g_.k());
+ }
+
+ std::vector<EdgeWithPairedInfo> FindCoveredEdges(const BidirectionalPath& path, EdgeId candidate) const override {
+ VERIFY(read_length_ != -1ul);
+ double estimated_coverage = EstimatePathCoverage(path);
+ VERIFY(math::gr(estimated_coverage, 0.));
+
+ double correction_coeff = estimated_coverage / ((double(read_length_) - double(g_.k())) * MAGIC_COEFF);
+
+ std::vector<EdgeWithPairedInfo> answer = BasicIdealInfoProvider::FindCoveredEdges(path, candidate);
+ for (auto& e_w_pi : answer) {
+ e_w_pi.pi_ *= correction_coeff;
+ }
+ return answer;
+ }
+};
+
+//FIXME optimize number of calls of EstimatePathCoverage(path)
+class MetagenomicWeightCounter: public WeightCounter {
+ static const size_t LENGTH_BOUND = 500;
+ shared_ptr<CoverageAwareIdealInfoProvider> cov_info_provider_;
+ shared_ptr<WeightCounter> normalizing_wc_;
+ shared_ptr<WeightCounter> raw_wc_;
+
+public:
+
+ //negative raw_threshold leads to the halt if no sufficiently long edges are in the path
+ MetagenomicWeightCounter(const Graph& g, const shared_ptr<PairedInfoLibrary>& lib,
+ size_t read_length, double normalized_threshold, double raw_threshold,
+ size_t estimation_edge_length = LENGTH_BOUND) :
+ WeightCounter(g, lib) {
+ cov_info_provider_ = make_shared<CoverageAwareIdealInfoProvider>(g, lib, read_length, estimation_edge_length);
+ normalizing_wc_ = make_shared<PathCoverWeightCounter>(g, lib, true, normalized_threshold, cov_info_provider_);
+ if (math::ge(raw_threshold, 0.)) {
+ raw_wc_ = make_shared<PathCoverWeightCounter>(g, lib, false, raw_threshold);
+ }
+ }
+
+ double CountWeight(const BidirectionalPath& path, EdgeId e,
+ const std::set<size_t>& excluded_edges, int gap = 0) const override {
+ if (math::gr(cov_info_provider_->EstimatePathCoverage(path), 0.)) {
+ return normalizing_wc_->CountWeight(path, e, excluded_edges, gap);
+ } else if (raw_wc_) {
+ return raw_wc_->CountWeight(path, e, excluded_edges, gap);
+ } else {
+ return 0.;
+ }
+ }
+
+ std::set<size_t> PairInfoExist(const BidirectionalPath& path, EdgeId e,
+ int gap = 0) const override {
+ static std::set<size_t> empty;
+ if (math::gr(cov_info_provider_->EstimatePathCoverage(path), 0.)) {
+ return normalizing_wc_->PairInfoExist(path, e, gap);
+ } else if (raw_wc_) {
+ return raw_wc_->PairInfoExist(path, e, gap);
+ } else {
+ return empty;
+ }
+ }
+};
+
+class PathsWeightCounter {
+public:
+ PathsWeightCounter(const Graph& g, shared_ptr<PairedInfoLibrary> lib, size_t min_read_count);
+ PathsWeightCounter(const PathsWeightCounter& w);
+ map<size_t, double> FindPairInfoFromPath(
+ const BidirectionalPath& path1, size_t from1, size_t to1,
+ const BidirectionalPath& path2, size_t from2, size_t to2) const;
+ double CountPairInfo(const BidirectionalPath& path1, size_t from1,
+ size_t to1, const BidirectionalPath& path2,
+ size_t from2, size_t to2, bool normalize = true) const;
+ double CountPairInfo(const BidirectionalPath& path1, size_t from1,
+ size_t to1, EdgeId edge, size_t gap) const;
+ void SetCommonWeightFrom(size_t iedge, double weight);
+ void ClearCommonWeight();
+ void FindJumpCandidates(EdgeId e, int min_dist, int max_dist, size_t min_len, set<EdgeId>& result) const;
+ void FindJumpEdges(EdgeId e, set<EdgeId>& candidates, int min_dist, int max_dist, vector<EdgeWithDistance>& result) const;
+ const shared_ptr<PairedInfoLibrary> GetLib() const {
+ return lib_;
+ }
+ bool HasPI(EdgeId e1, EdgeId e2, int dist) const;
+ bool HasPI(EdgeId e1, EdgeId e2, size_t dist_min, size_t dist_max) const;
+ double PI(EdgeId e1, EdgeId e2, int dist) const;
+ bool HasIdealPI(EdgeId e1, EdgeId e2, int dist) const;
+ double IdealPI(EdgeId e1, EdgeId e2, int dist) const;
+
+private:
+ void FindPairInfo(const BidirectionalPath& path1, size_t from1, size_t to1,
+ const BidirectionalPath& path2, size_t from2, size_t to2,
+ map<size_t, double>& pi, double& ideal_pi) const;
+ void FindPairInfo(EdgeId e1, EdgeId e2, size_t dist, double& ideal_w,
+ double& result_w) const;
+
+ const Graph& g_;
+ shared_ptr<PairedInfoLibrary> lib_;
+ std::map<size_t, double> common_w_;
+ size_t min_read_count_;
+ DECL_LOGGER("WeightCounter");
+};
+
+inline PathsWeightCounter::PathsWeightCounter(const Graph& g, shared_ptr<PairedInfoLibrary>lib, size_t min_read_count):g_(g), lib_(lib), min_read_count_(min_read_count){
+
+}
+
+inline PathsWeightCounter::PathsWeightCounter(const PathsWeightCounter& w): g_(w.g_), lib_(w.lib_), min_read_count_(w.min_read_count_) {
+
+}
+
+inline double PathsWeightCounter::CountPairInfo(const BidirectionalPath& path1,
+ size_t from1, size_t to1,
+ const BidirectionalPath& path2,
+ size_t from2, size_t to2, bool normalize) const {
+ map<size_t, double> pi;
+ double ideal_pi = 0.0;
+ FindPairInfo(path1, from1, to1, path2, from2, to2,
+ pi, ideal_pi);
+ double result = 0.0;
+ double all_common = 0.0;
+ for (size_t i = from1; i < to1; ++i) {
+ if (common_w_.find(i) != common_w_.end()) {
+ all_common += common_w_.at(i);
+ }
+ result += pi[i];
+ }
+ DEBUG("ideal _pi " << ideal_pi << " common " << all_common << " result " << result);
+ ideal_pi -= all_common;
+ result -= all_common;
+ double total_result = math::gr(ideal_pi, 0.0) ? result / ideal_pi : 0.0;
+ total_result = math::gr(total_result, 0.0) ? total_result : 0.0;
+ DEBUG("ideal _pi " << ideal_pi << " result " << result << " total_result " << total_result);
+ return normalize ? total_result : result;
+}
+
+inline double PathsWeightCounter::CountPairInfo(const BidirectionalPath& path1,
+ size_t from1, size_t to1, EdgeId edge,
+ size_t gap) const {
+ double result = 0.0;
+ for (size_t i1 = from1; i1 < to1; ++i1) {
+ double ideal_w, w;
+ FindPairInfo(path1.At(i1), edge, gap + path1.LengthAt(i1), ideal_w, w);
+ result += w;
+ }
+ return result;
+}
+
+inline void PathsWeightCounter::FindPairInfo(const BidirectionalPath& path1,
+ size_t from1, size_t to1,
+ const BidirectionalPath& path2,
+ size_t from2, size_t to2,
+ map<size_t, double>& pi,
+ double& ideal_pi) const {
+ stringstream str;
+ for (size_t i = 0; i < path2.Size(); ++i) {
+ str << g_.int_id(path2.At(i)) << " ";
+ }
+ DEBUG("pair info for path " << str.str());
+ for (size_t i1 = from1; i1 < to1; ++i1) {
+ for (size_t i2 = from2; i2 < to2; ++i2) {
+ size_t dist = path1.LengthAt(i1) + path2.Length()
+ - path2.LengthAt(i2);
+ double ideal_w = 0.0;
+ double w = 0.0;
+ FindPairInfo(path1.At(i1), path2.At(i2), dist, ideal_w, w);
+ ideal_pi += ideal_w;
+ if (pi.find(i1) == pi.end()) {
+ pi[i1] = 0;
+ }
+ pi[i1] += w;
+ }
+ }
+}
+
+inline void PathsWeightCounter::FindPairInfo(EdgeId e1, EdgeId e2, size_t dist,
+ double& ideal_w, double& result_w) const {
+ ideal_w = lib_->IdealPairedInfo(e1, e2, (int) dist, true);
+ result_w = 0.0;
+ if (ideal_w == 0.0) {
+ return;
+ }
+ if (HasPI(e1, e2, (int) dist)) {
+ result_w = ideal_w;
+ }
+}
+
+inline map<size_t, double> PathsWeightCounter::FindPairInfoFromPath(
+ const BidirectionalPath& path1, size_t from1, size_t to1,
+ const BidirectionalPath& path2, size_t from2, size_t to2) const {
+ map<size_t, double> pi;
+ double ideal_pi = 0;
+ FindPairInfo(path1, from1, to1, path2, from2, to2, pi, ideal_pi);
+ return pi;
+}
+
+inline void PathsWeightCounter::FindJumpCandidates(EdgeId e, int min_dist, int max_dist, size_t min_len, set<EdgeId>& result) const {
+ result.clear();
+ lib_->FindJumpEdges(e, result, min_dist, max_dist, min_len);
+}
+
+inline void PathsWeightCounter::FindJumpEdges(EdgeId e, set<EdgeId>& edges, int min_dist, int max_dist, vector<EdgeWithDistance>& result) const {
+ result.clear();
+
+ for (auto e2 = edges.begin(); e2 != edges.end(); ++e2) {
+ vector<int> distances;
+ vector<double> weights;
+ lib_->CountDistances(e, *e2, distances, weights);
+ int median_distance = median(distances, weights, min_dist, max_dist);
+
+ if (HasPI(e, *e2, median_distance)) {
+ result.push_back(EdgeWithDistance(*e2, median_distance));
+ }
+ }
+}
+
+inline void PathsWeightCounter::SetCommonWeightFrom(size_t iedge, double weight) {
+ common_w_[iedge] = weight;
+}
+
+inline void PathsWeightCounter::ClearCommonWeight() {
+ common_w_.clear();
+}
+
+inline double PathsWeightCounter::PI(EdgeId e1, EdgeId e2, int dist) const {
+ double w = lib_->CountPairedInfo(e1, e2, dist, true);
+ return w > (double) min_read_count_ ? w : 0.0;
+}
+
+inline bool PathsWeightCounter::HasPI(EdgeId e1, EdgeId e2, int dist) const {
+ return lib_->CountPairedInfo(e1, e2, dist, true) > (double) min_read_count_;
+}
+
+inline bool PathsWeightCounter::HasIdealPI(EdgeId e1, EdgeId e2, int dist) const {
+ return lib_->IdealPairedInfo(e1, e2, dist, true) > 0.0;
+}
+
+inline double PathsWeightCounter::IdealPI(EdgeId e1, EdgeId e2, int dist) const {
+ return lib_->IdealPairedInfo(e1, e2, dist, true);
+}
+
+inline bool PathsWeightCounter::HasPI(EdgeId e1, EdgeId e2, size_t dist_min, size_t dist_max) const {
+ return lib_->CountPairedInfo(e1, e2, (int) dist_min, (int) dist_max) > min_read_count_;
+}
+};
+
+#endif /* WEIGHT_COUNTER_HPP_ */
diff --git a/src/modules/algorithms/simplification/bulge_remover.hpp b/src/modules/algorithms/simplification/bulge_remover.hpp
new file mode 100644
index 0000000..1ab3de6
--- /dev/null
+++ b/src/modules/algorithms/simplification/bulge_remover.hpp
@@ -0,0 +1,783 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * bulge_remover.hpp
+ *
+ * Created on: Apr 13, 2011
+ * Author: sergey
+ */
+
+#pragma once
+
+#include "assembly_graph/graph_support/parallel_processing.hpp"
+#include "assembly_graph/graph_support/basic_edge_conditions.hpp"
+#include "assembly_graph/graph_support/graph_processing_algorithm.hpp"
+#include "assembly_graph/paths/path_processor.hpp"
+#include "assembly_graph/graph_support/comparators.hpp"
+#include "assembly_graph/components/graph_component.hpp"
+#include "data_structures/sequence/sequence_tools.hpp"
+#include "dev_support/standard_base.hpp"
+#include <cmath>
+#include <stack>
+#include "math/xmath.h"
+
+namespace omnigraph {
+
+template<class Graph>
+struct SimplePathCondition {
+ typedef typename Graph::EdgeId EdgeId;
+ const Graph& g_;
+
+ SimplePathCondition(const Graph& g) :
+ g_(g) {
+
+ }
+
+ bool operator()(EdgeId edge, const vector<EdgeId>& path) const {
+ if (edge == g_.conjugate(edge))
+ return false;
+ for (size_t i = 0; i < path.size(); ++i)
+ if (edge == path[i] || edge == g_.conjugate(path[i]))
+ return false;
+ for (size_t i = 0; i < path.size(); ++i) {
+ if (path[i] == g_.conjugate(path[i])) {
+ return false;
+ }
+ for (size_t j = i + 1; j < path.size(); ++j)
+ if (path[i] == path[j] || path[i] == g_.conjugate(path[j]))
+ return false;
+ }
+ return true;
+ }
+};
+
+template<class Graph>
+bool TrivialCondition(typename Graph::EdgeId,
+ const vector<typename Graph::EdgeId>& path) {
+ for (size_t i = 0; i < path.size(); ++i)
+ for (size_t j = i + 1; j < path.size(); ++j)
+ if (path[i] == path[j])
+ return false;
+ return true;
+}
+
+template<class Graph>
+class MostCoveredSimpleAlternativePathChooser: public PathProcessor<Graph>::Callback {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ const Graph& g_;
+ EdgeId forbidden_edge_;
+
+ double max_coverage_;
+ vector<EdgeId> most_covered_path_;
+
+public:
+
+ MostCoveredSimpleAlternativePathChooser(const Graph& g, EdgeId edge) :
+ g_(g), forbidden_edge_(edge), max_coverage_(-1.0) {
+
+ }
+
+ void HandleReversedPath(const vector<EdgeId>& reversed_path) override {
+ vector<EdgeId> path = this->ReversePath(reversed_path);
+ double path_cov = AvgCoverage(g_, path);
+ for (size_t i = 0; i < path.size(); i++) {
+ if (path[i] == forbidden_edge_)
+ return;
+ }
+ if (path_cov > max_coverage_ && SimplePathCondition<Graph>(g_)(forbidden_edge_, path)) {
+ max_coverage_ = path_cov;
+ most_covered_path_ = path;
+ }
+ }
+
+ double max_coverage() {
+ return max_coverage_;
+ }
+
+ const vector<EdgeId>& most_covered_path() {
+ return most_covered_path_;
+ }
+};
+
+inline size_t CountMaxDifference(size_t absolute_diff, size_t length, double relative_diff) {
+ return std::max((size_t) std::floor(relative_diff * (double) length), absolute_diff);
+}
+
+template<class Graph>
+class BulgeGluer {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef std::function<void(EdgeId edge, const vector<EdgeId>& path)> BulgeCallbackF;
+ Graph& g_;
+ BulgeCallbackF opt_callback_;
+ std::function<void(EdgeId)> removal_handler_;
+
+ void InnerProcessBulge(EdgeId edge, const vector<EdgeId>& path) {
+
+ EnsureEndsPositionAligner aligner(CumulativeLength(g_, path),
+ g_.length(edge));
+ double prefix_length = 0.;
+ vector<size_t> bulge_prefix_lengths;
+
+ for (EdgeId e : path) {
+ prefix_length += (double) g_.length(e);
+ bulge_prefix_lengths.push_back(aligner.GetPosition((size_t) prefix_length));
+ }
+
+ EdgeId edge_to_split = edge;
+ size_t prev_length = 0;
+
+ TRACE("Process bulge " << path.size() << " edges");
+
+ //fixme remove after checking results
+ bool flag = false;
+ VERIFY(bulge_prefix_lengths.back() == g_.length(edge));
+
+ for (size_t i = 0; i < path.size(); ++i) {
+ if (bulge_prefix_lengths[i] > prev_length) {
+ if (bulge_prefix_lengths[i] - prev_length
+ != g_.length(edge_to_split)) {
+
+ TRACE("SplitEdge " << g_.str(edge_to_split));
+ TRACE(
+ "Start: " << g_.str(g_.EdgeStart(edge_to_split)));
+ TRACE(
+ "Start: " << g_.str(g_.EdgeEnd(edge_to_split)));
+
+ pair<EdgeId, EdgeId> split_result = g_.SplitEdge(
+ edge_to_split,
+ bulge_prefix_lengths[i] - prev_length);
+
+ edge_to_split = split_result.second;
+
+ TRACE("GlueEdges " << g_.str(split_result.first));
+ flag = true;
+ g_.GlueEdges(split_result.first, path[i]);
+
+ } else {
+ TRACE("GlueEdges " << g_.str(edge_to_split));
+ flag = true;
+ g_.GlueEdges(edge_to_split, path[i]);
+ }
+ }
+ prev_length = bulge_prefix_lengths[i];
+ }
+ VERIFY(flag);
+ }
+
+public:
+
+ BulgeGluer(Graph& g, BulgeCallbackF opt_callback = 0,
+ std::function<void(EdgeId)> removal_handler = 0) :
+ g_(g),
+ opt_callback_(opt_callback),
+ removal_handler_(removal_handler) {
+
+ }
+
+ void operator()(EdgeId edge, const vector<EdgeId>& path) {
+ if (opt_callback_)
+ opt_callback_(edge, path);
+
+ if (removal_handler_)
+ removal_handler_(edge);
+
+ VertexId start = g_.EdgeStart(edge);
+ VertexId end = g_.EdgeEnd(edge);
+
+ TRACE("Projecting edge " << g_.str(edge));
+ InnerProcessBulge(edge, path);
+
+ TRACE("Compressing start vertex " << g_.str(start));
+ g_.CompressVertex(start);
+
+ TRACE("Compressing end vertex " << g_.str(end));
+ g_.CompressVertex(end);
+ }
+
+};
+
+template<class Graph>
+class AlternativesAnalyzer {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ const Graph& g_;
+ double max_coverage_;
+ size_t max_length_;
+ double max_relative_coverage_;
+ size_t max_delta_;
+ double max_relative_delta_;
+ size_t max_edge_cnt_;
+
+ static vector<EdgeId> EmptyPath() {
+ static vector<EdgeId> vec = {};
+ return vec;
+ }
+
+ /**
+ * Checks if alternative path is simple (doesn't contain conjugate edges, edge e or conjugate(e))
+ * and its average coverage * max_relative_coverage_ is greater than g.coverage(e)
+ */
+ bool BulgeCondition(EdgeId e, const vector<EdgeId>& path,
+ double path_coverage) const {
+ return math::ge(path_coverage * max_relative_coverage_,
+ g_.coverage(e)) && SimplePathCondition<Graph>(g_)(e, path);
+ }
+
+public:
+ AlternativesAnalyzer(const Graph& g, double max_coverage, size_t max_length,
+ double max_relative_coverage, size_t max_delta,
+ double max_relative_delta, size_t max_edge_cnt) :
+ g_(g),
+ max_coverage_(max_coverage),
+ max_length_(max_length),
+ max_relative_coverage_(max_relative_coverage),
+ max_delta_(max_delta),
+ max_relative_delta_(max_relative_delta),
+ max_edge_cnt_(max_edge_cnt) {
+ DEBUG("Created alternatives analyzer max_length=" << max_length
+ << " max_coverage=" << max_coverage
+ << " max_relative_coverage=" << max_relative_coverage
+ << " max_delta=" << max_delta
+ << " max_relative_delta=" << max_relative_delta);
+ }
+
+ vector<EdgeId> operator()(EdgeId e) const {
+ if (g_.length(e) > max_length_ || math::gr(g_.coverage(e), max_coverage_)) {
+ return EmptyPath();
+ }
+
+ size_t kplus_one_mer_coverage = (size_t) math::round((double) g_.length(e) * g_.coverage(e));
+ TRACE("Processing edge " << g_.str(e) << " and coverage " << kplus_one_mer_coverage);
+
+ size_t delta = CountMaxDifference(max_delta_, g_.length(e), max_relative_delta_);
+
+ MostCoveredSimpleAlternativePathChooser<Graph> path_chooser(g_, e);
+
+ VertexId start = g_.EdgeStart(e);
+ TRACE("Start " << g_.str(start));
+ VertexId end = g_.EdgeEnd(e);
+ TRACE("End " << g_.str(end));
+
+ ProcessPaths(g_, (g_.length(e) > delta) ? g_.length(e) - delta : 0,
+ g_.length(e) + delta, start, end, path_chooser, max_edge_cnt_);
+
+ const vector<EdgeId>& path = path_chooser.most_covered_path();
+ if (!path.empty()) {
+ VERIFY(g_.EdgeStart(path[0]) == start);
+ VERIFY(g_.EdgeEnd(path.back()) == end);
+ }
+
+ double path_coverage = path_chooser.max_coverage();
+ if (math::gr(path_coverage, 0.)) {
+ TRACE("Best path with coverage " << path_coverage << " is " << PrintPath(g_, path));
+
+ if (BulgeCondition(e, path, path_coverage)) {
+ TRACE("Satisfied condition");
+ return path;
+ } else {
+ TRACE("Didn't satisfy condition");
+ return EmptyPath();
+ }
+ } else {
+ TRACE("Didn't find alternative");
+ return EmptyPath();
+ }
+ }
+
+ double max_coverage() const {
+ return max_coverage_;
+ }
+
+ size_t max_length() const {
+ return max_length_;
+ }
+
+private:
+ DECL_LOGGER("AlternativesAnalyzer");
+};
+
+template<class Graph>
+pred::TypedPredicate<typename Graph::EdgeId>
+NecessaryBulgeCondition(const Graph& g, size_t max_length, double max_coverage) {
+ return AddAlternativesPresenceCondition(g,
+ pred::And(LengthUpperBound<Graph>(g, max_length),
+ CoverageUpperBound<Graph>(g, max_coverage)));
+}
+
+/**
+ * This class removes simple bulges from given graph with the following algorithm: it iterates through all edges of
+ * the graph and for each edge checks if this edge is likely to be a simple bulge
+ * if edge is judged to be one it is removed.
+ */
+//template<class Graph>
+//class OldBulgeRemover: public EdgeProcessingAlgorithm<Graph> {
+// typedef EdgeProcessingAlgorithm<Graph> base;
+// typedef typename Graph::EdgeId EdgeId;
+// typedef typename Graph::VertexId VertexId;
+//
+//protected:
+//
+// /*virtual*/
+// bool ProcessEdge(EdgeId e) {
+// TRACE("Considering edge " << this->g().str(e)
+// << " of length " << this->g().length(e)
+// << " and avg coverage " << this->g().coverage(e));
+//
+// if (!HasAlternatives(this->g(), e)) {
+// TRACE("Not possible bulge edge");
+// return false;
+// }
+//
+// for (const auto& analyzer : alternatives_analyzers_) {
+// vector<EdgeId> alternative = analyzer(e);
+// if (!alternative.empty()) {
+// gluer_(e, alternative);
+// return true;
+// }
+// }
+// return false;
+// }
+//
+//public:
+//
+// typedef std::function<void(EdgeId edge, const vector<EdgeId>& path)> BulgeCallbackF;
+//
+//// BulgeRemover(Graph& g, double max_coverage, size_t max_length,
+//// double max_relative_coverage, size_t max_delta,
+//// double max_relative_delta,
+//// size_t max_edge_cnt,
+//// BulgeCallbackF opt_callback = 0,
+//// std::function<void(EdgeId)> removal_handler = 0) :
+//// base(g, true),
+//// gluer_(g, opt_callback, removal_handler) {
+//// DEBUG("Launching br max_length=" << max_length
+//// << " max_coverage=" << max_coverage
+//// << " max_relative_coverage=" << max_relative_coverage
+//// << " max_delta=" << max_delta
+//// << " max_relative_delta=" << max_relative_delta
+//// << " max_number_edges=" << max_edge_cnt);
+//// alternatives_analyzers_.push_back(
+//// AlternativesAnalyzer<Graph>(g, max_coverage,
+//// max_length, max_relative_coverage,
+//// max_delta, max_relative_delta, max_edge_cnt));
+//// }
+//
+// OldBulgeRemover(Graph& g,
+// const std::vector<AlternativesAnalyzer<Graph>>& alternatives_analyzers,
+// BulgeCallbackF opt_callback = 0,
+// std::function<void(EdgeId)> removal_handler = 0) :
+// base(g, true),
+// alternatives_analyzers_(alternatives_analyzers),
+// gluer_(g, opt_callback, removal_handler) {
+// }
+//
+//private:
+// std::vector<AlternativesAnalyzer<Graph>> alternatives_analyzers_;
+// BulgeGluer<Graph> gluer_;
+//private:
+// DECL_LOGGER("BulgeRemover")
+//};
+
+template<class Graph>
+inline double AbsoluteMaxCoverage(const std::vector<AlternativesAnalyzer<Graph>>& alternatives_analyzers) {
+ double ans = -1.;
+ for (const auto& analyzer : alternatives_analyzers) {
+ ans = std::max(ans, analyzer.max_coverage());
+ }
+ return ans;
+}
+
+//fixme maybe switch on parallel finder?
+template<class Graph, class InterestingElementFinder>
+class BulgeRemover: public PersistentProcessingAlgorithm<Graph,
+ typename Graph::EdgeId,
+ InterestingElementFinder,
+ CoverageComparator<Graph>> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef PersistentProcessingAlgorithm<Graph, EdgeId,
+ InterestingElementFinder, CoverageComparator<Graph>> base;
+
+protected:
+
+ /*virtual*/
+ bool Process(EdgeId e) {
+ TRACE("Considering edge " << this->g().str(e)
+ << " of length " << this->g().length(e)
+ << " and avg coverage " << this->g().coverage(e));
+
+ if (!HasAlternatives(this->g(), e)) {
+ TRACE("Not possible bulge edge");
+ return false;
+ }
+
+ vector<EdgeId> alternative = alternatives_analyzer_(e);
+ if (!alternative.empty()) {
+ gluer_(e, alternative);
+ return true;
+ }
+ return false;
+ }
+
+public:
+
+ typedef std::function<void(EdgeId edge, const vector<EdgeId>& path)> BulgeCallbackF;
+
+// BulgeRemover(Graph& g, double max_coverage, size_t max_length,
+// double max_relative_coverage, size_t max_delta,
+// double max_relative_delta,
+// size_t max_edge_cnt,
+// BulgeCallbackF opt_callback = 0,
+// std::function<void(EdgeId)> removal_handler = 0) :
+// base(g, true),
+// gluer_(g, opt_callback, removal_handler) {
+// DEBUG("Launching br max_length=" << max_length
+// << " max_coverage=" << max_coverage
+// << " max_relative_coverage=" << max_relative_coverage
+// << " max_delta=" << max_delta
+// << " max_relative_delta=" << max_relative_delta
+// << " max_number_edges=" << max_edge_cnt);
+// alternatives_analyzers_.push_back(
+// AlternativesAnalyzer<Graph>(g, max_coverage,
+// max_length, max_relative_coverage,
+// max_delta, max_relative_delta, max_edge_cnt));
+// }
+
+ BulgeRemover(Graph& g, const InterestingElementFinder& interesting_finder,
+ const AlternativesAnalyzer<Graph>& alternatives_analyzer,
+ BulgeCallbackF opt_callback = 0,
+ std::function<void(EdgeId)> removal_handler = 0,
+ bool track_changes = true) :
+ base(g,
+ interesting_finder,
+ /*canonical_only*/true,
+ CoverageComparator<Graph>(g),
+ track_changes),
+ alternatives_analyzer_(alternatives_analyzer),
+ gluer_(g, opt_callback, removal_handler) {
+ }
+
+private:
+ AlternativesAnalyzer<Graph> alternatives_analyzer_;
+ BulgeGluer<Graph> gluer_;
+private:
+ DECL_LOGGER("BulgeRemover")
+};
+
+template<class Graph, class InterestingElementFinder>
+class ParallelBulgeRemover : public PersistentAlgorithmBase<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef SmartSetIterator<Graph, EdgeId, CoverageComparator<Graph>> SmartEdgeSet;
+
+ size_t buff_size_;
+ double buff_cov_diff_;
+ double buff_cov_rel_diff_;
+ AlternativesAnalyzer<Graph> alternatives_analyzer_;
+ BulgeGluer<Graph> gluer_;
+ InterestingElementFinder interesting_edge_finder_;
+ //todo remove
+ bool tracking_;
+
+ size_t curr_iteration_;
+
+ SmartEdgeSet it_;
+
+ static vector<EdgeId> EmptyPath() {
+ static vector<EdgeId> vec = {};
+ return vec;
+ }
+
+ struct BulgeInfo : private boost::noncopyable {
+ size_t id;
+ EdgeId e;
+ std::vector<EdgeId> alternative;
+
+ BulgeInfo() :
+ id(-1ul) {
+ }
+
+ BulgeInfo(size_t id_, EdgeId e_, std::vector<EdgeId> alternative_) :
+ id(id_), e(e_), alternative(std::move(alternative_)) {
+
+ }
+
+ BulgeInfo(BulgeInfo&& that) {
+ *this = std::move(that);
+ }
+
+ BulgeInfo& operator= (BulgeInfo&& that) {
+ id = that.id;
+ e = that.e;
+ alternative = std::move(that.alternative);
+ return *this;
+ }
+
+// BulgeInfo(size_t id_, EdgeId e_, std::vector<EdgeId>&& alternative_) :
+// id(id_), e(e_), alternative(std::move(alternative_)) {
+//
+// }
+//
+ bool operator< (const BulgeInfo& that) const {
+// VERIFY_MSG(id != that.id, "Ooops " << id);
+ return id < that.id;
+ }
+
+ std::string str(const Graph& g) const {
+ std::stringstream ss;
+ ss << "BulgeInfo " << id
+ << " e: " << g.str(e)
+ << " path: " << PrintPath(g, alternative);
+ return ss.str();
+ }
+
+ };
+
+ bool CheckInteracting(const BulgeInfo& info, const std::unordered_set<EdgeId>& involved_edges) const {
+ if (involved_edges.count(info.e))
+ return true;
+ for (EdgeId e : info.alternative)
+ if (involved_edges.count(e))
+ return true;
+ return false;
+ }
+
+ void AccountEdge(EdgeId e, std::unordered_set<EdgeId>& involved_edges) const {
+ TRACE("Pushing edge " << this->g().str(e));
+ involved_edges.insert(e);
+ EdgeId conj = this->g().conjugate(e);
+ TRACE("Pushing edge " << this->g().str(conj));
+ involved_edges.insert(conj);
+ }
+
+ void AccountEdges(const BulgeInfo& info, std::unordered_set<EdgeId>& involved_edges) const {
+ AccountEdge(info.e, involved_edges);
+ for (EdgeId e : info.alternative) {
+ AccountEdge(e, involved_edges);
+ }
+ }
+
+ //false if time to stop
+ bool FillEdgeBuffer(vector<EdgeId>& buffer, pred::TypedPredicate<EdgeId> proceed_condition) {
+ VERIFY(buffer.empty());
+ DEBUG("Filling edge buffer of size " << buff_size_);
+ perf_counter perf;
+ double low_cov = 0.;
+ double cov_diff = 0.;
+ while (!it_.IsEnd() && buffer.size() < buff_size_) {
+ EdgeId e = *it_;
+ TRACE("Current edge " << this->g().str(e));
+ if (!proceed_condition(e)) {
+ TRACE("Stop condition was reached.");
+ //need to release last element of the iterator to make it replaceable by new elements
+ it_.ReleaseCurrent();
+ return false;
+ }
+
+ double cov = this->g().coverage(e);
+ if (buffer.empty()) {
+ low_cov = cov;
+ cov_diff = max(buff_cov_diff_, buff_cov_rel_diff_ * low_cov);
+ } else {
+ if (math::gr(cov, low_cov + cov_diff)) {
+ //need to release last element of the iterator to make it replaceable by new elements
+ it_.ReleaseCurrent();
+ return true;
+ }
+ }
+ TRACE("Potential bulge edge");
+ buffer.push_back(e);
+ ++it_;
+ }
+
+ DEBUG("Filled in " << perf.time() << " seconds");
+ if (buffer.size() == buff_size_) {
+ TRACE("Buffer filled");
+ return true;
+ } else {
+ TRACE("No more edges in iterator");
+ return false;
+ }
+ }
+
+ std::vector<std::vector<BulgeInfo>> FindBulges(const std::vector<EdgeId> edge_buffer) const {
+ DEBUG("Looking for bulges (in parallel). Edge buffer size " << edge_buffer.size());
+ perf_counter perf;
+ std::vector<std::vector<BulgeInfo>> bulge_buffers(omp_get_max_threads());
+ size_t n = edge_buffer.size();
+ //order is in agreement with coverage
+ #pragma omp parallel for schedule(guided)
+ for (size_t i = 0; i < n; ++i) {
+ EdgeId e = edge_buffer[i];
+ auto alternative = alternatives_analyzer_(e);
+ if (!alternative.empty()) {
+ bulge_buffers[omp_get_thread_num()].push_back(BulgeInfo(i, e, std::move(alternative)));
+ }
+ }
+ DEBUG("Bulges found in " << perf.time() << " seconds");
+ return bulge_buffers;
+ }
+
+ std::vector<BulgeInfo> MergeBuffers(std::vector<std::vector<BulgeInfo>>&& buffers) const {
+ DEBUG("Merging bulge buffers");
+ perf_counter perf;
+
+ std::vector<BulgeInfo> merged_bulges;
+ for (auto& bulge_buffer : buffers) {
+ std::copy(std::make_move_iterator(bulge_buffer.begin()),
+ std::make_move_iterator(bulge_buffer.end()),
+ std::back_inserter(merged_bulges));
+ }
+
+ DEBUG("Sorting");
+ //order is in agreement with coverage
+ std::sort(merged_bulges.begin(), merged_bulges.end());
+ DEBUG("Total bulges " << merged_bulges.size());
+ DEBUG("Buffers merged in " << perf.time() << " seconds");
+ return merged_bulges;
+ }
+
+ SmartEdgeSet RetainIndependentBulges(std::vector<BulgeInfo>& bulges) const {
+ DEBUG("Looking for independent bulges");
+ size_t total_cnt = bulges.size();
+ perf_counter perf;
+
+ std::vector<BulgeInfo> filtered;
+ filtered.reserve(bulges.size());
+ //fixme switch to involved vertices to bring fully parallel glueing closer
+ std::unordered_set<EdgeId> involved_edges;
+ SmartEdgeSet interacting_edges(this->g(), false, CoverageComparator<Graph>(this->g()));
+
+ for (BulgeInfo& info : bulges) {
+ TRACE("Analyzing interactions of " << info.str(this->g()));
+ if (CheckInteracting(info, involved_edges)) {
+ TRACE("Interacting");
+ interacting_edges.push(info.e);
+ } else {
+ TRACE("Independent");
+ AccountEdges(info, involved_edges);
+ filtered.push_back(std::move(info));
+ }
+ }
+ bulges = std::move(filtered);
+
+ DEBUG("Independent bulges identified in " << perf.time() << " seconds");
+ DEBUG("Independent cnt " << bulges.size());
+ DEBUG("Interacting cnt " << interacting_edges.size());
+ VERIFY(bulges.size() + interacting_edges.size() == total_cnt);
+
+ return interacting_edges;
+ }
+
+ bool ProcessBulges(const std::vector<BulgeInfo>& independent_bulges, SmartEdgeSet&& interacting_edges) {
+ DEBUG("Processing bulges");
+ perf_counter perf;
+
+ bool triggered = false;
+
+ for (const BulgeInfo& info : independent_bulges) {
+ TRACE("Processing bulge " << info.str(this->g()));
+ triggered = true;
+ gluer_(info.e, info.alternative);
+ }
+
+ DEBUG("Independent bulges glued in " << perf.time() << " seconds");
+ perf.reset();
+
+ DEBUG("Processing remaining interacting bulges " << interacting_edges.size());
+ //usual br strategy
+ for (; !interacting_edges.IsEnd(); ++interacting_edges) {
+ EdgeId e = *interacting_edges;
+ TRACE("Processing edge " << this->g().str(e));
+ std::vector<EdgeId> alternative = alternatives_analyzer_(e);
+ if (!alternative.empty()) {
+ gluer_(e, alternative);
+ triggered = true;
+ }
+ }
+ DEBUG("Interacting edges processed in " << perf.time() << " seconds");
+ return triggered;
+ }
+
+public:
+
+ typedef std::function<void(EdgeId edge, const vector<EdgeId>& path)> BulgeCallbackF;
+
+ ParallelBulgeRemover(Graph& g, const InterestingElementFinder& interesting_edge_finder,
+ size_t buff_size, double buff_cov_diff,
+ double buff_cov_rel_diff, const AlternativesAnalyzer<Graph>& alternatives_analyzer,
+ BulgeCallbackF opt_callback = 0,
+ std::function<void(EdgeId)> removal_handler = 0,
+ bool track_changes = true) :
+ PersistentAlgorithmBase<Graph>(g),
+ buff_size_(buff_size),
+ buff_cov_diff_(buff_cov_diff),
+ buff_cov_rel_diff_(buff_cov_rel_diff),
+ alternatives_analyzer_(alternatives_analyzer),
+ gluer_(g, opt_callback, removal_handler),
+ interesting_edge_finder_(interesting_edge_finder),
+ tracking_(track_changes),
+ curr_iteration_(0),
+ it_(g, true, CoverageComparator<Graph>(g), true) {
+ VERIFY(buff_size_ > 0);
+ it_.Detach();
+ }
+
+ bool Run(bool force_primary_launch = false) override {
+ bool primary_launch = force_primary_launch ? true : curr_iteration_ == 0;
+ //todo remove if not needed;
+ //potentially can vary coverage threshold in coordination with ec threshold
+ auto proceed_condition = pred::AlwaysTrue<EdgeId>();
+
+ if (!it_.IsAttached()) {
+ it_.Attach();
+ }
+ if (primary_launch) {
+ it_.clear();
+ TRACE("Primary launch.");
+ TRACE("Start search for interesting edges");
+ interesting_edge_finder_.Run(it_);
+ TRACE(it_.size() << " interesting edges to process");
+ } else {
+ VERIFY(tracking_);
+ TRACE(it_.size() << " edges to process");
+ }
+
+ bool triggered = false;
+ bool proceed = true;
+ while (proceed) {
+ std::vector<EdgeId> edge_buffer;
+ edge_buffer.reserve(buff_size_);
+ proceed = FillEdgeBuffer(edge_buffer, proceed_condition);
+
+ std::vector<BulgeInfo> bulges = MergeBuffers(FindBulges(edge_buffer));
+
+ auto interacting_edges = RetainIndependentBulges(bulges);
+
+ bool inner_triggered = ProcessBulges(bulges, std::move(interacting_edges));
+ proceed |= inner_triggered;
+ triggered |= inner_triggered;
+ }
+
+ TRACE("Finished processing. Triggered = " << triggered);
+ if (!tracking_)
+ it_.Detach();
+
+ curr_iteration_++;
+
+ return triggered;
+ }
+
+private:
+ DECL_LOGGER("ParallelBulgeRemover")
+};
+
+}
diff --git a/src/modules/algorithms/simplification/cleaner.hpp b/src/modules/algorithms/simplification/cleaner.hpp
new file mode 100644
index 0000000..1787e56
--- /dev/null
+++ b/src/modules/algorithms/simplification/cleaner.hpp
@@ -0,0 +1,43 @@
+#pragma once
+
+#include "assembly_graph/graph_support/basic_vertex_conditions.hpp"
+#include "assembly_graph/graph_support/graph_processing_algorithm.hpp"
+#include "assembly_graph/graph_support/parallel_processing.hpp"
+
+namespace omnigraph {
+
+template<class Graph>
+class Cleaner : public PersistentProcessingAlgorithm<Graph,
+ typename Graph::VertexId,
+ ParallelInterestingElementFinder < Graph, typename Graph::VertexId>> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef PersistentProcessingAlgorithm <Graph,
+ VertexId, ParallelInterestingElementFinder<Graph, VertexId>> base;
+ typedef IsolatedVertexCondition<Graph> ConditionT;
+
+ Graph &g_;
+ ConditionT isolated_condition_;
+
+public:
+ Cleaner(Graph &g, size_t chunk_cnt = 1) :
+ base(g,
+ ParallelInterestingElementFinder<Graph, VertexId>(g,
+ ConditionT(g), chunk_cnt),
+ /*canonical only*/true),
+ g_(g), isolated_condition_(g) {
+ }
+
+protected:
+
+ bool Process(VertexId v) {
+ if (isolated_condition_.Check(v)) {
+ g_.DeleteVertex(v);
+ return true;
+ } else {
+ return false;
+ }
+ }
+};
+
+}
diff --git a/src/modules/algorithms/simplification/complex_bulge_remover.hpp b/src/modules/algorithms/simplification/complex_bulge_remover.hpp
new file mode 100644
index 0000000..e3a531a
--- /dev/null
+++ b/src/modules/algorithms/simplification/complex_bulge_remover.hpp
@@ -0,0 +1,1162 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <cmath>
+#include <stack>
+#include <queue>
+#include "utils/adt/concurrent_dsu.hpp"
+#include "dev_support/standard_base.hpp"
+#include "assembly_graph/components/graph_component.hpp"
+#include "math/xmath.h"
+#include "data_structures/sequence/sequence_tools.hpp"
+#include "assembly_graph/paths/path_processor.hpp"
+#include "visualization/visualization.hpp"
+#include "dominated_set_finder.hpp"
+
+
+namespace omnigraph {
+
+namespace complex_br {
+
+template<class Graph>
+class LocalizedComponent: public GraphActionHandler<Graph> /*: public GraphComponent<Graph>*/{
+ typedef GraphActionHandler<Graph> base;
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ const Graph& g_;
+ VertexId start_vertex_;
+ set<VertexId> end_vertices_;
+ //usage of inclusive-inclusive range!!!
+ map<VertexId, Range> vertex_depth_;
+ multimap<size_t, VertexId> height_2_vertices_;
+ size_t diff_threshold_;
+
+ bool AllEdgeOut(VertexId v) const {
+ for (EdgeId e : g_.OutgoingEdges(v)) {
+ if (contains(g_.EdgeEnd(e)))
+ return false;
+ }
+ return true;
+ }
+
+ bool AllEdgeIn(VertexId v) const {
+ for (EdgeId e : g_.OutgoingEdges(v)) {
+ if (!contains(g_.EdgeEnd(e)))
+ return false;
+ }
+ return true;
+ }
+
+ size_t Average(Range r) const {
+ return r.start_pos;
+ }
+
+public:
+
+// template <class It>
+ LocalizedComponent(const Graph& g, //It begin, It end,
+ VertexId start_vertex/*, const vector<VertexId>& end_vertices*/) :
+ base(g, "br_component"), g_(g), start_vertex_(start_vertex) {
+ end_vertices_.insert(start_vertex);
+ vertex_depth_.insert(make_pair(start_vertex_, Range(0, 0)));
+ height_2_vertices_.insert(make_pair(0, start_vertex));
+ }
+
+ const Graph& g() const {
+ return g_;
+ }
+
+ bool IsEndVertex(VertexId v) const {
+ for (EdgeId e : g_.OutgoingEdges(v)) {
+ if (contains(g_.EdgeEnd(e)))
+ return false;
+ }
+ return true;
+ }
+
+ void AddVertex(VertexId v, Range dist_range) {
+// VERIFY(CheckCloseNeighbour(v));
+// Range r = NeighbourDistanceRange(v);
+ DEBUG("Adding vertex " << g_.str(v) << " to the component");
+ vertex_depth_.insert(make_pair(v, dist_range));
+ height_2_vertices_.insert(make_pair(Average(dist_range), v));
+ DEBUG(
+ "Range " << dist_range << " Average height " << Average(dist_range));
+ for (EdgeId e : g_.IncomingEdges(v)) {
+ end_vertices_.erase(g_.EdgeStart(e));
+ }
+ if (IsEndVertex(v)) {
+ end_vertices_.insert(v);
+ }
+ }
+
+ //todo what if path processor will fail inside
+ size_t TotalPathCount() const {
+ size_t answer = 0;
+ for (VertexId end_v : end_vertices_) {
+ PathStorageCallback<Graph> path_storage(g_);
+ Range r = vertex_depth_.find(end_v)->second;
+ ProcessPaths(g_, r.start_pos, r.end_pos, start_vertex_, end_v, path_storage);
+ answer += path_storage.size();
+ }
+ return answer;
+ }
+
+ bool CheckCompleteness() const {
+ for (VertexId v : key_set(vertex_depth_)) {
+ if (v == start_vertex_)
+ continue;
+ if (!AllEdgeIn(v) && !AllEdgeOut(v))
+ return false;
+ }
+ return true;
+ }
+
+ bool NeedsProjection() const {
+ DEBUG("Checking if component needs projection");
+ size_t tot_path_count = TotalPathCount();
+ bool answer = tot_path_count > end_vertices_.size();
+// more robust to path processor failure this way VERIFY(tot_path_count >= end_vertices_.size());
+ if (answer) {
+ DEBUG("Needs projection");
+ } else {
+ DEBUG("Doesn't need projection");
+ }
+ return answer;
+ }
+
+ bool contains(VertexId v) const {
+ return vertex_depth_.count(v) > 0;
+ }
+
+ bool contains(EdgeId e) const {
+ return contains(g_.EdgeStart(e)) && contains(g_.EdgeEnd(e));
+ }
+
+ Range distance_range(VertexId v) const {
+ VERIFY(contains(v));
+ return vertex_depth_.find(v)->second;
+ }
+
+ size_t avg_distance(VertexId v) const {
+ VERIFY(contains(v));
+ return Average(vertex_depth_.find(v)->second);
+ }
+
+ set<size_t> avg_distances() const {
+ set<size_t> distances;
+ for (VertexId v : key_set(vertex_depth_)) {
+ distances.insert(avg_distance(v));
+ }
+ return distances;
+ }
+
+ VertexId start_vertex() const {
+ return start_vertex_;
+ }
+
+ const set<VertexId>& end_vertices() const {
+ return end_vertices_;
+ }
+
+ bool CheckCloseNeighbour(VertexId v) const {
+ DEBUG("Check if vertex " << g_.str(v) << " can be processed");
+ for (EdgeId e : g_.IncomingEdges(v)) {
+ if (!contains(g_.EdgeStart(e))) {
+ DEBUG(
+ "Blocked by unprocessed or external vertex " << g_.int_id(g_.EdgeStart(e)) << " that starts edge " << g_.int_id(e));
+ DEBUG("Check fail");
+ return false;
+ }
+ }
+ DEBUG("Check ok");
+ return true;
+ }
+
+ GraphComponent<Graph> AsGraphComponent() const {
+ set<VertexId> vertices = key_set(vertex_depth_);
+ return GraphComponent<Graph>(g_, vertices.begin(), vertices.end());
+ }
+
+ bool ContainsConjugateVertices() const {
+ set<VertexId> conjugate_vertices;
+ for (VertexId v : key_set(vertex_depth_)) {
+ if (conjugate_vertices.count(v) == 0) {
+ conjugate_vertices.insert(g_.conjugate(v));
+ } else {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ virtual void HandleDelete(VertexId v) {
+ VERIFY(end_vertices_.count(v) == 0);
+ if (contains(v)) {
+ DEBUG("Deleting vertex " << g_.str(v) << " from the component");
+ size_t depth = avg_distance(v);
+ vertex_depth_.erase(v);
+ for (auto it = height_2_vertices_.lower_bound(depth);
+ it != height_2_vertices_.upper_bound(depth); ++it) {
+ if (it->second == v) {
+ height_2_vertices_.erase(it);
+ return;
+ }
+ }
+ VERIFY(false);
+ }
+
+ }
+
+ virtual void HandleDelete(EdgeId /*e*/) {
+ //empty for now
+ }
+
+ virtual void HandleMerge(const vector<EdgeId>& /*old_edges*/, EdgeId /*new_edge*/) {
+ VERIFY(false);
+ }
+
+ virtual void HandleGlue(EdgeId /*new_edge*/, EdgeId /*edge1*/, EdgeId /*edge2*/) {
+ //empty for now
+ }
+
+ virtual void HandleSplit(EdgeId old_edge, EdgeId new_edge_1, EdgeId /*new_edge_2*/) {
+ VERIFY(old_edge != g_.conjugate(old_edge));
+ VertexId start = g_.EdgeStart(old_edge);
+ VertexId end = g_.EdgeEnd(old_edge);
+ if (contains(start)) {
+ VERIFY(vertex_depth_.count(end) > 0);
+ VERIFY(avg_distance(end) > avg_distance(start));
+ VertexId new_vertex = g_.EdgeEnd(new_edge_1);
+ Range new_vertex_depth(distance_range(start));
+ new_vertex_depth.shift((int) g_.length(new_edge_1));
+ //todo do better later (needs to be synched with splitting strategy)
+// + (vertex_depth_[end] - vertex_depth_[start])
+// * g_.length(new_edge_1) / g_.length(old_edge);
+ DEBUG(
+ "Inserting vertex " << g_.str(new_vertex) << " to component during split");
+ vertex_depth_.insert(make_pair(new_vertex, new_vertex_depth));
+ height_2_vertices_.insert(
+ make_pair(Average(new_vertex_depth), new_vertex));
+ }
+ }
+
+ const multimap<size_t, VertexId>& height_2_vertices() const {
+ return height_2_vertices_;
+ }
+
+ const set<VertexId> vertices_on_height(size_t height) const {
+ set<VertexId> answer;
+ for (auto it = height_2_vertices_.lower_bound(height);
+ it != height_2_vertices_.upper_bound(height); ++it) {
+ answer.insert(it->second);
+ }
+ return answer;
+ }
+
+private:
+ DECL_LOGGER("LocalizedComponent")
+ ;
+};
+
+template<class Graph>
+class SkeletonTree: public GraphActionHandler<Graph> {
+ typedef GraphActionHandler<Graph> base;
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+public:
+
+ const set<EdgeId>& edges() const {
+ return edges_;
+ }
+
+ const set<VertexId>& vertices() const {
+ return vertices_;
+ }
+
+ bool Contains(EdgeId e) const {
+// VertexId start = br_comp_.g().EdgeStart(e);
+// if (next_edges_.count(start) > 0) {
+// const vector<EdgeId> edges = next_edges_.find(start)->second;
+// return find(e, next_edges_.lower_bound(start), next_edges_.upper_bound(start)) != edges.end();
+// }
+// return false;
+ return edges_.count(e) > 0;
+ }
+
+ bool Contains(VertexId v) const {
+// return next_edges_.count(v) > 0;
+ return vertices_.count(v) > 0;
+ }
+
+ virtual void HandleDelete(VertexId v) {
+ //verify v not in the tree
+ VERIFY(!Contains(v));
+ }
+
+ virtual void HandleDelete(EdgeId e) {
+ //verify e not in the tree
+ DEBUG("Trying to delete " << br_comp_.g().str(e));
+ VERIFY(!Contains(e));
+ }
+
+ virtual void HandleMerge(const vector<EdgeId>& old_edges, EdgeId /*new_edge*/) {
+ //verify false
+ for (EdgeId e : old_edges) {
+ VERIFY(!Contains(e));
+ }
+ }
+
+ virtual void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) {
+// verify edge2 in tree
+// put new_edge instead of edge2
+ DEBUG("Glueing " << br_comp_.g().str(new_edge) << " " << br_comp_.g().str(edge1) << " " << br_comp_.g().str(edge2));
+ if (Contains(edge2)) {
+ DEBUG("Erasing from tree: " << br_comp_.g().str(edge2));
+ DEBUG("Inserting to tree: " << br_comp_.g().str(new_edge));
+ edges_.erase(edge2);
+ edges_.insert(new_edge);
+ }
+ }
+
+ virtual void HandleSplit(EdgeId old_edge, EdgeId new_edge_1,
+ EdgeId new_edge_2) {
+ VERIFY(old_edge != br_comp_.g().conjugate(old_edge));
+ if (Contains(old_edge)) {
+ edges_.erase(old_edge);
+ vertices_.insert(br_comp_.g().EdgeEnd(new_edge_1));
+ edges_.insert(new_edge_1);
+ edges_.insert(new_edge_2);
+ }
+ }
+
+ SkeletonTree(const LocalizedComponent<Graph>& br_comp,
+ const set<EdgeId>& edges) :
+ base(br_comp.g(), "br_tree"), br_comp_(br_comp), edges_(edges) {
+ DEBUG("Tree edges " << br_comp.g().str(edges));
+ for (EdgeId e : edges_) {
+ vertices_.insert(br_comp_.g().EdgeStart(e));
+ vertices_.insert(br_comp_.g().EdgeEnd(e));
+ }
+ }
+
+private:
+ const LocalizedComponent<Graph>& br_comp_;
+ set<EdgeId> edges_;
+ set<VertexId> vertices_;
+
+private:
+ DECL_LOGGER("SkeletonTree")
+ ;
+};
+
+typedef size_t mask;
+typedef mask mixed_color_t;
+typedef unsigned primitive_color_t;
+
+template<class Graph>
+class ComponentColoring: public GraphActionHandler<Graph> {
+ typedef GraphActionHandler<Graph> base;
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+public:
+
+ size_t CountPrimitiveColors(mixed_color_t color) const {
+ size_t cnt = 0;
+ for (size_t shift = 0; shift < color_cnt_; ++shift) {
+ mixed_color_t prim_color = 1 << shift;
+ if ((prim_color & color) != 0) {
+ cnt++;
+ }
+ }
+ VERIFY(cnt > 0);
+ return cnt;
+ }
+
+ primitive_color_t GetAnyPrimitiveColor(mixed_color_t color) const {
+ for (size_t shift = 0; shift < color_cnt_; ++shift) {
+ if ((1 << shift & color) != 0) {
+ return primitive_color_t(shift);
+ }
+ }
+ VERIFY(false);
+ return 0;
+ }
+
+ bool IsSubset(mixed_color_t super_set, mixed_color_t sub_set) const {
+ return (super_set | sub_set) == super_set;
+ }
+
+private:
+
+ const LocalizedComponent<Graph>& comp_;
+ const size_t color_cnt_;
+ map<VertexId, mixed_color_t> vertex_colors_;
+
+ mixed_color_t CountVertexColor(VertexId v) const {
+ mixed_color_t answer = mixed_color_t(0);
+ for (EdgeId e : comp_.g().OutgoingEdges(v)) {
+ answer |= color(e);
+ }
+ return answer;
+ }
+
+ void CountAndSetVertexColor(VertexId v) {
+ vertex_colors_.insert(make_pair(v, CountVertexColor(v)));
+ }
+
+ void ColorComponent() {
+ DEBUG("Coloring component");
+ size_t cnt = 0;
+ for (VertexId v : comp_.end_vertices()) {
+ mixed_color_t color = 1 << cnt;
+ DEBUG("Coloring exit " << comp_.g().str(v));
+ vertex_colors_.insert(make_pair(v, color));
+ cnt++;
+ }
+ for (auto it = comp_.height_2_vertices().rbegin();
+ it != comp_.height_2_vertices().rend(); ++it) {
+ if (vertex_colors_.count(it->second) == 0) {
+ DEBUG("Coloring vertex " << comp_.g().str(it->second));
+ CountAndSetVertexColor(it->second);
+ }
+ }
+ DEBUG("Component colored");
+ }
+
+public:
+
+ ComponentColoring(const LocalizedComponent<Graph>& comp) :
+ base(comp.g(), "br_comp_coloring"), comp_(comp), color_cnt_(
+ comp_.end_vertices().size()) {
+ VERIFY(comp.end_vertices().size() <= sizeof(size_t) * 8);
+ ColorComponent();
+ }
+
+ mixed_color_t color(VertexId v) const {
+ auto it = vertex_colors_.find(v);
+ if (it == vertex_colors_.end()) {
+ DEBUG("No color for vertex " << comp_.g().str(v));
+ DEBUG(
+ "Incoming edges " << comp_.g().str(comp_.g().IncomingEdges(v)));
+ DEBUG(
+ "Outgoing edges " << comp_.g().str(comp_.g().OutgoingEdges(v)));
+ }
+ VERIFY(it != vertex_colors_.end());
+ return it->second;
+ }
+
+ mixed_color_t color(EdgeId e) const {
+ return color(comp_.g().EdgeEnd(e));
+ }
+
+ virtual void HandleDelete(VertexId v) {
+ vertex_colors_.erase(v);
+ }
+
+ virtual void HandleMerge(const vector<EdgeId>& /*old_edges*/, EdgeId /*new_edge*/) {
+ VERIFY(false);
+ }
+
+ virtual void HandleGlue(EdgeId /*new_edge*/, EdgeId edge1, EdgeId edge2) {
+ if (comp_.contains(edge1)) {
+ VERIFY(comp_.contains(edge2));
+ VERIFY(IsSubset(color(edge2), color(edge1)));
+ }
+ }
+
+ virtual void HandleSplit(EdgeId old_edge, EdgeId new_edge_1,
+ EdgeId /*new_edge_2*/) {
+ VERIFY(old_edge != comp_.g().conjugate(old_edge));
+ if (comp_.contains(old_edge)) {
+ CountAndSetVertexColor(comp_.g().EdgeEnd(new_edge_1));
+ }
+ }
+
+private:
+ DECL_LOGGER("ComponentColoring")
+ ;
+};
+
+template<class Graph>
+class SkeletonTreeFinder {
+
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef ConcurrentDSU color_partition_ds_t;
+
+ const LocalizedComponent<Graph>& component_;
+ const ComponentColoring<Graph>& coloring_;
+
+ vector<size_t> level_heights_;
+
+ int current_level_;
+ color_partition_ds_t current_color_partition_;
+
+ set<VertexId> good_vertices_;
+ set<EdgeId> good_edges_;
+ map<VertexId, vector<EdgeId>> next_edges_;
+ map<VertexId, size_t> subtree_coverage_;
+
+ bool ConsistentWithPartition(mixed_color_t color) const {
+ return current_color_partition_.set_size(
+ GetCorrespondingDisjointSet(color))
+ == coloring_.CountPrimitiveColors(color);
+ }
+
+ bool IsGoodEdge(EdgeId e) const {
+// VertexId start = component_.g().EdgeStart(e);
+ VertexId end = component_.g().EdgeEnd(e);
+ //check if end is good
+ if (good_vertices_.count(end) == 0)
+ return false;
+
+// is subcase of next case
+// //check if end is from previous level
+// if (component_.avg_distance(end) == level_heights_[current_level_+1])
+// return true;
+
+ //check if end color is consistent with partition
+ //on level before the start
+ return ConsistentWithPartition(coloring_.color(end));
+ }
+
+ vector<EdgeId> GoodOutgoingEdges(VertexId v) const {
+ vector<EdgeId> answer;
+ for (EdgeId e : component_.g().OutgoingEdges(v)) {
+ if (IsGoodEdge(e)) {
+ DEBUG("Edge " << component_.g().str(e) << " is classified as good");
+ answer.push_back(e);
+ } else {
+ DEBUG("Edge " << component_.g().str(e) << " is classified as NOT good");
+ }
+ }
+ return answer;
+ }
+
+ vector<EdgeId> GoodOutgoingEdges(const vector<VertexId>& vertices) const {
+ vector<EdgeId> answer;
+ for (VertexId v : vertices) {
+ if (component_.end_vertices().count(v) == 0) {
+ push_back_all(answer, GoodOutgoingEdges(v));
+ }
+ }
+ return answer;
+ }
+
+ set<EdgeId> VectorAsSet(const vector<EdgeId>& edges) const {
+ return set<EdgeId>(edges.begin(), edges.end());
+ }
+
+ template<class T>
+ vector<T> SetAsVector(const set<T>& edges) const {
+ return vector<T>(edges.begin(), edges.end());
+ }
+
+ primitive_color_t GetCorrespondingDisjointSet(mixed_color_t color) const {
+ return (primitive_color_t) current_color_partition_.find_set(
+ coloring_.GetAnyPrimitiveColor(color));
+ }
+
+ void UpdateColorPartitionWithVertex(VertexId v) {
+ VERIFY(component_.g().OutgoingEdgeCount(v) > 0);
+ primitive_color_t ds = GetCorrespondingDisjointSet(
+ coloring_.color(*(component_.g().OutgoingEdges(v).begin())));
+ for (EdgeId e : component_.g().OutgoingEdges(v)) {
+ current_color_partition_.unite(ds,
+ GetCorrespondingDisjointSet(coloring_.color(e)));
+ }
+ }
+
+ bool IsGoodVertex(VertexId v) const {
+ if (!ConsistentWithPartition(coloring_.color(v)))
+ return false;
+ mixed_color_t union_color_of_good_children = mixed_color_t(0);
+ for (EdgeId e : component_.g().OutgoingEdges(v)) {
+ if (good_edges_.count(e) > 0) {
+ union_color_of_good_children |= coloring_.color(e);
+ }
+ }
+ return coloring_.color(v) == union_color_of_good_children;
+ }
+
+ void Init() {
+ current_level_ = (int) level_heights_.size() - 1;
+ size_t end_cnt = 0;
+ for (VertexId v : component_.end_vertices()) {
+ good_vertices_.insert(v);
+ subtree_coverage_[v] = 0;
+ end_cnt++;
+ }
+ }
+
+ size_t absolute_coverage(EdgeId e) {
+ return (size_t) (component_.g().coverage(e) * (double) component_.g().length(e));
+ }
+
+ void UpdateNextEdgesAndCoverage(VertexId v) {
+ map<mixed_color_t, size_t> best_subtrees_coverage;
+ map<mixed_color_t, EdgeId> best_alternatives;
+ for (EdgeId e : component_.g().OutgoingEdges(v)) {
+ if (good_edges_.count(e) > 0) {
+ VertexId end = component_.g().EdgeEnd(e);
+ mixed_color_t color = coloring_.color(e);
+ VERIFY(subtree_coverage_.count(end) > 0);
+ if (subtree_coverage_[end] + absolute_coverage(e)
+ >= best_subtrees_coverage[color]) {
+ best_subtrees_coverage[color] = subtree_coverage_[end]
+ + absolute_coverage(e);
+ best_alternatives[color] = e;
+ }
+ }
+ }
+ size_t coverage = 0;
+ for (size_t cov : value_set(best_subtrees_coverage)) {
+ coverage += cov;
+ }
+ next_edges_[v] = SetAsVector<EdgeId>(value_set(best_alternatives));
+ subtree_coverage_[v] = coverage;
+ }
+
+public:
+ SkeletonTreeFinder(const LocalizedComponent<Graph>& component,
+ const ComponentColoring<Graph>& coloring) :
+ component_(component),
+ coloring_(coloring),
+ level_heights_(SetAsVector<size_t>(component_.avg_distances())),
+ current_level_((int) level_heights_.size() - 1),
+ current_color_partition_(component_.end_vertices().size()) {
+
+ Init();
+ }
+
+ const set<EdgeId> GetTreeEdges() const {
+ set<EdgeId> answer;
+ std::queue<VertexId> vertex_queue;
+ vertex_queue.push(component_.start_vertex());
+ while (!vertex_queue.empty()) {
+ VertexId v = vertex_queue.front();
+ vertex_queue.pop();
+ if (next_edges_.count(v) == 0)
+ continue;
+ for (EdgeId e : next_edges_.find(v)->second) {
+ answer.insert(e);
+ vertex_queue.push(component_.g().EdgeEnd(e));
+ }
+ }
+ return answer;
+ }
+
+ const map<VertexId, vector<EdgeId>>& GetTree() const {
+ return next_edges_;
+ }
+
+ bool FindTree() {
+ DEBUG("Looking for tree");
+ while (current_level_ >= 0) {
+ size_t height = level_heights_[current_level_];
+ DEBUG("Processing level " << current_level_ << " on height " << height);
+ set<VertexId> level_vertices = component_.vertices_on_height(
+ height);
+ VERIFY(!level_vertices.empty());
+
+ //looking for good edges
+ insert_all(good_edges_,
+ GoodOutgoingEdges(
+ vector<VertexId>(level_vertices.begin(),
+ level_vertices.end())));
+
+
+
+ //counting colors and color partitions
+ for (VertexId v : level_vertices) {
+ if (component_.end_vertices().count(v) == 0) {
+ UpdateColorPartitionWithVertex(v);
+ if (IsGoodVertex(v)) {
+ DEBUG("Vertex " << component_.g().str(v) << " is classified as good");
+ good_vertices_.insert(v);
+ UpdateNextEdgesAndCoverage(v);
+ } else {
+ DEBUG("Vertex " << component_.g().str(v) << " is classified as NOT good");
+ }
+ }
+ }
+ current_level_--;
+ }
+ if (good_vertices_.count(component_.start_vertex()) > 0) {
+ DEBUG("Looking for tree was successful");
+ return true;
+ } else {
+ DEBUG("Looking for tree failed");
+ return false;
+ }
+ }
+
+private:
+ DECL_LOGGER("SkeletonTreeFinder")
+ ;
+};
+
+template<class Graph>
+void PrintComponent(const LocalizedComponent<Graph>& component,
+ const SkeletonTree<Graph>& tree, const string& file_name) {
+ typedef typename Graph::EdgeId EdgeId;
+ const set<EdgeId> tree_edges = tree.edges();
+ shared_ptr<omnigraph::visualization::ElementColorer<typename Graph::EdgeId>> edge_colorer = make_shared<omnigraph::visualization::MapColorer<EdgeId>>(
+ tree_edges.begin(), tree_edges.end(),"green", ""
+ );
+ visualization::WriteComponentSinksSources(component.AsGraphComponent(), file_name,
+ omnigraph::visualization::DefaultColorer(component.g(), edge_colorer),
+ *StrGraphLabelerInstance(component.g()));
+}
+
+template<class Graph>
+void PrintComponent(const LocalizedComponent<Graph>& component,
+ const string& file_name) {
+ visualization::WriteComponent(component.AsGraphComponent(), file_name,
+ omnigraph::visualization::DefaultColorer(component.g()),
+ *StrGraphLabelerInstance(component.g()));
+}
+
+
+
+template<class Graph>
+class ComponentProjector {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ Graph& g_;
+ const LocalizedComponent<Graph>& component_;
+ const ComponentColoring<Graph>& coloring_;
+ const SkeletonTree<Graph>& tree_;
+
+// DEBUG("Result: edges " << g_.str(split_res.first) << " " << g_.str(split_res.second));
+// DEBUG("New vertex" << g_.str(inner_v) << " ");
+
+ bool SplitComponent() {
+ DEBUG("Splitting component");
+ set<size_t> level_heights(component_.avg_distances());
+ DEBUG("Level heights " << ToString<size_t>(level_heights));
+
+ GraphComponent<Graph> gc = component_.AsGraphComponent();
+
+ for (auto it = gc.e_begin(); it != gc.e_end(); ++it) {
+ VertexId start_v = g_.EdgeStart(*it);
+ VertexId end_v = g_.EdgeEnd(*it);
+ size_t start_dist = component_.avg_distance(start_v);
+ size_t end_dist = component_.avg_distance(end_v);
+ DEBUG(
+ "Processing edge " << g_.str(*it) << " avg_start " << start_dist << " avg_end " << end_dist);
+ set<size_t> dist_to_split(level_heights.lower_bound(start_dist),
+ level_heights.upper_bound(end_dist));
+ DEBUG("Distances to split " << ToString<size_t>(dist_to_split));
+
+ size_t offset = start_dist;
+ EdgeId e = *it;
+ for (auto split_it = dist_to_split.begin();
+ split_it != dist_to_split.end(); ++split_it) {
+ size_t curr = *split_it;
+ if (curr == start_dist || curr == end_dist)
+ continue;
+ DEBUG("Splitting on " << curr);
+ size_t pos = curr - offset;
+ if(pos >= g_.length(e)) {
+ return false;
+ }
+ DEBUG("Splitting edge " << g_.str(e) << " on position " << pos);
+ pair<EdgeId, EdgeId> split_res = g_.SplitEdge(e, pos);
+ //checks accordance
+ VertexId inner_v = g_.EdgeEnd(split_res.first);
+ VERIFY(component_.avg_distance(inner_v) == curr);
+ e = split_res.second;
+ offset = curr;
+ }
+ }
+ DEBUG("Component split");
+ return true;
+ }
+
+ EdgeId CorrespondingTreeEdge(EdgeId e) const {
+ DEBUG("Getting height of vertex " << g_.str(g_.EdgeStart(e)));
+ size_t start_height = component_.avg_distance(g_.EdgeStart(e));
+ DEBUG("Done");
+ mixed_color_t color = coloring_.color(e);
+ DEBUG("Getting height of vertex " << g_.str(g_.EdgeEnd(e)));
+ size_t end_height = component_.avg_distance(g_.EdgeEnd(e));
+ DEBUG("Done");
+ for (VertexId v : component_.vertices_on_height(start_height)) {
+ if (component_.end_vertices().count(v) == 0) {
+ for (EdgeId e : g_.OutgoingEdges(v)) {
+ VERIFY(
+ component_.avg_distance(g_.EdgeEnd(e)) == end_height);
+ if (tree_.Contains(e)
+ && coloring_.IsSubset(coloring_.color(e), color)) {
+ return e;
+ }
+ }
+ }
+ }
+ VERIFY(false);
+ return EdgeId(NULL);
+ }
+
+public:
+
+ bool ProjectComponent() {
+ if(!SplitComponent()) {
+ DEBUG("Component can't be split");
+ return false;
+ }
+
+ DEBUG("Projecting split component");
+ GraphComponent<Graph> gc = component_.AsGraphComponent();
+
+ for (auto it = SmartSetIterator<Graph, EdgeId>(g_, gc.e_begin(),
+ gc.e_end()); !it.IsEnd(); ++it) {
+ DEBUG("Trying to project edge " << g_.str(*it));
+ EdgeId target = CorrespondingTreeEdge(*it);
+ DEBUG("Target found " << g_.str(target));
+ if (target != *it) {
+ DEBUG(
+ "Glueing " << g_.str(*it) << " to target " << g_.str(target));
+ g_.GlueEdges(*it, target);
+ DEBUG("Glued");
+ }
+ DEBUG("Edge processed");
+ }
+ DEBUG("Component projected");
+ return true;
+ }
+
+ ComponentProjector(Graph& g, const LocalizedComponent<Graph>& component,
+ const ComponentColoring<Graph>& coloring,
+ const SkeletonTree<Graph>& tree) :
+ g_(g), component_(component), coloring_(coloring), tree_(tree) {
+
+ }
+
+private:
+ DECL_LOGGER("ComponentProjector")
+ ;
+};
+
+template<class Graph>
+class LocalizedComponentFinder {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ static const size_t exit_bound = 32;
+ static const size_t inf = -1ul;
+
+ Graph& g_;
+ size_t max_length_;
+ size_t length_diff_threshold_;
+
+ LocalizedComponent<Graph> comp_;
+
+ map<VertexId, Range> dominated_;
+ set<VertexId> interfering_;
+
+ std::string ToString(EdgeId e) const {
+ std::stringstream ss;
+ ss << g_.str(e)
+ << " start: "
+ << g_.str(g_.EdgeStart(e))
+ << " end: "
+ << g_.str(g_.EdgeEnd(e));
+ return ss.str();
+ }
+
+ bool CheckCompleteness() const {
+ if (interfering_.size() == 0) {
+ VERIFY(comp_.CheckCompleteness());
+ return true;
+ }
+ return false;
+ }
+
+ //false if new interfering vertex is not dominated
+ //can be slightly modified in new algorithm
+ bool ProcessLocality(VertexId processing_v) {
+ vector<VertexId> processed_neighb;
+ vector<VertexId> unprocessed_neighb;
+ for (EdgeId e : g_.OutgoingEdges(processing_v)) {
+ VertexId v = g_.EdgeEnd(e);
+ if (!comp_.contains(v)) {
+ unprocessed_neighb.push_back(v);
+ } else {
+ processed_neighb.push_back(v);
+ }
+ }
+ if (!processed_neighb.empty()) {
+ for (VertexId v : unprocessed_neighb) {
+ if (dominated_.count(v) > 0) {
+ interfering_.insert(v);
+ } else {
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ bool AddVertexWithBackwardPaths(VertexId v) {
+ DEBUG("Adding vertex with backward paths");
+ std::queue<VertexId> q;
+ q.push(v);
+ while (!q.empty()) {
+ VertexId next_v = q.front();
+ q.pop();
+ if (!ProcessLocality(next_v)) {
+ return false;
+ }
+ if (!comp_.contains(next_v)) {
+ VERIFY(dominated_.count(v) > 0);
+ comp_.AddVertex(next_v, dominated_.find(next_v)->second);
+ for (EdgeId e : g_.IncomingEdges(next_v)) {
+ q.push(g_.EdgeStart(e));
+ }
+ }
+ }
+ return true;
+ }
+
+ boost::optional<VertexId> ClosestNeigbour() const {
+ size_t min_dist = inf;
+ boost::optional<VertexId> answer = boost::none;
+ for (auto it = dominated_.begin(); it != dominated_.end(); ++it) {
+ if (!comp_.contains(it->first) && it->second.start_pos < min_dist) {
+ min_dist = it->second.start_pos;
+ answer = boost::optional<VertexId>(it->first);
+ }
+ }
+ return answer;
+ }
+
+ bool ProcessInterferingVertex(VertexId v) {
+ interfering_.erase(v);
+ return AddVertexWithBackwardPaths(v);
+ }
+
+ bool CheckPathLengths() const {
+ VERIFY(CheckCompleteness());
+ for (VertexId v : comp_.end_vertices()) {
+ if (comp_.distance_range(v).size() > length_diff_threshold_)
+ return false;
+ }
+ return true;
+ }
+
+ bool CheckPositiveHeightDiff() const {
+ DEBUG("Checking for positive height diff of each edge");
+ GraphComponent<Graph> gc = comp_.AsGraphComponent();
+ for (auto it = gc.e_begin(); it != gc.e_end(); ++it) {
+ size_t start_height = comp_.avg_distance(g_.EdgeStart(*it));
+ size_t end_height = comp_.avg_distance(g_.EdgeEnd(*it));
+ //VERIFY(end_height >= start_height);
+ if (end_height <= start_height) {
+ DEBUG("Check failed for edge " << g_.str(*it) << " start_height " << start_height << " end_height " << end_height);
+ return false;
+ }
+ }
+ return true;
+ }
+
+ bool CloseComponent() {
+ while (!interfering_.empty()) {
+ VertexId v = *interfering_.begin();
+ DEBUG("Processing interfering vertex " << g_.str(v));
+ if (!ProcessInterferingVertex(v)) {
+ DEBUG("Vertex processing failed");
+ return false;
+ }
+ }
+ return true;
+ }
+
+public:
+ LocalizedComponentFinder(Graph& g, size_t max_length,
+ size_t length_diff_threshold, VertexId start_v) :
+ g_(g), max_length_(max_length), length_diff_threshold_(
+ length_diff_threshold), comp_(g, start_v) {
+ DEBUG(
+ "Component finder from vertex " << g_.str(comp_.start_vertex()) << " created");
+ DominatedSetFinder<Graph> dominated_set_finder(g_, start_v, max_length);
+ dominated_set_finder.FillDominated();
+ dominated_ = dominated_set_finder.dominated();
+// ProcessStartVertex();
+ }
+
+ bool ProceedFurther() {
+ DEBUG("Processing further");
+
+ DEBUG("Choosing closest vertex");
+ do {
+ optional<VertexId> next_v = ClosestNeigbour();
+
+ if (next_v) {
+ DEBUG(
+ "Vertex " << g_.str(*next_v) << " was chosen as closest neighbour");
+ interfering_.insert(*next_v);
+ DEBUG("Trying to construct closure");
+ if (!CloseComponent()) {
+ DEBUG("Failed to close component");
+ return false;
+ } else {
+ DEBUG("Component closed");
+ }
+ } else {
+ DEBUG("No more vertices can be added");
+ return false;
+ }
+ } while (!comp_.NeedsProjection());
+
+ if (!CheckPathLengths()) {
+ DEBUG("Path lengths check failed");
+ return false;
+ }
+ if (!CheckPositiveHeightDiff()) {
+ DEBUG("Check for positive height diff of each edge failed");
+ return false;
+ }
+ if (comp_.ContainsConjugateVertices()) {
+ DEBUG("Found component contains conjugate vertices");
+ return false;
+ }
+ if (comp_.end_vertices().size() > exit_bound) {
+ DEBUG("Too many exits:" << comp_.end_vertices().size());
+ return false;
+ }
+ GraphComponent<Graph> gc = comp_.AsGraphComponent();
+ DEBUG("Found component candidate. Vertices: " << g_.str(gc.vertices()));
+ return true;
+ }
+
+ const LocalizedComponent<Graph>& component() {
+ return comp_;
+ }
+
+private:
+ DECL_LOGGER("LocalizedComponentFinder")
+ ;
+};
+
+template<class Graph>
+class ComplexBulgeRemover {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ Graph& g_;
+ size_t max_length_;
+ size_t length_diff_;
+
+ string pics_folder_;
+
+ bool ProcessComponent(LocalizedComponent<Graph>& component,
+ size_t candidate_cnt) {
+ DEBUG("Processing component");
+ ComponentColoring<Graph> coloring(component);
+ SkeletonTreeFinder<Graph> tree_finder(component, coloring);
+ DEBUG("Looking for a tree");
+ if (tree_finder.FindTree()) {
+ DEBUG("Tree found");
+
+ SkeletonTree<Graph> tree(component, tree_finder.GetTreeEdges());
+
+ if (!pics_folder_.empty()) {
+ PrintComponent(component, tree,
+ pics_folder_ + "success/"
+ + ToString(g_.int_id(component.start_vertex()))
+ + "_" + ToString(candidate_cnt) + ".dot");
+ }
+
+ ComponentProjector<Graph> projector(g_, component, coloring, tree);
+ if(!projector.ProjectComponent()) {
+ DEBUG("Component can't be projected");
+ return false;
+ }
+ DEBUG(
+ "Successfully processed component candidate " << candidate_cnt << " start_v " << g_.str(component.start_vertex()));
+ return true;
+ } else {
+ DEBUG(
+ "Failed to find skeleton tree for candidate " << candidate_cnt << " start_v " << g_.str(component.start_vertex()));
+ if (!pics_folder_.empty()) {
+ //todo check if we rewrite all of the previous pics!
+ PrintComponent(component,
+ pics_folder_ + "fail/"
+ + ToString(g_.int_id(component.start_vertex())) //+ "_" + ToString(candidate_cnt)
+ + ".dot");
+ }
+ return false;
+ }
+ }
+
+public:
+ ComplexBulgeRemover(Graph& g, size_t max_length, size_t length_diff,
+ const string& pics_folder = "") :
+ g_(g), max_length_(max_length), length_diff_(length_diff), pics_folder_(
+ pics_folder) {
+ }
+
+ bool Run() {
+ size_t cnt = 0;
+ DEBUG("Complex bulge remover started");
+ if (!pics_folder_.empty()) {
+// remove_dir(pics_folder_);
+ make_dir(pics_folder_);
+ make_dir(pics_folder_ + "success/");
+ make_dir(pics_folder_ + "fail/");
+ }
+ bool something_done_flag = false;
+ for (auto it = g_.SmartVertexBegin(); !it.IsEnd(); ++it) {
+ DEBUG("Processing vertex " << g_.str(*it));
+ size_t candidate_cnt = 0;
+ vector<VertexId> vertices_to_post_process;
+ { //important scope!!!
+ LocalizedComponentFinder<Graph> comp_finder(g_, max_length_,
+ length_diff_, *it);
+ while (comp_finder.ProceedFurther()) {
+ candidate_cnt++;
+ DEBUG(
+ "Found component candidate " << candidate_cnt << " start_v " << g_.str(*it));
+ LocalizedComponent<Graph> component =
+ comp_finder.component();
+ if (ProcessComponent(component, candidate_cnt)) {
+ something_done_flag = true;
+ cnt++;
+ GraphComponent<Graph> gc = component.AsGraphComponent();
+ vertices_to_post_process.insert(
+ vertices_to_post_process.end(), gc.v_begin(),
+ gc.v_end());
+ break;
+ }
+ }
+ }
+ for (VertexId v : vertices_to_post_process) {
+ it.HandleAdd(v);
+ g_.CompressVertex(v);
+ }
+ }
+ DEBUG("Complex bulge remover finished");
+ DEBUG("Bulges processed " << cnt);
+ return something_done_flag;
+ }
+
+private:
+ DECL_LOGGER("ComplexBulgeRemover")
+ ;
+};
+
+}
+
+}
diff --git a/src/modules/algorithms/simplification/complex_tip_clipper.hpp b/src/modules/algorithms/simplification/complex_tip_clipper.hpp
new file mode 100644
index 0000000..a5fd240
--- /dev/null
+++ b/src/modules/algorithms/simplification/complex_tip_clipper.hpp
@@ -0,0 +1,153 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <limits>
+
+#include "visualization/visualization.hpp"
+#include "compressor.hpp"
+#include "dominated_set_finder.hpp"
+
+
+namespace omnigraph{
+
+
+template<class Graph>
+class ComplexTipClipper {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ Graph& g_;
+ double relative_coverage_treshold_;
+ size_t edge_length_treshold_;
+ size_t max_path_length_;
+ string pics_folder_;
+ std::function<void(const set<EdgeId>&)> removal_handler_;
+
+ bool CheckEdgeLenghts(const GraphComponent<Graph>& component) const {
+ for(auto e : component.edges()) {
+ if(g_.length(e) > edge_length_treshold_) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+
+ bool CheckSize(const GraphComponent<Graph> & component) const {
+ return (component.vertices().size() > 1);
+ }
+
+ void RemoveComplexTip(GraphComponent<Graph>& component) {
+ ComponentRemover<Graph> remover(g_, removal_handler_);
+ remover.DeleteComponent(component.edges().begin(), component.edges().end());
+ }
+
+
+ bool CheckPathLengths(const map<VertexId, Range>& ranges) const {
+ for(auto r : ranges) {
+ if(r.second.start_pos > max_path_length_) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ double GetTipCoverage(const GraphComponent<Graph> & component) const {
+ double cov = numeric_limits<double>::max();
+ for(auto edge : component.edges()) {
+ cov = std::min(cov, g_.coverage(edge));
+ }
+ return cov;
+ }
+
+ double GetOutwardCoverage(const GraphComponent<Graph> & component) const {
+ double cov = 0.0;
+ for(auto v : component.vertices()) {
+ for(auto edge : g_.OutgoingEdges(v)) {
+ if(component.contains(edge)) {
+ cov = max(cov, g_.coverage(edge));
+ }
+ }
+
+ for(auto edge : g_.IncomingEdges(v)) {
+ if(component.contains(edge)) {
+ cov = max(cov, g_.coverage(edge));
+ }
+ }
+ }
+ return cov;
+ }
+
+ double GetRelativeTipCoverage(const GraphComponent<Graph> & component) const {
+ return GetTipCoverage(component) / GetOutwardCoverage(component);
+ }
+
+public:
+ ComplexTipClipper(Graph& g, double relative_coverage, size_t max_edge_len, size_t max_path_len, const string& pics_folder = "", std::function<void(const set<EdgeId>&)> removal_handler = 0) :
+ g_(g), relative_coverage_treshold_(math::ge(relative_coverage, 0.0) ? relative_coverage : std::numeric_limits<double>::max()), edge_length_treshold_(max_edge_len) ,max_path_length_(max_path_len), pics_folder_(pics_folder), removal_handler_(removal_handler)
+ { }
+
+ bool Run() {
+ size_t cnt = 0;
+ INFO("Complex tip clipper started");
+ if (!pics_folder_.empty()) {
+ make_dir(pics_folder_);
+ }
+
+ bool something_done_flag = false;
+ for (auto it = g_.SmartVertexBegin(); !it.IsEnd(); ++it) {
+ if(g_.IncomingEdgeCount(*it) != 0) {
+ continue;
+ }
+ DEBUG("Processing vertex " << g_.str(*it));
+
+ DominatedSetFinder<Graph> dom_finder(g_, *it, max_path_length_ * 2);
+ dom_finder.FillDominated();
+ auto component = dom_finder.AsGraphComponent();
+
+ if(!CheckEdgeLenghts(component)) {
+ DEBUG("Tip contains too long edges");
+ continue;
+ }
+
+ if(!CheckSize(component)) {
+ DEBUG("Component doesn't meet size requirements");
+ continue;
+ }
+ auto dominated = dom_finder.dominated();
+ if(!CheckPathLengths(dominated)) {
+ DEBUG("Tip contains too long paths");
+ continue;
+ }
+
+ if(math::ge(GetRelativeTipCoverage(component), relative_coverage_treshold_)) {
+ DEBUG("Tip is too high covered with respect to external edges");
+ continue;
+ }
+
+ if (!pics_folder_.empty()) {
+ visualization::WriteComponentSinksSources(component,
+ pics_folder_
+ + ToString(g_.int_id(*it)) //+ "_" + ToString(candidate_cnt)
+ + ".dot");
+ }
+
+ something_done_flag = true;
+ cnt++;
+ RemoveComplexTip(component);
+ }
+ CompressAllVertices(g_);
+ INFO("Complex tip clipper finished");
+ INFO("Tips processed " << cnt);
+ return something_done_flag;
+ }
+private:
+ DECL_LOGGER("ComplexTipClipper")
+};
+
+}
diff --git a/src/modules/algorithms/simplification/compressor.hpp b/src/modules/algorithms/simplification/compressor.hpp
new file mode 100644
index 0000000..27257f0
--- /dev/null
+++ b/src/modules/algorithms/simplification/compressor.hpp
@@ -0,0 +1,141 @@
+#pragma once
+#include "assembly_graph/graph_support/parallel_processing.hpp"
+#include "assembly_graph/graph_support/basic_vertex_conditions.hpp"
+namespace omnigraph {
+
+/**
+* Compressor compresses vertices with unique incoming and unique outgoing edge in linear time while
+* simple one-by-one compressing has square complexity.
+*/
+template<class Graph>
+class Compressor : public PersistentProcessingAlgorithm<Graph, typename Graph::VertexId,
+ ParallelInterestingElementFinder<Graph, typename Graph::VertexId>> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef PersistentProcessingAlgorithm <Graph,
+ VertexId, ParallelInterestingElementFinder<Graph, VertexId>> base;
+ typedef CompressCondition <Graph> ConditionT;
+
+ Graph &graph_;
+ ConditionT compress_condition_;
+ bool safe_merging_;
+
+ bool GoUniqueWayForward(EdgeId &e) {
+ VertexId u = graph_.EdgeEnd(e);
+ if (!graph_.CheckUniqueOutgoingEdge(u)
+ || !graph_.CheckUniqueIncomingEdge(u)) {
+ return false;
+ }
+ e = graph_.GetUniqueOutgoingEdge(u);
+ return true;
+ }
+
+ bool GoUniqueWayBackward(EdgeId &e) {
+ VertexId u = graph_.EdgeStart(e);
+ if (!graph_.CheckUniqueOutgoingEdge(u)
+ || !graph_.CheckUniqueIncomingEdge(u)) {
+ return false;
+ }
+ e = graph_.GetUniqueIncomingEdge(u);
+ return true;
+ }
+
+//do not use without checks:)
+ EdgeId CompressWithoutChecks(VertexId v) {
+
+ EdgeId e = graph_.GetUniqueOutgoingEdge(v);
+ EdgeId start_edge = e;
+ while (GoUniqueWayBackward(e) && e != start_edge
+ && !graph_.RelatedVertices(graph_.EdgeStart(e),
+ graph_.EdgeEnd(e))) {
+ }
+ vector <EdgeId> mergeList;
+ // e = graph_.conjugate(e);
+ start_edge = e;
+ do {
+ mergeList.push_back(e);
+ } while (GoUniqueWayForward(e) && e != start_edge
+ && !graph_.RelatedVertices(graph_.EdgeStart(e),
+ graph_.EdgeEnd(e)));
+ EdgeId new_edge = graph_.MergePath(mergeList, safe_merging_);
+ TRACE("Vertex compressed and is now part of edge "
+ << graph_.str(new_edge));
+ return new_edge;
+
+ }
+
+// //todo use graph method!
+// bool CanCompressVertex(VertexId v) const {
+// if (!graph_.CheckUniqueOutgoingEdge(v)
+// || !graph_.CheckUniqueIncomingEdge(v)) {
+// TRACE(
+// "Vertex "
+// << graph_.str(v)
+// << " judged NOT compressible. Proceeding to the next vertex");
+// TRACE("Processing vertex " << graph_.str(v) << " finished");
+// return false;
+// }
+// return true;
+// }
+public:
+ Compressor(Graph &graph, size_t chunk_cnt = 1, bool safe_merging = true) :
+ base(graph,
+ ParallelInterestingElementFinder<Graph, VertexId>(graph,
+ ConditionT(graph), chunk_cnt),
+ /*canonical only*/true),
+ graph_(graph),
+ compress_condition_(graph),
+ safe_merging_(safe_merging) {
+ }
+
+ /**
+ * Method compresses longest possible path, containing given vertex.
+ * @param vertex to be compressed as part of a path
+ * @return true if vertex can be compressed and false otherwise
+ */
+ bool CompressVertex(VertexId v) {
+ TRACE("Processing vertex " << graph_.str(v) << " started");
+ if (!compress_condition_.Check(v)) {
+ return false;
+ }
+ TRACE("Vertex " << graph_.str(v) << " judged compressible");
+ CompressWithoutChecks(v);
+ return true;
+ }
+
+ EdgeId CompressVertexEdgeId(VertexId v) {
+ TRACE("Processing vertex " << graph_.str(v) << " started");
+ if (!compress_condition_.Check(v)) {
+ return EdgeId(0);
+ }
+ TRACE("Vertex " << graph_.str(v) << " judged compressible");
+ return CompressWithoutChecks(v);
+ }
+
+// bool IsOfInterest(VertexId v) const {
+// return CanCompressVertex(v);
+// }
+
+protected:
+ bool Process(VertexId v) override {
+ if (compress_condition_.Check(v)) {
+ CompressWithoutChecks(v);
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+private:
+ DECL_LOGGER("Compressor")
+};
+
+/**
+* Method compresses all vertices which can be compressed.
+*/
+template<class Graph>
+bool CompressAllVertices(Graph &g, bool safe_merging = true, size_t chunk_cnt = 1) {
+ Compressor<Graph> compressor(g, chunk_cnt, safe_merging);
+ return compressor.Run();
+}
+}
diff --git a/src/modules/algorithms/simplification/dominated_set_finder.hpp b/src/modules/algorithms/simplification/dominated_set_finder.hpp
new file mode 100644
index 0000000..050777d
--- /dev/null
+++ b/src/modules/algorithms/simplification/dominated_set_finder.hpp
@@ -0,0 +1,137 @@
+#pragma once
+
+namespace omnigraph {
+template<class Graph>
+class DominatedSetFinder {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ const Graph& g_;
+ VertexId start_vertex_;
+ size_t max_length_;
+ size_t max_count_;
+
+ size_t cnt_;
+ std::map<VertexId, Range> dominated_;
+
+ bool CheckCanBeProcessed(VertexId v) const {
+ DEBUG( "Check if vertex " << g_.str(v) << " is dominated close neighbour");
+ for (EdgeId e : g_.IncomingEdges(v)) {
+ if (dominated_.count(g_.EdgeStart(e)) == 0) {
+ DEBUG( "Blocked by external vertex " << g_.int_id(g_.EdgeStart(e)) << " that starts edge " << g_.int_id(e));
+ DEBUG("Check fail");
+ return false;
+ }
+ }
+ DEBUG("Check ok");
+ return true;
+ }
+
+ void UpdateCanBeProcessed(VertexId v,
+ std::queue<VertexId>& can_be_processed) const {
+ DEBUG("Updating can be processed");
+ for (EdgeId e : g_.OutgoingEdges(v)) {
+ DEBUG("Considering edge " << ToString(e));
+ VertexId neighbour_v = g_.EdgeEnd(e);
+ if (CheckCanBeProcessed(neighbour_v)) {
+ can_be_processed.push(neighbour_v);
+ }
+ }
+ }
+
+ Range NeighbourDistanceRange(VertexId v, bool dominated_only = true) const {
+ DEBUG("Counting distance range for vertex " << g_.str(v));
+ size_t min = numeric_limits<size_t>::max();
+ size_t max = 0;
+ VERIFY(g_.IncomingEdgeCount(v) > 0);
+ VERIFY(!dominated_only || CheckCanBeProcessed(v));
+ for (EdgeId e : g_.IncomingEdges(v)) {
+ //in case of dominated_only == false
+ if (dominated_.count(g_.EdgeStart(e)) == 0)
+ continue;
+ Range range = dominated_.find(g_.EdgeStart(e))->second;
+ range.shift((int) g_.length(e));
+ DEBUG("Edge " << g_.str(e) << " provide distance range " << range);
+ if (range.start_pos < min)
+ min = range.start_pos;
+ if (range.end_pos > max)
+ max = range.end_pos;
+ }
+ VERIFY((max > 0) && (min < numeric_limits<size_t>::max()) && (min <= max));
+ Range answer(min, max);
+ DEBUG("Range " << answer);
+ return answer;
+ }
+
+ bool CheckNoEdgeToStart(VertexId v) {
+ for (EdgeId e : g_.OutgoingEdges(v)) {
+ if (g_.EdgeEnd(e) == start_vertex_) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+public:
+ DominatedSetFinder(const Graph& g, VertexId v, size_t max_length = -1ul,
+ size_t max_count = -1ul)
+ : g_(g),
+ start_vertex_(v),
+ max_length_(max_length),
+ max_count_(max_count),
+ cnt_(0) {
+
+ }
+
+ //true if no thresholds exceeded
+ bool FillDominated() {
+ DEBUG("Adding starting vertex " << g_.str(start_vertex_) << " to dominated set");
+ dominated_.insert(make_pair(start_vertex_, Range(0, 0)));
+ cnt_++;
+ std::queue<VertexId> can_be_processed;
+ UpdateCanBeProcessed(start_vertex_, can_be_processed);
+ while (!can_be_processed.empty()) {
+ if (++cnt_ > max_count_) {
+ return false;
+ }
+ VertexId v = can_be_processed.front();
+ can_be_processed.pop();
+ Range r = NeighbourDistanceRange(v);
+ if (r.start_pos > max_length_) {
+ return false;
+ }
+ //Currently dominated vertices cannot have edge to start vertex
+ if (CheckNoEdgeToStart(v)) {
+ DEBUG("Adding vertex " << g_.str(v) << " to dominated set");
+ dominated_.insert(make_pair(v, r));
+ UpdateCanBeProcessed(v, can_be_processed);
+ }
+ }
+ return true;
+ }
+
+ const map<VertexId, Range>& dominated() const {
+ return dominated_;
+ }
+
+ GraphComponent<Graph> AsGraphComponent() const {
+ set<VertexId> vertices = key_set(dominated_);
+ return GraphComponent<Graph>(g_, vertices.begin(), vertices.end());
+ }
+
+ //little meaning if FillDominated returned false
+ const map<VertexId, Range> CountBorder() const {
+ map<VertexId, Range> border;
+ for (VertexId v : key_set(border)) {
+ for (EdgeId e : g_.OutgoingEdges(v)) {
+ VertexId e_end = g_.EdgeEnd(e);
+ if (dominated_.count(e_end) == 0) {
+ border[e_end] = NeighbourDistanceRange(e_end, false);
+ }
+ }
+ }
+ return border;
+ }
+
+};
+}
diff --git a/src/modules/algorithms/simplification/ec_threshold_finder.hpp b/src/modules/algorithms/simplification/ec_threshold_finder.hpp
new file mode 100644
index 0000000..84d7af2
--- /dev/null
+++ b/src/modules/algorithms/simplification/ec_threshold_finder.hpp
@@ -0,0 +1,152 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef OMNI_TOOLS_HPP_
+#define OMNI_TOOLS_HPP_
+
+#include "dev_support/simple_tools.hpp"
+
+#include "dev_support/path_helper.hpp"
+#include "assembly_graph/graph_support/basic_edge_conditions.hpp"
+#include "assembly_graph/graph_support/parallel_processing.hpp"
+#include "assembly_graph/graph_support/basic_vertex_conditions.hpp"
+#include "assembly_graph/graph_core/basic_graph_stats.hpp"
+
+#ifdef USE_GLIBCXX_PARALLEL
+#include "parallel/algorithm"
+#endif
+
+namespace omnigraph {
+
+template<class Graph>
+class ErroneousConnectionThresholdFinder {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ const Graph &graph_;
+ size_t backet_width_;
+
+ bool IsInteresting(EdgeId e) const {
+ if (graph_.length(e) > graph_.k() + 1)
+ return false;
+
+ if (graph_.OutgoingEdgeCount(graph_.EdgeStart(e)) < 2 ||
+ graph_.IncomingEdgeCount(graph_.EdgeEnd(e)) < 2)
+ return false;
+
+ std::vector<EdgeId> v1;
+ push_back_all(v1, graph_.OutgoingEdges(graph_.EdgeStart(e)));
+ std::vector<EdgeId> v2;
+ push_back_all(v2, graph_.IncomingEdges(graph_.EdgeEnd(e)));
+ bool eq = (v1.size() == 2 && v2.size() == 2) && ((v1[0] == v2[0] && v1[1] == v2[1]) || (v1[0] == v2[1] && v1[0] == v2[1]));
+ return !eq;
+ }
+
+ double weight(size_t value, const map<size_t, size_t> &histogram,
+ size_t backet_width) const {
+ double result = 0;
+ for (size_t i = 0; i < backet_width && value + i < histogram.size(); i++) {
+ result += (double) (getValue(value + i, histogram) * std::min(i + 1, backet_width - i));
+ }
+ return result;
+ }
+
+ double Median(double thr = 500.0) const {
+ vector<double> coverages;
+ for (auto it = graph_.ConstEdgeBegin(); !it.IsEnd(); ++it) {
+ if (graph_.length(*it) > thr)
+ coverages.push_back(graph_.coverage(*it));
+ }
+
+ auto middle_it = coverages.begin() + coverages.size() / 2;
+#ifdef USE_GLIBCXX_PARALLEL
+ __gnu_parallel::nth_element(coverages.begin(), middle_it, coverages.end());
+#else
+ std::nth_element(coverages.begin(), middle_it, coverages.end());
+#endif
+ return coverages[coverages.size() / 2];
+ }
+
+ size_t getValue(size_t arg, const map<size_t, size_t> &ssmap) const {
+ auto it = ssmap.find(arg);
+ if (it == ssmap.end())
+ return 0;
+ else
+ return it->second;
+ }
+
+public:
+ ErroneousConnectionThresholdFinder(const Graph &graph, size_t backet_width = 0) :
+ graph_(graph), backet_width_(backet_width) {
+ }
+
+ double AvgCoverage() const {
+ double cov = 0;
+ double length = 0;
+ for (auto it = graph_.ConstEdgeBegin(); !it.IsEnd(); ++it) {
+ cov += graph_.coverage(*it) * (double) graph_.length(*it);
+ length += (double) graph_.length(*it);
+ }
+ return cov / length;
+ }
+
+ std::map<size_t, size_t> ConstructHistogram() const {
+ std::map<size_t, size_t> result;
+ for (auto it = graph_.ConstEdgeBegin(); !it.IsEnd(); ++it) {
+ if (IsInteresting(*it))
+ result[(size_t)graph_.coverage(*it)]++;
+ }
+ return result;
+ }
+
+ double FindThreshold(const map<size_t, size_t> &histogram) const {
+ size_t backet_width = backet_width_;
+ if (backet_width == 0) {
+ backet_width = (size_t)(0.3 * AvgCovereageCounter<Graph>(graph_).Count() + 5);
+ }
+ size_t size = 0;
+ if (histogram.size() != 0)
+ size = histogram.rbegin()->first + 1;
+ INFO("Bucket size: " << backet_width);
+ size_t cnt = 0;
+ for (size_t i = 1; i + backet_width < size; i++) {
+ if (weight(i, histogram, backet_width) > weight(i - 1, histogram, backet_width))
+ cnt++;
+
+ if (i > backet_width &&
+ weight(i - backet_width, histogram, backet_width) >
+ weight(i - backet_width - 1, histogram, backet_width)) {
+ cnt--;
+ }
+ if (2 * cnt >= backet_width)
+ return (double) i;
+
+ }
+ INFO("Proper threshold was not found. Threshold set to 0.1 of average coverage");
+ return 0.1 * AvgCovereageCounter<Graph>(graph_).Count();
+ }
+
+ double FindThreshold() const {
+ INFO("Finding threshold started");
+ std::map<size_t, size_t> histogram = ConstructHistogram(/*weights*/);
+ for (size_t i = 0; i < histogram.size(); i++) {
+ TRACE(i << " " << histogram[i]);
+ }
+ double result = FindThreshold(histogram);
+ INFO("Average edge coverage: " << AvgCoverage());
+ INFO("Graph threshold: " << result);
+ result = std::max(AvgCoverage(), result);
+ INFO("Threshold finding finished. Threshold is set to " << result);
+ return result;
+ }
+private:
+ DECL_LOGGER("ThresholdFinder");
+};
+
+}
+
+#endif /* OMNI_TOOLS_HPP_ */
diff --git a/src/modules/algorithms/simplification/erroneous_connection_remover.hpp b/src/modules/algorithms/simplification/erroneous_connection_remover.hpp
new file mode 100644
index 0000000..937baaa
--- /dev/null
+++ b/src/modules/algorithms/simplification/erroneous_connection_remover.hpp
@@ -0,0 +1,567 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * erroneous_connection_remover.hpp
+ *
+ * Created on: May 31, 2011
+ * Author: sergey
+ */
+
+#pragma once
+
+#include "assembly_graph/graph_support/graph_processing_algorithm.hpp"
+#include "assembly_graph/graph_support/basic_edge_conditions.hpp"
+#include "dev_support/func.hpp"
+#include "math/xmath.h"
+#include "algorithms/dijkstra/dijkstra_helper.hpp"
+#include "assembly_graph/graph_core/coverage.hpp"
+
+namespace omnigraph {
+
+template<class Graph>
+pred::TypedPredicate<typename Graph::EdgeId>
+NecessaryECCondition(const Graph& g, size_t max_length, double max_coverage) {
+ return AddAlternativesPresenceCondition(g, pred::And(LengthUpperBound<Graph>(g, max_length),
+ CoverageUpperBound<Graph>(g, max_coverage)));
+}
+
+template<class Graph>
+bool RemoveErroneousEdgesInCoverageOrder(Graph &g,
+ pred::TypedPredicate<typename Graph::EdgeId> removal_condition,
+ double max_coverage,
+ std::function<void(typename Graph::EdgeId)> removal_handler) {
+ omnigraph::EdgeRemovingAlgorithm<Graph> erroneous_edge_remover(g,
+ AddAlternativesPresenceCondition(g, removal_condition),
+ removal_handler);
+
+ return erroneous_edge_remover.Run(CoverageComparator<Graph>(g),
+ CoverageUpperBound<Graph>(g, max_coverage));
+}
+
+template<class Graph>
+bool RemoveErroneousEdgesInLengthOrder(Graph &g,
+ pred::TypedPredicate<typename Graph::EdgeId> removal_condition,
+ size_t max_length,
+ std::function<void(typename Graph::EdgeId)> removal_handler) {
+ omnigraph::EdgeRemovingAlgorithm<Graph> erroneous_edge_remover(g,
+ AddAlternativesPresenceCondition(g, removal_condition),
+ removal_handler);
+
+ return erroneous_edge_remover.Run(LengthComparator<Graph>(g),
+ LengthUpperBound<Graph>(g, max_length));
+}
+
+template<class Graph>
+class SelfConjugateCondition : public EdgeCondition<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef EdgeCondition<Graph> base;
+
+ public:
+
+ SelfConjugateCondition(const Graph& g)
+ : base(g) {
+ }
+
+ bool Check(EdgeId e) const {
+ return e == this->g().conjugate(e);
+ }
+
+ private:
+ DECL_LOGGER("SelfConjugateCondition");
+};
+
+//coverage comparator
+//template<class Graph>
+//class RelativeCoverageCondition : public EdgeCondition<Graph> {
+// typedef typename Graph::EdgeId EdgeId;
+// typedef typename Graph::VertexId VertexId;
+// typedef EdgeCondition<Graph> base;
+//
+// double min_coverage_gap_;
+//
+// bool StrongNeighbourCondition(EdgeId neighbour_edge,
+// EdgeId possible_ec) const {
+// return neighbour_edge == possible_ec
+// || math::gr(this->g().coverage(neighbour_edge),
+// this->g().coverage(possible_ec) * min_coverage_gap_);
+//// || this->g().length(neighbour_edge)
+//// >= neighbour_length_threshold_;
+// }
+//
+// bool CheckAdjacent(const vector<EdgeId>& edges, EdgeId possible_ec) const {
+// FOREACH (EdgeId e, edges) {
+// if (!StrongNeighbourCondition(e, possible_ec))
+// return false;
+// }
+// return true;
+// }
+//
+// public:
+//
+// RelativeCoverageCondition(const Graph& g, double min_coverage_gap)
+// : base(g),
+// min_coverage_gap_(min_coverage_gap) {
+//
+// }
+//
+// bool Check(EdgeId e) const {
+// const Graph& g = this->g();
+// return CheckAdjacent(g.IncidentEdges(g.EdgeStart(e)), e)
+// && CheckAdjacent(g.IncidentEdges(g.EdgeEnd(e)), e);
+// }
+//
+// private:
+// DECL_LOGGER("RelativeCoverageCondition")
+// ;
+//
+//};
+
+//todo refactor
+template<class Graph>
+class ThornCondition : public EdgeCondition<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef EdgeCondition<Graph> base;
+
+ size_t uniqueness_length_;
+ size_t dijkstra_depth_;
+
+ bool Unique(const vector<EdgeId>& edges, bool forward) const {
+ return edges.size() == 1 && CheckUniqueness(*edges.begin(), forward);
+ }
+
+ bool CheckUnique(EdgeId e) const {
+ TRACE("Checking conditions for edge start");
+ return Unique(vector<EdgeId>(this->g().in_begin(this->g().EdgeStart(e)), this->g().in_end(this->g().EdgeStart(e))), false)
+ || Unique(vector<EdgeId>(this->g().out_begin(this->g().EdgeEnd(e)), this->g().out_end(this->g().EdgeEnd(e))), true);
+ }
+
+ bool CheckThorn(EdgeId e) const {
+ if (this->g().EdgeStart(e) == this->g().EdgeEnd(e))
+ return false;
+ if (this->g().RelatedVertices(this->g().EdgeStart(e),
+ this->g().EdgeEnd(e))) {
+ return true;
+ }
+ if (this->g().OutgoingEdgeCount(this->g().EdgeStart(e)) != 2)
+ return false;
+ if (this->g().IncomingEdgeCount(this->g().EdgeStart(e)) != 1)
+ return false;
+ if (this->g().OutgoingEdgeCount(this->g().EdgeEnd(e)) != 1)
+ return false;
+ if (this->g().IncomingEdgeCount(this->g().EdgeEnd(e)) != 2)
+ return false;
+
+ auto dij = DijkstraHelper<Graph>::CreateBoundedDijkstra(this->g(), dijkstra_depth_);
+ dij.Run(this->g().EdgeStart(e));
+ vector<VertexId> reached = dij.ReachedVertices();
+ for (auto it = reached.begin(); it != reached.end(); ++it) {
+ if (*it != this->g().EdgeEnd(e)
+ && this->g().RelatedVertices(*it, this->g().EdgeEnd(e))) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ template<class EdgeContainer>
+ bool CheckAlternativeCoverage(const EdgeContainer& edges, EdgeId base) const {
+ for (EdgeId e: edges) {
+ if (e != base && this->g().length(e) < 400
+ && this->g().coverage(e) < 15 * this->g().coverage(base)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ bool CheckCoverageAround(EdgeId e) const {
+ return CheckAlternativeCoverage(
+ this->g().IncidentEdges(this->g().EdgeStart(e)), e)
+ && CheckAlternativeCoverage(
+ this->g().IncidentEdges(this->g().EdgeEnd(e)), e);
+ }
+
+ bool CheckUniqueness(EdgeId e, bool /*forward*/) const {
+ return this->g().length(e) >= uniqueness_length_;
+ }
+
+ public:
+
+ ThornCondition(Graph& g, size_t uniqueness_length, size_t dijkstra_depth)
+ : base(g),
+ uniqueness_length_(uniqueness_length),
+ dijkstra_depth_(dijkstra_depth) {
+ }
+
+ bool Check(EdgeId e) const {
+ bool tmp = (CheckUnique(e) || CheckCoverageAround(e));
+ if (tmp)
+ tmp &= CheckThorn(e);
+ return tmp;
+ }
+
+ private:
+ DECL_LOGGER("ThornCondition")
+ ;
+
+};
+
+
+template<class Graph>
+class MultiplicityCounter {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ const Graph &graph_;
+ size_t uniqueness_length_;
+ size_t max_depth_;
+
+ bool search(VertexId a, VertexId start, EdgeId e, size_t depth,
+ std::set<VertexId> &was, pair<size_t, size_t> &result) const {
+ if (depth > max_depth_)
+ return false;
+ if (was.count(a) == 1)
+ return true;
+ was.insert(a);
+ if (graph_.OutgoingEdgeCount(a) == 0
+ || graph_.IncomingEdgeCount(a) == 0)
+ return false;
+ for (auto I = graph_.out_begin(a), E = graph_.out_end(a); I != E; ++I) {
+ if (*I == e) {
+ if (a != start) {
+ return false;
+ }
+ } else {
+ if (graph_.length(*I) >= uniqueness_length_) {
+ result.second++;
+ } else {
+ if (!search(graph_.EdgeEnd(*I), start, e,
+ depth + 1 /*graph_.length(*it)*/, was, result))
+ return false;
+ }
+ }
+ }
+ for (EdgeId in_e : graph_.IncomingEdges(a)) {
+ if (in_e == e) {
+ if (a != start) {
+ return false;
+ }
+ } else {
+ if (graph_.length(in_e) >= uniqueness_length_) {
+ result.first++;
+ } else {
+ if (!search(graph_.EdgeStart(in_e), start, e,
+ depth + 1 /*graph_.length(*it)*/, was, result))
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+public:
+ MultiplicityCounter(const Graph &graph, size_t uniqueness_length,
+ size_t max_depth)
+ : graph_(graph),
+ uniqueness_length_(uniqueness_length),
+ max_depth_(max_depth) {
+ }
+
+ size_t count(EdgeId e, VertexId start) const {
+ std::pair<size_t, size_t> result;
+ std::set<VertexId> was;
+ bool valid = search(start, start, e, 0, was, result);
+ if (!valid) {
+ return (size_t) (-1);
+ }
+ if (graph_.EdgeStart(e) == start) {
+ if (result.first < result.second) {
+ return (size_t) (-1);
+ }
+ return result.first - result.second;
+ } else {
+ if (result.first > result.second) {
+ return (size_t) (-1);
+ }
+ return -result.first + result.second;
+ }
+ }
+};
+
+template<class Graph>
+class MultiplicityCountingCondition : public UniquenessPlausabilityCondition<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef pred::TypedPredicate<EdgeId> EdgePredicate;
+ typedef UniquenessPlausabilityCondition<Graph> base;
+
+ MultiplicityCounter<Graph> multiplicity_counter_;
+ EdgePredicate plausiblity_condition_;
+
+public:
+ bool CheckUniqueness(EdgeId e, bool forward) const {
+ TRACE( "Checking " << this->g().int_id(e) << " for uniqueness in " << (forward ? "forward" : "backward") << " direction");
+ VertexId start =
+ forward ? this->g().EdgeEnd(e) : this->g().EdgeStart(e);
+ bool result = multiplicity_counter_.count(e, start) <= 1;
+ TRACE( "Edge " << this->g().int_id(e) << " is" << (result ? "" : " not") << " unique");
+ return result;
+ }
+
+ bool CheckPlausibility(EdgeId e, bool) const {
+ return plausiblity_condition_(e);
+ }
+
+ MultiplicityCountingCondition(const Graph& g, size_t uniqueness_length,
+ EdgePredicate plausiblity_condition)
+ :
+ //todo why 8???
+ base(g),
+ multiplicity_counter_(g, uniqueness_length, 8),
+ plausiblity_condition_(plausiblity_condition) {
+
+ }
+
+ private:
+
+ DECL_LOGGER("MultiplicityCountingCondition")
+ ;
+};
+
+
+template<class Graph>
+class ECLoopRemover : public EdgeProcessingAlgorithm<Graph> {
+ typedef std::less<typename Graph::EdgeId> Comparator;
+ typedef EdgeProcessingAlgorithm<Graph> base;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ double ec_threshold_;
+ double relative_threshold_;
+ const AbstractFlankingCoverage<Graph> &flanking_coverage_;
+ EdgeRemover<Graph> edge_remover_;
+ size_t coverage_loops_removed = 0;
+ size_t dead_loops_removed = 0;
+ size_t not_dead_loops_removed = 0;
+ size_t coverage_rc_loops_removed = 0;
+ size_t dead_rc_loops_removed = 0;
+ size_t not_dead_rc_loops_removed = 0;
+
+ bool IsLoop(EdgeId e) {
+ return this->g().EdgeStart(e) == this->g().EdgeEnd(e);
+ }
+
+ bool IsRCLoop(EdgeId e) {
+ return this->g().EdgeStart(e) == this->g().conjugate(this->g().EdgeEnd(e));
+ }
+
+ bool IsAnyLoop(EdgeId e) {
+ return IsRCLoop(e) || IsLoop(e);
+ }
+
+ void RemoveHiddenLoopEC(EdgeId e, bool break_on_end) {
+ if (IsLoop(e))
+ coverage_loops_removed++;
+ else
+ coverage_rc_loops_removed++;
+ if (this->g().length(e) <= this->g().k())
+ edge_remover_.DeleteEdge(e);
+ else {
+ if (break_on_end) {
+ auto split_result = this->g().SplitEdge(e, this->g().length(e) - this->g().k());
+ edge_remover_.DeleteEdge(split_result.second);
+ } else {
+ auto split_result = this->g().SplitEdge(e, this->g().k());
+ edge_remover_.DeleteEdge(split_result.first);
+ }
+ }
+
+ }
+ void RemoveLoopWithNoCheck(EdgeId e) {
+ if (IsLoop(e)) {
+ if (this->g().IncomingEdgeCount(this->g().EdgeStart(e)) == 1 || this->g().OutgoingEdgeCount(this->g().EdgeStart(e)) == 1)
+ dead_loops_removed++;
+ else
+ not_dead_loops_removed++;
+ } else {
+ if (this->g().IncomingEdgeCount(this->g().EdgeStart(e)) == 2)
+ dead_rc_loops_removed++;
+ else
+ not_dead_rc_loops_removed++;
+
+ }
+ edge_remover_.DeleteEdge(e);
+ }
+
+ bool FindHiddenLoopEC(EdgeId e) {
+ if(flanking_coverage_.GetInCov(e) * relative_threshold_ < flanking_coverage_.GetOutCov(e) && flanking_coverage_.GetInCov(e) < ec_threshold_) {
+ //start is bad, end is OK.
+ RemoveHiddenLoopEC(e, false);
+ return true;
+ } else if(flanking_coverage_.GetOutCov(e) * relative_threshold_ < flanking_coverage_.GetInCov(e) && flanking_coverage_.GetOutCov(e) < ec_threshold_) {
+ //end is bad, start is OK.
+ RemoveHiddenLoopEC(e, true);
+ return true;
+ }
+ RemoveLoopWithNoCheck(e);
+ return false;
+ }
+
+ bool ProcessEdge(EdgeId e) {
+ if (IsAnyLoop(e)) {
+ DEBUG("Susp loop: " << this->g().int_id(e) << endl);
+ bool res = FindHiddenLoopEC(e);
+ if (res) {DEBUG ("was removed");} else {DEBUG("was not removed"); }
+ return res;
+ }
+ return false;
+ }
+
+
+public:
+ ECLoopRemover(Graph &g, const AbstractFlankingCoverage<Graph> &flanking_coverage, double ec_threshold, double relative_threshold,
+ HandlerF<Graph> removal_handler = 0): base(g),ec_threshold_(ec_threshold),
+ relative_threshold_(relative_threshold), flanking_coverage_(flanking_coverage),
+ edge_remover_(g, removal_handler){
+ }
+ void PrintLoopStats(){
+ INFO("Loops: accurately removed/deadend removed/other: "<< coverage_loops_removed <<"/" << dead_loops_removed << "/" <<not_dead_loops_removed);
+ INFO("RC loops: accurately removed/deadend removed/other: "<< coverage_rc_loops_removed <<"/" << dead_rc_loops_removed << "/" <<not_dead_rc_loops_removed);
+ }
+private:
+ DECL_LOGGER("ECLoopRemover");
+};
+
+
+template<class Graph>
+class HiddenECRemover: public EdgeProcessingAlgorithm<Graph> {
+ typedef EdgeProcessingAlgorithm<Graph> base;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+private:
+ size_t uniqueness_length_;
+ double unreliability_threshold_;
+ double ec_threshold_;
+ double relative_threshold_;
+ const AbstractFlankingCoverage<Graph> &flanking_coverage_;
+ EdgeRemover<Graph> edge_remover_;
+ MultiplicityCountingCondition<Graph> condition_;
+private:
+ void RemoveHiddenEC(EdgeId edge) {
+ if (this->g().length(edge) <= this->g().k() || (edge == this->g().conjugate(edge) && this->g().length(edge) <= 2 * this->g().k()))
+ edge_remover_.DeleteEdge(edge);
+ else {
+ auto split_result = this->g().SplitEdge(edge, this->g().k());
+ edge_remover_.DeleteEdge(split_result.first);
+ }
+ }
+
+ void RemoveHiddenECWithNoCompression(EdgeId edge) {
+ if (this->g().length(edge) <= this->g().k() || (edge == this->g().conjugate(edge) && this->g().length(edge) <= 2 * this->g().k())) {
+ edge_remover_.DeleteEdgeWithNoCompression(edge);
+ } else {
+ auto split_result = this->g().SplitEdge(edge, this->g().k());
+ edge_remover_.DeleteEdgeWithNoCompression(split_result.first);
+ }
+ }
+
+ void DisconnectEdges(VertexId v) {
+ while(!this->g().IsDeadEnd(v)) {
+ RemoveHiddenECWithNoCompression(*(this->g().out_begin(v)));
+ }
+ }
+
+ bool FindHiddenEC(VertexId v) {
+ vector<EdgeId> edges(this->g().out_begin(v), this->g().out_end(v));
+ if(flanking_coverage_.GetInCov(edges[0]) > flanking_coverage_.GetInCov(edges[1])) {
+ auto tmp = edges[0];
+ edges[0] = edges[1];
+ edges[1] = tmp;
+ }
+// cout << flanking_coverage_.GetInCov(edges[0]) << " " << flanking_coverage_.GetInCov(edges[1]) << endl;
+ if(flanking_coverage_.GetInCov(edges[1]) < unreliability_threshold_) {
+ DisconnectEdges(v);
+// cout << "disconnected" << endl;
+ return true;
+ }
+ if(flanking_coverage_.GetInCov(edges[0]) * relative_threshold_ < flanking_coverage_.GetInCov(edges[1]) && flanking_coverage_.GetInCov(edges[0]) < ec_threshold_) {
+ RemoveHiddenEC(edges[0]);
+// cout << "success" << endl;
+ return true;
+ }
+ return false;
+ }
+
+ bool CheckSuspicious(VertexId v) {
+ if (this->g().IncomingEdgeCount(v) != 1 || this->g().OutgoingEdgeCount(v) != 2) {
+ return false;
+ }
+ vector<EdgeId> edges(this->g().out_begin(v), this->g().out_end(v));
+ return (edges.size() == 2 && this->g().conjugate(edges[0]) == edges[1] && condition_.CheckUniqueness(this->g().GetUniqueIncomingEdge(v), false)) || this->g().length(this->g().GetUniqueIncomingEdge(v)) >= uniqueness_length_;
+ }
+
+ bool ProcessEdge(EdgeId e) {
+ VertexId v = this->g().EdgeEnd(e);
+ if(CheckSuspicious(v)) {
+// cout << "client: " << this->g().int_id(v) << endl;
+ return FindHiddenEC(v);
+ }
+ return false;
+ }
+
+public:
+ HiddenECRemover(Graph& g, size_t uniqueness_length,
+ const AbstractFlankingCoverage<Graph> &flanking_coverage,
+ double unreliability_threshold, double ec_threshold,
+ double relative_threshold,
+ std::function<void(EdgeId)> removal_handler = 0)
+ : base(g), uniqueness_length_(uniqueness_length),
+ unreliability_threshold_(unreliability_threshold * ec_threshold), ec_threshold_(ec_threshold),
+ relative_threshold_(relative_threshold), flanking_coverage_(flanking_coverage),
+ edge_remover_(g, removal_handler),
+ condition_(g, uniqueness_length, pred::AlwaysTrue<EdgeId>()) {
+ }
+
+private:
+ DECL_LOGGER("HiddenECRemover");
+};
+
+template<class Graph>
+class SelfConjugateDisruptor: public EdgeProcessingAlgorithm<Graph> {
+ typedef EdgeProcessingAlgorithm<Graph> base;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ EdgeRemover<Graph> edge_remover_;
+protected:
+
+ bool ProcessEdge(EdgeId e) override {
+ if (e == this->g().conjugate(e)) {
+ TRACE("Disrupting self-conjugate edge " << this->g().str(e));
+ EdgeId to_del = e;
+ size_t len = this->g().length(e);
+ if (len > 1) {
+ to_del = this->g().SplitEdge(e, len / 2).second;
+ }
+ edge_remover_.DeleteEdge(to_del);
+ return true;
+ }
+ return false;
+ }
+
+public:
+ SelfConjugateDisruptor(Graph& g,
+ std::function<void(EdgeId)> removal_handler = 0)
+ : base(g, true), edge_remover_(g, removal_handler) {
+ }
+
+private:
+ DECL_LOGGER("SelfConjugateDisruptor");
+};
+}
diff --git a/src/modules/algorithms/simplification/mf_ec_remover.hpp b/src/modules/algorithms/simplification/mf_ec_remover.hpp
new file mode 100644
index 0000000..08a7270
--- /dev/null
+++ b/src/modules/algorithms/simplification/mf_ec_remover.hpp
@@ -0,0 +1,514 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <map>
+#include <queue>
+
+#include "assembly_graph/components/splitters.hpp"
+#include "cleaner.hpp"
+#include "assembly_graph/graph_support/graph_processing_algorithm.hpp"
+
+namespace omnigraph {
+
+using std::set;
+using std::map;
+using std::vector;
+using std::pair;
+using std::queue;
+using std::make_pair;
+
+template<class Graph>
+class FlowGraph {
+public:
+ typedef size_t FlowVertexId;
+ typedef pair<FlowVertexId, FlowVertexId> FlowEdgeId;
+
+private:
+ typedef typename Graph::VertexId OuterVertexId;
+ map<OuterVertexId, FlowVertexId> vertex_mapping_;
+ map<FlowVertexId, map<FlowVertexId, int>> capacities_;
+ set<FlowVertexId> vertices_;
+ size_t vertex_number_;
+ FlowVertexId source_;
+ FlowVertexId sink_;
+
+ FlowVertexId AddVertex() {
+ vertices_.insert(vertex_number_);
+ capacities_[vertex_number_];
+ vertex_number_++;
+ return vertex_number_ - 1;
+ }
+
+ void PushFlow(FlowEdgeId edge, int capacity) {
+ VERIFY(capacities_[EdgeStart(edge)][EdgeEnd(edge)] >= capacity);
+ capacities_[EdgeStart(edge)][EdgeEnd(edge)] -= capacity;
+ capacities_[EdgeEnd(edge)][EdgeStart(edge)] += capacity;
+ }
+
+ void AddEdge(FlowVertexId first, FlowVertexId second, int capacity = 10000) {
+ capacities_[first][second] += capacity; // operator [] creates entry with default values in case argument is not in keyset
+ capacities_[second][first] += 0;
+ }
+
+public:
+ FlowGraph() :
+ vertex_number_(0), source_(AddVertex()), sink_(AddVertex()) {
+ }
+
+ FlowVertexId GetCorrespondingVertex(OuterVertexId v) const {
+ return vertex_mapping_.find(v)->second;
+ }
+
+ bool HasCorrespondingVertex(OuterVertexId v) const {
+ return vertex_mapping_.find(v) == vertex_mapping_.end();
+ }
+
+ FlowVertexId AddVertex(OuterVertexId vertex) {
+ FlowVertexId new_vertex = AddVertex();
+ vertex_mapping_[vertex] = new_vertex;
+ return new_vertex;
+ }
+
+ void AddEdge(OuterVertexId outer_first, OuterVertexId outer_second,
+ int capacity = 10000) {
+ VERIFY(
+ vertex_mapping_.find(outer_first) != vertex_mapping_.end()
+ && vertex_mapping_.find(outer_second)
+ != vertex_mapping_.end());
+ FlowVertexId first = vertex_mapping_[outer_first];
+ FlowVertexId second = vertex_mapping_[outer_second];
+ AddEdge(first, second, capacity);
+ }
+
+ void AddSource(OuterVertexId vertex, int capacity) {
+ AddEdge(source_, GetCorrespondingVertex(vertex), capacity);
+ }
+
+ void AddSink(OuterVertexId vertex, int capacity) {
+ AddEdge(GetCorrespondingVertex(vertex), sink_, capacity);
+ }
+
+ FlowVertexId Source() const {
+ return source_;
+ }
+
+ FlowVertexId Sink() const {
+ return sink_;
+ }
+
+ bool Connected(FlowVertexId start, FlowVertexId end) const {
+ return capacities_.find(start) != capacities_.end()
+ && capacities_.find(start)->second.find(end)
+ != capacities_.find(start)->second.end()
+ && capacities_.find(start)->second.find(end)->second > 0;
+ }
+
+ vector<FlowEdgeId> OutgoingEdges(FlowVertexId v) const {
+ vector<FlowEdgeId> result;
+ const map<FlowVertexId, int> &outgoing = capacities_.find(v)->second;
+ for (auto it = outgoing.begin(); it != outgoing.end(); ++it) {
+ if (it->second > 0) {
+ result.push_back(make_pair(v, it->first));
+ }
+ }
+ return result;
+ }
+
+ vector<FlowEdgeId> IncomingEdges(FlowVertexId v) const {
+ vector<FlowEdgeId> result;
+ const map<FlowVertexId, int> &outgoing = capacities_.find(v)->second;
+ for (auto it = outgoing.begin(); it != outgoing.end(); ++it) {
+ if (Connected(it->first, v)) {
+ result.push_back(make_pair(it->first, v));
+ }
+ }
+ return result;
+ }
+
+ size_t OutgoingEdgesCount(FlowVertexId v) const {
+ return OutgoingEdges(v).size();
+ }
+
+ size_t IncomingEdgesCount(FlowVertexId v) const {
+ return IncomingEdges(v).size();
+ }
+
+ FlowVertexId EdgeStart(FlowEdgeId edge) const {
+ return edge.first;
+ }
+
+ FlowVertexId EdgeEnd(FlowEdgeId edge) const {
+ return edge.second;
+ }
+
+ set<FlowVertexId>::iterator begin() const {
+ return vertices_.begin();
+ }
+
+ set<FlowVertexId>::iterator end() const {
+ return vertices_.end();
+ }
+
+ int GetCapacity(FlowVertexId first, FlowVertexId second) const {
+ auto it1 = capacities_.find(first);
+ if (it1 == capacities_.end())
+ return 0;
+ auto it2 = it1->second.find(second);
+ if (it2 == it1->second.end())
+ return 0;
+ return it2->second;
+ }
+
+ void PushFlow(vector<FlowVertexId> path, int capacity) {
+ size_t n = path.size();
+ VERIFY(path[0] == source_ && path[n - 1] == sink_);
+ for (size_t i = 0; i + 1 < n; i++) {
+ PushFlow(make_pair(path[i], path[i + 1]), capacity);
+ }
+ }
+
+// void Print() const {
+// for(auto it = vertex_mapping_.begin(); it != vertex_mapping_.end(); ++it) {
+// TRACE(it->first << " " << it->second);
+// }
+// for(auto it = vertices_.begin(); it != vertices_.end();) {
+// auto out = OutgoingEdges(*it);
+// for(auto it1 = out.begin(); it1 != out.end(); ++it1) {
+// TRACE("edge " << (*it1) << " " << GetCapacity(*it, it1->second));
+// }
+// ++it;
+// if(it == vertices_.end())
+// break;
+// }
+// }
+};
+
+template<class Graph>
+class BFS {
+private:
+ const Graph &graph_;
+ typedef typename Graph::FlowVertexId FlowVertexId;
+ typedef typename Graph::FlowEdgeId FlowEdgeId;
+
+ vector<FlowVertexId> RestoreAnswer(FlowVertexId start, FlowVertexId end,
+ const map<FlowVertexId, FlowVertexId> &prev) {
+ vector<FlowVertexId> result;
+ result.push_back(end);
+ FlowVertexId current = end;
+ while (current != start) {
+ current = prev.find(current)->second;
+ result.push_back(current);
+ }
+ return vector<FlowVertexId>(result.rbegin(), result.rend());
+ }
+
+public:
+ BFS(const Graph &graph) :
+ graph_(graph) {
+ }
+
+ vector<FlowVertexId> Go(FlowVertexId start, FlowVertexId finish) {
+ queue<FlowVertexId> q;
+ q.push(start);
+ map<FlowVertexId, FlowVertexId> prev;
+ prev[start] = start;
+ while (!q.empty()) {
+ FlowVertexId current = q.front();
+ q.pop();
+ vector<FlowEdgeId> outgoing = graph_.OutgoingEdges(current);
+ for (auto it = outgoing.begin(); it != outgoing.end(); ++it) {
+ if (prev.find(it->second) == prev.end()) {
+ q.push(it->second);
+ prev[it->second] = current;
+ }
+ if (it->second == finish) {
+ return RestoreAnswer(start, finish, prev);
+ }
+ }
+ }
+ return vector<FlowVertexId>();
+ }
+};
+
+template<class Graph>
+class MaxFlowFinder {
+private:
+ FlowGraph<Graph> &graph_;
+ typedef typename FlowGraph<Graph>::FlowVertexId FlowVertexId;
+ typedef typename FlowGraph<Graph>::FlowEdgeId FlowEdgeId;
+
+ int MinCapacity(vector<FlowVertexId> path) {
+ VERIFY(path.size() >= 2);
+ int result = graph_.GetCapacity(path[0], path[1]);
+ for (size_t i = 1; i + 1 < path.size(); i++) {
+ result = std::min(result, graph_.GetCapacity(path[i], path[i + 1]));
+ }
+ return result;
+ }
+
+public:
+ MaxFlowFinder(FlowGraph<Graph> &graph) :
+ graph_(graph) {
+ }
+
+ void Find() {
+ BFS<FlowGraph<Graph> > bfs(graph_);
+ while (true) {
+ vector<FlowVertexId> path = bfs.Go(graph_.Source(), graph_.Sink());
+ if (path.size() == 0)
+ break;
+ int capacity = MinCapacity(path);
+ VERIFY(capacity > 0);
+ graph_.PushFlow(path, capacity);
+// graph_.Print();
+ }
+ }
+};
+
+template<class Graph>
+class TopSorter {
+private:
+ typedef typename Graph::FlowVertexId FlowVertexId;
+ typedef typename Graph::FlowEdgeId FlowEdgeId;
+ const Graph &graph_;
+
+ void Find(FlowVertexId v, vector<FlowVertexId> &result, set<FlowVertexId> &visited) {
+ visited.insert(v);
+ vector<FlowEdgeId> outgoing = graph_.OutgoingEdges(v);
+ for (auto it = outgoing.begin(); it != outgoing.end(); ++it) {
+ FlowVertexId next = graph_.EdgeEnd(*it);
+ if (visited.count(next) == 0) {
+ Find(next, result, visited);
+ }
+ }
+ result.push_back(v);
+ }
+
+public:
+ TopSorter(const Graph &graph) :
+ graph_(graph) {
+ }
+
+ vector<FlowVertexId> Sort() {
+ vector<FlowVertexId> result;
+ set<FlowVertexId> visited;
+ for (auto it = graph_.begin(); it != graph_.end(); ++it) {
+ if (visited.count(*it) == 0) {
+ Find(*it, result, visited);
+ }
+ }
+ return result;
+ }
+};
+
+template<class Graph>
+class ReverseDFSComponentFinder {
+private:
+ typedef typename Graph::FlowVertexId FlowVertexId;
+ typedef typename Graph::FlowEdgeId FlowEdgeId;
+
+ const Graph &graph_;
+
+ void Find(FlowVertexId v, map<FlowVertexId, size_t> &result, size_t cc) {
+ result[v] = cc;
+ vector<FlowEdgeId> incoming = graph_.IncomingEdges(v);
+ for (auto it = incoming.begin(); it != incoming.end(); ++it) {
+ FlowVertexId next = graph_.EdgeStart(*it);
+ if (result.count(next) == 0) {
+ Find(next, result, cc);
+ }
+ }
+ }
+public:
+ ReverseDFSComponentFinder(const Graph &graph) :
+ graph_(graph) {
+ }
+
+ map<FlowVertexId, size_t> Find(const vector<FlowVertexId> &order) {
+ size_t cc = 0;
+ map<FlowVertexId, size_t> result;
+ for (auto it = order.rbegin(); it != order.rend(); ++it) {
+ if (result.count(*it) == 0) {
+ Find(*it, result, cc);
+ cc++;
+ }
+ }
+ return result;
+ }
+};
+
+template<class Graph>
+class StroglyConnectedComponentFinder {
+private:
+ typedef typename Graph::FlowVertexId FlowVertexId;
+ const Graph &graph_;
+ bool ready_;
+public:
+ StroglyConnectedComponentFinder(const Graph &graph) :
+ graph_(graph), ready_(false) {
+ }
+
+ map<FlowVertexId, size_t> ColourComponents() {
+ map<FlowVertexId, size_t> result;
+ vector<FlowVertexId> order = TopSorter<Graph>(graph_).Sort();
+ return ReverseDFSComponentFinder<Graph>(graph_).Find(order);
+ }
+};
+
+template<class Graph>
+class MaxFlowECRemover {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ Graph& g_;
+ size_t max_length_;
+ size_t uniqueness_length_;
+ size_t plausibility_length_;
+ ComponentRemover<Graph> component_remover_;
+
+ bool IsTerminal(VertexId vertex) {
+ return g_.OutgoingEdgeCount(vertex)
+ + g_.IncomingEdgeCount(vertex) == 1;
+ }
+
+ bool IsTip(EdgeId edge) {
+ VertexId start = g_.EdgeStart(edge);
+ VertexId end = g_.EdgeEnd(edge);
+ return IsTerminal(start) || IsTerminal(end);
+ }
+
+
+ bool IsSuspicious(EdgeId edge) {
+ return g_.length(edge) <= max_length_ && !IsTip(edge);
+ }
+
+ set<EdgeId> CollectUnusedEdges(set<VertexId> component, FlowGraph<Graph> fg,
+ const map<typename FlowGraph<Graph>::FlowVertexId, size_t> &colouring) {
+ set<EdgeId> result;
+ for (auto it_start = component.begin(); it_start != component.end();
+ ++it_start) {
+ VertexId start = *it_start;
+ auto outgoing = g_.OutgoingEdges(start);
+ for (auto it_edge = outgoing.begin(); it_edge != outgoing.end();
+ ++it_edge) {
+ EdgeId edge = *it_edge;
+ VertexId end = g_.EdgeEnd(edge);
+ if (component.count(end) == 1 && IsSuspicious(edge)
+ && colouring.find(fg.GetCorrespondingVertex(start))->second
+ != colouring.find(
+ fg.GetCorrespondingVertex(end))->second) {
+ result.insert(edge);
+ }
+ }
+ }
+ return result;
+ }
+
+ bool CheckCompleteFlow(FlowGraph<Graph> &fg) {
+ return fg.OutgoingEdges(fg.Source()).size() == 0
+ && fg.IncomingEdges(fg.Sink()).size() == 0;
+ }
+
+ bool IsPlausible(EdgeId edge) {
+ return g_.length(edge) >= plausibility_length_ && !IsTip(edge);
+ }
+
+ bool IsUnique(EdgeId edge) {
+ return g_.length(edge) >= uniqueness_length_;
+ }
+
+ bool IsInnerShortEdge(set<VertexId> component, EdgeId edge) {
+ return !IsUnique(edge) && component.count(g_.EdgeStart(edge)) == 1
+ && component.count(g_.EdgeEnd(edge)) == 1;
+ }
+
+ void ProcessShortEdge(FlowGraph<Graph> &fg, set<VertexId> component,
+ EdgeId edge) {
+ if (IsInnerShortEdge(component, edge)) {
+ fg.AddEdge(g_.EdgeStart(edge), g_.EdgeEnd(edge));
+ }
+ }
+
+ void ProcessSource(FlowGraph<Graph> &fg, set<VertexId> /*component*/,
+ EdgeId edge) {
+ if (IsPlausible(edge) || IsUnique(edge)) {
+ fg.AddSource(g_.EdgeEnd(edge), 1);
+ }
+ }
+
+ void ProcessSink(FlowGraph<Graph> &fg, set<VertexId> /*component*/,
+ EdgeId edge) {
+ if (IsPlausible(edge) || IsUnique(edge)) {
+ fg.AddSink(g_.EdgeStart(edge), 1);
+ }
+ }
+
+ void ConstructFlowGraph(FlowGraph<Graph> &fg, set<VertexId> component) {
+ for (auto it = component.begin(); it != component.end(); ++it) {
+ fg.AddVertex(*it);
+ }
+ for (auto it = component.begin(); it != component.end(); ++it) {
+ VertexId vertex = *it;
+ auto outgoing = g_.OutgoingEdges(vertex);
+ for (auto it_edge = outgoing.begin(); it_edge != outgoing.end();
+ ++it_edge) {
+ EdgeId edge = *it_edge;
+ ProcessShortEdge(fg, component, edge);
+ ProcessSink(fg, component, edge);
+ }
+ auto incoming = g_.IncomingEdges(vertex);
+ for (auto it_edge = incoming.begin(); it_edge != incoming.end();
+ ++it_edge) {
+ EdgeId edge = *it_edge;
+ ProcessSource(fg, component, edge);
+ }
+ }
+ }
+
+public:
+ MaxFlowECRemover(Graph& g, size_t max_length, size_t uniqueness_length,
+ size_t plausibility_length, std::function<void (EdgeId)>
+ /*fixme ignored, fix after merge with relative coverage branch!!! removal_handler*/) :
+ g_(g), max_length_(max_length), uniqueness_length_(
+ uniqueness_length), plausibility_length_(
+ plausibility_length), component_remover_(g, (std::function<void (set<EdgeId>)>) 0) {
+ VERIFY(uniqueness_length >= plausibility_length);
+ VERIFY(plausibility_length > max_length);
+ }
+
+ bool Process() {
+ for (shared_ptr<GraphSplitter<Graph>> splitter_ptr = LongEdgesExclusiveSplitter<Graph>(g_,
+ uniqueness_length_); splitter_ptr->HasNext();) {
+ set<VertexId> component = splitter_ptr->Next().vertices();
+ FlowGraph<Graph> fg;
+ ConstructFlowGraph(fg, component);
+// fg.Print();
+ MaxFlowFinder<Graph> mf_finder(fg);
+ mf_finder.Find();
+ if (!CheckCompleteFlow(fg)) {
+ TRACE("Suspicious component! No edge delition!");
+ continue;
+ }
+ StroglyConnectedComponentFinder<FlowGraph<Graph>> component_finder(
+ fg);
+ map<typename FlowGraph<Graph>::FlowVertexId, size_t> colouring =
+ component_finder.ColourComponents();
+ set<EdgeId> to_remove = CollectUnusedEdges(component, fg,
+ colouring);
+ component_remover_.DeleteComponent(to_remove.begin(), to_remove.end(), false);
+ }
+ CompressAllVertices(g_);
+ Cleaner<Graph>(g_).Run();
+
+ return false;
+ }
+private:
+ DECL_LOGGER("MaxFlowECRemover");
+};
+}
diff --git a/src/modules/algorithms/simplification/parallel_simplification_algorithms.hpp b/src/modules/algorithms/simplification/parallel_simplification_algorithms.hpp
new file mode 100644
index 0000000..bea146c
--- /dev/null
+++ b/src/modules/algorithms/simplification/parallel_simplification_algorithms.hpp
@@ -0,0 +1,820 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "cleaner.hpp"
+#include "bulge_remover.hpp"
+#include "dev_support/standard_base.hpp"
+#include "assembly_graph/graph_support/graph_processing_algorithm.hpp"
+#include "assembly_graph/graph_support/basic_edge_conditions.hpp"
+#include "assembly_graph/graph_core/construction_helper.hpp"
+#include "assembly_graph/graph_support/marks_and_locks.hpp"
+#include "compressor.hpp"
+
+namespace debruijn {
+
+namespace simplification {
+
+// bool EnableParallel() {
+// if (simplif_cfg_.presimp.parallel) {
+// INFO("Trying to enable parallel presimplification.");
+// if (gp_.g.AllHandlersThreadSafe()) {
+// return true;
+// } else {
+// WARN("Not all handlers are threadsafe, switching to non-parallel presimplif");
+// //gp.g.PrintHandlersNames();
+// }
+// }
+// return false;
+// }
+
+template<class Graph>
+class ParallelTipClippingFunctor {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef std::function<void(EdgeId)> HandlerF;
+ typedef omnigraph::GraphElementLock<VertexId> VertexLockT;
+
+ Graph& g_;
+ size_t length_bound_;
+ double coverage_bound_;
+ HandlerF handler_f_;
+
+ size_t LockingIncomingCount(VertexId v) const {
+ VertexLockT lock(v);
+ return g_.IncomingEdgeCount(v);
+ }
+
+ size_t LockingOutgoingCount(VertexId v) const {
+ VertexLockT lock(v);
+ return g_.OutgoingEdgeCount(v);
+ }
+
+ bool IsIncomingTip(EdgeId e) const {
+ return g_.length(e) <= length_bound_ && math::le(g_.coverage(e), coverage_bound_)
+ && LockingIncomingCount(g_.EdgeStart(e)) + LockingOutgoingCount(g_.EdgeStart(e)) == 1;
+ }
+
+ void RemoveEdge(EdgeId e) {
+ //even full tip locking can't lead to deadlock
+ VertexLockT lock1(g_.EdgeStart(e));
+ VertexLockT lock2(g_.EdgeEnd(e));
+ g_.DeleteEdge(e);
+ }
+
+public:
+
+ ParallelTipClippingFunctor(Graph& g, size_t length_bound, double coverage_bound, HandlerF handler_f = 0)
+ : g_(g),
+ length_bound_(length_bound),
+ coverage_bound_(coverage_bound),
+ handler_f_(handler_f) {
+
+ }
+
+ bool Process(VertexId v) {
+ if (LockingOutgoingCount(v) == 0)
+ return false;
+
+ vector<EdgeId> tips;
+ //don't need lock here after the previous check
+ for (EdgeId e : g_.IncomingEdges(v)) {
+ if (IsIncomingTip(e)) {
+ tips.push_back(e);
+ }
+ }
+
+ //if all of edges are tips, leave the longest one
+ if (!tips.empty() && tips.size() == g_.IncomingEdgeCount(v)) {
+ sort(tips.begin(), tips.end(), omnigraph::LengthComparator<Graph>(g_));
+ tips.pop_back();
+ }
+
+ for (EdgeId e : tips) {
+ if (handler_f_) {
+ handler_f_(e);
+ }
+ //don't need any synchronization here!
+ RemoveEdge(e);
+ }
+ return false;
+ }
+
+ bool ShouldFilterConjugate() const {
+ return false;
+ }
+};
+
+template<class Graph>
+class ParallelSimpleBRFunctor {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef omnigraph::GraphElementLock<VertexId> VertexLockT;
+
+ Graph& g_;
+ size_t max_length_;
+ double max_coverage_;
+ double max_relative_coverage_;
+ size_t max_delta_;
+ double max_relative_delta_;
+ std::function<void(EdgeId)> handler_f_;
+
+ bool LengthDiffCheck(size_t l1, size_t l2, size_t delta) const {
+ return l1 <= l2 + delta && l2 <= l1 + delta;
+ }
+
+ EdgeId Alternative(EdgeId e, const vector<EdgeId>& edges) const {
+ size_t delta = omnigraph::CountMaxDifference(max_delta_, g_.length(e), max_relative_delta_);
+ for (auto it = edges.rbegin(); it != edges.rend(); ++it) {
+ EdgeId candidate = *it;
+ if (g_.EdgeEnd(candidate) == g_.EdgeEnd(e) && candidate != e && candidate != g_.conjugate(e)
+ && LengthDiffCheck(g_.length(candidate), g_.length(e), delta)) {
+ return candidate;
+ }
+ }
+ return EdgeId(0);
+ }
+
+ bool ProcessEdges(const vector<EdgeId>& edges) {
+ for (EdgeId e : edges) {
+ if (g_.length(e) <= max_length_ && math::le(g_.coverage(e), max_coverage_)) {
+ EdgeId alt = Alternative(e, edges);
+ if (alt != EdgeId(0) && math::ge(g_.coverage(alt) * max_relative_coverage_, g_.coverage(e))) {
+ //todo is not work in multiple threads for now :)
+ //Reasons: id distribution, kmer-mapping
+ handler_f_(e);
+ g_.GlueEdges(e, alt);
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ vector<VertexId> MultiEdgeDestinations(VertexId v) const {
+ vector<VertexId> answer;
+ set<VertexId> destinations;
+ for (EdgeId e : g_.OutgoingEdges(v)) {
+ VertexId end = g_.EdgeEnd(e);
+ if (destinations.count(end) > 0) {
+ answer.push_back(end);
+ }
+ destinations.insert(end);
+ }
+ return answer;
+ }
+
+ VertexId SingleMultiEdgeDestination(VertexId v) const {
+ vector<VertexId> dests = MultiEdgeDestinations(v);
+ if (dests.size() == 1) {
+ return dests.front();
+ } else {
+ return VertexId(0);
+ }
+ }
+
+ void RemoveBulges(VertexId v) {
+ bool flag = true;
+ while (flag) {
+ vector<EdgeId> edges(g_.out_begin(v), g_.out_end(v));
+ if (edges.size() == 1)
+ return;
+ sort(edges.begin(), edges.end(), omnigraph::CoverageComparator<Graph>(g_));
+ flag = ProcessEdges(edges);
+ }
+ }
+
+ bool CheckVertex(VertexId v) const {
+ VertexLockT lock(v);
+ return MultiEdgeDestinations(v).size() == 1 && MultiEdgeDestinations(g_.conjugate(v)).size() == 0;
+ }
+
+ size_t MinId(VertexId v) const {
+ return std::min(v.int_id(), g_.conjugate(v).int_id());
+ }
+
+ bool IsMinimal(VertexId v1, VertexId v2) const {
+ return MinId(v1) < MinId(v2);
+ }
+
+public:
+
+ ParallelSimpleBRFunctor(Graph& g, size_t max_length, double max_coverage, double max_relative_coverage, size_t max_delta, double max_relative_delta,
+ std::function<void(EdgeId)> handler_f = 0)
+ : g_(g),
+ max_length_(max_length),
+ max_coverage_(max_coverage),
+ max_relative_coverage_(max_relative_coverage),
+ max_delta_(max_delta),
+ max_relative_delta_(max_relative_delta),
+ handler_f_(handler_f) {
+
+ }
+
+ bool operator()(VertexId v/*, need number of vertex for stable id distribution*/) {
+ vector<VertexId> multi_dest;
+
+ {
+ VertexLockT lock(v);
+ multi_dest = MultiEdgeDestinations(v);
+ }
+
+ if (multi_dest.size() == 1 && IsMinimal(v, multi_dest.front())) {
+ VertexId dest = multi_dest.front();
+ if (CheckVertex(v) && CheckVertex(g_.conjugate(dest))) {
+ VertexLockT lock1(v);
+ VertexLockT lock2(dest);
+ RemoveBulges(v);
+ }
+ }
+ return false;
+ }
+
+ bool ShouldFilterConjugate() const {
+ return false;
+ }
+};
+
+template<class Graph>
+class CriticalEdgeMarker {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef std::function<void(EdgeId)> HandlerF;
+
+ Graph& g_;
+ size_t chunk_cnt_;
+ omnigraph::GraphElementMarker<EdgeId> edge_marker_;
+
+ void ProcessVertex(VertexId v) {
+ if (g_.OutgoingEdgeCount(v) > 0) {
+ auto max_cov_it =
+ std::max_element(g_.out_begin(v), g_.out_end(v), omnigraph::CoverageComparator<Graph>(g_));
+ DEBUG("Marking edge " << g_.str(*max_cov_it));
+ edge_marker_.mark(*max_cov_it);
+ }
+ }
+
+ template<class It>
+ void ProcessVertices(It begin, It end) {
+ for (auto it = begin; !(it == end); ++it) {
+ ProcessVertex(*it);
+ }
+ }
+
+public:
+
+ CriticalEdgeMarker(Graph& g, size_t chunk_cnt) : g_(g), chunk_cnt_(chunk_cnt) {
+ }
+
+ void PutMarks() {
+ auto chunk_iterators = omnigraph::IterationHelper<Graph, VertexId>(g_).Chunks(chunk_cnt_);
+
+ #pragma omp parallel for schedule(guided)
+ for (size_t i = 0; i < chunk_iterators.size() - 1; ++i) {
+ ProcessVertices(chunk_iterators[i], chunk_iterators[i + 1]);
+ }
+ }
+
+ void ClearMarks() {
+ auto chunk_iterators = omnigraph::IterationHelper<Graph, EdgeId>(g_).Chunks(chunk_cnt_);
+
+ #pragma omp parallel for schedule(guided)
+ for (size_t i = 0; i < chunk_iterators.size() - 1; ++i) {
+ for (auto it = chunk_iterators[i]; it != chunk_iterators[i + 1]; ++ it) {
+ edge_marker_.unmark(*it);
+ }
+ }
+ }
+private:
+ DECL_LOGGER("CriticalEdgeMarker");
+};
+
+template<class Graph>
+class ParallelLowCoverageFunctor {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef std::function<void(EdgeId)> HandlerF;
+ typedef omnigraph::GraphElementLock<VertexId> VertexLockT;
+
+ Graph& g_;
+ typename Graph::HelperT helper_;
+ pred::TypedPredicate<EdgeId> ec_condition_;
+ HandlerF handler_f_;
+
+ omnigraph::GraphElementMarker<EdgeId> edge_marker_;
+ vector<EdgeId> edges_to_remove_;
+
+ void UnlinkEdgeFromStart(EdgeId e) {
+ VertexId start = g_.EdgeStart(e);
+ VertexLockT lock(start);
+ helper_.DeleteLink(start, e);
+ }
+
+ void UnlinkEdge(EdgeId e) {
+ UnlinkEdgeFromStart(e);
+ if (g_.conjugate(e) != e)
+ UnlinkEdgeFromStart(g_.conjugate(e));
+ }
+
+public:
+
+ //should be launched with conjugate copies filtered
+ ParallelLowCoverageFunctor(Graph& g, size_t max_length, double max_coverage, HandlerF handler_f = 0)
+ : g_(g),
+ helper_(g_.GetConstructionHelper()),
+ ec_condition_(pred::And(pred::And(omnigraph::LengthUpperBound<Graph>(g, max_length),
+ omnigraph::CoverageUpperBound<Graph>(g, max_coverage)),
+ omnigraph::AlternativesPresenceCondition<Graph>(g))),
+ handler_f_(handler_f) {}
+
+ bool IsOfInterest(EdgeId e) const {
+ return !edge_marker_.is_marked(e) && ec_condition_(e);
+ }
+
+ void PrepareForProcessing(size_t /*interesting_cnt*/) {
+ }
+
+ //no conjugate copies here!
+ bool Process(EdgeId e, size_t /*idx*/) {
+ if (handler_f_)
+ handler_f_(e);
+ DEBUG("Removing edge " << g_.str(e));
+ g_.FireDeleteEdge(e);
+ UnlinkEdge(e);
+ helper_.DeleteUnlinkedEdge(e);
+ return true;
+ }
+
+ bool ShouldFilterConjugate() const {
+ return true;
+ }
+// bool operator()(EdgeId e) {
+// if (ec_condition_->Check(e)) {
+// edges_to_remove_.push_back(e);
+// }
+// return false;
+// }
+//
+// void RemoveCollectedEdges() {
+// omnigraph::SmartSetIterator<Graph, EdgeId> to_delete(g_, edges_to_remove_.begin(), edges_to_remove_.end());
+// while (!to_delete.IsEnd()) {
+// EdgeId e = *to_delete;
+// handler_f_(e);
+// g_.DeleteEdge(e);
+// ++to_delete;
+// }
+// }
+private:
+ DECL_LOGGER("ParallelLowCoverageFunctor");
+};
+
+template<class Graph>
+class ParallelCompressor {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::EdgeData EdgeData;
+ typedef typename Graph::VertexId VertexId;
+ typedef omnigraph::GraphElementLock<VertexId> VertexLockT;
+
+ Graph& g_;
+ typename Graph::HelperT helper_;
+ restricted::IdSegmentStorage segment_storage_;
+
+ bool IsBranching(VertexId v) const {
+// VertexLockT lock(v);
+ return !g_.CheckUniqueOutgoingEdge(v) || !g_.CheckUniqueIncomingEdge(v);
+ }
+
+ size_t LockingIncomingCount(VertexId v) const {
+ VertexLockT lock(v);
+ return g_.IncomingEdgeCount(v);
+ }
+
+ size_t LockingOutgoingCount(VertexId v) const {
+ VertexLockT lock(v);
+ return g_.OutgoingEdgeCount(v);
+ }
+
+ vector<VertexId> LockingNextVertices(VertexId v) const {
+ VertexLockT lock(v);
+ vector<VertexId> answer;
+ for (EdgeId e : g_.OutgoingEdges(v)) {
+ answer.push_back(g_.EdgeEnd(e));
+ }
+ return answer;
+ }
+
+ vector<VertexId> FilterBranchingVertices(const vector<VertexId>& vertices) const {
+ vector<VertexId> answer;
+ for (VertexId v : vertices) {
+ VertexLockT lock(v);
+ if (!IsBranching(v)) {
+ answer.push_back(v);
+ }
+ }
+ return answer;
+ }
+
+ //correctly handles self-conjugate case
+ bool IsMinimal(VertexId v1, VertexId v2) const {
+ return !(g_.conjugate(v2) < v1);
+ }
+
+ //true if need to go further, false if stop on any reason!
+ //to_compress is not empty only if compression needs to be done
+ //don't need additional checks for v == init | conjugate(init), because init is branching!
+ //fixme what about plasmids?! =)
+ bool ProcessNextAndGo(VertexId& v, VertexId init, vector<VertexId>& to_compress) {
+ VertexLockT lock(v);
+ if (!CheckConsistent(v)) {
+ to_compress.clear();
+ return false;
+ }
+ if (IsBranching(v)) {
+ if (!IsMinimal(init, v)) {
+ to_compress.clear();
+ }
+ return false;
+ } else {
+ to_compress.push_back(v);
+ v = g_.EdgeEnd(g_.GetUniqueOutgoingEdge(v));
+ return true;
+ }
+ }
+
+ void UnlinkEdge(VertexId v, EdgeId e) {
+ VertexLockT lock(v);
+ helper_.DeleteLink(v, e);
+ }
+
+ void UnlinkEdges(VertexId v) {
+ VertexLockT lock(v);
+ helper_.DeleteLink(v, g_.GetUniqueOutgoingEdge(v));
+ helper_.DeleteLink(g_.conjugate(v), g_.GetUniqueOutgoingEdge(g_.conjugate(v)));
+ }
+
+ //fixme duplication with abstract conj graph
+ //not locking!
+ vector<EdgeId> EdgesToDelete(const vector<EdgeId> &path) const {
+ set<EdgeId> edgesToDelete;
+ edgesToDelete.insert(path[0]);
+ for (size_t i = 0; i + 1 < path.size(); i++) {
+ EdgeId e = path[i + 1];
+ if (edgesToDelete.find(g_.conjugate(e)) == edgesToDelete.end())
+ edgesToDelete.insert(e);
+ }
+ return vector<EdgeId>(edgesToDelete.begin(), edgesToDelete.end());
+ }
+
+ //not locking!
+ //fixme duplication with abstract conj graph
+ vector<VertexId> VerticesToDelete(const vector<EdgeId> &path) const {
+ set<VertexId> verticesToDelete;
+ for (size_t i = 0; i + 1 < path.size(); i++) {
+ EdgeId e = path[i + 1];
+ VertexId v = g_.EdgeStart(e);
+ if (verticesToDelete.find(g_.conjugate(v)) == verticesToDelete.end())
+ verticesToDelete.insert(v);
+ }
+ return vector<VertexId>(verticesToDelete.begin(), verticesToDelete.end());
+ }
+ //todo end duplication with abstract conj graph
+
+ //not locking!
+ vector<EdgeId> CollectEdges(const vector<VertexId>& to_compress) const {
+ vector<EdgeId> answer;
+ answer.push_back(g_.GetUniqueIncomingEdge(to_compress.front()));
+ for (VertexId v : to_compress) {
+ answer.push_back(g_.GetUniqueOutgoingEdge(v));
+ }
+ return answer;
+ }
+
+ void CallHandlers(const vector<EdgeId>& edges, EdgeId new_edge) const {
+ g_.FireMerge(edges, new_edge);
+ g_.FireDeletePath(EdgesToDelete(edges), VerticesToDelete(edges));
+ g_.FireAddEdge(new_edge);
+ }
+
+ EdgeData MergedData(const vector<EdgeId>& edges) const {
+ vector<const EdgeData*> to_merge;
+ for (EdgeId e : edges) {
+ to_merge.push_back(&(g_.data(e)));
+ }
+ return g_.master().MergeData(to_merge);
+ }
+
+ EdgeId SyncAddEdge(VertexId v1, VertexId v2, const EdgeData& data, restricted::IdDistributor& id_distributor) {
+ EdgeId new_edge = helper_.AddEdge(data, id_distributor);
+ {
+ VertexLockT lock(v1);
+ helper_.LinkOutgoingEdge(v1, new_edge);
+ }
+ if (g_.conjugate(new_edge) != new_edge) {
+ VertexLockT lock(v2);
+ helper_.LinkIncomingEdge(v2, new_edge);
+ }
+ return new_edge;
+ }
+
+ void ProcessBranching(VertexId next, VertexId init, size_t idx) {
+ vector<VertexId> to_compress;
+ while (ProcessNextAndGo(next, init, to_compress)) {
+ }
+
+ if (!to_compress.empty()) {
+ //here we are sure that we are the ones to process the path
+ //so we can collect edges without any troubles (and actually without locks todo check!)
+ vector<EdgeId> edges = CollectEdges(to_compress);
+
+ restricted::ListIdDistributor<restricted::SegmentIterator> id_distributor = segment_storage_.GetSegmentIdDistributor(2 * idx, 2 * idx + 1);
+
+ EdgeId new_edge = SyncAddEdge(g_.EdgeStart(edges.front()), g_.EdgeEnd(edges.back()), MergeSequences(g_, edges), id_distributor);
+
+ CallHandlers(edges, new_edge);
+
+ VertexId final = g_.EdgeEnd(edges.back());
+ UnlinkEdge(init, edges.front());
+ for (VertexId v : VerticesToDelete(edges/*to_compress*/)) {
+ UnlinkEdges(v);
+ }
+
+ if (g_.conjugate(new_edge) != new_edge) {
+ UnlinkEdge(g_.conjugate(final), g_.conjugate(edges.back()));
+ }
+
+ for (EdgeId e : EdgesToDelete(edges)) {
+ helper_.DeleteUnlinkedEdge(e);
+ }
+ }
+ }
+
+ //vertex is not consistent if the path has already been compressed or under compression right now
+ //not needed here, but could check if vertex is fully isolated
+ bool CheckConsistent(VertexId v) const {
+ //todo change to incoming edge count
+ return g_.OutgoingEdgeCount(g_.conjugate(v)) > 0;
+ }
+
+ //long, but safe way to get left neighbour
+ //heavily relies on the current graph structure!
+ VertexId LockingGetInit(VertexId v) {
+ VertexLockT lock(v);
+ if (!CheckConsistent(v))
+ return VertexId(0);
+
+ //works even if this edge is already unlinked from the vertex =)
+ VERIFY(g_.CheckUniqueIncomingEdge(v));
+ return g_.EdgeStart(g_.GetUniqueIncomingEdge(v));
+ }
+
+public:
+
+ ParallelCompressor(Graph& g)
+ : g_(g),
+ helper_(g_.GetConstructionHelper()) {
+
+ }
+
+ //returns true iff v is the "leftmost" vertex to compress in the chain
+ bool IsOfInterest(VertexId v) const {
+ return !IsBranching(v) && IsBranching(g_.EdgeStart(g_.GetUniqueIncomingEdge(v)));
+ }
+
+ void PrepareForProcessing(size_t interesting_cnt) {
+ segment_storage_ = g_.GetGraphIdDistributor().Reserve(interesting_cnt * 2);
+ }
+
+ bool Process(VertexId v, size_t idx) {
+ VertexId init = LockingGetInit(v);
+ if (init != VertexId(0))
+ ProcessBranching(v, init, idx);
+ return false;
+ }
+
+ bool ShouldFilterConjugate() const {
+ return false;
+ }
+
+};
+
+
+//todo add conjugate filtration
+template<class Graph, class ElementType>
+class AlgorithmRunner {
+ const Graph& g_;
+
+ template<class Algo, class It>
+ bool ProcessBucket(Algo& algo, It begin, It end) {
+ bool changed = false;
+ for (auto it = begin; it != end; ++it) {
+ changed |= algo.Process(*it);
+ }
+ return changed;
+ }
+
+public:
+
+ const Graph& g() const {
+ return g_;
+ }
+
+ AlgorithmRunner(Graph& g)
+ : g_(g) {
+
+ }
+
+ template<class Algo, class ItVec>
+ bool RunFromChunkIterators(Algo& algo, const ItVec& chunk_iterators) {
+ DEBUG("Running from " << chunk_iterators.size() - 1 << "chunks");
+ VERIFY(chunk_iterators.size() > 1);
+ bool changed = false;
+ #pragma omp parallel for schedule(guided) reduction(|:changed)
+ for (size_t i = 0; i < chunk_iterators.size() - 1; ++i) {
+ changed |= ProcessBucket(algo, chunk_iterators[i], chunk_iterators[i + 1]);
+ }
+ DEBUG("Finished");
+ return changed;
+ }
+private:
+ DECL_LOGGER("AlgorithmRunner")
+ ;
+};
+
+template<class Graph, class ElementType>
+class TwoStepAlgorithmRunner {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ const Graph& g_;
+ const bool filter_conjugate_;
+ std::vector<std::vector<ElementType>> elements_of_interest_;
+
+ template<class Algo>
+ bool ProcessBucket(Algo& algo, const std::vector<ElementType>& bucket, size_t idx_offset) const {
+ bool changed = false;
+ for (ElementType el : bucket) {
+ changed |= algo.Process(el, idx_offset++);
+ }
+ return changed;
+ }
+
+ template<class Algo>
+ bool Process(Algo& algo) const {
+ std::vector<size_t> cumulative_bucket_sizes;
+ cumulative_bucket_sizes.push_back(0);
+ for (const auto& bucket : elements_of_interest_) {
+ cumulative_bucket_sizes.push_back(cumulative_bucket_sizes.back() + bucket.size());
+ }
+ DEBUG("Preparing for processing");
+ algo.PrepareForProcessing(cumulative_bucket_sizes.back());
+ bool changed = false;
+ DEBUG("Processing buckets");
+ #pragma omp parallel for schedule(guided) reduction(|:changed)
+ for (size_t i = 0; i < elements_of_interest_.size(); ++i) {
+ changed |= ProcessBucket(algo, elements_of_interest_[i], cumulative_bucket_sizes[i]);
+ }
+ return changed;
+ }
+
+ template<class Algo>
+ void CountElement(Algo& algo, ElementType el, size_t bucket) {
+ if (filter_conjugate_ && g_.conjugate(el) < el)
+ return;
+ if (algo.IsOfInterest(el)) {
+ TRACE("Element " << g_.str(el) << " is of interest");
+ elements_of_interest_[bucket].push_back(el);
+ } else {
+ TRACE("Element " << g_.str(el) << " is not interesting");
+ }
+ }
+
+ template<class Algo, class It>
+ void CountAll(Algo& algo, It begin, It end, size_t bucket) {
+ for (auto it = begin; !(it == end); ++it) {
+ CountElement(algo, *it, bucket);
+ }
+ }
+
+public:
+
+ const Graph& g() const {
+ return g_;
+ }
+
+ //conjugate elements are filtered based on ids
+ //should be used only if both conjugate elements are simultaneously either interesting or not
+ //fixme filter_conjugate is redundant
+ TwoStepAlgorithmRunner(Graph& g, bool filter_conjugate)
+ : g_(g),
+ filter_conjugate_(filter_conjugate) {
+
+ }
+
+ template<class Algo, class ItVec>
+ bool RunFromChunkIterators(Algo& algo, const ItVec& chunk_iterators) {
+ DEBUG("Started running from " << chunk_iterators.size() - 1 << " chunks");
+ VERIFY(algo.ShouldFilterConjugate() == filter_conjugate_);
+ VERIFY(chunk_iterators.size() > 1);
+ elements_of_interest_.clear();
+ elements_of_interest_.resize(chunk_iterators.size() - 1);
+ DEBUG("Searching elements of interest");
+ #pragma omp parallel for schedule(guided)
+ for (size_t i = 0; i < chunk_iterators.size() - 1; ++i) {
+ CountAll(algo, chunk_iterators[i], chunk_iterators[i + 1], i);
+ }
+ DEBUG("Processing");
+ return Process(algo);
+ }
+
+// template<class Algo, class It>
+// void RunFromIterator(Algo& algo, It begin, It end) {
+// RunFromChunkIterators(algo, std::vector<It> { begin, end });
+// }
+private:
+ DECL_LOGGER("TwoStepAlgorithmRunner")
+ ;
+};
+
+template<class Graph, class ElementType>
+class SemiParallelAlgorithmRunner {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ const Graph& g_;
+
+public:
+
+ const Graph& g() const {
+ return g_;
+ }
+
+ SemiParallelAlgorithmRunner(Graph& g)
+ : g_(g) {
+
+ }
+
+ template<class Algo, class ItVec, class Comparator = std::less<ElementType>>
+ bool RunFromChunkIterators(Algo& algo, const ItVec& chunk_iterators,
+ const Comparator& comp = Comparator()) {
+ VERIFY(chunk_iterators.size() > 1);
+ omnigraph::SmartSetIterator<Graph, ElementType, Comparator> it(g_, false, comp);
+
+ FillInterestingFromChunkIterators(chunk_iterators, it,
+ std::bind(&Algo::IsOfInterest, std::ref(algo), std::placeholders::_1));
+
+ bool changed = false;
+ for (; !it.IsEnd(); ++it) {
+ changed |= algo.Process(*it);
+ }
+ return changed;
+ }
+
+private:
+ DECL_LOGGER("SemiParallelAlgorithmRunner")
+ ;
+};
+
+//todo generalize to use for other algorithms if needed
+template<class Graph>
+class SemiParallelEdgeRemovingAlgorithm {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ Graph& g_;
+ pred::TypedPredicate<EdgeId> condition_;
+ omnigraph::EdgeRemover<Graph> edge_remover_;
+
+public:
+ SemiParallelEdgeRemovingAlgorithm(Graph& g,
+ pred::TypedPredicate<EdgeId> condition,
+ std::function<void(EdgeId)> removal_handler = 0) :
+ g_(g), condition_(condition), edge_remover_(g, removal_handler) {
+ }
+
+ bool IsOfInterest(EdgeId e) const {
+ return condition_->Check(e);
+ }
+
+ bool Process(EdgeId e) {
+ edge_remover_.DeleteEdge(e);
+ return true;
+ }
+};
+
+template<class Graph, class AlgoRunner, class Algo>
+bool RunVertexAlgorithm(Graph& g, AlgoRunner& runner, Algo& algo, size_t chunk_cnt) {
+ return runner.RunFromChunkIterators(algo, omnigraph::IterationHelper<Graph, typename Graph::VertexId>(g).Chunks(chunk_cnt));
+}
+
+template<class Graph, class AlgoRunner, class Algo>
+bool RunEdgeAlgorithm(Graph& g, AlgoRunner& runner, Algo& algo, size_t chunk_cnt) {
+ return runner.RunFromChunkIterators(algo, omnigraph::IterationHelper<Graph, typename Graph::EdgeId>(g).Chunks(chunk_cnt));
+}
+
+}
+
+}
diff --git a/src/modules/algorithms/simplification/relative_coverage_remover.hpp b/src/modules/algorithms/simplification/relative_coverage_remover.hpp
new file mode 100644
index 0000000..bc6da7e
--- /dev/null
+++ b/src/modules/algorithms/simplification/relative_coverage_remover.hpp
@@ -0,0 +1,674 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "dev_support/standard_base.hpp"
+#include "assembly_graph/components/graph_component.hpp"
+#include "visualization/graph_colorer.hpp"
+#include "assembly_graph/graph_support/graph_processing_algorithm.hpp"
+
+namespace omnigraph {
+
+namespace simplification {
+
+template<class EdgeContainer>
+void SingleEdgeAdapter(
+ const EdgeContainer& edges,
+ std::function<void(typename EdgeContainer::value_type)> single_edge_handler_f) {
+ for (auto e : edges) {
+ single_edge_handler_f(e);
+ }
+}
+
+namespace relative_coverage {
+
+template<class Graph>
+class Component {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ const Graph& g_;
+ set<EdgeId> edges_;
+ set<VertexId> inner_vertices_;
+ set<VertexId> border_;
+ set<VertexId> terminating_vertices_;
+ //maybe use something more sophisticated in future
+ size_t cumm_length_;
+ bool contains_deadends_;
+
+ //if edge start = edge end = v returns v
+ VertexId OppositeEnd(EdgeId e, VertexId v) const {
+ VERIFY(g_.EdgeStart(e) == v
+ || g_.EdgeEnd(e) == v);
+// VERIFY(remover_.g.EdgeStart(e) != remover_.g.EdgeEnd(e));
+ if (g_.EdgeStart(e) == v) {
+ return g_.EdgeEnd(e);
+ } else {
+ return g_.EdgeStart(e);
+ }
+ }
+
+ void RemoveFromBorder(VertexId v) {
+ size_t cnt = border_.erase(v);
+ VERIFY(cnt);
+ }
+
+public:
+
+ Component(const Graph& g, EdgeId e) : g_(g), cumm_length_(0), contains_deadends_(false) {
+ edges_.insert(e);
+ cumm_length_ += g_.length(e);
+ border_.insert(g.EdgeStart(e));
+ border_.insert(g.EdgeEnd(e));
+ }
+
+ void MakeInner(VertexId v) {
+ VERIFY(border_.count(v) > 0);
+ if (g_.IsDeadEnd(v) || g_.IsDeadStart(v)) {
+ contains_deadends_ = true;
+ }
+ inner_vertices_.insert(v);
+ for (EdgeId e : g_.IncidentEdges(v)) {
+ //seems to correctly handle loops
+ if (edges_.count(e) == 0) {
+ edges_.insert(e);
+ cumm_length_ += g_.length(e);
+ VertexId other_end = OppositeEnd(e, v);
+ if (inner_vertices_.count(other_end) == 0) {
+ border_.insert(other_end);
+ }
+ }
+ }
+ RemoveFromBorder(v);
+ }
+
+ void TerminateOnVertex(VertexId v) {
+ terminating_vertices_.insert(v);
+ RemoveFromBorder(v);
+ }
+
+ VertexId NextBorderVertex() const {
+ return *border_.begin();
+ }
+
+ bool IsBorderEmpty() const {
+ return border_.empty();
+ }
+
+ const set<EdgeId>& edges() const {
+ return edges_;
+ }
+
+ bool contains(EdgeId e) const {
+ return edges_.count(e) > 0;
+ }
+
+ const set<VertexId>& terminating_vertices() const {
+ return terminating_vertices_;
+ }
+
+ set<EdgeId> terminating_edges() const {
+ set<EdgeId> answer;
+ for (VertexId v : terminating_vertices()) {
+ for (EdgeId e : g_.IncidentEdges(v)) {
+ if (contains(e)) {
+ answer.insert(e);
+ }
+ }
+ }
+ return answer;
+ }
+
+ //terminating edges, going into the component
+ set<EdgeId> terminating_in_edges() const {
+ set<EdgeId> answer;
+ for (VertexId v : terminating_vertices()) {
+ for (EdgeId e : g_.OutgoingEdges(v)) {
+ if (contains(e)) {
+ answer.insert(e);
+ }
+ }
+ }
+ return answer;
+ }
+
+ //terminating edges, going out of the component
+ set<EdgeId> terminating_out_edges() const {
+ set<EdgeId> answer;
+ for (VertexId v : terminating_vertices()) {
+ for (EdgeId e : g_.IncomingEdges(v)) {
+ if (contains(e)) {
+ answer.insert(e);
+ }
+ }
+ }
+ return answer;
+ }
+
+ const Graph& g() const {
+ return g_;
+ }
+
+ size_t inner_vertex_cnt() const {
+ return inner_vertices_.size();
+ }
+
+ size_t length() const {
+ return cumm_length_;
+ }
+
+ bool contains_deadends() const {
+ return contains_deadends_;
+ }
+};
+
+template<class Graph>
+class RelativeCoverageHelper {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef std::function<double(EdgeId, VertexId)> LocalCoverageFT;
+
+ const Graph& g_;
+ LocalCoverageFT local_coverage_f_;
+ double min_coverage_gap_;
+
+public:
+ RelativeCoverageHelper(const Graph& g, LocalCoverageFT local_coverage_f,
+ double min_coverage_gap)
+ : g_(g),
+ local_coverage_f_(local_coverage_f),
+ min_coverage_gap_(min_coverage_gap) {
+ VERIFY(math::gr(min_coverage_gap, 1.));
+ }
+
+ double LocalCoverage(EdgeId e, VertexId v) const {
+ DEBUG("Local coverage of edge " << g_.str(e) << " around vertex " << g_.str(v) << " was " << local_coverage_f_(e, v));
+ return local_coverage_f_(e, v);
+ }
+
+ template<class EdgeContainer>
+ double MaxLocalCoverage(const EdgeContainer& edges, VertexId v) const {
+ double answer = 0.0;
+ for (EdgeId e : edges) {
+ answer = max(answer, LocalCoverage(e, v));
+ }
+ return answer;
+ }
+
+ template<class EdgeContainer>
+ bool CheckAnyHighlyCovered(const EdgeContainer& edges, VertexId v,
+ double base_coverage) const {
+ return math::gr(MaxLocalCoverage(edges, v),
+ base_coverage * min_coverage_gap_);
+ }
+
+ double RelativeCoverageToReport(VertexId v, double base_coverage) const {
+ return std::min(MaxLocalCoverage(g_.OutgoingEdges(v), v),
+ MaxLocalCoverage(g_.IncomingEdges(v), v))
+ / base_coverage;
+ }
+
+private:
+ DECL_LOGGER("RelativeCoverageHelper");
+};
+
+template<class Graph>
+class LongestPathFinder {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ const Component<Graph>& component_;
+ const Graph& g_;
+ map<VertexId, int> max_distance_;
+ vector<VertexId> vertex_stack_;
+ bool cycle_detected_;
+
+ //distance is changed!
+ bool TryGetMaxDistance(VertexId v, int& distance) {
+ if (max_distance_.count(v) > 0) {
+ distance = max_distance_[v];
+ return true;
+ }
+
+ //minus infinity for incoming tips
+ distance = std::numeric_limits<int>::min();
+ for (EdgeId e : g_.IncomingEdges(v)) {
+ VertexId start = g_.EdgeStart(e);
+ if (component_.contains(e)) {
+ if (max_distance_.count(start) == 0) {
+ if (std::find(vertex_stack_.begin(), vertex_stack_.end(), start) != vertex_stack_.end()) {
+ cycle_detected_ = true;
+ }
+ vertex_stack_.push_back(start);
+ return false;
+ } else {
+ distance = std::max(distance, max_distance_[start] + int(g_.length(e)));
+ }
+ }
+ }
+ //todo think...
+ //currently whole length of zig-zag path
+ //through several terminal vertices is counted
+ if (component_.terminating_vertices().count(v) > 0) {
+ distance = std::max(distance, 0);
+ }
+ return true;
+ }
+
+ void ProcessVertex(VertexId init_v) {
+ vertex_stack_.push_back(init_v);
+ while (!vertex_stack_.empty()) {
+ if (cycle_detected_)
+ return;
+
+ VertexId v = vertex_stack_.back();
+ int max_dist = 0;
+ if (TryGetMaxDistance(v, max_dist)) {
+ max_distance_[v] = max_dist;
+ vertex_stack_.pop_back();
+ }
+ }
+ }
+
+public:
+ LongestPathFinder(const Component<Graph>& component)
+ : component_(component), g_(component.g()), cycle_detected_(false) {
+ }
+
+ //-1u if component contains a cycle or no path between terminating vertices
+ size_t Find() {
+ int answer = 0;
+ for (VertexId v : component_.terminating_vertices()) {
+ ProcessVertex(v);
+ if (cycle_detected_)
+ return -1u;
+ VERIFY(max_distance_.count(v) > 0);
+ answer = std::max(answer, get(max_distance_, v));
+ }
+ VERIFY(answer >= 0);
+ if (answer == 0)
+ return -1u;
+ return size_t(answer);
+ }
+};
+
+template<class Graph>
+class ComponentChecker {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ const Graph& g_;
+ size_t vertex_count_limit_;
+ size_t length_bound_;
+ size_t tip_allowing_length_bound_;
+ size_t longest_connecting_path_bound_;
+ double max_coverage_;
+
+ bool CoverageCheck(const Component<Graph>& component) const {
+ for (EdgeId e : component.edges()) {
+ if (math::gr(g_.coverage(e), max_coverage_)) {
+ TRACE("Too high coverage! Component contains highly covered edge " << g_.str(e)
+ << " of coverage " << g_.coverage(e) << " while threshold was " << max_coverage_);
+ return false;
+ }
+ }
+ return true;
+ }
+
+public:
+ ComponentChecker(const Graph& g, size_t vertex_count_limit, size_t length_bound,
+ size_t tip_allowing_length_bound,
+ size_t longest_connecting_path_bound,
+ double max_coverage)
+ : g_(g), vertex_count_limit_(vertex_count_limit),
+ length_bound_(length_bound),
+ tip_allowing_length_bound_(tip_allowing_length_bound),
+ longest_connecting_path_bound_(longest_connecting_path_bound),
+ max_coverage_(max_coverage) {
+ }
+
+ bool SizeCheck(const Component<Graph>& component) const {
+ if (component.inner_vertex_cnt() > vertex_count_limit_) {
+ TRACE("Too many vertices : " << component.inner_vertex_cnt() << " ! More than " << vertex_count_limit_);
+ return false;
+ }
+ return true;
+ }
+
+ bool FullCheck(const Component<Graph>& component) const {
+ TRACE("Performing full check of the component");
+ size_t longest_connecting_path = LongestPathFinder<Graph>(component).Find();
+ if (longest_connecting_path != -1u) {
+ if (longest_connecting_path >= longest_connecting_path_bound_) {
+ TRACE("Length of longest path: " << longest_connecting_path << "; threshold: " << longest_connecting_path_bound_);
+ return false;
+ }
+ } else {
+ TRACE("Failed to find longest connecting path (check for cycles)");
+ }
+ if (!component.contains_deadends()
+ && component.length() > length_bound_) {
+ TRACE("Too long component of length " << component.length() << "! Longer than length bound " << length_bound_);
+ return false;
+ } else if (component.length() > tip_allowing_length_bound_) {
+ TRACE("Too long component of length " << component.length() << "! Longer than tip allowing length bound " << tip_allowing_length_bound_);
+ return false;
+ }
+
+ return SizeCheck(component) && CoverageCheck(component);
+ }
+
+private:
+ DECL_LOGGER("RelativelyLowCoveredComponentChecker");
+};
+
+//Removes last (k+1)-mer of graph edge
+template<class Graph>
+class EdgeDisconnector {
+ typedef typename Graph::EdgeId EdgeId;
+ Graph& g_;
+ EdgeRemover<Graph> edge_remover_;
+
+public:
+ EdgeDisconnector(Graph& g,
+ HandlerF<Graph> removal_handler = nullptr):
+ g_(g), edge_remover_(g, removal_handler) {
+ }
+
+ EdgeId operator()(EdgeId e) {
+ VERIFY(g_.length(e) > 1);
+ pair<EdgeId, EdgeId> split_res = g_.SplitEdge(e, 1);
+ edge_remover_.DeleteEdge(split_res.first);
+ return split_res.first;
+ }
+};
+
+//todo make parallel
+template<class Graph>
+class RelativeCoverageDisconnector: public EdgeProcessingAlgorithm<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef std::function<double(EdgeId, VertexId)> LocalCoverageFT;
+ typedef EdgeProcessingAlgorithm<Graph> base;
+
+ const RelativeCoverageHelper<Graph> rel_helper_;
+ EdgeDisconnector<Graph> disconnector_;
+ size_t cnt_;
+public:
+ RelativeCoverageDisconnector(Graph& g,
+ LocalCoverageFT local_coverage_f, double diff_mult) :
+ base(g, false),
+ rel_helper_(g, local_coverage_f, diff_mult),
+ disconnector_(g),
+ cnt_(0) {
+ }
+
+ ~RelativeCoverageDisconnector() {
+ DEBUG("Disconnected edge cnt " << cnt_);
+ }
+
+protected:
+ bool ProcessEdge(EdgeId edge) {
+ DEBUG("Processing edge " << this->g().int_id(edge));
+ VertexId v = this->g().EdgeStart(edge);
+ double coverage_edge_around_v = rel_helper_.LocalCoverage(edge, v);
+ DEBUG("Local flanking coverage - " << coverage_edge_around_v);
+ DEBUG("Max local coverage incoming - " << rel_helper_.MaxLocalCoverage(this->g().IncomingEdges(v), v));
+ DEBUG("Max local coverage outgoing - " << rel_helper_.MaxLocalCoverage(this->g().OutgoingEdges(v), v));
+ if (this->g().length(edge) > 1 &&
+ rel_helper_.CheckAnyHighlyCovered(this->g().IncomingEdges(v), v, coverage_edge_around_v) &&
+ rel_helper_.CheckAnyHighlyCovered(this->g().OutgoingEdges(v), v, coverage_edge_around_v)) {
+ DEBUG("Disconnecting");
+ disconnector_(edge);
+ cnt_++;
+ return true;
+ } else {
+ DEBUG("No need to disconnect");
+ return false;
+ }
+ }
+
+private:
+
+ DECL_LOGGER("RelativeCoverageDisconnector");
+};
+
+template<class Graph>
+class ComponentSearcher {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ const Graph& g_;
+ const RelativeCoverageHelper<Graph>& rel_helper_;
+ const ComponentChecker<Graph>& checker_;
+ Component<Graph> component_;
+
+public:
+ ComponentSearcher(const Graph& g,
+ const RelativeCoverageHelper<Graph>& rel_helper,
+ const ComponentChecker<Graph>& checker,
+ EdgeId first_edge)
+ : g_(g), rel_helper_(rel_helper), checker_(checker),
+ component_(g_, first_edge) {
+ }
+
+ bool FindComponent() {
+ while (!component_.IsBorderEmpty()) {
+ if (!checker_.SizeCheck(component_))
+ return false;
+
+ VertexId v = component_.NextBorderVertex();
+
+ TRACE("Checking if vertex " << g_.str(v) << " is terminating.");
+ //checking if there is a sufficient coverage gap
+ if (!IsTerminateVertex(v)) {
+ TRACE("Not terminating, adding neighbourhood");
+ component_.MakeInner(v);
+ if (component_.terminating_vertices().count(v) > 0) {
+ TRACE("Terminating vertex classified as non-terminating");
+ return false;
+ }
+ } else {
+ TRACE("Terminating");
+ component_.TerminateOnVertex(v);
+ }
+ }
+
+ return checker_.FullCheck(component_);
+ }
+
+ const Component<Graph>& component() const {
+ return component_;
+ }
+
+private:
+
+ bool IsTerminateVertex(VertexId v) const {
+ double base_coverage = rel_helper_.MaxLocalCoverage(
+ RetainEdgesFromComponent(g_.IncidentEdges(v)), v);
+ return CheckAnyFilteredHighlyCovered(g_.OutgoingEdges(v),
+ v, base_coverage)
+ && CheckAnyFilteredHighlyCovered(
+ g_.IncomingEdges(v), v, base_coverage);
+ }
+
+ template<class EdgeContainer>
+ bool CheckAnyFilteredHighlyCovered(const EdgeContainer& edges,
+ VertexId v,
+ double base_coverage) const {
+ return rel_helper_.CheckAnyHighlyCovered(
+ FilterEdgesFromComponent(edges), v, base_coverage);
+ }
+
+ template<class EdgeContainer>
+ vector<EdgeId> FilterEdgesFromComponent(
+ const EdgeContainer& edges) const {
+ vector<EdgeId> answer;
+ for (EdgeId e : edges) {
+ if (!component_.contains(e)) {
+ answer.push_back(e);
+ }
+ }
+ return answer;
+ }
+
+ template<class EdgeContainer>
+ vector<EdgeId> RetainEdgesFromComponent(
+ const EdgeContainer& edges) const {
+ vector<EdgeId> answer;
+ for (EdgeId e : edges) {
+ if (component_.contains(e)) {
+ answer.push_back(e);
+ }
+ }
+ return answer;
+ }
+
+ DECL_LOGGER("RelativelyLowCoveredComponentSearcher")
+ ;
+};
+
+//currently works with conjugate graphs only (due to the assumption in the outer cycle)
+template<class Graph>
+class RelativeCoverageComponentRemover : public EdgeProcessingAlgorithm<Graph> {
+ typedef EdgeProcessingAlgorithm<Graph> base;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef std::function<double(EdgeId, VertexId)> LocalCoverageFT;
+ typedef typename ComponentRemover<Graph>::HandlerF HandlerF;
+ typedef pred::TypedPredicate<EdgeId> ProceedConditionT;
+
+ RelativeCoverageHelper<Graph> rel_helper_;
+ size_t length_bound_;
+ size_t tip_allowing_length_bound_;
+ size_t longest_connecting_path_bound_;
+ double max_coverage_;
+ //bound on the number of inner vertices
+ size_t vertex_count_limit_;
+ std::string vis_dir_;
+ ComponentRemover<Graph> component_remover_;
+
+ size_t fail_cnt_;
+ size_t succ_cnt_;
+
+ void VisualizeNontrivialComponent(const set<typename Graph::EdgeId>& edges, bool success) {
+ auto colorer = omnigraph::visualization::DefaultColorer(this->g());
+ auto edge_colorer = make_shared<visualization::CompositeEdgeColorer<Graph>>("black");
+ edge_colorer->AddColorer(colorer);
+ edge_colorer->AddColorer(make_shared<visualization::SetColorer<Graph>>(this->g(), edges, "green"));
+ // shared_ptr<visualization::GraphColorer<Graph>>
+ auto resulting_colorer = make_shared<visualization::CompositeGraphColorer<Graph>>(colorer, edge_colorer);
+
+ StrGraphLabeler<Graph> str_labeler(this->g());
+ CoverageGraphLabeler<Graph> cov_labler(this->g());
+ CompositeLabeler<Graph> labeler(str_labeler, cov_labler);
+
+ if (edges.size() > 1) {
+ set<typename Graph::VertexId> vertices;
+ for (auto e : edges) {
+ vertices.insert(this->g().EdgeStart(e));
+ vertices.insert(this->g().EdgeEnd(e));
+ }
+
+
+ auto filename = success ? vis_dir_ + "/success/" + ToString(succ_cnt_++) : vis_dir_ + "/fail/" + ToString(fail_cnt_++);
+ visualization::WriteComponent(
+ ComponentCloser<Graph>(this->g(), 0).CloseComponent(GraphComponent<Graph>(this->g(), vertices.begin(), vertices.end())),
+ filename + ".dot", colorer, labeler);
+ }
+ }
+
+public:
+ RelativeCoverageComponentRemover(
+ Graph& g, LocalCoverageFT local_coverage_f,
+ double min_coverage_gap,
+ size_t length_bound,
+ size_t tip_allowing_length_bound,
+ size_t longest_connecting_path_bound,
+ double max_coverage = std::numeric_limits<double>::max(),
+ HandlerF handler_function = 0, size_t vertex_count_limit = 10,
+ std::string vis_dir = "")
+ : base(g),
+ rel_helper_(g, local_coverage_f, min_coverage_gap),
+ length_bound_(length_bound),
+ tip_allowing_length_bound_(tip_allowing_length_bound),
+ longest_connecting_path_bound_(longest_connecting_path_bound),
+ max_coverage_(max_coverage),
+ vertex_count_limit_(vertex_count_limit),
+ vis_dir_(vis_dir),
+ component_remover_(g, handler_function),
+ fail_cnt_(0),
+ succ_cnt_(0) {
+ VERIFY(math::gr(min_coverage_gap, 1.));
+ VERIFY(tip_allowing_length_bound >= length_bound);
+ TRACE("Coverage gap " << min_coverage_gap);
+ if (!vis_dir_.empty()) {
+ path::make_dirs(vis_dir_);
+ path::make_dirs(vis_dir_ + "/success/");
+ path::make_dirs(vis_dir_ + "/fail/");
+ }
+ }
+
+protected:
+
+ bool ProcessEdge(EdgeId e) {
+ TRACE("Processing edge " << this->g().str(e));
+
+ //here we use that the graph is conjugate!
+ VertexId v = this->g().EdgeStart(e);
+ if (this->g().IsDeadEnd(v) && this->g().IsDeadStart(v)) {
+ TRACE("Isolated");
+ return false;
+ }
+ if (this->g().IsDeadEnd(v) || this->g().IsDeadStart(v)) {
+ TRACE("Tip");
+ return false;
+ }
+
+ double local_cov = rel_helper_.LocalCoverage(e, v);
+
+ TRACE("Local coverage around start " << this->g().str(v) << " is " << local_cov);
+
+ //since min_coverage_gap_ > 1, we don't need to think about e here
+ TRACE("Checking presence of highly covered edges around start")
+ if (rel_helper_.CheckAnyHighlyCovered(this->g().OutgoingEdges(v), v, local_cov)
+ && rel_helper_.CheckAnyHighlyCovered(this->g().IncomingEdges(v), v,
+ local_cov)) {
+ TRACE("Looking for component");
+ ComponentChecker<Graph> checker(this->g(), vertex_count_limit_, length_bound_,
+ tip_allowing_length_bound_,
+ longest_connecting_path_bound_, max_coverage_);
+ //case of e being loop is handled implicitly!
+ ComponentSearcher<Graph> component_searcher(
+ this->g(), rel_helper_, checker, e);
+ if (component_searcher.FindComponent()) {
+ TRACE("Deleting component");
+ const Component<Graph>& component = component_searcher.component();
+ component_remover_.DeleteComponent(component.edges());
+ return true;
+ } else {
+ TRACE("Failed to find component");
+ if (!vis_dir_.empty()) {
+ TRACE("Outputting image");
+ VisualizeNontrivialComponent(component_searcher.component().edges(), false);
+ }
+ }
+ } else {
+ TRACE("No highly covered edges around");
+ }
+
+ return false;
+ }
+
+private:
+ DECL_LOGGER("RelativeCoverageComponentRemover");
+};
+
+}
+}
+
+}
diff --git a/src/modules/algorithms/simplification/tip_clipper.hpp b/src/modules/algorithms/simplification/tip_clipper.hpp
new file mode 100644
index 0000000..32951e7
--- /dev/null
+++ b/src/modules/algorithms/simplification/tip_clipper.hpp
@@ -0,0 +1,269 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "math/xmath.h"
+#include "dev_support/func.hpp"
+#include "assembly_graph/graph_support/basic_edge_conditions.hpp"
+#include "assembly_graph/graph_support/graph_processing_algorithm.hpp"
+#include "data_structures/sequence/sequence.hpp"
+
+#include <set>
+
+namespace omnigraph {
+
+template<class Graph>
+class RelativeCoverageTipCondition: public EdgeCondition<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef EdgeCondition<Graph> base;
+
+ const double max_relative_coverage_;
+
+ template<class IteratorType>
+ double MaxCompetitorCoverage(EdgeId tip, IteratorType begin, IteratorType end) const {
+ const Graph &g = this->g();
+ double result = 0;
+ for (auto it = begin; it != end; ++it) {
+ EdgeId e = *it;
+ //update if competitor edge is not loop
+ if (e != tip && g.EdgeStart(e) != g.EdgeEnd(e))
+ result = std::max(result, g.coverage(*it));
+ }
+ return result;
+ }
+
+ double MaxCompetitorCoverage(EdgeId tip) const {
+ const Graph &g = this->g();
+ VertexId start = g.EdgeStart(tip), end = g.EdgeEnd(tip);
+ auto out = g.OutgoingEdges(start);
+ auto in = g.IncomingEdges(end);
+ return std::max(
+ MaxCompetitorCoverage(tip, out.begin(), out.end()),
+ MaxCompetitorCoverage(tip, in.begin(), in.end()));
+// return std::max(
+// MaxCompetitorCoverage(tip, g.out_begin(start),
+// g.out_end(start)),
+// MaxCompetitorCoverage(tip, g.in_begin(end), g.in_end(end)));
+ }
+
+public:
+
+ RelativeCoverageTipCondition(const Graph& g, double max_relative_coverage) :
+ base(g), max_relative_coverage_(max_relative_coverage) {
+ }
+
+ bool Check(EdgeId e) const override {
+ //+1 is a trick to deal with edges of 0 coverage from iterative run
+ double max_coverage = MaxCompetitorCoverage(e) + 1;
+ return math::le(this->g().coverage(e),
+ max_relative_coverage_ * max_coverage);
+ }
+};
+
+template<class Graph>
+class TipCondition : public EdgeCondition<Graph> {
+ typedef EdgeCondition<Graph> base;
+
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ /**
+ * This method checks if given vertex topologically looks like end of tip
+ * @param v vertex to be checked
+ * @return true if vertex judged to be tip and false otherwise.
+ */
+ bool IsTip(VertexId v) const {
+ return this->g().IncomingEdgeCount(v) + this->g().OutgoingEdgeCount(v) == 1;
+ }
+
+public:
+ TipCondition(const Graph& g) : base(g) {
+ }
+
+ /**
+ * This method checks if given edge topologically looks like a tip.
+ * @param edge edge vertex to be checked
+ * @return true if edge judged to be tip and false otherwise.
+ */
+ bool Check(EdgeId e) const override {
+ return (IsTip(this->g().EdgeEnd(e)) || IsTip(this->g().EdgeStart(e)))
+ && (this->g().OutgoingEdgeCount(this->g().EdgeStart(e))
+ + this->g().IncomingEdgeCount(this->g().EdgeEnd(e)) > 2);
+ }
+
+};
+
+
+template<class Graph>
+class MismatchTipCondition : public EdgeCondition<Graph> {
+ typedef EdgeCondition<Graph> base;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ size_t max_diff_;
+
+ size_t Hamming(EdgeId edge1, EdgeId edge2) const {
+ size_t cnt = 0;
+ Sequence seq1 = this->g().EdgeNucls(edge1);
+ Sequence seq2 = this->g().EdgeNucls(edge2);
+ size_t len = std::min(seq1.size(), seq2.size());
+ for(size_t i = this->g().k(); i < len; i++) {
+ if(seq1[i] != seq2[i])
+ cnt++;
+ }
+ return cnt;
+ }
+
+ bool InnerCheck(EdgeId e) const {
+ size_t len = this->g().length(e);
+ for (auto alt : this->g().OutgoingEdges(this->g().EdgeStart(e))) {
+ if (e != alt && len < this->g().length(alt) && Hamming(e, alt) <= max_diff_) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+public:
+ MismatchTipCondition(const Graph& g, size_t max_diff) :
+ base(g), max_diff_(max_diff) {
+ }
+
+ bool Check(EdgeId e) const override {
+ return InnerCheck(e) || InnerCheck(this->g().conjugate(e));
+ }
+
+};
+
+template<class Graph>
+class ATCondition: public EdgeCondition<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef EdgeCondition<Graph> base;
+ const double max_AT_percentage_;
+ const size_t max_tip_length_;
+ const bool check_tip_ ;
+
+public:
+
+ ATCondition(const Graph& g, double max_AT_percentage, size_t max_tip_length, bool check_tip) :
+ base(g), max_AT_percentage_(max_AT_percentage), max_tip_length_(max_tip_length), check_tip_(check_tip) {
+ DEBUG("check_tip: " << check_tip_);
+ }
+
+ bool Check(EdgeId e) const {
+ //+1 is a trick to deal with edges of 0 coverage from iterative run
+ size_t start = 0;
+ //TODO: Do we need this check?
+ if(this->g().length(e) > max_tip_length_)
+ return false;
+ size_t end = this->g().length(e) + this->g().k();
+ if (check_tip_) {
+ if (this->g().OutgoingEdgeCount(this->g().EdgeEnd(e)) == 0)
+ start = this->g().k();
+ else if (this->g().IncomingEdgeCount(this->g().EdgeStart(e)) == 0)
+ end = this->g().length(e);
+ else return false;
+ }
+ std::array<size_t, 4> counts = std::array<size_t, 4>();
+ const Sequence &s_edge = this->g().EdgeNucls(e);
+
+ for (size_t position = start; position < end; position ++) {
+ counts[s_edge[position]] ++;
+ }
+ size_t curm = *std::max_element(counts.begin(), counts.end());
+ if (curm > (end - start) * max_AT_percentage_) {
+ DEBUG("deleting edge" << s_edge.str());;
+ DEBUG("curm: " << curm);
+
+ DEBUG("start end cutoff" << start << " " << end << " " << this->g().length(e) * max_AT_percentage_);
+
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+private:
+ DECL_LOGGER("ATCondition")
+};
+
+template<class Graph>
+pred::TypedPredicate<typename Graph::EdgeId> AddTipCondition(const Graph& g,
+ pred::TypedPredicate<typename Graph::EdgeId> condition) {
+ return pred::And(TipCondition<Graph>(g), condition);
+}
+
+template<class Graph>
+pred::TypedPredicate<typename Graph::EdgeId>
+NecessaryTipCondition(const Graph& g, size_t max_length, double max_coverage) {
+ return AddTipCondition(g, pred::And(LengthUpperBound<Graph>(g, max_length),
+ CoverageUpperBound<Graph>(g, max_coverage)));
+}
+
+template<class Graph>
+class DeadEndCondition : public EdgeCondition<Graph> {
+ typedef EdgeCondition<Graph> base;
+
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ /**
+ * This method checks if given vertex topologically looks like end of tip
+ * @param v vertex to be checked
+ * @return true if vertex judged to be tip and false otherwise.
+ */
+ bool IsDeadEnd(VertexId v) const {
+ return this->g().IncomingEdgeCount(v) * this->g().OutgoingEdgeCount(v) == 0;
+ }
+
+public:
+ DeadEndCondition(const Graph& g) : base(g) {
+ }
+
+ /**
+ * This method checks if given edge topologically looks like a tip.
+ * @param edge edge vertex to be checked
+ * @return true if edge judged to be tip and false otherwise.
+ */
+ /*virtual*/
+
+ //Careful - no alternative path check!
+ bool Check(EdgeId e) const {
+ return (IsDeadEnd(this->g().EdgeEnd(e)) || IsDeadEnd(this->g().EdgeStart(e)))
+ && (this->g().OutgoingEdgeCount(this->g().EdgeEnd(e))
+ + this->g().IncomingEdgeCount(this->g().EdgeStart(e)) >= 1);
+ }
+
+};
+
+template<class Graph>
+pred::TypedPredicate<typename Graph::EdgeId>AddDeadEndCondition(const Graph& g,
+ pred::TypedPredicate<typename Graph::EdgeId> condition) {
+ return pred::And<typename Graph::EdgeId>(DeadEndCondition<Graph>(g), condition);
+}
+
+
+//template<class Graph>
+//bool ClipTips(
+// Graph& g,
+// size_t max_length,
+// shared_ptr<Predicate<typename Graph::EdgeId>> condition
+// = make_shared<func::AlwaysTrue<typename Graph::EdgeId>>(),
+// std::function<void(typename Graph::EdgeId)> removal_handler = 0) {
+//
+// omnigraph::EdgeRemovingAlgorithm<Graph> tc(g,
+// AddTipCondition(g, condition),
+// removal_handler);
+//
+// return tc.Run(LengthComparator<Graph>(g),
+// make_shared<LengthUpperBound<Graph>>(g, max_length));
+//}
+
+} // namespace omnigraph
diff --git a/src/modules/assembly_graph/CMakeLists.txt b/src/modules/assembly_graph/CMakeLists.txt
new file mode 100644
index 0000000..5854450
--- /dev/null
+++ b/src/modules/assembly_graph/CMakeLists.txt
@@ -0,0 +1,12 @@
+############################################################################
+# Copyright (c) 2015 Saint Petersburg State University
+# Copyright (c) 2011-2014 Saint Petersburg Academic University
+# All Rights Reserved
+# See file LICENSE for details.
+############################################################################
+
+project(graph_support CXX)
+
+add_library(graph_support STATIC
+ components/connected_component.cpp paths/bidirectional_path.cpp graph_support/scaff_supplementary.cpp)
+target_link_libraries(graph_support hattrie)
diff --git a/src/include/omni/component_filters.hpp b/src/modules/assembly_graph/components/component_filters.hpp
similarity index 100%
rename from src/include/omni/component_filters.hpp
rename to src/modules/assembly_graph/components/component_filters.hpp
diff --git a/src/modules/assembly_graph/components/connected_component.cpp b/src/modules/assembly_graph/components/connected_component.cpp
new file mode 100644
index 0000000..69a9dce
--- /dev/null
+++ b/src/modules/assembly_graph/components/connected_component.cpp
@@ -0,0 +1,76 @@
+//
+// Created by lab42 on 8/24/15.
+//
+
+#include "connected_component.hpp"
+#include <stack>
+
+
+namespace debruijn_graph {
+
+
+void ConnectedComponentCounter::CalculateComponents() const {
+ map <EdgeId, size_t> component_ids;
+ vector <pair<size_t, size_t>> to_sort;
+ map<size_t, size_t> comp_size;
+ size_t cur_id = 0;
+ for (auto e = g_.ConstEdgeBegin(); !e.IsEnd(); ++e) {
+ if (component_ids.find(*e) == component_ids.end()) {
+ std::stack <EdgeId> next;
+ next.push(*e);
+ set <EdgeId> used;
+ size_t ans = 0;
+ while (!next.empty()) {
+ auto cur = next.top();
+ next.pop();
+ if (used.find(cur) != used.end()) {
+ continue;
+ }
+ ans += g_.length(cur);
+ used.insert(cur);
+ vector <EdgeId> neighbours;
+ neighbours.push_back(g_.conjugate(cur));
+ auto start = g_.EdgeStart(cur);
+ auto tmp = g_.IncidentEdges(start);
+
+ neighbours.insert(neighbours.end(), tmp.begin(), tmp.end());
+ auto end = g_.EdgeEnd(cur);
+ tmp = g_.IncidentEdges(end);
+ neighbours.insert(neighbours.end(), tmp.begin(), tmp.end());
+ for (auto ee:neighbours) {
+ if (used.find(ee) == used.end()) {
+ next.push(ee);
+ }
+ }
+ }
+ for (auto edge: used) {
+ component_ids[edge] = cur_id;
+ }
+ to_sort.push_back(std::make_pair(ans, cur_id));
+ comp_size[cur_id] = ans;
+ cur_id ++;
+ }
+ }
+ std::sort(to_sort.begin(), to_sort.end());
+ std::reverse(to_sort.begin(), to_sort.end());
+ vector <size_t> perm(to_sort.size());
+ for (size_t i = 0; i < to_sort.size(); i++) {
+ perm[to_sort[i].second] = i;
+ component_total_len_[i] = comp_size[to_sort[i].second];
+ }
+ for (auto pair:component_ids) {
+ component_ids_[pair.first] = perm[pair.second];
+ component_edges_quantity_[perm[pair.second]]++;
+ }
+ return;
+}
+
+size_t ConnectedComponentCounter::GetComponent(EdgeId & e) const {
+ if (component_ids_.size() == 0) {
+ CalculateComponents();
+ }
+ return component_ids_[e];
+}
+
+
+}
diff --git a/src/modules/assembly_graph/components/connected_component.hpp b/src/modules/assembly_graph/components/connected_component.hpp
new file mode 100644
index 0000000..abc396e
--- /dev/null
+++ b/src/modules/assembly_graph/components/connected_component.hpp
@@ -0,0 +1,26 @@
+//
+// Created by lab42 on 8/24/15.
+//
+#pragma once
+#include <map>
+//#include "path_extend/bidirectional_path.hpp"
+#include "assembly_graph/graph_core/graph.hpp"
+
+namespace debruijn_graph{
+
+class ConnectedComponentCounter {
+public:
+ mutable std::map<EdgeId, size_t> component_ids_;
+ mutable std::map<size_t, size_t> component_edges_quantity_;
+ mutable std::map<size_t, size_t> component_total_len_;
+ const Graph &g_;
+ ConnectedComponentCounter(const Graph &g):g_(g) {}
+ void CalculateComponents() const;
+// size_t GetComponent(path_extend::BidirectionalPath * p) const;
+ size_t GetComponent(EdgeId & e) const;
+ bool IsFilled() const {
+ return (component_ids_.size() != 0);
+ }
+
+};
+}
diff --git a/src/modules/assembly_graph/components/graph_component.hpp b/src/modules/assembly_graph/components/graph_component.hpp
new file mode 100644
index 0000000..e92831b
--- /dev/null
+++ b/src/modules/assembly_graph/components/graph_component.hpp
@@ -0,0 +1,198 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "dev_support/standard_base.hpp"
+
+namespace omnigraph {
+//todo make handler!!!
+template<class Graph>
+class GraphComponent {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename std::set<VertexId>::const_iterator vertex_iterator;
+ typedef typename std::set<EdgeId>::const_iterator edge_iterator;
+ const Graph& graph_;
+ std::set<VertexId> vertices_;
+ std::set<EdgeId> edges_;
+ std::set<VertexId> sinks_;
+ std::set<VertexId> sources_;
+ std::string name_;
+
+
+ template<class VertexIt>
+ void FillVertices(VertexIt begin, VertexIt end) {
+ for (auto it = begin; it != end; ++it) {
+ vertices_.insert(*it);
+ }
+ }
+
+ template<class VertexIt>
+ void FillVertices(VertexIt begin, VertexIt end, bool add_conjugate) {
+ for (auto it = begin; it != end; ++it) {
+ vertices_.insert(*it);
+ if (add_conjugate)
+ vertices_.insert(graph_.conjugate(*it));
+ }
+ }
+
+ void FillEdges() {
+ for (auto v_it = vertices_.begin(); v_it != vertices_.end(); ++v_it) {
+ TRACE("working with vertex " << graph_.str(*v_it));
+ for (EdgeId e : graph_.OutgoingEdges(*v_it)) {
+ VertexId edge_end = graph_.EdgeEnd(e);
+ TRACE(graph_.coverage(e) << " " << graph_.length(e));
+ if (vertices_.count(edge_end) > 0) {
+ edges_.insert(e);
+ TRACE("Edge added");
+ }
+ }
+ }
+ }
+
+ template<class VertexIt>
+ void Fill(VertexIt begin, VertexIt end) {
+ FillVertices(begin, end);
+ FillEdges();
+ FindSinksAndSources();
+ }
+
+ template<class VertexIt>
+ void Fill(VertexIt begin, VertexIt end, bool add_conjugate) {
+ FillVertices(begin, end, add_conjugate);
+ FillEdges();
+ FindSinksAndSources();
+ }
+
+ void FindSinksAndSources() {
+ for(auto v : vertices_) {
+ for(auto e : graph_.IncomingEdges(v)) {
+ if(!contains(e) && !(contains(graph_.EdgeStart(e)))) {
+ sources_.insert(v);
+ break;
+ }
+ }
+
+ for(auto e : graph_.OutgoingEdges(v)) {
+ if(!contains(e) && !(contains(graph_.EdgeEnd(e)))) {
+ sinks_.insert(v);
+ break;
+ }
+ }
+ }
+ }
+
+public:
+ template<class VertexIt>
+ GraphComponent(const Graph &g, VertexIt begin, VertexIt end, const string &name = "") :
+ graph_(g), name_(name) {
+ Fill(begin, end);
+ }
+
+ //todo refactor and get rid of hack
+ template<class VertexIt>
+ GraphComponent(const Graph &g, VertexIt begin, VertexIt end,
+ bool add_conjugate, const string &name = "") : graph_(g), name_(name) {
+ Fill(begin, end, add_conjugate);
+ }
+
+ //Full graph component
+ GraphComponent(const Graph &g, bool fill = true, const string &name = "") : graph_(g), name_(name) {
+ if(fill) {
+ Fill(g.begin(), g.end());
+ }
+ }
+
+ //may be used for conjugate closure
+ GraphComponent(const GraphComponent& component, bool add_conjugate, const string &name = "") : graph_(component.graph_), name_(name)
+// vertices_(component.vertices_.begin(), component.vertices_.end()),
+// edges_(component.edges_.begin(), component.edges_.end())
+ {
+ Fill(component.v_begin(), component.v_end(), add_conjugate);
+ }
+
+ GraphComponent<Graph> &operator=(const GraphComponent<Graph> &that) {
+ VERIFY(&this->graph_ == &that.graph_);
+ this->vertices_ = that.vertices_;
+ this->edges_ = that.edges_;
+ this->name_ = that.name_;
+ return *this;
+ }
+
+ const Graph& g() const {
+ return graph_;
+ }
+
+ string name() const {
+ return name_;
+ }
+
+ size_t v_size() const {
+ return vertices_.size();
+ }
+
+ size_t e_size() const {
+ return edges_.size();
+ }
+
+ bool contains(EdgeId e) const {
+ return edges_.count(e) > 0;
+ }
+
+ bool contains(VertexId v) const {
+ return vertices_.count(v) > 0;
+ }
+
+ edge_iterator e_begin() const {
+ return edges_.begin();
+ }
+ edge_iterator e_end() const {
+ return edges_.end();
+ }
+
+ const std::set<EdgeId>& edges() const {
+ return edges_;
+ }
+
+ const std::set<VertexId>& vertices() const{
+ return vertices_;
+ }
+
+ vertex_iterator v_begin() const {
+ return vertices_.begin();
+ }
+ vertex_iterator v_end() const {
+ return vertices_.end();
+ }
+
+ const std::set<VertexId>& sinks() const {
+ return sinks_;
+ }
+
+ const std::set<VertexId>& sources() const {
+ return sources_;
+ }
+
+ bool IsBorder(VertexId v) const {
+ if(vertices_.count(v) == 0)
+ return false;
+ for (EdgeId e : graph_.IncidentEdges(v)) {
+ if (vertices_.count(graph_.EdgeStart(e)) == 0
+ || vertices_.count(graph_.EdgeEnd(e)) == 0) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+};
+
+}
+
+
+
diff --git a/src/modules/assembly_graph/components/splitters.hpp b/src/modules/assembly_graph/components/splitters.hpp
new file mode 100644
index 0000000..3bb8f41
--- /dev/null
+++ b/src/modules/assembly_graph/components/splitters.hpp
@@ -0,0 +1,921 @@
+#pragma once
+
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "dev_support/standard_base.hpp"
+#include "graph_component.hpp"
+#include "algorithms/dijkstra/dijkstra_helper.hpp"
+#include "component_filters.hpp"
+
+namespace omnigraph {
+
+
+template<typename Element>
+class JSIterator {
+public:
+
+ virtual Element Next() = 0;
+
+ virtual bool HasNext() = 0;
+
+ virtual ~JSIterator() {
+ }
+};
+
+template<class Graph>
+class GraphSplitter : public JSIterator<GraphComponent<Graph>>{
+private:
+ const Graph& graph_;
+public:
+ GraphSplitter(const Graph& graph)
+ : graph_(graph) {
+ }
+
+ const Graph& graph() const {
+ return graph_;
+ }
+};
+
+template<class Graph>
+class PrecountedComponentSplitter : public GraphSplitter<Graph> {
+ bool HasNext_;
+ GraphComponent<Graph> component_;
+public:
+
+ template<class It>
+ PrecountedComponentSplitter(const Graph &graph, It begin, It end)
+ : GraphSplitter<Graph>(graph), HasNext_(false),
+ component_(graph, begin, end) {
+ }
+
+ template<class It>
+ PrecountedComponentSplitter(GraphComponent<Graph> component)
+ : GraphSplitter<Graph>(component.g()), HasNext_(false),
+ component_(component) {
+ }
+
+ GraphComponent<Graph> Next() {
+ HasNext_ = false;
+ return component_;
+ }
+
+// virtual bool CheckPutVertex(VertexId /*vertex*/, EdgeId edge, size_t /*length*/) const {
+// return edges_.count(edge) != 0;
+// }
+ bool HasNext() {
+ return HasNext_;
+ }
+};
+
+template<typename Element>
+class RelaxingIterator : public JSIterator<Element> {
+public:
+ template<typename It>
+ void Relax(It begin, It end) {
+ Relax(vector<Element>(begin, end));
+ }
+
+// virtual bool CheckProcessVertex(VertexId /*vertex*/, size_t distance) {
+// return distance <= bound_;
+// }
+ virtual void Relax(const vector<Element> &v) = 0;
+
+ virtual void Relax(Element) = 0;
+
+ virtual ~RelaxingIterator() {
+ }
+};
+
+template<class Collection>
+class CollectionIterator : public RelaxingIterator<typename Collection::value_type> {
+private:
+ typedef typename Collection::value_type Element;
+ typedef typename Collection::const_iterator Iter;
+ shared_ptr<Collection> storage_;
+ Iter current_;
+ const Iter end_;
+ set<Element> relaxed_;
+public:
+ CollectionIterator(const Collection &collection)
+ : current_(collection.begin()), end_(collection.end()) {
+ }
+
+// virtual bool CheckPutVertex(VertexId vertex, EdgeId /*edge*/, size_t /*length*/) const {
+// return subgraph_.count(vertex) != 0;
+// }
+ CollectionIterator(shared_ptr<Collection> collection)
+ : storage_(collection), current_(collection->begin()), end_(collection->end()) {
+ }
+
+ CollectionIterator(Iter begin, Iter end)
+ : current_(begin), end_(end) {
+ }
+
+ Element Next() {
+ if(!HasNext()) { //This function actually changes value of current! It is not just to verify!
+ //fixme use VERIFY_MSG instead
+ VERIFY(HasNext());
+ }
+ Element next = *current_;
+ ++current_;
+ return next;
+ }
+
+//public:
+// ErrorComponentSplitter(const Graph &graph, const set<EdgeId> &black_edges) :
+// base(graph), black_edges_(black_edges), iterator_(
+// graph.SmartEdgeBegin()) {
+// TRACE("ErrorComponentSplitter created and SmartIterator initialized");
+// }
+//
+// virtual ~ErrorComponentSplitter() {
+// }
+//
+// vector<VertexId> FindComponent(VertexId start_vertex) {
+// ComponentFinder<Graph> cf(this->graph(), black_edges_);
+// cf.run(start_vertex);
+// return cf.ReachedVertices();
+// }
+//
+// vector<VertexId> FindNeighbourhood(VertexId start, size_t bound) {
+// NeighbourhoodFinder<Graph> nf(this->graph(), black_edges_, bound);
+// nf.run(start);
+// return nf.ReachedVertices();
+// }
+//
+// size_t FindDiameter(const vector<VertexId> &component) {
+// set < VertexId > component_set(component.begin(), component.end());
+// size_t result = 0;
+// VertexId current = *(component.begin());
+// for (size_t i = 0; i < 4; i++) {
+// pair<VertexId, size_t> next = GetFarthest(current, component_set);
+// current = next.first;
+// result = next.second;
+// }
+// return result;
+// }
+//
+// pair<VertexId, size_t> GetFarthest(VertexId v,
+// const set<VertexId> &component) {
+// SubgraphDijkstra<Graph> sd(this->graph(), component);
+// sd.run(v);
+// pair<VertexId, size_t> result(v, 0);
+// auto bounds = sd.GetDistances();
+// for (auto it = bounds.first; it != bounds.second; ++it) {
+// if (it->second > result.second) {
+// result = *it;
+// }
+// }
+// return result;
+// }
+//
+// virtual vector<VertexId> NextComponent() {
+// TRACE("Construction of next component started");
+// if (Finished()) {
+// VERIFY(false);
+// return vector<VertexId>();
+// }
+// EdgeId next = *iterator_;
+// ++iterator_;
+// vector < VertexId > component = FindComponent(
+// this->graph().EdgeEnd(next));
+// TRACE("Error edges component constructed. It contains "
+// << component.size() << " vertices");
+// size_t component_size = FindDiameter(component);
+// TRACE("Diameter of component is " << component_size);
+// vector < VertexId > neighbourhood = FindNeighbourhood(
+// this->graph().EdgeEnd(next), (size_t) math::round(1.5 * (double) component_size));
+// TRACE("Error edges component neighborhood constructed. It contains "
+// << neighbourhood.size() << " vertices");
+// visited_.insert(component.begin(), component.end());
+// return neighbourhood;
+// }
+//
+// virtual bool Finished() {
+// while (!iterator_.IsEnd()) {
+// if (black_edges_.find(*iterator_) != black_edges_.end()
+// && visited_.find(this->graph().EdgeEnd(*iterator_))
+// == visited_.end()) {
+// return false;
+// }
+// ++iterator_;
+// }
+// return true;
+// }
+ bool HasNext() {
+ while(current_ != end_ && relaxed_.count(*current_) == 1) {
+ ++current_;
+ }
+ return current_ != end_;
+ }
+
+ void Relax(Element e) {
+ relaxed_.insert(e);
+ }
+
+//template<class Graph>
+//class ShortEdgeComponentNeighbourhoodFinder: public UnorientedDijkstra<Graph> {
+//private:
+// typedef UnorientedDijkstra<Graph> base;
+//protected:
+// typedef typename base::VertexId VertexId;
+// typedef typename base::EdgeId EdgeId;
+// typedef typename base::DistanceType distance_t;
+//private:
+// distance_t bound_;
+//public:
+// ShortEdgeComponentNeighbourhoodFinder(const Graph &graph, distance_t bound) :
+// UnorientedDijkstra<Graph>(graph), bound_(bound) {
+// }
+//
+// virtual bool CheckProcessVertexVertexId (VertexId /*vertex*/, distance_t distance) {
+// return distance == 0;
+// }
+//
+// virtual distance_t GetLength(EdgeId edge) const {
+// if (this->graph().length(edge) <= bound_)
+// return 0;
+// else
+// return 1;
+// }
+ void Relax(const vector<Element> &v) {
+ for (auto it = v.begin(); it != v.end(); ++it)
+ Relax(*it);
+ }
+
+ virtual ~CollectionIterator() {
+ }
+};
+
+template<class Graph>
+class PathIterator : public RelaxingIterator<typename Graph::VertexId> {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ const Graph &graph_;
+ vector<VertexId> path_;
+ size_t current_;
+
+ static vector<VertexId> ExtractVertices(const Graph &graph, const vector<EdgeId> &path) {
+ vector<VertexId> result;
+ for(size_t i = 0; i < path.size(); i++) {
+ if(i == 0 || path[i] != path[i - 1]) {
+ result.push_back(graph.EdgeStart(path[i]));
+ result.push_back(graph.EdgeEnd(path[i]));
+ }
+ }
+ return result;
+ }
+
+public:
+ PathIterator(const Graph &graph, const vector<EdgeId> &path)
+ : graph_(graph), path_(ExtractVertices(graph, path)), current_(0) {
+ }
+
+ VertexId Next() {
+ if(!HasNext()) {
+ VERIFY(HasNext());
+ }
+ VertexId next = path_[current_];
+ Relax(next);
+ return next;
+ }
+
+ bool HasNext() {
+ return current_ < path_.size();
+ }
+
+ void Relax(const vector<VertexId> &v) {
+ set<VertexId> toRelax(v.begin(), v.end());
+ while(toRelax.count(path_[current_]) == 1)
+ current_++;
+ }
+
+//public:
+// CountingDijkstra(const Graph &graph, size_t max_size,
+// size_t edge_length_bound) :
+// base(graph), max_size_(max_size), edge_length_bound_(
+// edge_length_bound), current_(0) {
+// }
+//
+// virtual bool CheckPutVertex(VertexId /*vertex*/, EdgeId edge,
+// distance_t /*length*/) const {
+// if (current_ < max_size_) {
+// ++current_;
+// }
+// if (current_ < max_size_ && GetLength(edge) < inf) {
+// return true;
+// }
+// return false;
+// }
+//
+// virtual bool CheckProcessVertex(VertexId /*vertex*/, distance_t /*distance*/) {
+// return current_ < max_size_;
+// }
+//
+// virtual void init(VertexId /*start*/) {
+// current_ = 0;
+// }
+//
+// virtual size_t GetLength(EdgeId edge) const {
+// if (this->graph().length(edge) <= edge_length_bound_)
+// //todo change back
+//// return 1;
+// return this->graph().length(edge);
+// else
+// return inf;
+// }
+ void Relax(VertexId e) {
+ Relax(vector<VertexId>({e}));
+ }
+};
+
+template<class Graph>
+class AbstractNeighbourhoodFinder {
+private:
+ const Graph &graph_;
+public:
+ AbstractNeighbourhoodFinder(const Graph &graph) : graph_(graph) {
+ }
+
+ const Graph &graph() const {
+ return graph_;
+ }
+
+ virtual GraphComponent<Graph> Find(typename Graph::VertexId v) = 0;
+
+ virtual vector<typename Graph::VertexId> InnerVertices(const GraphComponent<Graph> &component) = 0;
+
+ virtual ~AbstractNeighbourhoodFinder() {
+ }
+};
+
+template<class Graph, typename distance_t = size_t>
+class ComponentCloser {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ const Graph &graph_;
+ size_t edge_length_bound_;
+
+public:
+ ComponentCloser(const Graph &graph, size_t edge_length_bound)
+ : graph_(graph),
+ edge_length_bound_(edge_length_bound) {
+ }
+
+ void CloseComponent(set<VertexId> &component) const {
+ set<VertexId> additional_vertices;
+ for (auto it = component.begin(); it != component.end(); ++it) {
+ for (EdgeId e : graph_.OutgoingEdges(*it)) {
+ if (graph_.length(e) >= edge_length_bound_) {
+ additional_vertices.insert(graph_.EdgeEnd(e));
+ }
+ }
+ for (EdgeId e : graph_.IncomingEdges(*it)) {
+ if (graph_.length(e) >= edge_length_bound_) {
+ additional_vertices.insert(graph_.EdgeStart(e));
+ }
+ }
+ }
+ component.insert(additional_vertices.begin(),
+ additional_vertices.end());
+ }
+
+ GraphComponent<Graph> CloseComponent(const GraphComponent<Graph>& component) const {
+ set<VertexId> vertices(component.v_begin(), component.v_end());
+ CloseComponent(vertices);
+ return GraphComponent<Graph>(graph_, vertices.begin(), vertices.end());
+ }
+};
+
+//This method finds a neighbourhood of a set of vertices. Vertices that are connected by an edge of length more than 600 are not considered as adjacent.
+template<class Graph>
+class ReliableNeighbourhoodFinder : public AbstractNeighbourhoodFinder<Graph> {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ set<VertexId> FindNeighbours(const set<VertexId> &s) {
+ set<VertexId> result(s.begin(), s.end());
+ for (VertexId v : result) {
+ for (EdgeId e : this->graph().IncidentEdges(v)) {
+ if(this->graph().length(e) <= edge_length_bound_) {
+ result.insert(this->graph().EdgeEnd(e));
+ result.insert(this->graph().EdgeStart(e));
+ }
+ }
+ }
+ return result;
+ }
+
+ set<VertexId> FindNeighbours(const set<VertexId> &s, size_t eps) {
+ set<VertexId> result = s;
+ for(size_t i = 0; i < eps; i++) {
+ result = FindNeighbours(result);
+ }
+ return result;
+ }
+
+ set<VertexId> FindBorder(const GraphComponent<Graph> component) {
+ set<VertexId> result;
+ for(auto it = component.vertices().begin(); it != component.vertices().end(); ++it) {
+ if(component.IsBorder(*it)) {
+ result.insert(*it);
+ }
+ }
+ return result;
+ }
+
+public:
+ static const size_t DEFAULT_EDGE_LENGTH_BOUND = 500;
+ static const size_t DEFAULT_MAX_SIZE = 100;
+
+ const size_t edge_length_bound_;
+ const size_t max_size_;
+
+ ReliableNeighbourhoodFinder(const Graph &graph, size_t edge_length_bound =
+ DEFAULT_EDGE_LENGTH_BOUND,
+ size_t max_size = DEFAULT_MAX_SIZE)
+ : AbstractNeighbourhoodFinder<Graph>(graph),
+ edge_length_bound_(edge_length_bound),
+ max_size_(max_size) {
+ }
+
+ GraphComponent<Graph> Find(typename Graph::VertexId v) {
+ auto cd = DijkstraHelper<Graph>::CreateCountingDijkstra(this->graph(), max_size_,
+ edge_length_bound_);
+ cd.Run(v);
+ vector<VertexId> result_vector = cd.ReachedVertices();
+ set<VertexId> result(result_vector.begin(), result_vector.end());
+ ComponentCloser<Graph> cc(this->graph(), edge_length_bound_);
+ cc.CloseComponent(result);
+ return GraphComponent<Graph>(this->graph(), result.begin(),
+ result.end());
+ }
+
+ vector<VertexId> InnerVertices(const GraphComponent<Graph> &component) {
+ set<VertexId> border = FindNeighbours(FindBorder(component), 2);
+ std::vector<VertexId> result;
+ std::set_difference(component.vertices().begin(), component.vertices().end(), border.begin(), border.end(), std::inserter(result, result.end()));
+ return vector<VertexId>(result.begin(), result.end());
+ }
+};
+
+template<class Graph>
+class PathNeighbourhoodFinder : public AbstractNeighbourhoodFinder<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ VertexId OtherEnd(EdgeId e, VertexId v) const {
+ if (this->graph().EdgeStart(e) == v)
+ return this->graph().EdgeEnd(e);
+ else
+ return this->graph().EdgeStart(e);
+ }
+
+ bool Go(VertexId v, size_t curr_depth, set<VertexId>& grey, set<VertexId>& black) const {
+ //allows single vertex to be visited many times with different depth values
+ TRACE("Came to vertex " << this->graph().str(v) << " on depth " << curr_depth);
+ if (curr_depth >= max_depth_) {
+ TRACE("Too deep");
+ return true;
+ }
+ if (grey.size() >= max_size_) {
+ TRACE("Too many vertices");
+ return false;
+ }
+
+ TRACE("Started processing of vertex " << this->graph().str(v));
+ grey.insert(v);
+
+ TRACE("Sorting incident edges");
+ vector<EdgeId> incident_path;
+ vector<EdgeId> incident_non_path;
+ for (EdgeId e : this->graph().IncidentEdges(v)) {
+ if (path_edges_.count(e) != 0) {
+ /*condition not to go backward*/
+ if (this->graph().EdgeStart(e) == v) {
+ incident_path.push_back(e);
+ }
+ } else {
+ incident_non_path.push_back(e);
+ }
+ }
+
+ for (EdgeId e : incident_non_path) {
+ if (this->graph().length(e) > edge_length_bound_) {
+ TRACE("Edge " << this->graph().str(e) << " is too long");
+ continue;
+ }
+ TRACE("Going along edge " << this->graph().str(e));
+ if (!Go(OtherEnd(e, v), curr_depth + 1, grey, black))
+ return false;
+ }
+
+ TRACE("End processing of vertex " << this->graph().str(v));
+ black.insert(v);
+
+ for (EdgeId e : incident_path) {
+ if (grey.count(OtherEnd(e, v)) != 0)
+ continue;
+ TRACE("Going along next path edge " << this->graph().str(e));
+ if (!Go(OtherEnd(e, v), 0, grey, black))
+ return false;
+ }
+
+ return true;
+ }
+
+public:
+ static const size_t DEFAULT_EDGE_LENGTH_BOUND = 500;
+ static const size_t DEFAULT_MAX_DEPTH = 2;
+ static const size_t DEFAULT_MAX_SIZE = 20;
+
+ set<EdgeId> path_edges_;
+ const size_t edge_length_bound_;
+ const size_t max_size_;
+ const size_t max_depth_;
+
+ set<VertexId> last_inner_;
+
+ PathNeighbourhoodFinder(const Graph &graph, const vector<EdgeId>& path, size_t edge_length_bound = DEFAULT_EDGE_LENGTH_BOUND,
+ size_t max_size = DEFAULT_MAX_SIZE, size_t max_depth = DEFAULT_MAX_DEPTH)
+ : AbstractNeighbourhoodFinder<Graph>(graph),
+ path_edges_(path.begin(), path.end()),
+ edge_length_bound_(edge_length_bound),
+ max_size_(max_size),
+ max_depth_(max_depth) {
+ }
+
+
+ GraphComponent<Graph> Find(VertexId v) {
+ TRACE("Starting from vertex " << this->graph().str(v));
+ last_inner_.clear();
+ set<VertexId> grey;
+ set<VertexId> black;
+ Go(v, 0, grey, black);
+ last_inner_ = black;
+ last_inner_.insert(v);
+ ComponentCloser<Graph>(this->graph(), 0).CloseComponent(grey);
+ return GraphComponent<Graph>(this->graph(), grey.begin(), grey.end());
+ }
+
+ vector<VertexId> InnerVertices(const GraphComponent<Graph> &/*component*/) {
+ return vector<VertexId>(last_inner_.begin(), last_inner_.end());
+ }
+private:
+ DECL_LOGGER("PathNeighbourhoodFinder");
+};
+
+//todo delete and think if we really need hierarchy
+template<class Graph>
+class ShortEdgeComponentFinder : public AbstractNeighbourhoodFinder<Graph> {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+public:
+ static const size_t DEFAULT_EDGE_LENGTH_BOUND = 100;
+
+ const size_t edge_length_bound_;
+
+ ShortEdgeComponentFinder(const Graph &graph, size_t edge_length_bound = DEFAULT_EDGE_LENGTH_BOUND)
+ : AbstractNeighbourhoodFinder<Graph>(graph),
+ edge_length_bound_(edge_length_bound) {
+ }
+
+ GraphComponent<Graph> Find(VertexId v) {
+ auto cd = DijkstraHelper<Graph>::CreateShortEdgeDijkstra(this->graph(), edge_length_bound_);
+ cd.Run(v);
+ set<VertexId> result = cd.ProcessedVertices();
+ return GraphComponent<Graph>(this->graph(), result.begin(),
+ result.end());
+ }
+
+ vector<VertexId> InnerVertices(const GraphComponent<Graph> &component) {
+ return vector<VertexId>(component.v_begin(), component.v_end());
+ }
+};
+
+template<class Graph>
+class FilteringSplitterWrapper : public GraphSplitter<Graph> {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ shared_ptr<GraphSplitter<Graph>> inner_splitter_;
+ shared_ptr<GraphComponentFilter<Graph>> checker_;
+ boost::optional<GraphComponent<Graph>> next_;
+public:
+ FilteringSplitterWrapper(
+ shared_ptr<GraphSplitter<Graph>> inner_splitter,
+ shared_ptr<GraphComponentFilter<Graph>> checker)
+ : GraphSplitter<Graph>(inner_splitter->graph()), inner_splitter_(inner_splitter),
+ checker_(checker) {
+ }
+
+ GraphComponent<Graph> Next() {
+ if (!HasNext()) {
+ VERIFY(false);
+ return omnigraph::GraphComponent<Graph>(this->graph());
+ }
+ GraphComponent<Graph> result = next_.get();
+ next_ = boost::optional<GraphComponent<Graph>>();
+ return result;
+ }
+
+ bool HasNext() {
+ while (!next_ && inner_splitter_->HasNext()) {
+ GraphComponent<Graph> ne = inner_splitter_->Next();
+ if (checker_->Check(ne)) {
+ next_ = ne;
+ }
+ }
+ return next_;
+ }
+private:
+ DECL_LOGGER("FilteringSplitterWrapper");
+};
+
+//TODO split combined component into several.
+template<class Graph>
+class CollectingSplitterWrapper : public GraphSplitter<Graph> {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ shared_ptr<GraphSplitter<Graph>> inner_splitter_;
+ shared_ptr<GraphComponentFilter<Graph>> checker_;
+ boost::optional<GraphComponent<Graph>> next_;
+ set<VertexId> filtered_;
+public:
+ CollectingSplitterWrapper(
+ shared_ptr<GraphSplitter<Graph>> inner_splitter,
+ shared_ptr<GraphComponentFilter<Graph>> checker)
+ : GraphSplitter<Graph>(inner_splitter->graph()), inner_splitter_(inner_splitter),
+ checker_(checker) {
+ }
+
+ GraphComponent<Graph> Next() {
+ if (!HasNext()) {
+ VERIFY(false);
+ return omnigraph::GraphComponent<Graph>(this->graph());
+ } else {
+ if(next_) {
+ GraphComponent<Graph> result = next_.get();
+ next_ = boost::optional<GraphComponent<Graph>>();
+ return result;
+ } else {
+ GraphComponent<Graph> result(this->graph(), filtered_.begin(), filtered_.end(), false, "filtered");
+ filtered_.clear();
+ return result;
+ }
+ }
+ }
+
+ bool HasNext() {
+ while (!next_ && inner_splitter_->HasNext()) {
+ GraphComponent<Graph> ne = inner_splitter_->Next();
+ if (checker_->Check(ne)) {
+ next_ = ne;
+ } else {
+ filtered_.insert(ne.v_begin(), ne.v_end());
+ }
+ }
+ return next_ || !filtered_.empty();
+ }
+private:
+ DECL_LOGGER("FilteringSplitterWrapper");
+};
+
+template<class Graph>
+class CondensingSplitterWrapper : public GraphSplitter<Graph> {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ shared_ptr<GraphSplitter<Graph>> inner_splitter_;
+ shared_ptr<GraphComponentFilter<Graph>> checker_;
+ boost::optional<GraphComponent<Graph>> next_;
+
+ string CutName(const string &name, size_t max_length) {
+ VERIFY(max_length >= 7);
+ size_t length = name.size();
+ if (length <= max_length)
+ return name;
+ else {
+ return name.substr(0, (max_length - 5) / 2) + "....." + name.substr(length - (max_length - 5) / 2, (max_length - 5) / 2);
+ }
+ }
+
+ GraphComponent<Graph> ConstructComponent() {
+ GraphComponent<Graph> next = inner_splitter_->Next();
+ if (checker_->Check(next)) {
+ return next;
+ }
+ set<VertexId> vertices(next.v_begin(), next.v_end());
+ string name = next.name();
+ for(size_t i = 0; i < 10 && inner_splitter_->HasNext(); i++) {
+ next = inner_splitter_->Next();
+ if (checker_->Check(next)) {
+ next_ = next;
+ break;
+ } else {
+ vertices.insert(next.v_begin(), next.v_end());
+ if (next.name() != "") {
+ name += ";";
+ name += next.name();
+ }
+ }
+ }
+ return GraphComponent<Graph>(this->graph(), vertices.begin(), vertices.end(), CutName(name, 60));
+ }
+
+public:
+ CondensingSplitterWrapper(
+ shared_ptr<GraphSplitter<Graph>> inner_splitter,
+ shared_ptr<GraphComponentFilter<Graph>> checker)
+ : GraphSplitter<Graph>(inner_splitter->graph()), inner_splitter_(inner_splitter),
+ checker_(checker) {
+ }
+
+ GraphComponent<Graph> Next() {
+ if (!HasNext()) {
+ VERIFY(false);
+ return omnigraph::GraphComponent<Graph>(this->graph());
+ }
+ if(next_) {
+ GraphComponent<Graph> result = next_.get();
+ next_ = boost::optional<GraphComponent<Graph>>();
+ return result;
+ } else {
+ return ConstructComponent();
+ }
+ }
+
+ bool HasNext() {
+ if(next_)
+ return true;
+ if(!inner_splitter_->HasNext())
+ return false;
+ return true;
+ }
+private:
+ DECL_LOGGER("FilteringSplitterWrapper");
+};
+
+template<class Graph>
+class NeighbourhoodFindingSplitter : public GraphSplitter<Graph> {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ shared_ptr<RelaxingIterator<VertexId>> inner_iterator_;
+ shared_ptr<AbstractNeighbourhoodFinder<Graph>> neighbourhood_finder_;
+
+public:
+ NeighbourhoodFindingSplitter(
+ const Graph& graph,
+ shared_ptr<RelaxingIterator<VertexId>> inner_iterator,
+ shared_ptr<AbstractNeighbourhoodFinder<Graph>> neighbourhood_finder)
+ : GraphSplitter<Graph>(graph),
+ inner_iterator_(inner_iterator),
+ neighbourhood_finder_(neighbourhood_finder) {
+ }
+
+ NeighbourhoodFindingSplitter(
+ const Graph& graph,
+ shared_ptr<RelaxingIterator<VertexId>> inner_iterator)
+ : GraphSplitter<Graph>(graph),
+ inner_iterator_(inner_iterator),
+ neighbourhood_finder_(
+ make_shared<ReliableNeighbourhoodFinder<Graph>>(graph)) {
+ }
+
+ NeighbourhoodFindingSplitter(const Graph& graph)
+ : GraphSplitter<Graph>(graph),
+ inner_iterator_(
+ make_shared<CollectionIterator<set<VertexId>>>(graph.begin(), graph.end())),
+ neighbourhood_finder_(make_shared<ReliableNeighbourhoodFinder<Graph>>(graph)) {
+ }
+
+ GraphComponent<Graph> Next() {
+ VertexId next_vertex = inner_iterator_->Next();
+ GraphComponent<Graph> result = neighbourhood_finder_->Find(next_vertex);
+ vector<VertexId> to_relax = neighbourhood_finder_->InnerVertices(result);
+ to_relax.push_back(next_vertex);
+ inner_iterator_->Relax(to_relax);
+ return result;
+ }
+
+ bool HasNext() {
+ return inner_iterator_->HasNext();
+ }
+};
+
+template<class Graph>
+shared_ptr<GraphSplitter<Graph>> ReliableSplitter(const Graph &graph,
+ size_t edge_length_bound = ReliableNeighbourhoodFinder<Graph>::DEFAULT_EDGE_LENGTH_BOUND,
+ size_t max_size = ReliableNeighbourhoodFinder<Graph>::DEFAULT_MAX_SIZE) {
+ typedef typename Graph::VertexId VertexId;
+ shared_ptr<RelaxingIterator<VertexId>> inner_iterator = make_shared<CollectionIterator<set<VertexId>>>(graph.begin(), graph.end());
+ shared_ptr<AbstractNeighbourhoodFinder<Graph>> nf = make_shared<ReliableNeighbourhoodFinder<Graph>>(graph, edge_length_bound, max_size);
+ return make_shared<NeighbourhoodFindingSplitter<Graph>>(graph,
+ inner_iterator, nf);
+}
+
+template<class Graph>
+shared_ptr<GraphSplitter<Graph>> ConnectedSplitter(const Graph &graph,
+ size_t edge_length_bound = 1000000,
+ size_t max_size = 1000000) {
+ typedef typename Graph::VertexId VertexId;
+ shared_ptr<RelaxingIterator<VertexId>> inner_iterator = make_shared<CollectionIterator<set<VertexId>>>(graph.begin(), graph.end());
+ shared_ptr<AbstractNeighbourhoodFinder<Graph>> nf = make_shared<ReliableNeighbourhoodFinder<Graph>>(graph, edge_length_bound, max_size);
+ return make_shared<NeighbourhoodFindingSplitter<Graph>>(graph,
+ inner_iterator, nf);
+}
+
+template<class Graph>
+shared_ptr<GraphSplitter<Graph>> ReliableSplitterAlongPath(
+ const Graph &graph, const vector<typename Graph::EdgeId>& path, size_t edge_length_bound = PathNeighbourhoodFinder<Graph>::DEFAULT_EDGE_LENGTH_BOUND,
+ size_t max_size = PathNeighbourhoodFinder<Graph>::DEFAULT_MAX_SIZE,
+ size_t max_depth = PathNeighbourhoodFinder<Graph>::DEFAULT_MAX_DEPTH) {
+ typedef typename Graph::VertexId VertexId;
+ shared_ptr<RelaxingIterator<VertexId>> inner_iterator = make_shared<
+ PathIterator<Graph>>(graph, path);
+ shared_ptr<AbstractNeighbourhoodFinder<Graph>> nf = make_shared<PathNeighbourhoodFinder<Graph>>(graph, path,
+ edge_length_bound, max_size, max_depth);
+
+ return make_shared<NeighbourhoodFindingSplitter<Graph>>(graph,
+ inner_iterator, nf);
+}
+
+template<class Graph>
+shared_ptr<GraphSplitter<Graph>> LongEdgesExclusiveSplitter(
+ const Graph &graph, size_t bound =
+ ReliableNeighbourhoodFinder<Graph>::DEFAULT_EDGE_LENGTH_BOUND) {
+ typedef typename Graph::VertexId VertexId;
+ shared_ptr<RelaxingIterator<VertexId>> inner_iterator = make_shared<
+ CollectionIterator<set<VertexId>>>(graph.begin(), graph.end());
+ shared_ptr<AbstractNeighbourhoodFinder<Graph>> nf = make_shared<
+ ShortEdgeComponentFinder<Graph>>(graph, bound);
+ return make_shared<NeighbourhoodFindingSplitter<Graph>>(graph,
+ inner_iterator, nf);
+}
+
+template<class Graph, typename Collection>
+shared_ptr<GraphSplitter<Graph>> StandardSplitter(
+ const Graph &graph, const Collection &collection, size_t max_size = ReliableNeighbourhoodFinder<Graph>::DEFAULT_MAX_SIZE,
+ size_t edge_length_bound = ReliableNeighbourhoodFinder<Graph>::DEFAULT_EDGE_LENGTH_BOUND) {
+ typedef typename Graph::VertexId VertexId;
+ shared_ptr<RelaxingIterator<VertexId>> inner_iterator = make_shared<CollectionIterator<Collection>>(collection);
+ shared_ptr<AbstractNeighbourhoodFinder<Graph>> nf = make_shared<
+ ReliableNeighbourhoodFinder<Graph>>(graph, edge_length_bound,
+ max_size);
+ return make_shared<NeighbourhoodFindingSplitter<Graph>>(graph, inner_iterator, nf);
+}
+
+template<class Graph, typename Collection>
+shared_ptr<GraphSplitter<Graph>> StandardSplitter(
+ const Graph &graph, shared_ptr<Collection> collection, size_t max_size = ReliableNeighbourhoodFinder<Graph>::DEFAULT_MAX_SIZE,
+ size_t edge_length_bound = ReliableNeighbourhoodFinder<Graph>::DEFAULT_EDGE_LENGTH_BOUND) {
+ typedef typename Graph::VertexId VertexId;
+ shared_ptr<RelaxingIterator<VertexId>> inner_iterator = make_shared<CollectionIterator<Collection>>(collection);
+ shared_ptr<AbstractNeighbourhoodFinder<Graph>> nf = make_shared<
+ ReliableNeighbourhoodFinder<Graph>>(graph, edge_length_bound,
+ max_size);
+ return make_shared<NeighbourhoodFindingSplitter<Graph>>(graph, inner_iterator, nf);
+}
+
+template<class Graph>
+shared_ptr<GraphSplitter<Graph>> WholeGraphSplitter(
+ const Graph &graph, size_t max_size,
+ size_t edge_length_bound) {
+ return NeighbourhoodFindingSplitter<Graph>(graph, graph.vertices(), max_size, edge_length_bound);
+}
+
+template<class Graph>
+GraphComponent<Graph> VertexNeighborhood(
+ const Graph &graph, typename Graph::VertexId vertex, size_t max_size = ReliableNeighbourhoodFinder<Graph>::DEFAULT_MAX_SIZE,
+ size_t edge_length_bound = ReliableNeighbourhoodFinder<Graph>::DEFAULT_EDGE_LENGTH_BOUND) {
+ vector<typename Graph::VertexId> vv = {vertex};
+ shared_ptr<vector<typename Graph::VertexId>> sh_vv = make_shared<vector<typename Graph::VertexId>>(vv);
+ return StandardSplitter<Graph>(graph, sh_vv, max_size, edge_length_bound)->Next();
+}
+
+//TODO make a method that draws a picture that contains given set of edges for sure. ? mb refactor this into just drawing instead of splitting?
+template<class Graph>
+GraphComponent<Graph> EdgeNeighborhood(
+ const Graph &graph, typename Graph::EdgeId edge, size_t max_size = ReliableNeighbourhoodFinder<Graph>::DEFAULT_MAX_SIZE,
+ size_t edge_length_bound = ReliableNeighbourhoodFinder<Graph>::DEFAULT_EDGE_LENGTH_BOUND) {
+ vector<typename Graph::VertexId> vv = {graph.EdgeStart(edge)};
+ shared_ptr<vector<typename Graph::VertexId>> sh_vv = make_shared<vector<typename Graph::VertexId>>(vv);
+ return StandardSplitter<Graph>(graph, sh_vv, max_size, edge_length_bound)->Next();
+}
+
+}
diff --git a/src/modules/assembly_graph/graph_alignment/edge_index.hpp b/src/modules/assembly_graph/graph_alignment/edge_index.hpp
new file mode 100644
index 0000000..72a9d25
--- /dev/null
+++ b/src/modules/assembly_graph/graph_alignment/edge_index.hpp
@@ -0,0 +1,112 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "dev_support/openmp_wrapper.h"
+
+#include "assembly_graph/graph_core/graph.hpp"
+#include "assembly_graph/graph_core/action_handlers.hpp"
+#include "dev_support/standard_base.hpp"
+#include "data_structures/indices/edge_index_builders.hpp"
+
+namespace debruijn_graph {
+
+/**
+ * EdgeIndex is a structure to store info about location of certain k-mers in graph. It delegates all
+ * container procedures to inner_index_ and all handling procedures to
+ * renewer_ which is DataHashRenewer.
+ * @see DeBruijnKMerIndex
+ * @see DataHashRenewer
+ */
+//fixme template params
+template<class Graph, class Seq /*= runtime_k::RtSeq*/,
+ class Index /*= KmerFreeEdgeIndex<Graph, Seq>*/>
+class EdgeIndex: public omnigraph::GraphActionHandler<Graph> {
+
+public:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef Index InnerIndexT;
+ typedef Graph GraphT;
+ typedef typename Index::KMer KMer;
+ typedef typename Index::KMerIdx KMerIdx;
+ typedef typename Index::Value Value;
+
+private:
+ Index inner_index_;
+ EdgeInfoUpdater<Index, Graph> updater_;
+ bool delete_index_;
+
+public:
+
+ EdgeIndex(const Graph& g, const std::string &workdir)
+ : omnigraph::GraphActionHandler<Graph>(g, "EdgeIndex"),
+ inner_index_(g, workdir),
+ updater_(g, inner_index_),
+ delete_index_(true) {
+ }
+
+ virtual ~EdgeIndex() {
+ TRACE("~EdgeIndex OK")
+ }
+
+ Index &inner_index() {
+ return inner_index_;
+ }
+
+ size_t k() const {
+ return inner_index_.k();
+ }
+
+ const Index &inner_index() const {
+ VERIFY(this->IsAttached());
+ return inner_index_;
+ }
+
+ virtual void HandleAdd(EdgeId e) {
+ updater_.UpdateKmers(e);
+ }
+
+ virtual void HandleDelete(EdgeId e) {
+ updater_.DeleteKmers(e);
+ }
+
+ bool contains(const KMer& kmer) const {
+ VERIFY(this->IsAttached());
+ return inner_index_.contains(inner_index_.ConstructKWH(kmer));
+ }
+
+ const pair<EdgeId, size_t> get(const KMer& kmer) const {
+ VERIFY(this->IsAttached());
+ auto kwh = inner_index_.ConstructKWH(kmer);
+ if (!inner_index_.contains(kwh)) {
+ return make_pair(EdgeId(0), -1u);
+ } else {
+ EdgeInfo<EdgeId> entry = inner_index_.get_value(kwh);
+ return std::make_pair(entry.edge_id, (size_t)entry.offset);
+ }
+ }
+
+ void Refill() {
+ clear();
+ typedef typename EdgeIndexHelper<InnerIndexT>::GraphPositionFillingIndexBuilderT IndexBuilder;
+ //also makes an update!
+ //todo pass appropriate 3-rd arg
+ IndexBuilder().BuildIndexFromGraph(inner_index_, this->g());
+ INFO("Index refilled");
+ }
+
+ void Update() {
+ updater_.UpdateAll();
+ }
+
+ void clear() {
+ inner_index_.clear();
+ }
+
+};
+}
diff --git a/src/modules/assembly_graph/graph_alignment/kmer_map.hpp b/src/modules/assembly_graph/graph_alignment/kmer_map.hpp
new file mode 100644
index 0000000..e2d0f12
--- /dev/null
+++ b/src/modules/assembly_graph/graph_alignment/kmer_map.hpp
@@ -0,0 +1,151 @@
+//***************************************************************************
+//* Copyright (c) 2016 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef __KMER_MAP_HPP__
+#define __KMER_MAP_HPP__
+
+#include "data_structures/sequence/runtime_k.hpp"
+
+#include <htrie/hat-trie.h>
+#include <boost/iterator/iterator_facade.hpp>
+
+namespace debruijn_graph {
+class KMerMap {
+ typedef runtime_k::RtSeq Kmer;
+ typedef runtime_k::RtSeq Seq;
+ typedef typename Seq::DataType RawSeqData;
+
+ value_t* internal_tryget(const Kmer &key) const {
+ return hattrie_tryget(mapping_, (const char *)key.data(), rawcnt_ * sizeof(RawSeqData));
+ }
+
+ value_t* internal_get(const Kmer &key) const {
+ return hattrie_get(mapping_, (const char *)key.data(), rawcnt_ * sizeof(RawSeqData));
+ }
+
+ int internal_erase(const Kmer &key) {
+ return hattrie_del(mapping_, (const char *)key.data(), rawcnt_ * sizeof(RawSeqData));
+ }
+
+ class iterator : public boost::iterator_facade<iterator,
+ const std::pair<Kmer, Seq>,
+ std::forward_iterator_tag,
+ const std::pair<Kmer, Seq>> {
+ public:
+ iterator(unsigned k, hattrie_iter_t *start = nullptr)
+ : k_(k), iter_(start, [](hattrie_iter_t *p) { hattrie_iter_free(p); }) {}
+
+ private:
+ friend class boost::iterator_core_access;
+
+ void increment() {
+ hattrie_iter_next(iter_.get());
+ }
+
+ bool equal(const iterator &other) const {
+ // Special case: NULL and finished are equal
+ if (iter_.get() == nullptr || hattrie_iter_finished(iter_.get()))
+ return other.iter_.get() == nullptr || hattrie_iter_finished(other.iter_.get());
+
+ if (other.iter_.get() == nullptr)
+ return false;
+
+ return hattrie_iter_equal(iter_.get(), other.iter_.get());
+ }
+
+ const std::pair<Kmer, Seq> dereference() const {
+ size_t len;
+ Kmer k(k_, (const RawSeqData*)hattrie_iter_key(iter_.get(), &len));
+ Seq s(k_, (const RawSeqData*)(*hattrie_iter_val(iter_.get())));
+ return std::make_pair(k, s);
+ }
+
+ unsigned k_;
+ std::shared_ptr<hattrie_iter_t> iter_;
+ };
+
+ public:
+ KMerMap(unsigned k)
+ : k_(k), mapping_(hattrie_create()) {
+ rawcnt_ = (unsigned)Seq::GetDataSize(k_);
+ }
+
+ ~KMerMap() {
+ clear();
+ hattrie_free(mapping_);
+ }
+
+ void erase(const Kmer &key) {
+ value_t *vp = internal_tryget(key);
+ if (vp == nullptr)
+ return;
+
+ RawSeqData *value = reinterpret_cast<RawSeqData*>(*vp);
+ delete[] value;
+ int res = internal_erase(key);
+ VERIFY_MSG(res == 0, "Failed to delete from kmer mapper");
+ }
+
+ void set(const Kmer &key, const Seq &value) {
+ value_t *vp = internal_tryget(key);
+ RawSeqData *rawvalue = nullptr;
+ if (vp == nullptr) {
+ vp = internal_get(key);
+ rawvalue = new RawSeqData[rawcnt_];
+ *vp = reinterpret_cast<uintptr_t>(rawvalue);
+ } else {
+ rawvalue = reinterpret_cast<RawSeqData*>(*vp);
+ }
+
+ memcpy(rawvalue, value.data(), rawcnt_ * sizeof(RawSeqData));
+ }
+
+ bool count(const Kmer &key) const {
+ return internal_tryget(key) != nullptr;
+ }
+
+ const RawSeqData *find(const Kmer &key) const {
+ value_t *vp = internal_tryget(key);
+ if (vp == nullptr)
+ return nullptr;
+
+ return reinterpret_cast<const RawSeqData*>(*vp);
+ }
+
+ void clear() {
+ // Delete all the values
+ auto *iter = hattrie_iter_begin(mapping_, false);
+ while (!hattrie_iter_finished(iter)) {
+ RawSeqData *value = (RawSeqData*)(*hattrie_iter_val(iter));
+ delete[] value;
+ hattrie_iter_next(iter);
+ }
+ hattrie_iter_free(iter);
+ // Delete the mapping and all the keys
+ hattrie_clear(mapping_);
+ }
+
+ size_t size() const {
+ return hattrie_size(mapping_);
+ }
+
+ iterator begin() const {
+ return iterator(k_, hattrie_iter_begin(mapping_, false));
+ }
+
+ iterator end() const {
+ return iterator(k_);
+ }
+
+ private:
+ unsigned k_;
+ unsigned rawcnt_;
+ hattrie_t *mapping_;
+};
+
+}
+
+#endif // __KMER_MAP_HPP__
diff --git a/src/modules/assembly_graph/graph_alignment/kmer_mapper.hpp b/src/modules/assembly_graph/graph_alignment/kmer_mapper.hpp
new file mode 100644
index 0000000..f905d2d
--- /dev/null
+++ b/src/modules/assembly_graph/graph_alignment/kmer_mapper.hpp
@@ -0,0 +1,234 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "data_structures/sequence/sequence_tools.hpp"
+#include "data_structures/sequence/runtime_k.hpp"
+#include "utils/adt/kmer_vector.hpp"
+#include "edge_index.hpp"
+
+#include "kmer_map.hpp"
+
+#include <set>
+#include <cstdlib>
+
+namespace debruijn_graph {
+template<class Graph>
+class KmerMapper : public omnigraph::GraphActionHandler<Graph> {
+ typedef omnigraph::GraphActionHandler<Graph> base;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef runtime_k::RtSeq Kmer;
+ typedef runtime_k::RtSeq Seq;
+ typedef typename Seq::DataType RawSeqData;
+
+ unsigned k_;
+ KMerMap mapping_;
+ bool verification_on_;
+ bool normalized_;
+
+ bool CheckAllDifferent(const Sequence &old_s, const Sequence &new_s) const {
+ std::set<Kmer> kmers;
+ Kmer kmer = old_s.start<Kmer>(k_) >> 0;
+ for (size_t i = k_ - 1; i < old_s.size(); ++i) {
+ kmer <<= old_s[i];
+ kmers.insert(kmer);
+ }
+ kmer = new_s.start<Kmer>(k_) >> 0;
+ for (size_t i = k_ - 1; i < new_s.size(); ++i) {
+ kmer <<= new_s[i];
+ kmers.insert(kmer);
+ }
+ return kmers.size() == old_s.size() - k_ + 1 + new_s.size() - k_ + 1;
+ }
+
+public:
+ KmerMapper(const Graph &g, bool verification_on = true) :
+ base(g, "KmerMapper"), k_(unsigned(g.k() + 1)), mapping_(k_), verification_on_(verification_on), normalized_(false) {
+ }
+
+ virtual ~KmerMapper() {}
+
+ unsigned get_k() const { return k_; }
+
+ auto begin() const -> decltype(mapping_.begin()) {
+ return mapping_.begin();
+ }
+
+ auto end() const -> decltype(mapping_.end()) {
+ return mapping_.end();
+ }
+
+ void Normalize() {
+ if (normalized_)
+ return;
+
+ KMerVector<Kmer> all(k_, size());
+ for (auto it = begin(); it != end(); ++it)
+ all.push_back(it->first);
+
+ for (auto it = all.begin(); it != all.end(); ++it) {
+ Seq val(k_, it.data());
+ Normalize(val);
+ }
+ normalized_ = true;
+ }
+
+ void Revert(const Kmer &kmer) {
+ Kmer old_value = Substitute(kmer);
+ if (old_value != kmer) {
+ mapping_.erase(kmer);
+ mapping_.set(old_value, kmer);
+ normalized_ = false;
+ }
+ }
+
+ void Normalize(const Kmer &kmer) {
+ mapping_.set(kmer, Substitute(kmer));
+ }
+
+ bool CheckCanRemap(const Sequence &old_s, const Sequence &new_s) const {
+ if (!CheckAllDifferent(old_s, new_s))
+ return false;
+
+ size_t old_length = old_s.size() - k_ + 1;
+ size_t new_length = new_s.size() - k_ + 1;
+ UniformPositionAligner aligner(old_s.size() - k_ + 1,
+ new_s.size() - k_ + 1);
+ Kmer old_kmer = old_s.start<Kmer>(k_);
+ old_kmer >>= 0;
+ for (size_t i = k_ - 1; i < old_s.size(); ++i) {
+ old_kmer <<= old_s[i];
+ size_t old_kmer_offset = i - k_ + 1;
+ size_t new_kmer_offest = aligner.GetPosition(old_kmer_offset);
+ if (old_kmer_offset * 2 + 1 == old_length && new_length % 2 == 0) {
+ Kmer middle(k_ - 1, new_s, new_length / 2);
+ if (typename Kmer::less2()(middle, !middle)) {
+ new_kmer_offest = new_length - 1 - new_kmer_offest;
+ }
+ }
+ Kmer new_kmer(k_, new_s, new_kmer_offest);
+ if (mapping_.count(new_kmer)) {
+ if (Substitute(new_kmer) != old_kmer) {
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ void RemapKmers(const Sequence &old_s, const Sequence &new_s) {
+ VERIFY(this->IsAttached());
+ size_t old_length = old_s.size() - k_ + 1;
+ size_t new_length = new_s.size() - k_ + 1;
+ UniformPositionAligner aligner(old_s.size() - k_ + 1,
+ new_s.size() - k_ + 1);
+ Kmer old_kmer = old_s.start<Kmer>(k_);
+
+ for (size_t i = k_ - 1; i < old_s.size(); ++i) {
+ // Instead of shifting right
+ if (i != k_ - 1) {
+ old_kmer <<= old_s[i];
+ }
+
+ size_t old_kmer_offset = i - k_ + 1;
+ size_t new_kmer_offest = aligner.GetPosition(old_kmer_offset);
+ if (old_kmer_offset * 2 + 1 == old_length && new_length % 2 == 0) {
+ Kmer middle(k_-1, new_s, new_length / 2);
+ if (typename Kmer::less2()(middle, !middle)) {
+ new_kmer_offest = new_length - 1 - new_kmer_offest;
+ }
+ }
+ Kmer new_kmer(k_, new_s, new_kmer_offest);
+ if (mapping_.count(new_kmer)) {
+ if (verification_on_)
+ VERIFY(Substitute(new_kmer) == old_kmer);
+ mapping_.erase(new_kmer);
+ }
+ if (old_kmer != new_kmer) {
+ mapping_.set(old_kmer, new_kmer);
+ normalized_ = false;
+ }
+ }
+ }
+
+ void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) override {
+ VERIFY(this->g().EdgeNucls(new_edge) == this->g().EdgeNucls(edge2));
+ RemapKmers(this->g().EdgeNucls(edge1), this->g().EdgeNucls(edge2));
+ }
+
+ Kmer Substitute(const Kmer &kmer) const {
+ VERIFY(this->IsAttached());
+ Kmer answer = kmer;
+ const auto *rawval = mapping_.find(answer);
+ while (rawval != nullptr) {
+ Seq val(k_, rawval);
+ if (verification_on_)
+ VERIFY(answer != val);
+
+ answer = val;
+ rawval = mapping_.find(answer);
+ }
+ return answer;
+ }
+
+ void BinWrite(std::ostream &file) const {
+ uint32_t sz = (uint32_t)size();
+ file.write((const char *) &sz, sizeof(uint32_t));
+
+ for (auto iter = begin(); iter != end(); ++iter) {
+ Kmer::BinWrite(file, iter->first);
+ Kmer::BinWrite(file, iter->second);
+ }
+ }
+
+ void BinRead(std::istream &file) {
+ clear();
+
+ uint32_t size;
+ file.read((char *) &size, sizeof(uint32_t));
+ for (uint32_t i = 0; i < size; ++i) {
+ Kmer key(k_);
+ Seq value(k_);
+ Kmer::BinRead(file, &key);
+ Seq::BinRead(file, &value);
+ mapping_.set(key, value);
+ }
+ normalized_ = false;
+ }
+
+ bool CompareTo(KmerMapper<Graph> const &m) {
+ if (size() != m.size()) {
+ INFO("Unequal sizes");
+ return false;
+ }
+
+ for (auto iter = begin(); iter != end(); ++iter) {
+ auto cmp = m.mapping_.find(iter.first());
+ if (cmp == m.mapping_.end() || cmp.second() != iter.second()) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ void clear() {
+ normalized_ = false;
+ return mapping_.clear();
+ }
+
+ size_t size() const {
+ return mapping_.size();
+ }
+
+ // "turn on = true" means turning of all verifies
+ void SetUnsafeMode(bool turn_on) {
+ verification_on_ = !turn_on;
+ }
+};
+
+}
diff --git a/src/modules/assembly_graph/graph_alignment/kmer_mapper_logger.hpp b/src/modules/assembly_graph/graph_alignment/kmer_mapper_logger.hpp
new file mode 100644
index 0000000..bb9ebe2
--- /dev/null
+++ b/src/modules/assembly_graph/graph_alignment/kmer_mapper_logger.hpp
@@ -0,0 +1,45 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * sequencem_mapping_logger.h
+ *
+ * Created on: Nov 27, 2012
+ * Author: alex
+ */
+
+#ifndef KMER_MAPPER_LOGGER_H_
+#define KMER_MAPPER_LOGGER_H_
+
+#include "data_structures/sequence/sequence.hpp"
+#include "assembly_graph/graph_core/action_handlers.hpp"
+#include "dev_support/standard_base.hpp"
+
+namespace debruijn {
+
+template<class Graph>
+class KmerMapperLogger : public omnigraph::GraphActionHandler<Graph> {
+public:
+ typedef pair<Sequence, Sequence> MappedSeq;
+ typedef typename Graph::EdgeId EdgeId;
+
+ KmerMapperLogger(Graph& graph) : GraphActionHandler<Graph>(graph, "KmerMapperLogger") {}
+ virtual ~KmerMapperLogger() {}
+
+ virtual void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) {
+ log_.push_back(MappedSeq(this->g().EdgeNucls(edge1), this->g().EdgeNucls(edge2)));
+ }
+
+ const vector<MappedSeq>& log() const {
+ return log_;
+ }
+
+ vector<MappedSeq> log_;
+};
+
+} /* namespace debruijn */
+#endif /* KMER_MAPPER_LOGGER_H_ */
diff --git a/src/modules/assembly_graph/graph_alignment/long_read_mapper.hpp b/src/modules/assembly_graph/graph_alignment/long_read_mapper.hpp
new file mode 100644
index 0000000..654bc21
--- /dev/null
+++ b/src/modules/assembly_graph/graph_alignment/long_read_mapper.hpp
@@ -0,0 +1,190 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * long_read_mapper.hpp
+ *
+ * Created on: Jun 17, 2013
+ * Author: andrey
+ */
+
+#ifndef LONG_READ_MAPPER_HPP_
+#define LONG_READ_MAPPER_HPP_
+
+#include "assembly_graph/graph_alignment/long_read_storage.hpp"
+#include "assembly_graph/graph_alignment/sequence_mapper_notifier.hpp"
+
+namespace debruijn_graph {
+
+class AbstractLongReadMapper: public SequenceMapperListener {
+public:
+ AbstractLongReadMapper(conj_graph_pack& gp, PathStorage<conj_graph_pack::graph_t>& storage)
+ : gp_(gp), storage_(storage), path_finder_(gp_.g) {
+ }
+
+ void StartProcessLibrary(size_t threads_count) override {
+ for (size_t i = 0; i < threads_count; ++i)
+ buffer_storages_.emplace_back(gp_.g);
+ }
+
+ void StopProcessLibrary() override {
+ for (size_t i = 0; i < buffer_storages_.size(); ++i) {
+ MergeBuffer(i);
+ }
+ buffer_storages_.clear();
+ }
+
+ void MergeBuffer(size_t thread_index) override {
+ DEBUG("Merge buffer " << thread_index << " with size " << buffer_storages_[thread_index].size());
+ storage_.AddStorage(buffer_storages_[thread_index]);
+ buffer_storages_[thread_index].Clear();
+ DEBUG("Now size " << storage_.size());
+ }
+
+ void ProcessPairedRead(size_t ,
+ const io::PairedReadSeq&,
+ const MappingPath<EdgeId>& ,
+ const MappingPath<EdgeId>&) override {
+ //nothing to do
+ }
+
+ void ProcessPairedRead(size_t ,
+ const io::PairedRead&,
+ const MappingPath<EdgeId>& ,
+ const MappingPath<EdgeId>&) override {
+ //nothing to do
+ }
+
+ void ProcessSingleRead(size_t thread_index,
+ const io::SingleRead&,
+ const MappingPath<EdgeId>& read) override {
+ ProcessSingleRead(thread_index, read);
+ }
+
+ void ProcessSingleRead(size_t thread_index,
+ const io::SingleReadSeq&,
+ const MappingPath<EdgeId>& read) override {
+ ProcessSingleRead(thread_index, read);
+ }
+
+ PathStorage<conj_graph_pack::graph_t>& GetPaths() {
+ return storage_;
+ }
+
+private:
+
+ virtual void ProcessSingleRead(size_t thread_index, const MappingPath<EdgeId>& read) = 0;
+
+protected:
+ conj_graph_pack& gp_;
+ PathStorage<conj_graph_pack::graph_t>& storage_;
+ ReadPathFinder<conj_graph_pack::graph_t> path_finder_;
+ std::vector<PathStorage<conj_graph_pack::graph_t> > buffer_storages_;
+
+};
+
+class SimpleLongReadMapper: public AbstractLongReadMapper {
+public:
+ SimpleLongReadMapper(conj_graph_pack& gp, PathStorage<conj_graph_pack::graph_t>& storage)
+ : AbstractLongReadMapper(gp, storage) {
+ }
+
+private:
+
+ void ProcessSingleRead(size_t thread_index, const MappingPath<EdgeId>& read) override {
+ vector<EdgeId> path = path_finder_.FindReadPath(read);
+ buffer_storages_[thread_index].AddPath(path, 1, false);
+ }
+};
+
+class GappedLongReadMapper : public AbstractLongReadMapper {
+private:
+ typedef MappingPathFixer<Graph> GraphMappingPathFixer;
+ const GraphMappingPathFixer path_fixer_;
+ const double MIN_MAPPED_RATIO = 0.3;
+ const size_t MIN_MAPPED_LENGTH = 100;
+public:
+ GappedLongReadMapper(conj_graph_pack& gp, PathStorage<conj_graph_pack::graph_t>& storage)
+ : AbstractLongReadMapper(gp, storage), path_fixer_(gp.g) {
+ }
+
+private:
+
+ size_t CountMappedEdgeSize(EdgeId edge, const MappingPath<EdgeId>& mapping_path, size_t& mapping_index) const {
+ while(mapping_path[mapping_index].first != edge) {
+ mapping_index++;
+ }
+ size_t start_idx = mapping_index;
+
+ while(mapping_path[mapping_index].first == edge) {
+ mapping_index++;
+ if(mapping_index >= mapping_path.size()) {
+ break;
+ }
+ }
+ size_t end_idx = mapping_index;
+ size_t total_len = 0;
+ for(size_t i = start_idx; i < end_idx; ++i) {
+ total_len += mapping_path[i].second.initial_range.size();
+ }
+
+ return total_len;
+ }
+
+ vector<EdgeId> FilterBadMappings(const vector<EdgeId>& corrected_path, const MappingPath<EdgeId>& mapping_path) const {
+ vector<EdgeId> new_corrected_path;
+ size_t mapping_index = 0;
+ for (auto edge : corrected_path) {
+ size_t mapping_size = CountMappedEdgeSize(edge, mapping_path, mapping_index);
+ size_t edge_len = gp_.g.length(edge);
+ //VERIFY(edge_len >= mapping_size);
+ if (mapping_size > MIN_MAPPED_LENGTH ||
+ math::gr((double) mapping_size / (double) edge_len, MIN_MAPPED_RATIO)) {
+ new_corrected_path.push_back(edge);
+ }
+ }
+ return new_corrected_path;
+ }
+
+
+ void ProcessSingleRead(size_t thread_index, const MappingPath<EdgeId>& read) override {
+ vector<EdgeId> corrected_path = path_fixer_.DeleteSameEdges(
+ read.simple_path());
+ corrected_path = FilterBadMappings(corrected_path, read);
+ vector<vector<EdgeId>> paths = FindReadPathWithGaps(read, corrected_path);
+ for(auto path : paths) {
+ buffer_storages_[thread_index].AddPath(path, 1, false);
+ }
+ }
+
+ vector<vector<EdgeId>> FindReadPathWithGaps(const MappingPath<EdgeId>& mapping_path, vector<EdgeId>& corrected_path) const {
+ if (mapping_path.size() == 0) {
+ TRACE("read unmapped");
+ return vector<vector<EdgeId>>();
+ }
+ vector<EdgeId> fixed_path = path_fixer_.TryFixPath(corrected_path);
+ return SplitUnfixedPoints(fixed_path);
+ }
+
+ vector<vector<EdgeId>> SplitUnfixedPoints(vector<EdgeId>& path) const {
+ vector<vector<EdgeId>> result;
+ size_t prev_start = 0;
+ for (size_t i = 1; i < path.size(); ++i) {
+ if (gp_.g.EdgeEnd(path[i - 1]) != gp_.g.EdgeStart(path[i])) {
+ result.push_back(vector<EdgeId>(path.begin() + prev_start, path.begin() + i));
+ prev_start = i;
+ }
+ }
+ result.push_back(vector<EdgeId>(path.begin() + prev_start, path.end()));
+ return result;
+ }
+};
+
+
+}/*longreads*/
+
+#endif /* LONG_READ_MAPPER_HPP_ */
diff --git a/src/modules/assembly_graph/graph_alignment/long_read_storage.hpp b/src/modules/assembly_graph/graph_alignment/long_read_storage.hpp
new file mode 100644
index 0000000..44bf89e
--- /dev/null
+++ b/src/modules/assembly_graph/graph_alignment/long_read_storage.hpp
@@ -0,0 +1,376 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * long_edge_storage.hpp
+ *
+ * Created on: Feb 7, 2013
+ * Author: lab42
+ */
+
+#pragma once
+
+#include <algorithm>
+
+namespace debruijn_graph {
+
+template<class Graph>
+class PathInfo {
+public:
+ typedef typename Graph::EdgeId EdgeId;
+ vector<EdgeId> path;
+
+private:
+ mutable size_t w;
+
+public:
+ vector<EdgeId> getPath() const {
+ return path;
+ }
+
+ size_t getWeight() const {
+ return w;
+ }
+
+ void increaseWeight(int addition = 1) const {
+ w += addition;
+ }
+
+ bool operator<(const PathInfo<Graph> &other) const {
+ return path < other.path;
+ }
+
+ PathInfo(const vector<EdgeId> &p, size_t weight = 0) :
+ path(p), w(weight) {
+ }
+ PathInfo(const PathInfo<Graph> &other) {
+ path = other.path;
+ w = other.w;
+ }
+
+ string str(Graph &g_) {
+ stringstream s;
+ for(auto iter = path.begin(); iter != path.end(); iter ++ ){
+ s << g_.int_id(*iter) << " ";
+ }
+ return s.str();
+ }
+
+};
+
+template<class Graph>
+class PathStorage {
+ friend class PathInfo<Graph> ;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef map<EdgeId, set<PathInfo<Graph> > > InnerIndex;
+private:
+ Graph &g_;
+ InnerIndex inner_index_;
+ const size_t kLongEdgeForStats = 500;
+
+ void HiddenAddPath(const vector<EdgeId> &p, int w){
+ if (p.size() == 0 ) return;
+ for (typename set<PathInfo<Graph> >::iterator iter = inner_index_[p[0]].begin(); iter != inner_index_[p[0]].end(); ++iter) {
+
+ if (iter->path == p) {
+ iter->increaseWeight(w);
+ return;
+ }
+ }
+ inner_index_[p[0]].insert(PathInfo<Graph>(p, w));
+ size_++;
+ }
+
+public:
+
+ PathStorage(Graph &g)
+ : g_(g),
+ inner_index_(),
+ size_(0) {
+ }
+ PathStorage(const PathStorage & p)
+ : g_(p.g_),
+ inner_index_(),
+ size_(0) {
+ for (auto iter = p.inner_index_.begin(); iter != p.inner_index_.end();
+ iter++) {
+ for (auto j_iter = iter->second.begin();
+ j_iter != iter->second.end(); j_iter++) {
+ this->AddPath(j_iter->path, (int) j_iter->getWeight());
+ }
+ }
+ }
+ void ReplaceEdges(map<EdgeId, EdgeId> &old_to_new){
+ map<int, EdgeId> tmp_map;
+// for (auto iter = g_.SmartEdgeBegin(); !iter.IsEnd(); ++iter ){
+// tmp_map[g_.int_id(*iter)] = *iter;
+// }
+ InnerIndex new_index;
+ for (auto iter = inner_index_.begin(); iter != inner_index_.end(); iter++) {
+ auto tmp = iter->second;
+ EdgeId new_first;
+ if (old_to_new.find(iter->first) == old_to_new.end())
+ new_first = iter->first;
+ else {
+ DEBUG("new first edge: "<< g_.int_id(old_to_new[iter->first]) << " with " << tmp.size() << " edges ");
+ new_first = old_to_new[iter->first];
+ }
+ set<PathInfo<Graph> > new_tmp;
+ for (auto j_iter = tmp.begin(); j_iter != tmp.end(); j_iter++) {
+ PathInfo<Graph> pi = *(j_iter);
+ for (size_t k = 0; k < pi.path.size(); k++)
+ if (old_to_new.find(pi.path[k]) != old_to_new.end()) {
+// INFO(g_.int_id(old_to_new[pi.path[k]]));
+ pi.path[k] = old_to_new[pi.path[k]];
+ }
+ DEBUG(pi.str(g_));
+ new_tmp.insert(pi);
+
+ }
+ if (new_first != iter->first) {
+ TRACE("and mmew_tmp.size: "<< new_tmp.size());
+ }
+ if (new_index.find(new_first) == new_index.end()) {
+ new_index[new_first] = new_tmp;
+ } else {
+ for (auto j_iter = new_tmp.begin(); j_iter != new_tmp.end(); j_iter++) {
+ new_index[new_first].insert(*j_iter);
+ }
+ }
+
+ }
+
+ inner_index_ = new_index;
+ }
+
+ void AddPath(const vector<EdgeId> &p, int w, bool add_rc = false) {
+ HiddenAddPath(p, w);
+ if (add_rc) {
+ vector<EdgeId> rc_p(p.size());
+ for (size_t i = 0; i < p.size(); i++)
+ rc_p[i] = g_.conjugate(p[p.size() - 1 - i]);
+ HiddenAddPath(rc_p, w);
+ }
+ }
+ void DumpToFile(const string filename) const{
+ map <EdgeId, EdgeId> auxilary;
+ DumpToFile(filename, auxilary);
+ }
+ void DumpToFile(const string filename, map<EdgeId, EdgeId> &replacement, size_t stats_weight_cutoff = 1, bool need_log = false) const {
+ ofstream filestr(filename);
+ set<EdgeId> continued_edges;
+
+ for(auto iter = inner_index_.begin(); iter != inner_index_.end(); ++iter){
+ filestr<< iter->second.size() << endl;
+ int non1 = 0;
+ for (auto j_iter = iter->second.begin(); j_iter != iter->second.end(); ++j_iter) {
+ filestr << " Weight: " << j_iter->getWeight();
+ if (j_iter->getWeight() > stats_weight_cutoff)
+ non1++;
+
+ filestr << " length: " << j_iter->path.size() << " ";
+ for (auto p_iter = j_iter->path.begin(); p_iter != j_iter->path.end(); ++p_iter) {
+ if (p_iter != j_iter->path.end() - 1 && j_iter->getWeight() > stats_weight_cutoff) {
+ continued_edges.insert(*p_iter);
+ }
+
+ filestr << g_.int_id(*p_iter) << "(" << g_.length(*p_iter) << ") ";
+ }
+ filestr << endl;
+ }
+ filestr << endl;
+ }
+
+ int noncontinued = 0;
+ int long_gapped = 0;
+ int continued = 0;
+ if (need_log) {
+ for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
+ if (g_.length(*iter) > kLongEdgeForStats) {
+ if (!g_.IsDeadEnd(g_.EdgeEnd(*iter))) {
+ if (continued_edges.find(*iter) == continued_edges.end()) {
+ if ((replacement.find(*iter) != replacement.end() &&
+ continued_edges.find(replacement[*iter]) != continued_edges.end())) {
+ TRACE("found in teplacement, edges " << g_.int_id(*iter) << " " <<
+ g_.int_id(replacement[*iter]) << " skipping ");
+ continue;
+ }
+ TRACE("noncontinued end left " << g_.int_id(*iter));
+ noncontinued++;
+ } else
+ continued++;
+ } else {
+ TRACE("dead end left " << g_.int_id(*iter));
+ long_gapped++;
+ }
+ }
+ }
+ INFO("After PacBio (long reads) aligning, for edges longer than " << kLongEdgeForStats << ":");
+ INFO("No continuation found for " << noncontinued + long_gapped << " edges of " <<
+ noncontinued + continued + long_gapped);
+ }
+ }
+
+ vector<PathInfo<Graph> > GetAllPaths() const {
+ vector<PathInfo<Graph> > res;
+ for (auto iter = inner_index_.begin(); iter != inner_index_.end();
+ ++iter) {
+ for (auto j_iter = iter->second.begin();
+ j_iter != iter->second.end(); ++j_iter) {
+
+ res.push_back(*j_iter);
+ }
+ }
+ return res;
+ }
+
+
+ vector<PathInfo<Graph> > GetAllPathsNoConjugate() {
+ vector<PathInfo<Graph> > res;
+
+ std::set< PathInfo<Graph> > added;
+ for (auto iter = inner_index_.begin(); iter != inner_index_.end(); ++iter) {
+ for (auto j_iter = iter->second.begin(); j_iter != iter->second.end(); ++j_iter) {
+ if (added.count(*j_iter) > 0) {
+ continue;
+ }
+
+ added.insert(*j_iter);
+ vector<EdgeId> rc_p(j_iter->path.size()) ;
+ for (size_t i = 0; i < j_iter->path.size(); i++) {
+ rc_p[i] = g_.conjugate(j_iter->path[j_iter->path.size() - 1 - i]);
+ }
+ added.insert(PathInfo<Graph>(rc_p, j_iter->getWeight()));
+
+ res.push_back(*j_iter);
+ }
+ }
+ return res;
+ }
+
+
+ void LoadFromFile(const string s, bool force_exists = true) {
+ FILE* file = fopen(s.c_str(), "r");
+ if (force_exists) {
+ VERIFY(file != NULL);
+ } else if (file == NULL) {
+ INFO("Long reads not found, skipping");
+ return;
+ }
+ fclose(file);
+
+ INFO("Loading long reads alignment...");
+ ifstream filestr(s);
+ INFO("loading from " << s);
+ map<size_t, EdgeId> tmp_map;
+ for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
+ tmp_map[g_.int_id(*iter)] = *iter;
+ }
+ int fl;
+
+ file = fopen((s).c_str(), "r");
+ char ss[14];
+ while (!feof(file)) {
+ int n;
+
+ fl = fscanf(file, "%d\n", &n);
+ if (fl != 1)
+ break;
+ TRACE(n);
+ for (int i = 0; i < n; i++) {
+
+ int w = -1, l = -1;
+ fl = fscanf(file, "Weight: %d length: %d", &w, &l);
+ TRACE(w << " " << l);
+ VERIFY(fl == 2);
+ vector<EdgeId> p;
+ for (int j = 0; j < l; j++) {
+ size_t e;
+ int x;
+ fl = fscanf(file, "%zu(%d)", &e, &x);
+ VERIFY(fl == 2);
+ VERIFY(tmp_map.find(e) != tmp_map.end());
+ p.push_back(tmp_map[e]);
+ }
+ fl = fscanf(file, "%[^\n]\n", ss);
+ TRACE(ss[0]);
+ AddPath(p, w);
+ }
+ }
+ fclose(file);
+ INFO("Loading finished.");
+ }
+
+ void AddStorage(PathStorage<Graph> & to_add) {
+
+ for(auto iter = to_add.inner_index_.begin(); iter != to_add.inner_index_.end(); iter++) {
+ for(auto j_iter = iter->second.begin(); j_iter != iter->second.end(); j_iter ++) {
+ this->AddPath(j_iter->path, (int) j_iter->getWeight());
+ }
+ }
+ }
+
+ void Clear() {
+ inner_index_.clear();
+ size_ = 0;
+ }
+
+ size_t size() {
+ return size_;
+ }
+
+// typename InnerIndex::iterator begin() const {
+// return inner_index.begin();
+// }
+//
+// typename InnerIndex::iterator end() const {
+// return inner_index.end();
+// }
+// typename InnerIndex::iterator operator*(){
+// return this->first;
+// }
+private:
+ size_t size_;
+};
+
+template<class Graph>
+class LongReadContainer {
+ Graph& g_;
+ vector<PathStorage<Graph>> data_;
+
+public:
+
+ LongReadContainer(Graph& g, size_t count = 0): g_(g) {
+ for (size_t i = 0; i < count; ++i) {
+ data_.emplace_back(g_);
+ }
+ }
+
+ PathStorage<Graph>& operator[](size_t index) {
+ return data_[index];
+ }
+
+ const PathStorage<Graph>& operator[](size_t index) const {
+ return data_[index];
+ }
+
+ size_t size() const {
+ return data_.size();
+ }
+
+ void Clear() {
+ for (auto& storage : data_) {
+ storage.Clear();
+ }
+ }
+
+};
+
+
+}
+
+
diff --git a/src/modules/assembly_graph/graph_alignment/pacbio/pac_index.hpp b/src/modules/assembly_graph/graph_alignment/pacbio/pac_index.hpp
new file mode 100644
index 0000000..155c560
--- /dev/null
+++ b/src/modules/assembly_graph/graph_alignment/pacbio/pac_index.hpp
@@ -0,0 +1,834 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * pac_index.hpp
+ *
+ * Created on: Jan 21, 2013
+ * Author: lab42
+ */
+#pragma once
+
+#include "data_structures/indices/edge_multi_index.hpp"
+#include "data_structures/indices/edge_index_builders.hpp"
+#include <algorithm>
+#include "pacbio_read_structures.hpp"
+
+namespace pacbio {
+#define UNDEF_COLOR -1
+#define DELETED_COLOR -2
+
+template<class Graph>
+struct MappingDescription {
+
+};
+
+template<class Graph>
+class PacBioMappingIndex {
+public:
+ typedef map<typename Graph::EdgeId, vector<MappingInstance> > MappingDescription;
+ typedef pair<typename Graph::EdgeId, vector<MappingInstance> > ClusterDescription;
+ typedef set<KmerCluster<Graph> > ClustersSet;
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef debruijn_graph::DeBruijnEdgeMultiIndex<typename Graph::EdgeId> Index;
+ typedef typename Index::KeyWithHash KeyWithHash;
+
+private:
+ DECL_LOGGER("PacIndex")
+
+ const Graph &g_;
+ size_t pacbio_k;
+ size_t debruijn_k;
+ const static int short_edge_cutoff = 0;
+ const static size_t min_cluster_size = 8;
+ const static int max_similarity_distance = 500;
+ int good_follow = 0;
+ int half_bad_follow = 0;
+ int bad_follow = 0;
+
+ double compression_cutoff;
+ double domination_cutoff;
+ set<Sequence> banned_kmers;
+ debruijn_graph::DeBruijnEdgeMultiIndex<typename Graph::EdgeId> tmp_index;
+ map<pair<VertexId, VertexId>, vector<size_t> > distance_cashed;
+ size_t read_count;
+ bool ignore_map_to_middle;
+
+public:
+ MappingDescription Locate(const Sequence &s) const;
+
+ PacBioMappingIndex(const Graph &g, size_t k, size_t debruijn_k_, bool ignore_map_to_middle)
+ : g_(g),
+ pacbio_k(k),
+ debruijn_k(debruijn_k_),
+ tmp_index((unsigned) pacbio_k, cfg::get().output_dir), ignore_map_to_middle(ignore_map_to_middle) {
+ DEBUG("PB Mapping Index construction started");
+
+ typedef typename debruijn_graph::EdgeIndexHelper<debruijn_graph::DeBruijnEdgeMultiIndex<typename Graph::EdgeId>>::GraphPositionFillingIndexBuilderT Builder;
+
+ Builder().BuildIndexFromGraph(tmp_index, g_);
+ INFO("Index constructed");
+ FillBannedKmers();
+ compression_cutoff = cfg::get().pb.compression_cutoff; // 0.6
+ domination_cutoff = cfg::get().pb.domination_cutoff; //1.5
+ //INFO(tmp_index.size());
+ read_count = 0;
+ }
+ ~PacBioMappingIndex(){
+ DEBUG("good/ugly/bad counts:" << good_follow << " "<<half_bad_follow << " " << bad_follow);
+
+ }
+ void FillBannedKmers() {
+ for (int i = 0; i < 4; i++) {
+ auto base = nucl((unsigned char) i);
+ for (int j = 0; j < 4; j++) {
+ auto other = nucl((unsigned char) j);
+ for (size_t other_pos = 0; other_pos < pacbio_k; other_pos++) {
+ string s = "";
+ for (size_t k = 0; k < pacbio_k; k++) {
+ if (k != other_pos)
+ s += base;
+ else
+ s += other;
+ }
+ banned_kmers.insert(Sequence(s));
+ }
+ }
+ }
+ }
+
+ bool similar(const MappingInstance &a, const MappingInstance &b,
+ int shift = 0) const {
+ if (b.read_position + shift < a.read_position) {
+ return similar(b, a, -shift);
+ } else if (b.read_position == a.read_position) {
+ return (abs(int(b.edge_position) + shift - int(a.edge_position)) < 2);
+ } else {
+ return ((b.edge_position + shift - a.edge_position >= (b.read_position - a.read_position) * compression_cutoff) &&
+ ((b.edge_position + shift - a.edge_position) * compression_cutoff <= (b.read_position - a.read_position)));
+ }
+ }
+
+ void dfs_cluster(vector<int> &used, vector<MappingInstance> &to_add,
+ const int cur_ind,
+ const typename MappingDescription::iterator iter) const {
+ size_t len = iter->second.size();
+ for (size_t k = 0; k < len; k++) {
+ if (!used[k] && similar(iter->second[cur_ind], iter->second[k])) {
+ to_add.push_back(iter->second[k]);
+ used[k] = 1;
+ dfs_cluster(used, to_add, (int) k, iter);
+ }
+ }
+ }
+
+ void dfs_cluster_norec(vector<int> &used, vector<MappingInstance> &to_add,
+ const size_t cur_ind,
+ const typename MappingDescription::iterator iter, vector<vector<size_t> > &similarity_list) const {
+ std::deque<size_t> stack;
+ stack.push_back(cur_ind);
+ used[cur_ind] = 1;
+ while (stack.size() > 0) {
+ size_t k = stack.back();
+ stack.pop_back();
+ to_add.push_back(iter->second[k]);
+
+ for (size_t i = 0; i < similarity_list[k].size(); i++) {
+ if (!used[similarity_list[k][i]]) {
+ stack.push_back(similarity_list[k][i]);
+ used[similarity_list[k][i]] = 1;
+ }
+ }
+ }
+ }
+
+ ClustersSet GetOrderClusters(const Sequence &s) const {
+ MappingDescription descr = Locate(s);
+ ClustersSet res;
+ TRACE(read_count << " read_count");
+
+ DEBUG(descr.size() <<" clusters");
+ for (auto iter = descr.begin(); iter != descr.end(); ++iter) {
+ size_t edge_id = g_.int_id(iter->first);
+ DEBUG(edge_id);
+ sort(iter->second.begin(), iter->second.end(), ReadPositionComparator());
+ set<vector<MappingInstance> > edge_cluster_set;
+ size_t len = iter->second.size();
+ vector<vector<size_t> > similarity_list(len);
+ int cnt = 0;
+ for (size_t i = 0; i < len; i++){
+ for (size_t j = i + 1; j < len; j++){
+ if (iter->second[i].read_position + max_similarity_distance < iter->second[j].read_position) {
+ break;
+ }
+ if (similar(iter->second[i], iter->second[j])) {
+ similarity_list[i].push_back(j);
+ cnt ++;
+ if (cnt % 10000 == 0) {
+ DEBUG(cnt);
+ }
+ }
+ }
+ }
+
+ DEBUG(len <<" kmers in cluster");
+ vector<int> used(len);
+ for (size_t i = 0; i < len; i++) {
+ if (!used[i]) {
+ vector<size_t> new_cluster(len);
+ vector<size_t> prev(len);
+ for(size_t j = i; j < len; j++) {
+ if (!used[j]) {
+ if (new_cluster[j] == 0) new_cluster[j] = 1, prev[j] = size_t(-1);
+ for(size_t k = 0; k < similarity_list[j].size(); k++) {
+ size_t next_ind = similarity_list[j][k];
+ if (!used[next_ind]) {
+ if (new_cluster[next_ind] < new_cluster[j] + 1){
+ new_cluster[next_ind] = new_cluster[j] + 1;
+ prev[next_ind] = j;
+ }
+ }
+ }
+ }
+ }
+ size_t maxx = 0;
+ size_t maxj = i;
+ for(size_t j = i; j < len; j++) {
+ if (new_cluster[j] > maxx) maxj = j, maxx = new_cluster[j];
+ }
+ vector<MappingInstance> to_add;
+ size_t real_maxj = maxj, first_j = maxj;
+ while (maxj != size_t(-1)) {
+ to_add.push_back(iter->second[maxj]);
+ first_j = maxj;
+ maxj = prev[maxj];
+ }
+ for (auto j = first_j; j < real_maxj; j++)
+ used[j] = 1;
+ reverse(to_add.begin(), to_add.end());
+ TRACE("adding cluster "" edge "<< edge_id << " len " <<to_add.size() )
+ res.insert(KmerCluster<Graph>(iter->first, to_add));
+ }
+ }
+ }
+ FilterClusters(res);
+ return res;
+ }
+ //filter clusters that are too small or fully located on a vertex or dominated by some other cluster.
+ void FilterClusters(ClustersSet &clusters) const {
+ for (auto i_iter = clusters.begin(); i_iter != clusters.end();) {
+ size_t edge_id = g_.int_id(i_iter->edgeId);
+
+ int len = (int) g_.length(i_iter->edgeId);
+ auto sorted_by_edge = i_iter->sorted_positions;
+ sort(sorted_by_edge.begin(), sorted_by_edge.end());
+ double good = 0;
+ DEBUG("filtering cluster of size " << sorted_by_edge.size());
+ DEBUG(edge_id <<" : edgeId");
+ for (auto iter = sorted_by_edge.begin();
+ iter < sorted_by_edge.end(); iter++) {
+ if (iter->IsUnique())
+ good++;
+ //good += 1.0 / (iter->quality * iter->quality);
+ }
+ DEBUG("good " << good);
+
+ if (good < min_cluster_size || (len < short_edge_cutoff)) {
+ if (len < short_edge_cutoff) {
+ DEBUG("Life is too long, and edge is too short!");
+ }
+ auto tmp_iter = i_iter;
+ tmp_iter++;
+ clusters.erase(i_iter);
+ i_iter = tmp_iter;
+ } else {
+ if (sorted_by_edge[0].edge_position >= len
+ || sorted_by_edge[i_iter->size - 1].edge_position
+ <= int(debruijn_k) - int(pacbio_k)) {
+ DEBUG("All anchors in vertex");
+ auto tmp_iter = i_iter;
+ tmp_iter++;
+ clusters.erase(i_iter);
+ i_iter = tmp_iter;
+ } else {
+ i_iter++;
+ }
+ }
+ }
+ for (auto i_iter = clusters.begin(); i_iter != clusters.end();) {
+ size_t edge_id = g_.int_id(i_iter->edgeId);
+ auto sorted_by_edge = i_iter->sorted_positions;
+
+ DEBUG("filtering with cluster edge, stage 2 "<< edge_id << " len " << sorted_by_edge.size() << " clusters still alive: "<< clusters.size());
+ for (auto j_iter = clusters.begin(); j_iter != clusters.end();) {
+ if (i_iter != j_iter) {
+ if (dominates(*i_iter, *j_iter)) {
+ TRACE("cluster is dominated");
+ auto tmp_iter = j_iter;
+ tmp_iter++;
+ TRACE("cluster on edge " << g_.int_id(j_iter->edgeId));
+ TRACE("erased - dominated");
+ clusters.erase(j_iter);
+ j_iter = tmp_iter;
+ } else {
+ j_iter++;
+ }
+ } else {
+ j_iter++;
+ }
+ }
+ DEBUG("cluster size "<< i_iter->sorted_positions.size() << "survived filtering");
+ i_iter++;
+ }
+ }
+
+ // is "non strictly dominates" required?
+ inline bool dominates(const KmerCluster<Graph> &a,
+ const KmerCluster<Graph> &b) const {
+ size_t a_size = a.size;
+ size_t b_size = b.size;
+ if ((double) a_size < (double) b_size * domination_cutoff
+ || a.sorted_positions[a.first_trustable_index].read_position
+ > b.sorted_positions[b.first_trustable_index].read_position
+ || a.sorted_positions[a.last_trustable_index].read_position
+ < b.sorted_positions[b.last_trustable_index].read_position) {
+ return false;
+ } else {
+ return true;
+ }
+ }
+
+ vector<EdgeId> FillGapsInCluster(vector<pair<size_t, typename ClustersSet::iterator> > &cur_cluster,
+ const Sequence &s) {
+ vector<EdgeId> cur_sorted;
+ EdgeId prev_edge = EdgeId(0);
+
+ for (auto iter = cur_cluster.begin(); iter != cur_cluster.end();
+ ++iter) {
+ EdgeId cur_edge = iter->second->edgeId;
+ if (prev_edge != EdgeId(0)) {
+//Need to find sequence of edges between clusters
+ VertexId start_v = g_.EdgeEnd(prev_edge);
+ VertexId end_v = g_.EdgeStart(cur_edge);
+ auto prev_iter = iter - 1;
+ MappingInstance cur_first_index =
+ iter->second->sorted_positions[iter->second
+ ->first_trustable_index];
+ MappingInstance prev_last_index = prev_iter->second
+ ->sorted_positions[prev_iter->second
+ ->last_trustable_index];
+
+ if (start_v != end_v ||
+ (start_v == end_v &&
+ (double) (cur_first_index.read_position - prev_last_index.read_position) >
+ (double) (cur_first_index.edge_position + (int) g_.length(prev_edge) - prev_last_index.edge_position) * 1.3)) {
+ DEBUG(" traversing tangled hregion between "<< g_.int_id(prev_edge)<< " " << g_.int_id(cur_edge));
+ DEBUG(" first pair" << cur_first_index.str() << " edge_len" << g_.length(cur_edge));
+ DEBUG(" last pair" << prev_last_index.str() << " edge_len" << g_.length(prev_edge));
+ string s_add = "";
+ string e_add = "";
+ int seq_end = cur_first_index.read_position;
+ int seq_start = prev_last_index.read_position;
+ string tmp = g_.EdgeNucls(prev_edge).str();
+ s_add = tmp.substr(prev_last_index.edge_position,
+ g_.length(prev_edge) - prev_last_index.edge_position);
+ tmp = g_.EdgeNucls(cur_edge).str();
+ e_add = tmp.substr(0, cur_first_index.edge_position);
+ pair<int, int> limits = GetPathLimits(*(prev_iter->second),
+ *(iter->second),
+ (int) s_add.length(),
+ (int) e_add.length());
+ if (limits.first == -1)
+ return vector<EdgeId>(0);
+
+ vector<EdgeId> intermediate_path = BestScoredPath(s, start_v, end_v, limits.first, limits.second, seq_start, seq_end, s_add, e_add);
+ if (intermediate_path.size() == 0) {
+ DEBUG("Tangled region between edgees "<< g_.int_id(prev_edge) << " " << g_.int_id(cur_edge) << " is not closed, additions from edges: " << int(g_.length(prev_edge)) - int(prev_last_index.edge_position) <<" " << int(cur_first_index.edge_position) - int(debruijn_k - pacbio_k ) << " and seq "<< - seq_start + seq_end);
+ if (cfg::get().pb.additional_debug_info) {
+ DEBUG(" escpected gap length: " << -int(g_.length(prev_edge)) + int(prev_last_index.edge_position) - int(cur_first_index.edge_position) + int(debruijn_k - pacbio_k ) - seq_start + seq_end);
+ PathStorageCallback<Graph> callback(g_);
+ ProcessPaths(g_, 0, 4000,
+ start_v, end_v,
+ callback);
+ vector<vector<EdgeId> > paths = callback.paths();
+ stringstream s_buf;
+ for (auto p_iter = paths.begin();
+ p_iter != paths.end(); p_iter++) {
+ size_t tlen = 0;
+ for (auto path_iter = p_iter->begin();
+ path_iter != p_iter->end();
+ path_iter++) {
+ tlen += g_.length(*path_iter);
+ }
+ s_buf << tlen << " ";
+ }
+ DEBUG(s_buf.str());
+ }
+ return intermediate_path;
+ }
+ for (auto j_iter = intermediate_path.begin(); j_iter != intermediate_path.end(); j_iter++) {
+ cur_sorted.push_back(*j_iter);
+ }
+ }
+ }
+ cur_sorted.push_back(cur_edge);
+ prev_edge = cur_edge;
+ }
+ return cur_sorted;
+ }
+
+ bool TopologyGap(EdgeId first, EdgeId second, bool oriented) const {
+ bool res = (g_.IsDeadStart(g_.EdgeStart(first)) && g_.IsDeadEnd(g_.EdgeEnd(second)));
+ if (!oriented)
+ res |= g_.IsDeadEnd(g_.EdgeEnd(first)) && g_.IsDeadStart(g_.EdgeStart(second));
+ return res;
+ }
+
+ vector<int> GetWeightedColors(ClustersSet &mapping_descr, Sequence &s) {
+ int len = (int) mapping_descr.size();
+ DEBUG("getting colors, table size "<< len);
+ vector<vector<int> > cons_table(len);
+
+ vector<int> colors(len);
+ vector<int> cluster_size(len);
+ vector<int> max_size(len);
+ vector<int> prev(len);
+
+ for (int i = 0; i < len; i++) {
+ cons_table[i].resize(len);
+ cons_table[i][i] = 0;
+ prev[i] = -1;
+ }
+ int i = 0;
+
+ for (int i = 0; i < len; i++) {
+//-1 not initialized, -2 - removed as trash
+ colors[i] = UNDEF_COLOR;
+ }
+ for (auto i_iter = mapping_descr.begin(); i_iter != mapping_descr.end();
+ ++i_iter, ++i) {
+ cluster_size[i] = i_iter->size;
+ }
+ i = 0;
+ if (len > 1) {
+ TRACE(len << "clusters");
+ }
+
+ for (auto i_iter = mapping_descr.begin(); i_iter != mapping_descr.end();
+ ++i_iter, ++i) {
+ int j = i;
+ for (auto j_iter = i_iter;
+ j_iter != mapping_descr.end(); ++j_iter, ++j) {
+ if (i_iter == j_iter)
+ continue;
+ cons_table[i][j] = IsConsistent(s, *i_iter, *j_iter);
+ }
+ }
+ i = 0;
+ int cur_color = 0;
+
+ while (true) {
+ for (i = 0; i < len; i++) {
+ max_size[i] = 0;
+ prev[i] = -1;
+ }
+ i = 0;
+ for (auto i_iter = mapping_descr.begin(); i_iter != mapping_descr.end();
+ ++i_iter, ++i) {
+ if (colors[i] != UNDEF_COLOR) continue;
+ max_size[i] = cluster_size[i];
+ for (int j = 0; j < i; j ++) {
+ if (colors[j] != -1) continue;
+ if (cons_table[j][i] && max_size[i] < cluster_size[i] + max_size[j]) {
+ max_size[i] = max_size[j] + cluster_size[i];
+ prev[i] = j;
+ }
+ }
+ }
+ int maxx = 0;
+ int maxi = -1;
+ for (int j = 0; j < len; j++) {
+ if (max_size[j] > maxx) {
+ maxx = max_size[j];
+ maxi = j;
+ }
+ }
+ if (maxi == -1) {
+ break;
+ }
+ colors[maxi] = cur_color;
+ int real_maxi = maxi, min_i = maxi;
+
+ while (prev[maxi] != -1) {
+ min_i = maxi;
+ maxi = prev[maxi];
+ colors[maxi] = cur_color;
+ }
+ while (real_maxi >= min_i) {
+ if (colors[real_maxi] == UNDEF_COLOR) {
+ colors[real_maxi] = DELETED_COLOR;
+ }
+ real_maxi --;
+ }
+ cur_color ++;
+
+ }
+ return colors;
+ }
+
+
+
+
+ OneReadMapping<Graph> GetReadAlignment(Sequence &s) {
+ ClustersSet mapping_descr = GetOrderClusters(s);
+ DEBUG("clusters got");
+ int len = (int) mapping_descr.size();
+ vector<size_t> real_length;
+
+ vector<int> colors = GetWeightedColors(mapping_descr, s);
+ vector<vector<EdgeId> > sortedEdges;
+ vector<typename ClustersSet::iterator> start_clusters, end_clusters;
+ vector<GapDescription<Graph> > illumina_gaps;
+ vector<int> used(len);
+ size_t used_seed_count = 0;
+ auto iter = mapping_descr.begin();
+ for (int i = 0; i < len; i++, iter ++) {
+ used[i] = 0;
+ DEBUG(colors[i] <<" " << iter->str(g_));
+ }
+ for (int i = 0; i < len; i++) {
+ if (!used[i]) {
+ DEBUG("starting new subread");
+ size_t cur_seed_count = 0;
+ vector<pair<size_t, typename ClustersSet::iterator> > cur_cluster;
+ used[i] = 1;
+ int j = 0;
+ int cur_color = colors[i];
+ if (cur_color == DELETED_COLOR)
+ continue;
+ for (auto i_iter = mapping_descr.begin();
+ i_iter != mapping_descr.end(); ++i_iter, ++j) {
+ if (colors[j] == cur_color) {
+ cur_cluster.push_back(
+ make_pair(
+ i_iter->average_read_position,
+ i_iter));
+ used[j] = 1;
+ cur_seed_count += i_iter->sorted_positions.size();
+ }
+ }
+ sort(cur_cluster.begin(), cur_cluster.end(),
+ pair_iterator_less<typename ClustersSet::iterator>());
+ VERIFY(cur_cluster.size() > 0);
+ //if (cur_seed_count > used_seed_count)
+ used_seed_count += cur_seed_count;
+ auto cur_cluster_start = cur_cluster.begin();
+ for (auto iter = cur_cluster.begin(); iter != cur_cluster.end();
+ ++iter) {
+ auto next_iter = iter + 1;
+ if (next_iter == cur_cluster.end()
+ || !IsConsistent(s, *(iter->second),
+ *(next_iter->second))) {
+ if (next_iter != cur_cluster.end()) {
+ DEBUG("clusters splitted:");
+ DEBUG("on "<< iter->second->str(g_));
+ DEBUG("and " << next_iter->second->str(g_));
+ }
+ vector<pair<size_t, typename ClustersSet::iterator> > splitted_cluster(
+ cur_cluster_start, next_iter);
+ vector<EdgeId> cur_sorted = FillGapsInCluster(
+ splitted_cluster, s);
+ if (cur_sorted.size() > 0) {
+ start_clusters.push_back(cur_cluster_start->second);
+ end_clusters.push_back(iter->second);
+ sortedEdges.push_back(cur_sorted);
+ }
+ cur_cluster_start = next_iter;
+ } else {
+ DEBUG("connected consequtive clusters:");
+ DEBUG("on "<< iter->second->str(g_));
+ DEBUG("and " << next_iter->second->str(g_));
+
+ }
+
+ }
+ }
+ }
+ DEBUG("adding gaps between subreads");
+ int alignments = int(sortedEdges.size());
+ for (int i = 0; i < alignments; i++) {
+ for (int j = 0; j < alignments; j++) {
+ EdgeId before_gap = sortedEdges[j][sortedEdges[j].size() - 1];
+ EdgeId after_gap = sortedEdges[i][0];
+//do not add "gap" for rc-jumping
+ if (before_gap != after_gap
+ && before_gap != g_.conjugate(after_gap)) {
+ if (i != j && TopologyGap(before_gap, after_gap, true)) {
+ if (start_clusters[j]->CanFollow(*end_clusters[i])) {
+ illumina_gaps.push_back(
+ GapDescription<Graph>(*end_clusters[i],
+ *start_clusters[j], s,
+ (int) pacbio_k));
+ }
+
+ }
+ }
+ }
+ }
+ return OneReadMapping<Graph>(sortedEdges, illumina_gaps, real_length, used_seed_count);
+ }
+
+ std::pair<int, int> GetPathLimits(const KmerCluster<Graph> &a,
+ const KmerCluster<Graph> &b,
+ int s_add_len, int e_add_len) {
+ int start_pos = a.sorted_positions[a.last_trustable_index].read_position;
+ int end_pos = b.sorted_positions[b.first_trustable_index].read_position;
+ int seq_len = -start_pos + end_pos;
+ //int new_seq_len =
+//TODO::something more reasonable
+ int path_min_len = max(int(floor((seq_len - int(debruijn_k)) * cfg::get().pb.path_limit_pressing)), 0);
+ int path_max_len = (int) ((double) (seq_len + (int) debruijn_k) * cfg::get().pb.path_limit_stretching);
+ if (seq_len < 0) {
+ DEBUG("suspicious negative seq_len " << start_pos << " " << end_pos << " " << path_min_len << " " << path_max_len);
+ return std::make_pair(-1, -1);
+ }
+ path_min_len = max(path_min_len - int(s_add_len + e_add_len), 0);
+ path_max_len = max(path_max_len - int(s_add_len + e_add_len), 0);
+ return std::make_pair(path_min_len, path_max_len);
+ }
+
+//0 - No, 1 - Yes
+ int IsConsistent(Sequence &s, const KmerCluster<Graph> &a,
+ const KmerCluster<Graph> &b) {
+ EdgeId a_edge = a.edgeId;
+ EdgeId b_edge = b.edgeId;
+ size_t a_id = g_.int_id(a_edge);
+ size_t b_id = g_.int_id(b_edge);
+ DEBUG("clusters on " << a_id << " and " << b_id );
+ if (abs(a.sorted_positions[a.last_trustable_index].read_position - b.sorted_positions[b.first_trustable_index].read_position) > 5000) {
+ DEBUG("...to far5000");
+ return 0;
+ }
+ VertexId start_v = g_.EdgeEnd(a_edge);
+ size_t addition = g_.length(a_edge);
+ VertexId end_v = g_.EdgeStart(b_edge);
+ pair<VertexId, VertexId> vertex_pair = make_pair(start_v, end_v);
+ vector<size_t> result;
+ DEBUG("seq dist:" << s.size()/3);
+ if (distance_cashed.find(vertex_pair) == distance_cashed.end()) {
+ DistancesLengthsCallback<Graph> callback(g_);
+ ProcessPaths(g_, 0, s.size() / 3, start_v,
+ end_v, callback);
+ result = callback.distances();
+ distance_cashed[vertex_pair] = result;
+ } else {
+ DEBUG("taking from cashed");
+ }
+ DEBUG("addition: " << addition << " found " << result.size() << " lengths:" );
+ for (size_t i = 0; i < result.size(); i++) {
+ DEBUG(result[i]);
+ }
+ result = distance_cashed[vertex_pair];
+ //TODO: Serious optimization possible
+ for (size_t i = 0; i < result.size(); i++) {
+ for (auto a_iter = a.sorted_positions.begin();
+ a_iter != a.sorted_positions.end(); ++a_iter) {
+ if (a_iter - a.sorted_positions.begin() > 500 && a.sorted_positions.end() - a_iter >500) continue;
+ int cnt = 0;
+ for (auto b_iter = b.sorted_positions.begin();
+ b_iter != b.sorted_positions.end() && cnt <500; ++b_iter, cnt ++) {
+ if (similar(*a_iter, *b_iter,
+ (int) (result[i] + addition))) {
+ return 1;
+ }
+ }
+ cnt = 0;
+ if (b.sorted_positions.size() > 500) {
+ for (auto b_iter = b.sorted_positions.end() - 1;
+ b_iter != b.sorted_positions.begin() && cnt < 500; --b_iter, cnt ++) {
+ if (similar(*a_iter, *b_iter,
+ (int) (result[i] + addition))) {
+ return 1;
+ }
+ }
+ }
+ }
+ }
+ return 0;
+
+ }
+
+ string PathToString(const vector<EdgeId>& path) const {
+ string res = "";
+ for (auto iter = path.begin(); iter != path.end(); iter++) {
+ size_t len = g_.length(*iter);
+ string tmp = g_.EdgeNucls(*iter).First(len).str();
+ res = res + tmp;
+ }
+ return res;
+ }
+
+ vector<EdgeId> BestScoredPath(const Sequence &s, VertexId start_v, VertexId end_v,
+ int path_min_length, int path_max_length,
+ int start_pos, int end_pos, string &s_add,
+ string &e_add) {
+ DEBUG(" Traversing tangled region. Start and end vertices resp: " << g_.int_id(start_v) <<" " << g_.int_id(end_v));
+ PathStorageCallback<Graph> callback(g_);
+ ProcessPaths(g_,
+ path_min_length, path_max_length,
+ start_v, end_v,
+ callback);
+ vector<vector<EdgeId> > paths = callback.paths();
+ DEBUG("taking subseq" << start_pos <<" "<< end_pos <<" " << s.size());
+ int s_len = int(s.size());
+ string seq_string = s.Subseq(start_pos, min(end_pos + 1, s_len)).str();
+ size_t best_path_ind = paths.size();
+ size_t best_score = 1000000000;
+ DEBUG("need to find best scored path between "<<paths.size()<<" , seq_len " << seq_string.length());
+ if (paths.size() == 0)
+ return vector<EdgeId>(0);
+ for (size_t i = 0; i < paths.size(); i++) {
+ string cur_string = s_add + PathToString(paths[i]) + e_add;
+ if (paths.size() > 1 && paths.size() < 10) {
+ TRACE("candidate path number "<< i << " , len " << cur_string.length());
+ TRACE("graph candidate: " << cur_string);
+ TRACE("in pacbio read: " << seq_string);
+ for (auto j_iter = paths[i].begin(); j_iter != paths[i].end();
+ ++j_iter) {
+ DEBUG(g_.int_id(*j_iter));
+ }
+ }
+ size_t cur_score = StringDistance(cur_string, seq_string);
+ if (paths.size() > 1 && paths.size() < 10) {
+ DEBUG("score: "<< cur_score);
+ }
+ if (cur_score < best_score) {
+ best_score = cur_score;
+ best_path_ind = i;
+ }
+ }
+ if (best_score == 1000000000)
+ return vector<EdgeId>(0);
+ if (paths.size() > 1 && paths.size() < 10) {
+ DEBUG("best score found! Path " <<best_path_ind <<" score "<< best_score);
+ }
+ return paths[best_path_ind];
+ }
+
+ // Short read alignment
+ MappingPath<EdgeId> GetShortReadAlignment(const Sequence &s) const {
+ ClustersSet mapping_descr = GetOrderClusters(s);
+ map<EdgeId, KmerCluster<Graph> > largest_clusters;
+
+ //Selecting the biggest cluster for each edge
+ for (auto iter = mapping_descr.begin(); iter != mapping_descr.end(); ++iter) {
+
+ auto first_cluster = iter->sorted_positions[iter->first_trustable_index];
+ auto last_cluster = iter->sorted_positions[iter->last_trustable_index];
+ int read_range = last_cluster.read_position - first_cluster.read_position;
+ int edge_range = last_cluster.edge_position - first_cluster.edge_position;
+ int cluster_szie = iter->last_trustable_index - iter->first_trustable_index;
+ if (cluster_szie > 2 * read_range || edge_range < 0 || 2 * edge_range < read_range || edge_range > 2 * read_range) {
+ //skipping cluster
+ continue;
+ }
+
+ auto edge_cluster = largest_clusters.find(iter->edgeId);
+ if (edge_cluster != largest_clusters.end()) {
+ if (edge_cluster->second.last_trustable_index - edge_cluster->second.first_trustable_index
+ < iter->last_trustable_index - iter->first_trustable_index) {
+
+ edge_cluster->second = *iter;
+ }
+ }
+ else {
+ largest_clusters.insert(make_pair(iter->edgeId, *iter));
+ }
+ }
+
+ MappingPath<EdgeId> result;
+ for (auto iter = largest_clusters.begin(); iter != largest_clusters.end(); ++iter) {
+ auto first_cluster = iter->second.sorted_positions[iter->second.first_trustable_index];
+ auto last_cluster = iter->second.sorted_positions[iter->second.last_trustable_index];
+ MappingRange range(Range(first_cluster.read_position, last_cluster.read_position),
+ Range(first_cluster.edge_position, last_cluster.edge_position));
+ result.join(MappingPath<EdgeId>(vector<EdgeId>(1, iter->second.edgeId), vector<MappingRange>(1, range)));
+ }
+
+ return result;
+ }
+
+ pair<EdgeId, size_t> GetUniqueKmerPos(const runtime_k::RtSeq& kmer) const {
+ KeyWithHash kwh = tmp_index.ConstructKWH(kmer);
+
+ if (tmp_index.valid(kwh.key())) {
+ auto keys = tmp_index.get(kwh);
+ if (keys.size() == 1) {
+ return make_pair(keys[0].edge_id, keys[0].offset);
+ }
+ }
+ return make_pair(EdgeId(0), -1u);
+ }
+
+
+};
+
+template<class Graph>
+typename PacBioMappingIndex<Graph>::MappingDescription PacBioMappingIndex<Graph>::Locate(const Sequence &s) const {
+ MappingDescription res;
+ //WARNING: removed read_count from here to make const methods
+ int local_read_count = 0;
+ ++local_read_count;
+ if (s.size() < pacbio_k)
+ return res;
+
+ //runtime_k::RtSeq kmer = s.start<runtime_k::RtSeq>(pacbio_k);
+ KeyWithHash kwh = tmp_index.ConstructKWH(s.start<runtime_k::RtSeq>(pacbio_k));
+
+ for (size_t j = pacbio_k; j < s.size(); ++j) {
+ kwh = kwh << s[j];
+ if (!tmp_index.valid(kwh.key())) {
+// INFO("not valid kmer");
+ continue;
+ }
+ auto keys = tmp_index.get(kwh);
+ TRACE("Valid key, size: "<< keys.size());
+
+ for (auto iter = keys.begin(); iter != keys.end(); ++iter) {
+
+ int quality = (int) keys.size();
+ TRACE("and quality:" << quality);
+ if (banned_kmers.find(Sequence(kwh.key())) != banned_kmers.end())
+ continue;
+ int offset = (int)iter->offset;
+ int s_stretched = int ((double)s.size() * 1.2 + 50);
+ int edge_len = int(g_.length(iter->edge_id));
+ //No alignment in vertex, and further than s+eps bp from edge ends;
+ bool correct_alignment = offset > int(debruijn_k - pacbio_k) && offset < edge_len;
+ if (ignore_map_to_middle) {
+ correct_alignment &= (offset < int(debruijn_k - pacbio_k) + s_stretched || offset > edge_len - s_stretched);
+ }
+ if (correct_alignment) {
+ res[iter->edge_id].push_back(MappingInstance((int) iter->offset, (int) (j - pacbio_k + 1), quality));
+ }
+ }
+ }
+
+ for (auto iter = res.begin(); iter != res.end(); ++iter) {
+ sort(iter->second.begin(), iter->second.end());
+ DEBUG("read count "<< local_read_count);
+ DEBUG("edge: " << g_.int_id(iter->first) << "size: " << iter->second.size());
+ for (auto j_iter = iter->second.begin(); j_iter != iter->second.end(); j_iter++) {
+ DEBUG(j_iter->str());
+ }
+ }
+
+ return res;
+}
+
+}
diff --git a/src/modules/assembly_graph/graph_alignment/pacbio/pacbio_gap_closer.hpp b/src/modules/assembly_graph/graph_alignment/pacbio/pacbio_gap_closer.hpp
new file mode 100644
index 0000000..b742c3d
--- /dev/null
+++ b/src/modules/assembly_graph/graph_alignment/pacbio/pacbio_gap_closer.hpp
@@ -0,0 +1,394 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "pacbio_read_structures.hpp"
+
+#include "ConsensusCore/Poa/PoaConfig.hpp"
+#include "ConsensusCore/Poa/PoaConsensus.hpp"
+
+#include <algorithm>
+
+namespace pacbio {
+template<class Graph>
+class PacbioGapCloser;
+
+template<class Graph>
+class GapStorage {
+ friend class PacbioGapCloser<Graph> ;
+ typedef typename Graph::EdgeId EdgeId;
+private:
+ DECL_LOGGER("PacbioGaps")
+ ;
+ Graph &g_;
+ map<EdgeId, vector<GapDescription<Graph> > > inner_index;
+ void HiddenAddGap(const GapDescription<Graph> &p) {
+ inner_index[p.start].push_back(p);
+ }
+ vector<EdgeId> index;
+ set<pair<EdgeId, EdgeId> > nonempty_pairs;
+ set<pair<EdgeId, EdgeId> > transitively_ignored_pairs;
+ set<pair<EdgeId, EdgeId> > symmetrically_ignored_pairs;
+
+public:
+ size_t min_gap_quantity;
+ GapStorage(Graph &g, size_t min_gap_quantity)
+ : g_(g),
+ inner_index(), min_gap_quantity(min_gap_quantity){
+ }
+
+ size_t FillIndex() {
+ index.resize(0);
+ set<EdgeId> tmp;
+ for (auto iter = inner_index.begin(); iter != inner_index.end(); iter++) {
+ index.push_back(iter->first);
+ }
+ return index.size();
+ }
+
+ EdgeId operator[](size_t i) {
+ return index.at(i);
+ }
+
+ size_t size() const {
+ return index.size();
+ }
+
+ bool IsTransitivelyIgnored(pair<EdgeId, EdgeId> p) {
+ return (transitively_ignored_pairs.find(p) != transitively_ignored_pairs.end());
+ }
+ bool IsSymmetricallyIgnored(pair<EdgeId, EdgeId> p) {
+ return (symmetrically_ignored_pairs.find(p) != symmetrically_ignored_pairs.end());
+ }
+
+ bool IsIgnored(pair<EdgeId, EdgeId> p) {
+ return (IsTransitivelyIgnored(p) || IsSymmetricallyIgnored(p));
+ }
+ void AddGap(const GapDescription<Graph> &p, bool add_rc = false) {
+ HiddenAddGap(p);
+ if (add_rc) {
+ TRACE("Adding conjugate");
+ HiddenAddGap(p.conjugate(g_, (int) cfg::get().K));
+ }
+ }
+
+ void AddStorage(const GapStorage<Graph> & to_add) {
+ const auto& idx = to_add.inner_index;
+ for (auto iter = idx.begin(); iter != idx.end(); ++iter)
+ inner_index[iter->first].insert(inner_index[iter->first].end(), iter->second.begin(), iter->second.end());
+ }
+
+ void PostProcess() {
+ FillIndex();
+
+ for (auto j_iter = index.begin(); j_iter != index.end(); j_iter++) {
+ EdgeId e = *j_iter;
+ auto cl_start = inner_index[e].begin();
+ auto iter = inner_index[e].begin();
+ vector<GapDescription<Graph> > padded_gaps;
+ while (iter != inner_index[e].end()) {
+ auto next_iter = ++iter;
+ if (next_iter == inner_index[e].end() || next_iter->end != cl_start->end) {
+ size_t len = next_iter - cl_start;
+ if (len >= min_gap_quantity) {
+ nonempty_pairs.insert(make_pair(cl_start->start, cl_start->end));
+ }
+ cl_start = next_iter;
+ }
+ }
+ }
+
+ set<pair<EdgeId, EdgeId> > used_rc_pairs;
+ for (auto iter = nonempty_pairs.begin(); iter != nonempty_pairs.end(); ++iter) {
+ if (used_rc_pairs.find(*iter) != used_rc_pairs.end()) {
+ DEBUG("skipping pair " << g_.int_id(iter->first) << "," << g_.int_id(iter->second));
+ symmetrically_ignored_pairs.insert(make_pair(iter->first, iter->second));
+ } else {
+ DEBUG("Using pair" << g_.int_id(iter->first) << "," << g_.int_id(iter->second));
+ }
+
+ for (size_t i = 0; i < index.size(); i++) {
+ if (nonempty_pairs.find(make_pair(iter->first, index[i])) != nonempty_pairs.end()
+ && nonempty_pairs.find(make_pair(index[i], iter->second)) != nonempty_pairs.end()) {
+ DEBUG("pair " << g_.int_id(iter->first) << "," << g_.int_id(iter->second) << " is ignored because of edge between " << g_.int_id(index[i]));
+ transitively_ignored_pairs.insert(make_pair(iter->first, iter->second));
+ }
+ }
+ used_rc_pairs.insert(make_pair(g_.conjugate(iter->second), g_.conjugate(iter->first)));
+ }
+ }
+
+ void DumpToFile(const string filename) {
+ ofstream filestr(filename);
+ for (auto iter = inner_index.begin(); iter != inner_index.end(); ++iter) {
+ DEBUG( g_.int_id(iter->first)<< " " <<iter->second.size());
+ filestr << g_.int_id(iter->first) << " " << iter->second.size() << endl;
+ sort(iter->second.begin(), iter->second.end());
+ for (auto j_iter = iter->second.begin(); j_iter != iter->second.end(); ++j_iter) {
+ filestr << j_iter->str(g_);
+ }
+ filestr << endl;
+ }
+ }
+
+ void LoadFromFile(const string s) {
+ FILE* file = fopen((s).c_str(), "r");
+ int res;
+ char ss[5000];
+ map<int, EdgeId> tmp_map;
+ for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
+ tmp_map[g_.int_id(*iter)] = *iter;
+ }
+ while (!feof(file)) {
+ int first_id, second_id, first_ind, second_ind;
+ int size;
+ res = fscanf(file, "%d %d\n", &first_id, &size);
+ VERIFY(res == 2);
+ for (int i = 0; i < size; i++) {
+ res = fscanf(file, "%d %d\n", &first_id, &first_ind);
+ VERIFY(res == 2);
+ res = fscanf(file, "%d %d\n", &second_id, &second_ind);
+ VERIFY(res == 2);
+ res = fscanf(file, "%s\n", ss);
+ VERIFY(res == 1);
+ GapDescription<Graph> gap(tmp_map[first_id], tmp_map[second_id], Sequence(ss), first_ind, second_ind);
+ this->AddGap(gap);
+ }
+ }
+ }
+
+ void PadGapStrings(EdgeId e) {
+ sort(inner_index[e].begin(), inner_index[e].end());
+ auto cl_start = inner_index[e].begin();
+ auto iter = inner_index[e].begin();
+ vector<GapDescription<Graph> > padded_gaps;
+ while (iter != inner_index[e].end()) {
+ auto next_iter = ++iter;
+ if (next_iter == inner_index[e].end() || next_iter->end != cl_start->end) {
+ int start_min = 1000000000;
+ int end_max = 0;
+ size_t long_seqs = 0;
+ size_t short_seqs = 0;
+ size_t long_seq_limit = cfg::get().pb.long_seq_limit; //400
+ bool exclude_long_seqs = false;
+ for (auto j_iter = cl_start; j_iter != next_iter; j_iter++) {
+ if (g_.length(j_iter->start) - j_iter->edge_gap_start_position > 500 || j_iter->edge_gap_end_position > 500) {
+ DEBUG("ignoring alingment to the middle of edge");
+ continue;
+ }
+ if (j_iter->gap_seq.size() > long_seq_limit)
+ long_seqs++;
+ else
+ short_seqs++;
+
+ if (j_iter->edge_gap_start_position < start_min)
+ start_min = j_iter->edge_gap_start_position;
+ if (j_iter->edge_gap_end_position > end_max)
+ end_max = j_iter->edge_gap_end_position;
+ }
+
+ if (short_seqs >= min_gap_quantity && short_seqs > long_seqs)
+ exclude_long_seqs = true;
+
+ for (auto j_iter = cl_start; j_iter != next_iter; j_iter++) {
+ if (g_.length(j_iter->start) - j_iter->edge_gap_start_position > 500 || j_iter->edge_gap_end_position > 500)
+ continue;
+
+ if (exclude_long_seqs && j_iter->gap_seq.size() > long_seq_limit)
+ continue;
+
+ string s = g_.EdgeNucls(j_iter->start).Subseq(start_min, j_iter->edge_gap_start_position).str();
+ s += j_iter->gap_seq.str();
+ s += g_.EdgeNucls(j_iter->end).Subseq(j_iter->edge_gap_end_position, end_max).str();
+ padded_gaps.push_back(GapDescription<Graph>(j_iter->start, j_iter->end, Sequence(s), start_min, end_max));
+ }
+ cl_start = next_iter;
+ }
+ }
+ inner_index[e] = padded_gaps;
+ }
+
+ void PadGapStrings() {
+ for (auto iter = inner_index.begin(); iter != inner_index.end(); ++iter) {
+ DEBUG("Padding gaps for first edge " << g_.int_id(iter->first));
+ PadGapStrings(iter->first);
+ }
+ PostProcess();
+ }
+};
+
+template<class Graph>
+class PacbioGapCloser {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef runtime_k::RtSeq Kmer;
+ typedef vector<map<Kmer, int> > KmerStorage;
+private:
+ DECL_LOGGER("PacbioGaps")
+ ;
+ Graph &g_;
+ //first edge, second edge, weight, seq
+ map<EdgeId, map<EdgeId, pair<size_t, string> > > new_edges_;
+ int closed_gaps;
+ int not_unique_gaps;
+ int chained_gaps;
+ bool consensus_gap_closing;
+public:
+ void CloseGapsInGraph(map<EdgeId, EdgeId> &replacement) {
+ for (auto iter = new_edges_.begin(); iter != new_edges_.end(); ++iter) {
+ if (iter->second.size() != 1) {
+ DEBUG("non-unique gap!!");
+ not_unique_gaps ++;
+ continue;
+ }
+ EdgeId first = iter->first;
+ EdgeId second = (iter->second.begin()->first);
+ if (replacement.find(first) != replacement.end() || replacement.find(second) != replacement.end()) {
+ DEBUG("sorry, gap chains are not supported yet");
+ chained_gaps++;
+ continue;
+ }
+
+ EdgeId first_conj = g_.conjugate(first);
+ EdgeId second_conj = g_.conjugate(second);
+ size_t first_id = g_.int_id(first);
+ size_t second_id = g_.int_id(second);
+ size_t first_id_conj = g_.int_id(g_.conjugate(first));
+ size_t second_id_conj = g_.int_id(g_.conjugate(second));
+ DEBUG("closing gaps between "<< first_id << " " << second_id);
+ size_t len_f = g_.length(first);
+ size_t len_s = g_.length(second);
+ size_t len_sum = iter->second.begin()->second.second.length();
+ double cov = (double)g_.length(first) * g_.coverage(first) + (double)g_.length(second) * g_.coverage(second);
+
+ DEBUG("coverage was " << g_.coverage(first) << " " << g_.coverage(second));
+
+ EdgeId newEdge = g_.AddEdge(g_.EdgeStart(first), g_.EdgeEnd(second), Sequence(iter->second.begin()->second.second));
+ if (cov > UINT_MAX * 0.75 ) cov = UINT_MAX*0.75;
+ cov /= (double) g_.length(newEdge);
+ TRACE(g_.int_id(newEdge));
+ int len_split = int(((double) len_f * (double) len_sum) / ((double)len_s + (double)len_f));
+ if (len_split == 0) {
+ DEBUG(" zero split length, length are:" << len_f <<" " << len_sum <<" " << len_s);
+ len_split = 1;
+ }
+ g_.DeleteEdge(first);
+ g_.DeleteEdge(second);
+ g_.coverage_index().SetAvgCoverage(newEdge, cov);
+ g_.coverage_index().SetAvgCoverage(g_.conjugate(newEdge), cov);
+ size_t next_id = g_.int_id(newEdge);
+ DEBUG("and new coverage is " << g_.coverage(newEdge));
+ closed_gaps ++;
+ size_t next_id_conj = g_.int_id(g_.conjugate(newEdge));
+ TRACE(first_id << " " << second_id << " " << next_id << " " << first_id_conj << " " << second_id_conj << " " << next_id_conj << " ");
+ replacement[first] = newEdge;
+ replacement[second] = newEdge;
+ replacement[first_conj] = g_.conjugate(newEdge);
+ replacement[second_conj] = g_.conjugate(newEdge);
+ }
+ INFO("Closed " << closed_gaps << " gaps");
+ INFO("Total " << not_unique_gaps << " were not closed due to more than one possible pairing");
+ INFO("Total " << chained_gaps << " were skipped because of gap chains");
+ //TODO: chains of gaps!
+ }
+private:
+
+ void ConstructConsensus(EdgeId e, GapStorage<Graph> &storage, map<EdgeId, map<EdgeId, pair<size_t, string> > > & new_edges) {
+ auto cl_start = storage.inner_index[e].begin();
+ auto iter = storage.inner_index[e].begin();
+ size_t cur_len = 0;
+ while (iter != storage.inner_index[e].end()) {
+ auto next_iter = ++iter;
+ cur_len++;
+ if (next_iter == storage.inner_index[e].end() || next_iter->end != cl_start->end) {
+ if (cur_len >= storage.min_gap_quantity && !storage.IsIgnored(make_pair(cl_start->start, cl_start->end))) {
+ vector<string> gap_variants;
+
+ for (auto j_iter = cl_start; j_iter != next_iter; j_iter++) {
+ string s = j_iter->gap_seq.str();
+ transform(s.begin(), s.end(), s.begin(), ::toupper);
+ gap_variants.push_back(s);
+ }
+ if (consensus_gap_closing || (gap_variants.size() > 0 && gap_variants[0].length() < cfg::get().pb.max_contigs_gap_length)) {
+ map <EdgeId, pair<size_t, string>> tmp;
+ string tmp_string;
+ string s = g_.EdgeNucls(cl_start->start).Subseq(0, cl_start->edge_gap_start_position).str();
+ if (consensus_gap_closing) {
+ const ConsensusCore::PoaConsensus *pc = ConsensusCore::PoaConsensus::FindConsensus(
+ gap_variants,
+ ConsensusCore::PoaConfig::GLOBAL_ALIGNMENT);
+ tmp_string = pc->Sequence();
+ } else {
+ tmp_string = gap_variants[0];
+ if (gap_variants.size() > 1) {
+
+ stringstream ss;
+ for (size_t i = 0; i < gap_variants.size(); i++)
+ ss << gap_variants[i].length() << " ";
+ INFO(gap_variants.size() << " gap closing variant for contigs, lengths: " << ss.str());
+ }
+ }
+
+ DEBUG("consenus for " << g_.int_id(cl_start->start) << " and " << g_.int_id(cl_start->end) <<
+ "found: ");
+ DEBUG(tmp_string);
+ s += tmp_string;
+ s += g_.EdgeNucls(cl_start->end).Subseq(cl_start->edge_gap_end_position,
+ g_.length(cl_start->end) + g_.k()).str();
+ tmp.insert(make_pair(cl_start->end, make_pair(cur_len, s)));
+ new_edges[cl_start->start] = tmp;
+ } else {
+ INFO ("Skipping gap of size " << gap_variants[0].length() << " multiplicity " << gap_variants.size());
+ }
+ }
+ cl_start = next_iter;
+ cur_len = 0;
+ }
+ }
+ }
+
+public:
+ PacbioGapCloser(Graph &g, bool consensus_gap )
+ : g_(g), consensus_gap_closing(consensus_gap) {
+ closed_gaps = 0;
+ not_unique_gaps = 0;
+ chained_gaps = 0;
+ }
+
+ void ConstructConsensus(size_t nthreads, GapStorage<Graph> &storage) {
+ vector<map<EdgeId, map<EdgeId, pair<size_t, string> > > > new_edges_by_thread;
+ new_edges_by_thread.resize(nthreads);
+ size_t storage_size = storage.size();
+# pragma omp parallel for shared(storage, new_edges_by_thread) num_threads(nthreads)
+ for (size_t i = 0; i < storage_size; i++) {
+ EdgeId e = storage[i];
+ size_t thread_num = omp_get_thread_num();
+ DEBUG("constructing consenus for first edge " << g_.int_id(e) << " in thread " <<thread_num);
+ ConstructConsensus(e, storage, new_edges_by_thread[thread_num]);
+ }
+ for (size_t i = 0; i < nthreads; i++) {
+ for (auto iter = new_edges_by_thread[i].begin(); iter != new_edges_by_thread[i].end(); ++iter) {
+ new_edges_.insert(*iter);
+ }
+ }
+ }
+ void DumpToFile(const string filename) {
+ ofstream filestr(filename);
+ for (auto iter = new_edges_.begin(); iter != new_edges_.end(); ++iter) {
+ if (iter->second.size() > 1) {
+ DEBUG("nontrivial gap closing for edge" <<g_.int_id(iter->first));
+ }
+ for (auto j_iter = iter->second.begin(); j_iter != iter->second.end(); ++j_iter) {
+ filestr << ">" << g_.int_id(iter->first) << "_" << iter->second.size() << "_" << g_.int_id(j_iter->first) << "_" << j_iter->second.first << endl;
+ filestr << j_iter->second.second << endl;
+ }
+ }
+ }
+
+};
+
+}
diff --git a/src/modules/assembly_graph/graph_alignment/pacbio/pacbio_read_structures.hpp b/src/modules/assembly_graph/graph_alignment/pacbio/pacbio_read_structures.hpp
new file mode 100644
index 0000000..38bd2e2
--- /dev/null
+++ b/src/modules/assembly_graph/graph_alignment/pacbio/pacbio_read_structures.hpp
@@ -0,0 +1,326 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * pac_index.hpp
+ *
+ * Created on: Jan 21, 2013
+ * Author: lab42
+ */
+#pragma once
+
+#include "data_structures/indices/perfect_hash_map.hpp"
+#include "pipeline/graph_pack.hpp"
+#include <algorithm>
+using std::map;
+using std::set;
+namespace pacbio {
+template<class T>
+struct pair_iterator_less {
+ bool operator ()(pair<size_t, T> const& a, pair<size_t, T> const& b) const {
+ return (a.first < b.first);
+ }
+};
+
+struct MappingInstance {
+ int edge_position;
+ int read_position;
+ //Now quality is the same with multiplicity, so best quality is 1,
+ int quality;
+ MappingInstance(int edge_position, int read_position, int quality) :
+ edge_position(edge_position), read_position(read_position), quality(quality) {
+ }
+
+ inline bool IsUnique() const {
+ return (quality == 1);
+ }
+
+ string str() {
+ stringstream s;
+ s << "E: " << edge_position << " R: " << read_position << " Q: " << quality;
+ return s.str();
+ }
+
+//Less by EDGE position
+ bool operator <(MappingInstance const& b) const {
+ if (edge_position < b.edge_position || (edge_position == b.edge_position && read_position < b.read_position))
+ return true;
+ else
+ return false;
+ }
+private:
+ DECL_LOGGER("MappingInstance")
+ ;
+};
+
+//Less by READ position
+struct ReadPositionComparator {
+ bool operator ()(MappingInstance const& a, MappingInstance const& b) const {
+ return (a.read_position < b.read_position || (a.read_position == b.read_position && a.edge_position < b.edge_position));
+ }
+};
+
+template<class Graph>
+struct KmerCluster {
+ typedef typename Graph::EdgeId EdgeId;
+ int last_trustable_index;
+ int first_trustable_index;
+ size_t average_read_position;
+ size_t average_edge_position;
+ EdgeId edgeId;
+ vector<MappingInstance> sorted_positions;
+ int size;
+
+ KmerCluster(EdgeId e, const vector<MappingInstance>& v) {
+ last_trustable_index = 0;
+ first_trustable_index = 0;
+ average_read_position = 0;
+ edgeId = e;
+ size = (int) v.size();
+ sorted_positions = v;
+ FillTrustableIndeces();
+ }
+
+ bool operator <(const KmerCluster & b) const {
+ return (average_read_position < b.average_read_position ||(average_read_position == b.average_read_position && edgeId < b.edgeId) ||
+ (average_read_position == b.average_read_position && edgeId == b.edgeId && sorted_positions < b.sorted_positions));
+ }
+
+ bool CanFollow(const KmerCluster &b) const {
+ return (b.sorted_positions[b.last_trustable_index].read_position < sorted_positions[first_trustable_index].read_position);
+ }
+
+ void FillTrustableIndeces() {
+ //ignore non-unique kmers for distance determination
+ int first_unique_ind = 0;
+ while (first_unique_ind != size - 1 && !(sorted_positions[first_unique_ind].IsUnique())) {
+ first_unique_ind += 1;
+ }
+ int last_unique_ind = size - 1;
+ while (last_unique_ind != 0 && !(sorted_positions[last_unique_ind].IsUnique())) {
+ last_unique_ind -= 1;
+ }
+ last_trustable_index = last_unique_ind;
+ first_trustable_index = first_unique_ind;
+ double tmp_read_position = 0, tmp_edge_position = 0;;
+ vector<int> diffs;
+ for (auto mp : sorted_positions) {
+ tmp_read_position += mp.read_position;
+ tmp_edge_position += mp.edge_position;
+ diffs.push_back(mp.read_position - mp.edge_position);
+ }
+ sort(diffs.begin(), diffs.end());
+ int median_diff = diffs[size/2];
+
+ tmp_read_position /= size;
+ tmp_edge_position /= size;
+ average_read_position = (size_t)trunc(tmp_read_position);
+ average_edge_position = (size_t)trunc(tmp_edge_position);
+
+ if (size > 10) {
+ int max_debug_size = 10;
+ vector<int> distances(max_debug_size);
+ for (int df: diffs) {
+ int ind = abs(df - median_diff)/ 50;
+ if (ind > max_debug_size - 1) ind = max_debug_size - 1;
+ distances [ind] ++;
+ }
+ if (size > 100 || distances[0] * 5 < size * 4) {
+ stringstream s;
+
+ for (int d: distances) {
+ s << d << " ";
+ }
+// INFO(s.str());
+
+ }
+ }
+ }
+
+ string str(const Graph &g) const{
+ stringstream s;
+ s << "Edge: " << g.int_id(edgeId) << " on edge: " << sorted_positions[first_trustable_index].edge_position<< " - " << sorted_positions[last_trustable_index].edge_position<< ";on read: " << sorted_positions[first_trustable_index].read_position<< " - " << sorted_positions[last_trustable_index].read_position<< ";size "<< size;
+ return s.str();
+ }
+private:
+ DECL_LOGGER("KmerCluster")
+ ;
+};
+
+template<class Graph>
+struct GapDescription {
+ typedef typename Graph::EdgeId EdgeId;
+ typename Graph::EdgeId start, end;
+ Sequence gap_seq;
+ int edge_gap_start_position, edge_gap_end_position;
+
+
+ GapDescription(EdgeId start_e, EdgeId end_e, const Sequence &gap, int gap_start, int gap_end) :
+ start(start_e), end(end_e), gap_seq(gap.str()), edge_gap_start_position(gap_start), edge_gap_end_position(gap_end) {
+ }
+
+ GapDescription(const KmerCluster<Graph> &a, const KmerCluster<Graph> & b, Sequence read, int pacbio_k) {
+ edge_gap_start_position = a.sorted_positions[a.last_trustable_index].edge_position;
+ edge_gap_end_position = b.sorted_positions[b.first_trustable_index].edge_position + pacbio_k - 1;
+ start = a.edgeId;
+ end = b.edgeId;
+ DEBUG(read.str());
+ gap_seq = read.Subseq(a.sorted_positions[a.last_trustable_index].read_position, b.sorted_positions[b.first_trustable_index].read_position + pacbio_k - 1);
+ DEBUG(gap_seq.str());
+ DEBUG("gap added");
+ }
+
+ GapDescription<Graph> conjugate(Graph &g_, int shift) const {
+ GapDescription<Graph> res(
+ g_.conjugate(end), g_.conjugate(start), (!gap_seq),
+ (int) g_.length(end) + shift - edge_gap_end_position,
+ (int) g_.length(start) + shift - edge_gap_start_position);
+ DEBUG("conjugate created" << res.str(g_));
+ return res;
+ }
+
+ string str(Graph &g_) const {
+ stringstream s;
+ s << g_.int_id(start) << " " << edge_gap_start_position <<endl << g_.int_id(end) << " " << edge_gap_end_position << endl << gap_seq.str()<< endl;
+ return s.str();
+ }
+
+ bool operator <(const GapDescription & b) const {
+ return (start < b.start || (start == b.start && end < b.end) ||
+ (start == b.start && end == b.end && edge_gap_start_position < b.edge_gap_start_position));
+ }
+
+private:
+ DECL_LOGGER("PacIndex")
+ ;
+};
+
+template<class Graph>
+struct OneReadMapping {
+ typedef typename Graph::EdgeId EdgeId;
+ vector<vector<EdgeId> > main_storage;
+ vector<GapDescription<Graph> > gaps;
+ vector<size_t> real_length;
+//Total used seeds. sum over all subreads;
+ size_t seed_num;
+ OneReadMapping(vector<vector<EdgeId> > &paths_description, vector<GapDescription<Graph> > &gaps_description, vector<size_t> real_length, size_t seed_num) :
+ main_storage(paths_description), gaps(gaps_description), real_length(real_length), seed_num(seed_num) {
+ }
+
+};
+
+
+struct StatsCounter{
+
+ map<size_t,size_t> path_len_in_edges;
+ vector<size_t> subreads_length;
+ size_t total_len ;
+ size_t reads_with_conjugate;
+ size_t subreads_count;
+ map<size_t, size_t> seeds_percentage;
+ StatsCounter() {
+ total_len = 0;
+ reads_with_conjugate = 0;
+ }
+
+ void AddStorage(StatsCounter &other) {
+ total_len += other.total_len;
+ reads_with_conjugate += other.reads_with_conjugate;
+ for (auto iter = other.subreads_length.begin(); iter != other.subreads_length.end(); ++iter) {
+ subreads_length.push_back(*iter);
+ }
+
+ for (auto iter = other.path_len_in_edges.begin(); iter != other.path_len_in_edges.end(); ++iter){
+ auto j_iter = iter;
+ if (( j_iter = path_len_in_edges.find(iter->first)) == other.path_len_in_edges.end()){
+ path_len_in_edges.insert(make_pair(iter->first, iter->second));
+ } else {
+ path_len_in_edges[j_iter->first] += iter->second;
+ }
+ }
+ for (auto iter = other.seeds_percentage.begin(); iter != other.seeds_percentage.end(); ++iter){
+ auto j_iter = iter;
+ if (( j_iter = seeds_percentage.find(iter->first)) == other.seeds_percentage.end()){
+ seeds_percentage.insert(make_pair(iter->first, iter->second));
+ } else {
+ seeds_percentage[j_iter->first] += iter->second;
+ }
+ }
+ }
+
+ void report(){
+ size_t total = 0;
+ for (auto iter = seeds_percentage.begin(); iter != seeds_percentage.end(); ++iter){
+ total += iter->second;
+ }
+ size_t cur = 0;
+ size_t percentage = 0;
+ for (auto iter = seeds_percentage.begin(); iter != seeds_percentage.end(); ++iter){
+ cur += iter->second;
+ percentage = iter->first;
+ if (cur * 2 > total) break;
+ }
+ INFO("Median fraction of present seeds in maximal alignmnent among reads aligned to the graph: " << double(percentage) * 0.001);
+ }
+private:
+ DECL_LOGGER("StatsCounter");
+
+};
+
+inline int StringDistance(string &a, string &b) {
+ int a_len = (int) a.length();
+ int b_len = (int) b.length();
+ int d = min(a_len / 3, b_len / 3);
+ d = max(d, 10);
+ DEBUG(a_len << " " << b_len << " " << d);
+ vector<vector<int> > table(a_len);
+ //int d =
+ for (int i = 0; i < a_len; i++) {
+ table[i].resize(b_len);
+ int low = max(max(0, i - d - 1), i + b_len - a_len - d - 1);
+ int high = min(min(b_len, i + d + 1), i + a_len - b_len + d + 1);
+ TRACE(low << " " <<high);
+ for (int j = low; j < high; j++)
+ table[i][j] = 1000000;
+ }
+ table[a_len - 1][b_len - 1] = 1000000;
+ table[0][0] = 0;
+//free deletions on begin
+// for(int j = 0; j < b_len; j++)
+// table[0][j] = 0;
+
+ for (int i = 0; i < a_len; i++) {
+ int low = max(max(0, i - d), i + b_len - a_len - d);
+ int high = min(min(b_len, i + d), i + a_len - b_len + d);
+
+ TRACE(low << " " <<high);
+ for (int j = low; j < high; j++) {
+
+ if (i > 0)
+ table[i][j] = min(table[i][j], table[i - 1][j] + 1);
+ if (j > 0)
+ table[i][j] = min(table[i][j], table[i][j - 1] + 1);
+ if (i > 0 && j > 0) {
+ int add = 1;
+ if (a[i] == b[j])
+ add = 0;
+ table[i][j] = min(table[i][j], table[i - 1][j - 1] + add);
+ }
+ }
+ }
+ //return table[a_len - 1][b_len - 1];
+//free deletions on end
+ int res = table[a_len - 1][b_len - 1];
+ DEBUG(res);
+// for(int j = 0; j < b_len; j++){
+// res = min(table[a_len - 1][j], res);
+// }
+ return res;
+}
+
+
+}
diff --git a/src/modules/assembly_graph/graph_alignment/sequence_mapper.hpp b/src/modules/assembly_graph/graph_alignment/sequence_mapper.hpp
new file mode 100644
index 0000000..cab3ebe
--- /dev/null
+++ b/src/modules/assembly_graph/graph_alignment/sequence_mapper.hpp
@@ -0,0 +1,408 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "data_structures/sequence/sequence_tools.hpp"
+#include "assembly_graph/paths/path_processor.hpp"
+#include "assembly_graph/graph_core/basic_graph_stats.hpp"
+
+#include "data_structures/sequence/runtime_k.hpp"
+#include "edge_index.hpp"
+#include "kmer_mapper.hpp"
+
+#include <cstdlib>
+#include "assembly_graph/graph_core/basic_graph_stats.hpp"
+
+namespace debruijn_graph {
+using omnigraph::MappingPath;
+using omnigraph::Path;
+using omnigraph::MappingRange;
+using omnigraph::Range;
+
+template<class Graph>
+MappingPath<typename Graph::EdgeId> ConjugateMapping(const Graph& g,
+ const MappingPath<typename Graph::EdgeId>& mp,
+ size_t sequence_length) {
+ MappingPath<typename Graph::EdgeId> answer;
+ for (size_t i = mp.size(); i > 0; --i) {
+ auto p = mp[i-1];
+ auto e = p.first;
+ MappingRange mr = p.second;
+ answer.push_back(g.conjugate(e),
+ MappingRange(mr.initial_range.Invert(sequence_length - g.k()),
+ mr.mapped_range.Invert(g.length(e))));
+ }
+ return answer;
+}
+
+template<class Graph>
+class SequenceMapper {
+public:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef runtime_k::RtSeq Kmer;
+
+protected:
+ const Graph& g_;
+
+public:
+ SequenceMapper(const Graph& g): g_(g) {
+
+ }
+
+ virtual ~SequenceMapper() {
+
+ }
+
+ virtual MappingPath<EdgeId> MapSequence(const Sequence &sequence) const = 0;
+
+
+ MappingPath<EdgeId> MapRead(const io::SingleRead &read) const {
+// VERIFY(read.IsValid());
+ DEBUG(read.name() << " is mapping");
+ string s = read.GetSequenceString();
+ size_t l = 0, r = 0;
+ MappingPath<EdgeId> result;
+ for(size_t i = 0; i < s.size(); i++) {
+ if (read.GetSequenceString()[i] == 'N') {
+ if (r > l) {
+ result.join(MapSequence(Sequence(s.substr(l, r - l))), int(l));
+ }
+ r = i + 1;
+ l = i + 1;
+ } else {
+ r++;
+ }
+ }
+ if (r > l) {
+ result.join(MapSequence(Sequence(s.substr(l, r - l))), int(l));
+ }
+ DEBUG(read.name() << " is mapped");
+ DEBUG("Number of edges is " << result.size());
+
+ return result;
+ }
+
+ virtual size_t KmerSize() const = 0;
+};
+
+template<class Graph>
+class MappingPathFixer {
+public:
+
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ MappingPathFixer(const Graph& graph)
+ : g_(graph) {
+ }
+
+ bool CheckContiguous(const vector<typename Graph::EdgeId>& path) const {
+ for (size_t i = 1; i < path.size(); ++i) {
+ if (g_.EdgeEnd(path[i - 1]) != g_.EdgeStart(path[i]))
+ return false;
+ }
+ return true;
+ }
+
+ Path<EdgeId> TryFixPath(const Path<EdgeId>& path, size_t length_bound = 70) const {
+ return Path<EdgeId>(TryFixPath(path.sequence(), length_bound), path.start_pos(), path.end_pos());
+ }
+
+ vector<EdgeId> TryFixPath(const vector<EdgeId>& edges, size_t length_bound = 70) const {
+ vector<EdgeId> answer;
+ if (edges.empty()) {
+ // WARN("Mapping path was empty");
+ return vector<EdgeId>();
+ }
+ answer.push_back(edges[0]);
+ for (size_t i = 1; i < edges.size(); ++i) {
+ if (g_.EdgeEnd(edges[i - 1]) != g_.EdgeStart(edges[i])) {
+ vector<EdgeId> closure = TryCloseGap(g_.EdgeEnd(edges[i - 1]),
+ g_.EdgeStart(edges[i]),
+ length_bound);
+ answer.insert(answer.end(), closure.begin(), closure.end());
+ }
+ answer.push_back(edges[i]);
+ }
+ return answer;
+ }
+
+ vector<EdgeId> DeleteSameEdges(const vector<EdgeId>& path) const {
+ vector<EdgeId> result;
+ if (path.empty()) {
+ return result;
+ }
+ result.push_back(path[0]);
+ for (size_t i = 1; i < path.size(); ++i) {
+ if (path[i] != result[result.size() - 1]) {
+ result.push_back(path[i]);
+ }
+ }
+ return result;
+ }
+
+private:
+ vector<EdgeId> TryCloseGap(VertexId v1, VertexId v2, size_t length_bound) const {
+ if (v1 == v2)
+ return vector<EdgeId>();
+ TRACE("Trying to close gap between v1=" << g_.int_id(v1) << " and v2=" << g_.int_id(v2));
+ omnigraph::PathStorageCallback<Graph> path_store(g_);
+
+ TRACE("Path storage callback created");
+ //todo reduce value after investigation
+ omnigraph::ProcessPaths(g_, 0, length_bound, v1, v2, path_store);
+
+ TRACE("Paths processed");
+ if (path_store.size() == 0) {
+ TRACE("Failed to find closing path");
+ // TRACE("Failed to close gap between v1=" << graph_.int_id(v1)
+ // << " (conjugate "
+ // << graph_.int_id(g_.conjugate(v1))
+ // << ") and v2=" << g_.int_id(v2)
+ // << " (conjugate "
+ // << g_.int_id(g_.conjugate(v2)) << ")");
+ // return boost::none;
+ return vector<EdgeId>();
+ } else if (path_store.size() == 1) {
+ TRACE("Unique closing path found");
+ } else {
+ TRACE("Several closing paths found, first chosen");
+ }
+ TRACE("Taking answer ");
+ vector<EdgeId> answer = path_store.paths().front();
+ TRACE("Gap closed");
+ TRACE( "Cumulative closure length is " << CumulativeLength(g_, answer));
+ return answer;
+ }
+ const Graph& g_;
+};
+
+template<class Graph>
+class ReadPathFinder {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ const Graph& g_;
+ typedef MappingPathFixer<Graph> GraphMappingPathFixer;
+ const GraphMappingPathFixer path_fixer_;
+public:
+ ReadPathFinder (const Graph& g) :
+ g_(g), path_fixer_(g)
+ { }
+
+ vector<EdgeId> FindReadPath(const MappingPath<EdgeId>& mapping_path) const {
+ if (!IsMappingPathValid(mapping_path)) {
+ TRACE("read unmapped");
+ return vector<EdgeId>();
+ }
+ vector<EdgeId> corrected_path = path_fixer_.DeleteSameEdges(
+ mapping_path.simple_path());
+ PrintPathInfo(corrected_path);
+ if(corrected_path.size() != mapping_path.simple_path().size()) {
+ DEBUG("Some edges were deleted");
+ }
+ vector<EdgeId> fixed_path = path_fixer_.TryFixPath(corrected_path);
+ if (!path_fixer_.CheckContiguous(fixed_path)) {
+ TRACE("read unmapped");
+ std::stringstream debug_stream;
+ for (size_t i = 0; i < fixed_path.size(); ++i) {
+ debug_stream << g_.int_id(fixed_path[i]) << " ";
+ }
+ TRACE(debug_stream.str());
+ return vector<EdgeId>();
+ } else {
+ DEBUG("Path fix works");
+ }
+ return fixed_path;
+ }
+
+
+private:
+
+ bool IsTip(VertexId v) const {
+ return g_.IncomingEdgeCount(v) + g_.OutgoingEdgeCount(v) == 1;
+ }
+
+ bool IsMappingPathValid(const MappingPath<EdgeId>& path) const {
+ return path.size() != 0;
+ }
+
+ void PrintPathInfo(vector<EdgeId>& corrected_path) const {
+ for(size_t i = 0; i < corrected_path.size(); ++i) {
+ DEBUG(i + 1 << "-th edge is " << corrected_path[i].int_id());
+ }
+ }
+};
+
+template<class Graph, class Index>
+class NewExtendedSequenceMapper: public SequenceMapper<Graph> {
+ using SequenceMapper<Graph>::g_;
+
+ public:
+ typedef std::vector<MappingRange> RangeMappings;
+
+ private:
+ const Index& index_;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Index::KMer Kmer;
+ typedef KmerMapper<Graph> KmerSubs;
+ const KmerSubs& kmer_mapper_;
+ size_t k_;
+ bool optimization_on_;
+ // mutable size_t mapped_;
+ // mutable size_t unmapped_;
+
+ bool FindKmer(const Kmer &kmer, size_t kmer_pos, std::vector<EdgeId> &passed,
+ RangeMappings& range_mappings) const {
+ std::pair<EdgeId, size_t> position = index_.get(kmer);
+ if (position.second != -1u/*index contains this k-mer*/) {
+ if (passed.empty() || passed.back() != position.first ||
+ kmer_pos != range_mappings.back().initial_range.end_pos ||
+ position.second + 1 < range_mappings.back().mapped_range.end_pos) {
+ passed.push_back(position.first);
+ range_mappings.push_back(
+ MappingRange(Range(kmer_pos, kmer_pos + 1),
+ Range(position.second, position.second + 1)));
+ } else {
+ range_mappings.back().initial_range.end_pos = kmer_pos + 1;
+ range_mappings.back().mapped_range.end_pos = position.second + 1;
+ }
+ return true;
+ }
+ return false;
+ }
+
+ bool TryThread(const Kmer& kmer, size_t kmer_pos, std::vector<EdgeId> &passed,
+ RangeMappings& range_mappings) const {
+ EdgeId last_edge = passed.back();
+ size_t end_pos = range_mappings.back().mapped_range.end_pos;
+ if (end_pos < g_.length(last_edge)) {
+ if (g_.EdgeNucls(last_edge)[end_pos + k_ - 1] == kmer[k_ - 1]) {
+ range_mappings.back().initial_range.end_pos++;
+ range_mappings.back().mapped_range.end_pos++;
+ return true;
+ }
+ } else {
+ VertexId v = g_.EdgeEnd(last_edge);
+
+ if(!optimization_on_)
+ if(g_.OutgoingEdgeCount(v) > 1)
+ return false;
+
+ for (auto I = g_.out_begin(v), E = g_.out_end(v); I != E; ++I) {
+ EdgeId edge = *I;
+ if (g_.EdgeNucls(edge)[k_ - 1] == kmer[k_ - 1]) {
+ passed.push_back(edge);
+ range_mappings.push_back(
+ MappingRange(Range(kmer_pos, kmer_pos + 1),
+ Range(0, 1)));
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ bool Substitute(Kmer& kmer) const {
+ Kmer subs = kmer_mapper_.Substitute(kmer);
+ if (subs != kmer) {
+ kmer = subs;
+ return true;
+ }
+ return false;
+ }
+
+ bool ProcessKmer(Kmer kmer, size_t kmer_pos, std::vector<EdgeId> &passed_edges,
+ RangeMappings& range_mapping, bool try_thread) const {
+ if (try_thread) {
+ if (!TryThread(kmer, kmer_pos, passed_edges, range_mapping)) {
+ Substitute(kmer);
+ FindKmer(kmer, kmer_pos, passed_edges, range_mapping);
+ return false;
+ } else {
+ return true;
+ }
+ } else {
+ if (!Substitute(kmer)) {
+ return FindKmer(kmer, kmer_pos, passed_edges, range_mapping);
+ } else {
+ FindKmer(kmer, kmer_pos, passed_edges, range_mapping);
+ return false;
+ }
+ }
+ // if (!Substitute(kmer)) {
+ // if (try_thread) {
+ // return TryThread(kmer, kmer_pos, passed_edges, range_mapping);
+ // } else {
+ // return FindKmer(kmer, kmer_pos, passed_edges, range_mapping);
+ // }
+ // } else {
+ // FindKmer(kmer, kmer_pos, passed_edges, range_mapping);
+ // return false;
+ // }
+ }
+
+ public:
+ NewExtendedSequenceMapper(const Graph& g,
+ const Index& index,
+ const KmerSubs& kmer_mapper,
+ bool optimization_on = true) :
+ SequenceMapper<Graph>(g), index_(index), kmer_mapper_(kmer_mapper), k_(g.k()+1),
+ optimization_on_(optimization_on) { }
+
+ ~NewExtendedSequenceMapper() {
+ // TRACE("In destructor of sequence mapper");
+ // TRACE(mapped_ << " sequences were mapped");
+ // TRACE(unmapped_ << " sequences couldn't be mapped");
+ }
+
+ MappingPath<EdgeId> MapSequence(const Sequence &sequence) const {
+ std::vector<EdgeId> passed_edges;
+ RangeMappings range_mapping;
+
+ if (sequence.size() < k_) {
+ return MappingPath<EdgeId>();
+ }
+
+ Kmer kmer = sequence.start<Kmer>(k_);
+ //kmer >>= 0;
+ bool try_thread = false;
+ try_thread = ProcessKmer(kmer, 0, passed_edges,
+ range_mapping, try_thread);
+ for (size_t i = k_; i < sequence.size(); ++i) {
+ kmer <<= sequence[i];
+ try_thread = ProcessKmer(kmer, i - k_ + 1, passed_edges,
+ range_mapping, try_thread);
+ }
+
+ // if (passed_edges.empty()) {
+ //// TRACE("Sequence " << sequence << "couldn't be mapped");
+ // unmapped_++;
+ // //todo maybe check path consistency?
+ // } else {
+ // mapped_++;
+ // }
+
+ return MappingPath<EdgeId>(passed_edges, range_mapping);
+ }
+
+ size_t KmerSize() const {
+ return k_;
+ }
+
+ DECL_LOGGER("NewExtendedSequenceMapper");
+};
+
+
+template<class gp_t>
+std::shared_ptr<NewExtendedSequenceMapper<typename gp_t::graph_t, typename gp_t::index_t> > MapperInstance(const gp_t& gp) {
+ return std::make_shared<NewExtendedSequenceMapper<typename gp_t::graph_t, typename gp_t::index_t> >(gp.g, gp.index, gp.kmer_mapper);
+}
+
+}
diff --git a/src/modules/assembly_graph/graph_alignment/sequence_mapper_notifier.hpp b/src/modules/assembly_graph/graph_alignment/sequence_mapper_notifier.hpp
new file mode 100644
index 0000000..ed7c41f
--- /dev/null
+++ b/src/modules/assembly_graph/graph_alignment/sequence_mapper_notifier.hpp
@@ -0,0 +1,175 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef SEQUENCE_MAPPER_NOTIFIER_HPP_
+#define SEQUENCE_MAPPER_NOTIFIER_HPP_
+
+#include "assembly_graph/graph_alignment/sequence_mapper.hpp"
+#include "short_read_mapper.hpp"
+#include "io/reads/paired_read.hpp"
+#include "pipeline/graph_pack.hpp"
+
+#include <vector>
+#include <cstdlib>
+
+namespace debruijn_graph {
+//todo think if we still need all this
+class SequenceMapperListener {
+public:
+ virtual void StartProcessLibrary(size_t threads_count) = 0;
+ virtual void StopProcessLibrary() = 0;
+
+ //TODO: think about read ierarchy
+ virtual void ProcessPairedRead(size_t thread_index, const io::PairedRead& pr, const MappingPath<EdgeId>& read1, const MappingPath<EdgeId>& read2) = 0;
+ virtual void ProcessPairedRead(size_t thread_index, const io::PairedReadSeq& pr, const MappingPath<EdgeId>& read1, const MappingPath<EdgeId>& read2) = 0;
+ virtual void ProcessSingleRead(size_t thread_index, const io::SingleRead& r, const MappingPath<EdgeId>& read) = 0;
+ virtual void ProcessSingleRead(size_t thread_index, const io::SingleReadSeq& r, const MappingPath<EdgeId>& read) = 0;
+
+ virtual void MergeBuffer(size_t thread_index) = 0;
+ virtual ~SequenceMapperListener() {}
+};
+
+class SequenceMapperNotifier {
+ static const size_t BUFFER_SIZE = 200000;
+public:
+ typedef SequenceMapper<conj_graph_pack::graph_t> SequenceMapperT;
+
+ SequenceMapperNotifier(const conj_graph_pack& gp)
+ : gp_(gp) { }
+
+ void Subscribe(size_t lib_index, SequenceMapperListener* listener) {
+ while ((int)lib_index >= (int)listeners_.size() - 1) {
+ std::vector<SequenceMapperListener*> vect;
+ listeners_.push_back(vect);
+ }
+ listeners_[lib_index].push_back(listener);
+ }
+
+ template<class ReadType>
+ void ProcessLibrary(io::ReadStreamList<ReadType>& streams,
+ size_t lib_index, const SequenceMapperT& mapper, size_t threads_count = 0) {
+ if (threads_count == 0)
+ threads_count = streams.size();
+
+ streams.reset();
+ NotifyStartProcessLibrary(lib_index, threads_count);
+
+ size_t counter = 0, n = 15;
+ size_t fmem = get_free_memory();
+
+ #pragma omp parallel for num_threads(threads_count) shared(counter)
+ for (size_t ithread = 0; ithread < threads_count; ++ithread) {
+ size_t size = 0;
+ ReadType r;
+ auto& stream = streams[ithread];
+ stream.reset();
+ while (!stream.eof()) {
+ if (size == BUFFER_SIZE ||
+ // Stop filling buffer if the amount of available is smaller
+ // than half of free memory.
+ (10 * get_free_memory() / 4 < fmem && size > 10000)) {
+ #pragma omp critical
+ {
+ counter += size;
+ if (counter >> n) {
+ INFO("Processed " << counter << " reads");
+ n += 1;
+ }
+ size = 0;
+ NotifyMergeBuffer(lib_index, ithread);
+ }
+ }
+ stream >> r;
+ ++size;
+ NotifyProcessRead(r, mapper, lib_index, ithread);
+ }
+ }
+ INFO("Total " << counter << " reads processed");
+ NotifyStopProcessLibrary(lib_index);
+ }
+
+private:
+ template<class ReadType>
+ void NotifyProcessRead(const ReadType& r, const SequenceMapperT& mapper, size_t ilib, size_t ithread) const;
+
+ void NotifyStartProcessLibrary(size_t ilib, size_t thread_count) const {
+ for (const auto& listener : listeners_[ilib])
+ listener->StartProcessLibrary(thread_count);
+ }
+
+ void NotifyStopProcessLibrary(size_t ilib) const {
+ for (const auto& listener : listeners_[ilib])
+ listener->StopProcessLibrary();
+ }
+
+ void NotifyMergeBuffer(size_t ilib, size_t ithread) const {
+ for (const auto& listener : listeners_[ilib])
+ listener->MergeBuffer(ithread);
+ }
+ const conj_graph_pack& gp_;
+
+ std::vector<std::vector<SequenceMapperListener*> > listeners_; //first vector's size = count libs
+};
+
+template<>
+inline void SequenceMapperNotifier::NotifyProcessRead(const io::PairedReadSeq& r,
+ const SequenceMapperT& mapper,
+ size_t ilib,
+ size_t ithread) const {
+
+ const Sequence& read1 = r.first().sequence();
+ const Sequence& read2 = r.second().sequence();
+ MappingPath<EdgeId> path1 = mapper.MapSequence(read1);
+ MappingPath<EdgeId> path2 = mapper.MapSequence(read2);
+ for (const auto& listener : listeners_[ilib]) {
+ TRACE("Dist: " << r.second().size() << " - " << r.insert_size() << " = " << r.second().size() - r.insert_size());
+ listener->ProcessPairedRead(ithread, r, path1, path2);
+ listener->ProcessSingleRead(ithread, r.first(), path1);
+ listener->ProcessSingleRead(ithread, r.second(), path2);
+ }
+}
+
+template<>
+inline void SequenceMapperNotifier::NotifyProcessRead(const io::PairedRead& r,
+ const SequenceMapperT& mapper,
+ size_t ilib,
+ size_t ithread) const {
+ MappingPath<EdgeId> path1 = mapper.MapRead(r.first());
+ MappingPath<EdgeId> path2 = mapper.MapRead(r.second());
+ for (const auto& listener : listeners_[ilib]) {
+ TRACE("Dist: " << r.second().size() << " - " << r.insert_size() << " = " << r.second().size() - r.insert_size());
+ listener->ProcessPairedRead(ithread, r, path1, path2);
+ listener->ProcessSingleRead(ithread, r.first(), path1);
+ listener->ProcessSingleRead(ithread, r.second(), path2);
+ }
+}
+
+template<>
+inline void SequenceMapperNotifier::NotifyProcessRead(const io::SingleReadSeq& r,
+ const SequenceMapperT& mapper,
+ size_t ilib,
+ size_t ithread) const {
+ const Sequence& read = r.sequence();
+ MappingPath<EdgeId> path = mapper.MapSequence(read);
+ for (const auto& listener : listeners_[ilib])
+ listener->ProcessSingleRead(ithread, r, path);
+}
+
+template<>
+inline void SequenceMapperNotifier::NotifyProcessRead(const io::SingleRead& r,
+ const SequenceMapperT& mapper,
+ size_t ilib,
+ size_t ithread) const {
+ MappingPath<EdgeId> path = mapper.MapRead(r);
+ for (const auto& listener : listeners_[ilib])
+ listener->ProcessSingleRead(ithread, r, path);
+}
+
+} /*debruijn_graph*/
+
+
+#endif /* SEQUENCE_MAPPER_NOTIFIER_HPP_ */
diff --git a/src/modules/assembly_graph/graph_alignment/short_read_mapper.hpp b/src/modules/assembly_graph/graph_alignment/short_read_mapper.hpp
new file mode 100644
index 0000000..2202400
--- /dev/null
+++ b/src/modules/assembly_graph/graph_alignment/short_read_mapper.hpp
@@ -0,0 +1,98 @@
+/*
+ * short_read_mapper.hpp
+ *
+ * Created on: Dec 4, 2013
+ * Author: andrey
+ */
+
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+
+#include "assembly_graph/graph_alignment/sequence_mapper.hpp"
+#include "assembly_graph/graph_alignment/pacbio/pac_index.hpp"
+
+namespace debruijn_graph {
+
+template<class Graph>
+class SensitiveReadMapper: public SequenceMapper<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ using SequenceMapper<Graph>::g_;
+private:
+
+ size_t small_k_;
+
+ static map<size_t, pacbio::PacBioMappingIndex<Graph>* > indices_;
+ static size_t active_mappers_;
+
+ pacbio::PacBioMappingIndex<Graph>* index_;
+
+public:
+
+ SensitiveReadMapper(const Graph& g, size_t k, size_t graph_k) :
+ SequenceMapper<Graph>(g), small_k_(k)
+ {
+ if (indices_.find(small_k_) == indices_.end()) {
+ indices_.insert(make_pair(small_k_,
+ new pacbio::PacBioMappingIndex<Graph>(g, small_k_, graph_k, false)));
+ }
+ index_ = indices_[small_k_];
+ ++active_mappers_;
+ }
+
+ MappingPath<EdgeId> MapSequence(const Sequence &sequence) const {
+ return index_->GetShortReadAlignment(sequence);
+ }
+
+ size_t KmerSize() const {
+ return small_k_;
+ }
+
+ ~SensitiveReadMapper() {
+ --active_mappers_;
+ }
+
+ static void EraseIndices() {
+ if (active_mappers_ > 0) {
+ WARN("There are still active mappers");
+ }
+ for (auto iter = indices_.begin(); iter != indices_.end(); ++iter) {
+ delete iter->second;
+ }
+ indices_.clear();
+ }
+
+};
+
+template<class Graph>
+map<size_t, pacbio::PacBioMappingIndex<Graph>* > SensitiveReadMapper<Graph>::indices_;
+
+template<class Graph>
+size_t SensitiveReadMapper<Graph>::active_mappers_ = 0;
+
+
+template<class graph_pack, class SequencingLib>
+std::shared_ptr<SequenceMapper<typename graph_pack::graph_t>> ChooseProperMapper(const graph_pack& gp, const SequencingLib& library) {
+ if (library.type() == io::LibraryType::MatePairs) {
+ INFO("Mapping mate-pair library, selecting sensitive read mapper with k=" << cfg::get().sensitive_map.k);
+ return std::make_shared<SensitiveReadMapper<typename graph_pack::graph_t>>(gp.g, cfg::get().sensitive_map.k, gp.k_value);
+ }
+
+ size_t read_length = library.data().read_length;
+ if (read_length < gp.k_value && library.type() == io::LibraryType::PairedEnd) {
+ INFO("Read length = " << read_length << ", selecting short read mapper");
+ return std::make_shared<SensitiveReadMapper<typename graph_pack::graph_t>>(gp.g, read_length/ 3, gp.k_value);
+ }
+
+ INFO("Selecting usual mapper");
+ return MapperInstance(gp);
+}
+
+}
+
diff --git a/src/modules/assembly_graph/graph_core/action_handlers.hpp b/src/modules/assembly_graph/graph_core/action_handlers.hpp
new file mode 100644
index 0000000..55d015d
--- /dev/null
+++ b/src/modules/assembly_graph/graph_core/action_handlers.hpp
@@ -0,0 +1,347 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef __OMNI_ACTION_HANDLERS_HPP__
+#define __OMNI_ACTION_HANDLERS_HPP__
+
+#include "dev_support/verify.hpp"
+#include "dev_support/logger/logger.hpp"
+
+#include <boost/noncopyable.hpp>
+#include <string>
+#include <vector>
+
+namespace omnigraph {
+
+using std::vector;
+
+/**
+* ActionHandler is base listening class for graph events. All structures and information storages
+* which are meant to synchronize with graph should use this structure. In order to make handler listen
+* to graph events one should add it to graph listeners.
+* Normally structure itself extends ActionHandler and overrides several handling methods. In
+* constructor it adds itself to graph handler list and removes itself form this list in destructor.
+* All events are divided into two levels: low level events and high level events.
+* Low level events are addition/deletion of vertices/edges. These events should be triggered only after
+* high level events when all data was already transferred and graph structure is consistent.
+* High level events should be used to keep external data synchronized with graph and keep internal data
+* consistent. Now high level events are merge, glue and split. This list can be extended in near future.
+*/
+template<typename VertexId, typename EdgeId>
+class ActionHandler : private boost::noncopyable {
+ const std::string handler_name_;
+private:
+ bool attached_;
+public:
+ /**
+ * Create action handler with given name. With this name one can find out what tipe of handler is it.
+ */
+ ActionHandler(const std::string &name)
+ : handler_name_(name), attached_(true) {
+ }
+
+ virtual ~ActionHandler() {
+ TRACE("~ActionHandler " << handler_name_);
+ }
+
+ /**
+ * Method returns name of this handler
+ */
+ const std::string &name() const {
+ return handler_name_;
+ }
+
+ /**
+ * Low level event which is triggered when vertex is added to graph.
+ * @param v new vertex
+ */
+ virtual void HandleAdd(VertexId /*v*/) { }
+
+ /**
+ * Low level event which is triggered when edge is added to graph.
+ * @param e new edge
+ */
+ virtual void HandleAdd(EdgeId /*e*/) { }
+
+ /**
+ * Low level event which is triggered when vertex is deleted from graph.
+ * @param v vertex to delete
+ */
+ virtual void HandleDelete(VertexId /*v*/) { }
+
+ /**
+ * Low level event which is triggered when edge is deleted from graph.
+ * @param e edge to delete
+ */
+ virtual void HandleDelete(EdgeId /*e*/) { }
+
+ /**
+ * High level event which is triggered when merge operation is performed on graph, which is when
+ * path of edges with all inner vertices having exactly one incoming and one outgoing edge is
+ * replaced with a single edge. Since this is high level operation event of creation of new edge
+ * and events of deletion of old edges should not have been triggered yet when this event was triggered.
+ * @param old_edges path of edges to be replaced with single edge
+ * @param new_edge new edge that was added to be a replacement of path
+ */
+ virtual void HandleMerge(const vector<EdgeId> & /*old_edges*/, EdgeId /*new_edge*/) { }
+
+ /**
+ * High level event which is triggered when glue operation is performed on graph, which is when
+ * edge is completely replaced with other edge. This operation is widely used in bulge removal
+ * when alternative path is glued to main path. Since this is high level operation event of deletion
+ * of old edge should not have been triggered yet when this event was triggered.
+ * @param new_edge edge glue result
+ * @param edge1 edge to be glued to edge2
+ * @param edge2 edge edge1 should be glued with
+ */
+ virtual void HandleGlue(EdgeId /*new_edge*/, EdgeId /*edge1*/, EdgeId /*edge2*/) { }
+
+ /**
+ * High level event which is triggered when split operation is performed on graph, which is when
+ * edge is split into several shorter edges. Split operation is reverse to merge operation.
+ * Since this is high level operation event of deletion of old edge and events of creation of new edges
+ * should not have been triggered yet when this event was triggered.
+ * @param old_edge edge to be split
+ * @param new_edges edges which are results of split
+ */
+ virtual void HandleSplit(EdgeId /*old_edge*/, EdgeId /*new_edge_1*/,
+ EdgeId /*new_edge_2*/) { }
+
+ /**
+ * Every thread safe descendant should override this method for correct concurrent graph processing.
+ */
+ virtual bool IsThreadSafe() const {
+ return false;
+ }
+
+ bool IsAttached() const {
+ return attached_;
+ }
+
+ void Attach() {
+ VERIFY(!attached_);
+ attached_ = true;
+ }
+
+ void Detach() {
+ VERIFY(attached_);
+ attached_ = false;
+ }
+};
+
+template<class Graph>
+class GraphActionHandler : public ActionHandler<typename Graph::VertexId,
+ typename Graph::EdgeId> {
+ typedef ActionHandler<typename Graph::VertexId, typename Graph::EdgeId> base;
+
+ const Graph &g_;
+
+protected:
+ const Graph &g() const {
+ return g_;
+ }
+
+public:
+ GraphActionHandler(const Graph &g, const std::string &name)
+ : base(name),
+ g_(g) {
+ TRACE("Adding new action handler: " << this->name());
+ g_.AddActionHandler(this);
+ }
+
+ GraphActionHandler(const GraphActionHandler<Graph> &other)
+ : base(other.name()),
+ g_(other.g_) {
+ TRACE("Adding new action handler: " << this->name());
+ g_.AddActionHandler(this);
+ }
+
+ virtual ~GraphActionHandler() {
+ TRACE("Removing action handler: " << this->name());
+ if (this->IsAttached())
+ this->Detach();
+ g_.RemoveActionHandler(this);
+ }
+};
+
+/**
+* In order to support various types of graphs and make handler structure more flexible HandlerApplier
+* structure was introduced. If certain implementation of graph requires special handler triggering scheme
+* one can store certain extension of HandlerApplier in graph and trigger HandlerApplier methods instead
+* of GraphHandler methods.
+* HandlerApplier contains one method for each of graph events which define the exact way this event
+* should be triggered.
+*/
+template<typename VertexId, typename EdgeId>
+class HandlerApplier {
+ typedef ActionHandler<VertexId, EdgeId> Handler;
+public:
+
+ virtual void
+ ApplyAdd(Handler &handler, VertexId v) const = 0;
+
+ virtual void
+ ApplyAdd(Handler &handler, EdgeId e) const = 0;
+
+ virtual void
+ ApplyDelete(Handler &handler, VertexId v) const = 0;
+
+ virtual void
+ ApplyDelete(Handler &handler, EdgeId e) const = 0;
+
+ virtual void ApplyMerge(Handler &handler, vector<EdgeId> old_edges,
+ EdgeId new_edge) const = 0;
+
+ virtual void ApplyGlue(Handler &handler, EdgeId new_edge, EdgeId edge1,
+ EdgeId edge2) const = 0;
+
+ virtual void ApplySplit(Handler &handler, EdgeId old_edge,
+ EdgeId new_edge_1, EdgeId new_edge2) const = 0;
+
+ virtual ~HandlerApplier() {
+ }
+};
+
+/**
+* SimpleHandlerApplier is simple implementation of handler applier with no special filtering.
+*/
+template<class Graph>
+class SimpleHandlerApplier : public HandlerApplier<typename Graph::VertexId,
+ typename Graph::EdgeId> {
+public:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef ActionHandler<VertexId, EdgeId> Handler;
+
+ virtual void ApplyAdd(Handler &handler, VertexId v) const {
+ handler.HandleAdd(v);
+ }
+
+ virtual void ApplyAdd(Handler &handler, EdgeId e) const {
+ handler.HandleAdd(e);
+ }
+
+ virtual void ApplyDelete(Handler &handler, VertexId v) const {
+ handler.HandleDelete(v);
+ }
+
+ virtual void ApplyDelete(Handler &handler, EdgeId e) const {
+ handler.HandleDelete(e);
+ }
+
+ virtual void ApplyMerge(Handler &handler, vector<EdgeId> old_edges,
+ EdgeId new_edge) const {
+ handler.HandleMerge(old_edges, new_edge);
+ }
+
+ virtual void ApplyGlue(Handler &handler, EdgeId new_edge, EdgeId edge1,
+ EdgeId edge2) const {
+ handler.HandleGlue(new_edge, edge1, edge2);
+ }
+
+ virtual void ApplySplit(Handler &handler, EdgeId old_edge, EdgeId new_edge1,
+ EdgeId new_edge2) const {
+ handler.HandleSplit(old_edge, new_edge1, new_edge2);
+ }
+
+};
+
+/**
+* PairedHandlerApplier is implementation of HandlerApplier for graph with synchronization of actions
+* performed with vertices/edges and its reverse-complement analogues. Thus while corresponding
+* method was called only once event should be triggered twice: for the parameters with which method
+* was called and for reverse-complement parameters. Also certain assertions were added for bad cases.
+*/
+template<class Graph>
+class PairedHandlerApplier : public HandlerApplier<typename Graph::VertexId,
+ typename Graph::EdgeId> {
+private:
+ Graph &graph_;
+public:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef ActionHandler<VertexId, EdgeId> Handler;
+
+ PairedHandlerApplier(Graph &graph)
+ : graph_(graph) {
+ }
+
+ virtual void ApplyAdd(Handler &handler, VertexId v) const {
+ VertexId rcv = graph_.conjugate(v);
+ handler.HandleAdd(v);
+ if (v != rcv) {
+ handler.HandleAdd(rcv);
+ }
+ }
+
+ virtual void ApplyAdd(Handler &handler, EdgeId e) const {
+ EdgeId rce = graph_.conjugate(e);
+ handler.HandleAdd(e);
+ if (e != rce) {
+ handler.HandleAdd(rce);
+ }
+ }
+
+ virtual void ApplyDelete(Handler &handler, VertexId v) const {
+ VertexId rcv = graph_.conjugate(v);
+ handler.HandleDelete(v);
+ if (v != rcv) {
+ handler.HandleDelete(rcv);
+ }
+ }
+
+ virtual void ApplyDelete(Handler &handler, EdgeId e) const {
+ EdgeId rce = graph_.conjugate(e);
+ handler.HandleDelete(e);
+ if (e != rce) {
+ handler.HandleDelete(rce);
+ }
+ }
+
+ virtual void ApplyMerge(Handler &handler, vector<EdgeId> old_edges,
+ EdgeId new_edge) const {
+ EdgeId rce = graph_.conjugate(new_edge);
+ handler.HandleMerge(old_edges, new_edge);
+ if (new_edge != rce) {
+ vector<EdgeId> rc_old_edges;
+ for (int i = (int) old_edges.size() - 1; i >= 0; i--) {
+ rc_old_edges.push_back(graph_.conjugate(old_edges[i]));
+ }
+ handler.HandleMerge(rc_old_edges, rce);
+ }
+ }
+
+ virtual void ApplyGlue(Handler &handler, EdgeId new_edge, EdgeId edge1,
+ EdgeId edge2) const {
+ EdgeId rc_edge1 = graph_.conjugate(edge1);
+ EdgeId rc_edge2 = graph_.conjugate(edge2);
+ VERIFY(edge1 != edge2);
+ VERIFY(edge2 != rc_edge2);
+ handler.HandleGlue(new_edge, edge1, edge2);
+ if (edge1 != rc_edge1) {
+ handler.HandleGlue(graph_.conjugate(new_edge), rc_edge1, rc_edge2);
+ }
+ }
+
+ virtual void ApplySplit(Handler &handler, EdgeId old_edge,
+ EdgeId new_edge_1, EdgeId new_edge2) const {
+ EdgeId rce = graph_.conjugate(old_edge);
+ //VERIFY(old_edge != rce);
+ handler.HandleSplit(old_edge, new_edge_1, new_edge2);
+ if (old_edge != rce) {
+ handler.HandleSplit(rce, graph_.conjugate(new_edge2),
+ graph_.conjugate(new_edge_1));
+ }
+ }
+
+private:
+ DECL_LOGGER("PairedHandlerApplier")
+};
+
+};
+
+#endif
diff --git a/src/modules/assembly_graph/graph_core/basic_graph_stats.hpp b/src/modules/assembly_graph/graph_core/basic_graph_stats.hpp
new file mode 100644
index 0000000..52701ac
--- /dev/null
+++ b/src/modules/assembly_graph/graph_core/basic_graph_stats.hpp
@@ -0,0 +1,53 @@
+#pragma once
+
+#include "dev_support/standard_base.hpp"
+namespace omnigraph {
+
+template<class Graph>
+class AvgCovereageCounter {
+private:
+ const Graph &graph_;
+ const size_t min_length_;
+public:
+ AvgCovereageCounter(const Graph &graph, size_t min_length = 0) :
+ graph_(graph), min_length_(min_length) {
+ }
+
+ double Count() const {
+ double cov = 0;
+ size_t length = 0;
+ for (auto it = graph_.ConstEdgeBegin(); !it.IsEnd(); ++it) {
+ if (graph_.length(*it) >= min_length_) {
+ cov += graph_.coverage(*it) * (double) graph_.length(*it);
+ length += graph_.length(*it);
+ }
+ }
+ if (length == 0)
+ return 0.;
+ return cov / (double) length;
+ }
+};
+
+template<class Graph>
+size_t CumulativeLength(const Graph& g,
+ const std::vector<typename Graph::EdgeId>& path) {
+ size_t s = 0;
+ for (auto it = path.begin(); it != path.end(); ++it)
+ s += g.length(*it);
+
+ return s;
+}
+
+template<class Graph>
+double AvgCoverage(const Graph& g,
+ const std::vector<typename Graph::EdgeId>& path) {
+ double unnormalized_coverage = 0;
+ size_t path_length = 0;
+ for (auto edge : path) {
+ size_t length = g.length(edge);
+ path_length += length;
+ unnormalized_coverage += g.coverage(edge) * (double) length;
+ }
+ return unnormalized_coverage / (double) path_length;
+}
+}
\ No newline at end of file
diff --git a/src/include/omni/construction_helper.hpp b/src/modules/assembly_graph/graph_core/construction_helper.hpp
similarity index 100%
rename from src/include/omni/construction_helper.hpp
rename to src/modules/assembly_graph/graph_core/construction_helper.hpp
diff --git a/src/modules/assembly_graph/graph_core/coverage.hpp b/src/modules/assembly_graph/graph_core/coverage.hpp
new file mode 100644
index 0000000..4f243eb
--- /dev/null
+++ b/src/modules/assembly_graph/graph_core/coverage.hpp
@@ -0,0 +1,343 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * coverage.hpp
+ *
+ * Created on: Jun 21, 2011
+ * Author: sergey
+ */
+
+#pragma once
+
+#include "dev_support/logger/logger.hpp"
+#include <iostream>
+#include <vector>
+#include <algorithm>
+#include "math/xmath.h"
+#include "action_handlers.hpp"
+namespace omnigraph {
+
+using std::vector;
+//todo save/load absolute coverage
+template<class Graph>
+class CoverageIndex : public GraphActionHandler<Graph> {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ //typedef unordered_map<EdgeId, int> map_type;
+
+ Graph& g_;
+// map_type storage_;
+
+// size_t KPlusOneMerCoverage(EdgeId edge) const {
+// return (size_t) math::round(coverage(edge) * (double) this->g().length(edge));
+// }
+
+// template<class ReadThreader>
+// Path<EdgeId> ProcessSequence(const ReadThreader& threader,
+// const Sequence& sequence) const {
+// return threader.MapSequence(sequence);
+// }
+
+// void AddPathsToGraph(const Path<EdgeId>& path) {
+//
+// if (path.sequence().size() == 0)
+// return;
+//
+// const vector<EdgeId>& edges_list = path.sequence();
+//
+// for (auto it = edges_list.cbegin(); it != edges_list.cend(); ++it) {
+// IncCoverage(*it, this->g().length(*it));
+// }
+// IncCoverage(edges_list[0], -int(path.start_pos()));
+// EdgeId last = edges_list[edges_list.size() - 1];
+// IncCoverage(last, int(path.end_pos()) - int(this->g().length(last)));
+// }
+
+// void IncCoverageInMap(EdgeId edge, int toAdd, map_type& map) {
+// //VERIFY(toAdd >= 0);
+// map[edge] += toAdd;
+// VERIFY(map[edge] >= 0);
+// }
+//
+// void AddPathsToMap(const Path<EdgeId>& path, map_type& map) {
+//
+// if (path.sequence().size() == 0)
+// return;
+//
+// const vector<EdgeId>& edges_list = path.sequence();
+//
+// for (auto it = edges_list.cbegin(); it != edges_list.cend(); ++it) {
+// IncCoverageInMap(*it, this->g().length(*it), map);
+// }
+// IncCoverageInMap(edges_list[0], -int(path.start_pos()), map);
+// EdgeId last = edges_list[edges_list.size() - 1];
+// IncCoverageInMap(last,
+// int(path.end_pos()) - int(this->g().length(last)),
+// map);
+// }
+
+ public:
+ CoverageIndex(Graph &g)
+ : GraphActionHandler<Graph>(g, "CoverageIndex"), g_(g) {
+ }
+
+ virtual ~CoverageIndex() {
+ }
+
+ /**
+ * In NON averaged units
+ */
+ void SetRawCoverage(EdgeId e, unsigned cov) {
+ g_.data(e).set_raw_coverage(cov);
+ }
+
+ void IncRawCoverage(EdgeId e, unsigned count) {
+ g_.data(e).inc_raw_coverage((int)count);
+ }
+
+ void SetAvgCoverage(EdgeId e, double cov) {
+ g_.data(e).set_raw_coverage((int) math::round(cov * (double) this->g().length(e)));
+ }
+
+ /**
+ * Returns average coverage of the edge
+ */
+ double coverage(EdgeId edge) const {
+ return (double) RawCoverage(edge) / (double) this->g().length(edge);
+ }
+
+ unsigned RawCoverage(EdgeId edge) const {
+ return g_.data(edge).raw_coverage();
+ }
+// /**
+// * Returns average coverage of the edge
+// */
+// double operator[](EdgeId e) const {
+// return coverage(e);
+// }
+
+// /**
+// * Method increases coverage value
+// */
+// void IncCoverage(EdgeId edge, int to_add) {
+// edge->IncCoverage(to_add);
+// VERIFY(edge->GetRawCoverage() >= 0);
+// }
+//
+// /**
+// * Method increases coverage value by 1
+// */
+// void IncCoverage(EdgeId edge) {
+// IncCoverage(edge, 1);
+// }
+
+// template<class ReadThreader, class Read>
+// void Fill(io::IReader<Read>& stream, const ReadThreader& threader) {
+//
+// INFO("Processing reads (takes a while)");
+// size_t counter = 0;
+// stream.reset();
+//
+// while (!stream.eof()) {
+// Read r;
+// stream >> r;
+// Path<EdgeId> path = ProcessSequence(threader, r.sequence());
+// AddPathsToGraph(path);
+//
+// VERBOSE_POWER(++counter, " reads processed");
+// }
+//
+// INFO("DeBruijn graph coverage counted, reads used: " << counter);
+// }
+//
+// template<class ReadThreader, class Read>
+// void FillParallel(io::ReadStreamVector<io::IReader<Read> >& streams,
+// const ReadThreader& threader, size_t buffer_size) {
+//
+// INFO("Processing reads (takes a while)");
+// perf_counter pc;
+// size_t counter = 0;
+//
+// size_t nthreads = streams.size();
+// size_t buf_size = buffer_size
+// / (nthreads * (sizeof(Path<EdgeId> ) + 32));
+//
+//#pragma omp parallel num_threads(nthreads)
+// {
+//#pragma omp for reduction(+ : counter)
+// for (size_t i = 0; i < nthreads; ++i) {
+//
+// Read r;
+// io::IReader<Read>& stream = streams[i];
+// stream.reset();
+// std::vector<Path<EdgeId> > buffer(buf_size);
+//
+// size_t j = 0;
+// while (!stream.eof()) {
+// stream >> r;
+// ++counter;
+// buffer[j++] = ProcessSequence(threader, r.sequence());
+//
+// if (j == buf_size) {
+// j = 0;
+//
+//#pragma omp critical
+// {
+// for (size_t l = 0; l < buf_size; ++l) {
+// AddPathsToGraph(buffer[l]);
+// }
+// }
+// }
+// }
+//
+//#pragma omp critical
+// {
+// for (size_t l = 0; l < j; ++l) {
+// AddPathsToGraph(buffer[l]);
+// }
+// }
+// }
+//
+// }
+//
+// INFO("DeBruijn graph coverage counted, reads used: " << counter);
+//
+// INFO("Elapsed time: " << pc.time_ms());
+// }
+//
+// template<class ReadThreader, class Read>
+// void FillFastParallel(
+// io::ReadStreamVector<io::IReader<Read> >& streams,
+// const ReadThreader& threader) {
+//
+// INFO("Processing reads (takes a while)");
+// perf_counter pc;
+// size_t counter = 0;
+//
+// size_t nthreads = streams.size();
+////
+// std::vector<map_type*> maps(nthreads);
+//// maps[0] = &storage_;
+//
+// for (size_t i = 0; i < nthreads; ++i) {
+// maps[i] = new map_type();
+// }
+//
+//#pragma omp parallel num_threads(nthreads)
+// {
+//#pragma omp for reduction(+ : counter)
+// for (size_t i = 0; i < nthreads; ++i) {
+//
+// Read r;
+// io::IReader<Read>& stream = streams[i];
+// stream.reset();
+// Path<EdgeId> path;
+//
+// while (!stream.eof()) {
+// stream >> r;
+// ++counter;
+// path = ProcessSequence(threader, r.sequence());
+//
+// AddPathsToMap(path, *maps[i]);
+// }
+// }
+// }
+//
+// INFO("Merging maps");
+// for (size_t i = 0; i < nthreads; ++i) {
+// for (auto it = maps[i]->begin(); it != maps[i]->end(); ++it) {
+// it->first->IncCoverage(it->second);
+// }
+// delete maps[i];
+// }
+//
+// INFO("DeBruijn graph coverage counted, reads used: " << counter);
+//
+// INFO("Elapsed time: " << pc.time_ms());
+// }
+
+// template<class Index>
+// void FillFromIndex(Index& index) {
+// for (auto I = index.value_cbegin(), E = index.value_cend();
+// I != E; ++I) {
+// const auto& edge_info = *I;
+// VERIFY(edge_info.offset != -1u);
+// VERIFY(edge_info.edge_id.get() != NULL);
+// IncRawCoverage(edge_info.edge_id, edge_info.count);
+// }
+//
+// DEBUG("Coverage counted");
+// }
+
+ virtual void HandleDelete(EdgeId edge) {
+ SetRawCoverage(edge, 0);
+ }
+
+ virtual void HandleMerge(const vector<EdgeId>& old_edges, EdgeId new_edge) {
+ unsigned coverage = 0;
+ for (auto it = old_edges.begin(); it != old_edges.end(); ++it) {
+ coverage += RawCoverage(*it);
+ }
+ SetRawCoverage(new_edge, coverage);
+ }
+
+ virtual void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) {
+ SetRawCoverage(new_edge, RawCoverage(edge1) + RawCoverage(edge2));
+ }
+
+ virtual void HandleSplit(EdgeId old_edge, EdgeId new_edge1, EdgeId new_edge2) {
+// size_t length1 = this->g().length(newEdge1);
+// size_t length = this->g().length(oldEdge);
+// size_t coverage = KPlusOneMerCoverage(oldEdge);
+// size_t coverage1 = coverage * length1 / length;
+// if (coverage1 == 0)
+// coverage1 = 1;
+// size_t coverage2 = coverage - coverage1;
+// if (coverage2 == 0)
+// coverage2 = 1;
+// SetCoverage(newEdge1, coverage1);
+// SetCoverage(newEdge2, coverage2);
+ double avg_cov = coverage(old_edge);
+ if (old_edge == g_.conjugate(old_edge)) {
+ int raw1 = std::max(1, (int) math::round(avg_cov * (double) this->g().length(new_edge1)));
+ SetRawCoverage(new_edge1, raw1);
+ SetRawCoverage(g_.conjugate(new_edge1), raw1);
+ SetRawCoverage(new_edge2, std::max(1, (int) math::round(avg_cov * (double) this->g().length(new_edge2))));
+ } else {
+ SetRawCoverage(new_edge1, std::max(1, (int) math::round(avg_cov * (double) this->g().length(new_edge1))));
+ SetRawCoverage(new_edge2, std::max(1, (int) math::round(avg_cov * (double) this->g().length(new_edge2))));
+ }
+ }
+
+ void Save(EdgeId e, std::ostream& out) const {
+ out << fmt::format("{:.6f}", coverage(e));
+ }
+
+ void Load(EdgeId e, std::istream& in) {
+ double cov;
+ in >> cov;
+ SetAvgCoverage(e, cov);
+ }
+
+ /*
+ * Is thread safe if different threads process different edges.
+ */
+ bool IsThreadSafe() const {
+ return true;
+ }
+};
+
+//todo discuss with Anton
+template<class Graph>
+class AbstractFlankingCoverage {
+public:
+ virtual double GetInCov(typename Graph::EdgeId edge) const = 0;
+ virtual double GetOutCov(typename Graph::EdgeId edge) const = 0;
+};
+
+}
diff --git a/src/modules/assembly_graph/graph_core/debruijn_data.hpp b/src/modules/assembly_graph/graph_core/debruijn_data.hpp
new file mode 100644
index 0000000..c775165
--- /dev/null
+++ b/src/modules/assembly_graph/graph_core/debruijn_data.hpp
@@ -0,0 +1,170 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <vector>
+#include <set>
+#include <cstring>
+#include "dev_support/verify.hpp"
+#include "dev_support/logger/logger.hpp"
+#include "data_structures/sequence/sequence_tools.hpp"
+#include "dev_support/standard_base.hpp"
+
+namespace debruijn_graph {
+class DeBruijnMaster;
+
+class DeBruijnVertexData {
+ friend class DeBruinMaster;
+public:
+ DeBruijnVertexData() {
+
+ }
+};
+
+class CoverageData {
+ private:
+ unsigned coverage_;
+
+ public:
+ CoverageData()
+ : coverage_(0) {
+ }
+
+ void inc_coverage(int value) {
+ VERIFY(value >= 0 || coverage_ > unsigned(-value));
+ coverage_ += value;
+ }
+
+ void set_coverage(unsigned coverage) {
+ coverage_ = coverage;
+ }
+
+ //not length normalized
+ unsigned coverage() const {
+ return coverage_;
+ }
+};
+
+class DeBruijnEdgeData {
+ friend class DeBruinMaster;
+ CoverageData coverage_;
+ CoverageData flanking_cov_;
+ Sequence nucls_;
+public:
+
+ DeBruijnEdgeData(const Sequence &nucls) :
+ nucls_(nucls) {
+ }
+
+ const Sequence& nucls() const {
+ return nucls_;
+ }
+
+ void inc_raw_coverage(int value) {
+ coverage_.inc_coverage(value);
+ }
+
+ void set_raw_coverage(unsigned coverage) {
+ coverage_.set_coverage(coverage);
+ }
+
+ unsigned raw_coverage() const {
+ return coverage_.coverage();
+ }
+
+ void inc_flanking_coverage(int value) {
+ flanking_cov_.inc_coverage(value);
+ }
+
+ void set_flanking_coverage(unsigned flanking_coverage) {
+ flanking_cov_.set_coverage(flanking_coverage);
+ }
+
+ //not length normalized
+ unsigned flanking_coverage() const {
+ return flanking_cov_.coverage();
+ }
+
+ size_t size() const {
+ return nucls_.size();
+ }
+};
+
+class DeBruijnDataMaster {
+private:
+ const size_t k_;
+
+public:
+ typedef DeBruijnVertexData VertexData;
+ typedef DeBruijnEdgeData EdgeData;
+
+ DeBruijnDataMaster(size_t k) :
+ k_(k) {
+ }
+
+ const EdgeData MergeData(const std::vector<const EdgeData*>& to_merge, bool safe_merging = true) const;
+
+ std::pair<VertexData, std::pair<EdgeData, EdgeData>> SplitData(const EdgeData& edge, size_t position, bool is_self_conj = false) const;
+
+ EdgeData GlueData(const EdgeData&, const EdgeData& data2) const;
+
+ bool isSelfConjugate(const EdgeData &data) const {
+ return data.nucls() == !(data.nucls());
+ }
+
+ EdgeData conjugate(const EdgeData &data) const {
+ return EdgeData(!(data.nucls()));
+ }
+
+ VertexData conjugate(const VertexData & /*data*/) const {
+ return VertexData();
+ }
+
+ size_t length(const EdgeData& data) const {
+ return data.nucls().size() - k_;
+ }
+
+ size_t length(const VertexData& ) const {
+ return k_;
+ }
+
+ size_t k() const {
+ return k_;
+ }
+
+};
+
+//typedef DeBruijnVertexData VertexData;
+//typedef DeBruijnEdgeData EdgeData;
+//typedef DeBruijnDataMaster DataMaster;
+
+inline const DeBruijnEdgeData DeBruijnDataMaster::MergeData(const std::vector<const DeBruijnEdgeData*>& to_merge, bool safe_merging) const {
+ std::vector<Sequence> ss;
+ ss.reserve(to_merge.size());
+ for (auto it = to_merge.begin(); it != to_merge.end(); ++it) {
+ ss.push_back((*it)->nucls());
+ }
+ return EdgeData(MergeOverlappingSequences(ss, k_, safe_merging));
+}
+
+inline std::pair<DeBruijnVertexData, std::pair<DeBruijnEdgeData, DeBruijnEdgeData>> DeBruijnDataMaster::SplitData(const EdgeData& edge,
+ size_t position,
+ bool is_self_conj) const {
+ const Sequence& nucls = edge.nucls();
+ size_t end = nucls.size();
+ if (is_self_conj) {
+ VERIFY(position < end);
+ end -= position;
+ }
+ return std::make_pair(VertexData(), std::make_pair(EdgeData(edge.nucls().Subseq(0, position + k_)), EdgeData(nucls.Subseq(position, end))));
+}
+
+inline DeBruijnEdgeData DeBruijnDataMaster::GlueData(const DeBruijnEdgeData&, const DeBruijnEdgeData& data2) const {
+ return data2;
+}
+
+}
diff --git a/src/modules/assembly_graph/graph_core/directions.hpp b/src/modules/assembly_graph/graph_core/directions.hpp
new file mode 100644
index 0000000..16a7849
--- /dev/null
+++ b/src/modules/assembly_graph/graph_core/directions.hpp
@@ -0,0 +1,132 @@
+#pragma once
+
+namespace omnigraph {
+template<class Graph>
+class AbstractDirection {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ const Graph &graph_;
+
+protected:
+ const Graph &graph() const {
+ return graph_;
+ }
+
+public:
+ AbstractDirection(const Graph &graph)
+ : graph_(graph) { }
+
+ virtual ~AbstractDirection() { }
+
+ virtual const std::vector <EdgeId> OutgoingEdges(VertexId v) const = 0;
+
+ virtual const std::vector <EdgeId> IncomingEdges(VertexId v) const = 0;
+
+ virtual size_t OutgoingEdgeCount(VertexId v) const = 0;
+
+ virtual size_t IncomingEdgeCount(VertexId v) const = 0;
+
+ virtual VertexId EdgeStart(EdgeId edge) const = 0;
+
+ virtual VertexId EdgeEnd(EdgeId edge) const = 0;
+
+ bool CheckUniqueOutgoingEdge(VertexId v) const {
+ return OutgoingEdgeCount(v) == 1;
+ }
+
+ EdgeId GetUniqueOutgoingEdge(VertexId v) const {
+ return OutgoingEdges(v)[0];
+ }
+
+ bool CheckUniqueIncomingEdge(VertexId v) const {
+ return IncomingEdgeCount(v) == 1;
+ }
+
+ EdgeId GetUniqueIncomingEdge(VertexId v) const {
+ return IncomingEdges(v)[0];
+ }
+
+ virtual bool IsForward() const = 0;
+};
+
+template<class Graph>
+class ForwardDirection : public AbstractDirection<Graph> {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+public:
+ ForwardDirection(const Graph &graph)
+ : AbstractDirection<Graph>(graph) {
+ }
+
+ virtual const std::vector <EdgeId> OutgoingEdges(VertexId v) const {
+ return std::vector<EdgeId>(this->graph().out_begin(v), this->graph().out_end(v));
+ }
+
+ virtual const std::vector <EdgeId> IncomingEdges(VertexId v) const {
+ return std::vector<EdgeId>(this->graph().in_begin(v), this->graph().in_end(v));
+ }
+
+ virtual size_t OutgoingEdgeCount(VertexId v) const {
+ return this->graph().OutgoingEdgeCount(v);
+ }
+
+ virtual size_t IncomingEdgeCount(VertexId v) const {
+ return this->graph().IncomingEdgeCount(v);
+ }
+
+ virtual VertexId EdgeStart(EdgeId edge) const {
+ return this->graph().EdgeStart(edge);
+ }
+
+ virtual VertexId EdgeEnd(EdgeId edge) const {
+ return this->graph().EdgeEnd(edge);
+ }
+
+ bool IsForward() const {
+ return true;
+ }
+};
+
+template<class Graph>
+class BackwardDirection : public AbstractDirection<Graph> {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+public:
+ BackwardDirection(const Graph &graph)
+ : AbstractDirection<Graph>(graph) {
+ }
+
+ virtual const std::vector <EdgeId> OutgoingEdges(VertexId v) const {
+ return std::vector<EdgeId>(this->graph().in_begin(v), this->graph().in_end(v));
+ }
+
+ virtual const std::vector <EdgeId> IncomingEdges(VertexId v) const {
+ return std::vector<EdgeId>(this->graph().out_begin(v), this->graph().out_end(v));
+ }
+
+ virtual size_t OutgoingEdgeCount(VertexId v) const {
+ return this->graph().IncomingEdgeCount(v);
+ }
+
+ virtual size_t IncomingEdgeCount(VertexId v) const {
+ return this->graph().OutgoingEdgeCount(v);
+ }
+
+ virtual VertexId EdgeStart(EdgeId edge) const {
+ return this->graph().EdgeEnd(edge);
+ }
+
+ virtual VertexId EdgeEnd(EdgeId edge) const {
+ return this->graph().EdgeStart(edge);
+ }
+
+ bool IsForward() const {
+ return false;
+ }
+
+};
+}
\ No newline at end of file
diff --git a/src/modules/assembly_graph/graph_core/graph.hpp b/src/modules/assembly_graph/graph_core/graph.hpp
new file mode 100644
index 0000000..65268fa
--- /dev/null
+++ b/src/modules/assembly_graph/graph_core/graph.hpp
@@ -0,0 +1,110 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "observable_graph.hpp"
+#include "coverage.hpp"
+#include "debruijn_data.hpp"
+
+namespace debruijn_graph {
+
+using omnigraph::CoverageIndex;
+class DeBruijnGraph: public omnigraph::ObservableGraph<DeBruijnDataMaster> {
+public:
+ typedef omnigraph::ObservableGraph<DeBruijnDataMaster> base;
+ typedef base::DataMasterT DataMasterT;
+ typedef base::VertexData VertexData;
+ typedef base::EdgeData EdgeData;
+ typedef base::EdgeId EdgeId;
+ typedef base::VertexId VertexId;
+ typedef base::VertexIt VertexIt;
+ typedef VertexIt VertexIterator;
+ typedef VertexIterator iterator; // for for_each
+ typedef const VertexIterator const_iterator; // for for_each
+private:
+ CoverageIndex<DeBruijnGraph> coverage_index_;
+
+public:
+ DeBruijnGraph(size_t k) :
+ base(k), coverage_index_(*this) {
+ }
+
+ CoverageIndex<DeBruijnGraph>& coverage_index() {
+ return coverage_index_;
+ }
+
+ const CoverageIndex<DeBruijnGraph>& coverage_index() const {
+ return coverage_index_;
+ }
+
+ /**
+ * Method returns average coverage of the edge
+ */
+ double coverage(EdgeId edge) const {
+ return coverage_index_.coverage(edge);
+ }
+
+ using base::AddVertex;
+ using base::AddEdge;
+
+ VertexId AddVertex() {
+ return AddVertex(VertexData());
+ }
+
+ EdgeId AddEdge(VertexId from, VertexId to, const Sequence &nucls) {
+ VERIFY(nucls.size() > k());
+ return AddEdge(from, to, EdgeData(nucls));
+ }
+
+ size_t k() const {
+ return master().k();
+ }
+
+ /**
+ * Method returns Sequence stored in the edge
+ */
+ const Sequence& EdgeNucls(EdgeId edge) const {
+ return this->data(edge).nucls();
+ }
+
+ const Sequence VertexNucls(VertexId v) const {
+ //todo add verify on vertex nucls consistency
+ if (this->OutgoingEdgeCount(v) > 0) {
+ return EdgeNucls(*(this->out_begin(v))).Subseq(0, k());
+ } else if (this->IncomingEdgeCount(v) > 0) {
+ EdgeId inc = *(this->in_begin(v));
+ size_t length = EdgeNucls(inc).size();
+ return EdgeNucls(inc).Subseq(length - k(), length);
+ }
+ VERIFY(false);
+ return Sequence();
+ }
+
+ Sequence PathNucls(const vector<EdgeId> &path) const {
+ if(path.empty())
+ return Sequence("");
+ SequenceBuilder result;
+ result.append(Sequence(""));
+ result.append(this->EdgeNucls(path[0]).Subseq(0, this->k()));
+ for (size_t i = 0; i < path.size(); ++i) {
+ result.append(this->EdgeNucls(path[i]).Subseq(this->k()));
+ }
+
+ return result.BuildSequence();
+ }
+
+private:
+ DECL_LOGGER("DeBruijnGraph")
+};
+
+typedef DeBruijnGraph ConjugateDeBruijnGraph;
+
+typedef ConjugateDeBruijnGraph Graph;
+typedef Graph::EdgeId EdgeId;
+typedef Graph::VertexId VertexId;
+}
diff --git a/src/modules/assembly_graph/graph_core/graph_core.hpp b/src/modules/assembly_graph/graph_core/graph_core.hpp
new file mode 100644
index 0000000..d45efb4
--- /dev/null
+++ b/src/modules/assembly_graph/graph_core/graph_core.hpp
@@ -0,0 +1,620 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <vector>
+#include <set>
+#include "dev_support/verify.hpp"
+#include "dev_support/logger/logger.hpp"
+#include "order_and_law.hpp"
+#include <boost/iterator/iterator_facade.hpp>
+#include "dev_support/simple_tools.hpp"
+
+namespace omnigraph {
+
+using std::vector;
+template<class DataMaster>
+class GraphCore;
+
+template<class DataMaster>
+class ConstructionHelper;
+
+template<class T>
+class PairedElementManipulationHelper;
+
+template<class DataMaster>
+class PairedVertex;
+
+template<class DataMaster>
+class PairedEdge;
+
+template<class DataMaster>
+class PairedEdge {
+ private:
+ typedef typename DataMaster::EdgeData EdgeData;
+ typedef restricted::pure_pointer<PairedEdge<DataMaster>> EdgeId;
+ typedef restricted::pure_pointer<PairedVertex<DataMaster>> VertexId;
+ friend class GraphCore<DataMaster>;
+ friend class ConstructionHelper<DataMaster>;
+ friend class PairedElementManipulationHelper<EdgeId>;
+ //todo unfriend
+ friend class PairedVertex<DataMaster>;
+ VertexId end_;
+ EdgeData data_;
+ EdgeId conjugate_;
+
+ PairedEdge(VertexId end, const EdgeData &data)
+ : end_(end),
+ data_(data) {
+ }
+
+ EdgeData &data() {
+ return data_;
+ }
+
+ void set_data(const EdgeData &data) {
+ data_ = data;
+ }
+
+ VertexId end() const {
+ return end_;
+ }
+
+ VertexId start() const {
+ return conjugate_->end()->conjugate();
+ }
+
+ void set_conjugate(EdgeId conjugate) {
+ conjugate_ = conjugate;
+ }
+
+ void SetEndVertex(VertexId end) {
+ end_ = end;
+ }
+
+public:
+ EdgeId conjugate() const {
+ return conjugate_;
+ }
+
+ size_t length(size_t k) const {
+ return data_.size() - k;
+ }
+};
+
+template<class DataMaster>
+class PairedVertex {
+private:
+ typedef typename DataMaster::VertexData VertexData;
+ typedef restricted::pure_pointer<PairedEdge<DataMaster>> EdgeId;
+ typedef restricted::pure_pointer<PairedVertex<DataMaster>> VertexId;
+ typedef typename std::vector<EdgeId>::const_iterator edge_raw_iterator;
+
+ class conjugate_iterator : public boost::iterator_facade<conjugate_iterator,
+ EdgeId, boost::forward_traversal_tag, EdgeId> {
+ public:
+ explicit conjugate_iterator(edge_raw_iterator it,
+ bool conjugate = false)
+ : it_(it),
+ conjugate_(conjugate) {
+ }
+
+ //todo do we need it?
+ conjugate_iterator()
+ : conjugate_(false) {
+ }
+
+ private:
+ friend class boost::iterator_core_access;
+
+ void increment() {
+ it_++;
+ }
+
+ bool equal(const conjugate_iterator &other) const {
+ return other.it_ == it_ && other.conjugate_ == conjugate_;
+ }
+
+ EdgeId dereference() const {
+ return (conjugate_ ? (*it_)->conjugate() : *it_);
+ }
+
+ edge_raw_iterator it_;
+ bool conjugate_;
+ };
+
+public:
+ typedef conjugate_iterator edge_const_iterator;
+
+private:
+ friend class GraphCore<DataMaster>;
+ friend class ConstructionHelper<DataMaster>;
+ friend class PairedEdge<DataMaster>;
+ friend class PairedElementManipulationHelper<VertexId>;
+ friend class conjugate_iterator;
+
+ std::vector<EdgeId> outgoing_edges_;
+
+ VertexId conjugate_;
+
+ VertexData data_;
+
+ bool IsMinimal() const {
+ return conjugate_->conjugate_ <= conjugate_;
+ }
+
+ VertexId conjugate() const {
+ return conjugate_;
+ }
+
+ void set_conjugate(VertexId conjugate) {
+ conjugate_ = conjugate;
+ }
+
+ size_t OutgoingEdgeCount() const {
+ return outgoing_edges_.size();
+ }
+
+ edge_const_iterator out_begin() const {
+ return edge_const_iterator(outgoing_edges_.cbegin(), false);
+ }
+
+ edge_const_iterator out_end() const {
+ return edge_const_iterator(outgoing_edges_.cend(), false);
+ }
+
+ size_t IncomingEdgeCount() const {
+ return conjugate_->OutgoingEdgeCount();
+ }
+
+ size_t IncomingEdgesCount() const {
+ return conjugate_->OutgoingEdgeCount();
+ }
+
+ edge_const_iterator in_begin() const {
+ return edge_const_iterator(conjugate_->outgoing_edges_.cbegin(), true);
+ }
+
+ edge_const_iterator in_end() const {
+ return edge_const_iterator(conjugate_->outgoing_edges_.cend(), true);
+ }
+
+ PairedVertex(VertexData data)
+ : data_(data) {
+ }
+
+ VertexData &data() {
+ return data_;
+ }
+
+ void set_data(VertexData data) {
+ data_ = data;
+ }
+
+ const std::vector<EdgeId> OutgoingEdgesTo(VertexId v) const {
+ vector<EdgeId> result;
+ for (auto it = outgoing_edges_.begin(); it != outgoing_edges_.end(); ++it) {
+ if ((*it)->end() == v) {
+ result.push_back(*it);
+ }
+ }
+ return result;
+ }
+
+ void AddOutgoingEdge(EdgeId e) {
+ outgoing_edges_.insert(std::upper_bound(outgoing_edges_.begin(), outgoing_edges_.end(), e), e);
+ //outgoing_edges_.push_back(e);
+ }
+
+ bool RemoveOutgoingEdge(const EdgeId e) {
+ auto it = std::find(outgoing_edges_.begin(), outgoing_edges_.end(), e);
+ if (it == outgoing_edges_.end())
+ return false;
+
+ outgoing_edges_.erase(it);
+ return true;
+ }
+
+ ~PairedVertex() {
+ VERIFY(outgoing_edges_.size() == 0);
+ }
+};
+
+template<class DataMaster>
+class GraphCore: private boost::noncopyable {
+public:
+ typedef DataMaster DataMasterT;
+ typedef typename DataMasterT::VertexData VertexData;
+ typedef typename DataMasterT::EdgeData EdgeData;
+ typedef restricted::pure_pointer<PairedEdge<DataMaster>> EdgeId;
+ typedef restricted::pure_pointer<PairedVertex<DataMaster>> VertexId;
+ typedef typename std::set<VertexId>::const_iterator VertexIt;
+ typedef typename PairedVertex<DataMaster>::edge_const_iterator edge_const_iterator;
+
+private:
+ restricted::LocalIdDistributor id_distributor_;
+ DataMaster master_;
+ std::set<VertexId> vertices_;
+
+ friend class ConstructionHelper<DataMaster>;
+public:
+ VertexIt begin() const {
+ return vertices_.begin();
+ }
+
+ VertexIt end() const {
+ return vertices_.end();
+ }
+
+ const std::set<VertexId>& vertices() const {
+ return vertices_;
+ }
+
+ size_t size() const {
+ return vertices_.size();
+ }
+
+ edge_const_iterator out_begin(VertexId v) const {
+ return v->out_begin();
+ }
+
+ edge_const_iterator out_end(VertexId v) const {
+ return v->out_end();
+ }
+
+ edge_const_iterator in_begin(VertexId v) const {
+ return v->in_begin();
+ }
+
+ edge_const_iterator in_end(VertexId v) const {
+ return v->in_end();
+ }
+
+private:
+ void DeleteVertexFromGraph(VertexId vertex) {
+ this->vertices_.erase(vertex);
+ this->vertices_.erase(conjugate(vertex));
+ }
+
+ void DestroyVertex(VertexId vertex) {
+ VertexId conjugate = vertex->conjugate();
+ delete vertex.get();
+ delete conjugate.get();
+ }
+
+ bool AdditionalCompressCondition(VertexId v) const {
+ return !(EdgeEnd(GetUniqueOutgoingEdge(v)) == conjugate(v) && EdgeStart(GetUniqueIncomingEdge(v)) == conjugate(v));
+ }
+
+protected:
+
+ VertexId CreateVertex(const VertexData& data1, const VertexData& data2, restricted::IdDistributor& id_distributor) {
+ VertexId vertex1(new PairedVertex<DataMaster>(data1), id_distributor);
+ VertexId vertex2(new PairedVertex<DataMaster>(data2), id_distributor);
+ vertex1->set_conjugate(vertex2);
+ vertex2->set_conjugate(vertex1);
+ return vertex1;
+ }
+
+ VertexId CreateVertex(const VertexData &data, restricted::IdDistributor &id_distributor) {
+ return CreateVertex(data, master_.conjugate(data), id_distributor);
+ }
+
+ VertexId CreateVertex(const VertexData &data) {
+ return CreateVertex(data, id_distributor_);
+ }
+
+ void AddVertexToGraph(VertexId vertex) {
+ vertices_.insert(vertex);
+ vertices_.insert(conjugate(vertex));
+ }
+
+ VertexId HiddenAddVertex(const VertexData& data, restricted::IdDistributor& id_distributor) {
+ VertexId vertex = CreateVertex(data, id_distributor);
+ AddVertexToGraph(vertex);
+ return vertex;
+ }
+
+ VertexId HiddenAddVertex(const VertexData& data) {
+ return HiddenAddVertex(data, id_distributor_);
+ }
+
+ void HiddenDeleteVertex(VertexId vertex) {
+ DeleteVertexFromGraph(vertex);
+ DestroyVertex(vertex);
+ }
+
+ /////////////////////////low-level ops (move to helper?!)
+
+ ////what with this method?
+ EdgeId AddSingleEdge(VertexId v1, VertexId v2, const EdgeData &data,
+ restricted::IdDistributor &idDistributor) {
+ EdgeId newEdge(new PairedEdge<DataMaster>(v2, data), idDistributor);
+ if (v1 != VertexId(0))
+ v1->AddOutgoingEdge(newEdge);
+ return newEdge;
+ }
+
+ EdgeId HiddenAddEdge(const EdgeData& data, restricted::IdDistributor& id_distributor) {
+ EdgeId result = AddSingleEdge(VertexId(0), VertexId(0), data, id_distributor);
+ if (this->master().isSelfConjugate(data)) {
+ result->set_conjugate(result);
+ return result;
+ }
+ EdgeId rcEdge = AddSingleEdge(VertexId(0), VertexId(0), this->master().conjugate(data), id_distributor);
+ result->set_conjugate(rcEdge);
+ rcEdge->set_conjugate(result);
+ return result;
+ }
+
+ EdgeId HiddenAddEdge(const EdgeData &data) {
+ return HiddenAddEdge(data, id_distributor_);
+ }
+
+ EdgeId HiddenAddEdge(VertexId v1, VertexId v2, const EdgeData& data, restricted::IdDistributor& id_distributor) {
+ // todo was suppressed for concurrent execution reasons (see concurrent_graph_component.hpp)
+ // VERIFY(this->vertices_.find(v1) != this->vertices_.end() && this->vertices_.find(v2) != this->vertices_.end());
+ EdgeId result = AddSingleEdge(v1, v2, data, id_distributor);
+ if (this->master().isSelfConjugate(data) && (v1 == conjugate(v2))) {
+ // todo why was it removed???
+ // Because of some split issues: when self-conjugate edge is split armageddon happends
+ // VERIFY(v1 == conjugate(v2));
+ // VERIFY(v1 == conjugate(v2));
+ result->set_conjugate(result);
+ return result;
+ }
+ EdgeId rcEdge = AddSingleEdge(v2->conjugate(), v1->conjugate(), this->master().conjugate(data), id_distributor);
+ result->set_conjugate(rcEdge);
+ rcEdge->set_conjugate(result);
+ return result;
+ }
+
+ EdgeId HiddenAddEdge(VertexId v1, VertexId v2, const EdgeData &data) {
+ return HiddenAddEdge(v1, v2, data, id_distributor_);
+ }
+
+ void HiddenDeleteEdge(EdgeId edge) {
+ DEBUG("Hidden delete edge " << edge.int_id());
+ EdgeId rcEdge = conjugate(edge);
+ VertexId rcStart = conjugate(edge->end());
+ VertexId start = conjugate(rcEdge->end());
+ start->RemoveOutgoingEdge(edge);
+ rcStart->RemoveOutgoingEdge(rcEdge);
+ if (edge != rcEdge) {
+ delete rcEdge.get();
+ }
+ delete edge.get();
+ }
+
+ void HiddenDeletePath(const std::vector<EdgeId>& edgesToDelete, const std::vector<VertexId>& verticesToDelete) {
+ for (auto it = edgesToDelete.begin(); it != edgesToDelete.end(); ++it)
+ HiddenDeleteEdge(*it);
+ for (auto it = verticesToDelete.begin(); it != verticesToDelete.end(); ++it)
+ HiddenDeleteVertex(*it);
+ }
+
+public:
+
+ GraphCore(const DataMaster& master) : master_(master) {
+ }
+
+ virtual ~GraphCore() {
+ VERIFY(size() == 0);
+ }
+
+ class IteratorContainer {
+ public:
+ typedef edge_const_iterator const_iterator;
+ private:
+ const_iterator begin_;
+ const_iterator end_;
+ public:
+ IteratorContainer(const_iterator begin, const_iterator end) :
+ begin_(begin), end_(end) {
+
+ }
+
+ const_iterator begin() const {
+ return begin_;
+ }
+
+ const_iterator end() const {
+ return end_;
+ }
+ };
+
+ restricted::LocalIdDistributor &GetGraphIdDistributor() {
+ return id_distributor_;
+ }
+
+ const restricted::LocalIdDistributor &GetGraphIdDistributor() const {
+ return id_distributor_;
+ }
+
+ size_t int_id(EdgeId edge) const {
+ return edge.int_id();
+ }
+
+ size_t int_id(VertexId vertex) const {
+ return vertex.int_id();
+ }
+
+ const DataMaster& master() const {
+ return master_;
+ }
+
+ const EdgeData& data(EdgeId edge) const {
+ return edge->data();
+ }
+
+ const VertexData& data(VertexId v) const {
+ return v->data();
+ }
+
+ EdgeData& data(EdgeId edge) {
+ return edge->data();
+ }
+
+ VertexData& data(VertexId v) {
+ return v->data();
+ }
+
+ size_t OutgoingEdgeCount(VertexId v) const {
+ return v->OutgoingEdgeCount();
+ }
+
+ IteratorContainer OutgoingEdges(VertexId v) const {
+ //INFO("Outgoing");
+ return IteratorContainer(out_begin(v), out_end(v));
+ }
+
+ size_t IncomingEdgeCount(VertexId v) const {
+ return v->IncomingEdgeCount();
+ }
+
+ IteratorContainer IncomingEdges(VertexId v) const {
+ return IteratorContainer(in_begin(v), in_end(v));
+ }
+
+ std::vector<EdgeId> GetEdgesBetween(VertexId v, VertexId u) const {
+ return v->OutgoingEdgesTo(u);
+ }
+
+ bool RelatedVertices(VertexId v1, VertexId v2) const {
+ return v1 == v2 || v1 == conjugate(v2);
+ }
+
+ ////////////////////////edge information
+ VertexId EdgeStart(EdgeId edge) const {
+ return edge->start();
+ }
+
+ VertexId EdgeEnd(EdgeId edge) const {
+ //INFO("Edge end");
+ return edge->end();
+ }
+
+ VertexId conjugate(VertexId v) const {
+ return v->conjugate();
+ }
+
+ EdgeId conjugate(EdgeId edge) const {
+ return edge->conjugate();
+ }
+
+ size_t length(const EdgeId edge) const {
+ return master_.length(data(edge));
+ }
+
+ size_t length(const VertexId v) const {
+ return master_.length(data(v));
+ }
+
+ //////////////////////shortcut methods
+
+ std::vector<EdgeId> IncidentEdges(VertexId v) const {
+ vector<EdgeId> answer;
+ push_back_all(answer, IncomingEdges(v));
+ push_back_all(answer, OutgoingEdges(v));
+ return answer;
+ }
+
+ EdgeId GetUniqueOutgoingEdge(VertexId v) const {
+ VERIFY(CheckUniqueOutgoingEdge(v));
+ return *out_begin(v);
+ }
+
+ bool CheckUniqueIncomingEdge(VertexId v) const {
+ return IncomingEdgeCount(v) == 1;
+ }
+
+ EdgeId GetUniqueIncomingEdge(VertexId v) const {
+ VERIFY(CheckUniqueIncomingEdge(v));
+ return *in_begin(v);
+ }
+
+ bool CheckUniqueOutgoingEdge(VertexId v) const {
+ return OutgoingEdgeCount(v) == 1;
+ }
+
+ bool IsDeadEnd(VertexId v) const {
+ return OutgoingEdgeCount(v) == 0;
+ }
+
+ bool IsDeadStart(VertexId v) const {
+ return IncomingEdgeCount(v) == 0;
+ }
+
+ bool CanCompressVertex(VertexId v) const {
+ // TRACE("Compress vertex check: ");
+ // TRACE("Outgoing check: " << (OutgoingEdgeCount(v) == 1));
+ // TRACE("Outgoing check: " << (CheckUniqueOutgoingEdge(v)));
+ // TRACE("Incoming check: " << (IncomingEdgeCount(v) == 1));
+ // TRACE("Incoming check: " << (CheckUniqueIncomingEdge(v) == 1));
+ // if((OutgoingEdgeCount(v) == 1) && (IncomingEdgeCount(v) == 1)) {
+ // TRACE("Loop check: " << (GetUniqueOutgoingEdge(v) != GetUniqueIncomingEdge(v)));
+ // TRACE("Additional check: " << AdditionalCompressCondition(v));
+ // }
+ return OutgoingEdgeCount(v) == 1 && IncomingEdgeCount(v) == 1 &&
+ GetUniqueOutgoingEdge(v) != GetUniqueIncomingEdge(v) &&
+ AdditionalCompressCondition(v);
+ }
+
+ //////////////////////printing
+ std::string str(const EdgeId e) const {
+// return master_.str(data(edge));
+ std::stringstream ss;
+ ss << int_id(e) << " (" << length(e) << ")";
+ return ss.str();
+ }
+
+ std::string str(const VertexId v) const {
+// return master_.str(data(v));
+ return ToString(int_id(v));
+ }
+
+ std::string detailed_str(const VertexId v) const {
+ std::stringstream ss;
+ ss << str(v) << ";";
+ ss << "Incoming edges" << str(IncomingEdges(v)) << "; ";
+ ss << "Outgoing edges" << str(OutgoingEdges(v)) << ";";
+ return ss.str();
+ }
+
+ std::string detailed_str(const std::vector<EdgeId>& path) const {
+ std::stringstream ss;
+ ss << "Path: ";
+ ss << "Vertex " << detailed_str(EdgeStart(path[0])) << " | ";
+ for (auto it = path.begin(); it != path.end(); ++it) {
+ EdgeId e = *it;
+ ss << "Edge " << str(e) << " | ";
+ ss << "Vertex " << detailed_str(EdgeEnd(e)) << " | ";
+ }
+ return ss.str();
+ }
+
+ template<class Container>
+ std::string str(const Container& container) const {
+ return str(container.begin(), container.end());
+ }
+
+ template<class It>
+ std::string str(It begin, It end) const {
+ std::stringstream ss;
+ std::string delim = "";
+ for (auto it = begin; it != end; ++it) {
+ ss << delim << str(*it);
+ delim = ", ";
+ }
+ return ss.str();
+ }
+
+private:
+ DECL_LOGGER("GraphCore");
+};
+
+}
diff --git a/src/modules/assembly_graph/graph_core/graph_iterators.hpp b/src/modules/assembly_graph/graph_core/graph_iterators.hpp
new file mode 100644
index 0000000..9879885
--- /dev/null
+++ b/src/modules/assembly_graph/graph_core/graph_iterators.hpp
@@ -0,0 +1,408 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "utils/adt/queue_iterator.hpp"
+#include "math/pred.hpp"
+#include "action_handlers.hpp"
+#include "dev_support/simple_tools.hpp"
+#include <boost/iterator/iterator_facade.hpp>
+
+namespace omnigraph {
+
+/**
+ * SmartIterator is able to iterate through collection content of which can be changed in process of
+ * iteration. And as GraphActionHandler SmartIterator can change collection contents with respect to the
+ * way graph is changed. Also one can define order of iteration by specifying Comparator.
+ */
+template<class Graph, typename ElementId, typename Comparator = std::less<ElementId>>
+class SmartIterator : public GraphActionHandler<Graph> {
+ typedef GraphActionHandler<Graph> base;
+ DynamicQueueIterator<ElementId, Comparator> inner_it_;
+ bool add_new_;
+ bool canonical_only_;
+ //todo think of checking it in HandleAdd
+ pred::TypedPredicate<ElementId> add_condition_;
+
+protected:
+
+ void push(const ElementId& el) {
+ if ((!canonical_only_ || el <= this->g().conjugate(el)) &&
+ add_condition_(el)) {
+ inner_it_.push(el);
+ }
+ }
+
+ template<typename InputIterator>
+ void insert(InputIterator begin, InputIterator end) {
+ for (auto it = begin; it != end; ++it) {
+ push(*it);
+ }
+ }
+
+ void erase(const ElementId& el) {
+ if (!canonical_only_ || el <= this->g().conjugate(el)) {
+ inner_it_.erase(el);
+ }
+ }
+
+ void clear() {
+ inner_it_.clear();
+ }
+
+ SmartIterator(const Graph &g, const std::string &name, bool add_new,
+ const Comparator& comparator, bool canonical_only,
+ pred::TypedPredicate<ElementId> add_condition = pred::AlwaysTrue<ElementId>())
+ : base(g, name),
+ inner_it_(comparator),
+ add_new_(add_new),
+ canonical_only_(canonical_only),
+ add_condition_(add_condition) {
+ }
+
+public:
+
+ bool canonical_only() const {
+ return canonical_only_;
+ }
+
+ bool IsEnd() const {
+ return inner_it_.IsEnd();
+ }
+
+ size_t size() const {
+ return inner_it_.size();
+ }
+
+ ElementId operator*() {
+ return *inner_it_;
+ }
+
+ void operator++() {
+ ++inner_it_;
+ }
+
+ void HandleAdd(ElementId v) override {
+ if (add_new_)
+ push(v);
+ }
+
+ void HandleDelete(ElementId v) override {
+ erase(v);
+ }
+
+ //use carefully!
+ void ReleaseCurrent() {
+ inner_it_.ReleaseCurrent();
+ }
+
+};
+
+/**
+ * SmartIterator is abstract class which acts both as QueueIterator and GraphActionHandler. As QueueIterator
+ * SmartIterator is able to iterate through collection content of which can be changed in process of
+ * iteration. And as GraphActionHandler SmartIterator can change collection contents with respect to the
+ * way graph is changed. Also one can define order of iteration by specifying Comparator.
+ */
+template<class Graph, typename ElementId,
+ typename Comparator = std::less<ElementId>>
+class SmartSetIterator : public SmartIterator<Graph, ElementId, Comparator> {
+ typedef SmartIterator<Graph, ElementId, Comparator> base;
+
+public:
+ SmartSetIterator(const Graph &g,
+ bool add_new = false,
+ const Comparator& comparator = Comparator(),
+ bool canonical_only = false,
+ pred::TypedPredicate<ElementId> add_condition = pred::AlwaysTrue<ElementId>())
+ : base(g, "SmartSet " + ToString(this), add_new, comparator, canonical_only, add_condition) {
+ }
+
+ template<class Iterator>
+ SmartSetIterator(const Graph &g, Iterator begin, Iterator end,
+ bool add_new = false,
+ const Comparator& comparator = Comparator(),
+ bool canonical_only = false,
+ pred::TypedPredicate<ElementId> add_condition = pred::AlwaysTrue<ElementId>())
+ : SmartSetIterator(g, add_new, comparator, canonical_only, add_condition) {
+ insert(begin, end);
+ }
+
+ template<typename InputIterator>
+ void insert(InputIterator begin, InputIterator end) {
+ base::insert(begin, end);
+ }
+
+ void push(const ElementId& el) {
+ base::push(el);
+ }
+
+ void clear() {
+ base::clear();
+ }
+};
+
+/**
+ * SmartVertexIterator iterates through vertices of graph. It listens to AddVertex/DeleteVertex graph events
+ * and correspondingly edits the set of vertices to iterate through. Note: high level event handlers are
+ * triggered before low level event handlers like H>andleAdd/HandleDelete. Thus if Comparator uses certain
+ * structure which is also updated with handlers make sure that all information is updated in high level
+ * event handlers.
+ */
+template<class Graph, typename Comparator = std::less<typename Graph::VertexId> >
+class SmartVertexIterator : public SmartIterator<Graph,
+ typename Graph::VertexId, Comparator> {
+ public:
+ typedef typename Graph::VertexId VertexId;
+
+ static size_t get_id() {
+ static size_t id = 0;
+ return id++;
+ }
+
+ public:
+ SmartVertexIterator(const Graph &g, const Comparator& comparator =
+ Comparator(), bool canonical_only = false)
+ : SmartIterator<Graph, VertexId, Comparator>(
+ g, "SmartVertexIterator " + ToString(get_id()), true,
+ comparator, canonical_only) {
+ this->insert(g.begin(), g.end());
+ }
+
+};
+
+//todo return verifies when they can be switched off
+template<class Graph>
+class GraphEdgeIterator : public boost::iterator_facade<GraphEdgeIterator<Graph>
+ , typename Graph::EdgeId, boost::forward_traversal_tag
+ , typename Graph::EdgeId> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexIt const_vertex_iterator;
+ typedef typename Graph::edge_const_iterator const_edge_iterator;
+
+ const Graph& g_;
+ const_vertex_iterator v_it_;
+ const_edge_iterator e_it_;
+ bool canonical_only_;
+
+public:
+
+ GraphEdgeIterator(const Graph& g, const_vertex_iterator v_it, bool canonical_only = false)
+ : g_(g),
+ v_it_(v_it),
+ canonical_only_(canonical_only) {
+ if (v_it_ != g_.end()) {
+ e_it_ = g_.out_begin(*v_it_);
+ Skip();
+ }
+ }
+
+private:
+
+ bool Canonical(EdgeId e) const {
+ return e <= g_.conjugate(e);
+ }
+
+ friend class boost::iterator_core_access;
+
+ void Skip() {
+ //VERIFY(v_it_ != g_.end());
+ while (true) {
+ if (e_it_ == g_.out_end(*v_it_)) {
+ v_it_++;
+ if (v_it_ == g_.end())
+ return;
+ e_it_ = g_.out_begin(*v_it_);
+ } else {
+ if (!canonical_only_ || Canonical(*e_it_))
+ return;
+ else
+ e_it_++;
+ }
+ }
+ }
+
+ void increment() {
+ if (v_it_ == g_.end())
+ return;
+ e_it_++;
+ Skip();
+ }
+
+ bool equal(const GraphEdgeIterator &other) const {
+ if (other.v_it_ != v_it_)
+ return false;
+ if (v_it_ != g_.end() && other.e_it_ != e_it_)
+ return false;
+ if (other.canonical_only_ != canonical_only_)
+ return false;
+ return true;
+ }
+
+ EdgeId dereference() const {
+ //VERIFY(v_it_ != g_.end());
+ return *e_it_;
+ }
+
+};
+
+template<class Graph>
+class ConstEdgeIterator {
+ typedef typename Graph::EdgeId EdgeId;
+ GraphEdgeIterator<Graph> begin_, end_;
+
+ public:
+ ConstEdgeIterator(const Graph &g, bool canonical_only = false)
+ : begin_(g, g.begin(), canonical_only), end_(g, g.end(), canonical_only) {
+ }
+
+ bool IsEnd() const {
+ return begin_ == end_;
+ }
+
+ EdgeId operator*() const {
+ return *begin_;
+ }
+
+ const ConstEdgeIterator& operator++() {
+ begin_++;
+ return *this;
+ }
+};
+
+/**
+ * SmartEdgeIterator iterates through edges of graph. It listens to AddEdge/DeleteEdge graph events
+ * and correspondingly edits the set of edges to iterate through. Note: high level event handlers are
+ * triggered before low level event handlers like HandleAdd/HandleDelete. Thus if Comparator uses certain
+ * structure which is also updated with handlers make sure that all information is updated in high level
+ * event handlers.
+ */
+template<class Graph, typename Comparator = std::less<typename Graph::EdgeId> >
+class SmartEdgeIterator : public SmartIterator<Graph, typename Graph::EdgeId, Comparator> {
+ typedef GraphEdgeIterator<Graph> EdgeIt;
+ public:
+ typedef typename Graph::EdgeId EdgeId;
+
+ static size_t get_id() {
+ static size_t id = 0;
+ return id++;
+ }
+
+ public:
+ SmartEdgeIterator(const Graph &g, Comparator comparator = Comparator(),
+ bool canonical_only = false)
+ : SmartIterator<Graph, EdgeId, Comparator>(
+ g, "SmartEdgeIterator " + ToString(get_id()), true,
+ comparator, canonical_only) {
+ this->insert(EdgeIt(g, g.begin()), EdgeIt(g, g.end()));
+
+// for (auto it = graph.begin(); it != graph.end(); ++it) {
+// //todo: this solution doesn't work with parallel simplification
+// this->insert(graph.out_begin(*it), graph.out_end(*it));
+// //this does
+// //auto out = graph.OutgoingEdges(*it);
+// //this->base::insert(out.begin(), out.end());
+// }
+ }
+};
+
+//todo move out
+template<class Graph, class ElementId>
+class IterationHelper {
+};
+
+template<class Graph>
+class IterationHelper<Graph, typename Graph::VertexId> {
+ const Graph& g_;
+public:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::VertexIt const_vertex_iterator;
+
+ IterationHelper(const Graph& g)
+ : g_(g) {
+ }
+
+ const_vertex_iterator begin() const {
+ return g_.begin();
+ }
+
+ const_vertex_iterator end() const {
+ return g_.end();
+ }
+
+ std::vector<const_vertex_iterator> Chunks(size_t chunk_cnt) const {
+ VERIFY(chunk_cnt > 0);
+ if (chunk_cnt == 1) {
+ return {begin(), end()};
+ }
+
+ //trying to split vertices into equal chunks, leftovers put into first chunk
+ vector<const_vertex_iterator> answer;
+ size_t vertex_cnt = g_.size();
+ size_t chunk_size = vertex_cnt / chunk_cnt;
+ auto it = g_.begin();
+ answer.push_back(it);
+ for (size_t i = 0; i + chunk_cnt * chunk_size < vertex_cnt; ++i) {
+ it++;
+ }
+ if (chunk_size > 0) {
+ size_t i = 0;
+ do {
+ ++it;
+ if (++i % chunk_size == 0)
+ answer.push_back(it);
+ } while (it != g_.end());
+
+ VERIFY(i == chunk_cnt * chunk_size);
+ } else {
+ VERIFY(it == g_.end());
+ answer.push_back(it);
+ }
+ VERIFY(answer.back() == g_.end());
+ return answer;
+ }
+
+};
+
+//todo move out
+template<class Graph>
+class IterationHelper<Graph, typename Graph::EdgeId> {
+ typedef typename Graph::VertexId VertexId;
+
+ const Graph& g_;
+public:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef GraphEdgeIterator<Graph> const_edge_iterator;
+
+ IterationHelper(const Graph& g)
+ : g_(g) {
+ }
+
+ const_edge_iterator begin() const {
+ return const_edge_iterator(g_, g_.begin());
+ }
+
+ const_edge_iterator end() const {
+ return const_edge_iterator(g_, g_.end());
+ }
+
+ std::vector<omnigraph::GraphEdgeIterator<Graph>> Chunks(size_t chunk_cnt) const {
+ if (chunk_cnt == 1) {
+ return {begin(), end()};
+ }
+
+ vector<omnigraph::GraphEdgeIterator<Graph>> answer;
+
+ for (auto v_it : IterationHelper<Graph, VertexId>(g_).Chunks(chunk_cnt)) {
+ answer.push_back(omnigraph::GraphEdgeIterator<Graph>(g_, v_it));
+ }
+ return answer;
+ }
+};
+
+}
diff --git a/src/modules/assembly_graph/graph_core/observable_graph.hpp b/src/modules/assembly_graph/graph_core/observable_graph.hpp
new file mode 100644
index 0000000..0286cc5
--- /dev/null
+++ b/src/modules/assembly_graph/graph_core/observable_graph.hpp
@@ -0,0 +1,499 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <vector>
+#include <set>
+#include <cstring>
+#include "dev_support/logger/logger.hpp"
+#include "graph_core.hpp"
+#include "graph_iterators.hpp"
+
+namespace omnigraph {
+
+using std::vector;
+using std::set;
+template<class DataMaster>
+class ObservableGraph: public GraphCore<DataMaster> {
+public:
+ typedef GraphCore<DataMaster> base;
+ typedef typename base::DataMasterT DataMasterT;
+ typedef typename base::VertexData VertexData;
+ typedef typename base::EdgeData EdgeData;
+ typedef typename base::EdgeId EdgeId;
+ typedef typename base::VertexId VertexId;
+ typedef typename base::VertexIt VertexIt;
+ typedef typename base::edge_const_iterator edge_const_iterator;
+
+ typedef HandlerApplier<VertexId, EdgeId> Applier;
+ typedef SmartVertexIterator<ObservableGraph> SmartVertexIt;
+ typedef SmartEdgeIterator<ObservableGraph> SmartEdgeIt;
+ typedef ConstEdgeIterator<ObservableGraph> ConstEdgeIt;
+ typedef ActionHandler<VertexId, EdgeId> Handler;
+
+private:
+ //todo switch to smart iterators
+ mutable std::vector<Handler*> action_handler_list_;
+ const HandlerApplier<VertexId, EdgeId> *applier_;
+
+public:
+//todo move to graph core
+ typedef ConstructionHelper<DataMaster> HelperT;
+
+ HelperT GetConstructionHelper() {
+// TODO: fix everything and restore this check
+// VERIFY(this->VerifyAllDetached());
+ return HelperT(*this);
+ }
+
+ const Applier& GetHandlerApplier() const {
+ return *applier_;
+ }
+
+ void AddActionHandler(Handler* action_handler) const;
+
+ bool RemoveActionHandler(const Handler* action_handler) const;
+
+ bool AllHandlersThreadSafe() const;
+
+ // TODO: for debug. remove.
+ void PrintHandlersNames() const;
+
+ //todo make Fire* protected once again with helper friend class
+ void FireAddVertex(VertexId v) const;
+
+ void FireAddEdge(EdgeId e) const;
+
+ void FireDeleteVertex(VertexId v) const;
+
+ void FireDeleteEdge(EdgeId e) const;
+
+ void FireMerge(std::vector<EdgeId> old_edges, EdgeId new_edge) const;
+
+ void FireGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) const;
+
+ void FireSplit(EdgeId edge, EdgeId new_edge1, EdgeId new_edge2) const;
+
+ bool VerifyAllDetached();
+
+ //smart iterators
+ template<typename Comparator>
+ SmartVertexIterator<ObservableGraph, Comparator> SmartVertexBegin(
+ const Comparator& comparator, bool canonical_only = false) const {
+ return SmartVertexIterator<ObservableGraph, Comparator>(*this,
+ comparator, canonical_only);
+ }
+
+ SmartVertexIterator<ObservableGraph> SmartVertexBegin(bool canonical_only = false) const {
+ return SmartVertexIterator<ObservableGraph>(*this, std::less<VertexId>(), canonical_only);
+ }
+
+ template<typename Comparator>
+ SmartEdgeIterator<ObservableGraph, Comparator> SmartEdgeBegin(
+ const Comparator& comparator, bool canonical_only = false) const {
+ return SmartEdgeIterator<ObservableGraph, Comparator>(*this, comparator, canonical_only);
+ }
+
+ SmartEdgeIterator<ObservableGraph> SmartEdgeBegin(bool canonical_only = false) const {
+ return SmartEdgeIterator<ObservableGraph>(*this, std::less<EdgeId>(), canonical_only);
+ }
+
+ ConstEdgeIterator<ObservableGraph> ConstEdgeBegin(bool canonical_only = false) const {
+ return ConstEdgeIterator<ObservableGraph>(*this, canonical_only);
+ }
+
+ void FireDeletePath(const std::vector<EdgeId>& edges_to_delete, const std::vector<VertexId>& vertices_to_delete) const;
+
+ ObservableGraph(const DataMaster& master) :
+ base(master), applier_(new PairedHandlerApplier<ObservableGraph>(*this)) {
+ }
+
+ virtual ~ObservableGraph();
+
+ /////////////////////////graph operations
+ //adding/removing vertices and edges
+ VertexId AddVertex(const VertexData& data) {
+ return AddVertex(data, GetGraphIdDistributor());
+ }
+
+ VertexId AddVertex(const VertexData& data, restricted::IdDistributor& id_distributor);
+
+ void DeleteVertex(VertexId v);
+
+ void ForceDeleteVertex(VertexId v);
+
+ using base::GetGraphIdDistributor;
+ using base::conjugate;
+
+ EdgeId AddEdge(const EdgeData &data) {
+ return AddEdge(data, GetGraphIdDistributor());
+ }
+
+ EdgeId AddEdge(const EdgeData& data, restricted::IdDistributor& id_distributor);
+
+ EdgeId AddEdge(VertexId v1, VertexId v2, const EdgeData &data) {
+ return AddEdge(v1, v2, data, GetGraphIdDistributor());
+ }
+
+ EdgeId AddEdge(VertexId v1, VertexId v2, const EdgeData& data, restricted::IdDistributor& id_distributor);
+
+ void DeleteEdge(EdgeId e);
+
+ void DeleteAllOutgoing(VertexId v);
+
+ void DeleteAllIncoming(VertexId v);
+
+ void CompressVertex(VertexId v);
+
+ EdgeId UnsafeCompressVertex(VertexId v);
+
+ std::vector<EdgeId> EdgesToDelete(const std::vector<EdgeId>& path) const;
+
+ std::vector<VertexId> VerticesToDelete(const std::vector<EdgeId>& path) const;
+
+ std::vector<EdgeId> CorrectMergePath(const std::vector<EdgeId>& path) const;
+
+ EdgeId MergePath(const std::vector<EdgeId>& path, bool safe_merging = true);
+
+ std::pair<EdgeId, EdgeId> SplitEdge(EdgeId edge, size_t position);
+
+ EdgeId GlueEdges(EdgeId edge1, EdgeId edge2);
+
+private:
+ DECL_LOGGER("ObservableGraph")
+};
+
+template<class DataMaster>
+typename ObservableGraph<DataMaster>::VertexId ObservableGraph<DataMaster>::AddVertex(const VertexData& data, restricted::IdDistributor& id_distributor) {
+ VertexId v = base::HiddenAddVertex(data, id_distributor);
+ FireAddVertex(v);
+ return v;
+}
+
+template<class DataMaster>
+void ObservableGraph<DataMaster>::DeleteVertex(VertexId v) {
+ VERIFY(base::IsDeadEnd(v) && base::IsDeadStart(v));
+ VERIFY(v != VertexId(NULL));
+ FireDeleteVertex(v);
+ base::HiddenDeleteVertex(v);
+}
+
+template<class DataMaster>
+void ObservableGraph<DataMaster>::ForceDeleteVertex(VertexId v) {
+ DeleteAllOutgoing(v);
+ DeleteAllIncoming(v);
+ DeleteVertex(v);
+}
+
+template<class DataMaster>
+typename ObservableGraph<DataMaster>::EdgeId ObservableGraph<DataMaster>::AddEdge(VertexId v1, VertexId v2, const EdgeData& data, restricted::IdDistributor& id_distributor) {
+ EdgeId e = base::HiddenAddEdge(v1, v2, data, id_distributor);
+ FireAddEdge(e);
+ return e;
+}
+
+template<class DataMaster>
+typename ObservableGraph<DataMaster>::EdgeId ObservableGraph<DataMaster>::AddEdge(const EdgeData& data, restricted::IdDistributor& id_distributor) {
+ EdgeId e = base::HiddenAddEdge(data, id_distributor);
+ FireAddEdge(e);
+ return e;
+}
+
+template<class DataMaster>
+void ObservableGraph<DataMaster>::DeleteEdge(EdgeId e) {
+ FireDeleteEdge(e);
+ base::HiddenDeleteEdge(e);
+}
+
+template<class DataMaster>
+void ObservableGraph<DataMaster>::DeleteAllOutgoing(VertexId v) {
+ while (base::OutgoingEdgeCount(v) > 0) {
+ EdgeId edge = *base::out_begin(v);
+ DeleteEdge(edge);
+ }
+}
+
+template<class DataMaster>
+void ObservableGraph<DataMaster>::DeleteAllIncoming(VertexId v) {
+ while (base::IncomingEdgeCount(v) > 0) {
+ EdgeId edge = *base::in_begin(v);
+ DeleteEdge(edge);
+ }
+}
+
+template<class DataMaster>
+void ObservableGraph<DataMaster>::CompressVertex(VertexId v) {
+ //VERIFY(CanCompressVertex(v));
+ if (base::CanCompressVertex(v)) {
+ UnsafeCompressVertex(v);
+ } else {
+ TRACE("Vertex " << base::str(v) << " can't be compressed");
+ }
+}
+
+template<class DataMaster>
+typename ObservableGraph<DataMaster>::EdgeId ObservableGraph<DataMaster>::UnsafeCompressVertex(VertexId v) {
+ VERIFY(base::CanCompressVertex(v));
+ std::vector<EdgeId> edges_to_merge;
+ edges_to_merge.push_back(base::GetUniqueIncomingEdge(v));
+ edges_to_merge.push_back(base::GetUniqueOutgoingEdge(v));
+ return MergePath(edges_to_merge);
+}
+
+template<class DataMaster>
+std::vector<typename ObservableGraph<DataMaster>::EdgeId> ObservableGraph<DataMaster>::EdgesToDelete(const std::vector<EdgeId>& path) const {
+ std::set<EdgeId> edgesToDelete;
+ edgesToDelete.insert(path[0]);
+ for (size_t i = 0; i + 1 < path.size(); i++) {
+ EdgeId e = path[i + 1];
+ if (edgesToDelete.find(base::conjugate(e)) == edgesToDelete.end())
+ edgesToDelete.insert(e);
+ }
+ return std::vector<EdgeId>(edgesToDelete.begin(), edgesToDelete.end());
+}
+
+template<class DataMaster>
+vector<typename ObservableGraph<DataMaster>::VertexId> ObservableGraph<DataMaster>::VerticesToDelete(const vector<EdgeId>& path) const {
+ std::set<VertexId> verticesToDelete;
+ for (size_t i = 0; i + 1 < path.size(); i++) {
+ EdgeId e = path[i + 1];
+ VertexId v = base::EdgeStart(e);
+ if (verticesToDelete.find(base::conjugate(v)) == verticesToDelete.end())
+ verticesToDelete.insert(v);
+ }
+ return vector<VertexId>(verticesToDelete.begin(), verticesToDelete.end());
+}
+
+template<class DataMaster>
+void ObservableGraph<DataMaster>::AddActionHandler(Handler* action_handler) const {
+#pragma omp critical(action_handler_list_modification)
+ {
+ TRACE("Action handler " << action_handler->name() << " added");
+ if (find(action_handler_list_.begin(), action_handler_list_.end(), action_handler) != action_handler_list_.end()) {
+ VERIFY_MSG(false, "Action handler " << action_handler->name() << " has already been added");
+ } else {
+ action_handler_list_.push_back(action_handler);
+ }
+ }
+}
+
+template<class DataMaster>
+bool ObservableGraph<DataMaster>::RemoveActionHandler(const Handler* action_handler) const {
+ bool result = false;
+#pragma omp critical(action_handler_list_modification)
+ {
+ auto it = std::find(action_handler_list_.begin(), action_handler_list_.end(), action_handler);
+ if (it != action_handler_list_.end()) {
+ action_handler_list_.erase(it);
+ TRACE("Action handler " << action_handler->name() << " removed");
+ result = true;
+ } else {
+ TRACE("Action handler " << action_handler->name() << " wasn't found among graph action handlers");
+ }
+ }
+ return result;
+}
+
+template<class DataMaster>
+bool ObservableGraph<DataMaster>::AllHandlersThreadSafe() const {
+ for (Handler* handler : action_handler_list_) {
+ if (handler->IsAttached() && !handler->IsThreadSafe()) {
+ return false;
+ }
+ }
+ return true;
+}
+
+template<class DataMaster>
+void ObservableGraph<DataMaster>::PrintHandlersNames() const {
+ for (Handler* handler : action_handler_list_) {
+ std::cout << handler->name() << " attached=" << handler->IsAttached() << std::endl;
+ }
+}
+
+template<class DataMaster>
+void ObservableGraph<DataMaster>::FireAddVertex(VertexId v) const {
+ for (Handler* handler_ptr : action_handler_list_) {
+ if (handler_ptr->IsAttached()) {
+ TRACE("FireAddVertex to handler " << handler_ptr->name());
+ applier_->ApplyAdd(*handler_ptr, v);
+ }
+ }
+}
+
+template<class DataMaster>
+void ObservableGraph<DataMaster>::FireAddEdge(EdgeId e) const {
+ for (Handler* handler_ptr : action_handler_list_) {
+ if (handler_ptr->IsAttached()) {
+ TRACE("FireAddEdge to handler " << handler_ptr->name());
+ applier_->ApplyAdd(*handler_ptr, e);
+ }
+ }
+}
+
+template<class DataMaster>
+void ObservableGraph<DataMaster>::FireDeleteVertex(VertexId v) const {
+ for (auto it = action_handler_list_.rbegin(); it != action_handler_list_.rend(); ++it) {
+ if ((*it)->IsAttached()) {
+ applier_->ApplyDelete(**it, v);
+ }
+ }
+}
+
+template<class DataMaster>
+void ObservableGraph<DataMaster>::FireDeleteEdge(EdgeId e) const {
+ for (auto it = action_handler_list_.rbegin(); it != action_handler_list_.rend(); ++it) {
+ if ((*it)->IsAttached()) {
+ applier_->ApplyDelete(**it, e);
+ }
+ };
+}
+
+template<class DataMaster>
+void ObservableGraph<DataMaster>::FireMerge(vector<EdgeId> old_edges, EdgeId new_edge) const {
+ for (Handler* handler_ptr : action_handler_list_) {
+ if (handler_ptr->IsAttached()) {
+ applier_->ApplyMerge(*handler_ptr, old_edges, new_edge);
+ }
+ }
+}
+
+template<class DataMaster>
+void ObservableGraph<DataMaster>::FireGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) const {
+ for (Handler* handler_ptr : action_handler_list_) {
+ if (handler_ptr->IsAttached()) {
+ applier_->ApplyGlue(*handler_ptr, new_edge, edge1, edge2);
+ }
+ };
+}
+
+template<class DataMaster>
+void ObservableGraph<DataMaster>::FireSplit(EdgeId edge, EdgeId new_edge1, EdgeId new_edge2) const {
+ for (Handler* handler_ptr : action_handler_list_) {
+ if (handler_ptr->IsAttached()) {
+ applier_->ApplySplit(*handler_ptr, edge, new_edge1, new_edge2);
+ }
+ }
+}
+
+template<class DataMaster>
+bool ObservableGraph<DataMaster>::VerifyAllDetached() {
+ for (Handler* handler_ptr : action_handler_list_) {
+ if (handler_ptr->IsAttached()) {
+ return false;
+ }
+ }
+ return true;
+}
+
+template<class DataMaster>
+void ObservableGraph<DataMaster>::FireDeletePath(const vector<EdgeId>& edgesToDelete, const vector<VertexId>& verticesToDelete) const {
+ for (auto it = edgesToDelete.begin(); it != edgesToDelete.end(); ++it)
+ FireDeleteEdge(*it);
+ for (auto it = verticesToDelete.begin(); it != verticesToDelete.end(); ++it)
+ FireDeleteVertex(*it);
+}
+
+template<class DataMaster>
+ObservableGraph<DataMaster>::~ObservableGraph<DataMaster>() {
+ while (base::size() > 0) {
+ ForceDeleteVertex(*base::begin());
+ }
+}
+
+template<class DataMaster>
+vector<typename ObservableGraph<DataMaster>::EdgeId> ObservableGraph<DataMaster>::CorrectMergePath(const vector<EdgeId>& path) const {
+ for (size_t i = 0; i < path.size(); i++) {
+ if (path[i] == base::conjugate(path[i])) {
+ vector<EdgeId> result;
+ if (i < path.size() - 1 - i) {
+ for (size_t j = 0; j < path.size(); j++)
+ result.push_back(base::conjugate(path[path.size() - 1 - j]));
+ i = path.size() - 1 - i;
+ } else {
+ result = path;
+ }
+ size_t size = 2 * i + 1;
+ for (size_t j = result.size(); j < size; j++) {
+ result.push_back(base::conjugate(result[size - 1 - j]));
+ }
+ return result;
+ }
+ }
+ return path;
+}
+
+template<class DataMaster>
+typename ObservableGraph<DataMaster>::EdgeId ObservableGraph<DataMaster>::MergePath(const vector<EdgeId>& path, bool safe_merging) {
+ VERIFY(!path.empty());
+ for (size_t i = 0; i < path.size(); i++)
+ for (size_t j = i + 1; j < path.size(); j++) {
+ VERIFY(path[i] != path[j]);
+ }
+ if (path.size() == 1) {
+ TRACE(
+ "Path of single edge " << base::str(*(path.begin())) << ". Nothing to merge.");
+ };
+ // cerr << "Merging " << PrintDetailedPath(pObservableGraph<DataMaster><VertexIdT, EdgeIdT, VertexIt>ath) << endl;
+ // cerr << "Conjugate " << PrintConjugatePath(path) << endl;
+ vector<EdgeId> corrected_path = CorrectMergePath(path);
+ VertexId v1 = base::EdgeStart(corrected_path[0]);
+ VertexId v2 = base::EdgeEnd(corrected_path[corrected_path.size() - 1]);
+ vector<const EdgeData*> to_merge;
+ for (auto it = corrected_path.begin(); it != corrected_path.end(); ++it) {
+ to_merge.push_back(&(base::data(*it)));
+ }
+ EdgeId new_edge = base::HiddenAddEdge(v1, v2, base::master().MergeData(to_merge, safe_merging));
+ FireMerge(corrected_path, new_edge);
+ vector<EdgeId> edges_to_delete = EdgesToDelete(corrected_path);
+ vector<VertexId> vertices_to_delete = VerticesToDelete(corrected_path);
+ FireDeletePath(edges_to_delete, vertices_to_delete);
+ FireAddEdge(new_edge);
+ base::HiddenDeletePath(edges_to_delete, vertices_to_delete);
+ return new_edge;
+}
+
+template<class DataMaster>
+std::pair<typename ObservableGraph<DataMaster>::EdgeId, typename ObservableGraph<DataMaster>::EdgeId> ObservableGraph<DataMaster>::SplitEdge(EdgeId edge, size_t position) {
+ bool sc_flag = (edge == conjugate(edge));
+ VERIFY_MSG(position > 0 && position < (sc_flag ? base::length(edge) / 2 + 1 : base::length(edge)),
+ "Edge length is " << base::length(edge) << " but split pos was " << position);
+ std::pair<VertexData, std::pair<EdgeData, EdgeData> > newData = base::master().SplitData(base::data(edge), position, sc_flag);
+ VertexId splitVertex = base::HiddenAddVertex(newData.first);
+ EdgeId new_edge1 = base::HiddenAddEdge(base::EdgeStart(edge), splitVertex, newData.second.first);
+ EdgeId new_edge2 = base::HiddenAddEdge(splitVertex, sc_flag ? conjugate(splitVertex) : base::EdgeEnd(edge), newData.second.second);
+ VERIFY(!sc_flag || new_edge2 == conjugate(new_edge2))
+ FireSplit(edge, new_edge1, new_edge2);
+ FireDeleteEdge(edge);
+ FireAddVertex(splitVertex);
+ FireAddEdge(new_edge1);
+ FireAddEdge(new_edge2);
+ base::HiddenDeleteEdge(edge);
+ return make_pair(new_edge1, new_edge2);
+}
+
+template<class DataMaster>
+typename ObservableGraph<DataMaster>::EdgeId ObservableGraph<DataMaster>::GlueEdges(EdgeId edge1, EdgeId edge2) {
+ EdgeId new_edge = base::HiddenAddEdge(base::EdgeStart(edge2), base::EdgeEnd(edge2), base::master().GlueData(base::data(edge1), base::data(edge2)));
+ FireGlue(new_edge, edge1, edge2);
+ FireDeleteEdge(edge1);
+ FireDeleteEdge(edge2);
+ FireAddEdge(new_edge);
+ VertexId start = base::EdgeStart(edge1);
+ VertexId end = base::EdgeEnd(edge1);
+ base::HiddenDeleteEdge(edge1);
+ base::HiddenDeleteEdge(edge2);
+ if (base::IsDeadStart(start) && base::IsDeadEnd(start)) {
+ DeleteVertex(start);
+ }
+ if (base::IsDeadStart(end) && base::IsDeadEnd(end)) {
+ DeleteVertex(end);
+ }
+ return new_edge;
+}
+}
diff --git a/src/modules/assembly_graph/graph_core/order_and_law.hpp b/src/modules/assembly_graph/graph_core/order_and_law.hpp
new file mode 100644
index 0000000..20ad96d
--- /dev/null
+++ b/src/modules/assembly_graph/graph_core/order_and_law.hpp
@@ -0,0 +1,644 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <boost/utility.hpp>
+
+#include <ostream>
+#include <unordered_set>
+#include <unordered_map>
+#include "dev_support/stacktrace.hpp"
+#include <algorithm>
+#include <map>
+#include "dev_support/openmp_wrapper.h"
+#include "folly/PackedSyncPtr.h"
+
+
+namespace restricted {
+
+//todo discuss with Anton
+static const uint16_t MAX_THREAD_CNT = 128;
+
+class IdDistributor {
+public:
+ virtual size_t GetId() = 0;
+
+ virtual ~IdDistributor() {
+ }
+};
+
+template<class Iter>
+class ListIdDistributor : public IdDistributor {
+ friend class IdSegmentStorage;
+
+private:
+ Iter left_;
+ Iter right_;
+ size_t shift_;
+ size_t max_;
+
+ ListIdDistributor(Iter left, Iter right, size_t shift = 0, size_t max = size_t(-1)) : left_(left),
+ right_(right),
+ shift_(shift), max_(max) {
+ }
+
+public:
+ bool valid() {
+ return left_ < right_;
+ }
+
+ size_t GetId() {
+ size_t result = *(left_);
+ VERIFY(result < max_);
+ ++left_;
+ return shift_ + result;
+ }
+};
+
+class SegmentIterator {
+private:
+ size_t value_;
+public:
+ SegmentIterator(size_t value) : value_(value) {
+ }
+
+ size_t operator*() const {
+ return value_;
+ }
+
+ void operator++() {
+ value_++;
+ }
+
+ void operator++(int) {
+ ++value_;
+ }
+
+ bool operator==(const SegmentIterator &that) const {
+ return value_ == that.value_;
+ }
+
+ bool operator!=(const SegmentIterator &that) const {
+ return value_ != that.value_;
+ }
+};
+
+class IdSegmentStorage {
+ friend class LocalIdDistributor;
+
+public:
+ ListIdDistributor<SegmentIterator> GetSegmentIdDistributor(size_t left, size_t right) {
+ VERIFY(left < right);
+ VERIFY(right <= size_);
+ return ListIdDistributor<SegmentIterator>(SegmentIterator(left), SegmentIterator(right), min_value_, size_);
+ }
+
+ template<class Iter>
+ ListIdDistributor<Iter> GetSegmentIdDistributor(Iter left, Iter right) {
+ VERIFY(left < right);
+ return ListIdDistributor<Iter>(left, right, min_value_, size_);
+ }
+
+ IdSegmentStorage() : min_value_(0), size_(0) { }
+
+private:
+ IdSegmentStorage(size_t min_value, size_t size) : min_value_(min_value), size_(size) { }
+
+ size_t min_value_;
+ size_t size_;
+};
+
+// Id distributor for pure_pointer. Singleton.
+class LocalIdDistributor : public IdDistributor, boost::noncopyable {
+ friend class PeriodicIdDistributor;
+
+ static const size_t INITIAL_MAX_INT_ID = 2;
+public:
+ size_t GetId() {
+ return max_int_id_++;
+ }
+
+ IdSegmentStorage Reserve(size_t size) {
+ max_int_id_ += size;
+ return IdSegmentStorage(max_int_id_ - size, size);
+ }
+
+ IdSegmentStorage ReserveUpTo(size_t max) {
+ VERIFY(max_int_id_ == INITIAL_MAX_INT_ID);
+ max_int_id_ = max;
+ return IdSegmentStorage(0, max);
+ }
+
+// static GlobalIdDistributor &GetInstance() {
+// static GlobalIdDistributor instance(INITIAL_MAX_INT_ID);
+// return instance;
+// }
+
+ size_t GetMax() const {
+ return max_int_id_;
+ }
+
+ LocalIdDistributor(size_t min_id_value = INITIAL_MAX_INT_ID) : max_int_id_(min_id_value) { }
+
+private:
+ size_t max_int_id_;
+};
+
+/* id distributor used for concurrent algorithms.
+* each thread use their own PeriodicIdDistributor with period equals to
+* the quantity of threads. After thread's job is done Synchronize call are required
+* to increase id in GlobalIdDistributor.
+*/
+class PeriodicIdDistributor : public IdDistributor {
+
+public:
+ PeriodicIdDistributor(LocalIdDistributor &id_distributor, size_t first_id, size_t period)
+ : id_distributor_(id_distributor), cur_id_(first_id), period_(period) {
+ }
+
+ virtual size_t GetId() {
+ size_t id = cur_id_;
+ cur_id_ += period_;
+
+ return id;
+ }
+
+ void Synchronize() const {
+ size_t &global_max_id = id_distributor_.max_int_id_;
+ global_max_id = std::max(cur_id_, global_max_id);
+ }
+
+private:
+ LocalIdDistributor &id_distributor_;
+ size_t cur_id_;
+ size_t period_;
+};
+
+template<class PurePtrT>
+class PurePtrLock;
+
+template<class PurePtrT>
+class PurePtrMarker;
+
+//todo maybe make it extend folly::PackedSyncPtr<T>?
+template<class T>
+struct pure_pointer {
+ typedef T type;
+ typedef T *pointer_type;
+
+ explicit pure_pointer()
+ : int_id_(0) {
+ ptr_.init(pointer_type(0), MAX_THREAD_CNT);
+ }
+
+ explicit pure_pointer(T *ptr)
+ : int_id_(size_t(ptr)) {
+ ptr_.init(ptr, MAX_THREAD_CNT);
+ VERIFY(int_id_ < 2);
+ }
+
+ explicit pure_pointer(T *ptr, IdDistributor &idDistributor)
+ : int_id_(generate_id(ptr, idDistributor)) {
+ ptr_.init(ptr, MAX_THREAD_CNT);
+ }
+
+// lock_pointer_type& get_lockable() {
+// return ptr_;
+// }
+
+ T *get() const {
+ return ptr_.get();
+ }
+
+ T &operator*() const {
+ return *ptr_;
+ }
+
+ T *operator->() const {
+ return ptr_.get();
+ }
+
+ bool operator==(const pure_pointer &rhs) const {
+ if (int_id_ == rhs.int_id_) {
+ VERIFY(ptr_.get() == rhs.ptr_.get());
+ return true;
+ }
+ return false;
+ }
+
+ bool operator!=(const pure_pointer &rhs) const {
+ return !operator==(rhs);
+ }
+
+ bool operator<(const pure_pointer &rhs) const {
+ return this->int_id_ < rhs.int_id_;
+ }
+
+ bool operator<=(const pure_pointer &rhs) const {
+ return *this < rhs || *this == rhs;
+ }
+
+ size_t hash() const {
+ return this->int_id_;
+ }
+
+ size_t int_id() const {
+ return int_id_;
+ }
+
+private:
+ friend class PurePtrLock<pure_pointer<T>>;
+
+ friend class PurePtrMarker<pure_pointer<T>>;
+
+ typedef folly::PackedSyncPtr<T> lock_pointer_type;
+
+ static size_t generate_id(T *ptr, IdDistributor &idDistributor) {
+ if (ptr == 0 || ptr == (T *) 1 || ptr == (T *) (-1)) {
+ return size_t(ptr);
+ }
+
+ return idDistributor.GetId();
+ }
+
+ lock_pointer_type ptr_;
+
+ size_t int_id_;
+};
+
+template<class LockT>
+class ReEnteringLock {
+ LockT &lock_;
+ bool reentered_;
+
+ uint16_t locking_thread() const {
+ //don't need barrier here (as folly documentation says)
+ return lock_.extra();
+ }
+
+ uint16_t current_thread() const {
+ return uint16_t(omp_get_thread_num());
+ }
+
+ void Lock() {
+ lock_.lock();
+ lock_.setExtra(current_thread());
+ }
+
+ void Unlock() {
+ lock_.setExtra(MAX_THREAD_CNT);
+ lock_.unlock();
+ }
+
+public:
+ ReEnteringLock(LockT &lock) :
+ lock_(lock),
+ reentered_(false) {
+ if (locking_thread() == current_thread()) {
+ reentered_ = true;
+ } else {
+ Lock();
+ }
+ }
+
+ ~ReEnteringLock() {
+ if (!reentered_) {
+ Unlock();
+ }
+ }
+};
+
+/**
+* Lock that uses a pure ptr as a target.
+* Be careful NOT to pass a COPY of pure ptr you want to use as locked object!
+*/
+template<class PurePtrT>
+class PurePtrLock {
+ ReEnteringLock<typename PurePtrT::lock_pointer_type> inner_lock_;
+
+public:
+ PurePtrLock(PurePtrT &pure_ptr) :
+ inner_lock_(pure_ptr.ptr_) {
+ }
+
+};
+
+/**
+* Way to "mark" pure pointer without using additional memory.
+* Marking/unmarking operations are atomic
+* Be careful NOT to pass a COPY of pure ptr you want to mark!
+* Do not use with PurePtrLocks, they use the same space for storing data...
+*/
+template<class PurePtrT>
+class PurePtrMarker {
+ typedef typename PurePtrT::lock_pointer_type LockWithData;
+
+ void ChangeMark(PurePtrT &pure_ptr, uint16_t new_mark) const {
+ LockWithData &lock_with_data = pure_ptr.ptr_;
+ lock_with_data.lock();
+ lock_with_data.setExtra(new_mark);
+ lock_with_data.unlock();
+ }
+
+public:
+
+ void mark(PurePtrT &pure_ptr) const {
+ ChangeMark(pure_ptr, 0);
+ }
+
+ void unmark(PurePtrT &pure_ptr) const {
+ ChangeMark(pure_ptr, MAX_THREAD_CNT);
+ }
+
+ bool is_marked(const PurePtrT &pure_ptr) const {
+ uint16_t curr_mark = pure_ptr.ptr_.extra();
+ VERIFY(curr_mark == 0 || curr_mark == MAX_THREAD_CNT);
+ return curr_mark == 0;
+ }
+
+};
+
+//template<class T>
+//struct Comparator
+//{
+// typedef pure_pointer<T> pointer_type_t;
+//
+// bool operator()(pointer_type_t const& a, pointer_type_t const& b) const {
+// return a.get() < b.get();
+// }
+//};
+
+template<class T>
+struct Hash {
+ typedef pure_pointer<T> pointer_type_t;
+ std::hash<T *> inner_hash_;
+
+ size_t operator()(pointer_type_t const &a) const {
+ return inner_hash_(a.get());
+ }
+};
+
+template<class It>
+struct iterator_wrapper {
+ typedef typename It::value_type value_type;
+ typedef typename It::difference_type difference_type;
+ typedef typename It::reference reference;
+ typedef typename It::pointer pointer;
+
+ explicit iterator_wrapper(It it) : it_(it) { }
+
+ reference operator*() const { return it_.operator*(); }
+
+ pointer operator->() const { return it_.operator->(); }
+
+ bool operator==(const iterator_wrapper &rhs) const { return it_ == rhs.it_; }
+
+ bool operator!=(const iterator_wrapper &rhs) const { return it_ != rhs.it_; }
+
+private:
+ It it_;
+};
+
+template<class T>
+struct set {
+ typedef Hash<typename T::type> hash_t;
+ typedef std::unordered_set<T, hash_t> base_set_t;
+ typedef typename base_set_t::value_type value_type;
+
+ typedef iterator_wrapper<typename base_set_t::iterator> iterator;
+ typedef iterator_wrapper<typename base_set_t::const_iterator> const_iterator;
+
+public:
+ set() : base_set_(10, hash_t()) {
+ }
+
+ template<class It>
+ set(It begin, It end) : base_set_(begin, end, 10, hash_t()) {
+ }
+
+ const_iterator begin() const { return const_iterator(base_set_.begin()); }
+
+ const_iterator end() const { return const_iterator(base_set_.end()); }
+
+ iterator begin() { return iterator(base_set_.begin()); }
+
+ iterator end() { return iterator(base_set_.end()); }
+
+ const_iterator find(const T &key) const { return const_iterator(base_set_.find(key)); }
+
+ iterator find(const T &key) { return iterator(base_set_.find(key)); }
+
+ size_t count(T const &item) const { return base_set_.count(item); }
+
+ std::pair<iterator, bool> insert(value_type const &item) {
+ const std::pair<iterator, bool> &ret = base_set_.insert(item);
+ return make_pair(iterator(ret.first), ret.second);
+ }
+
+ template<class It>
+ void insert(It first, It last) { base_set_.insert(first, last); }
+
+ size_t erase(const T &x) { return base_set_.erase(x); }
+
+ void clear() { base_set_.clear(); }
+
+ size_t size() const { return base_set_.size(); }
+
+ bool operator==(const set &rhs) const {
+ if (this->size() != rhs.size())
+ return false;
+
+ for (auto i = base_set_.begin(), j = rhs.base_set_.begin();
+ i != base_set_.end() && j != rhs.base_set_.end();
+ ++i, ++j) {
+ if (*i != *j)
+ return false;
+ }
+
+ return true;
+ }
+
+ bool operator!=(const set &rhs) const {
+ return !(*this == rhs);
+ }
+
+ template<class Comparator>
+ void Copy(std::set<T, Comparator> &container) const {
+ container.insert(base_set_.begin(), base_set_.end());
+ }
+
+private:
+ base_set_t base_set_;
+};
+
+
+template<class Key, class Value>
+struct map {
+ typedef Hash<typename Key::type> hash_t;
+ typedef std::unordered_map<Key, Value, hash_t> base_map_t;
+ typedef typename base_map_t::value_type value_type;
+
+ typedef iterator_wrapper<typename base_map_t::iterator> iterator;
+ typedef iterator_wrapper<typename base_map_t::const_iterator> const_iterator;
+
+public:
+ map()
+ : base_map_(10, hash_t()) {
+ }
+
+ template<class It>
+ map(It begin, It end)
+ : base_map_(begin, end, 10, hash_t()) {
+ }
+
+ const_iterator begin() const { return const_iterator(base_map_.begin()); }
+
+ const_iterator end() const { return const_iterator(base_map_.end()); }
+
+ iterator begin() { return iterator(base_map_.begin()); }
+
+ iterator end() { return iterator(base_map_.end()); }
+
+ const_iterator find(const Key &key) const {
+ return const_iterator(base_map_.find(key));
+ }
+
+ iterator find(const Key &key) { return iterator(base_map_.find(key)); }
+
+ size_t count(Key const &item) const { return base_map_.count(item); }
+
+ Value &operator[](Key const &x) { return base_map_[x]; }
+
+ std::pair<iterator, bool> insert(value_type const &value) {
+ std::pair<iterator, bool> ret = base_map_.insert(value);
+ return make_pair(iterator(ret.first), ret.second);
+ }
+
+ template<class It>
+ void insert(It first, It last) { base_map_.insert(first, last); }
+
+ size_t erase(Key const &x) { return base_map_.erase(x); }
+
+ void clear() { base_map_.clear(); }
+
+ size_t size() const { return base_map_.size(); }
+
+ bool operator==(const map &rhs) const {
+ if (size() != rhs.size())
+ return false;
+
+ for (auto i = base_map_.begin(), j = rhs.base_map_.begin();
+ i != base_map_.end() && j != rhs.base_map_.end();
+ ++i, ++j) {
+ if (*i != *j)
+ return false;
+ }
+
+ return true;
+ }
+
+ bool operator!=(const map &rhs) const {
+ return !(*this == rhs);
+ }
+
+ template<class Comparator>
+ void Copy(std::map<Key, Value, Comparator> &container) const {
+ container.insert(base_map_.begin(), base_map_.end());
+ }
+
+private:
+ base_map_t base_map_;
+};
+
+template<class T>
+std::ostream &operator<<(std::ostream &stream, const pure_pointer<T> &pointer) {
+ stream << pointer.int_id();
+ return stream;
+}
+
+} // namespace restricted
+
+namespace std {
+template<class T>
+struct hash<restricted::pure_pointer<T>> {
+ size_t operator()(const restricted::pure_pointer<T> &pointer) const {
+ return pointer.hash();
+ }
+};
+}
+
+template<class T, class Comparator>
+class PairComparator {
+private:
+ Comparator comparator_;
+public:
+ PairComparator(Comparator comparator) : comparator_(comparator) {
+ }
+
+ bool operator()(std::pair<T, T> a, std::pair<T, T> b) const {
+ return a.first == b.first ? comparator_(a.second, b.second) : comparator_(a.first, b.first);
+ }
+};
+
+//
+//template<typename T, class Comparator>
+//class MixedComparator {
+//private:
+// Comparator c1_;
+// Comparator c2_;
+//public:
+// MixedComparator(const Comparator &c1, const Comparator &c2) : c1_(c1), c2_(c2) {
+// }
+//
+// bool operator()(const T &a, const T &b) const {
+// if(c1_.IsAFAKE(a) || c1_.IsAFAKE(b)) {
+// if(c1_.IsAFAKEMin(a))
+// return !c1_.IsAFAKEMin(b);
+// if(c1_.IsAFAKEMax(b))
+// return c1_.IsAFAKEMax(a);
+// return false;
+// }
+// if(c1_.IsValidId(a) && c1_.IsValidId(b))
+// return c1_(a, b);
+// if(c1_.IsValidId(a))
+// return true;
+// if(c1_.IsValidId(b))
+// return false;
+// if(c2_.IsValidId(a) && c2_.IsValidId(b)) {
+// return c2_(a, b);
+// }
+// VERIFY(false);
+// return false;
+// }
+//
+// bool IsValidId(T element) {
+// return c1_.IsValid(element) || c2_.IsValid(element);
+// }
+//};
+
+template<class Container, class Comparator>
+class ContainerComparator {
+private:
+ Comparator comparator_;
+public:
+ ContainerComparator(const Comparator &comparator) : comparator_(comparator) {
+ }
+
+ bool operator()(const Container &a, const Container &b) const {
+ for (auto ita = a.begin, itb = b.begin(); ita != a.end() && itb != b.end(); ++ita, ++itb) {
+ if (*ita != *itb)
+ return comparator_(*ita, *itb);
+ }
+ if (a.size() < b.size()) {
+ return true;
+ }
+ return false;
+ }
+
+};
+
diff --git a/src/modules/assembly_graph/graph_support/basic_edge_conditions.hpp b/src/modules/assembly_graph/graph_support/basic_edge_conditions.hpp
new file mode 100644
index 0000000..68e3050
--- /dev/null
+++ b/src/modules/assembly_graph/graph_support/basic_edge_conditions.hpp
@@ -0,0 +1,272 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "dev_support/func.hpp"
+#include "math/pred.hpp"
+#include "assembly_graph/graph_core/basic_graph_stats.hpp"
+#include "assembly_graph/graph_core/directions.hpp"
+#include "assembly_graph/paths/path_finders.hpp"
+
+namespace omnigraph {
+
+using namespace func;
+
+template<class Graph>
+class EdgeCondition : public Predicate<typename Graph::EdgeId> {
+ typedef typename Graph::EdgeId EdgeId;
+
+ const Graph &g_;
+protected:
+
+ EdgeCondition(const Graph &g)
+ : g_(g) {
+ }
+
+ const Graph &g() const {
+ return g_;
+ }
+
+};
+
+template<class Graph>
+class IsolatedEdgeCondition : public EdgeCondition<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef EdgeCondition<Graph> base;
+
+ bool IsTerminalVertex(VertexId v) const {
+ return this->g().IncomingEdgeCount(v) + this->g().OutgoingEdgeCount(v) == 1;
+ }
+
+public:
+ IsolatedEdgeCondition(const Graph &g) : base(g) {
+ }
+
+ bool Check(EdgeId e) const {
+ return IsTerminalVertex(this->g().EdgeStart(e)) && IsTerminalVertex(this->g().EdgeEnd(e));
+ }
+
+};
+
+template<class Graph>
+inline bool HasAlternatives(const Graph &g, typename Graph::EdgeId e) {
+ return g.OutgoingEdgeCount(g.EdgeStart(e)) > 1
+ && g.IncomingEdgeCount(g.EdgeEnd(e)) > 1;
+}
+
+
+template<class Graph>
+class AlternativesPresenceCondition : public EdgeCondition<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef EdgeCondition<Graph> base;
+
+public:
+
+ AlternativesPresenceCondition(const Graph &g)
+ : base(g) {
+
+ }
+
+ bool Check(EdgeId e) const {
+ return HasAlternatives(this->g(), e);
+ }
+
+};
+
+template<class Graph>
+pred::TypedPredicate<typename Graph::EdgeId> AddAlternativesPresenceCondition(const Graph &g,
+ pred::TypedPredicate<typename Graph::EdgeId> condition) {
+ return pred::And(AlternativesPresenceCondition<Graph>(g), condition);
+}
+
+template<class Graph>
+class CoverageUpperBound : public EdgeCondition<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef EdgeCondition<Graph> base;
+ const double max_coverage_;
+
+public:
+
+ CoverageUpperBound(const Graph &g, double max_coverage)
+ : base(g),
+ max_coverage_(max_coverage) {
+ }
+
+ bool Check(EdgeId e) const {
+ return math::le(this->g().coverage(e), max_coverage_);
+ }
+
+};
+
+template<class Graph>
+class LengthUpperBound : public EdgeCondition<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef EdgeCondition<Graph> base;
+
+ const size_t max_length_;
+
+public:
+
+ LengthUpperBound(const Graph &g, size_t max_length)
+ : base(g),
+ max_length_(max_length) {
+ }
+
+ bool Check(EdgeId e) const {
+ return this->g().length(e) <= max_length_;
+ }
+
+};
+
+template<class Graph, class PathFinder>
+class PathLengthLowerBound : public EdgeCondition<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef EdgeCondition<Graph> base;
+
+ PathFinder path_finder_;
+ size_t min_length_;
+
+ ForwardDirection<Graph> forward_;
+ BackwardDirection<Graph> backward_;
+
+ size_t CumulativePathLength(EdgeId e, const AbstractDirection<Graph> &direction) const {
+ return CumulativeLength(this->g(), path_finder_(e, direction));
+ }
+
+public:
+ PathLengthLowerBound(const Graph &g, const PathFinder &path_finder,
+ size_t min_length)
+ : base(g),
+ path_finder_(path_finder),
+ min_length_(min_length),
+ forward_(g),
+ backward_(g) {
+
+ }
+
+ bool Check(EdgeId e) const {
+ size_t forward = CumulativePathLength(e, forward_);
+ size_t backward = CumulativePathLength(e, backward_);
+ //checking that path was trivial in one of directions
+ VERIFY(forward == this->g().length(e) || backward == this->g().length(e));
+ return std::max(forward, backward) >= min_length_;
+ }
+};
+
+template<class Graph, class PathFinder>
+PathLengthLowerBound<Graph, PathFinder>
+MakePathLengthLowerBound(const Graph &g, const PathFinder &path_finder, size_t min_length) {
+ return PathLengthLowerBound<Graph, PathFinder>(g, path_finder, min_length);
+}
+
+template<class Graph>
+class UniquenessPlausabilityCondition : public EdgeCondition<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef EdgeCondition<Graph> base;
+
+ virtual bool CheckUniqueness(EdgeId e, bool forward) const = 0;
+
+ virtual bool CheckPlausibility(EdgeId e, bool forward) const = 0;
+
+ bool SingleUnique(const vector<EdgeId> &edges, bool forward) const {
+ return edges.size() == 1 && CheckUniqueness(*edges.begin(), forward);
+ }
+
+ bool ExistPlausible(EdgeId init_e, const vector<EdgeId> &edges,
+ bool forward) const {
+ for (EdgeId e : edges) {
+ if (e == init_e)
+ continue;
+ if (CheckPlausibility(e, forward)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ bool Check(EdgeId e, const AbstractDirection<Graph> &direction) const {
+ return SingleUnique(direction.IncomingEdges(direction.EdgeStart(e)),
+ !direction.IsForward())
+ && ExistPlausible(
+ e, direction.OutgoingEdges(direction.EdgeStart(e)),
+ direction.IsForward());
+ }
+
+public:
+
+ UniquenessPlausabilityCondition(const Graph &g)
+ : base(g) {
+
+ }
+
+ bool Check(EdgeId e) const {
+ return Check(e, ForwardDirection<Graph>(this->g()))
+ || Check(e, BackwardDirection<Graph>(this->g()));
+ }
+
+};
+
+template<class Graph>
+class PredicateUniquenessPlausabilityCondition :
+ public UniquenessPlausabilityCondition<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef pred::TypedPredicate<EdgeId> EdgePredicate;
+ typedef UniquenessPlausabilityCondition<Graph> base;
+
+ EdgePredicate uniqueness_condition_;
+ EdgePredicate plausiblity_condition_;
+
+ bool CheckUniqueness(EdgeId e, bool) const {
+ return uniqueness_condition_(e);
+ }
+
+ bool CheckPlausibility(EdgeId e, bool) const {
+ return plausiblity_condition_(e);
+ }
+
+public:
+
+ PredicateUniquenessPlausabilityCondition(
+ const Graph &g, EdgePredicate uniqueness_condition,
+ EdgePredicate plausiblity_condition)
+ : base(g),
+ uniqueness_condition_(uniqueness_condition),
+ plausiblity_condition_(plausiblity_condition) {
+ }
+
+};
+
+template<class Graph>
+class DefaultUniquenessPlausabilityCondition :
+ public PredicateUniquenessPlausabilityCondition<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef pred::TypedPredicate<EdgeId> EdgePredicate;
+ typedef PredicateUniquenessPlausabilityCondition<Graph> base;
+
+public:
+
+ DefaultUniquenessPlausabilityCondition(const Graph &g,
+ size_t uniqueness_length,
+ size_t plausibility_length)
+ : base(g,
+ MakePathLengthLowerBound(g,
+ UniquePathFinder<Graph>(g), uniqueness_length),
+ MakePathLengthLowerBound(g,
+ PlausiblePathFinder<Graph>(g, 2 * plausibility_length),
+ plausibility_length)) {
+ }
+
+};
+
+}
diff --git a/src/modules/assembly_graph/graph_support/basic_vertex_conditions.hpp b/src/modules/assembly_graph/graph_support/basic_vertex_conditions.hpp
new file mode 100644
index 0000000..2d9e05e
--- /dev/null
+++ b/src/modules/assembly_graph/graph_support/basic_vertex_conditions.hpp
@@ -0,0 +1,52 @@
+#pragma once
+#include "math/pred.hpp"
+#include "dev_support/func.hpp"
+
+namespace omnigraph {
+using func::Predicate;
+
+template<class Graph>
+class VertexCondition : public Predicate<typename Graph::VertexId> {
+ typedef typename Graph::VertexId VertexId;
+ const Graph &g_;
+protected:
+
+ VertexCondition(const Graph &g)
+ : g_(g) {
+ }
+
+ const Graph &g() const {
+ return g_;
+ }
+
+};
+
+template<class Graph>
+class CompressCondition : public VertexCondition<Graph> {
+ typedef typename Graph::VertexId VertexId;
+
+public:
+ CompressCondition(const Graph &g) :
+ VertexCondition<Graph>(g) {
+ }
+
+ bool Check(VertexId v) const override {
+ return this->g().CanCompressVertex(v);
+ }
+};
+
+template<class Graph>
+class IsolatedVertexCondition : public VertexCondition<Graph> {
+ typedef typename Graph::VertexId VertexId;
+
+public:
+ IsolatedVertexCondition(const Graph& g) :
+ VertexCondition<Graph>(g) {
+ }
+
+ bool Check(VertexId v) const override {
+ return this->g().IsDeadStart(v) && this->g().IsDeadEnd(v);
+ }
+};
+
+}
\ No newline at end of file
diff --git a/src/modules/assembly_graph/graph_support/chimera_stats.hpp b/src/modules/assembly_graph/graph_support/chimera_stats.hpp
new file mode 100644
index 0000000..86d19c3
--- /dev/null
+++ b/src/modules/assembly_graph/graph_support/chimera_stats.hpp
@@ -0,0 +1,266 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "assembly_graph/stats/statistics.hpp"
+#include "assembly_graph/graph_support/genomic_quality.hpp"
+
+namespace debruijn_graph {
+
+namespace stats {
+
+template<class Graph>
+class ChimericEdgeClassifier {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ const Graph& g_;
+ size_t length_bound_;
+ const EdgeQuality<Graph>& edge_qual_;
+ bool real_edges_mode_;
+
+ template<class EdgeContainer>
+ vector<EdgeId> FilterNotEqual(const EdgeContainer& edges,
+ EdgeId edge) const {
+ vector<EdgeId> answer;
+ for (EdgeId e : edges) {
+ if (e != edge) {
+ answer.push_back(e);
+ }
+ }
+ return answer;
+ }
+
+ bool TopologyAndQualCheck(const vector<EdgeId>& edges) const {
+ return edges.size() == 1 && edge_qual_.IsPositiveQuality(edges.front());
+ }
+
+ bool TopologyAndQualCheck(VertexId v, EdgeId e) const {
+ return TopologyAndQualCheck(
+ FilterNotEqual(g_.OutgoingEdges(v), e))
+ && TopologyAndQualCheck(
+ FilterNotEqual(g_.IncomingEdges(v), e));
+ }
+
+ bool TopologyAndQualCheck(EdgeId e) const {
+ return TopologyAndQualCheck(g_.EdgeStart(e), e)
+ && TopologyAndQualCheck(g_.EdgeEnd(e), e);
+ }
+
+public:
+ ChimericEdgeClassifier(const Graph& g, size_t length_bound, const EdgeQuality<Graph>& edge_qual, bool real_edges_mode = false)
+ : g_(g),
+ length_bound_(length_bound),
+ edge_qual_(edge_qual),
+ real_edges_mode_(real_edges_mode) {
+ }
+
+ bool IsTrivialChimeric(EdgeId e) const {
+ bool correct_qual = real_edges_mode_ ? edge_qual_.IsPositiveQuality(e) : edge_qual_.IsZeroQuality(e);
+ return correct_qual && g_.length(e) <= length_bound_
+ && TopologyAndQualCheck(e);
+ }
+
+private:
+ DECL_LOGGER("ChimericEdgeClassifier");
+};
+
+template<class Graph>
+class InterstrandAnalyzer {
+ const static size_t infinity = -1u;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ const Graph& g_;
+ size_t dist_bound_;
+ const MappingPath<EdgeId> genome_path_;
+
+ bool Relax(size_t& a, size_t b) const {
+ if (b < a) {
+ a = b;
+ return true;
+ }
+ return false;
+ }
+
+ size_t GenomicDistance(size_t genome_path_pos, EdgeId e2,
+ size_t distance_bound) const {
+ for (size_t i = genome_path_pos + 1; i < genome_path_.size(); ++i) {
+ int gap =
+ (int)(genome_path_[i].second.initial_range.start_pos
+ - genome_path_[genome_path_pos].second.initial_range.end_pos);
+ VERIFY(gap >= 0);
+ if (size_t(gap) > distance_bound)
+ return infinity;
+ if (genome_path_[i].first == e2)
+ return gap;
+ }
+ return infinity;
+ }
+
+ size_t ShortestGenomicDistance(EdgeId e1, EdgeId e2,
+ size_t distance_bound) const {
+ size_t best = infinity;
+ for (size_t i = 0; i < genome_path_.size(); ++i) {
+ if (genome_path_[i].first == e1) {
+ Relax(best, GenomicDistance(i, e2, distance_bound));
+ }
+ }
+ return best;
+ }
+
+ size_t InnerInterstrandDistance(EdgeId e) const {
+ size_t answer = infinity;
+ EdgeId e1 = g_.GetUniqueIncomingEdge(g_.EdgeStart(e));
+ EdgeId e2 = g_.GetUniqueOutgoingEdge(g_.EdgeEnd(e));
+ if (g_.length(e2) > dist_bound_)
+ return -1;
+ Relax(answer,
+ ShortestGenomicDistance(e1, g_.conjugate(e2),
+ dist_bound_ - g_.length(e2)));
+ Relax(answer,
+ ShortestGenomicDistance(e2, g_.conjugate(e1),
+ dist_bound_ - g_.length(e2)));
+ return answer + g_.length(e2);
+ }
+
+
+public:
+ InterstrandAnalyzer(const Graph& g, size_t dist_bound, const MappingPath<EdgeId> genome_path)
+ : g_(g),
+ dist_bound_(dist_bound),
+ genome_path_(genome_path) {
+ }
+
+ //todo rewrite and think of additionally detecting thorns with no path
+ //returns -1u if no interstrand path or interstrand distance > dist_bound
+ size_t InterstrandDistance(EdgeId e) const {
+ size_t answer = infinity;
+ Relax(answer, InnerInterstrandDistance(e));
+ Relax(answer, InnerInterstrandDistance(g_.conjugate(e)));
+ //todo maybe unnecessary check
+ return answer <= dist_bound_ ? answer : -1u;
+ }
+
+private:
+ DECL_LOGGER("InterstrandAnalyzer");
+};
+
+template<class Graph>
+class ChimericEdgeStats {
+ const static size_t infinity = -1u;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ const Graph& g_;
+ const ChimericEdgeClassifier<Graph>& chimeric_edge_classifier_;
+ const InterstrandAnalyzer<Graph>& interstrand_analyzer_;
+ ostream& out_;
+
+protected:
+ virtual string Head() {
+ std::stringstream ss;
+ ss << "int_id\t"
+ << "length\t"
+ << "coverage\t"
+ << "interstrand_dist"
+ << endl;
+ return ss.str();
+ }
+
+ virtual string ReportChimera(EdgeId e, size_t interstrand_dist) {
+ std::stringstream ss;
+ ss << g_.int_id(e) << "\t"
+ << g_.length(e) << "\t"
+ << g_.coverage(e) << "\t";
+ if (interstrand_dist < infinity) {
+ ss << interstrand_dist;
+ } else {
+ ss << -1;
+ }
+ ss << endl;
+ return ss.str();
+ }
+
+ const Graph& g() const {
+ return g_;
+ }
+
+public:
+ ChimericEdgeStats(const Graph& g,
+ const ChimericEdgeClassifier<Graph>& chimeric_edge_classifier,
+ const InterstrandAnalyzer<Graph>& interstrand_analyzer,
+ ostream& out)
+ : g_(g),
+ chimeric_edge_classifier_(chimeric_edge_classifier),
+ interstrand_analyzer_(interstrand_analyzer),
+ out_(out) {
+ }
+
+ virtual ~ChimericEdgeStats() {
+ }
+
+ void operator()() {
+ out_ << Head() << endl;
+ set<EdgeId> visited;
+ for (auto it = g_.SmartEdgeBegin(); !it.IsEnd(); ++it) {
+ if (visited.count(*it) > 0)
+ continue;
+ visited.insert(*it);
+ visited.insert(g_.conjugate(*it));
+ if (chimeric_edge_classifier_.IsTrivialChimeric(*it)) {
+ out_ << ReportChimera(*it, interstrand_analyzer_.InterstrandDistance(*it)) << endl;
+ }
+ }
+ }
+};
+
+template<class Graph>
+class ChimeraRelativeCoverageStats : public ChimericEdgeStats<Graph> {
+ typedef ChimericEdgeStats<Graph> base;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef std::function<double(EdgeId, VertexId)> LocalCoverageFT;
+
+ simplification::relative_coverage::RelativeCoverageHelper<Graph> rel_helper_;
+
+ double RelativeCoverage(VertexId v, EdgeId base_edge) {
+ return rel_helper_.RelativeCoverageToReport(v, rel_helper_.LocalCoverage(base_edge, v));
+ }
+
+public:
+ ChimeraRelativeCoverageStats(const Graph& g,
+ const ChimericEdgeClassifier<Graph>& edge_classifier,
+ const InterstrandAnalyzer<Graph>& interstrand_analyzer,
+ LocalCoverageFT local_coverage_f,
+ ostream& out)
+ : base(g, edge_classifier, interstrand_analyzer, out),
+ rel_helper_(g, local_coverage_f, 2.0/*any value works here*/) {
+ }
+
+protected:
+ virtual string Head() {
+ return base::Head() + "\tmin_rel_cov\tmax_rel_cov";
+ }
+
+ virtual string ReportChimera(EdgeId e, size_t interstrand_dist) {
+ double start_cov = RelativeCoverage(this->g().EdgeStart(e), e);
+ double end_cov = RelativeCoverage(this->g().EdgeEnd(e), e);
+ stringstream ss;
+ ss << base::ReportChimera(e, interstrand_dist) << "\t"
+ << std::min(start_cov, end_cov) << "\t"
+ << std::max(start_cov, end_cov);
+ return ss.str();
+ }
+
+private:
+ DECL_LOGGER("ChimeraRelativeCoverageStats");
+};
+
+}
+}
diff --git a/src/modules/assembly_graph/graph_support/comparators.hpp b/src/modules/assembly_graph/graph_support/comparators.hpp
new file mode 100644
index 0000000..2f493f8
--- /dev/null
+++ b/src/modules/assembly_graph/graph_support/comparators.hpp
@@ -0,0 +1,62 @@
+#pragma once
+namespace omnigraph {
+
+template<class Graph>
+struct CoverageComparator {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ const Graph &graph_;
+public:
+ CoverageComparator(const Graph &graph)
+ : graph_(graph) {
+ }
+
+ /**
+ * Standard comparator function as used in collections.
+ */
+ bool operator()(EdgeId edge1, EdgeId edge2) const {
+ if (math::eq(graph_.coverage(edge1), graph_.coverage(edge2))) {
+ return edge1 < edge2;
+ }
+ return math::ls(graph_.coverage(edge1), graph_.coverage(edge2));
+ }
+};
+
+/**
+ * This class defines which edge is more likely to be tip. In this case we just assume shorter edges
+ * are more likely tips then longer ones.
+ */
+ template<class Graph>
+ struct LengthComparator {
+ private:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ const Graph &graph_;
+ public:
+ /**
+ * TipComparator should never be created with default constructor but it is necessary on order for
+ * code to compile.
+ */
+ // TipComparator() {
+ // VERIFY(false);
+ // }
+ /**
+ * Construct TipComparator for given graph
+ * @param graph graph for which comparator is created
+ */
+ LengthComparator(const Graph &graph)
+ : graph_(graph) {
+ }
+
+ /**
+ * Standard comparator function as used in collections.
+ */
+ bool operator()(EdgeId edge1, EdgeId edge2) const {
+ if (graph_.length(edge1) == graph_.length(edge2)) {
+ return edge1 < edge2;
+ }
+ return graph_.length(edge1) < graph_.length(edge2);
+ }
+ };
+}
\ No newline at end of file
diff --git a/src/modules/assembly_graph/graph_support/contig_output.hpp b/src/modules/assembly_graph/graph_support/contig_output.hpp
new file mode 100644
index 0000000..a67166f
--- /dev/null
+++ b/src/modules/assembly_graph/graph_support/contig_output.hpp
@@ -0,0 +1,421 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "assembly_graph/stats/picture_dump.hpp"
+#include <io/reads_io/osequencestream.hpp>
+#include "assembly_graph/components/connected_component.hpp"
+#include "assembly_graph/stats/statistics.hpp"
+#include "assembly_graph/paths/path_finders.hpp"
+#include "assembly_graph/paths/path_utils.hpp"
+
+namespace debruijn_graph {
+
+//This class corrects mismatches or masks repeat differences or other such things with the sequence of an edge
+template<class Graph>
+class ContigCorrector {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+ const Graph &graph_;
+protected:
+ const Graph &graph() const {
+ return graph_;
+ }
+
+public:
+ ContigCorrector(const Graph &graph) : graph_(graph) {
+ }
+
+ virtual string correct(EdgeId e) = 0;
+
+ virtual ~ContigCorrector() {
+ }
+};
+
+template<class Graph>
+class DefaultContigCorrector : public ContigCorrector<Graph> {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+public:
+ DefaultContigCorrector(const Graph &graph) : ContigCorrector<Graph>(graph) {
+ }
+
+ string correct(EdgeId e) {
+ return this->graph().EdgeNucls(e).str();
+ }
+};
+
+//This class uses corrected sequences to construct contig (just return as is, find unipath, trim contig)
+template<class Graph>
+class ContigConstructor {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+ const Graph &graph_;
+ ContigCorrector<Graph> &corrector_;
+protected:
+ string correct(EdgeId e) {
+ return corrector_.correct(e);
+ }
+
+ const Graph &graph() const {
+ return graph_;
+ }
+
+public:
+
+ ContigConstructor(const Graph &graph, ContigCorrector<Graph> &corrector) : graph_(graph), corrector_(corrector) {
+ }
+
+ virtual pair<string, double> construct(EdgeId e) = 0;
+
+ virtual ~ContigConstructor(){
+ }
+};
+
+template<class Graph>
+class DefaultContigConstructor : public ContigConstructor<Graph> {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+public:
+
+ DefaultContigConstructor(const Graph &graph, ContigCorrector<Graph> &corrector) : ContigConstructor<Graph>(graph, corrector) {
+ }
+
+ pair<string, double> construct(EdgeId e) {
+ return make_pair(this->correct(e), this->graph().coverage(e));
+ }
+};
+
+template<class Graph>
+vector<typename Graph::EdgeId> Unipath(const Graph& g, typename Graph::EdgeId e) {
+ omnigraph::UniquePathFinder<Graph> unipath_finder(g);
+ vector<typename Graph::EdgeId> answer = unipath_finder.UniquePathBackward(e);
+ const vector<typename Graph::EdgeId>& forward = unipath_finder.UniquePathForward(e);
+ for (size_t i = 1; i < forward.size(); ++i) {
+ answer.push_back(forward[i]);
+ }
+ return answer;
+}
+
+template<class Graph>
+class UnipathConstructor : public ContigConstructor<Graph> {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+
+
+
+ string MergeOverlappingSequences(std::vector<string>& ss, size_t overlap) {
+ if (ss.empty()) {
+ return "";
+ }
+ stringstream result;
+ result << ss.front().substr(0, overlap);
+// prev_end = ss.front().substr(0, overlap);
+ for (auto it = ss.begin(); it != ss.end(); ++it) {
+// VERIFY(prev_end == it->substr(0, overlap));
+ result << it->substr(overlap);
+// prev_end = it->substr(it->size() - overlap);
+ }
+ return result.str();
+ }
+
+
+ string MergeSequences(const Graph& g,
+ const vector<typename Graph::EdgeId>& continuous_path) {
+ vector<string> path_sequences;
+ for (size_t i = 0; i < continuous_path.size(); ++i) {
+ if(i > 0)
+ VERIFY(
+ g.EdgeEnd(continuous_path[i - 1])
+ == g.EdgeStart(continuous_path[i]));
+ path_sequences.push_back(this->correct(continuous_path[i]));
+ }
+ return MergeOverlappingSequences(path_sequences, g.k());
+ }
+
+public:
+
+ UnipathConstructor(const Graph &graph, ContigCorrector<Graph> &corrector) : ContigConstructor<Graph>(graph, corrector) {
+ }
+
+ pair<string, double> construct(EdgeId e) {
+ vector<EdgeId> unipath = Unipath(this->graph(), e);
+ return make_pair(MergeSequences(this->graph(), unipath), stats::AvgCoverage(this->graph(), unipath));
+ }
+};
+
+template<class Graph>
+class CuttingContigConstructor : public ContigConstructor<Graph> {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+
+ bool ShouldCut(VertexId v) const {
+ const Graph &g = this->graph();
+ vector<EdgeId> edges;
+ push_back_all(edges, g.OutgoingEdges(v));
+ if(edges.size() == 0)
+ return false;
+ for(size_t i = 1; i < edges.size(); i++) {
+ if(g.EdgeNucls(edges[i])[g.k()] != g.EdgeNucls(edges[0])[g.k()])
+ return false;
+ }
+ edges.clear();
+ push_back_all(edges, g.IncomingEdges(v));
+ for(size_t i = 0; i < edges.size(); i++)
+ for(size_t j = i + 1; j < edges.size(); j++) {
+ if(g.EdgeNucls(edges[i])[g.length(edges[i]) - 1] != g.EdgeNucls(edges[j])[g.length(edges[j]) - 1])
+ return true;
+ }
+ return false;
+ }
+
+public:
+
+ CuttingContigConstructor(const Graph &graph, ContigCorrector<Graph> &corrector) : ContigConstructor<Graph>(graph, corrector) {
+ }
+
+ pair<string, double> construct(EdgeId e) {
+ string result = this->correct(e);
+ if(result.size() > this->graph().k() && ShouldCut(this->graph().EdgeEnd(e))) {
+ result = result.substr(0, result.size() - this->graph().k());
+ }
+ if(result.size() > this->graph().k() && ShouldCut(this->graph().conjugate(this->graph().EdgeStart(e)))) {
+ result = result.substr(this->graph().k(), result.size());
+ }
+ return make_pair(result, this->graph().coverage(e));
+ }
+};
+
+struct ExtendedContigIdT {
+ string full_id_;
+ string short_id_;
+
+ ExtendedContigIdT(): full_id_(""), short_id_("") {}
+
+ ExtendedContigIdT(string full_id, string short_id): full_id_(full_id), short_id_(short_id) {}
+};
+
+template <class Graph>
+void MakeContigIdMap(const Graph& graph, map<EdgeId, ExtendedContigIdT>& ids, const ConnectedComponentCounter &cc_counter_, string prefix) {
+ int counter = 0;
+ for (auto it = graph.ConstEdgeBegin(); !it.IsEnd(); ++it) {
+ EdgeId e = *it;
+ if (ids.count(e) == 0) {
+ string id;
+ if (cfg::get().pd) {
+ size_t c_id = cc_counter_.GetComponent(e);
+ id = io::MakeContigComponentId(++counter, graph.length(e) + graph.k(), graph.coverage(e), c_id, prefix);
+ }
+ else
+ id = io::MakeContigId(++counter, graph.length(e) + graph.k(), graph.coverage(e), prefix);
+ ids[e] = ExtendedContigIdT(id, ToString(counter) + "+");
+ if (e != graph.conjugate(e))
+ ids[graph.conjugate(e)] = ExtendedContigIdT(id + "'", ToString(counter) + "-");
+ }
+ }
+}
+
+template<class Graph>
+class ContigPrinter {
+private:
+ const Graph &graph_;
+ ContigConstructor<Graph> &constructor_;
+ template<class sequence_stream>
+ void ReportEdge(sequence_stream& oss
+ , const pair<string, double> sequence_data) {
+ oss << sequence_data.second;
+ oss << sequence_data.first;
+ }
+
+ void ReportEdge(io::osequencestream_for_fastg& oss,
+ const string& sequence,
+ const string& id,
+ const set<string>& nex_ids) {
+ oss.set_header(id);
+ oss << nex_ids;
+ oss << sequence;
+ }
+
+public:
+ ContigPrinter(const Graph &graph, ContigConstructor<Graph> &constructor) : graph_(graph), constructor_(constructor) {
+ }
+
+ template<class sequence_stream>
+ void PrintContigs(sequence_stream &os) {
+ for (auto it = graph_.ConstEdgeBegin(true); !it.IsEnd(); ++it) {
+ ReportEdge<sequence_stream>(os, constructor_.construct(*it));
+ }
+ }
+
+ template<class sequence_stream>
+ void PrintContigsFASTG(sequence_stream &os, const ConnectedComponentCounter & cc_counter) {
+ map<EdgeId, ExtendedContigIdT> ids;
+ MakeContigIdMap(graph_, ids, cc_counter, "EDGE");
+ for (auto it = graph_.SmartEdgeBegin(); !it.IsEnd(); ++it) {
+ set<string> next;
+ VertexId v = graph_.EdgeEnd(*it);
+ auto edges = graph_.OutgoingEdges(v);
+ for (auto next_it = edges.begin(); next_it != edges.end(); ++next_it) {
+ next.insert(ids[*next_it].full_id_);
+ }
+ ReportEdge(os, constructor_.construct(*it).first, ids[*it].full_id_, next);
+ //FASTG always needs both sets of edges
+ //it.HandleDelete(graph_.conjugate(*it));
+ }
+ }
+};
+
+template<class Graph>
+bool PossibleECSimpleCheck(const Graph& g
+ , typename Graph::EdgeId e) {
+ return g.OutgoingEdgeCount(g.EdgeStart(e)) > 1 && g.IncomingEdgeCount(g.EdgeEnd(e)) > 1;
+}
+
+template<class Graph>
+void ReportEdge(io::osequencestream_cov& oss
+ , const Graph& g
+ , typename Graph::EdgeId e
+ , bool output_unipath = false
+ , size_t solid_edge_length_bound = 0) {
+ typedef typename Graph::EdgeId EdgeId;
+ if (!output_unipath || (PossibleECSimpleCheck(g, e) && g.length(e) <= solid_edge_length_bound)) {
+ TRACE("Outputting edge " << g.str(e) << " as single edge");
+ oss << g.coverage(e);
+ oss << g.EdgeNucls(e);
+ } else {
+ TRACE("Outputting edge " << g.str(e) << " as part of unipath");
+ vector<EdgeId> unipath = Unipath(g, e);
+ TRACE("Unipath is " << g.str(unipath));
+ oss << stats::AvgCoverage(g, unipath);
+ TRACE("Merged sequence is of length " << MergeSequences(g, unipath).size());
+ oss << MergeSequences(g, unipath);
+ }
+}
+
+inline void OutputContigs(ConjugateDeBruijnGraph& g,
+ const string& contigs_output_filename,
+ bool output_unipath,
+ size_t ,
+ bool /*cut_bad_connections*/) {
+ INFO("Outputting contigs to " << contigs_output_filename << ".fasta");
+ DefaultContigCorrector<ConjugateDeBruijnGraph> corrector(g);
+ io::osequencestream_cov oss(contigs_output_filename + ".fasta");
+
+ if(!output_unipath) {
+ DefaultContigConstructor<ConjugateDeBruijnGraph> constructor(g, corrector);
+
+ ContigPrinter<ConjugateDeBruijnGraph>(g, constructor).PrintContigs(oss);
+ } else {
+ UnipathConstructor<ConjugateDeBruijnGraph> constructor(g, corrector);
+ ContigPrinter<ConjugateDeBruijnGraph>(g, constructor).PrintContigs(oss);
+ }
+
+// {
+// osequencestream_cov oss(contigs_output_filename);
+// set<ConjugateDeBruijnGraph::EdgeId> edges;
+// for (auto it = g.SmartEdgeBegin(); !it.IsEnd(); ++it) {
+// if (edges.count(*it) == 0) {
+// ReportEdge(oss, g, *it, output_unipath, solid_edge_length_bound + ".oppa.fasta");
+// edges.insert(g.conjugate(*it));
+// }
+// // oss << g.EdgeNucls(*it);
+// }
+// DEBUG("Contigs written");
+// }
+// if(!output_unipath) {
+// OutputContigs(g, contigs_output_filename + ".2.fasta", true, solid_edge_length_bound);
+// }
+}
+
+inline void OutputContigsToFASTG(ConjugateDeBruijnGraph& g,
+ const string& contigs_output_filename, const ConnectedComponentCounter & cc_counter) {
+
+ INFO("Outputting graph to " << contigs_output_filename << ".fastg");
+ DefaultContigCorrector<ConjugateDeBruijnGraph> corrector(g);
+ DefaultContigConstructor<ConjugateDeBruijnGraph> constructor(g, corrector);
+ io::osequencestream_for_fastg ossfg(contigs_output_filename + ".fastg");
+ ContigPrinter<ConjugateDeBruijnGraph>(g, constructor).PrintContigsFASTG(ossfg, cc_counter);
+}
+
+
+
+
+inline bool ShouldCut(ConjugateDeBruijnGraph& g, VertexId v) {
+ vector<EdgeId> edges;
+ push_back_all(edges, g.OutgoingEdges(v));
+
+ if(edges.size() == 0)
+ return false;
+ for(size_t i = 1; i < edges.size(); i++) {
+ if(g.EdgeNucls(edges[i])[g.k()] != g.EdgeNucls(edges[0])[g.k()])
+ return false;
+ }
+ edges.clear();
+ push_back_all(edges, g.IncomingEdges(v));
+ for(size_t i = 0; i < edges.size(); i++)
+ for(size_t j = i + 1; j < edges.size(); j++) {
+ if(g.EdgeNucls(edges[i])[g.length(edges[i]) - 1] != g.EdgeNucls(edges[j])[g.length(edges[j]) - 1])
+ return true;
+ }
+ return false;
+}
+
+inline void OutputCutContigs(ConjugateDeBruijnGraph& g,
+ const string& contigs_output_filename,
+ bool /*output_unipath*/ = false,
+ size_t /*solid_edge_length_bound*/ = 0) {
+ INFO("Outputting contigs to " << contigs_output_filename);
+ DefaultContigCorrector<ConjugateDeBruijnGraph> corrector(g);
+ io::osequencestream_cov oss(contigs_output_filename);
+ CuttingContigConstructor<ConjugateDeBruijnGraph> constructor(g, corrector);
+
+// osequencestream_cov oss(contigs_output_filename);
+// set<ConjugateDeBruijnGraph::EdgeId> edges;
+// for (auto it = g.SmartEdgeBegin(); !it.IsEnd(); ++it) {
+// EdgeId e = *it;
+// cout << g.length(e) << endl;
+// if (edges.count(e) == 0) {
+// Sequence s = g.EdgeNucls(e);
+// cout << s.size() << endl;
+// cout << "oppa " << ShouldCut(g, g.EdgeEnd(e)) << endl;
+// if(s.size() > g.k() && ShouldCut(g, g.EdgeEnd(e))) {
+// s = s.Subseq(0, s.size() - g.k());
+// cout << s.size() << endl;
+// }
+// cout << "oppa1 " << ShouldCut(g, g.conjugate(g.EdgeStart(e))) << endl;
+// if(s.size() > g.k() && ShouldCut(g, g.conjugate(g.EdgeStart(e)))) {
+// s = s.Subseq(g.k(), s.size());
+// cout << s.size() << endl;
+// }
+// oss << g.coverage(e);
+// oss << s;
+// edges.insert(g.conjugate(*it));
+// }
+// // oss << g.EdgeNucls(*it);
+// }
+}
+
+inline void OutputSingleFileContigs(ConjugateDeBruijnGraph& g,
+ const string& contigs_output_dir) {
+ INFO("Outputting contigs to " << contigs_output_dir);
+ int n = 0;
+ make_dir(contigs_output_dir);
+ char n_str[20];
+ set<ConjugateDeBruijnGraph::EdgeId> edges;
+ for (auto it = g.SmartEdgeBegin(); !it.IsEnd(); ++it) {
+ if (edges.count(*it) == 0) {
+ sprintf(n_str, "%d.fa", n);
+ edges.insert(g.conjugate(*it));
+ io::osequencestream oss(contigs_output_dir + n_str);
+ oss << g.EdgeNucls(*it);
+ n++;
+ }
+ }DEBUG("SingleFileContigs(Conjugate) written");
+}
+
+}
diff --git a/src/modules/assembly_graph/graph_support/detail_coverage.hpp b/src/modules/assembly_graph/graph_support/detail_coverage.hpp
new file mode 100644
index 0000000..a203d75
--- /dev/null
+++ b/src/modules/assembly_graph/graph_support/detail_coverage.hpp
@@ -0,0 +1,258 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "data_structures/indices/perfect_hash_map.hpp"
+#include "assembly_graph/graph_core/coverage.hpp"
+#include "assembly_graph/graph_core/action_handlers.hpp"
+#include "dev_support/verify.hpp"
+#include <vector>
+#include <map>
+#include <set>
+#include <string>
+#include <iostream>
+#include <fstream>
+
+namespace debruijn_graph {
+
+template<class Graph>
+class FlankingCoverage : public omnigraph::GraphActionHandler<Graph>,
+ public omnigraph::AbstractFlankingCoverage<Graph> {
+ typedef omnigraph::GraphActionHandler<Graph> base;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef pair<EdgeId, unsigned> Pos;
+
+ Graph& g_;
+ const size_t averaging_range_;
+
+ void SetRawCoverage(EdgeId e, unsigned cov) {
+ g_.data(e).set_flanking_coverage(cov);
+ }
+
+ unsigned RawCoverage(EdgeId e) const {
+ return g_.data(e).flanking_coverage();
+ }
+
+ size_t EdgeAveragingRange(EdgeId e) const {
+ return std::min(this->g().length(e), averaging_range_);
+ }
+
+ double AverageFlankingCoverage(EdgeId e) const {
+ return double(RawCoverage(e)) / double(EdgeAveragingRange(e));
+ }
+
+ unsigned InterpolateCoverage(EdgeId e, size_t l) const {
+ VERIFY(l <= averaging_range_);
+ VERIFY(l < g_.length(e));
+ return unsigned(math::round(AverageFlankingCoverage(e) * double(l)));
+ }
+
+ void SetCoverageSimilarToAverageFlanking(EdgeId target, EdgeId source) {
+ SetRawCoverage(target, unsigned(math::round(AverageFlankingCoverage(source) * double(EdgeAveragingRange(target)))));
+ }
+
+ void SetCoverageSimilarToAverageGlobal(EdgeId target, EdgeId source) {
+ SetRawCoverage(target, unsigned(math::round(g_.coverage(source) * double(EdgeAveragingRange(target)))));
+ }
+
+public:
+
+ //todo think about interactions with gap closer
+ FlankingCoverage(Graph& g, size_t averaging_range)
+ : base(g, "FlankingCoverage"), g_(g),
+ averaging_range_(averaging_range) {
+ }
+
+ size_t averaging_range() const {
+ return averaging_range_;
+ }
+
+ //todo currently left for saves compatibility! remove later!
+ template<class CoverageIndex>
+ void Fill(const CoverageIndex& count_index) {
+ TRACE("Filling flanking coverage from index");
+
+ for (auto I = count_index.value_cbegin(), E = count_index.value_cend();
+ I != E; ++I) {
+ const auto& edge_info = *I;
+ EdgeId e = edge_info.edge_id;
+ unsigned offset = edge_info.offset;
+ unsigned count = edge_info.count;
+ VERIFY(offset != -1u);
+ VERIFY(e.get() != NULL);
+ if (offset < averaging_range_) {
+ IncRawCoverage(e, count);
+ }
+ }
+ }
+
+ void IncRawCoverage(EdgeId e, unsigned count) {
+ g_.data(e).inc_flanking_coverage(count);
+ }
+
+ double CoverageOfStart(EdgeId e) const {
+ return AverageFlankingCoverage(e);
+ }
+
+ double CoverageOfEnd(EdgeId e) const {
+ return CoverageOfStart(this->g().conjugate(e));
+ }
+
+ virtual void HandleAdd(EdgeId /*e*/) {
+ }
+
+ virtual void HandleMerge(const vector<EdgeId>& old_edges, EdgeId new_edge) {
+// SetRawCoverage(new_edge, RawCoverage(old_edges.front()));
+ size_t kpomers_left = averaging_range_;
+ unsigned acc = 0;
+ for (EdgeId e : old_edges) {
+ if (kpomers_left >= g_.length(e)) {
+ acc += RawCoverage(e);
+ kpomers_left -= g_.length(e);
+ } else {
+ if (kpomers_left != 0)
+ acc += InterpolateCoverage(e, kpomers_left);
+ break;
+ }
+ }
+ SetRawCoverage(new_edge, acc);
+ }
+
+ virtual void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) {
+ SetRawCoverage(new_edge, RawCoverage(edge1) + RawCoverage(edge2));
+ }
+
+ virtual void HandleSplit(EdgeId old_edge, EdgeId new_edge_1,
+ EdgeId new_edge_2) {
+ //todo maybe improve later
+ SetCoverageSimilarToAverageFlanking(new_edge_1, old_edge);
+ SetCoverageSimilarToAverageGlobal(new_edge_2, old_edge);
+ if (old_edge == g_.conjugate(old_edge)) {
+ SetCoverageSimilarToAverageGlobal(g_.conjugate(new_edge_1), old_edge);
+ }
+ }
+
+ virtual void HandleDelete(EdgeId e) {
+ SetRawCoverage(e, 0);
+ }
+
+ double LocalCoverage(EdgeId e, VertexId v) const {
+ if (this->g().EdgeStart(e) == v) {
+ return GetInCov(e);
+ } else if (this->g().EdgeEnd(e) == v) {
+ return GetOutCov(e);
+ } else {
+ VERIFY(false);
+ return 0.0;
+ }
+ }
+
+ //left for compatibility
+ //todo rename
+ double GetInCov(EdgeId e) const {
+ return CoverageOfStart(e);
+ }
+
+ //todo rename
+ double GetOutCov(EdgeId e) const {
+ return CoverageOfEnd(e);
+ }
+
+ //////////////////////////
+
+ void Save(EdgeId e, ostream& out) const {
+ out << RawCoverage(e);
+ }
+
+ void Load(EdgeId e, istream& in) {
+ unsigned cov;
+ in >> cov;
+ SetRawCoverage(e, cov);
+ }
+
+ /*
+ * Is thread safe if different threads process different edges.
+ */
+ bool IsThreadSafe() const {
+ return true;
+ }
+
+private:
+ DECL_LOGGER("FlankingCoverage")
+ ;
+};
+
+template<class StoringType>
+struct SimultaneousCoverageCollector {
+};
+
+template<>
+struct SimultaneousCoverageCollector<SimpleStoring> {
+ template<class SimultaneousCoverageFiller, class Info>
+ static void CollectCoverage(SimultaneousCoverageFiller& filler, const Info &edge_info) {
+ filler.inc_coverage(edge_info);
+ }
+};
+
+template<>
+struct SimultaneousCoverageCollector<InvertableStoring> {
+ template<class SimultaneousCoverageFiller, class Info>
+ static void CollectCoverage(SimultaneousCoverageFiller& filler, const Info &edge_info) {
+ filler.inc_coverage(edge_info);
+ filler.inc_coverage(edge_info.conjugate(filler.k()));
+ }
+};
+
+template<class Graph, class CountIndex>
+class SimultaneousCoverageFiller {
+ const Graph& g_;
+ const CountIndex& count_index_;
+ FlankingCoverage<Graph>& flanking_coverage_;
+ omnigraph::CoverageIndex<Graph>& coverage_index_;
+ typedef typename CountIndex::Value Value;
+public:
+ SimultaneousCoverageFiller(const Graph& g, const CountIndex& count_index,
+ FlankingCoverage<Graph>& flanking_coverage,
+ omnigraph::CoverageIndex<Graph>& coverage_index) :
+ g_(g),
+ count_index_(count_index),
+ flanking_coverage_(flanking_coverage),
+ coverage_index_(coverage_index) {
+ }
+
+ size_t k() const {
+ return count_index_.k();
+ }
+
+ void inc_coverage(const Value &edge_info) {
+ coverage_index_.IncRawCoverage(edge_info.edge_id, edge_info.count);
+ if (edge_info.offset < flanking_coverage_.averaging_range()) {
+ flanking_coverage_.IncRawCoverage(edge_info.edge_id, edge_info.count);
+ }
+ }
+
+ void Fill() {
+ for (auto I = count_index_.value_cbegin(), E = count_index_.value_cend();
+ I != E; ++I) {
+ const auto& edge_info = *I;
+ VERIFY(edge_info.valid());
+ VERIFY(edge_info.edge_id.get() != NULL);
+ SimultaneousCoverageCollector<typename CountIndex::storing_type>::CollectCoverage(*this, edge_info);
+ }
+ }
+};
+
+template<class Graph, class CountIndex>
+void FillCoverageAndFlanking(const CountIndex& count_index, Graph& g,
+ FlankingCoverage<Graph>& flanking_coverage) {
+ SimultaneousCoverageFiller<Graph, CountIndex> filler(g, count_index, flanking_coverage, g.coverage_index());
+ filler.Fill();
+}
+
+}
diff --git a/src/modules/assembly_graph/graph_support/genomic_quality.hpp b/src/modules/assembly_graph/graph_support/genomic_quality.hpp
new file mode 100644
index 0000000..cdf6e12
--- /dev/null
+++ b/src/modules/assembly_graph/graph_support/genomic_quality.hpp
@@ -0,0 +1,554 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "visualization/visualization.hpp"
+#include "assembly_graph/graph_support/basic_edge_conditions.hpp"
+#include "assembly_graph/graph_alignment/sequence_mapper.hpp"
+#include "assembly_graph/graph_core/action_handlers.hpp"
+
+namespace debruijn_graph {
+
+template<class Graph>
+class EdgeQuality: public omnigraph::GraphLabeler<Graph>, public omnigraph::GraphActionHandler<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ map<EdgeId, size_t> quality_;
+ size_t k_;
+
+ template<class Index>
+ void FillQuality(const Index &index
+ , const KmerMapper<Graph>& kmer_mapper, const Sequence &genome) {
+ if (genome.size() < k_)
+ return;
+ runtime_k::RtSeq cur = genome.start<runtime_k::RtSeq>(k_);
+ cur >>= 0;
+ for (size_t i = 0; i + k_ - 1 < genome.size(); i++) {
+ cur <<= genome[i + k_ - 1];
+ auto corr_cur = kmer_mapper.Substitute(cur);
+ if (index.contains(corr_cur)) {
+ quality_[index.get(corr_cur).first]++;
+ }
+ }
+ }
+
+public:
+
+ template<class Index>
+ void Fill(const Index &index
+ , const KmerMapper<Graph>& kmer_mapper
+ , const Sequence &genome) {
+ FillQuality(index, kmer_mapper, genome);
+ FillQuality(index, kmer_mapper, !genome);
+ }
+
+ EdgeQuality(const Graph &graph) :
+ omnigraph::GraphActionHandler<Graph>(graph, "EdgeQuality"),
+ k_(graph.k() + 1) {
+ }
+
+ virtual ~EdgeQuality() {
+ }
+
+ virtual void HandleAdd(EdgeId /*e*/) {
+ }
+
+ virtual void HandleDelete(EdgeId e) {
+ quality_.erase(e);
+ }
+
+ virtual void HandleMerge(const vector<EdgeId>& old_edges, EdgeId new_edge) {
+ size_t res = 0;
+ for (size_t i = 0; i < old_edges.size(); i++) {
+ res += quality_[old_edges[i]];
+ }
+ quality_[new_edge] += res;
+ }
+
+ virtual void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) {
+ quality_[new_edge] += quality_[edge2];
+ quality_[new_edge] += quality_[edge1];
+ }
+
+ virtual void HandleSplit(EdgeId old_edge, EdgeId new_edge1,
+ EdgeId new_edge2) {
+ if (old_edge == this->g().conjugate(old_edge)) {
+ WARN("EdgeQuality does not support self-conjugate splits");
+ return;
+ }
+ VERIFY(old_edge != this->g().conjugate(old_edge));
+ quality_[new_edge1] = quality_[old_edge] * this->g().length(new_edge1)
+ / (this->g().length(new_edge1) + this->g().length(new_edge2));
+ quality_[new_edge2] = quality_[old_edge] * this->g().length(new_edge2)
+ / (this->g().length(new_edge1) + this->g().length(new_edge2));
+ }
+
+ double quality(EdgeId edge) const {
+ auto it = quality_.find(edge);
+ if (it == quality_.end())
+ return 0.;
+ else
+ return 1. * (double) it->second / (double) this->g().length(edge);
+ }
+
+ bool IsPositiveQuality(EdgeId edge) const {
+ return math::gr(quality(edge), 0.);
+ }
+
+ bool IsZeroQuality(EdgeId edge) const {
+ return math::eq(quality(edge), 0.);
+ }
+
+ virtual std::string label(VertexId /*vertexId*/) const {
+ return "";
+ }
+
+ virtual std::string label(EdgeId edge) const {
+ double q = quality(edge);
+ return (q == 0) ? "" : "quality: " + ToString(q);
+ }
+
+ void clear() {
+ quality_.clear();
+ }
+
+};
+
+template<class Graph>
+class QualityLoggingRemovalHandler {
+ typedef typename Graph::EdgeId EdgeId;
+ const Graph& g_;
+ const EdgeQuality<Graph>& quality_handler_;
+ size_t black_removed_;
+ size_t total_;
+ bool handle_all_;
+
+ virtual void HandlePositiveQuality(EdgeId /*e*/) {
+
+ }
+
+public:
+ QualityLoggingRemovalHandler(const Graph& g, const EdgeQuality<Graph>& quality_handler,
+ bool handle_all = false) :
+ g_(g), quality_handler_(quality_handler), black_removed_(0), total_(0), handle_all_(handle_all) {
+ }
+
+ void HandleDelete(EdgeId e) {
+ total_++;
+ if (handle_all_ || math::gr(quality_handler_.quality(e), 0.)) {
+ TRACE("Deleting good edge id = " << g_.int_id(e)
+ << "; length = " << g_.length(e)
+ << "; quality = " << quality_handler_.quality(e)
+ << "; cov = " << g_.coverage(e));
+ HandlePositiveQuality(e);
+ } else {
+ black_removed_++;
+ }
+ }
+
+ const Graph& g() const {
+ return g_;
+ }
+
+ const EdgeQuality<Graph>& quality_handler() const {
+ return quality_handler_;
+ }
+
+ virtual ~QualityLoggingRemovalHandler() {
+ TRACE("Overall stats: total removed = " << total_
+ << "; bad removed = " << black_removed_
+ << "; good removed = " << total_ - black_removed_);
+ }
+
+private:
+ DECL_LOGGER("QualityLoggingRemovalHandler");
+};
+
+template<class Graph>
+class QualityEdgeLocalityPrintingRH : public QualityLoggingRemovalHandler<Graph> {
+ typedef QualityLoggingRemovalHandler<Graph> base;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ omnigraph::visualization::LocalityPrintingRH<Graph> printing_rh_;
+public:
+ QualityEdgeLocalityPrintingRH(const Graph& g
+ , const EdgeQuality<Graph>& quality_handler
+ , const omnigraph::GraphLabeler<Graph>& labeler
+ , std::shared_ptr<omnigraph::visualization::GraphColorer<Graph>> colorer
+ , const string& output_folder, bool handle_all = false) :
+ base(g, quality_handler, handle_all),
+ printing_rh_(g, labeler, colorer, output_folder)
+ {}
+
+ virtual void HandlePositiveQuality(EdgeId e) {
+ printing_rh_.HandleDelete(e, "_" + ToString(this->quality_handler().quality(e)));
+ }
+
+private:
+ DECL_LOGGER("QualityEdgeLocalityPrintingRH");
+};
+
+//earlier version from rel_cov branch
+//template<class Graph>
+//class EdgeNeighborhoodFinder: public omnigraph::GraphSplitter<Graph> {
+//private:
+// typedef typename Graph::EdgeId EdgeId;
+// typedef typename Graph::VertexId VertexId;
+// EdgeId edge_;
+// size_t max_size_;
+// size_t edge_length_bound_;
+// bool finished_;
+//public:
+// EdgeNeighborhoodFinder(const Graph &graph, EdgeId edge, size_t max_size
+// , size_t edge_length_bound) :
+// GraphSplitter<Graph>(graph), edge_(edge), max_size_(
+// max_size), edge_length_bound_(edge_length_bound), finished_(
+// false) {
+// }
+//
+// GraphComponent<Graph> NextComponent() {
+// CountingDijkstra<Graph> cf(this->graph(), max_size_,
+// edge_length_bound_);
+// set<VertexId> result_set;
+// cf.run(this->graph().EdgeStart(edge_));
+// vector<VertexId> result_start = cf.ReachedVertices();
+// result_set.insert(result_start.begin(), result_start.end());
+// cf.run(this->graph().EdgeEnd(edge_));
+// vector<VertexId> result_end = cf.ReachedVertices();
+// result_set.insert(result_end.begin(), result_end.end());
+//
+// ComponentCloser<Graph> cc(this->graph(), edge_length_bound_);
+// cc.CloseComponent(result_set);
+//
+// finished_ = true;
+// return GraphComponent<Graph>(this->graph(), result_set.begin(), result_set.end());
+// }
+//
+// /*virtual*/ bool Finished() {
+// return finished_;
+// }
+//};
+//
+//template<class Graph>
+//class EdgeLocalityPrintingRH {
+// typedef typename Graph::EdgeId EdgeId;
+// typedef typename Graph::VertexId VertexId;
+// const Graph& g_;
+// const GraphLabeler<Graph>& labeler_;
+// const string& output_folder_;
+// std::function<double (EdgeId)>& quality_f_;
+//// size_t black_removed_;
+//// size_t colored_removed_;
+//public:
+// EdgeLocalityPrintingRH(const Graph& g
+// , const GraphLabeler<Graph>& labeler
+// , const string& output_folder
+// , std::function<double (EdgeId)> quality_f = 0) :
+// g_(g),
+// labeler_(labeler), output_folder_(output_folder),
+// quality_f_(quality_f){
+// }
+//
+// void HandleDelete(EdgeId edge) {
+// TRACE("Deleting edge " << g_.str(edge));
+// if (quality_f_ && math::gr(quality_f_(edge), 0.))
+// INFO("EdgeLocalityPrintRH handling the edge with positive quality : " << quality_f_(edge) << " " << g_.str(edge));
+//
+// string folder = output_folder_ + "edges_deleted/";
+// path::make_dir(folder);
+// //todo magic constant
+// map<EdgeId, string> empty_coloring;
+// omnigraph::visualization::WriteComponent(g_, EdgeNeighborhood<Graph>(g_, edge, 50, 250),
+// folder + "edge_" + ToString(g_.int_id(edge)) + ".dot", empty_coloring, labeler_);
+// }
+//
+//private:
+// DECL_LOGGER("QualityEdgeLocalityPrintingRH")
+// ;
+//};
+
+//template<class Graph, class Index>
+//class EdgeQuality: public GraphLabeler<Graph>, public GraphActionHandler<Graph> {
+// typedef typename Graph::EdgeId EdgeId;
+// typedef typename Graph::VertexId VertexId;
+// map<EdgeId, size_t> quality_;
+// size_t k_;
+//
+//public:
+//
+// void FillQuality(const Index &index
+// , const KmerMapper<Graph>& kmer_mapper, const Sequence &genome) {
+// if (genome.size() < k_)
+// return;
+// runtime_k::RtSeq cur = genome.start<runtime_k::RtSeq>(k_);
+// cur >>= 0;
+// for (size_t i = 0; i + k_ - 1 < genome.size(); i++) {
+// cur <<= genome[i + k_ - 1];
+// auto corr_cur = kmer_mapper.Substitute(cur);
+// if (index.contains(corr_cur)) {
+// quality_[index.get(corr_cur).first]++;
+// }
+// }
+// }
+//
+// EdgeQuality(const Graph &graph, const Index &index,
+// const KmerMapper<Graph>& kmer_mapper,
+// const Sequence &genome) :
+//
+// GraphActionHandler<Graph>(graph, "EdgeQualityLabeler"),
+// k_(kmer_mapper.get_k()) {
+// FillQuality(index, kmer_mapper, genome);
+// FillQuality(index, kmer_mapper, !genome);
+// }
+//
+// virtual ~EdgeQuality() {
+// }
+//
+// virtual void HandleAdd(EdgeId /*e*/) {
+// }
+//
+// virtual void HandleDelete(EdgeId e) {
+// quality_.erase(e);
+// }
+//
+// virtual void HandleMerge(const vector<EdgeId>& old_edges, EdgeId new_edge) {
+// size_t res = 0;
+// for (size_t i = 0; i < old_edges.size(); i++) {
+// res += quality_[old_edges[i]];
+// }
+// quality_[new_edge] += res;
+// }
+//
+// virtual void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) {
+// quality_[new_edge] += quality_[edge2];
+// quality_[new_edge] += quality_[edge1];
+// }
+//
+// virtual void HandleSplit(EdgeId old_edge, EdgeId new_edge1,
+// EdgeId new_edge2) {
+// quality_[new_edge1] = quality_[old_edge] * this->g().length(new_edge1)
+// / (this->g().length(new_edge1) + this->g().length(new_edge2));
+// quality_[new_edge2] = quality_[old_edge] * this->g().length(new_edge2)
+// / (this->g().length(new_edge1) + this->g().length(new_edge2));
+// }
+//
+// double quality(EdgeId edge) const {
+// auto it = quality_.find(edge);
+// if (it == quality_.end())
+// return 0.;
+// else
+// return 1. * (double) it->second / (double) this->g().length(edge);
+// }
+//
+// bool IsPositiveQuality(EdgeId edge) const {
+// return math::gr(quality(edge), 0.);
+// }
+//
+// virtual std::string label(VertexId /*vertexId*/) const {
+// return "";
+// }
+//
+// virtual std::string label(EdgeId edge) const {
+// double q = quality(edge);
+// return (q == 0) ? "" : "quality: " + ToString(q);
+// }
+//
+//};
+//
+//template<class Graph, class Index>
+//class QualityLoggingRemovalHandler {
+// typedef typename Graph::EdgeId EdgeId;
+// const Graph& g_;
+// const EdgeQuality<Graph, Index>& quality_handler_;
+//// size_t black_removed_;
+//// size_t colored_removed_;
+//public:
+// QualityLoggingRemovalHandler(const Graph& g, const EdgeQuality<Graph, Index>& quality_handler) :
+// g_(g), quality_handler_(quality_handler)/*, black_removed_(0), colored_removed_(
+// 0)*/{
+//
+// }
+//
+// void HandleDelete(EdgeId edge) {
+// if (math::gr(quality_handler_.quality(edge), 0.)) {
+// TRACE("Deleting edge " << g_.str(edge) << " with quality " << quality_handler_.quality(edge));
+// } else {
+//// TRACE("Deleting edge " << g_.int_id(edge) << " with zero quality");
+// }
+//// if (math::gr(quality_handler_.quality(edge), 0.))
+//// colored_removed_++;
+//// else
+//// black_removed_++;
+// }
+//
+//private:
+// DECL_LOGGER("QualityLoggingRemovalHandler")
+// ;
+//};
+//
+//template<class Graph, class Index>
+//class QualityLoggingRemovalCountHandler {
+// typedef typename Graph::EdgeId EdgeId;
+// const Graph& g_;
+// const EdgeQuality<Graph, Index>& quality_handler_;
+// size_t black_removed_;
+// size_t total;
+//
+//public:
+// QualityLoggingRemovalCountHandler(const Graph& g, const EdgeQuality<Graph, Index>& quality_handler) :
+// g_(g), quality_handler_(quality_handler)/*, black_removed_(0), colored_removed_(
+// 0)*/{
+// black_removed_ = 0;
+// total = 0;
+// }
+//
+// void HandleDelete(EdgeId edge) {
+// total++;
+// if (math::gr(quality_handler_.quality(edge), 0.)) {
+// TRACE("Deleting good edge " << g_.int_id(edge) << " with quality " << quality_handler_.quality(edge) << " cov " << g_.coverage(edge) << " length " << g_.length(edge));
+// }else{
+// black_removed_++;
+// }
+// if ((total % (1<<10)) != 0)
+// TRACE("Removed still " << black_removed_ << " " << total);
+// }
+//
+//private:
+//};
+//
+//template<class Graph, class Index>
+//class QualityEdgeLocalityPrintingRH {
+// typedef typename Graph::EdgeId EdgeId;
+// typedef typename Graph::VertexId VertexId;
+// const Graph& g_;
+// const EdgeQuality<Graph, Index>& quality_handler_;
+// const omnigraph::GraphLabeler<Graph>& labeler_;
+// const omnigraph::visualization::GraphColorer<Graph>& colorer_;
+// const string& output_folder_;
+//// size_t black_removed_;
+//// size_t colored_removed_;
+//public:
+// QualityEdgeLocalityPrintingRH(const Graph& g
+// , const EdgeQuality<Graph, Index>& quality_handler
+// , const omnigraph::GraphLabeler<Graph>& labeler
+// , const omnigraph::visualization::GraphColorer<Graph>& colorer
+// , const string& output_folder) :
+// g_(g), quality_handler_(quality_handler),
+// labeler_(labeler), colorer_(colorer), output_folder_(output_folder){
+// }
+//
+// void HandleDelete(EdgeId edge) {
+// if (quality_handler_.IsPositiveQuality(edge)) {
+// DEBUG("Deleting edge " << g_.str(edge) << " with quality " << quality_handler_.quality(edge));
+// string folder = output_folder_ + "colored_edges_deleted/";
+// path::make_dir(folder);
+// //todo magic constant
+//// map<EdgeId, string> empty_coloring;
+// shared_ptr<GraphSplitter<Graph>> splitter = EdgeNeighborhoodFinder<Graph>(g_, edge, 50, 250);
+// omnigraph::visualization::WriteComponents(g_, *splitter/*, "locality_of_edge_" + ToString(g_.int_id(edge))*/
+// , folder + "edge_" + ToString(g_.int_id(edge)) + "_" + ToString(quality_handler_.quality(edge)) + ".dot"
+// , colorer_, labeler_);
+// } else {
+// TRACE("Deleting edge " << g_.str(edge) << " with zero quality");
+// }
+// }
+//
+//private:
+// DECL_LOGGER("QualityEdgeLocalityPrintingRH")
+// ;
+//};
+//
+//template<class Graph, class Index>
+//class QualityPairInfoHandler {
+// typedef typename Graph::EdgeId EdgeId;
+// typedef typename Graph::VertexId VertexId;
+// typedef omnigraph::PairInfo<EdgeId> PairInfo;
+// typedef vector<PairInfo> PairInfos;
+// const Graph& g_;
+// const EdgeQuality<Graph, Index>& quality_handler_;
+// const GraphLabeler<Graph>& labeler_;
+// const string& output_folder_;
+// const PairedInfoIndex<ConjugateDeBruijnGraph>& index_;
+//// size_t black_removed_;
+//// size_t colored_removed_;
+//public:
+// QualityPairInfoHandler(const Graph& g
+// , const EdgeQuality<Graph, Index>& quality_handler
+// , const GraphLabeler<Graph>& labeler
+// , const string& output_folder
+// , const PairedInfoIndex<ConjugateDeBruijnGraph>& index) :
+// g_(g), quality_handler_(quality_handler),
+// labeler_(labeler), output_folder_(output_folder), index_(index) {
+// }
+//
+// void HandleDelete(EdgeId edge) {
+// if (quality_handler_.IsPositiveQuality(edge)) {
+// cout << "Deleting edge " << g_.str(edge) << " with quality " << quality_handler_.quality(edge) << endl;
+// string folder = output_folder_ + "colored_edges_deleted/";
+// path::make_dir(folder);
+// //todo magic constant
+// PairInfos infos = index_.GetEdgeInfo(edge);
+// if (infos.size() > 0){
+// for (size_t i = 0; i<infos.size(); i++){
+// cout << "Tip Info " << g_.int_id(infos[i].first) << " " << g_.int_id(infos[i].second) << " " << infos[i].d << " " << infos[i].weight << " " << infos[i].variance << endl;
+// }
+// }
+// map<EdgeId, string> empty_coloring;
+// shared_ptr<GraphSplitter<Graph>> splitter = EdgeNeighborhoodFinder<Graph>(g_, edge, 50,
+// 250);
+//
+// omnigraph::visualization::WriteComponents(g_, *splitter, TrueFilter<vector<VertexId>>(), "locality_of_edge_" + ToString(g_.int_id(edge))
+// , folder + "edge_" + ToString(g_.int_id(edge)) + "_" + ToString(quality_handler_.quality(edge)) + ".dot"
+// , empty_coloring, labeler_);
+// }
+// }
+//
+//private:
+//};
+//
+////todo what is the difference with QELPRH?!
+//template<class Graph>
+//class EdgeLocalityPrintingRH {
+// typedef typename Graph::EdgeId EdgeId;
+// typedef typename Graph::VertexId VertexId;
+// const Graph& g_;
+// const GraphLabeler<Graph>& labeler_;
+// const string& output_folder_;
+// std::function<double (EdgeId)>& quality_f_;
+//// size_t black_removed_;
+//// size_t colored_removed_;
+//public:
+// EdgeLocalityPrintingRH(const Graph& g
+// , const GraphLabeler<Graph>& labeler
+// , const string& output_folder
+// , std::function<double (EdgeId)> quality_f = 0) :
+// g_(g),
+// labeler_(labeler), output_folder_(output_folder),
+// quality_f_(quality_f){
+// }
+//
+// void HandleDelete(EdgeId edge) {
+// TRACE("Deleting edge " << g_.str(edge));
+// if (quality_f_ && math::gr(quality_f_(edge), 0.))
+// INFO("Handling the edge with positive quality : " << quality_f_(edge) << " " << g_.str(edge));
+//
+// string folder = output_folder_ + "edges_deleted/";
+// path::make_dir(folder);
+// //todo magic constant
+// map<EdgeId, string> empty_coloring;
+// shared_ptr<GraphSplitter<Graph>> splitter = EdgeNeighborhoodFinder<Graph>(g_, edge, 50, 250);
+// omnigraph::visualization::WriteComponents(g_, *splitter, TrueFilter<vector<VertexId>>(), "locality_of_edge_" + ToString(g_.int_id(edge))
+// , folder + "edge_" + ToString(g_.int_id(edge)) + ".dot", empty_coloring, labeler_);
+// }
+//
+//private:
+// DECL_LOGGER("EdgeLocalityPrintingRH")
+// ;
+//};
+
+}
diff --git a/src/modules/assembly_graph/graph_support/graph_processing_algorithm.hpp b/src/modules/assembly_graph/graph_support/graph_processing_algorithm.hpp
new file mode 100644
index 0000000..cce6c20
--- /dev/null
+++ b/src/modules/assembly_graph/graph_support/graph_processing_algorithm.hpp
@@ -0,0 +1,262 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "dev_support/func.hpp"
+#include <boost/none.hpp>
+#include <atomic>
+#include "assembly_graph/graph_core/graph_iterators.hpp"
+#include "assembly_graph/components/graph_component.hpp"
+#include "math/pred.hpp"
+#include "dev_support/logger/logger.hpp"
+
+namespace omnigraph {
+
+template<class Graph>
+using HandlerF = std::function<void(typename Graph::EdgeId)>;
+
+template<class Graph>
+class EdgeProcessingAlgorithm {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef pred::TypedPredicate<EdgeId> ProceedConditionT;
+
+ Graph& g_;
+ bool conjugate_symmetry_;
+ protected:
+
+ Graph& g() {
+ return g_;
+ }
+
+ const Graph& g() const {
+ return g_;
+ }
+
+ virtual bool ProcessEdge(EdgeId e) = 0;
+
+ public:
+ EdgeProcessingAlgorithm(Graph& g,
+ bool conjugate_symmetry = false)
+ : g_(g), conjugate_symmetry_(conjugate_symmetry) {
+
+ }
+
+ virtual ~EdgeProcessingAlgorithm() {
+ }
+
+// bool conjugate_symmetry() const {
+// return conjugate_symmetry_;
+// }
+
+ template<class Comparator = std::less<EdgeId>>
+ bool Run(const Comparator& comp = Comparator(), ProceedConditionT proceed_condition = pred::AlwaysTrue<EdgeId>()) {
+ bool triggered = false;
+ for (auto it = g_.SmartEdgeBegin(comp, conjugate_symmetry_); !it.IsEnd(); ++it) {
+ EdgeId e = *it;
+ TRACE("Current edge " << g_.str(e));
+ if (!proceed_condition(e)) {
+ TRACE("Stop condition was reached.");
+ break;
+ }
+
+ TRACE("Processing edge " << this->g().str(e));
+ triggered |= ProcessEdge(e);
+ };
+ return triggered;
+ }
+
+ private:
+ DECL_LOGGER("EdgeProcessingAlgorithm");
+};
+
+template<class Graph>
+class CountingCallback {
+ typedef typename Graph::EdgeId EdgeId;
+ bool report_on_destruction_;
+ std::atomic<size_t> cnt_;
+
+public:
+ CountingCallback(bool report_on_destruction = false) :
+ report_on_destruction_(report_on_destruction), cnt_(0) {
+ }
+
+ ~CountingCallback() {
+ if (report_on_destruction_)
+ Report();
+ }
+
+ void HandleDelete(EdgeId /*e*/) {
+ cnt_++;
+ }
+
+ void Report() {
+ TRACE(cnt_ << " edges were removed.")
+ cnt_ = 0;
+ }
+
+private:
+ DECL_LOGGER("CountingCallback");
+};
+
+template<class Graph>
+std::function<void(typename Graph::EdgeId)> AddCountingCallback(CountingCallback<Graph>& cnt_callback, std::function<void(typename Graph::EdgeId)> handler) {
+ std::function<void(typename Graph::EdgeId)> cnt_handler = std::bind(&CountingCallback<Graph>::HandleDelete, std::ref(cnt_callback), std::placeholders::_1);
+ return func::Composition<typename Graph::EdgeId>(handler, cnt_handler);
+}
+template<class Graph>
+void RemoveIsolatedOrCompress(Graph& g, typename Graph::VertexId v) {
+ if (g.IsDeadStart(v) && g.IsDeadEnd(v)) {
+ g.DeleteVertex(v);
+ } else {
+ g.CompressVertex(v);
+ }
+}
+
+template<class Graph>
+class EdgeRemover {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef std::function<void(EdgeId)> HandlerF;
+
+ Graph& g_;
+ HandlerF removal_handler_;
+
+ public:
+ EdgeRemover(Graph& g, HandlerF removal_handler = nullptr)
+ : g_(g),
+ removal_handler_(removal_handler) {
+ }
+
+ void DeleteEdge(EdgeId e) {
+ VertexId start = g_.EdgeStart(e);
+ VertexId end = g_.EdgeEnd(e);
+ DeleteEdgeWithNoCompression(e);
+ // NOTE: e here is already dead!
+ TRACE("Compressing locality");
+ if (!g_.RelatedVertices(start, end)) {
+ TRACE("Vertices not related");
+ TRACE("Processing end");
+ RemoveIsolatedOrCompress(g_, end);
+ TRACE("End processed");
+ }
+ TRACE("Processing start");
+ RemoveIsolatedOrCompress(g_, start);
+ TRACE("Start processed");
+ }
+
+ void DeleteEdgeWithNoCompression(EdgeId e) {
+ TRACE("Deletion of edge " << g_.str(e));
+ TRACE("Start " << g_.str(g_.EdgeStart(e)));
+ TRACE("End " << g_.str(g_.EdgeEnd(e)));
+ if (removal_handler_) {
+ TRACE("Calling handler");
+ removal_handler_(e);
+ }
+ TRACE("Deleting edge");
+ g_.DeleteEdge(e);
+ }
+
+ private:
+ DECL_LOGGER("EdgeRemover");
+};
+
+template<class Graph>
+class EdgeRemovingAlgorithm : public EdgeProcessingAlgorithm<Graph> {
+ typedef EdgeProcessingAlgorithm<Graph> base;
+ typedef typename Graph::EdgeId EdgeId;
+
+ pred::TypedPredicate<EdgeId> remove_condition_;
+ EdgeRemover<Graph> edge_remover_;
+
+ protected:
+ bool ProcessEdge(EdgeId e) {
+ TRACE("Checking edge " << this->g().str(e) << " for the removal condition");
+ if (remove_condition_(e)) {
+ TRACE("Check passed, removing");
+ edge_remover_.DeleteEdge(e);
+ return true;
+ }
+ TRACE("Check not passed");
+ return false;
+ }
+
+ public:
+ EdgeRemovingAlgorithm(Graph& g,
+ pred::TypedPredicate<EdgeId> remove_condition,
+ std::function<void (EdgeId)> removal_handler = boost::none,
+ bool conjugate_symmetry = false)
+ : base(g, conjugate_symmetry),
+ remove_condition_(remove_condition),
+ edge_remover_(g, removal_handler) {}
+
+ private:
+ DECL_LOGGER("EdgeRemovingAlgorithm");
+};
+
+//todo rewrite with SmartSetIterator
+template<class Graph>
+class ComponentRemover {
+ public:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef std::function<void(const std::set<EdgeId>&)> HandlerF;
+
+ private:
+ Graph& g_;
+ HandlerF removal_handler_;
+
+ template<class ElemType>
+ void InsertIfNotConjugate(std::set<ElemType>& elems, ElemType elem) {
+ if (elems.count(g_.conjugate(elem)) == 0) {
+ elems.insert(elem);
+ }
+ }
+
+ public:
+ ComponentRemover(Graph& g, HandlerF removal_handler = 0)
+ : g_(g),
+ removal_handler_(removal_handler) {
+ }
+
+ template<class EdgeIt>
+ void DeleteComponent(EdgeIt begin, EdgeIt end, bool alter_vertices = true) {
+ using std::set;
+ set<EdgeId> edges;
+ set<VertexId> vertices;
+
+ //cleaning conjugates and gathering vertices
+ for (EdgeIt it = begin; it != end; ++it) {
+ EdgeId e = *it;
+ InsertIfNotConjugate(edges, e);
+ InsertIfNotConjugate(vertices, g_.EdgeStart(e));
+ InsertIfNotConjugate(vertices, g_.EdgeEnd(e));
+ }
+
+ if (removal_handler_) {
+ removal_handler_(edges);
+ }
+
+ for (EdgeId e: edges) {
+ g_.DeleteEdge(e);
+ }
+
+ if (alter_vertices) {
+ for (VertexId v: vertices) {
+ RemoveIsolatedOrCompress(g_, v);
+ }
+ }
+ }
+
+ template<class Container>
+ void DeleteComponent(const Container& container, bool alter_vertices = true) {
+ DeleteComponent(container.begin(), container.end(), alter_vertices);
+ }
+
+};
+
+}
diff --git a/src/include/omni/marks_and_locks.hpp b/src/modules/assembly_graph/graph_support/marks_and_locks.hpp
similarity index 100%
rename from src/include/omni/marks_and_locks.hpp
rename to src/modules/assembly_graph/graph_support/marks_and_locks.hpp
diff --git a/src/modules/assembly_graph/graph_support/parallel_processing.hpp b/src/modules/assembly_graph/graph_support/parallel_processing.hpp
new file mode 100644
index 0000000..9b5084b
--- /dev/null
+++ b/src/modules/assembly_graph/graph_support/parallel_processing.hpp
@@ -0,0 +1,290 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "dev_support/logger/logger.hpp"
+#include "assembly_graph/graph_core/graph_iterators.hpp"
+#include "assembly_graph/graph_support/graph_processing_algorithm.hpp"
+#include "dev_support/openmp_wrapper.h"
+
+namespace omnigraph {
+
+template<class ItVec, class SmartIt, class Predicate>
+void FillInterestingFromChunkIterators(const ItVec& chunk_iterators,
+ SmartIt& smart_it,
+ const Predicate& predicate) {
+ VERIFY(chunk_iterators.size() > 1);
+ typedef typename Predicate::checked_type ElementType;
+ std::vector<std::vector<ElementType>> of_interest(omp_get_max_threads());
+
+ #pragma omp parallel for schedule(guided)
+ for (size_t i = 0; i < chunk_iterators.size() - 1; ++i) {
+ for (auto it = chunk_iterators[i], end = chunk_iterators[i + 1]; it != end; ++it) {
+ ElementType t = *it;
+ if (predicate(t)) {
+ of_interest[omp_get_thread_num()].push_back(t);
+ }
+ }
+ }
+
+ for (auto& chunk : of_interest) {
+ smart_it.insert(chunk.begin(), chunk.end());
+ chunk.clear();
+ }
+}
+
+template<class Graph, class ElementId = typename Graph::EdgeId>
+class TrivialInterestingElementFinder {
+public:
+
+ TrivialInterestingElementFinder() {
+ }
+
+ template<class SmartIt>
+ bool Run(SmartIt& /*it*/) const {
+ return false;
+ }
+};
+
+template<class Graph, class ElementId = typename Graph::EdgeId>
+class SimpleInterestingElementFinder {
+ typedef GraphEdgeIterator<Graph> EdgeIt;
+
+ const Graph& g_;
+ pred::TypedPredicate<ElementId> condition_;
+public:
+
+ SimpleInterestingElementFinder(const Graph& g,
+ pred::TypedPredicate<ElementId> condition = pred::AlwaysTrue<ElementId>())
+ : g_(g), condition_(condition) {}
+
+ template<class SmartIt>
+ bool Run(SmartIt& interest) const {
+ for (EdgeIt it = EdgeIt(g_, g_.begin()), end = EdgeIt(g_, g_.end()); it != end; ++it) {
+ if (condition_(*it)) {
+ interest.push(*it);
+ }
+ }
+ return false;
+ }
+};
+
+template<class Graph, class ElementId = typename Graph::EdgeId>
+class ParallelInterestingElementFinder {
+ typedef GraphEdgeIterator<Graph> EdgeIt;
+
+ const Graph& g_;
+ pred::TypedPredicate<ElementId> condition_;
+ const size_t chunk_cnt_;
+public:
+
+ ParallelInterestingElementFinder(const Graph& g,
+ pred::TypedPredicate<ElementId> condition,
+ size_t chunk_cnt)
+ : g_(g), condition_(condition), chunk_cnt_(chunk_cnt) {}
+
+ template<class SmartIt>
+ bool Run(SmartIt& it) const {
+ TRACE("Looking for interesting elements");
+ TRACE("Splitting graph into " << chunk_cnt_ << " chunks");
+ FillInterestingFromChunkIterators(IterationHelper<Graph, ElementId>(g_).Chunks(chunk_cnt_), it, condition_);
+ TRACE("Found " << it.size() << " interesting elements");
+ return false;
+ }
+private:
+ DECL_LOGGER("ParallelInterestingElementFinder");
+};
+
+template<class Graph>
+class PersistentAlgorithmBase {
+ Graph& g_;
+protected:
+
+ PersistentAlgorithmBase(Graph& g) : g_(g) {}
+
+ Graph& g() { return g_; }
+ const Graph& g() const { return g_; }
+public:
+ virtual ~PersistentAlgorithmBase() {}
+ virtual bool Run(bool force_primary_launch = false) = 0;
+};
+
+//todo use add_condition in it_
+template<class Graph, class ElementId, class InterestingElementFinder,
+ class Comparator = std::less<ElementId>>
+class PersistentProcessingAlgorithm : public PersistentAlgorithmBase<Graph> {
+ InterestingElementFinder interest_el_finder_;
+
+ SmartSetIterator<Graph, ElementId, Comparator> it_;
+ //todo remove
+ bool tracking_;
+ size_t total_iteration_estimate_;
+
+ size_t curr_iteration_;
+
+protected:
+
+ virtual bool Process(ElementId el) = 0;
+ virtual bool Proceed(ElementId /*el*/) const { return true; }
+
+ virtual void PrepareIteration(size_t /*it_cnt*/, size_t /*total_it_estimate*/) {}
+
+public:
+
+ PersistentProcessingAlgorithm(Graph& g,
+ const InterestingElementFinder& interest_el_finder,
+ bool canonical_only = false,
+ const Comparator& comp = Comparator(),
+ bool track_changes = true,
+ size_t total_iteration_estimate = -1ul) :
+ PersistentAlgorithmBase<Graph>(g),
+ interest_el_finder_(interest_el_finder),
+ it_(g, true, comp, canonical_only),
+ tracking_(track_changes),
+ total_iteration_estimate_(total_iteration_estimate),
+ curr_iteration_(0) {
+ it_.Detach();
+ }
+
+ bool Run(bool force_primary_launch = false) {
+ bool primary_launch = !tracking_ || (curr_iteration_ == 0) || force_primary_launch ;
+ if (!it_.IsAttached()) {
+ it_.Attach();
+ }
+ if (primary_launch) {
+ it_.clear();
+ TRACE("Primary launch.");
+ TRACE("Start preprocessing");
+ interest_el_finder_.Run(it_);
+ TRACE(it_.size() << " edges to process after preprocessing");
+ } else {
+ TRACE(it_.size() << " edges to process");
+ VERIFY(tracking_);
+ }
+
+ if (curr_iteration_ >= total_iteration_estimate_) {
+ PrepareIteration(total_iteration_estimate_ - 1, total_iteration_estimate_);
+ } else {
+ PrepareIteration(curr_iteration_, total_iteration_estimate_);
+ }
+
+ bool triggered = false;
+ TRACE("Start processing");
+ for (; !it_.IsEnd(); ++it_) {
+ ElementId el = *it_;
+ if (!Proceed(el)) {
+ TRACE("Proceed condition turned false on element " << this->g().str(el));
+ it_.ReleaseCurrent();
+ break;
+ }
+ TRACE("Processing edge " << this->g().str(el));
+ triggered |= Process(el);
+ }
+ TRACE("Finished processing. Triggered = " << triggered);
+ if (!tracking_)
+ it_.Detach();
+
+ curr_iteration_++;
+ return triggered;
+ }
+
+};
+
+template<class Graph, class InterestingEdgeFinder,
+ class Comparator = std::less<typename Graph::EdgeId>>
+class PersistentEdgeRemovingAlgorithm : public PersistentProcessingAlgorithm<Graph,
+ typename Graph::EdgeId,
+ InterestingEdgeFinder, Comparator> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef PersistentProcessingAlgorithm<Graph, EdgeId, InterestingEdgeFinder, Comparator> base;
+ EdgeRemover<Graph> edge_remover_;
+public:
+ PersistentEdgeRemovingAlgorithm(Graph& g,
+ const InterestingEdgeFinder& interest_edge_finder,
+ std::function<void(EdgeId)> removal_handler = boost::none,
+ bool canonical_only = false,
+ const Comparator& comp = Comparator(),
+ bool track_changes = true,
+ size_t total_iteration_estimate = -1ul)
+ : base(g, interest_edge_finder,
+ canonical_only, comp, track_changes,
+ total_iteration_estimate),
+ edge_remover_(g, removal_handler) {
+
+ }
+
+protected:
+
+ virtual bool ShouldRemove(EdgeId e) const = 0;
+
+ bool Process(EdgeId e) override {
+ TRACE("Checking edge " << this->g().str(e) << " for the removal condition");
+ if (ShouldRemove(e)) {
+ TRACE("Check passed, removing");
+ edge_remover_.DeleteEdge(e);
+ return true;
+ }
+ TRACE("Check not passed");
+ return false;
+ }
+
+};
+
+template<class Graph, class InterestingEdgeFinder,
+ class Comparator = std::less<typename Graph::EdgeId>>
+class ConditionEdgeRemovingAlgorithm : public PersistentEdgeRemovingAlgorithm<Graph,
+ InterestingEdgeFinder, Comparator> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef PersistentEdgeRemovingAlgorithm<Graph, InterestingEdgeFinder, Comparator> base;
+ pred::TypedPredicate<EdgeId> remove_condition_;
+protected:
+
+ bool ShouldRemove(EdgeId e) const override {
+ return remove_condition_(e);
+ }
+
+public:
+ ConditionEdgeRemovingAlgorithm(Graph& g,
+ const InterestingEdgeFinder& interest_edge_finder,
+ pred::TypedPredicate<EdgeId> remove_condition,
+ std::function<void(EdgeId)> removal_handler = boost::none,
+ bool canonical_only = false,
+ const Comparator& comp = Comparator(),
+ bool track_changes = true)
+ : base(g, interest_edge_finder,
+ removal_handler,
+ canonical_only, comp, track_changes),
+ remove_condition_(remove_condition) {
+
+ }
+};
+
+template<class Graph, class Comparator = std::less<typename Graph::EdgeId>>
+class ParallelEdgeRemovingAlgorithm : public ConditionEdgeRemovingAlgorithm<Graph,
+ ParallelInterestingElementFinder<Graph>, Comparator> {
+ typedef ConditionEdgeRemovingAlgorithm<Graph,
+ ParallelInterestingElementFinder<Graph>, Comparator> base;
+ typedef typename Graph::EdgeId EdgeId;
+
+public:
+ ParallelEdgeRemovingAlgorithm(Graph& g,
+ pred::TypedPredicate<EdgeId> remove_condition,
+ size_t chunk_cnt,
+ std::function<void(EdgeId)> removal_handler = boost::none,
+ bool canonical_only = false,
+ const Comparator& comp = Comparator(),
+ bool track_changes = true)
+ : base(g,
+ ParallelInterestingElementFinder<Graph>(g, remove_condition, chunk_cnt),
+ remove_condition, removal_handler,
+ canonical_only, comp, track_changes) {
+ }
+
+};
+
+}
diff --git a/src/debruijn/path_extend/scaffolder2015/scaff_supplementary.cpp b/src/modules/assembly_graph/graph_support/scaff_supplementary.cpp
similarity index 100%
rename from src/debruijn/path_extend/scaffolder2015/scaff_supplementary.cpp
rename to src/modules/assembly_graph/graph_support/scaff_supplementary.cpp
diff --git a/src/modules/assembly_graph/graph_support/scaff_supplementary.hpp b/src/modules/assembly_graph/graph_support/scaff_supplementary.hpp
new file mode 100644
index 0000000..71522f6
--- /dev/null
+++ b/src/modules/assembly_graph/graph_support/scaff_supplementary.hpp
@@ -0,0 +1,77 @@
+#pragma once
+
+#include "assembly_graph/graph_core/graph.hpp"
+#include "pipeline/graph_pack.hpp"
+#include "dev_support/logger/logger.hpp"
+
+namespace path_extend {
+ typedef debruijn_graph::EdgeId EdgeId;
+
+/* Storage of presumably unique, relatively long edges. Filled by ScaffoldingUniqueEdgeAnalyzer
+ *
+ */
+ class ScaffoldingUniqueEdgeStorage {
+ friend class ScaffoldingUniqueEdgeAnalyzer;
+ private:
+ set <EdgeId> unique_edges_;
+ size_t min_unique_length_;
+ public:
+ ScaffoldingUniqueEdgeStorage(): unique_edges_(){
+ DEBUG("storage created, empty");
+ }
+
+ bool IsUnique(EdgeId e) const {
+ return (unique_edges_.find(e) != unique_edges_.end());
+ }
+
+ decltype(unique_edges_.begin()) begin() const {
+ return unique_edges_.begin();
+ }
+
+ decltype(unique_edges_.end()) end() const {
+ return unique_edges_.end();
+ }
+
+ size_t size() const {
+ return unique_edges_.size();
+ }
+ size_t GetMinLength() const {
+ return min_unique_length_;
+ }
+ void SetMinLength(size_t min_length) {
+ min_unique_length_ = min_length;
+ }
+
+ const set<EdgeId>& GetSet() const {
+ return unique_edges_;
+ }
+ protected:
+ DECL_LOGGER("ScaffoldingUniqueEdgeStorage")
+
+ };
+
+/* Auxillary class required to fillin the unique edge storage.
+ *
+ */
+ class ScaffoldingUniqueEdgeAnalyzer {
+
+ ;
+ private:
+ const debruijn_graph::conj_graph_pack &gp_;
+ size_t length_cutoff_;
+ double median_coverage_;
+ double relative_coverage_variation_;
+ protected:
+ DECL_LOGGER("ScaffoldingUniqueEdgeAnalyzer")
+
+
+ void SetCoverageBasedCutoff();
+ public:
+ ScaffoldingUniqueEdgeAnalyzer(const debruijn_graph::conj_graph_pack &gp, size_t apriori_length_cutoff, double max_relative_coverage):gp_(gp), length_cutoff_(apriori_length_cutoff), relative_coverage_variation_(max_relative_coverage){
+ SetCoverageBasedCutoff();
+ }
+ void FillUniqueEdgeStorage(ScaffoldingUniqueEdgeStorage &storage_);
+ };
+}
+
+
diff --git a/src/modules/assembly_graph/handlers/edge_labels_handler.hpp b/src/modules/assembly_graph/handlers/edge_labels_handler.hpp
new file mode 100644
index 0000000..4a8c653
--- /dev/null
+++ b/src/modules/assembly_graph/handlers/edge_labels_handler.hpp
@@ -0,0 +1,226 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ *
+ * Saves labeling of new_graph via different graph transformation by edges of unresolved graph - old_graph
+ * Has two methods
+ *
+ * Created on: Aug 5, 2011
+ * Author: undead
+ */
+
+#ifndef EDGE_LABELS_HANDLER_HPP_
+#define EDGE_LABELS_HANDLER_HPP_
+
+//#include "utils.hpp"
+#include "visualization/graph_labeler.hpp"
+#include "dev_support/simple_tools.hpp"
+#include <unordered_map>
+#include <map>
+
+using namespace omnigraph;
+
+namespace omnigraph {
+using std::map;
+
+//todo ask Shurik to remove new_graph_
+template<class Graph>
+class EdgeLabelHandler : public GraphActionHandler<Graph> {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+private:
+ Graph &new_graph_;
+ Graph &old_graph_;
+ //From new edge to sequence of old
+public:
+ map<EdgeId, vector<EdgeId> > edge_labels;
+ //From old edge to set of new ones, containing it.
+ map<EdgeId, set<EdgeId> > edge_inclusions;
+public:
+ //TODO: integrate this to resolver, remove "from_resolve" parameter
+ EdgeLabelHandler(Graph &new_graph, Graph &old_graph,
+ const std::map<EdgeId, EdgeId> &from_resolve)
+ : GraphActionHandler<Graph>(new_graph, "EdgePositionHandler"),
+ new_graph_(new_graph),
+ old_graph_(old_graph) {
+ // printing from resolve
+ FillLabels(from_resolve);
+ /* for(auto iter = from_resolve.begin(); iter != from_resolve.end(); ++iter) {
+ if (edge_inclusions.find(iter->second) == edge_inclusions.end()){
+ set<EdgeId> tmp;
+ edge_inclusions.insert(make_pair(iter->second, tmp));
+ }
+ edge_inclusions[iter->second].insert(iter->first);
+
+ if (edge_labels.find(iter->first) == edge_labels.end()) {
+ set<EdgeId> tmp;
+ edge_labels.insert(make_pair(iter->first, tmp));
+ }
+ edge_labels[iter->second].push_back(iter->second);
+ }
+ */}
+
+ EdgeLabelHandler(Graph &new_graph, Graph &old_graph)
+ : GraphActionHandler<Graph>(new_graph, "EdgePositionHandler"),
+ new_graph_(new_graph),
+ old_graph_(old_graph) {
+ }
+
+ void FillLabels(const map<EdgeId, EdgeId> &from_resolve) {
+ for (auto iter = from_resolve.begin(); iter != from_resolve.end();
+ ++iter) {
+ if (edge_inclusions.find(iter->second) == edge_inclusions.end()) {
+ set<EdgeId> tmp;
+ edge_inclusions.insert(make_pair(iter->second, tmp));
+ }
+ edge_inclusions.find(iter->second)->second.insert(iter->first);
+
+ if (edge_labels.find(iter->first) == edge_labels.end()) {
+ vector<EdgeId> tmp;
+ edge_labels.insert(make_pair(iter->first, tmp));
+ }
+ edge_labels[iter->first].push_back(iter->second);
+ }
+ }
+
+ virtual ~EdgeLabelHandler() {
+ }
+
+ virtual void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) {
+ TRACE("Handle glue");
+ if (edge_labels[edge1] != edge_labels[edge2])
+ WARN("gluing two different edges is not a good idea on this step! EdgeLabel Handler can fail on such operation");
+ vector<EdgeId> tmp;
+ for (size_t i = 0; i < edge_labels[edge1].size(); i++) {
+ edge_inclusions.find(edge_labels[edge1][i])->second.insert(
+ new_edge);
+ edge_inclusions.find(edge_labels[edge1][i])->second.erase(edge1);
+ tmp.push_back(edge_labels[edge1][i]);
+
+ edge_labels.erase(edge1);
+ }
+ for (size_t i = 0; i < edge_labels[edge2].size(); i++) {
+ edge_inclusions.find(edge_labels[edge2][i])->second.insert(
+ new_edge);
+ edge_inclusions.find(edge_labels[edge2][i])->second.erase(edge2);
+ edge_labels.erase(edge2);
+
+ // tmp.push_back(edge_labels[edge1][i]);
+ }
+
+ edge_labels.insert(make_pair(new_edge, tmp));
+
+ }
+
+ virtual void HandleSplit(EdgeId /*oldEdge*/, EdgeId /*newEdge1*/, EdgeId /*newEdge2*/) {
+ WARN("EdgesLabelHandler does not support splits");
+ }
+
+ virtual void HandleMerge(const vector<EdgeId> &oldEdges, EdgeId newEdge) {
+ TRACE("HandleMerge by edge labels handler");
+ size_t n = oldEdges.size();
+ vector<EdgeId> tmp;
+ for (size_t j = 0; j < n; j++) {
+ TRACE("Edge " << oldEdges[j] << " was labeled by " << edge_labels[oldEdges[j]]);
+ for (size_t i = 0; i < edge_labels[oldEdges[j]].size(); i++) {
+ edge_inclusions[edge_labels[oldEdges[j]][i]].insert(newEdge);
+ edge_inclusions[edge_labels[oldEdges[j]][i]].erase(oldEdges[j]);
+ tmp.push_back(edge_labels[oldEdges[j]][i]);
+ }
+ edge_labels.erase(oldEdges[j]);
+ }
+ if (edge_labels.find(newEdge) != edge_labels.end()) {
+ DEBUG("Unexpected finding of new edge labels");
+ };
+ edge_labels[newEdge] = tmp;
+
+ }
+
+ /*
+ virtual void HandleAdd(VertexId v) {
+ AddVertexIntId(v);
+ }
+ virtual void HandleDelete(VertexId v) {
+ ClearVertexId(v);
+ }
+ */
+ virtual void HandleAdd(EdgeId e) {
+ TRACE("Add edge " << e);
+
+ }
+
+ virtual void HandleDelete(EdgeId e) {
+ for (size_t i = 0; i < edge_labels[e].size(); i++) {
+ edge_inclusions[edge_labels[e][i]].erase(e);
+ }
+ edge_labels.erase(e);
+ }
+
+ std::string str(EdgeId edgeId) const {
+ std::stringstream ss;
+
+ auto it = edge_labels.find(edgeId);
+ if (it != edge_labels.end()) {
+ TRACE("Number of labels " << it->second.size());
+ for (auto label_it = it->second.begin(), end = it->second.end();
+ label_it != end; ++label_it) {
+ ss << this->g().str(*label_it) << "\\n";
+ }
+ }
+ return ss.str();
+ }
+
+ vector<pair<EdgeId, size_t> > resolvedPositions(EdgeId old_edge, size_t position_on_edge) {
+ vector<pair<EdgeId, size_t> > res;
+ for (auto it = edge_inclusions[old_edge].begin(); it != edge_inclusions[old_edge].end(); it++) {
+ EdgeId cur_edge = *it;
+ size_t cur_shift = 0;
+ for (size_t i = 0; i < edge_labels[cur_edge].size(); i++) {
+ if (edge_labels[cur_edge][i] == old_edge) {
+ res.push_back(make_pair(cur_edge, cur_shift + position_on_edge));
+ }
+ cur_shift += old_graph_.length(edge_labels[cur_edge][i]);
+ }
+ }
+ return res;
+ }
+
+};
+
+template<class Graph>
+class EdgesLabelsGraphLabeler : public GraphLabeler<Graph> {
+
+protected:
+ typedef GraphLabeler<Graph> super;
+ typedef typename super::EdgeId EdgeId;
+ typedef typename super::VertexId VertexId;
+ Graph &g_;
+public:
+ EdgeLabelHandler<Graph> &EdgesLabels;
+
+ EdgesLabelsGraphLabeler(Graph &g, EdgeLabelHandler<Graph> &EdgesLab)
+ : g_(g),
+ EdgesLabels(EdgesLab) {
+ }
+
+ virtual std::string label(VertexId vertexId) const {
+ return g_.str(vertexId);
+ }
+
+ virtual std::string label(EdgeId edgeId) const {
+ return EdgesLabels.str(edgeId) + ": " + g_.str(edgeId);
+ }
+
+ virtual ~EdgesLabelsGraphLabeler() {
+ TRACE("~EdgesPosGraphLabeler");
+ }
+
+};
+}
+
+#endif /* EDGE_LABELS_HANDLER_HPP_ */
diff --git a/src/modules/assembly_graph/handlers/edges_position_handler.hpp b/src/modules/assembly_graph/handlers/edges_position_handler.hpp
new file mode 100644
index 0000000..d3aefdf
--- /dev/null
+++ b/src/modules/assembly_graph/handlers/edges_position_handler.hpp
@@ -0,0 +1,207 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * edges_position_handler.hpp
+ *
+ * Created on: 22.07.2011
+ *
+ */
+
+#ifndef EDGES_POSITION_HANDLER_HPP_
+#define EDGES_POSITION_HANDLER_HPP_
+
+//#include "utils.hpp"
+#include "visualization/graph_labeler.hpp"
+#include "dev_support/simple_tools.hpp"
+#include "assembly_graph/paths/mapping_path.hpp"
+#include "assembly_graph/graph_core/action_handlers.hpp"
+
+namespace omnigraph {
+
+struct EdgePosition {
+ string contigId;
+ MappingRange mr;
+ EdgePosition(string _contigId, MappingRange _mr) : contigId(_contigId), mr(_mr) {
+ }
+
+ EdgePosition() {
+ }
+};
+
+inline ostream& operator <<(ostream& os, const EdgePosition& ep) {
+ return os << ep.contigId << " " << ep.mr;
+}
+
+template<class Graph>
+class EdgesPositionHandler: public GraphActionHandler<Graph> {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ size_t max_mapping_gap_;
+ size_t max_gap_diff_;
+ map<EdgeId, map<string, std::set<MappingRange>>> edges_positions_;
+ //TODO extract set<MappingRange> as a storage class
+
+ MappingRange EraseAndExtract(set<MappingRange> &ranges, set<MappingRange>::iterator &position, const MappingRange &new_pos) {
+ auto &old_pos = *position;
+ if(old_pos.IntersectLeftOf(new_pos) || old_pos.StrictlyContinuesWith(new_pos, max_mapping_gap_, max_gap_diff_)) {
+ ranges.erase(position);
+ return old_pos.Merge(new_pos);
+ } else if(new_pos.IntersectLeftOf(old_pos) || new_pos.StrictlyContinuesWith(old_pos, max_mapping_gap_, max_gap_diff_)) {
+ ranges.erase(position);
+ return new_pos.Merge(old_pos);
+ } else {
+ return new_pos;
+ }
+ }
+
+public:
+ MappingRange EraseAndExtract(set<MappingRange> &ranges, MappingRange new_pos) {
+ auto it = ranges.lower_bound(new_pos);
+ if(it != ranges.end()) {
+ new_pos = EraseAndExtract(ranges, it, new_pos);
+ it = ranges.lower_bound(new_pos);
+ }
+ if(it != ranges.begin()) {
+ new_pos = EraseAndExtract(ranges, --it, new_pos);
+ }
+ return new_pos;
+ }
+
+ set<MappingRange> GetEdgePositions(EdgeId edge, string contig_id) const {
+ VERIFY(this->IsAttached());
+ auto edge_it = edges_positions_.find(edge);
+ if(edge_it == edges_positions_.end())
+ return set<MappingRange>();
+ const auto& positions = edge_it->second;
+ auto it = positions.find(contig_id);
+ if(it == positions.end())
+ return set<MappingRange>();
+ else
+ return it->second;
+ }
+
+ vector<EdgePosition> GetEdgePositions(EdgeId edge) const {
+ VERIFY(this->IsAttached());
+ auto edge_it = edges_positions_.find(edge);
+ if(edge_it == edges_positions_.end())
+ return vector<EdgePosition>();
+ vector<EdgePosition> result;
+ for(auto it = edge_it->second.begin(); it != edge_it->second.end(); ++it) {
+ for(auto pos_it = it->second.begin(); pos_it != it->second.end(); ++pos_it) {
+ result.push_back(EdgePosition(it->first, *pos_it));
+ }
+ }
+ return result;
+ }
+
+ void AddEdgePosition(EdgeId edge, string contig_id, size_t start, size_t end, size_t m_start, size_t m_end) {
+ VERIFY(this->IsAttached());
+ AddEdgePosition(edge, contig_id, MappingRange(start, end, m_start, m_end));
+ }
+
+ void AddEdgePosition(EdgeId edge, string contig_id, MappingRange new_pos) {
+ VERIFY(this->IsAttached());
+ if(new_pos.empty())
+ return;
+ set<MappingRange> &new_set = edges_positions_[edge][contig_id];
+ new_pos = EraseAndExtract(new_set, new_pos);
+ new_set.insert(new_pos);
+ }
+
+ void AddAndShiftEdgePositions(EdgeId edge, const map<string, set<MappingRange>> &contig_map, int shift = 0) {
+ VERIFY(this->IsAttached());
+ for(auto contig_it = contig_map.begin(); contig_it != contig_map.end(); ++contig_it) {
+ for(auto it = contig_it->second.begin(); it != contig_it->second.end(); ++it) {
+ AddEdgePosition(edge, contig_it->first, it->Shift(shift).Fit(this->g().length(edge)));
+ }
+ }
+ }
+
+ template<typename Iter>
+ void AddEdgePositions(EdgeId edge, Iter begin, Iter end) {
+ VERIFY(this->IsAttached());
+ for(auto it = begin; it != end; ++it) {
+ AddEdgePosition(edge, it->contigId, it->mr);
+ }
+ }
+
+ std::string str(EdgeId edge) const {
+ VERIFY(this->IsAttached());
+ std::stringstream ss;
+ vector<EdgePosition> positions = GetEdgePositions(edge);
+ for (auto pos_it = positions.begin(), end = positions.end(); pos_it != end; ++pos_it) {
+ ss << "(" << pos_it->contigId << ": " << pos_it->mr << ")\\n";
+ }
+ return ss.str();
+ }
+
+ /**
+ * @param max_mapping_gap - maximal difference in positions of
+ * original sequence for two mapping ranges to be merged.
+ * @param max_gap_diff - maximal difference between gaps in initial and mapped ranges for
+ * mapping ranges to be merged
+ */
+ EdgesPositionHandler(const Graph &g, size_t max_mapping_gap, size_t max_gap_diff = 0) :
+ GraphActionHandler<Graph>(g, "EdgePositionHandler"),
+ max_mapping_gap_(max_mapping_gap),
+ max_gap_diff_(max_gap_diff) {
+ }
+
+ virtual ~EdgesPositionHandler() {
+ TRACE("~EdgePositionHandler ok");
+ }
+
+ virtual void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) {
+// TRACE("Handle glue ");
+ auto positions1 = GetEdgePositions(edge1);
+ auto positions2 = GetEdgePositions(edge2);
+ AddEdgePositions(new_edge, positions1.begin(), positions1.end());
+ AddEdgePositions(new_edge, positions2.begin(), positions2.end());
+ }
+
+ virtual void HandleSplit(EdgeId oldEdge, EdgeId newEdge1, EdgeId newEdge2) {
+ if (oldEdge == this->g().conjugate(oldEdge)) {
+ WARN("EdgesPositionHandler does not support self-conjugate splits");
+ return;
+ }
+ if (edges_positions_.count(oldEdge) != 0) {
+ auto contig_map = edges_positions_[oldEdge];
+ AddAndShiftEdgePositions(newEdge1, contig_map, 0);
+ AddAndShiftEdgePositions(newEdge2, contig_map, -int(this->g().length(newEdge1)));
+ }
+ }
+
+ virtual void HandleMerge(const vector<EdgeId>& oldEdges, EdgeId newEdge) {
+ int shift = 0;
+ for(auto it = oldEdges.begin(); it != oldEdges.end(); ++it) {
+ if (edges_positions_.count(*it) != 0) {
+ AddAndShiftEdgePositions(newEdge, edges_positions_[*it], shift);
+ }
+ shift += int(this->g().length(*it));
+ }
+ }
+
+ virtual void HandleAdd(EdgeId /*e*/) {
+ }
+
+ virtual void HandleDelete(EdgeId e) {
+ edges_positions_.erase(e);
+ }
+
+ void clear() {
+ edges_positions_.clear();
+ }
+
+private:
+ DECL_LOGGER("EdgesPositionHandler");
+};
+
+}
+
+#endif /* EDGES_POSITION_HANDLER_HPP_ */
diff --git a/src/modules/assembly_graph/handlers/id_track_handler.hpp b/src/modules/assembly_graph/handlers/id_track_handler.hpp
new file mode 100644
index 0000000..7ab0ec8
--- /dev/null
+++ b/src/modules/assembly_graph/handlers/id_track_handler.hpp
@@ -0,0 +1,110 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <unordered_map>
+//#include "utils.hpp"
+#include "visualization/graph_labeler.hpp"
+#include "dev_support/simple_tools.hpp"
+#include "assembly_graph/graph_core/action_handlers.hpp"
+using namespace omnigraph;
+
+namespace omnigraph {
+template<class Graph>
+class GraphElementFinder : public GraphActionHandler<Graph> {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ unordered_map<size_t, VertexId> id2vertex_;
+ unordered_map<size_t, EdgeId> id2edge_;
+
+public:
+ GraphElementFinder(const Graph &graph) : GraphActionHandler<Graph>(graph, "Graph element finder") {
+ }
+
+ virtual ~GraphElementFinder() {
+ }
+
+ virtual void HandleAdd(EdgeId e) {
+#pragma omp critical
+ {
+ id2edge_[e.int_id()] = e;
+ }
+ }
+
+ virtual void HandleAdd(VertexId v) {
+#pragma omp critical
+ {
+ id2vertex_[v.int_id()] = v;
+ }
+ }
+
+ virtual void HandleDelete(EdgeId e) {
+ id2edge_[e.int_id()] = e;
+ }
+
+ virtual void HandleDelete(VertexId v) {
+ id2vertex_[v.int_id()] = v;
+ }
+
+ VertexId ReturnVertexId(size_t id) const {
+ auto it = id2vertex_.find(id);
+ if(it == id2vertex_.end())
+ return VertexId();
+ else
+ return it->second;
+ }
+
+ EdgeId ReturnEdgeId(size_t id) const {
+ auto it = id2edge_.find(id);
+ if(it == id2edge_.end())
+ return EdgeId();
+ else
+ return it->second;
+ }
+
+ void Init() {
+ for(auto it = this->g().begin(); it != this->g().end(); ++it) {
+ HandleAdd(*it);
+ for(auto eit = this->g().OutgoingEdges(*it).begin(); eit != this->g().OutgoingEdges(*it).end(); ++eit) {
+ HandleAdd(*eit);
+ }
+ }
+ }
+};
+
+template<class VertexId, class EdgeId>
+class BaseIdTrackHandler {
+public:
+ BaseIdTrackHandler() {
+ }
+
+ size_t ReturnIntId(EdgeId e) const {
+ return e.int_id();
+ }
+
+ size_t ReturnIntId(VertexId v) const {
+ return v.int_id();
+ }
+};
+
+template<class Graph>
+class IdTrackHandler : public BaseIdTrackHandler<typename Graph::VertexId, typename Graph::EdgeId> {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ const Graph &graph_;
+public:
+ IdTrackHandler(const Graph& g) : graph_(g) {
+ }
+
+ ~IdTrackHandler() {
+ }
+};
+
+}
diff --git a/src/modules/assembly_graph/paths/bidirectional_path.cpp b/src/modules/assembly_graph/paths/bidirectional_path.cpp
new file mode 100644
index 0000000..0718c2c
--- /dev/null
+++ b/src/modules/assembly_graph/paths/bidirectional_path.cpp
@@ -0,0 +1,21 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * bidirectional_path.cpp
+ *
+ * Created on: Jun 25, 2015
+ * Author: andrey
+ */
+
+#include "dev_support/standard_base.hpp"
+#include "assembly_graph/paths/bidirectional_path.hpp"
+
+namespace path_extend {
+
+std::atomic<uint64_t> BidirectionalPath::path_id_{0};
+
+}
diff --git a/src/modules/assembly_graph/paths/bidirectional_path.hpp b/src/modules/assembly_graph/paths/bidirectional_path.hpp
new file mode 100644
index 0000000..36e6030
--- /dev/null
+++ b/src/modules/assembly_graph/paths/bidirectional_path.hpp
@@ -0,0 +1,1087 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * bidirectional_path.h
+ *
+ * Created on: Nov 14, 2011
+ * Author: andrey
+ */
+#pragma once
+
+#include <atomic>
+#include "assembly_graph/graph_core/graph.hpp"
+#include "assembly_graph/components/connected_component.hpp"
+
+using debruijn_graph::Graph;
+using debruijn_graph::EdgeId;
+using debruijn_graph::VertexId;
+
+namespace path_extend {
+
+class BidirectionalPath;
+
+struct Gap {
+ int gap_;
+ uint32_t trash_previous_;
+ uint32_t trash_current_;
+ Gap(int gap)
+ : gap_(gap), trash_previous_(0), trash_current_(0)
+ { }
+
+ Gap(int gap, uint32_t trash_previous, uint32_t trash_current)
+ : gap_(gap), trash_previous_(trash_previous), trash_current_(trash_current)
+ { }
+};
+
+
+class PathListener {
+public:
+ virtual void FrontEdgeAdded(EdgeId e, BidirectionalPath * path, Gap gap) = 0;
+ virtual void BackEdgeAdded(EdgeId e, BidirectionalPath * path, Gap gap) = 0;
+ virtual void FrontEdgeRemoved(EdgeId e, BidirectionalPath * path) = 0;
+ virtual void BackEdgeRemoved(EdgeId e, BidirectionalPath * path) = 0;
+ virtual ~PathListener() {
+ }
+};
+
+
+class BidirectionalPath : public PathListener {
+private:
+ static std::atomic<uint64_t> path_id_;
+
+
+public:
+ BidirectionalPath(const Graph& g)
+ : g_(g),
+ data_(),
+ conj_path_(NULL),
+ cumulative_len_(),
+ gap_len_(),
+ listeners_(),
+ id_(path_id_++),
+ weight_(1.0),
+ has_overlaped_begin_(false),
+ has_overlaped_end_(false),
+ overlap_(false) {
+ }
+
+ BidirectionalPath(const Graph& g, const std::vector<EdgeId>& path)
+ : BidirectionalPath(g) {
+ for (size_t i = 0; i < path.size(); ++i) {
+ PushBack(path[i]);
+ }
+ RecountLengths();
+ }
+
+ BidirectionalPath(const Graph& g, EdgeId startingEdge)
+ : BidirectionalPath(g) {
+ PushBack(startingEdge);
+ }
+
+ BidirectionalPath(const BidirectionalPath& path)
+ : g_(path.g_),
+ data_(path.data_),
+ conj_path_(NULL),
+ cumulative_len_(path.cumulative_len_),
+ gap_len_(path.gap_len_),
+ listeners_(),
+ id_(path_id_++),
+ weight_(path.weight_),
+ has_overlaped_begin_(path.has_overlaped_begin_),
+ has_overlaped_end_(path.has_overlaped_end_),
+ overlap_(path.overlap_) {
+ }
+
+public:
+ void Subscribe(PathListener * listener) {
+ listeners_.push_back(listener);
+ }
+
+ void Unsubscribe(PathListener * listener) {
+ listeners_.push_back(listener);
+ }
+
+ void SetConjPath(BidirectionalPath* path) {
+ conj_path_ = path;
+ }
+
+ const BidirectionalPath* GetConjPath() const {
+ return conj_path_;
+ }
+
+ BidirectionalPath* GetConjPath() {
+ return conj_path_;
+ }
+
+ void SetWeight(float w) {
+ weight_ = w;
+ }
+
+ double GetWeight() const {
+ return weight_;
+ }
+
+ size_t Size() const {
+ return data_.size();
+ }
+
+ const Graph& graph() const {
+ return g_;
+ }
+
+ bool Empty() const {
+ return data_.empty();
+ }
+
+ size_t Length() const {
+ if (gap_len_.size() == 0 || cumulative_len_.size() == 0) {
+ return 0;
+ }
+ return cumulative_len_[0] + gap_len_[0].gap_;
+ }
+
+ //TODO iterators forward/reverse
+ EdgeId operator[](size_t index) const {
+ return data_[index];
+ }
+
+ EdgeId At(size_t index) const {
+ return data_[index];
+ }
+
+ EdgeId ReverseAt(size_t index) const {
+ return data_[data_.size() - index - 1];
+ }
+
+
+ // Length from beginning of i-th edge to path end for forward directed path: L(e1 + e2 + ... + eN)
+ size_t LengthAt(size_t index) const {
+ return cumulative_len_[index];
+ }
+
+ int GapAt(size_t index) const {
+ return gap_len_[index].gap_;
+ }
+
+ uint32_t TrashCurrentAt(size_t index) const {
+ return gap_len_[index].trash_current_;
+ }
+
+ uint32_t TrashPreviousAt(size_t index) const {
+ return gap_len_[index].trash_previous_;
+ }
+
+ size_t GetId() const {
+ return id_;
+ }
+
+ EdgeId Back() const {
+ return data_.back();
+ }
+
+ EdgeId Front() const {
+ return data_.front();
+ }
+
+ void PushBack(EdgeId e, int gap = 0, uint32_t trash_previous = 0, uint32_t trash_current = 0) {
+ data_.push_back(e);
+ Gap gap_struct(gap, trash_previous, trash_current);
+ gap_len_.push_back(gap_struct);
+ IncreaseLengths(g_.length(e), gap_struct);
+ NotifyBackEdgeAdded(e, gap_struct);
+ }
+
+ void PushBack(EdgeId e, Gap gap) {
+ data_.push_back(e);
+ gap_len_.push_back(gap);
+ IncreaseLengths(g_.length(e), gap);
+ NotifyBackEdgeAdded(e, gap);
+ }
+
+ void PushBack(const BidirectionalPath& path) {
+ for (size_t i = 0; i < path.Size(); ++i) {
+ PushBack(path.At(i), path.GapAt(i), path.TrashPreviousAt(i), path.TrashCurrentAt(i));
+ }
+ }
+
+ void PopBack() {
+ if (data_.empty()) {
+ return;
+ }
+ EdgeId e = data_.back();
+ DecreaseLengths();
+ gap_len_.pop_back();
+ data_.pop_back();
+ NotifyBackEdgeRemoved(e);
+ }
+
+ void PopBack(size_t count) {
+ for (size_t i = 0; i < count; ++i) {
+ PopBack();
+ }
+ }
+
+ void Clear() {
+ while (!Empty()) {
+ PopBack();
+ }
+ }
+
+ virtual void FrontEdgeAdded(EdgeId, BidirectionalPath*, int) {
+ }
+
+ virtual void FrontEdgeAdded(EdgeId, BidirectionalPath*, Gap) {
+ }
+
+
+ virtual void BackEdgeAdded(EdgeId e, BidirectionalPath*, int gap) {
+ PushFront(g_.conjugate(e), gap);
+ }
+
+ virtual void BackEdgeAdded(EdgeId e, BidirectionalPath*, Gap gap) {
+ PushFront(g_.conjugate(e), gap);
+ }
+
+ virtual void FrontEdgeRemoved(EdgeId, BidirectionalPath*) {
+ }
+
+ virtual void BackEdgeRemoved(EdgeId, BidirectionalPath *) {
+ PopFront();
+ }
+
+ int FindFirst(EdgeId e) const {
+ for (size_t i = 0; i < Size(); ++i) {
+ if (data_[i] == e) {
+ return (int) i;
+ }
+ }
+ return -1;
+ }
+
+ int FindLast(EdgeId e) const {
+ for (int i = (int) Size() - 1; i >= 0; --i) {
+ if (data_[i] == e) {
+ return i;
+ }
+ }
+ return -1;
+ }
+
+ bool Contains(EdgeId e) const {
+ return FindFirst(e) != -1;
+ }
+
+ bool Contains(VertexId v) const {
+ for(auto edge : data_) {
+ if(g_.EdgeEnd(edge) == v || g_.EdgeStart(edge) == v ) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ vector<size_t> FindAll(EdgeId e, size_t start = 0) const {
+ vector<size_t> result;
+ for (size_t i = start; i < Size(); ++i) {
+ if (data_[i] == e) {
+ result.push_back(i);
+ }
+ }
+ return result;
+ }
+
+ bool CompareFrom(size_t from, const BidirectionalPath& sample) const {
+ if (from + sample.Size() > Size()) {
+ return false;
+ }
+
+ for (size_t i = 0; i < sample.Size(); ++i) {
+ if (At(from + i) != sample[i]) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ size_t CommonEndSize(const BidirectionalPath& p) const {
+ if (p.Size() == 0) {
+ return 0;
+ }
+ std::vector<size_t> begins = FindAll(p.At(0));
+ for (size_t i = 0; i < begins.size(); ++i) {
+ size_t it1 = begins[i];
+ size_t it2 = 0;
+ while (it2 < p.Size() and At(it1) == p.At(it2)) {
+ it1++;
+ it2++;
+ if (it1 == Size()) {
+ return it2;
+ }
+ }
+ }
+ return 0;
+ }
+
+ size_t OverlapEndSize(const BidirectionalPath* path2) const {
+ if (Size() == 0) {
+ return 0;
+ }
+ int last1 = (int) Size() - 1;
+ int max_over = 0;
+ vector<size_t> begins2 = path2->FindAll(At(last1));
+ for (size_t i = 0; i < begins2.size(); ++i) {
+ int begin2 = (int) begins2[i];
+ int cur1 = last1;
+ while (begin2 > 0 && cur1 > 0 && path2->At(begin2 - 1) == At(cur1 - 1)) {
+ cur1--;
+ begin2--;
+ }
+ int over = last1 - cur1 + 1;
+ if (begin2 == 0 && cur1 > 0 && over > max_over) {
+ max_over = over;
+ }
+ }
+ return (size_t) max_over;
+ }
+
+ int FindFirst(const BidirectionalPath& path, size_t from = 0) const {
+ if (path.Size() > Size()) {
+ return -1;
+ }
+ for (size_t i = from; i <= Size() - path.Size(); ++i) {
+ if (CompareFrom(i, path)) {
+ return (int) i;
+ }
+ }
+ return -1;
+ }
+//TODO: Why just naive search?
+ int FindLast(const BidirectionalPath& path) const {
+ if (path.Size() > Size()) {
+ return -1;
+ }
+ for (int i = (int) (Size() - path.Size()); i >= 0; --i) {
+ if (CompareFrom((size_t) i, path)) {
+ return i;
+ }
+ }
+ return -1;
+ }
+
+ bool Contains(const BidirectionalPath& path) const {
+ return FindFirst(path) != -1;
+ }
+
+ bool Equal(const BidirectionalPath& path) const {
+ return operator==(path);
+ }
+
+ bool operator==(const BidirectionalPath& path) const {
+ return Size() == path.Size() && CompareFrom(0, path);
+ }
+
+ bool operator!=(const BidirectionalPath& path) const {
+ return !operator==(path);
+ }
+
+ void CheckConjugateEnd(size_t max_repeat_length) {
+ size_t prev_size = 0;
+ while (prev_size != Size()) {
+ prev_size = Size();
+ FindConjEdges(max_repeat_length);
+ }
+ }
+
+ size_t GetComponent(const debruijn_graph::ConnectedComponentCounter &component_counter) const {
+ std::unordered_map <size_t, size_t> component_sizes;
+ for (size_t i = 0; i < this->Size(); i++) {
+ auto e = this->At(i);
+ size_t comp_id = component_counter.GetComponent(e);
+ if (component_sizes.find(comp_id) == component_sizes.end())
+ component_sizes[comp_id] = 0;
+ component_sizes[comp_id] += g_.length(e);
+ }
+ size_t ans = 0;
+ size_t maxans = 0;
+ for (auto pp: component_sizes) {
+ if (pp.second > maxans) {
+ ans = pp.first;
+ maxans = pp.second;
+ }
+ }
+ return ans;
+ }
+
+ void FindConjEdges(size_t max_repeat_length) {
+ for (size_t begin_pos = 0; begin_pos < Size(); ++begin_pos) {
+ size_t begin = begin_pos;
+ vector<size_t> conj_pos = FindAll(g_.conjugate(At(begin_pos)), begin + 1);
+ for (auto end_pos = conj_pos.rbegin(); end_pos != conj_pos.rend(); ++end_pos) {
+ VERIFY(*end_pos < Size());
+ size_t end = *end_pos;
+ if (end <= begin) {
+ continue;
+ }
+ while (begin < end && At(begin) == g_.conjugate(At(end))) {
+ begin++;
+ end--;
+ }
+ DEBUG("Found palindromic fragment from " << begin_pos << " to " << *end_pos);
+ Print();
+ VERIFY(*end_pos < Size());
+ size_t tail_size = Size() - *end_pos - 1;
+ size_t head_size = begin_pos;
+ size_t palindrom_half_size = begin - begin_pos;
+ size_t head_len = Length() - LengthAt(begin_pos);
+ size_t tail_len = *end_pos < Size() - 1 ? LengthAt(*end_pos + 1) : 0;
+//TODO : this is not true in case of gaps inside the palindrom_len;
+ size_t palindrom_len = (size_t) max((int) LengthAt(begin_pos) - (int) LengthAt(begin), 0);
+ size_t between = (size_t) max(0, (int) LengthAt(begin) - (int) (end < Size() - 1 ? LengthAt(end + 1) : 0));
+ DEBUG("tail len " << tail_len << " head len " << head_len << " palindrom_len "<< palindrom_len << " between " << between);
+ if (palindrom_len <= max_repeat_length) {
+ if (palindrom_len < head_len && palindrom_len < tail_len) {
+ DEBUG("too big head and end");
+ continue;
+ }
+ if (between > palindrom_len) {
+ DEBUG("too big part between");
+ continue;
+ }
+ }
+ bool delete_tail = tail_size < head_size;
+ if (tail_size == head_size) {
+ delete_tail = tail_len < head_len;
+ }
+ if (delete_tail) {
+ PopBack(tail_size + palindrom_half_size);
+ DEBUG("Deleting tail because of palindrom removal");
+ return;
+ } else {
+ GetConjPath()->PopBack(head_size + palindrom_half_size);
+ DEBUG("Deleting head because of palindrom removal");
+ return;
+ }
+ }
+ }
+ }
+
+ BidirectionalPath SubPath(size_t from, size_t to) const {
+ BidirectionalPath result(g_);
+ for (size_t i = from; i < min(to, Size()); ++i) {
+ result.PushBack(data_[i], gap_len_[i]);
+ }
+ return result;
+ }
+
+ BidirectionalPath SubPath(size_t from) const {
+ return SubPath(from, Size());
+ }
+
+ double Coverage() const {
+ double cov = 0.0;
+
+ for (size_t i = 0; i < Size(); ++i) {
+ cov += g_.coverage(data_[i]) * (double) g_.length(data_[i]);
+ }
+ return cov / (double) Length();
+ }
+
+ BidirectionalPath Conjugate() const {
+ BidirectionalPath result(g_);
+ if (Empty()) {
+ return result;
+ }
+ result.PushBack(g_.conjugate(Back()), 0);
+ for (int i = ((int) Size()) - 2; i >= 0; --i) {
+ result.PushBack(g_.conjugate(data_[i]), gap_len_[i + 1].gap_ + gap_len_[i + 1].trash_current_ - gap_len_[i + 1].trash_previous_, gap_len_[i + 1].trash_current_, gap_len_[i + 1].trash_previous_);
+ }
+
+ return result;
+ }
+
+ vector<EdgeId> ToVector() const {
+ return vector<EdgeId>(data_.begin(), data_.end());
+ }
+
+ bool CameToInterstrandBulge() const {
+ if (Empty())
+ return false;
+
+ EdgeId lastEdge = Back();
+ VertexId lastVertex = g_.EdgeEnd(lastEdge);
+
+ if (g_.OutgoingEdgeCount(lastVertex) == 2) {
+ vector<EdgeId> bulgeEdges(g_.out_begin(lastVertex), g_.out_end(lastVertex));
+ VertexId nextVertex = g_.EdgeEnd(bulgeEdges[0]);
+
+ if (bulgeEdges[0] == g_.conjugate(bulgeEdges[1]) && nextVertex == g_.EdgeEnd(bulgeEdges[1]) && g_.CheckUniqueOutgoingEdge(nextVertex)
+ && *(g_.out_begin(nextVertex)) == g_.conjugate(lastEdge)) {
+
+ DEBUG("Came to interstrand bulge " << g_.int_id(lastEdge));
+ return true;
+ }
+ }
+ return false;
+ }
+
+ bool IsInterstrandBulge() const {
+ if (Empty())
+ return false;
+
+ EdgeId lastEdge = Back();
+ VertexId lastVertex = g_.EdgeEnd(lastEdge);
+ VertexId prevVertex = g_.EdgeStart(lastEdge);
+
+ if (g_.OutgoingEdgeCount(prevVertex) == 2 && g_.IncomingEdgeCount(lastVertex) == 2 && g_.CheckUniqueOutgoingEdge(lastVertex)
+ && g_.CheckUniqueIncomingEdge(prevVertex) && *(g_.in_begin(prevVertex)) == g_.conjugate(*(g_.out_begin(lastVertex)))) {
+
+ vector<EdgeId> bulgeEdges(g_.out_begin(prevVertex), g_.out_end(prevVertex));
+ EdgeId bulgeEdge = bulgeEdges[0] == lastEdge ? bulgeEdges[1] : bulgeEdges[0];
+
+ if (bulgeEdge == g_.conjugate(lastEdge)) {
+ DEBUG("In interstrand bulge " << g_.int_id(lastEdge));
+ return true;
+ }
+ }
+ return false;
+ }
+
+ void Print() const {
+ DEBUG("Path " << id_);
+ DEBUG("Length " << Length());
+ DEBUG("Weight " << weight_);
+ DEBUG("#, edge, length, gap length, trash length, total length, total length from begin");
+ for (size_t i = 0; i < Size(); ++i) {
+ DEBUG(i << ", " << g_.int_id(At(i)) << ", "
+ << g_.length(At(i)) << ", " << GapAt(i) << ", "
+ << TrashPreviousAt(i) << "-" << TrashCurrentAt(i)
+ << ", " << LengthAt(i) << ", "
+ << ((Length() < LengthAt(i)) ? 0 : Length() - LengthAt(i)));
+ }
+ }
+
+ void PrintInString() const {
+ stringstream str;
+ for (size_t i = 0; i < Size(); ++i) {
+ str << g_.int_id(At(i)) << " ";
+ }
+ DEBUG(str.str());
+ }
+ void PrintInfo() const {
+ INFO("Path " << id_);
+ INFO("Length " << Length());
+ INFO("Weight " << weight_);
+ INFO("#, edge, length, gap length, total length");
+ for (size_t i = 0; i < Size(); ++i) {
+ INFO(i << ", " << g_.int_id(At(i)) << ", " << g_.length(At(i)) << ", " << GapAt(i) << ", " << LengthAt(i));
+ }
+ }
+
+ void Print(std::ostream& os) {
+ if (Empty()) {
+ return;
+ }
+ os << "Path " << GetId() << endl;
+ os << "Length " << Length() << endl;
+ os << "#, edge, length, gap, total length" << endl;
+ for (size_t i = 0; i < Size(); ++i) {
+ os << i << ", " << g_.int_id(At(i)) << ", " << g_.length(At(i)) << ", " << GapAt(i) << ", " << LengthAt(i) << endl;
+ }
+ }
+
+ void SetOverlapedBeginTo(BidirectionalPath* to) {
+ if (has_overlaped_begin_) {
+ to->SetOverlapBegin();
+ }
+ SetOverlapBegin();
+ to->SetOverlapEnd();
+ }
+
+ void SetOverlapedEndTo(BidirectionalPath* to) {
+ if (has_overlaped_end_) {
+ to->SetOverlapEnd();
+ }
+ SetOverlapEnd();
+ to->SetOverlapBegin();
+ }
+
+ void SetOverlap(bool overlap = true) {
+ overlap_ = overlap;
+ conj_path_->overlap_ = overlap;
+ }
+
+ bool HasOverlapedBegin() const {
+ return has_overlaped_begin_;
+ }
+
+ bool HasOverlapedEnd() const {
+ return has_overlaped_end_;
+ }
+
+ bool IsOverlap() const {
+ return overlap_;
+ }
+
+ void ResetOverlaps() {
+ overlap_ = false;
+ has_overlaped_begin_ = false;
+ has_overlaped_end_ = false;
+ conj_path_->overlap_ = false;
+ conj_path_->has_overlaped_begin_ = false;
+ conj_path_->has_overlaped_end_ = false;
+ }
+private:
+
+ void RecountLengths() {
+ cumulative_len_.clear();
+ size_t currentLength = 0;
+ for (auto iter = data_.rbegin(); iter != data_.rend(); ++iter) {
+ currentLength += g_.length((EdgeId) *iter);
+ cumulative_len_.push_front(currentLength);
+ }
+ }
+
+ void IncreaseLengths(size_t length, Gap gap_struct) {
+ for (auto iter = cumulative_len_.begin(); iter != cumulative_len_.end(); ++iter) {
+ *iter += length + gap_struct.gap_ - gap_struct.trash_previous_;
+ }
+ cumulative_len_.push_back(length);
+ }
+
+ void DecreaseLengths() {
+ size_t length = g_.length(data_.back()) + gap_len_.back().gap_ - gap_len_.back().trash_previous_;
+
+ for (auto iter = cumulative_len_.begin(); iter != cumulative_len_.end(); ++iter) {
+ *iter -= length;
+ }
+ cumulative_len_.pop_back();
+ }
+
+ void NotifyFrontEdgeAdded(EdgeId e, int gap) {
+ for (auto i = listeners_.begin(); i != listeners_.end(); ++i) {
+ (*i)->FrontEdgeAdded(e, this, gap);
+ }
+ }
+
+ void NotifyFrontEdgeAdded(EdgeId e, Gap gap) {
+ for (auto i = listeners_.begin(); i != listeners_.end(); ++i) {
+ (*i)->FrontEdgeAdded(e, this, gap);
+ }
+ }
+
+ void NotifyBackEdgeAdded(EdgeId e, int gap) {
+ for (auto i = listeners_.begin(); i != listeners_.end(); ++i) {
+ (*i)->BackEdgeAdded(e, this, gap);
+ }
+ }
+
+ void NotifyBackEdgeAdded(EdgeId e, Gap gap) {
+ for (auto i = listeners_.begin(); i != listeners_.end(); ++i) {
+ (*i)->BackEdgeAdded(e, this, gap);
+ }
+ }
+
+ void NotifyFrontEdgeRemoved(EdgeId e) {
+ for (auto i = listeners_.begin(); i != listeners_.end(); ++i) {
+ (*i)->FrontEdgeRemoved(e, this);
+ }
+ }
+
+ void NotifyBackEdgeRemoved(EdgeId e) {
+ for (auto i = listeners_.begin(); i != listeners_.end(); ++i) {
+ (*i)->BackEdgeRemoved(e, this);
+ }
+ }
+
+ void PushFront(EdgeId e, Gap gap) {
+ PushFront(e, gap.gap_ + gap.trash_current_ - gap.trash_previous_, gap.trash_current_, gap.trash_previous_);
+ }
+
+ void PushFront(EdgeId e, int gap = 0, uint32_t trash_previous = 0, uint32_t trash_current = 0) {
+ data_.push_front(e);
+ if (gap_len_.size() > 0) {
+ gap_len_[0].gap_ += gap;
+ gap_len_[0].trash_previous_ += trash_previous;
+ gap_len_[0].trash_current_ += trash_current;
+ }
+ gap_len_.push_front(Gap(0, 0, 0));
+
+ int length = (int) g_.length(e);
+ if (cumulative_len_.empty()) {
+ cumulative_len_.push_front(length);
+ } else {
+ cumulative_len_.push_front(length + cumulative_len_.front() + gap - trash_previous );
+ }
+ NotifyFrontEdgeAdded(e, gap);
+ }
+
+ void PopFront() {
+ EdgeId e = data_.front();
+ if (gap_len_.size() > 1) {
+ gap_len_[1].gap_ = 0;
+ gap_len_[1].trash_previous_ = 0;
+ gap_len_[1].trash_current_ = 0;
+ }
+ data_.pop_front();
+ gap_len_.pop_front();
+
+ cumulative_len_.pop_front();
+ NotifyFrontEdgeRemoved(e);
+ }
+
+ void SetOverlapBegin(bool overlap = true) {
+ if (has_overlaped_begin_ != overlap) {
+ has_overlaped_begin_ = overlap;
+ }
+ if (GetConjPath()->has_overlaped_end_ != overlap) {
+ GetConjPath()->has_overlaped_end_ = overlap;
+ }
+ }
+
+ void SetOverlapEnd(bool overlap = true) {
+ GetConjPath()->SetOverlapBegin(overlap);
+ }
+
+ const Graph& g_;
+ std::deque<EdgeId> data_;
+ BidirectionalPath* conj_path_;
+ std::deque<size_t> cumulative_len_; // Length from beginning of i-th edge to path end for forward directed path: L(e1 + e2 + ... + eN) ... L(eN)
+ std::deque<Gap> gap_len_; // e1 - gap2 - e2 - ... - gapN - eN
+ std::vector<PathListener *> listeners_;
+ const uint64_t id_; //Unique ID
+ float weight_;
+ bool has_overlaped_begin_;
+ bool has_overlaped_end_;
+ bool overlap_;
+ DECL_LOGGER("BidirectionalPath");
+};
+
+inline int SkipOneGap(EdgeId end, const BidirectionalPath& path, int gap, int pos, bool forward) {
+ size_t len = 0;
+ while (pos < (int) path.Size() && pos >= 0 && end != path.At(pos) && (int) len < 2 * gap) {
+ len += path.graph().length(path.At(pos));
+ forward ? pos++ : pos--;
+ }
+ if (pos < (int) path.Size() && pos >= 0 && end == path.At(pos)) {
+ return pos;
+ }
+ return -1;
+}
+
+inline void SkipGaps(const BidirectionalPath& path1, size_t& cur_pos1, int gap1, const BidirectionalPath& path2, size_t& cur_pos2, int gap2, bool use_gaps,
+ bool forward) {
+ if (use_gaps) {
+ if (gap1 > 0 && gap2 <= 0) {
+ int temp2 = SkipOneGap(path1.At(cur_pos1), path2, gap1, (int) cur_pos2, forward);
+ if (temp2 >= 0) {
+ cur_pos2 = (size_t) temp2;
+ }
+ } else if (gap2 > 0 && gap1 <= 0) {
+ int temp1 = SkipOneGap(path2.At(cur_pos2), path1, gap2, (int) cur_pos1, forward);
+ if (temp1 >= 0) {
+ cur_pos1 = (size_t) temp1;
+ }
+ } else if (gap1 > 0 && gap2 > 0 && gap1 != gap2) {
+ DEBUG("not equal gaps in two paths!!!");
+ }
+ }
+}
+
+inline size_t FirstNotEqualPosition(const BidirectionalPath& path1, size_t pos1, const BidirectionalPath& path2, size_t pos2, bool use_gaps) {
+ int cur_pos1 = (int) pos1;
+ int cur_pos2 = (int) pos2;
+ int gap1 = path1.GapAt(cur_pos1);
+ int gap2 = path2.GapAt(cur_pos2);
+ while (cur_pos1 >= 0 && cur_pos2 >= 0) {
+ if (path1.At(cur_pos1) == path2.At(cur_pos2)) {
+ cur_pos1--;
+ cur_pos2--;
+ } else {
+ DEBUG("Not Equal at " << cur_pos1 << " and " << cur_pos2);
+ return cur_pos1;
+ }
+ if (cur_pos1 >= 0 && cur_pos2 >= 0) {
+ size_t p1 = (size_t) cur_pos1;
+ size_t p2 = (size_t) cur_pos2;
+ SkipGaps(path1, p1, gap1, path2, p2, gap2, use_gaps, false);
+ cur_pos1 = (int) p1;
+ cur_pos2 = (int) p2;
+ gap1 = path1.GapAt(cur_pos1);
+ gap2 = path2.GapAt(cur_pos2);
+ }
+ }
+ DEBUG("Equal!!");
+ return -1UL;
+}
+inline bool EqualBegins(const BidirectionalPath& path1, size_t pos1, const BidirectionalPath& path2, size_t pos2, bool use_gaps) {
+ DEBUG("Checking for equal begins");
+ return FirstNotEqualPosition(path1, pos1, path2, pos2, use_gaps) == -1UL;
+}
+
+inline size_t LastNotEqualPosition(const BidirectionalPath& path1, size_t pos1, const BidirectionalPath& path2, size_t pos2, bool use_gaps) {
+ size_t cur_pos1 = pos1;
+ size_t cur_pos2 = pos2;
+ while (cur_pos1 < path1.Size() && cur_pos2 < path2.Size()) {
+ if (path1.At(cur_pos1) == path2.At(cur_pos2)) {
+ cur_pos1++;
+ cur_pos2++;
+ } else {
+ return cur_pos1;
+ }
+ int gap1 = cur_pos1 < path1.Size() ? path1.GapAt(cur_pos1) : 0;
+ int gap2 = cur_pos2 < path2.Size() ? path2.GapAt(cur_pos2) : 0;
+ SkipGaps(path1, cur_pos1, gap1, path2, cur_pos2, gap2, use_gaps, true);
+ }
+ return -1UL;
+}
+
+inline bool EqualEnds(const BidirectionalPath& path1, size_t pos1, const BidirectionalPath& path2, size_t pos2, bool use_gaps) {
+ return LastNotEqualPosition(path1, pos1, path2, pos2, use_gaps) == -1UL;
+}
+
+inline bool PathIdCompare(const BidirectionalPath* p1, const BidirectionalPath* p2) {
+ return p1->GetId() < p2->GetId();
+}
+
+
+
+typedef std::pair<BidirectionalPath*, BidirectionalPath*> PathPair;
+
+inline bool compare_path_pairs(const PathPair& p1, const PathPair& p2) {
+ if (p1.first->Length() != p2.first->Length() || p1.first->Size() == 0 || p2.first->Size() == 0) {
+ return p1.first->Length() > p2.first->Length();
+ }
+ const Graph& g = p1.first->graph();
+ return g.int_id(p1.first->Front()) < g.int_id(p2.first->Front());
+}
+
+class PathComparator {
+public:
+ bool operator()(const BidirectionalPath& p1, const BidirectionalPath& p2) const {
+ return p1.GetId() < p2.GetId();
+ }
+
+ bool operator()(const BidirectionalPath* p1, const BidirectionalPath* p2) const {
+ return p1->GetId() < p2->GetId();
+ }
+};
+
+typedef set<BidirectionalPath*, PathComparator> BidirectionalPathSet;
+
+template<class Value>
+using BidirectionalPathMap = map<BidirectionalPath*, Value, PathComparator>;
+
+typedef std::multiset <BidirectionalPath *, PathComparator> BidirectionalPathMultiset;
+
+class PathContainer {
+
+public:
+
+ typedef std::vector<PathPair> PathContainerT;
+
+ class Iterator : public PathContainerT::iterator {
+ public:
+ Iterator(const PathContainerT::iterator& iter)
+ : PathContainerT::iterator(iter) {
+ }
+ BidirectionalPath* get() const {
+ return this->operator *().first;
+ }
+ BidirectionalPath* getConjugate() const {
+ return this->operator *().second;
+ }
+ };
+
+ class ConstIterator : public PathContainerT::const_iterator {
+ public:
+ ConstIterator(const PathContainerT::const_iterator& iter)
+ : PathContainerT::const_iterator(iter) {
+ }
+ BidirectionalPath* get() const {
+ return this->operator *().first;
+ }
+ BidirectionalPath* getConjugate() const {
+ return this->operator *().second;
+ }
+ };
+
+ PathContainer() {
+ }
+
+ BidirectionalPath& operator[](size_t index) const {
+ return *(data_[index].first);
+ }
+
+ BidirectionalPath* Get(size_t index) const {
+ return data_[index].first;
+ }
+
+ BidirectionalPath* GetConjugate(size_t index) const {
+ return data_[index].second;
+ }
+
+ void DeleteAllPaths() {
+ for (size_t i = 0; i < data_.size(); ++i) {
+ delete data_[i].first;
+ delete data_[i].second;
+ }
+ clear();
+ }
+
+ size_t size() const {
+ return data_.size();
+ }
+
+ void clear() {
+ data_.clear();
+ }
+
+ void reserve(size_t size) {
+ data_.reserve(size);
+ }
+
+ bool AddPair(BidirectionalPath* p, BidirectionalPath* cp) {
+ p->SetConjPath(cp);
+ cp->SetConjPath(p);
+ p->Subscribe(cp);
+ cp->Subscribe(p);
+ data_.push_back(std::make_pair(p, cp));
+ return true;
+ }
+
+ void SortByLength() {
+ std::stable_sort(data_.begin(), data_.end(), compare_path_pairs);
+ }
+
+ Iterator begin() {
+ return Iterator(data_.begin());
+ }
+
+ Iterator end() {
+ return Iterator(data_.end());
+ }
+
+
+ ConstIterator begin() const {
+ return ConstIterator(data_.begin());
+ }
+
+ ConstIterator end() const {
+ return ConstIterator(data_.end());
+ }
+
+ Iterator erase(Iterator iter) {
+ return Iterator(data_.erase(iter));
+ }
+
+ void print() const {
+ for (size_t i = 0; i < size(); ++i) {
+ Get(i)->Print();
+ GetConjugate(i)->Print();
+ }
+ }
+
+ void FilterEmptyPaths() {
+ DEBUG ("try to delete empty paths");
+ for (Iterator iter = begin(); iter != end();) {
+ if (iter.get()->Size() == 0) {
+ // FIXME: This is trash. PathContainer should own paths
+ delete iter.get();
+ delete iter.getConjugate();
+ iter = erase(iter);
+ } else {
+ ++iter;
+ }
+ }
+ DEBUG("empty paths are removed");
+ }
+
+ void FilterInterstandBulges() {
+ DEBUG ("Try to delete paths with interstand bulges");
+ for (Iterator iter = begin(); iter != end(); ++iter) {
+ if (iter.get()->IsInterstrandBulge()) {
+ iter.get()->PopBack();
+ }
+ if (iter.getConjugate()->IsInterstrandBulge()) {
+ iter.getConjugate()->PopBack();
+ }
+ }
+ DEBUG("deleted paths with interstand bulges");
+ }
+
+private:
+ std::vector<PathPair> data_;
+
+protected:
+ DECL_LOGGER("BidirectionalPath");
+
+};
+
+inline pair<size_t, size_t> ComparePaths(size_t start_pos1, size_t start_pos2, const BidirectionalPath& path1, const BidirectionalPath& path2,
+ size_t max_diff) {
+ path1.Print();
+ path2.Print();
+ if (start_pos1 >= path1.Size() || start_pos2 >= path2.Size()) {
+ return make_pair(start_pos1, start_pos2);
+ }
+ const Graph& g = path1.graph();
+ size_t cur_pos = start_pos1;
+ size_t last2 = start_pos2;
+ size_t last1 = cur_pos;
+ cur_pos++;
+ size_t diff_len = 0;
+ while (cur_pos < path1.Size()) {
+ if (diff_len > max_diff) {
+ return make_pair(last1, last2);
+ }
+ EdgeId e = path1[cur_pos];
+ vector<size_t> poses2 = path2.FindAll(e);
+ bool found = false;
+ for (size_t pos2 = 0; pos2 < poses2.size(); ++pos2) {
+ if (poses2[pos2] > last2) {
+ if (path2.LengthAt(last2) - path2.LengthAt(poses2[pos2]) - g.length(path2.At(last2)) - path2.GapAt(poses2[pos2]) > max_diff) {
+ break;
+ }
+ last2 = poses2[pos2];
+ last1 = cur_pos;
+ DEBUG("found " << cur_pos);
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ diff_len += g.length(e) + path1.GapAt(cur_pos);
+ DEBUG("not found " << cur_pos << " now diff len " << diff_len);
+ } else {
+ diff_len = 0;
+ }
+ cur_pos++;
+ }
+ return make_pair(last1, last2);
+}
+
+inline void DeletePaths(BidirectionalPathSet& paths) {
+ for (auto i = paths.begin(); i != paths.end(); ++i) {
+ delete (*i);
+ }
+}
+
+inline void DeletePaths(vector<BidirectionalPath*>& paths) {
+ for (auto i = paths.begin(); i != paths.end(); ++i) {
+ delete (*i);
+ }
+}
+
+inline void DeleteMapWithPaths(map<EdgeId, BidirectionalPath*> m) {
+ for (auto i = m.begin(); i != m.end(); ++i){
+ delete i->second;
+ }
+}
+
+} // path extend
+
diff --git a/src/modules/assembly_graph/paths/mapping_path.hpp b/src/modules/assembly_graph/paths/mapping_path.hpp
new file mode 100644
index 0000000..d6cba65
--- /dev/null
+++ b/src/modules/assembly_graph/paths/mapping_path.hpp
@@ -0,0 +1,227 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "dev_support/range.hpp"
+
+namespace omnigraph {
+
+/**
+ * This class is a representation of how certain sequence is mapped to genome. Needs further adjustment.
+ */
+template<typename ElementId>
+class Path {
+ std::vector<ElementId> sequence_;
+ size_t start_pos_;
+ size_t end_pos_;
+ public:
+ typedef typename vector<ElementId>::const_iterator iterator;
+
+ Path(const vector<ElementId>& sequence, size_t start_pos, size_t end_pos)
+ : sequence_(sequence), start_pos_(start_pos), end_pos_( end_pos) {
+ }
+
+ Path()
+ : sequence_(),
+ start_pos_(-1ul),
+ end_pos_(-1ul) {
+ }
+
+ size_t start_pos() const { return start_pos_; }
+ size_t end_pos() const { return end_pos_; }
+
+ size_t size() const { return sequence_.size(); }
+
+ const std::vector<ElementId>& sequence() const { return sequence_; }
+ ElementId operator[](size_t index) const { return sequence_[index]; }
+
+ iterator begin() const { return sequence_.begin(); }
+ iterator end() const { return sequence_.end(); }
+};
+
+struct MappingRange {
+// on genome/contig/whatever
+ Range initial_range;
+//on edge
+ Range mapped_range;
+
+ MappingRange() {
+ }
+
+ MappingRange(Range initial_range, Range mapped_range)
+ : initial_range(initial_range), mapped_range(mapped_range) {}
+
+ MappingRange(size_t i_start, size_t i_end, size_t m_start, size_t m_end)
+ : initial_range(i_start, i_end), mapped_range(m_start, m_end) {}
+
+ MappingRange Merge(const MappingRange &other) const {
+ return MappingRange(initial_range.Merge(other.initial_range), mapped_range.Merge(other.mapped_range));
+ }
+
+ MappingRange ShiftInitial(int shift) const {
+ MappingRange result(*this);
+ result.initial_range.shift(shift);
+ return result;
+ }
+
+ MappingRange Shift(int shift) const {
+ VERIFY(initial_range.end_pos >= initial_range.start_pos);
+ if(empty())
+ return MappingRange();
+ MappingRange result(*this);
+ if(int(result.mapped_range.end_pos) <= -shift)
+ return MappingRange();
+ result.mapped_range.end_pos += shift;
+ if(int(result.mapped_range.start_pos) <= -shift) {
+ result.initial_range.start_pos -= result.mapped_range.start_pos + shift;
+ if(result.initial_range.start_pos >= result.initial_range.end_pos)
+ result.initial_range.start_pos = result.initial_range.end_pos - 1;
+ result.mapped_range.start_pos = 0;
+ } else {
+ result.mapped_range.start_pos += shift;
+ }
+ return result;
+ }
+
+ MappingRange Fit(size_t length) const {
+ VERIFY(initial_range.end_pos >= initial_range.start_pos);
+ if(empty())
+ return MappingRange();
+ MappingRange result(*this);
+ if(result.mapped_range.start_pos >= length)
+ return MappingRange();
+ if(result.mapped_range.end_pos >= length) {
+ if(result.initial_range.end_pos + length < result.mapped_range.end_pos)
+ return MappingRange();
+ result.initial_range.end_pos -= result.mapped_range.end_pos - length;
+ result.mapped_range.end_pos = length;
+ }
+ return result;
+ }
+
+ bool empty() const {
+ return initial_range.empty() || mapped_range.empty();
+ }
+
+ bool operator<(const MappingRange &other) const {
+ if(this->initial_range != other.initial_range)
+ return this->initial_range < other.initial_range;
+ return this->mapped_range < other.mapped_range;
+ }
+ MappingRange operator = (const MappingRange & other) {
+ initial_range = other.initial_range;
+ mapped_range = other.mapped_range;
+ return *this;
+ }
+
+ bool Intersect(const MappingRange &other) {
+ return initial_range.Intersect(other.initial_range) && mapped_range.Intersect(other.mapped_range);
+ }
+
+ bool IntersectLeftOf(const MappingRange &other) const {
+ return initial_range.IntersectLeftOf(other.initial_range) && mapped_range.IntersectLeftOf(other.mapped_range);
+ }
+
+ bool StrictlyContinuesWith(const MappingRange &other, size_t max_gap, size_t gap_diff = 0) const {
+ return this->initial_range.end_pos <= other.initial_range.start_pos
+ && this->mapped_range.end_pos <= other.mapped_range.start_pos
+ && other.initial_range.start_pos - this->initial_range.end_pos
+ <= other.mapped_range.start_pos - this->mapped_range.end_pos + gap_diff
+ && other.mapped_range.start_pos - this->mapped_range.end_pos
+ <= other.initial_range.start_pos - this->initial_range.end_pos + gap_diff
+ && other.initial_range.start_pos - this->initial_range.end_pos <= max_gap;
+ }
+
+ bool operator==(const MappingRange &that) const {
+ return initial_range == that.initial_range || mapped_range == that.mapped_range;
+ }
+
+ bool operator!=(const MappingRange &that) const {
+ return !(*this == that);
+ }
+
+};
+
+inline std::ostream& operator<<(std::ostream& os, const MappingRange& map_range) {
+ os << map_range.initial_range << " --> " << map_range.mapped_range;
+ return os;
+}
+
+template<typename ElementId>
+class MappingPath {
+ public:
+ MappingPath() {}
+
+ MappingPath(const std::vector<ElementId>& edges,
+ const std::vector<MappingRange> range_mappings)
+ : edges_(edges),
+ range_mappings_(range_mappings) {}
+
+ size_t size() const { return edges_.size(); }
+
+ std::pair<const ElementId, const MappingRange> operator[](size_t idx) const {
+ return std::make_pair(edges_[idx], range_mappings_[idx]);
+ }
+
+ std::pair<const ElementId, const MappingRange> front() const {
+ return std::make_pair(edges_.front(), range_mappings_.front());
+ }
+
+ std::pair<const ElementId, const MappingRange> back() const {
+ return std::make_pair(edges_.back(), range_mappings_.back());
+ }
+
+ size_t start_pos() const {
+ return range_mappings_.front().mapped_range.start_pos;
+ }
+
+ size_t end_pos() const {
+ return range_mappings_.back().mapped_range.end_pos;
+ }
+
+ Path<ElementId> path() const {
+ if (edges_.size() != 0)
+ return Path<ElementId>(edges_,
+ range_mappings_[0].mapped_range.start_pos,
+ range_mappings_[range_mappings_.size() - 1].mapped_range.end_pos);
+ else
+ return Path<ElementId>();
+ }
+
+ const std::vector<ElementId>& simple_path() const {
+ return edges_;
+ }
+
+ void join(const MappingPath<ElementId>& that, int pos_shift = 0) {
+ for (size_t i = 0; i < that.size(); ++i) {
+ edges_.push_back(that.edges_[i]);
+ range_mappings_.push_back(that.range_mappings_[i].ShiftInitial(pos_shift));
+ }
+ }
+
+ void push_back(ElementId id, MappingRange range) {
+ edges_.push_back(id);
+ range_mappings_.push_back(range);
+ }
+
+ private:
+ std::vector<ElementId> edges_;
+ std::vector<MappingRange> range_mappings_;
+};
+
+template <typename ElementId>
+inline std::ostream& operator<<(std::ostream& os, const MappingPath<ElementId>& mp) {
+ os << "MappingPath ( ";
+ for(size_t i = 0; i < mp.size(); i++) {
+ os << mp[i] << " ";
+ }
+ os << " )";
+ return os;
+}
+
+}
diff --git a/src/modules/assembly_graph/paths/path_finders.hpp b/src/modules/assembly_graph/paths/path_finders.hpp
new file mode 100644
index 0000000..40f5add
--- /dev/null
+++ b/src/modules/assembly_graph/paths/path_finders.hpp
@@ -0,0 +1,124 @@
+#pragma once
+
+#include "assembly_graph/graph_core/directions.hpp"
+
+namespace omnigraph {
+template<class Graph>
+class UniquePathFinder {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ const Graph& graph_;
+public:
+ //todo use length bound if needed
+ UniquePathFinder(const Graph& graph, size_t /*length_bound*/ =
+ std::numeric_limits<size_t>::max())
+ : graph_(graph) {}
+
+ std::vector<EdgeId> operator()(EdgeId e,
+ const AbstractDirection<Graph> &direction) const {
+ std::vector<EdgeId> answer;
+ EdgeId curr = e;
+ answer.push_back(curr);
+ std::set<EdgeId> was;
+ while (direction.CheckUniqueOutgoingEdge(direction.EdgeEnd(curr))) {
+ curr = direction.GetUniqueOutgoingEdge(direction.EdgeEnd(curr));
+ if (was.count(curr) > 0)
+ break;
+ was.insert(curr);
+ answer.push_back(curr);
+ }
+ return answer;
+ }
+
+ std::vector<EdgeId> UniquePathForward(EdgeId e) const {
+ return this->operator()(e, ForwardDirection<Graph>(graph_));
+ }
+
+ std::vector<EdgeId> UniquePathBackward(EdgeId e) const {
+ auto tmp = this->operator()(e, BackwardDirection<Graph>(graph_));
+ return std::vector<EdgeId>(tmp.rbegin(), tmp.rend());
+ }
+
+};
+
+template<class Graph>
+class TrivialPathFinder {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+public:
+ TrivialPathFinder(const Graph&, size_t = 0) {}
+
+ std::vector<EdgeId> operator()(EdgeId e, const AbstractDirection<Graph> &) const {
+ return {e};
+ }
+
+};
+
+template<class Graph>
+class PlausiblePathFinder {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ //todo remove graph_ field???
+ const Graph& graph_;
+ const size_t length_bound_;
+
+ class DFS {
+ private:
+ const Graph &graph_;
+ const AbstractDirection<Graph> &direction_;
+ const size_t length_bound_;
+
+ std::pair<size_t, EdgeId> find(EdgeId edge, size_t length) {
+ length += graph_.length(edge);
+ VertexId cross = direction_.EdgeEnd(edge);
+ auto result = make_pair(length, edge);
+ if (length < length_bound_
+ && direction_.CheckUniqueIncomingEdge(cross)) {
+ std::vector<EdgeId> outgoing = direction_.OutgoingEdges(cross);
+ for (auto it = outgoing.begin(); it != outgoing.end(); ++it) {
+ auto candidate = find(*it, length);
+ if (candidate.first > result.first)
+ result = candidate;
+ }
+ }
+ return result;
+ }
+
+ std::vector<EdgeId> RestoreAnswer(EdgeId start, EdgeId end) {
+ std::vector<EdgeId> result;
+ while (end != start) {
+ result.push_back(end);
+ end = direction_.GetUniqueIncomingEdge(direction_.EdgeStart(end));
+ }
+ result.push_back(start);
+ return std::vector<EdgeId>(result.rbegin(), result.rend());
+ }
+
+ public:
+ DFS(const Graph &graph, const AbstractDirection<Graph> &direction,
+ size_t length_bound)
+ : graph_(graph),
+ direction_(direction),
+ length_bound_(length_bound) {
+ }
+
+ std::vector<EdgeId> find(EdgeId edge) {
+ return RestoreAnswer(edge, find(edge, 0).second);
+ }
+ };
+
+public:
+ PlausiblePathFinder(const Graph& graph, size_t length_bound)
+ : graph_(graph),
+ length_bound_(length_bound) {}
+
+ std::vector<EdgeId> operator()(EdgeId e,
+ const AbstractDirection<Graph> &direction) const {
+ return DFS(graph_, direction, length_bound_).find(e);
+ }
+
+};
+}
\ No newline at end of file
diff --git a/src/modules/assembly_graph/paths/path_processor.hpp b/src/modules/assembly_graph/paths/path_processor.hpp
new file mode 100644
index 0000000..5f3d3b6
--- /dev/null
+++ b/src/modules/assembly_graph/paths/path_processor.hpp
@@ -0,0 +1,441 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "dev_support/standard_base.hpp"
+#include "utils/adt/bag.hpp"
+#include "algorithms/dijkstra/dijkstra_helper.hpp"
+
+namespace omnigraph {
+
+template<class Graph>
+const string PrintPath(const Graph& g, const vector<typename Graph::EdgeId>& edges) {
+ string delim = "";
+ std::stringstream ss;
+ for (size_t i = 0; i < edges.size(); ++i) {
+ ss << delim << g.str(edges[i]);
+ delim = " -> ";
+ }
+ return ss.str();
+}
+
+
+template<class Graph>
+class PathProcessor {
+
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef vector<EdgeId> Path;
+ typedef typename DijkstraHelper<Graph>::BoundedDijkstra DijkstraT;
+public:
+ class Callback {
+
+ public:
+ virtual ~Callback() {
+ }
+
+ virtual void Flush() {
+ }
+
+ virtual void HandleReversedPath(const vector<EdgeId>& reversed_path) = 0;
+
+
+ protected:
+ Path ReversePath(const Path& path) const {
+ Path result;
+ for (auto I = path.rbegin(), E = path.rend(); I != E; ++I)
+ result.push_back(*I);
+ return result;
+ }
+ };
+
+private:
+
+ class Traversal {
+ const PathProcessor& outer_;
+ VertexId end_;
+ size_t min_len_;
+ size_t max_len_;
+ Callback& callback_;
+ size_t edge_depth_bound_;
+
+ size_t curr_len_;
+ size_t curr_depth_;
+ size_t call_cnt_;
+ Path reversed_edge_path_;
+ bag<VertexId> vertex_cnts_;
+
+ const Graph& g_;
+ const DijkstraT& dijkstra_;
+
+ void Push(EdgeId e, VertexId start_v) {
+ TRACE("Pushing edge " << g_.str(e));
+ curr_len_ += g_.length(e);
+ curr_depth_++;
+ reversed_edge_path_.push_back(e);
+ vertex_cnts_.put(start_v);
+ }
+
+ void Pop() {
+ VERIFY(!reversed_edge_path_.empty());
+ EdgeId e = reversed_edge_path_.back();
+ size_t len = g_.length(e);
+ VERIFY(curr_len_ >= len);
+
+ TRACE("Popping edge " << g_.str(e));
+ vertex_cnts_.take(g_.EdgeStart(e));
+ reversed_edge_path_.pop_back();
+ curr_len_ -= len;
+ curr_depth_--;
+ }
+
+ bool CanGo(EdgeId e, VertexId start_v) {
+ if (!dijkstra_.DistanceCounted(start_v))
+ return false;
+ if (dijkstra_.GetDistance(start_v) + g_.length(e) + curr_len_ > max_len_)
+ return false;
+ if (curr_depth_ >= edge_depth_bound_)
+ return false;
+ if (vertex_cnts_.mult(start_v) >= PathProcessor::MAX_VERTEX_USAGE)
+ return false;
+ return true;
+ }
+
+ bool Go(VertexId v, const size_t min_len) {
+ TRACE("Got to vertex " << g_.str(v));
+ if (++call_cnt_ >= PathProcessor::MAX_CALL_CNT) {
+ TRACE("Maximal count " << MAX_CALL_CNT << " of recursive calls was exceeded!");
+ return true;
+ }
+
+ if (v == outer_.start_ && curr_len_ >= min_len) {
+ //TRACE("New path found: " << PrintPath(g_, path_));
+ callback_.HandleReversedPath(reversed_edge_path_);
+ }
+
+ TRACE("Iterating through incoming edges of vertex " << g_.int_id(v))
+ //TODO: doesn`t work with parallel simplification
+ vector<EdgeId> incoming;
+ incoming.reserve(4);
+ std::copy_if(g_.in_begin(v), g_.in_end(v), std::back_inserter(incoming), [&] (EdgeId e) {
+ return dijkstra_.DistanceCounted(g_.EdgeStart(e));
+ });
+
+ std::sort(incoming.begin(), incoming.end(), [&] (EdgeId e1, EdgeId e2) {
+ return dijkstra_.GetDistance(g_.EdgeStart(e1)) < dijkstra_.GetDistance(g_.EdgeStart(e2));
+ });
+
+ for (EdgeId e : incoming) {
+ VertexId start_v = g_.EdgeStart(e);
+ if (CanGo(e, start_v)) {
+ Push(e, start_v);
+ bool exceeded_limits = Go(start_v, min_len);
+ Pop();
+ if (exceeded_limits)
+ return true;
+ }
+ }
+ return false;
+ }
+
+ public:
+ Traversal(const PathProcessor& outer, VertexId end,
+ size_t min_len, size_t max_len,
+ Callback& callback, size_t edge_depth_bound) :
+ outer_(outer), end_(end),
+ min_len_(min_len), max_len_(max_len),
+ callback_(callback),
+ edge_depth_bound_(edge_depth_bound),
+ curr_len_(0), curr_depth_(0), call_cnt_(0),
+ g_(outer.g_),
+ dijkstra_(outer.dijkstra_) {
+ reversed_edge_path_.reserve(PathProcessor::MAX_CALL_CNT);
+ vertex_cnts_.put(end_);
+ }
+
+ //returns true iff limits were exceeded
+ bool Go() {
+ bool code = Go(end_, min_len_);
+ VERIFY(curr_len_ == 0);
+ VERIFY(curr_depth_ == 0);
+ vertex_cnts_.take(end_);
+ VERIFY(vertex_cnts_.size() == 0);
+ return code;
+ }
+ };
+
+ friend class Traversal;
+
+public:
+
+ PathProcessor(const Graph& g, VertexId start, size_t length_bound) :
+ g_(g),
+ start_(start),
+ dijkstra_(DijkstraHelper<Graph>::CreateBoundedDijkstra(g, length_bound, MAX_DIJKSTRA_VERTICES)) {
+ TRACE("Dijkstra launched");
+ dijkstra_.Run(start);
+ TRACE("Dijkstra finished");
+ }
+
+ // dfs from the end vertices
+ // 3 two mistakes, 2 bad dijkstra, 1 some bad dfs, 0 = okay
+ int Process(VertexId end, size_t min_len, size_t max_len, Callback& callback, size_t edge_depth_bound = -1ul) const {
+ TRACE("Process launched");
+ int error_code = 0;
+
+ if (dijkstra_.VertexLimitExceeded()) {
+ TRACE("dijkstra : vertex limit exceeded");
+ error_code = 2;
+ }
+
+ TRACE("Start vertex is " << g_.str(start_));
+ TRACE("Bounds are " << min_len << " " << max_len);
+ TRACE("End vertex " << g_.str(end));
+
+ Traversal traversal(*this, end, min_len, max_len, callback, edge_depth_bound);
+ error_code |= int(traversal.Go());
+
+ callback.Flush();
+ TRACE("Process finished with error code " << error_code);
+ return error_code;
+ }
+
+private:
+ static const size_t MAX_CALL_CNT = 3000;
+ static const size_t MAX_DIJKSTRA_VERTICES = 3000;
+ static const size_t MAX_VERTEX_USAGE = 5;
+
+ const Graph& g_;
+ VertexId start_;
+ DijkstraT dijkstra_;
+
+ DECL_LOGGER("PathProcessor")
+};
+
+template<class Graph>
+int ProcessPaths(const Graph& g, size_t min_len, size_t max_len,
+ typename Graph::VertexId start, typename Graph::VertexId end,
+ typename PathProcessor<Graph>::Callback& callback, size_t max_edge_cnt = -1ul) {
+ PathProcessor<Graph> processor(g, start, max_len);
+ return processor.Process(end, min_len, max_len, callback, max_edge_cnt);
+}
+
+template<class Graph>
+class CompositeCallback: public PathProcessor<Graph>::Callback {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef vector<EdgeId> Path;
+
+public:
+ void AddProcessor(typename PathProcessor<Graph>::Callback& processor) {
+ processors_.push_back(&processor);
+ }
+
+ void Flush() override {
+ for (auto it = processors_.begin(); it != processors_.end(); ++it) {
+ (*it)->Flush();
+ }
+ }
+
+ void HandleReversedPath(const Path& path) override {
+ for (auto it = processors_.begin(); it != processors_.end(); ++it) {
+ (*it)->HandleReversedPath(path);
+ }
+ }
+
+private:
+ vector<typename PathProcessor<Graph>::Callback*> processors_;
+};
+
+template<class Graph, class Comparator>
+class BestPathStorage: public PathProcessor<Graph>::Callback {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef vector<EdgeId> Path;
+public:
+ BestPathStorage(const Graph& g, Comparator comparator) :
+ g_(g), cnt_(0), comparator_(comparator) {
+ }
+
+ void HandleReversedPath(const vector<EdgeId>& path) override {
+ cnt_++;
+ if(best_path_.size() == 0 || comparator_(path, best_path_))
+ best_path_ = path;
+ }
+
+ vector<EdgeId> BestPath() const {
+ return best_path_;
+ }
+
+ size_t size() const {
+ return cnt_;
+ }
+
+private:
+ const Graph& g_;
+ size_t cnt_;
+ Comparator comparator_;
+ vector<vector<Path>> best_path_;
+};
+
+
+template<class Graph>
+class PathStorageCallback: public PathProcessor<Graph>::Callback {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef vector<EdgeId> Path;
+
+public:
+ PathStorageCallback(const Graph& g) :
+ g_(g) {
+ }
+
+ void Flush() override {
+ all_paths_.push_back(cur_paths_);
+ cur_paths_.clear();
+ }
+
+ void HandleReversedPath(const vector<EdgeId>& path) override {
+ cur_paths_.push_back(this->ReversePath(path));
+ }
+
+ size_t size(size_t k = 0) const {
+ return all_paths_[k].size();
+ }
+
+ const vector<Path>& paths(size_t k = 0) const {
+ return all_paths_[k];
+ }
+
+private:
+ const Graph& g_;
+ vector<vector<Path>> all_paths_;
+ vector<Path> cur_paths_;
+};
+
+template<class Graph>
+class NonEmptyPathCounter: public PathProcessor<Graph>::Callback {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef vector<EdgeId> Path;
+
+public:
+ NonEmptyPathCounter(const Graph& g) :
+ g_(g), count_(0) {
+ }
+
+ void Flush() override {
+ all_paths_.push_back(cur_paths_);
+ counts_.push_back(count_);
+ cur_paths_.clear();
+ }
+
+ void HandleReversedPath(const Path& path) override {
+ if (path.size() > 0) {
+ ++count_;
+ cur_paths_.push_back(this->ReversePath(path));
+ }
+ }
+
+ size_t count(size_t k = 0) const {
+ return counts_[k];
+ }
+
+ const vector<Path>& paths(size_t k = 0) const {
+ return all_paths_[k];
+ }
+
+private:
+ const Graph& g_;
+ vector<size_t> counts_;
+ size_t count_;
+ vector<vector<Path> > all_paths_;
+ vector<Path> cur_paths_;
+};
+
+template<class Graph>
+class VertexLabelerCallback: public PathProcessor<Graph>::Callback {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef vector<EdgeId> Path;
+
+public:
+ VertexLabelerCallback(const Graph& g) :
+ g_(g), count_(0) {
+ }
+
+ void Flush() override {
+ all_vertices_.push_back(vertices_);
+ vertices_.clear();
+ counts_.push_back(count_);
+ }
+
+ void HandleReversedPath(const Path& path) override {
+ for (auto it = path.rbegin(); it != path.rend(); ++it) {
+ if (path.size() > 0) {
+ vertices_.insert(g_.EdgeStart(*it));
+ vertices_.insert(g_.EdgeEnd(*it));
+ ++count_;
+ }
+ }
+ }
+
+ const set<VertexId>& vertices(size_t k = 0) const {
+ return all_vertices_[k];
+ }
+
+ size_t count(size_t k = 0) const {
+ return counts_[k];
+ }
+
+private:
+ Graph& g_;
+ vector<size_t> counts_;
+ vector<set<VertexId>> all_vertices_;
+ size_t count_;
+ set<VertexId> vertices_;
+};
+
+template<class Graph>
+class DistancesLengthsCallback: public PathProcessor<Graph>::Callback {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef vector<EdgeId> Path;
+
+public:
+ DistancesLengthsCallback(const Graph& g) :
+ g_(g) {
+ }
+
+ void Flush() override {
+ all_distances_.push_back(distances_);
+ distances_.clear();
+ }
+
+ void HandleReversedPath(const Path& path) override {
+ size_t path_length = PathLength(path);
+ distances_.insert(path_length);
+ }
+
+ vector<size_t> distances(size_t k = 0) const {
+ VERIFY(k < all_distances_.size());
+ const set<size_t>& tmp = all_distances_[k];
+ return vector<size_t>(tmp.begin(), tmp.end());
+ }
+
+private:
+ size_t PathLength(const Path& path) const {
+ size_t res = 0;
+ for (auto I = path.begin(); I != path.end(); ++I)
+ res += g_.length(*I);
+ return res;
+ }
+
+ const Graph& g_;
+ set<size_t> distances_;
+ vector<set<size_t>> all_distances_;
+
+ DECL_LOGGER("DistancesLengthsCallback");
+};
+
+}
diff --git a/src/modules/assembly_graph/paths/path_utils.hpp b/src/modules/assembly_graph/paths/path_utils.hpp
new file mode 100644
index 0000000..212c81c
--- /dev/null
+++ b/src/modules/assembly_graph/paths/path_utils.hpp
@@ -0,0 +1,128 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * path_utils.hpp
+ *
+ */
+
+#pragma once
+
+#include "assembly_graph/paths/path_processor.hpp"
+
+namespace debruijn_graph {
+
+ // TODO: rewrite this function
+ template<class Graph>
+ vector<typename Graph::EdgeId> GetCommonPathsEnd(
+ const Graph& g,
+ typename Graph::EdgeId e1,
+ typename Graph::EdgeId e2,
+ size_t min_dist,
+ size_t max_dist,
+ const omnigraph::PathProcessor<Graph>& path_processor)
+ {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef vector<EdgeId> Path;
+
+ //PathProcessor<Graph> path_processor(g,
+ //min_dist - g.length(e1),
+ //max_dist - g.length(e1),
+ //g.EdgeEnd(e1), g.EdgeStart(e2), callback);
+
+ omnigraph::PathStorageCallback<Graph> callback(g);
+ int error_code = path_processor.Process(g.EdgeStart(e2), min_dist - g.length(e1),
+ max_dist - g.length(e1), callback);
+ vector<Path> paths = callback.paths();
+
+ vector<EdgeId> result;
+ if (error_code != 0) {
+ DEBUG("Edge " << g.int_id(e1) << " path_processor problem")
+ return result;
+ }
+ if (paths.size() == 0)
+ return result;
+ if (paths.size() == 1)
+ return paths[0];
+ size_t j = 0;
+ while (j < paths[0].size()) {
+ for (size_t i = 1; i < paths.size(); ++i) {
+ if (j == paths[i].size()) {
+ vector<EdgeId> result(paths[0].begin()+(paths[0].size() - j), paths[0].end());
+ return result;
+ } else {
+ if (paths[0][paths[0].size()-1-j] != paths[i][paths[i].size()-1-j]) {
+ vector<EdgeId> result(paths[0].begin()+(paths[0].size() - j), paths[0].end());
+ return result;
+ }
+ }
+ }
+ ++j;
+ }
+ return paths[0];
+ }
+
+
+
+ template<class Graph>
+ vector<vector<typename Graph::EdgeId> > GetAllPathsBetweenEdges(
+ const Graph& g,
+ typename Graph::EdgeId& e1,
+ typename Graph::EdgeId& e2, size_t min_dist,
+ size_t max_dist) {
+ omnigraph::PathStorageCallback<Graph> callback(g);
+ ProcessPaths(g,
+ min_dist,
+ max_dist, //0, *cfg::get().ds.IS - K + size_t(*cfg::get().ds.is_var),
+ g.EdgeEnd(e1), g.EdgeStart(e2),
+ callback);
+ auto paths = callback.paths();
+ return paths;
+ }
+
+template<class graph_pack>
+size_t GetAllPathsQuantity(const graph_pack& origin_gp,
+ const typename graph_pack::graph_t::EdgeId& e1,
+ const typename graph_pack::graph_t::EdgeId& e2, double d, double is_var) {
+ omnigraph::PathStorageCallback<typename graph_pack::graph_t> callback(origin_gp.g);
+ omnigraph::PathProcessor<typename graph_pack::graph_t>
+ path_processor(origin_gp.g,
+ (size_t) d - origin_gp.g.length(e1) - size_t(is_var),
+ (size_t) d - origin_gp.g.length(e1) + size_t(is_var),
+ origin_gp.g.EdgeEnd(e1),
+ origin_gp.g.EdgeStart(e2),
+ callback);
+ path_processor.Process();
+ auto paths = callback.paths();
+ TRACE(e1.ind_id() << " " << e2.int_id() << " " << paths.size());
+ return paths.size();
+}
+
+template<class Graph>
+Sequence MergeSequences(const Graph& g,
+ const vector<typename Graph::EdgeId>& continuous_path) {
+ vector < Sequence > path_sequences;
+ path_sequences.push_back(g.EdgeNucls(continuous_path[0]));
+ for (size_t i = 1; i < continuous_path.size(); ++i) {
+ VERIFY(
+ g.EdgeEnd(continuous_path[i - 1])
+ == g.EdgeStart(continuous_path[i]));
+ path_sequences.push_back(g.EdgeNucls(continuous_path[i]));
+ }
+ return MergeOverlappingSequences(path_sequences, g.k());
+}
+
+template<class Graph>
+Sequence PathSequence(const Graph& g, const omnigraph::Path<typename Graph::EdgeId>& path) {
+ Sequence path_sequence = MergeSequences(g, path.sequence());
+ size_t start = path.start_pos();
+ size_t end = path_sequence.size()
+ - g.length(path[path.size() - 1]) + path.end_pos();
+ return path_sequence.Subseq(start, end);
+}
+
+}
diff --git a/src/modules/assembly_graph/stats/picture_dump.hpp b/src/modules/assembly_graph/stats/picture_dump.hpp
new file mode 100644
index 0000000..cfaa696
--- /dev/null
+++ b/src/modules/assembly_graph/stats/picture_dump.hpp
@@ -0,0 +1,426 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "statistics.hpp"
+#include "assembly_graph/graph_core/graph.hpp"
+
+#include "pipeline/graph_pack.hpp"
+#include "assembly_graph/graph_alignment/sequence_mapper.hpp"
+#include "pipeline/graphio.hpp"
+//FIXME awful dependency to get write_lib_data
+#include "pipeline/config_struct.hpp"
+#include "visualization/position_filler.hpp"
+
+#include "visualization/visualization.hpp"
+#include "assembly_graph/handlers/edges_position_handler.hpp"
+#include "assembly_graph/components/graph_component.hpp"
+#include "io/reads_io/rc_reader_wrapper.hpp"
+#include "io/reads_io/delegating_reader_wrapper.hpp"
+#include "io/reads_io/io_helper.hpp"
+#include "io/reads_io/wrapper_collection.hpp"
+#include "io/reads_io/osequencestream.hpp"
+#include "io/dataset_support/dataset_readers.hpp"
+#include "dev_support/copy_file.hpp"
+
+#include <boost/algorithm/string.hpp>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <cmath>
+
+namespace debruijn_graph {
+
+namespace stats {
+
+template<class Graph, class Index>
+MappingPath<typename Graph::EdgeId>
+FindGenomeMappingPath(const Sequence& genome, const Graph& g,
+ const Index& index,
+ const KmerMapper<Graph>& kmer_mapper) {
+ NewExtendedSequenceMapper<Graph, Index> srt(g, index, kmer_mapper);
+ return srt.MapSequence(genome);
+}
+
+template<class graph_pack>
+MappingPath<typename graph_pack::graph_t::EdgeId>
+FindGenomeMappingPath(const Sequence& genome, const graph_pack& gp) {
+ return FindGenomeMappingPath(genome, gp.g, gp.index, gp.kmer_mapper);
+}
+
+template <class graph_pack>
+shared_ptr<omnigraph::visualization::GraphColorer<Graph>> DefaultColorer(const graph_pack& gp) {
+ return omnigraph::visualization::DefaultColorer(gp.g,
+ FindGenomeMappingPath(gp.genome.GetSequence(), gp.g, gp.index, gp.kmer_mapper).path(),
+ FindGenomeMappingPath(!gp.genome.GetSequence(), gp.g, gp.index, gp.kmer_mapper).path());
+}
+
+template <class graph_pack>
+void CollectContigPositions(graph_pack &gp) {
+ if (!cfg::get().pos.contigs_for_threading.empty() &&
+ path::FileExists(cfg::get().pos.contigs_for_threading))
+ FillPos(gp, cfg::get().pos.contigs_for_threading, "thr_", true);
+
+ if (!cfg::get().pos.contigs_to_analyze.empty() &&
+ path::FileExists(cfg::get().pos.contigs_to_analyze))
+ FillPos(gp, cfg::get().pos.contigs_to_analyze, "anlz_", true);
+}
+
+template<class Graph, class Index>
+class GenomeMappingStat: public AbstractStatCounter {
+ private:
+ typedef typename Graph::EdgeId EdgeId;
+ const Graph &graph_;
+ const Index& index_;
+ Sequence genome_;
+ size_t k_;
+ public:
+ GenomeMappingStat(const Graph &graph, const Index &index, GenomeStorage genome, size_t k) :
+ graph_(graph), index_(index), genome_(genome.GetSequence()), k_(k) {}
+
+ virtual ~GenomeMappingStat() {}
+
+ virtual void Count() {
+ INFO("Mapping genome");
+ size_t break_number = 0;
+ size_t covered_kp1mers = 0;
+ size_t fail = 0;
+ if (genome_.size() <= k_)
+ return;
+
+ runtime_k::RtSeq cur = genome_.start<runtime_k::RtSeq>(k_ + 1);
+ cur >>= 0;
+ bool breaked = true;
+ pair<EdgeId, size_t> cur_position;
+ for (size_t cur_nucl = k_; cur_nucl < genome_.size(); cur_nucl++) {
+ cur <<= genome_[cur_nucl];
+ if (index_.contains(cur)) {
+ pair<EdgeId, size_t> next = index_.get(cur);
+ if (!breaked
+ && cur_position.second + 1
+ < graph_.length(cur_position.first)) {
+ if (next.first != cur_position.first
+ || cur_position.second + 1 != next.second) {
+ fail++;
+ }
+ }
+ cur_position = next;
+ covered_kp1mers++;
+ breaked = false;
+ } else {
+ if (!breaked) {
+ breaked = true;
+ break_number++;
+ }
+ }
+ }
+ INFO("Genome mapped");
+ INFO("Genome mapping results:");
+ INFO("Covered k+1-mers:" << covered_kp1mers << " of " << (genome_.size() - k_) << " which is "
+ << (100.0 * (double) covered_kp1mers / (double) (genome_.size() - k_)) << "%");
+ INFO("Covered k+1-mers form " << break_number + 1 << " contigious parts");
+ INFO("Continuity failtures " << fail);
+ }
+};
+
+template<class Graph>
+void WriteErrorLoc(const Graph &g,
+ const string& folder_name,
+ std::shared_ptr<omnigraph::visualization::GraphColorer<Graph>> genome_colorer,
+ const omnigraph::GraphLabeler<Graph>& labeler) {
+ INFO("Writing error localities for graph to folder " << folder_name);
+ GraphComponent<Graph> all(g, g.begin(), g.end());
+ set<typename Graph::EdgeId> edges = genome_colorer->ColoredWith(all.edges().begin(),
+ all.edges().end(), "black");
+ set<typename Graph::VertexId> to_draw;
+ for (auto it = edges.begin(); it != edges.end(); ++it) {
+ to_draw.insert(g.EdgeEnd(*it));
+ to_draw.insert(g.EdgeStart(*it));
+ }
+ shared_ptr<GraphSplitter<Graph>> splitter = StandardSplitter(g, to_draw);
+ WriteComponents(g, folder_name, splitter, genome_colorer, labeler);
+ INFO("Error localities written written to folder " << folder_name);
+}
+
+template<class graph_pack>
+void CountStats(const graph_pack& gp) {
+ typedef typename graph_pack::graph_t Graph;
+ typedef typename Graph::EdgeId EdgeId;
+ INFO("Counting stats");
+ StatList stats;
+ Path<EdgeId> path1 = FindGenomeMappingPath(gp.genome.GetSequence(), gp.g, gp.index,
+ gp.kmer_mapper).path();
+ Path<EdgeId> path2 = FindGenomeMappingPath(!gp.genome.GetSequence(), gp.g, gp.index,
+ gp.kmer_mapper).path();
+ stats.AddStat(new VertexEdgeStat<Graph>(gp.g));
+ stats.AddStat(new BlackEdgesStat<Graph>(gp.g, path1, path2));
+ stats.AddStat(new NStat<Graph>(gp.g, path1, 50));
+ stats.AddStat(new SelfComplementStat<Graph>(gp.g));
+ stats.AddStat(
+ new GenomeMappingStat<Graph, Index>(gp.g, gp.index,
+ gp.genome, gp.k_value));
+ stats.AddStat(new IsolatedEdgesStat<Graph>(gp.g, path1, path2));
+ stats.Count();
+ INFO("Stats counted");
+}
+
+template<class Graph>
+void WriteGraphComponentsAlongGenome(const Graph& g,
+ const GraphLabeler<Graph>& labeler,
+ const string& folder,
+ const Path<typename Graph::EdgeId>& path1,
+ const Path<typename Graph::EdgeId>& path2) {
+ INFO("Writing graph components along genome");
+
+ make_dir(folder);
+ omnigraph::visualization::WriteComponentsAlongPath(g, path1, folder, omnigraph::visualization::DefaultColorer(g, path1, path2), labeler);
+
+ INFO("Writing graph components along genome finished");
+}
+
+//todo refactoring needed: use graph pack instead!!!
+template<class Graph, class Mapper>
+void WriteGraphComponentsAlongContigs(const Graph& g,
+ Mapper &mapper,
+ const std::string& folder,
+ std::shared_ptr<omnigraph::visualization::GraphColorer<Graph>> colorer,
+ const GraphLabeler<Graph>& labeler) {
+ INFO("Writing graph components along contigs");
+ auto contigs_to_thread = io::EasyStream(cfg::get().pos.contigs_to_analyze, false);
+ contigs_to_thread->reset();
+ io::SingleRead read;
+ while (!contigs_to_thread->eof()) {
+ (*contigs_to_thread) >> read;
+ make_dir(folder + read.name());
+ omnigraph::visualization::WriteComponentsAlongPath(g, mapper.MapSequence(read.sequence()).simple_path(), folder + read.name() + "/",
+ colorer, labeler);
+ }
+ INFO("Writing graph components along contigs finished");
+}
+
+template<class Graph>
+void WriteKmerComponent(conj_graph_pack &gp, runtime_k::RtSeq const& kp1mer, const std::string& file,
+ std::shared_ptr<omnigraph::visualization::GraphColorer<Graph>> colorer,
+ const omnigraph::GraphLabeler<Graph>& labeler) {
+ if(!gp.index.contains(kp1mer)) {
+ WARN("no such kmer in the graph");
+ return;
+ }
+ VERIFY(gp.index.contains(kp1mer));
+ auto pos = gp.index.get(kp1mer);
+ typename Graph::VertexId v = pos.second * 2 < gp.g.length(pos.first) ? gp.g.EdgeStart(pos.first) : gp.g.EdgeEnd(pos.first);
+ GraphComponent<Graph> component = omnigraph::VertexNeighborhood<Graph>(gp.g, v);
+ omnigraph::visualization::WriteComponent<Graph>(component, file, colorer, labeler);
+}
+
+inline
+optional<runtime_k::RtSeq> FindCloseKP1mer(const conj_graph_pack &gp,
+ size_t genome_pos, size_t k) {
+ VERIFY(gp.genome.size() > 0);
+ VERIFY(genome_pos < gp.genome.size());
+ static const size_t magic_const = 200;
+ for (size_t diff = 0; diff < magic_const; diff++) {
+ for (int dir = -1; dir <= 1; dir += 2) {
+ size_t pos = (gp.genome.size() - k + genome_pos + dir * diff) % (gp.genome.size() - k);
+ runtime_k::RtSeq kp1mer = gp.kmer_mapper.Substitute(
+ runtime_k::RtSeq (k + 1, gp.genome.GetSequence(), pos));
+ if (gp.index.contains(kp1mer))
+ return optional<runtime_k::RtSeq>(kp1mer);
+ }
+ }
+ return boost::none;
+}
+
+inline
+void PrepareForDrawing(conj_graph_pack &gp) {
+ gp.EnsureDebugInfo();
+ CollectContigPositions(gp);
+}
+
+
+struct detail_info_printer {
+ detail_info_printer(conj_graph_pack &gp,
+ const omnigraph::GraphLabeler<Graph>& labeler,
+ const string& folder)
+ : gp_(gp),
+ labeler_(labeler),
+ folder_(folder) {
+ }
+
+ void operator() (config::info_printer_pos pos,
+ const string& folder_suffix = "") {
+ string pos_name = ModeName(pos, config::InfoPrinterPosNames());
+
+ ProduceDetailedInfo(pos_name + folder_suffix, pos);
+ }
+
+ private:
+
+ void ProduceDetailedInfo(const string &pos_name,
+ config::info_printer_pos pos) {
+ static size_t call_cnt = 0;
+
+ auto it = cfg::get().info_printers.find(pos);
+ VERIFY(it != cfg::get().info_printers.end());
+
+ const config::debruijn_config::info_printer & config = it->second;
+
+ if (config.basic_stats) {
+ VertexEdgeStat<conj_graph_pack::graph_t> stats(gp_.g);
+ INFO("Number of vertices : " << stats.vertices() << ", number of edges : "
+ << stats.edges() << ", sum length of edges : " << stats.edge_length());
+ }
+
+ if (config.save_full_graph) {
+ string saves_folder = path::append_path(path::append_path(folder_, "saves/"),
+ ToString(call_cnt++, 2) + "_" + pos_name + "/");
+ path::make_dirs(saves_folder);
+ graphio::ConjugateDataPrinter<conj_graph_pack::graph_t> printer(gp_.g);
+ graphio::PrintBasicGraph(saves_folder + "graph", printer);
+ }
+
+ if (config.lib_info) {
+ string saves_folder = path::append_path(path::append_path(folder_, "saves/"),
+ ToString(call_cnt++, 2) + "_" + pos_name + "/");
+ path::make_dirs(saves_folder);
+ config::write_lib_data(saves_folder + "lib_info");
+ }
+
+ if (config.extended_stats) {
+ VERIFY(cfg::get().developer_mode);
+ CountStats(gp_);
+ }
+
+ if (!(config.write_error_loc ||
+ config.write_full_graph ||
+ config.write_full_nc_graph ||
+ config.write_components ||
+ !config.components_for_kmer.empty() ||
+ config.write_components_along_genome ||
+ config.write_components_along_contigs ||
+ !config.components_for_genome_pos.empty())) {
+ return;
+ }
+
+ VERIFY(cfg::get().developer_mode);
+ string pics_folder = path::append_path(path::append_path(folder_, "pictures/"),
+ ToString(call_cnt++, 2) + "_" + pos_name + "/");
+ path::make_dirs(pics_folder);
+ PrepareForDrawing(gp_);
+
+ auto path1 = FindGenomeMappingPath(gp_.genome.GetSequence(), gp_.g, gp_.index,
+ gp_.kmer_mapper).path();
+
+ auto colorer = DefaultColorer(gp_);
+
+ if (config.write_error_loc) {
+ make_dir(pics_folder + "error_loc/");
+ WriteErrorLoc(gp_.g, pics_folder + "error_loc/", colorer, labeler_);
+ }
+
+ if (config.write_full_graph) {
+ WriteComponent(GraphComponent<Graph>(gp_.g, gp_.g.begin(), gp_.g.end()), pics_folder + "full_graph.dot", colorer, labeler_);
+ }
+
+ if (config.write_full_nc_graph) {
+ WriteSimpleComponent(GraphComponent<Graph>(gp_.g, gp_.g.begin(), gp_.g.end()), pics_folder + "nc_full_graph.dot", colorer, labeler_);
+ }
+
+ if (config.write_components) {
+ make_dir(pics_folder + "components/");
+ omnigraph::visualization::WriteComponents(gp_.g, pics_folder + "components/", omnigraph::ReliableSplitter<Graph>(gp_.g), colorer, labeler_);
+ }
+
+ if (!config.components_for_kmer.empty()) {
+ string kmer_folder = path::append_path(pics_folder, "kmer_loc/");
+ make_dir(kmer_folder);
+ auto kmer = runtime_k::RtSeq(gp_.k_value + 1, config.components_for_kmer.substr(0, gp_.k_value + 1).c_str());
+ string file_name = path::append_path(kmer_folder, pos_name + ".dot");
+ WriteKmerComponent(gp_, kmer, file_name, colorer, labeler_);
+ }
+
+ if (config.write_components_along_genome) {
+ make_dir(pics_folder + "along_genome/");
+ omnigraph::visualization::WriteComponentsAlongPath(gp_.g, path1.sequence(), pics_folder + "along_genome/", colorer, labeler_);
+ }
+
+ if (config.write_components_along_contigs) {
+ make_dir(pics_folder + "along_contigs/");
+ NewExtendedSequenceMapper<Graph, Index> mapper(gp_.g, gp_.index, gp_.kmer_mapper);
+ WriteGraphComponentsAlongContigs(gp_.g, mapper, pics_folder + "along_contigs/", colorer, labeler_);
+ }
+
+ if (!config.components_for_genome_pos.empty()) {
+ string pos_loc_folder = path::append_path(pics_folder, "pos_loc/");
+ make_dir(pos_loc_folder);
+ vector<string> positions;
+ boost::split(positions, config.components_for_genome_pos,
+ boost::is_any_of(" ,"), boost::token_compress_on);
+ for (auto it = positions.begin(); it != positions.end(); ++it) {
+ boost::optional<runtime_k::RtSeq> close_kp1mer = FindCloseKP1mer(gp_,
+ std::stoi(*it), gp_.k_value);
+ if (close_kp1mer) {
+ string locality_folder = path::append_path(pos_loc_folder, *it + "/");
+ make_dir(locality_folder);
+ WriteKmerComponent(gp_, *close_kp1mer, path::append_path(locality_folder, pos_name + ".dot"), colorer, labeler_);
+ } else {
+ WARN(
+ "Failed to find genome kp1mer close to the one at position "
+ << *it << " in the graph. Which is " << runtime_k::RtSeq (gp_.k_value + 1, gp_.genome.GetSequence(), std::stoi(*it)));
+ }
+ }
+ }
+ }
+
+ conj_graph_pack& gp_;
+ const omnigraph::GraphLabeler<Graph>& labeler_;
+ string folder_;
+};
+
+inline
+std::string ConstructComponentName(std::string file_name, size_t cnt) {
+ stringstream ss;
+ ss << cnt;
+ string res = file_name;
+ res.insert(res.length(), ss.str());
+ return res;
+}
+
+template<class Graph>
+double AvgCoverage(const Graph& g,
+ const std::vector<typename Graph::EdgeId>& edges) {
+ double total_cov = 0.;
+ size_t total_length = 0;
+ for (auto it = edges.begin(); it != edges.end(); ++it) {
+ total_cov += g.coverage(*it) * (double) g.length(*it);
+ total_length += g.length(*it);
+ }
+ return total_cov / (double) total_length;
+}
+
+template<class Graph>
+size_t Nx(Graph &g, double percent) {
+ size_t sum_edge_length = 0;
+ vector<size_t> lengths;
+ for (auto iterator = g.ConstEdgeBegin(); !iterator.IsEnd(); ++iterator) {
+ lengths.push_back(g.length(*iterator));
+ sum_edge_length += g.length(*iterator);
+ }
+ sort(lengths.begin(), lengths.end());
+ double len_perc = (1.0 - percent * 0.01) * (double) (sum_edge_length);
+ for (size_t i = 0; i < lengths.size(); i++) {
+ if (lengths[i] >= len_perc)
+ return lengths[i];
+ else
+ len_perc -= (double) lengths[i];
+ }
+ return 0;
+}
+
+}
+}
diff --git a/src/modules/assembly_graph/stats/statistics.hpp b/src/modules/assembly_graph/stats/statistics.hpp
new file mode 100644
index 0000000..3ab53a5
--- /dev/null
+++ b/src/modules/assembly_graph/stats/statistics.hpp
@@ -0,0 +1,273 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "dev_support/simple_tools.hpp"
+#include "math/xmath.h"
+#include "pipeline/config_struct.hpp"
+#include "assembly_graph/paths/mapping_path.hpp"
+
+#include <iostream>
+#include <fstream>
+#include <map>
+
+namespace debruijn_graph {
+namespace stats {
+
+using namespace math;
+using namespace omnigraph;
+
+class AbstractStatCounter {
+public:
+ AbstractStatCounter() {
+ }
+
+ virtual ~AbstractStatCounter() {
+ }
+
+ virtual void Count() = 0;
+ //protected:
+ // DECL_LOGGER("StatCounter")
+};
+
+class StatList : AbstractStatCounter {
+private:
+ vector<AbstractStatCounter *> to_count_;
+public:
+ StatList(vector<AbstractStatCounter *> to_count =
+ vector<AbstractStatCounter *>()) :
+ to_count_(to_count) {
+ }
+
+ virtual ~StatList() {
+ }
+
+ void AddStat(AbstractStatCounter *new_stat) {
+ to_count_.push_back(new_stat);
+ }
+
+ const vector<AbstractStatCounter *> stats() {
+ return to_count_;
+ }
+
+ virtual void Count() {
+ for (size_t i = 0; i < to_count_.size(); i++) {
+ to_count_[i]->Count();
+ }
+ }
+
+ void DeleteStats() {
+ for (size_t i = 0; i < to_count_.size(); i++)
+ delete to_count_[i];
+ to_count_.clear();
+ }
+};
+
+template<class Graph>
+class VertexEdgeStat : public AbstractStatCounter {
+private:
+ const Graph &graph_;
+public:
+ VertexEdgeStat(const Graph &graph) :
+ graph_(graph) {
+ }
+
+ virtual ~VertexEdgeStat() {
+ }
+
+ size_t vertices() {
+ return graph_.size();
+ }
+
+ size_t edges() {
+ size_t edgeNumber = 0;
+ size_t sum_edge_length = 0;
+ for (auto iterator = graph_.ConstEdgeBegin(); !iterator.IsEnd();
+ ++iterator) {
+ edgeNumber++;
+ // if (graph_.coverage(*iterator) > 30) {
+ sum_edge_length += graph_.length(*iterator);
+ // }
+ }
+ return edgeNumber;
+ }
+
+ size_t edge_length() {
+ size_t sum_edge_length = 0;
+ for (auto iterator = graph_.ConstEdgeBegin(); !iterator.IsEnd();
+ ++iterator) {
+ if (graph_.coverage(*iterator) > 30) {
+ sum_edge_length += graph_.length(*iterator);
+ }
+ }
+ return sum_edge_length;
+ }
+
+ virtual void Count() {
+ INFO(
+ "Vertex count=" << vertices() << "; Edge count=" << edges());
+ INFO(
+ "sum length of edges " << edge_length());
+ }
+};
+
+template<class Graph>
+class BlackEdgesStat : public AbstractStatCounter {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+ const Graph &graph_;
+ Path<EdgeId> path1_;
+ Path<EdgeId> path2_;
+public:
+ BlackEdgesStat(const Graph &graph, Path<EdgeId> path1, Path<EdgeId> path2) :
+ graph_(graph), path1_(path1), path2_(path2) {
+ }
+
+ virtual ~BlackEdgesStat() {
+ }
+
+ virtual void Count() {
+ size_t black_count = 0;
+ size_t edge_count = 0;
+ const vector <EdgeId> path_edges1 = path1_.sequence();
+ const vector <EdgeId> path_edges2 = path2_.sequence();
+ set <EdgeId> colored_edges;
+ colored_edges.insert(path_edges1.begin(), path_edges1.end());
+ colored_edges.insert(path_edges2.begin(), path_edges2.end());
+ size_t sum_length = 0;
+ for (auto it = graph_.ConstEdgeBegin(); !it.IsEnd(); ++it) {
+ edge_count++;
+ if (colored_edges.count(*it) == 0) {
+ black_count++;
+ sum_length += graph_.length(*it);
+ }
+ }
+ if (edge_count > 0) {
+ INFO("Error edges count: " << black_count << " which is " <<
+ 100.0 * (double) black_count / (double) edge_count << "% of all edges");
+ INFO("Total length of all black edges: " << sum_length << ". While double genome length is " <<
+ (2 * cfg::get().ds.reference_genome.size()));
+ } else {
+ INFO("Error edges count: " << black_count << " which is 0% of all edges");
+ }
+ }
+};
+
+template<class Graph>
+class NStat : public AbstractStatCounter {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+ const Graph &graph_;
+ Path<EdgeId> path_;
+ size_t perc_;
+public:
+ NStat(const Graph &graph, Path<EdgeId> path, size_t perc = 50) :
+ graph_(graph), path_(path), perc_(perc) {
+ }
+
+ virtual ~NStat() {
+ }
+
+ virtual void Count() {
+ vector <size_t> lengths;
+ size_t sum_all = 0;
+ for (size_t i = 0; i < path_.size(); i++) {
+ lengths.push_back(graph_.length(path_[i]));
+ sum_all += graph_.length(path_[i]);
+ }
+ sort(lengths.begin(), lengths.end());
+ size_t sum = 0;
+ size_t current = lengths.size();
+ while (current > 0 && (double) sum < (double) perc_ * 0.01 * (double) sum_all) {
+ current--;
+ sum += lengths[current];
+ }
+ if (current < lengths.size())
+ INFO("N" << perc_ << ": " << lengths[current]);
+ }
+};
+
+template<class Graph>
+class IsolatedEdgesStat : public AbstractStatCounter {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+ const Graph &graph_;
+ set <EdgeId> black_edges_;
+ vector <size_t> lengths;
+public:
+ IsolatedEdgesStat(const Graph &graph, Path<EdgeId> path1,
+ Path<EdgeId> path2) :
+ graph_(graph) {
+ for (auto it = graph.ConstEdgeBegin(); !it.IsEnd(); ++it) {
+ black_edges_.insert(*it);
+ }
+ for (size_t i = 0; i < path1.size(); i++) {
+ black_edges_.erase(path1[i]);
+ }
+ for (size_t i = 0; i < path2.size(); i++) {
+ black_edges_.erase(path2[i]);
+ }
+ }
+
+ virtual ~IsolatedEdgesStat() {
+ }
+
+ virtual void Count() {
+ lengths.clear();
+ for (auto it = graph_.ConstEdgeBegin(); !it.IsEnd(); ++it) {
+ EdgeId edge = *it;
+ if (graph_.IsDeadEnd(graph_.EdgeEnd(edge))
+ && graph_.IsDeadStart(graph_.EdgeStart(edge))
+ && black_edges_.count(edge) == 0) {
+ lengths.push_back(graph_.length(edge));
+ }
+ }
+ INFO("Isolated not black edges: " << lengths.size());
+ WriteLengths(cfg::get().output_dir, "isolated_edges.txt");
+ }
+
+ void WriteLengths(string folder_name, string file_name) {
+ ofstream os;
+ os.open((folder_name + "/" + file_name).c_str());
+ WriteLengths(os);
+ os.close();
+ }
+
+ void WriteLengths(ostream &os) {
+ sort(lengths.begin(), lengths.end());
+ for (size_t i = 0; i < lengths.size(); i++) {
+ os << lengths[i] << endl;
+ }
+ }
+};
+
+template<class Graph>
+class SelfComplementStat : public AbstractStatCounter {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+ const Graph &graph_;
+public:
+ SelfComplementStat(const Graph &graph) :
+ graph_(graph) {
+ }
+
+ virtual ~SelfComplementStat() {
+ }
+
+ virtual void Count() {
+ size_t sc_number = 0;
+ for (auto iterator = graph_.ConstEdgeBegin(); !iterator.IsEnd();
+ ++iterator)
+ if (graph_.conjugate(*iterator) == (*iterator))
+ sc_number++;
+ // INFO("Self-complement count failed!!! ");
+ INFO("Self-complement count=" << sc_number);
+ }
+};
+}
+}
diff --git a/src/modules/data_structures/debruijn_graph/debruijn_graph_constructor.hpp b/src/modules/data_structures/debruijn_graph/debruijn_graph_constructor.hpp
new file mode 100644
index 0000000..d6a3545
--- /dev/null
+++ b/src/modules/data_structures/debruijn_graph/debruijn_graph_constructor.hpp
@@ -0,0 +1,555 @@
+#pragma once
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * debruijn_graph_constructor.hpp
+ *
+ * Created on: Apr 5, 2011
+ * Author: sergey
+ */
+
+#include "assembly_graph/graph_core/graph.hpp"
+#include "assembly_graph/graph_core/construction_helper.hpp"
+#include "dev_support/standard_base.hpp"
+#include "data_structures/indices/kmer_extension_index.hpp"
+#include "dev_support/openmp_wrapper.h"
+#include "dev_support/parallel_wrapper.hpp"
+
+namespace debruijn_graph {
+
+/*
+ * Constructs DeBruijnGraph from DeBruijn Graph using "new DeBruijnGraphConstructor(DeBruijn).ConstructGraph(DeBruijnGraph, Index)"
+ */
+template<class Graph, class Index>
+class DeBruijnGraphConstructor {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef Index DeBruijn;
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Index::KMer Kmer;
+ typedef typename DeBruijn::KeyWithHash KeyWithHash;
+ typedef typename DeBruijn::kmer_iterator kmer_iterator;
+
+ Graph &graph_;
+ DeBruijn &origin_;
+ size_t kmer_size_;
+
+ bool StepRightIfPossible(KeyWithHash &kwh) {
+ // VERIFY(origin_.contains(edge));
+ if (origin_.RivalEdgeCount(kwh) == 1
+ && origin_.NextEdgeCount(kwh) == 1) {
+ kwh = origin_.NextEdge(kwh);
+ // VERIFY(origin_.contains(next_edge));
+ return true;
+ }
+ return false;
+ }
+
+ KeyWithHash &GoRight(KeyWithHash &kwh) {
+ KeyWithHash initial = kwh;
+ while (StepRightIfPossible(kwh) && kwh != initial) {
+ ;
+ }
+ return kwh;
+ }
+
+ KeyWithHash &GoLeft(KeyWithHash &kwh) {
+ //These strange things are in order to avoid making copies of kwh
+ kwh = !kwh;
+ kwh = !GoRight(kwh);
+ return kwh;
+ }
+
+ Sequence ConstructSeqGoingRight(KeyWithHash &kwh) {
+ SequenceBuilder s;
+ s.append(kwh.key());
+ KeyWithHash initial = kwh;
+ while (StepRightIfPossible(kwh) && kwh != initial) {
+ s.append(kwh[kmer_size_]);
+ }
+ return s.BuildSequence();
+ }
+
+ Sequence ConstructSequenceWithEdge(const KeyWithHash &kwh) {
+ KeyWithHash tmp = kwh;
+ return ConstructSeqGoingRight(GoLeft(tmp));
+ }
+
+ VertexId FindVertexByOutgoingEdges(Kmer kmer) {
+ for (char c = 0; c < 4; ++c) {
+ KeyWithHash edge = origin_.ConstructKWH(kmer.pushBack(c));
+ if (origin_.contains(edge))
+ return graph_.EdgeStart(origin_.get_value(edge).edge_id);
+ }
+ return VertexId(NULL);
+ }
+
+ VertexId FindVertexByIncomingEdges(Kmer kmer) {
+ for (char c = 0; c < 4; ++c) {
+ KeyWithHash edge = origin_.ConstructKWH(kmer.pushFront(c));
+ if (origin_.contains(edge)) {
+ return graph_.EdgeEnd(origin_.get_value(edge).edge_id);
+ }
+ }
+ return VertexId(NULL);
+ }
+
+ VertexId FindVertex(Kmer kmer) {
+ VertexId v = FindVertexByOutgoingEdges(kmer);
+ return v == VertexId(NULL) ? FindVertexByIncomingEdges(kmer) : v;
+ }
+
+ VertexId FindVertexMaybeMissing(Kmer kmer) {
+ VertexId v = FindVertex(kmer);
+ return v != VertexId(NULL) ? v : graph_.AddVertex();
+ }
+
+ VertexId FindEndMaybeMissing(const ConjugateDeBruijnGraph& graph,
+ VertexId start, Kmer start_kmer, Kmer end_kmer) {
+ if (start_kmer == end_kmer) {
+ return start;
+ } else if (start_kmer == !end_kmer) {
+ return graph.conjugate(start);
+ } else {
+ return FindVertexMaybeMissing(end_kmer);
+ }
+ }
+
+ void ConstructPart(const std::vector<KeyWithHash>& kwh_list,
+ std::vector<Sequence>& sequences) {
+ for (size_t i = 0; i < sequences.size(); ++i) {
+ if (origin_.contains(kwh_list[i])) {
+ continue;
+ }
+
+ Kmer start_kmer = sequences[i].start < Kmer > (kmer_size_);
+ Kmer end_kmer = sequences[i].end < Kmer > (kmer_size_);
+
+ VertexId start = FindVertexMaybeMissing(start_kmer);
+ VertexId end = FindEndMaybeMissing(graph_, start, start_kmer,
+ end_kmer);
+
+ graph_.AddEdge(start, end, sequences[i]);
+ }
+ }
+
+ void AddKmers(kmer_iterator &it, kmer_iterator &end, size_t queueSize,
+ std::vector<KeyWithHash>& kwh_list) {
+ for (; kwh_list.size() != queueSize && it != end; ++it) {
+ KeyWithHash kwh = origin_.ConstructKWH(Kmer(unsigned(kmer_size_ + 1), (*it).data()));
+
+ if (!origin_.contains(kwh))
+ kwh_list.push_back(kwh);
+ }
+ }
+
+ void CalculateSequences(std::vector<KeyWithHash> &kwh_list,
+ std::vector<Sequence> &sequences) {
+ size_t size = kwh_list.size();
+ sequences.resize(size);
+
+# pragma omp parallel for schedule(guided)
+ for (size_t i = 0; i < size; ++i) {
+ sequences[i] = ConstructSequenceWithEdge(kwh_list[i]);
+ }
+ }
+
+public:
+ DeBruijnGraphConstructor(Graph& graph, DeBruijn &origin) :
+ graph_(graph), origin_(origin), kmer_size_(graph_.k()) {
+ }
+
+ void ConstructGraph(size_t queueMinSize, size_t queueMaxSize,
+ double queueGrowthRate) {
+ kmer_iterator it = origin_.kmer_begin();
+ kmer_iterator end = origin_.kmer_end();
+ size_t queueSize = queueMinSize;
+ std::vector<KeyWithHash> kwh_list;
+ std::vector<Sequence> sequences;
+ kwh_list.reserve(queueSize);
+ sequences.reserve(queueMaxSize);
+ while (it != end) {
+ AddKmers(it, end, queueSize, kwh_list); // format a queue of kmers that are not in index
+ CalculateSequences(kwh_list, sequences); // in parallel
+ ConstructPart(kwh_list, sequences);
+ kwh_list.clear();
+ queueSize = min(size_t(double(queueSize) * queueGrowthRate), queueMaxSize);
+ }
+ }
+
+private:
+ DECL_LOGGER("DeBruijnGraphConstructor")
+};
+
+class UnbranchingPathFinder {
+private:
+ typedef DeBruijnExtensionIndex<> Index;
+ typedef runtime_k::RtSeq Kmer;
+ typedef Index::kmer_iterator kmer_iterator;
+ typedef Index::KeyWithHash KeyWithHash;
+ typedef Index::DeEdge DeEdge;
+
+ Index &origin_;
+ size_t kmer_size_;
+ bool clean_condensed_;
+
+
+public:
+ UnbranchingPathFinder(Index &origin, size_t kmer_size) : origin_(origin), kmer_size_(kmer_size) {
+ }
+
+ bool StepRightIfPossible(DeEdge &edge) {
+ if (origin_.CheckUniqueOutgoing(edge.end) && origin_.CheckUniqueIncoming(edge.end)) {
+ edge = DeEdge(edge.end, origin_.GetUniqueOutgoing(edge.end));
+ return true;
+ }
+ return false;
+ }
+
+ Sequence ConstructSeqGoingRight(DeEdge edge) {
+ SequenceBuilder s;
+ s.append(edge.start.key());
+ s.append(edge.end[kmer_size_ - 1]);
+ DeEdge initial = edge;
+ while (StepRightIfPossible(edge) && edge != initial) {
+ s.append(edge.end[kmer_size_ - 1]);
+ }
+ return s.BuildSequence();
+ }
+
+ Sequence ConstructSequenceWithEdge(DeEdge edge) {
+ return ConstructSeqGoingRight(edge);
+ }
+
+//TODO Think about what happends to self rc perfect loops
+ Sequence ConstructLoopFromVertex(const KeyWithHash &kh) {
+ DeEdge break_point(kh, origin_.GetUniqueOutgoing(kh));
+ Sequence result = ConstructSequenceWithEdge(break_point);
+ if (clean_condensed_)
+ origin_.IsolateVertex(kh);
+ return result;
+ }
+};
+
+class UnbranchingPathExtractor {
+private:
+ typedef DeBruijnExtensionIndex<> Index;
+ typedef runtime_k::RtSeq Kmer;
+ typedef Index::kmer_iterator kmer_iterator;
+ typedef Index::DeEdge DeEdge;
+ typedef Index::KeyWithHash KeyWithHash;
+
+ Index &origin_;
+ size_t kmer_size_;
+
+ bool IsJunction(KeyWithHash kh) const {
+ return !(origin_.CheckUniqueOutgoing(kh) && origin_.CheckUniqueIncoming(kh));
+ }
+
+ void AddStartDeEdgesForVertex(KeyWithHash kh, std::vector<DeEdge>& start_edges) const {
+ for (char next = 0; next < 4; next++) {
+ if (origin_.CheckOutgoing(kh, next)) {
+ TRACE("Added to queue " << DeEdge(kh, origin_.GetOutgoing(kh, next)));
+ start_edges.push_back(DeEdge(kh, origin_.GetOutgoing(kh, next)));
+ }
+ }
+ }
+
+ void AddStartDeEdges(kmer_iterator &it, size_t queueSize,
+ std::vector<DeEdge>& start_edges) const {
+ for (; start_edges.size() < queueSize && it.good(); ++it) {
+ KeyWithHash kh = origin_.ConstructKWH(Kmer(kmer_size_, *it));
+ if (IsJunction(kh)) {
+ AddStartDeEdgesForVertex(kh, start_edges);
+ KeyWithHash kh_inv = !kh;
+ if(!(kh_inv.is_minimal())) {
+ AddStartDeEdgesForVertex(kh_inv, start_edges);
+ }
+ }
+ }
+ }
+
+ void CalculateSequences(std::vector<DeEdge> &edges,
+ std::vector<Sequence> &sequences, UnbranchingPathFinder &finder) const {
+ size_t size = edges.size();
+ size_t start = sequences.size();
+ sequences.resize(start + size);
+
+# pragma omp parallel for schedule(guided)
+ for (size_t i = 0; i < size; ++i) {
+ sequences[start + i] = finder.ConstructSequenceWithEdge(edges[i]);
+ TRACE("From " << edges[i] << " calculated sequence");
+ TRACE(sequences[start + i]);
+ }
+ }
+
+ void CleanCondensed(const Sequence &sequence) {
+ Kmer kmer = sequence.start<Kmer>(kmer_size_);
+ KeyWithHash kwh = origin_.ConstructKWH(kmer);
+ origin_.IsolateVertex(kwh);
+ for(size_t pos = kmer_size_; pos < sequence.size(); pos++) {
+ kwh = kwh << sequence[pos];
+ origin_.IsolateVertex(kwh);
+ }
+ }
+
+ void CleanCondensed(const std::vector<Sequence> &sequences) {
+# pragma omp parallel for schedule(guided)
+ for (size_t i = 0; i < sequences.size(); ++i) {
+ CleanCondensed(sequences[i]);
+ }
+ }
+
+ //This methods collects all loops that were not extracted by finding unbranching paths because there are no junctions on loops.
+ //TODO make parallel
+ const std::vector<Sequence> CollectLoops() {
+ INFO("Collecting perfect loops");
+ UnbranchingPathFinder finder(origin_, kmer_size_);
+ std::vector<Sequence> result;
+ for (kmer_iterator it = origin_.kmer_begin(); it.good(); ++it) {
+ KeyWithHash kh = origin_.ConstructKWH(Kmer(kmer_size_, *it));
+ if (!IsJunction(kh)) {
+ Sequence loop = finder.ConstructLoopFromVertex(kh);
+ result.push_back(loop);
+ CleanCondensed(loop);
+ if(loop != (!loop)) {
+ CleanCondensed(!loop);
+ result.push_back(!loop);
+ }
+ }
+ }
+ INFO("Collecting perfect loops finished. " << result.size() << " loops collected");
+ return result;
+ }
+
+public:
+ UnbranchingPathExtractor(Index &origin, size_t k) : origin_(origin), kmer_size_(k) {
+ }
+
+ //TODO very large vector is returned. But I hate to make all those artificial changes that can fix it.
+ const std::vector<Sequence> ExtractUnbranchingPaths(size_t queueMinSize, size_t queueMaxSize,
+ double queueGrowthRate) {
+ INFO("Extracting unbranching paths");
+ UnbranchingPathFinder finder(origin_, kmer_size_);
+ std::vector<Sequence> result;
+ size_t queueSize = queueMinSize;
+ std::vector<DeEdge> start_edges;
+ std::vector<Sequence> sequences;
+ start_edges.reserve(queueSize);
+ auto it = origin_.kmer_begin();
+ while (it.good()) {
+ AddStartDeEdges(it, queueSize, start_edges); // format a queue of junction kmers
+ CalculateSequences(start_edges, sequences, finder); // in parallel
+ start_edges.clear();
+ queueSize = min((size_t) ((double) queueSize * queueGrowthRate), queueMaxSize);
+ }
+ INFO("Extracting unbranching paths finished. " << sequences.size() << " sequences extracted");
+ return sequences;
+ }
+
+ const std::vector<Sequence> ExtractUnbranchingPathsAndLoops(size_t queueMinSize, size_t queueMaxSize,
+ double queueGrowthRate) {
+ std::vector<Sequence> result = ExtractUnbranchingPaths(queueMinSize, queueMaxSize, queueGrowthRate);
+ CleanCondensed(result);
+ std::vector<Sequence> loops = CollectLoops();
+ for(auto it = loops.begin(); it != loops.end(); ++it) {
+ result.push_back(*it);
+ }
+ return result;
+ }
+
+private:
+ DECL_LOGGER("UnbranchingPathExtractor")
+};
+
+/*
+ * Only works for Conjugate dbg
+ */
+template<class Graph>
+class FastGraphFromSequencesConstructor {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef runtime_k::RtSeq Kmer;
+ typedef DeBruijnExtensionIndex<> Index;
+ size_t kmer_size_;
+ Index &origin_;
+
+ class LinkRecord {
+ private:
+ size_t hash_and_mask_;
+ EdgeId edge_;
+
+ size_t BitBool(bool flag) const {
+ if(flag)
+ return 1;
+ return 0;
+ }
+
+ public:
+ size_t GetHash() const {
+ return hash_and_mask_ >> 2;
+ }
+
+ bool IsRC() const {
+ return hash_and_mask_ & 2;
+ }
+
+ bool IsStart() const {
+ return hash_and_mask_ & 1;
+ }
+
+
+ EdgeId GetEdge() const {
+ return edge_;
+ }
+
+ LinkRecord(size_t hash, EdgeId edge, bool is_start, bool is_rc) :
+ hash_and_mask_((hash << 2) | (BitBool(is_rc) << 1)| BitBool(is_start)), edge_(edge) {
+ }
+
+ LinkRecord() :
+ hash_and_mask_(-1ul), edge_(0) {
+ }
+
+ bool IsInvalid() {
+ return hash_and_mask_ + 1 == 0 && edge_ == EdgeId(0);
+ }
+
+ bool operator<(const LinkRecord &other) const {
+ if(this->hash_and_mask_ == other.hash_and_mask_)
+ return this->edge_ < other.edge_;
+ return this->hash_and_mask_ < other.hash_and_mask_;
+ }
+ };
+
+ LinkRecord StartLink(const EdgeId &edge, const Sequence &sequence) const {
+ Kmer kmer(kmer_size_, sequence);
+ Kmer kmer_rc = !kmer;
+ if(kmer < kmer_rc)
+ return LinkRecord(origin_.ConstructKWH(kmer).idx(), edge, true, false);
+ else
+ return LinkRecord(origin_.ConstructKWH(kmer_rc).idx(), edge, true, true);
+ }
+
+ LinkRecord EndLink(const EdgeId &edge, const Sequence &sequence) const {
+ Kmer kmer(kmer_size_, sequence, sequence.size() - kmer_size_);
+ Kmer kmer_rc = !kmer;
+ if(kmer < kmer_rc)
+ return LinkRecord(origin_.ConstructKWH(kmer).idx(), edge, false, false);
+ else
+ return LinkRecord(origin_.ConstructKWH(kmer_rc).idx(), edge, false, true);
+ }
+
+ void CollectLinkRecords(typename Graph::HelperT &helper, const Graph &graph, vector<LinkRecord> &records, const vector<Sequence> &sequences) const {
+ size_t size = sequences.size();
+ records.resize(size * 2, LinkRecord(0, EdgeId(0), false, false));
+ restricted::IdSegmentStorage id_storage = helper.graph().GetGraphIdDistributor().Reserve(size * 2);
+# pragma omp parallel for schedule(guided)
+ for (size_t i = 0; i < size; ++i) {
+ size_t j = i << 1;
+ auto id_distributor = id_storage.GetSegmentIdDistributor(j, j + 2);//indices for two edges are required
+ EdgeId edge = helper.AddEdge(DeBruijnEdgeData(sequences[i]), id_distributor);
+ records[j] = StartLink(edge, sequences[i]);
+ if(graph.conjugate(edge) != edge)
+ records[j + 1] = EndLink(edge, sequences[i]);
+ else
+ records[j + 1] = LinkRecord();
+ }
+ }
+
+ void LinkEdge(typename Graph::HelperT &helper, const Graph &graph, const VertexId v, const EdgeId edge, const bool is_start, const bool is_rc) const {
+ VertexId v1 = v;
+ if(is_rc) {
+ v1 = graph.conjugate(v);
+ }
+ if(is_start) {
+ helper.LinkOutgoingEdge(v1, edge);
+ } else {
+ helper.LinkIncomingEdge(v1, edge);
+ }
+ }
+
+public:
+ FastGraphFromSequencesConstructor(size_t k, Index &origin) : kmer_size_(k), origin_(origin) {
+ }
+
+ void ConstructGraph(Graph &graph, const vector<Sequence> &sequences) const {
+ typename Graph::HelperT helper = graph.GetConstructionHelper();
+ vector<LinkRecord> records;
+ CollectLinkRecords(helper, graph, records, sequences);//TODO make parallel
+ parallel::sort(records.begin(), records.end());
+ size_t size = records.size();
+ vector<vector<VertexId>> vertices_list(omp_get_max_threads());
+ restricted::IdSegmentStorage id_storage = helper.graph().GetGraphIdDistributor().Reserve(size * 2);
+# pragma omp parallel for schedule(guided)
+ for(size_t i = 0; i < size; i++) {
+ if(i != 0 && records[i].GetHash() == records[i - 1].GetHash()) {
+ continue;
+ }
+ if(records[i].IsInvalid())
+ continue;
+ auto id_distributor = id_storage.GetSegmentIdDistributor(i << 1, (i << 1) + 2);
+ VertexId v = helper.CreateVertex(DeBruijnVertexData(), id_distributor);
+ vertices_list[omp_get_thread_num()].push_back(v);
+ for(size_t j = i; j < size && records[j].GetHash() == records[i].GetHash(); j++) {
+ LinkEdge(helper, graph, v, records[j].GetEdge(), records[j].IsStart(), records[j].IsRC());
+ }
+ }
+ for(size_t i = 0; i < vertices_list.size(); i++)
+ helper.AddVerticesToGraph(vertices_list[i].begin(), vertices_list[i].end());
+ }
+};
+
+/*
+ * Constructs DeBruijnGraph from DeBruijnExtensionIndex using "new DeBruijnGraphExtentionConstructor(DeBruijn).ConstructGraph(DeBruijnGraph, Index)"
+ */
+template<class Graph>
+class DeBruijnGraphExtentionConstructor {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef DeBruijnExtensionIndex<> DeBruijn;
+ typedef typename Graph::VertexId VertexId;
+ typedef runtime_k::RtSeq Kmer;
+
+ Graph &graph_;
+ DeBruijn &origin_;
+ size_t kmer_size_;
+
+ void FilterRC(std::vector<Sequence> &edge_sequences) {
+ size_t size = 0;
+ for(size_t i = 0; i < edge_sequences.size(); i++) {
+ if(!(edge_sequences[i] < !edge_sequences[i])) {
+ edge_sequences[size] = edge_sequences[i];
+ size++;
+ }
+ }
+ edge_sequences.resize(size);
+ }
+
+public:
+ DeBruijnGraphExtentionConstructor(Graph& graph, DeBruijn &origin) :
+ graph_(graph), origin_(origin), kmer_size_(graph.k()) {
+ }
+
+ void ConstructGraph(size_t queueMinSize, size_t queueMaxSize,
+ double queueGrowthRate, bool keep_perfect_loops) {
+ std::vector<Sequence> edge_sequences;
+ if(keep_perfect_loops)
+ edge_sequences = UnbranchingPathExtractor(origin_, kmer_size_).ExtractUnbranchingPathsAndLoops(queueMinSize, queueMaxSize, queueGrowthRate);
+ else
+ edge_sequences = UnbranchingPathExtractor(origin_, kmer_size_).ExtractUnbranchingPaths(queueMinSize, queueMaxSize, queueGrowthRate);
+ FilterRC(edge_sequences);
+ FastGraphFromSequencesConstructor<Graph>(kmer_size_, origin_).ConstructGraph(graph_, edge_sequences);
+ }
+
+private:
+ DECL_LOGGER("DeBruijnGraphConstructor")
+};
+
+}
diff --git a/src/modules/data_structures/debruijn_graph/early_simplification.hpp b/src/modules/data_structures/debruijn_graph/early_simplification.hpp
new file mode 100644
index 0000000..ccc89e6
--- /dev/null
+++ b/src/modules/data_structures/debruijn_graph/early_simplification.hpp
@@ -0,0 +1,269 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+#include "dev_support/standard_base.hpp"
+#include "data_structures/indices/perfect_hash_map.hpp"
+#include "data_structures/sequence/runtime_k.hpp"
+#include "data_structures/mph_index/kmer_index.hpp"
+
+namespace debruijn_graph {
+
+class LinkCleaner {
+private:
+ typedef DeBruijnExtensionIndex<> Index;
+ typedef Index::KMer Kmer;
+ typedef Index::KeyWithHash KeyWithHash;
+ Index &index_;
+
+ void CleanForwardLinks(KeyWithHash &kh, char i) {
+ if(index_.CheckOutgoing(kh, i)) {
+ KeyWithHash next_kh = index_.GetOutgoing(kh, i);
+ if(!index_.CheckIncoming(next_kh, kh[0])) {
+ index_.DeleteOutgoing(kh, i);
+ }
+ }
+ }
+
+ void CleanBackwardLinks(KeyWithHash &kh, char i) {
+ if(index_.CheckIncoming(kh, i)) {
+ KeyWithHash prev_kh = index_.GetIncoming(kh, i);
+ if(!index_.CheckOutgoing(prev_kh, kh[index_.k() - 1])) {
+ index_.DeleteIncoming(kh, i);
+ }
+ }
+ }
+
+public:
+ LinkCleaner(Index &index) : index_(index) {}
+
+ //TODO make parallel
+ void CleanLinks() {
+ vector<Index::kmer_iterator> iters = index_.kmer_begin(10 * cfg::get().max_threads);
+# pragma omp parallel for schedule(guided)
+ for(size_t i = 0; i < iters.size(); i++) {
+ for (Index::kmer_iterator &it = iters[i]; it.good(); ++it) {
+ KeyWithHash kh = index_.ConstructKWH(runtime_k::RtSeq(index_.k(), *it));
+ if (kh.is_minimal()) {
+ KeyWithHash kh = index_.ConstructKWH(runtime_k::RtSeq(index_.k(), *it));
+ for (char i = 0; i < 4; i++) {
+ CleanForwardLinks(kh, i);
+ CleanBackwardLinks(kh, i);
+ }
+ }
+ }
+ }
+ }
+};
+
+
+class EarlyTipClipper {
+private:
+ typedef DeBruijnExtensionIndex<> Index;
+ typedef Index::KMer Kmer;
+ typedef Index::KeyWithHash KeyWithHash;
+ Index &index_;
+ size_t length_bound_;
+
+//Not optimal with respect to the number of large array queries (the one that contains adjacency masks). Should be ok though in case cash works the way I think it does
+ size_t RemoveForward(KeyWithHash kh) {
+ std::vector<KeyWithHash> tip;
+ do {
+ tip.push_back(kh);
+ kh = index_.GetUniqueOutgoing(kh);
+ } while (tip.size() < length_bound_ && index_.CheckUniqueIncoming(kh) && index_.CheckUniqueOutgoing(kh));
+
+ if (!index_.CheckUniqueIncoming(kh)) {
+ for (size_t i = 0; i < tip.size(); i++) {
+ index_.IsolateVertex(tip[i]);
+ }
+ return tip.size();
+ }
+
+ return 0;
+ }
+
+ size_t RemoveBackward(KeyWithHash kh) {
+ std::vector<KeyWithHash> tip;
+ do {
+ tip.push_back(kh);
+ kh = index_.GetUniqueIncoming(kh);
+ } while(tip.size() < length_bound_ && index_.CheckUniqueIncoming(kh) && index_.CheckUniqueOutgoing(kh));
+
+ if (!index_.CheckUniqueOutgoing(kh)) {
+ for (size_t i = 0; i < tip.size(); i++) {
+ index_.IsolateVertex(tip[i]);
+ }
+ return tip.size();
+ }
+ return 0;
+ }
+
+ //TODO make parallel
+ size_t RoughClipTips() {
+ size_t result = 0;
+ for (auto it = index_.kmer_begin(); it.good(); ++it) {
+ KeyWithHash kh = index_.ConstructKWH(runtime_k::RtSeq(index_.k(), *it));
+ if (index_.IsDeadEnd(kh) && index_.CheckUniqueIncoming(kh)) {
+ result += RemoveBackward(kh);
+ } else if(index_.IsDeadStart(kh) && index_.CheckUniqueOutgoing(kh)) {
+ result += RemoveForward(kh);
+ }
+ }
+ return result;
+ }
+
+
+public:
+ EarlyTipClipper(Index &index, size_t length_bound) :
+ index_(index), length_bound_(length_bound) {}
+
+ /*
+ * Method returns the number of removed edges
+ */
+ size_t ClipTips() {
+ INFO("Early tip clipping");
+ size_t result = RoughClipTips();
+ LinkCleaner(index_).CleanLinks();
+ INFO(result << " " << (index_.k()+1) <<"-mers were removed by early tip clipper");
+ return result;
+ }
+protected:
+ DECL_LOGGER("Early tip clipping");
+};
+
+
+class AlternativeEarlyTipClipper {
+private:
+ typedef DeBruijnExtensionIndex<> Index;
+ typedef Index::KMer Kmer;
+ typedef Index::KeyWithHash KeyWithHash;
+ Index &index_;
+ size_t length_bound_;
+
+ /*
+ * This method starts from the kmer that is second in the tip counting from junction vertex. It records all kmers of a tip into tip vector.
+ * The method returns length of a tip.
+ * In case it did not end as a tip or if it was too long tip vector is cleared and infinite length is returned.
+ * Thus tip vector contains only kmers to be removed while returned length value gives reasonable information of what happend.
+ */
+ size_t FindForward(KeyWithHash kh, vector<KeyWithHash> &tip) {
+ while(tip.size() < length_bound_ && index_.CheckUniqueIncoming(kh) && index_.CheckUniqueOutgoing(kh)) {
+ tip.push_back(kh);
+ kh = index_.GetUniqueOutgoing(kh);
+ }
+ tip.push_back(kh);
+ if(index_.CheckUniqueIncoming(kh) && index_.IsDeadEnd(kh)) {
+ return tip.size();
+ }
+ tip.clear();
+ return -1;
+ }
+
+ size_t FindBackward(KeyWithHash kh, vector<KeyWithHash> &tip) {
+ while(tip.size() < length_bound_ && index_.CheckUniqueOutgoing(kh) && index_.CheckUniqueIncoming(kh)) {
+ tip.push_back(kh);
+ kh = index_.GetUniqueIncoming(kh);
+ }
+ tip.push_back(kh);
+ if(index_.CheckUniqueOutgoing(kh) && index_.IsDeadStart(kh)) {
+ return tip.size();
+ }
+ tip.clear();
+ return -1;
+ }
+
+ size_t RemoveTip(vector<KeyWithHash > &tip) {
+ for(size_t i = 0; i < tip.size(); i++)
+ index_.IsolateVertex(tip[i]);
+ return tip.size();
+ }
+
+ size_t RemoveTips(vector<vector<KeyWithHash > > tips, size_t max) {
+ size_t result = 0;
+ for(char c = 0; c < 4; c++) {
+ if(tips[c].size() < max) {
+ result += RemoveTip(tips[c]);
+ }
+ }
+ return result;
+ }
+
+ size_t RemoveForward(KeyWithHash kh) {
+ vector<vector<KeyWithHash >> tips;
+ tips.resize(4);
+ size_t max = 0;
+ for(char c = 0; c < 4; c++) {
+ if(index_.CheckOutgoing(kh, c)) {
+ KeyWithHash khc = index_.GetOutgoing(kh, c);
+ size_t len = FindForward(khc, tips[c]);
+ if(len > max)
+ max = len;
+ }
+ }
+ return RemoveTips(tips, max);
+ }
+
+ size_t RemoveBackward(KeyWithHash kh) {
+ vector<vector<KeyWithHash >> tips;
+ tips.resize(4);
+ size_t max = 0;
+ for(char c = 0; c < 4; c++) {
+ if(index_.CheckIncoming(kh, c)) {
+ KeyWithHash khc = index_.GetIncoming(kh, c);
+ size_t len = FindBackward(khc, tips[c]);
+ if(len > max)
+ max = len;
+ }
+ }
+ return RemoveTips(tips, max);
+ }
+
+ //TODO make parallel
+ size_t RoughClipTips() {
+ vector<Index::kmer_iterator> iters = index_.kmer_begin(10 * cfg::get().max_threads);
+ vector<size_t> result(iters.size());
+# pragma omp parallel for schedule(guided)
+ for(size_t i = 0; i < iters.size(); i++) {
+ for(Index::kmer_iterator &it = iters[i]; it.good(); ++it) {
+ KeyWithHash kh = index_.ConstructKWH(runtime_k::RtSeq(index_.k(), *it));
+ if(kh.is_minimal()) {
+ if (index_.OutgoingEdgeCount(kh) >= 2) {
+ result[i] += RemoveForward(kh);
+ }
+ if (index_.IncomingEdgeCount(kh) >= 2) {
+ result[i] += RemoveBackward(kh);
+ }
+ }
+ }
+ }
+ size_t sum = 0;
+ for(size_t i = 0; i < result.size(); i++)
+ sum += result[i];
+ return sum;
+ }
+
+
+public:
+ AlternativeEarlyTipClipper(Index &index, size_t length_bound) : index_(index), length_bound_(length_bound) {
+ }
+
+ /*
+ * Method returns the number of removed edges
+ */
+ size_t ClipTips() {
+ INFO("Early tip clipping");
+ size_t result = RoughClipTips();
+ LinkCleaner(index_).CleanLinks();
+ INFO(result << " " << (index_.k()+1) <<"-mers were removed by early tip clipper");
+ return result;
+ }
+protected:
+ DECL_LOGGER("Early tip clipping");
+};
+
+}
diff --git a/src/modules/data_structures/indices/edge_index_builders.hpp b/src/modules/data_structures/indices/edge_index_builders.hpp
new file mode 100644
index 0000000..6e20297
--- /dev/null
+++ b/src/modules/data_structures/indices/edge_index_builders.hpp
@@ -0,0 +1,179 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "edge_position_index.hpp"
+
+namespace debruijn_graph {
+
+template <class Builder>
+class GraphPositionFillingIndexBuilder : public Builder {
+ typedef Builder base;
+public:
+ typedef typename Builder::IndexT IndexT;
+ typedef typename IndexT::KMer Kmer;
+// typedef typename IndexT::GraphT GraphT;
+
+ template<class Graph>
+ void BuildIndexFromGraph(IndexT &index,
+ const Graph/*T*/ &g, size_t read_buffer_size = 0) const {
+ base::BuildIndexFromGraph(index, g, read_buffer_size);
+
+ // Now use the index to fill the coverage and EdgeId's
+ INFO("Collecting k-mer coverage information from graph, this takes a while.");
+ EdgeInfoUpdater<IndexT, Graph> updater(g, index);
+ updater.UpdateAll();
+ }
+
+};
+
+template<typename> struct Void { typedef void type; };
+
+template<typename T, typename Sfinae = void>
+struct has_contains: std::false_type {};
+
+template<typename T>
+struct has_contains<
+ T
+ , typename Void<
+ //decltype( std::declval<T&>().contains(typename T::KMerIdx(0), typename T::KMer()) )
+ decltype( ((T*)(0))->contains(*((typename T::KeyWithHash*)(0))) )
+ >::type
+>: std::true_type {};
+
+template <class Builder>
+class CoverageFillingEdgeIndexBuilder : public Builder {
+ typedef Builder base;
+ public:
+ typedef typename Builder::IndexT IndexT;
+ typedef typename IndexT::KMer Kmer;
+ typedef typename IndexT::KMerIdx KmerIdx;
+ typedef typename IndexT::KeyWithHash KeyWithHash;
+
+ private:
+
+
+ bool ContainsWrap(bool check_contains, IndexT& index, const KeyWithHash &kwh, std::true_type) const {
+ return !check_contains || index.contains(kwh);
+ }
+
+ bool ContainsWrap(bool /*check_contains*/, IndexT&/* index*/, const KeyWithHash &/*kwh*/, std::false_type) const {
+ VERIFY(false);
+// VERIFY(!check_contains);
+ return true;
+ }
+
+ template<class ReadStream>
+ size_t FillCoverageFromStream(ReadStream &stream,
+ IndexT &index, bool check_contains) const {
+ unsigned k = index.k();
+ size_t rl = 0;
+
+ while (!stream.eof()) {
+ typename ReadStream::ReadT r;
+ stream >> r;
+ rl = std::max(rl, r.size());
+
+ const Sequence &seq = r.sequence();
+ if (seq.size() < k)
+ continue;
+
+ KeyWithHash kwh = index.ConstructKWH(seq.start<Kmer>(k) >> 'A');
+ for (size_t j = k - 1; j < seq.size(); ++j) {
+ kwh <<= seq[j];
+ //contains is not used since index might be still empty here
+ if (kwh.is_minimal() && index.valid(kwh) && ContainsWrap(check_contains, index, kwh, has_contains<IndexT>())) {
+# pragma omp atomic
+ index.get_raw_value_reference(kwh).count += 1;
+ }
+ }
+ }
+
+ return rl;
+ }
+
+ public:
+
+ template<class Streams>
+ size_t ParallelFillCoverage(IndexT &index,
+ Streams &streams,
+ bool check_contains = true) const {
+ INFO("Collecting k-mer coverage information from reads, this takes a while.");
+ unsigned nthreads = (unsigned) streams.size();
+ size_t rl = 0;
+ streams.reset();
+#pragma omp parallel for num_threads(nthreads) shared(rl)
+ for (size_t i = 0; i < nthreads; ++i) {
+ size_t crl = FillCoverageFromStream(streams[i], index, check_contains);
+
+ // There is no max reduction in C/C++ OpenMP... Only in FORTRAN :(
+#pragma omp flush(rl)
+ if (crl > rl)
+#pragma omp critical
+ {
+ rl = std::max(rl, crl);
+ }
+ }
+
+ // Contigs have zero coverage!
+#if 0
+ if (contigs_stream) {
+ contigs_stream->reset();
+ FillCoverageFromStream(*contigs_stream, index, check_contains);
+ }
+#endif
+
+//todo if this verify is neede, put it outside
+//#ifndef NDEBUG
+// for (auto idx = index.kmer_idx_begin(), eidx = index.kmer_idx_end();
+// idx != eidx; ++idx) {
+//
+// Kmer k = index.kmer(idx);
+//
+// VERIFY(index[k].count == index[!k].count);
+// }
+//#endif
+
+ return rl;
+ }
+
+ template<class Streams>
+ size_t BuildIndexFromStream(IndexT &index,
+ Streams &streams,
+ io::SingleStream* contigs_stream = 0) const {
+ base::BuildIndexFromStream(index, streams, contigs_stream);
+
+ return ParallelFillCoverage(index, streams, false);
+ }
+
+// template<class Streams>
+// size_t BuildIndexWithCoverageFromGraph(
+// GraphT &graph, IndexT &index,
+// Streams &streams,
+// SingleReadStream* contigs_stream = 0) const {
+// this->BuildIndexFromGraph(index, graph);
+//
+// return ParallelFillCoverage(index, streams, contigs_stream, true);
+// }
+};
+
+template<class Index>
+struct EdgeIndexHelper {
+ typedef Index IndexT;
+ typedef typename IndexT::KMer Kmer;
+ typedef typename IndexT::KMerIdx KMerIdx;
+ typedef typename IndexT::traits_t traits_t;
+// typedef typename IndexT::IdType IdType;
+ typedef DeBruijnStreamKMerIndexBuilder<Kmer, IndexT> DeBruijnStreamKMerIndexBuilderT;
+ typedef CoverageFillingEdgeIndexBuilder<DeBruijnStreamKMerIndexBuilderT> CoverageFillingEdgeIndexBuilderT;
+ typedef DeBruijnGraphKMerIndexBuilder<IndexT> DeBruijnGraphKMerIndexBuilderT;
+ typedef GraphPositionFillingIndexBuilder<DeBruijnGraphKMerIndexBuilderT> GraphPositionFillingIndexBuilderT;
+ typedef CoverageFillingEdgeIndexBuilder<GraphPositionFillingIndexBuilderT> CoverageAndGraphPositionFillingIndexBuilderT;
+};
+
+}
diff --git a/src/modules/data_structures/indices/edge_info_updater.hpp b/src/modules/data_structures/indices/edge_info_updater.hpp
new file mode 100644
index 0000000..6c53b79
--- /dev/null
+++ b/src/modules/data_structures/indices/edge_info_updater.hpp
@@ -0,0 +1,107 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "dev_support/standard_base.hpp"
+#include "dev_support/openmp_wrapper.h"
+#include "modules/assembly_graph/graph_core/graph_iterators.hpp"
+
+namespace debruijn_graph {
+
+template<typename Index, typename Graph>
+class EdgeInfoUpdater {
+ typedef typename Index::KMer Kmer;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Index::KeyWithHash KeyWithHash;
+ typedef typename Index::Value EdgeInfo;
+
+ const Graph &g_;
+ Index &index_;
+
+
+ void PutInIndex(const KeyWithHash &kwh, EdgeId id, size_t offset) {
+ if (index_.valid(kwh)) {
+ auto &entry = index_.get_raw_value_reference(kwh);
+ if (!entry.valid() || index_.contains(kwh)) {
+ index_.put_value(kwh, EdgeInfo(id, (unsigned)offset, entry.count));
+ }
+ }
+ }
+
+ //todo why do we need to check equality???!!!
+ bool DeleteIfEqual(const KeyWithHash &kwh, EdgeId e) {
+ if (!index_.contains(kwh))
+ return false;
+ if (index_.get_value(kwh).edge_id == e) {
+ index_.get_raw_value_reference(kwh).invalidate();
+ return true;
+ }
+ return false;
+ }
+
+ void UpdateKMers(const Sequence &nucls, EdgeId e) {
+ VERIFY(nucls.size() >= index_.k());
+ KeyWithHash kwh = index_.ConstructKWH(Kmer(index_.k(), nucls));
+ index_.PutInIndex(kwh, e, 0);
+ for (size_t i = index_.k(), n = nucls.size(); i < n; ++i) {
+ kwh <<= nucls[i];
+ index_.PutInIndex(kwh, e, i - index_.k() + 1);
+ }
+ }
+
+ void DeleteKMers(const Sequence &nucls, EdgeId e) {
+ VERIFY(nucls.size() >= index_.k());
+ KeyWithHash kwh = index_.ConstructKWH(Kmer(index_.k(), nucls));
+ DeleteIfEqual(kwh, e);
+ for (size_t i = index_.k(), n = nucls.size(); i < n; ++i) {
+ kwh <<= nucls[i];
+ DeleteIfEqual(kwh, e);
+ }
+ }
+
+ public:
+ /**
+ * Creates DataHashRenewer for specified graph and index
+ * @param g graph to be indexed
+ * @param index index to be synchronized with graph
+ */
+ EdgeInfoUpdater(const Graph& g, Index& index)
+ : g_(g),
+ index_(index) {
+ }
+
+ void UpdateKmers(EdgeId e) {
+ Sequence nucls = g_.EdgeNucls(e);
+ UpdateKMers(nucls, e);
+ }
+
+ void DeleteKmers(EdgeId e) {
+ Sequence nucls = g_.EdgeNucls(e);
+ DeleteKMers(nucls, e);
+ }
+
+ void UpdateAll() {
+ unsigned nthreads = omp_get_max_threads();
+
+ omnigraph::IterationHelper<Graph, EdgeId> edges(g_);
+ auto iters = edges.Chunks(16 * nthreads);
+
+ #pragma omp parallel for schedule(guided)
+ for (size_t i = 0; i < iters.size() - 1; ++i) {
+ TRACE("Processing chunk #" << i);
+ for (auto it = iters[i]; it != iters[i + 1]; ++it) {
+ UpdateKmers(*it);
+ }
+ }
+ }
+
+ private:
+ DECL_LOGGER("EdgeInfoUpdater")
+};
+
+}
diff --git a/src/modules/data_structures/indices/edge_multi_index.hpp b/src/modules/data_structures/indices/edge_multi_index.hpp
new file mode 100644
index 0000000..e1e7e52
--- /dev/null
+++ b/src/modules/data_structures/indices/edge_multi_index.hpp
@@ -0,0 +1,161 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+/*
+ * edge_multi_index.hpp
+ *
+ * Created on: May 24, 2013
+ * Author: anton
+ */
+#include "perfect_hash_map.hpp"
+#include "edge_info_updater.hpp"
+#include "kmer_splitters.hpp"
+#include "edge_position_index.hpp"
+
+#include <folly/SmallLocks.h>
+
+namespace debruijn_graph {
+
+template<class IdType>
+class EdgeInfoStorage {
+public:
+ typedef vector<EdgeInfo<IdType>> Content;
+ typedef typename Content::iterator iterator;
+ typedef typename Content::const_iterator const_iterator;
+ Content content_;
+ folly::MicroSpinLock lock_;
+
+ EdgeInfoStorage(const Content &content) : content_(content) {
+ lock_.init();
+ }
+
+ EdgeInfoStorage() {
+ lock_.init();
+ }
+
+ EdgeInfo<IdType> &operator[](size_t i) {
+ return content_[i];
+ }
+
+ iterator begin() {
+ return content_.begin();
+ }
+
+ iterator end() {
+ return content_.end();
+ }
+
+ const_iterator begin() const {
+ return content_.cbegin();
+ }
+
+ const_iterator end() const {
+ return content_.cend();
+ }
+
+ iterator find(const EdgeInfo<IdType> &info) {
+ return content_.find(info);
+ }
+
+ const_iterator find(const EdgeInfo<IdType> &info) const {
+ return content_.find(info);
+ }
+
+ void push_back(const EdgeInfo<IdType> &info) {
+ folly::MSLGuard g(lock_);
+ content_.push_back(info);
+ }
+
+ template<class... Args>
+ void emplace_back(Args&&... args) {
+ folly::MSLGuard g(lock_);
+ content_.emplace_back(std::forward<Args>(args)...);
+ }
+
+ size_t size() const{
+ return content_.size();
+ }
+
+ bool valid() const {
+ //what's invalid edge info storage?
+ return true;
+ }
+
+ EdgeInfoStorage conjugate(size_t k) const {
+ EdgeInfoStorage result;
+ for(auto it = content_.rbegin(); it != content_.rend(); ++it) {
+ result.push_back(it->conjugate(k));
+ }
+ return result;
+ }
+};
+
+//todo it is not handling graph events!!!
+template<class IdType, class Seq = runtime_k::RtSeq,
+ class traits = kmer_index_traits<Seq>, class StoringType = SimpleStoring >
+class DeBruijnEdgeMultiIndex : public KeyStoringMap<Seq, EdgeInfoStorage<IdType>, traits, StoringType > {
+ typedef KeyStoringMap<Seq, EdgeInfoStorage<IdType>, traits, StoringType > base;
+ public:
+ typedef StoringType storing_type;
+ typedef typename base::traits_t traits_t;
+ typedef typename base::KMer KMer;
+ typedef typename base::KMerIdx KMerIdx;
+ typedef typename base::KeyWithHash KeyWithHash;
+ typedef EdgeInfoStorage<IdType> Value;
+
+ using base::ConstructKWH;
+// typedef typename base::IdType IdType;
+ //todo move this typedef up in hierarchy (need some c++ tricks)
+
+ DeBruijnEdgeMultiIndex(unsigned k, const std::string &workdir)
+ : base(k, workdir) {
+ INFO("Constructing multi-kmer index");
+ }
+
+ ~DeBruijnEdgeMultiIndex() {}
+
+
+ Value get(const KeyWithHash &kwh) const {
+ VERIFY(contains(kwh));
+ return base::get_value(kwh);
+ }
+
+ bool contains(const KeyWithHash &kwh) const {
+ if (!base::valid(kwh))
+ return false;
+ return this->get_raw_value_reference(kwh).valid();
+ }
+
+ bool valid(const KMer &kmer) const {
+ KeyWithHash kwh = base::ConstructKWH(kmer);
+ return base::valid(kwh);
+ }
+
+ void PutInIndex(const KeyWithHash &kwh, IdType id, size_t offset) {
+ if (!contains(kwh))
+ return;
+
+ EdgeInfoStorage<IdType> &entry = this->get_raw_value_reference(kwh);
+ entry.emplace_back(id, (unsigned int)offset);
+ }
+
+ const EdgeInfoStorage<IdType> get(const KMer& kmer) const {
+ auto kwh = base::ConstructKWH(kmer);
+ auto entry = this->get_value(kwh);
+ return entry;
+ }
+
+ //todo delete if equal seems to work improperly!!!
+ bool DeleteIfEqual(const KeyWithHash &, IdType) {
+ VERIFY(false);
+ return false;
+ }
+
+};
+
+}
diff --git a/src/modules/data_structures/indices/edge_position_index.hpp b/src/modules/data_structures/indices/edge_position_index.hpp
new file mode 100644
index 0000000..6652e48
--- /dev/null
+++ b/src/modules/data_structures/indices/edge_position_index.hpp
@@ -0,0 +1,191 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+/*
+ * edge_index.hpp
+ *
+ * Created on: May 24, 2013
+ * Author: anton
+ */
+
+#include "perfect_hash_map.hpp"
+#include "edge_info_updater.hpp"
+#include "kmer_splitters.hpp"
+
+namespace debruijn_graph {
+
+template<class IdType>
+struct EdgeInfo {
+ IdType edge_id;
+ unsigned offset;
+ unsigned count;
+
+ EdgeInfo(IdType edge_id_ = IdType(), unsigned offset_ = -1u, unsigned count_ = 0) :
+ edge_id(edge_id_), offset(offset_), count(count_) { }
+
+ template<class KWH>
+ EdgeInfo conjugate(const KWH &kwh) const {
+ return conjugate(kwh.key().size());
+ }
+
+ EdgeInfo conjugate(size_t k) const {
+ if(!valid()) {
+ return EdgeInfo(IdType(0), unsigned(-1), count);
+ } else {
+ return EdgeInfo(edge_id->conjugate(), (unsigned)edge_id->length(k) - offset, count);
+ }
+ }
+
+ void invalidate() {
+ offset = unsigned(-1);
+ }
+
+ bool valid() const {
+ return offset != unsigned(-1);
+ }
+};
+
+template<class stream, class IdType>
+stream &operator<<(stream &s, const EdgeInfo<IdType> &info) {
+ return s << "EdgeInfo[" << info.edge_id << ", " << info.offset << ", " << info.count << "]";
+}
+
+template<class Graph, class Seq = runtime_k::RtSeq, class traits = kmer_index_traits<Seq>, class StoringType = DefaultStoring>
+class KmerFreeEdgeIndex : public KeyIteratingMap<Seq, EdgeInfo<typename Graph::EdgeId>, traits, StoringType> {
+ typedef KeyIteratingMap<Seq, EdgeInfo<typename Graph::EdgeId>, traits, StoringType> base;
+ const Graph &graph_;
+
+public:
+ typedef typename base::traits_t traits_t;
+ typedef StoringType storing_type;
+ typedef typename base::KMer KMer;
+ typedef typename base::KMerIdx KMerIdx;
+ typedef Graph GraphT;
+ typedef typename Graph::EdgeId IdType;
+ typedef typename base::KeyWithHash KeyWithHash;
+ typedef EdgeInfo<typename Graph::EdgeId> Value;
+ using base::valid;
+ using base::ConstructKWH;
+
+public:
+
+ KmerFreeEdgeIndex(const Graph &graph, const std::string &workdir)
+ : base(unsigned(graph.k() + 1), workdir), graph_(graph) {}
+
+ /**
+ * Shows if kmer has some entry associated with it
+ */
+ bool contains(const KeyWithHash &kwh) const {
+ // Sanity check
+ if (!valid(kwh)) {
+ return false;
+ }
+
+ Value entry = base::get_value(kwh);
+
+ if (entry.offset == -1u) {
+ return false;
+ }
+
+ return kwh.key() == KMer(this->k(), graph_.EdgeNucls(entry.edge_id), entry.offset);
+ }
+
+ void PutInIndex(KeyWithHash &kwh, IdType id, size_t offset) {
+ if (valid(kwh)) {
+ auto &entry = this->get_raw_value_reference(kwh);
+ if (!entry.valid() || contains(kwh)) {
+ this->put_value(kwh, Value(id, (unsigned)offset, entry.count));
+ }
+ }
+ }
+
+ //Only coverage is loaded
+ template<class Writer>
+ void BinWrite(Writer &writer) const {
+ this->index_.serialize(writer);
+ size_t sz = this->data_.size();
+ writer.write((char*)&sz, sizeof(sz));
+ for (size_t i = 0; i < sz; ++i)
+ writer.write((char*)&(this->data_[i].count), sizeof(this->data_[0].count));
+ }
+
+ template<class Reader>
+ void BinRead(Reader &reader, const std::string/* &FileName*/) {
+ this->clear();
+ this->index_.deserialize(reader);
+ size_t sz = 0;
+ reader.read((char*)&sz, sizeof(sz));
+ this->data_.resize(sz);
+ for (size_t i = 0; i < sz; ++i)
+ reader.read((char*)&(this->data_[i].count), sizeof(this->data_[0].count));
+ }
+};
+
+template<class Graph, class Seq = runtime_k::RtSeq, class traits = kmer_index_traits<Seq>, class StoringType = DefaultStoring>
+class KmerStoringEdgeIndex : public KeyStoringMap<Seq, EdgeInfo<typename Graph::EdgeId>, traits, StoringType> {
+ typedef KeyStoringMap<Seq, EdgeInfo<typename Graph::EdgeId>, traits, StoringType> base;
+
+public:
+ typedef typename base::traits_t traits_t;
+ typedef StoringType storing_type;
+ typedef typename base::KMer KMer;
+ typedef typename base::KMerIdx KMerIdx;
+ typedef Graph GraphT;
+ typedef typename Graph::EdgeId IdType;
+ typedef typename base::KeyWithHash KeyWithHash;
+ typedef EdgeInfo<typename Graph::EdgeId> Value;
+ using base::valid;
+ using base::ConstructKWH;
+
+
+ KmerStoringEdgeIndex(const Graph& g, const std::string &workdir)
+ : base(unsigned(g.k() + 1), workdir) {}
+
+ ~KmerStoringEdgeIndex() {}
+
+ /**
+ * Shows if kmer has some entry associated with it
+ */
+ bool contains(const KeyWithHash &kwh) const {
+ if (!base::valid(kwh))
+ return false;
+ return this->get_raw_value_reference(kwh).valid();
+ }
+
+ template<class Writer>
+ void BinWrite(Writer &writer) const {
+ this->index_.serialize(writer);
+ size_t sz = this->data_.size();
+ writer.write((char*)&sz, sizeof(sz));
+ for (size_t i = 0; i < sz; ++i)
+ writer.write((char*)&(this->data_[i].count), sizeof(this->data_[0].count));
+ this->BinWriteKmers(writer);
+ }
+
+ template<class Reader>
+ void BinRead(Reader &reader, const std::string &FileName) {
+ this->clear();
+ this->index_.deserialize(reader);
+ size_t sz = 0;
+ reader.read((char*)&sz, sizeof(sz));
+ this->data_.resize(sz);
+ for (size_t i = 0; i < sz; ++i)
+ reader.read((char*)&(this->data_[i].count), sizeof(this->data_[0].count));
+ this->BinReadKmers(reader, FileName);
+ }
+ void PutInIndex(KeyWithHash &kwh, IdType id, size_t offset) {
+ if (valid(kwh)) {
+ auto &entry = this->get_raw_value_reference(kwh);
+ if (!entry.valid() || contains(kwh)) {
+ this->put_value(kwh, Value(id, (unsigned)offset, entry.count));
+ }
+ }
+ }
+};
+
+}
diff --git a/src/debruijn/indices/editable_index.hpp b/src/modules/data_structures/indices/editable_index.hpp
similarity index 100%
rename from src/debruijn/indices/editable_index.hpp
rename to src/modules/data_structures/indices/editable_index.hpp
diff --git a/src/debruijn/indices/key_with_hash.hpp b/src/modules/data_structures/indices/key_with_hash.hpp
similarity index 100%
rename from src/debruijn/indices/key_with_hash.hpp
rename to src/modules/data_structures/indices/key_with_hash.hpp
diff --git a/src/modules/data_structures/indices/kmer_extension_index.hpp b/src/modules/data_structures/indices/kmer_extension_index.hpp
new file mode 100644
index 0000000..6d1d5df
--- /dev/null
+++ b/src/modules/data_structures/indices/kmer_extension_index.hpp
@@ -0,0 +1,413 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+/*
+ * kmer_extension_index.hpp
+ *
+ * Created on: May 24, 2013
+ * Author: anton
+ */
+#include "perfect_hash_map.hpp"
+#include "kmer_splitters.hpp"
+#include "dev_support/simple_tools.hpp"
+#include "storing_traits.hpp"
+#include <bitset>
+
+namespace debruijn_graph {
+
+inline uint8_t invert_byte_slow(uint8_t a) {
+ size_t res = 0;
+ for(size_t i = 0; i < 8; i++) {
+ res <<= 1;
+ res += a & 1;
+ a = uint8_t(a >> 1);
+ }
+ return uint8_t(res);
+}
+
+inline vector<uint8_t> count_invert_byte() {
+ vector<uint8_t> result;
+ for(size_t a = 0; a < 256; a++) {
+ result.push_back(invert_byte_slow((uint8_t)a));
+ }
+ return result;
+}
+
+inline uint8_t invert_byte(uint8_t a) {
+ static vector<uint8_t> precalc = count_invert_byte();
+ return precalc[a];
+}
+
+class InOutMask {
+private:
+ uint8_t mask_;
+
+ bool CheckUnique(uint8_t mask) const {
+ static bool unique[] =
+ { 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 };
+ return unique[mask];
+ }
+
+ char GetUnique(uint8_t mask) const {
+ static char next[] = { -1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1,
+ -1, -1, -1 };
+ VERIFY(next[mask] != -1)
+ return next[mask];
+ }
+
+ size_t Count(uint8_t mask) const {
+ static char count[] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
+ return count[mask];
+ }
+
+
+ char inv_position(char nucl, bool as_is) const {
+ if(as_is)
+ return nucl;
+ else
+ return char(7 - nucl);
+ }
+
+public:
+ explicit InOutMask(uint8_t mask = 0) : mask_(mask){
+ }
+
+ uint8_t get_mask() const {
+ return mask_;
+ }
+
+ template<class Key>
+ InOutMask conjugate(const Key & /*k*/) const {
+ return InOutMask(invert_byte(mask_));
+ }
+
+ void AddOutgoing(char nnucl, bool as_is) {
+ unsigned nmask = (unsigned) (1 << inv_position(nnucl, as_is));
+ if (!(mask_ & nmask)) {
+# pragma omp atomic
+ mask_ |= (unsigned char) nmask;
+ }
+ }
+
+ void AddIncoming(char pnucl, bool as_is) {
+ unsigned pmask = (unsigned) (1 << inv_position(char(pnucl + 4), as_is));
+ if (!(mask_ & pmask)) {
+# pragma omp atomic
+ mask_|= (unsigned char) pmask;
+ }
+ }
+
+ void DeleteOutgoing(char nnucl, bool as_is) {
+ unsigned nmask = (1 << inv_position(nnucl, as_is));
+ if (mask_ & nmask) {
+# pragma omp atomic
+ mask_ &= (unsigned char) ~nmask;
+ }
+ }
+
+ void DeleteIncoming(char pnucl, bool as_is) {
+ unsigned pmask = (1 << inv_position(char(pnucl + 4), as_is));
+ if (mask_ & pmask) {
+# pragma omp atomic
+ mask_ &= (unsigned char) ~pmask;
+ }
+ }
+
+ void IsolateVertex() {
+ mask_ = 0;
+ }
+
+ bool CheckOutgoing(char nucl) const {
+ return mask_ & (1 << nucl);
+ }
+
+ bool CheckIncoming(char nucl) const {
+ return mask_ & (1 << (4 + nucl));
+ }
+
+ bool IsDeadEnd() const {
+ return !(mask_ & 15);
+ }
+
+ bool IsDeadStart() const {
+ return !(mask_ >> 4);
+ }
+
+ bool CheckUniqueOutgoing() const {
+ return CheckUnique(mask_ & 15);
+ }
+
+ bool CheckUniqueIncoming() const {
+ return CheckUnique(uint8_t(mask_ >> 4));
+ }
+
+ char GetUniqueOutgoing() const {
+ return GetUnique(mask_ & 15);
+ }
+
+ char GetUniqueIncoming() const {
+ return GetUnique(uint8_t(mask_ >> 4));
+ }
+
+ size_t OutgoingEdgeCount() const {
+ return Count(mask_ & 15);
+ }
+
+ size_t IncomingEdgeCount() const {
+ return Count(uint8_t(mask_ >> 4));
+ }
+};
+
+template<class Stream>
+Stream &operator<<(Stream& stream, const InOutMask &mask) {
+ return stream << std::bitset<8>(mask.get_mask());
+}
+
+template<class Seq>
+struct slim_kmer_index_traits : public kmer_index_traits<Seq> {
+ typedef kmer_index_traits<Seq> __super;
+
+ typedef MMappedRecordReader<typename Seq::DataType> FinalKMerStorage;
+
+ template<class Writer>
+ static void raw_serialize(Writer&, typename __super::RawKMerStorage*) {
+ VERIFY(false && "Cannot save extension index");
+ }
+
+ template<class Reader>
+ static typename __super::RawKMerStorage *raw_deserialize(
+ Reader&, const std::string &) {
+ VERIFY(false && "Cannot load extension index");
+ return NULL;
+ }
+
+};
+
+template<typename KeyWithHash>
+struct AbstractDeEdge {
+ KeyWithHash start;
+ KeyWithHash end;
+ AbstractDeEdge(KeyWithHash _start, KeyWithHash _end) : start(_start), end(_end) {
+ }
+
+ AbstractDeEdge<KeyWithHash> &operator=(const AbstractDeEdge<KeyWithHash> &that) {
+ this->start = that.start;
+ this->end = that.end;
+ return *this;
+ }
+
+ bool operator==(const AbstractDeEdge &other) {
+ return start.idx() == other.start.idx() && end.idx() == other.end.idx();
+ }
+
+ bool operator!=(const AbstractDeEdge &other) {
+ return !(*this == other);
+ }
+};
+
+template<class stream, class KWH>
+stream &operator<<(stream &s, const AbstractDeEdge<KWH> de_edge) {
+ return s << "DeEdge[" << de_edge.start << ", " << de_edge.end << "]";
+}
+
+template<class traits = slim_kmer_index_traits<runtime_k::RtSeq>, class StoringType = DefaultStoring>
+class DeBruijnExtensionIndex : public KeyIteratingMap<typename traits::SeqType, InOutMask, traits, StoringType> {
+ typedef KeyIteratingMap<typename traits::SeqType, InOutMask, traits, StoringType> base;
+
+public:
+ typedef typename base::traits_t traits_t;
+ typedef StoringType storing_type;
+ typedef typename base::KeyType KMer;
+ typedef typename base::IdxType KMerIdx;
+ typedef typename base::KeyWithHash KeyWithHash;
+ typedef AbstractDeEdge<KeyWithHash> DeEdge;
+ using base::ConstructKWH;
+
+ DeBruijnExtensionIndex(unsigned K, const std::string &workdir)
+ : base((size_t) K, workdir) {
+ }
+
+ void AddOutgoing(const KeyWithHash &kwh, char nucl) {
+ TRACE("Add outgoing " << kwh << " " << size_t(nucl) << " " << kwh.is_minimal());
+ this->get_raw_value_reference(kwh).AddOutgoing(nucl, kwh.is_minimal());
+ }
+
+ void AddIncoming(const KeyWithHash &kwh, char nucl) {
+ TRACE("Add incoming " << kwh << " " << size_t(nucl) << " " << kwh.is_minimal());
+ this->get_raw_value_reference(kwh).AddIncoming(nucl, kwh.is_minimal());
+ }
+
+ void DeleteOutgoing(const KeyWithHash &kwh, char nucl) {
+ TRACE("Delete outgoing " << kwh << " " << size_t(nucl) << " " << kwh.is_minimal());
+ this->get_raw_value_reference(kwh).DeleteOutgoing(nucl, kwh.is_minimal());
+ }
+
+ void DeleteIncoming(const KeyWithHash &kwh, char nucl) {
+ TRACE("Delete incoming " << kwh << " " << size_t(nucl) << " " << kwh.is_minimal());
+ this->get_raw_value_reference(kwh).DeleteIncoming(nucl, kwh.is_minimal());
+ }
+
+ void IsolateVertex(const KeyWithHash &kwh) {
+ TRACE("Isolate vertex " << kwh);
+ this->get_raw_value_reference(kwh).IsolateVertex();
+ }
+
+ bool CheckOutgoing(const KeyWithHash &kwh, char nucl) const {
+ return this->get_value(kwh).CheckOutgoing(nucl);
+ }
+
+ KeyWithHash GetOutgoing(const KeyWithHash &kwh, char nucl) const {
+ return kwh << nucl;
+ }
+
+ bool CheckIncoming(const KeyWithHash &kwh, char nucl) const {
+ return this->get_value(kwh).CheckIncoming(nucl);
+ }
+
+ KeyWithHash GetIncoming(const KeyWithHash &kwh, char nucl) const {
+ return kwh >> nucl;
+ }
+
+ bool IsDeadEnd(const KeyWithHash &kwh) const {
+ return this->get_value(kwh).IsDeadEnd();
+ }
+
+ bool IsDeadStart(const KeyWithHash &kwh) const {
+ return this->get_value(kwh).IsDeadStart();
+ }
+
+ bool CheckUniqueOutgoing(const KeyWithHash &kwh) const {
+ return this->get_value(kwh).CheckUniqueOutgoing();
+ }
+
+ KeyWithHash GetUniqueOutgoing(const KeyWithHash &kwh) const {
+ return GetOutgoing(kwh, this->get_value(kwh).GetUniqueOutgoing());
+ }
+
+ bool CheckUniqueIncoming(const KeyWithHash &kwh) const {
+ return this->get_value(kwh).CheckUniqueIncoming();
+ }
+
+ KeyWithHash GetUniqueIncoming(const KeyWithHash &kwh) const {
+ return GetIncoming(kwh, this->get_value(kwh).GetUniqueIncoming());
+ }
+
+ size_t OutgoingEdgeCount(const KeyWithHash &kwh) const {
+ return this->get_value(kwh).OutgoingEdgeCount();
+ }
+
+ size_t IncomingEdgeCount(const KeyWithHash &kwh) const {
+ return this->get_value(kwh).IncomingEdgeCount();
+ }
+
+ ~DeBruijnExtensionIndex() {
+ }
+
+private:
+ DECL_LOGGER("ExtentionIndex");
+};
+
+template<class Builder>
+class DeBruijnExtensionIndexBuilder : public Builder {
+ typedef Builder base;
+public:
+ typedef typename Builder::IndexT IndexT;
+
+ template<class ReadStream>
+ size_t FillExtensionsFromStream(ReadStream &stream, IndexT &index) const {
+ unsigned k = index.k();
+ size_t rl = 0;
+
+ while (!stream.eof()) {
+ typename ReadStream::read_type r;
+ stream >> r;
+ rl = std::max(rl, r.size());
+
+ const Sequence &seq = r.sequence();
+ if (seq.size() < k + 1)
+ continue;
+
+ typename IndexT::KeyWithHash kwh = index.ConstructKWH(seq.start<runtime_k::RtSeq>(k));
+ for (size_t j = k; j < seq.size(); ++j) {
+ char nnucl = seq[j], pnucl = kwh[0];
+ index.AddOutgoing(kwh, nnucl);
+ kwh <<= nnucl;
+ index.AddIncoming(kwh, pnucl);
+ }
+ }
+
+ return rl;
+ }
+
+ void FillExtensionsFromIndex(const std::string &KPlusOneMersFilename,
+ IndexT &index) const {
+ unsigned KPlusOne = index.k() + 1;
+
+ typename IndexT::kmer_iterator it(
+ KPlusOneMersFilename, runtime_k::RtSeq::GetDataSize(KPlusOne));
+ for (; it.good(); ++it) {
+ runtime_k::RtSeq kpomer(KPlusOne, *it);
+
+ char pnucl = kpomer[0], nnucl = kpomer[KPlusOne - 1];
+ TRACE("processing k+1-mer " << kpomer);
+ index.AddOutgoing(index.ConstructKWH(runtime_k::RtSeq(KPlusOne - 1, kpomer)),
+ nnucl);
+ // FIXME: This is extremely ugly. Needs to add start / end methods to extract first / last N symbols...
+ index.AddIncoming(index.ConstructKWH(runtime_k::RtSeq(KPlusOne - 1, kpomer << 0)),
+ pnucl);
+ }
+ }
+
+public:
+ template<class Streams>
+ ReadStatistics BuildExtensionIndexFromStream(
+ IndexT &index, Streams &streams, io::SingleStream* contigs_stream = 0,
+ size_t read_buffer_size = 0) const {
+ unsigned nthreads = (unsigned) streams.size();
+
+ // First, build a k+1-mer index
+ DeBruijnReadKMerSplitter<typename Streams::ReadT, StoringTypeFilter<typename IndexT::storing_type>> splitter(
+ index.workdir(), index.k() + 1, 0xDEADBEEF, streams,
+ contigs_stream, read_buffer_size);
+ KMerDiskCounter<runtime_k::RtSeq> counter(index.workdir(), splitter);
+ counter.CountAll(nthreads, nthreads, /* merge */false);
+
+ // Now, count unique k-mers from k+1-mers
+ DeBruijnKMerKMerSplitter<StoringTypeFilter<typename IndexT::storing_type> > splitter2(index.workdir(), index.k(),
+ index.k() + 1, IndexT::storing_type::IsInvertable(), read_buffer_size);
+ for (unsigned i = 0; i < nthreads; ++i)
+ splitter2.AddKMers(counter.GetMergedKMersFname(i));
+ KMerDiskCounter<runtime_k::RtSeq> counter2(index.workdir(), splitter2);
+
+ index.BuildIndex(counter2, 16, nthreads);
+
+ // Build the kmer extensions
+ INFO("Building k-mer extensions from k+1-mers");
+# pragma omp parallel for num_threads(nthreads)
+ for (unsigned i = 0; i < nthreads; ++i)
+ FillExtensionsFromIndex(counter.GetMergedKMersFname(i), index);
+ INFO("Building k-mer extensions from k+1-mers finished.");
+
+ return splitter.stats();
+ }
+
+private:
+ DECL_LOGGER("DeBruijnExtensionIndexBuilder");
+};
+
+template<class Index>
+struct ExtensionIndexHelper {
+ typedef Index IndexT;
+ typedef typename IndexT::traits_t traits_t;
+ typedef typename IndexT::KMer Kmer;
+ typedef typename IndexT::KMerIdx KMerIdx;
+ typedef DeBruijnStreamKMerIndexBuilder<Kmer, IndexT> DeBruijnStreamKMerIndexBuilderT;
+ typedef DeBruijnExtensionIndexBuilder<DeBruijnStreamKMerIndexBuilderT> DeBruijnExtensionIndexBuilderT;
+};
+
+}
diff --git a/src/modules/data_structures/indices/kmer_splitters.hpp b/src/modules/data_structures/indices/kmer_splitters.hpp
new file mode 100644
index 0000000..ba56656
--- /dev/null
+++ b/src/modules/data_structures/indices/kmer_splitters.hpp
@@ -0,0 +1,445 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "io/reads_io/io_helper.hpp"
+#include "storing_traits.hpp"
+
+#include "dev_support/file_limit.hpp"
+#include "data_structures/sequence/runtime_k.hpp"
+
+namespace debruijn_graph {
+
+template<class StoringType>
+struct StoringTypeFilter {
+};
+
+template<>
+struct StoringTypeFilter<SimpleStoring> {
+ template<class Kmer>
+ bool filter(const Kmer &/*kmer*/) const {
+ return true;
+ }
+};
+
+template<>
+struct StoringTypeFilter<InvertableStoring> {
+ template<class Kmer>
+ bool filter(const Kmer &kmer) const {
+ return kmer.IsMinimal();
+ }
+};
+
+// used for temporary reads storage during parallel reading
+static const size_t READS_BUFFER_SIZE = 536870912; // 512 MB in bytes
+
+typedef ::KMerSplitter<runtime_k::RtSeq> RtSeqKMerSplitter;
+
+typedef KMerVector<runtime_k::RtSeq> RtSeqKMerVector;
+typedef std::vector<RtSeqKMerVector> KMerBuffer;
+
+template<class KmerFilter>
+class DeBruijnKMerSplitter : public RtSeqKMerSplitter {
+ private:
+ bool skip_not_minimal_;
+ KmerFilter kmer_filter_;
+ protected:
+ size_t read_buffer_size_;
+ protected:
+ size_t FillBufferFromSequence(const Sequence &seq,
+ KMerBuffer &buffer, unsigned num_files) const {
+ size_t kmers = 0;
+
+ if (seq.size() < this->K_)
+ return kmers;
+
+ runtime_k::RtSeq kmer = seq.start<runtime_k::RtSeq>(this->K_) >> 'A';
+ for (size_t j = this->K_ - 1; j < seq.size(); ++j) {
+ kmer <<= seq[j];
+ if(kmer_filter_.filter(kmer)) {
+ buffer[this->GetFileNumForSeq(kmer, num_files)].push_back(kmer);
+ kmers++;
+ }
+ }
+ return kmers;
+ }
+
+
+ void DumpBuffers(size_t num_files, size_t nthreads,
+ std::vector<KMerBuffer> &buffers,
+ const path::files_t &ostreams) const{
+ # pragma omp parallel for
+ for (unsigned k = 0; k < num_files; ++k) {
+ size_t sz = 0;
+ for (size_t i = 0; i < nthreads; ++i)
+ sz += buffers[i][k].size();
+
+ KMerVector<runtime_k::RtSeq> SortBuffer(this->K_, sz);
+ for (size_t i = 0; i < nthreads; ++i) {
+ KMerBuffer &entry = buffers[i];
+ for (size_t j = 0; j < entry[k].size(); ++j)
+ SortBuffer.push_back(entry[k][j]);
+ }
+ libcxx::sort(SortBuffer.begin(), SortBuffer.end(), KMerVector<runtime_k::RtSeq>::less2_fast());
+ auto it = std::unique(SortBuffer.begin(), SortBuffer.end(), KMerVector<runtime_k::RtSeq>::equal_to());
+
+ # pragma omp critical
+ {
+ FILE *f = fopen(ostreams[k].c_str(), "ab");
+ VERIFY_MSG(f, "Cannot open temporary file to write");
+ fwrite(SortBuffer.data(), SortBuffer.el_data_size(), it - SortBuffer.begin(), f);
+ fclose(f);
+ }
+ }
+
+ for (unsigned i = 0; i < nthreads; ++i) {
+ for (unsigned j = 0; j < num_files; ++j) {
+ buffers[i][j].clear();
+ }
+ }
+ }
+
+ public:
+ DeBruijnKMerSplitter(const std::string &work_dir,
+ unsigned K, KmerFilter kmer_filter, size_t read_buffer_size = 0, uint32_t seed = 0)
+ : RtSeqKMerSplitter(work_dir, K, seed), kmer_filter_(kmer_filter), read_buffer_size_(read_buffer_size) {
+ }
+ protected:
+ DECL_LOGGER("DeBruijnKMerSplitter");
+};
+
+struct ReadStatistics {
+ size_t reads_;
+ size_t max_read_length_;
+ size_t bases_;
+};
+
+template<class Read, class KmerFilter>
+class DeBruijnReadKMerSplitter : public DeBruijnKMerSplitter<KmerFilter> {
+ io::ReadStreamList<Read> &streams_;
+ io::SingleStream *contigs_;
+
+ template<class ReadStream>
+ ReadStatistics
+ FillBufferFromStream(ReadStream& stream,
+ KMerBuffer &tmp_entries,
+ unsigned num_files, size_t cell_size) const;
+
+ ReadStatistics rs_;
+
+ public:
+ DeBruijnReadKMerSplitter(const std::string &work_dir,
+ unsigned K, uint32_t seed,
+ io::ReadStreamList<Read>& streams,
+ io::SingleStream* contigs_stream = 0,
+ size_t read_buffer_size = 0)
+ : DeBruijnKMerSplitter<KmerFilter>(work_dir, K, KmerFilter(), read_buffer_size, seed),
+ streams_(streams), contigs_(contigs_stream), rs_({0 ,0 ,0}) {
+ }
+
+ virtual path::files_t Split(size_t num_files);
+
+ size_t read_length() const { return rs_.max_read_length_; }
+ ReadStatistics stats() const { return rs_; }
+};
+
+template<class Read, class KmerFilter> template<class ReadStream>
+ReadStatistics
+DeBruijnReadKMerSplitter<Read, KmerFilter>::FillBufferFromStream(ReadStream &stream,
+ KMerBuffer &buffer,
+ unsigned num_files, size_t cell_size) const {
+ typename ReadStream::ReadT r;
+ size_t reads = 0, kmers = 0, rl = 0, bases = 0;
+
+ while (!stream.eof() && kmers < num_files * cell_size) {
+ stream >> r;
+ rl = std::max(rl, r.size());
+ reads += 1;
+ bases += r.size();
+
+ kmers += this->FillBufferFromSequence(r.sequence(), buffer, num_files);
+ }
+ return { reads, rl, bases };
+}
+
+template<class Read, class KmerFilter>
+path::files_t DeBruijnReadKMerSplitter<Read, KmerFilter>::Split(size_t num_files) {
+ unsigned nthreads = (unsigned) streams_.size();
+
+ INFO("Splitting kmer instances into " << num_files << " buckets. This might take a while.");
+
+ // Determine the set of output files
+ path::files_t out;
+ for (unsigned i = 0; i < num_files; ++i)
+ out.push_back(this->GetRawKMersFname(i));
+
+ size_t file_limit = num_files + 2*nthreads;
+ size_t res = limit_file(file_limit);
+ if (res < file_limit) {
+ WARN("Failed to setup necessary limit for number of open files. The process might crash later on.");
+ WARN("Do 'ulimit -n " << file_limit << "' in the console to overcome the limit");
+ }
+
+ size_t reads_buffer_size = DeBruijnKMerSplitter<KmerFilter>::read_buffer_size_;
+ if (reads_buffer_size == 0) {
+ reads_buffer_size = READS_BUFFER_SIZE;
+ size_t mem_limit = (size_t)((double)(get_free_memory()) / (nthreads * 3));
+ INFO("Memory available for splitting buffers: " << (double)mem_limit / 1024.0 / 1024.0 / 1024.0 << " Gb");
+ reads_buffer_size = std::min(reads_buffer_size, mem_limit);
+ }
+ size_t cell_size = reads_buffer_size /
+ (num_files * runtime_k::RtSeq::GetDataSize(this->K_) * sizeof(runtime_k::RtSeq::DataType));
+
+ // Set sane minimum cell size
+ if (cell_size < 16384)
+ cell_size = 16384;
+ INFO("Using cell size of " << cell_size);
+
+ std::vector<KMerBuffer> tmp_entries(nthreads);
+ for (unsigned i = 0; i < nthreads; ++i) {
+ KMerBuffer &entry = tmp_entries[i];
+ entry.resize(num_files, RtSeqKMerVector(this->K_, (size_t) (1.1 * (double) cell_size)));
+ }
+
+ size_t counter = 0, rl = 0, bases = 0, n = 15;
+ streams_.reset();
+ while (!streams_.eof()) {
+# pragma omp parallel for num_threads(nthreads) reduction(+ : counter) reduction(+ : bases) shared(rl)
+ for (size_t i = 0; i < nthreads; ++i) {
+ ReadStatistics stats = FillBufferFromStream(streams_[i], tmp_entries[i], (unsigned) num_files, cell_size);
+ counter += stats.reads_;
+ bases += stats.bases_;
+
+ // There is no max reduction in C/C++ OpenMP... Only in FORTRAN :(
+# pragma omp flush(rl)
+ if (stats.max_read_length_ > rl)
+# pragma omp critical
+ {
+ rl = std::max(rl, stats.max_read_length_);
+ }
+ }
+
+ this->DumpBuffers(num_files, nthreads, tmp_entries, out);
+
+ if (counter >> n) {
+ INFO("Processed " << counter << " reads");
+ n += 1;
+ }
+ }
+
+ if (contigs_) {
+ INFO("Adding contigs from previous K");
+ size_t cnt = 0;
+ contigs_->reset();
+ while (!contigs_->eof()) {
+ FillBufferFromStream(*contigs_, tmp_entries[cnt], (unsigned) num_files, cell_size);
+ this->DumpBuffers(num_files, nthreads, tmp_entries, out);
+ if (++cnt >= nthreads)
+ cnt = 0;
+ }
+ }
+
+ INFO("Used " << counter << " reads. Maximum read length " << rl);
+ INFO("Average read length " << double(bases) / double(counter));
+ rs_ = { counter, rl, bases };
+
+ return out;
+}
+
+template<class Graph, class KmerFilter>
+class DeBruijnGraphKMerSplitter : public DeBruijnKMerSplitter<KmerFilter> {
+ typedef typename Graph::ConstEdgeIt EdgeIt;
+ typedef typename Graph::EdgeId EdgeId;
+
+ const Graph &g_;
+
+ size_t FillBufferFromEdges(EdgeIt &edge,
+ KMerBuffer &tmp_entries,
+ unsigned num_files, size_t cell_size) const;
+
+ public:
+ DeBruijnGraphKMerSplitter(const std::string &work_dir,
+ unsigned K, const Graph &g, size_t read_buffer_size = 0)
+ : DeBruijnKMerSplitter<KmerFilter>(work_dir, K, KmerFilter(), read_buffer_size), g_(g) {}
+
+ virtual path::files_t Split(size_t num_files);
+};
+
+template<class Graph, class KmerFilter>
+size_t
+DeBruijnGraphKMerSplitter<Graph, KmerFilter>::FillBufferFromEdges(EdgeIt &edge,
+ KMerBuffer &buffer,
+ unsigned num_files, size_t cell_size) const {
+ size_t seqs = 0;
+ for (size_t kmers = 0; !edge.IsEnd() && kmers < num_files * cell_size; ++edge) {
+ const Sequence &nucls = g_.EdgeNucls(*edge);
+
+ kmers += this->FillBufferFromSequence(nucls, buffer, num_files);
+ seqs += 1;
+ }
+
+ return seqs;
+}
+
+template<class Graph, class KmerFilter>
+path::files_t DeBruijnGraphKMerSplitter<Graph, KmerFilter>::Split(size_t num_files) {
+ INFO("Splitting kmer instances into " << num_files << " buckets. This might take a while.");
+
+ // Determine the set of output files
+ path::files_t out;
+ for (unsigned i = 0; i < num_files; ++i)
+ out.push_back(this->GetRawKMersFname(i));
+
+ size_t file_limit = num_files + 2*16;
+ size_t res = limit_file(file_limit);
+ if (res < file_limit) {
+ WARN("Failed to setup necessary limit for number of open files. The process might crash later on.");
+ WARN("Do 'ulimit -n " << file_limit << "' in the console to overcome the limit");
+ }
+
+ size_t reads_buffer_size = DeBruijnKMerSplitter<KmerFilter>::read_buffer_size_;
+ if (reads_buffer_size == 0) {
+ reads_buffer_size = READS_BUFFER_SIZE;
+ size_t mem_limit = (size_t)((double)(get_free_memory()) / (3));
+ INFO("Memory available for splitting buffers: " << (double)mem_limit / 1024.0 / 1024.0 / 1024.0 << " Gb");
+ reads_buffer_size = std::min(reads_buffer_size, mem_limit);
+ }
+ size_t cell_size = reads_buffer_size /
+ (num_files * runtime_k::RtSeq::GetDataSize(this->K_) * sizeof(runtime_k::RtSeq::DataType));
+ INFO("Using cell size of " << cell_size);
+
+ std::vector<KMerBuffer> tmp_entries(1);
+ KMerBuffer &entry = tmp_entries[0];
+ entry.resize(num_files, RtSeqKMerVector(this->K_, (size_t) (1.1 * (double) cell_size)));
+
+ size_t counter = 0, n = 10;
+ for (auto it = g_.ConstEdgeBegin(); !it.IsEnd(); ) {
+ counter += FillBufferFromEdges(it, tmp_entries[0], (unsigned) num_files, cell_size);
+
+ this->DumpBuffers(num_files, 1, tmp_entries, out);
+
+ if (counter >> n) {
+ INFO("Processed " << counter << " edges");
+ n += 1;
+ }
+ }
+
+ INFO("Used " << counter << " sequences.");
+
+ return out;
+}
+
+
+template<class KmerFilter>
+class DeBruijnKMerKMerSplitter : public DeBruijnKMerSplitter<KmerFilter> {
+ typedef MMappedFileRecordArrayIterator<runtime_k::RtSeq::DataType> kmer_iterator;
+
+ unsigned K_source_;
+ std::vector<std::string> kmers_;
+ bool add_rc_;
+
+ size_t FillBufferFromKMers(kmer_iterator &kmer,
+ KMerBuffer &tmp_entries,
+ unsigned num_files, size_t cell_size) const;
+
+ public:
+ DeBruijnKMerKMerSplitter(const std::string &work_dir,
+ unsigned K_target, unsigned K_source, bool add_rc, size_t read_buffer_size = 0)
+ : DeBruijnKMerSplitter<KmerFilter>(work_dir, K_target, KmerFilter(), read_buffer_size), K_source_(K_source), add_rc_(add_rc) {}
+
+ void AddKMers(const std::string &file) {
+ kmers_.push_back(file);
+ }
+
+ virtual path::files_t Split(size_t num_files);
+};
+
+template<class KmerFilter>
+inline size_t DeBruijnKMerKMerSplitter<KmerFilter>::FillBufferFromKMers(kmer_iterator &kmer,
+ KMerBuffer &buffer,
+ unsigned num_files, size_t cell_size) const {
+ size_t seqs = 0;
+ for (size_t kmers = 0; kmer.good() && kmers < num_files * cell_size; ++kmer) {
+ Sequence nucls(runtime_k::RtSeq(K_source_, *kmer));
+ kmers += this->FillBufferFromSequence(nucls, buffer, num_files);
+ if(add_rc_)
+ kmers += this->FillBufferFromSequence(!nucls, buffer, num_files);
+ seqs += 1;
+ }
+
+ return seqs;
+}
+
+template<class KmerFilter>
+inline path::files_t DeBruijnKMerKMerSplitter<KmerFilter>::Split(size_t num_files) {
+ unsigned nthreads = (unsigned) kmers_.size();
+ INFO("Splitting kmer instances into " << num_files << " buckets. This might take a while.");
+
+ // Determine the set of output files
+ path::files_t out;
+ for (unsigned i = 0; i < num_files; ++i)
+ out.push_back(this->GetRawKMersFname(i));
+
+ size_t file_limit = num_files + 2*nthreads;
+ size_t res = limit_file(file_limit);
+ if (res < file_limit) {
+ WARN("Failed to setup necessary limit for number of open files. The process might crash later on.");
+ WARN("Do 'ulimit -n " << file_limit << "' in the console to overcome the limit");
+ }
+
+ size_t reads_buffer_size = DeBruijnKMerSplitter<KmerFilter>::read_buffer_size_;
+ if (reads_buffer_size == 0) {
+ reads_buffer_size = READS_BUFFER_SIZE;
+ size_t mem_limit = (size_t)((double)(get_free_memory()) / (nthreads * 3));
+ INFO("Memory available for splitting buffers: " << (double)mem_limit / 1024.0 / 1024.0 / 1024.0 << " Gb");
+ reads_buffer_size = std::min(reads_buffer_size, mem_limit);
+ }
+ size_t cell_size = reads_buffer_size /
+ (num_files * runtime_k::RtSeq::GetDataSize(this->K_) * sizeof(runtime_k::RtSeq::DataType));
+ // Set sane minimum cell size
+ if (cell_size < 16384)
+ cell_size = 16384;
+ INFO("Using cell size of " << cell_size);
+
+ std::vector<KMerBuffer> tmp_entries(nthreads);
+ for (unsigned i = 0; i < nthreads; ++i) {
+ KMerBuffer &entry = tmp_entries[i];
+ entry.resize(num_files, RtSeqKMerVector(this->K_, (size_t) (1.1 * (double) cell_size)));
+ }
+
+ size_t counter = 0, n = 10;
+ std::vector<kmer_iterator> its;
+ its.reserve(nthreads);
+ for (auto it = kmers_.begin(), et = kmers_.end(); it != et; ++it)
+ its.emplace_back(*it, runtime_k::RtSeq::GetDataSize(K_source_));
+
+ bool anygood = false;
+ do {
+# pragma omp parallel for num_threads(nthreads) reduction(+ : counter)
+ for (size_t i = 0; i < nthreads; ++i)
+ counter += FillBufferFromKMers(its[i], tmp_entries[i], (unsigned) num_files, cell_size);
+
+ this->DumpBuffers(num_files, nthreads, tmp_entries, out);
+
+ if (counter >> n) {
+ INFO("Processed " << counter << " kmers");
+ n += 1;
+ }
+
+ anygood = false;
+ for (auto it = its.begin(), et = its.end(); it != et; ++it)
+ anygood |= it->good();
+ } while (anygood);
+
+ INFO("Used " << counter << " kmers.");
+
+ return out;
+}
+
+
+}
diff --git a/src/modules/data_structures/indices/perfect_hash_map.hpp b/src/modules/data_structures/indices/perfect_hash_map.hpp
new file mode 100644
index 0000000..6a58abc
--- /dev/null
+++ b/src/modules/data_structures/indices/perfect_hash_map.hpp
@@ -0,0 +1,396 @@
+#pragma once
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "dev_support/openmp_wrapper.h"
+
+#include "io/reads_io/io_helper.hpp"
+
+#include "data_structures/mph_index/kmer_index.hpp"
+#include "utils/adt/kmer_vector.hpp"
+
+#include "libcxx/sort.hpp"
+
+#include "kmer_splitters.hpp"
+#include "key_with_hash.hpp"
+#include "values.hpp"
+#include "storing_traits.hpp"
+
+#include <vector>
+#include <cstdlib>
+#include <cstdio>
+#include <cstdint>
+#include <io/kmers_io/kmer_iterator.hpp>
+
+namespace debruijn_graph {
+
+template<class K, class traits>
+class IndexWrapper {
+ static const size_t InvalidIdx = size_t(-1);
+public:
+ typedef size_t IdxType;
+ typedef K KeyType;
+ typedef traits traits_t;
+protected:
+ typedef KMerIndex<traits> KMerIndexT;
+ //these fields are protected only for reduction of storage in edge indices BinWrite
+ KMerIndexT index_;
+private:
+ std::string workdir_;
+ unsigned k_;
+
+protected:
+ size_t raw_seq_idx(const typename KMerIndexT::KMerRawReference s) const {
+ return index_.raw_seq_idx(s);
+ }
+
+ bool valid(const size_t idx) const {
+ return idx != InvalidIdx && idx < index_.size();
+ }
+public:
+ IndexWrapper(size_t k, const std::string &workdir) : k_((unsigned) k) {
+ //fixme string literal
+ workdir_ = path::make_temp_dir(workdir, "kmeridx");
+ }
+
+ ~IndexWrapper() {
+ path::remove_dir(workdir_);
+ }
+
+ void clear() {
+ index_.clear();
+ }
+
+ unsigned k() const { return k_; }
+
+public:
+ template<class Writer>
+ void BinWrite(Writer &writer) const {
+ index_.serialize(writer);
+ }
+
+ template<class Reader>
+ void BinRead(Reader &reader, const std::string &) {
+ clear();
+ index_.deserialize(reader);
+ }
+
+ const std::string &workdir() const {
+ return workdir_;
+ }
+};
+
+template<class K, class V, class traits, class StoringType>
+class PerfectHashMap : public ValueArray<V>, public IndexWrapper<K, traits> {
+public:
+ typedef size_t IdxType;
+ typedef K KeyType;
+ typedef ValueArray<V> ValueBase;
+ typedef IndexWrapper<KeyType, traits> KeyBase;
+ using KeyBase::index_;
+ typedef typename KeyBase::KMerIndexT KMerIndexT;
+ typedef typename StoringTraits<K, KMerIndexT, StoringType>::KeyWithHash KeyWithHash;
+
+ KeyWithHash ConstructKWH(const KeyType &key) const {
+ return KeyWithHash(key, index_);
+ }
+
+ bool valid(const KeyWithHash &kwh) const {
+ return KeyBase::valid(kwh.idx());
+ }
+
+ PerfectHashMap(size_t k, const std::string &workdir) : KeyBase(k, workdir) {
+ }
+
+ ~PerfectHashMap() {
+ }
+
+ void clear() {
+ KeyBase::clear();
+ ValueBase::clear();
+ }
+
+ const V get_value(const KeyWithHash &kwh) const {
+ return StoringType::get_value(*this, kwh);
+ }
+
+ //Think twice or ask AntonB if you want to use it!
+ V &get_raw_value_reference(const KeyWithHash &kwh) {
+ return ValueBase::operator[](kwh.idx());
+ }
+
+ const V &get_raw_value_reference(const KeyWithHash &kwh) const {
+ return ValueBase::operator[](kwh.idx());
+ }
+
+ void put_value(const KeyWithHash &kwh, const V &value) {
+ StoringType::set_value(*this, kwh, value);
+ }
+
+ template<class Writer>
+ void BinWrite(Writer &writer) const {
+ ValueBase::BinWrite(writer);
+ KeyBase::BinWrite(writer);
+ }
+
+ template<class Reader>
+ void BinRead(Reader &reader, const std::string &tmp) {
+ KeyBase::BinRead(reader, tmp);
+ ValueBase::BinRead(reader, tmp);
+ }
+//todo think more about hierarchy
+protected:
+ template <class KmerCounter>
+ void BuildIndex(KmerCounter& counter, size_t bucket_num, size_t thread_num, bool save_final = true) {
+ KMerIndexBuilder<KMerIndexT> builder(this->workdir(),
+ (unsigned) bucket_num,
+ (unsigned) thread_num);
+ size_t sz = builder.BuildIndex(index_, counter, save_final);
+ ValueBase::resize(sz);
+ }
+};
+
+
+template<class K, class V, class traits, class StoringType>
+class KeyStoringMap : public PerfectHashMap<K, V, traits, StoringType> {
+private:
+ typedef PerfectHashMap<K, V, traits, StoringType> base;
+
+public:
+ typedef traits traits_t;
+ typedef K KMer;
+ typedef typename base::IdxType KMerIdx;
+ typedef typename traits::FinalKMerStorage::iterator kmer_iterator;
+ typedef typename traits::FinalKMerStorage::const_iterator const_kmer_iterator;
+ typedef typename base::KeyWithHash KeyWithHash;
+ using base::ConstructKWH;
+
+private:
+ typename traits::FinalKMerStorage *kmers_;
+
+ void SortUniqueKMers() const {
+ size_t swaps = 0;
+ INFO("Arranging kmers in hash map order");
+ for (auto I = kmers_->begin(), E = kmers_->end(); I != E; ++I) {
+ size_t cidx = I - kmers_->begin();
+ size_t kidx = this->raw_seq_idx(*I);
+ while (cidx != kidx) {
+ auto J = kmers_->begin() + kidx;
+ using std::swap;
+ swap(*I, *J);
+ swaps += 1;
+ kidx = this->raw_seq_idx(*I);
+ }
+ }
+ INFO("Done. Total swaps: " << swaps);
+ }
+
+protected:
+ template<class Writer>
+ void BinWriteKmers(Writer &writer) const {
+ traits::raw_serialize(writer, this->kmers_);
+ }
+
+ template<class Reader>
+ void BinReadKmers(Reader &reader, const std::string &FileName) {
+ this->kmers_ = traits_t::raw_deserialize(reader, FileName);
+ }
+
+ template<class Writer>
+ void BinWrite(Writer &writer) const {
+ base::BinWrite(writer);
+ BinWriteKmers(writer);
+ }
+
+ template<class Reader>
+ void BinRead(Reader &reader, const std::string &FileName) {
+ base::BinRead(reader, FileName);
+ BinReadKmers(reader, FileName);
+ }
+
+public:
+
+ KeyStoringMap(size_t k, const std::string &workdir)
+ : base(k, workdir),
+ kmers_(NULL) {
+ }
+
+ ~KeyStoringMap() {
+ delete kmers_;
+ }
+
+ KMer true_kmer(KeyWithHash kwh) const {
+ VERIFY(this->valid(kwh));
+
+ auto it = this->kmers_->begin() + kwh.idx();
+ return (typename traits_t::raw_create()(this->k(), *it));
+ }
+
+ void clear() {
+ base::clear();
+ delete kmers_;
+ kmers_ = NULL;
+ }
+
+ kmer_iterator kmer_begin() {
+ return kmers_->begin();
+ }
+ const_kmer_iterator kmer_begin() const {
+ return kmers_->cbegin();
+ }
+
+ kmer_iterator kmer_end() {
+ return kmers_->end();
+ }
+ const_kmer_iterator kmer_end() const {
+ return kmers_->cend();
+ }
+
+ bool valid(const KeyWithHash &kwh) const {
+ if (!base::valid(kwh))
+ return false;
+
+ auto it = this->kmers_->begin() + kwh.idx();
+ if(!kwh.is_minimal())
+ return (typename traits_t::raw_equal_to()(!kwh.key(), *it));
+ else
+ return (typename traits_t::raw_equal_to()(kwh.key(), *it));
+ }
+
+ /**
+ * Number of edges going out of the param edge's end
+ */
+ unsigned NextEdgeCount(const KeyWithHash &kwh) const {
+ unsigned res = 0;
+ for (char c = 0; c < 4; ++c)
+ if (valid(kwh << c))
+ res += 1;
+
+ return res;
+ }
+
+ KeyWithHash NextEdge(const KeyWithHash &kwh) const { // returns any next edge
+ for (char c = 0; c < 4; ++c) {
+ if (valid(kwh << c))
+ //hack for this code to work with long seqs! (oterwise return s is totally fine)
+ return ConstructKWH(true_kmer(kwh));//s;
+ }
+
+ VERIFY_MSG(false, "Couldn't find requested edge!");
+ return ConstructKWH(KMer(this->k()));
+ // no next edges (we should request one here).
+ }
+
+ /**
+ * Number of edges coming into param edge's end
+ */
+ unsigned RivalEdgeCount(const KeyWithHash &kwh) const {
+ KeyWithHash next = kwh << 'A';
+ unsigned res = 0;
+ for (char c = 0; c < 4; ++c)
+ if (valid(next >> c))
+ res += 1;
+
+ return res;
+ }
+
+ template<class KmerCounter>
+ void BuildIndex(KmerCounter& counter, size_t bucket_num,
+ size_t thread_num) {
+ base::BuildIndex(counter, bucket_num, thread_num);
+ VERIFY(!kmers_);
+ kmers_ = counter.GetFinalKMers();
+ VERIFY(kmers_);
+ SortUniqueKMers();
+ }
+};
+
+template<class K, class V, class traits, class StoringType>
+class KeyIteratingMap : public PerfectHashMap<K, V, traits, StoringType> {
+ typedef PerfectHashMap<K, V, traits, StoringType> base;
+
+ std::string KMersFilename_;
+
+public:
+ typedef StoringType storing_type;
+ typedef typename base::traits_t traits_t;
+ typedef typename base::KeyType KMer;
+ typedef typename base::IdxType KMerIdx;
+ using base::ConstructKWH;
+
+public:
+
+ KeyIteratingMap(size_t k, const std::string &workdir)
+ : base(k, workdir),
+ KMersFilename_("") {
+ }
+
+ ~KeyIteratingMap() {
+ }
+
+ typedef MMappedFileRecordArrayIterator<typename KMer::DataType> kmer_iterator;
+
+ kmer_iterator kmer_begin() const {
+ return kmer_iterator(this->KMersFilename_, KMer::GetDataSize(base::k()));
+ }
+
+ std::vector<kmer_iterator> kmer_begin(size_t parts) const {
+ return io::make_kmer_iterator<KMer>(this->KMersFilename_, base::k(), parts);
+ }
+
+
+ template<class KmerCounter>
+ void BuildIndex(KmerCounter& counter, size_t bucket_num,
+ size_t thread_num) {
+ base::BuildIndex(counter, bucket_num, thread_num);
+ KMersFilename_ = counter.GetFinalKMersFname();
+ }
+};
+
+//Seq is here for partial specialization
+template <class Seq, class Index>
+class DeBruijnStreamKMerIndexBuilder {
+
+};
+
+template<class Index>
+class DeBruijnStreamKMerIndexBuilder<runtime_k::RtSeq, Index> {
+ public:
+ typedef Index IndexT;
+
+ template <class Streams>
+ size_t BuildIndexFromStream(IndexT &index,
+ Streams &streams,
+ io::SingleStream* contigs_stream = 0) const {
+ DeBruijnReadKMerSplitter<typename Streams::ReadT, StoringTypeFilter<typename IndexT::storing_type>>
+ splitter(index.workdir(), index.k(), 0, streams, contigs_stream);
+ KMerDiskCounter<runtime_k::RtSeq> counter(index.workdir(), splitter);
+
+ index.BuildIndex(counter, 16, streams.size());
+ return 0;
+ }
+};
+
+//fixme makes hierarchy a bit strange
+template <class Index, class Seq = typename Index::KMer>
+class DeBruijnGraphKMerIndexBuilder;
+
+template <class Index>
+class DeBruijnGraphKMerIndexBuilder<Index, runtime_k::RtSeq> {
+ public:
+ typedef Index IndexT;
+
+ template<class Graph>
+ void BuildIndexFromGraph(IndexT &index, const Graph &g, size_t read_buffer_size = 0) const {
+ DeBruijnGraphKMerSplitter<Graph, StoringTypeFilter<typename Index::storing_type>> splitter(index.workdir(), index.k(),
+ g, read_buffer_size);
+ KMerDiskCounter<runtime_k::RtSeq> counter(index.workdir(), splitter);
+ index.BuildIndex(counter, 16, 1);
+ }
+};
+
+}
diff --git a/src/modules/data_structures/indices/storing_traits.hpp b/src/modules/data_structures/indices/storing_traits.hpp
new file mode 100644
index 0000000..b91406f
--- /dev/null
+++ b/src/modules/data_structures/indices/storing_traits.hpp
@@ -0,0 +1,61 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+/*
+ * key_with_hash.hpp
+ *
+ * Created on: Nov 7, 2013
+ * Author: anton
+ */
+
+#include "values.hpp"
+
+namespace debruijn_graph {
+
+
+struct SimpleStoring {
+ template<class K, class V>
+ static V get_value(const ValueArray<V> &values, const K& key) {
+ return values[key.idx()];
+ }
+
+ template<class K, class V>
+ static void set_value(ValueArray<V> &values, const K& key, const V& value) {
+ values[key.idx()] = value;
+ }
+
+ static bool IsInvertable() {
+ return false;
+ }
+};
+
+struct InvertableStoring {
+ template<class K, class V>
+ static V get_value(const ValueArray<V> &values, const K& key) {
+ if(key.is_minimal())
+ return values[key.idx()];
+ else
+ return values[key.idx()].conjugate(key);
+ }
+
+ template<class K, class V>
+ static void set_value(ValueArray<V> &values, const K& key, const V& value) {
+ if(key.is_minimal())
+ values[key.idx()] = value;
+ else
+ values[key.idx()] = value.conjugate(key);
+ }
+
+ static bool IsInvertable() {
+ return true;
+ }
+};
+
+typedef InvertableStoring DefaultStoring;
+
+}
diff --git a/src/debruijn/indices/values.hpp b/src/modules/data_structures/indices/values.hpp
similarity index 100%
rename from src/debruijn/indices/values.hpp
rename to src/modules/data_structures/indices/values.hpp
diff --git a/src/modules/data_structures/mph_index/CMakeLists.txt b/src/modules/data_structures/mph_index/CMakeLists.txt
new file mode 100644
index 0000000..cf07729
--- /dev/null
+++ b/src/modules/data_structures/mph_index/CMakeLists.txt
@@ -0,0 +1,13 @@
+############################################################################
+# Copyright (c) 2015 Saint Petersburg State University
+# Copyright (c) 2011-2014 Saint Petersburg Academic University
+# All Rights Reserved
+# See file LICENSE for details.
+############################################################################
+
+project(mph_index CXX)
+
+add_library(mph_index STATIC bitpair_vector.cpp)
+
+target_link_libraries(mph_index cityhash)
+
diff --git a/src/include/mph_index/base_hash.hpp b/src/modules/data_structures/mph_index/base_hash.hpp
similarity index 100%
rename from src/include/mph_index/base_hash.hpp
rename to src/modules/data_structures/mph_index/base_hash.hpp
diff --git a/src/modules/data_structures/mph_index/bitpair_vector.cpp b/src/modules/data_structures/mph_index/bitpair_vector.cpp
new file mode 100644
index 0000000..de151bb
--- /dev/null
+++ b/src/modules/data_structures/mph_index/bitpair_vector.cpp
@@ -0,0 +1,77 @@
+//
+// Created by anton on 3/22/16.
+//
+
+#include "bitpair_vector.hpp"
+
+#include <iostream>
+
+void emphf::bitpair_vector::resize(uint64_t n) {
+ // can only grow, for now
+ assert(n >= size());
+ m_size = n;
+ m_bits.resize((m_size + 31) / 32);
+}
+
+size_t emphf::bitpair_vector::size() const {
+ return m_size;
+}
+
+size_t emphf::bitpair_vector::mem_size() const {
+ return m_bits.size() * sizeof(m_bits[0]);
+}
+
+uint64_t emphf::bitpair_vector::operator[](uint64_t pos) const {
+ return (m_bits[pos / 32] >> ((pos % 32) * 2)) % 4;
+}
+
+void emphf::bitpair_vector::set(uint64_t pos, uint64_t val) {
+ assert(val < 4);
+ uint64_t word_pos = pos / 32;
+ uint64_t word_offset = (pos % 32) * 2;
+ m_bits[word_pos] &= ~(3ULL << word_offset);
+ m_bits[word_pos] |= val << word_offset;
+}
+
+uint64_t emphf::bitpair_vector::range_nonzeros(uint64_t begin, uint64_t end) const {
+ assert(begin <= end);
+ assert(end <= size());
+
+ uint64_t word_begin = begin / 32;
+ uint64_t offset_begin = (begin % 32) * 2;
+ uint64_t word_end = end / 32;
+ uint64_t offset_end = (end % 32) * 2;
+ uint64_t r = 0;
+
+ uint64_t word = (m_bits[word_begin] >> offset_begin) << offset_begin;
+ for (uint64_t w = word_begin; w < word_end; ++w) {
+ r += nonzero_pairs(word);
+ word = m_bits[w + 1];
+ }
+
+ uint64_t mask = (uint64_t(1) << offset_end) - 1;
+ r += nonzero_pairs(word & mask);
+
+ return r;
+}
+
+void emphf::bitpair_vector::swap(bitpair_vector& other) {
+ std::swap(m_size, other.m_size);
+ m_bits.swap(other.m_bits);
+ }
+
+
+void emphf::bitpair_vector::save(std::ostream& os) const {
+ os.write(reinterpret_cast<char const*>(&m_size), sizeof(m_size));
+ os.write(reinterpret_cast<char const*>(m_bits.data()), (std::streamsize)(sizeof(m_bits[0]) * m_bits.size()));
+ }
+
+void emphf::bitpair_vector::load(std::istream& is) {
+ is.read(reinterpret_cast<char*>(&m_size), sizeof(m_size));
+ m_bits.resize((m_size + 31) / 32);
+ is.read(reinterpret_cast<char*>(m_bits.data()), (std::streamsize)(sizeof(m_bits[0]) * m_bits.size()));
+ }
+
+std::vector<uint64_t> const &emphf::bitpair_vector::data() const {
+ return m_bits;
+}
diff --git a/src/modules/data_structures/mph_index/bitpair_vector.hpp b/src/modules/data_structures/mph_index/bitpair_vector.hpp
new file mode 100644
index 0000000..0ecd88e
--- /dev/null
+++ b/src/modules/data_structures/mph_index/bitpair_vector.hpp
@@ -0,0 +1,27 @@
+#pragma once
+
+#include "common.hpp"
+#include <vector>
+
+namespace emphf {
+
+ class bitpair_vector {
+ public:
+ bitpair_vector(): m_size(0) {}
+ bitpair_vector(uint64_t n): m_size(0){resize(n);}
+ void resize(uint64_t n);
+ size_t size() const;
+ size_t mem_size() const;
+ uint64_t operator[](uint64_t pos) const;
+ void set(uint64_t pos, uint64_t val);
+ uint64_t range_nonzeros(uint64_t begin, uint64_t end) const;
+ void swap(bitpair_vector& other);
+ void save(std::ostream& os) const;
+ void load(std::istream& is);
+ std::vector<uint64_t> const & data() const;
+ protected:
+ std::vector<uint64_t> m_bits;
+ uint64_t m_size;
+ };
+
+}
diff --git a/src/modules/data_structures/mph_index/common.hpp b/src/modules/data_structures/mph_index/common.hpp
new file mode 100644
index 0000000..b39e686
--- /dev/null
+++ b/src/modules/data_structures/mph_index/common.hpp
@@ -0,0 +1,66 @@
+#pragma once
+
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <cassert>
+
+#include "emphf_config.hpp"
+
+namespace emphf {
+
+ template <typename Iterator>
+ struct iter_range
+ {
+ iter_range(Iterator b, Iterator e)
+ : m_begin(b)
+ , m_end(e)
+ {}
+
+ Iterator begin() const
+ { return m_begin; }
+
+ Iterator end() const
+ { return m_end; }
+
+ Iterator m_begin, m_end;
+ };
+
+ typedef std::pair<uint8_t const*, uint8_t const*> byte_range_t;
+
+ struct identity_adaptor
+ {
+ byte_range_t operator()(byte_range_t s) const
+ {
+ return s;
+ }
+ };
+
+ template <typename Iterator>
+ iter_range<Iterator> range(Iterator begin, Iterator end)
+ {
+ return iter_range<Iterator>(begin, end);
+ }
+
+ inline uint64_t nonzero_pairs(uint64_t x)
+ {
+ static const uint64_t ones_step_4 = 0x1111111111111111ULL;
+ x = (x | (x >> 1)) & (0x5 * ones_step_4);
+
+#if EMPHF_USE_POPCOUNT
+ return (uint64_t)__builtin_popcountll(x);
+#else
+ static const uint64_t ones_step_8 = 0x0101010101010101ULL;
+ x = (x & 3 * ones_step_4) + ((x >> 2) & 3 * ones_step_4);
+ x = (x + (x >> 4)) & 0x0f * ones_step_8;
+ return (x * ones_step_8) >> 56;
+#endif
+ }
+
+ inline uint64_t msb(uint64_t x)
+ {
+ assert(x);
+ return 63 - __builtin_clzll(x);
+ }
+
+}
diff --git a/src/include/mph_index/emphf_config.hpp b/src/modules/data_structures/mph_index/emphf_config.hpp
similarity index 100%
rename from src/include/mph_index/emphf_config.hpp
rename to src/modules/data_structures/mph_index/emphf_config.hpp
diff --git a/src/include/mph_index/hypergraph.hpp b/src/modules/data_structures/mph_index/hypergraph.hpp
similarity index 100%
rename from src/include/mph_index/hypergraph.hpp
rename to src/modules/data_structures/mph_index/hypergraph.hpp
diff --git a/src/modules/data_structures/mph_index/hypergraph_sorter_seq.hpp b/src/modules/data_structures/mph_index/hypergraph_sorter_seq.hpp
new file mode 100644
index 0000000..649be20
--- /dev/null
+++ b/src/modules/data_structures/mph_index/hypergraph_sorter_seq.hpp
@@ -0,0 +1,130 @@
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <tuple>
+#include <cmath>
+#include <vector>
+#include <iterator>
+#include <algorithm>
+#include <stdexcept>
+
+#include "common.hpp"
+#include "hypergraph.hpp"
+
+#include "dev_support/logger/logger.hpp"
+
+namespace emphf {
+
+ template <typename HypergraphType>
+ class hypergraph_sorter_seq {
+ public:
+ typedef HypergraphType hg;
+ typedef typename hg::node_t node_t;
+ typedef typename hg::hyperedge hyperedge;
+ typedef typename hg::xored_adj_list xored_adj_list;
+
+ hypergraph_sorter_seq()
+ {}
+
+ template <typename Range, typename EdgeGenerator>
+ bool try_generate_and_sort(Range const& input_range,
+ EdgeGenerator const& edge_gen,
+ size_t n,
+ size_t hash_domain,
+ bool verbose = true)
+ {
+ using std::get;
+ std::vector<xored_adj_list> adj_lists;
+
+ size_t m = hash_domain * 3;
+
+ // do all the allocations upfront
+ m_peeling_order.clear();
+ m_peeling_order.reserve(n);
+ adj_lists.resize(m);
+
+ // generate edges
+ if (verbose) {
+ //logger() << "Generating hyperedges and populating adjacency lists"
+ // << std::endl;
+ }
+
+ for (auto const& val: input_range) {
+ auto edge = edge_gen(val);
+ // canonical by construction
+ assert(orientation(edge) == 0);
+
+ adj_lists[edge.v0].add_edge(edge);
+
+ std::swap(edge.v0, edge.v1);
+ adj_lists[edge.v0].add_edge(edge);
+
+ std::swap(edge.v0, edge.v2);
+ adj_lists[edge.v0].add_edge(edge);
+ }
+
+ // peel
+ if (verbose) {
+ // logger() << "Peeling" << std::endl;
+ }
+
+ auto visit = [&](node_t v0) {
+ if (adj_lists[v0].degree == 1) {
+ auto edge = adj_lists[v0].edge_from(v0);
+ m_peeling_order.push_back(edge);
+
+ edge = canonicalize_edge(edge);
+ adj_lists[edge.v0].delete_edge(edge);
+
+ std::swap(edge.v0, edge.v1);
+ adj_lists[edge.v0].delete_edge(edge);
+
+ std::swap(edge.v0, edge.v2);
+ adj_lists[edge.v0].delete_edge(edge);
+ }
+ };
+
+ size_t queue_position = 0;
+ for (node_t v0 = 0; v0 < m; ++v0) {
+ visit(v0);
+
+ while (queue_position < m_peeling_order.size()) {
+ auto const& cur_edge = m_peeling_order[queue_position];
+
+ visit(cur_edge.v1);
+ visit(cur_edge.v2);
+ queue_position += 1;
+ }
+ }
+
+ if (m_peeling_order.size() < n) {
+ if (verbose) {
+ // logger() << "Hypergraph is not peelable: "
+ // << (n - m_peeling_order.size()) << " edges remaining"
+ // << std::endl;
+ }
+ return false;
+ }
+
+ assert(m_peeling_order.size() == n);
+
+ return true;
+ }
+
+ typedef typename std::vector<hyperedge>::const_reverse_iterator
+ peeling_iterator;
+
+ std::pair<peeling_iterator, peeling_iterator>
+ get_peeling_order() const
+ {
+ return std::make_pair(m_peeling_order.crbegin(),
+ m_peeling_order.crend());
+ }
+
+ private:
+
+ size_t m_hash_domain;
+ std::vector<hyperedge> m_peeling_order;
+ };
+}
diff --git a/src/modules/data_structures/mph_index/kmer_index.hpp b/src/modules/data_structures/mph_index/kmer_index.hpp
new file mode 100644
index 0000000..16d2c66
--- /dev/null
+++ b/src/modules/data_structures/mph_index/kmer_index.hpp
@@ -0,0 +1,530 @@
+#pragma once
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "io/kmers_io/mmapped_reader.hpp"
+#include "io/kmers_io/mmapped_writer.hpp"
+#include "utils/adt/pointer_iterator.hpp"
+
+#include "mphf.hpp"
+#include "base_hash.hpp"
+#include "hypergraph.hpp"
+#include "hypergraph_sorter_seq.hpp"
+
+#include "dev_support/openmp_wrapper.h"
+
+#include "dev_support/logger/logger.hpp"
+#include "dev_support/path_helper.hpp"
+
+#include "dev_support/memory_limit.hpp"
+
+#include <libcxx/sort.hpp>
+
+#include <algorithm>
+#ifdef USE_GLIBCXX_PARALLEL
+#include <parallel/algorithm>
+#endif
+#include <fstream>
+#include <vector>
+#include <cmath>
+
+#include "config.hpp"
+
+#ifdef SPADES_USE_JEMALLOC
+# include <jemalloc/jemalloc.h>
+#endif
+
+template<class Index>
+class KMerIndexBuilder;
+
+template<class Seq>
+struct kmer_index_traits {
+ typedef Seq SeqType;
+ typedef MMappedRecordArrayReader<typename Seq::DataType> RawKMerStorage;
+ typedef MMappedRecordArrayReader<typename Seq::DataType> FinalKMerStorage;
+ typedef typename RawKMerStorage::iterator raw_data_iterator;
+ typedef typename RawKMerStorage::const_iterator raw_data_const_iterator;
+ typedef typename RawKMerStorage::iterator::value_type KMerRawData;
+ typedef typename RawKMerStorage::iterator::reference KMerRawReference;
+ typedef typename RawKMerStorage::const_iterator::reference KMerRawConstReference;
+
+ struct raw_equal_to {
+ bool operator()(const Seq &lhs, const KMerRawReference rhs) {
+ return (array_equal_to<typename Seq::DataType>()(lhs.data(), lhs.data_size(), rhs));
+ }
+ };
+
+ struct raw_create {
+ Seq operator()(unsigned K, const KMerRawReference kmer) {
+ return Seq(K, kmer.data());
+ }
+ Seq operator()(unsigned K, const KMerRawConstReference kmer) {
+ return Seq(K, kmer.data());
+ }
+ };
+
+ struct hash_function {
+ uint64_t operator()(const Seq &k) const{
+ return typename Seq::hash()(k);
+ }
+ uint64_t operator()(const KMerRawReference k) const {
+ return typename Seq::hash()(k.data(), k.size());
+ }
+ };
+
+ struct KMerRawReferenceAdaptor {
+ emphf::byte_range_t operator()(const KMerRawReference k) const {
+ const uint8_t * data = (const uint8_t*)k.data();
+ return std::make_pair(data, data + k.data_size());
+ }
+ };
+
+ struct KMerSeqAdaptor {
+ emphf::byte_range_t operator()(const Seq &k) const {
+ const uint8_t * data = (const uint8_t*)k.data();
+ return std::make_pair(data, data + k.data_size() * sizeof(typename Seq::DataType));
+ }
+ };
+
+ template<class Writer>
+ static void raw_serialize(Writer &writer, RawKMerStorage *data) {
+ size_t sz = data->data_size(), elcnt = data->elcnt();
+ unsigned PageSize = getpagesize();
+ writer.write((char*)&sz, sizeof(sz));
+ writer.write((char*)&elcnt, sizeof(elcnt));
+ // Make sure data is aligned to the page boundary
+ size_t cpos = writer.tellp();
+ size_t pos = (cpos + PageSize - 1 + sizeof(size_t)) / PageSize * PageSize;
+ size_t off = pos - writer.tellp();
+ writer.write((char*)&off, sizeof(off));
+ writer.seekp(pos);
+ writer.write((char*)data->data(), data->data_size());
+ }
+
+ template<class Reader>
+ static RawKMerStorage *raw_deserialize(Reader &reader, const std::string &FileName) {
+ size_t sz, off, elcnt;
+ reader.read((char*)&sz, sizeof(sz));
+ reader.read((char*)&elcnt, sizeof(elcnt));
+ reader.read((char*)&off, sizeof(off));
+ off -= sizeof(off);
+ off += reader.tellg();
+
+ return new RawKMerStorage(FileName, elcnt, false, off, sz);
+ }
+
+};
+
+template<class traits>
+class KMerIndex {
+ public:
+ typedef traits kmer_index_traits;
+ typedef typename traits::SeqType KMerSeq;
+ typedef typename traits::hash_function hash_function;
+ typedef typename traits::KMerRawData KMerRawData;
+ typedef typename traits::KMerRawReference KMerRawReference;
+ typedef size_t IdxType;
+
+ private:
+ using KMerDataIndex = emphf::mphf<emphf::city_hasher>;
+ typedef KMerIndex __self;
+
+ public:
+ KMerIndex(): index_(NULL), num_buckets_(0), size_(0) {}
+
+ KMerIndex(const KMerIndex&) = delete;
+ KMerIndex& operator=(const KMerIndex&) = delete;
+
+ ~KMerIndex() { clear(); }
+
+ void clear() {
+ num_buckets_ = 0;
+ bucket_starts_.clear();
+
+ delete[] index_;
+ index_ = NULL;
+ }
+
+ size_t mem_size() {
+ size_t sz = 0;
+ for (size_t i = 0; i < num_buckets_; ++i)
+ sz += index_[i].mem_size();
+
+ return sz;
+ }
+
+ void count_size() {
+ if (index_ == NULL)
+ return;
+ size_ = 0;
+ for (size_t i = 0; i < num_buckets_; i++)
+ size_ += index_[i].size();
+ }
+
+ size_t size() const {
+ return size_;
+ }
+
+ size_t seq_idx(const KMerSeq &s) const {
+ size_t bucket = seq_bucket(s);
+
+ return bucket_starts_[bucket] +
+ index_[bucket].lookup(s, typename traits::KMerSeqAdaptor());
+ }
+
+ size_t raw_seq_idx(const KMerRawReference data) const {
+ size_t bucket = raw_seq_bucket(data);
+
+ return bucket_starts_[bucket] +
+ index_[bucket].lookup(data, typename traits::KMerRawReferenceAdaptor());
+ }
+
+ template<class Writer>
+ void serialize(Writer &os) const {
+ os.write((char*)&num_buckets_, sizeof(num_buckets_));
+ for (size_t i = 0; i < num_buckets_; ++i)
+ index_[i].save(os);
+ os.write((char*)&bucket_starts_[0], (num_buckets_ + 1) * sizeof(bucket_starts_[0]));
+ }
+
+ template<class Reader>
+ void deserialize(Reader &is) {
+ clear();
+
+ is.read((char*)&num_buckets_, sizeof(num_buckets_));
+
+ index_ = new KMerDataIndex[num_buckets_];
+ for (size_t i = 0; i < num_buckets_; ++i)
+ index_[i].load(is);
+
+ bucket_starts_.resize(num_buckets_ + 1);
+ is.read((char*)&bucket_starts_[0], (num_buckets_ + 1) * sizeof(bucket_starts_[0]));
+ count_size();
+ }
+
+ void swap(KMerIndex<traits> &other) {
+ std::swap(index_, other.index_);
+ std::swap(num_buckets_, other.num_buckets_);
+ std::swap(size_, other.size_);
+ std::swap(bucket_starts_, other.bucket_starts_);
+ }
+
+ private:
+ KMerDataIndex *index_;
+
+ size_t num_buckets_;
+ std::vector<size_t> bucket_starts_;
+ size_t size_;
+
+ size_t seq_bucket(const KMerSeq &s) const {
+ return hash_function()(s) % num_buckets_;
+ }
+ size_t raw_seq_bucket(const KMerRawReference data) const {
+ return hash_function()(data) % num_buckets_;
+ }
+
+ friend class KMerIndexBuilder<__self>;
+};
+
+template<class Seq>
+class KMerSplitter {
+ public:
+ typedef typename Seq::hash hash_function;
+
+ KMerSplitter(const std::string &work_dir, unsigned K, uint32_t seed = 0)
+ : work_dir_(work_dir), K_(K), seed_(seed) {}
+
+ virtual ~KMerSplitter() {}
+
+ virtual path::files_t Split(size_t num_files) = 0;
+
+ unsigned K() const { return K_; }
+
+ protected:
+ const std::string &work_dir_;
+ hash_function hash_;
+ unsigned K_;
+ uint32_t seed_;
+
+ std::string GetRawKMersFname(unsigned suffix) const {
+ return path::append_path(work_dir_, "kmers.raw." + std::to_string(suffix));
+ }
+
+ unsigned GetFileNumForSeq(const Seq &s, unsigned total) const {
+ return (unsigned)(hash_(s, seed_) % total);
+ }
+
+ DECL_LOGGER("K-mer Splitting");
+};
+
+template<class Seq, class traits = kmer_index_traits<Seq> >
+class KMerCounter {
+ public:
+ typedef typename traits::raw_data_iterator iterator;
+ typedef typename traits::raw_data_const_iterator const_iterator;
+ typedef typename traits::RawKMerStorage RawKMerStorage;
+ typedef typename traits::FinalKMerStorage FinalKMerStorage;
+
+ virtual size_t KMerSize() const = 0;
+
+ virtual size_t Count(unsigned num_buckets, unsigned num_threads) = 0;
+ virtual size_t CountAll(unsigned num_buckets, unsigned num_threads, bool merge = true) = 0;
+ virtual void MergeBuckets(unsigned num_buckets) = 0;
+
+ virtual void OpenBucket(size_t idx, bool unlink = true) = 0;
+ virtual void ReleaseBucket(size_t idx) = 0;
+ virtual RawKMerStorage* TransferBucket(size_t idx) = 0;
+ virtual FinalKMerStorage* GetFinalKMers() = 0;
+
+ virtual iterator bucket_begin(size_t idx) = 0;
+ virtual iterator bucket_end(size_t idx) = 0;
+
+ virtual ~KMerCounter() {}
+
+protected:
+ DECL_LOGGER("K-mer Counting");
+};
+
+template<class Seq, class traits = kmer_index_traits<Seq> >
+class KMerDiskCounter : public KMerCounter<Seq> {
+ typedef KMerCounter<Seq, traits> __super;
+public:
+ KMerDiskCounter(const std::string &work_dir, KMerSplitter<Seq> &splitter)
+ : work_dir_(work_dir), splitter_(splitter) {
+ std::string prefix = path::append_path(work_dir, "kmers_XXXXXX");
+ char *tempprefix = strcpy(new char[prefix.length() + 1], prefix.c_str());
+ VERIFY_MSG(-1 != (fd_ = ::mkstemp(tempprefix)), "Cannot create temporary file");
+ kmer_prefix_ = tempprefix;
+ delete[] tempprefix;
+ }
+
+ ~KMerDiskCounter() {
+ for (size_t i = 0; i < buckets_.size(); ++i)
+ ReleaseBucket(i);
+
+ ::close(fd_);
+ ::unlink(kmer_prefix_.c_str());
+ }
+
+ size_t KMerSize() const {
+ return Seq::GetDataSize(splitter_.K()) * sizeof(typename Seq::DataType);
+ }
+
+ void OpenBucket(size_t idx, bool unlink = true) {
+ unsigned K = splitter_.K();
+
+ buckets_[idx] = new MMappedRecordArrayReader<typename Seq::DataType>(GetMergedKMersFname((unsigned)idx), Seq::GetDataSize(K), unlink);
+ }
+
+ void ReleaseBucket(size_t idx) {
+ delete buckets_[idx];
+ buckets_[idx] = NULL;
+ }
+
+ MMappedRecordArrayReader<typename Seq::DataType>* TransferBucket(size_t idx) {
+ MMappedRecordArrayReader<typename Seq::DataType> *res = buckets_[idx];
+ buckets_[idx] = NULL;
+
+ return res;
+ }
+
+ typename __super::iterator bucket_begin(size_t idx) {
+ return buckets_[idx]->begin();
+ }
+ typename __super::iterator bucket_end(size_t idx) {
+ return buckets_[idx]->end();
+ }
+
+ size_t Count(unsigned num_buckets, unsigned num_threads) {
+ unsigned K = splitter_.K();
+
+ // Split k-mers into buckets.
+ path::files_t raw_kmers = splitter_.Split(num_buckets * num_threads);
+
+ INFO("Starting k-mer counting.");
+ size_t kmers = 0;
+# pragma omp parallel for shared(raw_kmers) num_threads(num_threads) schedule(dynamic) reduction(+:kmers)
+ for (unsigned iFile = 0; iFile < raw_kmers.size(); ++iFile) {
+ kmers += MergeKMers(raw_kmers[iFile], GetUniqueKMersFname(iFile), K);
+ }
+ INFO("K-mer counting done. There are " << kmers << " kmers in total. ");
+
+ INFO("Merging temporary buckets.");
+ for (unsigned i = 0; i < num_buckets; ++i) {
+ std::string ofname = GetMergedKMersFname(i);
+ std::ofstream ofs(ofname.c_str(), std::ios::out | std::ios::binary);
+ for (unsigned j = 0; j < num_threads; ++j) {
+ MMappedRecordArrayReader<typename Seq::DataType> ins(GetUniqueKMersFname(i + j * num_buckets), Seq::GetDataSize(K), /* unlink */ true);
+ ofs.write((const char*)ins.data(), ins.data_size());
+ }
+ }
+
+ buckets_.resize(num_buckets);
+
+ return kmers;
+ }
+
+ void MergeBuckets(unsigned num_buckets) {
+ unsigned K = splitter_.K();
+
+ INFO("Merging final buckets.");
+ for (unsigned i = 0; i < num_buckets; ++i)
+ VERIFY(buckets_[i] == NULL);
+
+ buckets_.clear();
+
+ MMappedRecordArrayWriter<typename Seq::DataType> os(GetFinalKMersFname(), Seq::GetDataSize(K));
+ std::string ofname = GetFinalKMersFname();
+ std::ofstream ofs(ofname.c_str(), std::ios::out | std::ios::binary);
+ for (unsigned j = 0; j < num_buckets; ++j) {
+ MMappedRecordArrayReader<typename Seq::DataType> ins(GetMergedKMersFname(j), Seq::GetDataSize(K), /* unlink */ true);
+ ofs.write((const char*)ins.data(), ins.data_size());
+ }
+ ofs.close();
+ }
+
+ size_t CountAll(unsigned num_buckets, unsigned num_threads, bool merge = true) {
+ size_t kmers = Count(num_buckets, num_threads);
+ if (merge)
+ MergeBuckets(num_buckets);
+
+ return kmers;
+ }
+
+ typename __super::FinalKMerStorage *GetFinalKMers() {
+ unsigned K = splitter_.K();
+ return new MMappedRecordArrayReader<typename Seq::DataType>(GetFinalKMersFname(), Seq::GetDataSize(K), /* unlink */ true);
+ }
+
+ std::string GetMergedKMersFname(unsigned suffix) const {
+ return kmer_prefix_ + ".merged." + std::to_string(suffix);
+ }
+
+ std::string GetFinalKMersFname() const {
+ return kmer_prefix_ + ".final";
+ }
+
+private:
+ std::string work_dir_;
+ KMerSplitter<Seq> &splitter_;
+ int fd_;
+ std::string kmer_prefix_;
+
+ std::vector<MMappedRecordArrayReader<typename Seq::DataType>*> buckets_;
+
+ std::string GetUniqueKMersFname(unsigned suffix) const {
+ return kmer_prefix_ + ".unique." + std::to_string(suffix);
+ }
+
+ size_t MergeKMers(const std::string &ifname, const std::string &ofname,
+ unsigned K) {
+ MMappedRecordArrayReader<typename Seq::DataType> ins(ifname, Seq::GetDataSize(K), /* unlink */ true);
+
+ // Sort the stuff
+ libcxx::sort(ins.begin(), ins.end(), array_less<typename Seq::DataType>());
+
+ // FIXME: Use something like parallel version of unique_copy but with explicit
+ // resizing.
+ auto it = std::unique(ins.begin(), ins.end(), array_equal_to<typename Seq::DataType>());
+
+ MMappedRecordArrayWriter<typename Seq::DataType> os(ofname, Seq::GetDataSize(K));
+ os.resize(it - ins.begin());
+ std::copy(ins.begin(), it, os.begin());
+
+ return it - ins.begin();
+ }
+};
+
+template<class Index>
+class KMerIndexBuilder {
+ typedef typename Index::KMerSeq Seq;
+ typedef typename Index::kmer_index_traits kmer_index_traits;
+
+ std::string work_dir_;
+ unsigned num_buckets_;
+ unsigned num_threads_;
+
+ public:
+ KMerIndexBuilder(const std::string &workdir,
+ unsigned num_buckets, unsigned num_threads)
+ : work_dir_(workdir), num_buckets_(num_buckets), num_threads_(num_threads) {}
+ size_t BuildIndex(Index &out, KMerCounter<Seq> &counter,
+ bool save_final = false);
+
+ unsigned num_buckets() const { return num_buckets_; }
+
+ private:
+
+ DECL_LOGGER("K-mer Index Building");
+};
+
+template<class Index>
+size_t KMerIndexBuilder<Index>::BuildIndex(Index &index, KMerCounter<Seq> &counter,
+ bool save_final) {
+ index.clear();
+
+ INFO("Building kmer index ");
+
+ // First, count the unique k-mers
+ size_t kmers = counter.Count(num_buckets_, num_threads_);
+
+ index.num_buckets_ = num_buckets_;
+ index.bucket_starts_.resize(num_buckets_ + 1);
+ index.index_ = new typename KMerIndex<kmer_index_traits>::KMerDataIndex[num_buckets_];
+
+ INFO("Building perfect hash indices");
+
+ // Index building requires up to 40 bytes per k-mer. Limit number of threads depending on the memory limit.
+ unsigned num_threads = num_threads_;
+# ifdef SPADES_USE_JEMALLOC
+ const size_t *cmem = 0;
+ size_t clen = sizeof(cmem);
+
+ je_mallctl("stats.cactive", &cmem, &clen, NULL, 0);
+ size_t bucket_size = (36 * kmers + kmers * counter.KMerSize()) / num_buckets_;
+ num_threads = std::min<unsigned>((unsigned) ((get_memory_limit() - *cmem) / bucket_size), num_threads);
+ if (num_threads < 1)
+ num_threads = 1;
+ if (num_threads < num_threads_)
+ WARN("Number of threads was limited down to " << num_threads << " in order to fit the memory limits during the index construction");
+# endif
+
+# pragma omp parallel for shared(index) num_threads(num_threads)
+ for (unsigned iFile = 0; iFile < num_buckets_; ++iFile) {
+ typename KMerIndex<kmer_index_traits>::KMerDataIndex &data_index = index.index_[iFile];
+ counter.OpenBucket(iFile, !save_final);
+ size_t sz = counter.bucket_end(iFile) - counter.bucket_begin(iFile);
+ index.bucket_starts_[iFile + 1] = sz;
+ typename kmer_index_traits::KMerRawReferenceAdaptor adaptor;
+ size_t max_nodes = (size_t(std::ceil(double(sz) * 1.23)) + 2) / 3 * 3;
+ if (max_nodes >= uint64_t(1) << 32) {
+ emphf::hypergraph_sorter_seq<emphf::hypergraph<uint64_t> > sorter;
+ typename KMerIndex<kmer_index_traits>::KMerDataIndex(sorter,
+ sz, emphf::range(counter.bucket_begin(iFile), counter.bucket_end(iFile)),
+ adaptor).swap(data_index);
+ } else {
+ emphf::hypergraph_sorter_seq<emphf::hypergraph<uint32_t> > sorter;
+ typename KMerIndex<kmer_index_traits>::KMerDataIndex(sorter,
+ sz, emphf::range(counter.bucket_begin(iFile), counter.bucket_end(iFile)),
+ adaptor).swap(data_index);
+ }
+
+ counter.ReleaseBucket(iFile);
+ }
+
+ // Finally, record the sizes of buckets.
+ for (unsigned iFile = 1; iFile < num_buckets_; ++iFile)
+ index.bucket_starts_[iFile] += index.bucket_starts_[iFile - 1];
+
+ if (save_final)
+ counter.MergeBuckets(num_buckets_);
+
+ double bits_per_kmer = 8.0 * (double)index.mem_size() / (double)kmers;
+ INFO("Index built. Total " << index.mem_size() << " bytes occupied (" << bits_per_kmer << " bits per kmer).");
+ index.count_size();
+ return kmers;
+}
diff --git a/src/modules/data_structures/mph_index/mphf.hpp b/src/modules/data_structures/mph_index/mphf.hpp
new file mode 100644
index 0000000..a00c6fd
--- /dev/null
+++ b/src/modules/data_structures/mph_index/mphf.hpp
@@ -0,0 +1,136 @@
+#pragma once
+
+#include <random>
+
+#include "bitpair_vector.hpp"
+#include "ranked_bitpair_vector.hpp"
+
+#include "dev_support/logger/logger.hpp"
+
+namespace emphf {
+
+ template <typename BaseHasher>
+ class mphf {
+ public:
+ mphf()
+ {}
+
+ template <typename HypergraphSorter, typename Range, typename Adaptor>
+ mphf(HypergraphSorter& sorter, size_t n,
+ Range const& input_range, Adaptor adaptor,
+ double gamma = 1.23)
+ : m_n(n)
+ , m_hash_domain(std::max((size_t(std::ceil(double(m_n) * gamma)) + 2) / 3, size_t(2)))
+ {
+ typedef typename HypergraphSorter::node_t node_t;
+ typedef typename HypergraphSorter::hyperedge hyperedge;
+ typedef decltype(*std::begin(input_range)) value_type;
+
+ size_t nodes_domain = m_hash_domain * 3;
+
+ if (nodes_domain >= std::numeric_limits<node_t>::max()) {
+ throw std::invalid_argument("Too many nodes for node_t");
+ }
+
+ auto edge_gen = [&](value_type s) {
+ using std::get;
+ auto hashes = m_hasher(adaptor(s));
+ return hyperedge((node_t)(get<0>(hashes) % m_hash_domain),
+ (node_t)(m_hash_domain +
+ (get<1>(hashes) % m_hash_domain)),
+ (node_t)(2 * m_hash_domain +
+ (get<2>(hashes) % m_hash_domain)));
+ };
+
+ std::mt19937_64 rng(37); // deterministic seed
+
+ for (size_t trial = 0; ; ++trial) {
+ //logger() << "Hypergraph generation: trial " << trial << std::endl;
+
+ m_hasher = BaseHasher::generate(rng);
+ if (sorter.try_generate_and_sort(input_range, edge_gen,
+ m_n, m_hash_domain)) break;
+ }
+
+ auto peeling_order = sorter.get_peeling_order();
+ bitpair_vector bv(nodes_domain);
+
+ //logger() << "Assigning values" << std::endl;
+
+ for (auto edge = peeling_order.first;
+ edge != peeling_order.second;
+ ++edge) {
+
+ uint64_t target = orientation(*edge);
+ uint64_t assigned = bv[edge->v1] + bv[edge->v2];
+
+ // "assigned values" must be nonzeros to be ranked, so
+ // if the result is 0 we assign 3
+ bv.set(edge->v0, ((target - assigned + 9) % 3) ?: 3);
+ }
+
+ m_bv.build(std::move(bv));
+ }
+
+ uint64_t size() const
+ {
+ return m_n;
+ }
+
+ size_t mem_size() const {
+ return m_bv.mem_size();
+ }
+
+ BaseHasher const& base_hasher() const
+ {
+ return m_hasher;
+ }
+
+ template <typename T, typename Adaptor>
+ uint64_t lookup(T val, Adaptor adaptor)
+ {
+ using std::get;
+ auto hashes = m_hasher(adaptor(val));
+ uint64_t nodes[3] = {get<0>(hashes) % m_hash_domain,
+ m_hash_domain + (get<1>(hashes) % m_hash_domain),
+ 2 * m_hash_domain + (get<2>(hashes) % m_hash_domain)};
+
+ uint64_t hidx = (m_bv[nodes[0]] + m_bv[nodes[1]] + m_bv[nodes[2]]) % 3;
+ return m_bv.rank(nodes[hidx]);
+ }
+
+ void swap(mphf& other)
+ {
+ std::swap(m_n, other.m_n);
+ std::swap(m_hash_domain, other.m_hash_domain);
+ m_hasher.swap(other.m_hasher);
+ m_bv.swap(other.m_bv);
+ }
+
+ void save(std::ostream& os) const
+ {
+ os.write(reinterpret_cast<char const*>(&m_n), sizeof(m_n));
+ os.write(reinterpret_cast<char const*>(&m_hash_domain),
+ sizeof(m_hash_domain));
+ m_hasher.save(os);
+ m_bv.save(os);
+ }
+
+ void load(std::istream& is)
+ {
+ is.read(reinterpret_cast<char*>(&m_n), sizeof(m_n));
+ is.read(reinterpret_cast<char*>(&m_hash_domain),
+ sizeof(m_hash_domain));
+ m_hasher.load(is);
+ m_bv.load(is);
+ }
+
+
+ private:
+
+ uint64_t m_n;
+ uint64_t m_hash_domain;
+ BaseHasher m_hasher;
+ ranked_bitpair_vector m_bv;
+ };
+}
diff --git a/src/include/mph_index/ranked_bitpair_vector.hpp b/src/modules/data_structures/mph_index/ranked_bitpair_vector.hpp
similarity index 100%
rename from src/include/mph_index/ranked_bitpair_vector.hpp
rename to src/modules/data_structures/mph_index/ranked_bitpair_vector.hpp
diff --git a/src/modules/data_structures/sequence/CMakeLists.txt b/src/modules/data_structures/sequence/CMakeLists.txt
new file mode 100644
index 0000000..f465519
--- /dev/null
+++ b/src/modules/data_structures/sequence/CMakeLists.txt
@@ -0,0 +1,10 @@
+############################################################################
+# Copyright (c) 2015 Saint Petersburg State University
+# Copyright (c) 2011-2014 Saint Petersburg Academic University
+# All Rights Reserved
+# See file LICENSE for details.
+############################################################################
+
+project(sequence CXX)
+
+add_library(sequence STATIC genome_storage.cpp)
diff --git a/src/modules/data_structures/sequence/genome_storage.cpp b/src/modules/data_structures/sequence/genome_storage.cpp
new file mode 100644
index 0000000..f2f262e
--- /dev/null
+++ b/src/modules/data_structures/sequence/genome_storage.cpp
@@ -0,0 +1,45 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+//
+// Created by lab42 on 8/19/15.
+//
+
+#include "genome_storage.hpp"
+#include "data_structures/sequence/nucl.hpp"
+using namespace std;
+
+namespace debruijn_graph {
+//TODO exterminate this where possible
+ Sequence GenomeStorage::GetSequence() const{
+ stringstream ss;
+ size_t l = 0, r = 0;
+ for(size_t i = 0; i < s_.size(); i++) {
+ if (! is_nucl(s_[i]) ) {
+ if (r > l) {
+ ss << s_.substr(l, r - l);
+ }
+ r = i + 1;
+ l = i + 1;
+ } else {
+ r++;
+ }
+ }
+ if (r > l) {
+ ss << s_.substr(l, r - l);
+ }
+ return Sequence(ss.str());
+ }
+ void GenomeStorage::SetSequence(const Sequence &s) {
+ s_ = s.str();
+ }
+ string GenomeStorage::str() const{
+ return s_;
+ }
+ size_t GenomeStorage::size() const {
+ return s_.size();
+ }
+}
\ No newline at end of file
diff --git a/src/modules/data_structures/sequence/genome_storage.hpp b/src/modules/data_structures/sequence/genome_storage.hpp
new file mode 100644
index 0000000..401576d
--- /dev/null
+++ b/src/modules/data_structures/sequence/genome_storage.hpp
@@ -0,0 +1,33 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+//
+// Created by lab42 on 8/19/15.
+//
+
+#ifndef GENOME_STORAGE_HPP_
+#define GENOME_STORAGE_HPP_
+
+#include <string>
+#include "data_structures/sequence/sequence.hpp"
+namespace debruijn_graph {
+ class GenomeStorage {
+ private:
+ std::string s_;
+ public:
+ GenomeStorage():s_(""){
+ }
+
+ GenomeStorage(const std::string &s): s_(s){
+ }
+
+ Sequence GetSequence() const;
+ void SetSequence(const Sequence &s);
+ std::string str() const;
+ size_t size() const;
+ };
+}
+#endif //PROJECT_GENOME_STORAGE_HPP
diff --git a/src/modules/data_structures/sequence/nucl.hpp b/src/modules/data_structures/sequence/nucl.hpp
new file mode 100755
index 0000000..905d8c2
--- /dev/null
+++ b/src/modules/data_structures/sequence/nucl.hpp
@@ -0,0 +1,123 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/**
+ * @file nucl.hpp
+ * @author vyahhi
+ * @version 1.0
+ *
+ * @section LICENSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * @section DESCRIPTION
+ *
+ * Simple operations and checks for nucleotide-letters
+ *
+ */
+
+
+#ifndef NUCL_HPP_
+#define NUCL_HPP_
+
+#include "dev_support/verify.hpp"
+#include <iostream>
+
+const char dignucl_map['T' + 1] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3};
+
+const bool isnucl_map[256] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
+
+const char nucl_map[4] = {'A', 'C', 'G', 'T'};
+
+const char nucl_complement_map['T' + 1] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 'T', 0, 'G', 0, 0, 0, 'C', 0, 0, 0, 0, 0, 0, 'N', 0, 0, 0, 0, 0, 'A'};
+
+/**
+ * ACGT -> true
+ * @param char c
+ * @return true if c is 'A', 'C', 'G' or 'T'.
+ */
+inline bool is_nucl(char c) { // is ACGT
+ return isnucl_map[(unsigned)c];
+}
+
+/**
+ * 0123 -> true
+ * @param char c
+ * @return true if c is 0, 1, 2 or 3.
+ */
+inline bool is_dignucl(char c) { // is 0123
+ return (c < 4);
+}
+
+/**
+ * 0123 -> 3210
+ * @param char c
+ * @return c ^ 3
+ */
+inline char complement(char c) {
+ // VERIFY(is_dignucl(c));
+ return c ^ 3;
+}
+
+/**
+ * ACGT -> TGCA
+ * @param char c is 'A', 'C', 'G', 'T' or 'N'
+ * @return complement symbol, i.e. 'A' => 'T', 'C' => 'G', 'G' => 'C', 'T' => 'A', 'N' => 'N'
+ */
+
+struct nucl_complement_functor { // still unused
+ inline bool operator() (char c) const {
+ char cc = nucl_complement_map[(unsigned)c];
+ return cc ? cc : 'N';
+ }
+};
+
+inline char nucl_complement(char c){
+ // TODO: deal with 'N' case
+ //VERIFY(is_nucl(c));
+ char cc = nucl_complement_map[(unsigned)c];
+ return cc ? cc : 'N';
+}
+
+/**
+ * 0123 -> ACGT
+ * @param char c is 0, 1, 2 or 3
+ * @return 0 => 'A', 1 => 'C', 2 => 'G', 3 => 'T'
+ */
+inline char nucl(char c) {
+ return nucl_map[(unsigned)c];
+}
+
+/**
+ * ACGT -> 0123
+ * @param char c is 'A', 'C', 'G' or 'T'
+ * @return A => 0, C => 1, G => 2, T => 3
+ */
+
+/*
+struct dignucl : public unary_function<int,bool> {
+ bool operator()(signed char c) const {
+ return dignucl_map[c];
+ }
+};*/
+
+inline char dignucl(char c) {
+ // VERIFY(is_nucl(c));
+ return dignucl_map[(unsigned)c];
+}
+
+
+#endif /* NUCL_HPP_ */
diff --git a/src/modules/data_structures/sequence/quality.hpp b/src/modules/data_structures/sequence/quality.hpp
new file mode 100755
index 0000000..7410c3e
--- /dev/null
+++ b/src/modules/data_structures/sequence/quality.hpp
@@ -0,0 +1,39 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * qual.hpp
+ *
+ * Created on: 03.03.2011
+ * Author: vyahhi
+ */
+
+#ifndef QUAL_HPP_
+#define QUAL_HPP_
+
+#include <string>
+//todo really strange class
+class Quality {
+public:
+
+ Quality(const std::string &s) : qual_(s) {
+ }
+
+ int operator[](size_t i) const {
+ return qual_[i];
+ }
+
+ std::string str() const { // copying (defensive)!
+ return qual_;
+ }
+
+private:
+ std::string qual_;
+ //friend class ireadstream;
+};
+
+#endif /* QUAL_HPP_ */
diff --git a/src/modules/data_structures/sequence/rtseq.hpp b/src/modules/data_structures/sequence/rtseq.hpp
new file mode 100644
index 0000000..e67e855
--- /dev/null
+++ b/src/modules/data_structures/sequence/rtseq.hpp
@@ -0,0 +1,736 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * rtseq.hpp
+ *
+ * Created on: Jun 28, 2012
+ * Author: andrey
+ */
+
+#ifndef RTSEQ_HPP_
+#define RTSEQ_HPP_
+
+#include <string>
+#include "dev_support/verify.hpp"
+#include <array>
+#include <algorithm>
+#include "data_structures/sequence/nucl.hpp"
+#include "dev_support/log.hpp"
+#include "seq_common.hpp"
+#include "seq.hpp"
+#include "simple_seq.hpp"
+
+#include <cstring>
+#include <iostream>
+
+template<size_t max_size_, typename T = seq_element_type>
+class RuntimeSeq {
+public:
+ /**
+ * @variable Number of bits in type T (e.g. 8 for char)
+ * @example 8: 2^8 = 256 or 16
+ */
+ const static size_t TBits = sizeof(T) << 3;
+
+ /**
+ * @variable Number of nucleotides that can be stored in one type T (e.g. 4 for char)
+ * TNucl MUST be a power of two
+ * @example 4: 8/2 = 4 or 16/2 = 8
+ */
+ const static size_t TNucl = TBits >> 1;
+
+ /**
+ * @variable Number of bits in TNucl (e.g. 2 for char). Useful for shifts instead of divisions.
+ */
+ const static size_t TNuclBits = log_<TNucl, 2>::value;
+
+ const static size_t Iterations = log_<TBits, 2>::value;
+
+ static const std::array<T, Iterations> ConstructLeftMasks() {
+ std::array<T, Iterations> result;
+ for (size_t i = 0; i < Iterations; i++) {
+ size_t shift = 1 << i;
+ T mask = T(T(1) << shift) - T(1);
+ result[i] = T(mask << shift);
+ for (size_t j = 0; j < i; j++) {
+ result[j] += T(result[j] << shift);
+ }
+ }
+ return result;
+ }
+
+ static const std::array<T, Iterations> ConstructRightMasks() {
+ std::array<T, Iterations> result(ConstructLeftMasks());
+ for (size_t i = 0; i < Iterations; i++) {
+ result[i] = T(~result[i]);
+ }
+ return result;
+ }
+
+
+ RuntimeSeq<max_size_, T> FastRC() const {
+ const static std::array<T, Iterations> LeftMasks(ConstructLeftMasks());
+ const static std::array<T, Iterations> RightMasks(ConstructRightMasks());
+ const static size_t LogTSize = log_<sizeof(T), 2>::value + 3;
+
+ RuntimeSeq<max_size_, T> res(this->size());
+
+ const size_t bit_size = size_ << 1;
+ const size_t extra = bit_size & ((1 << LogTSize) - 1);
+ const size_t to_extra = TBits - extra;
+ const size_t filled = bit_size >> LogTSize;
+ size_t real_length = filled;
+ if (extra == 0) {
+ for (size_t i = 0, j = filled - 1; i < filled; i++, j--) {
+ res.data_[i] = data_[j];
+ }
+ } else {
+ for (size_t i = 0, j = filled; i < filled && j > 0; i++, j--) {
+ res.data_[i] = (data_[j] << to_extra) + (data_[j - 1] >> extra);
+ }
+ res.data_[filled] = (data_[0] << to_extra);
+ real_length++;
+ }
+
+ for (size_t i = 0; i < real_length; i++) {
+ res.data_[i] = res.data_[i] ^ T(-1);
+ for (size_t it = 1; it < Iterations; it++) {
+ size_t shift = 1 << it;
+ res.data_[i] = T((res.data_[i] & LeftMasks[it]) >> shift) ^ T((res.data_[i] & RightMasks[it]) << shift);
+ }
+ }
+
+ if (extra != 0) {
+ res.data_[real_length - 1] = (res.data_[real_length - 1] & ((T(1) << extra) - 1));
+ }
+ return res;
+ }
+
+ /**
+ * @variable Number of Ts which required to store all sequence.
+ */
+ const static size_t DataSize = (max_size_ + TNucl - 1) >> TNuclBits;
+
+ /**
+ * @variable Number of meaningful bytes in whick seq is stored
+ */
+ const static size_t TotalBytes = sizeof(T) * DataSize;
+
+ typedef T DataType;
+
+ static size_t GetDataSize(size_t size) {
+ return (size + TNucl - 1) >> TNuclBits;
+ }
+
+private:
+ /* *
+ * @variable Just some prime number to count the hash function of the kmer
+ * */
+ const static size_t PrimeNum = 239;
+
+
+ // number of nucleotides in the last data_ bucket
+ static size_t NuclsRemain(size_t size) {
+ return size & (TNucl - 1);
+ }
+
+ // useful mask to fill the last element of the data_ array
+ static size_t MaskForLastBucket(size_t size) {
+ size_t nr = NuclsRemain(size);
+ return nr != 0 ? (((T) 1) << (nr << 1)) - 1 : -1ul;
+ }
+
+
+ /**
+ * @variable Inner representation of sequence: array of Ts with length = DataSize.
+ *
+ * @invariant Invariant: all nucleotides >= size_ are 'A's (useful for comparison)
+ */
+ std::array<T, DataSize> data_;
+
+ size_t size_;
+
+ /**
+ * Initialize data_ array of this object with C-string
+ *
+ * @param s C-string (ACGT chars only), strlen(s) = size_
+ */
+ void init(const char *s) {
+ T data = 0;
+ size_t cnt = 0;
+ size_t cur = 0;
+ for (size_t pos = 0; pos < size_; ++pos, ++s) { // unsafe!
+ // VERIFY(is_nucl(*s)); // for performance
+ data = data | ((T) dignucl(*s) << cnt);
+ cnt += 2;
+ if (cnt == TBits) {
+ this->data_[cur++] = data;
+ cnt = 0;
+ data = 0;
+ }
+ }
+ if (cnt != 0) {
+ this->data_[cur++] = data;
+ }
+
+ for (; cur < DataSize; ++cur)
+ this->data_[cur] = 0;
+
+ VERIFY(*s == 0); // C-string always ends on 0
+ }
+
+ /**
+ * Sets i-th symbol of Seq with 0123-char
+ */
+ inline void set(const size_t i, char c) {
+ data_[i >> TNuclBits] =
+ (data_[i >> TNuclBits] & ~((T) 3 << ((i & (TNucl - 1)) << 1))) | ((T) c << ((i & (TNucl - 1)) << 1));
+ }
+
+ // Template voodoo to calculate the length of the string regardless whether it is std::string or const char*
+ template<class S>
+ size_t size(const S &t,
+ typename std::enable_if<std::is_class<S>::value, T>::type * = 0) {
+ return t.size();
+ }
+
+ template<class S>
+ size_t size(const S &t,
+ typename std::enable_if<std::is_same<S, const char *>::value, T>::type * = 0) {
+ return strlen(t);
+ }
+
+
+public:
+
+ const static size_t max_size = max_size_;
+
+ RuntimeSeq() : size_(0) {
+ std::fill(data_.begin(), data_.end(), 0);
+ }
+
+ /**
+ * Default constructor, fills Seq with A's
+ */
+
+ explicit RuntimeSeq(size_t k) : size_(k) {
+ VERIFY(k <= max_size_);
+ //VERIFY((T)(-1) >= (T)0);//be sure to use unsigned types
+ std::fill(data_.begin(), data_.end(), 0);
+ }
+
+ RuntimeSeq(size_t k, const char *s) : size_(k) {
+ VERIFY(k <= max_size_);
+ //VERIFY((T)(-1) >= (T)0);//be sure to use unsigned types
+ init(s);
+ }
+
+
+ explicit RuntimeSeq(size_t k, const T *data_array) : size_(k) {
+ VERIFY(k <= max_size_);
+ std::fill(data_.begin(), data_.end(), 0);
+
+ size_t data_size = GetDataSize(size_);
+ memcpy(data_.data(), data_array, data_size * sizeof(T));
+
+ if (NuclsRemain(size_)) {
+ data_[data_size - 1] = data_[data_size - 1] & MaskForLastBucket(size_);
+ }
+ }
+
+ explicit RuntimeSeq(size_t k, T *data_array) : size_(k) {
+ VERIFY(k <= max_size_);
+ std::fill(data_.begin(), data_.end(), 0);
+
+ size_t data_size = GetDataSize(size_);
+ memcpy(data_.data(), data_array, data_size * sizeof(T));
+
+ if (NuclsRemain(size_)) {
+ data_[data_size - 1] = data_[data_size - 1] & MaskForLastBucket(size_);
+ }
+ }
+
+ template<size_t size2_, typename T2 = T>
+ explicit RuntimeSeq(const Seq<size2_, T2> &seq, bool) : size_(size2_) {
+ VERIFY(size_ <= max_size_);
+ std::fill(data_.begin(), data_.end(), 0);
+ seq.copy_data(data_.data());
+ }
+
+ template<size_t size2_, typename T2 = T>
+ explicit RuntimeSeq(const SimpleSeq<size2_, T2> &seq, size_t k) : size_(k) {
+ VERIFY(size_ <= max_size_);
+ VERIFY(size2_ <= max_size_);
+ std::fill(data_.begin(), data_.end(), 0);
+ seq.copy_data(data_.data());
+ }
+
+
+ /**
+ * Ultimate constructor from ACGT0123-string.
+ *
+ * @param s Any object with operator[], which returns 0123 chars
+ * @param offset Offset when this sequence starts
+ * @number_to_read A number of nucleotides, we want to fetch from this string
+ * @warning assuming that s is a correct string, filled with ACGT _OR_ 0123
+ * no init method, filling right here
+ */
+ template<typename S>
+ explicit RuntimeSeq(size_t k, const S &s, size_t offset = 0) : size_(k) {
+ VERIFY(size_ <= max_size_);
+ //TRACE("New Constructor for seq " << s[0] << " is first symbol");
+ VERIFY(size_ == 0 || is_dignucl(s[0]) || is_nucl(s[0]));
+ VERIFY(offset + size_ <= this->size(s));
+
+ // which symbols does our string contain : 0123 or ACGT?
+ bool digit_str = size_ == 0 || is_dignucl(s[0]);
+
+ // data -- one temporary variable corresponding to the i-th array element
+ // and some counters
+ T data = 0;
+ size_t cnt = 0;
+ size_t cur = 0;
+
+ for (size_t i = 0; i < size_; ++i) {
+ //VERIFY(is_dignucl(s[i]) || is_nucl(s[i]));
+
+ // we fill everything with zeros (As) by default.
+ char c = (char) (digit_str ? s[offset + i] : dignucl(s[offset + i]));
+
+ data = data | (T(c) << cnt);
+ cnt += 2;
+
+ if (cnt == TBits) {
+ this->data_[cur++] = data;
+ cnt = 0;
+ data = 0;
+ }
+ }
+
+ if (cnt != 0) {
+ this->data_[cur++] = data;
+ }
+
+ for (; cur < DataSize; ++cur)
+ this->data_[cur] = 0;
+ }
+
+ /**
+ * Reads sequence from the file (in the same format as BinWrite writes it)
+ * and returns false if error occured, true otherwise.
+ */
+ bool BinRead(std::istream &file) {
+ file.read((char *) data_.data(), sizeof(T) * GetDataSize(size_));
+ return !file.fail();
+ }
+
+ /**
+ * Writes sequence to the file (in the same format as BinRead reads it)
+ * and returns false if error occured, true otherwise.
+ */
+ bool BinWrite(std::ostream &file) const {
+ file.write((const char *) data_.data(), sizeof(T) * GetDataSize(size_));
+ return !file.fail();
+ }
+
+ /**
+ * Reads sequence from the file (in the same format as BinWrite writes it)
+ * and returns false if error occured, true otherwise.
+ */
+ static bool BinRead(std::istream &file, RuntimeSeq<max_size_, T> *seq) {
+ return seq->BinRead(file);
+ }
+
+ /**
+ * Writes sequence to the file (in the same format as BinRead reads it)
+ * and returns false if error occured, true otherwise.
+ */
+ static bool BinWrite(std::ostream &file, const RuntimeSeq<max_size_, T> &seq) {
+ return seq.BinWrite(file);
+ }
+
+
+ /**
+ * Get i-th symbol of Seq.
+ *
+ * @param i Index of the symbol (0 <= i < size_)
+ * @return 0123-char on position i
+ */
+ char operator[](const size_t i) const {
+ VERIFY(i < size_);
+ return (data_[i >> TNuclBits] >> ((i & (TNucl - 1)) << 1)) & 3;
+ }
+
+ /**::
+ * Reverse complement.
+ *
+ * @return Reverse complement Seq.
+ */
+ RuntimeSeq<max_size_, T> operator!() const {
+// RuntimeSeq<max_size_, T> res(*this);
+// for (size_t i = 0; i < (size_ >> 1); ++i) {
+// auto front = complement(res[i]);
+// auto end = complement(res[size_ - 1 - i]);
+// res.set(i, end);
+// res.set(size_ - 1 - i, front);
+// }
+// if ((size_ & 1) == 1) {
+// res.set(size_ >> 1, complement(res[size_ >> 1]));
+// }
+ return FastRC();
+// return res;
+ }
+
+ /**
+ * Is the kmer minimal among this and !this.
+ *
+ * @return True if kmer < !kmer and false otherwise.
+ */
+ bool IsMinimal() const {
+ for (size_t i = 0; (i << 1) + 1 <= size_; ++i) {
+ auto front = this->operator[](i);
+ auto end = complement(this->operator[](size_ - 1 - i));
+ if (front != end)
+ return front < end;
+ }
+ return true;
+ }
+
+ /**
+ * Shift left
+ *
+ * @param c New 0123 char which should be added to the right.
+ * @return Shifted (to the left) sequence with 'c' char on the right.
+ */
+ RuntimeSeq<max_size_, T> operator<<(char c) const {
+ if (is_nucl(c)) {
+ c = dignucl(c);
+ }
+
+ RuntimeSeq<max_size_, T> res(*this);
+ std::array<T, DataSize> &data = res.data_;
+
+ size_t data_size = GetDataSize(size_);
+
+ if (data_size != 0) { // unless empty sequence
+ T rm = data[data_size - 1] & 3;
+ T lastnuclshift_ = ((size_ + TNucl - 1) & (TNucl - 1)) << 1;
+ data[data_size - 1] = (data[data_size - 1] >> 2) | ((T) c << lastnuclshift_);
+
+ if (data_size >= 2) { // if we have at least 2 elements in data
+ for (int i = (int) data_size - 2; i >= 0; --i) {
+ T new_rm = data[i] & 3;
+ data[i] = (data[i] >> 2) |
+ (rm << (TBits - 2)); // we need & here because if we shift negative, it fill with ones :(
+ rm = new_rm;
+ }
+ }
+ }
+ return res;
+ }
+
+ void operator<<=(char c) {
+ if (is_nucl(c)) {
+ c = dignucl(c);
+ }
+
+ size_t data_size = GetDataSize(size_);
+
+ if (data_size == 0) {
+ return;
+ }
+
+ for (size_t i = 0; i < data_size - 1; ++i) {
+ data_[i] = (data_[i] >> 2) | (((T) data_[i + 1] & 3) << (TBits - 2));
+ }
+
+ T lastnuclshift_ = ((size_ + TNucl - 1) & (TNucl - 1)) << 1;
+ data_[data_size - 1] = (data_[data_size - 1] >> 2) | ((T) c << lastnuclshift_);
+ }
+
+//todo naming convention violation!
+ RuntimeSeq<max_size_, T> pushBack(char c) const {
+ //VERIFY(size_ + 1 <= max_size_);
+
+ if (is_nucl(c)) {
+ c = dignucl(c);
+ }
+ //VERIFY(is_dignucl(c));
+ RuntimeSeq<max_size_, T> s(size_ + 1);
+ copy(this->data_.begin(), this->data_.end(), s.data_.begin());
+
+ size_t data_size = GetDataSize(size_ + 1);
+
+ s.data_[data_size - 1] |= ((T) c << ((size_ & (TNucl - 1)) << 1));
+
+ return s; //was: Seq<size_ + 1, T>(str() + nucl(c));
+ }
+
+
+//todo naming convention violation!
+ void pushBackThis(char c) {
+ VERIFY(size_ + 1 <= max_size_);
+
+ if (is_nucl(c)) {
+ c = dignucl(c);
+ }
+
+ size_ += 1;
+ size_t data_size = GetDataSize(size_);
+
+ data_[data_size - 1] |= ((T) c << (((size_ - 1) & (TNucl - 1)) << 1));
+ }
+
+ // /**
+ // * @todo optimize!!!
+ // */
+ // RuntimeSeq<max_size_, T> pushFront(char c) const {
+ // VERIFY(size_ + 1 < max_size_);
+ // if (is_nucl(c)) {
+ // c = dignucl(c);
+ // }
+ // VERIFY(is_dignucl(c));
+ // return RuntimeSeq<max_size_, T> (size_ + 1, nucl(c) + str());
+ // }
+
+ //todo naming convention violation!
+ RuntimeSeq<max_size_, T> pushFront(char c) const {
+ VERIFY(size_ + 1 <= max_size_);
+ if (is_nucl(c)) {
+ c = dignucl(c);
+ }
+ VERIFY(is_dignucl(c));
+ RuntimeSeq<max_size_, T> res(size_ + 1);
+
+ size_t data_size = GetDataSize(size_ + 1);
+
+ T rm = c;
+ for (size_t i = 0; i < data_size; ++i) {
+ T new_rm = (data_[i] >> (TBits - 2)) & 3;
+ res.data_[i] = (data_[i] << 2) | rm;
+ rm = new_rm;
+ }
+
+ return res;
+ }
+
+//todo naming convention violation!
+ void pushFrontThis(char c) {
+ VERIFY(size_ + 1 <= max_size_);
+
+ if (is_nucl(c)) {
+ c = dignucl(c);
+ }
+
+ size_ += 1;
+ size_t data_size = GetDataSize(size_);
+
+ T rm = c;
+ for (size_t i = 0; i < data_size; ++i) {
+ T new_rm = (data_[i] >> (TBits - 2)) & 3;
+ data_[i] = (data_[i] << 2) | rm;
+ rm = new_rm;
+ }
+ }
+
+ /**
+ * Shift right
+ *
+ * @param c New 0123 char which should be added to the left.
+ * @return Shifted (to the right) sequence with 'c' char on the left.
+ */
+ RuntimeSeq<max_size_, T> operator>>(char c) const {
+ if (is_nucl(c)) {
+ c = dignucl(c);
+ }
+ VERIFY(is_dignucl(c));
+
+ RuntimeSeq<max_size_, T> res(*this);
+ size_t data_size = GetDataSize(size_);
+
+ T rm = c;
+ for (size_t i = 0; i < data_size; ++i) {
+ T new_rm = (res.data_[i] >> (TBits - 2)) & 3;
+ res.data_[i] = (res.data_[i] << 2) | rm;
+ rm = new_rm;
+ }
+
+ res.data_[data_size - 1] &= MaskForLastBucket(size_);
+
+ return res;
+ }
+
+ //todo remove code duplication!
+ void operator>>=(char c) {
+ if (is_nucl(c)) {
+ c = dignucl(c);
+ }
+ VERIFY(is_dignucl(c));
+
+ size_t data_size = GetDataSize(size_);
+
+ T rm = (T) c;
+ for (size_t i = 0; i < data_size; ++i) {
+ T new_rm = (data_[i] >> (TBits - 2)) & 3;
+ data_[i] = (data_[i] << 2) | rm;
+ rm = new_rm;
+ }
+
+ data_[data_size - 1] &= MaskForLastBucket(size_);
+ }
+
+ bool operator==(const RuntimeSeq<max_size_, T> &s) const {
+ VERIFY(size_ == s.size_);
+ // INFO(this->full_str());
+ // INFO(s.full_str());
+ return 0 == memcmp(data_.data(), s.data_.data(), sizeof(T) * DataSize);
+ }
+
+ /**
+ * @see operator ==()
+ */
+
+
+
+ bool operator!=(const RuntimeSeq<max_size_, T> &s) const {
+ return !operator==(s);
+ }
+
+ /**
+ * String representation of this Seq
+ *
+ * @return ACGT-string of length size_
+ * @see nucl()
+ */
+ std::string str() const {
+ std::string res(size_, '-');
+ for (size_t i = 0; i < size_; ++i) {
+ res[i] = nucl(operator[](i));
+ }
+ return res;
+ }
+
+ std::string err() const {
+ return "";
+ }
+
+
+ std::string full_str() const {
+ std::string res(max_size, '-');
+ for (size_t i = 0; i < max_size; ++i) {
+ res[i] = nucl(operator[](i));
+ }
+ return res;
+ }
+
+ size_t size() const {
+ return size_;
+ }
+
+ size_t data_size() const {
+ return GetDataSize(size_);
+ }
+
+ const T *data() const {
+ return data_.data();
+ }
+
+ template<size_t size2_, typename T2 = T>
+ Seq<size2_, T2> get_seq() const {
+ VERIFY(size2_ == size_);
+ return Seq<size2_, T2>((T2 *) data_.data());
+ }
+
+ template<size_t size2_, typename T2 = T>
+ SimpleSeq<size2_, T2> get_sseq() const {
+ VERIFY(size2_ <= max_size_);
+ return SimpleSeq<size2_, T2>((T2 *) data_.data());
+ }
+
+ void copy_data(void *dst) const {
+ memcpy(dst, (const void *) data_.data(), GetDataSize(size_) * sizeof(T));
+ }
+
+ char last() const {
+ return operator[](size_ - 1);
+ }
+
+ char first() const {
+ return operator[](0);
+ }
+
+ static size_t GetHash(const DataType *data, size_t sz, uint32_t seed = 0) {
+ return CityHash64WithSeed((const char *) data, sz * sizeof(DataType), 0x9E3779B9 ^ seed);
+ }
+
+ size_t GetHash(unsigned seed = 0) const {
+ return GetHash(data_.data(), GetDataSize(size_), seed);
+ }
+
+ struct hash {
+ size_t operator()(const RuntimeSeq<max_size_, T> &seq, uint32_t seed = 0) const {
+ return seq.GetHash(seed);
+ }
+
+ size_t operator()(const DataType *data, size_t sz, unsigned seed = 0) {
+ return GetHash(data, sz, seed);
+ }
+ };
+
+ struct less2 {
+ int operator()(const RuntimeSeq<max_size_, T> &l, const RuntimeSeq<max_size_, T> &r) const {
+ for (size_t i = 0; i < l.size(); ++i) {
+ if (l[i] != r[i]) {
+ return (l[i] < r[i]);
+ }
+ }
+ return l.size() < r.size();
+ }
+ };
+
+ /**
+ * Denotes some (weird) order on k-mers. Works fast.
+ */
+ struct less2_fast {
+ bool operator()(const RuntimeSeq<max_size_, T> &l, const RuntimeSeq<max_size_, T> &r) const {
+ return 0 > memcmp(l.data(), r.data(), sizeof(T) * l.data_size());
+ }
+ };
+
+};
+
+template<size_t max_size_, typename T = seq_element_type>
+bool operator<(const RuntimeSeq<max_size_, T> &l, const RuntimeSeq<max_size_, T> &r) {
+ for (size_t i = 0; i < l.size(); ++i) {
+ if (l[i] != r[i]) {
+ return (l[i] < r[i]);
+ }
+ }
+
+ return l.size() < r.size();
+}
+
+
+template<size_t max_size_, typename T>
+std::ostream &operator<<(std::ostream &os, RuntimeSeq<max_size_, T> seq) {
+ os << seq.str();
+ return os;
+}
+
+namespace std {
+template<size_t max_size, typename T>
+struct hash<RuntimeSeq<max_size, T>> {
+ size_t operator()(const RuntimeSeq<max_size, T> &seq) const {
+ return seq.GetHash();
+ }
+};
+
+};
+
+
+#endif /* RTSEQ_HPP_ */
diff --git a/src/modules/data_structures/sequence/runtime_k.hpp b/src/modules/data_structures/sequence/runtime_k.hpp
new file mode 100644
index 0000000..bbb28b7
--- /dev/null
+++ b/src/modules/data_structures/sequence/runtime_k.hpp
@@ -0,0 +1,47 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef RUNTIME_K_HPP_
+#define RUNTIME_K_HPP_
+
+#include "data_structures/sequence/sequence.hpp"
+#include "data_structures/sequence/seq.hpp"
+#include "data_structures/sequence/simple_seq.hpp"
+#include "data_structures/sequence/rtseq.hpp"
+
+#include "k_range.hpp"
+
+namespace runtime_k {
+
+constexpr size_t t_size(void) {
+ return sizeof(seq_element_type);
+}
+
+constexpr size_t get_t_elements_number(size_t value) {
+ return ((value - 1) / (t_size() << 2) + 1);
+}
+
+constexpr size_t get_k_by_ts(size_t value) {
+ return (value * (t_size() << 2));
+}
+
+constexpr size_t get_upper_bound(size_t value) {
+ return get_k_by_ts(get_t_elements_number(value));
+}
+
+const size_t UPPER_BOUND = get_upper_bound(MAX_K); //((MAX_K - 1) / (sizeof(seq_element_type) << 2) + 1) * (sizeof(seq_element_type) << 2);
+
+const size_t MAX_TS = get_t_elements_number(MAX_K);
+
+const size_t MIN_TS = get_t_elements_number(MIN_K);
+
+
+typedef RuntimeSeq<UPPER_BOUND> RtSeq;
+
+} /* namespace runtime_k */
+
+#endif /* RUNTIME_K_HPP_ */
diff --git a/src/modules/data_structures/sequence/seq.hpp b/src/modules/data_structures/sequence/seq.hpp
new file mode 100755
index 0000000..3753b74
--- /dev/null
+++ b/src/modules/data_structures/sequence/seq.hpp
@@ -0,0 +1,529 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/**
+ * @file seq.hpp
+ * @author vyahhi
+ * @version 1.0
+ *
+ * @section LICENSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * @section DESCRIPTION
+ *
+ * Immutable ACGT-sequence with compile-time size.
+ * It compress sequence to array of Ts (default: char).
+ */
+
+#ifndef SEQ_HPP_
+#define SEQ_HPP_
+
+#include <string>
+#include <array>
+#include <algorithm>
+#include <cstring>
+#include <iostream>
+
+#include <city/city.h>
+
+#include "dev_support/verify.hpp"
+#include "data_structures/sequence/nucl.hpp"
+#include "dev_support/log.hpp"
+#include "seq_common.hpp"
+
+
+/**
+ * @param T is max number of nucleotides, type for storage
+ */
+template<size_t size_, typename T = seq_element_type>
+class Seq {
+public:
+ /**
+ * @variable Number of bits in type T (e.g. 8 for char)
+ * @example 8: 2^8 = 256 or 16
+ */
+ const static size_t TBits = sizeof(T) << 3;
+
+ /**
+ * @variable Number of nucleotides that can be stored in one type T (e.g. 4 for char)
+ * TNucl MUST be a power of two
+ * @example 4: 8/2 = 4 or 16/2 = 8
+ */
+ const static size_t TNucl = TBits >> 1;
+
+ /**
+ * @variable Number of bits in TNucl (e.g. 2 for char). Useful for shifts instead of divisions.
+ */
+ const static size_t TNuclBits = log_<TNucl, 2>::value;
+
+ /**
+ * @variable Number of Ts which required to store all sequence.
+ */
+ const static size_t DataSize = (size_ + TNucl - 1) >> TNuclBits;
+
+ typedef T DataType;
+
+ /**
+ * @variable Number of meaningful bytes in whick seq is stored
+ */
+ const static size_t TotalBytes = sizeof(T) * DataSize;
+
+ static size_t GetDataSize(size_t size) {
+ VERIFY(size == size_);
+ return (size_ + TNucl - 1) >> TNuclBits;
+ }
+
+private:
+ /* *
+ * @variable Just some prime number to count the hash function of the kmer
+ * */
+ const static size_t PrimeNum = 239;
+
+ // number of nucleotides in the last data_ bucket
+ const static size_t NuclsRemain = size_ & (TNucl - 1);
+
+ // useful mask to fill the last element of the data_ array
+ const static size_t MaskForLastBucket = (((T) 1) << (NuclsRemain << 1)) - 1;
+
+
+ /**
+ * @variable Inner representation of sequence: array of Ts with length = DataSize.
+ *
+ * @invariant Invariant: all nucleotides >= size_ are 'A's (useful for comparison)
+ */
+ std::array<T, DataSize> data_;
+
+ friend class Seq<size_ - 1, T>;
+
+ /**
+ * Initialize data_ array of this object with C-string
+ *
+ * @param s C-string (ACGT chars only), strlen(s) = size_
+ */
+ void init(const char *s) {
+ T data = 0;
+ size_t cnt = 0;
+ int cur = 0;
+ for (size_t pos = 0; pos != size_; ++pos, ++s) { // unsafe!
+ // VERIFY(is_nucl(*s)); // for performance
+ data = data | (T) ((T) dignucl(*s) << cnt);
+ cnt += 2;
+ if (cnt == TBits) {
+ this->data_[cur++] = data;
+ cnt = 0;
+ data = 0;
+ }
+ }
+ if (cnt != 0) {
+ this->data_[cur++] = data;
+ }
+ VERIFY(*s == 0); // C-string always ends on 0
+ }
+
+ // Template voodoo to calculate the length of the string regardless whether it is std::string or const char*
+ template<class S>
+ size_t size(const S &t,
+ typename std::enable_if<std::is_class<S>::value, T>::type * = 0) {
+ return t.size();
+ }
+
+ template<class S>
+ size_t size(const S &t,
+ typename std::enable_if<std::is_same<S, const char *>::value, T>::type * = 0) {
+ return strlen(t);
+ }
+
+public:
+ /**
+ * Default constructor, fills Seq with A's
+ */
+ Seq() {
+ std::fill(data_.begin(), data_.end(), 0);
+ }
+
+ Seq(const char *s) {
+ init(s);
+ }
+
+ explicit Seq(T *data_array) {
+ memcpy(data_.data(), data_array, TotalBytes);
+ }
+
+ explicit Seq(unsigned, const T *data_array) {
+ memcpy(data_.data(), data_array, TotalBytes);
+ }
+
+
+ /**
+ * Ultimate constructor from ACGT0123-string.
+ *
+ * @param s Any object with operator[], which returns 0123 chars
+ * @param offset Offset when this sequence starts
+ * @number_to_read A number of nucleotides, we want to fetch from this string
+ * @raw Flag whether to check for string length (e.g. via strlen, or not)
+ * @warning assuming that s is a correct string, filled with ACGT _OR_ 0123
+ * no init method, filling right here
+ */
+ template<typename S>
+ explicit Seq(const S &s, size_t offset = 0, size_t number_to_read = size_,
+ bool raw = false) {
+ if (this->size(s) == 0) {
+ return;
+ }
+ VERIFY(offset < this->size(s));
+ VERIFY(is_dignucl(s[offset]) || is_nucl(s[offset]));
+ if (!raw)
+ VERIFY(offset + number_to_read <= this->size(s));
+
+ // which symbols does our string contain : 0123 or ACGT?
+ bool digit_str = is_dignucl(s[offset]);
+
+ // data -- one temporary variable corresponding to the i-th array element
+ // and some counters
+ T data = 0;
+ size_t cnt = 0;
+ size_t cur = 0;
+
+ for (size_t i = 0; i < number_to_read; ++i) {
+ //VERIFY(is_dignucl(s[i]) || is_nucl(s[i]));
+
+ // we fill everything with zeros (As) by default.
+ char c = digit_str ? s[offset + i] : (char) dignucl(s[offset + i]);
+
+ data = data | (T(c) << cnt);
+ cnt += 2;
+
+ if (cnt == TBits) {
+ this->data_[cur++] = data;
+ cnt = 0;
+ data = 0;
+ }
+ }
+
+ if (cnt != 0) {
+ this->data_[cur++] = data;
+ }
+
+ for (; cur != DataSize; ++cur)
+ this->data_[cur] = 0;
+ }
+
+
+ /**
+ * Get i-th symbol of Seq.
+ *
+ * @param i Index of the symbol (0 <= i < size_)
+ * @return 0123-char on position i
+ */
+ char operator[](const size_t i) const {
+ return (data_[i >> TNuclBits] >> ((i & (TNucl - 1)) << 1)) & 3;
+ }
+
+ /**
+ * Reverse complement.
+ *
+ * @return Reverse complement Seq.
+ */
+ Seq<size_, T> operator!() const {
+ Seq<size_, T> res(*this);
+ for (size_t i = 0; i < (size_ >> 1); ++i) {
+ T front = complement(res[i]);
+ T end = complement(res[size_ - 1 - i]);
+ res.set(i, (char) end);
+ res.set(size_ - 1 - i, (char) front);
+ }
+ if ((size_ & 1) == 1) {
+ res.set(size_ >> 1, complement(res[size_ >> 1]));
+ }
+ // can be made without complement calls, but with xor on all bytes afterwards.
+ return res;
+ }
+
+ /**
+ * Shift left
+ *
+ * @param c New 0123 char which should be added to the right.
+ * @return Shifted (to the left) sequence with 'c' char on the right.
+ */
+ Seq<size_, T> operator<<(char c) const {
+ if (is_nucl(c)) {
+ c = dignucl(c);
+ }
+ Seq<size_, T> res(*this);
+ std::array<T, DataSize> &data = res.data_;
+ if (DataSize != 0) { // unless empty sequence
+ T rm = data[DataSize - 1] & 3;
+ T lastnuclshift_ = ((size_ + TNucl - 1) & (TNucl - 1)) << 1;
+ data[DataSize - 1] = (data[DataSize - 1] >> 2) | ((T) c << lastnuclshift_);
+
+ if (DataSize >= 2) { // if we have at least 2 elements in data
+ int data_size = DataSize;
+ for (int i = data_size - 2; i >= 0; --i) {
+ T new_rm = data[i] & 3;
+ data[i] = (data[i] >> 2) |
+ (rm << (TBits - 2)); // we need & here because if we shift negative, it fill with ones :(
+ rm = new_rm;
+ }
+ }
+ }
+ return res;
+ }
+
+ Seq<size_ + 1, T> pushBack(char c) const {
+ if (is_nucl(c)) {
+ c = dignucl(c);
+ }
+ //VERIFY(is_dignucl(c));
+ Seq<size_ + 1, T> s;
+ copy(this->data_.begin(), this->data_.end(), s.data_.begin());
+ s.data_[s.DataSize - 1] = s.data_[s.DataSize - 1] | ((T) c << ((size_ & (TNucl - 1)) << 1));
+
+ return s; //was: Seq<size_ + 1, T>(str() + nucl(c));
+
+ }
+
+ // /**
+ // * @todo optimize!!!
+ // */
+ // Seq<size_ + 1, T> pushFront(char c) const {
+ // if (is_nucl(c)) {
+ // c = dignucl(c);
+ // }
+ // VERIFY(is_dignucl(c));
+ // return Seq<size_ + 1, T> (nucl(c) + str());
+ // }
+
+ Seq<size_ + 1, T> pushFront(char c) const {
+ if (is_nucl(c)) {
+ c = dignucl(c);
+ }
+ VERIFY(is_dignucl(c));
+ Seq<size_ + 1, T> res;
+
+ //if new kmer has more Ts
+ if (Seq<size_ + 1, T>::DataSize > DataSize) {
+ res.data_[DataSize] = (data_[DataSize - 1] >> (TBits - 2)) & 3;
+ }
+
+ T rm = c;
+ for (size_t i = 0; i < DataSize; ++i) {
+ T new_rm = (data_[i] >> (TBits - 2)) & 3;
+ res.data_[i] = (data_[i] << 2) | rm;
+ rm = new_rm;
+ }
+
+ return res;
+ }
+
+ /**
+ * Shift right
+ *
+ * @param c New 0123 char which should be added to the left.
+ * @return Shifted (to the right) sequence with 'c' char on the left.
+ */
+ Seq<size_, T> operator>>(char c) const {
+ if (is_nucl(c)) {
+ c = dignucl(c);
+ }
+ VERIFY(is_dignucl(c));
+ Seq<size_, T> res(*this);
+ T rm = c;
+ for (size_t i = 0; i < DataSize; ++i) {
+ T new_rm = (res.data_[i] >> (TBits - 2)) & 3;
+ res.data_[i] = (res.data_[i] << 2) | rm;
+ rm = new_rm;
+ }
+ if ((size_ & (TNucl - 1)) != 0) {
+ T lastnuclshift_ = (size_ & (TNucl - 1)) << 1;
+ res.data_[DataSize - 1] = res.data_[DataSize - 1] & (((T) 1
+ << lastnuclshift_) - 1);
+ }
+ return res;
+ }
+
+ /**
+ * Sets i-th symbol of Seq with 0123-char
+ */
+ inline void set(const size_t i, char c) {
+ data_[i >> TNuclBits] =
+ (data_[i >> TNuclBits] & ~((T) 3 << ((i & (TNucl - 1)) << 1))) | ((T) c << ((i & (TNucl - 1)) << 1));
+ }
+
+ bool operator==(const Seq<size_, T> &s) const {
+ for (size_t i = 0; i < DataSize; ++i)
+ if (data_[i] != s.data_[i])
+ return false;
+ return true;
+ }
+
+ /**
+ * @see operator ==()
+ */
+
+ bool operator!=(const Seq<size_, T> &s) const {
+ return !operator==(s);
+ }
+
+ /**
+ * String representation of this Seq
+ *
+ * @return ACGT-string of length size_
+ * @see nucl()
+ */
+ std::string str() const {
+ std::string res(size_, '-');
+ for (size_t i = 0; i != size_; ++i) {
+ res[i] = nucl(operator[](i));
+ }
+ return res;
+ }
+
+ static size_t size() {
+ return size_;
+ }
+
+
+ void copy_data(void *dst) const {
+ memcpy(dst, (const void *) data_.data(), TotalBytes);
+ }
+
+ /**
+ * Reads sequence from the file (in the same format as BinWrite writes it)
+ * and returns false if error occured, true otherwise.
+ */
+ static bool BinRead(std::istream &file, Seq<size_> *seq) {
+ file.read((char *) seq->data_.data(), sizeof(T) * DataSize);
+ return !file.fail();
+ }
+
+ /**
+ * Writes sequence to the file (in the same format as BinRead reads it)
+ * and returns false if error occured, true otherwise.
+ */
+ static bool BinWrite(std::ostream &file, const Seq<size_> &seq) {
+ file.write((const char *) seq.data_.data(), sizeof(T) * DataSize);
+ return !file.fail();
+ }
+
+ /**
+ * Reads sequence from the file (in the same format as BinWrite writes it)
+ * and returns false if error occured, true otherwise.
+ */
+ bool BinRead(std::istream &file) {
+ return BinRead(file, this);
+ }
+
+ /**
+ * Writes sequence to the file (in the same format as BinRead reads it)
+ * and returns false if error occured, true otherwise.
+ */
+ bool BinWrite(std::ostream &file) const {
+ return BinWrite(file, *this);
+ }
+
+ /**
+ * @see Seq
+ */
+ template<size_t size2_, typename T2 = T>
+ Seq<size2_, T2> start() const {
+ VERIFY(size2_ <= size_);
+ return Seq<size2_, T2>(*this);
+ }
+
+ template<size_t size2_/* = size_ - 1*/, typename T2 = T>
+ Seq<size2_, T2> end() const {
+ VERIFY(size2_ <= size_);
+ return Seq<size2_, T2>(*this, size_ - size2_);
+ }
+
+ const T *data() const {
+ return data_.data();
+ }
+
+ size_t data_size() const {
+ return DataSize;
+ }
+
+
+ char last() const {
+ return operator[](size_ - 1);
+ }
+
+ char first() const {
+ return operator[](0);
+ }
+
+ static size_t GetHash(const DataType *data, size_t sz = DataSize, uint32_t seed = 0) {
+ return CityHash64WithSeed((const char *) data, sz * sizeof(DataType), 0x9E3779B9 ^ seed);
+ }
+
+ size_t GetHash(uint32_t seed = 0) const {
+ return GetHash(data_.data(), DataSize, seed);
+ }
+
+ struct hash {
+ size_t operator()(const Seq<size_, T> &seq, uint32_t seed = 0) const {
+ return seq.GetHash(seed);
+ }
+
+ size_t operator()(const DataType *data, size_t sz = DataSize, uint32_t seed = 0) {
+ return GetHash(data, sz, seed);
+ }
+ };
+
+ struct equal_to {
+ bool operator()(const Seq<size_, T> &l, const Seq<size_, T> &r) const {
+ return r == l;
+ }
+ };
+
+ struct less2 {
+ bool operator()(const Seq<size_, T> &l, const Seq<size_, T> &r) const {
+ for (size_t i = 0; i < size_; ++i) {
+ if (l[i] != r[i]) {
+ return (l[i] < r[i]);
+ }
+ }
+ return false;
+ }
+ };
+
+ /**
+ * Denotes some (weird) order on k-mers. Works fast.
+ */
+ struct less2_fast {
+ bool operator()(const Seq<size_, T> &l, const Seq<size_, T> &r) const {
+ return 0 > memcmp(l.data_.data(), r.data_.data(), sizeof(T) * DataSize);
+ }
+ };
+};
+
+template<size_t size_, typename T>
+std::ostream &operator<<(std::ostream &os, Seq<size_, T> seq) {
+ os << seq.str();
+ return os;
+}
+
+//namespace std {
+//
+//template<size_t size_, typename T = seq_element_type>
+//struct hash<Seq<size_, T> {
+// typedef size_t result_type;
+// typedef Seq<size_, T> argument_type;
+//
+// result_type operator() (const argument_type& arg) {
+// return Seq<size_, T>::hash()(arg);
+// }
+//};
+//
+//}
+
+#endif /* SEQ_HPP_ */
diff --git a/src/include/sequence/seq_common.hpp b/src/modules/data_structures/sequence/seq_common.hpp
similarity index 100%
rename from src/include/sequence/seq_common.hpp
rename to src/modules/data_structures/sequence/seq_common.hpp
diff --git a/src/modules/data_structures/sequence/sequence.hpp b/src/modules/data_structures/sequence/sequence.hpp
new file mode 100755
index 0000000..cf9304f
--- /dev/null
+++ b/src/modules/data_structures/sequence/sequence.hpp
@@ -0,0 +1,542 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef SEQUENCE_HPP_
+#define SEQUENCE_HPP_
+
+#include <vector>
+#include <string>
+#include <memory>
+#include <cstring>
+
+#include "data_structures/sequence/seq.hpp"
+#include "data_structures/sequence/rtseq.hpp"
+
+class Sequence {
+ // Type to store Seq in Sequences
+ typedef seq_element_type ST;
+ // Number of bits in ST
+ const static size_t STBits = sizeof(ST) << 3;
+ // Number of nucleotides in ST
+ const static size_t STN = (STBits >> 1);
+ // Number of bits in STN (for faster div and mod)
+ const static size_t STNBits = log_<STN, 2>::value;
+
+ template<typename T>
+ struct array_deleter {
+ void operator()(const T *p) { delete[] p; }
+ };
+
+private:
+ size_t from_;
+ size_t size_;
+ bool rtl_; // Right to left + complimentary (?)
+ std::shared_ptr<ST> data_;
+
+ static size_t DataSize(size_t size) {
+ return (size + STN - 1) >> STNBits;
+ }
+
+ template<typename S>
+ void InitFromNucls(const S &s, bool rc = false) {
+ size_t bytes_size = DataSize(size_);
+ ST *bytes = data_.get();
+
+ VERIFY(is_dignucl(s[0]) || is_nucl(s[0]));
+
+ // Which symbols does our string contain : 0123 or ACGT?
+ bool digit_str = is_dignucl(s[0]);
+
+ // data -- one temporary variable corresponding to the i-th array element
+ // and some counters
+ ST data = 0;
+ size_t cnt = 0;
+ size_t cur = 0;
+
+ if (rc) {
+ for (int i = (int) size_ - 1; i >= 0; --i) {
+ //VERIFY(is_dignucl(s[i]) || is_nucl(s[i]));
+ char c = complement(digit_str ? s[(unsigned) i] : dignucl(s[(unsigned) i]));
+
+ data = data | (ST(c) << cnt);
+ cnt += 2;
+
+ if (cnt == STBits) {
+ bytes[cur++] = data;
+ cnt = 0;
+ data = 0;
+ }
+ }
+ } else {
+ for (size_t i = 0; i < size_; ++i) {
+ //VERIFY(is_dignucl(s[i]) || is_nucl(s[i]));
+ char c = digit_str ? s[i] : dignucl(s[i]);
+
+ data = data | (ST(c) << cnt);
+ cnt += 2;
+
+ if (cnt == STBits) {
+ bytes[cur++] = data;
+ cnt = 0;
+ data = 0;
+ }
+ }
+ }
+
+ if (cnt != 0)
+ bytes[cur++] = data;
+
+ for (; cur < bytes_size; ++cur)
+ bytes[cur] = 0;
+ }
+
+
+public:
+ /**
+ * Sequence initialization (arbitrary size string)
+ *
+ * @param s ACGT or 0123-string
+ */
+ explicit Sequence(const char *s, bool rc = false) :
+ from_(0), size_(strlen(s)), rtl_(false), data_(new ST[DataSize(size_)], array_deleter<ST>()) {
+ InitFromNucls(s, rc);
+ }
+
+ explicit Sequence(char *s, bool rc = false) :
+ from_(0), size_(strlen(s)), rtl_(false), data_(new ST[DataSize(size_)], array_deleter<ST>()) {
+ InitFromNucls(s, rc);
+ }
+
+ template<typename S>
+ explicit Sequence(const S &s, bool rc = false) :
+ from_(0), size_(s.size()), rtl_(false), data_(new ST[DataSize(size_)], array_deleter<ST>()) {
+ InitFromNucls(s, rc);
+ }
+
+ Sequence() :
+ from_(0), size_(0), rtl_(false), data_(new ST[DataSize(size_)], array_deleter<ST>()) {
+ memset(data_.get(), 0, DataSize(size_));
+ }
+
+ template<size_t size2_>
+ explicit Sequence(const Seq<size2_> &kmer, size_t) :
+ from_(0), size_(kmer.size()), rtl_(false), data_(new ST[DataSize(size_)], array_deleter<ST>()) {
+
+ kmer.copy_data(data_.get());
+ }
+
+ template<size_t size2_>
+ explicit Sequence(const RuntimeSeq<size2_> &kmer, size_t) :
+ from_(0), size_(kmer.size()), rtl_(false), data_(new ST[DataSize(size_)], array_deleter<ST>()) {
+
+ kmer.copy_data(data_.get());
+ }
+
+ Sequence(const Sequence &seq, size_t from, size_t size, bool rtl) :
+ from_(from), size_(size), rtl_(rtl), data_(seq.data_) {
+ }
+
+ Sequence(const Sequence &s) :
+ from_(s.from_), size_(s.size_), rtl_(s.rtl_), data_(s.data_) {
+ }
+
+ ~Sequence() { }
+
+ const Sequence &operator=(const Sequence &rhs) {
+ if (&rhs != this) {
+ from_ = rhs.from_;
+ size_ = rhs.size_;
+ rtl_ = rhs.rtl_;
+ data_ = rhs.data_;
+ }
+
+ return *this;
+ }
+
+ char operator[](const size_t index) const {
+ //todo can be put back after switching to distributing release without asserts
+ //VERIFY(index < size_);
+ const ST *bytes = data_.get();
+ if (rtl_) {
+ size_t i = from_ + size_ - 1 - index;
+ return complement((bytes[i >> STNBits] >> ((i & (STN - 1)) << 1)) & 3);
+ } else {
+ size_t i = from_ + index;
+ return (bytes[i >> STNBits] >> ((i & (STN - 1)) << 1)) & 3;
+ }
+ }
+
+ bool operator==(const Sequence &that) const {
+ if (size_ != that.size_) {
+ return false;
+ }
+
+ if (data_ == that.data_ && from_ == that.from_ && rtl_ == that.rtl_) {
+ return true;
+ }
+
+ for (size_t i = 0; i < size_; ++i) {
+ if (this->operator[](i) != that[i]) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ bool operator!=(const Sequence &that) const {
+ return !(operator==(that));
+ }
+
+ /**
+ * @todo Might be optimized via int comparison (not so easy)
+ */
+ bool operator<(const Sequence &that) const {
+ size_t s = std::min(size_, that.size_);
+ for (size_t i = 0; i < s; ++i) {
+ if (this->operator[](i) != that[i]) {
+ return (this->operator[](i) < that[i]);
+ }
+ }
+ return (size_ < that.size_);
+ }
+
+ Sequence operator!() const {
+ return Sequence(*this, from_, size_, !rtl_);
+ }
+
+ inline Sequence operator<<(char c) const;
+
+ /**
+ * @param from inclusive
+ * @param to exclusive;
+ */
+ inline Sequence Subseq(size_t from, size_t to) const;
+
+ inline Sequence Subseq(size_t from) const; // up to size_ by default
+ inline Sequence First(size_t count) const;
+
+ inline Sequence Last(size_t count) const;
+
+ inline Sequence operator+(const Sequence &s) const;
+
+ /////todo what are these methods???
+ inline size_t find(const Sequence &t, size_t from = 0) const;
+
+ inline size_t similar(const Sequence &t, size_t k, char directed = 0) const;
+
+ inline size_t leftSimilar(const Sequence &t, size_t k) const;
+
+ inline size_t rightSimilar(const Sequence &t, size_t k) const;
+
+ /**
+ * @param from inclusive
+ * @param to exclusive;
+ * @return true if two sequences intersect
+ */
+ inline bool intersects(const Sequence &t) const;
+
+ template<size_t size2_>
+ Seq<size2_> start() const;
+
+ template<size_t size2_>
+ Seq<size2_> fast_start() const;
+
+ template<size_t size2_>
+ Seq<size2_> end() const;
+
+ template<class Seq>
+ Seq start(size_t k) const;
+
+ template<class Seq>
+ Seq end(size_t k) const;
+
+ inline std::string str() const;
+
+ inline std::string err() const;
+
+ size_t size() const {
+ return size_;
+ }
+
+private:
+ inline bool ReadHeader(std::istream &file);
+
+ inline bool WriteHeader(std::ostream &file) const;
+
+public:
+ inline bool BinRead(std::istream &file);
+
+ inline bool BinWrite(std::ostream &file) const;
+};
+
+inline std::ostream &operator<<(std::ostream &os, const Sequence &s);
+
+/**
+ * start of Sequence is Seq with preferred size
+ */
+template<size_t size2_>
+Seq<size2_> Sequence::start() const {
+ //VERIFY(size2_ <= size_);
+ return Seq<size2_>(*this);
+}
+
+template<size_t size2_>
+Seq<size2_> Sequence::fast_start() const {
+ ST result[(size2_ + STN - 1) >> STNBits] = {0};
+
+ size_t start = from_ >> STNBits;
+ size_t end = (from_ + size_ - 1) >> STNBits;
+ size_t shift = (from_ & (STN - 1)) << 1;
+ const ST *bytes = data_.get();
+
+ for (size_t i = start; i <= end; ++i) {
+ result[i - start] = bytes[i] >> shift;
+ }
+
+ if (shift != 0) {
+ shift = STBits - shift;
+
+ for (size_t i = start + 1; i <= end; ++i) {
+ result[i - start - 1] |= bytes[i] << shift;
+ }
+ }
+
+ return (rtl_ ? !Seq<size2_>(result) : Seq<size2_>(result));
+}
+
+template<size_t size2_>
+Seq<size2_> Sequence::end() const {
+ return Seq<size2_>(*this, size_ - size2_);
+}
+
+
+template<class Seq>
+Seq Sequence::start(size_t k) const {
+ return Seq(unsigned(k), *this);
+}
+
+template<class Seq>
+Seq Sequence::end(size_t k) const {
+ return Seq(unsigned(k), *this, size_ - k);
+}
+
+
+Sequence Sequence::First(size_t count) const {
+ return Subseq(0, count);
+}
+
+Sequence Sequence::Last(size_t count) const {
+ return Subseq(size_ - count);
+}
+
+bool Sequence::intersects(const Sequence &t) const {
+ for (size_t i = 0; i < std::min(size_, t.size_); ++i) {
+ if (this->operator[](i) == t[i]) {
+ return true;
+ }
+ }
+ return false;
+}
+
+// O(1)
+//including from, excluding to
+//safe if not #DEFINE NDEBUG
+Sequence Sequence::Subseq(size_t from, size_t to) const {
+ // cerr << endl<<"subseq:" << from <<" " << to << " " << this->str() << endl;
+ VERIFY(to >= from);
+ VERIFY(to <= size_);
+ //VERIFY(to - from <= size_);
+ if (rtl_) {
+ return Sequence(*this, from_ + size_ - to, to - from, true);
+ } else {
+ return Sequence(*this, from_ + from, to - from, false);
+ }
+}
+
+//including from, excluding to
+Sequence Sequence::Subseq(size_t from) const {
+ return Subseq(from, size_);
+}
+
+/**
+ * @todo : must be KMP or hashing instead of this
+ */
+size_t Sequence::find(const Sequence &t, size_t from) const {
+ for (size_t i = from; i <= size() - t.size(); i++) {
+ if (Subseq(i, i + t.size()) == t) {
+ return i;
+ }
+ }
+ return -1ULL;
+}
+
+/**
+ *
+ *@param k minimal intersection of sequences
+ *@param directed LEFT means that after intersection t continues to left over _this and matches perfectly with _this on overlaping
+ *@return 0 - undirected similarity, 1: t extends this to right, -1: this extends t
+ *
+ */
+size_t Sequence::similar(const Sequence &t, size_t k, char directed) const {
+ size_t result = 0;
+ if (directed != -1)
+ result |= rightSimilar(t, k);
+ if (directed != 1)
+ result |= leftSimilar(t, k);
+ return result;
+}
+
+size_t Sequence::leftSimilar(const Sequence &t, size_t k) const {
+ return t.rightSimilar(*this, k);
+}
+
+size_t Sequence::rightSimilar(const Sequence &t, size_t k) const {
+ size_t tsz = t.size();
+ size_t sz = size();
+ Sequence d(t.Subseq(0, k));
+ for (size_t res = find(d, 0); res != -1ULL; res = find(d, res + 1)) {
+ if (res + tsz < sz)
+ continue;
+ size_t i;
+ for (i = k; i + res < sz; i++) {
+ if (t[i] != this->operator[](i + res)) {
+ break;
+ };
+ }
+ if (i == sz - res)
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * @todo optimize
+ */
+Sequence Sequence::operator+(const Sequence &s) const {
+ return Sequence(str() + s.str());
+ // TODO might be opposite to correct
+ // int total = size_ + s.size_;
+ // std::vector<Seq<4> > bytes((total + 3) >> 2);
+ // for (size_t i = 0; i < size_; ++i) {
+ // bytes[i / 4] = (bytes[i / 4] << operator [](i)); // TODO :-) use <<=
+ // }
+ // for (size_t i = 0, j = size_; i < s.size_; ++i, ++j) {
+ // bytes[j / 4] = (bytes[j / 4]) << s[i];
+ // }
+ // return Sequence(new Data(bytes), 0, total, false);
+}
+
+std::string Sequence::str() const {
+ std::string res(size_, '-');
+ for (size_t i = 0; i < size_; ++i) {
+ res[i] = nucl(this->operator[](i));
+ }
+ return res;
+}
+
+std::string Sequence::err() const {
+ std::ostringstream oss;
+ oss << "{ *data=" << data_ <<
+ ", from_=" << from_ <<
+ ", size_=" << size_ <<
+ ", rtl_=" << int(rtl_) << " }";
+ return oss.str();
+}
+
+std::ostream &operator<<(std::ostream &os, const Sequence &s) {
+ os << s.str();
+ return os;
+}
+
+bool Sequence::ReadHeader(std::istream &file) {
+ file.read((char *) &size_, sizeof(size_));
+
+ from_ = 0;
+ rtl_ = false;
+
+ return !file.fail();
+}
+
+bool Sequence::WriteHeader(std::ostream &file) const {
+ VERIFY(from_ == 0);
+ VERIFY(!rtl_);
+
+ file.write((const char *) &size_, sizeof(size_));
+
+ return !file.fail();
+}
+
+
+bool Sequence::BinRead(std::istream &file) {
+ ReadHeader(file);
+
+ data_ = std::shared_ptr<ST>(new ST[DataSize(size_)], array_deleter<ST>());
+ file.read((char *) data_.get(), DataSize(size_) * sizeof(ST));
+
+ return !file.fail();
+}
+
+
+bool Sequence::BinWrite(std::ostream &file) const {
+ if (from_ != 0 || rtl_) {
+ Sequence clear(this->str());
+ return clear.BinWrite(file);
+ }
+
+ WriteHeader(file);
+
+ file.write((const char *) data_.get(), DataSize(size_) * sizeof(ST));
+
+ return !file.fail();
+}
+
+/**
+ * @class SequenceBuilder
+ * @section DESCRIPTION
+ *
+ * Class was created for build sequence. It is included method: size(), append()
+ */
+
+class SequenceBuilder {
+ std::vector<char> buf_;
+public:
+ template<typename S>
+ SequenceBuilder &append(const S &s) {
+ for (size_t i = 0; i < s.size(); ++i) {
+ buf_.push_back(s[i]);
+ }
+ return *this;
+ }
+
+ SequenceBuilder &append(char c) {
+ buf_.push_back(c);
+ return *this;
+ }
+
+ Sequence BuildSequence() {
+ return Sequence(buf_);
+ }
+
+ size_t size() const {
+ return buf_.size();
+ }
+
+ char operator[](const size_t index) const {
+ VERIFY(index < buf_.size());
+ return buf_[index];
+ }
+
+ std::string str() const {
+ std::string s(buf_.size(), '-');
+ for (size_t i = 0; i < s.size(); ++i) {
+ s[i] = nucl(buf_[i]);
+ }
+ return s;
+ }
+};
+
+#endif /* SEQUENCE_HPP_ */
diff --git a/src/modules/data_structures/sequence/sequence_tools.hpp b/src/modules/data_structures/sequence/sequence_tools.hpp
new file mode 100644
index 0000000..eea0e65
--- /dev/null
+++ b/src/modules/data_structures/sequence/sequence_tools.hpp
@@ -0,0 +1,159 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef SEQUENCE_TOOLS_HPP_
+#define SEQUENCE_TOOLS_HPP_
+
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "data_structures/sequence/nucl.hpp"
+#include "data_structures/sequence/sequence.hpp"
+#include "utils/levenshtein.hpp"
+
+inline const std::string Reverse(const std::string &s) {
+ return std::string(s.rbegin(), s.rend());
+}
+
+inline const std::string Complement(const std::string &s) {
+ std::string res(s.size(), 0);
+ transform(s.begin(), s.end(), res.begin(), nucl_complement);
+ return res;
+}
+
+inline const Sequence MergeOverlappingSequences(std::vector<Sequence>& ss,
+ size_t overlap, bool safe_merging = true) {
+ if (ss.empty()) {
+ return Sequence();
+ }
+ SequenceBuilder sb;
+ Sequence prev_end = ss.front().Subseq(0, overlap);
+ sb.append(prev_end);
+ for (auto it = ss.begin(); it != ss.end(); ++it) {
+ if(safe_merging)
+ VERIFY(prev_end == it->Subseq(0, overlap));
+ sb.append(it->Subseq(overlap));
+ prev_end = it->Subseq(it->size() - overlap);
+ }
+ return sb.BuildSequence();
+}
+
+inline size_t EditDistance(const Sequence& s1, const Sequence& s2) {
+ return edit_distance(s1.str(), s2.str());
+}
+
+inline bool Relax(int& val, int new_val) {
+ if (new_val > val) {
+ val = new_val;
+ return true;
+ }
+ return false;
+}
+
+inline std::pair<size_t, size_t> LocalSimilarity(const Sequence& s1, const Sequence& s2) {
+ size_t m = s1.size();
+ size_t n = s2.size();
+ std::vector<std::vector<int>> a(m + 1);
+ for (size_t i = 0; i <= m; ++i) {
+ a[i].resize(n + 1);
+ }
+ for (size_t i = 0; i <= m; ++i) {
+ for (size_t j = 0; j <= n; ++j) {
+ a[i][j] = 0;
+ }
+ }
+ for (size_t i = 1; i <= m; ++i) {
+ for (size_t j = 1; j <= n; ++j) {
+ Relax(a[i][j], a[i - 1][j] - 1);
+ Relax(a[i][j], a[i][j - 1] - 1);
+ if (s1[i - 1] == s2[j - 1]) {
+ Relax(a[i][j], a[i - 1][j - 1] + 1);
+ } else {
+ Relax(a[i][j], a[i - 1][j - 1] - 1);
+ }
+ }
+ }
+
+ //finding local alignment
+ int answer = 0;
+ size_t i_m = 0;
+ size_t j_m = 0;
+ for (size_t i = 0; i <= m; ++i) {
+ for (size_t j = 0; j <= n; ++j) {
+ if (Relax(answer, a[i][j])) {
+ i_m = i;
+ j_m = j;
+ }
+ }
+ }
+
+ //finding alignment lengths
+ size_t i = i_m;
+ size_t j = j_m;
+ while (a[i][j] > 0) {
+ if (a[i][j] == a[i][j - 1] - 1) {
+ j--;
+ } else if (a[i][j] == a[i-1][j] - 1) {
+ i--;
+ } else if (a[i][j] == a[i-1][j-1] + 1) {
+ VERIFY(s1[i-1] == s2[j-1]);
+ i--;
+ j--;
+ } else {
+ VERIFY(a[i-1][j-1] - 1 == a[i][j] && s1[i-1] != s2[j-1]);
+ i--;
+ j--;
+ }
+ }
+ return std::make_pair(size_t(answer), std::min(i_m - i, j_m - j));
+}
+
+inline const std::string ReverseComplement(const std::string &s) {
+ std::string res(s.size(), 0);
+ transform(s.begin(), s.end(), res.rbegin(), nucl_complement); // only difference with reverse is rbegin() instead of begin()
+ return res;
+}
+
+class UniformPositionAligner {
+private:
+ size_t upper_length_;
+ size_t lower_length_;
+public:
+ UniformPositionAligner(size_t upper_length, size_t lower_length) :
+ upper_length_(upper_length), lower_length_(lower_length) {
+ }
+
+ size_t GetPosition(size_t upper_position) {
+ if (upper_position * 2 + 1 >= upper_length_)
+ return (2 * upper_position + 1) * lower_length_
+ / (2 * upper_length_);
+ else
+ return lower_length_ - 1
+ - GetPosition(upper_length_ - 1 - upper_position);
+ }
+};
+
+class EnsureEndsPositionAligner {
+private:
+ size_t upper_length_;
+ size_t lower_length_;
+public:
+ EnsureEndsPositionAligner(size_t upper_length, size_t lower_length) :
+ upper_length_(upper_length), lower_length_(lower_length) {
+ }
+
+ size_t GetPosition(size_t upper_position) {
+ VERIFY(upper_position > 0);
+ if (lower_length_ == 1)
+ return 1;
+ return (2 * upper_position * lower_length_ + upper_length_)
+ / (2 * upper_length_);
+ }
+};
+
+#endif /* SEQUENCE_TOOLS_HPP_ */
diff --git a/src/modules/data_structures/sequence/simple_seq.hpp b/src/modules/data_structures/sequence/simple_seq.hpp
new file mode 100644
index 0000000..8c5642f
--- /dev/null
+++ b/src/modules/data_structures/sequence/simple_seq.hpp
@@ -0,0 +1,154 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * simple_seq.hpp
+ *
+ * Created on: Jul 23, 2012
+ * Author: andrey
+ */
+
+#ifndef SIMPLE_SEQ_HPP_
+#define SIMPLE_SEQ_HPP_
+
+#include <string>
+#include <array>
+#include <algorithm>
+#include <cstring>
+#include <iostream>
+
+#include "dev_support/verify.hpp"
+#include "data_structures/sequence/nucl.hpp"
+#include "dev_support/log.hpp"
+#include "seq_common.hpp"
+/**
+ * @param T is max number of nucleotides, type for storage
+ */
+template<size_t size_, typename T = seq_element_type>
+class SimpleSeq {
+public:
+ /**
+ * @variable Number of bits in type T (e.g. 8 for char)
+ * @example 8: 2^8 = 256 or 16
+ */
+ const static size_t TBits = sizeof(T) << 3;
+
+ /**
+ * @variable Number of nucleotides that can be stored in one type T (e.g. 4 for char)
+ * TNucl MUST be a power of two
+ * @example 4: 8/2 = 4 or 16/2 = 8
+ */
+ const static size_t TNucl = TBits >> 1;
+
+ /**
+ * @variable Number of bits in TNucl (e.g. 2 for char). Useful for shifts instead of divisions.
+ */
+ const static size_t TNuclBits = log_<TNucl, 2>::value;
+
+ /**
+ * @variable Number of Ts which required to store all sequence.
+ */
+ const static size_t DataSize = (size_ + TNucl - 1) >> TNuclBits;
+
+ typedef T DataType;
+
+ /**
+ * @variable Number of meaningful bytes in whick seq is stored
+ */
+ const static size_t TotalBytes = sizeof(T) * DataSize;
+
+private:
+ // number of nucleotides in the last data_ bucket
+ const static size_t NuclsRemain = size_ & (TNucl - 1);
+
+ // useful mask to fill the last element of the data_ array
+ const static size_t MaskForLastBucket = (((T) 1) << (NuclsRemain << 1) ) - 1;
+
+
+ /**
+ * @variable Inner representation of sequence: array of Ts with length = DataSize.
+ *
+ * @invariant Invariant: all nucleotides >= size_ are 'A's (useful for comparison)
+ */
+ std::array<T, DataSize> data_;
+
+
+public:
+
+ SimpleSeq() {
+ //VERIFY((T)(-1) >= (T)0);//be sure to use unsigned types
+ std::fill(data_.begin(), data_.end(), 0);
+ }
+
+ explicit SimpleSeq(T * data_array) {
+ memcpy(data_.data(), data_array, TotalBytes);
+ }
+
+
+ char operator[](const size_t i) const {
+ //VERIFY(i >= 0);
+ //VERIFY(i < size_);
+ return (data_[i >> TNuclBits] >> ((i & (TNucl - 1)) << 1)) & 3;
+ }
+
+ std::string str() const {
+ std::string res(size_, '-');
+ for (size_t i = 0; i < size_; ++i) {
+ res[i] = nucl(operator[](i));
+ }
+ return res;
+ }
+
+ void copy_data(void * dst) const {
+ memcpy(dst, (const void *) data_.data(), TotalBytes);
+ }
+
+ static size_t GetHash(const DataType *data, size_t sz, uint32_t seed = 0) {
+ return CityHash64WithSeed((const char*)data, sz * sizeof(DataType), 0x9E3779B9 ^ seed);
+ }
+
+ size_t GetHash(uint32_t seed = 0) const {
+ return GetHash(data_.data(), DataSize, seed);
+ }
+
+ struct hash {
+ size_t operator()(const SimpleSeq<size_, T>& seq, uint32_t seed = 0) const {
+ return seq.GetHash(seed);
+ }
+
+ size_t operator()(const DataType *data, size_t sz, unsigned seed = 0) {
+ return GetHash(data, sz, seed);
+ }
+ };
+
+ struct equal_to {
+ bool operator()(const SimpleSeq<size_, T>& l, const SimpleSeq<size_, T>& r) const {
+ return memcmp(l.data_.data(), r.data_.data(), sizeof(T) * DataSize) == 0;
+ }
+ };
+
+ struct less2 {
+ int operator()(const SimpleSeq<size_, T> &l, const SimpleSeq<size_, T> &r) const {
+ for (size_t i = 0; i < size_; ++i) {
+ if (l[i] != r[i]) {
+ return (l[i] < r[i]);
+ }
+ }
+ return false;
+ }
+ };
+
+};
+
+template<size_t size_, typename T>
+std::ostream& operator<<(std::ostream& os, SimpleSeq<size_, T> seq) {
+ os << seq.str();
+ return os;
+}
+
+
+#endif /* SIMPLE_SEQ_HPP_ */
diff --git a/src/modules/dev_support/CMakeLists.txt b/src/modules/dev_support/CMakeLists.txt
new file mode 100644
index 0000000..d719227
--- /dev/null
+++ b/src/modules/dev_support/CMakeLists.txt
@@ -0,0 +1,13 @@
+############################################################################
+# Copyright (c) 2015 Saint Petersburg State University
+# Copyright (c) 2011-2014 Saint Petersburg Academic University
+# All Rights Reserved
+# See file LICENSE for details.
+############################################################################
+
+project(dev_support CXX)
+
+add_library(dev_support STATIC
+ copy_file.cpp
+ path_helper.cpp
+ logger/logger_impl.cpp)
diff --git a/src/modules/dev_support/autocompletion.cpp b/src/modules/dev_support/autocompletion.cpp
new file mode 100644
index 0000000..6b5060d
--- /dev/null
+++ b/src/modules/dev_support/autocompletion.cpp
@@ -0,0 +1,51 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include <vector>
+#include <string>
+#include <queue>
+#include <readline/readline.h>
+
+namespace online_visualization {
+
+std::vector<std::string> commands;
+
+char* CommandGenerator(const char* text, int state) {
+ static std::queue<std::string> list_possible_matches;
+
+ if (state != 0) {
+ if (!list_possible_matches.empty()) {
+ char* answer = strdup(list_possible_matches.front().c_str());
+ list_possible_matches.pop();
+ return answer;
+ } else
+ return NULL;
+ } else {
+ for (size_t i = 0; i < commands.size(); ++i) {
+ std::string name = commands[i];
+ if (!name.compare(0, strlen(text), text))
+ list_possible_matches.push(name);
+ }
+ return CommandGenerator(text, 1);
+ }
+ return NULL;
+}
+
+char** GafCompletion(const char* text, int start, int /*end*/) {
+ if (start == 0) {
+ return rl_completion_matches(text, CommandGenerator);
+ } else
+ return NULL;
+}
+
+void InitAutocompletion(const std::vector<std::string>& available_commands) {
+ commands = available_commands;
+ rl_attempted_completion_function = GafCompletion;
+}
+
+}
+
diff --git a/src/modules/dev_support/autocompletion.hpp b/src/modules/dev_support/autocompletion.hpp
new file mode 100644
index 0000000..f6f04d2
--- /dev/null
+++ b/src/modules/dev_support/autocompletion.hpp
@@ -0,0 +1,16 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+#include <vector>
+#include <string>
+
+namespace online_visualization {
+
+void InitAutocompletion(const std::vector<std::string>& commands);
+
+}
diff --git a/src/modules/dev_support/copy_file.cpp b/src/modules/dev_support/copy_file.cpp
new file mode 100644
index 0000000..f68d9d2
--- /dev/null
+++ b/src/modules/dev_support/copy_file.cpp
@@ -0,0 +1,158 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "copy_file.hpp"
+
+#include "dev_support/path_helper.hpp"
+#include "dev_support/logger/logger.hpp"
+
+#include <boost/algorithm/string.hpp>
+
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <vector>
+
+#include <unistd.h>
+#include <dirent.h>
+
+#include <sys/stat.h>
+#include <sys/types.h>
+
+namespace path {
+
+namespace details {
+
+using namespace path;
+
+void copy_file(std::string from_path, std::string to_path) {
+ using namespace std;
+
+ make_full_path(from_path);
+ make_full_path(to_path );
+
+ if (from_path == to_path)
+ return;
+
+ std::ifstream source(from_path, ios::binary);
+ std::ofstream dest (to_path.c_str() , ios::binary);
+
+ dest << source.rdbuf();
+}
+
+
+void hard_link(std::string from_path, std::string to_path) {
+ make_full_path(from_path);
+ make_full_path(to_path );
+
+ if (from_path == to_path)
+ return;
+
+ if (link(from_path.c_str(), to_path.c_str()) == -1) {
+ WARN("Failed to create link. Reason: " << strerror(errno) << ". Error code: " << errno << ". Copying instead");
+ copy_file(from_path, to_path);
+ }
+}
+
+files_t files_in_folder(std::string const& path) {
+ DIR *dp;
+ if ((dp = opendir(path.c_str())) == NULL)
+ throw std::runtime_error("can not open folder " + path);
+
+ files_t files;
+
+ struct dirent *dirp;
+ while ((dirp = readdir(dp)) != NULL)
+ if (dirp->d_type == DT_REG)
+ files.push_back(append_path(path, dirp->d_name));
+
+ closedir(dp);
+ return files;
+}
+
+files_t folders_in_folder(std::string const& path) {
+ DIR *dp;
+ if ((dp = opendir(path.c_str())) == NULL)
+ throw std::runtime_error("can not open folder " + path);
+
+ files_t folders;
+
+ struct dirent *dirp;
+ while ((dirp = readdir(dp)) != NULL)
+ if (dirp->d_type == DT_DIR) {
+ std::string folder = dirp->d_name;
+
+ if (folder != "." && folder != "..")
+ folders.push_back(append_path(path, folder));
+ }
+
+ closedir(dp);
+ return folders;
+}
+
+} // details
+
+path::files_t files_by_prefix(std::string const& path) {
+ using namespace details;
+ files_t files;
+
+ std::string folder(parent_path(path));
+ std::string prefix = filename(path);
+
+ files_t out_files;
+ const files_t all_files = files_in_folder(folder);
+
+ for (auto it = all_files.begin(); it != all_files.end(); ++it) // no std::copy_if before C++11
+ if (boost::starts_with(filename(*it), prefix))
+ out_files.push_back(*it);
+
+ return out_files;
+}
+
+void copy_files_by_prefix(path::files_t const& files, std::string const& to_folder) {
+ using namespace details;
+
+ for (auto it = files.begin(); it != files.end(); ++it) {
+ files_t files_to_copy = files_by_prefix(*it);
+
+ for (auto it = files_to_copy.begin(); it != files_to_copy.end(); ++it)
+ copy_file(*it, append_path(to_folder, filename(*it)));
+ }
+}
+
+void link_files_by_prefix(path::files_t const& files, std::string const& to_folder) {
+ using namespace details;
+
+ for (auto it = files.begin(); it != files.end(); ++it) {
+ files_t files_to_copy = files_by_prefix(*it);
+
+ for (auto it = files_to_copy.begin(); it != files_to_copy.end(); ++it)
+ hard_link(*it, append_path(to_folder, filename(*it)));
+ }
+}
+
+void copy_files_by_ext(std::string const& from_folder, std::string const& to_folder, std::string const& ext, bool recursive) {
+ using namespace details;
+
+ files_t files = files_in_folder(from_folder);
+
+ for (auto it = files.begin(); it != files.end(); ++it)
+ if (boost::ends_with(*it, ext))
+ copy_file(*it, append_path(to_folder, filename(*it)));
+
+ if (recursive) {
+ files_t folders = folders_in_folder(from_folder);
+
+ for (auto it = folders.begin(); it != folders.end(); ++it) {
+ std::string subdir = append_path(to_folder, filename(*it));
+ path:: make_dir(subdir);
+ copy_files_by_ext(*it, subdir, ext, recursive);
+ }
+ }
+}
+
+}
diff --git a/src/modules/dev_support/copy_file.hpp b/src/modules/dev_support/copy_file.hpp
new file mode 100644
index 0000000..f402772
--- /dev/null
+++ b/src/modules/dev_support/copy_file.hpp
@@ -0,0 +1,18 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "dev_support/path_helper.hpp"
+#include <string>
+
+namespace path {
+
+path::files_t files_by_prefix(std::string const& path);
+void copy_files_by_prefix(path::files_t const& files, std::string const& to_folder);
+void link_files_by_prefix(path::files_t const& files, std::string const& to_folder);
+void copy_files_by_ext(std::string const& from_folder, std::string const& to_folder, std::string const& ext, bool recursive);
+
+}
diff --git a/src/modules/dev_support/cpp_utils.hpp b/src/modules/dev_support/cpp_utils.hpp
new file mode 100644
index 0000000..b59b7ab
--- /dev/null
+++ b/src/modules/dev_support/cpp_utils.hpp
@@ -0,0 +1,40 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * cpp_utils.hpp
+ *
+ * Created on: Nov 14, 2011
+ * Author: valery
+ */
+
+#pragma once
+
+namespace utils {
+
+// arrays
+template<class T, size_t N>
+size_t array_size(T (&/*arr*/)[N]) {
+ return N;
+}
+
+template<class T, size_t N>
+T *array_end(T (&arr)[N]) {
+ return &arr[N];
+}
+
+template<size_t EXPECTED_SIZE, class T, size_t N>
+void check_array_size(T (&/*arr*/)[N]) {
+ static_assert(EXPECTED_SIZE == N, "Unexpected array size");
+}
+
+template<class T>
+T identity_function(const T &t) {
+ return t;
+}
+
+} // namespace utils
diff --git a/src/modules/dev_support/file_limit.hpp b/src/modules/dev_support/file_limit.hpp
new file mode 100644
index 0000000..6990b6f
--- /dev/null
+++ b/src/modules/dev_support/file_limit.hpp
@@ -0,0 +1,33 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "dev_support/verify.hpp"
+
+inline rlim_t limit_file(size_t limit) {
+ struct rlimit rl;
+
+ int res = getrlimit(RLIMIT_NOFILE, &rl);
+ VERIFY_MSG(res == 0,
+ "getrlimit(2) call failed, errno = " << errno);
+
+ // We cannot go beyond hard limit and we might not have enough privileges to
+ // increase the hard limit
+ limit = std::max<size_t>(limit, rl.rlim_cur);
+ rl.rlim_cur = std::min<size_t>(limit, rl.rlim_max);
+ res = setrlimit(RLIMIT_NOFILE, &rl);
+ VERIFY_MSG(res == 0,
+ "setrlimit(2) call failed, errno = " << errno);
+ INFO("Open file limit set to " << rl.rlim_cur);
+
+ return rl.rlim_cur;
+}
diff --git a/src/modules/dev_support/func.hpp b/src/modules/dev_support/func.hpp
new file mode 100644
index 0000000..5a8343c
--- /dev/null
+++ b/src/modules/dev_support/func.hpp
@@ -0,0 +1,69 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <functional>
+
+namespace func {
+
+//to use with std::function-s
+template<class T>
+void Compose(T t, std::function<void(T)> f1,
+ std::function<void(T)> f2) {
+ if (f1)
+ f1(t);
+ if (f2)
+ f2(t);
+}
+
+template<class T>
+std::function<void(T)> Composition(std::function<void(T)> f1,
+ std::function<void(T)> f2) {
+ return std::bind(func::Compose<T>, std::placeholders::_1, f1, f2);
+}
+
+template<class A, class B>
+class Func {
+public:
+ typedef std::function<B(A)> function_t;
+
+ virtual B Apply(A a) const = 0;
+
+ virtual ~Func() {
+ }
+};
+
+template<class T>
+class AndOperator;
+
+template<class T>
+class OrOperator;
+
+template<class T>
+class NotOperator;
+
+template<class T>
+class Predicate: public Func<T, bool> {
+public:
+ typedef T checked_type;
+
+ bool Apply(T t) const {
+ return Check(t);
+ }
+
+ virtual bool Check(T t) const = 0;
+
+ bool operator()(T t) const { return Check(t); }
+
+
+ virtual ~Predicate() {
+ }
+};
+
+
+}
diff --git a/src/modules/dev_support/log.hpp b/src/modules/dev_support/log.hpp
new file mode 100755
index 0000000..e5634fe
--- /dev/null
+++ b/src/modules/dev_support/log.hpp
@@ -0,0 +1,33 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * Compile time log(n,base) function for use in templates
+ *
+ * Created on: 02.03.2011
+ * Author: vyahhi
+ */
+
+#ifndef LOG_HPP_
+#define LOG_HPP_
+
+template <size_t N, size_t base = 2>
+struct log_ {
+ const static size_t value = 1 + log_<N/base, base>::value;
+};
+
+template <size_t base>
+struct log_<1, base> {
+ const static size_t value = 0;
+};
+
+template <size_t base>
+struct log_<0, base> {
+ const static size_t value = 0;
+};
+
+#endif /* LOG_HPP_ */
diff --git a/src/modules/dev_support/logger/log_writers.hpp b/src/modules/dev_support/logger/log_writers.hpp
new file mode 100644
index 0000000..12330f3
--- /dev/null
+++ b/src/modules/dev_support/logger/log_writers.hpp
@@ -0,0 +1,43 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "dev_support/path_helper.hpp"
+#include "logger.hpp"
+
+#include <iostream>
+
+#include "config.hpp"
+
+#include <iostream>
+
+namespace logging {
+
+struct console_writer : public writer {
+#ifdef SPADES_USE_JEMALLOC
+
+ void write_msg(double time, size_t cmem, size_t max_rss, level l, const char *file, size_t line_num,
+ const char *source, const char *msg) {
+ std::cout << fmt::format("{:14s} {:>5s} / {:<5s} {:6.6s} {:24.24s} ({:26.26s}:{:4d}) {:s}",
+ human_readable_time(time), human_readable_memory(cmem),
+ human_readable_memory(max_rss), logging::level_name(l),
+ source, path::filename(file), int(line_num), msg)
+ << std::endl;
+ }
+
+#else
+void write_msg(double time, size_t max_rss, level l, const char* file, size_t line_num, const char* source, const char* msg) {
+ std::cout << fmt::format("{:14s} {:^5s} {:6.6s} {:24.24s} ({:26.26s}:{:4d}) {:s}",
+ human_readable_time(time), human_readable_memory(max_rss), logging::level_name(l),
+ source, path::filename(file), int(line_num), msg)
+ << std::endl;
+}
+#endif
+};
+
+} // logging
diff --git a/src/modules/dev_support/logger/logger.hpp b/src/modules/dev_support/logger/logger.hpp
new file mode 100644
index 0000000..e72329a
--- /dev/null
+++ b/src/modules/dev_support/logger/logger.hpp
@@ -0,0 +1,149 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+#include "dev_support/perfcounter.hpp"
+
+#include <vector>
+#include <unordered_map>
+#include <string>
+#include <sstream>
+#include <memory>
+
+#include "config.hpp"
+
+namespace logging
+{
+
+/////////////////////////////////////////////////////
+enum level
+{
+ L_TRACE,
+ L_DEBUG,
+ L_INFO,
+ L_WARN,
+ L_ERROR
+};
+
+inline std::string level_name(level l)
+{
+ static std::string names [] =
+ {
+ "TRACE",
+ "DEBUG",
+ "INFO" ,
+ "WARN" ,
+ "ERROR"
+ };
+
+ return names[l];
+}
+
+
+/////////////////////////////////////////////////////
+struct writer
+{
+#ifdef SPADES_USE_JEMALLOC
+ virtual void write_msg(double time_in_sec, size_t cmem, size_t max_rss, level l, const char* file, size_t line_num, const char* source, const char* msg) = 0;
+#else
+ virtual void write_msg(double time_in_sec, size_t max_rss, level l, const char* file, size_t line_num, const char* source, const char* msg) = 0;
+#endif
+ virtual ~writer(){}
+};
+
+typedef std::shared_ptr<writer> writer_ptr;
+
+/////////////////////////////////////////////////////
+struct properties
+{
+ /* Reading logger properties from file
+ *
+ * File should contains lines like below.
+ * Use leading # for comment.
+ * File could contain line with default behavior description. If no 'default' entry found, default is set to INFO
+ * Valid levels: TRACE, DEBUG, INFO, WARN, ERROR
+ *
+ * default=INFO
+ * AbraCaDabra=TRACE
+ * #BubaZuba=WARN
+ * HariKrishna=INFO
+ *
+ */
+
+ properties(std::string filename = "", level default_level = L_INFO);
+ properties(level default_level = L_INFO);
+
+ std::unordered_map<std::string, level> levels;
+ level def_level;
+ bool all_default;
+};
+
+////////////////////////////////////////////////////
+struct logger
+{
+ logger(properties const& props);
+
+ //
+ bool need_log(level desired_level, const char* source) const;
+ void log(level desired_level, const char* file, size_t line_num, const char* source, const char* msg);
+
+ //
+ void add_writer(writer_ptr ptr);
+
+private:
+ properties props_ ;
+ std::vector<writer_ptr> writers_;
+ perf_counter timer_ ;
+};
+
+std::shared_ptr<logger>& __logger();
+logger* create_logger(std::string filename = "", level default_level = L_INFO);
+
+void attach_logger(logger *lg);
+void detach_logger();
+
+} // logging
+
+inline const char* __scope_source_name() {
+ return " General ";
+}
+
+#define DECL_LOGGER(source) \
+ static const char* __scope_source_name() { \
+ return source; \
+ }
+
+#define LOG_MSG(l, msg) \
+ do { \
+ std::shared_ptr<logging::logger> &__lg__ = logging::__logger(); \
+ if (__lg__.get() == NULL) \
+ break; \
+ \
+ if (__lg__->need_log((l), __scope_source_name())) { \
+ std::stringstream __logger__str__; \
+ __logger__str__ << msg; /* don't use brackets here! */ \
+ __lg__->log((l), __FILE__, __LINE__, __scope_source_name(), __logger__str__.str().c_str()); \
+ } \
+ } while(0);
+
+#ifdef SPADES_DEBUG_LOGGING
+# define DEBUG(message) LOG_MSG(logging::L_DEBUG, message)
+# define TRACE(message) LOG_MSG(logging::L_TRACE, message)
+#else
+# define DEBUG(message) /* No trace */
+# define TRACE(message) /* No trace */
+#endif
+#define INFO(message) LOG_MSG(logging::L_INFO , message)
+#define VERBOSE_T(n, T, message) {size_t n_copy = (n); if (n_copy % (T) == 0 && n_copy > 0) INFO(n_copy << message)}
+#define VERBOSE(n, message) VERBOSE_T((n), 10000, message)
+#define VERBOSE_POWER_T(n, T, message) {size_t n_copy = (n); if ((n_copy & (n_copy - 1)) == 0 && (n_copy > T)) INFO(n_copy << message)}
+#define VERBOSE_POWER(n, message) VERBOSE_POWER_T((n), 10000, message)
+#define VERBOSE_POWER_T2(n, T, message) {size_t n_copy = (n); if ((n_copy & (n_copy - 1)) == 0 && (n_copy > T)) INFO(message)}
+#define VERBOSE_POWER2(n, message) VERBOSE_POWER_T2((n), 10000, message)
+#define WARN(message) LOG_MSG(logging::L_WARN, message)
+#define ERROR(message) LOG_MSG(logging::L_ERROR, message)
+#define FATAL_ERROR(message) {ERROR(message); exit(-1);}
diff --git a/src/modules/dev_support/logger/logger_impl.cpp b/src/modules/dev_support/logger/logger_impl.cpp
new file mode 100644
index 0000000..c9d8570
--- /dev/null
+++ b/src/modules/dev_support/logger/logger_impl.cpp
@@ -0,0 +1,148 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include <boost/algorithm/string.hpp>
+#include <cppformat/format.h>
+
+#include <string>
+#include <map>
+#include <fstream>
+#include <vector>
+
+#include "dev_support/logger/logger.hpp"
+
+#include "config.hpp"
+
+#ifdef SPADES_USE_JEMALLOC
+# include <jemalloc/jemalloc.h>
+#endif
+
+namespace logging {
+
+properties::properties(level default_level)
+ : def_level(default_level), all_default(true) {}
+
+properties::properties(std::string filename, level default_level)
+ : def_level(default_level), all_default(true) {
+ if (filename.empty())
+ return;
+
+ std::ifstream in(filename.c_str());
+
+ std::map<std::string, level> remap = {
+ {"TRACE", L_TRACE},
+ {"DEBUG", L_DEBUG},
+ {"INFO" , L_INFO },
+ {"WARN" , L_WARN },
+ {"ERROR", L_ERROR}
+ };
+
+ while (!in.eof()) {
+ using namespace boost;
+
+ char buf [0x400] = {};
+ in.getline(buf, sizeof buf);
+
+ std::string str(buf);
+ trim(str);
+
+ if (str.empty() || boost::starts_with(str, "#"))
+ continue;
+
+ std::vector<std::string> entry;
+ split(entry, str, is_any_of("="));
+
+ if(entry.size() != 2)
+ throw std::runtime_error("invalid log file property entry: " + str);
+
+ trim (entry[0]);
+ trim (entry[1]);
+ to_upper(entry[1]);
+
+ auto it = remap.find(entry[1]);
+ if(it == remap.end())
+ throw std::runtime_error("invalid log file level description: " + entry[1]);
+
+ levels[entry[0]] = it->second;
+ }
+
+ auto def = levels.find("default");
+ if (def != levels.end())
+ def_level = def->second;
+
+ for (auto I = levels.begin(), E = levels.end(); I != E; ++I) {
+ if (I->second != def_level) {
+ all_default = false;
+ break;
+ }
+ }
+}
+
+
+logger::logger(properties const& props)
+ : props_(props) { }
+
+bool logger::need_log(level desired_level, const char* source) const {
+ level source_level = props_.def_level;
+
+ if (!props_.all_default) {
+ auto it = props_.levels.find(source);
+ if (it != props_.levels.end())
+ source_level = it->second;
+ }
+
+ return desired_level >= source_level;
+}
+
+#ifdef SPADES_USE_JEMALLOC
+
+void logger::log(level desired_level, const char* file, size_t line_num, const char* source, const char* msg) {
+ double time = timer_.time();
+ const size_t *cmem = 0, *cmem_max = 0;
+ size_t clen = sizeof(cmem);
+
+ je_mallctl("stats.cactive", &cmem, &clen, NULL, 0);
+ je_mallctl("stats.cactive_max", &cmem_max, &clen, NULL, 0);
+
+ for (auto it = writers_.begin(); it != writers_.end(); ++it)
+ (*it)->write_msg(time, (*cmem) / 1024, (*cmem_max) / 1024, desired_level, file, line_num, source, msg);
+}
+#else
+void logger::log(level desired_level, const char* file, size_t line_num, const char* source, const char* msg) {
+ double time = timer_.time();
+ size_t max_rss = get_max_rss();
+
+ for (auto it = writers_.begin(); it != writers_.end(); ++it)
+ (*it)->write_msg(time, max_rss, desired_level, file, line_num, source, msg);
+}
+#endif
+
+//
+void logger::add_writer(writer_ptr ptr)
+{
+ writers_.push_back(ptr);
+}
+
+////////////////////////////////////////////////////
+std::shared_ptr<logger> &__logger() {
+ static std::shared_ptr<logger> l;
+ return l;
+}
+
+logger *create_logger(std::string filename, level default_level) {
+ return new logger(properties(filename, default_level));
+}
+
+void attach_logger(logger *lg) {
+ __logger().reset(lg);
+}
+
+void detach_logger() {
+ __logger().reset();
+}
+
+
+} // logging
diff --git a/src/modules/dev_support/md5.h b/src/modules/dev_support/md5.h
new file mode 100644
index 0000000..471dc5a
--- /dev/null
+++ b/src/modules/dev_support/md5.h
@@ -0,0 +1,393 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef MD5_H
+#define MD5_H
+
+//Taken from http://bobobobo.wordpress.com/2010/10/17/md5-c-implementation/
+
+// Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All
+// rights reserved.
+
+// License to copy and use this software is granted provided that it
+// is identified as the "RSA Data Security, Inc. MD5 Message-Digest
+// Algorithm" in all material mentioning or referencing this software
+// or this function.
+//
+// License is also granted to make and use derivative works provided
+// that such works are identified as "derived from the RSA Data
+// Security, Inc. MD5 Message-Digest Algorithm" in all material
+// mentioning or referencing the derived work.
+//
+// RSA Data Security, Inc. makes no representations concerning either
+// the merchantability of this software or the suitability of this
+// software for any particular purpose. It is provided "as is"
+// without express or implied warranty of any kind.
+//
+// These notices must be retained in any copies of any part of this
+// documentation and/or software.
+
+
+
+// The original md5 implementation avoids external libraries.
+// This version has dependency on stdio.h for file input and
+// string.h for memcpy.
+#include <stdio.h>
+#include <string.h>
+
+#pragma region MD5 defines
+// Constants for MD5Transform routine.
+#define S11 7
+#define S12 12
+#define S13 17
+#define S14 22
+#define S21 5
+#define S22 9
+#define S23 14
+#define S24 20
+#define S31 4
+#define S32 11
+#define S33 16
+#define S34 23
+#define S41 6
+#define S42 10
+#define S43 15
+#define S44 21
+
+
+
+
+
+
+static unsigned char PADDING[64] = {
+ 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+// F, G, H and I are basic MD5 functions.
+#define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
+#define G(x, y, z) (((x) & (z)) | ((y) & (~z)))
+#define H(x, y, z) ((x) ^ (y) ^ (z))
+#define I(x, y, z) ((y) ^ ((x) | (~z)))
+
+// ROTATE_LEFT rotates x left n bits.
+#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
+
+// FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4.
+// Rotation is separate from addition to prevent recomputation.
+#define FF(a, b, c, d, x, s, ac) { \
+ (a) += F ((b), (c), (d)) + (x) + (UINT4)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+ }
+#define GG(a, b, c, d, x, s, ac) { \
+ (a) += G ((b), (c), (d)) + (x) + (UINT4)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+ }
+#define HH(a, b, c, d, x, s, ac) { \
+ (a) += H ((b), (c), (d)) + (x) + (UINT4)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+ }
+#define II(a, b, c, d, x, s, ac) { \
+ (a) += I ((b), (c), (d)) + (x) + (UINT4)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+ }
+#pragma endregion
+
+typedef unsigned char BYTE ;
+
+// POINTER defines a generic pointer type
+typedef unsigned char *POINTER;
+
+// UINT2 defines a two byte word
+typedef unsigned short int UINT2;
+
+// UINT4 defines a four byte word
+typedef unsigned long int UINT4;
+
+
+// convenient object that wraps
+// the C-functions for use in C++ only
+class MD5
+{
+private:
+ struct __context_t {
+ UINT4 state[4]; /* state (ABCD) */
+ UINT4 count[2]; /* number of bits, modulo 2^64 (lsb first) */
+ unsigned char buffer[64]; /* input buffer */
+ } context ;
+
+ #pragma region static helper functions
+ // The core of the MD5 algorithm is here.
+ // MD5 basic transformation. Transforms state based on block.
+ static void MD5Transform( UINT4 state[4], unsigned char block[64] )
+ {
+ UINT4 a = state[0], b = state[1], c = state[2], d = state[3], x[16];
+
+ Decode (x, block, 64);
+
+ /* Round 1 */
+ FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */
+ FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */
+ FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */
+ FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */
+ FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */
+ FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */
+ FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */
+ FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */
+ FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */
+ FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */
+ FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */
+ FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */
+ FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */
+ FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */
+ FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */
+ FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */
+
+ /* Round 2 */
+ GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */
+ GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */
+ GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */
+ GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */
+ GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */
+ GG (d, a, b, c, x[10], S22, 0x2441453); /* 22 */
+ GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */
+ GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */
+ GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */
+ GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */
+ GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */
+ GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */
+ GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */
+ GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */
+ GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */
+ GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */
+
+ /* Round 3 */
+ HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */
+ HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */
+ HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */
+ HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */
+ HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */
+ HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */
+ HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */
+ HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */
+ HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */
+ HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */
+ HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */
+ HH (b, c, d, a, x[ 6], S34, 0x4881d05); /* 44 */
+ HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */
+ HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */
+ HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */
+ HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */
+
+ /* Round 4 */
+ II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */
+ II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */
+ II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */
+ II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */
+ II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */
+ II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */
+ II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */
+ II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */
+ II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */
+ II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */
+ II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */
+ II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */
+ II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */
+ II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */
+ II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */
+ II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */
+
+ state[0] += a;
+ state[1] += b;
+ state[2] += c;
+ state[3] += d;
+
+ // Zeroize sensitive information.
+ memset((POINTER)x, 0, sizeof (x));
+ }
+
+ // Encodes input (UINT4) into output (unsigned char). Assumes len is
+ // a multiple of 4.
+ static void Encode( unsigned char *output, UINT4 *input, unsigned int len )
+ {
+ unsigned int i, j;
+
+ for (i = 0, j = 0; j < len; i++, j += 4) {
+ output[j] = (unsigned char)(input[i] & 0xff);
+ output[j+1] = (unsigned char)((input[i] >> 8) & 0xff);
+ output[j+2] = (unsigned char)((input[i] >> 16) & 0xff);
+ output[j+3] = (unsigned char)((input[i] >> 24) & 0xff);
+ }
+ }
+
+ // Decodes input (unsigned char) into output (UINT4). Assumes len is
+ // a multiple of 4.
+ static void Decode( UINT4 *output, unsigned char *input, unsigned int len )
+ {
+ unsigned int i, j;
+
+ for (i = 0, j = 0; j < len; i++, j += 4)
+ output[i] = ((UINT4)input[j]) | (((UINT4)input[j+1]) << 8) |
+ (((UINT4)input[j+2]) << 16) | (((UINT4)input[j+3]) << 24);
+ }
+ #pragma endregion
+
+
+public:
+ // MAIN FUNCTIONS
+ MD5()
+ {
+ Init() ;
+ }
+
+ // MD5 initialization. Begins an MD5 operation, writing a new context.
+ void Init()
+ {
+ context.count[0] = context.count[1] = 0;
+
+ // Load magic initialization constants.
+ context.state[0] = 0x67452301;
+ context.state[1] = 0xefcdab89;
+ context.state[2] = 0x98badcfe;
+ context.state[3] = 0x10325476;
+ }
+
+ // MD5 block update operation. Continues an MD5 message-digest
+ // operation, processing another message block, and updating the
+ // context.
+ void Update(
+ unsigned char *input, // input block
+ unsigned int inputLen ) // length of input block
+ {
+ unsigned int i, index, partLen;
+
+ // Compute number of bytes mod 64
+ index = (unsigned int)((context.count[0] >> 3) & 0x3F);
+
+ // Update number of bits
+ if ((context.count[0] += ((UINT4)inputLen << 3))
+ < ((UINT4)inputLen << 3))
+ context.count[1]++;
+ context.count[1] += ((UINT4)inputLen >> 29);
+
+ partLen = 64 - index;
+
+ // Transform as many times as possible.
+ if (inputLen >= partLen) {
+ memcpy((POINTER)&context.buffer[index], (POINTER)input, partLen);
+ MD5Transform (context.state, context.buffer);
+
+ for (i = partLen; i + 63 < inputLen; i += 64)
+ MD5Transform (context.state, &input[i]);
+
+ index = 0;
+ }
+ else
+ i = 0;
+
+ /* Buffer remaining input */
+ memcpy((POINTER)&context.buffer[index], (POINTER)&input[i], inputLen-i);
+ }
+
+ // MD5 finalization. Ends an MD5 message-digest operation, writing the
+ // the message digest and zeroizing the context.
+ // Writes to digestRaw
+ void Final()
+ {
+ unsigned char bits[8];
+ unsigned int index, padLen;
+
+ // Save number of bits
+ Encode( bits, context.count, 8 );
+
+ // Pad out to 56 mod 64.
+ index = (unsigned int)((context.count[0] >> 3) & 0x3f);
+ padLen = (index < 56) ? (56 - index) : (120 - index);
+ Update( PADDING, padLen );
+
+ // Append length (before padding)
+ Update( bits, 8 );
+
+ // Store state in digest
+ Encode( digestRaw, context.state, 16);
+
+ // Zeroize sensitive information.
+ memset((POINTER)&context, 0, sizeof (context));
+
+ writeToString() ;
+ }
+
+ /// Buffer must be 32+1 (nul) = 33 chars long at least
+ void writeToString()
+ {
+ int pos ;
+
+ for( pos = 0 ; pos < 16 ; pos++ )
+ sprintf( digestChars+(pos*2), "%02x", digestRaw[pos] ) ;
+ }
+
+
+public:
+ // an MD5 digest is a 16-byte number (32 hex digits)
+ BYTE digestRaw[ 16 ] ;
+
+ // This version of the digest is actually
+ // a "printf'd" version of the digest.
+ char digestChars[ 33 ] ;
+
+ /// Load a file from disk and digest it
+ // Digests a file and returns the result.
+ char* digestFile( char *filename )
+ {
+ Init() ;
+
+ FILE *file;
+
+ int len;
+ unsigned char buffer[1024] ;
+
+ if( (file = fopen (filename, "rb")) == NULL )
+ printf( "%s can't be opened\n", filename ) ;
+ else
+ {
+ while( len = fread( buffer, 1, 1024, file ) )
+ Update( buffer, len ) ;
+ Final();
+
+ fclose( file );
+ }
+
+ return digestChars ;
+ }
+
+ /// Digests a byte-array already in memory
+ char* digestMemory( BYTE *memchunk, int len )
+ {
+ Init() ;
+ Update( memchunk, len ) ;
+ Final() ;
+
+ return digestChars ;
+ }
+
+ // Digests a string and prints the result.
+ char* digestString( char *string )
+ {
+ Init() ;
+ Update( (unsigned char*)string, strlen(string) ) ;
+ Final() ;
+
+ return digestChars ;
+ }
+} ;
+
+#endif
diff --git a/src/include/memory.hpp b/src/modules/dev_support/memory.hpp
similarity index 100%
rename from src/include/memory.hpp
rename to src/modules/dev_support/memory.hpp
diff --git a/src/modules/dev_support/memory_limit.hpp b/src/modules/dev_support/memory_limit.hpp
new file mode 100644
index 0000000..5aee818
--- /dev/null
+++ b/src/modules/dev_support/memory_limit.hpp
@@ -0,0 +1,97 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#if __DARWIN || __DARWIN_UNIX03
+#include <mach/task.h>
+#include <mach/mach.h>
+#else
+
+#include <sys/resource.h>
+
+#endif
+
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "config.hpp"
+
+#ifdef SPADES_USE_JEMALLOC
+
+# include <jemalloc/jemalloc.h>
+
+#endif
+
+inline void limit_memory(size_t limit) {
+ rlimit rl;
+ if (sizeof(rlim_t) < 8) {
+ INFO("Can't limit virtual memory because of 32-bit system");
+ return;
+ }
+
+ int res = getrlimit(RLIMIT_AS, &rl);
+ VERIFY_MSG(res == 0,
+ "getrlimit(2) call failed, errno = " << errno);
+
+ // We cannot go beyond hard limit and we might not have enough privileges to
+ // increase the hard limit
+ rl.rlim_cur = std::min<size_t>(limit, rl.rlim_max);
+ res = setrlimit(RLIMIT_AS, &rl);
+ VERIFY_MSG(res == 0,
+ "setrlimit(2) call failed, errno = " << errno);
+ INFO("Memory limit set to " << (1.0 * (double) rl.rlim_cur / 1024 / 1024 / 1024) << " Gb");
+}
+
+inline size_t get_memory_limit() {
+ rlimit rl;
+ int res = getrlimit(RLIMIT_AS, &rl);
+ VERIFY_MSG(res == 0,
+ "getrlimit(2) call failed, errno = " << errno);
+
+ return rl.rlim_cur;
+}
+
+#if __DARWIN || __DARWIN_UNIX03
+inline size_t get_max_rss() {
+ struct task_basic_info t_info;
+ mach_msg_type_number_t t_info_count = TASK_BASIC_INFO_COUNT;
+
+ if (KERN_SUCCESS !=
+ task_info(mach_task_self(),
+ TASK_BASIC_INFO, (task_info_t)&t_info, &t_info_count))
+ return -1U;
+
+ return t_info.resident_size / 1024;
+}
+#else
+
+inline size_t get_max_rss() {
+ rusage ru;
+ getrusage(RUSAGE_SELF, &ru);
+
+ return ru.ru_maxrss;
+}
+
+#endif
+
+inline size_t get_used_memory() {
+#ifdef SPADES_USE_JEMALLOC
+ const size_t *cmem = 0;
+ size_t clen = sizeof(cmem);
+
+ je_mallctl("stats.cactive", &cmem, &clen, NULL, 0);
+ return *cmem;
+#else
+ get_max_rss();
+#endif
+}
+
+
+inline size_t get_free_memory() {
+ return get_memory_limit() - get_used_memory();
+}
diff --git a/src/include/openmp_wrapper.h b/src/modules/dev_support/openmp_wrapper.h
similarity index 100%
rename from src/include/openmp_wrapper.h
rename to src/modules/dev_support/openmp_wrapper.h
diff --git a/src/include/parallel_wrapper.hpp b/src/modules/dev_support/parallel_wrapper.hpp
similarity index 100%
rename from src/include/parallel_wrapper.hpp
rename to src/modules/dev_support/parallel_wrapper.hpp
diff --git a/src/modules/dev_support/path_helper.cpp b/src/modules/dev_support/path_helper.cpp
new file mode 100644
index 0000000..534d459
--- /dev/null
+++ b/src/modules/dev_support/path_helper.cpp
@@ -0,0 +1,249 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "dev_support/path_helper.hpp"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <dirent.h>
+#include <unistd.h>
+
+#include <boost/tokenizer.hpp>
+#include <boost/algorithm/string.hpp>
+
+#include <string>
+#include <vector>
+
+namespace path {
+
+bool make_dir(std::string const& folder) {
+ return mkdir(folder.c_str(), 0755) == 0;
+}
+
+std::string make_temp_dir(std::string const& prefix,
+ std::string const& suffix) {
+ std::string name = append_path(prefix, suffix + "_XXXXXX");
+ char* actual;
+ if ((actual = ::mkdtemp(strcpy(new char[name.length() + 1], name.c_str())))
+ == NULL)
+ throw std::runtime_error("Cannot create temporary dir " + name);
+
+ std::string result(actual);
+ if (result == name)
+ throw std::runtime_error("Cannot create temporary dir " + name);
+
+ delete[] actual;
+
+ return result;
+}
+
+void remove_dir(std::string const& folder) {
+ DIR *dp;
+ if ((dp = opendir(folder.c_str())) == NULL)
+ throw std::runtime_error("can not open folder " + folder);
+
+ struct dirent *dirp;
+ while ((dirp = readdir(dp)) != NULL) {
+ std::string full_path = folder + "/" + dirp->d_name;
+
+ if (dirp->d_type == DT_DIR) {
+ if (std::string(".") != dirp->d_name
+ && std::string("..") != dirp->d_name) {
+ remove_dir(full_path);
+ }
+ } else
+ remove(full_path.c_str());
+ }
+
+ closedir(dp);
+ remove(folder.c_str());
+}
+
+bool is_regular_file(std::string const& path) {
+ struct stat st;
+ return (stat(path.c_str(), &st) == 0) && (S_ISREG(st.st_mode));
+}
+
+std::string append_path(std::string const& prefix, std::string const& suffix) {
+ std::string delimiter = "";
+
+ if (!boost::ends_with(prefix, "/") && !boost::starts_with(suffix, "/")
+ && !prefix.empty()) {
+ delimiter = "/";
+ }
+
+ return prefix + delimiter + suffix;
+}
+
+std::string current_dir() {
+ char* cwd = getcwd(NULL, 0);
+ std::string result = cwd;
+
+ free(cwd);
+ return result;
+}
+
+void make_full_path(std::string& path) {
+ if (!boost::starts_with(path, "/")) // relative path
+ path = append_path(current_dir(), path);
+}
+
+std::string filename(std::string const& path) {
+ size_t pos = path.find_last_of('/');
+ return pos != std::string::npos ? path.substr(pos + 1) : path;
+}
+
+std::string basename(std::string const& path) {
+ size_t slash = path.find_last_of('/');
+ size_t after_slash = slash == std::string::npos ? 0 : slash + 1;
+
+ size_t dot = path.find_last_of('.');
+ if (dot < after_slash)
+ dot = std::string::npos;
+
+ return path.substr(after_slash, dot - after_slash);
+}
+
+std::string extension(std::string const& path) {
+ size_t slash = path.find_last_of('/');
+ size_t after_slash = slash == std::string::npos ? 0 : slash + 1;
+ size_t dot = path.find_last_of('.');
+
+ if (dot < after_slash || dot == std::string::npos || dot + 1 == path.size())
+ return std::string();
+
+ return path.substr(dot);
+}
+
+std::string parent_path(std::string const& path) {
+ std::string cpath(path);
+
+ make_full_path(cpath);
+ size_t slash_pos = cpath.find_last_of('/');
+
+ return (slash_pos == 0 ? std::string("/") : cpath.substr(0, slash_pos));
+}
+
+bool check_existence(std::string const& path) {
+ struct stat st_buf;
+ return stat(path.c_str(), &st_buf) == 0
+ && (S_ISREG(st_buf.st_mode) || S_ISDIR(st_buf.st_mode)); // exists and (file or dir)
+}
+
+void remove_if_exists(std::string const& path) {
+ if (check_existence(path)) {
+ if (is_regular_file(path)) // file
+ remove(path.c_str());
+ else // dir
+ remove_dir(path);
+ }
+}
+
+//TODO do we need to screen anything but whitespaces?
+std::string screen_whitespaces(std::string const &path) {
+ std::string to_search = " ";
+ std::string res = "";
+ for (size_t i = 0; i < path.size(); i++) {
+ if ((i == 0) || (path[i] != ' ') || (path[i - 1] == '\\')) {
+ res += path[i];
+ } else {
+ res +='\\';
+ res +=' ';
+ }
+ }
+// res += "'";
+ return res;
+}
+
+//todo reduce code duplication!!!
+bool FileExists(std::string const &filename) {
+ struct stat st_buf;
+ return stat(filename.c_str(), &st_buf) == 0 && S_ISREG(st_buf.st_mode);
+}
+
+void CheckFileExistenceFATAL(std::string const &filename) {
+ if (!FileExists(filename)) FATAL_ERROR("File " << filename << " doesn't exist or can't be read!");
+}
+
+void make_dirs(std::string const &path) {
+ VERIFY(!path.empty());
+
+ size_t slash_pos = 0;
+ while ((slash_pos = path.find_first_of('/', slash_pos + 1)) != std::string::npos) {
+ make_dir(path.substr(0, slash_pos));
+ }
+ if (path[path.size() - 1] != '/') {
+ make_dir(path);
+ }
+}
+
+// doesn't support symlinks
+std::string resolve(std::string const& path) {
+ typedef boost::char_delimiters_separator<char> separator_t;
+ typedef boost::tokenizer<separator_t> tokenizer_t;
+
+ tokenizer_t tok(path, separator_t(false, "", "/"));
+
+ std::string result = "/";
+ for (auto it = tok.begin(); it != tok.end(); ++it) {
+ if (*it == "..")
+ result = parent_path(result);
+
+ else if (*it == ".")
+ ; // Ignore
+
+ else
+ // Just cat other path entries
+ result = append_path(result, *it);
+ }
+
+ return result;
+}
+
+std::string make_relative_path(std::string p, std::string base) {
+ p = resolve(p);
+ base = resolve(base);
+
+ std::string pp = parent_path(p);
+
+ typedef boost::char_delimiters_separator<char> separator_t;
+ typedef boost::tokenizer<separator_t> tokenizer_t;
+
+ tokenizer_t pp_tok(pp, separator_t(false, "", "/"));
+ tokenizer_t base_tok(base, separator_t(false, "", "/"));
+
+ auto i = pp_tok.begin();
+ auto j = base_tok.begin();
+
+ while (i != pp_tok.end() && j != base_tok.end() && *i == *j) {
+ ++i;
+ ++j;
+ }
+
+ std::string result;
+ for (; j != base_tok.end(); ++j)
+ result = append_path("..", result);
+
+ for (; i != pp_tok.end(); ++i)
+ result = append_path(result, *i);
+
+ return append_path(result, filename(p));
+}
+
+std::string MakeLaunchTimeDirName() {
+ time_t rawtime;
+ struct tm * timeinfo;
+ char buffer[80];
+
+ time(&rawtime);
+ timeinfo = localtime(&rawtime);
+
+ strftime(buffer, 80, "%m.%d_%H.%M.%S", timeinfo);
+ return std::string(buffer);
+}
+
+}
diff --git a/src/modules/dev_support/path_helper.hpp b/src/modules/dev_support/path_helper.hpp
new file mode 100644
index 0000000..372c6f4
--- /dev/null
+++ b/src/modules/dev_support/path_helper.hpp
@@ -0,0 +1,74 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <dirent.h>
+#include <unistd.h>
+
+#include <string>
+#include <vector>
+#include "dev_support/logger/logger.hpp"
+#include "dev_support/verify.hpp"
+
+namespace path {
+//todo review and make names consistent!
+
+typedef std::vector<std::string> files_t;
+
+bool make_dir(std::string const &folder);
+
+std::string make_temp_dir(std::string const &prefix, std::string const &suffix);
+
+void remove_dir(std::string const &folder);
+
+bool is_regular_file(std::string const &path);
+
+std::string append_path(std::string const &prefix, std::string const &suffix);
+
+std::string current_dir();
+
+//todo why non-cons argument?!
+void make_full_path(std::string &path);
+
+std::string filename(std::string const &path);
+
+std::string basename(std::string const &path);
+
+std::string extension(std::string const &path);
+
+std::string parent_path(std::string const &path);
+
+bool check_existence(std::string const &path);
+
+void remove_if_exists(std::string const &path);
+
+std::string screen_whitespaces(std::string const &path);
+
+/**
+* Checks if file exists.
+* Analogs: http://www.techbytes.ca/techbyte103.html , http://www.gamedev.net/topic/211918-determining-if-a-file-exists-c/
+*/
+bool FileExists(std::string const &filename);
+
+/**
+* Exit(1) if file doesn't exists, writes FATAL log message.
+*/
+void CheckFileExistenceFATAL(std::string const &filename);
+
+void make_dirs(std::string const &path);
+
+// doesn't support symlinks
+std::string resolve(std::string const &path);
+
+std::string make_relative_path(std::string p, std::string base = current_dir());
+
+std::string MakeLaunchTimeDirName();
+
+}
diff --git a/src/modules/dev_support/perfcounter.hpp b/src/modules/dev_support/perfcounter.hpp
new file mode 100644
index 0000000..3487888
--- /dev/null
+++ b/src/modules/dev_support/perfcounter.hpp
@@ -0,0 +1,123 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+#include <sys/time.h>
+#include <string>
+#include <cppformat/format.h>
+
+struct perf_counter
+{
+ perf_counter()
+ {
+ reset();
+ }
+
+ double time() const
+ {
+ struct timeval now;
+ gettimeofday(&now, NULL);
+
+ return (double)(now.tv_sec - time_.tv_sec) + (double)(now.tv_usec - time_.tv_usec) * 1e-6;
+ }
+
+ double time_ms() const
+ {
+ return time() * 1e3;
+ }
+
+ void reset()
+ {
+ gettimeofday(&time_, NULL);
+ }
+
+private:
+ struct timeval time_;
+};
+
+
+inline std::string human_readable_time(double time_in_sec)
+{
+// assert(time_in_sec > 0);
+
+ size_t msec = size_t(time_in_sec * 1000) % 1000;
+ size_t sec = size_t(time_in_sec);
+ size_t hours = sec / 3600;
+ size_t mins = (sec / 60) % 60;
+ sec %= 60;
+
+ return fmt::format("{:3d}:{:02d}:{:02d}.{:03d}", hours, mins, sec, msec);
+}
+
+inline std::string human_readable_memory(size_t max_rss) {
+ if (max_rss < 1024 * 1024) {
+ return fmt::format("{:d}M", (max_rss / 1024));
+ } else {
+ return fmt::format("{:d}G", (max_rss / (1024 * 1024)));
+ }
+}
+
+struct avg_perf_counter
+{
+ avg_perf_counter(/*const string& name*/)// : name_(name)
+ {
+ reset();
+ }
+
+// ~avg_perf_counter() {
+// cout << "Time in counter " << name_ << ": " << human_readable_time(time()) << endl;
+// }
+
+ int start(int ret = 0)
+ {
+ p_cnt_.reset();
+ return ret;
+ }
+
+ int stop(int ret = 0)
+ {
+ counter_++;
+ whole_time_ += p_cnt_.time();
+ return ret;
+ }
+ double time() const
+ {
+ return whole_time_;
+ }
+ size_t counts()
+ {
+ return counter_;
+ }
+ double time_ms() const
+ {
+ return time() * 1e3;
+ }
+
+ double avg_time() const
+ {
+ return counter_ > 0 ? whole_time_/(double)counter_ : 0.;
+ }
+
+ double avg_time_ms() const
+ {
+ return avg_time() * 1e3;
+ }
+
+ void reset()
+ {
+ p_cnt_.reset();
+ whole_time_ = 0;
+ counter_ = 0;
+ }
+
+private:
+ const std::string name_;
+ perf_counter p_cnt_;
+ double whole_time_;
+ size_t counter_;
+
+};
diff --git a/src/modules/dev_support/range.hpp b/src/modules/dev_support/range.hpp
new file mode 100644
index 0000000..bf2595d
--- /dev/null
+++ b/src/modules/dev_support/range.hpp
@@ -0,0 +1,92 @@
+#pragma once
+
+#include "dev_support/verify.hpp"
+
+namespace omnigraph {
+
+struct Range {
+private:
+ bool inside(size_t left, size_t right, size_t point) const {
+ return left <= point && point <= right;
+ }
+
+public:
+ //inclusive
+ size_t start_pos;
+ //exclusive
+ size_t end_pos;
+
+ size_t size() const {
+ VERIFY(end_pos >= start_pos);
+ return end_pos - start_pos;
+ }
+
+ void shift(int shift) {
+ VERIFY(shift > 0 || size_t(-shift) <= start_pos);
+ start_pos += shift;
+ end_pos += shift;
+ }
+
+ Range(): start_pos(0), end_pos(0) {
+ VERIFY(end_pos >= start_pos);
+ }
+
+ Range(size_t start_pos, size_t end_pos)
+ : start_pos(start_pos),
+ end_pos(end_pos) {
+ VERIFY(end_pos >= start_pos);
+ }
+
+ bool operator<(const Range &other) const {
+ if (start_pos != other.start_pos)
+ return start_pos < other.start_pos;
+ return end_pos < other.end_pos;
+ }
+
+ bool contains(const Range& that) const {
+ return start_pos <= that.start_pos && end_pos >= that.end_pos;
+ }
+
+ Range Merge(const Range &other) const {
+ return Range(this->start_pos, other.end_pos);
+ }
+
+ Range Invert(size_t base_length) const {
+ VERIFY(base_length >= end_pos);
+ return Range(base_length - end_pos, base_length - start_pos);
+ }
+
+ Range& operator=(const Range& other) {
+ start_pos = other.start_pos;
+ end_pos = other.end_pos;
+ return *this;
+ }
+
+ bool empty() const {
+ return start_pos == end_pos;
+ }
+
+ bool Intersect(const Range &other) const {
+ return inside(start_pos, end_pos, other.start_pos) || inside(start_pos, end_pos, other.end_pos) ||
+ inside(other.start_pos, other.end_pos, start_pos);
+ }
+
+ bool IntersectLeftOf(const Range &other) const {
+ return inside(start_pos, end_pos, other.start_pos) && inside(other.start_pos, other.end_pos, end_pos);
+ }
+
+ bool operator==(const Range &that) const {
+ return start_pos == that.start_pos && end_pos == that.end_pos;
+ }
+
+ bool operator!=(const Range &that) const {
+ return !(*this == that);
+ }
+};
+
+inline std::ostream& operator<<(std::ostream& os, const Range& range) {
+ os << "[" << (range.start_pos + 1) << " - " << range.end_pos << "]";
+ return os;
+}
+
+}
diff --git a/src/modules/dev_support/segfault_handler.hpp b/src/modules/dev_support/segfault_handler.hpp
new file mode 100644
index 0000000..836e2f2
--- /dev/null
+++ b/src/modules/dev_support/segfault_handler.hpp
@@ -0,0 +1,58 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+
+#pragma once
+
+#include "dev_support/stacktrace.hpp"
+#include "boost/noncopyable.hpp"
+
+#include <signal.h>
+
+struct segfault_handler : boost::noncopyable {
+ typedef std::function<void()> callback_t;
+
+ typedef void (*seg_handler_t)(int);
+
+ segfault_handler(callback_t const &cb = 0) {
+ if (callback() != 0)
+ throw std::runtime_error("failed to initialize segfault_handler, it has been already initialized");
+
+ callback() = cb;
+ old_func_ = signal(SIGSEGV, &segfault_handler::handler);
+ }
+
+ ~segfault_handler() {
+ callback() = 0;
+ signal(SIGSEGV, old_func_);
+ }
+
+private:
+ static callback_t &callback() {
+ static callback_t cb = 0;
+ return cb;
+ }
+
+ static void handler(int signum) {
+ if (signum == SIGSEGV) {
+ std::cerr << "The program was terminated by segmentation fault" << std::endl;
+ print_stacktrace();
+
+ if (callback())
+ callback()();
+ }
+
+ //TEST!!
+ exit(1);
+
+ signal(signum, SIG_DFL);
+ kill(getpid(), signum);
+ }
+
+private:
+ seg_handler_t old_func_;
+};
diff --git a/src/modules/dev_support/simple_tools.hpp b/src/modules/dev_support/simple_tools.hpp
new file mode 100644
index 0000000..00690a5
--- /dev/null
+++ b/src/modules/dev_support/simple_tools.hpp
@@ -0,0 +1,184 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * simple_tools.hpp
+ *
+ * Created on: 27.05.2011
+ * Author: vyahhi
+ */
+
+#ifndef SIMPLE_TOOLS_HPP_
+#define SIMPLE_TOOLS_HPP_
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "dev_support/verify.hpp"
+#include "io/reads_io/ireader.hpp"
+#include "dev_support/path_helper.hpp"
+#include <memory>
+#include <string>
+#include <set>
+#include <vector>
+
+/**
+ * Converts anything to string (using ostringstream).
+ */
+template <typename T>
+std::string ToString(const T& t) {
+ std::ostringstream ss;
+ ss << t;
+ return ss.str();
+}
+
+template <typename T>
+std::string ToString(const T& t, size_t length) {
+ std::ostringstream ss;
+ ss << t;
+ std::string result = ss.str();
+ while(result.size() < length)
+ result = "0" + result;
+ return result;
+}
+
+template <typename T>
+std::string ToString(std::vector<T>& t) {
+ std::ostringstream ss;
+ ss << "Size "<<t.size()<<": [";
+ for (auto it = t.begin(); it != t.end(); ++it)
+ ss<<*it<<", ";
+ ss<<"]";
+ return ss.str();
+}
+
+template <typename T>
+std::string ToString(std::set<T>& t) {
+ std::ostringstream ss;
+ ss << "Size "<<t.size()<<": [";
+ for (auto it = t.begin(); it != t.end(); ++it)
+ ss<<*it<<", ";
+ ss<<"]";
+ return ss.str();
+}
+
+template<typename T>
+inline const std::pair<T, T> ReversePair(std::pair<T, T> ep) {
+ return std::pair<T, T>(ep.second, ep.first);
+}
+
+template <class ContainerT1, class ContainerT2>
+void push_back_all(ContainerT1& target, const ContainerT2& to_insert) {
+ target.insert(target.end(), to_insert.begin(), to_insert.end());
+}
+
+template <class ContainerT1, class ContainerT2>
+void insert_all(ContainerT1& target, const ContainerT2& to_insert) {
+ target.insert(to_insert.begin(), to_insert.end());
+}
+
+template<class MapT>
+std::set<typename MapT::key_type> key_set(const MapT& m) {
+ std::set<typename MapT::key_type> answer;
+ for (auto it = m.begin(); it != m.end(); ++it) {
+ answer.insert(it->first);
+ }
+ return answer;
+}
+
+template<class MapT>
+std::set<typename MapT::mapped_type> value_set(const MapT& m) {
+ std::set<typename MapT::mapped_type> answer;
+ for (auto it = m.begin(); it != m.end(); ++it) {
+ answer.insert(it->second);
+ }
+ return answer;
+}
+
+template <class MapT>
+const typename MapT::mapped_type& get(const MapT& from, const typename MapT::key_type& key) {
+ auto it = from.find(key);
+ VERIFY(it != from.end());
+ return it->second;
+}
+
+template <class MapT>
+typename MapT::mapped_type& get(MapT& from, const typename MapT::key_type& key) {
+ auto it = from.find(key);
+ VERIFY(it != from.end());
+ return it->second;
+}
+
+template <class MMapT>
+const std::vector<typename MMapT::mapped_type> get_all(const MMapT& from, const typename MMapT::key_type& key) {
+ std::vector<typename MMapT::mapped_type> answer;
+ for (auto it = from.lower_bound(key); it != from.upper_bound(key); ++it) {
+ answer.push_back(it->second);
+ }
+ return answer;
+}
+
+class TmpFolderFixture
+{
+ std::string tmp_folder_;
+
+public:
+ TmpFolderFixture(std::string tmp_folder = "tmp") :
+ tmp_folder_(tmp_folder)
+ {
+ path::make_dir(tmp_folder_);
+ }
+
+ ~TmpFolderFixture()
+ {
+ path::remove_dir(tmp_folder_);
+ }
+};
+
+namespace std
+{
+template<class T1, class T2>
+std::ostream& operator<< (std::ostream& os, std::pair<T1, T2> const& pair)
+{
+ return os << "(" << pair.first << ", " << pair.second << ")";
+}
+//}
+
+//namespace omnigraph
+//{
+template<class T>
+std::ostream& operator<< (std::ostream& os, const std::vector<T>& v)
+{
+ os << "[";
+ std::string delim = "";
+ for (auto it = v.begin(); it != v.end(); ++it) {
+ os << delim << *it;
+ delim = ", ";
+ }
+// std::copy(v.begin(), v.end(), std::ostream_iterator<T>(os, ", "));
+ os << "]";
+ return os;
+}
+
+template<class T>
+std::ostream& operator<< (std::ostream& os, const std::set<T>& set)
+{
+ os << "{";
+ bool delim = false;
+ for (const auto& i : set) {
+ if (delim) os << ", ";
+ os << i;
+ delim = true;
+ }
+ os << "}";
+ return os;
+}
+
+}
+
+#endif /* SIMPLE_TOOLS_HPP_ */
diff --git a/src/include/stacktrace.hpp b/src/modules/dev_support/stacktrace.hpp
similarity index 100%
rename from src/include/stacktrace.hpp
rename to src/modules/dev_support/stacktrace.hpp
diff --git a/src/modules/dev_support/standard_base.hpp b/src/modules/dev_support/standard_base.hpp
new file mode 100644
index 0000000..9adc83b
--- /dev/null
+++ b/src/modules/dev_support/standard_base.hpp
@@ -0,0 +1,140 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * standart.hpp
+ *
+ * Created on: 1 Sep 2011
+ * Author: valery
+ */
+
+#pragma once
+
+//==crt and stl
+#include <memory>
+#include <cstdlib>
+#include <cstdio>
+#include <time.h>
+#include <signal.h>
+#include <execinfo.h>
+
+#include <iostream>
+#include <iterator>
+#include <algorithm>
+#include <map>
+#include <vector>
+#include <set>
+#include <string>
+#include <sstream>
+#include <utility>
+#include <array>
+#include <unordered_map>
+#include <unordered_set>
+#include <deque>
+#include <cmath>
+#include <limits>
+
+using std::cin;
+using std::cout;
+using std::cerr;
+using std::endl;
+using std::map;
+using std::multimap;
+using std::unordered_map;
+using std::unordered_set;
+using std::vector;
+using std::array;
+using std::set;
+using std::string;
+using std::pair;
+using std::make_pair;
+using std::ifstream;
+using std::istream;
+using std::ofstream;
+using std::ostream;
+using std::min;
+using std::max;
+using std::abs;
+using std::stringstream;
+using std::numeric_limits;
+using std::ostream_iterator;
+using std::copy;
+
+using std::shared_ptr;
+using std::make_shared;
+
+//==boost
+
+#ifndef NDEBUG
+#define BOOST_ENABLE_ASSERT_HANDLER
+#endif
+
+#include <boost/optional.hpp>
+
+#include <boost/noncopyable.hpp>
+
+using boost::optional;
+using boost::make_optional;
+using boost::none;
+
+using boost::noncopyable;
+
+// err handling
+#include "dev_support/stacktrace.hpp"
+
+// path manipulation instead of boost filesystem
+#include "dev_support/path_helper.hpp"
+using path::make_dir;
+using path::remove_dir;
+
+#ifndef NDEBUG
+namespace boost {
+inline void assertion_failed(char const * expr, char const * function,
+ char const * file, long line) {
+ std::cerr << "Aborted by assert: " << std::endl;
+ print_stacktrace();
+#if __DARWIN_UNIX03
+ __assert_rtn (expr, file, (int)line, function);
+#elif __DARWIN
+ __assert (expr, file, (int)line, function);
+#else
+ __assert_fail (expr, file, (unsigned)line, function);
+#endif
+}
+
+inline void assertion_failed_msg(char const * expr, char const * msg,
+ char const * function, char const * file,
+ long line) {
+ std::cerr << "Aborted by assert: " << msg << std::endl;
+ print_stacktrace();
+#if __DARWIN_UNIX03
+ __assert_rtn (expr, file, (int)line, function);
+#elif __DARWIN
+ __assert (expr, file, (int)line, function);
+#else
+ __assert_fail (expr, file, (unsigned)line, function);
+#endif
+}
+
+} // namespace boost
+
+#endif // NDEBUG
+
+//==sys
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/time.h>
+
+//our
+//math
+#include "math/xmath.h"
+#include "dev_support/func.hpp"
+#include "dev_support/verify.hpp"
+// log
+#include "dev_support/logger/logger.hpp"
+
+
diff --git a/src/modules/dev_support/verify.hpp b/src/modules/dev_support/verify.hpp
new file mode 100644
index 0000000..337828e
--- /dev/null
+++ b/src/modules/dev_support/verify.hpp
@@ -0,0 +1,33 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+#include "dev_support/stacktrace.hpp"
+#include "boost/current_function.hpp"
+#include <sstream>
+#include <iostream>
+#include <cassert>
+
+#define VERIFY(expr) \
+ do { \
+ if(!(expr))\
+ print_stacktrace();\
+ assert(expr); \
+ } while(0);
+
+#define VERIFY_MSG(expr, msg) \
+ if (!(expr)) { \
+ std::stringstream ss; \
+ print_stacktrace();\
+ ss << "Verification of expression '" << #expr << "' failed in function '" << BOOST_CURRENT_FUNCTION << \
+ "'. In file '" << __FILE__ << "' on line " << __LINE__ << ". Message '" << msg << "'." ; \
+ std::cout << ss.str() << std::endl; \
+ std::cerr << ss.str() << std::endl; \
+ fflush(stdout); \
+ fflush(stderr); \
+ assert(expr); \
+ }
diff --git a/src/modules/empty.cpp b/src/modules/empty.cpp
new file mode 100644
index 0000000..e69de29
diff --git a/src/modules/io/CMakeLists.txt b/src/modules/io/CMakeLists.txt
new file mode 100644
index 0000000..5c0fd41
--- /dev/null
+++ b/src/modules/io/CMakeLists.txt
@@ -0,0 +1,16 @@
+############################################################################
+# Copyright (c) 2015 Saint Petersburg State University
+# Copyright (c) 2011-2014 Saint Petersburg Academic University
+# All Rights Reserved
+# See file LICENSE for details.
+############################################################################
+
+project(input CXX)
+
+add_library(input STATIC
+ reads_io/parser.cpp
+ sam_io/read.cpp
+ sam_io/sam_reader.cpp)
+
+target_link_libraries(input BamTools samtools)
+
diff --git a/src/modules/io/dataset_support/dataset_readers.hpp b/src/modules/io/dataset_support/dataset_readers.hpp
new file mode 100644
index 0000000..5d56151
--- /dev/null
+++ b/src/modules/io/dataset_support/dataset_readers.hpp
@@ -0,0 +1,122 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "dev_support/logger/logger.hpp"
+#include "dev_support/simple_tools.hpp"
+#include "io/reads_io/io_helper.hpp"
+#include "pipeline/library.hpp"
+
+#include "pipeline/config_struct.hpp"
+
+namespace debruijn_graph {
+
+inline
+io::PairedStreamPtr paired_easy_reader(const io::SequencingLibrary<config::DataSetData> &lib,
+ bool followed_by_rc,
+ size_t insert_size,
+ bool change_read_order = false,
+ bool use_orientation = true,
+ io::OffsetType offset_type = io::PhredOffset) {
+ io::ReadStreamList<io::PairedRead> streams;
+ for (auto read_pair : lib.paired_reads()) {
+ streams.push_back(io::PairedEasyStream(read_pair.first, read_pair.second, followed_by_rc, insert_size, change_read_order,
+ use_orientation, lib.orientation(), offset_type));
+ }
+ return io::MultifileWrap<io::PairedRead>(streams);
+}
+
+inline
+io::ReadStreamList<io::SingleRead> single_easy_readers(const io::SequencingLibrary<config::DataSetData> &lib,
+ bool followed_by_rc,
+ bool including_paired_reads,
+ bool handle_Ns = true,
+ io::OffsetType offset_type = io::PhredOffset) {
+ io::ReadStreamList<io::SingleRead> streams;
+ if (including_paired_reads) {
+ for (const auto& read : lib.reads()) {
+ //do we need input_file function here?
+ streams.push_back(io::EasyStream(read, followed_by_rc, handle_Ns, offset_type));
+ }
+ } else {
+ for (const auto& read : lib.single_reads()) {
+ streams.push_back(io::EasyStream(read, followed_by_rc, handle_Ns, offset_type));
+ }
+ }
+ return streams;
+}
+
+inline
+io::SingleStreamPtr single_easy_reader(const io::SequencingLibrary<config::DataSetData> &lib,
+ bool followed_by_rc,
+ bool including_paired_reads,
+ bool handle_Ns = true,
+ io::OffsetType offset_type = io::PhredOffset) {
+ return io::MultifileWrap<io::SingleRead>(
+ single_easy_readers(lib, followed_by_rc, including_paired_reads, handle_Ns, offset_type));
+}
+
+inline
+io::PairedStreamPtr paired_easy_reader_for_libs(std::vector<size_t> libs,
+ bool followed_by_rc,
+ size_t insert_size,
+ bool change_read_order = false,
+ bool use_orientation = true,
+ io::OffsetType offset_type = io::PhredOffset) {
+ io::ReadStreamList<io::PairedRead> streams;
+ for (size_t i = 0; i < libs.size(); ++i) {
+ streams.push_back(paired_easy_reader(cfg::get().ds.reads[libs[i]],
+ followed_by_rc, insert_size, change_read_order, use_orientation, offset_type));
+ }
+ return io::MultifileWrap<io::PairedRead>(streams);
+}
+
+
+inline
+io::PairedStreamPtr paired_easy_reader(bool followed_by_rc,
+ size_t insert_size,
+ bool change_read_order = false,
+ bool use_orientation = true,
+ io::OffsetType offset_type = io::PhredOffset) {
+
+ std::vector<size_t> all_libs(cfg::get().ds.reads.lib_count());
+ for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i)
+ all_libs[i] = i;
+
+ // FIXME: Should we use only first library?
+ // No, this one is for all libs together
+ return paired_easy_reader_for_libs(all_libs, followed_by_rc, insert_size, change_read_order, use_orientation, offset_type);
+}
+
+
+inline
+io::SingleStreamPtr single_easy_reader_for_libs(vector<size_t> libs,
+ bool followed_by_rc,
+ bool including_paired_reads,
+ io::OffsetType offset_type = io::PhredOffset) {
+ io::ReadStreamList<io::SingleRead> streams;
+ for (size_t i = 0; i < libs.size(); ++i) {
+ streams.push_back(single_easy_reader(cfg::get().ds.reads[libs[i]],
+ followed_by_rc, including_paired_reads, offset_type));
+ }
+ return io::MultifileWrap<io::SingleRead>(streams);
+}
+
+inline
+io::SingleStreamPtr single_easy_reader(bool followed_by_rc,
+ bool including_paired_reads,
+ io::OffsetType offset_type = io::PhredOffset) {
+
+ std::vector<size_t> all_libs(cfg::get().ds.reads.lib_count());
+ for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i)
+ all_libs[i] = i;
+
+ return single_easy_reader_for_libs(all_libs, followed_by_rc, including_paired_reads, offset_type);
+}
+
+}
diff --git a/src/modules/io/dataset_support/read_converter.hpp b/src/modules/io/dataset_support/read_converter.hpp
new file mode 100644
index 0000000..736c793
--- /dev/null
+++ b/src/modules/io/dataset_support/read_converter.hpp
@@ -0,0 +1,360 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * read_converter.hpp
+ *
+ * Created on: Apr 13, 2012
+ * Author: andrey
+ */
+
+#pragma once
+
+#include "io/reads_io/binary_converter.hpp"
+#include "io/reads_io/io_helper.hpp"
+#include "dataset_readers.hpp"
+#include "dev_support/simple_tools.hpp"
+
+#include <fstream>
+
+namespace debruijn_graph {
+
+class ReadConverter {
+
+private:
+ const static size_t current_binary_format_version = 10;
+
+ void convert_reads_to_binary() {
+ if (path::FileExists(cfg::get().temp_bin_reads_info)) {
+ std::ifstream info;
+ info.open(cfg::get().temp_bin_reads_info.c_str(), std::ios_base::in);
+
+ size_t thread_num = 0;
+ size_t format = 0;
+ size_t lib_count = 0;
+
+ info >> format;
+ if (!info.eof()) {
+ info >> thread_num;
+ }
+ if (!info.eof()) {
+ info >> lib_count;
+ }
+
+ if (thread_num == cfg::get().max_threads && format == current_binary_format_version && lib_count == cfg::get().ds.reads.lib_count()) {
+ INFO("Binary reads detected");
+
+ io::ReadStreamStat stat;
+ info >> stat.read_count_;
+ info >> stat.max_len_;
+ info >> stat.total_len_;
+
+ auto &dataset = cfg::get_writable().ds.reads;
+ for (size_t i = 0; i < dataset.lib_count(); ++i) {
+ info >> dataset[i].data().binary_coverted;
+ info >> dataset[i].data().read_length;
+ info >> dataset[i].data().total_nucls;
+
+ dataset[i].data().thread_num = cfg::get().max_threads;
+ dataset[i].data().paired_read_prefix = cfg::get().paired_read_prefix + "_" + ToString(i);
+ dataset[i].data().single_read_prefix = cfg::get().single_read_prefix + "_" + ToString(i);
+ }
+ info.close();
+ return;
+ }
+ info.close();
+ }
+
+ std::ofstream info;
+ info.open(cfg::get().temp_bin_reads_info.c_str(), std::ios_base::out);
+ info << "0 0";
+ info.close();
+
+ io::ReadStreamStat total_stat;
+ auto& dataset = cfg::get_writable().ds.reads;
+
+ INFO("Converting reads to binary format (takes a while)");
+ for (size_t i = 0; i < dataset.lib_count(); ++i) {
+ if (cfg::get().bwa.bwa_enable && dataset[i].is_bwa_alignable()) {
+ INFO("Library #" << i << " will be used by BWA only and thus will not be converted");
+ continue;
+ }
+ else if (dataset[i].is_binary_covertable()) {
+ INFO("Paired reads for library #" << i);
+ dataset[i].data().thread_num = cfg::get().max_threads;
+ dataset[i].data().paired_read_prefix = cfg::get().paired_read_prefix + "_" + ToString(i);
+
+ io::PairedStreamPtr paired_reader = paired_easy_reader(dataset[i], false, 0, false, false);
+ io::BinaryWriter paired_converter
+ (dataset[i].data().paired_read_prefix, cfg::get().max_threads, cfg::get().buffer_size);
+ io::ReadStreamStat paired_stat = paired_converter.ToBinary(*paired_reader, dataset[i].orientation());
+ paired_stat.read_count_ *= 2;
+ total_stat.merge(paired_stat);
+
+ INFO("Single reads for library #" << i);
+ dataset[i].data().single_read_prefix = cfg::get().single_read_prefix + "_" + ToString(i);
+ io::SingleStreamPtr single_reader = single_easy_reader(dataset[i], false, false);
+ io::BinaryWriter single_converter
+ (dataset[i].data().single_read_prefix, cfg::get().max_threads, cfg::get().buffer_size);
+ io::ReadStreamStat single_stat = single_converter.ToBinary(*single_reader);
+ total_stat.merge(single_stat);
+
+ paired_stat.merge(single_stat);
+ dataset[i].data().read_length = paired_stat.max_len_;
+ dataset[i].data().total_nucls = paired_stat.total_len_;
+ dataset[i].data().binary_coverted = true;
+ }
+ else {
+ INFO("Library #" << i << " doesn't need to be converted");
+ }
+ }
+ info.open(cfg::get().temp_bin_reads_info.c_str(), std::ios_base::out);
+ info << current_binary_format_version << " " << cfg::get().max_threads << " " << cfg::get().ds.reads.lib_count() << " " <<
+ total_stat.read_count_ << " " << total_stat.max_len_ << " " << total_stat.total_len_ << "\n";
+
+ for (size_t i = 0; i < dataset.lib_count(); ++i) {
+ info << dataset[i].data().binary_coverted
+ << " " << dataset[i].data().read_length
+ << " " << dataset[i].data().total_nucls << "\n";
+ }
+ info.close();
+ }
+
+public:
+ ReadConverter() {
+ convert_reads_to_binary();
+ }
+};
+
+
+inline
+void convert_if_needed() {
+ static ReadConverter converter;
+}
+
+inline
+io::BinaryPairedStreams raw_paired_binary_readers(const io::SequencingLibrary<config::DataSetData> &lib,
+ bool followed_by_rc,
+ size_t insert_size = 0) {
+ convert_if_needed();
+ VERIFY_MSG(lib.data().binary_coverted, "Lib was not converted to binary, cannot produce binary stream");
+
+ io::ReadStreamList<io::PairedReadSeq> paired_streams;
+ for (size_t i = 0; i < lib.data().thread_num; ++i) {
+ paired_streams.push_back(make_shared<io::BinaryFilePairedStream>(lib.data().paired_read_prefix, i, insert_size));
+ }
+ return io::apply_paired_wrappers(followed_by_rc, paired_streams);
+}
+
+inline
+io::BinarySingleStreams raw_single_binary_readers(const io::SequencingLibrary<config::DataSetData> &lib,
+ bool followed_by_rc,
+ bool including_paired_reads) {
+ convert_if_needed();
+ VERIFY_MSG(lib.data().binary_coverted, "Lib was not converted to binary, cannot produce binary stream");
+
+ io::BinarySingleStreams single_streams;
+ for (size_t i = 0; i < lib.data().thread_num; ++i) {
+ single_streams.push_back(make_shared<io::BinaryFileSingleStream>(lib.data().single_read_prefix, i));
+ }
+ if (including_paired_reads) {
+ io::BinaryPairedStreams paired_streams;
+ for (size_t i = 0; i < lib.data().thread_num; ++i) {
+ paired_streams.push_back(make_shared<io::BinaryFilePairedStream>(lib.data().paired_read_prefix, i, 0));
+ }
+
+ return io::apply_single_wrappers(followed_by_rc, single_streams, &paired_streams);
+ }
+ else {
+ return io::apply_single_wrappers(followed_by_rc, single_streams);
+ }
+}
+
+
+inline
+io::BinaryPairedStreams paired_binary_readers(const io::SequencingLibrary<config::DataSetData> &lib,
+ bool followed_by_rc,
+ size_t insert_size = 0) {
+ convert_if_needed();
+ return raw_paired_binary_readers(lib, followed_by_rc, insert_size);
+}
+
+
+inline
+io::BinarySingleStreams single_binary_readers(const io::SequencingLibrary<config::DataSetData> &lib,
+ bool followed_by_rc,
+ bool including_paired_reads) {
+ convert_if_needed();
+ return raw_single_binary_readers(lib, followed_by_rc, including_paired_reads);
+}
+
+
+inline
+//todo simplify
+io::BinaryPairedStreams paired_binary_readers_for_libs(const std::vector<size_t>& libs,
+ bool followed_by_rc,
+ size_t insert_size = 0) {
+ convert_if_needed();
+
+ std::vector<io::BinaryPairedStreams> streams(cfg::get().max_threads);
+ for (size_t i = 0; i < libs.size(); ++i) {
+ io::BinaryPairedStreams lib_streams = raw_paired_binary_readers(cfg::get().ds.reads[libs[i]], followed_by_rc, insert_size);
+
+ for (size_t j = 0; j < cfg::get().max_threads; ++j) {
+ streams[j].push_back(lib_streams.ptr_at(j));
+ }
+ }
+
+ io::BinaryPairedStreams joint_streams;
+ for (size_t j = 0; j < cfg::get().max_threads; ++j) {
+ joint_streams.push_back(io::MultifileWrap<io::PairedReadSeq>(streams[j]));
+ }
+ return joint_streams;
+}
+
+inline
+io::BinarySingleStreams single_binary_readers_for_libs(const std::vector<size_t>& libs,
+ bool followed_by_rc,
+ bool including_paired_reads) {
+ convert_if_needed();
+
+ std::vector<io::BinarySingleStreams> streams(cfg::get().max_threads);
+ for (size_t i = 0; i < libs.size(); ++i) {
+ io::BinarySingleStreams lib_streams = raw_single_binary_readers(cfg::get().ds.reads[libs[i]], followed_by_rc, including_paired_reads);
+
+ for (size_t j = 0; j < cfg::get().max_threads; ++j) {
+ streams[j].push_back(lib_streams.ptr_at(j));
+ }
+ }
+
+ io::BinarySingleStreams joint_streams;
+ for (size_t j = 0; j < cfg::get().max_threads; ++j) {
+ joint_streams.push_back(io::MultifileWrap<io::SingleReadSeq>(streams[j]));
+ }
+ return joint_streams;
+}
+
+inline
+io::BinaryPairedStreams paired_binary_readers(bool followed_by_rc,
+ size_t insert_size = 0) {
+ std::vector<size_t> all_libs(cfg::get().ds.reads.lib_count());
+ for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i) {
+ all_libs[i] = i;
+ }
+ return paired_binary_readers_for_libs(all_libs, followed_by_rc, insert_size);
+}
+
+inline
+io::BinarySingleStreams single_binary_readers(bool followed_by_rc,
+ bool including_paired_reads) {
+ std::vector<size_t> all_libs(cfg::get().ds.reads.lib_count());
+ for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i) {
+ all_libs[i] = i;
+ }
+ return single_binary_readers_for_libs(all_libs, followed_by_rc, including_paired_reads);
+}
+
+inline
+io::BinarySingleStreamPtr single_binary_multireader(bool followed_by_rc, bool including_paired_reads) {
+ return io::MultifileWrap<io::SingleReadSeq>(single_binary_readers(followed_by_rc, including_paired_reads));
+}
+
+inline
+io::BinaryPairedStreamPtr paired_binary_multireader(bool followed_by_rc, size_t insert_size = 0) {
+ return io::MultifileWrap<io::PairedReadSeq>(paired_binary_readers(followed_by_rc, insert_size));
+}
+
+/*
+
+class BufferedReadersStorage {
+
+private:
+
+ std::vector< SequenceSingleReadStream* > * single_streams_;
+
+ std::vector< SequencePairedReadStream* > * paired_streams_;
+
+ BufferedReadersStorage() {
+ INFO("Creating buffered read storage");
+
+ INFO("Buffering single reads... (takes a while)");
+ single_streams_ = new std::vector< SequenceSingleReadStream* >(cfg::get().max_threads);
+ for (size_t i = 0; i < cfg::get().max_threads; ++i) {
+ io::PredictableIReader<io::SingleReadSeq> * s_stream = new io::SeqSingleReadStream(cfg::get().single_read_prefix, i);
+ single_streams_->at(i) = new io::ReadBufferedStream<io::SingleReadSeq> (*s_stream);
+ }
+
+ INFO("Buffering paired reads... (takes a while)");
+ paired_streams_ = new std::vector< SequencePairedReadStream* >(cfg::get().max_threads);
+ for (size_t i = 0; i < cfg::get().max_threads; ++i) {
+ io::PredictableIReader<io::PairedReadSeq> * p_stream = new io::SeqPairedReadStream(cfg::get().paired_read_prefix, i, 0);
+ paired_streams_->at(i) = new io::ReadBufferedStream<io::PairedReadSeq> (*p_stream);
+ }
+ }
+
+ BufferedReadersStorage(const BufferedReadersStorage&);
+
+ BufferedReadersStorage& operator=(const BufferedReadersStorage&);
+
+public:
+
+ static BufferedReadersStorage * GetInstance() {
+ static BufferedReadersStorage instance;
+ return &instance;
+ }
+
+
+ std::vector< SequenceSingleReadStream* > * GetSingleReaders() const {
+ return single_streams_;
+ }
+
+ std::vector< SequencePairedReadStream* > * GetPairedReaders() const {
+ return paired_streams_;
+ }
+
+};
+
+
+std::vector< SequenceSingleReadStream* > single_buffered_binary_readers(bool followed_by_rc, bool including_paired_reads) {
+ convert_if_needed();
+
+ BufferedReadersStorage * storage = BufferedReadersStorage::GetInstance();
+
+ if (including_paired_reads) {
+ return apply_single_wrappers(followed_by_rc, *(storage->GetSingleReaders()), storage->GetPairedReaders());
+ }
+ else {
+ return apply_single_wrappers(followed_by_rc, *(storage->GetSingleReaders()));
+ }
+}
+
+std::vector< SequencePairedReadStream* > paired_buffered_binary_readers(bool followed_by_rc, size_t insert_size) {
+ convert_if_needed();
+
+ BufferedReadersStorage * storage = BufferedReadersStorage::GetInstance();
+
+ std::vector<SequencePairedReadStream*> paired_streams(cfg::get().max_threads);
+ for (size_t i = 0; i < cfg::get().max_threads; ++i) {
+ paired_streams[i] = new io::InsertSizeModifyingWrapper(*(storage->GetPairedReaders()->at(i)), insert_size);
+ }
+ return apply_paired_wrappers(followed_by_rc, paired_streams);
+}
+
+auto_ptr<SequenceSingleReadStream> single_buffered_binary_multireader(bool followed_by_rc, bool including_paired_reads) {
+ convert_if_needed();
+
+ return auto_ptr<SequenceSingleReadStream>(new io::MultifileReader<io::SingleReadSeq>(single_buffered_binary_readers(followed_by_rc, including_paired_reads)));
+}
+
+auto_ptr<SequencePairedReadStream> paired_buffered_binary_multireader(bool followed_by_rc, size_t insert_size) {
+ convert_if_needed();
+
+ return auto_ptr<SequencePairedReadStream>(new io::MultifileReader<io::PairedReadSeq>(paired_buffered_binary_readers(followed_by_rc, insert_size)));
+}
+*/
+
+}
diff --git a/src/modules/io/graph_io/graph_print_utils.hpp b/src/modules/io/graph_io/graph_print_utils.hpp
new file mode 100755
index 0000000..abed05f
--- /dev/null
+++ b/src/modules/io/graph_io/graph_print_utils.hpp
@@ -0,0 +1,328 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef GRAPH_PRINTER_HPP_
+#define GRAPH_PRINTER_HPP_
+
+#include "dev_support/standard_base.hpp"
+
+namespace gvis {
+
+template<class VertexId>
+struct BaseVertex {
+ VertexId id_;
+ string label_;
+ string href_;
+ string fill_color_;
+ BaseVertex(VertexId id, string label, string reference, string fill_color) :id_(id), label_(label), href_(reference), fill_color_(fill_color) {
+ }
+};
+
+template<class VertexId>
+struct BaseEdge {
+ VertexId from;
+ VertexId to;
+ string label;
+ string color;
+ BaseEdge(VertexId _from, VertexId _to, string _label, string _color) {
+ from = _from;
+ to = _to;
+ label = _label;
+ color = _color;
+ }
+};
+
+class StreamRecorder {
+private:
+ ostream &os_;
+protected:
+ virtual ostream &os() {
+ return os_;
+ }
+public:
+ StreamRecorder(ostream &os) : os_(os) {
+ }
+
+ virtual ~StreamRecorder() {
+ }
+};
+
+template<class Vertex, class Edge>
+class GraphRecorder {
+public:
+ virtual void recordVertex(Vertex vertex) = 0;
+
+ virtual void recordEdge(Edge edge) = 0;
+
+ virtual inline void startGraphRecord(const string &name) = 0;
+
+ virtual inline void endGraphRecord() = 0;
+
+ virtual ~GraphRecorder(){
+ }
+};
+
+template<class VertexId>
+class SingleGraphRecorder : public GraphRecorder<BaseVertex<VertexId>, BaseEdge<VertexId>> {
+protected:
+ typedef BaseVertex<VertexId> Vertex;
+ typedef BaseEdge<VertexId> Edge;
+};
+
+template<class VertexId>
+class PairedGraphRecorder : public GraphRecorder<pair<BaseVertex<VertexId>, BaseVertex<VertexId>>, BaseEdge<pair<VertexId, VertexId>>> {
+protected:
+ typedef pair<BaseVertex<VertexId>, BaseVertex<VertexId>> Vertex;
+ typedef BaseEdge<pair<VertexId, VertexId>> Edge;
+};
+
+template<class VertexId>
+class DotGraphRecorder : public StreamRecorder {
+public:
+ DotGraphRecorder(ostream &os) : StreamRecorder(os) {
+ }
+
+protected:
+ template<class vid>
+ void recordVertexId(vid id) {
+ this->os() << "vertex_" << id;
+ }
+
+ string IdToStr(VertexId u) {
+ stringstream ss;
+ ss << u;
+ return ss.str();
+ }
+
+ string constructNodeId(VertexId v) {
+ return constructNodePairId(v, v);
+ }
+
+ inline void recordParameter(ostream &os, const string &name, const string &value) {
+ os << name << "=" << "<" << value << "> ";
+ }
+
+ inline void recordParameter(const string &name, const string &value) {
+ recordParameter(this->os(), name, value);
+ }
+
+ inline void recordParameterInQuotes(ostream &os, const string &name, const string &value) {
+ os << name << "=" << "\"" << value << "\" ";
+ }
+
+ inline void recordParameterInQuotes(const string &name, const string &value) {
+ recordParameterInQuotes(this->os(), name, value);
+ }
+
+ inline double getColorParameter(int l, int r, double perc) {
+ return l * perc + r * (1 - perc);
+ }
+
+ inline string getColor(int currentLength, int approximateLength) {
+ currentLength %= approximateLength;
+ int points[8][3] = {{0, 0, 1}, {0, 1, 1}, {1, 1, 1}, {0, 1, 0}, {1, 1, 0}, {1, 0, 1}, {0, 0, 1}};
+ stringstream ss;
+ int bound = approximateLength / 6;
+ int num = currentLength / bound;
+ double perc = (currentLength % bound) * 1. / bound;
+ for(int i = 0; i < 3; i++) {
+ ss << getColorParameter(points[num][i], points[num + 1][i], perc);
+ if(i != 2)
+ ss << ",";
+ }
+ return ss.str();
+ }
+
+};
+
+
+template<class SingleVertexId>
+class DotSingleGraphRecorder: public SingleGraphRecorder<SingleVertexId>, public DotGraphRecorder<SingleVertexId> {
+private:
+ typedef BaseVertex<SingleVertexId> Vertex;
+ typedef BaseEdge<SingleVertexId> Edge;
+
+public:
+ DotSingleGraphRecorder(ostream &os) : DotGraphRecorder<SingleVertexId>(os) {
+ }
+
+ void recordVertex(Vertex vertex) {
+ this->recordVertexId(vertex.id_);
+ this->os() << "[";
+ this->recordParameterInQuotes("label", vertex.label_);
+ this->os() << ",";
+ this->recordParameter("style", "filled");
+ this->os() << ",";
+ this->recordParameter("color", "black");
+ this->os() << ",";
+ if(vertex.href_ != "") {
+ this->recordParameterInQuotes("href", vertex.href_);
+ this->os() << ",";
+ }
+ this->recordParameter("fillcolor", vertex.fill_color_);
+ this->os() << "]" << endl;
+ }
+
+ void recordEdge(Edge edge) {
+ this->recordVertexId(edge.from);
+ this->os() << "->";
+ this->recordVertexId(edge.to);
+ this->os() << "[";
+ this->recordParameterInQuotes("label", edge.label);
+ this->os() << ",";
+ this->recordParameter("color", edge.color);
+ this->os() << "]" << endl;
+ }
+
+ inline void startGraphRecord(const string &name) {
+ this->os() << "digraph " << name << " {" << endl;
+ this->os() << "node" << "[";
+ this->recordParameter("fontname", "Courier");
+ this->recordParameter("penwidth", "1.8");
+ this->os() << "]" << endl;
+ }
+
+ inline void endGraphRecord() {
+ this->os() << "}" << endl;
+ }
+};
+
+template<class SingleVertexId>
+class DotPairedGraphRecorder: public PairedGraphRecorder<SingleVertexId>, public DotGraphRecorder<SingleVertexId> {
+private:
+ typedef BaseVertex<SingleVertexId> SingleVertex;
+ typedef BaseEdge<SingleVertexId> SingleEdge;
+ typedef typename PairedGraphRecorder<SingleVertexId>::Vertex Vertex;
+ typedef typename PairedGraphRecorder<SingleVertexId>::Edge Edge;
+
+
+ string constructNodePairId(SingleVertexId u, SingleVertexId v) {
+ stringstream ss;
+ string u_str = this->IdToStr(u);
+ string v_str = this->IdToStr(v);
+ if (u == v)
+ ss << u;
+ else if (u_str > v_str)
+ ss << v_str << "_" << u_str;
+ else
+ ss << u_str << "_" << v_str;
+ return ss.str();
+ }
+
+ inline string constructPortCell(const string &port, string href, const string &color) {
+ stringstream ss;
+ ss << "<TD BORDER=\"0\" PORT = \"port_" << port << "\" ";
+ this->recordParameterInQuotes(ss, "color", color);
+ this->recordParameterInQuotes(ss, "bgcolor", color);
+ if(href != "") {
+ ss <<"href=\"" << href << "\"";
+ }
+ ss << "></TD>";
+ return ss.str();
+ }
+
+ inline string constructLabelCell(const string &label, const string &href, const string &color) {
+ stringstream ss;
+ ss << "<TD BORDER=\"0\" ";
+ this->recordParameterInQuotes(ss, "color", color);
+ this->recordParameterInQuotes(ss, "bgcolor", color);
+ if(href != "") {
+ ss <<"href=\"" << href << "\"";
+ }
+ ss << ">"
+ << label << "</TD>";
+ return ss.str();
+ }
+
+ string constructComplexNodeId(string pairId, SingleVertexId v) {
+ stringstream ss;
+ ss << pairId << ":port_" << v;
+ return ss.str();
+ }
+
+ string constructTableEntry(SingleVertex v/*, const string &label, const string &href*/) {
+ stringstream ss;
+ ss << "<TR>";
+ ss << constructPortCell(ToString(v.id_) + "_in", v.href_, v.fill_color_);
+ ss << constructLabelCell(v.label_, v.href_, v.fill_color_);
+ ss << constructPortCell(ToString(v.id_) + "_out", v.href_, v.fill_color_);
+ ss << "</TR>\n";
+ return ss.str();
+ }
+
+ string constructReverceTableEntry(SingleVertex v/*, const string &label, const string &href*/) {
+ stringstream ss;
+ ss << "<TR>";
+ ss << constructPortCell(ToString(v.id_) + "_out", v.href_, v.fill_color_);
+ ss << constructLabelCell(v.label_, v.href_, v.fill_color_);
+ ss << constructPortCell(ToString(v.id_) + "_in", v.href_, v.fill_color_);
+ ss << "</TR>\n";
+ return ss.str();
+ }
+
+ string constructComplexNodeLabel(Vertex v) {
+ return "<TABLE BORDER=\"1\" CELLSPACING=\"0\" >\n" + constructTableEntry(v.first)
+ + constructReverceTableEntry(v.second) + "</TABLE>";
+ }
+
+ string constructVertexInPairId(SingleVertexId v, SingleVertexId rc) {
+ return constructComplexNodeId(constructNodePairId(v, rc), v);
+ }
+
+
+public:
+ DotPairedGraphRecorder(ostream &os) : DotGraphRecorder<SingleVertexId>(os) {
+ }
+
+ void recordPairedVertexId(SingleVertexId id1, SingleVertexId id2) {
+ this->os() << "vertex_" << constructNodePairId(id1, id2);
+ }
+
+ void recordVertex(Vertex vertex) {
+ string pairLabel = constructComplexNodeLabel(vertex);
+ recordPairedVertexId(vertex.first.id_, vertex.second.id_);
+ this->os() << "[";
+ this->recordParameter("label", constructComplexNodeLabel(vertex));
+ this->os() << ",";
+ this->recordParameter("color", "black");
+ this->os() << ",";
+ this->recordParameter("URL", "/vertex/" + std::to_string(vertex.first.id_) + ".svg");
+ this->os() << "]" << endl;
+ }
+
+ void recordEdge(Edge edge) {
+ this->recordVertexId(constructVertexInPairId(edge.from.first, edge.from.second));
+ this->os() << "_out";
+ this->os() << "->";
+ this->recordVertexId(constructVertexInPairId(edge.to.first, edge.to.second));
+ this->os() << "_in";
+ this->os() << "[";
+ this->recordParameterInQuotes("label", edge.label);
+ this->os() << ",";
+ this->recordParameter("color", edge.color);
+ this->os() << "]" << endl;
+ }
+
+ inline void startGraphRecord(const string &name) {
+ this->os() << "digraph " << name << " {" << endl;
+ this->os() << "node" << "[";
+ this->recordParameter("fontname", "Courier");
+ this->os() << ",";
+ this->recordParameter("penwidth", "1.8");
+ this->os() << ",";
+ this->recordParameter("shape", "plaintext");
+ this->os() << "]" << endl;
+ }
+
+ inline void endGraphRecord() {
+ this->os() << "}" << endl;
+ }
+};
+
+
+}
+#endif //GRAPH_PRINTER_HPP_//
diff --git a/src/modules/io/kmers_io/kmer_iterator.hpp b/src/modules/io/kmers_io/kmer_iterator.hpp
new file mode 100644
index 0000000..0e7a38e
--- /dev/null
+++ b/src/modules/io/kmers_io/kmer_iterator.hpp
@@ -0,0 +1,54 @@
+#ifndef __IO_KMER_ITERATOR_HPP__
+#define __IO_KMER_ITERATOR_HPP__
+
+#include "io/kmers_io/mmapped_reader.hpp"
+#include <string>
+
+namespace io {
+
+template<class Seq>
+using raw_kmer_iterator = MMappedFileRecordArrayIterator<typename Seq::DataType>;
+
+template<class Seq>
+raw_kmer_iterator<Seq> make_kmer_iterator(const std::string &FileName,
+ unsigned K) {
+ return raw_kmer_iterator<Seq>(FileName, Seq::GetDataSize(K));
+}
+
+template<class Seq>
+std::vector<raw_kmer_iterator<Seq>> make_kmer_iterator(const std::string &FileName,
+ size_t K, size_t amount) {
+ std::vector<raw_kmer_iterator<Seq>> res;
+ if (amount == 1) {
+ res.emplace_back(FileName, Seq::GetDataSize(K));
+ return res;
+ }
+
+ // Determine the file size
+ struct stat buf;
+ VERIFY_MSG(stat(FileName.c_str(), &buf) != -1,
+ "stat(2) failed. Reason: " << strerror(errno) << ". Error code: " << errno);
+ size_t file_size = buf.st_size;
+
+ // Now start creating the iterators keeping in mind, that offset should be
+ // multiple of page size.
+ size_t chunk = round_up(file_size / amount,
+ getpagesize() * Seq::GetDataSize(K) * sizeof(typename Seq::DataType));
+ size_t offset = 0;
+ if (chunk > file_size)
+ chunk = file_size;
+
+ while (offset < file_size) {
+ res.emplace_back(FileName, Seq::GetDataSize(K),
+ offset,
+ offset + chunk > file_size ? file_size - offset : chunk);
+ offset += chunk;
+ }
+
+ return res;
+}
+
+
+};
+
+#endif
diff --git a/src/modules/io/kmers_io/mmapped_reader.hpp b/src/modules/io/kmers_io/mmapped_reader.hpp
new file mode 100644
index 0000000..0fbe335
--- /dev/null
+++ b/src/modules/io/kmers_io/mmapped_reader.hpp
@@ -0,0 +1,396 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef HAMMER_MMAPPED_READER_HPP
+#define HAMMER_MMAPPED_READER_HPP
+
+#include "utils/adt/pointer_iterator.hpp"
+#include "utils/adt/array_vector.hpp"
+
+#include "dev_support/verify.hpp"
+
+#include <boost/iterator/iterator_facade.hpp>
+
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <cstring>
+#include <cerrno>
+
+#include <string>
+#include <algorithm>
+
+class MMappedReader {
+ int StreamFile;
+ bool Unlink;
+ std::string FileName;
+
+ void remap() {
+ VERIFY(BlockSize != FileSize);
+
+ if (MappedRegion)
+ munmap(MappedRegion, BlockSize);
+
+ BlockOffset += BlockSize;
+
+ if (BlockOffset + BlockSize > FileSize)
+ BlockSize = FileSize - BlockOffset;
+
+ // We do not add PROT_WRITE here intentionaly - remapping and write access
+ // is pretty error-prone.
+ if (BlockSize)
+ MappedRegion =
+ (uint8_t *) mmap(NULL, BlockSize,
+ PROT_READ, MAP_FILE | MAP_PRIVATE,
+ StreamFile, InitialOffset + BlockOffset);
+ else
+ MappedRegion = NULL;
+ VERIFY_MSG((intptr_t) MappedRegion != -1L,
+ "mmap(2) failed. Reason: " << strerror(errno) << ". Error code: " << errno);
+ }
+
+ void read_internal(void *buf, size_t amount) {
+ memcpy(buf, MappedRegion + BytesRead - BlockOffset, amount);
+ BytesRead += amount;
+ }
+
+protected:
+ uint8_t *MappedRegion;
+ size_t FileSize, BlockOffset, BytesRead, BlockSize;
+ off_t InitialOffset;
+
+public:
+ MMappedReader()
+ : StreamFile(-1), Unlink(false), FileName(""), MappedRegion(0), FileSize(0), BytesRead(0),
+ InitialOffset(0) { }
+
+ MMappedReader(const std::string &filename, bool unlink = false,
+ size_t blocksize = 64 * 1024 * 1024, off_t off = 0, size_t sz = 0)
+ : Unlink(unlink), FileName(filename), BlockSize(blocksize) {
+ struct stat buf;
+
+ InitialOffset = off;
+ FileSize = (sz ? sz : (stat(FileName.c_str(), &buf) != 0 ? 0 : buf.st_size - InitialOffset));
+
+ StreamFile = open(FileName.c_str(), O_RDONLY);
+ VERIFY_MSG(StreamFile != -1,
+ "open(2) failed. Reason: " << strerror(errno) << ". Error code: " << errno << ". File: " <<
+ FileName);
+
+ if (BlockSize != -1ULL) {
+ size_t PageSize = getpagesize();
+ BlockSize = BlockSize / PageSize * PageSize;
+ } else
+ BlockSize = FileSize;
+
+ if (BlockSize) {
+ MappedRegion =
+ (uint8_t *) mmap(NULL, BlockSize, PROT_READ | PROT_WRITE, MAP_FILE | MAP_PRIVATE,
+ StreamFile, InitialOffset);
+ VERIFY_MSG((intptr_t) MappedRegion != -1L,
+ "mmap(2) failed. Reason: " << strerror(errno) << ". Error code: " << errno);
+ } else
+ MappedRegion = NULL;
+
+ BlockOffset = BytesRead = 0;
+ }
+
+ MMappedReader(MMappedReader &&other) {
+ // First, copy out the stuff
+ MappedRegion = other.MappedRegion;
+ FileSize = other.FileSize;
+ BlockOffset = other.BlockOffset;
+ BytesRead = other.BytesRead;
+ BlockSize = other.BlockSize;
+ FileName = std::move(other.FileName);
+ Unlink = other.Unlink;
+ StreamFile = other.StreamFile;
+ InitialOffset = other.InitialOffset;
+
+ // Now, zero out inside other, so we won't do crazy thing in dtor
+ other.StreamFile = -1;
+ other.Unlink = false;
+ other.MappedRegion = 0;
+ }
+
+ MMappedReader &operator=(MMappedReader &&other) {
+ if (this != &other) {
+ *this = std::move(other);
+ }
+ return *this;
+ }
+
+ virtual ~MMappedReader() {
+ if (StreamFile != -1)
+ close(StreamFile);
+ if (MappedRegion)
+ munmap(MappedRegion, BlockSize);
+
+ if (Unlink) {
+ int res = unlink(FileName.c_str());
+ VERIFY_MSG(res == 0,
+ "unlink(2) failed. Reason: " << strerror(errno) << ". Error code: " << errno);
+ }
+ }
+
+ void read(void *buf, size_t amount) {
+ if (BytesRead + amount < BlockOffset + BlockSize) {
+ // Easy case, no remap is necessary
+ read_internal(buf, amount);
+ return;
+ }
+
+ // Hard case - remapping is necessary. First - finish the current block.
+ size_t ToRead = BlockSize - (BytesRead - BlockOffset);
+ uint8_t *cbuf = (uint8_t *) buf;
+
+ read_internal(cbuf, ToRead);
+ amount -= ToRead;
+ cbuf += ToRead;
+
+ // Next, read as much BlockSize blocks as possible.
+ while (amount >= BlockSize) {
+ remap();
+ read_internal(cbuf, BlockSize);
+ amount -= BlockSize;
+ cbuf += BlockSize;
+ }
+
+ // Finally, remap and read remaining.
+ remap();
+ read_internal(cbuf, amount);
+ }
+
+ void *skip(size_t amount) {
+ // Easy case, no remapping is needed
+ if (BytesRead + amount <= BlockOffset + BlockSize) {
+ void *out = MappedRegion + BytesRead - BlockOffset;
+ BytesRead += amount;
+
+ return out;
+ }
+
+ // Make sure data does not cross the block boundary
+ VERIFY(BytesRead == BlockOffset + BlockSize);
+
+ // Now, remap and read from the beginning of the block
+ remap();
+
+ return skip(amount);
+ }
+
+ bool good() const {
+ return BytesRead < FileSize;
+ }
+
+ size_t size() const { return FileSize; }
+
+ size_t data_size() const { return FileSize; }
+
+ void *data() const { return MappedRegion; }
+};
+
+template<typename T>
+class MMappedRecordReader : public MMappedReader {
+public:
+ typedef pointer_iterator<T> iterator;
+ typedef const pointer_iterator<T> const_iterator;
+
+ MMappedRecordReader(const std::string &FileName, bool unlink = true,
+ size_t blocksize = 64 * 1024 * 1024 / (sizeof(T) * (unsigned) getpagesize()) *
+ (sizeof(T) * (unsigned) getpagesize()),
+ off_t off = 0, size_t sz = 0) :
+ MMappedReader(FileName, unlink, blocksize, off, sz) {
+ VERIFY(FileSize % sizeof(T) == 0);
+ }
+
+ void read(T *el, size_t amount) {
+ MMappedReader::read(el, amount * sizeof(T));
+ }
+
+ size_t size() const { return FileSize / sizeof(T); }
+
+ size_t data_size() const { return FileSize; }
+
+ T *data() { return (T *) MappedRegion; }
+
+ const T *data() const { return (const T *) MappedRegion; }
+
+ T &operator[](size_t idx) { return data()[idx]; }
+
+ const T &operator[](size_t idx) const { return data()[idx]; }
+
+ iterator begin() { return iterator(data()); }
+
+ const_iterator begin() const { return const_iterator(data()); }
+
+ iterator end() { return iterator(data() + size()); }
+
+ const_iterator end() const { return const_iterator(data() + size()); }
+};
+
+template<class T>
+class MMappedFileRecordIterator :
+ public boost::iterator_facade<MMappedFileRecordIterator<T>,
+ const T,
+ std::input_iterator_tag> {
+public:
+ // Default ctor, used to implement "end" iterator
+ MMappedFileRecordIterator() : good_(false) { }
+
+ MMappedFileRecordIterator(const std::string &FileName)
+ : reader_(FileName, false), good_(true) {
+ reader_.read(&value_, sizeof(value_));
+ }
+
+ MMappedFileRecordIterator(MMappedRecordReader<T> &&reader)
+ : reader_(std::move(reader)), good_(true) {
+ reader_.read(&value_, sizeof(value_));
+ }
+
+ bool good() const {
+ return good_;
+ }
+
+private:
+ friend class boost::iterator_core_access;
+
+ void increment() {
+ good_ = reader_.good();
+ if (good_)
+ reader_.read(&value_, sizeof(value_));
+ }
+
+ bool equal(const MMappedFileRecordIterator &other) {
+ // Iterators are equal iff:
+ // 1) They both are not good (at the end of the stream),
+ // or
+ // 2) Has the same mapped region
+ return ((!reader_.good() && !other.reader_.good()) ||
+ reader_.data() == other.reader_.data());
+ }
+
+ const T dereference() const { return value_; }
+
+ T value_;
+ MMappedRecordReader<T> reader_;
+ bool good_;
+};
+
+template<typename T>
+class MMappedRecordArrayReader : public MMappedReader {
+ size_t elcnt_;
+
+public:
+ typedef typename array_vector<T>::iterator iterator;
+ typedef typename array_vector<T>::const_iterator const_iterator;
+
+ MMappedRecordArrayReader(const std::string &FileName,
+ size_t elcnt = 1,
+ bool unlink = true,
+ off_t off = 0, size_t sz = 0) :
+ MMappedReader(FileName, unlink, -1ULL, off, sz), elcnt_(elcnt) {
+ VERIFY(FileSize % (sizeof(T) * elcnt_) == 0);
+ }
+
+ void read(T *el, size_t amount) {
+ MMappedReader::read(el, amount * sizeof(T) * elcnt_);
+ }
+
+ size_t size() const { return FileSize / sizeof(T) / elcnt_; }
+
+ size_t data_size() const { return FileSize; }
+
+ size_t elcnt() const { return elcnt_; }
+
+ T *data() { return (T *) MappedRegion; }
+
+ const T *data() const { return (const T *) MappedRegion; }
+
+ T &operator[](size_t idx) { return data()[idx * elcnt_]; }
+
+ const T &operator[](size_t idx) const { return data()[idx * elcnt_]; }
+
+ iterator begin() { return iterator(data(), /* size */ elcnt_); }
+
+ const_iterator begin() const { return const_iterator(data()), /* size */ elcnt_; }
+
+ const_iterator cbegin() const { return const_iterator(data()), /* size */ elcnt_; }
+
+ iterator end() { return iterator(data() + size() * elcnt_, elcnt_); }
+
+ const_iterator end() const { return const_iterator(data() + size() * elcnt_, elcnt_); }
+
+ const_iterator cend() const { return const_iterator(data() + size() * elcnt_, elcnt_); }
+};
+
+static inline size_t round_up(size_t value, size_t boundary) {
+ return (value + boundary - 1) / boundary * boundary;
+}
+
+template<class T>
+class MMappedFileRecordArrayIterator :
+ public boost::iterator_facade<MMappedFileRecordArrayIterator<T>,
+ const T *,
+ std::input_iterator_tag,
+ const T *> {
+public:
+ // Default ctor, used to implement "end" iterator
+ MMappedFileRecordArrayIterator() : value_(NULL), array_size_(0), reader_(), good_(false) { }
+
+ MMappedFileRecordArrayIterator(const std::string &FileName,
+ size_t elcnt,
+ off_t offset = 0, size_t filesize = 0)
+ : value_(NULL),
+ array_size_(sizeof(T) * elcnt),
+ reader_(FileName, false,
+ round_up(filesize > 0 ? std::min(size_t(64 * 1024 * 1024), filesize) : 64 * 1024 * 1024,
+ array_size_ * (unsigned) getpagesize()),
+ offset, filesize),
+ good_(false) {
+ increment();
+ }
+
+ MMappedFileRecordArrayIterator(MMappedRecordReader<T> &&reader, size_t elcnt)
+ : value_(NULL), array_size_(sizeof(T) * elcnt), reader_(std::move(reader)), good_(false) {
+ increment();
+ }
+
+ MMappedFileRecordArrayIterator(const MMappedFileRecordArrayIterator &) = delete;
+
+ MMappedFileRecordArrayIterator(MMappedFileRecordArrayIterator &&other)
+ : value_(other.value_), array_size_(other.array_size_),
+ reader_(std::move(other.reader_)), good_(other.good_) { }
+
+ bool good() const { return good_; }
+
+ const MMappedRecordReader<T> &reader() const { return reader_; }
+
+private:
+ friend class boost::iterator_core_access;
+
+ void increment() {
+ good_ = reader_.good();
+ value_ = (good_ ? (T *) reader_.skip(array_size_) : NULL);
+ }
+
+ bool equal(const MMappedFileRecordArrayIterator &other) const {
+ return value_ == other.value_;
+ }
+
+ const T *dereference() const { return value_; }
+
+ T *value_;
+ size_t array_size_;
+ MMappedRecordReader<T> reader_;
+ bool good_;
+};
+
+#endif // HAMMER_MMAPPED_READER_HPP
diff --git a/src/modules/io/kmers_io/mmapped_writer.hpp b/src/modules/io/kmers_io/mmapped_writer.hpp
new file mode 100644
index 0000000..1f90a42
--- /dev/null
+++ b/src/modules/io/kmers_io/mmapped_writer.hpp
@@ -0,0 +1,191 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef HAMMER_MMAPPED_WRITER_HPP
+#define HAMMER_MMAPPED_WRITER_HPP
+
+#include "utils/adt/pointer_iterator.hpp"
+#include "utils/adt/array_vector.hpp"
+
+#include <string>
+
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <strings.h>
+
+class MMappedWriter {
+ int StreamFile;
+
+ MMappedWriter(const MMappedWriter &) = delete;
+
+protected:
+ uint8_t *MappedRegion;
+ size_t BytesWritten, BytesReserved, FileOffset, BufOffset;
+public:
+ MMappedWriter() = default;
+
+ MMappedWriter(const std::string &FileName) {
+ open(FileName);
+ }
+
+ void open(const std::string &FileName) {
+ StreamFile = ::open(FileName.c_str(), O_RDWR | O_CREAT | O_TRUNC, (mode_t) 0660);
+ VERIFY_MSG(StreamFile != -1,
+ "open(2) failed. Reason: " << strerror(errno) << ". Error code: " << errno);
+
+ FileOffset = BytesWritten = 0;
+ MappedRegion = NULL;
+ }
+
+ virtual ~MMappedWriter() {
+ if (MappedRegion)
+ munmap(MappedRegion, BytesReserved);
+ close(StreamFile);
+ }
+
+ void write(void *buf, size_t amount) {
+ memcpy(MappedRegion + BufOffset + BytesWritten, buf, amount);
+ BytesWritten += amount;
+ }
+
+ bool good() const {
+ return BytesWritten < BytesReserved;
+ }
+
+ void reserve(size_t amount) {
+ if (MappedRegion) {
+ munmap(MappedRegion, BytesReserved);
+ FileOffset += BytesWritten;
+ MappedRegion = NULL;
+ }
+
+ if (amount == 0)
+ return;
+
+ int res = (int) lseek(StreamFile, amount - 1, SEEK_CUR);
+ VERIFY_MSG(res != -1,
+ "lseek(2) failed. Reason: " << strerror(errno) << ". Error code: " << errno);
+ res = (int) ::write(StreamFile, "", 1);
+ VERIFY_MSG(res != -1,
+ "write(2) failed. Reason: " << strerror(errno) << ". Error code: " << errno);
+
+ // FileOffset here should be aligned to page boundary. Tune the stuff due to this fact.
+ int PageSize = getpagesize();
+ size_t FileOffsetAligned = FileOffset / PageSize * PageSize;
+ size_t Residual = FileOffset - FileOffsetAligned;
+
+ BytesReserved = amount + Residual;
+ BytesWritten = 0;
+ BufOffset = Residual;
+ MappedRegion =
+ (uint8_t *) mmap(NULL, BytesReserved,
+ PROT_READ | PROT_WRITE, MAP_FILE | MAP_SHARED,
+ StreamFile, FileOffsetAligned);
+ VERIFY_MSG((intptr_t) MappedRegion != -1L,
+ "mmap(2) failed. Reason: " << strerror(errno) << ". Error code: " << errno);
+ }
+
+ size_t size() const { return BytesReserved; }
+};
+
+template<typename T>
+class MMappedRecordWriter : public MMappedWriter {
+public:
+ typedef pointer_iterator<T> iterator;
+ typedef const pointer_iterator<T> const_iterator;
+
+ MMappedRecordWriter() = default;
+
+ MMappedRecordWriter(const std::string &FileName) :
+ MMappedWriter(FileName) {
+ }
+
+ void write(const T *el, size_t amount) {
+ MMappedWriter::write((void *) el, amount * sizeof(T));
+ }
+
+ void reserve(size_t amount) {
+ MMappedWriter::reserve(amount * sizeof(T));
+ }
+
+ void resize(size_t amount) {
+ MMappedWriter::reserve(amount * sizeof(T));
+ }
+
+ size_t size() const { return BytesReserved / sizeof(T); }
+
+ T *data() { return (T *) MappedRegion; }
+
+ const T *data() const { return (const T *) MappedRegion; }
+
+ T &operator[](size_t idx) { return data()[idx]; }
+
+ const T &operator[](size_t idx) const { return data()[idx]; }
+
+ iterator begin() { return iterator(data()); }
+
+ const_iterator begin() const { return const_iterator(data()); }
+
+ iterator end() { return iterator(data() + size()); }
+
+ const_iterator end() const { return const_iterator(data() + size()); }
+};
+
+template<typename T>
+class MMappedRecordArrayWriter : public MMappedWriter {
+ size_t elcnt_;
+public:
+ typedef typename array_vector<T>::iterator iterator;
+ typedef typename array_vector<T>::const_iterator const_iterator;
+
+ MMappedRecordArrayWriter() = default;
+
+ MMappedRecordArrayWriter(const std::string &FileName,
+ size_t elcnt = 1) :
+ MMappedWriter(FileName), elcnt_(elcnt) { }
+
+ void open(const std::string &FileName,
+ size_t elcnt = 1) {
+ elcnt_ = elcnt;
+ MMappedWriter::open(FileName);
+ }
+
+ void write(const T *el, size_t amount) {
+ MMappedWriter::write((void *) el, amount * sizeof(T) * elcnt_);
+ }
+
+ void reserve(size_t amount) {
+ MMappedWriter::reserve(amount * sizeof(T) * elcnt_);
+ }
+
+ void resize(size_t amount) {
+ MMappedWriter::reserve(amount * sizeof(T) * elcnt_);
+ }
+
+ size_t size() const { return BytesReserved / sizeof(T) / elcnt_; }
+
+ T *data() { return (T *) MappedRegion; }
+
+ const T *data() const { return (const T *) MappedRegion; }
+
+ T &operator[](size_t idx) { return data()[idx * elcnt_]; }
+
+ const T &operator[](size_t idx) const { return data()[idx * elcnt_]; }
+
+ iterator begin() { return iterator(data(), elcnt_); }
+
+ const_iterator begin() const { return const_iterator(data(), elcnt_); }
+
+ iterator end() { return iterator(data() + size() * elcnt_, elcnt_); }
+
+ const_iterator end() const { return const_iterator(data() + size() * elcnt_, elcnt_); }
+};
+
+#endif // HAMMER_MMAPPED_WRITER_HPP
diff --git a/src/modules/io/reads/paired_read.hpp b/src/modules/io/reads/paired_read.hpp
new file mode 100644
index 0000000..2c498d7
--- /dev/null
+++ b/src/modules/io/reads/paired_read.hpp
@@ -0,0 +1,186 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "single_read.hpp"
+
+#include <string>
+#include <utility>
+
+namespace io {
+
+
+class PairedRead {
+public:
+ typedef SingleRead SingleReadT;
+ typedef int16_t size_type;
+
+ PairedRead() : first_(), second_(), insert_size_(0) { }
+
+ PairedRead(const SingleRead &first,
+ const SingleRead &second,
+ size_t insert_size)
+ : first_(first), second_(second), insert_size_(insert_size) { }
+
+ const SingleRead &first() const {
+ return first_;
+ }
+
+ const SingleRead &second() const {
+ return second_;
+ }
+
+ size_t insert_size() const {
+ return insert_size_;
+ }
+
+ size_t distance() const {
+ return insert_size_ - second_.size();
+ }
+
+ size_t gap() const {
+ return insert_size_ - first_.size() - second_.size();
+ }
+
+ size_t size() const {
+ return std::max(first_.size(), second_.size());
+ }
+
+ size_t nucl_count() const {
+ return first_.size() + second_.size();
+ }
+
+ bool IsValid() const {
+ return first_.IsValid() && second_.IsValid();
+ }
+
+ const SingleRead &operator[](size_t i) const {
+ if (i == 0) {
+ return first_;
+ } else if (i == 1) {
+ return second_;
+ }
+ VERIFY(false);
+ return first_;
+ }
+
+ const PairedRead operator!() const {
+ return PairedRead(!second_, !first_, insert_size_);
+ }
+
+ bool operator==(const PairedRead &pairedread) const {
+ return first_ == pairedread.first_ &&
+ second_ == pairedread.second_ &&
+ insert_size_ == pairedread.insert_size_;
+ }
+
+ bool BinWrite(std::ostream &file, bool rc1 = false, bool rc2 = false) const {
+ first_.BinWrite(file, rc1);
+ second_.BinWrite(file, rc2);
+
+ return !file.fail();
+ }
+
+ void print_size() const {
+ first_.print_size();
+ second_.print_size();
+ }
+
+private:
+ SingleRead first_;
+ SingleRead second_;
+ size_t insert_size_;
+
+};
+
+inline std::ostream &operator<<(std::ostream &os, const PairedRead &read) {
+ os << "Single read first=" << read.first() << " second=" << read.second() << std::endl;
+ return os;
+}
+
+class PairedReadSeq {
+public:
+ typedef SingleReadSeq SingleReadT;
+private:
+ SingleReadSeq first_;
+ SingleReadSeq second_;
+ size_t insert_size_;
+
+public:
+ PairedReadSeq() : first_(), second_(), insert_size_(0) { }
+
+ bool BinRead(std::istream &file, size_t is = 0) {
+ first_.BinRead(file);
+ second_.BinRead(file);
+
+ insert_size_ = is - (size_t) first_.GetLeftOffset() - (size_t) second_.GetRightOffset();
+ return !file.fail();
+ }
+
+ bool BinWrite(std::ostream &file, bool rc1 = false, bool rc2 = false) const {
+ first_.BinWrite(file, rc1);
+ second_.BinWrite(file, rc2);
+
+ return !file.fail();
+ }
+
+ const SingleReadSeq &first() const {
+ return first_;
+ }
+
+ const SingleReadSeq &second() const {
+ return second_;
+ }
+
+ size_t insert_size() const {
+ return insert_size_;
+ }
+
+ size_t distance() const {
+ return insert_size_ - second_.size();
+ }
+
+ size_t gap() const {
+ return insert_size_ - first_.size() - second_.size();
+ }
+
+ size_t size() const {
+ return std::max(first_.size(), second_.size());
+ }
+
+ size_t nucl_count() const {
+ return first_.size() + second_.size();
+ }
+
+ PairedReadSeq(const SingleReadSeq &first,
+ const SingleReadSeq &second,
+ size_t insert_size)
+ : first_(first), second_(second), insert_size_(insert_size) { }
+
+ const SingleReadSeq &operator[](size_t i) const {
+ if (i == 0) {
+ return first_;
+ } else if (i == 1) {
+ return second_;
+ }
+ VERIFY(false);
+ return first_;
+ }
+
+ const PairedReadSeq operator!() const {
+ return PairedReadSeq(!second_, !first_, insert_size_);
+ }
+
+};
+
+inline std::ostream &operator<<(std::ostream &os, const PairedReadSeq &read) {
+ os << "Paired read first=" << read.first() << " second=" << read.second() << std::endl;
+ return os;
+}
+
+}
diff --git a/src/modules/io/reads/read.hpp b/src/modules/io/reads/read.hpp
new file mode 100644
index 0000000..02f4c74
--- /dev/null
+++ b/src/modules/io/reads/read.hpp
@@ -0,0 +1,244 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * read.hpp
+ *
+ * Created on: 29.03.2011
+ * Author: vyahhi
+ */
+
+#ifndef READ_HPP_
+#define READ_HPP_
+
+#include <string>
+#include <iostream>
+#include <fstream>
+#include "dev_support/verify.hpp"
+#include "data_structures/sequence/quality.hpp"
+#include "data_structures/sequence/sequence.hpp"
+#include "data_structures/sequence/nucl.hpp"
+#include "data_structures/sequence/sequence_tools.hpp"
+#include "dev_support/simple_tools.hpp"
+
+//fixme deprecated!!! used in hammer!
+class Read {
+public:
+ static const int PHRED_OFFSET = 33;
+
+ bool isValid() const {
+ return valid_;
+ }
+
+ Sequence getSequence() const {
+ VERIFY(valid_);
+ return Sequence(seq_);
+ }
+
+ Sequence getSubSequence(size_t start, size_t length) const __attribute__ ((deprecated)) {
+ VERIFY(length > 0 && start + length <= seq_.size());
+ return Sequence(seq_.substr(start, length));
+ }
+
+ Quality getQuality() const {
+ VERIFY(valid_);
+ return Quality(qual_);
+ }
+
+ const std::string &getSequenceString() const {
+ return seq_;
+ }
+
+ const std::string &getQualityString() const {
+ return qual_;
+ }
+
+ std::string getPhredQualityString(int offset = PHRED_OFFSET) const {
+ std::string res = qual_;
+ for (size_t i = 0; i < res.size(); ++i) {
+ res[i] = (char) (res[i] + offset);
+ }
+ return res;
+ }
+
+ const std::string &getName() const {
+ return name_;
+ }
+
+ size_t size() const {
+ return seq_.size();
+ }
+
+ char operator[](size_t i) const {
+ VERIFY(is_nucl(seq_[i]));
+ return dignucl(seq_[i]);
+ }
+
+ /**
+ * trim read
+ * @param ltrim first good base
+ * @param rtrim last good base
+ * @return whether there is anything left
+ */
+ bool trimLeftRight(int ltrim, int rtrim) {
+ if (ltrim >= (int) seq_.size() || rtrim < 0 || rtrim < ltrim) {
+ seq_ = "";
+ qual_ = "";
+ valid_ = false;
+ return 0;
+ }
+ bool donesomething = false;
+ if (ltrim > 0) {
+ ltrim_ += ltrim;
+ seq_.erase(0, ltrim);
+ qual_.erase(0, ltrim);
+ donesomething = true;
+ }
+ if (rtrim - ltrim + 1 < (int) seq_.size() && rtrim < (int) seq_.size() - ltrim - 1) {
+ rtrim_ -= ((int) seq_.size() - (rtrim - ltrim + 1));
+ seq_.erase(rtrim - ltrim + 1, std::string::npos);
+ qual_.erase(rtrim - ltrim + 1, std::string::npos);
+ donesomething = true;
+ }
+ if (donesomething) valid_ = updateValid();
+ return true;
+ }
+
+ size_t trimNsAndBadQuality(int threshold) {
+ int start = 0;
+ for (; start < (int) seq_.size(); ++start) {
+ if (seq_[start] != 'N' && (int) qual_[start] > threshold) break;
+ }
+ int end = 0;
+ for (end = (int) seq_.size() - 1; end > -1; --end) {
+ if (seq_[end] != 'N' && (int) qual_[end] > threshold) break;
+ }
+ if (!trimLeftRight(start, end)) return 0;
+ else return seq_.size();
+ }
+
+ /**
+ * @param k k as in k-mer
+ * @param start start point
+ * @return the first starting point of a valid k-mer >=start; return -1 if no such place exists
+ */
+ size_t firstValidKmer(size_t start, size_t k) const __attribute__ ((deprecated)) {
+ size_t curHypothesis = start;
+ size_t i = start;
+ for (; i < seq_.size(); ++i) {
+ if (i >= k + curHypothesis)
+ return curHypothesis;
+ if (!is_nucl(seq_[i])) {
+ curHypothesis = i + 1;
+ }
+ }
+ if (i >= k + curHypothesis) {
+ return curHypothesis;
+ }
+ return -1ULL;
+ }
+
+ void setSequence(const char *s, bool preserve_trimming = false) {
+ seq_ = s;
+ if (!preserve_trimming) {
+ ltrim_ = 0;
+ rtrim_ = initial_size_ = (int) seq_.size();
+ }
+ valid_ = updateValid();
+ }
+
+ void setQuality(const char *s, int offset = PHRED_OFFSET) {
+ qual_ = s;
+ for (size_t i = 0; i < qual_.size(); ++i) {
+ qual_[i] = (char) (qual_[i] - offset);
+ }
+ }
+
+ void setName(const char *s) {
+ name_ = s;
+ }
+
+ Read()
+ : valid_(false), ltrim_(0), rtrim_(0), initial_size_(0) {
+ ;
+ }
+
+ Read(const std::string &name, const std::string &seq, const std::string &qual) :
+ name_(name), seq_(seq), qual_(qual) { // for test only!
+ ltrim_ = 0;
+ initial_size_ = rtrim_ = (int) seq_.size();
+ valid_ = updateValid();
+ }
+
+ int ltrim() const { return ltrim_; }
+
+ void set_ltrim(unsigned val) { ltrim_ = val; };
+
+ int rtrim() const { return rtrim_; }
+
+ int initial_size() const { return initial_size_; }
+
+private:
+ std::string name_;
+ std::string seq_;
+ std::string qual_;
+ bool valid_;
+ int ltrim_;
+ int rtrim_;
+ int initial_size_;
+
+ friend class ireadstream;
+
+ friend uint32_t TrimBadQuality(Read *, int);
+
+ bool updateValid() const {
+ if (seq_.size() == 0) {
+ return false;
+ }
+ for (size_t i = 0; i < seq_.size(); ++i) {
+ if (!is_nucl(seq_[i])) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+public:
+ Read operator!() const {
+ std::string newName;
+ if (name_ == "" || name_[0] != '!') {
+ newName = '!' + name_;
+ } else {
+ newName = name_.substr(1, name_.length());
+ }
+ return Read(newName, ReverseComplement(seq_), Reverse(qual_));
+ }
+
+ void print(std::ostream &outf, int offset) const {
+ outf << "@" << name_.c_str() << "\n";
+ for (int i = 0; i < ltrim_; ++i) outf << "N";
+ outf << seq_.c_str();
+ for (int i = 0; i < initial_size_ - rtrim_; ++i) outf << "N";
+ outf << "\n" << "+" << name_.c_str();
+ if (ltrim_ > 0) outf << " ltrim=" << ltrim_;
+ if (rtrim_ < initial_size_)
+ outf << " rtrim=" << (initial_size_ - rtrim_);
+ outf << "\n";
+ char badq = (char) (offset + 2);
+ for (int i = 0; i < ltrim_; ++i) outf << badq;
+ outf << getPhredQualityString(offset).c_str();
+ for (int i = 0; i < initial_size_ - rtrim_; ++i) outf << badq;
+ outf << "\n";
+ }
+};
+
+// todo: put this to *.cpp
+//ostream& operator<<(ostream& os, const Read& read) {
+// return os << read.getSequenceString();
+//}
+
+#endif /* READ_HPP_ */
diff --git a/src/modules/io/reads/single_read.hpp b/src/modules/io/reads/single_read.hpp
new file mode 100644
index 0000000..c307eaa
--- /dev/null
+++ b/src/modules/io/reads/single_read.hpp
@@ -0,0 +1,334 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "dev_support/verify.hpp"
+#include "data_structures/sequence/quality.hpp"
+#include "data_structures/sequence/sequence.hpp"
+#include "data_structures/sequence/nucl.hpp"
+#include "data_structures/sequence/sequence_tools.hpp"
+#include "dev_support/simple_tools.hpp"
+
+#include <string>
+
+namespace io {
+
+/*
+* This enumerate contains offset type.
+* UnknownOffset is equal to "offset = 0".
+* PhredOffset is equal to "offset = 33".
+* SolexaOffset is equal to "offset = 64".
+*/
+enum OffsetType {
+ UnknownOffset = 0,
+ PhredOffset = 33,
+ SolexaOffset = 64
+};
+
+//todo extract code about offset from here
+
+typedef uint16_t SequenceOffsetT;
+
+
+class SingleRead {
+public:
+
+ static std::string EmptyQuality(const std::string &seq) {
+ return std::string(seq.size(), (char) 33);
+ }
+
+ static const int BAD_QUALITY_THRESHOLD = 2;
+
+ SingleRead() :
+ name_(""), seq_(""), qual_(""), left_offset_(0), right_offset_(0), valid_(false) {
+ DEBUG(name_ << " created");
+ }
+
+ SingleRead(const std::string &name, const std::string &seq,
+ const std::string &qual, OffsetType offset,
+ SequenceOffsetT left_offset = 0, SequenceOffsetT right_offset = 0) :
+ name_(name), seq_(seq), qual_(qual), left_offset_(left_offset), right_offset_(right_offset) {
+ Init();
+ DEBUG(name_ << " created");
+ for (size_t i = 0; i < qual_.size(); ++i) {
+ qual_[i] = (char) (qual_[i] - offset);
+ }
+ }
+
+ SingleRead(const std::string &name, const std::string &seq,
+ const std::string &qual,
+ SequenceOffsetT left_offset = 0, SequenceOffsetT right_offset = 0) :
+ name_(name), seq_(seq), qual_(qual), left_offset_(left_offset), right_offset_(right_offset) {
+ DEBUG(name_ << " created");
+ Init();
+ }
+
+ SingleRead(const std::string &name, const std::string &seq,
+ SequenceOffsetT left_offset = 0, SequenceOffsetT right_offset = 0) :
+ name_(name), seq_(seq), qual_(EmptyQuality(seq_)), left_offset_(left_offset),
+ right_offset_(right_offset) {
+ DEBUG(name_ << " created");
+ Init();
+ }
+
+ bool IsValid() const {
+ return valid_;
+ }
+
+ Sequence sequence(bool rc = false) const {
+ VERIFY(valid_);
+ return Sequence(seq_, rc);
+ }
+
+ Quality quality() const {
+ VERIFY(valid_);
+ return Quality(qual_);
+ }
+
+ const std::string &name() const {
+ return name_;
+ }
+
+ size_t size() const {
+ return seq_.size();
+ }
+
+ size_t nucl_count() const {
+ return size();
+ }
+
+ const std::string &GetSequenceString() const {
+ return seq_;
+ }
+
+ const std::string &GetQualityString() const {
+ return qual_;
+ }
+
+ std::string GetPhredQualityString() const {
+ int offset = PhredOffset;
+ std::string res = qual_;
+ for (size_t i = 0; i < res.size(); ++i) {
+ res[i] = (char) (res[i] + offset);
+ }
+ return res;
+ }
+
+ /*
+ * Return ith nucleotide of SingleRead sequence in unreadable form
+ * (0, 1, 2 or 3).
+ *
+ * @param i Nucleotide index.
+ * @return Nucleotide on ith position of SingleRead sequence.
+ */
+ char operator[](size_t i) const {
+ VERIFY(is_nucl(seq_[i]));
+ return dignucl(seq_[i]);
+ }
+
+ SingleRead operator!() const {
+ std::string new_name;
+ if (name_.length() >= 3 && name_.substr(name_.length() - 3) == "_RC") {
+ new_name = name_.substr(0, name_.length() - 3);
+ } else {
+ new_name = name_ + "_RC";
+ }
+ // TODO make naming nicer
+ // if (name_ == "" || name_[0] != '!') {
+ // new_name = '!' + name_;
+ // } else {
+ // new_name = name_.substr(1, name_.length());
+ // }
+ return SingleRead(new_name, ReverseComplement(seq_), Reverse(qual_), right_offset_, left_offset_);
+ }
+
+ SingleRead SubstrStrict(size_t from, size_t to) const {
+ size_t len = to - from;
+ // return SingleRead(name_, seq_.substr(from, len), qual_.substr(from, len));
+ // TODO remove naming?
+ std::string new_name;
+ if (name_.length() >= 3 && name_.substr(name_.length() - 3) == "_RC") {
+ new_name = name_.substr(0, name_.length() - 3) + "_SUBSTR(" + ToString(size() - to) + "," +
+ ToString(size() - from) + ")" + "_RC";
+ } else {
+ new_name = name_ + "_SUBSTR(" + ToString(from) + "," + ToString(to) + ")";
+ }
+ return SingleRead(new_name, seq_.substr(from, len), qual_.substr(from, len),
+ SequenceOffsetT(from + (size_t) left_offset_),
+ SequenceOffsetT(size() - to + (size_t) right_offset_));
+ }
+
+ SingleRead Substr(size_t from, size_t to) const {
+ size_t len = to - from;
+ if (len == size()) {
+ return *this;
+ }
+ if (len == 0) {
+ return SingleRead();
+ }
+ return SubstrStrict(from, to);
+ }
+
+ bool operator==(const SingleRead &singleread) const {
+ return seq_ == singleread.seq_;
+ }
+
+ void ChangeName(const std::string &new_name) {
+ name_ = new_name;
+ }
+
+ static bool IsValid(const std::string &seq) {
+ for (size_t i = 0; i < seq.size(); ++i) {
+ if (!is_nucl(seq[i])) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ SequenceOffsetT GetLeftOffset() const {
+ return left_offset_;
+ }
+
+ SequenceOffsetT GetRightOffset() const {
+ return right_offset_;
+ }
+
+ bool BinWrite(std::ostream &file, bool rc = false) const {
+ sequence(rc).BinWrite(file);
+ if (rc) {
+ file.write((const char *) &right_offset_, sizeof(right_offset_));
+ file.write((const char *) &left_offset_, sizeof(left_offset_));
+ } else {
+ file.write((const char *) &left_offset_, sizeof(left_offset_));
+ file.write((const char *) &right_offset_, sizeof(right_offset_));
+ }
+ return !file.fail();
+ }
+
+
+ void print_size() const {
+ std::cerr << size() << std::endl;
+ }
+
+
+private:
+ /*
+ * @variable The name of SingleRead in input file.
+ */
+ std::string name_;
+ /*
+ * @variable The sequence of nucleotides.
+ */
+ std::string seq_;
+ /*
+ * @variable The quality of SingleRead.
+ */
+ std::string qual_;
+ /*
+ * @variable The flag of SingleRead correctness.
+ */
+
+ //Left and right offsets with respect to original sequence
+ SequenceOffsetT left_offset_;
+
+ SequenceOffsetT right_offset_;
+
+ bool valid_;
+
+ void Init() {
+ VERIFY(seq_.size() == qual_.size());
+ valid_ = SingleRead::IsValid(seq_);
+ }
+
+};
+
+inline std::ostream &operator<<(std::ostream &os, const SingleRead &read) {
+ os << "Single read name=" << read.name() << " sequence=" << read.GetSequenceString() << std::endl;
+ return os;
+}
+
+class SingleReadSeq {
+
+public:
+ SingleReadSeq(const Sequence &s,
+ SequenceOffsetT left_offset = 0, SequenceOffsetT right_offset = 0) :
+ seq_(s), left_offset_(left_offset), right_offset_(right_offset) {
+ }
+
+ SingleReadSeq() : seq_(), left_offset_(0), right_offset_(0) {
+ }
+
+ bool BinRead(std::istream &file) {
+ seq_.BinRead(file);
+ file.read((char *) &left_offset_, sizeof(left_offset_));
+ file.read((char *) &right_offset_, sizeof(right_offset_));
+ return !file.fail();
+ }
+
+ bool BinWrite(std::ostream &file, bool rc = false) const {
+ if (rc)
+ (!seq_).BinWrite(file);
+ else
+ seq_.BinWrite(file);
+ if (rc) {
+ file.write((const char *) &right_offset_, sizeof(right_offset_));
+ file.write((const char *) &left_offset_, sizeof(left_offset_));
+ } else {
+ file.write((const char *) &left_offset_, sizeof(left_offset_));
+ file.write((const char *) &right_offset_, sizeof(right_offset_));
+ }
+ return !file.fail();
+ }
+
+ // SingleReadSeq(std::istream& file): seq_(file, true) {
+ // }
+
+ bool operator==(const SingleReadSeq &singleread) const {
+ return seq_ == singleread.seq_;
+ }
+
+ const Sequence sequence() const {
+ return seq_;
+ }
+
+ size_t size() const {
+ return seq_.size();
+ }
+
+ size_t nucl_count() const {
+ return size();
+ }
+
+ SingleReadSeq operator!() const {
+ return SingleReadSeq(!seq_);
+ }
+
+ SequenceOffsetT GetLeftOffset() const {
+ return left_offset_;
+ }
+
+ SequenceOffsetT GetRightOffset() const {
+ return right_offset_;
+ }
+
+private:
+ Sequence seq_;
+
+ //Left and right offsets with respect to original sequence
+ SequenceOffsetT left_offset_;
+
+ SequenceOffsetT right_offset_;
+};
+
+inline std::ostream &operator<<(std::ostream &os, const SingleReadSeq &read) {
+ os << "Single read sequence=" << read.sequence() << std::endl;
+ return os;
+}
+
+}
diff --git a/src/modules/io/reads_io/binary_converter.hpp b/src/modules/io/reads_io/binary_converter.hpp
new file mode 100644
index 0000000..7da965f
--- /dev/null
+++ b/src/modules/io/reads_io/binary_converter.hpp
@@ -0,0 +1,295 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * binary_io.hpp
+ *
+ * Created on: Apr 12, 2012
+ * Author: andrey
+ */
+
+#ifndef BINARY_IO_HPP_
+#define BINARY_IO_HPP_
+
+#include <fstream>
+
+#include "dev_support/verify.hpp"
+#include "ireader.hpp"
+#include "io/reads/single_read.hpp"
+#include "io/reads/paired_read.hpp"
+#include "pipeline/library.hpp"
+
+namespace io {
+
+template<class Read>
+class ReadBinaryWriter {
+
+public:
+
+ ReadBinaryWriter(LibraryOrientation /*orientation*/ = LibraryOrientation::Undefined) {
+ }
+
+ bool Write(std::ostream& file, const Read& r) const {
+ return r.BinWrite(file);
+ }
+};
+
+template<>
+class ReadBinaryWriter<PairedRead> {
+
+private:
+
+ bool rc1_;
+
+ bool rc2_;
+
+public:
+
+ ReadBinaryWriter(LibraryOrientation orientation) {
+ switch (orientation) {
+ case LibraryOrientation::FF: {
+ rc1_ = false;
+ rc2_ = false;
+ break;
+ }
+ case LibraryOrientation::RR: {
+ rc1_ = true;
+ rc2_ = true;
+ break;
+ }
+ case LibraryOrientation::FR: {
+ rc1_ = false;
+ rc2_ = true;
+ break;
+ }
+ case LibraryOrientation::RF: {
+ rc1_ = true;
+ rc2_ = false;
+ break;
+ }
+ default: {
+ rc1_ = false;
+ rc2_ = false;
+ break;
+ }
+ }
+
+ }
+
+ bool Write(std::ostream& file, const PairedRead& r) const {
+ return r.BinWrite(file, rc1_, rc2_);
+ }
+};
+
+
+class BinaryWriter {
+
+private:
+ const std::string file_name_prefix_;
+
+ size_t file_num_;
+
+ std::vector<std::ofstream*> file_ds_;
+
+ size_t buf_size_;
+
+ template<class Read>
+ void FlushBuffer(const std::vector<Read>& buffer, const ReadBinaryWriter<Read>& read_writer, std::ostream& file, size_t from, size_t to) {
+ for (size_t i = from; i < to; ++i) {
+ read_writer.Write(file, buffer[i]);
+ }
+ }
+
+ template<class Read>
+ void FlushBuffer(const std::vector<Read>& buffer, const ReadBinaryWriter<Read>& read_writer, std::ostream& file) {
+ FlushBuffer(buffer, read_writer, file, 0, buffer.size());
+ }
+
+ template<class Read>
+ ReadStreamStat ToBinary(io::ReadStream<Read>& stream, size_t buf_size,
+ LibraryOrientation orientation) {
+
+ ReadBinaryWriter<Read> read_writer(orientation);
+ size_t buffer_reads = buf_size / (sizeof (Read) * 4);
+ size_t reads_to_flush = buffer_reads * file_num_;
+
+ std::vector< std::vector<Read> > buf(file_num_, std::vector<Read>(buffer_reads) );
+ std::vector< ReadStreamStat > read_stats(file_num_);
+ std::vector< size_t > current_buf_sizes(file_num_, 0);
+ size_t read_count = 0;
+
+ for (size_t i = 0; i < file_num_; ++i) {
+ file_ds_[i]->seekp(0);
+ read_stats[i].write(*file_ds_[i]);
+ }
+
+ size_t buf_index;
+ while (!stream.eof()) {
+ buf_index = read_count % file_num_;
+
+ Read& r = buf[buf_index][current_buf_sizes[buf_index]];
+ stream >> r;
+ read_stats[buf_index].increase(r);
+
+ ++current_buf_sizes[buf_index];
+ VERBOSE_POWER(++read_count, " reads processed");
+
+ if (read_count % reads_to_flush == 0) {
+ for (size_t i = 0; i < file_num_; ++i) {
+ FlushBuffer(buf[i], read_writer, *file_ds_[i]);
+ current_buf_sizes[i] = 0;
+ }
+ }
+ }
+
+ ReadStreamStat result;
+ for (size_t i = 0; i < file_num_; ++i) {
+ buf[i].resize(current_buf_sizes[i]);
+ FlushBuffer(buf[i], read_writer, *file_ds_[i]);
+
+ file_ds_[i]->seekp(0);
+ read_stats[i].write(*file_ds_[i]);
+ result.merge(read_stats[i]);
+ }
+
+ INFO(read_count << " reads written");
+ return result;
+ }
+
+
+ template<class Read>
+ ReadStreamStat ToBinaryForThread(io::ReadStream<Read>& stream, size_t buf_size,
+ size_t thread_num, LibraryOrientation orientation) {
+
+ ReadBinaryWriter<Read> read_writer(orientation);
+ size_t buffer_reads = buf_size / (sizeof (Read) * 4);
+ std::vector<Read> buf(buffer_reads);
+
+ ReadStreamStat stat;
+ file_ds_[thread_num]->seekp(0);
+ stat.write(*file_ds_[thread_num]);
+
+ size_t current = 0;
+
+ while (!stream.eof()) {
+ Read& r = buf[current];
+ stream >> r;
+ stat.increase(r);
+ ++current;
+
+ if (stat.read_count_ % buffer_reads == 0) {
+ FlushBuffer(buf, read_writer, *file_ds_[thread_num]);
+ current = 0;
+ }
+ }
+
+ buf.resize(current);
+ FlushBuffer(buf, read_writer, *file_ds_[thread_num]);
+
+ file_ds_[thread_num]->seekp(0);
+ stat.write(*file_ds_[thread_num]);
+
+ return stat;
+ }
+
+
+public:
+
+ BinaryWriter(const std::string& file_name_prefix, size_t file_num,
+ size_t buf_size):
+ file_name_prefix_(file_name_prefix), file_num_(file_num),
+ file_ds_(), buf_size_(buf_size) {
+
+ std::string fname;
+ for (size_t i = 0; i < file_num_; ++i) {
+ fname = file_name_prefix_ + "_" + ToString(i) + ".seq";
+ file_ds_.push_back(new std::ofstream(fname, std::ios_base::binary));
+ }
+ }
+
+ ~BinaryWriter() {
+ for (size_t i = 0; i < file_num_; ++i) {
+ if (file_ds_[i]->is_open()) {
+ file_ds_[i]->close();
+ }
+ delete file_ds_[i];
+ }
+ }
+
+
+ ReadStreamStat ToBinary(io::ReadStream<io::SingleReadSeq>& stream) {
+ return ToBinary(stream, buf_size_ / file_num_, LibraryOrientation::Undefined);
+ }
+
+ ReadStreamStat ToBinary(io::ReadStream<io::SingleRead>& stream) {
+ return ToBinary(stream, buf_size_ / file_num_, LibraryOrientation::Undefined);
+ }
+
+ ReadStreamStat ToBinary(io::ReadStream<io::PairedReadSeq>& stream) {
+ return ToBinary(stream, buf_size_ / (2 * file_num_), LibraryOrientation::Undefined);
+ }
+
+ ReadStreamStat ToBinary(io::ReadStream<io::PairedRead>& stream, LibraryOrientation orientation) {
+ return ToBinary(stream, buf_size_ / (2 * file_num_), orientation);
+ }
+
+ ReadStreamStat ToBinaryForThread(io::ReadStream<io::SingleReadSeq>& stream, size_t thread_num) {
+ return ToBinaryForThread(stream, buf_size_ / file_num_, thread_num, LibraryOrientation::Undefined);
+ }
+
+ ReadStreamStat ToBinaryForThread(io::ReadStream<io::SingleRead>& stream, size_t thread_num) {
+ return ToBinaryForThread(stream, buf_size_ / file_num_, thread_num, LibraryOrientation::Undefined);
+ }
+
+ ReadStreamStat ToBinaryForThread(io::ReadStream<io::PairedReadSeq>& stream, size_t thread_num) {
+ return ToBinaryForThread(stream, buf_size_ / (2 * file_num_), thread_num, LibraryOrientation::Undefined);
+ }
+
+ ReadStreamStat ToBinaryForThread(io::ReadStream<io::PairedRead>& stream, size_t thread_num, LibraryOrientation orientation) {
+ return ToBinaryForThread(stream, buf_size_ / (2 * file_num_), thread_num, orientation);
+ }
+
+// template<class Read>
+// void WriteReads(std::vector<Read>& data) {
+// size_t chunk_size = data.size() / file_num_;
+// size_t last_chunk_size = chunk_size + data.size() % file_num_;
+//
+// for (size_t i = 0; i < file_num_ - 1; ++i) {
+// file_ds_[i]->write((const char *) &chunk_size, sizeof(chunk_size));
+// }
+// file_ds_.back()->write((const char *) &last_chunk_size, sizeof(last_chunk_size));
+//
+// size_t start_pos = 0;
+// for (size_t i = 0; i < file_num_ - 1; ++i, start_pos += chunk_size) {
+// FlushBuffer(data, *file_ds_[i], start_pos, start_pos + chunk_size);
+// }
+// FlushBuffer(data, file_ds_.back(), start_pos, data.size());
+// }
+//
+// template<class Read>
+// void WriteSeparatedReads(std::vector< std::vector<Read> >& data) {
+// if (data.size() != file_num_) {
+// WARN("Cannot write reads, number of vectors is not equal to thread number");
+// return;
+// }
+//
+// for (size_t i = 0; i < file_num_; ++i) {
+// size_t size = data[i].size();
+// file_ds_[i]->write((const char *) &size, sizeof(size));
+// }
+//
+// for (size_t i = 0; i < file_num_; ++i) {
+// FlushBuffer(data[i], *file_ds_[i]);
+// }
+// }
+};
+
+
+}
+
+
+#endif /* BINARY_IO_HPP_ */
diff --git a/src/modules/io/reads_io/binary_streams.hpp b/src/modules/io/reads_io/binary_streams.hpp
new file mode 100644
index 0000000..d7679f2
--- /dev/null
+++ b/src/modules/io/reads_io/binary_streams.hpp
@@ -0,0 +1,357 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <fstream>
+
+#include "dev_support/verify.hpp"
+#include "ireader.hpp"
+#include "io/reads/single_read.hpp"
+#include "io/reads/paired_read.hpp"
+
+namespace io {
+
+// == Deprecated classes ==
+// Use FileReadStream and InsertSizeModyfing instead
+
+class BinaryFileSingleStream: public PredictableReadStream<SingleReadSeq> {
+private:
+ std::ifstream stream_;
+ ReadStreamStat read_stat_;
+ size_t current_;
+
+public:
+
+ BinaryFileSingleStream(const std::string& file_name_prefix, size_t file_num) {
+ std::string fname;
+ fname = file_name_prefix + "_" + ToString(file_num) + ".seq";
+ stream_.open(fname.c_str(), std::ios_base::binary | std::ios_base::in);
+
+ reset();
+ }
+
+ virtual bool is_open() {
+ return stream_.is_open();
+ }
+
+ virtual bool eof() {
+ return current_ == read_stat_.read_count_;
+ }
+
+ virtual BinaryFileSingleStream& operator>>(SingleReadSeq& read) {
+ read.BinRead(stream_);
+ VERIFY(current_ < read_stat_.read_count_);
+
+ ++current_;
+ return *this;
+ }
+
+ virtual void close() {
+ current_ = 0;
+ stream_.close();
+ }
+
+ virtual void reset() {
+ stream_.clear();
+ stream_.seekg(0);
+ VERIFY(stream_.good());
+ read_stat_.read(stream_);
+ current_ = 0;
+ }
+
+ virtual size_t size() const {
+ return read_stat_.read_count_;
+ }
+
+ virtual ReadStreamStat get_stat() const {
+ return read_stat_;
+ }
+
+};
+
+class BinaryFilePairedStream: public PredictableReadStream<PairedReadSeq> {
+
+private:
+ std::ifstream stream_;
+
+ size_t insert_size_;
+
+ ReadStreamStat read_stat_;
+
+ size_t current_;
+
+
+public:
+
+ BinaryFilePairedStream(const std::string& file_name_prefix, size_t file_num, size_t insert_szie): stream_(), insert_size_ (insert_szie) {
+ std::string fname;
+ fname = file_name_prefix + "_" + ToString(file_num) + ".seq";
+ stream_.open(fname.c_str(), std::ios_base::binary | std::ios_base::in);
+
+ reset();
+ }
+
+ virtual bool is_open() {
+ return stream_.is_open();
+ }
+
+ virtual bool eof() {
+ return current_ >= read_stat_.read_count_;
+ }
+
+ virtual BinaryFilePairedStream& operator>>(PairedReadSeq& read) {
+ read.BinRead(stream_, insert_size_);
+ VERIFY(current_ < read_stat_.read_count_);
+
+ ++current_;
+ return *this;
+ }
+
+ virtual void close() {
+ current_ = 0;
+ stream_.close();
+ }
+
+
+ virtual void reset() {
+ stream_.clear();
+ stream_.seekg(0);
+ VERIFY(stream_.good());
+ read_stat_.read(stream_);
+ current_ = 0;
+ }
+
+ virtual size_t size() const {
+ return read_stat_.read_count_;
+ }
+
+ ReadStreamStat get_stat() const {
+ ReadStreamStat stat = read_stat_;
+ stat.read_count_ *= 2;
+ return stat;
+ }
+};
+
+
+//template <class Read>
+//class FileReadStream: public io::PredictableIReader<Read> {
+//
+//private:
+// std::ifstream stream_;
+//
+// ReadStat read_stat_;
+//
+// size_t current_;
+//
+//public:
+//
+// FileReadStream(const std::string& file_name_prefix, size_t file_num) {
+// std::string fname;
+// fname = file_name_prefix + "_" + ToString(file_num) + ".seq";
+// stream_.open(fname.c_str(), std::ios_base::binary | std::ios_base::in);
+//
+// reset();
+// }
+//
+// virtual ~FileReadStream() {
+// if (stream_.is_open()) {
+// stream_.close();
+// }
+// }
+//
+// virtual bool is_open() {
+// return stream_.is_open();
+// }
+//
+// virtual bool eof() {
+// return current_ == read_stat_.read_count_;
+// }
+//
+// virtual FileReadStream& operator>>(Read& read) {
+// read.BinRead(stream_);
+// VERIFY(current_ < read_stat_.read_count_);
+//
+// ++current_;
+// return *this;
+// }
+//
+// virtual void close() {
+// current_ = 0;
+// stream_.close();
+// }
+//
+// virtual void reset() {
+// stream_.clear();
+// stream_.seekg(0);
+// VERIFY(stream_.good());
+// read_stat_.read(stream_);
+// current_ = 0;
+// }
+//
+// virtual size_t size() const {
+// return read_stat_.read_count_;
+// }
+//
+// virtual ReadStat get_stat() const {
+// return read_stat_;
+// }
+//};
+
+//template <class Read>
+//class ReadBufferedStream: public io::PredictableIReader<Read> {
+//
+//private:
+// std::vector<Read> * data_;
+//
+// ReadStat read_stat_;
+//
+// size_t current_;
+//
+//public:
+//
+// ReadBufferedStream(io::PredictableIReader<Read>& stream) {
+// read_stat_ = stream.get_stat();
+// data_ = new std::vector<Read>(read_stat_.read_count_);
+//
+// size_t i = 0;
+// while (!stream.eof()) {
+// stream >> (*data_)[i++];
+// }
+//
+// reset();
+// }
+//
+// virtual ~ReadBufferedStream() {
+// delete data_;
+// }
+//
+// virtual bool is_open() {
+// return true;
+// }
+//
+// virtual bool eof() {
+// return current_ == read_stat_.read_count_;
+// }
+//
+// virtual ReadBufferedStream& operator>>(Read& read) {
+// read = (*data_)[current_];
+// VERIFY(current_ < read_stat_.read_count_);
+//
+// ++current_;
+// return *this;
+// }
+//
+// virtual void close() {
+// current_ = 0;
+// }
+//
+// virtual void reset() {
+// current_ = 0;
+// }
+//
+// virtual size_t size() const {
+// return read_stat_.read_count_;
+// }
+//
+// virtual ReadStat get_stat() const {
+// return read_stat_;
+// }
+//};
+
+//class SeqSingleReadStreamWrapper: public Reader<SingleReadSeq> {
+//
+//private:
+// io::IReader<io::PairedReadSeq>& stream_;
+//
+// PairedReadSeq current_read_;
+//
+// bool is_read_;
+//
+//public:
+//
+// SeqSingleReadStreamWrapper(io::IReader<io::PairedReadSeq>& stream): stream_(stream), current_read_(), is_read_(false) {
+// }
+//
+// virtual ~SeqSingleReadStreamWrapper() {}
+//
+// virtual bool is_open() {
+// return stream_.is_open();
+// }
+//
+// virtual bool eof() {
+// return stream_.eof() && !is_read_;
+// }
+//
+// virtual SeqSingleReadStreamWrapper& operator>>(io::SingleReadSeq& read) {
+// if (!is_read_) {
+// stream_ >> current_read_;
+// read = current_read_.first();
+// } else {
+// read = current_read_.second();
+// }
+// is_read_ = !is_read_;
+// return *this;
+// }
+//
+// virtual void close() {
+// stream_.close();
+// }
+//
+// virtual void reset() {
+// stream_.reset();
+// is_read_ = false;
+// }
+//
+// virtual ReadStat get_stat() const {
+// return stream_.get_stat();
+// }
+//};
+
+//class InsertSizeModifyingWrapper: public io::IReader<io::PairedReadSeq> {
+//
+//private:
+// io::IReader<io::PairedReadSeq>& stream_;
+//
+// size_t insert_size_;
+//
+//public:
+//
+// InsertSizeModifyingWrapper(io::IReader<io::PairedReadSeq>& stream, size_t insert_szie): stream_(stream), insert_size_ (insert_szie) {
+// }
+//
+// virtual ~InsertSizeModifyingWrapper() {
+// }
+//
+// virtual bool is_open() {
+// return stream_.is_open();
+// }
+//
+// virtual bool eof() {
+// return stream_.eof();
+// }
+//
+// virtual InsertSizeModifyingWrapper& operator>>(io::PairedReadSeq& read) {
+// stream_ >> read;
+// read.inc_insert_size(insert_size_);
+// return *this;
+// }
+//
+// virtual void close() {
+// stream_.close();
+// }
+//
+// virtual void reset() {
+// stream_.reset();
+// }
+//
+// virtual ReadStat get_stat() const {
+// return stream_.get_stat();
+// }
+//};
+
+}
diff --git a/src/modules/io/reads_io/careful_filtering_reader_wrapper.hpp b/src/modules/io/reads_io/careful_filtering_reader_wrapper.hpp
new file mode 100644
index 0000000..188ba6b
--- /dev/null
+++ b/src/modules/io/reads_io/careful_filtering_reader_wrapper.hpp
@@ -0,0 +1,183 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+#pragma once
+//todo rename file
+#include "io/reads_io/delegating_reader_wrapper.hpp"
+#include "pipeline/library.hpp"
+
+namespace io {
+
+const size_t none = -1ul;
+
+inline std::pair<size_t, size_t> LongestValidCoords(const SingleRead& r) {
+ size_t best_len = 0;
+ size_t best_pos = none;
+ size_t pos = none;
+ std::string seq = r.GetSequenceString();
+ for (size_t i = 0; i <= seq.size(); ++i) {
+ if (i < seq.size() && is_nucl(seq[i])) {
+ if (pos == none) {
+ pos = i;
+ }
+ } else {
+ if (pos != none) {
+ size_t len = i - pos;
+ if (len > best_len) {
+ best_len = len;
+ best_pos = pos;
+ }
+ }
+ pos = none;
+ }
+ }
+ if (best_len == 0) {
+ return std::make_pair(0, 0);
+ }
+ return std::make_pair(best_pos, best_pos + best_len);
+}
+
+inline SingleRead LongestValid(const SingleRead& r,
+ bool /*use_orientation*/ = false,
+ LibraryOrientation /*orientation*/ = LibraryOrientation::FR) {
+
+ std::pair<size_t, size_t> p = LongestValidCoords(r);
+ return r.Substr(p.first, p.second);
+}
+
+inline PairedRead LongestValid(const PairedRead& r,
+ bool use_orientation = false,
+ LibraryOrientation orientation = LibraryOrientation::FR) {
+ std::pair<size_t, size_t> c1 = LongestValidCoords(r.first());
+ std::pair<size_t, size_t> c2 = LongestValidCoords(r.second());
+ size_t len1 = c1.second - c1.first;
+ size_t len2 = c2.second - c2.first;
+ if (len1 == 0 || len2 == 0) {
+ return PairedRead();
+ }
+ if (len1 == r.first().size() && len2 == r.second().size()) {
+ return r;
+ }
+
+ size_t is;
+ if (!use_orientation) {
+ is = r.insert_size() - c1.first - r.second().size() + c2.second;
+ }
+ else {
+ switch (orientation) {
+ case LibraryOrientation::FF: {
+ is = r.insert_size() - c1.first - r.second().size() + c2.second;
+ break;
+ }
+ case LibraryOrientation::RR: {
+ is = r.insert_size() - r.first().size() + c1.second - c2.first;
+ break;
+ }
+ case LibraryOrientation::FR: {
+ is = r.insert_size() - c1.first - c2.first;
+ break;
+ }
+ case LibraryOrientation::RF: {
+ is = r.insert_size() - r.first().size() + c1.second - r.second().size() + c2.second;
+ break;
+ }
+ default: {
+ is = r.insert_size() - c1.first - r.second().size() + c2.second;
+ break;
+ }
+ }
+ }
+
+ return PairedRead(r.first().Substr(c1.first, c1.second), r.second().Substr(c2.first, c2.second), is);
+}
+
+
+//todo rewrite without eof
+template<typename ReadType>
+class CarefulFilteringWrapper : public DelegatingWrapper<ReadType> {
+ typedef DelegatingWrapper<ReadType> base;
+public:
+ /*
+ * Default constructor.
+ *
+ * @param reader Reference to any other reader (child of IReader).
+ */
+ CarefulFilteringWrapper(typename base::ReadStreamPtrT reader_ptr,
+ bool use_orientation = false,
+ LibraryOrientation orientation = LibraryOrientation::Undefined) :
+ base(reader_ptr),
+ eof_(false),
+ use_orientation_(use_orientation),
+ orientation_(orientation) {
+ StepForward();
+ }
+
+ /* virtual */ bool eof() {
+ return eof_;
+ }
+
+ /*
+ * Read SingleRead from stream.
+ *
+ * @param read The SingleRead that will store read * data.
+ *
+ * @return Reference to this stream.
+ */
+ /* virtual */ CarefulFilteringWrapper& operator>>(ReadType& read) {
+ read = next_read_;
+ StepForward();
+ return *this;
+ }
+
+ /* virtual */
+ void reset() {
+ base::reset();
+ eof_ = false;
+ StepForward();
+ }
+
+private:
+ bool eof_;
+ bool use_orientation_;
+ LibraryOrientation orientation_;
+ ReadType next_read_;
+
+ /*
+ * Read next valid read in the stream.
+ */
+ void StepForward() {
+ while (!base::eof()) {
+ base::operator >>(next_read_);
+ next_read_ = LongestValid(next_read_, use_orientation_, orientation_);
+ if (next_read_.IsValid()) {
+ return;
+ }
+ }
+ eof_ = true;
+ }
+};
+
+template<class ReadType>
+std::shared_ptr<ReadStream<ReadType>> CarefulFilteringWrap(std::shared_ptr<ReadStream<ReadType>> reader_ptr,
+ bool use_orientation = false,
+ LibraryOrientation orientation = LibraryOrientation::Undefined) {
+ //return reader_ptr = make_shared<CarefulFilteringWrapper<ReadType>>(reader_ptr, false, LibraryOrientation::Undefined);
+ return std::shared_ptr<CarefulFilteringWrapper<ReadType> >(
+ new CarefulFilteringWrapper<ReadType>(reader_ptr, use_orientation, orientation));
+}
+
+template<class ReadType>
+ReadStreamList<ReadType> CarefulFilteringWrap(const ReadStreamList<ReadType>& readers,
+ bool use_orientation = false,
+ LibraryOrientation orientation = LibraryOrientation::Undefined) {
+ ReadStreamList<ReadType> answer;
+ for (size_t i = 0; i < readers.size(); ++i) {
+ answer.push_back(CarefulFilteringWrap<ReadType>(readers.ptr_at(i), use_orientation, orientation));
+ }
+ return answer;
+}
+
+}
diff --git a/src/modules/io/reads_io/converting_reader_wrapper.hpp b/src/modules/io/reads_io/converting_reader_wrapper.hpp
new file mode 100644
index 0000000..2f40fd7
--- /dev/null
+++ b/src/modules/io/reads_io/converting_reader_wrapper.hpp
@@ -0,0 +1,121 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "read_stream_vector.hpp"
+
+namespace io {
+
+/**
+* SquashingWrapper is the class-wrapper that reads SingleReads
+* from Reader<PairedRead> (first and second single reads in a pair
+* one by one).
+*/
+template<class PairedReadType>
+class SquashingWrapper : public ReadStream<typename PairedReadType::SingleReadT> {
+ typedef typename PairedReadType::SingleReadT SingleReadT;
+ typedef std::shared_ptr<ReadStream<PairedReadType>> PairedReaderPtrT;
+public:
+
+ explicit SquashingWrapper(PairedReaderPtrT reader)
+ : reader_(reader), pairedread_(), index_(0) {
+ }
+
+ /*
+ * Check whether the stream is opened.
+ *
+ * @return true if the stream is opened and false otherwise.
+ */
+ /* virtual */ bool is_open() {
+ return reader_->is_open();
+ }
+
+ /*
+ * Check whether we've reached the end of stream.
+ *
+ * @return true if the end of the stream is reached and false
+ * otherwise.
+ */
+ /* virtual */ bool eof() {
+ return (index_ == 0) && (reader_->eof());
+ }
+
+ /*
+ * Read SingleRead from stream (which is actually the part of
+ * PairedRead from stream).
+ *
+ * @param singleread The SingleRead that will store read data.
+ *
+ * @return Reference to this stream.
+ */
+ /* virtual */ SquashingWrapper &operator>>(
+ SingleReadT &singleread) {
+ if (index_ == 0) {
+ (*reader_) >> pairedread_;
+ }
+ singleread = pairedread_[index_];
+ index_ = 1 - index_;
+ return (*this);
+ }
+
+ /*
+ * Close the stream.
+ */
+ /* virtual */ void close() {
+ reader_->close();
+ }
+
+ /*
+ * Close the stream and open it again.
+ */
+ /* virtual */ void reset() {
+ index_ = 0;
+ reader_->reset();
+ }
+
+ ReadStreamStat get_stat() const {
+ return reader_->get_stat();
+ }
+
+private:
+ /*
+ * @variable Internal stream reader.
+ */
+ PairedReaderPtrT reader_;
+ /*
+ * @variable Element that stores the last read PairedRead from
+ * stream.
+ */
+ PairedReadType pairedread_;
+ /*
+ * @variable Index of current part of PairedRead.
+ */
+ size_t index_;
+
+};
+
+template<class PairedReadType>
+std::shared_ptr<ReadStream<typename PairedReadType::SingleReadT>> SquashingWrap(
+ std::shared_ptr<ReadStream<PairedReadType>> reader_ptr) {
+ return std::make_shared<SquashingWrapper<PairedReadType>>(reader_ptr);
+}
+
+template<class PairedReadType>
+ReadStreamList<typename PairedReadType::SingleReadT> SquashingWrap(ReadStreamList<PairedReadType> &readers) {
+ ReadStreamList<typename PairedReadType::SingleReadT> answer;
+ for (size_t i = 0; i < readers.size(); ++i) {
+ answer.push_back(SquashingWrap<PairedReadType>(readers.ptr_at(i)));
+ }
+ return answer;
+}
+
+//template<class ReaderPtrType>
+//std::shared_ptr<Reader<typename ReaderPtrType::element_type::ReadT::SingleReadT>> SquashingWrap(ReaderPtrType reader_ptr) {
+// return std::make_shared<SquashingWrapper<typename ReaderPtrType::element_type::ReadT>>(reader_ptr);
+//}
+}
diff --git a/src/include/io/cutting_reader_wrapper.hpp b/src/modules/io/reads_io/cutting_reader_wrapper.hpp
similarity index 100%
rename from src/include/io/cutting_reader_wrapper.hpp
rename to src/modules/io/reads_io/cutting_reader_wrapper.hpp
diff --git a/src/modules/io/reads_io/delegating_reader_wrapper.hpp b/src/modules/io/reads_io/delegating_reader_wrapper.hpp
new file mode 100644
index 0000000..b63043a
--- /dev/null
+++ b/src/modules/io/reads_io/delegating_reader_wrapper.hpp
@@ -0,0 +1,64 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "ireader.hpp"
+
+namespace io {
+
+//todo rename file
+template<typename ReadType>
+class DelegatingWrapper: public ReadStream<ReadType> {
+public:
+ typedef std::shared_ptr<ReadStream<ReadType>> ReadStreamPtrT;
+
+ explicit DelegatingWrapper(ReadStreamPtrT reader) : reader_(reader) {}
+
+
+ /* virtual */ bool is_open() {
+ return reader_->is_open();
+ }
+
+ /* virtual */ bool eof() {
+ return reader_->eof();
+ }
+
+ /* virtual */ DelegatingWrapper& operator>>(ReadType& read) {
+ (*reader_) >> read;
+ return *this;
+ }
+
+ /* virtual */
+ void close() {
+ reader_->close();
+ }
+
+ /*
+ * Close the stream and open it again.
+ */
+ /* virtual */
+ void reset() {
+ reader_->reset();
+ }
+
+ /* virtual */
+ ReadStreamStat get_stat() const {
+ return reader_->get_stat();
+ }
+
+protected:
+ ReadStream<ReadType>& reader() {
+ return *reader_;
+ }
+
+private:
+ ReadStreamPtrT reader_;
+
+};
+
+}
diff --git a/src/modules/io/reads_io/easy_reader.hpp b/src/modules/io/reads_io/easy_reader.hpp
new file mode 100644
index 0000000..98df7fb
--- /dev/null
+++ b/src/modules/io/reads_io/easy_reader.hpp
@@ -0,0 +1,122 @@
+////***************************************************************************
+////* Copyright (c) 2011-2014 Saint-Petersburg Academic University
+////* All Rights Reserved
+////* See file LICENSE for details.
+////****************************************************************************
+//
+//#pragma once
+//
+//#include "ireader.hpp"
+//#include "paired_readers.hpp"
+//#include "delegating_reader_wrapper.hpp"
+//#include "splitting_wrapper.hpp"
+//#include "rc_reader_wrapper.hpp"
+//#include "filtering_reader_wrapper.hpp"
+//#include "careful_filtering_reader_wrapper.hpp"
+//#include "single_read.hpp"
+//#include "io_helper.hpp"
+//
+//#include <memory>
+//
+//namespace io {
+//
+//////todo refactor, and maybe merge them once again
+////class EasyReader: public DelegatingReaderWrapper<SingleRead> {
+//// explicit EasyReader(const EasyReader& reader);
+//// void operator=(const EasyReader& reader);
+////
+//// Reader raw_reader_;
+////// FilteringReaderWrapper<ReadType> filtered_reader_;
+//// CarefulFilteringReaderWrapper<SingleRead> filtered_reader_;
+//// RCReaderWrapper<SingleRead> rc_reader_;
+////
+////public:
+//// explicit EasyReader(const string& filename,
+//// bool followed_by_rc, OffsetType offset_type = PhredOffset) :
+//// raw_reader_(filename, offset_type), filtered_reader_(raw_reader_), rc_reader_(
+//// filtered_reader_) {
+//// if (followed_by_rc) {
+//// Init(rc_reader_);
+//// } else {
+//// Init(filtered_reader_);
+//// }
+//// }
+////
+//// /*
+//// * Default destructor.
+//// */
+//// /* virtual */
+//// ~EasyReader() {
+//// }
+////
+////};
+////
+//////todo refactor, and maybe merge them once again
+////class EasySplittingReader: public DelegatingReaderWrapper<io::SingleRead> {
+//// explicit EasySplittingReader(const EasySplittingReader& reader);
+//// void operator=(const EasySplittingReader& reader);
+////
+//// Reader raw_reader_;
+////// FilteringReaderWrapper<ReadType> filtered_reader_;
+//// SplittingWrapper splitting_reader_;
+//// RCReaderWrapper<io::SingleRead> rc_reader_;
+////
+////public:
+//// explicit EasySplittingReader(const io::SingleRead::FilenameType& filename,
+//// bool followed_by_rc, OffsetType offset_type = PhredOffset) :
+//// raw_reader_(filename, offset_type), splitting_reader_(raw_reader_), rc_reader_(
+//// splitting_reader_) {
+//// if (followed_by_rc) {
+//// Init(rc_reader_);
+//// } else {
+//// Init(splitting_reader_);
+//// }
+//// }
+////
+//// /*
+//// * Default destructor.
+//// */
+//// /* virtual */
+//// ~EasySplittingReader() {
+//// }
+////
+////};
+//
+////class PairedEasyReader: public DelegatingReaderWrapper<io::PairedRead> {
+//// std::unique_ptr<IReader<io::PairedRead>> raw_reader_;
+//// CarefulFilteringReaderWrapper<io::PairedRead> filtered_reader_;
+//// RCReaderWrapper<io::PairedRead> rc_reader_;
+////
+////public:
+//// PairedEasyReader(const io::PairedRead::FilenamesType& filenames,
+//// bool followed_by_rc, size_t insert_size, bool change_read_order =
+//// false, bool use_orientation = true, LibraryOrientation orientation = LibraryOrientation::FR,
+//// OffsetType offset_type = PhredOffset) :
+//// raw_reader_(
+//// new SeparateReader(filenames, insert_size,
+//// change_read_order, use_orientation, orientation, offset_type)), filtered_reader_(
+//// *raw_reader_), rc_reader_(filtered_reader_) {
+//// if (followed_by_rc) {
+//// Init(rc_reader_);
+//// } else {
+//// Init(filtered_reader_);
+//// }
+//// }
+////
+//// PairedEasyReader(const std::string& filename, bool followed_by_rc,
+//// size_t insert_size, bool change_read_order = false,
+//// bool use_orientation = true, LibraryOrientation orientation = LibraryOrientation::FR,
+//// OffsetType offset_type = PhredOffset) :
+//// raw_reader_(
+//// new MixedReader(filename, insert_size, change_read_order,
+//// use_orientation, orientation, offset_type)), filtered_reader_(
+//// *raw_reader_), rc_reader_(filtered_reader_) {
+//// if (followed_by_rc) {
+//// Init(rc_reader_);
+//// } else {
+//// Init(filtered_reader_);
+//// }
+//// }
+////};
+//
+//}
diff --git a/src/modules/io/reads_io/fasta_fastq_gz_parser.hpp b/src/modules/io/reads_io/fasta_fastq_gz_parser.hpp
new file mode 100644
index 0000000..7cb42c0
--- /dev/null
+++ b/src/modules/io/reads_io/fasta_fastq_gz_parser.hpp
@@ -0,0 +1,165 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/**
+ * @file fastqgz_parser.hpp
+ * @author Mariya Fomkina
+ * @version 1.0
+ *
+ * @section LICENSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * @section DESCRIPTION
+ *
+ * FastaFastqGzParser is the parser stream that reads data from .fastq.gz
+ * files.
+ */
+
+#ifndef COMMON_IO_FASTAFASTQGZPARSER_HPP
+#define COMMON_IO_FASTAFASTQGZPARSER_HPP
+
+#include <zlib.h>
+#include <string>
+#include "kseq/kseq.h"
+#include "dev_support/verify.hpp"
+#include "io/reads/single_read.hpp"
+#include "io/reads_io/parser.hpp"
+#include "data_structures/sequence/quality.hpp"
+#include "data_structures/sequence/nucl.hpp"
+
+namespace io {
+
+namespace fastafastqgz {
+// STEP 1: declare the type of file handler and the read() function
+KSEQ_INIT(gzFile, gzread)
+}
+
+class FastaFastqGzParser: public Parser {
+public:
+ /*
+ * Default constructor.
+ *
+ * @param filename The name of the file to be opened.
+ * @param offset The offset of the read quality.
+ */
+ FastaFastqGzParser(const std::string& filename, OffsetType offset_type =
+ PhredOffset) :
+ Parser(filename, offset_type), fp_(), seq_(NULL) {
+ open();
+ }
+
+ /*
+ * Default destructor.
+ */
+ /* virtual */
+ ~FastaFastqGzParser() {
+ close();
+ }
+
+ /*
+ * Read SingleRead from stream.
+ *
+ * @param read The SingleRead that will store read data.
+ *
+ * @return Reference to this stream.
+ */
+ /* virtual */
+ FastaFastqGzParser& operator>>(SingleRead& read) {
+ if (!is_open_ || eof_) {
+ return *this;
+ }
+ //todo offset_type_ should be used in future
+ if (seq_->qual.s) {
+ read = SingleRead(seq_->name.s, seq_->seq.s, seq_->qual.s, offset_type_);
+ } else {
+ read = SingleRead(seq_->name.s, seq_->seq.s);
+// size_t len = strlen(seq_->seq.s);
+// char* qual = (char*) malloc(len + 1);
+// char q = '\2' + 64;
+// for (size_t i = 0; i < len; ++i) {
+// qual[i] = q;
+// }
+// qual[len] = '\0';
+// read.SetAll(seq_->name.s, seq_->seq.s, qual, SolexaOffset);
+// free(qual);
+ }
+ ReadAhead();
+ return *this;
+ }
+
+ /*
+ * Close the stream.
+ */
+ /* virtual */
+ void close() {
+ if (is_open_) {
+ // STEP 5: destroy seq
+ fastafastqgz::kseq_destroy(seq_);
+ // STEP 6: close the file handler
+ gzclose(fp_);
+ is_open_ = false;
+ eof_ = true;
+ }
+ }
+
+private:
+ /*
+ * @variable File that is associated with gzipped data file.
+ */
+ gzFile fp_;
+ /*
+ * @variable Data element that stores last SingleRead got from
+ * stream.
+ */
+ fastafastqgz::kseq_t* seq_;
+
+ /*
+ * Open a stream.
+ */
+ /* virtual */
+ void open() {
+ // STEP 2: open the file handler
+ fp_ = gzopen(filename_.c_str(), "r");
+ if (!fp_) {
+ is_open_ = false;
+ return;
+ }
+ // STEP 3: initialize seq
+ seq_ = fastafastqgz::kseq_init(fp_);
+ eof_ = false;
+ is_open_ = true;
+ ReadAhead();
+ }
+
+ /*
+ * Read next SingleRead from file.
+ */
+ void ReadAhead() {
+ VERIFY(is_open_);
+ VERIFY(!eof_);
+ if (fastafastqgz::kseq_read(seq_) < 0) {
+ eof_ = true;
+ }
+ }
+
+ /*
+ * Hidden copy constructor.
+ */
+ FastaFastqGzParser(const FastaFastqGzParser& parser);
+ /*
+ * Hidden assign operator.
+ */
+ void operator=(const FastaFastqGzParser& parser);
+};
+
+}
+
+#endif /* COMMON_IO_FASTAFASTQGZPARSER_HPP */
diff --git a/src/modules/io/reads_io/file_reader.hpp b/src/modules/io/reads_io/file_reader.hpp
new file mode 100644
index 0000000..c9152d0
--- /dev/null
+++ b/src/modules/io/reads_io/file_reader.hpp
@@ -0,0 +1,129 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/**
+
+* Reader<SingleRead> is the very base class that reads from one file
+* through Parser object.
+* Reader<PairedRead> is the class that reads data from two input
+* files and gets PairedReads using this data and distance information.
+*/
+
+#pragma once
+
+#include "ireader.hpp"
+#include "io/reads/single_read.hpp"
+#include "parser.hpp"
+#include "dev_support/path_helper.hpp"
+
+namespace io {
+
+class FileReadStream : public ReadStream<SingleRead> {
+public:
+ /*
+ * Default constructor.
+ *
+ * @param filename The name of the file to be opened.
+ * @param distance Doesn't have any sense here, but necessary for
+ * wrappers.
+ * @param offset The offset of the read quality.
+ */
+ explicit FileReadStream(const std::string &filename,
+ OffsetType offset_type = PhredOffset)
+ : filename_(filename), offset_type_(offset_type), parser_(NULL) {
+ path::CheckFileExistenceFATAL(filename_);
+ parser_ = SelectParser(filename_, offset_type_);
+ }
+
+ /*
+ * Default destructor.
+ */
+ /* virtual */ ~FileReadStream() {
+ close();
+ delete parser_;
+ }
+
+ /*
+ * Check whether the stream is opened.
+ *
+ * @return true of the stream is opened and false otherwise.
+ */
+ /* virtual */ bool is_open() {
+ if (parser_ != NULL) {
+ return parser_->is_open();
+ } else {
+ return false;
+ }
+ }
+
+ /*
+ * Check whether we've reached the end of stream.
+ *
+ * @return true if the end of stream is reached and false
+ * otherwise.
+ */
+ /* virtual */ bool eof() {
+ if (parser_ != NULL) {
+ return parser_->eof();
+ } else {
+ return true;
+ }
+ }
+
+ /*
+ * Read SingleRead from stream.
+ *
+ * @param singleread The SingleRead that will store read data.
+ *
+ * @return Reference to this stream.
+ */
+ /* virtual */ FileReadStream &operator>>(SingleRead &singleread) {
+ if (parser_ != NULL) {
+ (*parser_) >> singleread;
+ }
+ return *this;
+ }
+
+ /*
+ * Close the stream.
+ */
+ /* virtual */ void close() {
+ if (parser_ != NULL) {
+ parser_->close();
+ }
+ }
+
+ /*
+ * Close the stream and open it again.
+ */
+ /* virtual */ void reset() {
+ if (parser_ != NULL) {
+ parser_->reset();
+ }
+ }
+
+ ReadStreamStat get_stat() const {
+ return ReadStreamStat();
+ }
+
+private:
+ /*
+ * @variable The name of the file which stream reads from.
+ */
+ std::string filename_;
+ /*
+ * @variable Quality offset type.
+ */
+ OffsetType offset_type_;
+ /*
+ * @variable Internal stream that reads from file.
+ */
+ Parser *parser_;
+
+};
+
+}
diff --git a/src/modules/io/reads_io/filtering_reader_wrapper.hpp b/src/modules/io/reads_io/filtering_reader_wrapper.hpp
new file mode 100644
index 0000000..e038b4b
--- /dev/null
+++ b/src/modules/io/reads_io/filtering_reader_wrapper.hpp
@@ -0,0 +1,148 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/**
+ * @file filtering_reader_wrapper.hpp
+ * @author Sergey Nurk
+ * @version 1.0
+ *
+ * @section LICENSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * @section DESCRIPTION
+ *
+ * FilteringReaderWrapper is the class-wrapper that gets only valid
+ * reads.
+ */
+
+#ifndef COMMON_IO_FILTERINGREADERWRAPPER_HPP_
+#define COMMON_IO_FILTERINGREADERWRAPPER_HPP_
+
+#include "io/ireader.hpp"
+
+namespace io {
+
+template<typename ReadType>
+class FilteringReaderWrapper: public IReader<ReadType> {
+public:
+ /*
+ * Default constructor.
+ *
+ * @param reader Reference to any other reader (child of IReader).
+ */
+ explicit FilteringReaderWrapper(IReader<ReadType>& reader) :
+ reader_(reader), eof_(false) {
+ StepForward();
+ }
+
+ /*
+ * Default destructor.
+ */
+ /* virtual */ ~FilteringReaderWrapper() {
+ close();
+ }
+
+ /*
+ * Check whether the stream is opened.
+ *
+ * @return true of the stream is opened and false otherwise.
+ */
+ /* virtual */ bool is_open() {
+ return reader_.is_open();
+ }
+
+ /*
+ * Check whether we've reached the end of stream.
+ *
+ * @return true if the end of stream is reached and false
+ * otherwise.
+ */
+ /* virtual */ bool eof() {
+ return eof_;
+ }
+
+ /*
+ * Read SingleRead or PairedRead from stream (according to ReadType).
+ *
+ * @param read The SingleRead or PairedRead that will store read
+ * data.
+ *
+ * @return Reference to this stream.
+ */
+ /* virtual */ FilteringReaderWrapper& operator>>(ReadType& read) {
+ read = next_read_;
+ StepForward();
+ return *this;
+ }
+
+ /*
+ * Close the stream.
+ */
+ /* virtual */
+ void close() {
+ reader_.close();
+ }
+
+ /*
+ * Close the stream and open it again.
+ */
+ /* virtual */
+ void reset() {
+ reader_.reset();
+ eof_ = false;
+ StepForward();
+ }
+
+ ReadStat get_stat() const {
+ return reader_.get_stat();
+ }
+
+private:
+ /*
+ * @variable Internal stream readers.
+ */
+ IReader<ReadType>& reader_;
+ /*
+ * @variable Flag that shows whether the end of stream reached.
+ */
+ bool eof_;
+ /*
+ * @variable Next read to be outputted by stream.
+ */
+ ReadType next_read_;
+
+ /*
+ * Read next valid read in the stream.
+ */
+ void StepForward() {
+ while (!reader_.eof()) {
+ reader_ >> next_read_;
+ if (next_read_.IsValid()) {
+ return;
+ }
+ }
+ eof_ = true;
+ }
+
+ /*
+ * Hidden copy constructor.
+ */
+ explicit FilteringReaderWrapper(
+ const FilteringReaderWrapper<ReadType>& reader);
+ /*
+ * Hidden assign operator.
+ */
+ void operator=(const FilteringReaderWrapper<ReadType>& reader);
+};
+
+}
+
+#endif /* COMMON_IO_FILTERINGREADERWRAPPER_HPP_ */
diff --git a/src/modules/io/reads_io/io_helper.hpp b/src/modules/io/reads_io/io_helper.hpp
new file mode 100644
index 0000000..2f42348
--- /dev/null
+++ b/src/modules/io/reads_io/io_helper.hpp
@@ -0,0 +1,118 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "read_stream_vector.hpp"
+#include "io/reads/single_read.hpp"
+#include "io/reads/paired_read.hpp"
+#include "file_reader.hpp"
+#include "paired_readers.hpp"
+#include "binary_streams.hpp"
+#include "multifile_reader.hpp"
+#include "converting_reader_wrapper.hpp"
+#include "careful_filtering_reader_wrapper.hpp"
+#include "rc_reader_wrapper.hpp"
+
+namespace io {
+ typedef ReadStream<SingleRead> SingleStream;
+ typedef std::shared_ptr<SingleStream> SingleStreamPtr;
+ typedef ReadStreamList<SingleRead> SingleStreams;
+
+ typedef ReadStream<PairedRead> PairedStream;
+ typedef std::shared_ptr<PairedStream> PairedStreamPtr;
+ typedef ReadStreamList<PairedRead> PairedStreams;
+
+ typedef ReadStream<SingleReadSeq> BinarySingleStream;
+ typedef std::shared_ptr<BinarySingleStream> BinarySingleStreamPtr;
+ typedef ReadStreamList<SingleReadSeq> BinarySingleStreams;
+
+ typedef ReadStream<PairedReadSeq> BinaryPairedStream;
+ typedef std::shared_ptr<BinaryPairedStream> BinaryPairedStreamPtr;
+ typedef ReadStreamList<PairedReadSeq> BinaryPairedStreams;
+
+ //old
+// typedef io::IReader<io::SingleReadSeq> SequenceSingleReadStream;
+// typedef io::IReader<io::PairedReadSeq> SequencePairedReadStream;
+// typedef io::MultifileReader<io::PairedRead> MultiPairedStream;
+// typedef io::MultifileReader<io::SingleRead> MultiSingleStream;
+
+ inline BinarySingleStreams apply_single_wrappers(bool followed_by_rc,
+ BinarySingleStreams& single_readers,
+ BinaryPairedStreams* paired_readers = 0) {
+ VERIFY(single_readers.size() != 0);
+ BinarySingleStreams readers = single_readers;
+
+ if (paired_readers != 0) {
+ VERIFY(single_readers.size() == paired_readers->size());
+ BinarySingleStreams squashed_paired = SquashingWrap<PairedReadSeq>(*paired_readers);
+ readers = WrapPairsInMultifiles<SingleReadSeq>(squashed_paired, readers);
+ }
+
+ if (followed_by_rc) {
+ readers = RCWrap<SingleReadSeq>(readers);
+ }
+ return readers;
+ }
+
+ //todo make deprecated
+ inline BinaryPairedStreams apply_paired_wrappers(bool followed_by_rc,
+ BinaryPairedStreams& readers) {
+ VERIFY(readers.size() != 0);
+ if (followed_by_rc) {
+ return RCWrap<PairedReadSeq>(readers);
+ } else {
+ return readers;
+ }
+ }
+
+ inline SingleStreamPtr EasyStream(const std::string& filename, bool followed_by_rc,
+ bool handle_Ns = true, OffsetType offset_type = PhredOffset) {
+ SingleStreamPtr reader = make_shared<FileReadStream>(filename, offset_type);
+ if (handle_Ns) {
+ reader = CarefulFilteringWrap<SingleRead>(reader);
+ }
+ if (followed_by_rc) {
+ reader = RCWrap<SingleRead>(reader);
+ }
+ return reader;
+ }
+
+ inline PairedStreamPtr WrapPairedStream(PairedStreamPtr reader,
+ bool followed_by_rc,
+ bool use_orientation = false,
+ LibraryOrientation orientation = LibraryOrientation::Undefined) {
+ PairedStreamPtr answer = reader;
+ answer = CarefulFilteringWrap<PairedRead>(answer, use_orientation, orientation);
+ if (followed_by_rc) {
+ answer = RCWrap<PairedRead>(answer);
+ }
+ return answer;
+
+ }
+
+ inline PairedStreamPtr PairedEasyStream(const std::string& filename1, const std::string& filename2,
+ bool followed_by_rc, size_t insert_size, bool change_read_order = false,
+ bool use_orientation = true, LibraryOrientation orientation = LibraryOrientation::FR,
+ OffsetType offset_type = PhredOffset) {
+ PairedStreamPtr reader = make_shared<SeparatePairedReadStream>(filename1, filename2, insert_size,
+ change_read_order, use_orientation,
+ orientation, offset_type);
+ //Use orientation for IS calculation if it's not done by changer
+ return WrapPairedStream(reader, followed_by_rc, !use_orientation, orientation);
+ }
+
+ inline PairedStreamPtr PairedEasyStream(const std::string& filename, bool followed_by_rc,
+ size_t insert_size, bool change_read_order = false,
+ bool use_orientation = true, LibraryOrientation orientation = LibraryOrientation::FR,
+ OffsetType offset_type = PhredOffset) {
+ PairedStreamPtr reader = make_shared<InterleavingPairedReadStream>(filename, insert_size, change_read_order,
+ use_orientation, orientation, offset_type);
+ //Use orientation for IS calculation if it's not done by changer
+ return WrapPairedStream(reader, followed_by_rc, !use_orientation, orientation);
+ }
+}
diff --git a/src/modules/io/reads_io/ireader.hpp b/src/modules/io/reads_io/ireader.hpp
new file mode 100644
index 0000000..e3e286d
--- /dev/null
+++ b/src/modules/io/reads_io/ireader.hpp
@@ -0,0 +1,117 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+//todo rename to reader
+#pragma once
+
+#include <boost/noncopyable.hpp>
+#include "dev_support/standard_base.hpp"
+
+namespace io {
+
+struct ReadStreamStat {
+ size_t read_count_;
+ size_t max_len_;
+ uint64_t total_len_;
+
+
+ ReadStreamStat(): read_count_(0), max_len_(0), total_len_(0) { }
+
+ void write(std::ostream& stream) const {
+ stream.write((const char *) &read_count_, sizeof(read_count_));
+ stream.write((const char *) &max_len_, sizeof(max_len_));
+ stream.write((const char *) &total_len_, sizeof(total_len_));
+ }
+
+ void read(std::istream& stream) {
+ stream.read((char *) &read_count_, sizeof(read_count_));
+ stream.read((char *) &max_len_, sizeof(max_len_));
+ stream.read((char *) &total_len_, sizeof(total_len_));
+ }
+
+ template<class Read>
+ void increase(const Read& read) {
+ size_t len = read.size();
+
+ ++read_count_;
+ if (max_len_ < len) {
+ max_len_ = len;
+ }
+ total_len_ += read.nucl_count();
+ }
+
+ void merge(const ReadStreamStat& stat) {
+ read_count_ += stat.read_count_;
+ if (max_len_ < stat.max_len_) {
+ max_len_ = stat.max_len_;
+ }
+ total_len_ += stat.total_len_;
+ }
+
+ bool valid() const {
+ return read_count_ != 0;
+ }
+
+};
+
+/**
+ * Reader is the interface for all other readers and reader wrappers.
+ */
+template<typename ReadType>
+class ReadStream: boost::noncopyable {
+ public:
+ typedef ReadType ReadT;
+
+ /*
+ * Default destructor.
+ */
+ virtual ~ReadStream() {}
+
+ /*
+ * Check whether the stream is opened.
+ *
+ * @return true if the stream is opened and false otherwise.
+ */
+ virtual bool is_open() = 0;
+
+ /*
+ * Check whether we've reached the end of stream.
+ *
+ * @return true if the end of the stream is reached and false
+ * otherwise.
+ */
+ virtual bool eof() = 0;
+
+ /*
+ * Read SingleRead or PairedRead from stream (according to ReadType).
+ *
+ * @param read The SingleRead or PairedRead that will store read data.
+ *
+ * @return Reference to this stream.
+ */
+ virtual ReadStream& operator>>(ReadType& read) = 0;
+
+ /*
+ * Close the stream.
+ */
+ virtual void close() = 0;
+
+ /*
+ * Close the stream and open it again.
+ */
+ virtual void reset() = 0;
+
+ virtual ReadStreamStat get_stat() const = 0;
+
+};
+
+template<class Read>
+class PredictableReadStream: public ReadStream<Read> {
+public:
+ virtual size_t size() const = 0;
+};
+
+}
diff --git a/src/modules/io/reads_io/ireadstream.hpp b/src/modules/io/reads_io/ireadstream.hpp
new file mode 100644
index 0000000..3cc34d0
--- /dev/null
+++ b/src/modules/io/reads_io/ireadstream.hpp
@@ -0,0 +1,170 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+* ifastqstream.hpp
+*
+* Created on: 03.03.2011
+* Author: vyahhi
+*/
+
+#ifndef IREADSTREAM_HPP_
+#define IREADSTREAM_HPP_
+
+#include "kseq/kseq.h"
+#include <zlib.h>
+#include "dev_support/verify.hpp"
+#include "io/reads/read.hpp"
+#include "data_structures/sequence/nucl.hpp"
+
+// STEP 1: declare the type of file handler and the read() function
+KSEQ_INIT(gzFile, gzread)
+
+/*
+* Read name, seq and qual strings from FASTQ data (one by one)
+*/
+//fixme deprecated!!! remove usages!
+class ireadstream {
+
+public:
+typedef Read ReadT;
+
+ireadstream(const std::string &filename) : offset_(Read::PHRED_OFFSET) {
+ filename_ = filename;
+ is_open_ = open(filename);
+}
+
+ireadstream(const std::string &filename, int offset) : offset_(offset) {
+ filename_ = filename;
+ is_open_ = open(filename);
+}
+
+virtual ~ireadstream() {
+ close();
+}
+
+bool is_open() const {
+ return is_open_;
+}
+
+bool eof() const {
+ return eof_;
+}
+
+static std::vector <Read> *readAll(std::string filename, int cnt = -1) {
+ ireadstream irs(filename);
+ VERIFY(irs.is_open());
+ std::vector <Read> *res = new std::vector<Read>();
+ Read r;
+ while (cnt-- && irs.is_open() && !irs.eof()) {
+ irs >> r;
+ if (!r.isValid()) {
+ cnt++;
+ continue;
+ }
+ res->push_back(r);
+ }
+ irs.close();
+ return res;
+}
+
+static void readAllNoValidation(std::vector <Read> *res, std::string filename, uint64_t *totalsize,
+ int qvoffset = Read::PHRED_OFFSET, int trim_quality = -1, int cnt = -1) {
+ ireadstream irs(filename, qvoffset);
+ VERIFY(irs.is_open());
+ *totalsize = 0;
+ Read r;
+ while (cnt-- && irs.is_open() && !irs.eof()) {
+ irs >> r;
+ size_t read_size = r.trimNsAndBadQuality(trim_quality);
+ res->push_back(r);
+ *totalsize += read_size;
+ }
+ irs.close();
+}
+
+ireadstream &operator>>(Read &r) {
+ VERIFY(is_open());
+ VERIFY(!eof());
+ if (!is_open() || eof()) {
+ return *this;
+ }
+ r.setName(seq_->name.s);
+ if (seq_->qual.s) {
+ r.setQuality(seq_->qual.s, offset_);
+ }
+ r.setSequence(seq_->seq.s);
+ read_ahead(); // make actual read for the next result
+ return *this;
+}
+
+void close() {
+ if (is_open()) {
+ kseq_destroy(seq_); // STEP 5: destroy seq
+ gzclose(fp_); // STEP 6: close the file handler
+ is_open_ = false;
+ }
+}
+
+void reset() {
+ close();
+ open(filename_);
+}
+
+private:
+std::string filename_;
+gzFile fp_;
+kseq_t *seq_;
+bool is_open_;
+bool eof_;
+int offset_;
+
+/*
+ * open i's file with FASTQ reads,
+ * return true if it opened file, false otherwise
+ */
+bool open(std::string filename) {
+ fp_ = gzopen(filename.c_str(), "r"); // STEP 2: open the file handler
+ if (!fp_) {
+ return false;
+ }
+ is_open_ = true;
+ seq_ = kseq_init(fp_); // STEP 3: initialize seq
+ eof_ = false;
+ read_ahead();
+ return true;
+}
+
+void read_ahead() {
+ VERIFY(is_open());
+ VERIFY(!eof());
+ if (kseq_read(seq_) < 0) {
+ eof_ = true;
+ }
+}
+};
+
+//return -1 if failed to determine offset
+inline int determine_offset(const std::string &filename) {
+ireadstream stream(filename, 0);
+size_t count = 0;
+Read r;
+while (!stream.eof() && count++ < 10000) {
+ stream >> r;
+ std::string q_str = r.getQualityString();
+ for (size_t i = 0; i < q_str.size(); ++i) {
+ int q_val = q_str[i];
+ if (q_val < 59)
+ return 33;
+ if (q_val > 74)
+ return 64;
+ }
+}
+return -1;
+}
+
+#endif /* IREADSTREAM_HPP_ */
diff --git a/src/modules/io/reads_io/is_corrupting_wrapper.hpp b/src/modules/io/reads_io/is_corrupting_wrapper.hpp
new file mode 100644
index 0000000..f2993f3
--- /dev/null
+++ b/src/modules/io/reads_io/is_corrupting_wrapper.hpp
@@ -0,0 +1,33 @@
+////***************************************************************************
+////* Copyright (c) 2011-2014 Saint-Petersburg Academic University
+////* All Rights Reserved
+////* See file LICENSE for details.
+////****************************************************************************
+// todo remove!!!
+//#ifndef IS_CORRUPTING_WRAPPER_HPP_
+//#define IS_CORRUPTING_WRAPPER_HPP_
+//
+//namespace io {
+//
+//class ISCorruptingWrapper: public DelegatingReaderWrapper<PairedRead> {
+//private:
+// const size_t is_;
+//public:
+// typedef PairedRead ReadType;
+//
+// explicit ISCorruptingWrapper(IReader<ReadType>& reader, size_t is) :
+// DelegatingReaderWrapper<PairedRead>(reader), is_(is) {
+// }
+//
+// /* virtual */
+// ISCorruptingWrapper& operator>>(ReadType& read) {
+// (this->reader()) >> read;
+// read = PairedRead(read.first(), read.second(), is_);
+// return *this;
+// }
+//
+//};
+//
+//}
+//
+//#endif /* IS_CORRUPTING_WRAPPER_HPP_ */
diff --git a/src/modules/io/reads_io/modifying_reader_wrapper.hpp b/src/modules/io/reads_io/modifying_reader_wrapper.hpp
new file mode 100644
index 0000000..8039db0
--- /dev/null
+++ b/src/modules/io/reads_io/modifying_reader_wrapper.hpp
@@ -0,0 +1,114 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "dev_support/verify.hpp"
+#include "io/reads_io/delegating_reader_wrapper.hpp"
+
+#include <memory>
+#include <io/reads/single_read.hpp>
+
+namespace io {
+
+class SequenceModifier {
+public:
+ virtual ~SequenceModifier() {}
+
+ SingleRead Modify(const SingleRead& read) {
+ return SingleRead(read.name(), Modify(read.sequence()).str());
+ }
+
+ SingleReadSeq Modify(const SingleReadSeq& read) {
+ return SingleReadSeq(Modify(read.sequence()));
+ }
+
+ virtual Sequence Modify(const Sequence& s) = 0;
+};
+
+class TrivialModifier : public SequenceModifier {
+public:
+
+ virtual Sequence Modify(const Sequence& s) {
+ return s;
+ }
+};
+
+/**
+ * Attention!!! this class clears quality!!!
+ */
+template<class ReadType>
+class ModifyingWrapper;
+
+template<>
+class ModifyingWrapper<SingleRead>: public DelegatingWrapper<SingleRead> {
+ typedef DelegatingWrapper<SingleRead> base;
+ std::shared_ptr<SequenceModifier> modifier_;
+
+public:
+ ModifyingWrapper(base::ReadStreamPtrT reader, std::shared_ptr<SequenceModifier> modifier) :
+ base(reader), modifier_(modifier) {}
+
+ ModifyingWrapper& operator>>(SingleRead& read) {
+ this->reader() >> read;
+ read = modifier_->Modify(read);
+ return *this;
+ }
+};
+
+template<>
+class ModifyingWrapper<PairedRead>: public DelegatingWrapper<PairedRead> {
+ typedef DelegatingWrapper<PairedRead> base;
+ std::shared_ptr<SequenceModifier> modifier_;
+
+public:
+ ModifyingWrapper(base::ReadStreamPtrT reader, std::shared_ptr<SequenceModifier> modifier) :
+ base(reader), modifier_(modifier) {}
+
+ ModifyingWrapper& operator>>(PairedRead& read) {
+ this->reader() >> read;
+ read = PairedRead(modifier_->Modify(read.first()),
+ modifier_->Modify(read.second()),
+ read.insert_size());
+ return *this;
+ }
+};
+
+template<>
+class ModifyingWrapper<SingleReadSeq>: public DelegatingWrapper<SingleReadSeq> {
+ typedef DelegatingWrapper<SingleReadSeq> base;
+ std::shared_ptr<SequenceModifier> modifier_;
+
+public:
+ ModifyingWrapper(base::ReadStreamPtrT reader, std::shared_ptr<SequenceModifier> modifier) :
+ base(reader), modifier_(modifier) {}
+
+ ModifyingWrapper& operator>>(SingleReadSeq& read) {
+ this->reader() >> read;
+ read = modifier_->Modify(read.sequence());
+ return *this;
+ }
+};
+
+template<>
+class ModifyingWrapper<PairedReadSeq>: public DelegatingWrapper<PairedReadSeq> {
+ typedef DelegatingWrapper<PairedReadSeq> base;
+ std::shared_ptr<SequenceModifier> modifier_;
+
+public:
+ ModifyingWrapper(base::ReadStreamPtrT reader, std::shared_ptr<SequenceModifier> modifier) :
+ base(reader), modifier_(modifier) {}
+
+ ModifyingWrapper& operator>>(PairedReadSeq& read) {
+ this->reader() >> read;
+ read = PairedReadSeq(modifier_->Modify(read.first().sequence())
+ , SingleReadSeq(modifier_->Modify(read.second())), read.insert_size());
+ return *this;
+ }
+};
+
+}
diff --git a/src/modules/io/reads_io/mpmc_bounded.hpp b/src/modules/io/reads_io/mpmc_bounded.hpp
new file mode 100644
index 0000000..d82ced5
--- /dev/null
+++ b/src/modules/io/reads_io/mpmc_bounded.hpp
@@ -0,0 +1,153 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/* Multi-consumer/multi-producer bounded queue
+
+Copyright (c) 2011 Dmitry Vyukov. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are
+permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this list of
+conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, this list
+of conditions and the following disclaimer in the documentation and/or other materials
+provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY DMITRY VYUKOV "AS IS" AND ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL DMITRY VYUKOV OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <ciso646>
+
+#if __GNUC__ > 4 || (__GNUC__ >= 4 && __GNUC_MINOR__ >= 5) || _LIBCPP_VERSION
+
+#include <atomic>
+
+#else
+#include <cstdatomic>
+#endif
+
+#include <cstring>
+#include <unistd.h>
+
+template<typename T>
+class mpmc_bounded_queue {
+public:
+ mpmc_bounded_queue(size_t buffer_size)
+ : buffer_(new cell_t[buffer_size]), buffer_mask_(buffer_size - 1) {
+ assert((buffer_size >= 2) && ((buffer_size & (buffer_size - 1)) == 0));
+ for (size_t i = 0; i != buffer_size; i += 1)
+ buffer_[i].sequence_.store(i, std::memory_order_relaxed);
+ enqueue_pos_.store(0, std::memory_order_relaxed);
+ dequeue_pos_.store(0, std::memory_order_relaxed);
+ closed_.store(false, std::memory_order_relaxed);
+ }
+
+ ~mpmc_bounded_queue() {
+ delete[] buffer_;
+ }
+
+ bool is_closed() const {
+ return closed_.load(std::memory_order_relaxed);
+ }
+
+ void close() {
+ closed_.store(true, std::memory_order_release);
+ }
+
+ bool enqueue(T const &data) {
+ if (is_closed())
+ return false;
+
+ cell_t *cell;
+ size_t pos = enqueue_pos_.load(std::memory_order_relaxed);
+ for (; ;) {
+ cell = &buffer_[pos & buffer_mask_];
+ size_t seq = cell->sequence_.load(std::memory_order_acquire);
+ intptr_t dif = (intptr_t) seq - (intptr_t) pos;
+ if (dif == 0) {
+ if (enqueue_pos_.compare_exchange_weak(pos, pos + 1, std::memory_order_relaxed))
+ break;
+ } else if (dif < 0)
+ return false;
+ else
+ pos = enqueue_pos_.load(std::memory_order_relaxed);
+ }
+
+ cell->data_ = data;
+ cell->sequence_.store(pos + 1, std::memory_order_release);
+
+ return true;
+ }
+
+ bool dequeue(T &data) {
+ cell_t *cell;
+ size_t pos = dequeue_pos_.load(std::memory_order_relaxed);
+ for (; ;) {
+ cell = &buffer_[pos & buffer_mask_];
+ size_t seq = cell->sequence_.load(std::memory_order_acquire);
+ intptr_t dif = (intptr_t) seq - (intptr_t) (pos + 1);
+ if (dif == 0) {
+ if (dequeue_pos_.compare_exchange_weak(pos, pos + 1, std::memory_order_relaxed))
+ break;
+ } else if (dif < 0)
+ return false;
+ else
+ pos = dequeue_pos_.load(std::memory_order_relaxed);
+ }
+
+ data = cell->data_;
+ cell->sequence_.store(pos + buffer_mask_ + 1, std::memory_order_release);
+
+ return true;
+ }
+
+ bool wait_dequeue(T &data) {
+ bool res = false;
+ do {
+ res = dequeue(data);
+ if (!res)
+ usleep(1);
+ } while (!res && !is_closed());
+
+ return res;
+ }
+
+private:
+ struct cell_t {
+ std::atomic<size_t> sequence_;
+ T data_;
+ };
+
+ static size_t const cacheline_size = 64;
+ typedef char cacheline_pad_t[cacheline_size];
+
+ cacheline_pad_t pad0_;
+ cell_t *const buffer_;
+ size_t const buffer_mask_;
+ cacheline_pad_t pad1_;
+ std::atomic<size_t> enqueue_pos_;
+ cacheline_pad_t pad2_;
+ std::atomic<size_t> dequeue_pos_;
+ cacheline_pad_t pad3_;
+ std::atomic<bool> closed_;
+ cacheline_pad_t pad4_;
+
+ mpmc_bounded_queue(mpmc_bounded_queue const &);
+
+ void operator=(mpmc_bounded_queue const &);
+};
diff --git a/src/modules/io/reads_io/multifile_reader.hpp b/src/modules/io/reads_io/multifile_reader.hpp
new file mode 100644
index 0000000..b658be0
--- /dev/null
+++ b/src/modules/io/reads_io/multifile_reader.hpp
@@ -0,0 +1,99 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "read_stream_vector.hpp"
+#include <vector>
+
+namespace io {
+
+/**
+ * MultifileReader is the stream that gets data from number of files,
+ * given in a constructor.
+ */
+template<typename ReadType>
+class MultifileStream: public ReadStream<ReadType> {
+ typedef ReadStream<ReadType> StreamT;
+ typedef std::shared_ptr<StreamT> ReadStreamPtrT;
+public:
+ MultifileStream(const ReadStreamList<ReadType>& readers) :
+ readers_(readers), current_reader_index_(0) {
+ }
+
+ MultifileStream(ReadStreamPtrT reader_1, ReadStreamPtrT reader_2) :
+ current_reader_index_(0) {
+ VERIFY(reader_1->is_open() && reader_2->is_open());
+ readers_.push_back(reader_1);
+ readers_.push_back(reader_2);
+ }
+
+ /* virtual */
+ bool is_open() {
+ return (readers_.size() > 0) && readers_[0].is_open();
+ }
+
+ /* virtual */
+ bool eof() {
+ while ((current_reader_index_ < readers_.size()) && readers_[current_reader_index_].eof()) {
+ ++current_reader_index_;
+ }
+ return current_reader_index_ == readers_.size();
+ }
+
+ /* virtual */
+ MultifileStream& operator>>(ReadType& read) {
+ if (!eof()) {
+ readers_[current_reader_index_] >> read;
+ }
+ return (*this);
+ }
+
+ /* virtual */
+ void close() {
+ readers_.close();
+ }
+
+ /* virtual */
+ void reset() {
+ readers_.reset();
+ current_reader_index_ = 0;
+ }
+
+ /* virtual */
+ ReadStreamStat get_stat() const {
+ return readers_.get_stat();
+ }
+
+private:
+ ReadStreamList<ReadType> readers_;
+ size_t current_reader_index_;
+};
+
+template<class ReadType>
+std::shared_ptr<ReadStream<ReadType>> MultifileWrap(std::shared_ptr<ReadStream<ReadType>> reader_1,
+ std::shared_ptr<ReadStream<ReadType>> reader_2) {
+ return std::make_shared<MultifileStream<ReadType>>(reader_1, reader_2);
+}
+
+template<class ReadType>
+std::shared_ptr<ReadStream<ReadType>> MultifileWrap(const ReadStreamList<ReadType>& readers) {
+ return std::make_shared<MultifileStream<ReadType>>(readers);
+}
+
+template<class ReadType>
+ReadStreamList<ReadType> WrapPairsInMultifiles(ReadStreamList<ReadType> readers_1,
+ ReadStreamList<ReadType> readers_2) {
+ VERIFY(readers_1.size() == readers_2.size());
+ ReadStreamList<ReadType> answer;
+ for (size_t i = 0; i < readers_1.size(); ++i) {
+ answer.push_back(MultifileWrap<ReadType>(readers_1.ptr_at(i), readers_2.ptr_at(i)));
+ }
+ return answer;
+}
+
+}
diff --git a/src/modules/io/reads_io/orientation.hpp b/src/modules/io/reads_io/orientation.hpp
new file mode 100644
index 0000000..ce75d05
--- /dev/null
+++ b/src/modules/io/reads_io/orientation.hpp
@@ -0,0 +1,93 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "pipeline/library.hpp"
+
+namespace io {
+
+template<typename ReadType>
+class OrientationChanger {
+
+public:
+
+ virtual ReadType Perform(const ReadType& r) const = 0;
+
+ virtual ~OrientationChanger() {
+ }
+};
+
+template<typename ReadType>
+class IdeticalChanger : public OrientationChanger<ReadType> {
+
+public:
+
+ virtual ReadType Perform(const ReadType& r) const {
+ return r;
+ }
+};
+
+template<typename ReadType>
+class ReverseSecondChanger : public OrientationChanger<ReadType> {
+
+public:
+
+ virtual ReadType Perform(const ReadType& r) const {
+ return ReadType(r.first(), !r.second(), r.insert_size());
+ }
+};
+
+template<typename ReadType>
+class ReverseFirstChanger : public OrientationChanger<ReadType> {
+
+public:
+
+ virtual ReadType Perform(const ReadType& r) const {
+ return ReadType(!r.first(), r.second(), r.insert_size());
+ }
+};
+
+template<typename ReadType>
+class ReverseChanger : public OrientationChanger<ReadType> {
+
+public:
+
+ virtual ReadType Perform(const ReadType& r) const {
+ return ReadType(!r.first(), !r.second(), r.insert_size());
+ }
+};
+
+template<typename ReadType>
+std::unique_ptr<OrientationChanger<ReadType>> GetOrientationChanger(LibraryOrientation orientation) {
+ OrientationChanger<ReadType> * result;
+ switch (orientation) {
+ case LibraryOrientation::FF: {
+ result = new IdeticalChanger<ReadType>();
+ break;
+ }
+ case LibraryOrientation::RR: {
+ result = new ReverseChanger<ReadType>();
+ break;
+ }
+ case LibraryOrientation::FR: {
+ result = new ReverseSecondChanger<ReadType>();
+ break;
+ }
+ case LibraryOrientation::RF: {
+ result = new ReverseFirstChanger<ReadType>();
+ break;
+ }
+ default: {
+ result = new IdeticalChanger<ReadType>();
+ break;
+ }
+ }
+ return std::unique_ptr<OrientationChanger<ReadType>>(result);
+}
+
+}
diff --git a/src/modules/io/reads_io/osequencestream.hpp b/src/modules/io/reads_io/osequencestream.hpp
new file mode 100644
index 0000000..6124aef
--- /dev/null
+++ b/src/modules/io/reads_io/osequencestream.hpp
@@ -0,0 +1,374 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * oreadstream.hpp
+ *
+ * Created on: 23.06.2011
+ * Author: vyahhi
+ */
+
+#pragma once
+
+#include <fstream>
+#include <string>
+#include <vector>
+#include "io/reads/single_read.hpp"
+#include "io/reads/paired_read.hpp"
+
+namespace io {
+
+inline std::string MakeContigId(int number, size_t length, const std::string& prefix = "NODE") {
+ return prefix + "_" + ToString(number) + "_length_" + ToString(length);
+}
+
+inline std::string MakeContigId(int number, size_t length, double coverage, const std::string& prefix = "NODE") {
+ return prefix + "_" + ToString(number) + "_length_" + ToString(length) + "_cov_" + ToString(coverage);
+}
+
+inline std::string MakeContigId(int number, size_t length, double coverage, size_t id, const std::string& prefix = "NODE") {
+ return prefix + "_" + ToString(number) + "_length_" + ToString(length) + "_cov_" + ToString(coverage) + "_ID_" + ToString(id);
+}
+inline std::string MakeContigComponentId(int number, size_t length, double coverage, size_t id, size_t component_id, const std::string& prefix = "NODE") {
+ return prefix + "_" + ToString(number) + "_length_" + ToString(length) + "_cov_" + ToString(coverage) + "_ID_" + ToString(id) + "_component_" + ToString(component_id);
+}
+inline std::string MakeContigComponentId(int number, size_t length, double coverage, size_t component_id, const std::string& prefix = "NODE") {
+ return prefix + "_" + ToString(number) + "_length_" + ToString(length) + "_cov_" + ToString(coverage) + "_component_" + ToString(component_id);
+}
+
+
+class osequencestream {
+protected:
+ std::ofstream ofstream_;
+
+ int id_;
+
+ void write_str(const std::string& s) {
+ size_t cur = 0;
+ while (cur < s.size()) {
+ ofstream_ << s.substr(cur, 60) << std::endl;
+ cur += 60;
+ }
+ }
+
+ virtual void write_header(const std::string& s) {
+ // Velvet format: NODE_1_length_24705_cov_358.255249
+ ofstream_ << ">" << MakeContigId(id_++, s.size()) << std::endl;
+ }
+
+public:
+ osequencestream(const std::string& filename): id_(1) {
+ ofstream_.open(filename.c_str());
+ }
+
+ virtual ~osequencestream() {
+ ofstream_.close();
+ }
+
+ virtual osequencestream& operator<<(const std::string& s) {
+ write_header(s);
+ write_str(s);
+ return *this;
+ }
+
+ virtual osequencestream& operator<<(const Sequence& seq) {
+ std::string s = seq.str();
+ return operator <<(s);
+ }
+
+ /**
+ * Has different way of making headers
+ * Doesn't increase counters, don't mix with other methods!
+ */
+ virtual osequencestream& operator<<(const SingleRead& read) {
+ ofstream_ << ">" << read.name() << std::endl;
+ size_t cur = 0;
+ std::string s = read.GetSequenceString();
+ while (cur < s.size()) {
+ ofstream_ << s.substr(cur, 60) << std::endl;
+ cur += 60;
+ }
+ return *this;
+ }
+};
+
+class PairedOutputSequenceStream {
+protected:
+ std::ofstream ofstreaml_;
+ std::ofstream ofstreamr_;
+
+ static void write(const SingleRead& read, std::ofstream& stream) {
+ stream << ">" << read.name() << std::endl;
+ size_t cur = 0;
+ std::string s = read.GetSequenceString();
+ while (cur < s.size()) {
+ stream << s.substr(cur, 60) << std::endl;
+ cur += 60;
+ }
+ }
+
+public:
+ PairedOutputSequenceStream(const std::string& filename1, const std::string &filename2) {
+ ofstreaml_.open(filename1);
+ ofstreamr_.open(filename2);
+ }
+
+ virtual ~PairedOutputSequenceStream() {
+ ofstreaml_.close();
+ ofstreamr_.close();
+ }
+
+ PairedOutputSequenceStream& operator<<(const PairedRead& read) {
+ write(read.first(), ofstreaml_);
+ write(read.second(), ofstreamr_);
+ return *this;
+ }
+};
+
+
+class osequencestream_cov: public osequencestream {
+protected:
+ double coverage_;
+
+ virtual void write_header(const std::string& s) {
+ // Velvet format: NODE_1_length_24705_cov_358.255249
+ ofstream_ << ">" << MakeContigId(id_++, s.size(), coverage_) << std::endl;
+ }
+
+
+public:
+ osequencestream_cov(const std::string& filename)
+ : osequencestream(filename), coverage_(0.) { }
+
+ virtual ~osequencestream_cov() {
+ ofstream_.close();
+ }
+
+ osequencestream_cov& operator<<(double coverage) {
+ coverage_ = coverage;
+ return *this;
+ }
+
+ osequencestream_cov& operator<<(const std::string& s) {
+ write_header(s);
+ write_str(s);
+ return *this;
+ }
+
+ osequencestream_cov& operator<<(const Sequence& seq) {
+ std::string s = seq.str();
+ return operator <<(s);
+ }
+
+};
+
+
+class osequencestream_simple: public osequencestream {
+protected:
+ std::string header_;
+
+ double cov_;
+
+ virtual void write_header(const std::string& /*s*/) {
+ ofstream_ << ">" << header_ << std::endl;
+ }
+
+public:
+ osequencestream_simple(const std::string& filename)
+ : osequencestream(filename), header_("") { }
+
+ virtual ~osequencestream_simple() {
+ ofstream_.close();
+ }
+
+ void set_header(const std::string &header) {
+ header_ = header;
+ }
+
+ osequencestream_simple& operator<<(const std::string& s) {
+ write_header(s);
+ write_str(s);
+ return *this;
+ }
+
+ osequencestream_simple& operator<<(const Sequence& seq) {
+ std::string s = seq.str();
+ return operator <<(s);
+ }
+
+};
+
+class osequencestream_with_id: public osequencestream {
+protected:
+ size_t uid_;
+
+ double cov_;
+
+ virtual void write_header(const std::string& s) {
+ ofstream_ << ">" << GetId(s) << std::endl;
+ id_++;
+ }
+
+public:
+ osequencestream_with_id(const std::string& filename)
+ : osequencestream(filename), uid_(0), cov_(0.0) { }
+
+ virtual ~osequencestream_with_id() {
+ ofstream_.close();
+ }
+
+ std::string GetId(const std::string& s) const {
+ return MakeContigId(id_, s.size(), cov_, uid_);
+ }
+
+ void setCoverage(double c) {
+ cov_ = c;
+ }
+
+ void setID(size_t uid) {
+ uid_ = uid;
+ }
+
+ osequencestream_with_id& operator<<(const std::string& s) {
+ write_header(s);
+ write_str(s);
+ return *this;
+ }
+
+ osequencestream_with_id& operator<<(double coverage) {
+ cov_ = coverage;
+ return *this;
+ }
+
+ osequencestream_with_id& operator<<(const Sequence& seq) {
+ std::string s = seq.str();
+ return operator <<(s);
+ }
+
+};
+
+class osequencestream_with_manual_node_id: public osequencestream_with_id {
+ bool is_id_set_;
+ virtual void write_header(const std::string& s) {
+ //for manual NODE ID setting osequencestream need to chech that node ID is really manually set
+ if (!is_id_set_) {
+ WARN ("NODE ID is not set manually, setting to 0");
+ id_ = 0;
+ }
+ ofstream_ << ">" << MakeContigId(id_, s.size(), cov_, uid_) << std::endl;
+ is_id_set_ = false;
+ }
+
+public:
+//unfortunately constructor inheritance is supported only since g++4.8
+ osequencestream_with_manual_node_id(const std::string& filename): osequencestream_with_id(filename) {
+ is_id_set_ = false;
+ }
+
+ void setNodeID(int id) {
+ id_ = id;
+ is_id_set_ = true;
+ }
+
+ osequencestream_with_manual_node_id& operator<<(const std::string& s) {
+ write_header(s);
+ write_str(s);
+ return *this;
+ }
+
+ osequencestream_with_manual_node_id& operator<<(const Sequence& seq) {
+ std::string s = seq.str();
+ return operator <<(s);
+ }
+
+
+};
+
+
+class osequencestream_with_data_for_scaffold: public osequencestream_with_id {
+protected:
+ std::ofstream scstream_;
+
+ virtual void write_header(const std::string& s) {
+ scstream_ << id_ << "\tNODE_" << id_ << "\t" << s.size() << "\t" << (int) round(cov_) << std::endl;
+ ofstream_ << ">" << MakeContigId(id_++, s.size(), cov_, uid_) << std::endl;
+ }
+
+public:
+ osequencestream_with_data_for_scaffold(const std::string& filename): osequencestream_with_id(filename) {
+ id_ = 1;
+ std::string sc_filename = filename + ".info";
+ scstream_.open(sc_filename.c_str());
+ }
+
+ virtual ~osequencestream_with_data_for_scaffold() {
+ ofstream_.close();
+ scstream_.close();
+ }
+
+ osequencestream_with_data_for_scaffold& operator<<(const std::string& s) {
+ write_header(s);
+ write_str(s);
+ return *this;
+ }
+
+ osequencestream_with_data_for_scaffold& operator<<(const Sequence& seq) {
+ std::string s = seq.str();
+ return operator <<(s);
+ }
+};
+
+class osequencestream_for_fastg: public osequencestream_with_id {
+protected:
+ std::string header_;
+
+ virtual void write_header(const std::string& s) {
+ ofstream_ << ">" << s;
+ }
+
+public:
+ osequencestream_for_fastg(const std::string& filename):
+ osequencestream_with_id(filename) {
+ id_ = 1;
+ }
+
+ virtual ~osequencestream_for_fastg() {
+ ofstream_.close();
+ }
+
+ void set_header(const std::string& h) {
+ header_= h;
+ }
+
+ osequencestream_for_fastg& operator<<(const std::set<std::string>& s) {
+ write_header(header_);
+ if (s.size() > 0) {
+ auto iter = s.begin();
+ ofstream_ << ":" << *iter;
+ ++iter;
+ while (iter != s.end()) {
+ ofstream_ << "," << *iter;
+ ++iter;
+ }
+ }
+ ofstream_ << ";" << std::endl;
+ return *this;
+ }
+
+ osequencestream_for_fastg& operator<<(const std::string& s) {
+ write_str(s);
+ return *this;
+ }
+
+ osequencestream_for_fastg& operator<<(const Sequence& seq) {
+ std::string s = seq.str();
+ return operator <<(s);
+ }
+
+};
+
+}
diff --git a/src/modules/io/reads_io/paired_readers.hpp b/src/modules/io/reads_io/paired_readers.hpp
new file mode 100644
index 0000000..78cc4ba
--- /dev/null
+++ b/src/modules/io/reads_io/paired_readers.hpp
@@ -0,0 +1,251 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <string>
+#include "ireader.hpp"
+#include "io/reads/paired_read.hpp"
+#include "orientation.hpp"
+
+namespace io {
+
+class SeparatePairedReadStream : public ReadStream<PairedRead> {
+ public:
+ /*
+ * Default constructor.
+ *
+ * @param filename The pair that contains the names of two files to
+ * be opened.
+ * @param distance Distance between parts of PairedReads.
+ * @param offset The offset of the read quality.
+ */
+ explicit SeparatePairedReadStream(const std::string& filename1, const std::string& filename2,
+ size_t insert_size, bool change_order = false,
+ bool use_orientation = true, LibraryOrientation orientation = LibraryOrientation::FR,
+ OffsetType offset_type = PhredOffset)
+ : insert_size_(insert_size),
+ change_order_(change_order),
+ use_orientation_(use_orientation),
+ changer_(GetOrientationChanger<PairedRead>(orientation)),
+ offset_type_(offset_type),
+ first_(new FileReadStream(filename1, offset_type_)),
+ second_(new FileReadStream(filename2, offset_type_)),
+ filename1_(filename1),
+ filename2_(filename2){}
+
+ /*
+ * Check whether the stream is opened.
+ *
+ * @return true of the stream is opened and false otherwise.
+ */
+ /* virtual */ bool is_open() {
+ return first_->is_open() && second_->is_open();
+ }
+
+ /*
+ * Check whether we've reached the end of stream.
+ *
+ * @return true if the end of stream is reached and false
+ * otherwise.
+ */
+ /* virtual */ bool eof() {
+
+ if (first_->eof() != second_->eof()) {
+ if (first_->eof()) {
+ ERROR("The number of right read-pairs is larger than the number of left read-pairs");
+ } else {
+ ERROR("The number of left read-pairs is larger than the number of right read-pairs");
+ }
+ FATAL_ERROR("Unequal number of read-pairs detected in the following files: " << filename1_ << " " << filename2_ << "");
+ }
+ return first_->eof();
+ }
+
+ /*
+ * Read PairedRead from stream.
+ *
+ * @param pairedread The PairedRead that will store read data.
+ *
+ * @return Reference to this stream.
+ */
+ /* virtual */ SeparatePairedReadStream& operator>>(PairedRead& pairedread) {
+ SingleRead sr1, sr2;
+ (*first_) >> sr1;
+ (*second_) >> sr2;
+
+ if (use_orientation_) {
+ pairedread = changer_->Perform(PairedRead(sr1, sr2, insert_size_));
+ }
+ else {
+ pairedread = PairedRead(sr1, sr2, insert_size_);
+ }
+
+ if (change_order_) {
+ pairedread = PairedRead(pairedread.second(), pairedread.first(), insert_size_);
+ }
+
+ return *this;
+ }
+
+ /*
+ * Close the stream.
+ */
+ /* virtual */ void close() {
+ first_->close();
+ second_->close();
+ }
+
+ /*
+ * Close the stream and open it again.
+ */
+ /* virtual */ void reset() {
+ first_->reset();
+ second_->reset();
+ }
+
+ ReadStreamStat get_stat() const {
+ return ReadStreamStat();
+ }
+
+ private:
+
+ size_t insert_size_;
+
+ bool change_order_;
+
+ bool use_orientation_;
+
+ std::unique_ptr<OrientationChanger<PairedRead>> changer_;
+
+ /*
+ * @variable Quality offset type.
+ */
+ OffsetType offset_type_;
+
+ /*
+ * @variable The first stream (reads from first file).
+ */
+ std::unique_ptr<ReadStream<SingleRead>> first_;
+ /*
+ * @variable The second stream (reads from second file).
+ */
+ std::unique_ptr<ReadStream<SingleRead>> second_;
+
+ //Only for providing information about error for users
+ std::string filename1_;
+ std::string filename2_;
+};
+
+class InterleavingPairedReadStream : public ReadStream<PairedRead> {
+ public:
+ /*
+ * Default constructor.
+ *
+ * @param filename Single file
+ * @param distance Distance between parts of PairedReads.
+ * @param offset The offset of the read quality.
+ */
+ explicit InterleavingPairedReadStream(const std::string& filename, size_t insert_size, bool change_order = false,
+ bool use_orientation = true, LibraryOrientation orientation = LibraryOrientation::FR,
+ OffsetType offset_type = PhredOffset)
+ : filename_(filename), insert_size_(insert_size),
+ change_order_(change_order),
+ use_orientation_(use_orientation),
+ changer_(GetOrientationChanger<PairedRead>(orientation)),
+ offset_type_(offset_type),
+ single_(new FileReadStream(filename_, offset_type_)) {}
+
+ /*
+ * Check whether the stream is opened.
+ *
+ * @return true of the stream is opened and false otherwise.
+ */
+ /* virtual */ bool is_open() {
+ return single_->is_open();
+ }
+
+ /*
+ * Check whether we've reached the end of stream.
+ *
+ * @return true if the end of stream is reached and false
+ * otherwise.
+ */
+ /* virtual */ bool eof() {
+ return single_->eof();
+ }
+
+ /*
+ * Read PairedRead from stream.
+ *
+ * @param pairedread The PairedRead that will store read data.
+ *
+ * @return Reference to this stream.
+ */
+ /* virtual */ InterleavingPairedReadStream& operator>>(PairedRead& pairedread) {
+ SingleRead sr1, sr2;
+ (*single_) >> sr1;
+ (*single_) >> sr2;
+
+ if (use_orientation_) {
+ pairedread = changer_->Perform(PairedRead(sr1, sr2, insert_size_));
+ }
+ else {
+ pairedread = PairedRead(sr1, sr2, insert_size_);
+ }
+
+ if (change_order_) {
+ pairedread = PairedRead(pairedread.second(), pairedread.first(), insert_size_);
+ }
+
+ return *this;
+ }
+
+ /*
+ * Close the stream.
+ */
+ /* virtual */ void close() {
+ single_->close();
+ }
+
+ /*
+ * Close the stream and open it again.
+ */
+ /* virtual */ void reset() {
+ single_->reset();
+ }
+
+ ReadStreamStat get_stat() const {
+ return ReadStreamStat();
+ }
+
+ private:
+ /*
+ * @variable The names of the file which stream reads from.
+ */
+ std::string filename_;
+
+ size_t insert_size_;
+
+ bool change_order_;
+
+ bool use_orientation_;
+
+ std::unique_ptr<OrientationChanger<PairedRead>> changer_;
+
+ /*
+ * @variable Quality offset type.
+ */
+ OffsetType offset_type_;
+
+ /*
+ * @variable The single read stream.
+ */
+ std::unique_ptr<ReadStream<SingleRead>> single_;
+
+};
+}
diff --git a/src/modules/io/reads_io/parser.cpp b/src/modules/io/reads_io/parser.cpp
new file mode 100644
index 0000000..f750810
--- /dev/null
+++ b/src/modules/io/reads_io/parser.cpp
@@ -0,0 +1,90 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/**
+ * @file parser.cpp
+ * @author Mariya Fomkina
+ * @version 1.0
+ *
+ * @section LICENSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * @section DESCRIPTION
+ *
+ * Parser is the parent class for all streams that read data from
+ * different file types (fastq, fasta, sam etc).
+ * This file contains functions that are used to select exact parser
+ * according to extension.
+ */
+
+#include <io/reads/single_read.hpp>
+#include "io/reads_io/fasta_fastq_gz_parser.hpp"
+#include "io/reads_io/parser.hpp"
+#include "io/sam_io/bam_parser.hpp"
+#include "dev_support/standard_base.hpp"
+
+
+namespace io {
+
+/*
+ * Get extension from filename.
+ *
+ * @param filename The name of the file to read from.
+ *
+ * @return File extension (e.g. "fastq", "fastq.gz").
+ */
+std::string GetExtension(const std::string& filename) {
+ std::string name = filename;
+ size_t pos = name.find_last_of(".");
+ std::string ext = "";
+ if (pos != std::string::npos) {
+ ext = name.substr(name.find_last_of(".") + 1);
+ if (ext == "gz") {
+ ext = name.substr(name.find_last_of
+ (".", name.find_last_of(".") - 1) + 1);
+ }
+ }
+ return ext;
+}
+
+/*
+ * Select parser type according to file extension.
+ *
+ * @param filename The name of the file to be opened.
+ * @param offset The offset of the read quality.
+
+ * @return Pointer to the new parser object with these filename and
+ * offset.
+ */
+Parser* SelectParser(const std::string& filename,
+ OffsetType offset_type /*= PhredOffset*/) {
+ std::string ext = GetExtension(filename);
+ if (ext == "bam")
+ return new BAMParser(filename, offset_type);
+
+ return new FastaFastqGzParser(filename, offset_type);
+ /*
+ if ((ext == "fastq") || (ext == "fastq.gz") ||
+ (ext == "fasta") || (ext == "fasta.gz") ||
+ (ext == "fa") || (ext == "fq.gz") ||
+ (ext == "fq") || (ext == "fa.gz") ||
+ (ext == "seq") || (ext == "seq.gz")) {
+ return new FastaFastqGzParser(filename, offset_type);
+ }
+
+ ERROR("Unknown file extention in input!");
+ return NULL; */
+}
+
+void first_fun(int) {
+}
+
+}
diff --git a/src/modules/io/reads_io/parser.hpp b/src/modules/io/reads_io/parser.hpp
new file mode 100644
index 0000000..f384446
--- /dev/null
+++ b/src/modules/io/reads_io/parser.hpp
@@ -0,0 +1,145 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/**
+* @file parser.hpp
+* @author Mariya Fomkina
+* @version 1.0
+*
+* @section LICENSE
+*
+* This program is free software; you can redistribute it and/or
+* modify it under the terms of the GNU General Public License as
+* published by the Free Software Foundation; either version 2 of
+* the License, or (at your option) any later version.
+*
+* @section DESCRIPTION
+*
+* Parser is the parent class for all streams that read data from
+* different file types (fastq, fasta, sam etc).
+*/
+
+#ifndef COMMON_IO_PARSER_HPP
+#define COMMON_IO_PARSER_HPP
+
+#include <string>
+#include "io/reads/single_read.hpp"
+
+namespace io {
+
+class Parser {
+public:
+ /*
+ * Default constructor.
+ *
+ * @param filename The name of the file to be opened.
+ * @param offset The offset of the read quality.
+ */
+ Parser(const std::string &filename,
+ OffsetType offset_type = PhredOffset)
+ : filename_(filename), offset_type_(offset_type),
+ is_open_(false), eof_(true) { }
+
+ /*
+ * Default destructor.
+ */
+ virtual ~Parser() { }
+
+ /*
+ * Check whether the stream is opened.
+ *
+ * @return true of the stream is opened and false otherwise.
+ */
+ virtual bool is_open() const {
+ return is_open_;
+ }
+
+ /*
+ * Check whether we've reached the end of stream.
+ *
+ * @return true if the end of stream is reached and false
+ * otherwise.
+ */
+ virtual bool eof() const {
+ return eof_;
+ }
+
+ /*
+ * Read SingleRead from stream.
+ *
+ * @param read The SingleRead that will store read data.
+ *
+ * @return Reference to this stream.
+ */
+ virtual Parser &operator>>(SingleRead &read) = 0;
+
+ /*
+ * Close the stream.
+ */
+ virtual void close() = 0;
+
+ /*
+ * Close the stream and open it again.
+ */
+ void reset() {
+ close();
+ open();
+ }
+
+protected:
+ /*
+ * @variable The name the file which stream reads from.
+ */
+ std::string filename_;
+ /*
+ * @variable Quality offset type.
+ */
+ OffsetType offset_type_;
+ /*
+ * @variable Flag that shows whether the stream is opened.
+ */
+ bool is_open_;
+ /*
+ * @variable Flag that shows whether the end of the stream is
+ * reached.
+ */
+ bool eof_;
+
+private:
+ /*
+ * Open a stream.
+ */
+ virtual void open() = 0;
+};
+
+/*
+* Get extension from filename.
+*
+* @param filename The name of the file to read from.
+*
+* @return File extension (e.g. "fastq", "fastq.gz").
+*/
+std::string GetExtension(const std::string &filename);
+
+/*
+* Select parser type according to file extension.
+*
+* @param filename The name of the file to be opened.
+* @param offset The offset of the read quality.
+
+* @return Pointer to the new parser object with these filename and
+* offset.
+*/
+Parser *SelectParser(const std::string &filename,
+ OffsetType offset_type = PhredOffset);
+
+//todo delete???
+void first_fun(int);
+
+}
+
+#endif /* COMMON_IO_PARSER_HPP */
diff --git a/src/modules/io/reads_io/rc_reader_wrapper.hpp b/src/modules/io/reads_io/rc_reader_wrapper.hpp
new file mode 100644
index 0000000..2320648
--- /dev/null
+++ b/src/modules/io/reads_io/rc_reader_wrapper.hpp
@@ -0,0 +1,137 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <boost/noncopyable.hpp>
+
+#include "read_stream_vector.hpp"
+#include "delegating_reader_wrapper.hpp"
+#include "orientation.hpp"
+
+namespace io {
+
+/**
+ * RCWrapper is the class-wrapper that gets reads and reverse
+ * complimentary reads from given reader (one by one).
+ */
+template<typename ReadType>
+class RCWrapper: public DelegatingWrapper<ReadType> {
+ typedef DelegatingWrapper<ReadType> base;
+public:
+ explicit RCWrapper(typename base::ReadStreamPtrT reader) :
+ base(reader), rc_read_(), was_rc_(true) {
+ }
+
+ /* virtual */
+ bool eof() {
+ return was_rc_ && base::eof();
+ }
+
+ /* virtual */
+ RCWrapper& operator>>(ReadType& read) {
+ if (was_rc_) {
+ base::operator >>(read);
+ rc_read_ = read;
+ } else {
+ read = !rc_read_;
+ }
+ was_rc_ = !was_rc_;
+ return (*this);
+ }
+
+ /* virtual */
+ void reset() {
+ was_rc_ = true;
+ base::reset();
+ }
+
+ /* virtual */
+ ReadStreamStat get_stat() const {
+ ReadStreamStat stat = base::get_stat();
+ stat.merge(stat);
+ return stat;
+ }
+
+private:
+ ReadType rc_read_;
+ bool was_rc_;
+};
+
+template<class ReadType>
+std::shared_ptr<ReadStream<ReadType>> RCWrap(std::shared_ptr<ReadStream<ReadType>> reader_ptr) {
+ return std::make_shared<RCWrapper<ReadType>>(reader_ptr);
+}
+
+template<class ReadType>
+ReadStreamList<ReadType> RCWrap(ReadStreamList<ReadType>& readers) {
+ ReadStreamList<ReadType> answer;
+ for (size_t i = 0; i < readers.size(); ++i) {
+ answer.push_back(RCWrap<ReadType>(readers.ptr_at(i)));
+ }
+ return answer;
+}
+
+template<typename ReadType>
+class OrientationChangingWrapper: public DelegatingWrapper<ReadType> {
+ typedef DelegatingWrapper<ReadType> base;
+ typedef std::unique_ptr<OrientationChanger<ReadType>> ChangerPtrT;
+public:
+
+ OrientationChangingWrapper(typename base::ReaderStreamPtrT reader,
+ LibraryOrientation orientation) :
+ base(reader), changer_(GetOrientationChanger<ReadType>(orientation)) {
+ }
+
+ /*virtual*/
+ OrientationChangingWrapper& operator>>(ReadType& read) {
+ base::operator >>(read);
+ read = changer_->Perform(read);
+ return (*this);
+ }
+
+private:
+ ChangerPtrT changer_;
+ bool delete_reader_;
+};
+
+template<typename ReadType>
+class RCRemovingWrapper: public DelegatingWrapper<ReadType> {
+ typedef DelegatingWrapper<ReadType> base;
+public:
+
+ explicit RCRemovingWrapper(typename base::ReadStreamPtrT reader) : base(reader) {
+ }
+
+ /*virtual*/
+ RCRemovingWrapper& operator>>(ReadType& read) {
+ base::operator>>(read);
+
+ VERIFY(!this->eof());
+ ReadType skip;
+ base::operator>>(skip);
+
+ return *this;
+ }
+
+};
+
+template<class ReadType>
+std::shared_ptr<ReadStream<ReadType>> UnRCWrap(std::shared_ptr<ReadStream<ReadType>> reader_ptr) {
+ return std::make_shared<RCRemovingWrapper<ReadType>>(reader_ptr);
+}
+
+template<class ReadType>
+ReadStreamList<ReadType> UnRCWrap(ReadStreamList<ReadType>& readers) {
+ ReadStreamList<ReadType> answer;
+ for (size_t i = 0; i < readers.size(); ++i) {
+ answer.push_back(UnRCWrap<ReadType>(readers.ptr_at(i)));
+ }
+ return answer;
+}
+
+}
diff --git a/src/modules/io/reads_io/read_processor.hpp b/src/modules/io/reads_io/read_processor.hpp
new file mode 100644
index 0000000..2648852
--- /dev/null
+++ b/src/modules/io/reads_io/read_processor.hpp
@@ -0,0 +1,201 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef __HAMMER_READ_PROCESSOR_HPP__
+#define __HAMMER_READ_PROCESSOR_HPP__
+
+#include "io/reads_io/mpmc_bounded.hpp"
+
+#include "dev_support/openmp_wrapper.h"
+
+#pragma GCC diagnostic push
+#ifdef __clang__
+#pragma clang diagnostic ignored "-Wunused-private-field"
+#endif
+namespace hammer {
+class ReadProcessor {
+ static size_t const cacheline_size = 64;
+ typedef char cacheline_pad_t[cacheline_size];
+
+ unsigned nthreads_;
+ cacheline_pad_t pad0;
+ size_t read_;
+ cacheline_pad_t pad1;
+ size_t processed_;
+ cacheline_pad_t pad2;
+
+private:
+ template<class Reader, class Op>
+ bool RunSingle(Reader &irs, Op &op) {
+ while (!irs.eof()) {
+ typename Reader::ReadT r;
+ irs >> r;
+ read_ += 1;
+
+ processed_ += 1;
+ if (op(r))
+ return true;
+ }
+
+ return false;
+ }
+
+ template<class Reader, class Op, class Writer>
+ void RunSingle(Reader &irs, Op &op, Writer &writer) {
+ while (!irs.eof()) {
+ typename Reader::ReadT r;
+ irs >> r;
+ read_ += 1;
+
+ auto res = op(r);
+ processed_ += 1;
+
+ if (res)
+ writer << *res;
+ }
+ }
+
+public:
+ ReadProcessor(unsigned nthreads)
+ : nthreads_(nthreads), read_(0), processed_(0) { }
+
+ size_t read() const { return read_; }
+
+ size_t processed() const { return processed_; }
+
+ template<class Reader, class Op>
+ bool Run(Reader &irs, Op &op) {
+ if (nthreads_ < 2)
+ return RunSingle(irs, op);
+
+ // Round nthreads to next power of two
+ unsigned bufsize = nthreads_ - 1;
+ bufsize = (bufsize >> 1) | bufsize;
+ bufsize = (bufsize >> 2) | bufsize;
+ bufsize = (bufsize >> 4) | bufsize;
+ bufsize = (bufsize >> 8) | bufsize;
+ bufsize = (bufsize >> 16) | bufsize;
+ bufsize += 1;
+
+ mpmc_bounded_queue<typename Reader::ReadT> in_queue(2 * bufsize);
+
+ bool stop = false;
+# pragma omp parallel shared(in_queue, irs, op, stop) num_threads(nthreads_)
+ {
+# pragma omp master
+ {
+ while (!irs.eof()) {
+ typename Reader::ReadT r;
+ irs >> r;
+# pragma omp atomic
+ read_ += 1;
+
+ while (!in_queue.enqueue(r))
+ sched_yield();
+
+# pragma omp flush (stop)
+ if (stop)
+ break;
+ }
+
+ in_queue.close();
+ }
+
+ while (1) {
+ typename Reader::ReadT r;
+
+ if (!in_queue.wait_dequeue(r))
+ break;
+
+# pragma omp atomic
+ processed_ += 1;
+
+ bool res = op(r);
+ if (res) {
+# pragma omp atomic
+ stop |= res;
+ }
+ }
+ }
+
+# pragma omp flush(stop)
+ return stop;
+ }
+
+ template<class Reader, class Op, class Writer>
+ void Run(Reader &irs, Op &op, Writer &writer) {
+ if (nthreads_ < 2) {
+ RunSingle(irs, op, writer);
+ return;
+ }
+
+ // Round nthreads to next power of two
+ unsigned bufsize = nthreads_ - 1;
+ bufsize = (bufsize >> 1) | bufsize;
+ bufsize = (bufsize >> 2) | bufsize;
+ bufsize = (bufsize >> 4) | bufsize;
+ bufsize = (bufsize >> 8) | bufsize;
+ bufsize = (bufsize >> 16) | bufsize;
+ bufsize += 1;
+
+ mpmc_bounded_queue<typename Reader::ReadT> in_queue(bufsize), out_queue(2 * bufsize);
+# pragma omp parallel shared(in_queue, out_queue, irs, op, writer) num_threads(nthreads_)
+ {
+# pragma omp master
+ {
+ while (!irs.eof()) {
+ typename Reader::ReadT r;
+ irs >> r;
+
+ // First, try to provide read to the queue. If it's full, never mind.
+ bool status = in_queue.enqueue(r);
+
+ // Flush down the output queue
+ typename Reader::ReadT outr;
+ while (out_queue.dequeue(outr))
+ writer << outr;
+
+ // If the input queue was originally full, wait until we can insert
+ // the read once again.
+ if (!status)
+ while (!in_queue.enqueue(r))
+ sched_yield();
+ }
+
+ in_queue.close();
+
+ // Flush down the output queue while in master threads.
+ typename Reader::ReadT outr;
+ while (out_queue.dequeue(outr))
+ writer << outr;
+ }
+
+ while (1) {
+ typename Reader::ReadT r;
+
+ if (!in_queue.wait_dequeue(r))
+ break;
+
+ auto res = op(r);
+ if (res)
+ while (!out_queue.enqueue(*res))
+ sched_yield();
+ }
+ }
+
+ // Flush down the output queue
+ typename Reader::ReadT outr;
+ while (out_queue.dequeue(outr))
+ writer << outr;
+ }
+};
+
+#pragma GCC diagnostic pop
+
+}
+
+#endif // __HAMMER_READ_PROCESSOR_HPP__
diff --git a/src/modules/io/reads_io/read_stream_vector.hpp b/src/modules/io/reads_io/read_stream_vector.hpp
new file mode 100644
index 0000000..632e8db
--- /dev/null
+++ b/src/modules/io/reads_io/read_stream_vector.hpp
@@ -0,0 +1,183 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "ireader.hpp"
+#include <vector>
+
+namespace io {
+//todo rename file
+
+//todo check destroy_readers logic and usages
+template<class ReadType>
+class ReadStreamList {
+public:
+ typedef ReadType ReadT;
+ typedef ReadStream<ReadType> ReaderT;
+ typedef std::shared_ptr<ReaderT> ReaderPtrT;
+
+private:
+ std::vector<ReaderPtrT> readers_;
+
+public:
+
+ explicit ReadStreamList(const std::vector<ReaderPtrT> &readers) : readers_(readers) {
+ }
+
+ ReadStreamList() {
+ }
+
+ explicit ReadStreamList(ReaderT *reader_ptr) : readers_(1, ReaderPtrT(reader_ptr)) {
+ }
+
+ explicit ReadStreamList(ReaderPtrT reader_ptr) : readers_(1, reader_ptr) {
+ }
+
+ explicit ReadStreamList(size_t size) : readers_(size) {
+ }
+
+// std::vector<Reader*>& get() {
+// destroy_readers_ = false;
+// return streams_;
+// }
+
+ //todo use boost iterator facade
+ class iterator : public std::iterator<std::input_iterator_tag, ReaderT> {
+ typedef typename std::vector<ReaderPtrT>::iterator vec_it;
+ vec_it it_;
+ public:
+
+ iterator(vec_it it) : it_(it) {
+ }
+
+ void operator++() {
+ ++it_;
+ }
+
+ bool operator==(const iterator &that) {
+ return it_ == that.it_;
+ }
+
+ bool operator!=(const iterator &that) {
+ return it_ != that.it_;
+ }
+
+ ReaderT &operator*() {
+ return *(*it_);
+ }
+ };
+
+// class const_iterator: public std::iterator<std::input_iterator_tag, Reader> {
+// typedef typename std::vector<Reader*>::iterator vec_it;
+// vec_it it_;
+// public:
+//
+// const_iterator(vec_it it) : it_(it) {
+// }
+//
+// void operator++ () {
+// ++it_;
+// }
+//
+// bool operator== (const const_iterator& that) {
+// return it_ == that.it_;
+// }
+//
+// bool operator!= (const const_iterator& that) {
+// return it_ != that.it_;
+// }
+//
+// ReaderT& operator*() {
+// return *(*it_);
+// }
+// };
+
+ ReaderT &operator[](size_t i) {
+ return *readers_.at(i);
+ }
+
+ ReaderPtrT &ptr_at(size_t i) {
+ return readers_.at(i);
+ }
+
+ ReaderT &back() {
+ return *readers_.back();
+ }
+
+ size_t size() const {
+ return readers_.size();
+ }
+
+ bool eof() const {
+ for (size_t i = 0; i < readers_.size(); ++i) {
+ if (!readers_[i]->eof()) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ iterator begin() {
+ return iterator(readers_.begin());
+ }
+
+ iterator end() {
+ return iterator(readers_.end());
+ }
+
+// const_iterator begin() const {
+// return iterator(streams_.begin());
+// }
+//
+// const_iterator end() const {
+// return iterator(streams_.end());
+// }
+
+ void push_back(ReaderT *reader_ptr) {
+ readers_.push_back(ReaderPtrT(reader_ptr));
+ }
+
+ void push_back(ReaderPtrT reader_ptr) {
+ readers_.push_back(reader_ptr);
+ }
+
+ void reset() {
+ for (size_t i = 0; i < readers_.size(); ++i) {
+ readers_[i]->reset();
+ }
+ }
+
+ void close() {
+ for (size_t i = 0; i < readers_.size(); ++i) {
+ readers_[i]->close();
+ }
+ }
+
+ void clear() {
+ readers_.clear();
+ }
+
+ ReadStreamStat get_stat() const {
+ ReadStreamStat stat;
+ for (size_t i = 0; i < readers_.size(); ++i) {
+ stat.merge(readers_[i]->get_stat());
+ }
+ return stat;
+ }
+
+// void release() {
+// destroy_readers_ = false;
+// }
+
+// const std::vector< Reader * >& get() const {
+// return streams_;
+// }
+
+};
+
+}
diff --git a/src/modules/io/reads_io/sequence_reader.hpp b/src/modules/io/reads_io/sequence_reader.hpp
new file mode 100644
index 0000000..515cc9e
--- /dev/null
+++ b/src/modules/io/reads_io/sequence_reader.hpp
@@ -0,0 +1,77 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "io/reads_io/ireader.hpp"
+#include "io/reads/single_read.hpp"
+
+namespace io {
+
+//todo merge with VectorReader
+template<class ReadType>
+class SequenceReadStream : public ReadStream<ReadType> {
+public:
+ explicit SequenceReadStream(const Sequence &sequence, const std::string &name = "")
+ : sequence_(sequence),
+ name_(name),
+ opened_(true),
+ eof_(false) {
+ }
+
+ virtual ~SequenceReadStream() {
+ }
+
+ virtual bool is_open() {
+ return opened_;
+ }
+
+ virtual bool eof() {
+ return eof_;
+ }
+
+ virtual void close() {
+ opened_ = false;
+ }
+
+ void reset() {
+ eof_ = false;
+ opened_ = true;
+ }
+
+ ReadStreamStat get_stat() const {
+ return ReadStreamStat();
+ }
+
+ SequenceReadStream &operator>>(ReadType &read);
+
+private:
+ Sequence sequence_;
+ std::string name_;
+ bool opened_;
+ bool eof_;
+};
+
+template<>
+SequenceReadStream<SingleRead> &SequenceReadStream<SingleRead>::operator>>(SingleRead &read) {
+ if (!eof_) {
+ read = SingleRead(name_, sequence_.str());
+ eof_ = true;
+ }
+ return *this;
+}
+
+template<>
+SequenceReadStream<SingleReadSeq> &SequenceReadStream<SingleReadSeq>::operator>>(SingleReadSeq &read) {
+ if (!eof_) {
+ read = SingleReadSeq(sequence_);
+ eof_ = true;
+ }
+ return *this;
+}
+
+}
diff --git a/src/modules/io/reads_io/splitting_wrapper.hpp b/src/modules/io/reads_io/splitting_wrapper.hpp
new file mode 100644
index 0000000..026dff2
--- /dev/null
+++ b/src/modules/io/reads_io/splitting_wrapper.hpp
@@ -0,0 +1,75 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+#include "io/reads/single_read.hpp"
+#include "delegating_reader_wrapper.hpp"
+
+namespace io {
+
+class SplittingWrapper: public DelegatingWrapper<SingleRead> {
+ typedef DelegatingWrapper<SingleRead> base;
+private:
+ std::vector<SingleRead> buffer_;
+ size_t buffer_position_;
+
+ void FillBuffer(SingleRead& tmp_read) {
+ buffer_.clear();
+ for(size_t i = 0; i < tmp_read.size(); i++) {
+ size_t j = i;
+ while(j < tmp_read.size() && is_nucl(tmp_read.GetSequenceString()[j])) {
+ j++;
+ }
+ if(j > i) {
+ buffer_.push_back(tmp_read.Substr(i, j));
+ i = j - 1;
+ }
+ }
+ buffer_position_ = 0;
+ }
+
+ bool Skip() {
+ while(!this->reader().eof() && buffer_position_ == buffer_.size()) {
+ SingleRead tmp_read;
+ this->reader() >> tmp_read;
+ FillBuffer(tmp_read);
+ }
+ return buffer_position_ != buffer_.size();
+ }
+
+public:
+
+ explicit SplittingWrapper(base::ReadStreamPtrT reader) :
+ base(reader), buffer_position_(0) {
+ }
+
+ /* virtual */
+ SplittingWrapper& operator>>(SingleRead& read) {
+ Skip();
+ read = buffer_[buffer_position_];
+ buffer_position_++;
+ return *this;
+ }
+
+ //todo fix needed!!! seems that eof can't be called multiple times in a row!!!
+ /* virtual */ bool eof() {
+ return !Skip();
+ }
+};
+
+inline std::shared_ptr<ReadStream<SingleRead>> SplittingWrap(std::shared_ptr<ReadStream<SingleRead>> reader_ptr) {
+ return std::make_shared<SplittingWrapper>(reader_ptr);
+}
+
+inline ReadStreamList<SingleRead> SplittingWrap(ReadStreamList<SingleRead>& readers) {
+ ReadStreamList<SingleRead> answer;
+ for (size_t i = 0; i < readers.size(); ++i) {
+ answer.push_back(SplittingWrap(readers.ptr_at(i)));
+ }
+ return answer;
+}
+}
diff --git a/src/modules/io/reads_io/vector_reader.hpp b/src/modules/io/reads_io/vector_reader.hpp
new file mode 100644
index 0000000..9059c6e
--- /dev/null
+++ b/src/modules/io/reads_io/vector_reader.hpp
@@ -0,0 +1,61 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+#include "io/reads_io/ireadstream.hpp"
+namespace io {
+
+/**
+ * Use vector<T> as input-stream with operator>>(T& t)
+ */
+template <typename T>
+class VectorReadStream : public ReadStream<T> {
+ std::vector<T> data_;
+ size_t pos_;
+ bool closed_;
+public:
+ VectorReadStream(const std::vector<T>& data) : data_(data), pos_(0), closed_(false) {
+
+ }
+
+ VectorReadStream(const T& item) : data_({item}), pos_(0), closed_(false) {
+
+ }
+
+ virtual bool eof() /*const */{
+ return pos_ == data_.size();
+ }
+
+ VectorReadStream<T>& operator>>(T& t) {
+ VERIFY(!eof());
+ t = data_[pos_++];
+ return *this;
+ }
+
+ void close() {
+ closed_ = true;
+ }
+
+ virtual bool is_open() /*const */{
+ return !closed_;
+ }
+
+ void reset() {
+ pos_ = 0;
+ }
+
+ ReadStreamStat get_stat() const {
+ //todo
+ ReadStreamStat stat;
+ stat.read_count_ = data_.size();
+
+ return stat;
+ }
+
+};
+
+}
diff --git a/src/modules/io/reads_io/wrapper_collection.hpp b/src/modules/io/reads_io/wrapper_collection.hpp
new file mode 100644
index 0000000..3b243bb
--- /dev/null
+++ b/src/modules/io/reads_io/wrapper_collection.hpp
@@ -0,0 +1,115 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "io/reads/single_read.hpp"
+#include "delegating_reader_wrapper.hpp"
+
+namespace io {
+
+//todo refactor!!!
+class IdSettingReaderWrapper: public DelegatingWrapper<SingleRead> {
+ typedef DelegatingWrapper<SingleRead> base;
+ size_t next_id_;
+public:
+ IdSettingReaderWrapper(base::ReadStreamPtrT reader, size_t start_id = 0) :
+ base(reader), next_id_(start_id) {
+ }
+
+ /* virtual */
+ IdSettingReaderWrapper& operator>>(SingleRead& read) {
+ this->reader() >> read;
+ read.ChangeName(ToString(next_id_++));
+ return *this;
+ }
+};
+
+class PrefixAddingReaderWrapper: public DelegatingWrapper<SingleRead> {
+ typedef DelegatingWrapper<SingleRead> base;
+ std::string prefix_;
+public:
+ PrefixAddingReaderWrapper(base::ReadStreamPtrT reader,
+ const std::string& prefix) :
+ base(reader), prefix_(prefix) {
+ }
+
+ /* virtual */
+ PrefixAddingReaderWrapper& operator>>(SingleRead& read) {
+ this->reader() >> read;
+ read.ChangeName(prefix_ + read.name());
+ return *this;
+ }
+};
+
+//fixme currently leads to long stretches of ACGTACGT...
+class FixingWrapper: public DelegatingWrapper<SingleRead> {
+ typedef DelegatingWrapper<SingleRead> base;
+
+ io::SingleRead MakeValid(const io::SingleRead& read) const {
+ std::string str = read.GetSequenceString();
+ for (size_t i = 0; i < str.length(); ++i) {
+ if (!is_nucl(str[i]))
+ str[i] = nucl(char(i % 4));
+ }
+ return io::SingleRead(read.name(), str);
+ }
+
+public:
+ FixingWrapper(base::ReadStreamPtrT reader) :
+ base(reader) {
+ }
+
+ /* virtual */
+ FixingWrapper& operator>>(SingleRead& read) {
+ this->reader() >> read;
+ if (!read.IsValid()) {
+ TRACE("Read " << read.name() << " was invalid. Fixing");
+ read = MakeValid(read);
+ VERIFY(read.IsValid());
+ }
+ return *this;
+ }
+
+private:
+ DECL_LOGGER("FixingWrapper");
+};
+
+class NonNuclCollapsingWrapper: public DelegatingWrapper<SingleRead> {
+ typedef DelegatingWrapper<SingleRead> base;
+
+ io::SingleRead MakeValid(const io::SingleRead& read) const {
+ std::string str = read.GetSequenceString();
+ std::stringstream ss;
+ for (size_t i = 0; i < read.size(); ++i) {
+ if (is_nucl(str[i]))
+ ss << str[i];
+ }
+ return io::SingleRead(read.name(), ss.str());
+ }
+
+public:
+ NonNuclCollapsingWrapper(base::ReadStreamPtrT reader) :
+ base(reader) {
+ }
+
+ /* virtual */
+ NonNuclCollapsingWrapper& operator>>(SingleRead& read) {
+ this->reader() >> read;
+ if (!read.IsValid()) {
+ TRACE("Read " << read.name() << " was invalid. Collapsing non-nucls");
+ read = MakeValid(read);
+ VERIFY(read.IsValid());
+ }
+ return *this;
+ }
+
+private:
+ DECL_LOGGER("NonNuclCollapsingWrapper");
+};
+
+}
diff --git a/src/modules/io/sam_io/bam_parser.hpp b/src/modules/io/sam_io/bam_parser.hpp
new file mode 100644
index 0000000..3a22c0d
--- /dev/null
+++ b/src/modules/io/sam_io/bam_parser.hpp
@@ -0,0 +1,67 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef COMMON_IO_BAMPARSER_HPP
+#define COMMON_IO_BAMPARSER_HPP
+
+#include "io/reads/single_read.hpp"
+#include "io/reads_io/parser.hpp"
+#include "data_structures/sequence/quality.hpp"
+#include "data_structures/sequence/nucl.hpp"
+#include "dev_support/verify.hpp"
+
+#include "bamtools/api/BamReader.h"
+
+#include <string>
+
+namespace io {
+
+class BAMParser: public Parser {
+public:
+ BAMParser(const std::string& filename, OffsetType offset_type = PhredOffset)
+ : Parser(filename, offset_type) {
+ open();
+ }
+
+ ~BAMParser() {
+ close();
+ }
+
+ BAMParser& operator>>(SingleRead& read) {
+ if (!is_open_ || eof_)
+ return *this;
+
+ read = SingleRead(seq_.Name, seq_.QueryBases, seq_.Qualities, offset_type_);
+ eof_ = (false == reader_.GetNextAlignment(seq_));
+
+ return *this;
+ }
+
+ void close() {
+ reader_.Close();
+ is_open_ = false;
+ eof_ = true;
+ }
+
+private:
+ BamTools::BamReader reader_;
+ BamTools::BamAlignment seq_;
+
+ void open() {
+ reader_.Open(filename_);
+ is_open_ = true;
+
+ eof_ = (false == reader_.GetNextAlignment(seq_));
+ }
+
+ BAMParser(const BAMParser& parser);
+ void operator=(const BAMParser& parser);
+};
+
+}
+
+#endif /* COMMON_IO_FASTAFASTQGZPARSER_HPP */
diff --git a/src/modules/io/sam_io/bam_reader.hpp b/src/modules/io/sam_io/bam_reader.hpp
new file mode 100644
index 0000000..57c2c64
--- /dev/null
+++ b/src/modules/io/sam_io/bam_reader.hpp
@@ -0,0 +1,107 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+//todo rename to reader
+#pragma once
+
+#include "io/reads_io/ireader.hpp"
+#include "io/reads/single_read.hpp"
+
+#include <bamtools/api/BamReader.h>
+
+namespace io {
+class BamRead : public BamTools::BamAlignment {
+public:
+ BamRead() { }
+
+ BamRead(const BamTools::BamAlignment &other)
+ : BamTools::BamAlignment(other) { }
+
+ const std::string &name() const {
+ return Name;
+ }
+
+ size_t size() const {
+ return Length;
+ }
+
+ size_t nucl_count() const {
+ return size();
+ }
+
+ const std::string &GetSequenceString() const {
+ return QueryBases;
+ }
+
+ std::string GetPhredQualityString() const {
+ return Qualities;
+ }
+
+ operator io::SingleRead() {
+ // not including quality is intentional:
+ // during read correction bases might be inserted/deleted,
+ // and base qualities for them are not calculated
+ return io::SingleRead(name(), GetSequenceString());
+ }
+
+ char operator[](size_t i) const {
+ VERIFY(is_nucl(QueryBases[i]));
+ return dignucl(QueryBases[i]);
+ }
+};
+
+class UnmappedBamStream : public ReadStream<BamRead> {
+public:
+ UnmappedBamStream(const std::string &filename)
+ : filename_(filename) {
+ open();
+ }
+
+ virtual ~UnmappedBamStream() { }
+
+ bool is_open() { return is_open_; }
+
+ bool eof() { return eof_; }
+
+ UnmappedBamStream &operator>>(BamRead &read) {
+ if (!is_open_ || eof_)
+ return *this;
+
+ read = seq_;
+ eof_ = (false == reader_.GetNextAlignment(seq_));
+
+ return *this;
+ }
+
+ void close() {
+ reader_.Close();
+ is_open_ = false;
+ eof_ = true;
+ }
+
+ void reset() {
+ close();
+ open();
+ }
+
+ ReadStreamStat get_stat() const { return ReadStreamStat(); }
+
+private:
+ BamTools::BamReader reader_;
+ BamTools::BamAlignment seq_;
+ std::string filename_;
+ bool is_open_;
+ bool eof_;
+
+ void open() {
+ reader_.Open(filename_);
+ is_open_ = true;
+
+ eof_ = (false == reader_.GetNextAlignment(seq_));
+ }
+
+};
+}
diff --git a/src/modules/io/sam_io/read.cpp b/src/modules/io/sam_io/read.cpp
new file mode 100644
index 0000000..dc9a0e0
--- /dev/null
+++ b/src/modules/io/sam_io/read.cpp
@@ -0,0 +1,42 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include <io/sam_io/read.hpp>
+
+using namespace std;
+
+namespace sam_reader {
+
+string SingleSamRead::cigar() const {
+ uint32_t *cigar = bam1_cigar(data_);
+ string res;
+ res.reserve(data_->core.n_cigar);
+ for (size_t k = 0; k < data_->core.n_cigar; ++k) {
+ res += std::to_string(bam_cigar_oplen(cigar[k]));
+ res += bam_cigar_opchr(cigar[k]);
+
+ }
+ return res;
+}
+
+string SingleSamRead::name() const {
+ string res(bam1_qname(data_));
+ return res;
+}
+
+string SingleSamRead::seq() const {
+ string res = "";
+ auto b = bam1_seq(data_);
+ for (int k = 0; k < data_->core.l_qseq; ++k) {
+ res += bam_nt16_rev_table[bam1_seqi(b, k)];
+ }
+ return res;
+}
+
+
+}
+;
diff --git a/src/include/io/sam/read.hpp b/src/modules/io/sam_io/read.hpp
similarity index 100%
rename from src/include/io/sam/read.hpp
rename to src/modules/io/sam_io/read.hpp
diff --git a/src/modules/io/sam_io/sam_reader.cpp b/src/modules/io/sam_io/sam_reader.cpp
new file mode 100644
index 0000000..5d338fa
--- /dev/null
+++ b/src/modules/io/sam_io/sam_reader.cpp
@@ -0,0 +1,75 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include <io/sam_io/read.hpp>
+#include <io/sam_io/sam_reader.hpp>
+
+using namespace std;
+
+namespace sam_reader {
+
+bool MappedSamStream::eof() const {
+ return eof_;
+}
+
+bool MappedSamStream::is_open() const {
+ return is_open_;
+}
+
+MappedSamStream& MappedSamStream::operator>>(SingleSamRead& read) {
+ if (!is_open_ || eof_)
+ return *this;
+ read.set_data(seq_);
+ int tmp = samread(reader_, seq_);
+ eof_ = (0 >= tmp);
+ return *this;
+}
+
+MappedSamStream& MappedSamStream::operator >>(PairedSamRead& read) {
+ TRACE("starting process paired read");
+ SingleSamRead r1;
+ MappedSamStream::operator >>(r1);
+ SingleSamRead r2;
+ MappedSamStream::operator >>(r2);
+
+ read = PairedSamRead(r1, r2);
+ TRACE(r1.seq());
+ TRACE(r2.seq());
+ TRACE(r1.name());
+ return *this;
+}
+
+const char* MappedSamStream::get_contig_name(int i) const {
+ VERIFY(i < reader_->header->n_targets);
+ return (reader_->header->target_name[i]);
+}
+
+void MappedSamStream::close() {
+ samclose(reader_);
+ is_open_ = false;
+ eof_ = true;
+ bam_destroy1(seq_);
+}
+
+void MappedSamStream::reset() {
+ close();
+ open();
+}
+
+void MappedSamStream::open() {
+ if ((reader_ = samopen(filename_.c_str(), "r", NULL)) == NULL) {
+ WARN("Fail to open SAM file " << filename_);
+ is_open_ = false;
+ eof_ = true;
+ } else {
+ is_open_ = true;
+ int tmp = samread(reader_, seq_);
+ eof_ = (0 >= tmp);
+ }
+}
+
+}
diff --git a/src/modules/io/sam_io/sam_reader.hpp b/src/modules/io/sam_io/sam_reader.hpp
new file mode 100644
index 0000000..55dc297
--- /dev/null
+++ b/src/modules/io/sam_io/sam_reader.hpp
@@ -0,0 +1,49 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+#pragma once
+
+#include "read.hpp"
+
+#include "dev_support/logger/log_writers.hpp"
+
+#include <samtools/sam.h>
+#include <samtools/bam.h>
+
+#include <string>
+
+namespace sam_reader {
+
+class MappedSamStream {
+public:
+ MappedSamStream(const std::string &filename)
+ : filename_(filename) {
+ open();
+ }
+
+ virtual ~MappedSamStream() {
+ }
+
+ bool is_open() const;
+ bool eof() const;
+ MappedSamStream& operator >>(SingleSamRead& read);
+ MappedSamStream& operator >>(PairedSamRead& read);
+ const char* get_contig_name(int i) const;
+ void close();
+ void reset();
+
+private:
+ samfile_t *reader_;
+ bam1_t *seq_ = bam_init1();
+ std::string filename_;
+ bool is_open_;
+ bool eof_;
+
+ void open();
+};
+
+}
+;
diff --git a/src/modules/math/CMakeLists.txt b/src/modules/math/CMakeLists.txt
new file mode 100644
index 0000000..28cb6c6
--- /dev/null
+++ b/src/modules/math/CMakeLists.txt
@@ -0,0 +1,14 @@
+############################################################################
+# Copyright (c) 2015 Saint Petersburg State University
+# Copyright (c) 2011-2014 Saint Petersburg Academic University
+# All Rights Reserved
+# See file LICENSE for details.
+############################################################################
+
+project(math_module CXX)
+
+add_library(math_module STATIC
+ kmer_coverage_model.cpp)
+
+target_link_libraries(math_module nlopt)
+
diff --git a/src/modules/math/kmer_coverage_model.cpp b/src/modules/math/kmer_coverage_model.cpp
new file mode 100644
index 0000000..c957546
--- /dev/null
+++ b/src/modules/math/kmer_coverage_model.cpp
@@ -0,0 +1,394 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "kmer_coverage_model.hpp"
+
+#include "math/xmath.h"
+#include "dev_support/logger/logger.hpp"
+#include "math/smooth.hpp"
+#include "dev_support/verify.hpp"
+
+#include <boost/math/special_functions/zeta.hpp>
+#include <boost/math/distributions/normal.hpp>
+#include <boost/math/distributions/skew_normal.hpp>
+#include <boost/math/distributions/geometric.hpp>
+#include <boost/math/distributions/pareto.hpp>
+
+#include <nlopt/nlopt.hpp>
+
+#include <vector>
+
+#include <cstring>
+#include <cstdint>
+#include <cstddef>
+#include <cmath>
+
+namespace cov_model {
+using std::isfinite;
+
+static const size_t MaxCopy = 10;
+
+static double dzeta(double x, double p) {
+ return pow(x, -p - 1) / boost::math::zeta(p + 1);
+}
+
+static double perr(size_t i, double scale, double shape) {
+ return pow((1 + shape * ((double) (i - 1)) / scale), -1.0 / shape) -
+ pow((1 + shape * ((double) i) / scale), -1.0 / shape);
+}
+
+static double pgood(size_t i, double zp, double u, double sd, double shape,
+ double *mixprobs = NULL) {
+ double res = 0;
+
+ for (unsigned copy = 0; copy < MaxCopy; ++copy) {
+ boost::math::skew_normal snormal((copy + 1) * u, sd * sqrt(copy + 1), shape);
+ // res += (mixprobs ? mixprobs[copy] : dzeta(copy + 1, zp)) * (boost::math::cdf(snormal, i + 1) - boost::math::cdf(snormal, i));
+ res += (mixprobs ? mixprobs[copy] : dzeta(copy + 1, zp)) * boost::math::pdf(snormal, i);
+ }
+
+ return res;
+}
+
+class CovModelLogLike {
+ const std::vector <size_t> &cov;
+
+public:
+ CovModelLogLike(const std::vector <size_t> &cov)
+ : cov(cov) { }
+
+ int getN() const { return 7; };
+
+private:
+
+ double eval_(const double *x) const {
+ double zp = x[0], p = x[1], shape = x[2], u = x[3], sd = x[4], scale = x[5], shape2 = x[6];
+
+ if (zp <= 1 || shape <= 0 || sd <= 0 || p < 1e-9 || p > 1 - 1e-9 || u <= 0 || scale <= 0 ||
+ !isfinite(zp) || !isfinite(shape) || !isfinite(sd) || !isfinite(p) || !isfinite(u) ||
+ !isfinite(scale) || !isfinite(shape2))
+ return +std::numeric_limits<double>::infinity();
+
+ std::vector <double> kmer_probs(cov.size());
+
+ // Error
+ for (size_t i = 0; i < kmer_probs.size(); ++i)
+ kmer_probs[i] += p * perr(i + 1, scale, shape);
+
+ // Good
+ for (size_t i = 0; i < kmer_probs.size(); ++i)
+ kmer_probs[i] += (1 - p) * pgood(i + 1, zp, u, sd, shape2);
+
+ double res = 0;
+ for (size_t i = 0; i < kmer_probs.size(); ++i)
+ res += (double) (cov[i]) * log(kmer_probs[i]);
+
+ return -res;
+ }
+};
+
+struct CovModelLogLikeEMData {
+ const std::vector <size_t> &cov;
+ const std::vector <double> &z;
+};
+
+static double CovModelLogLikeEM(unsigned, const double *x, double *, void *data) {
+ double zp = x[0], shape = x[1], u = x[2], sd = x[3], scale = x[4], shape2 = x[5];
+
+ // INFO("Entry: " << x[0] << " " << x[1] << " " << x[2] << " " << x[3] << " " << x[4]);
+
+ if (zp <= 1 || shape <= 0 || sd <= 0 || u <= 0 || scale <= 0 ||
+ !isfinite(zp) || !isfinite(shape) || !isfinite(sd) || !isfinite(u) ||
+ !isfinite(scale) || !isfinite(shape2))
+ return -std::numeric_limits<double>::infinity();
+
+ const std::vector <size_t> &cov = static_cast<CovModelLogLikeEMData *>(data)->cov;
+ const std::vector <double> &z = static_cast<CovModelLogLikeEMData *>(data)->z;
+
+ std::vector <double> kmer_probs(cov.size(), 0);
+
+ // Error
+ for (size_t i = 0; i < kmer_probs.size(); ++i) {
+ if (cov[i] == 0)
+ continue;
+
+ kmer_probs[i] += z[i] * log(perr(i + 1, scale, shape));
+ }
+
+ // Good
+ // Pre-compute mixing probabilities
+ std::vector <double> mixprobs(MaxCopy, 0);
+ for (unsigned copy = 0; copy < MaxCopy; ++copy)
+ mixprobs[copy] = dzeta(copy + 1, zp);
+
+ // Compute the density
+ for (size_t i = 0; i < kmer_probs.size(); ++i) {
+ if (cov[i] == 0)
+ continue;
+
+ double val = log(pgood(i + 1, zp, u, sd, shape2, &mixprobs[0]));
+ if (!isfinite(val))
+ val = -1000.0;
+ kmer_probs[i] += (1 - z[i]) * val;
+ }
+
+ double res = 0;
+ for (size_t i = 0; i < kmer_probs.size(); ++i)
+ res += (double) (cov[i]) * kmer_probs[i];
+
+ // INFO("f: " << res);
+ return res;
+}
+
+
+static std::vector <double> EStep(const std::vector <double> &x,
+ double p, size_t N) {
+ double zp = x[0], shape = x[1], u = x[2], sd = x[3], scale = x[4], shape2 = x[5];
+
+ std::vector <double> res(N);
+ for (size_t i = 0; i < N; ++i) {
+ double pe = p * perr(i + 1, scale, shape);
+ res[i] = pe / (pe + (1 - p) * pgood(i + 1, zp, u, sd, shape2));
+ if (!isfinite(res[i]))
+ res[i] = 1.0;
+ }
+
+ return res;
+}
+
+// Estimate the coverage mean by finding the max past the
+// first valley.
+size_t KMerCoverageModel::EstimateValley() const {
+ // Smooth the histogram
+ std::vector <size_t> scov;
+ math::Smooth3RS3R(scov, cov_);
+
+ size_t Valley = scov[0];
+
+ // Start finding the valley
+ size_t Idx = 1;
+ while (scov[Idx] < Valley && Idx < scov.size()) {
+ Valley = scov[Idx];
+ Idx += 1;
+ }
+ Idx -= 1;
+
+ INFO("Kmer coverage valley at: " << Idx);
+
+ return Idx;
+}
+
+void KMerCoverageModel::Fit() {
+ VERIFY_MSG(cov_.size() > 10, "Invalid kmer coverage histogram");
+
+ // Find the minimal coverage point using smoothed histogram.
+ Valley_ = EstimateValley();
+
+ // First estimate of coverage is the first maximum after the valley.
+ MaxCov_ = Valley_ + 1;
+ size_t MaxHist = cov_[MaxCov_];
+ for (size_t i = Valley_ + 1; i < cov_.size(); ++i) {
+ if (cov_[i] > MaxHist) {
+ MaxHist = cov_[i];
+ MaxCov_ = i;
+ }
+ }
+ INFO("K-mer histogram maximum: " << MaxCov_);
+
+ // Refine the estimate via median
+ size_t AfterValley = 0, SecondValley = std::min(2 * MaxCov_ - Valley_, cov_.size());
+ for (size_t i = Valley_ + 1; i < SecondValley; ++i)
+ AfterValley += cov_[i];
+
+ size_t ccov = 0;
+ for (size_t i = Valley_ + 1; i < SecondValley; ++i) {
+ if (ccov > AfterValley / 2) {
+ MaxCov_ = std::max(i, MaxCov_);
+ break;
+ }
+ ccov += cov_[i];
+ }
+
+ if (MaxCov_ - Valley_ < 3)
+ WARN("Too much erroneous kmers, the estimates might be unreliable");
+
+ std::vector <size_t> mvals(1 + MaxCov_ - Valley_);
+ mvals[0] = cov_[MaxCov_];
+ size_t tmadcov = mvals[0];
+ for (size_t i = 1; i < std::min(MaxCov_ - Valley_, cov_.size() - MaxCov_); ++i) {
+ mvals[i] = cov_[MaxCov_ + i] + cov_[MaxCov_ - i];
+ tmadcov += mvals[i];
+ }
+ size_t madcov = 0;
+ double CovSd = sqrt(5.0 * (double) MaxCov_);
+ for (size_t i = 0; i < MaxCov_ - Valley_; ++i) {
+ if (madcov > tmadcov / 2) {
+ CovSd = i;
+ break;
+ }
+ madcov += mvals[i];
+ }
+ CovSd *= 1.4826;
+ INFO("Estimated median coverage: " << MaxCov_ << ". Coverage mad: " << CovSd);
+
+ // Estimate error probability as ratio of kmers before the valley.
+ size_t BeforeValley = 0, Total = 0;
+ double ErrorProb = 0;
+ for (size_t i = 0; i < cov_.size(); ++i) {
+ if (i <= Valley_)
+ BeforeValley += cov_[i];
+ Total += cov_[i];
+ }
+ ErrorProb = (double) BeforeValley / (double) Total;
+ // Allow some erroneous / good kmers.
+ ErrorProb = std::min(1 - 1e-3, ErrorProb);
+ ErrorProb = std::max(1e-3, ErrorProb);
+
+ TRACE("Total: " << Total << ". Before: " << BeforeValley);
+ TRACE("p: " << ErrorProb);
+
+ std::vector <double> x(6), lb(6), ub(6);
+
+ x[0] = 3;
+ lb[0] = 0;
+ ub[0] = 2000;
+ x[1] = 3;
+ lb[1] = 0;
+ ub[1] = 2000;
+ x[2] = MaxCov_;
+ lb[2] = 0;
+ ub[2] = 2 * MaxCov_;
+ x[3] = CovSd;
+ lb[3] = MaxCov_ - Valley_;
+ ub[3] = SecondValley;
+ x[4] = 1;
+ lb[4] = 0;
+ ub[4] = 2000;
+ x[5] = 0;
+ lb[5] = -6;
+ ub[5] = 6;
+
+ INFO("Fitting coverage model");
+ // Ensure that there will be at least 2 iterations.
+ double PrevErrProb = 2;
+ const double ErrProbThr = 1e-8;
+ auto GoodCov = cov_;
+ GoodCov.resize(std::min(cov_.size(), 5 * MaxCopy * MaxCov_ / 4));
+ converged_ = true;
+ unsigned it = 1;
+ while (fabs(PrevErrProb - ErrorProb) > ErrProbThr) {
+ // Recalculate the vector of posterior error probabilities
+ std::vector <double> z = EStep(x, ErrorProb, GoodCov.size());
+
+ // Recalculate the probability of error
+ PrevErrProb = ErrorProb;
+ ErrorProb = 0;
+ for (size_t i = 0; i < GoodCov.size(); ++i)
+ ErrorProb += z[i] * (double) GoodCov[i];
+ ErrorProb /= (double) Total;
+
+ bool LastIter = fabs(PrevErrProb - ErrorProb) <= ErrProbThr;
+
+ nlopt::opt opt(nlopt::LN_NELDERMEAD, 6);
+ CovModelLogLikeEMData data = {GoodCov, z};
+ opt.set_max_objective(CovModelLogLikeEM, &data);
+ if (!LastIter)
+ opt.set_maxeval(5 * 6 * it);
+ opt.set_xtol_rel(1e-8);
+ opt.set_ftol_rel(1e-8);
+
+ double fMin;
+ nlopt::result Results = nlopt::FAILURE;
+ try {
+ Results = opt.optimize(x, fMin);
+ } catch (nlopt::roundoff_limited &) {
+ }
+
+ VERBOSE_POWER_T2(it, 1, "... iteration " << it);
+ TRACE("Results: ");
+ TRACE("Converged: " << Results << " " << "F: " << fMin);
+
+ double zp = x[0], shape = x[1], u = x[2], sd = x[3], scale = x[4], shape2 = x[5];
+ TRACE("zp: " << zp << " p: " << ErrorProb << " shape: " << shape << " u: " << u << " sd: " << sd <<
+ " scale: " << scale << " shape2: " << shape2);
+
+ it += 1;
+ }
+
+ double delta = x[5] / sqrt(1 + x[5] * x[5]);
+ mean_coverage_ = x[2] + x[3] * delta * sqrt(2 / M_PI);
+ sd_coverage_ = x[3] * sqrt(1 - 2 * delta * delta / M_PI);
+ INFO("Fitted mean coverage: " << mean_coverage_ << ". Fitted coverage std. dev: " << sd_coverage_);
+
+ // Now let us check whether we have sane results
+ for (size_t i = 0; i < x.size(); ++i)
+ if (!isfinite(x[i])) {
+ converged_ = false;
+ break;
+ }
+
+ if (!isfinite(ErrorProb))
+ converged_ = false;
+
+ // See, if we can deduce proper threshold
+
+ // First, check whether initial estimate of Valley was sane.
+ ErrorThreshold_ = 0;
+ if (converged_ && Valley_ > x[2] && x[2] > 2) {
+ Valley_ = (size_t) math::round(x[2] / 2.0);
+ WARN("Valley value was estimated improperly, reset to " << Valley_);
+ }
+
+ // If the model converged, then use it to estimate the thresholds.
+ if (converged_) {
+ std::vector <double> z = EStep(x, ErrorProb, GoodCov.size());
+
+ INFO("Probability of erroneous kmer at valley: " << z[Valley_]);
+ converged_ = false;
+ for (size_t i = 0; i < z.size(); ++i)
+ if (z[i] > strong_probability_threshold_) //0.999
+ LowThreshold_ = std::min(i + 1, Valley_);
+ else if (z[i] < probability_threshold_) {//0.05?
+ ErrorThreshold_ = std::max(i + 1, Valley_);
+ converged_ = true;
+ break;
+ }
+
+#if 0
+for (size_t i = 0; i < z.size(); ++i) {
+ double zp = x[0], shape = x[1], u = x[2], sd = x[3], scale = x[4], shape2 = x[5];
+ double pe = ErrorProb * perr(i + 1, scale, shape);
+ double pg = (1 - ErrorProb) * pgood(i + 1, zp, u, sd, shape2);
+
+ fprintf(stderr, "%e %e %e %e\n", pe, pg, z[i], perr(i + 1, scale, shape));
+}
+#endif
+ }
+
+ // See, if we have sane ErrorThreshold_ and go down to something convervative, if not.
+ if (converged_) {
+ INFO("Preliminary threshold calculated as: " << ErrorThreshold_);
+ ErrorThreshold_ = (Valley_ < mean_coverage_ ?
+ std::min(Valley_ + (size_t) (mean_coverage_ - Valley_) / 2, ErrorThreshold_) :
+ Valley_);
+ INFO("Threshold adjusted to: " << ErrorThreshold_);
+ } else {
+ ErrorThreshold_ = Valley_;
+ LowThreshold_ = 1;
+ WARN("Failed to determine erroneous kmer threshold. Threshold set to: " << ErrorThreshold_);
+ }
+
+ // Now the bonus: estimate the genome size!
+ GenomeSize_ = 0;
+ for (size_t i = ErrorThreshold_ - 1; i < GoodCov.size(); ++i)
+ GenomeSize_ += GoodCov[i];
+ GenomeSize_ /= 2;
+
+ INFO("Estimated genome size (ignoring repeats): " << GenomeSize_);
+}
+
+};
diff --git a/src/modules/math/kmer_coverage_model.hpp b/src/modules/math/kmer_coverage_model.hpp
new file mode 100644
index 0000000..1e7ec38
--- /dev/null
+++ b/src/modules/math/kmer_coverage_model.hpp
@@ -0,0 +1,50 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef __KMER_COVERAGE_MODEL_HPP__
+#define __KMER_COVERAGE_MODEL_HPP__
+
+#include <vector>
+#include <cstddef>
+
+namespace cov_model {
+
+class KMerCoverageModel {
+ const std::vector <size_t> &cov_;
+ size_t MaxCov_, Valley_, ErrorThreshold_, LowThreshold_, GenomeSize_;
+ double probability_threshold_, strong_probability_threshold_, mean_coverage_, sd_coverage_;
+ bool converged_;
+
+public:
+ KMerCoverageModel(const std::vector <size_t> &cov, double probability_threshold,
+ double strong_probability_threshold)
+ : cov_(cov), LowThreshold_(0), probability_threshold_(probability_threshold),
+ strong_probability_threshold_(strong_probability_threshold),
+ mean_coverage_(0.0), sd_coverage_(0.0), converged_(false) { }
+
+ void Fit();
+
+ size_t GetErrorThreshold() const { return ErrorThreshold_; }
+
+ size_t GetLowThreshold() const { return LowThreshold_; }
+
+ size_t GetGenomeSize() const { return GenomeSize_; }
+
+ double GetMeanCoverage() const { return mean_coverage_; }
+
+ double GetSdCoverage() const { return sd_coverage_; }
+
+ bool converged() const { return converged_; }
+
+private:
+ size_t EstimateValley() const;
+};
+
+};
+
+
+#endif
diff --git a/src/modules/math/pred.hpp b/src/modules/math/pred.hpp
new file mode 100644
index 0000000..493626b
--- /dev/null
+++ b/src/modules/math/pred.hpp
@@ -0,0 +1,169 @@
+#ifndef __ADT_PRED_HPP__
+#define __ADT_PRED_HPP__
+
+#pragma once
+
+#include "utils/adt/function_traits.hpp"
+
+#include <memory>
+#include <functional>
+
+namespace pred {
+
+template<typename T>
+class TypedPredicate {
+public:
+ typedef T checked_type;
+
+ template<typename P>
+ TypedPredicate(P p)
+ : self_(std::make_shared<TypedPredicateModel < P> > (std::move(p))) { }
+
+ bool operator()(T x) const {
+ return self_->operator()(x);
+ }
+
+private:
+ struct TypedPredicateConcept {
+ virtual ~TypedPredicateConcept() { };
+
+ virtual bool operator()(T x) const = 0;
+ };
+
+ template<class P>
+ struct TypedPredicateModel : TypedPredicateConcept {
+ TypedPredicateModel(P p)
+ : data_(std::move(p)) { }
+
+ virtual bool operator()(T x) const override {
+ return data_(x);
+ }
+
+ P data_;
+ };
+
+ std::shared_ptr<const TypedPredicateConcept> self_;
+};
+
+template<typename T>
+class AlwaysTrueOperator {
+public:
+ typedef T checked_type;
+
+ bool operator()(T) const {
+ return true;
+ }
+};
+
+template<typename T>
+class AlwaysFalseOperator {
+ typedef T checked_type;
+
+public:
+ bool operator()(T) const {
+ return false;
+ }
+};
+
+template<typename T>
+class AndOperator {
+public:
+ typedef T checked_type;
+
+ AndOperator(TypedPredicate<T> lhs, TypedPredicate<T> rhs)
+ : lhs_(std::move(lhs)),
+ rhs_(std::move(rhs)) { }
+
+ bool operator()(T x) const {
+ return lhs_(x) && rhs_(x);
+ }
+
+private:
+ const TypedPredicate<T> lhs_, rhs_;
+};
+
+template<typename T>
+class OrOperator {
+public:
+ typedef T checked_type;
+
+ OrOperator(TypedPredicate<T> lhs, TypedPredicate<T> rhs)
+ : lhs_(std::move(lhs)), rhs_(std::move(rhs)) { }
+
+ bool operator()(T x) const {
+ return lhs_(x) || rhs_(x);
+ }
+
+private:
+ const TypedPredicate<T> lhs_, rhs_;
+};
+
+template<typename T>
+class NotOperator {
+public:
+ typedef T checked_type;
+
+ NotOperator(const TypedPredicate<T> p)
+ : p_(std::move(p)) { }
+
+ bool operator()(T x) const {
+ return !p_(x);
+ }
+
+private:
+ const TypedPredicate<T> p_;
+};
+
+template<class P,
+ bool = adt::function_traits<P>::arity == 1 &&
+ std::is_same<typename adt::function_traits<P>::return_type, bool>::value>
+struct is_predicate : public std::true_type {
+};
+
+template<class P>
+struct is_predicate<P, false> : public std::false_type {
+};
+
+template<class TP1, class TP2,
+ typename _T1 = typename adt::function_traits<TP1>::template arg<0>::type,
+ typename _T2 = typename adt::function_traits<TP2>::template arg<0>::type,
+ typename =
+ typename std::enable_if<std::is_same<_T1, _T2>::value &&
+ is_predicate<TP1>::value && is_predicate<TP2>::value
+ >::type>
+TypedPredicate<_T1> And(TP1 lhs, TP2 rhs) {
+ return AndOperator<_T1>(lhs, rhs);
+}
+
+template<class TP1, class TP2,
+ typename _T1 = typename adt::function_traits<TP1>::template arg<0>::type,
+ typename _T2 = typename adt::function_traits<TP2>::template arg<0>::type,
+ typename =
+ typename std::enable_if<std::is_same<_T1, _T2>::value &&
+ is_predicate<TP1>::value && is_predicate<TP2>::value
+ >::type>
+TypedPredicate<_T1> Or(TP1 lhs, TP2 rhs) {
+ return OrOperator<_T1>(lhs, rhs);
+}
+
+template<class TP,
+ typename _T = typename adt::function_traits<TP>::template arg<0>::type,
+ typename =
+ typename std::enable_if<is_predicate<TP>::value>::type>
+TypedPredicate<_T> Not(TP p) {
+ return NotOperator<_T>(p);
+}
+
+template<class T>
+TypedPredicate<T> AlwaysTrue() {
+ return AlwaysTrueOperator<T>();
+}
+
+template<class T>
+TypedPredicate<T> AlwaysFalse() {
+ return AlwaysFalseOperator<T>();
+}
+
+} // namespace pred
+
+#endif // __ADT_PRED_HPP__
diff --git a/src/modules/math/smooth.hpp b/src/modules/math/smooth.hpp
new file mode 100644
index 0000000..eb53dc9
--- /dev/null
+++ b/src/modules/math/smooth.hpp
@@ -0,0 +1,195 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef __SMOTH_HPP__
+#define __SMOTH_HPP__
+
+#include <cmath>
+
+namespace math {
+
+template<typename T>
+static T MedianOf3(T u, T v, T w) {
+ /* Median(u,v,w): */
+ if ((u <= v && v <= w) ||
+ (u >= v && v >= w))
+ return v;
+ if ((u <= w && w <= v) ||
+ (u >= w && w >= v))
+ return w;
+
+ /* else */ return u;
+}
+
+/* Return (Index-1) of median(u,v,w) , i.e.,
+-1 : u
+0 : v
+1 : w
+*/
+template<typename T>
+static int IndexOfMedianOf3(T u, T v, T w) {
+ if ((u <= v && v <= w) ||
+ (u >= v && v >= w))
+ return 0;
+ if ((u <= w && w <= v) ||
+ (u >= w && w >= v))
+ return 1;
+
+ /* else */ return -1;
+}
+
+enum {
+ SmoothNoEndRule,
+ SmoothCopyEndRule,
+ SmoothTukeyEndRule
+};
+
+template<typename T>
+static bool SmoothEndStep(const T *x, T *y, size_t n, unsigned end_rule) {
+ switch (end_rule) {
+ default:
+ case SmoothNoEndRule:
+ return false;
+ case SmoothCopyEndRule:
+ y[0] = x[0];
+ y[n - 1] = x[n - 1];
+ return false;
+ case SmoothTukeyEndRule: {
+ bool chg = false;
+ y[0] = MedianOf3(3 * y[1] - 2 * y[2], x[0], y[1]);
+ chg = chg || (y[0] != x[0]);
+ y[n - 1] = MedianOf3(y[n - 2], x[n - 1], 3 * y[n - 2] - 2 * y[n - 3]);
+ chg = chg || (y[n - 1] != x[n - 1]);
+ return chg;
+ }
+ }
+
+ return false;
+}
+
+template<typename T>
+static bool Smooth3(const T *x, T *y, size_t n, unsigned end_rule) {
+ // y[] := Running Median of three (x) = "3 (x[])" with "copy ends"
+ // --- return chg := ( y != x )
+ bool chg = false;
+
+ for (size_t i = 1; i < n - 1; i++) {
+ int j = IndexOfMedianOf3(x[i - 1], x[i], x[i + 1]);
+ y[i] = x[(int) i + j];
+ chg = chg || j;
+ }
+
+ chg |= SmoothEndStep(x, y, n, end_rule);
+
+ return chg;
+}
+
+template<typename T>
+static size_t Smooth3R(const T *x, T *y, T *z, size_t n, unsigned end_rule) {
+ // y[] := "3R"(x) ; 3R = Median of three, repeated until convergence
+ size_t iter;
+ bool chg;
+
+ iter = chg = Smooth3(x, y, n, SmoothCopyEndRule);
+
+ while (chg) {
+ if ((chg = Smooth3(y, z, n, SmoothNoEndRule))) {
+ iter += 1;
+ for (size_t i = 1; i < n - 1; i++)
+ y[i] = z[i];
+ }
+ }
+
+ chg |= SmoothEndStep(x, y, n, end_rule);
+
+ return (iter ? iter : chg);
+ /* = 0 <==> only one "3" w/o any change
+ = 1 <==> either ["3" w/o change + endchange]
+ or [two "3"s, 2nd w/o change ] */
+}
+
+
+template<typename T>
+static bool SplitTest(const T *x, size_t i) {
+ // Split test:
+ // Are we at a /-\ or \_/ location => split should be made ?
+
+ if (x[i] != x[i + 1])
+ return false;
+
+ if ((x[i - 1] <= x[i] && x[i + 1] <= x[i + 2]) ||
+ (x[i - 1] >= x[i] && x[i + 1] >= x[i + 2]))
+ return false;
+
+ /* else */ return true;
+}
+
+template<typename T>
+static bool SmoothSplit3(const T *x, T *y, size_t n, bool do_ends) {
+ // y[] := S(x[]) where S() = "sm_split3"
+ bool chg = false;
+
+ for (size_t i = 0; i < n; i++)
+ y[i] = x[i];
+
+ if (do_ends && SplitTest(x, 1)) {
+ chg = true;
+ y[1] = x[0];
+ y[2] = MedianOf3(x[2], x[3], 3 * x[3] - 2 * x[4]);
+ }
+
+ for (size_t i = 2; i < n - 3; i++) {
+ if (SplitTest(x, i)) {
+ int j;
+ // plateau at x[i] == x[i+1]
+
+ // at left:
+ if (-1 < (j = IndexOfMedianOf3(x[i], x[i - 1], 3 * x[i - 1] - 2 * x[i - 2]))) {
+ y[i] = (j == 0 ? x[i - 1] : 3 * x[i - 1] - 2 * x[i - 2]);
+ chg = (y[i] != x[i]);
+ }
+
+ // at right:
+ if (-1 < (j = IndexOfMedianOf3(x[i + 1], x[i + 2], 3 * x[i + 2] - 2 * x[i + 3]))) {
+ y[i + 1] = (j == 0 ? x[i + 2] : 3 * x[i + 2] - 2 * x[i + 3]);
+ chg = (y[i + 1] != x[i + 1]);
+ }
+ }
+ }
+
+ if (do_ends && SplitTest(x, n - 3)) {
+ chg = true;
+ y[n - 2] = x[n - 1];
+ y[n - 3] = MedianOf3(x[n - 3], x[n - 4], 3 * x[n - 4] - 2 * x[n - 5]);
+ }
+
+ return chg;
+}
+
+template<typename T>
+size_t Smooth3RS3R(std::vector <T> &y, const std::vector <T> &x,
+ unsigned end_rule = SmoothTukeyEndRule, bool split_ends = false) {
+ // y[1:n] := "3R S 3R"(x[1:n]); z = "work";
+ size_t iter;
+ bool chg;
+ size_t n = x.size();
+
+ y.resize(n);
+ std::vector <T> z(n), w(n);
+
+ iter = Smooth3R(&x[0], &y[0], &z[0], n, end_rule);
+ chg = SmoothSplit3(&y[0], &z[0], n, split_ends);
+ if (chg)
+ iter += Smooth3R(&z[0], &y[0], &w[0], n, end_rule);
+
+ /* else y == z already */
+ return (iter + chg);
+}
+
+};
+
+#endif
diff --git a/src/modules/math/xmath.h b/src/modules/math/xmath.h
new file mode 100644
index 0000000..b323ac9
--- /dev/null
+++ b/src/modules/math/xmath.h
@@ -0,0 +1,357 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef XMATH_H_
+#define XMATH_H_
+
+#include <limits>
+#include <cmath>
+
+namespace math {
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: wan at google.com (Zhanyong Wan), eefacm at gmail.com (Sean Mcafee)
+//
+// The Google C++ Testing Framework (Google Test)
+
+
+// This template class serves as a compile-time function from size to
+// type. It maps a size in bytes to a primitive type with that
+// size. e.g.
+//
+// TypeWithSize<4>::UInt
+//
+// is typedef-ed to be unsigned int (unsigned integer made up of 4
+// bytes).
+//
+// Such functionality should belong to STL, but I cannot find it
+// there.
+//
+// Google Test uses this class in the implementation of floating-point
+// comparison.
+//
+// For now it only handles UInt (unsigned int) as that's all Google Test
+// needs. Other types can be easily added in the future if need
+// arises.
+template<size_t size>
+class TypeWithSize {
+public:
+ // This prevents the user from using TypeWithSize<N> with incorrect
+ // values of N.
+ typedef void UInt;
+};
+
+// The specialization for size 4.
+template<>
+class TypeWithSize<4> {
+public:
+ // unsigned int has size 4 in both gcc and MSVC.
+ //
+ // As base/basictypes.h doesn't compile on Windows, we cannot use
+ // uint32, uint64, and etc here.
+ typedef int Int;
+ typedef unsigned int UInt;
+};
+
+// The specialization for size 8.
+template<>
+class TypeWithSize<8> {
+public:
+ typedef long long Int; // NOLINT
+ typedef unsigned long long UInt; // NOLINT
+};
+
+// This template class represents an IEEE floating-point number
+// (either single-precision or double-precision, depending on the
+// template parameters).
+//
+// The purpose of this class is to do more sophisticated number
+// comparison. (Due to round-off error, etc, it's very unlikely that
+// two floating-points will be equal exactly. Hence a naive
+// comparison by the == operation often doesn't work.)
+//
+// Format of IEEE floating-point:
+//
+// The most-significant bit being the leftmost, an IEEE
+// floating-point looks like
+//
+// sign_bit exponent_bits fraction_bits
+//
+// Here, sign_bit is a single bit that designates the sign of the
+// number.
+//
+// For float, there are 8 exponent bits and 23 fraction bits.
+//
+// For double, there are 11 exponent bits and 52 fraction bits.
+//
+// More details can be found at
+// http://en.wikipedia.org/wiki/IEEE_floating-point_standard.
+//
+// Template parameter:
+//
+// RawType: the raw floating-point type (either float or double)
+template<typename RawType>
+class FloatingPoint {
+public:
+ // Defines the unsigned integer type that has the same size as the
+ // floating point number.
+ typedef typename TypeWithSize<sizeof(RawType)>::UInt Bits;
+
+ // Constants.
+
+ // # of bits in a number.
+ static const size_t kBitCount = 8 * sizeof(RawType);
+
+ // # of fraction bits in a number.
+ static const size_t kFractionBitCount = std::numeric_limits<RawType>::digits - 1;
+
+ // # of exponent bits in a number.
+ static const size_t kExponentBitCount = kBitCount - 1 - kFractionBitCount;
+
+ // The mask for the sign bit.
+ static const Bits kSignBitMask = static_cast<Bits>(1) << (kBitCount - 1);
+
+ // The mask for the fraction bits.
+ static const Bits kFractionBitMask = ~static_cast<Bits>(0) >> (kExponentBitCount + 1);
+
+ // The mask for the exponent bits.
+ static const Bits kExponentBitMask = ~(kSignBitMask | kFractionBitMask);
+
+ // How many ULP's (Units in the Last Place) we want to tolerate when
+ // comparing two numbers. The larger the value, the more error we
+ // allow. A 0 value means that two numbers must be exactly the same
+ // to be considered equal.
+ //
+ // The maximum error of a single floating-point operation is 0.5
+ // units in the last place. On Intel CPU's, all floating-point
+ // calculations are done with 80-bit precision, while double has 64
+ // bits. Therefore, 4 should be enough for ordinary use.
+ //
+ // See the following article for more details on ULP:
+ // http://www.cygnus-software.com/papers/comparingfloats/comparingfloats.htm.
+ static const size_t kMaxUlps = 4;
+
+ // Constructs a FloatingPoint from a raw floating-point number.
+ //
+ // On an Intel CPU, passing a non-normalized NAN (Not a Number)
+ // around may change its bits, although the new value is guaranteed
+ // to be also a NAN. Therefore, don't expect this constructor to
+ // preserve the bits in x when x is a NAN.
+ explicit FloatingPoint(const RawType &x) { u_.value_ = x; }
+
+ // Static methods
+
+ // Reinterprets a bit pattern as a floating-point number.
+ //
+ // This function is needed to test the AlmostEquals() method.
+ static RawType ReinterpretBits(const Bits bits) {
+ FloatingPoint fp(0);
+ fp.u_.bits_ = bits;
+ return fp.u_.value_;
+ }
+
+ // Returns the floating-point number that represent positive infinity.
+ static RawType Infinity() {
+ return ReinterpretBits(kExponentBitMask);
+ }
+
+ // Non-static methods
+
+ // Returns the bits that represents this number.
+ const Bits &bits() const { return u_.bits_; }
+
+ // Returns the exponent bits of this number.
+ Bits exponent_bits() const { return kExponentBitMask & u_.bits_; }
+
+ // Returns the fraction bits of this number.
+ Bits fraction_bits() const { return kFractionBitMask & u_.bits_; }
+
+ // Returns the sign bit of this number.
+ Bits sign_bit() const { return kSignBitMask & u_.bits_; }
+
+ // Returns true iff this is NAN (not a number).
+ bool is_nan() const {
+ // It's a NAN if the exponent bits are all ones and the fraction
+ // bits are not entirely zeros.
+ return (exponent_bits() == kExponentBitMask) && (fraction_bits() != 0);
+ }
+
+ // Returns true iff this number is at most kMaxUlps ULP's away from
+ // rhs. In particular, this function:
+ //
+ // - returns false if either number is (or both are) NAN.
+ // - treats really large numbers as almost equal to infinity.
+ // - thinks +0.0 and -0.0 are 0 DLP's apart.
+
+ template<class FloatingPoint2>
+ bool AlmostEquals(const FloatingPoint2 &rhs) const {
+ static_assert(kBitCount == FloatingPoint2::kBitCount, "Can only compare similar sized types");
+ // The IEEE standard says that any comparison operation involving
+ // a NAN must return false.
+ if (is_nan() || rhs.is_nan()) return false;
+ //cout << "ULPS " << DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_) << endl;
+
+ return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.bits())
+ <= kMaxUlps;
+ }
+
+private:
+ // The data type used to store the actual floating-point number.
+ union FloatingPointUnion {
+ RawType value_; // The raw floating-point number.
+ Bits bits_; // The bits that represent the number.
+ };
+
+ // Converts an integer from the sign-and-magnitude representation to
+ // the biased representation. More precisely, let N be 2 to the
+ // power of (kBitCount - 1), an integer x is represented by the
+ // unsigned number x + N.
+ //
+ // For instance,
+ //
+ // -N + 1 (the most negative number representable using
+ // sign-and-magnitude) is represented by 1;
+ // 0 is represented by N; and
+ // N - 1 (the biggest number representable using
+ // sign-and-magnitude) is represented by 2N - 1.
+ //
+ // Read http://en.wikipedia.org/wiki/Signed_number_representations
+ // for more details on signed number representations.
+ static Bits SignAndMagnitudeToBiased(const Bits &sam) {
+ if (kSignBitMask & sam) {
+ // sam represents a negative number.
+ return ~sam + 1;
+ } else {
+ // sam represents a positive number.
+ return kSignBitMask | sam;
+ }
+ }
+
+ // Given two numbers in the sign-and-magnitude representation,
+ // returns the distance between them as an unsigned number.
+ static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits &sam1,
+ const Bits &sam2) {
+ const Bits biased1 = SignAndMagnitudeToBiased(sam1);
+ const Bits biased2 = SignAndMagnitudeToBiased(sam2);
+ return (biased1 >= biased2) ? (biased1 - biased2) : (biased2 - biased1);
+ }
+
+ FloatingPointUnion u_;
+};
+
+template<class T>
+T eps();
+
+template<>
+inline double eps<double>() { return 1e-10; }
+
+template<>
+inline float eps<float>() { return (float) 1e-5; }
+
+template<class T>
+inline
+bool eq(T lhs, T rhs) {
+ const FloatingPoint<T> lhs_(lhs), rhs_(rhs);
+ return lhs_.AlmostEquals(rhs_);
+ //return !ls(lhs, rhs) && !ls(rhs, lhs)[> std::abs(lhs - rhs) < eps<T>()<];
+}
+
+template<class T, class U>
+inline
+bool eq(T lhs, U rhs) {
+ const FloatingPoint<T> lhs_(lhs);
+ const FloatingPoint<U> rhs_(rhs);
+ return lhs_.AlmostEquals(rhs_);
+ //return !ls(lhs, rhs) && !ls(rhs, lhs)[> std::abs(lhs - rhs) < eps<T>()<];
+}
+
+template<class T, class U>
+inline
+bool ls(T lhs, U rhs) {
+ if (!eq(lhs, rhs))
+ return (lhs < rhs);
+ return false;
+ //T maxim = max(std::abs(rhs), std::abs(lhs));
+ //if (maxim < 1)
+ //return (lhs + eps<T>() < rhs);
+ //else
+ //return (eps<T>() < (rhs - lhs) / maxim);
+}
+
+template<class T, class U>
+inline
+bool gr(T lhs, U rhs) { return ls(rhs, lhs); }
+
+template<class T, class U>
+inline
+bool le(T lhs, U rhs) { return !ls(rhs, lhs); }
+
+template<class T, class U>
+inline
+bool ge(T lhs, U rhs) { return !ls(lhs, rhs); }
+
+template<class T>
+inline
+T floor(T t) { return std::floor(t + eps<T>()); }
+
+template<class T>
+inline
+T round(T t) { return floor(t + (T) 0.5); }
+
+template<class T>
+inline
+int round_to_zero(T t) {
+ using math::ls;
+ int res = (int) math::round(std::abs(t));
+ if (ls(t, (T) 0.))
+ res = -res;
+ return res;
+}
+
+// updates floating point @variable only if it does not differ from the @new_value too much
+// @returns true if the @variable was updated indeed
+template<class T>
+inline
+bool update_value_if_needed(T &variable, T new_value) {
+ bool result = !eq<T>(variable, new_value);
+
+ if (result) {
+ variable = new_value;
+ }
+ return result;
+}
+
+}
+
+#endif /* XMATH_H_ */
diff --git a/src/modules/paired_info/CMakeLists.txt b/src/modules/paired_info/CMakeLists.txt
new file mode 100644
index 0000000..35d1605
--- /dev/null
+++ b/src/modules/paired_info/CMakeLists.txt
@@ -0,0 +1,14 @@
+############################################################################
+# Copyright (c) 2015 Saint Petersburg State University
+# Copyright (c) 2011-2014 Saint Petersburg Academic University
+# All Rights Reserved
+# See file LICENSE for details.
+############################################################################
+
+project(paired_info CXX)
+
+add_library(paired_info STATIC
+ bwa_pair_info_filler.cpp)
+
+target_link_libraries(paired_info input)
+
diff --git a/src/modules/paired_info/bwa_pair_info_filler.cpp b/src/modules/paired_info/bwa_pair_info_filler.cpp
new file mode 100644
index 0000000..6855138
--- /dev/null
+++ b/src/modules/paired_info/bwa_pair_info_filler.cpp
@@ -0,0 +1,408 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "bwa_pair_info_filler.hpp"
+
+
+namespace bwa_pair_info {
+
+void MapperReadT::ParseCigar(const string& cigar) {
+ string num = "";
+ bool left_side = true;
+ for (size_t i = 0; i < cigar.length(); ++i) {
+ if (isdigit(cigar[i])) {
+ num += cigar[i];
+ }
+ else {
+ if (cigar[i] == 'H') {
+ if (left_side)
+ left_hard_clip_ = (uint16_t) std::stoi(num);
+ else
+ right_hard_clip_ = (uint16_t) std::stoi(num);
+ num = "";
+ }
+ else if (cigar[i] == 'S') {
+ if (left_side)
+ left_soft_clip_ = (uint16_t) std::stoi(num);
+ else
+ right_soft_clip_ = (uint16_t) std::stoi(num);
+ num = "";
+ }
+ else {
+ left_side = false;
+ num = "";
+ }
+ }
+ }
+}
+
+//Correct read algnment according to orientation and clippings
+void BWACorrectingProcessor::ProcessPairedRead(const MapperReadT& l, const MapperReadT& r) {
+ using io::LibraryOrientation;
+
+ if (!l.IsValid() || !r.IsValid()) {
+ return;
+ }
+ ++count_;
+
+ MappedPositionT left_pos(edge_id_map_.at(stoi(l.get_contig_id())), l.pos());
+ MappedPositionT right_pos(edge_id_map_.at(stoi(r.get_contig_id())), r.pos());
+
+ //This function if overloaded in BWAISCounter and BWAIndexFiller
+ if (!CheckAlignments(left_pos, right_pos)) {
+ return;
+ }
+
+ int r_from_pos_to_right_end = r.len() + r.right_hard_clip() - r.left_soft_clip();
+ int l_from_pos_to_left_end = l.left_soft_clip() + l.left_hard_clip();
+
+ if ((!l.is_forward() && (lib_.orientation() == LibraryOrientation::FF || lib_.orientation() == LibraryOrientation::FR)) ||
+ (l.is_forward() && (lib_.orientation() == LibraryOrientation::RF || lib_.orientation() == LibraryOrientation::RR))) {
+ left_pos.e = g_.conjugate(left_pos.e);
+ left_pos.pos = (int) g_.length(left_pos.e) - left_pos.pos - (l.len() - l.left_soft_clip() - l.right_soft_clip()) + (int) g_.k();
+ l_from_pos_to_left_end = l.right_soft_clip() + l.right_hard_clip();
+ }
+ if ((!r.is_forward() && (lib_.orientation() == LibraryOrientation::FF || lib_.orientation() == LibraryOrientation::RF)) ||
+ (r.is_forward() && (lib_.orientation() == LibraryOrientation::FR || lib_.orientation() == LibraryOrientation::RR))) {
+ right_pos.e = g_.conjugate(right_pos.e);
+ right_pos.pos = (int) g_.length(right_pos.e) - right_pos.pos - (r.len() - r.left_soft_clip() - r.right_soft_clip()) + (int) g_.k();
+ r_from_pos_to_right_end = r.len() + r.left_hard_clip() - r.right_soft_clip();
+ }
+
+ right_pos.pos = right_pos.pos + r_from_pos_to_right_end;
+ left_pos.pos = left_pos.pos - l_from_pos_to_left_end;
+
+ //This function if overloaded in BWAISCounter and BWAIndexFiller
+ ProcessAlignments(left_pos, right_pos);
+}
+
+// ==== insert size counter overloads ====
+bool BWAISCounter::CheckAlignments(const MappedPositionT& l, const MappedPositionT& r) {
+ return l.e == r.e && g_.length(l.e) >= min_contig_len_;
+}
+
+void BWAISCounter::ProcessAlignments(const MappedPositionT& l, const MappedPositionT& r) {
+ ++mapped_count_;
+
+ int is = r.pos - l.pos;
+ if (is > 0 || !ignore_negative_) {
+ hist_[is] += 1;
+ } else {
+ ++negative_count_;
+ }
+}
+
+bool BWAISCounter::RefineInsertSize(SequencingLibraryT& reads) const {
+ using namespace omnigraph;
+ size_t correctly_mapped = mapped_count_ - negative_count_;
+ INFO(correctly_mapped << " paired reads (" << ((double) correctly_mapped * 100.0 / (double) count_) << "% of all) aligned to long edges");
+
+ if (negative_count_ > 3 * correctly_mapped)
+ WARN("Too much reads aligned with negative insert size. Is the library orientation set properly?");
+ if (mapped_count_ == 0)
+ return false;
+
+ std::map<size_t, size_t> percentiles;
+ find_mean(hist_, reads.data().mean_insert_size, reads.data().insert_size_deviation, percentiles);
+ find_median(hist_, reads.data().median_insert_size, reads.data().insert_size_mad, reads.data().insert_size_distribution);
+ if (reads.data().median_insert_size < reads.data().read_length) {
+ return false;
+ }
+
+ std::tie(reads.data().insert_size_left_quantile, reads.data().insert_size_right_quantile) =
+ GetISInterval(0.8, reads.data().insert_size_distribution);
+
+ return !reads.data().insert_size_distribution.empty();
+}
+
+// ==== pair info index filler overloads ====
+EdgePair BWAIndexFiller::ConjugatePair(EdgePair ep) const {
+ return make_pair(g_.conjugate(ep.second), g_.conjugate(ep.first));
+}
+
+void BWAIndexFiller::ProcessAlignments(const MappedPositionT& l, const MappedPositionT& r) {
+ EdgePair ep{l.e, r.e};
+ TRACE("Lpos " << l.pos << ", Rpos " << r.pos);
+ int edge_distance = (int) lib_.data().mean_insert_size - r.pos + l.pos;
+ TRACE("Distance " << edge_distance);
+
+ paired_index_.Add(ep.first, ep.second, omnigraph::de::RawPoint(edge_distance, 1.0));
+}
+
+bool BWAIndexFiller::CheckAlignments(const MappedPositionT& l, const MappedPositionT& r) {
+ return g_.length(l.e) >= min_contig_len_ && g_.length(r.e) >= min_contig_len_;
+}
+
+
+//Main class realization
+void BWAPairInfoFiller::OutputEdges(const string &filename) const {
+ io::osequencestream_simple oss(filename);
+ for (auto it = g_.ConstEdgeBegin(true); !it.IsEnd(); ++it) {
+ debruijn_graph::EdgeId e = *it;
+ oss.set_header(ToString(g_.int_id(e)));
+ oss << g_.EdgeNucls(e);
+ }
+}
+void BWAPairInfoFiller::FillEdgeIdMap() {
+ for (auto it = g_.ConstEdgeBegin(true); !it.IsEnd(); ++it) {
+ debruijn_graph::EdgeId e = *it;
+ edge_id_map_.insert(make_pair(g_.int_id(e), e));
+ }
+}
+
+bool BWAPairInfoFiller::CreateIndex(const string& contigs) {
+ int run_res = 0;
+ string err_log = path::append_path(work_dir_, "index.err");
+ string index_line = bwa_path_ + string(" index ") + "-a is " + contigs + " 2>" + err_log;
+ index_line = path::screen_whitespaces(index_line);
+ INFO("Running bwa index ... ");
+ INFO("Command line: " << index_line);
+ run_res = system(index_line.c_str());
+ if (run_res != 0) {
+ ERROR("bwa index failed, cannot align reads");
+ return false;
+ }
+ return true;
+}
+
+
+bool BWAPairInfoFiller::RunBWA(const string& reads_file, const string& out_sam_file) const {
+ string run_command = bwa_path_ + " mem -t " + ToString(nthreads_) + " " + index_base_ + " " + reads_file + " > " + out_sam_file + " 2>"
+ + out_sam_file + ".txt";
+ run_command = path::screen_whitespaces(run_command);
+ INFO("Running bwa mem ...");
+ INFO("Command line: " << run_command);
+
+ int run_res = system(run_command.c_str());
+ if (run_res != 0) {
+ ERROR("bwa mem failed, cannot align reads");
+ return false;
+ }
+ return true;
+}
+
+bool BWAPairInfoFiller::AlignLib(const SequencingLibraryT& lib,
+ const string& sam_file_base,
+ vector<pair<string, string>>& resulting_sam_files) {
+
+ VERIFY_MSG(Init(), "BWA index was not constructed properly");
+ resulting_sam_files.clear();
+ size_t file_index = 0;
+ bool any_aligned = false;
+
+ for (auto iter = lib.paired_begin(); iter != lib.paired_end(); iter++) {
+ string left_reads = iter->first;
+ string left_sam = sam_file_base + "_1_" + ToString(file_index) + ".sam";
+ bool res = RunBWA(left_reads, left_sam);
+ if (!res) {
+ WARN("Failed to align left reads " << left_reads);
+ continue;
+ }
+ string right_reads = iter->second;
+ string right_sam = sam_file_base + "_2_" + ToString(file_index) + ".sam";
+ res = RunBWA(right_reads, right_sam);
+ if (!res) {
+ WARN("Failed to align right reads " << right_reads);
+ continue;
+ }
+
+ resulting_sam_files.push_back(make_pair(left_sam, right_sam));
+ any_aligned = true;
+ }
+ return any_aligned;
+}
+
+
+void BWAPairInfoFiller::ProcessSAMFiles(const string &left_sam, const string &right_sam,
+ BWAPairedReadProcessor& processor) {
+
+ //Left and right reads are stored in maps until pair is detected
+ unordered_map<string, MapperReadT> left_reads;
+ unordered_map<string, MapperReadT> right_reads;
+ size_t counter = 0;
+ //Check for duplicating read IDs
+ bool left_duplicated = false;
+ bool right_duplicated = false;
+
+ INFO("Reading SAM files " << left_sam << " and " << right_sam);
+ MappedSamStream lf(left_sam);
+ MappedSamStream rf(right_sam);
+ while (!lf.eof() || !rf.eof()) {
+ SingleSamRead left_read;
+ MapperReadT left_data;
+ string l_name = "";
+
+ SingleSamRead right_read;
+ MapperReadT right_data;
+ string r_name = "";
+
+ if (!lf.eof()) {
+ lf >> left_read;
+ l_name = left_read.name();
+ if (left_read.is_properly_aligned()) {
+ TRACE("Left read " << l_name);
+ left_data = MapperReadT(string(lf.get_contig_name(left_read.contig_id())),
+ left_read.pos(),
+ left_read.data_len(),
+ left_read.strand(),
+ left_read.cigar());
+ }
+ else if (!left_read.is_main_alignment()) {
+ //If not primary alignment ignore mapping
+ TRACE("Ignoring left read");
+ l_name = "";
+ }
+ }
+ if (!rf.eof()) {
+ rf >> right_read;
+ r_name = right_read.name();
+ if (right_read.is_properly_aligned()) {
+ TRACE("Right read " << r_name);
+ right_data = MapperReadT(string(rf.get_contig_name(right_read.contig_id())),
+ right_read.pos(),
+ right_read.data_len(),
+ right_read.strand(),
+ right_read.cigar());
+ }
+ else if (!right_read.is_main_alignment()) {
+ //If not primary alignment ignore mapping
+ TRACE("Ignoring right read");
+ r_name = "";
+ }
+ }
+
+ //Think about custom read names
+ if (l_name == r_name) {
+ TRACE("Equal processing");
+ //Process immideately if ids are equal in both SAM entries
+ processor.ProcessPairedRead(left_data, right_data);
+ VERBOSE_POWER2(++counter, "Processed " << counter << " paired reads");
+ continue;
+ }
+
+ if (r_name != "") {
+ auto it = left_reads.find(r_name);
+ if (it != left_reads.end()) {
+ //Right read's mate found in map
+ TRACE("Right read's mate found, processing");
+ processor.ProcessPairedRead(it->second, right_data);
+ VERBOSE_POWER2(++counter, "Processed " << counter << " paired reads");
+ //Remove mate as used
+ left_reads.erase(it);
+ }
+ else {
+ TRACE("Right read's mate not found, adding to map");
+ if (right_reads.count(r_name) == 0) {
+ //Insert read without mate for further analysis
+ //TODO inspect map size and performance
+ right_reads.emplace(r_name, right_data);
+ } else {
+ DEBUG("Right read " << r_name << " is duplicated!");
+ //Report duplication
+ right_duplicated = true;
+ }
+ }
+ }
+
+ if (l_name != "") {
+ auto it = right_reads.find(l_name);
+ if (it != right_reads.end()) {
+ //Left read's mate found in map
+ TRACE("Left read's mate found, processing");
+ processor.ProcessPairedRead(left_data, it->second);
+ VERBOSE_POWER2(++counter, "Processed " << counter << " paired reads");
+ //Remove mate as used
+ right_reads.erase(it);
+ }
+ else {
+ TRACE("Left read's mate not found, adding to map");
+ if (left_reads.count(l_name) == 0) {
+ //Insert read without mate for further analysis
+ //TODO inspect map size and performance
+ left_reads.emplace(l_name, left_data);
+ } else {
+ DEBUG("Left read " << r_name << " is duplicated!");
+ //Report duplication
+ left_duplicated = true;
+ }
+
+ }
+ }
+ }
+
+ if (left_duplicated)
+ WARN("SAM file " << left_sam << " contains duplicated read ids");
+ if (right_duplicated)
+ WARN("SAM file " << right_sam << " contains duplicated read ids");
+}
+
+bool BWAPairInfoFiller::Init() {
+ if (!index_constructed_) {
+ INFO("Initializing bwa pair info counter, working dir " << work_dir_);
+ path::make_dir(base_dir_);
+ work_dir_ = path::make_temp_dir(base_dir_, "");
+ index_base_= path::append_path(work_dir_, "long_edges.fasta");
+ INFO("Saving edges to " << index_base_);
+ OutputEdges(index_base_);
+ FillEdgeIdMap();
+ index_constructed_ = CreateIndex(index_base_);
+ }
+ return index_constructed_;
+}
+
+bool BWAPairInfoFiller::ProcessLib(size_t lib_index,
+ SequencingLibraryT& lib,
+ PairedInfoIndexT& paired_index,
+ size_t counter_edge_len,
+ size_t index_filler_edge_len) {
+ //Initialize if needed
+ Init();
+ string lib_dir = path::append_path(work_dir_, ToString(lib_index));
+ path::make_dir(lib_dir);
+ vector<pair<string, string>> sam_files;
+ bool result = false;
+
+ INFO("Mapping lib #" << lib_index << " using BWA");
+ if (!AlignLib(lib, path::append_path(lib_dir, "single"), sam_files)) {
+ WARN("Failed to align lib #" << lib_index);
+ return false;
+ }
+
+ INFO("Estimating insert size for library #" << lib_index);
+ BWAISCounter counter(lib, edge_id_map_, g_, counter_edge_len);
+ for (const auto& sam_pair : sam_files) {
+ ProcessSAMFiles(sam_pair.first, sam_pair.second, counter);
+ }
+
+ if (!counter.RefineInsertSize(lib)) {
+ lib.data().mean_insert_size = 0.0;
+ WARN("Unable to estimate insert size paired library #" << lib_index);
+ }
+ else {
+ INFO(" Estimated insert size for paired library #" << lib_index);
+ INFO(" Insert size = " << lib.data().mean_insert_size <<
+ ", deviation = " << lib.data().insert_size_deviation <<
+ ", left quantile = " << lib.data().insert_size_left_quantile <<
+ ", right quantile = " << lib.data().insert_size_right_quantile <<
+ ", read length = " << lib.data().read_length);
+
+ INFO("Collecting paired information for library #" << lib_index);
+ paired_index.Init();
+
+ BWAIndexFiller filler(lib, edge_id_map_, g_, paired_index, index_filler_edge_len);
+ for (const auto& sam_pair : sam_files) {
+ ProcessSAMFiles(sam_pair.first, sam_pair.second, filler);
+ }
+ result = true;
+ }
+ if (remove_tmp_files_)
+ path::remove_dir(lib_dir);
+ return result;
+}
+
+
+}
diff --git a/src/modules/paired_info/bwa_pair_info_filler.hpp b/src/modules/paired_info/bwa_pair_info_filler.hpp
new file mode 100644
index 0000000..438fafe
--- /dev/null
+++ b/src/modules/paired_info/bwa_pair_info_filler.hpp
@@ -0,0 +1,253 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "assembly_graph/graph_core/graph.hpp"
+#include "pipeline/config_struct.hpp"
+
+#include <io/sam_io/sam_reader.hpp>
+#include <io/sam_io/read.hpp>
+
+#include <io/reads_io/osequencestream.hpp>
+#include <paired_info/paired_info.hpp>
+#include <paired_info/insert_size_refiner.hpp>
+
+#ifndef PROJECT_BWA_PAIR_INFO_FILLER_HPP_H
+#define PROJECT_BWA_PAIR_INFO_FILLER_HPP_H
+
+namespace bwa_pair_info {
+
+using namespace sam_reader;
+using debruijn_graph::EdgeId;
+
+typedef omnigraph::de::UnclusteredPairedInfoIndexT<debruijn_graph::Graph> PairedInfoIndexT;
+typedef io::SequencingLibrary<debruijn_graph::config::DataSetData> SequencingLibraryT;
+typedef std::pair<debruijn_graph::EdgeId, debruijn_graph::EdgeId> EdgePair;
+typedef unordered_map<size_t, debruijn_graph::EdgeId> EdgeIdMap;
+
+//More compact representation of aligned read for storing in map
+class MapperReadT {
+public:
+ MapperReadT(): contig_id_(""), pos_(-1), len_(-1), is_forward_(true),
+ left_hard_clip_(0), right_hard_clip_(0), left_soft_clip_(0), right_soft_clip_(0){}
+
+ MapperReadT(const string& ctg_id, int32_t pos, int32_t len, bool is_forward, const string& cigar):
+ contig_id_(ctg_id), pos_(pos), len_(len), is_forward_(is_forward),
+ left_hard_clip_(0), right_hard_clip_(0), left_soft_clip_(0), right_soft_clip_(0) {
+
+ ParseCigar(cigar);
+ }
+
+ bool IsValid() const {
+ return contig_id_ != "";
+ }
+
+private:
+
+ void ParseCigar(const string& cigar);
+
+public:
+ const string &get_contig_id() const {
+ return contig_id_;
+ }
+ int32_t pos() const {
+ return pos_;
+ }
+ int32_t len() const {
+ return len_;
+ }
+ bool is_forward() const {
+ return is_forward_;
+ }
+ uint32_t left_soft_clip() const {
+ return left_soft_clip_;
+ }
+ uint32_t right_soft_clip() const {
+ return right_soft_clip_;
+ }
+ uint32_t left_hard_clip() const {
+ return left_hard_clip_;
+ }
+ uint32_t right_hard_clip() const {
+ return right_hard_clip_;
+ }
+
+private:
+ string contig_id_;
+ int32_t pos_;
+ int32_t len_;
+ bool is_forward_;
+ uint32_t left_hard_clip_:16, right_hard_clip_:16;
+ uint32_t left_soft_clip_:16, right_soft_clip_:16;
+};
+
+//Base class for aligned read processor (simple analog of SequenceMapperListener)
+class BWAPairedReadProcessor {
+public:
+ virtual void ProcessPairedRead(const MapperReadT& l, const MapperReadT& r) = 0;
+
+ virtual ~BWAPairedReadProcessor() {
+
+ }
+};
+
+//Class that corrects mapping positions according to lib orientation and clippings
+class BWACorrectingProcessor: public BWAPairedReadProcessor {
+protected:
+ const SequencingLibraryT& lib_;
+
+ const EdgeIdMap& edge_id_map_;
+
+ const debruijn_graph::Graph& g_;
+
+ size_t count_;
+
+public:
+
+ struct MappedPositionT {
+ EdgeId e;
+ int pos;
+
+ MappedPositionT(EdgeId e_, int pos_): e(e_), pos(pos_) {
+
+ }
+ };
+
+ BWACorrectingProcessor(const SequencingLibraryT& lib, const EdgeIdMap& edge_id_map, const debruijn_graph::Graph& g):
+ lib_(lib), edge_id_map_(edge_id_map), g_(g), count_(0) {
+ }
+
+ virtual bool CheckAlignments(const MappedPositionT& l, const MappedPositionT& r) = 0;
+
+ virtual void ProcessAlignments(const MappedPositionT& l, const MappedPositionT& r) = 0;
+//Correct read algnment according to orientation and clippings
+ virtual void ProcessPairedRead(const MapperReadT& l, const MapperReadT& r);
+};
+
+//Insert size counter
+class BWAISCounter: public BWACorrectingProcessor {
+private:
+ HistType hist_;
+ size_t min_contig_len_;
+ bool ignore_negative_;
+ size_t mapped_count_;
+ size_t negative_count_;
+
+public:
+ BWAISCounter(const SequencingLibraryT& lib, const EdgeIdMap& edge_id_map, const debruijn_graph::Graph& g,
+ size_t min_contig_len, bool ignore_negative = false):
+ BWACorrectingProcessor(lib, edge_id_map, g), hist_(), min_contig_len_(min_contig_len),
+ ignore_negative_(ignore_negative), mapped_count_(0), negative_count_(0) {
+ }
+
+ bool CheckAlignments(const MappedPositionT& l, const MappedPositionT& r) override;
+
+ void ProcessAlignments(const MappedPositionT& l, const MappedPositionT& r) override;
+
+ bool RefineInsertSize(SequencingLibraryT& reads) const ;
+
+};
+
+//Pair info filler
+class BWAIndexFiller: public BWACorrectingProcessor {
+
+private:
+ PairedInfoIndexT& paired_index_;
+
+ size_t min_contig_len_;
+
+ EdgePair ConjugatePair(EdgePair ep) const;
+
+public:
+ BWAIndexFiller(const SequencingLibraryT& lib, const EdgeIdMap& edge_id_map, const debruijn_graph::Graph& g,
+ PairedInfoIndexT& paired_index, size_t min_contig_len = 0):
+ BWACorrectingProcessor(lib, edge_id_map, g), paired_index_(paired_index), min_contig_len_(min_contig_len) {
+ }
+
+ bool CheckAlignments(const MappedPositionT& l, const MappedPositionT& r) override;
+
+ void ProcessAlignments(const MappedPositionT& l, const MappedPositionT& r) override;
+};
+
+//Class for running BWA, managing and parsing SAM files
+class BWAPairInfoFiller {
+public:
+ DECL_LOGGER("BWAPairInfo");
+
+private:
+ const debruijn_graph::Graph& g_;
+
+ string bwa_path_;
+
+ string base_dir_;
+
+ string work_dir_;
+
+ size_t nthreads_;
+
+ string index_base_;
+
+ bool index_constructed_;
+
+ bool remove_tmp_files_;
+
+ unordered_map<size_t, debruijn_graph::EdgeId> edge_id_map_;
+
+private:
+
+ //Save graph in fasta format
+ void OutputEdges(const string& filename) const;
+
+ //Construct int_id -> EdgeId map
+ void FillEdgeIdMap();
+
+ //Run bwa index
+ bool CreateIndex(const string& contigs);
+
+ //Initialize for read aligment (includes all above)
+ bool Init();
+
+ //Run bwa mem on single file
+ bool RunBWA(const string& reads_file, const string& out_sam_file) const;
+
+ //Process single read library
+ bool AlignLib(const SequencingLibraryT& lib,
+ const string& sam_file_base,
+ vector<pair<string, string>>& resulting_sam_files);
+
+ //Parse a pair of same files and analyze alignments with processor
+ void ProcessSAMFiles(const string &left_sam, const string &right_sam,
+ BWAPairedReadProcessor& processor);
+
+public:
+
+ BWAPairInfoFiller(const debruijn_graph::Graph& g,
+ const string& bwa_path,
+ const string& work_dir,
+ size_t nthreads = 1,
+ bool remove_tmp = true):
+ g_(g), bwa_path_(bwa_path), base_dir_(work_dir), work_dir_(""),
+ nthreads_(nthreads), index_base_(""), index_constructed_(false),
+ remove_tmp_files_(remove_tmp),
+ edge_id_map_() {
+ }
+
+ ~BWAPairInfoFiller() {
+ if (remove_tmp_files_)
+ path::remove_if_exists(work_dir_);
+ }
+
+ //Count IS and fill pair info index for the given lib
+ bool ProcessLib(size_t lib_index,
+ SequencingLibraryT& lib,
+ PairedInfoIndexT& paired_index,
+ size_t counter_edge_len,
+ size_t index_filler_edge_len);
+};
+
+}
+
+#endif //PROJECT_BWA_PAIR_INFO_FILLER_HPP_H
diff --git a/src/modules/paired_info/data_divider.hpp b/src/modules/paired_info/data_divider.hpp
new file mode 100644
index 0000000..7bd2c7b
--- /dev/null
+++ b/src/modules/paired_info/data_divider.hpp
@@ -0,0 +1,137 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+* data_divider.hpp
+*
+* Created on: Aug 16, 2011
+* Author: alexeyka
+*/
+
+
+#ifndef DATA_DIVIDER_HPP_
+#define DATA_DIVIDER_HPP_
+
+#include <iostream>
+#include <math.h>
+#include "dev_support/verify.hpp"
+#include <vector>
+#include <utility>
+#include <cstdlib>
+#include <cstdio>
+#include "index_point.hpp"
+
+namespace omnigraph {
+
+namespace de {
+
+template<class EdgeId>
+class DataDivider {
+ typedef pair<size_t, size_t> Interval;
+ typedef vector<PairInfo<EdgeId> > PairInfos;
+ typedef pair<EdgeId, EdgeId> EdgePair;
+ typedef vector<Point> PointArray;
+ typedef std::function<double(int)> WeightFunction;
+
+ // double LeftDerivative(int index, vector<int> x, vector<int> y) {
+ // return outf[dist - min_value_ + 1][0] - outf[dist - min][0];
+ // }
+ //
+ // double RightDerivative(index, std::vector<int> x, std::vector<int> y) {
+ // return outf[dist - min_value_][0] - outf[dist - min - 1][0];
+ // }
+ //
+ // double MiddleDerivative(int index, std::vector<int> x, std::vector<int> y) {
+ // return 0.5f * (outf[dist - min_value_ + 1][0] - outf[dist - min - 1][0]);
+ // }
+
+public:
+ DataDivider(size_t threshold, const PointArray &points) :
+ threshold_(threshold), points_(points) {
+ }
+
+ vector<Interval> DivideData() {
+ VERIFY(points_.size() > 0);
+ vector<Interval> answer;
+ min_value_ = rounded_d(points_.front());
+ max_value_ = rounded_d(points_.back());
+ size_t begin = 0;
+ for (size_t i = 0; i < points_.size() - 1; ++i) {
+ if (IsANewCluster(i, points_)) {
+ answer.push_back(make_pair(begin, i + 1));
+ begin = i + 1;
+ }
+ }
+ answer.push_back(make_pair(begin, points_.size()));
+
+ return answer;
+ }
+
+ vector<Interval> DivideAndSmoothData(const EdgePair &ep,
+ PairInfos &new_data,
+ WeightFunction weight_f) {
+ VERIFY(points_.size() > 0);
+ vector<Interval> answer;
+
+ TRACE("Data");
+ //Print();
+ const Point &point = points_.front();
+ min_value_ = rounded_d(point);
+ max_value_ = rounded_d(points_.back());
+ size_t begin = 0;
+ for (size_t i = 0; i < points_.size(); ++i) {
+ if (i == points_.size() - 1 || IsANewCluster(i)) {
+ int low_val = rounded_d(points_[begin]);
+ int high_val = rounded_d(points_[i]);
+ size_t new_begin = new_data.size();
+ VERIFY(low_val <= high_val);
+ for (int j = low_val; j <= high_val; ++j) {
+ double val = 0.;
+ for (size_t k = begin; k <= i; ++k) {
+ val += points_[k].weight * weight_f(j - rounded_d(points_[k]));
+ }
+ new_data.push_back(PairInfo<EdgeId>(ep.first, ep.second, j, val, 0.));
+ }
+ size_t new_end = new_data.size();
+ answer.push_back(make_pair(new_begin, new_end));
+
+ begin = i + 1;
+ }
+ }
+ //answer.push_back(make_pair(beginc, new_data.size()));
+ TRACE("New_data ");
+ Print();
+
+ return answer;
+ }
+
+private:
+ int min_value_;
+ int max_value_;
+ size_t threshold_;
+ PointArray points_;
+
+ void Print() const {
+ for (size_t i = 0; i < points_.size(); ++i) {
+ TRACE(points_[i].d << " " << points_[i].weight);
+ }
+ }
+
+ bool IsANewCluster(size_t index) {
+ VERIFY(index < points_.size() - 1);
+ return (math::gr(abs(points_[index + 1].d - points_[index].d), (DEDistance) threshold_));
+ }
+
+ DECL_LOGGER("DataDivider");
+};
+
+}
+
+
+}
+
+#endif /* DATA_DIVIDER_HPP_ */
diff --git a/src/modules/paired_info/distance_estimation.hpp b/src/modules/paired_info/distance_estimation.hpp
new file mode 100644
index 0000000..7143ef3
--- /dev/null
+++ b/src/modules/paired_info/distance_estimation.hpp
@@ -0,0 +1,309 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef DISTANCE_ESTIMATION_HPP_
+#define DISTANCE_ESTIMATION_HPP_
+
+#include "math/xmath.h"
+#include "dev_support/openmp_wrapper.h"
+
+#include "paired_info.hpp"
+#include "assembly_graph/paths/path_processor.hpp"
+#include "paired_info/pair_info_bounds.hpp"
+
+namespace omnigraph {
+
+namespace de {
+
+//todo move to some more common place
+template<class Graph>
+class GraphDistanceFinder {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef std::vector<EdgeId> Path;
+ typedef std::vector<size_t> GraphLengths;
+ typedef std::map<EdgeId, GraphLengths> LengthMap;
+
+public:
+ GraphDistanceFinder(const Graph &graph, size_t insert_size, size_t read_length, size_t delta) :
+ graph_(graph), insert_size_(insert_size), gap_((int) (insert_size - 2 * read_length)),
+ delta_((double) delta) { }
+
+ std::vector<size_t> GetGraphDistancesLengths(EdgeId e1, EdgeId e2) const {
+ LengthMap m;
+ m.insert({e2, {}});
+
+ FillGraphDistancesLengths(e1, m);
+
+ return m[e2];
+ }
+
+ // finds all distances from a current edge to a set of edges
+ void FillGraphDistancesLengths(EdgeId e1, LengthMap &second_edges) const {
+ vector<VertexId> end_points;
+ vector<size_t> path_lower_bounds;
+ for (const auto &entry : second_edges) {
+ EdgeId second_edge = entry.first;
+ end_points.push_back(graph_.EdgeStart(second_edge));
+ path_lower_bounds.push_back(PairInfoPathLengthLowerBound(graph_.k(), graph_.length(e1),
+ graph_.length(second_edge), gap_, delta_));
+ TRACE("Bounds for paths are " << path_lower_bounds.back());
+ }
+
+ size_t path_upper_bound = PairInfoPathLengthUpperBound(graph_.k(), insert_size_, delta_);
+
+ DistancesLengthsCallback<Graph> callback(graph_);
+
+ PathProcessor<Graph> paths_proc(graph_, graph_.EdgeEnd(e1), path_upper_bound);
+
+ for (size_t i = 0; i < end_points.size(); ++i) {
+ //FIXME should max dist also depend on the point?
+ paths_proc.Process(end_points[i], path_lower_bounds[i], path_upper_bound, callback);
+ }
+
+ vector<GraphLengths> result;
+
+ size_t i = 0;
+ for (auto &entry : second_edges) {
+ GraphLengths lengths = callback.distances(i++);
+ for (size_t j = 0; j < lengths.size(); ++j) {
+ lengths[j] += graph_.length(e1);
+ TRACE("Resulting distance set # " << i <<
+ " edge " << graph_.int_id(entry.first) << " #" << j << " length " << lengths[j]);
+ }
+
+ if (e1 == entry.first)
+ lengths.push_back(0);
+
+ std::sort(lengths.begin(), lengths.end());
+ entry.second = lengths;
+ }
+ }
+
+private:
+ DECL_LOGGER("GraphDistanceFinder");
+
+ const Graph &graph_;
+ const size_t insert_size_;
+ const int gap_;
+ const double delta_;
+};
+
+template<class Graph>
+class AbstractDistanceEstimator {
+protected:
+ typedef UnclusteredPairedInfoIndexT<Graph> InPairedIndex;
+ typedef PairedInfoIndexT<Graph> OutPairedIndex;
+ typedef typename InPairedIndex::HistProxy InHistogram;
+ typedef typename OutPairedIndex::Histogram OutHistogram;
+
+public:
+ AbstractDistanceEstimator(const Graph &graph,
+ const InPairedIndex &index,
+ const GraphDistanceFinder<Graph> &distance_finder,
+ size_t linkage_distance = 0)
+ : graph_(graph), index_(index),
+ distance_finder_(distance_finder), linkage_distance_(linkage_distance) { }
+
+ virtual void Estimate(PairedInfoIndexT<Graph> &result, size_t nthreads) const = 0;
+
+ virtual ~AbstractDistanceEstimator() { }
+
+protected:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef pair<EdgeId, EdgeId> EdgePair;
+ typedef vector<pair<int, double> > EstimHist;
+ typedef vector<size_t> GraphLengths;
+ typedef std::map<EdgeId, GraphLengths> LengthMap;
+
+ const Graph &graph() const { return graph_; }
+
+ const InPairedIndex &index() const { return index_; }
+
+ void FillGraphDistancesLengths(EdgeId e1, LengthMap &second_edges) const {
+ distance_finder_.FillGraphDistancesLengths(e1, second_edges);
+ }
+
+ OutHistogram ClusterResult(EdgePair /*ep*/, const EstimHist &estimated) const {
+ OutHistogram result;
+ for (size_t i = 0; i < estimated.size(); ++i) {
+ size_t left = i;
+ double weight = estimated[i].second;
+ while (i + 1 < estimated.size() &&
+ (estimated[i + 1].first - estimated[i].first) <= (int) linkage_distance_) {
+ ++i;
+ weight += estimated[i].second;
+ }
+ double center = (estimated[left].first + estimated[i].first) * 0.5;
+ double var = (estimated[i].first - estimated[left].first) * 0.5;
+ result.insert(Point(center, weight, var));
+ }
+ return result;
+ }
+
+ void AddToResult(const OutHistogram &clustered, EdgePair ep, PairedInfoBuffer<Graph> &result) const {
+ result.AddMany(ep.first, ep.second, clustered);
+ }
+
+private:
+ const Graph &graph_;
+ const InPairedIndex &index_;
+ const GraphDistanceFinder<Graph> &distance_finder_;
+ const size_t linkage_distance_;
+
+ virtual const string Name() const = 0;
+};
+
+template<class Graph>
+class DistanceEstimator : public AbstractDistanceEstimator<Graph> {
+ typedef AbstractDistanceEstimator<Graph> base;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef vector<size_t> GraphLengths;
+ typedef vector<pair<int, double> > EstimHist;
+ typedef pair<EdgeId, EdgeId> EdgePair;
+
+protected:
+ typedef typename base::InPairedIndex InPairedIndex;
+ typedef typename base::OutPairedIndex OutPairedIndex;
+ typedef typename base::InHistogram InHistogram;
+ typedef typename base::OutHistogram OutHistogram;
+
+public:
+ DistanceEstimator(const Graph &graph,
+ const InPairedIndex &index,
+ const GraphDistanceFinder<Graph> &distance_finder,
+ size_t linkage_distance, size_t max_distance)
+ : base(graph, index, distance_finder, linkage_distance), max_distance_(max_distance) { }
+
+ virtual ~DistanceEstimator() { }
+
+ void Init() const {
+ INFO("Using " << this->Name() << " distance estimator");
+ }
+
+ virtual void Estimate(OutPairedIndex &result, size_t nthreads) const {
+ this->Init();
+ const auto &index = this->index();
+
+ DEBUG("Collecting edge infos");
+ std::vector<EdgeId> edges;
+ for (auto it = this->graph().ConstEdgeBegin(); !it.IsEnd(); ++it)
+ edges.push_back(*it);
+
+ DEBUG("Processing");
+ PairedInfoBuffersT<Graph> buffer(this->graph(), nthreads);
+# pragma omp parallel for num_threads(nthreads) schedule(guided, 10)
+ for (size_t i = 0; i < edges.size(); ++i) {
+ EdgeId edge = edges[i];
+ ProcessEdge(edge, index, buffer[omp_get_thread_num()]);
+ }
+
+ for (size_t i = 0; i < nthreads; ++i) {
+ result.Merge(buffer[i]);
+ buffer[i].Clear();
+ }
+ }
+
+protected:
+ const DEDistance max_distance_;
+
+ virtual EstimHist EstimateEdgePairDistances(EdgePair ep,
+ const InHistogram &histogram,
+ const GraphLengths &raw_forward) const {
+ using std::abs;
+ using namespace math;
+ EdgeId e1 = ep.first, e2 = ep.second;
+ size_t first_len = this->graph().length(e1), second_len = this->graph().length(e2);
+ int minD = rounded_d(histogram.min()), maxD = rounded_d(histogram.max());
+
+ TRACE("Bounds are " << minD << " " << maxD);
+ EstimHist result;
+ vector<int> forward;
+ forward.reserve(raw_forward.size());
+ for (auto raw_length : raw_forward) {
+ int length = int(raw_length);
+ if (minD - int(max_distance_) <= length && length <= maxD + int(max_distance_))
+ forward.push_back(length);
+ }
+ if (forward.size() == 0)
+ return result;
+
+ size_t cur_dist = 0;
+ vector<DEWeight> weights(forward.size(), 0);
+ for (auto point : histogram) {
+ if (ls(2 * point.d + second_len, DEDistance(first_len)))
+ continue;
+ while (cur_dist + 1 < forward.size() && forward[cur_dist + 1] < point.d)
+ ++cur_dist;
+
+ if (cur_dist + 1 < forward.size() &&
+ ls(forward[cur_dist + 1] - point.d, point.d - forward[cur_dist])) {
+ ++cur_dist;
+
+ if (le(abs(forward[cur_dist] - point.d), max_distance_))
+ weights[cur_dist] += point.weight;
+ } else if (cur_dist + 1 < forward.size() &&
+ eq(forward[cur_dist + 1] - point.d, point.d - forward[cur_dist])) {
+ if (le(abs(forward[cur_dist] - point.d), max_distance_))
+ weights[cur_dist] += point.weight * 0.5;
+ ++cur_dist;
+ if (le(abs(forward[cur_dist] - point.d), max_distance_))
+ weights[cur_dist] += point.weight * 0.5;
+ } else {
+ if (le(abs(forward[cur_dist] - point.d), max_distance_))
+ weights[cur_dist] += point.weight;
+ }
+ }
+
+ for (size_t i = 0; i < forward.size(); ++i)
+ if (ge(weights[i], DEWeight(0)))
+ result.push_back(make_pair(forward[i], weights[i]));
+
+ VERIFY(result.size() == forward.size());
+ return result;
+ }
+
+private:
+ virtual void ProcessEdge(EdgeId e1,
+ const InPairedIndex &pi,
+ PairedInfoBuffer<Graph> &result) const {
+ typename base::LengthMap second_edges;
+ auto inner_map = pi.GetHalf(e1);
+ for (auto i : inner_map)
+ second_edges[i.first];
+
+ this->FillGraphDistancesLengths(e1, second_edges);
+
+ for (const auto &entry: second_edges) {
+ EdgeId e2 = entry.first;
+ EdgePair ep(e1, e2);
+
+ VERIFY(ep <= pi.ConjugatePair(ep));
+
+ const GraphLengths &forward = entry.second;
+ TRACE("Edge pair is " << this->graph().int_id(ep.first)
+ << " " << this->graph().int_id(ep.second));
+ auto hist = pi.Get(e1, e2);
+ const EstimHist &estimated = this->EstimateEdgePairDistances(ep, hist, forward);
+ OutHistogram res = this->ClusterResult(ep, estimated);
+ this->AddToResult(res, ep, result);
+ }
+ }
+
+ virtual const string Name() const {
+ static const string my_name = "SIMPLE";
+ return my_name;
+ }
+
+ DECL_LOGGER("DistanceEstimator");
+};
+
+}
+
+}
+
+#endif /* DISTANCE_ESTIMATION_HPP_ */
diff --git a/src/modules/paired_info/histogram.hpp b/src/modules/paired_info/histogram.hpp
new file mode 100644
index 0000000..c326f6e
--- /dev/null
+++ b/src/modules/paired_info/histogram.hpp
@@ -0,0 +1,190 @@
+//***************************************************************************
+//* Copyright (c) 2015-2016 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <btree/btree_set.h>
+#include "utils/adt/flat_set.hpp"
+#include "utils/adt/small_pod_vector.hpp"
+#include "index_point.hpp"
+
+namespace omnigraph {
+
+namespace de {
+
+template<class Point>
+class Histogram {
+ typedef Histogram<Point> self_type;
+ typedef typename std::less<Point> key_compare;
+ typedef typename std::allocator<Point> allocator_type;
+ typedef typename adt::flat_set<Point, key_compare, adt::SmallPODVector> Tree;
+
+public:
+ typedef typename Tree::key_type key_type;
+ typedef typename Tree::value_type value_type;
+ typedef typename Tree::pointer pointer;
+ typedef typename Tree::const_pointer const_pointer;
+ typedef typename Tree::reference reference;
+ typedef typename Tree::const_reference const_reference;
+ typedef typename Tree::size_type size_type;
+ typedef typename Tree::difference_type difference_type;
+ typedef typename Tree::iterator iterator;
+ typedef typename Tree::const_iterator const_iterator;
+ typedef typename Tree::reverse_iterator reverse_iterator;
+ typedef typename Tree::const_reverse_iterator const_reverse_iterator;
+
+ enum {
+ kValueSize = sizeof(Point)
+ };
+
+public:
+ // Default constructor.
+ Histogram() = default;
+
+ // Copy constructor.
+ Histogram(const self_type &x)
+ : tree_(x.tree_) {}
+
+ template <class InputIterator>
+ Histogram(InputIterator b, InputIterator e) {
+ insert(b, e);
+ }
+
+ // Iterator routines.
+ iterator begin() { return tree_.begin(); }
+ const_iterator begin() const { return tree_.begin(); }
+ iterator end() { return tree_.end(); }
+ const_iterator end() const { return tree_.end(); }
+ reverse_iterator rbegin() { return tree_.rbegin(); }
+ const_reverse_iterator rbegin() const { return tree_.rbegin(); }
+ reverse_iterator rend() { return tree_.rend(); }
+ const_reverse_iterator rend() const { return tree_.rend(); }
+
+ // Lookup routines.
+ iterator lower_bound(const key_type &key) { return tree_.lower_bound(key); }
+ const_iterator lower_bound(const key_type &key) const { return tree_.lower_bound(key); }
+ iterator upper_bound(const key_type &key) { return tree_.upper_bound(key); }
+ const_iterator upper_bound(const key_type &key) const { return tree_.upper_bound(key); }
+ std::pair<iterator,iterator> equal_range(const key_type &key) { return tree_.equal_range(key); }
+ std::pair<const_iterator,const_iterator> equal_range(const key_type &key) const { return tree_.equal_range(key); }
+
+ // Utility routines.
+ void clear() { tree_.clear(); }
+ void swap(self_type &x) { tree_.swap(x.tree_); }
+
+ // Size routines.
+ size_type size() const { return tree_.size(); }
+ size_type max_size() const { return tree_.max_size(); }
+ bool empty() const { return tree_.empty(); }
+ size_type bytes_used() const { return tree_.bytes_used(); }
+
+ // Lookup routines.
+ iterator find(const key_type &key) { return tree_.find(key); }
+ const_iterator find(const key_type &key) const { return tree_.find(key); }
+ size_type count(const key_type &key) const { return tree_.count(key); }
+
+ // Insertion routines.
+ std::pair<iterator,bool> insert(const value_type &x) { return tree_.insert(x); }
+ iterator insert(iterator position, const value_type &x) { return tree_.insert(position, x); }
+ template <typename InputIterator>
+ void insert(InputIterator b, InputIterator e) { tree_.insert(b, e); }
+
+ // Deletion routines.
+ size_type erase(const key_type &key) { return tree_.erase(key); }
+ // Erase the specified iterator from the btree. The iterator must be valid
+ // (i.e. not equal to end()). Return an iterator pointing to the node after
+ // the one that was erased (or end() if none exists).
+ iterator erase(const iterator &iter) { return tree_.erase(iter); }
+ void erase(const iterator &first, const iterator &last) { tree_.erase(first, last); }
+
+ bool operator==(const self_type& x) const {
+ if (size() != x.size())
+ return false;
+
+ for (const_iterator i = begin(), xi = x.begin(); i != end(); ++i, ++xi)
+ if (*i != *xi)
+ return false;
+
+ return true;
+ }
+
+ bool operator!=(const self_type& other) const {
+ return !operator==(other);
+ }
+
+protected:
+ Tree tree_;
+
+private:
+ // This is template voodoo which creates function overload depending on
+ // whether Point has const operator+= or not.
+ template<class>
+ struct true_helper : std::true_type {};
+ template<class T = Point>
+ static auto test_can_merge(int) -> true_helper<decltype(std::declval<const T>().operator+=(std::declval<const T>()))>;
+ template<class>
+ static auto test_can_merge(long) -> std::false_type;
+ template<class T = Point>
+ struct can_merge : decltype(test_can_merge<T>(0)) {};
+
+public:
+ // This function overload is enabled only when Point has const operator+= (e.g. RawPoint)
+ // and therefore we can update it inplace.
+ template<class U = Point>
+ typename std::enable_if<can_merge<U>::value, size_t>::type
+ merge_point(const U &new_point) {
+ // First, try to insert a point
+ const auto &result = insert(new_point);
+ if (result.second)
+ return 1;
+ // We already having something there. Try to merge stuff in.
+ *result.first += new_point;
+ return 0;
+ }
+
+ // Otherwise this overload is used, which removes the point from set,
+ // updates it and re-inserts back.
+ template<class U = Point>
+ typename std::enable_if<!can_merge<U>::value, size_t>::type
+ merge_point(const U &new_point) {
+ auto result = insert(new_point);
+ if (result.second)
+ return 1;
+ Point updated = *result.first + new_point;
+ auto after_removed = erase(result.first);
+ insert(after_removed, updated);
+ return 0;
+ }
+
+ template<class OtherHist>
+ size_t merge(const OtherHist &other) {
+ size_t added = 0;
+ for (const auto &new_point : other) {
+ added += merge_point(new_point);
+ }
+ return added;
+ }
+};
+
+template<typename T>
+inline std::ostream &operator<<(std::ostream &os, const Histogram<T> &b) {
+ os << "{";
+ for (const auto& e : b)
+ os << e << "; ";
+ os << "}";
+ return os;
+}
+
+typedef Histogram<RawGapPoint> RawGapHistogram;
+typedef Histogram<GapPoint> GapHistogram;
+
+typedef Histogram<RawPoint> RawHistogram;
+typedef Histogram<Point> HistogramWithWeight;
+
+}
+
+}
diff --git a/src/modules/paired_info/index_point.hpp b/src/modules/paired_info/index_point.hpp
new file mode 100644
index 0000000..ee26121
--- /dev/null
+++ b/src/modules/paired_info/index_point.hpp
@@ -0,0 +1,370 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+
+namespace omnigraph {
+
+namespace de {
+
+/**
+ * @brief Type for measuring distance between edges.
+ * ---edge1---> --edge2->
+ * |----distance-----|
+ * Can be negative if edges are overlapped.
+ * For paired reads, distance roughly equals to insert size.
+ */
+class DEDistance {
+public:
+ DEDistance() = default;
+ DEDistance(int d)
+ : d_((float)d) {}
+ DEDistance(double d)
+ : d_((float)d) {}
+ DEDistance(size_t d)
+ : d_((float)d) {}
+ operator float() const { return d_; }
+ DEDistance operator+= (double d) {
+ d_ += (float)d;
+ return *this;
+ }
+ DEDistance operator*= (double d) {
+ d_ *= (float)d;
+ return *this;
+ }
+private:
+ float d_;
+};
+
+/**
+ * @brief Scalar type for measuring the weight of a point.
+ * Should not be negative.
+ */
+class DEWeight {
+public:
+ DEWeight() = default;
+ DEWeight(double d)
+ : d_((float)d) {}
+ operator float() const { return d_; }
+ DEWeight operator+= (double d) {
+ d_ += (float)d;
+ return *this;
+ }
+ DEWeight operator*= (double d) {
+ d_ *= (float)d;
+ return *this;
+ }
+private:
+ float d_;
+};
+
+/**
+ * @brief Scalar type for measuring the variance of point distance in clustered index.
+ * |---- max distance ----|
+ * |min distance|
+ * |var-|
+ * Should not be negative.
+ */
+typedef float DEVariance;
+
+/**
+ * @brief A proof-of-concept bicyclish wrapper for small integer types with saturated arithmetics.
+ * All operations are in the interval of [-OFFSET .. MAX_VAL - OFFSET]
+ * where MAX_VAL is 2^sizeof(T) - 1.
+ */
+template<typename T, T OFFSET = 0>
+class DESat {
+public:
+ DESat(): _val(OFFSET) {}
+ DESat(int val): _val(shrink(val)) {}
+ DESat(float val): _val(shrink((int)val)) {}
+
+ operator float() const {
+ return (float)_val - OFFSET;
+ }
+
+ DESat& operator+=(DESat rhs) {
+ _val = sadd(_val, rhs._val);
+ if (_val > OFFSET) //subtract the second offset came from the rhs
+ _val -= OFFSET;
+ else
+ _val = 0;
+ }
+
+ static DESat Max() {
+ return DESat(std::numeric_limits<T>::max());
+ }
+
+private:
+ DESat(T val): _val(val) {}
+
+ T _val;
+ static const T MAX_VAL = std::numeric_limits<T>::max();
+
+ template<typename F>
+ static T shrink(F d) {
+ //Saturate to the allowed interval
+ //TODO: optimize
+ F t = d + OFFSET;
+ t = std::max(F(0), t);
+ t = std::min(F(MAX_VAL), t);
+ return (T)t;
+ }
+
+ static T sadd(T a, T b) {
+ //TODO: check
+ return (a > MAX_VAL - b) ? MAX_VAL : a + b;
+ }
+};
+
+/**
+ * @brief Type for measuring a small gap between edges.
+ * Gap equals to distance minus the length of the first edge, and can be slightly negative in case of overlaps.
+ * ---edge1---> --edge2->
+ * |-gap--|
+ */
+//typedef DESat<uint16_t, 512> DEGap;
+typedef float DEGap;
+
+/**
+ * @brief Type for weighting points in a paired info index.
+ */
+//typedef DESat<uint16_t> DECropWeight;
+typedef float DECropWeight;
+
+/**
+ * @brief Raw point of unclustered index. Parameterized by distance and weight types.
+ */
+template<typename D, typename W>
+struct __attribute((aligned(sizeof(D) + sizeof(W)))) RawPointT {
+ typedef RawPointT<D, W> Self;
+ D d;
+ mutable W weight;
+
+ RawPointT()
+ : RawPointT(0.0, 0.0) {}
+
+ RawPointT(D distance, W weight)
+ : d(distance), weight(weight) {}
+
+ Self& operator+=(const Self &rhs) {
+ weight += rhs.weight;
+ return *this;
+ }
+
+ std::string str() const {
+ std::ostringstream ss;
+ ss << "Point: " << " distance = " << this->d
+ << ", weight = " << this->weight;
+ return ss.str();
+ }
+
+ bool operator<(const Self& rhs) const {
+ return math::ls(this->d, rhs.d);
+ }
+
+ bool operator==(const Self& rhs) const {
+ return math::eq(this->d, rhs.d);
+ }
+
+ bool operator!=(const Self& rhs) const {
+ return !(operator==(rhs));
+ }
+
+ Self operator-() const {
+ return Self(-d, weight);
+ }
+
+ Self operator+(const Self &rhs) const {
+ return Self(d, rhs.weight + this->weight);
+ }
+
+ DEVariance variance() { return 0; } //TODO: remove
+};
+
+typedef RawPointT<DEDistance, DEWeight> RawPoint;
+typedef RawPointT<DEGap, DECropWeight> RawGapPoint;
+
+inline int rounded_d(const RawPoint& p) {
+ return math::round_to_zero(p.d);
+}
+
+inline std::ostream& operator<<(std::ostream& os, const RawPoint &point) {
+ return os << point.str();
+}
+
+/**
+ * @brief Clustered index point. Parameterized by distance and weight types, also has variance.
+ */
+template<typename D, typename W>
+struct PointT : public RawPointT<D, W> {
+ typedef PointT<D, W> Self;
+ DEVariance var;
+ PointT()
+ : PointT(0.0, 0.0, 0.0) {}
+
+ PointT(D distance, W weight, DEVariance variance)
+ : RawPointT<D, W>(distance, weight), var(variance) {}
+
+ PointT(const RawPointT<D, W> &rhs)
+ : RawPointT<D, W>(rhs), var(0.0) {}
+
+ bool operator<(const Self& rhs) const {
+ return math::ls(this->d, rhs.d);
+ }
+
+ bool operator==(const Self& rhs) const {
+ return math::eq(this->d, rhs.d);
+ }
+
+ bool operator!=(const Self& rhs) const {
+ return !(operator==(rhs));
+ }
+
+ Self operator+(const Self &rhs) const {
+ // counting new bounds in the case, when we are merging pair infos with var != 0
+ auto left_bound = std::min(this->d - var, rhs.d - rhs.var);
+ auto right_bound = std::max(this->d + var, rhs.d + rhs.var);
+ auto new_dist = DEDistance((left_bound + right_bound) * 0.5f);
+ auto new_weight = this->weight + rhs.weight; //TODO: crop
+ auto new_variance = (right_bound - left_bound) * 0.5f;
+
+ return Self(new_dist, new_weight, new_variance);
+ }
+
+ bool lt(const Self &rhs) const {
+ return math::ls(this->weight, rhs.weight);
+ }
+
+ DEVariance variance() { return this->var; } //TODO: remove
+};
+
+typedef PointT<DEDistance, DEWeight> Point;
+typedef PointT<DEGap, DECropWeight> GapPoint;
+
+inline std::ostream& operator<<(std::ostream& os, const Point &point) {
+ return os << point.str();
+}
+
+/**
+ * @brief Policy-like type which provides associated point types for unclustered index
+ * and static methods for converting between them.
+ */
+struct RawPointTraits {
+ typedef RawGapPoint Gapped;
+ typedef RawPoint Expanded;
+
+ static Gapped Shrink(Expanded p, DEDistance edge) {
+ DEGap gap = DEGap(p.d - edge);
+ return RawGapPoint(gap, p.weight);
+ }
+
+ static Expanded Expand(Gapped p, DEDistance edge) {
+ RawPoint res(p.d, p.weight);
+ res.d += edge;
+ return res;
+ }
+};
+
+/**
+ * @brief Policy-like type which provides associated point types for clustered index
+ * and static methods for converting between them.
+ */
+struct PointTraits {
+ typedef GapPoint Gapped;
+ typedef Point Expanded;
+
+ static Gapped Shrink(const Expanded &p, DEDistance edge) {
+ DEGap gap = DEGap(p.d - edge);
+ return GapPoint(gap, p.weight, p.var);
+ }
+
+ static Expanded Expand(Gapped p, DEDistance edge) {
+ Point res(p.d, p.weight, p.var);
+ res.d += edge;
+ return res;
+ }
+};
+
+// tuple of a pair of edges @first, @second, and a @point
+template<typename EdgeId>
+struct PairInfo {
+ EdgeId first;
+ EdgeId second;
+ Point point;
+
+ PairInfo()
+ : first(), second(), point() {}
+
+ PairInfo(const PairInfo& pair_info)
+ : first(pair_info.first), second(pair_info.second), point(pair_info.point) {}
+
+ PairInfo(EdgeId first, EdgeId second, DEDistance d, DEWeight weight, DEDistance var)
+ : first(first), second(second), point(d, weight, var) {}
+
+ PairInfo(EdgeId first, EdgeId second, Point point)
+ : first(first), second(second), point(point) {}
+
+ // Two paired infos are considered equal
+ // if they coincide in all parameters except for weight and variance.
+ bool operator==(const PairInfo& rhs) const {
+ const PairInfo &lhs = *this;
+ return lhs.first == rhs.first && lhs.second == rhs.second && lhs.point == rhs.point;
+ }
+
+ bool operator!=(const PairInfo& rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator<(const PairInfo<EdgeId>& rhs) const {
+ const PairInfo<EdgeId>& lhs = *this;
+ return lhs.first == rhs.first ?
+ (lhs.second == rhs.second ? lhs.point < rhs.point : lhs.second < rhs.second)
+ : lhs.first < rhs.first;
+ }
+
+ double d() const { return point.d; }
+ double weight() const { return point.weight; }
+ double var() const { return point.var; }
+};
+
+template<typename EdgeId>
+ostream& operator<<(ostream& os, const PairInfo<EdgeId>& info) {
+ return os << "PairInfo: first = " << info.first << ", second = " << info.second
+ << "Point : " << info.point;
+}
+
+/**
+ * Method returns approximate distance between occurrences of edges in genome rounded to the nearest
+ * integer. In case of a tie closest to 0 value is chosen thus one can assume that distance
+ * is rounded the same way as opposite one.
+ * todo check that written here is true
+ */
+template<typename EdgeId>
+inline int rounded_d(PairInfo<EdgeId> const& pi) {
+ return math::round_to_zero(pi.d());
+}
+
+template<typename EdgeId>
+inline PairInfo<EdgeId> BackwardInfo(const PairInfo<EdgeId>& pi) {
+ return PairInfo<EdgeId>(pi.second, pi.first, -pi.point);
+}
+
+}
+
+}
+
+namespace std {
+template<>
+class numeric_limits<omnigraph::de::DEDistance> : public numeric_limits<float> {};
+template<>
+class numeric_limits<omnigraph::de::DEWeight> : public numeric_limits<float> {};
+}
diff --git a/src/modules/paired_info/insert_size_refiner.hpp b/src/modules/paired_info/insert_size_refiner.hpp
new file mode 100644
index 0000000..cbaf257
--- /dev/null
+++ b/src/modules/paired_info/insert_size_refiner.hpp
@@ -0,0 +1,165 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "dev_support/standard_base.hpp"
+#include "dev_support/cpp_utils.hpp"
+#include "assembly_graph/stats/picture_dump.hpp"
+//#include "sequence_mapper.hpp"
+
+namespace omnigraph {
+
+typedef std::map<int, size_t> HistType;
+
+inline double get_median(const HistType &hist) {
+ double S = 0;
+ for (auto iter = hist.begin(); iter != hist.end(); ++iter)
+ S += (double) iter->second;
+
+ double sum = S;
+ for (auto iter = hist.begin(); iter != hist.end(); ++iter) {
+ sum -= (double) iter->second;
+ if (sum <= S / 2) {
+ return iter->first;
+ }
+ }
+ assert(false);
+ return -1;
+}
+
+inline double get_mad(const HistType &hist, double median) { // median absolute deviation
+ std::map<int, size_t> hist2;
+ for (auto iter = hist.begin(); iter != hist.end(); ++iter) {
+ int x = abs(iter->first - math::round_to_zero(median));
+ hist2[x] = iter->second;
+ }
+ return get_median(hist2);
+}
+
+inline void hist_crop(const HistType &hist, double low, double high, HistType &res) {
+ for (auto iter = hist.begin(); iter != hist.end(); ++iter) {
+ if (iter->first >= low && iter->first <= high) {
+ DEBUG("Cropped histogram " << iter->first << " " << iter->second);
+ res.insert(*iter);
+ }
+ }
+}
+
+inline
+std::pair<double, double> GetISInterval(double quantile,
+ const HistType &is_hist) {
+ // First, obtain the sum of the values
+ double S = 0;
+ for (auto iter : is_hist)
+ S += (double) iter.second;
+
+ double lval = S * (1 - quantile) / 2, rval = S * (1 + quantile) / 2;
+ double is_min, is_max;
+
+ // Now, find the quantiles
+ double cS = 0;
+ is_min = is_hist.begin()->first;
+ is_max = is_hist.rbegin()->first;
+ for (auto iter : is_hist) {
+ if (cS <= lval)
+ is_min = iter.first;
+ else if (cS <= rval)
+ is_max = iter.first;
+ cS += (double) iter.second;
+ }
+
+ return std::make_pair(is_min, is_max);
+}
+
+inline void find_median(const HistType &hist, double &median, double &mad, HistType &cropped_hist) {
+ DEBUG("Counting median and MAD");
+ median = get_median(hist);
+ mad = get_mad(hist, median);
+ double low = median - 5. * 1.4826 * mad;
+ double high = median + 5. * 1.4826 * mad;
+ omnigraph::hist_crop(hist, low, high, cropped_hist);
+ median = get_median(cropped_hist);
+ mad = get_mad(cropped_hist, median);
+}
+
+//Moved from insert size counter.
+//TODO: Please explain constants like 1.4826.
+inline void find_mean(const HistType &hist, double &mean, double &delta, std::map<size_t, size_t> &percentiles) {
+ double median = get_median(hist);
+ double mad = get_mad(hist, median);
+ double low = median - 5. * 1.4826 * mad;
+ double high = median + 5. * 1.4826 * mad;
+
+ DEBUG("Median IS: " << median);
+ DEBUG("MAD: " << mad);
+ DEBUG("Thresholds set to: [" << low << ", " << high << "]");
+
+ size_t n = 0;
+ double sum = 0.;
+ double sum2 = 0.;
+ DEBUG("Counting average");
+ for (auto iter = hist.begin(); iter != hist.end(); ++iter) {
+ if (iter->first < low || iter->first > high) {
+ continue;
+ }
+ n += iter->second;
+ sum += (double) iter->second * 1. * (double) iter->first;
+ sum2 += (double) iter->second * 1. * (double) iter->first * (double) iter->first;
+ }
+ mean = sum / (double) n;
+ delta = sqrt(sum2 / (double) n - mean * mean);
+
+ low = mean - 5 * delta;
+ high = mean + 5 * delta;
+
+ DEBUG("Mean IS: " << mean);
+ DEBUG("sd: " << delta);
+ DEBUG("Thresholds set to: [" << low << ", " << high << "]");
+
+ n = 0;
+ sum = 0.;
+ sum2 = 0.;
+ for (auto iter = hist.begin(); iter != hist.end(); ++iter) {
+ if (iter->first < low || iter->first > high) {
+ continue;
+ }
+ n += iter->second;
+ sum += (double) iter->second * 1. * (double) iter->first;
+ sum2 += (double) iter->second * 1. * (double) iter->first * (double) iter->first;
+ }
+ mean = sum / (double) n;
+ delta = sqrt(sum2 / (double) n - mean * mean);
+
+ DEBUG("Mean IS: " << mean);
+ DEBUG("sd: " << delta);
+
+ size_t m = 0;
+
+ DEBUG("Counting percentiles");
+ //todo optimize
+ size_t q[19];
+ for (size_t i = 1; i < 20; ++i) {
+ q[i - 1] = 5 * i;
+ }
+ for (auto iter = hist.begin(); iter != hist.end(); ++iter) {
+ if (iter->first < low || iter->first > high) {
+ continue;
+ }
+ size_t mm = m + iter->second;
+ for (size_t i = 0; i < utils::array_size(q); i++) {
+ size_t scaled_q_i((size_t) ((double) q[i] / 100. * (double) n));
+ if (m < scaled_q_i && mm >= scaled_q_i) {
+ percentiles[q[i]] = (size_t) iter->first;
+ }
+ }
+ m = mm;
+ }
+}
+
+
+}
diff --git a/src/modules/paired_info/is_counter.hpp b/src/modules/paired_info/is_counter.hpp
new file mode 100644
index 0000000..678387c
--- /dev/null
+++ b/src/modules/paired_info/is_counter.hpp
@@ -0,0 +1,167 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * is_counter.hpp
+ *
+ * Created on: May 25, 2014
+ * Author: andrey
+ */
+
+#ifndef IS_COUNTER_HPP_
+#define IS_COUNTER_HPP_
+
+
+#include "paired_info/insert_size_refiner.hpp"
+#include "assembly_graph/graph_alignment/sequence_mapper_notifier.hpp"
+
+namespace debruijn_graph {
+
+using namespace omnigraph;
+
+class InsertSizeCounter: public SequenceMapperListener {
+
+public:
+
+ InsertSizeCounter(const conj_graph_pack& gp,
+ size_t edge_length_threshold,
+ bool ignore_negative = false)
+ : gp_(gp),
+ edge_length_threshold_(edge_length_threshold),
+ ignore_negative_(ignore_negative) {
+ }
+
+ HistType hist() { return hist_; }
+ size_t total() const { return total_.total_; }
+ size_t mapped() const { return counted_.total_; }
+ size_t negative() const { return negative_.total_; }
+
+
+ virtual void StartProcessLibrary(size_t threads_count) {
+ hist_.clear();
+ tmp_hists_ = vector<HistType>(threads_count);
+
+ total_ = count_data(threads_count);
+ counted_ = count_data(threads_count);
+ negative_ = count_data(threads_count);
+ }
+
+ virtual void StopProcessLibrary() {
+ for (size_t i = 0; i < tmp_hists_.size(); ++i) {
+ MergeBuffer(i);
+ }
+ tmp_hists_.clear();
+ total_.merge();
+ counted_.merge();
+ negative_.merge();
+ }
+
+ virtual void ProcessPairedRead(size_t thread_index,
+ const io::PairedRead& r,
+ const MappingPath<EdgeId>& read1,
+ const MappingPath<EdgeId>& read2) {
+ ProcessPairedRead(thread_index, read1, read2, (int) r.second().size(),
+ (int) r.first().GetLeftOffset() + (int) r.second().GetRightOffset());
+ }
+
+ virtual void ProcessPairedRead(size_t thread_index,
+ const io::PairedReadSeq& r,
+ const MappingPath<EdgeId>& read1,
+ const MappingPath<EdgeId>& read2) {
+ ProcessPairedRead(thread_index, read1, read2, (int) r.second().size(),
+ (int) r.first().GetLeftOffset() + (int) r.second().GetRightOffset());
+ }
+
+ virtual void ProcessSingleRead(size_t /*thread_index*/, const io::SingleRead&, const MappingPath<EdgeId>& /*read*/) {
+ }
+
+ virtual void ProcessSingleRead(size_t /*thread_index*/, const io::SingleReadSeq&, const MappingPath<EdgeId>& /*read*/) {
+ }
+
+ virtual void MergeBuffer(size_t thread_index) {
+ for (const auto& kv: tmp_hists_[thread_index]) {
+ hist_[kv.first] += kv.second;
+ }
+ tmp_hists_[thread_index].clear();
+ }
+
+ void FindMean(double& mean, double& delta, std::map<size_t, size_t>& percentiles) const {
+ find_mean(hist_, mean, delta, percentiles);
+ }
+
+ void FindMedian(double& median, double& mad, HistType& histogram) const {
+ find_median(hist_, median, mad, histogram);
+ }
+
+private:
+ virtual void ProcessPairedRead(size_t thread_index,
+ const MappingPath<EdgeId>& read1,
+ const MappingPath<EdgeId>& read2,
+ int read2_size,
+ int is_delta) {
+
+ ++total_.arr_[thread_index];
+
+ if (read1.size() == 1 && read2.size() == 1 &&
+ read2.simple_path().front() == read1.simple_path().front() &&
+ gp_.g.length(read1.simple_path().front()) >= edge_length_threshold_) {
+
+ auto mapping_edge_1 = read1.front().second;
+ auto mapping_edge_2 = read2.front().second;
+
+ int read1_start = (int) mapping_edge_1.mapped_range.start_pos - (int) mapping_edge_1.initial_range.start_pos ;
+ TRACE("Read 1: " << (int) mapping_edge_1.mapped_range.start_pos << " - " << (int) mapping_edge_1.initial_range.start_pos << " = " << read1_start);
+ int read2_start = (int) mapping_edge_2.mapped_range.start_pos - (int) mapping_edge_2.initial_range.start_pos;
+ TRACE("Read 2: " << (int) mapping_edge_2.mapped_range.start_pos << " - " << (int) mapping_edge_2.initial_range.start_pos << " = " << read2_start);
+ int is = read2_start - read1_start + read2_size + is_delta;
+ TRACE("IS: " << read2_start << " - " << read1_start << " + " << (int) is_delta << " = " << is);
+
+ if (is > 0 || !ignore_negative_) {
+ tmp_hists_[thread_index][is] += 1;
+ ++counted_.arr_[thread_index];
+ } else {
+ ++negative_.arr_[thread_index];
+ }
+
+ }
+
+ }
+ struct count_data {
+ size_t total_;
+ vector<size_t> arr_;
+ count_data(): total_(0) {
+ }
+ count_data(size_t nthreads): total_(0), arr_(nthreads, 0) {
+ }
+ void inc(size_t i) {
+ ++arr_[i];
+ }
+ void merge() {
+ for (size_t i = 0; i < arr_.size(); ++i) {
+ total_ += arr_[i];
+ }
+ }
+ };
+
+private:
+ const conj_graph_pack& gp_;
+
+ HistType hist_;
+ vector<HistType> tmp_hists_;
+
+ count_data total_;
+ count_data counted_;
+ count_data negative_;
+
+ size_t edge_length_threshold_;
+ bool ignore_negative_;
+};
+
+}
+
+
+#endif /* IS_COUNTER_HPP_ */
diff --git a/src/modules/paired_info/pair_info_bounds.hpp b/src/modules/paired_info/pair_info_bounds.hpp
new file mode 100644
index 0000000..ae0c041
--- /dev/null
+++ b/src/modules/paired_info/pair_info_bounds.hpp
@@ -0,0 +1,30 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef OMNI_UTILS_HPP_
+#define OMNI_UTILS_HPP_
+
+#include "dev_support/standard_base.hpp"
+
+namespace omnigraph {
+
+
+inline size_t PairInfoPathLengthUpperBound(size_t k, size_t insert_size,
+ double delta) {
+ double answer = 0. + (double) insert_size + delta - (double) k - 2.;
+ VERIFY(math::gr(answer, 0.));
+ return (size_t)std::floor(answer);
+}
+
+inline size_t PairInfoPathLengthLowerBound(size_t k, size_t l1, size_t l2,
+ int gap, double delta) {
+ double answer = 0. + (double) gap + (double) k + 2. - (double) l1 - (double) l2 - delta;
+ return math::gr(answer, 0.) ? (size_t)std::floor(answer) : 0;
+}
+
+}
+#endif /* OMNI_UTILS_HPP_ */
diff --git a/src/modules/paired_info/pair_info_filler.hpp b/src/modules/paired_info/pair_info_filler.hpp
new file mode 100644
index 0000000..3d2ef1b
--- /dev/null
+++ b/src/modules/paired_info/pair_info_filler.hpp
@@ -0,0 +1,119 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * pair_info_filler.hpp
+ *
+ * Created on: Oct 3, 2013
+ * Author: andrey
+ */
+
+#ifndef PAIR_INFO_FILLER_HPP_
+#define PAIR_INFO_FILLER_HPP_
+
+#include "assembly_graph/graph_alignment/sequence_mapper_notifier.hpp"
+
+namespace debruijn_graph {
+
+/**
+ * As for now it ignores sophisticated case of repeated consecutive
+ * occurrence of edge in path due to gaps in mapping
+ *
+ * todo talk with Anton about simplification and speed-up of procedure with little quality loss
+ */
+class LatePairedIndexFiller : public SequenceMapperListener {
+ typedef std::function<double(MappingRange, MappingRange)> WeightF;
+ typedef std::pair<EdgeId, EdgeId> EdgePair;
+public:
+ LatePairedIndexFiller(const Graph &graph, WeightF weight_f, omnigraph::de::UnclusteredPairedInfoIndexT<Graph>& paired_index)
+ : graph_(graph),
+ weight_f_(weight_f),
+ paired_index_(paired_index) {
+ }
+
+ virtual void StartProcessLibrary(size_t threads_count) {
+ paired_index_.Init();
+ buffer_pi_ = {graph_, threads_count};
+ }
+
+ virtual void StopProcessLibrary() {
+ for (size_t i = 0; i < buffer_pi_.size(); ++i)
+ MergeBuffer(i);
+
+ buffer_pi_.Clear();
+ }
+
+ virtual void ProcessPairedRead(size_t thread_index,
+ const io::PairedRead& r,
+ const MappingPath<EdgeId>& read1,
+ const MappingPath<EdgeId>& read2) {
+ ProcessPairedRead(buffer_pi_[thread_index], read1, read2, r.distance());
+ }
+
+ virtual void ProcessPairedRead(size_t thread_index,
+ const io::PairedReadSeq& r,
+ const MappingPath<EdgeId>& read1,
+ const MappingPath<EdgeId>& read2) {
+ ProcessPairedRead(buffer_pi_[thread_index], read1, read2, r.distance());
+ }
+
+ virtual void ProcessSingleRead(size_t,
+ const io::SingleReadSeq&,
+ const MappingPath<EdgeId>&) {}
+
+ virtual void ProcessSingleRead(size_t,
+ const io::SingleRead&,
+ const MappingPath<EdgeId>&) {}
+
+ virtual void MergeBuffer(size_t thread_index) {
+ paired_index_.Merge(buffer_pi_[thread_index]);
+ buffer_pi_[thread_index].Clear();
+ }
+
+ virtual ~LatePairedIndexFiller() {}
+
+private:
+ void ProcessPairedRead(omnigraph::de::PairedInfoBuffer<Graph>& paired_index,
+ const MappingPath<EdgeId>& path1,
+ const MappingPath<EdgeId>& path2, size_t read_distance) const {
+ for (size_t i = 0; i < path1.size(); ++i) {
+ std::pair<EdgeId, MappingRange> mapping_edge_1 = path1[i];
+ for (size_t j = 0; j < path2.size(); ++j) {
+ std::pair<EdgeId, MappingRange> mapping_edge_2 = path2[j];
+
+ EdgePair ep{mapping_edge_1.first, mapping_edge_2.first};
+
+
+ omnigraph::de::DEWeight weight =
+ weight_f_(mapping_edge_1.second, mapping_edge_2.second);
+ size_t kmer_distance = read_distance
+ + mapping_edge_2.second.initial_range.end_pos
+ - mapping_edge_1.second.initial_range.start_pos;
+ int edge_distance = (int) kmer_distance
+ + (int) mapping_edge_1.second.mapped_range.start_pos
+ - (int) mapping_edge_2.second.mapped_range.end_pos;
+
+ paired_index.Add(mapping_edge_1.first, mapping_edge_2.first,
+ omnigraph::de::RawPoint(edge_distance, weight));
+ }
+ }
+ }
+
+private:
+ const Graph& graph_;
+ WeightF weight_f_;
+ omnigraph::de::UnclusteredPairedInfoIndexT<Graph>& paired_index_;
+ omnigraph::de::PairedInfoBuffersT<Graph> buffer_pi_;
+
+ DECL_LOGGER("LatePairedIndexFiller");
+};
+
+
+}
+
+
+#endif /* PAIR_INFO_FILLER_HPP_ */
diff --git a/src/modules/paired_info/pair_info_filters.hpp b/src/modules/paired_info/pair_info_filters.hpp
new file mode 100644
index 0000000..89f92d0
--- /dev/null
+++ b/src/modules/paired_info/pair_info_filters.hpp
@@ -0,0 +1,271 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef PAIR_INFO_FILTERS_HPP_
+#define PAIR_INFO_FILTERS_HPP_
+
+#include "paired_info_helpers.hpp"
+
+namespace omnigraph {
+
+namespace de {
+
+template<class Graph>
+class AbstractPairInfoChecker{
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef PairInfo<EdgeId> PairInfoT;
+
+protected:
+ const Graph& graph_;
+
+public:
+ AbstractPairInfoChecker(const Graph &graph) : graph_(graph) { }
+
+ virtual bool Check(const PairInfoT&) {
+ return true;
+ }
+
+ virtual bool Check(EdgeId, EdgeId) {
+ return true;
+ }
+
+ virtual ~AbstractPairInfoChecker() { }
+};
+
+template<class Graph>
+class PairInfoWeightChecker : public AbstractPairInfoChecker<Graph>{
+ private:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef PairInfo<EdgeId> PairInfoT;
+ double weight_threshold_;
+
+ public:
+ PairInfoWeightChecker(const Graph& graph, double weight_threshold) :
+ AbstractPairInfoChecker<Graph>(graph), weight_threshold_(weight_threshold) {
+ }
+
+ bool Check(const PairInfoT& info) {
+ return math::ge(info.weight(), weight_threshold_);
+ }
+};
+
+template<class Graph>
+class PairInfoWeightCheckerWithCoverage: public AbstractPairInfoChecker<Graph> {
+ private:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef PairInfo<EdgeId> PairInfoT;
+ double weight_threshold_;
+
+ public:
+ PairInfoWeightCheckerWithCoverage(const Graph& graph, double weight_threshold) :
+ AbstractPairInfoChecker<Graph>(graph), weight_threshold_(weight_threshold){
+ }
+
+ bool Check(const PairInfoT& info) {
+ double info_weight = info.weight();
+ return math::ge(info_weight, weight_threshold_)
+ || (math::ge(info_weight, 0.1 * this->graph_.coverage(info.first)))
+ || (math::ge(info_weight, 0.1 * this->graph_.coverage(info.second)));
+ }
+};
+
+template <class Graph>
+class AmbiguousPairInfoChecker : public AbstractPairInfoChecker<Graph> {
+
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef PairInfo<EdgeId> PairInfoT;
+ typedef boost::optional<EdgeId> OptEdgeId;
+
+ AbstractPairInfoChecker<Graph> &standard_filter_;
+ const PairedInfoIndexT<Graph>& index_;
+
+ double haplom_threshold_;
+ double relative_length_threshold_;
+ double relative_seq_threshold_;
+
+ bool IsEdgeOneHaplome(EdgeId edge){
+ return this->graph_.coverage(edge) < 1.5 * haplom_threshold_;
+ }
+
+ bool IsPairInfoGood(EdgeId edge1, EdgeId edge2){
+ return index_.Get(edge1, edge2).size() <= 1;
+ }
+
+ bool EdgesAreFromSimpleBulgeWithAmbPI(const PairInfoT& info){
+ EdgeId edge1 = info.first;
+ EdgeId edge2 = info.second;
+ // edge is auto reverse complementary
+ TRACE("Check for auto reverse complementary");
+ if(this->graph_.conjugate(edge1) == info.second)
+ return false;
+ TRACE("Done");
+
+ TRACE("Check for coverage 1x haplome for edge from pair info");
+ if(!IsEdgeOneHaplome(edge1) || !IsEdgeOneHaplome(edge2))
+ return false;
+ TRACE("Done");
+
+ // first edge is not side of simple bulge
+ TRACE("Check for bulge side for the 1st edge");
+ OptEdgeId edge1_alt = GetOtherSideOfSimpleBulge(edge1);
+ if(!edge1_alt.is_initialized())
+ return false;
+ TRACE("Done");
+
+ // second edge is not side of simple bulge
+ TRACE("Check for bulge side for the 2nd edge");
+ OptEdgeId edge2_alt = GetOtherSideOfSimpleBulge(edge2);
+ if(!edge2_alt.is_initialized())
+ return false;
+ TRACE("Done");
+
+ TRACE("Check for coverage 1x haplome for edge from alternative bulge sides");
+ if(!IsEdgeOneHaplome(edge1_alt.get()) || !IsEdgeOneHaplome(edge2_alt.get()))
+ return false;
+ TRACE("Done");
+
+ TRACE("Check for multiplicity of pair info");
+ if(!(IsPairInfoGood(edge1, edge2_alt.get()) &&
+ IsPairInfoGood(edge1_alt.get(), edge2) &&
+ IsPairInfoGood(edge1_alt.get(), edge2_alt.get())))
+ return false;
+ TRACE("Done");
+
+ return true;
+ }
+
+ DEWeight GetPairInfoWeight(EdgeId edge1, EdgeId edge2){
+ auto hist = index_.Get(edge1, edge2);
+ return (hist.size() == 1) ? hist.begin()->weight : DEWeight(0);
+ }
+
+ bool InnerCheck(const PairInfoT& info){
+
+ EdgeId edge1 = info.first;
+ EdgeId edge2 = info.second;
+
+ // get second edges of simple bulge
+ OptEdgeId opt_edge1_alt = GetOtherSideOfSimpleBulge(edge1);
+ VERIFY(opt_edge1_alt.is_initialized());
+ EdgeId edge1_alt = opt_edge1_alt.get();
+
+ OptEdgeId opt_edge2_alt = GetOtherSideOfSimpleBulge(edge2);
+ VERIFY(opt_edge2_alt.is_initialized());
+ EdgeId edge2_alt = opt_edge2_alt.get();
+
+ double direct_weight = GetPairInfoWeight(edge1, edge2) +
+ GetPairInfoWeight(edge1_alt, edge2_alt);
+
+ double reverse_weight = GetPairInfoWeight(edge1, edge2_alt) +
+ GetPairInfoWeight(edge1_alt, edge2);
+
+ TRACE("Direct_weight " << direct_weight << ", reverse_weight " << reverse_weight);
+ return direct_weight > reverse_weight;
+ }
+
+public:
+ AmbiguousPairInfoChecker(const Graph& graph, const PairedInfoIndexT<Graph>& index,
+ AbstractPairInfoChecker<Graph> &standard_filter, double haplom_threshold,
+ double relative_length_threshold, double relative_seq_threshold) :
+ AbstractPairInfoChecker<Graph>(graph),
+ standard_filter_(standard_filter),
+ index_(index),
+ haplom_threshold_(haplom_threshold),
+ relative_length_threshold_(relative_length_threshold),
+ relative_seq_threshold_(relative_seq_threshold) { }
+
+ bool Check(const PairInfoT& info) {
+ TRACE(this->graph_.int_id(info.first) << " " << this->graph_.int_id(info.second));
+ if(EdgesAreFromSimpleBulgeWithAmbPI(info)){
+ TRACE("Forward directed edges form a simple bulge");
+ return InnerCheck(info);
+ }
+
+ if(EdgesAreFromSimpleBulgeWithAmbPI(BackwardInfo(info))){
+ TRACE("Backward directed edges form a simple bulge");
+ return InnerCheck(BackwardInfo(info));
+ }
+
+ TRACE("Edges do not form a bulge. Applying default checker");
+ return standard_filter_.Check(info);
+ }
+
+private:
+ OptEdgeId GetOtherSideOfSimpleBulge(EdgeId edge){
+ auto edges = this->graph_.GetEdgesBetween(this->graph_.EdgeStart(edge),
+ this->graph_.EdgeEnd(edge));
+ TRACE("Number alternative edges - " << edges.size());
+ if(edges.size() == 1)
+ return OptEdgeId();
+
+ size_t edge_length = this->graph_.length(edge);
+ Sequence edge_seq = this->graph_.EdgeNucls(edge);
+ for(auto it_edge = edges.begin(); it_edge != edges.end(); it_edge++)
+ if(*it_edge != edge){
+ size_t it_edge_length = this->graph_.length(*it_edge);
+ Sequence it_edge_seq = this->graph_.EdgeNucls(*it_edge);
+ double length_ratio = double(min<size_t>(edge_length, it_edge_length)) /
+ double(max<size_t>(edge_length, it_edge_length));
+ if(length_ratio >= relative_length_threshold_){
+ // size_t edit_dist = EditDistance(edge_seq, it_edge_seq);
+ // double seq_ratio = edit_dist / min<size_t> (edge_seq.size(), it_edge_seq.size());
+ return *it_edge;
+ }
+ }
+ return OptEdgeId();
+ }
+};
+
+template<class Graph>
+class PairInfoFilter{
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef PairInfo<EdgeId> PairInfoT;
+
+protected:
+ AbstractPairInfoChecker<Graph> &pair_info_checker_;
+
+public:
+ PairInfoFilter(AbstractPairInfoChecker<Graph> &pair_info_checker) :
+ pair_info_checker_(pair_info_checker)
+ {}
+
+ void Filter(PairedInfoIndexT<Graph>& index) {
+ INFO("Start filtering; index size: " << index.size());
+ //We can't filter while traversing, because Remove may invalidate iterators
+ //So let's save edge pairs first
+ using EdgePair = std::pair<EdgeId, EdgeId>;
+ std::vector<EdgePair> pairs;
+ for (auto i = pair_begin(index); i != pair_end(index); ++i)
+ if (pair_info_checker_.Check(i.first(), i.second()))
+ pairs.push_back({i.first(), i.second()});
+
+ //TODO: implement fast removing of the whole set of points
+ for (const auto& pair : pairs) {
+ //Same thing with invalidation
+ HistogramWithWeight hist;
+ for (auto point : index[pair])
+ if (!pair_info_checker_.Check(PairInfoT(pair.first, pair.second, point)))
+ hist.insert(point);
+ //index.RemoveMany(pair_hist.first.first, pair_hist.first.second, pair_hist.second);
+ for (const auto& point : hist)
+ index.Remove(pair.first, pair.second, point);
+ }
+
+ INFO("Done filtering");
+ }
+};
+
+}
+
+}
+
+#endif /* PAIR_INFO_FILTERS_HPP_ */
diff --git a/src/modules/paired_info/pair_info_improver.hpp b/src/modules/paired_info/pair_info_improver.hpp
new file mode 100644
index 0000000..89c8945
--- /dev/null
+++ b/src/modules/paired_info/pair_info_improver.hpp
@@ -0,0 +1,279 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "pipeline/graph_pack.hpp"
+#include "split_path_constructor.hpp"
+#include "paired_info/paired_info_helpers.hpp"
+#include "assembly_graph/paths/path_utils.hpp"
+#include <math.h>
+#include <io/reads_io/read_processor.hpp>
+
+namespace debruijn_graph {
+
+inline bool ClustersIntersect(omnigraph::de::Point p1, omnigraph::de::Point p2) {
+ return math::le(p1.d, p2.d + p1.var + p2.var) &&
+ math::le(p2.d, p1.d + p1.var + p2.var);
+}
+
+
+//todo move out
+template<class Graph>
+class ParallelEdgeProcessor {
+ class ConstEdgeIteratorWrapper {
+ public:
+ typedef typename Graph::EdgeId ReadT;
+
+ ConstEdgeIteratorWrapper(const Graph &g)
+ : it_(g) {}
+
+ bool eof() const { return it_.IsEnd(); }
+
+ ConstEdgeIteratorWrapper& operator>>(typename Graph::EdgeId &val) {
+ val = *it_;
+ ++it_;
+ return *this;
+ }
+
+ private:
+ ConstEdgeIterator<Graph> it_;
+ };
+
+public:
+ ParallelEdgeProcessor(const Graph &g, unsigned nthreads)
+ : rp_(nthreads), it_(g) {}
+
+ template <class Processor>
+ bool Run(Processor &op) { return rp_.Run(it_, op); }
+
+ bool IsEnd() const { return it_.eof(); }
+ size_t processed() const { return rp_.processed(); }
+
+private:
+ hammer::ReadProcessor rp_;
+ ConstEdgeIteratorWrapper it_;
+};
+
+template<class Graph>
+static
+bool TryToAddPairInfo(omnigraph::de::PairedInfoIndexT<Graph>& clustered_index,
+ typename Graph::EdgeId e1, typename Graph::EdgeId e2,
+ const omnigraph::de::Point& point_to_add) {
+ auto histogram = clustered_index.Get(e1, e2);
+ for (auto i : histogram) {
+ if (ClustersIntersect(i, point_to_add))
+ return false;
+ }
+
+ clustered_index.Add(e1, e2, point_to_add);
+ return true;
+}
+
+template<class Graph>
+class PairInfoImprover {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef std::vector<omnigraph::de::PairInfo<EdgeId> > PairInfos;
+ typedef std::pair<EdgeId, EdgeId> EdgePair;
+ typedef omnigraph::de::PairedInfoIndexT<Graph> Index;
+
+ public:
+ PairInfoImprover(const Graph& g,
+ Index& clustered_index,
+ const io::SequencingLibrary<config::DataSetData> &lib)
+ : graph_(g), index_(clustered_index), lib_(lib) { }
+
+ void ImprovePairedInfo(unsigned num_threads = 1) {
+ CorrectPairedInfo(num_threads);
+ CorrectPairedInfo(num_threads);
+ }
+
+ private:
+ void CorrectPairedInfo(unsigned nthreads) {
+ size_t missing_paired_info_count = 0;
+ size_t extra_paired_info_count = 0;
+ extra_paired_info_count = RemoveContradictional(nthreads);
+ missing_paired_info_count = FillMissing(nthreads);
+
+ INFO("Paired info stats: missing = " << missing_paired_info_count
+ << "; contradictional = " << extra_paired_info_count);
+ }
+
+ class ContradictionalRemover {
+ public:
+ ContradictionalRemover(omnigraph::de::PairedInfoIndicesT<Graph> &to_remove,
+ const Graph &g,
+ omnigraph::de::PairedInfoIndexT<Graph>& index)
+ : to_remove_(to_remove), graph_(g), index_(index) {}
+
+ bool operator()(EdgeId e) {
+ omnigraph::de::PairedInfoIndexT<Graph> &to_remove = to_remove_[omp_get_thread_num()];
+
+ if (graph_.length(e)>= cfg::get().max_repeat_length && index_.contains(e))
+ FindInconsistent(e, to_remove);
+
+ return false;
+ }
+
+ private:
+ bool IsConsistent(EdgeId /*e*/, EdgeId e1, EdgeId e2,
+ const omnigraph::de::Point& p1, const omnigraph::de::Point& p2) const {
+ if (math::le(p1.d, 0.f) || math::le(p2.d, 0.f) || math::gr(p1.d, p2.d))
+ return true;
+
+ double pi_dist = p2.d - p1.d;
+ int first_length = (int) graph_.length(e1);
+ double var = p1.var + p2.var;
+
+ TRACE(" PI " << p1 << " tr " << omp_get_thread_num());
+ TRACE("vs PI " << p2 << " tr " << omp_get_thread_num());
+
+ if (math::le(pi_dist, first_length + var) &&
+ math::le((double)first_length, pi_dist + var)) {
+ if (graph_.EdgeEnd(e1) == graph_.EdgeStart(e2))
+ return true;
+
+ auto paths = GetAllPathsBetweenEdges(graph_, e1, e2, 0, (size_t) ceil(pi_dist - first_length + var));
+ return (paths.size() > 0);
+ } else {
+ if (math::gr(p2.d, p1.d + first_length)) {
+ auto paths = GetAllPathsBetweenEdges(graph_, e1, e2,
+ (size_t) floor(pi_dist - first_length - var),
+ (size_t) ceil(pi_dist - first_length + var));
+ return (paths.size() > 0);
+ }
+ return false;
+ }
+ }
+
+ // Checking the consistency of two edge pairs (e, e_1) and (e, e_2) for all pairs (base_edge, <some_edge>)
+ void FindInconsistent(EdgeId base_edge,
+ Index& pi) const {
+ for (auto i1 : index_.Get(base_edge)) {
+ auto e1 = i1.first;
+ for (auto i2 : index_.Get(base_edge)) {
+ auto e2 = i2.first;
+ if (e1 == e2)
+ continue;
+ for (auto p1 : i1.second) {
+ for (auto p2 : i2.second) {
+ if (!IsConsistent(base_edge, e1, e2, p1, p2)) {
+ if (p1.lt(p2))
+ pi.Add(base_edge, e1, p1);
+ else
+ pi.Add(base_edge, e2, p2);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ omnigraph::de::PairedInfoIndicesT<Graph> &to_remove_;
+ const Graph &graph_;
+ Index& index_;
+ };
+
+ size_t RemoveContradictional(unsigned nthreads) {
+ size_t cnt = 0;
+
+ omnigraph::de::PairedInfoIndicesT<Graph> to_remove(graph_, nthreads);
+
+ // FIXME: Replace with lambda
+ ContradictionalRemover remover(to_remove, graph_, index_);
+ ParallelEdgeProcessor<Graph>(graph_, nthreads).Run(remover);
+
+ DEBUG("ParallelRemoveContraditional: Threads finished");
+
+ DEBUG("Merging maps");
+ for (size_t i = 1; i < nthreads; ++i) {
+ to_remove[0].Merge(to_remove[i]);
+ to_remove[i].Clear();
+ }
+ DEBUG("Resulting size " << to_remove[0].size());
+
+ DEBUG("Deleting paired infos, liable to removing");
+ for (auto I = omnigraph::de::half_pair_begin(to_remove[0]);
+ I != omnigraph::de::half_pair_end(to_remove[0]); ++I) {
+ cnt += DeleteIfExist(I.first(), I.second(), *I);
+ }
+ to_remove[0].Clear();
+
+ DEBUG("Size of index " << index_.size());
+ DEBUG("ParallelRemoveContraditional: Clean finished");
+ return cnt;
+
+ }
+
+ size_t FillMissing(unsigned nthreads) {
+ DEBUG("Fill missing: Creating indexes");
+ const size_t NUM_CHUNKS = nthreads * 16;
+ omnigraph::de::PairedInfoIndicesT<Graph> to_add(graph_, NUM_CHUNKS);
+
+ SplitPathConstructor<Graph> spc(graph_);
+ IterationHelper<Graph, EdgeId> edges(graph_);
+ auto iters = edges.Chunks(NUM_CHUNKS);
+
+ DEBUG("Fill missing: Start threads");
+ #pragma omp parallel for schedule(guided)
+ for (size_t i = 0; i < iters.size() - 1; ++i) {
+ TRACE("Processing chunk #" << i);
+ for (auto e = iters[i]; e != iters[i + 1]; ++e) {
+ TRACE("Checking for edge " << *e);
+ auto paths = spc.ConvertPIToSplitPaths(*e, index_,
+ lib_.data().mean_insert_size,
+ lib_.data().insert_size_deviation);
+ for (const auto &path : paths) {
+ TRACE("Path " << path.PrintPath(graph_));
+ for (const auto &pi : path)
+ TryToAddPairInfo(to_add[i], pi.first, pi.second, pi.point);
+ }
+ }
+ }
+ //ParallelEdgeProcessor<Graph>(graph_, nthreads).Run(filler);
+ DEBUG("Fill missing: Threads finished");
+
+ size_t cnt = 0;
+ for (size_t i = 0; i < iters.size() - 1; ++i) {
+ DEBUG("Adding map #" << i);
+ for (auto I = omnigraph::de::half_pair_begin(to_add[i]);
+ I != omnigraph::de::half_pair_end(to_add[i]);
+ ++I) {
+ EdgeId e1 = I.first();
+ EdgeId e2 = I.second();
+ for (auto p : *I)
+ cnt += TryToAddPairInfo(index_, e1, e2, p);
+ }
+ to_add[i].Clear();
+ }
+
+ DEBUG("Size of paired index " << index_.size());
+
+ DEBUG("Fill missing: Clean finished");
+ DEBUG("Added " << cnt);
+ return cnt;
+ }
+
+ private:
+ size_t DeleteIfExist(EdgeId e1, EdgeId e2, const typename Index::HistProxy& infos) {
+ size_t cnt = 0;
+ for (auto point : infos) {
+ cnt += index_.Remove(e1, e2, point);
+ TRACE("cnt += " << cnt);
+ }
+
+ return cnt;
+ }
+
+ const Graph& graph_;
+ Index& index_;
+ const io::SequencingLibrary<config::DataSetData>& lib_;
+
+ DECL_LOGGER("PairInfoImprover")
+};
+
+}
diff --git a/src/modules/paired_info/paired_info.hpp b/src/modules/paired_info/paired_info.hpp
new file mode 100644
index 0000000..952617b
--- /dev/null
+++ b/src/modules/paired_info/paired_info.hpp
@@ -0,0 +1,712 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "utils/adt/iterator_range.hpp"
+#include <boost/iterator/iterator_facade.hpp>
+#include <btree/safe_btree_map.h>
+#include <sparsehash/sparse_hash_map>
+
+#include <type_traits>
+
+#include "histogram.hpp"
+
+namespace omnigraph {
+
+namespace de {
+
+/**
+ * @brief Index of paired reads information. For each pair of edges, we store so-called histogram which is a set
+ * of points with distance between those edges. Index is internally arranged as a map of map of histograms:
+ * edge1 -> (edge2 -> histogram)
+ * When we add a point (a,b)->p into the index, we automatically insert a conjugate point (b',a')->p',
+ * (self-conjugate edge pairs are the sole exception), so the index is always conjugate-symmetrical.
+ * Index provides access for a lot of different information:
+ * - if you need to have a histogram between two edges, use Get(edge1, edge2);
+ * - if you need to get a neighbourhood of some edge (second edges with corresponding histograms), use Get(edge1);
+ * - if you need to skip a symmetrical half of that neighbourhood, use GetHalf(edge1);
+ * Backward information (e.g., (b,a)->-p) is currently inaccessible.
+ * @param G graph type
+ * @param Traits Policy-like structure with associated types of inner and resulting points, and how to convert between them
+ * @param C map-like container type (parameterized by key and value type)
+ */
+template<typename G, typename Traits, template<typename, typename> class Container>
+class PairedIndex {
+
+private:
+ typedef typename Traits::Gapped InnerPoint;
+ typedef omnigraph::de::Histogram<InnerPoint> InnerHistogram;
+
+public:
+ typedef G Graph;
+
+ typedef typename Traits::Expanded Point;
+ typedef omnigraph::de::Histogram<Point> Histogram;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef std::pair<EdgeId, EdgeId> EdgePair;
+
+ typedef Container<EdgeId, InnerHistogram> InnerMap;
+ typedef Container<EdgeId, InnerMap> StorageMap;
+
+ typedef PairedIndex<G, Traits, Container> Self;
+
+ //--Data access types--
+
+ typedef typename StorageMap::const_iterator ImplIterator;
+
+public:
+ /**
+ * @brief Smart proxy set representing a composite histogram of points between two edges.
+ * @detail You can work with the proxy just like any constant set.
+ * The only major difference is that it returns all consisting points by value,
+ * because some of them don't exist in the underlying sets and are
+ * restored from the conjugate info on-the-fly.
+ */
+ class HistProxy {
+
+ public:
+ /**
+ * @brief Iterator over a proxy set of points.
+ */
+ class Iterator: public boost::iterator_facade<Iterator, Point, boost::bidirectional_traversal_tag, Point> {
+
+ typedef typename InnerHistogram::const_iterator InnerIterator;
+
+ public:
+ Iterator(InnerIterator iter, DEDistance offset, bool back = false)
+ : iter_(iter), offset_(offset), back_(back)
+ {}
+
+ private:
+ friend class boost::iterator_core_access;
+
+ Point dereference() const {
+ auto i = iter_;
+ if (back_) --i;
+ Point result = Traits::Expand(*i, offset_);
+ if (back_)
+ result.d = -result.d;
+ return result;
+ }
+
+ void increment() {
+ back_ ? --iter_ : ++iter_;
+ }
+
+ void decrement() {
+ back_ ? ++iter_ : --iter_;
+ }
+
+ inline bool equal(const Iterator &other) const {
+ return iter_ == other.iter_ && back_ == other.back_;
+ }
+
+ InnerIterator iter_; //current position
+ DEDistance offset_; //edge length
+ bool back_;
+ };
+
+ /**
+ * @brief Returns a wrapper for a histogram.
+ */
+ HistProxy(const InnerHistogram& hist, DEDistance offset = 0, bool back = false)
+ : hist_(hist), offset_(offset), back_(back)
+ {}
+
+ /**
+ * @brief Returns an empty proxy (effectively a Null object pattern).
+ */
+ static const InnerHistogram& empty_hist() {
+ static InnerHistogram res;
+ return res;
+ }
+
+ /**
+ * @brief Adds a point to the histogram.
+ */
+ //void insert(Point p) {
+ // hist_.insert(Traits::Shrink(p, offset_));
+ //}
+
+ Iterator begin() const {
+ return Iterator(back_ ? hist_.end() : hist_.begin(), offset_, back_);
+ }
+
+ Iterator end() const {
+ return Iterator(back_ ? hist_.begin() : hist_.end(), offset_, back_);
+ }
+
+ /**
+ * @brief Finds the point with the minimal distance.
+ */
+ Point min() const {
+ VERIFY(!empty());
+ return *begin();
+ }
+
+ /**
+ * @brief Finds the point with the maximal distance.
+ */
+ Point max() const {
+ VERIFY(!empty());
+ return *--end();
+ }
+
+ /**
+ * @brief Returns the copy of all points in a simple flat histogram.
+ */
+ Histogram Unwrap() const {
+ return Histogram(begin(), end());
+ }
+
+ size_t size() const {
+ return hist_.size();
+ }
+
+ bool empty() const {
+ return hist_.empty();
+ }
+
+ private:
+ const InnerHistogram& hist_;
+ DEDistance offset_;
+ bool back_;
+ };
+
+ typedef typename HistProxy::Iterator HistIterator;
+
+ //---- Traversing edge neighbours ----
+
+ using EdgeHist = std::pair<EdgeId, HistProxy>;
+
+ /**
+ * @brief A proxy map representing neighbourhood of an edge,
+ * where `Key` is the graph edge ID and `Value` is the proxy histogram.
+ * @detail You can work with the proxy just like with any constant map.
+ * The only major difference is that it returns all consisting pairs by value,
+ * because proxies are constructed on-the-fly.
+ */
+ class EdgeProxy {
+ public:
+
+ /**
+ * @brief Iterator over a proxy map.
+ * @detail For a full proxy, traverses both straight and conjugate pairs.
+ * For a half proxy, traverses only lesser pairs (i.e., (a,b) where (a,b)<=(b',a')) of edges.
+ */
+ class Iterator: public boost::iterator_facade<Iterator, EdgeHist, boost::forward_traversal_tag, EdgeHist> {
+
+ typedef typename InnerMap::const_iterator InnerIterator;
+
+ void Skip() { //For a half iterator, skip conjugate pairs
+ while (half_ && iter_ != stop_ && index_.GreaterPair(edge_, iter_->first))
+ ++iter_;
+ }
+
+ public:
+ Iterator(const PairedIndex &index, InnerIterator iter, InnerIterator stop, EdgeId edge, bool half)
+ : index_ (index)
+ , iter_(iter)
+ , stop_(stop)
+ , edge_(edge)
+ , half_(half)
+ {
+ Skip();
+ }
+
+ void increment() {
+ ++iter_;
+ Skip();
+ }
+
+ void operator=(const Iterator &other) {
+ //TODO: is this risky without an assertion?
+ //VERIFY(index_ == other.index_);
+ //We shouldn't reassign iterators from one index onto another
+ iter_ = other.iter_;
+ stop_ = other.stop_;
+ edge_ = other.edge_;
+ half_ = other.half_;
+ }
+
+ private:
+ friend class boost::iterator_core_access;
+
+ bool equal(const Iterator &other) const {
+ return iter_ == other.iter_;
+ }
+
+ EdgeHist dereference() const {
+ const auto& hist = iter_->second;
+ return std::make_pair(iter_->first, HistProxy(hist, index_.CalcOffset(edge_)));
+ }
+
+ private:
+ const PairedIndex &index_; //TODO: get rid of this somehow
+ InnerIterator iter_, stop_;
+ EdgeId edge_;
+ bool half_;
+ };
+
+ EdgeProxy(const PairedIndex &index, const InnerMap& map, EdgeId edge, bool half = false)
+ : index_(index), map_(map), edge_(edge), half_(half)
+ {}
+
+ Iterator begin() const {
+ return Iterator(index_, map_.begin(), map_.end(), edge_, half_);
+ }
+
+ Iterator end() const {
+ return Iterator(index_, map_.end(), map_.end(), edge_, half_);
+ }
+
+ HistProxy operator[](EdgeId e2) const {
+ if (half_ && index_.GreaterPair(edge_, e2))
+ return HistProxy::empty_hist();
+ return index_.Get(edge_, e2);
+ }
+
+ //Currently unused
+ /*HistProxy<true> GetBack(EdgeId e2) const {
+ return index_.GetBack(edge_, e2);
+ }*/
+
+ bool empty() const {
+ return map_.empty();
+ }
+
+ private:
+ const PairedIndex& index_;
+ const InnerMap& map_;
+ EdgeId edge_;
+ //When false, represents all neighbours (consisting both of directly added data and "restored" conjugates).
+ //When true, proxifies only half of the added edges.
+ bool half_;
+ };
+
+ typedef typename EdgeProxy::Iterator EdgeIterator;
+
+ //---------------- Constructor ----------------
+
+ PairedIndex(const Graph &graph)
+ : size_(0), graph_(graph)
+ {}
+
+public:
+ /**
+ * @brief Returns a conjugate pair for two edges.
+ */
+ EdgePair ConjugatePair(EdgeId e1, EdgeId e2) const {
+ return std::make_pair(graph_.conjugate(e2), graph_.conjugate(e1));
+ }
+ /**
+ * @brief Returns a conjugate pair for a pair of edges.
+ */
+ EdgePair ConjugatePair(EdgePair ep) const {
+ return ConjugatePair(ep.first, ep.second);
+ }
+
+private:
+ bool GreaterPair(EdgeId e1, EdgeId e2) const {
+ auto ep = std::make_pair(e1, e2);
+ return ep > ConjugatePair(ep);
+ }
+
+ void SwapConj(EdgeId &e1, EdgeId &e2) const {
+ auto tmp = e1;
+ e1 = graph_.conjugate(e2);
+ e2 = graph_.conjugate(tmp);
+ }
+
+ size_t CalcOffset(EdgeId e) const {
+ return this->graph().length(e);
+ }
+
+public:
+ //---------------- Data inserting methods ----------------
+ /**
+ * @brief Adds a point between two edges to the index,
+ * merging weights if there's already one with the same distance.
+ */
+ void Add(EdgeId e1, EdgeId e2, Point p) {
+ InnerPoint sp = Traits::Shrink(p, CalcOffset(e1));
+ InsertWithConj(e1, e2, sp);
+ }
+
+ /**
+ * @brief Adds a whole set of points between two edges to the index.
+ */
+ template<typename TH>
+ void AddMany(EdgeId e1, EdgeId e2, const TH& hist) {
+ for (auto p : hist) {
+ InnerPoint sp = Traits::Shrink(p, CalcOffset(e1));
+ InsertWithConj(e1, e2, sp);
+ }
+ }
+
+private:
+
+ void InsertWithConj(EdgeId e1, EdgeId e2, InnerPoint p) {
+ size_ += storage_[e1][e2].merge_point(p);
+ //TODO: deal with loops and self-conj
+ SwapConj(e1, e2);
+ size_ += storage_[e1][e2].merge_point(p);
+ }
+
+ bool IsSelfConj(EdgeId e1, EdgeId e2) {
+ return e1 == graph_.conjugate(e2);
+ }
+
+public:
+ /**
+ * @brief Adds a lot of info from another index, using fast merging strategy.
+ * Should be used instead of point-by-point index merge.
+ */
+ template<class Index>
+ void Merge(const Index& index_to_add) {
+ auto& base_index = storage_;
+ for (auto AddI = index_to_add.data_begin(); AddI != index_to_add.data_end(); ++AddI) {
+ EdgeId e1_to_add = AddI->first;
+ const auto& map_to_add = AddI->second;
+ InnerMap& map_already_exists = base_index[e1_to_add];
+ MergeInnerMaps(map_to_add, map_already_exists);
+ }
+ VERIFY(size() >= index_to_add.size());
+ }
+
+private:
+ template<class OtherMap>
+ void MergeInnerMaps(const OtherMap& map_to_add,
+ InnerMap& map) {
+ for (const auto& to_add : map_to_add) {
+ InnerHistogram& hist_exists = map[to_add.first];
+ size_ += hist_exists.merge(to_add.second);
+ }
+ }
+
+public:
+ //---------------- Data deleting methods ----------------
+
+ /**
+ * @brief Removes the specific entry from the index, and its conjugate.
+ * @warning Don't use it on unclustered index, because hashmaps require set_deleted_item
+ * @return The number of deleted entries (0 if there wasn't such entry)
+ */
+ size_t Remove(EdgeId e1, EdgeId e2, Point p) {
+ InnerPoint point = Traits::Shrink(p, graph_.length(e1));
+ auto res = RemoveSingle(e1, e2, point);
+ //TODO: deal with loops and self-conj
+ SwapConj(e1, e2);
+ res += RemoveSingle(e1, e2, point);
+ return res;
+ }
+
+ /**
+ * @brief Removes the whole histogram from the index, and its conjugate.
+ * @warning Don't use it on unclustered index, because hashmaps require set_deleted_item
+ * @return The number of deleted entries
+ */
+ size_t Remove(EdgeId e1, EdgeId e2) {
+ auto res = RemoveAll(e1, e2);
+ if (!IsSelfConj(e1, e2)) { //TODO: loops?
+ SwapConj(e1, e2);
+ res += RemoveAll(e1, e2);
+ }
+ return res;
+ }
+
+private:
+
+ //TODO: remove duplicode
+ size_t RemoveSingle(EdgeId e1, EdgeId e2, InnerPoint point) {
+ auto i1 = storage_.find(e1);
+ if (i1 == storage_.end())
+ return 0;
+ auto& map = i1->second;
+ auto i2 = map.find(e2);
+ if (i2 == map.end())
+ return 0;
+ InnerHistogram& hist = i2->second;
+ if (!hist.erase(point))
+ return 0;
+ --size_;
+ if (hist.empty()) { //Prune empty maps
+ map.erase(e2);
+ if (map.empty())
+ storage_.erase(e1);
+ }
+ return 1;
+ }
+
+ size_t RemoveAll(EdgeId e1, EdgeId e2) {
+ auto i1 = storage_.find(e1);
+ if (i1 == storage_.end())
+ return 0;
+ auto& map = i1->second;
+ auto i2 = map.find(e2);
+ if (i2 == map.end())
+ return 0;
+ InnerHistogram& hist = i2->second;
+ size_t size_decrease = hist.size();
+ map.erase(i2);
+ size_ -= size_decrease;
+ if (map.empty()) //Prune empty maps
+ storage_.erase(i1);
+ return size_decrease;
+ }
+
+public:
+
+ /**
+ * @brief Removes all neighbourhood of an edge (all edges referring to it, and their histograms)
+ * @warning To keep the symmetricity, it also deletes all conjugates, so the actual complexity is O(size).
+ * @return The number of deleted entries
+ */
+ size_t Remove(EdgeId edge) {
+ InnerMap &inner_map = storage_[edge];
+ std::vector<EdgeId> to_remove;
+ to_remove.reserve(inner_map.size());
+ size_t old_size = this->size();
+ for (const auto& ep : inner_map)
+ to_remove.push_back(ep.first);
+ for (auto e2 : to_remove)
+ this->Remove(edge, e2);
+ return old_size - this->size();
+ }
+
+ //---------------- Data accessing methods ----------------
+
+ /**
+ * @brief Underlying raw implementation data (for custom iterator helpers).
+ */
+ ImplIterator data_begin() const {
+ return storage_.begin();
+ }
+
+ /**
+ * @brief Underlying raw implementation data (for custom iterator helpers).
+ */
+ ImplIterator data_end() const {
+ return storage_.end();
+ }
+
+ adt::iterator_range<ImplIterator> data() const {
+ return adt::make_range(data_begin(), data_end());
+ }
+
+private:
+ //When there is no such edge, returns a fake empty map for safety
+ const InnerMap& GetImpl(EdgeId e) const {
+ auto i = storage_.find(e);
+ if (i != storage_.end())
+ return i->second;
+ return empty_map_;
+ }
+
+ //When there is no such histogram, returns a fake empty histogram for safety
+ const InnerHistogram& GetImpl(EdgeId e1, EdgeId e2) const {
+ auto i = storage_.find(e1);
+ if (i != storage_.end()) {
+ auto j = i->second.find(e2);
+ if (j != i->second.end())
+ return j->second;
+ }
+ return HistProxy::empty_hist();
+ }
+
+public:
+
+ /**
+ * @brief Returns a whole proxy map to the neighbourhood of some edge.
+ * @param e ID of starting edge
+ */
+ EdgeProxy Get(EdgeId e) const {
+ return EdgeProxy(*this, GetImpl(e), e);
+ }
+
+ /**
+ * @brief Returns a half proxy map to the neighbourhood of some edge.
+ * @param e ID of starting edge
+ */
+ EdgeProxy GetHalf(EdgeId e) const {
+ return EdgeProxy(*this, GetImpl(e), e, true);
+ }
+
+ /**
+ * @brief Operator alias of Get(id).
+ */
+ EdgeProxy operator[](EdgeId e) const {
+ return Get(e);
+ }
+
+ /**
+ * @brief Returns a histogram proxy for all points between two edges.
+ */
+ HistProxy Get(EdgeId e1, EdgeId e2) const {
+ return HistProxy(GetImpl(e1, e2), CalcOffset(e1));
+ }
+
+ /**
+ * @brief Operator alias of Get(e1, e2).
+ */
+ HistProxy operator[](EdgePair p) const {
+ return Get(p.first, p.second);
+ }
+
+ //Currently unused
+ /**
+ * @brief Returns a backwards histogram proxy for all points between two edges.
+ */
+ /*HistProxy<true> GetBack(EdgeId e1, EdgeId e2) const {
+ return HistProxy<true>(GetImpl(e2, e1), CalcOffset(e2));
+ }*/
+
+ /**
+ * @brief Checks if an edge (or its conjugated twin) is consisted in the index.
+ */
+ bool contains(EdgeId edge) const {
+ return storage_.count(edge) + storage_.count(graph_.conjugate(edge)) > 0;
+ }
+
+ /**
+ * @brief Checks if there is a histogram for two points (or their conjugated pair).
+ */
+ bool contains(EdgeId e1, EdgeId e2) const {
+ auto i1 = storage_.find(e1);
+ if (i1 != storage_.end() && i1->second.count(e2))
+ return true;
+ return false;
+ }
+
+ //---------------- Miscellaneous ----------------
+
+ /**
+ * Returns the graph the index is based on. Needed for custom iterators.
+ */
+ const Graph &graph() const { return graph_; }
+
+ /**
+ * @brief Inits the index with graph data. For each edge, adds a loop with zero weight.
+ * @warning Do not call this on non-empty indexes.
+ */
+ void Init() {
+ //VERIFY(size() == 0);
+ for (auto it = graph_.ConstEdgeBegin(); !it.IsEnd(); ++it)
+ Add(*it, *it, Point());
+ }
+
+ /**
+ * @brief Clears the whole index. Used in merging.
+ */
+ void Clear() {
+ storage_.clear();
+ size_ = 0;
+ }
+
+ /**
+ * @brief Returns the physical index size (total count of all histograms).
+ */
+ size_t size() const { return size_; }
+
+private:
+ PairedIndex(size_t size, const Graph& graph, const StorageMap& storage)
+ : size_(size), graph_(graph), storage_(storage) {}
+
+public:
+ /**
+ * @brief Returns a copy of sub-index.
+ * @deprecated Needed only in smoothing distance estimator.
+ */
+ Self SubIndex(EdgeId e1, EdgeId e2) const {
+ InnerMap tmp;
+ const auto& h1 = GetImpl(e1, e2);
+ size_t size = h1.size();
+ tmp[e1][e2] = h1;
+ SwapConj(e1, e2);
+ const auto& h2 = GetImpl(e1, e2);
+ size += h2.size();
+ tmp[e1][e2] = h2;
+ return Self(size, graph_, tmp);
+ };
+
+private:
+ size_t size_;
+ const Graph& graph_;
+ StorageMap storage_;
+ InnerMap empty_map_; //null object
+};
+
+//Aliases for common graphs
+template<typename K, typename V>
+using safe_btree_map = btree::safe_btree_map<K, V>; //Two-parameters wrapper
+template<typename Graph>
+using PairedInfoIndexT = PairedIndex<Graph, PointTraits, safe_btree_map>;
+
+template<typename K, typename V>
+using sparse_hash_map = google::sparse_hash_map<K, V>; //Two-parameters wrapper
+template<typename Graph>
+using UnclusteredPairedInfoIndexT = PairedIndex<Graph, RawPointTraits, sparse_hash_map>;
+
+/**
+ * @brief A collection of paired indexes which can be manipulated as one.
+ * Used as a convenient wrapper in parallel index processing.
+ */
+template<class Index>
+class PairedIndices {
+ typedef std::vector<Index> Storage;
+ Storage data_;
+
+public:
+ PairedIndices() {}
+
+ PairedIndices(const typename Index::Graph& graph, size_t lib_num) {
+ data_.reserve(lib_num);
+ for (size_t i = 0; i < lib_num; ++i)
+ data_.emplace_back(graph);
+ }
+
+ /**
+ * @brief Initializes all indexes with zero points.
+ */
+ void Init() { for (auto& it : data_) it.Init(); }
+
+ /**
+ * @brief Clears all indexes.
+ */
+ void Clear() { for (auto& it : data_) it.Clear(); }
+
+ Index& operator[](size_t i) { return data_[i]; }
+
+ const Index& operator[](size_t i) const { return data_[i]; }
+
+ size_t size() const { return data_.size(); }
+
+ typename Storage::iterator begin() { return data_.begin(); }
+ typename Storage::iterator end() { return data_.end(); }
+
+ typename Storage::const_iterator begin() const { return data_.begin(); }
+ typename Storage::const_iterator end() const { return data_.end(); }
+};
+
+template<class Graph>
+using PairedInfoIndicesT = PairedIndices<PairedInfoIndexT<Graph>>;
+
+template<class Graph>
+using UnclusteredPairedInfoIndicesT = PairedIndices<UnclusteredPairedInfoIndexT<Graph>>;
+
+template<typename K, typename V>
+using unordered_map = std::unordered_map<K, V>; //Two-parameters wrapper
+template<class Graph>
+using PairedInfoBuffer = PairedIndex<Graph, RawPointTraits, unordered_map>;
+
+template<class Graph>
+using PairedInfoBuffersT = PairedIndices<PairedInfoBuffer<Graph>>;
+
+}
+
+}
diff --git a/src/modules/paired_info/paired_info_helpers.hpp b/src/modules/paired_info/paired_info_helpers.hpp
new file mode 100644
index 0000000..5198e4f
--- /dev/null
+++ b/src/modules/paired_info/paired_info_helpers.hpp
@@ -0,0 +1,142 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "paired_info.hpp"
+#include "boost/optional.hpp"
+
+namespace omnigraph {
+
+namespace de {
+
+template<typename Index, bool full>
+class EdgePairIterator :
+ public boost::iterator_facade<EdgePairIterator<Index, full>,
+ typename Index::HistProxy,
+ boost::forward_traversal_tag,
+ typename Index::HistProxy>
+{
+ typedef typename Index::StorageMap::const_iterator OuterIterator;
+ typedef boost::optional<typename Index::InnerMap::const_iterator> InnerIterator;
+
+protected:
+ //They're not intended to be constucted explicitly, only via begin/end.
+ EdgePairIterator(const Index& index, OuterIterator i)
+ : index_(index), i_(i)
+ {
+ StartOver();
+ }
+
+ bool FakePair() {
+ auto ep = std::make_pair(i_->first, (*j_)->first);
+ return ep > index_.ConjugatePair(ep);
+ }
+
+ inline void Skip() { //For a half iterator, skip conjugate pairs
+ while (!full && j_ && FakePair()) {
+ IncImpl();
+ }
+ }
+
+ void IncImpl() {
+ ++(*j_);
+ if (j_ == i_->second.end()) { //Traversed all neighbours, jump to the next edge
+ ++i_;
+ StartOver();
+ }
+ }
+
+public:
+ void increment() {
+ IncImpl();
+ Skip();
+ }
+
+private:
+ void StartOver() {
+ if (i_ == index_.data_end()) {
+ j_.reset();
+ } else {
+ j_ = i_->second.begin();
+ Skip();
+ }
+ }
+
+public:
+
+ typename Index::HistProxy dereference() const {
+ return index_.Get(first(), second()); //TODO: optimize
+ }
+
+ bool equal(const EdgePairIterator &other) const {
+ return j_ == other.j_;
+ }
+
+ typename Index::EdgeId first() const {
+ return i_->first;
+ }
+
+ typename Index::EdgeId second() const {
+ return (*j_)->first;
+ }
+
+ static EdgePairIterator begin(const Index& index) {
+ return EdgePairIterator(index, index.data_begin());
+ }
+
+ static EdgePairIterator end(const Index& index) {
+ return EdgePairIterator(index, index.data_end());
+ }
+
+private:
+ const Index &index_;
+ OuterIterator i_;
+ InnerIterator j_;
+};
+
+template<typename Storage>
+inline EdgePairIterator<Storage, true> pair_begin(const Storage &s) {
+ return EdgePairIterator<Storage, true>::begin(s);
+}
+
+template<typename Storage>
+inline EdgePairIterator<Storage, true> pair_end(const Storage &s) {
+ return EdgePairIterator<Storage, true>::end(s);
+}
+
+template<typename Storage>
+inline EdgePairIterator<Storage, false> half_pair_begin(const Storage &s) {
+ return EdgePairIterator<Storage, false>::begin(s);
+}
+
+template<typename Storage>
+inline EdgePairIterator<Storage, false> half_pair_end(const Storage &s) {
+ return EdgePairIterator<Storage, false>::end(s);
+}
+
+//Small wrapper for range-based loops
+//Usage: for (auto i in PairsOf(index))
+/*template <typename Storage>
+class PairsOf {
+public:
+ EdgePairIterator<Storage> begin() const{
+ return pair_begin(storage_);
+ }
+
+ EdgePairIterator<Storage> end() const{
+ return pair_begin(storage_);
+ }
+
+ PairsOf(const Storage& storage)
+ : storage_(storage) {}
+private:
+ const Storage& storage_;
+};*/
+
+}
+
+}
diff --git a/src/modules/paired_info/peak_finder.hpp b/src/modules/paired_info/peak_finder.hpp
new file mode 100644
index 0000000..c127108
--- /dev/null
+++ b/src/modules/paired_info/peak_finder.hpp
@@ -0,0 +1,385 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * PeakFinder.hpp
+ *
+ * Created on: Aug 15, 2011
+ * Author: alexeyka
+ */
+
+#ifndef PEAKFINDER_HPP_
+#define PEAKFINDER_HPP_
+
+#include "dev_support/verify.hpp"
+#include "data_divider.hpp"
+#include "paired_info.hpp"
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <complex>
+#include <cmath>
+
+namespace omnigraph{
+
+namespace de {
+
+template <class EdgeId>
+class PeakFinder {
+
+ typedef std::complex<double> complex_t;
+
+ public:
+ PeakFinder(const vector<PairInfo<EdgeId> >& data,
+ size_t begin,
+ size_t end,
+ size_t /*range*/,
+ size_t delta,
+ double percentage,
+ double der_thr) :
+ delta_(delta),
+ percentage_(percentage),
+ der_thr_(der_thr)
+ {
+ for (size_t i = begin; i < end; ++i) {
+ x_.push_back(rounded_d(data[i]));
+ y_.push_back(data[i].weight());
+ }
+ Init();
+ }
+
+ double weight() const {
+ return weight_;
+ }
+
+ double GetNormalizedWeight() const {
+ return weight_;
+ }
+
+ void PrintStats(string host) const {
+ for (size_t i = 0; i < data_len_; ++i)
+ DEBUG(host << (x_left_ + (int) i) << " " << hist_[i]);
+ }
+
+ void FFTSmoothing(double cutoff) {
+ VERIFY(data_len_ > 0);
+ if (data_len_ == 1) {
+ hist_[0] = y_[0];
+ return;
+ }
+ InitBaseline();
+ SubtractBaseline();
+ FFTForward(hist_);
+ size_t Ncrit = (size_t) (cutoff);
+
+ // cutting off - standard parabolic filter
+ for (size_t i = 0; i < data_len_ && i < Ncrit; ++i)
+ hist_[i] *= 1. - ((double) i * (double) i * 1.) / (double) (Ncrit * Ncrit);
+
+ for (size_t i = Ncrit; i < hist_.size(); ++i)
+ hist_[i] = 0.;
+
+ FFTBackward(hist_);
+ AddBaseline();
+ }
+
+ bool IsPeak(int dist, size_t range) const {
+ return IsLocalMaximum(dist, range);
+ }
+
+ bool IsPeak(int dist) const {
+ return IsLocalMaximum(dist, 10);
+ }
+
+ // looking for one maximum in the picture
+ vector<pair<int, double> > ListPeaks(/*int delta = 3*/) const {
+ TRACE("Smoothed data");
+ //size_t index_max = 0;
+ //for (size_t i = 0; i < data_len_; ++i) {
+ //TRACE(x_left_ + (int) i << " " << hist_[i]);
+ //if (hist_[i].real() > hist_[index_max].real())
+ //index_max = i;
+ //}
+ //vector<pair<int, double> > result;
+ //result.push_back(make_pair(x_left_ + index_max, hist_[index_max].real()));
+ //return result;
+ DEBUG("Listing peaks");
+ map<int, double> peaks_;
+ //another data_len_
+ size_t data_len_ = (size_t) (x_right_ - x_left_);
+ vector<bool> was;
+ srand((unsigned) time(NULL));
+ for (size_t i = 0; i < data_len_; ++i)
+ was.push_back(false);
+
+ size_t iteration = 0;
+ for (size_t l = 0; l < data_len_; ++l) {
+ //for (size_t k = 0; k < 4; ++k) {
+ //size_t v = std::rand() % data_len_;
+ size_t v = l;
+ if (was[v])
+ continue;
+
+ was[v] = true;
+ int index = (int) v + x_left_;
+ while (index < (x_right_ - 1) && index > x_left_ && iteration < 5) {
+ // if @index is local maximum, then leave it
+ double right_derivative = RightDerivative(index);
+ double left_derivative = LeftDerivative(index);
+
+ if (math::gr(right_derivative, 0.) && math::gr(right_derivative, -left_derivative)) {
+ index++;
+ if ((iteration & 1) == 0)
+ ++iteration;
+ }
+ else if (math::le(left_derivative, 0.)) {
+ index--;
+ if ((iteration & 1) == 1)
+ ++iteration;
+ }
+ else
+ break;
+ }
+
+ TRACE("FOUND " << index);
+
+ //double right_derivative = RightDerivative(index);
+ //double left_derivative = LeftDerivative(index);
+
+ if (index < 0)
+ continue;
+
+ //if (index >= x_right_ - delta || index < x_left_ + delta)
+ //continue;
+
+ TRACE("Is in range");
+
+ if (IsPeak(index, 5)) {
+ TRACE("Is local maximum " << index);
+ double weight_ = 0.;
+ int left_bound = (x_left_ > (index - 20) ? x_left_ : (index - 20));
+ int right_bound = (x_right_ < (index + 1 + 20) ? x_right_ : (index + 1 + 20));
+ for (int i = left_bound; i < right_bound; ++i)
+ weight_ += hist_[i - x_left_].real();
+ TRACE("WEIGHT counted");
+ pair<int, double> tmp_pair = make_pair(index, 100. * weight_);
+ if (!peaks_.count(index)) {
+ TRACE("Peaks size " << peaks_.size() << ", inserting " << tmp_pair);
+ peaks_.insert(tmp_pair);
+ } else {
+ TRACE("NON UNIQUE");
+ }
+ }
+ }
+ TRACE("FINISHED " << peaks_.size());
+ vector<pair<int, double> > peaks;
+ for (auto iter = peaks_.begin(); iter != peaks_.end(); ++iter) {
+ const pair<int, double>& tmp_pair = *iter;
+ TRACE("next peak " << tmp_pair);
+ peaks.push_back(tmp_pair);
+ //for (int i = -10; i <= 10; ++i) {
+ //peaks.push_back(make_pair(tmp_pair.first + i, tmp_pair.second / 21.));
+ //}
+ }
+ return peaks;
+ }
+
+ vector<complex_t> getIn() const {
+ return hist_;
+ }
+
+ vector<complex_t> getOut() const {
+ return hist_;
+ }
+
+private:
+ double x1, x2, y1, y2;
+ size_t delta_;
+ double percentage_;
+ double der_thr_;
+ double weight_;
+ vector<int> x_;
+ vector<double> y_;
+ size_t data_size_, data_len_;
+ int x_left_, x_right_;
+ vector<complex_t> hist_;
+
+ size_t Rev(size_t num, size_t lg_n) {
+ size_t res = 0;
+ for (size_t i = 0; i < lg_n; ++i)
+ if (num & (1 << i))
+ res |= 1 << (lg_n - 1 - i);
+ return res;
+ }
+
+ void FFT(vector<complex_t>& vect, bool invert) {
+ size_t n = vect.size();
+ size_t lg_n = 0;
+ while ( (1u << lg_n) < n)
+ ++lg_n;
+
+ while (n < (1u << lg_n)) {
+ vect.push_back(0.);
+ ++n;
+ }
+
+ for (size_t i = 0; i < n; ++i)
+ if (i < Rev(i, lg_n))
+ swap(vect[i], vect[Rev(i, lg_n)]);
+
+ for (size_t len = 2; len < 1 + n; len <<= 1) {
+ double ang = 2 * M_PI / (double) len * (invert ? -1 : 1);
+ complex_t wlen(cos(ang), sin(ang));
+ for (size_t i = 0; i < n; i += len) {
+ complex_t w(1.);
+ for (size_t j = 0; j < (len >> 1); ++j) {
+ complex_t u = vect[i + j];
+ complex_t v = vect[i + j + (len >> 1)] * w;
+ vect[i + j] = u + v;
+ vect[i + j + (len >> 1)] = u - v;
+ w *= wlen;
+ }
+ }
+ }
+
+ if (invert)
+ for (size_t i = 0; i < n; ++i)
+ vect[i] /= (double) n;
+ }
+
+
+ void FFTForward(vector<complex_t>& vect) {
+ FFT(vect, false);
+ }
+
+ void FFTBackward(vector<complex_t>& vect) {
+ FFT(vect, true);
+ }
+
+ void ExtendLinear(vector<complex_t>& hist) {
+ size_t ind = 0;
+ weight_ = 0.;
+ for (size_t i = 0; i < data_len_; ++i) {
+ if (ind == data_size_ - 1)
+ hist.push_back((double) x_right_);
+ else {
+ VERIFY(x_[ind + 1] > x_[ind]);
+ hist.push_back(((double) (i + x_left_ - x_[ind]) *
+ y_[ind + 1] + y_[ind] *
+ (double) (x_[ind + 1] - i - x_left_)) /
+ (double) (1 * (x_[ind + 1] - x_[ind])));
+ }
+ weight_ += hist[i].real(); // filling the array on the fly
+
+ if (ind < data_size_ && ((int) i == x_[ind + 1] - x_left_))
+ ++ind;
+ }
+
+ }
+
+
+ void InitBaseline() {
+ size_t Np = (size_t) ((double) data_len_ * percentage_);
+ if (Np == 0) Np++; // Np <> 0 !!!!
+
+ double mean_beg = 0.;
+ double mean_end = 0.;
+ for (size_t i = 0; i < Np; ++i) {
+ mean_beg += hist_[i].real();
+ mean_end += hist_[data_len_ - i - 1].real();
+ }
+ mean_beg /= 1. * (double) Np;
+ mean_end /= 1. * (double) Np;
+
+ // two points defining the line
+ x1 = (double) Np / 2.;
+ x2 = (double) data_len_ - (double) Np / 2.;
+ y1 = mean_beg;
+ y2 = mean_end;
+ }
+
+ void SubtractBaseline() {
+ // subtracting a baseline
+ // it's being constructed like this: the first point is (Np/2; mean of the first percentage of data),
+ // the second point is (data_len_ - Np/2; mean of the last $percentage of data)
+ for (size_t i = 0; i < data_len_; ++i) {
+ hist_[i] -= (y1 + (y2 - y1) * ((double) i - x1) / (x2 - x1));
+ }
+ }
+
+ void AddBaseline() {
+ for (size_t i = 0; i < data_len_; ++i) {
+ hist_[i] += (y1 + (y2 - y1) * ((double) i - x1) / (x2 - x1));
+ }
+ }
+
+ void Init() {
+ data_size_ = x_.size();
+ x_left_ = x_[0];
+ x_right_ = x_[data_size_ - 1] + 1;
+ data_len_ = x_right_ - x_left_;
+ ExtendLinear(hist_);
+ }
+
+ bool IsInRange(int peak) const {
+ return peak < x_right_ && peak >= x_left_;
+ }
+
+ double LeftDerivative(int dist) const {
+ VERIFY(dist > x_left_);
+ return hist_[dist - x_left_].real() - hist_[dist - x_left_ - 1].real();
+ }
+
+ double RightDerivative(int dist) const {
+ VERIFY(dist < x_right_ - 1);
+ return hist_[dist - x_left_ + 1].real() - hist_[dist - x_left_].real();
+ }
+
+ double MiddleDerivative(int dist) const {
+ VERIFY(dist > x_left_ && dist < x_right_ - 1);
+ return .5 * (hist_[dist - x_left_ + 1].real() - hist_[dist - x_left_ - 1].real());
+ }
+
+ double Derivative(int dist) const {
+ if (dist == x_right_ - 1)
+ return LeftDerivative(dist);
+ else if (dist == x_left_)
+ return RightDerivative(dist);
+ else
+ return MiddleDerivative(dist);
+ }
+
+ bool IsLocalMaximum(int peak, size_t range, int left_bound, int right_bound, size_t delta) const {
+
+ DEBUG("Is local maximum : peak " << peak << " range " << range
+ << " bounds " << left_bound << " " << right_bound << " delta " << delta);
+ int index_max = peak;
+ TRACE("Looking for the maximum");
+ for (int j = left_bound; j < right_bound; ++j)
+ if (math::ls(hist_[index_max - x_left_].real(), hist_[j - x_left_].real())) {
+ index_max = j;
+ }// else if (j < i && hist_[index_max - x_left_][0] == hist_[j - x_left][0] ) index_max = j;
+ TRACE("Maximum is " << index_max);
+
+ if ((size_t)abs(index_max - peak) <= delta)
+ return true;
+
+ return false;
+ }
+
+ bool IsLocalMaximum(int peak, size_t range) const {
+ return IsLocalMaximum(peak, range, x_left_, x_right_, delta_);
+ }
+
+ DECL_LOGGER("PeakFinder");
+};
+
+}
+
+}
+
+#endif /* PEAKFINDER_HPP_ */
diff --git a/src/modules/paired_info/smoothing_distance_estimation.hpp b/src/modules/paired_info/smoothing_distance_estimation.hpp
new file mode 100644
index 0000000..04f9410
--- /dev/null
+++ b/src/modules/paired_info/smoothing_distance_estimation.hpp
@@ -0,0 +1,283 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef SMOOTHING_DISTANCE_ESTIMATION_HPP_
+#define SMOOTHING_DISTANCE_ESTIMATION_HPP_
+
+#include "paired_info.hpp"
+#include "data_divider.hpp"
+#include "peak_finder.hpp"
+#include "weighted_distance_estimation.hpp"
+
+namespace omnigraph {
+
+namespace de {
+
+template<class Graph>
+class SmoothingDistanceEstimator : public WeightedDistanceEstimator<Graph> {
+ //FIXME configure
+ static const size_t OVERLAP_TOLERANCE = 1000;
+protected:
+ typedef WeightedDistanceEstimator<Graph> base;
+ typedef typename base::InPairedIndex InPairedIndex;
+ typedef typename base::OutPairedIndex OutPairedIndex;
+ typedef typename base::InHistogram InHistogram;
+ typedef typename base::OutHistogram OutHistogram;
+ typedef typename InPairedIndex::Histogram TempHistogram;
+
+public:
+ SmoothingDistanceEstimator(const Graph &graph,
+ const InPairedIndex &histogram,
+ const GraphDistanceFinder<Graph> &dist_finder,
+ std::function<double(int)> weight_f,
+ size_t linkage_distance, size_t max_distance, size_t threshold,
+ double range_coeff, double delta_coeff,
+ size_t cutoff,
+ size_t min_peak_points,
+ double inv_density,
+ double percentage,
+ double derivative_threshold,
+ bool only_scaffolding = false) :
+ base(graph, histogram, dist_finder, weight_f, linkage_distance, max_distance),
+ threshold_(threshold),
+ range_coeff_(range_coeff),
+ delta_coeff_(delta_coeff),
+ cutoff_((int) cutoff),
+ min_peak_points_(min_peak_points),
+ inv_density_(inv_density),
+ percentage_(percentage),
+ deriv_thr(derivative_threshold),
+ only_scaffolding_(only_scaffolding),
+ gap_distances(0) { }
+
+ virtual ~SmoothingDistanceEstimator() { }
+
+protected:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef pair<EdgeId, EdgeId> EdgePair;
+ typedef vector<pair<int, double> > EstimHist;
+ typedef vector<PairInfo<EdgeId> > PairInfos;
+ typedef vector<size_t> GraphLengths;
+
+ EstimHist EstimateEdgePairDistances(EdgePair /*ep*/,
+ const InHistogram & /*raw_data*/,
+ const vector<size_t> & /*forward*/) const override {
+ VERIFY_MSG(false, "Sorry, the SMOOOOTHING estimator is not available anymore." <<
+ "SPAdes is going to terminate");
+
+ return EstimHist();
+ }
+
+private:
+ typedef pair<size_t, size_t> Interval;
+
+ size_t threshold_;
+ double range_coeff_;
+ double delta_coeff_;
+ int cutoff_;
+ size_t min_peak_points_;
+ double inv_density_;
+ double percentage_;
+ double deriv_thr;
+ bool only_scaffolding_;
+ mutable size_t gap_distances;
+
+ EstimHist FindEdgePairDistances(EdgePair ep,
+ const TempHistogram &raw_hist) const {
+ size_t first_len = this->graph().length(ep.first);
+ size_t second_len = this->graph().length(ep.second);
+ TRACE("Lengths are " << first_len << " " << second_len);
+ TempHistogram data;
+ for (auto I = raw_hist.begin(), E = raw_hist.end(); I != E; ++I) {
+ Point p = *I;
+ if (math::ge(2 * (long) rounded_d(p) + (long) second_len, (long) first_len)) if (
+ (long) rounded_d(p) + (long) OVERLAP_TOLERANCE >= (long) first_len)
+ data.insert(p);
+ }
+ EstimHist result;
+ double picture_weight = 0.;
+ for (auto I = data.begin(), E = data.end(); I != E; ++I)
+ picture_weight += I->weight;
+ if (math::ls(picture_weight, 3.))
+ return result;
+
+ DataDivider<EdgeId> data_divider(threshold_,
+ vector<Point>(data.begin(), data.end()));
+
+ PairInfos infos;
+ infos.reserve(data.size());
+ const vector<Interval> &clusters =
+ data_divider.DivideAndSmoothData(ep, infos, this->weight_f_);
+ DEBUG("Seeking for distances");
+ TRACE("size " << infos.size());
+
+ for (size_t i = 0; i < clusters.size(); ++i) {
+ size_t begin = clusters[i].first;
+ size_t end = clusters[i].second;
+ TRACE("begin " << begin << " at " << rounded_d(infos[begin])
+ << ", " << " end " << end << " at " << rounded_d(infos[end - 1]));
+ size_t data_length = rounded_d(infos[end - 1]) - rounded_d(infos[begin]) + 1;
+ TRACE("data length " << data_length);
+ if (end - begin > min_peak_points_) {
+ size_t range = (size_t) math::round((double) data_length * range_coeff_);
+ size_t delta = (size_t) math::round((double) data_length * delta_coeff_);
+ PeakFinder<EdgeId> peakfinder(infos, begin, end, range, delta, percentage_, deriv_thr);
+ DEBUG("Processing window : " << rounded_d(infos[begin])
+ << " " << rounded_d(infos[end - 1]));
+ peakfinder.FFTSmoothing(cutoff_);
+ TRACE("Listing peaks");
+ const EstimHist &peaks = peakfinder.ListPeaks();
+ //for (auto iter = peaks.begin(); iter != peaks.end(); ++iter) {
+ //TRACE("PEAKS " << iter->first << " " << iter->second);
+ //}
+ if (peaks.size() == 0)
+ continue;
+ size_t index_of_max_weight = 0;
+ for (size_t i = 0; i < peaks.size(); ++i)
+ if (math::ls(peaks[index_of_max_weight].second, peaks[i].second))
+ index_of_max_weight = i;
+ result.push_back(peaks[index_of_max_weight]);
+ }
+ }
+
+ if (result.size() == 0)
+ return result;
+ size_t index_of_max_weight = 0;
+ for (size_t i = 0; i < result.size(); ++i)
+ if (math::ls(result[index_of_max_weight].second, result[i].second))
+ index_of_max_weight = i;
+
+ EstimHist new_result;
+ for (size_t i = 0; i < result.size(); ++i)
+ if (result[i].second > .5 * result[index_of_max_weight].second)
+ new_result.push_back(result[i]);
+ return new_result;
+ }
+
+ void ProcessEdge(EdgeId e1,
+ const InPairedIndex &pi,
+ PairedInfoBuffer<Graph> &result) const override {
+ typename base::LengthMap second_edges;
+ auto inner_map = pi.GetHalf(e1);
+ for (auto I : inner_map)
+ second_edges[I.first];
+
+ this->FillGraphDistancesLengths(e1, second_edges);
+
+ for (const auto &entry: second_edges) {
+ EdgeId e2 = entry.first;
+ EdgePair ep(e1, e2);
+
+ VERIFY(ep <= pi.ConjugatePair(ep));
+
+ TRACE("Processing edge pair " << this->graph().int_id(e1)
+ << " " << this->graph().int_id(e2));
+ const GraphLengths &forward = entry.second;
+
+ auto hist = pi.Get(e1, e2).Unwrap();
+ EstimHist estimated;
+ //DEBUG("Extending paired information");
+ //DEBUG("Extend left");
+ //this->base::ExtendInfoLeft(e1, e2, hist, 1000);
+ DEBUG("Extend right");
+ this->ExtendInfoRight(e1, e2, hist, 1000);
+ if (forward.size() == 0) {
+ estimated = FindEdgePairDistances(ep, hist);
+ ++gap_distances;
+ } else if (forward.size() > 0 && (!only_scaffolding_)) {
+ //TODO: remove THIS
+ InPairedIndex temp_index(this->graph());
+ temp_index.AddMany(e1, e2, hist);
+ auto hist = temp_index.Get(e1, e2);
+ estimated = this->base::EstimateEdgePairDistances(ep, hist, forward);
+ }
+ DEBUG(gap_distances << " distances between gap edge pairs have been found");
+ OutHistogram res = this->ClusterResult(ep, estimated);
+ this->AddToResult(res, ep, result);
+ }
+ }
+
+ bool IsTipTip(EdgeId e1, EdgeId e2) const {
+ return (this->graph().OutgoingEdgeCount(this->graph().EdgeEnd(e1)) == 0 &&
+ this->graph().IncomingEdgeCount(this->graph().EdgeEnd(e1)) == 1 &&
+ this->graph().IncomingEdgeCount(this->graph().EdgeStart(e2)) == 0 &&
+ this->graph().OutgoingEdgeCount(this->graph().EdgeStart(e2)) == 1);
+ }
+
+ void ExtendInfoRight(EdgeId e1, EdgeId e2, TempHistogram &data, size_t max_shift) const {
+ ExtendRightDFS(e1, e2, data, 0, max_shift);
+ }
+
+ void MergeInto(const InHistogram &what, TempHistogram &where, int shift) const {
+ // assuming they are sorted already
+ if (what.size() == 0)
+ return;
+
+ if (where.size() == 0) {
+ for (auto to_be_added : what) {
+ to_be_added.d += shift;
+ where.insert(to_be_added);
+ }
+
+ return;
+ }
+
+ // Check, whether two histograms intersect. If not, we can just merge them
+ // straightforwardly.
+ if (math::ls(where.rbegin()->d, what.min().d + shift) ||
+ math::gr(where.begin()->d, what.max().d + shift)) {
+ for (auto to_be_added : what) {
+ to_be_added.d += shift;
+ where.insert(to_be_added);
+ }
+ } else {
+ for (auto to_be_added : what) {
+ to_be_added.d += shift;
+ auto low_bound = std::lower_bound(where.begin(), where.end(), to_be_added);
+ if (to_be_added == *low_bound) {
+ to_be_added.weight += low_bound->weight;
+ where.erase(to_be_added);
+ where.insert(to_be_added);
+ } else
+ where.insert(low_bound, to_be_added);
+ }
+ }
+ }
+
+ void ExtendRightDFS(const EdgeId &first, EdgeId current, TempHistogram &data, int shift,
+ size_t max_shift) const {
+ auto end = this->graph().EdgeEnd(current);
+ if (current == first)
+ return;
+ if (this->graph().IncomingEdgeCount(end) > 1)
+ return;
+
+ for (EdgeId next : this->graph().OutgoingEdges(end)) {
+ auto hist = this->index().Get(first, next);
+ if (-shift < (int) max_shift)
+ ExtendRightDFS(first, next, data, shift - (int) this->graph().length(current), max_shift);
+
+ //auto filtered_infos = FilterPositive(hist, this->graph().length(first), this->graph().length(next));
+ //if (filtered_infos.size() > 0)
+ // MergeInto(filtered_infos, data, shift - (int) this->graph().length(current));
+ MergeInto(hist, data, shift - (int) this->graph().length(current));
+ }
+ }
+
+ const string Name() const override {
+ static const string my_name = "SMOOTHING";
+ return my_name;
+ }
+
+ DECL_LOGGER("SmoothingDistanceEstimator")
+};
+
+}
+
+}
+
+#endif /* SMOOTHING_DISTANCE_ESTIMATION_HPP_ */
diff --git a/src/modules/paired_info/split_path_constructor.hpp b/src/modules/paired_info/split_path_constructor.hpp
new file mode 100644
index 0000000..9cf0c2f
--- /dev/null
+++ b/src/modules/paired_info/split_path_constructor.hpp
@@ -0,0 +1,140 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+* split_path_constructor.hpp
+*
+* Created on: Jun 14, 2012
+* Author: avsirotkin
+*/
+
+#pragma once
+
+#include "dev_support/logger/logger.hpp"
+#include "paired_info/paired_info.hpp"
+#include "assembly_graph/paths/path_processor.hpp"
+#include "paired_info/pair_info_bounds.hpp"
+
+namespace debruijn_graph {
+
+template<class Graph>
+class PathInfoClass {
+public:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef omnigraph::de::PairInfo<EdgeId> PairInfo;
+
+ EdgeId base_edge;
+ vector<PairInfo> path;
+
+ PathInfoClass() : base_edge(NULL) { };
+
+ PathInfoClass(const EdgeId Edge) : base_edge(Edge) { };
+
+ std::pair<EdgeId, double> operator[](const size_t i) const {
+ if (i == 0)
+ return std::make_pair(base_edge, 0.0);
+
+ VERIFY(i < path.size() + 1);
+ return std::make_pair(path[i - 1].second, path[i - 1].d());
+ }
+
+ size_t size() const { return path.size() + 1; }
+
+ void push_back(const PairInfo &pi) { path.push_back(pi); }
+
+ typename std::vector<PairInfo>::const_iterator begin() const { return path.begin(); }
+
+ typename std::vector<PairInfo>::const_iterator end() const { return path.end(); }
+
+ std::string PrintPath(const Graph &graph) const {
+ std::ostringstream ss;
+ ss << " " << graph.int_id(base_edge) << ": ";
+ for (size_t j = 0; j < path.size(); j++) {
+ ss << "(" << graph.int_id(path[j].second) << ", " << path[j].d() << "), ";
+ }
+ return ss.str();
+ }
+};
+
+template<class Graph>
+class SplitPathConstructor {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef PathInfoClass<Graph> PathInfo;
+ typedef omnigraph::de::PairInfo<EdgeId> PairInfo;
+
+public:
+ SplitPathConstructor(const Graph &graph) : graph_(graph) { }
+
+ vector<PathInfo> ConvertPIToSplitPaths(EdgeId cur_edge, const omnigraph::de::PairedInfoIndexT<Graph> &pi,
+ double is, double is_var) const {
+ vector<PairInfo> pair_infos; //TODO: this is an adaptor for the old implementation
+ for (auto i : pi.Get(cur_edge))
+ for (auto j : i.second)
+ pair_infos.emplace_back(cur_edge, i.first, j);
+
+ vector<PathInfo> result;
+ if (pair_infos.empty())
+ return result;
+
+ vector<bool> pair_info_used(pair_infos.size());
+ TRACE("Preparing path_processor for this base edge");
+ size_t path_upper_bound = PairInfoPathLengthUpperBound(graph_.k(), (size_t) is, is_var);
+
+ //FIXME is path_upper_bound enough?
+ PathProcessor<Graph> path_processor(graph_,
+ graph_.EdgeEnd(cur_edge),
+ path_upper_bound);
+
+ TRACE("Path_processor is done");
+
+ for (size_t i = pair_infos.size(); i > 0; --i) {
+ const PairInfo &cur_info = pair_infos[i - 1];
+ if (math::le(cur_info.d(), 0.))
+ continue;
+ if (pair_info_used[i - 1])
+ continue;
+ DEBUG("SPC: pi " << cur_info);
+ vector<EdgeId> common_part = GetCommonPathsEnd(graph_, cur_edge, cur_info.second,
+ (size_t) (cur_info.d() - cur_info.var()),
+ (size_t) (cur_info.d() + cur_info.var()),
+ path_processor);
+ DEBUG("Found common part of size " << common_part.size());
+ PathInfoClass<Graph> sub_res(cur_edge);
+ if (common_part.size() > 0) {
+ size_t total_length = 0;
+ for (size_t j = 0; j < common_part.size(); ++j)
+ total_length += graph_.length(common_part[j]);
+
+ DEBUG("Common part " << ToString(common_part));
+ for (size_t j = 0; j < common_part.size(); ++j) {
+ PairInfo cur_pi(cur_edge, common_part[j],
+ cur_info.d() - (double) total_length,
+ cur_info.weight(),
+ cur_info.var());
+
+ sub_res.push_back(cur_pi);
+ total_length -= graph_.length(common_part[j]);
+ for (size_t ind = 0; ind + 1 < i; ++ind) {
+ if (cur_pi == pair_infos[ind])
+ pair_info_used[ind] = true;
+ }
+ }
+ }
+
+ sub_res.push_back(cur_info);
+ result.push_back(sub_res);
+ DEBUG(sub_res.PrintPath(graph_));
+ }
+ return result;
+ }
+
+private:
+ const Graph &graph_;
+};
+
+
+}
diff --git a/src/modules/paired_info/weighted_distance_estimation.hpp b/src/modules/paired_info/weighted_distance_estimation.hpp
new file mode 100644
index 0000000..9928ef9
--- /dev/null
+++ b/src/modules/paired_info/weighted_distance_estimation.hpp
@@ -0,0 +1,112 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef WEIGHTED_DISTANCE_ESTIMATION_HPP_
+#define WEIGHTED_DISTANCE_ESTIMATION_HPP_
+
+#include "math/xmath.h"
+#include "paired_info.hpp"
+#include "distance_estimation.hpp"
+
+namespace omnigraph {
+
+namespace de {
+
+template<class Graph>
+class WeightedDistanceEstimator : public DistanceEstimator<Graph> {
+protected:
+ typedef DistanceEstimator<Graph> base;
+ typedef typename base::InPairedIndex InPairedIndex;
+ typedef typename base::OutPairedIndex OutPairedIndex;
+ typedef typename base::InHistogram InHistogram;
+ typedef typename base::OutHistogram OutHistogram;
+
+public:
+ WeightedDistanceEstimator(const Graph &graph,
+ const InPairedIndex &histogram,
+ const GraphDistanceFinder<Graph> &distance_finder,
+ std::function<double(int)> weight_f,
+ size_t linkage_distance, size_t max_distance) :
+ base(graph, histogram, distance_finder, linkage_distance, max_distance), weight_f_(weight_f) { }
+
+ virtual ~WeightedDistanceEstimator() { }
+
+protected:
+ typedef typename Graph::EdgeId EdgeId;
+
+ typedef vector<pair<int, double> > EstimHist;
+ typedef pair<EdgeId, EdgeId> EdgePair;
+ typedef vector<size_t> GraphLengths;
+
+ std::function<double(int)> weight_f_;
+
+ virtual EstimHist EstimateEdgePairDistances(EdgePair ep,
+ const InHistogram &histogram,
+ const GraphLengths &raw_forward) const override {
+ using std::abs;
+ using namespace math;
+ TRACE("Estimating with weight function");
+ size_t first_len = this->graph().length(ep.first);
+ size_t second_len = this->graph().length(ep.second);
+
+ EstimHist result;
+ int maxD = rounded_d(histogram.max()), minD = rounded_d(histogram.min());
+ vector<int> forward;
+ for (auto length : raw_forward) {
+ if (minD - (int) this->max_distance_ <= length && length <= maxD + (int) this->max_distance_) {
+ forward.push_back(length);
+ }
+ }
+ if (forward.size() == 0)
+ return result;
+
+ DEDistance max_dist = this->max_distance_;
+ size_t cur_dist = 0;
+ vector<double> weights(forward.size());
+ for (auto point : histogram) {
+ if (ls(2. * point.d + (double) second_len, (double) first_len))
+ continue;
+ while (cur_dist + 1 < forward.size() && (double) forward[cur_dist + 1] < point.d) {
+ ++cur_dist;
+ }
+ if (cur_dist + 1 < forward.size() && ls((double) forward[cur_dist + 1] - point.d,
+ point.d - (double) forward[cur_dist])) {
+ ++cur_dist;
+ if (le(abs(forward[cur_dist] - point.d), max_dist))
+ weights[cur_dist] += point.weight * weight_f_(forward[cur_dist] - rounded_d(point));
+ }
+ else if (cur_dist + 1 < forward.size() && eq(forward[cur_dist + 1] - point.d,
+ point.d - forward[cur_dist])) {
+ if (le(abs(forward[cur_dist] - point.d), max_dist))
+ weights[cur_dist] += point.weight * 0.5 * weight_f_(forward[cur_dist] - rounded_d(point));
+
+ ++cur_dist;
+
+ if (le(abs(forward[cur_dist] - point.d), max_dist))
+ weights[cur_dist] += point.weight * 0.5 * weight_f_(forward[cur_dist] - rounded_d(point));
+ } else if (le(abs(forward[cur_dist] - point.d), max_dist))
+ weights[cur_dist] += point.weight * weight_f_(forward[cur_dist] - rounded_d(point));
+ }
+
+ for (size_t i = 0; i < forward.size(); ++i)
+ if (gr(weights[i], 0.))
+ result.push_back(make_pair(forward[i], weights[i]));
+
+ return result;
+ }
+
+ const string Name() const override {
+ static const string my_name = "WEIGHTED";
+ return my_name;
+ }
+
+};
+
+}
+
+}
+#endif
diff --git a/src/modules/paired_info/weights.hpp b/src/modules/paired_info/weights.hpp
new file mode 100644
index 0000000..8812d68
--- /dev/null
+++ b/src/modules/paired_info/weights.hpp
@@ -0,0 +1,82 @@
+#pragma once
+
+using omnigraph::Path;
+using omnigraph::MappingPath;
+using omnigraph::Range;
+using omnigraph::MappingRange;
+
+namespace debruijn_graph {
+inline double PairedReadCountWeight(const MappingRange&, const MappingRange&) {
+ return 1.;
+}
+
+inline double KmerCountProductWeight(const MappingRange& mr1,
+ const MappingRange& mr2) {
+ return (double)(mr1.initial_range.size() * mr2.initial_range.size());
+}
+
+class WeightDEWrapper {
+private:
+
+ vector<double> new_hist;
+ int left_x;
+ int insert_size;
+
+ void ExtendLinear(const std::map<int, size_t> & hist) {
+ size_t sum_weight = 0;
+
+ for (auto iter = hist.begin(); iter != hist.end(); ++iter)
+ sum_weight += iter->second;
+ DEBUG(sum_weight);
+
+ VERIFY(hist.size() > 0);
+ auto iter = hist.begin();
+
+ left_x = iter->first;
+
+ int prev = iter->first;
+ size_t prev_val = iter->second;
+
+ new_hist.push_back((double)prev_val / (double)sum_weight);
+ ++iter;
+
+ for (; iter != hist.end(); ++iter) {
+ int x = iter->first;
+ size_t y = iter->second;
+ double tan = ((double)y - (double)prev_val) / (x - prev);
+
+ VERIFY(prev < x);
+ for (int i = prev + 1; i <= x; ++i) {
+ new_hist.push_back(((double)prev_val + tan * (i - prev)) / (double)sum_weight);
+ }
+ prev = x;
+ prev_val = y;
+ DEBUG("hist " << x << " " << y);
+ }
+ }
+
+public:
+ WeightDEWrapper(const map<int, size_t>& hist, double IS) {
+ DEBUG("WeightDEWrapper " << IS);
+ insert_size = (int) IS;
+ DEBUG("Extending linear");
+ ExtendLinear(hist);
+ }
+
+ ~WeightDEWrapper() {
+ }
+
+
+ double CountWeight(int x) const {
+ int xx = insert_size - left_x + x - 1;
+
+ if (!(xx >= 0 && xx < (int) new_hist.size())) return 0.;
+ VERIFY(math::le(new_hist[xx], 1.));
+ return 1000. * new_hist[xx];
+ }
+};
+
+inline double UnityFunction(int /*x*/) {
+ return 1.;
+}
+}
\ No newline at end of file
diff --git a/src/modules/pipeline/CMakeLists.txt b/src/modules/pipeline/CMakeLists.txt
new file mode 100644
index 0000000..aa37d3b
--- /dev/null
+++ b/src/modules/pipeline/CMakeLists.txt
@@ -0,0 +1,14 @@
+############################################################################
+# Copyright (c) 2015 Saint Petersburg State University
+# Copyright (c) 2011-2014 Saint Petersburg Academic University
+# All Rights Reserved
+# See file LICENSE for details.
+############################################################################
+
+project(pipeline CXX)
+
+add_library(pipeline STATIC
+ stage.cpp config_struct.cpp genomic_info_filler.cpp library.cpp)
+
+target_link_libraries(pipeline input llvm-support)
+
diff --git a/src/modules/pipeline/config_common.hpp b/src/modules/pipeline/config_common.hpp
new file mode 100755
index 0000000..e540017
--- /dev/null
+++ b/src/modules/pipeline/config_common.hpp
@@ -0,0 +1,140 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "dev_support/simple_tools.hpp"
+#include "dev_support/path_helper.hpp"
+#include "dev_support/verify.hpp"
+
+// todo: undo dirty fix
+
+#include <boost/property_tree/ptree.hpp>
+#include <boost/property_tree/info_parser.hpp>
+
+#include <string>
+#include <vector>
+#include <iostream>
+#include <fstream>
+#include <map>
+
+namespace config_common {
+// for enable_if/disable_if
+namespace details {
+template<class T, class S>
+struct is_equal_type {
+ static const bool value = false;
+};
+
+template<class T>
+struct is_equal_type<T, T> {
+ static const bool value = true;
+};
+}
+
+template<class T>
+typename boost::enable_if_c<details::is_equal_type<T, std::string>::value ||
+ boost::is_arithmetic<T>::value>::type
+load(T &value,
+ boost::property_tree::ptree const &pt, std::string const &key,
+ bool complete) {
+ if (complete || pt.find(key) != pt.not_found())
+ value = pt.get<T>(key);
+}
+
+template<class T>
+typename boost::disable_if_c<details::is_equal_type<T,
+ std::string>::value ||
+ boost::is_arithmetic<T>::value>::type
+load(T &value,
+ boost::property_tree::ptree const &pt, std::string const &key,
+ bool complete) {
+ if (complete || pt.find(key) != pt.not_found())
+ load(value, pt.get_child(key), complete);
+}
+
+template<class T>
+void load_items(std::vector <T> &vec, boost::property_tree::ptree const &pt,
+ std::string const &key, bool complete) {
+ std::string vector_key = key + std::string(".count");
+ if (complete || pt.find(vector_key) != pt.not_found()) {
+ size_t count = pt.get<size_t>(vector_key);
+
+ for (size_t i = 0; i != count; ++i) {
+ T t;
+ load(t, pt.get_child(fmt::format("{:s}.item_{:d}", key, i)),
+ complete);
+ vec.push_back(t);
+ }
+ }
+}
+
+template<class T>
+void load(std::vector <T> &vec, boost::property_tree::ptree const &pt, std::string const &key,
+ bool /*complete*/) {
+ boost::optional<T> value = pt.get_optional<T>(key);
+ if (value) {
+ vec.push_back(*value);
+ return;
+ }
+ for (size_t i = 1; ; i++) {
+ value = pt.get_optional<std::string>(key + "#" + ToString(i));
+ if (value) {
+ vec.push_back(*value);
+ continue;
+ }
+ value = pt.get_optional<std::string>(key + "." + ToString(i));
+ if (value) {
+ vec.push_back(*value);
+ continue;
+ }
+ if (i > 0) {
+ return;
+ }
+ }
+}
+
+template<class T>
+void load(T &value, boost::property_tree::ptree const &pt, std::string const &key) {
+ load(value, pt, key, true);
+}
+
+template<class T>
+void load(T &value, boost::property_tree::ptree const &pt, const char *key) {
+ load(value, pt, std::string(key), true);
+}
+
+template<class T>
+void load(T &value, boost::property_tree::ptree const &pt) {
+ load(value, pt, true);
+}
+
+template<class T>
+void load_param(const std::string &filename, const std::string &key,
+ boost::optional<T> &value) {
+ boost::property_tree::ptree pt;
+ boost::property_tree::read_info(filename, pt);
+ value = pt.get_optional<T>(key);
+}
+
+template<class T>
+void write_param(const std::string &filename, const std::string &key,
+ const boost::optional<T> &value) {
+ if (value) {
+ std::ofstream params_stream(filename.c_str(), std::ios_base::app);
+ params_stream << key << "\t" << value << std::endl;
+ }
+}
+
+template<class T>
+void write_param(const std::string &filename, const std::string &key,
+ const T &value) {
+ std::ofstream params_stream(filename.c_str(), std::ios_base::app);
+ params_stream << key << "\t" << value << std::endl;
+}
+
+}
diff --git a/src/modules/pipeline/config_singl.hpp b/src/modules/pipeline/config_singl.hpp
new file mode 100644
index 0000000..9bf726e
--- /dev/null
+++ b/src/modules/pipeline/config_singl.hpp
@@ -0,0 +1,57 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef __CONFIG_SINGL_HPP__
+#define __CONFIG_SINGL_HPP__
+
+
+#include "dev_support/verify.hpp"
+
+#include <string>
+
+namespace config_common {
+
+// config singleton-wrap
+template<class Config>
+struct config {
+ static std::string dirnameOf(const std::string &fname) {
+ size_t pos = fname.find_last_of("\\/");
+ return (std::string::npos == pos) ? "" : fname.substr(0, pos);
+ }
+
+ template<class Source>
+ static void create_instance(Source const &source) {
+ load(inner_cfg(), source);
+ is_initialized() = true;
+ }
+
+ static Config const &get() {
+ VERIFY_MSG(is_initialized(), "Config not initialized");
+ return inner_cfg();
+ }
+
+ static Config &get_writable() {
+ VERIFY_MSG(is_initialized(), "Config not initialized");
+ return inner_cfg();
+ }
+
+private:
+ static Config &inner_cfg() {
+ static Config config;
+ return config;
+ }
+
+ static bool &is_initialized() {
+ static bool is_initialized = false;
+ return is_initialized;
+ }
+};
+
+}
+
+
+#endif // __CONFIG_SINGLE_HPP__
diff --git a/src/modules/pipeline/config_struct.cpp b/src/modules/pipeline/config_struct.cpp
new file mode 100644
index 0000000..4e35ffd
--- /dev/null
+++ b/src/modules/pipeline/config_struct.cpp
@@ -0,0 +1,786 @@
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "pipeline/config_struct.hpp"
+
+#include "pipeline/config_common.hpp"
+#include "dev_support/openmp_wrapper.h"
+
+#include "dev_support/logger/logger.hpp"
+#include "dev_support/verify.hpp"
+
+#include "io/reads_io/file_reader.hpp"
+
+#include <string>
+#include <vector>
+
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/FileSystem.h"
+
+using namespace llvm;
+
+namespace io {
+template<>
+void SequencingLibrary<debruijn_graph::config::DataSetData>::yamlize(llvm::yaml::IO &io) {
+ // First, load the "common stuff"
+ SequencingLibraryBase::yamlize(io);
+
+ // Now load the remaining stuff
+ io.mapOptional("read length" , data_.read_length);
+ io.mapOptional("average read length" , data_.avg_read_length);
+ io.mapOptional("insert size mean" , data_.mean_insert_size);
+ io.mapOptional("insert size deviation" , data_.insert_size_deviation);
+ io.mapOptional("insert size left quantile" , data_.insert_size_left_quantile);
+ io.mapOptional("insert size right quantile" , data_.insert_size_right_quantile);
+ io.mapOptional("insert size median" , data_.median_insert_size);
+ io.mapOptional("insert size mad" , data_.insert_size_mad);
+ io.mapOptional("insert size distribution" , data_.insert_size_distribution);
+ io.mapOptional("average coverage" , data_.average_coverage);
+ io.mapOptional("pi threshold" , data_.pi_threshold);
+ io.mapOptional("binary converted" , data_.binary_coverted);
+ io.mapOptional("single reads mapped" , data_.single_reads_mapped);
+}
+
+template<>
+void SequencingLibrary<debruijn_graph::config::DataSetData>::validate(llvm::yaml::IO &io, llvm::StringRef &res) {
+ // Simply ask base class to validate for us
+ SequencingLibraryBase::validate(io, res);
+}
+}
+
+#include "pipeline/library.inl"
+
+template class io::DataSet<debruijn_graph::config::DataSetData>;
+
+namespace debruijn_graph {
+namespace config {
+
+template<typename mode_t>
+vector<string> CheckedNames(const vector<pair<string, mode_t>>& mapping, mode_t total) {
+ VERIFY_MSG(size_t(total) == mapping.size(), "Names for some modes missing")
+ vector<string> answer;
+ for (size_t i = 0; i < mapping.size(); ++i) {
+ VERIFY_MSG(size_t(mapping[i].second) == i, "Id/name mapping error");
+ answer.push_back(mapping[i].first);
+ }
+ return answer;
+}
+
+vector<string> InfoPrinterPosNames() {
+ return CheckedNames<info_printer_pos>({
+ {"default", info_printer_pos::default_pos},
+ {"before_first_gap_closer", info_printer_pos::before_first_gap_closer},
+ {"before_simplification", info_printer_pos::before_simplification},
+ {"before_post_simplification", info_printer_pos::before_post_simplification},
+ {"final_simplified", info_printer_pos::final_simplified},
+ {"final_gap_closed", info_printer_pos::final_gap_closed},
+ {"before_repeat_resolution", info_printer_pos::before_repeat_resolution}}, info_printer_pos::total);
+}
+
+vector<string> PipelineTypeNames() {
+ return CheckedNames<pipeline_type>({
+ {"base", pipeline_type::base},
+ {"isolate", pipeline_type::isolate},
+ {"mda", pipeline_type::mda},
+ {"meta", pipeline_type::meta},
+ {"moleculo", pipeline_type::moleculo},
+ {"diploid", pipeline_type::diploid},
+ {"rna", pipeline_type::rna},
+ {"plasmid", pipeline_type::plasmid}}, pipeline_type::total);
+}
+
+vector<string> ConstructionModeNames() {
+ return CheckedNames<construction_mode>({
+ {"old", construction_mode::old},
+ {"extension", construction_mode::extention}}, construction_mode::total);
+}
+
+vector<string> EstimationModeNames() {
+ return CheckedNames<estimation_mode>({
+ {"simple", estimation_mode::simple},
+ {"weighted", estimation_mode::weighted},
+ {"smoothing", estimation_mode::smoothing}}, estimation_mode::total);
+}
+
+
+vector<string> ResolveModeNames() {
+ return CheckedNames<resolving_mode>({
+ {"none", resolving_mode::none},
+ {"path_extend", resolving_mode::path_extend}}, resolving_mode::total);
+}
+
+vector<string> SingleReadResolveModeNames() {
+ return CheckedNames<single_read_resolving_mode>({
+ {"none", single_read_resolving_mode::none},
+ {"only_single_libs", single_read_resolving_mode::only_single_libs},
+ {"all", single_read_resolving_mode::all}}, single_read_resolving_mode::total);
+}
+
+void load_lib_data(const std::string& prefix) {
+ // First, load the data into separate libs
+ cfg::get_writable().ds.reads.load(prefix + ".lib_data");
+
+ // Now, infer the common parameters
+ size_t max_rl = 0;
+ double avg_cov = 0.0;
+ double avg_rl = 0.0;
+ for (const auto& lib : cfg::get().ds.reads.libraries()) {
+ auto const& data = lib.data();
+ if (lib.is_graph_contructable())
+ max_rl = std::max(max_rl, data.read_length);
+ if (data.average_coverage > 0)
+ avg_cov = data.average_coverage;
+ if (data.avg_read_length > 0)
+ avg_rl = data.avg_read_length;
+ }
+
+ cfg::get_writable().ds.set_RL(max_rl);
+ cfg::get_writable().ds.set_aRL(avg_rl);
+ cfg::get_writable().ds.set_avg_coverage(avg_cov);
+}
+
+void write_lib_data(const std::string& prefix) {
+ cfg::get_writable().ds.reads.save(prefix + ".lib_data");
+}
+
+void load(debruijn_config::simplification::tip_clipper &tc,
+ boost::property_tree::ptree const &pt, bool /*complete*/) {
+ using config_common::load;
+ load(tc.condition, pt, "condition");
+}
+
+void load(resolving_mode &rm, boost::property_tree::ptree const &pt,
+ std::string const &key, bool complete) {
+ if (complete || pt.find(key) != pt.not_found()) {
+ rm = ModeByName<resolving_mode>(pt.get<std::string>(key), ResolveModeNames());
+ }
+}
+
+void load(single_read_resolving_mode &rm, boost::property_tree::ptree const &pt,
+ std::string const &key, bool complete) {
+ if (complete || pt.find(key) != pt.not_found()) {
+ std::string ep = pt.get<std::string>(key);
+ rm = ModeByName<single_read_resolving_mode>(ep, SingleReadResolveModeNames());
+ }
+}
+
+void load(construction_mode& con_mode,
+ boost::property_tree::ptree const& pt, std::string const& key,
+ bool complete) {
+ if (complete || pt.find(key) != pt.not_found()) {
+ con_mode = ModeByName<construction_mode>(pt.get<std::string>(key), ConstructionModeNames());
+ }
+}
+
+void load(debruijn_config::construction::early_tip_clipper& etc,
+ boost::property_tree::ptree const& pt, bool /*complete*/) {
+ using config_common::load;
+ load(etc.enable, pt, "enable");
+ etc.length_bound = pt.get_optional<size_t>("length_bound");
+}
+
+void load(debruijn_config::construction& con,
+ boost::property_tree::ptree const& pt, bool complete) {
+ using config_common::load;
+ load(con.con_mode, pt, "mode", complete);
+ load(con.keep_perfect_loops, pt, "keep_perfect_loops", complete);
+ load(con.read_buffer_size, pt, "read_buffer_size", complete);
+ con.read_buffer_size *= 1024 * 1024;
+ load(con.early_tc, pt, "early_tip_clipper", complete);
+}
+
+void load(debruijn_config::sensitive_mapper& sensitive_map,
+ boost::property_tree::ptree const& pt, bool complete) {
+ using config_common::load;
+ load(sensitive_map.k, pt, "k", complete);
+}
+
+void load(estimation_mode &est_mode,
+ boost::property_tree::ptree const &pt, std::string const &key,
+ bool complete) {
+ if (complete || pt.find(key) != pt.not_found()) {
+ est_mode = ModeByName<estimation_mode>(pt.get<std::string>(key), EstimationModeNames());
+ }
+}
+
+void load(debruijn_config::simplification::bulge_remover& br,
+ boost::property_tree::ptree const& pt, bool complete) {
+ using config_common::load;
+
+ load(br.enabled , pt, "enabled" , complete);
+ load(br.main_iteration_only , pt, "main_iteration_only" , complete);
+ load(br.max_bulge_length_coefficient , pt, "max_bulge_length_coefficient", complete);
+ load(br.max_additive_length_coefficient , pt,
+ "max_additive_length_coefficient", complete);
+ load(br.max_coverage, pt, "max_coverage", complete);
+ load(br.max_relative_coverage, pt, "max_relative_coverage", complete);
+ load(br.max_delta, pt, "max_delta", complete);
+ load(br.max_relative_delta, pt, "max_relative_delta", complete);
+ load(br.max_number_edges, pt, "max_number_edges", complete);
+ load(br.parallel, pt, "parallel", complete);
+ load(br.buff_size, pt, "buff_size", complete);
+ load(br.buff_cov_diff, pt, "buff_cov_diff", complete);
+ load(br.buff_cov_rel_diff, pt, "buff_cov_rel_diff", complete);
+}
+
+void load(debruijn_config::simplification::topology_tip_clipper& ttc,
+ boost::property_tree::ptree const& pt, bool /*complete*/) {
+ using config_common::load;
+ load(ttc.length_coeff, pt, "length_coeff");
+ load(ttc.plausibility_length, pt, "plausibility_length");
+ load(ttc.uniqueness_length, pt, "uniqueness_length");
+}
+
+void load(debruijn_config::simplification::complex_tip_clipper &ctc,
+ boost::property_tree::ptree const &pt, bool complete) {
+ using config_common::load;
+ load(ctc.enabled, pt, "enabled", complete);
+ load(ctc.max_relative_coverage, pt, "max_relative_coverage", complete);
+ load(ctc.max_edge_len, pt, "max_edge_len", complete);
+ load(ctc.condition, pt, "condition", complete);
+}
+
+void load(debruijn_config::simplification::relative_coverage_edge_disconnector& relative_ed,
+ boost::property_tree::ptree const& pt, bool complete) {
+ using config_common::load;
+ load(relative_ed.enabled, pt, "enabled", complete);
+ load(relative_ed.diff_mult, pt, "diff_mult", complete);
+}
+
+void load(debruijn_config::simplification::relative_coverage_comp_remover& rcc,
+ boost::property_tree::ptree const& pt, bool complete) {
+ using config_common::load;
+ load(rcc.enabled, pt, "enabled", complete);
+ load(rcc.coverage_gap, pt, "coverage_gap", complete);
+ load(rcc.length_coeff, pt, "max_length_coeff", complete);
+ load(rcc.tip_allowing_length_coeff, pt, "max_length_with_tips_coeff", complete);
+ load(rcc.vertex_count_limit, pt, "max_vertex_cnt", complete);
+ load(rcc.max_ec_length_coefficient, pt, "max_ec_length_coefficient", complete);
+ load(rcc.max_coverage_coeff, pt, "max_coverage_coeff", complete);
+}
+
+void load(debruijn_config::simplification::isolated_edges_remover& ier,
+ boost::property_tree::ptree const& pt, bool complete) {
+ using config_common::load;
+ load(ier.enabled, pt, "enabled", complete);
+ load(ier.max_length, pt, "max_length", complete);
+ load(ier.max_coverage, pt, "max_coverage", complete);
+ load(ier.max_length_any_cov, pt, "max_length_any_cov", complete);
+}
+
+void load(debruijn_config::simplification::init_cleaning& init_clean,
+ boost::property_tree::ptree const& pt, bool complete) {
+ using config_common::load;
+ load(init_clean.self_conj_condition, pt, "self_conj_condition", complete);
+ load(init_clean.early_it_only, pt, "early_it_only", complete);
+ load(init_clean.activation_cov, pt, "activation_cov", complete);
+ load(init_clean.ier, pt, "ier", complete);
+ load(init_clean.tip_condition, pt, "tip_condition", complete);
+ load(init_clean.ec_condition, pt, "ec_condition", complete);
+ load(init_clean.disconnect_flank_cov, pt, "disconnect_flank_cov", complete);
+}
+
+void load(debruijn_config::simplification::complex_bulge_remover& cbr,
+ boost::property_tree::ptree const& pt, bool complete) {
+ using config_common::load;
+
+ load(cbr.enabled, pt, "enabled");
+ load(cbr.max_relative_length, pt, "max_relative_length", complete);
+ load(cbr.max_length_difference, pt, "max_length_difference", complete);
+}
+
+void load(debruijn_config::simplification::erroneous_connections_remover& ec,
+ boost::property_tree::ptree const& pt, bool /*complete*/) {
+ using config_common::load;
+
+ load(ec.condition, pt, "condition");
+}
+
+void load(debruijn_config::simplification::topology_based_ec_remover& tec,
+ boost::property_tree::ptree const& pt, bool /*complete*/) {
+ using config_common::load;
+
+ load(tec.max_ec_length_coefficient, pt, "max_ec_length_coefficient");
+ load(tec.plausibility_length, pt, "plausibility_length");
+ load(tec.uniqueness_length, pt, "uniqueness_length");
+}
+
+void load(debruijn_config::simplification::interstrand_ec_remover &isec,
+ boost::property_tree::ptree const &pt, bool /*complete*/) {
+ using config_common::load;
+ load(isec.max_ec_length_coefficient, pt, "max_ec_length_coefficient");
+ load(isec.uniqueness_length, pt, "uniqueness_length");
+ load(isec.span_distance, pt, "span_distance");
+}
+
+void load(debruijn_config::simplification::tr_based_ec_remover &trec,
+ boost::property_tree::ptree const &pt, bool /*complete*/) {
+ using config_common::load;
+ load(trec.max_ec_length_coefficient, pt, "max_ec_length_coefficient");
+ load(trec.unreliable_coverage, pt, "unreliable_coverage");
+ load(trec.uniqueness_length, pt, "uniqueness_length");
+}
+
+void load(debruijn_config::simplification::max_flow_ec_remover& mfec,
+ boost::property_tree::ptree const& pt, bool /*complete*/) {
+ using config_common::load;
+
+ load(mfec.enabled, pt, "enabled");
+ load(mfec.max_ec_length_coefficient, pt, "max_ec_length_coefficient");
+ load(mfec.plausibility_length, pt, "plausibility_length");
+ load(mfec.uniqueness_length, pt, "uniqueness_length");
+}
+
+void load(debruijn_config::simplification::hidden_ec_remover& her,
+ boost::property_tree::ptree const& pt, bool /*complete*/) {
+ using config_common::load;
+
+ load(her.enabled, pt, "enabled");
+ load(her.uniqueness_length, pt, "uniqueness_length");
+ load(her.unreliability_threshold, pt, "unreliability_threshold");
+ load(her.relative_threshold, pt, "relative_threshold");
+}
+
+void load(debruijn_config::distance_estimator& de,
+ boost::property_tree::ptree const& pt, bool /*complete*/) {
+ using config_common::load;
+
+ load(de.linkage_distance_coeff, pt, "linkage_distance_coeff");
+ load(de.max_distance_coeff, pt, "max_distance_coeff");
+ load(de.max_distance_coeff_scaff, pt, "max_distance_coeff_scaff");
+ load(de.filter_threshold, pt, "filter_threshold");
+}
+
+void load(debruijn_config::smoothing_distance_estimator& ade,
+ boost::property_tree::ptree const& pt, bool /*complete*/) {
+ using config_common::load;
+
+ load(ade.threshold, pt, "threshold");
+ load(ade.range_coeff, pt, "range_coeff");
+ load(ade.delta_coeff, pt, "delta_coeff");
+ load(ade.percentage, pt, "percentage");
+ load(ade.cutoff, pt, "cutoff");
+ load(ade.min_peak_points, pt, "min_peak_points");
+ load(ade.inv_density, pt, "inv_density");
+ load(ade.derivative_threshold, pt, "derivative_threshold");
+}
+
+//FIXME make amb_de optional field
+inline void load(debruijn_config::ambiguous_distance_estimator &amde,
+ boost::property_tree::ptree const &pt, bool complete) {
+ using config_common::load;
+
+ load(amde.enabled, pt, "enabled", complete);
+ load(amde.haplom_threshold, pt, "haplom_threshold", complete);
+ load(amde.relative_length_threshold, pt, "relative_length_threshold", complete);
+ load(amde.relative_seq_threshold, pt, "relative_seq_threshold", complete);
+}
+
+void load(debruijn_config::scaffold_correction& sc_corr,
+ boost::property_tree::ptree const& pt, bool /*complete*/) {
+ using config_common::load;
+ load(sc_corr.scaffolds_file, pt, "scaffolds_file");
+ load(sc_corr.output_unfilled, pt, "output_unfilled");
+ load(sc_corr.max_insert, pt, "max_insert");
+ load(sc_corr.max_cut_length, pt, "max_cut_length");
+}
+
+void load(debruijn_config::truseq_analysis& tsa,
+ boost::property_tree::ptree const& pt, bool /*complete*/) {
+ using config_common::load;
+ load(tsa.scaffolds_file, pt, "scaffolds_file");
+ load(tsa.genome_file, pt, "genome_file");
+}
+
+void load(debruijn_config::bwa_aligner& bwa,
+ boost::property_tree::ptree const& pt, bool /*complete*/) {
+ using config_common::load;
+ load(bwa.bwa_enable, pt, "bwa_enable");
+ load(bwa.debug, pt, "debug");
+ load(bwa.path_to_bwa, pt, "path_to_bwa");
+ load(bwa.min_contig_len, pt, "min_contig_len");
+}
+
+void load(debruijn_config::pacbio_processor& pb,
+ boost::property_tree::ptree const& pt, bool /*complete*/) {
+ using config_common::load;
+ load(pb.pacbio_k, pt, "pacbio_k");
+ load(pb.additional_debug_info, pt, "additional_debug_info");
+ load(pb.compression_cutoff, pt, "compression_cutoff");
+ load(pb.domination_cutoff, pt, "domination_cutoff");
+ load(pb.path_limit_stretching, pt, "path_limit_stretching");
+ load(pb.path_limit_pressing, pt, "path_limit_pressing");
+ load(pb.ignore_middle_alignment, pt, "ignore_middle_alignment");
+ load(pb.long_seq_limit, pt, "long_seq_limit");
+ load(pb.pacbio_min_gap_quantity, pt, "pacbio_min_gap_quantity");
+ load(pb.contigs_min_gap_quantity, pt, "contigs_min_gap_quantity");
+ load(pb.max_contigs_gap_length, pt, "max_contigs_gap_length");
+
+}
+
+
+void load(debruijn_config::position_handler& pos,
+ boost::property_tree::ptree const& pt, bool /*complete*/) {
+ using config_common::load;
+ load(pos.max_mapping_gap, pt, "max_mapping_gap");
+ load(pos.max_gap_diff, pt, "max_gap_diff");
+ load(pos.contigs_for_threading, pt, "contigs_for_threading");
+ load(pos.contigs_to_analyze, pt, "contigs_to_analyze");
+ load(pos.late_threading, pt, "late_threading");
+ load(pos.careful_labeling, pt, "careful_labeling");
+}
+void load(debruijn_config::plasmid& pd,
+ boost::property_tree::ptree const& pt, bool /*complete*/) {
+ using config_common::load;
+ load(pd.long_edge_length, pt, "long_edge_length");
+ load(pd.edge_length_for_median, pt, "edge_length_for_median");
+
+ load(pd.relative_coverage, pt, "relative_coverage");
+ load(pd.small_component_size, pt, "small_component_size");
+ load(pd.small_component_relative_coverage, pt, "small_component_relative_coverage");
+ load(pd.min_component_length, pt, "min_component_length");
+ load(pd.min_isolated_length, pt, "min_isolated_length");
+
+}
+
+
+void load(debruijn_config::gap_closer& gc,
+ boost::property_tree::ptree const& pt, bool /*complete*/) {
+ using config_common::load;
+ load(gc.minimal_intersection, pt, "minimal_intersection");
+ load(gc.before_simplify, pt, "before_simplify");
+ load(gc.in_simplify, pt, "in_simplify");
+ load(gc.after_simplify, pt, "after_simplify");
+ load(gc.weight_threshold, pt, "weight_threshold");
+}
+
+void load(debruijn_config::graph_read_corr_cfg& graph_read_corr,
+ boost::property_tree::ptree const& pt, bool /*complete*/) {
+ using config_common::load;
+ load(graph_read_corr.enable, pt, "enable");
+ load(graph_read_corr.output_dir, pt, "output_dir");
+ load(graph_read_corr.binary, pt, "binary");
+}
+
+void load(debruijn_config::kmer_coverage_model& kcm,
+ boost::property_tree::ptree const& pt, bool /*complete*/) {
+ using config_common::load;
+ load(kcm.probability_threshold, pt, "probability_threshold");
+ load(kcm.strong_probability_threshold, pt, "strong_probability_threshold");
+ load(kcm.coverage_threshold, pt, "coverage_threshold");
+ load(kcm.use_coverage_threshold, pt, "use_coverage_threshold");
+}
+
+void load(dataset &ds,
+ boost::property_tree::ptree const &pt, bool /*complete*/) {
+ using config_common::load;
+
+ load(ds.reads_filename, pt, "reads");
+
+ ds.reference_genome_filename = "";
+ boost::optional<std::string> refgen =
+ pt.get_optional<std::string>("reference_genome");
+ if (refgen && *refgen != "N/A") {
+ ds.reference_genome_filename = *refgen;
+ }
+}
+
+void load_reads(dataset &ds,
+ std::string input_dir) {
+ if (ds.reads_filename[0] != '/')
+ ds.reads_filename = input_dir + ds.reads_filename;
+ path::CheckFileExistenceFATAL(ds.reads_filename);
+ ds.reads.load(ds.reads_filename);
+}
+
+void load_reference_genome(dataset &ds,
+ std::string input_dir) {
+ if (ds.reference_genome_filename == "") {
+ ds.reference_genome = "";
+ return;
+ }
+ if (ds.reference_genome_filename[0] != '/')
+ ds.reference_genome_filename = input_dir + ds.reference_genome_filename;
+ path::CheckFileExistenceFATAL(ds.reference_genome_filename);
+ io::FileReadStream genome_stream(ds.reference_genome_filename);
+ io::SingleRead genome;
+ genome_stream >> genome;
+ ds.reference_genome = genome.GetSequenceString();
+}
+
+void load(debruijn_config::simplification& simp,
+ boost::property_tree::ptree const& pt, bool complete) {
+ using config_common::load;
+
+ load(simp.cycle_iter_count, pt, "cycle_iter_count", complete);
+ load(simp.post_simplif_enabled, pt, "post_simplif_enabled", complete);
+ load(simp.topology_simplif_enabled, pt, "topology_simplif_enabled", complete);
+ load(simp.tc, pt, "tc", complete); // tip clipper:
+ load(simp.ttc, pt, "ttc", complete); // topology tip clipper:
+ load(simp.complex_tc, pt, "complex_tc", complete); // complex tip clipper:
+ load(simp.br, pt, "br", complete); // bulge remover:
+ load(simp.ec, pt, "ec", complete); // erroneous connections remover:
+ load(simp.rcc, pt, "rcc", complete); // relative coverage component remover:
+ load(simp.relative_ed, pt, "relative_ed", complete); // relative edge disconnector:
+ load(simp.tec, pt, "tec", complete); // topology aware erroneous connections remover:
+ load(simp.trec, pt, "trec", complete); // topology and reliability based erroneous connections remover:
+ load(simp.isec, pt, "isec", complete); // interstrand erroneous connections remover (thorn remover):
+ load(simp.mfec, pt, "mfec", complete); // max flow erroneous connections remover:
+ load(simp.ier, pt, "ier", complete); // isolated edges remover
+ load(simp.cbr, pt, "cbr", complete); // complex bulge remover
+ load(simp.her, pt, "her", complete); // hidden ec remover
+ load(simp.init_clean, pt, "init_clean", complete); // presimplification
+ load(simp.final_tc, pt, "final_tc", complete);
+ load(simp.final_br, pt, "final_br", complete);
+ simp.second_final_br = simp.final_br;
+ load(simp.second_final_br, pt, "second_final_br", false);
+}
+
+void load(debruijn_config::info_printer& printer,
+ boost::property_tree::ptree const& pt, bool complete) {
+ using config_common::load;
+ load(printer.basic_stats, pt, "basic_stats", complete);
+ load(printer.lib_info, pt, "lib_info", complete);
+ load(printer.extended_stats, pt, "extended_stats", complete);
+ load(printer.write_components, pt, "write_components", complete);
+ load(printer.components_for_kmer, pt, "components_for_kmer", complete);
+ load(printer.components_for_genome_pos, pt, "components_for_genome_pos",
+ complete);
+ load(printer.write_components_along_genome, pt,
+ "write_components_along_genome", complete);
+ load(printer.write_components_along_contigs, pt,
+ "write_components_along_contigs", complete);
+ load(printer.save_full_graph, pt, "save_full_graph", complete);
+ load(printer.write_full_graph, pt, "write_full_graph", complete);
+ load(printer.write_full_nc_graph, pt, "write_full_nc_graph", complete);
+ load(printer.write_error_loc, pt, "write_error_loc", complete);
+}
+
+//void clear(debruijn_config::info_printer& printer) {
+// printer.print_stats = false;
+// printer.write_components = false;
+// printer.components_for_kmer = "";
+// printer.components_for_genome_pos = "";
+// printer.write_components_along_genome = false;
+// printer.save_full_graph = false;
+// printer.write_full_graph = false;
+// printer.write_full_nc_graph = false;
+// printer.write_error_loc = false;
+//}
+
+void load(debruijn_config::info_printers_t &printers,
+ boost::property_tree::ptree const &pt, bool /*complete*/) {
+ using config_common::load;
+
+ debruijn_config::info_printer def;
+ load(def, pt, ModeName(info_printer_pos::default_pos, InfoPrinterPosNames()), true);
+
+ for (size_t pos = size_t(info_printer_pos::default_pos) + 1; pos != size_t(info_printer_pos::total); ++pos) {
+ debruijn_config::info_printer printer(def);
+ load(printer, pt, ModeName(pos, InfoPrinterPosNames()), false);
+
+ printers[info_printer_pos(pos)] = printer;
+ }
+}
+
+void load_launch_info(debruijn_config &cfg, boost::property_tree::ptree const &pt) {
+ using config_common::load;
+ load(cfg.K, pt, "K");
+ // input options:
+ load(cfg.dataset_file, pt, "dataset");
+ // input dir is based on dataset file location (all paths in datasets are relative to its location)
+ cfg.input_dir = path::parent_path(cfg.dataset_file);
+ if (cfg.input_dir[cfg.input_dir.length() - 1] != '/')
+ cfg.input_dir += '/';
+
+ load(cfg.output_base, pt, "output_base");
+ if (cfg.output_base[cfg.output_base.length() - 1] != '/')
+ cfg.output_base += '/';
+
+ load(cfg.log_filename, pt, "log_filename");
+
+ load(cfg.developer_mode, pt, "developer_mode");
+ if (cfg.developer_mode) {
+ load(cfg.output_pictures, pt, "output_pictures");
+ load(cfg.output_nonfinal_contigs, pt, "output_nonfinal_contigs");
+ load(cfg.compute_paths_number, pt, "compute_paths_number");
+ } else {
+ cfg.output_pictures = false;
+ cfg.output_nonfinal_contigs = false;
+ cfg.compute_paths_number = false;
+ }
+
+ load(cfg.load_from, pt, "load_from");
+ if (cfg.load_from[0] != '/') { // relative path
+ cfg.load_from = cfg.output_dir + cfg.load_from;
+ }
+
+ load(cfg.tmp_dir, pt, "tmp_dir");
+ load(cfg.main_iteration, pt, "main_iteration");
+
+ load(cfg.entry_point, pt, "entry_point");
+
+ load(cfg.use_additional_contigs, pt, "use_additional_contigs");
+ load(cfg.additional_contigs, pt, "additional_contigs");
+ load(cfg.rr_enable, pt, "rr_enable");
+
+ load(cfg.buffer_size, pt, "buffer_size");
+ cfg.buffer_size <<= 20; //turn MB to bytes
+
+ load(cfg.temp_bin_reads_dir, pt, "temp_bin_reads_dir");
+ if (cfg.temp_bin_reads_dir[cfg.temp_bin_reads_dir.length() - 1] != '/')
+ cfg.temp_bin_reads_dir += '/';
+
+ load(cfg.max_threads, pt, "max_threads");
+ // Fix number of threads according to OMP capabilities.
+ cfg.max_threads = std::min(cfg.max_threads, (size_t) omp_get_max_threads());
+ // Inform OpenMP runtime about this :)
+ omp_set_num_threads((int) cfg.max_threads);
+
+ load(cfg.max_memory, pt, "max_memory");
+
+ path::CheckFileExistenceFATAL(cfg.dataset_file);
+ boost::property_tree::ptree ds_pt;
+ boost::property_tree::read_info(cfg.dataset_file, ds_pt);
+ load(cfg.ds, ds_pt, true);
+ load_reads(cfg.ds, cfg.input_dir);
+ load_reference_genome(cfg.ds, cfg.input_dir);
+}
+
+// main debruijn config load function
+void load_cfg(debruijn_config &cfg, boost::property_tree::ptree const &pt,
+ bool complete) {
+ using config_common::load;
+
+ string mode_str = pt.get("mode", "");
+ if (!mode_str.empty()) {
+ cfg.mode = ModeByName<pipeline_type>(mode_str, PipelineTypeNames());
+ }
+
+ //FIXME
+ load(cfg.tsa, pt, "tsa", complete);
+
+ load(cfg.use_unipaths, pt, "use_unipaths", complete);
+
+ load(cfg.pb, pt, "pacbio_processor", complete);
+
+ load(cfg.two_step_rr, pt, "two_step_rr", complete);
+ load(cfg.use_intermediate_contigs, pt, "use_intermediate_contigs", complete);
+ load(cfg.single_reads_rr, pt, "single_reads_rr", complete);
+
+ load(cfg.correct_mismatches, pt, "correct_mismatches", complete);
+ load(cfg.paired_info_statistics, pt, "paired_info_statistics", complete);
+ load(cfg.paired_info_scaffolder, pt, "paired_info_scaffolder", complete);
+ load(cfg.gap_closer_enable, pt, "gap_closer_enable", complete);
+
+ load(cfg.max_repeat_length, pt, "max_repeat_length", complete);
+
+ load(cfg.est_mode, pt, "estimation_mode", complete);
+ load(cfg.de, pt, "de", complete);
+ load(cfg.ade, pt, "ade", complete); // advanced distance estimator:
+ load(cfg.amb_de, pt, "amb_de", complete);
+
+ load(cfg.con, pt, "construction", complete);
+ load(cfg.gc, pt, "gap_closer", complete);
+ load(cfg.simp, pt, "simp", complete);
+ load(cfg.flanking_range, pt, "flanking_range", complete);
+ load(cfg.graph_read_corr, pt, "graph_read_corr", complete);
+ load(cfg.kcm, pt, "kmer_coverage_model", complete);
+ load(cfg.pos, pt, "pos", complete); // position handler:
+
+ load(cfg.rm, pt, "resolving_mode", complete);
+ load(cfg.pe_params, pt, "pe", complete);
+
+ load(cfg.use_scaffolder, pt, "use_scaffolder", complete);
+ load(cfg.avoid_rc_connections, pt, "avoid_rc_connections", complete);
+
+ load(cfg.sensitive_map, pt, "sensitive_mapper", complete);
+
+ load(cfg.info_printers, pt, "info_printers", complete);
+
+ load(cfg.bwa, pt, "bwa_aligner", complete);
+
+ if (pt.count("plasmid")) {
+ VERIFY_MSG(!cfg.pd, "Option can be loaded only once");
+ cfg.pd.reset(debruijn_config::plasmid());
+ load(*cfg.pd, pt, "plasmid");
+ }
+
+ if (pt.count("sc_cor")) {
+ VERIFY_MSG(!cfg.sc_cor, "Option sc_cor can be loaded only once");
+ cfg.sc_cor.reset(debruijn_config::scaffold_correction());
+ load(*cfg.sc_cor, pt, "sc_cor");
+ }
+
+ if (pt.count("preliminary_simp")) {
+ VERIFY_MSG(!cfg.preliminary_simp, "Option preliminary can be loaded only once");
+ cfg.preliminary_simp.reset(cfg.simp);
+ load(*cfg.preliminary_simp, pt, "preliminary_simp", false);
+ }
+ if (pt.count("prelim_pe")) {
+ VERIFY_MSG(!cfg.prelim_pe_params, "Option prelim_pe can be loaded only once");
+ cfg.prelim_pe_params.reset(cfg.pe_params);
+ load(*cfg.prelim_pe_params, pt, "prelim_pe", false);
+ }
+}
+
+void load(debruijn_config &cfg, const std::string &cfg_fns) {
+ load(cfg, std::vector<std::string>({ cfg_fns }));
+}
+
+void load(debruijn_config &cfg, const std::vector<std::string> &cfg_fns) {
+ VERIFY_MSG(cfg_fns.size() > 0, "Should provide at least one config file");
+ boost::property_tree::ptree base_pt;
+ boost::property_tree::read_info(cfg_fns[0], base_pt);
+
+ load_launch_info(cfg, base_pt);
+ load_cfg(cfg, base_pt, true);
+
+ for (size_t i = 1 ; i < cfg_fns.size(); ++i) {
+ boost::property_tree::ptree pt;
+ boost::property_tree::read_info(cfg_fns[i], pt);
+
+ //FIXME add logging of loading configs
+ load_cfg(cfg, pt, false);
+ }
+
+ //some post-loading processing
+ using config::pipeline_type;
+ cfg.uneven_depth = set<pipeline_type>{pipeline_type::mda, pipeline_type::rna, pipeline_type::meta}.count(cfg.mode);
+ if (!cfg.developer_mode) {
+ cfg.pe_params.debug_output = false;
+ cfg.pe_params.viz.DisableAll();
+ cfg.pe_params.output.DisableAll();
+ }
+
+ if (!cfg.use_scaffolder) {
+ cfg.pe_params.param_set.scaffolder_options.on = false;
+ }
+ cfg.need_mapping = cfg.developer_mode || cfg.correct_mismatches
+ || cfg.gap_closer_enable || cfg.rr_enable;
+
+ cfg.output_dir = cfg.output_base + "/K" + ToString(cfg.K) + "/";
+
+ cfg.output_saves = cfg.output_dir + "saves/";
+
+ if (cfg.tmp_dir[0] != '/') { // relative path
+ cfg.tmp_dir = cfg.output_dir + cfg.tmp_dir;
+ }
+
+ cfg.temp_bin_reads_path =
+ cfg.project_name.empty() ?
+ (cfg.output_base + "/" + cfg.temp_bin_reads_dir) :
+ (cfg.output_base + cfg.project_name + "/"
+ + cfg.temp_bin_reads_dir);
+ cfg.temp_bin_reads_info = cfg.temp_bin_reads_path + "INFO";
+
+ cfg.paired_read_prefix = cfg.temp_bin_reads_path + "_paired";
+ cfg.single_read_prefix = cfg.temp_bin_reads_path + "_single";
+}
+
+}
+}
diff --git a/src/modules/pipeline/config_struct.hpp b/src/modules/pipeline/config_struct.hpp
new file mode 100644
index 0000000..b1cce24
--- /dev/null
+++ b/src/modules/pipeline/config_struct.hpp
@@ -0,0 +1,561 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+#pragma once
+
+#include "pipeline/config_singl.hpp"
+#include "algorithms/path_extend/pe_config_struct.hpp"
+#include "pipeline/library.hpp"
+
+#include <boost/optional.hpp>
+#include "math/xmath.h"
+
+namespace debruijn_graph {
+namespace config {
+
+enum class info_printer_pos : char {
+ default_pos = 0,
+ before_first_gap_closer,
+ before_simplification,
+ before_post_simplification,
+ final_simplified,
+ final_gap_closed,
+ before_repeat_resolution,
+
+ total
+};
+
+std::vector<std::string> InfoPrinterPosNames();
+
+enum class pipeline_type : char {
+ base = 0,
+ isolate,
+ mda,
+ meta,
+ moleculo,
+ diploid,
+ rna,
+ plasmid,
+
+ total
+};
+
+std::vector<std::string> PipelineTypeNames();
+
+enum class construction_mode : char {
+ old = 0,
+ extention,
+
+ total
+};
+
+std::vector<std::string> ConstructionModeNames();
+
+enum class estimation_mode : char {
+ simple = 0,
+ weighted,
+ smoothing,
+
+ total
+};
+
+std::vector<std::string> EstimationModeNames();
+
+enum class resolving_mode : char {
+ none = 0,
+ path_extend,
+
+ total
+};
+
+std::vector<std::string> ResolveModeNames();
+
+enum class single_read_resolving_mode : char {
+ none = 0,
+ only_single_libs,
+ all,
+
+ total
+};
+
+std::vector<std::string> SingleReadResolveModeNames();
+
+template<typename mode_t>
+mode_t ModeByName(const std::string& name, const std::vector<std::string>& names) {
+ auto it = std::find(names.begin(), names.end(), name);
+ VERIFY_MSG(it != names.end(), "Unrecognized mode name");
+ return mode_t(it - names.begin());
+}
+
+template<typename mode_t>
+std::string ModeName(const mode_t& mode, const std::vector<std::string>& names) {
+ VERIFY_MSG(size_t(mode) < names.size(), "Unrecognized mode id");
+ return names[size_t(mode)];
+}
+
+struct DataSetData {
+ size_t read_length;
+ double avg_read_length;
+ double mean_insert_size;
+ double insert_size_deviation;
+ double insert_size_left_quantile;
+ double insert_size_right_quantile;
+ double median_insert_size;
+ double insert_size_mad;
+ std::map<int, size_t> insert_size_distribution;
+
+ bool binary_coverted;
+ bool single_reads_mapped;
+
+ uint64_t total_nucls;
+ double average_coverage;
+ double pi_threshold;
+
+ std::string paired_read_prefix;
+ std::string single_read_prefix;
+ size_t thread_num;
+
+ DataSetData(): read_length(0), avg_read_length(0.0),
+ mean_insert_size(0.0),
+ insert_size_deviation(0.0),
+ insert_size_left_quantile(0.0),
+ insert_size_right_quantile(0.0),
+ median_insert_size(0.0),
+ insert_size_mad(0.0),
+ binary_coverted(false),
+ single_reads_mapped(false),
+ total_nucls(0),
+ average_coverage(0.0),
+ pi_threshold(0.0) {
+ }
+};
+
+struct dataset {
+ io::DataSet<DataSetData> reads;
+
+ size_t max_read_length;
+ double average_coverage;
+ double average_read_length;
+
+ size_t RL() const { return max_read_length; }
+ void set_RL(size_t RL) {
+ max_read_length = RL;
+ }
+
+ double aRL() const { return average_read_length; }
+ void set_aRL(double aRL) {
+ average_read_length = aRL;
+ for (size_t i = 0; i < reads.lib_count(); ++i) {
+ reads[i].data().avg_read_length = aRL;
+ }
+ }
+
+ double avg_coverage() const { return average_coverage; }
+ void set_avg_coverage(double avg_coverage) {
+ average_coverage = avg_coverage;
+ for (size_t i = 0; i < reads.lib_count(); ++i) {
+ reads[i].data().average_coverage = avg_coverage;
+ }
+ }
+
+ std::string reference_genome_filename;
+ std::string reads_filename;
+
+ std::string reference_genome;
+
+ dataset(): max_read_length(0), average_coverage(0.0) {
+ }
+};
+
+// struct for debruijn project's configuration file
+struct debruijn_config {
+
+ pipeline_type mode;
+ bool uneven_depth;
+
+ bool developer_mode;
+
+ struct simplification {
+ struct tip_clipper {
+ std::string condition;
+ tip_clipper() {}
+ tip_clipper(std::string condition_) : condition(condition_) {}
+ };
+
+ struct topology_tip_clipper {
+ double length_coeff;
+ size_t uniqueness_length;
+ size_t plausibility_length;
+ };
+
+ struct complex_tip_clipper {
+ bool enabled;
+ double max_relative_coverage;
+ size_t max_edge_len;
+ std::string condition;
+ };
+
+ struct bulge_remover {
+ bool enabled;
+ bool main_iteration_only;
+ double max_bulge_length_coefficient;
+ size_t max_additive_length_coefficient;
+ double max_coverage;
+ double max_relative_coverage;
+ size_t max_delta;
+ double max_relative_delta;
+ size_t max_number_edges;
+ bool parallel;
+ size_t buff_size;
+ double buff_cov_diff;
+ double buff_cov_rel_diff;
+ };
+
+ struct erroneous_connections_remover {
+ std::string condition;
+ erroneous_connections_remover() {}
+ erroneous_connections_remover(std::string condition_) : condition(condition_) {}
+ };
+
+ struct relative_coverage_ec_remover {
+ size_t max_ec_length_coefficient;
+ double max_coverage_coeff;
+ double coverage_gap;
+ };
+
+ struct topology_based_ec_remover {
+ size_t max_ec_length_coefficient;
+ size_t uniqueness_length;
+ size_t plausibility_length;
+ };
+
+ struct tr_based_ec_remover {
+ size_t max_ec_length_coefficient;
+ size_t uniqueness_length;
+ double unreliable_coverage;
+ };
+
+ struct interstrand_ec_remover {
+ size_t max_ec_length_coefficient;
+ size_t uniqueness_length;
+ size_t span_distance;
+ };
+
+ struct max_flow_ec_remover {
+ bool enabled;
+ double max_ec_length_coefficient;
+ size_t uniqueness_length;
+ size_t plausibility_length;
+ };
+
+ struct isolated_edges_remover {
+ bool enabled;
+ size_t max_length;
+ double max_coverage;
+ size_t max_length_any_cov;
+ };
+
+ struct complex_bulge_remover {
+ bool enabled;
+ double max_relative_length;
+ size_t max_length_difference;
+ };
+
+ struct hidden_ec_remover {
+ bool enabled;
+ size_t uniqueness_length;
+ double unreliability_threshold;
+ double relative_threshold;
+ };
+
+ struct relative_coverage_edge_disconnector {
+ bool enabled;
+ double diff_mult;
+ };
+
+ struct relative_coverage_comp_remover {
+ bool enabled;
+ double coverage_gap;
+ double length_coeff;
+ double tip_allowing_length_coeff;
+ size_t max_ec_length_coefficient;
+ double max_coverage_coeff;
+ size_t vertex_count_limit;
+ };
+
+ struct init_cleaning {
+ std::string self_conj_condition;
+
+ bool early_it_only;
+ double activation_cov;
+ isolated_edges_remover ier;
+ std::string tip_condition;
+ std::string ec_condition;
+ double disconnect_flank_cov;
+ };
+
+ size_t cycle_iter_count;
+ bool post_simplif_enabled;
+ bool topology_simplif_enabled;
+ tip_clipper tc;
+ complex_tip_clipper complex_tc;
+ topology_tip_clipper ttc;
+ bulge_remover br;
+ erroneous_connections_remover ec;
+ relative_coverage_comp_remover rcc;
+ relative_coverage_edge_disconnector relative_ed;
+ topology_based_ec_remover tec;
+ tr_based_ec_remover trec;
+ interstrand_ec_remover isec;
+ max_flow_ec_remover mfec;
+ isolated_edges_remover ier;
+ complex_bulge_remover cbr;
+ hidden_ec_remover her;
+
+ tip_clipper final_tc;
+ bulge_remover final_br;
+ bulge_remover second_final_br;
+
+ init_cleaning init_clean;
+ };
+
+ struct construction {
+ struct early_tip_clipper {
+ bool enable;
+ boost::optional<size_t> length_bound;
+ early_tip_clipper() : enable(false) {}
+ };
+
+ construction_mode con_mode;
+ early_tip_clipper early_tc;
+ bool keep_perfect_loops;
+ size_t read_buffer_size;
+ construction() :
+ con_mode(construction_mode::extention),
+ keep_perfect_loops(true),
+ read_buffer_size(0) {}
+ };
+
+ simplification simp;
+ boost::optional<simplification> preliminary_simp;
+
+ struct sensitive_mapper {
+ size_t k;
+ };
+
+ struct distance_estimator {
+ double linkage_distance_coeff;
+ double max_distance_coeff;
+ double max_distance_coeff_scaff;
+ double filter_threshold;
+ };
+
+ struct smoothing_distance_estimator {
+ size_t threshold;
+ double range_coeff;
+ double delta_coeff;
+ double percentage;
+ size_t cutoff;
+ size_t min_peak_points;
+ double inv_density;
+ double derivative_threshold;
+ };
+
+ struct ambiguous_distance_estimator {
+ bool enabled;
+ double haplom_threshold;
+ double relative_length_threshold;
+ double relative_seq_threshold;
+ };
+
+ struct plasmid {
+ size_t long_edge_length;
+ size_t edge_length_for_median;
+ double relative_coverage;
+ size_t small_component_size;
+ double small_component_relative_coverage;
+ size_t min_component_length;
+ size_t min_isolated_length;
+
+ };
+
+ struct pacbio_processor {
+ //align and traverse.
+ size_t pacbio_k; //13
+ bool additional_debug_info; //false
+ double compression_cutoff;// 0.6
+ double domination_cutoff; //1.5
+ double path_limit_stretching; //1.3
+ double path_limit_pressing;//0.7
+ bool ignore_middle_alignment; //true; false for stats and mate_pairs;
+ //gap_closer
+ size_t long_seq_limit; //400
+ size_t pacbio_min_gap_quantity; //2
+ size_t contigs_min_gap_quantity; //1
+ size_t max_contigs_gap_length; // 10000
+ };
+
+ struct position_handler {
+ size_t max_mapping_gap;
+ size_t max_gap_diff;
+ std::string contigs_for_threading;
+ std::string contigs_to_analyze;
+ bool late_threading;
+ bool careful_labeling;
+ };
+
+ struct gap_closer {
+ int minimal_intersection;
+ bool before_simplify;
+ bool in_simplify;
+ bool after_simplify;
+ double weight_threshold;
+ };
+
+ struct info_printer {
+ bool basic_stats;
+ bool lib_info;
+ bool extended_stats;
+ bool write_components;
+ std::string components_for_kmer;
+ std::string components_for_genome_pos;
+ bool write_components_along_genome;
+ bool write_components_along_contigs;
+ bool save_full_graph;
+ bool write_error_loc;
+ bool write_full_graph;
+ bool write_full_nc_graph;
+ };
+
+ struct graph_read_corr_cfg {
+ bool enable;
+ std::string output_dir;
+ bool binary;
+ };
+
+ struct kmer_coverage_model {
+ double probability_threshold;
+ double strong_probability_threshold;
+ double coverage_threshold;
+ bool use_coverage_threshold;
+ };
+
+ struct bwa_aligner {
+ bool bwa_enable;
+ bool debug;
+ std::string path_to_bwa;
+ size_t min_contig_len;
+ };
+
+ typedef std::map<info_printer_pos, info_printer> info_printers_t;
+
+ std::string dataset_file;
+ std::string project_name;
+ std::string input_dir;
+ std::string output_base;
+ std::string output_dir;
+ std::string tmp_dir;
+ std::string output_suffix;
+ std::string output_saves;
+ std::string final_contigs_file;
+ std::string log_filename;
+
+ bool output_pictures;
+ bool output_nonfinal_contigs;
+ bool compute_paths_number;
+
+ bool use_additional_contigs;
+ bool use_unipaths;
+ std::string additional_contigs;
+
+ struct scaffold_correction {
+ std::string scaffolds_file;
+ bool output_unfilled;
+ size_t max_insert;
+ size_t max_cut_length;
+ };
+
+ struct truseq_analysis {
+ std::string scaffolds_file;
+ std::string genome_file;
+ };
+
+ boost::optional<scaffold_correction> sc_cor;
+ truseq_analysis tsa;
+ std::string load_from;
+
+ std::string entry_point;
+
+ bool rr_enable;
+ bool two_step_rr;
+ bool use_intermediate_contigs;
+
+ single_read_resolving_mode single_reads_rr;
+ bool use_single_reads;
+
+ bool correct_mismatches;
+ bool paired_info_statistics;
+ bool paired_info_scaffolder;
+ bool gap_closer_enable;
+
+ size_t max_repeat_length;
+
+ //Convertion options
+ size_t buffer_size;
+ std::string temp_bin_reads_dir;
+ std::string temp_bin_reads_path;
+ std::string temp_bin_reads_info;
+ std::string paired_read_prefix;
+ std::string single_read_prefix;
+
+ size_t K;
+
+ bool main_iteration;
+
+ size_t max_threads;
+ size_t max_memory;
+
+ estimation_mode est_mode;
+ resolving_mode rm;
+ path_extend::pe_config::MainPEParamsT pe_params;
+ boost::optional<path_extend::pe_config::MainPEParamsT> prelim_pe_params;
+ bool avoid_rc_connections;
+
+ construction con;
+ sensitive_mapper sensitive_map;
+ distance_estimator de;
+ smoothing_distance_estimator ade;
+ ambiguous_distance_estimator amb_de;
+ pacbio_processor pb;
+ bool use_scaffolder;
+ dataset ds;
+ position_handler pos;
+ gap_closer gc;
+ graph_read_corr_cfg graph_read_corr;
+ info_printers_t info_printers;
+ kmer_coverage_model kcm;
+ bwa_aligner bwa;
+ boost::optional<plasmid> pd;
+ size_t flanking_range;
+
+ bool need_mapping;
+
+ debruijn_config() :
+ use_single_reads(false) {
+
+ }
+};
+
+void load(debruijn_config& cfg, const std::vector<std::string> &filenames);
+void load(debruijn_config& cfg, const std::string &filename);
+void load_lib_data(const std::string& prefix);
+void write_lib_data(const std::string& prefix);
+
+} // config
+} // debruijn_graph
+
+
+typedef config_common::config<debruijn_graph::config::debruijn_config> cfg;
diff --git a/src/modules/pipeline/genomic_info.hpp b/src/modules/pipeline/genomic_info.hpp
new file mode 100644
index 0000000..4f7e5be
--- /dev/null
+++ b/src/modules/pipeline/genomic_info.hpp
@@ -0,0 +1,48 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef __GENOMIC_INFO_HPP__
+#define __GENOMIC_INFO_HPP__
+
+#include <vector>
+
+namespace llvm { namespace yaml { class IO; } }
+
+class GenomicInfo {
+ public:
+ GenomicInfo()
+ : genome_size_(0), estimated_mean_(0), ec_bound_(0), trusted_bound_(0) {}
+
+ const std::vector<size_t>& cov_histogram() const { return cov_histogram_; }
+ void set_cov_histogram(const std::vector<size_t> &hist) { cov_histogram_ = hist; }
+
+ size_t genome_size() const { return genome_size_; }
+ void set_genome_size(size_t genome_size) { genome_size_ = genome_size; }
+
+ double estimated_mean() const { return estimated_mean_; }
+ void set_estimated_mean(double estimated_mean) { estimated_mean_ = estimated_mean; }
+
+ double ec_bound() const { return ec_bound_; }
+ void set_ec_bound(double ec_bound) { ec_bound_ = ec_bound; }
+
+ double trusted_bound() const { return trusted_bound_; }
+ void set_trusted_bound(double trusted_bound) { trusted_bound_ = trusted_bound; }
+
+ bool Load(const std::string &filename);
+ void Save(const std::string &filename) const;
+
+ void yamlize(llvm::yaml::IO &io);
+
+ private:
+ std::vector<size_t> cov_histogram_;
+ size_t genome_size_;
+ double estimated_mean_;
+ double ec_bound_;
+ double trusted_bound_;
+};
+
+#endif
diff --git a/src/modules/pipeline/genomic_info_filler.cpp b/src/modules/pipeline/genomic_info_filler.cpp
new file mode 100644
index 0000000..8b71fa3
--- /dev/null
+++ b/src/modules/pipeline/genomic_info_filler.cpp
@@ -0,0 +1,149 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "genomic_info_filler.hpp"
+
+#include "math/kmer_coverage_model.hpp"
+#include "algorithms/simplification/ec_threshold_finder.hpp"
+
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/FileSystem.h"
+
+#include <string>
+
+#include <map>
+#include <vector>
+
+using namespace llvm;
+using namespace debruijn_graph;
+
+static std::vector<size_t> extract(const std::map<size_t, size_t> &hist) {
+ std::map<size_t, size_t> tmp = hist;
+
+ size_t maxcov = 0;
+ for (auto it = tmp.cbegin(), et = tmp.cend(); it != et; ++it)
+ maxcov = std::max(maxcov, it->first);
+
+ // Touch all the values until maxcov to make sure all the values exist in the map
+ for (size_t i = 0; i <= maxcov; ++i)
+ tmp[i];
+
+ // Extract the values
+ std::vector<size_t> res(maxcov);
+ for (size_t i = 0; i < maxcov; ++i)
+ res[i] = tmp[i + 1];
+
+ return res;
+}
+
+namespace llvm { namespace yaml {
+template <>
+struct MappingTraits<GenomicInfo> {
+ static void mapping(yaml::IO &io, GenomicInfo &info) {
+ info.yamlize(io);
+ }
+};
+
+
+template <>
+struct SequenceTraits<std::vector<std::size_t>> {
+ static size_t size(IO &, std::vector<std::size_t> &seq) {
+ return seq.size();
+ }
+ static size_t&
+ element(IO &, std::vector<std::size_t> &seq, size_t index) {
+ if (index >= seq.size())
+ seq.resize(index+1);
+ return seq[index];
+ }
+ static const bool flow = true;
+};
+}}
+
+void GenomicInfo::yamlize(yaml::IO &io) {
+ io.mapOptional("ec bound", ec_bound_, 0.0);
+ io.mapOptional("estimated mean", estimated_mean_, 0.0);
+ io.mapOptional("trusted bound", trusted_bound_, 0.0);
+ io.mapOptional("genome size", genome_size_, size_t(0));
+ io.mapOptional("coverage histogram", cov_histogram_);
+}
+
+
+bool GenomicInfo::Load(const std::string &filename) {
+ ErrorOr<std::unique_ptr<MemoryBuffer>> Buf = MemoryBuffer::getFile(filename);
+ if (!Buf)
+ return false;
+
+ yaml::Input yin(*Buf.get());
+ yin >> *this;
+
+ if (yin.error())
+ return false;
+
+ return true;
+}
+
+void GenomicInfo::Save(const std::string &filename) const {
+ std::error_code EC;
+ llvm::raw_fd_ostream ofs(filename, EC, llvm::sys::fs::OpenFlags::F_Text);
+ llvm::yaml::Output yout(ofs);
+ yout << const_cast<GenomicInfo&>(*this);
+}
+
+void GenomicInfoFiller::run(conj_graph_pack &gp, const char*) {
+ if (cfg::get().uneven_depth) {
+ ErroneousConnectionThresholdFinder<decltype(gp.g)> finder(gp.g);
+ std::map<size_t, size_t> hist = finder.ConstructHistogram();
+ double avg = finder.AvgCoverage();
+ double gthr = finder.FindThreshold(hist);
+ INFO("Average edge coverage: " << avg);
+ INFO("Graph threshold: " << gthr);
+
+ gp.ginfo.set_cov_histogram(extract(hist));
+ gp.ginfo.set_ec_bound(std::min(avg, gthr));
+ } else {
+ // First, get k-mer coverage histogram
+ std::map<size_t, size_t> tmp;
+ size_t maxcov = 0;
+ size_t kmer_per_record = 1;
+ if (conj_graph_pack::index_t::InnerIndexT::storing_type::IsInvertable())
+ kmer_per_record = 2;
+
+ for (auto I = gp.index.inner_index().value_cbegin(), E = gp.index.inner_index().value_cend(); I != E; ++I) {
+ size_t ccov = I->count;
+ maxcov = std::max(ccov, maxcov);
+ tmp[ccov] += kmer_per_record;
+ }
+
+ gp.ginfo.set_cov_histogram(extract(tmp));
+
+ // Fit the coverage model and get the threshold
+ cov_model::KMerCoverageModel CovModel(gp.ginfo.cov_histogram(), cfg::get().kcm.probability_threshold, cfg::get().kcm.strong_probability_threshold);
+ CovModel.Fit();
+
+ gp.ginfo.set_genome_size(CovModel.GetGenomeSize());
+ gp.ginfo.set_ec_bound((double)CovModel.GetErrorThreshold());
+ if (CovModel.converged()) {
+ gp.ginfo.set_estimated_mean((double)CovModel.GetMeanCoverage());
+ INFO("Mean coverage was calculated as " << gp.ginfo.estimated_mean());
+ } else
+ INFO("Failed to estimate mean coverage");
+
+ if (cfg::get().kcm.use_coverage_threshold) {
+ double coef = (cfg::get().ds.aRL() - double(cfg::get().K) + 1) / cfg::get().ds.aRL();
+ if (coef < 0)
+ coef = double(cfg::get().ds.RL() - cfg::get().K + 1) / double(cfg::get().ds.RL());
+ gp.ginfo.set_trusted_bound(CovModel.converged() && cfg::get().kcm.coverage_threshold == 0.0 ?
+ double(CovModel.GetLowThreshold()) :
+ cfg::get().kcm.coverage_threshold * coef);
+ }
+ }
+
+ INFO("EC coverage threshold value was calculated as " << gp.ginfo.ec_bound());
+ INFO("Trusted kmer low bound: " << gp.ginfo.trusted_bound());
+}
diff --git a/src/modules/pipeline/genomic_info_filler.hpp b/src/modules/pipeline/genomic_info_filler.hpp
new file mode 100644
index 0000000..ff30b64
--- /dev/null
+++ b/src/modules/pipeline/genomic_info_filler.hpp
@@ -0,0 +1,23 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "pipeline/stage.hpp"
+
+namespace debruijn_graph {
+
+class GenomicInfoFiller : public spades::AssemblyStage {
+public:
+ GenomicInfoFiller()
+ : AssemblyStage("EC Threshold Finding", "ec_threshold_finder") { }
+
+ void run(conj_graph_pack &gp, const char *);
+};
+
+}
+
diff --git a/src/modules/pipeline/graph_pack.hpp b/src/modules/pipeline/graph_pack.hpp
new file mode 100644
index 0000000..9c997fd
--- /dev/null
+++ b/src/modules/pipeline/graph_pack.hpp
@@ -0,0 +1,163 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "data_structures/indices/edge_position_index.hpp"
+#include "data_structures/indices/storing_traits.hpp"
+#include "data_structures/sequence/genome_storage.hpp"
+#include "assembly_graph/handlers/id_track_handler.hpp"
+#include "assembly_graph/handlers/edges_position_handler.hpp"
+#include "assembly_graph/graph_core/graph.hpp"
+#include "paired_info/paired_info.hpp"
+#include "pipeline/config_struct.hpp"
+#include "assembly_graph/graph_alignment/edge_index.hpp"
+#include "assembly_graph/graph_support/genomic_quality.hpp"
+#include "assembly_graph/graph_alignment/sequence_mapper.hpp"
+#include "genomic_info.hpp"
+#include "assembly_graph/graph_alignment/long_read_storage.hpp"
+#include "assembly_graph/graph_support/detail_coverage.hpp"
+#include "assembly_graph/components/connected_component.hpp"
+#include "assembly_graph/graph_alignment/kmer_mapper.hpp"
+
+namespace debruijn_graph {
+
+/*KmerFree*//*KmerStoring*/
+template<class Graph,
+ class KmerEdgeIndex = KmerStoringEdgeIndex<Graph, runtime_k::RtSeq, kmer_index_traits<runtime_k::RtSeq>, DefaultStoring>>
+struct graph_pack: private boost::noncopyable {
+ typedef Graph graph_t;
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef runtime_k::RtSeq seq_t;
+ typedef EdgeIndex<graph_t, seq_t, KmerEdgeIndex> index_t;
+ using PairedInfoIndicesT = omnigraph::de::PairedInfoIndicesT<Graph>;
+ //typedef omnigraph::de::PairedInfoIndicesT<Graph> PairedInfoIndicesT;
+ typedef omnigraph::de::UnclusteredPairedInfoIndicesT<Graph> UnclusteredPairedInfoIndicesT;
+ typedef LongReadContainer<Graph> LongReadContainerT;
+
+ size_t k_value;
+
+ graph_t g;
+ index_t index;
+ KmerMapper<graph_t> kmer_mapper;
+ FlankingCoverage<graph_t> flanking_cov;
+ UnclusteredPairedInfoIndicesT paired_indices;
+ PairedInfoIndicesT clustered_indices;
+ PairedInfoIndicesT scaffolding_indices;
+ LongReadContainerT single_long_reads;
+ GenomicInfo ginfo;
+
+ GenomeStorage genome;
+ EdgeQuality<Graph> edge_qual;
+ mutable EdgesPositionHandler<graph_t> edge_pos;
+ ConnectedComponentCounter components;
+ graph_pack(size_t k, const std::string &workdir, size_t lib_count,
+ const std::string &genome = "",
+ size_t flanking_range = 50,
+ size_t max_mapping_gap = 0,
+ size_t max_gap_diff = 0,
+ bool detach_indices = true)
+ : k_value(k), g(k), index(g, workdir),
+ kmer_mapper(g),
+ flanking_cov(g, flanking_range),
+ paired_indices(g, lib_count),
+ clustered_indices(g, lib_count),
+ scaffolding_indices(g, lib_count),
+ single_long_reads(g, lib_count),
+ genome(genome),
+ edge_qual(g),
+ edge_pos(g, max_mapping_gap + k, max_gap_diff),
+ components(g)
+ {
+ if (detach_indices) {
+ DetachAll();
+ }
+ }
+
+ void FillQuality() {
+ edge_qual.Fill(index, kmer_mapper, genome.GetSequence());
+ }
+
+ //todo remove with usages after checking
+ void ClearQuality() {
+ edge_qual.clear();
+ }
+
+ void EnsureIndex() {
+ if (!index.IsAttached()) {
+ INFO("Index refill");
+ index.Refill();
+ index.Attach();
+ }
+ }
+
+ void EnsureBasicMapping() {
+ VERIFY(kmer_mapper.IsAttached());
+ EnsureIndex();
+ INFO("Normalizing k-mer map. Total " << kmer_mapper.size() << " kmers to process");
+ kmer_mapper.Normalize();
+ INFO("Normalizing done");
+ }
+
+ void EnsureQuality() {
+ if (!edge_qual.IsAttached()) {
+ ClearQuality();
+ FillQuality();
+ edge_qual.Attach();
+ }
+ }
+
+ //positions are refilled every time
+ void EnsurePos() {
+ if (!edge_pos.IsAttached()) {
+ edge_pos.Attach();
+ }
+ edge_pos.clear();
+ FillPos(*this, genome.GetSequence(), "ref0");
+ FillPos(*this, !genome.GetSequence(), "ref1");
+ }
+
+ void EnsureDebugInfo() {
+ EnsureBasicMapping();
+ EnsureQuality();
+ EnsurePos();
+ }
+
+ void InitRRIndices() {
+ clustered_indices.Init();
+ scaffolding_indices.Init();
+ }
+
+ void ClearRRIndices() {
+ for (auto& pi : paired_indices) {
+ pi.Clear();
+ }
+ clustered_indices.Clear();
+ scaffolding_indices.Clear();
+ single_long_reads.Clear();
+ }
+
+ void DetachAll() {
+ index.Detach();
+ kmer_mapper.Detach();
+ edge_pos.Detach();
+ edge_qual.Detach();
+ }
+
+};
+
+typedef graph_pack<ConjugateDeBruijnGraph, KmerFreeEdgeIndex<Graph, runtime_k::RtSeq, kmer_index_traits<runtime_k::RtSeq>, DefaultStoring>> conj_graph_pack;
+typedef conj_graph_pack::index_t Index;
+
+typedef conj_graph_pack::PairedInfoIndicesT PairedIndicesT;
+typedef conj_graph_pack::UnclusteredPairedInfoIndicesT UnclusteredPairedIndicesT;
+typedef conj_graph_pack::LongReadContainerT LongReadContainerT;
+typedef omnigraph::de::PairedInfoIndexT<ConjugateDeBruijnGraph> PairedIndexT;
+typedef omnigraph::de::UnclusteredPairedInfoIndexT<ConjugateDeBruijnGraph> UnclusteredPairedIndexT;
+
+} // namespace debruijn_graph
diff --git a/src/modules/pipeline/graphio.hpp b/src/modules/pipeline/graphio.hpp
new file mode 100644
index 0000000..d47d00a
--- /dev/null
+++ b/src/modules/pipeline/graphio.hpp
@@ -0,0 +1,1040 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "dev_support/standard_base.hpp"
+
+#include "assembly_graph/handlers/id_track_handler.hpp"
+#include "assembly_graph/handlers/edges_position_handler.hpp"
+#include "assembly_graph/components/graph_component.hpp"
+
+#include "paired_info/paired_info.hpp"
+
+#include "assembly_graph/graph_core/graph.hpp"
+#include "assembly_graph/graph_support/detail_coverage.hpp"
+#include "assembly_graph/graph_alignment/long_read_storage.hpp"
+
+#include "assembly_graph/graph_core/order_and_law.hpp"
+
+#include <cmath>
+#include <set>
+#include <map>
+#include <algorithm>
+#include <fstream>
+#include <cstdio>
+
+namespace debruijn_graph {
+
+namespace graphio {
+
+using namespace omnigraph;
+using namespace omnigraph::de;
+//todo think of inner namespace
+
+template<class KmerMapper>
+void SaveKmerMapper(const string& file_name,
+ const KmerMapper& mapper) {
+ std::ofstream file;
+ file.open((file_name + ".kmm").c_str(),
+ std::ios_base::binary | std::ios_base::out);
+ DEBUG("Saving kmer mapper, " << file_name <<" created");
+ VERIFY(file.is_open());
+
+ uint32_t k_ = (uint32_t) mapper.get_k();
+ file.write((char *) &k_, sizeof(uint32_t));
+ mapper.BinWrite(file);
+
+ file.close();
+ DEBUG("kmer mapper saved ")
+}
+
+template<class KmerMapper>
+bool LoadKmerMapper(const string& file_name,
+ KmerMapper& kmer_mapper) {
+ kmer_mapper.clear();
+ std::ifstream file;
+ file.open((file_name + ".kmm").c_str(),
+ std::ios_base::binary | std::ios_base::in);
+ if (!file.is_open()) {
+ return false;
+ }
+ INFO("Reading kmer mapper, " << file_name <<" started");
+
+ uint32_t k_;
+ file.read((char *) &k_, sizeof(uint32_t));
+
+ VERIFY_MSG(k_ == kmer_mapper.get_k(), "Cannot read kmer mapper, different Ks");
+ kmer_mapper.BinRead(file);
+
+ file.close();
+ return true;
+}
+
+template<class EdgeIndex>
+void SaveEdgeIndex(const std::string& file_name,
+ const EdgeIndex& index) {
+ std::ofstream file;
+ file.open((file_name + ".kmidx").c_str(),
+ std::ios_base::binary | std::ios_base::out);
+ DEBUG("Saving kmer index, " << file_name <<" created");
+ VERIFY(file.is_open());
+
+ uint32_t k_ = index.k();
+ file.write((char *) &k_, sizeof(uint32_t));
+ index.BinWrite(file);
+
+ file.close();
+ DEBUG("index saved ")
+}
+
+template<class EdgeIndex>
+bool LoadEdgeIndex(const std::string& file_name,
+ EdgeIndex& index) {
+ std::ifstream file;
+ file.open((file_name + ".kmidx").c_str(),
+ std::ios_base::binary | std::ios_base::in);
+ INFO("Reading kmer index, " << file_name <<" started");
+ if (!file.is_open())
+ return false;
+
+ uint32_t k_;
+ file.read((char *) &k_, sizeof(uint32_t));
+ VERIFY_MSG(k_ == index.k(), "Cannot read edge index, different Ks:");
+
+ index.BinRead(file, file_name + ".kmidx");
+
+ file.close();
+
+ return true;
+}
+
+inline
+void SaveMapCoverage(const std::string& path, const std::map<int, int>& data ) {
+ std::ofstream outFile;
+ outFile.open(path.c_str());
+
+ INFO("Saving detailed coverage in file " << path <<" started");
+ outFile << data.size() << "\n";
+ for (auto dataIterator = data.begin(); dataIterator != data.end(); ++dataIterator){
+ outFile << dataIterator->first << " " << dataIterator->second << " .\n";
+ }
+}
+
+template<class KmerIndex>
+void SaveDetailCoverage(const std::string& pathInCov, const std::string& pathOutCov, const KmerIndex& index ) {
+ SaveMapCoverage(pathInCov, index.inCoverage);
+ SaveMapCoverage(pathOutCov, index.outCoverage);
+}
+
+inline void SerializePoint(FILE* file, size_t e1, size_t e2, const RawPoint &p) {
+ fprintf(file, "%zu %zu %.2f %.2f 0.00 .\n", e1, e2, (double)p.d, (double)p.weight);
+}
+
+inline void SerializePoint(FILE* file, size_t e1, size_t e2, const Point &p) {
+ fprintf(file, "%zu %zu %.2f %.2f %.2f .\n", e1, e2, (double)p.d, (double)p.weight, (double)p.var);
+}
+
+inline void DeserializePoint(FILE* file, size_t& e1, size_t& e2, RawPoint &p) {
+ float unused;
+ size_t read_count = fscanf(file, "%zu %zu %f %f %f .\n", &e1, &e2,
+ (float *)&p.d, (float *)&p.weight, (float *)&unused);
+ VERIFY(read_count == 5);
+
+}
+
+inline void DeserializePoint(FILE* file, size_t& e1, size_t& e2, Point &p) {
+ size_t read_count = fscanf(file, "%zu %zu %f %f %f .\n", &e1, &e2,
+ (float *)&p.d, (float *)&p.weight, (float *)&p.var);
+ VERIFY(read_count == 5);
+}
+
+
+template<class Graph>
+class DataPrinter {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ //todo reduce duplication
+ template<class T>
+ void SaveEdgeAssociatedInfo(std::function<T (EdgeId)> access_f, ostream& out) const {
+ out << component_.e_size() << endl;
+ for (auto iter = component_.e_begin(); iter != component_.e_end(); ++iter) {
+ EdgeId e = *iter;
+ //todo fixme currently matches old format .cvr format
+ out << e.int_id()/* << endl*/;
+ out << " " << access_f(e) << " ." << endl;
+ }
+ }
+
+// template<class C>
+// void SaveEdgeAssociatedInfo(const C& c, ostream& out) const {
+// SaveEdgeAssociatedInfo<decltype(C::operator[])>(boost::bind(&C::operator[], c, _1), out);
+// }
+
+ template<class C>
+ void SaveEdgeAssociatedInfo(const C& c, ostream& out) const {
+ out << component_.e_size() << endl;
+ for (auto iter = component_.e_begin(); iter != component_.e_end(); ++iter) {
+ EdgeId e = *iter;
+ //todo fixme currently matches old format .cvr format
+ out << e.int_id()/* << endl*/;
+ out << " ";
+ c.Save(e, out);
+ out << " ." << endl;
+ }
+ }
+
+ public:
+
+ void SaveGraph(const string& file_name) const {
+ FILE* gid_file = fopen((file_name + ".gid").c_str(), "w");
+ size_t max_id = this->component().g().GetGraphIdDistributor().GetMax();
+ fprintf(gid_file, "%zu\n", max_id);
+ fclose(gid_file);
+ FILE* file = fopen((file_name + ".grp").c_str(), "w");
+ DEBUG("Graph saving to " << file_name << " started");
+ VERIFY_MSG(file != NULL,
+ "Couldn't open file " << (file_name + ".grp") << " on write");
+ size_t vertex_count = component_.v_size();
+ size_t edge_count = component_.e_size();
+ fprintf(file, "%zu %zu \n", vertex_count, edge_count);
+ for (auto iter = component_.v_begin(); iter != component_.v_end(); ++iter) {
+ Save(file, *iter);
+ }
+
+ fprintf(file, "\n");
+
+ for (auto iter = component_.e_begin(); iter != component_.e_end(); ++iter) {
+ Save(file, *iter);
+ }
+ DEBUG("Graph saving to " << file_name << " finished");
+
+ fclose(file);
+ }
+
+ void SaveEdgeSequences(const string& file_name) const {
+ ofstream out(file_name + ".sqn");
+ //todo switch to general function after its switching to fasta
+ //SaveEdgeAssociatedInfo<Sequence>(boost::bind(&Graph::EdgeNucls, component_.g(), _1), out);
+ DEBUG("Saving sequences, " << file_name <<" created");
+ for (auto iter = component_.e_begin(); iter != component_.e_end(); ++iter) {
+ EdgeId e = *iter;
+ out << ">" << e.int_id() << endl;
+ out << component_.g().EdgeNucls(e) << endl;
+ }
+ }
+
+ void SaveCoverage(const string& file_name) const {
+ ofstream out(file_name + ".cvr");
+ DEBUG("Saving coverage, " << file_name <<" created");
+ SaveEdgeAssociatedInfo(component_.g().coverage_index(), out);
+ }
+
+ void SaveFlankingCoverage(const string& file_name, const FlankingCoverage<Graph>& flanking_cov) const {
+ ofstream out(file_name + ".flcvr");
+ DEBUG("Saving flanking coverage, " << file_name <<" created");
+ SaveEdgeAssociatedInfo(flanking_cov, out);
+ }
+
+ template<class Index>
+ void SavePaired(const string& file_name,
+ Index const& paired_index) const {
+ FILE* file = fopen((file_name + ".prd").c_str(), "w");
+ DEBUG("Saving paired info, " << file_name <<" created");
+ VERIFY(file != NULL);
+
+ size_t comp_size = 0;
+ for (auto I = component_.e_begin(), E = component_.e_end(); I != E; ++I) {
+ EdgeId e1 = *I;
+ auto inner_map = paired_index.GetHalf(e1);
+ for (auto entry : inner_map) {
+ if (component_.contains(entry.first)) { // if the second edge also lies in the same component
+ comp_size += entry.second.size();
+ continue;
+ }
+ }
+ }
+
+ fprintf(file, "%zu\n", comp_size);
+
+ for (auto I = component_.e_begin(), E = component_.e_end(); I != E; ++I) {
+ EdgeId e1 = *I;
+ const auto& inner_map = paired_index.GetHalf(e1);
+ std::map<typename Graph::EdgeId, typename Index::HistProxy> ordermap(inner_map.begin(), inner_map.end());
+ for (auto entry : ordermap) {
+ EdgeId e2 = entry.first;
+ if (component_.contains(e2))
+ for (auto point : entry.second)
+ SerializePoint(file, e1.int_id(), e2.int_id(), point);
+ }
+ }
+
+ fclose(file);
+ }
+
+ void SavePositions(const string& file_name,
+ EdgesPositionHandler<Graph> const& ref_pos) const {
+ ofstream file((file_name + ".pos").c_str());
+ DEBUG("Saving edges positions, " << file_name << " created");
+ VERIFY(file.is_open());
+ file << component_.e_size() << endl;
+ for (auto it = component_.e_begin(); it != component_.e_end(); ++it) {
+ vector<omnigraph::EdgePosition> pos_it = ref_pos.GetEdgePositions(*it);
+ file << it->int_id() << " " << pos_it.size() << endl;
+ for (size_t i = 0; i < pos_it.size(); i++) {
+ file << " " << pos_it[i].contigId << " " << pos_it[i].mr << endl;
+ }
+ }
+ }
+
+ private:
+ void Save(FILE* file, EdgeId eid) const {
+ fprintf(file, "%s\n", ToPrint(eid).c_str());
+ }
+
+ void Save(FILE* file, VertexId vid) const {
+ fprintf(file, "%s\n", ToPrint(vid).c_str());
+ }
+
+ const GraphComponent<Graph> component_;
+
+ virtual std::string ToPrint(VertexId v) const = 0;
+ virtual std::string ToPrint(EdgeId e) const = 0;
+
+ protected:
+
+ //todo optimize component copy
+ DataPrinter(const GraphComponent<Graph>& component) :
+ component_(component) {
+ }
+
+ const GraphComponent<Graph>& component() const {
+ return component_;
+ }
+
+ public:
+ virtual ~DataPrinter() {
+ }
+};
+
+template<class Graph>
+class ConjugateDataPrinter: public DataPrinter<Graph> {
+ typedef DataPrinter<Graph> base;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ public:
+ ConjugateDataPrinter(Graph const& g) :
+ base(g) {
+ }
+
+ ConjugateDataPrinter(const GraphComponent<Graph>& graph_component) :
+ base(GraphComponent<Graph>(graph_component, true)) {
+ }
+
+ template<class VertexIt>
+ ConjugateDataPrinter(const Graph& g, VertexIt begin, VertexIt end) :
+ base(GraphComponent<Graph>(g, begin, end, true)) {
+ }
+
+ std::string ToPrint(VertexId v) const {
+ stringstream ss;
+ ss
+ << "Vertex "
+ << v.int_id()
+ << " ~ "
+ << this->component().g().conjugate(v).int_id() << " .";
+ return ss.str();
+ }
+
+ std::string ToPrint(EdgeId e) const {
+ stringstream ss;
+ ss
+ << "Edge "
+ << e.int_id()
+ << " : "
+ << this->component().g().EdgeStart(e).int_id()
+ << " -> "
+ << this->component().g().EdgeEnd(e).int_id()
+ << ", l = "
+ << this->component().g().length(e)
+ << " ~ "
+ << this->component().g().conjugate(e).int_id() << " .";
+ return ss.str();
+ }
+
+};
+
+template<class Graph>
+class DataScanner {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ template<class T>
+ void LoadEdgeAssociatedInfo(std::function<void (EdgeId, T)> setting_f, istream& in) const {
+ size_t cnt;
+ in >> cnt;
+ for (size_t i = 0 ; i < cnt; ++i) {
+ size_t edge_id;
+ T t;
+ string delim;
+ in >> edge_id;
+ in >> t;
+ in >> delim;
+ VERIFY(delim == ".");
+ VERIFY(this->edge_id_map().find(edge_id) != this->edge_id_map().end());
+ setting_f(this->edge_id_map()[edge_id], t);
+ }
+ }
+
+ template<class T>
+ void LoadEdgeAssociatedInfo(T& t, istream& in) const {
+ size_t cnt;
+ in >> cnt;
+ for (size_t i = 0 ; i < cnt; ++i) {
+ size_t edge_id;
+ in >> edge_id;
+ VERIFY(this->edge_id_map().find(edge_id) != this->edge_id_map().end());
+ EdgeId eid = this->edge_id_map().find(edge_id)->second;
+ t.Load(eid, in);
+ string delim;
+ in >> delim;
+ VERIFY(delim == ".");
+ }
+ }
+
+// template<class C>
+// void LoadEdgeAssociatedInfo(const C& c, ostream& out) const {
+// SaveEdgeAssociatedInfo<decltype(C::operator[])>(boost::bind(&C::operator[], c, _1), out);
+// }
+
+ public:
+ virtual void LoadGraph(const string& file_name) = 0;
+
+ void LoadCoverage(const string& file_name) {
+ INFO("Reading coverage from " << file_name);
+ ifstream in(file_name + ".cvr");
+ LoadEdgeAssociatedInfo(g_.coverage_index(), in);
+ }
+
+ bool LoadFlankingCoverage(const string& file_name, FlankingCoverage<Graph>& flanking_cov) {
+ if (!path::FileExists(file_name + ".flcvr")) {
+ INFO("Flanking coverage saves are absent");
+ return false;
+ }
+ INFO("Reading flanking coverage from " << file_name);
+ ifstream in(file_name + ".flcvr");
+ LoadEdgeAssociatedInfo(flanking_cov, in);
+ return true;
+ }
+
+ template<typename Index>
+ void LoadPaired(const string& file_name,
+ Index& paired_index,
+ bool force_exists = true) {
+ typedef typename Graph::EdgeId EdgeId;
+ FILE* file = fopen((file_name + ".prd").c_str(), "r");
+ INFO((file_name + ".prd"));
+ if (force_exists) {
+ VERIFY(file != NULL);
+ } else if (file == NULL) {
+ INFO("Paired info not found, skipping");
+ return;
+ }
+ INFO("Reading paired info from " << file_name << " started");
+
+ size_t paired_count;
+ int read_count = fscanf(file, "%zu \n", &paired_count);
+ VERIFY(read_count == 1);
+ while (!feof(file)) {
+ size_t first_real_id, second_real_id;
+
+ typename Index::Point point;
+ DeserializePoint(file, first_real_id, second_real_id, point);
+
+ TRACE(first_real_id << " " << second_real_id << " " << point);
+ VERIFY(this->edge_id_map().find(first_real_id) != this->edge_id_map().end())
+ EdgeId e1 = this->edge_id_map()[first_real_id];
+ EdgeId e2 = this->edge_id_map()[second_real_id];
+ if (e1 == EdgeId(NULL) || e2 == EdgeId(NULL))
+ continue;
+ TRACE(e1 << " " << e2 << " " << point);
+ //Need to prevent doubling of self-conjugate edge pairs
+ //Their weight would be always even, so we don't lose precision
+ auto ep = std::make_pair(e1, e2);
+ if (ep == paired_index.ConjugatePair(ep))
+ point.weight = math::round(point.weight / 2);
+ paired_index.Add(e1, e2, point);
+ }
+ DEBUG("PII SIZE " << paired_index.size());
+ fclose(file);
+ }
+
+ bool LoadPositions(const string& file_name,
+ EdgesPositionHandler<Graph>& edge_pos) {
+ FILE* file = fopen((file_name + ".pos").c_str(), "r");
+ if (file == NULL) {
+ INFO("No positions were saved");
+ return false;
+ }
+ VERIFY(!edge_pos.IsAttached());
+ edge_pos.Attach();
+ INFO("Reading edges positions, " << file_name <<" started");
+ VERIFY(file != NULL);
+ size_t pos_count;
+ int read_count = fscanf(file, "%zu\n", &pos_count);
+ VERIFY(read_count == 1);
+ for (size_t i = 0; i < pos_count; i++) {
+ size_t edge_real_id, pos_info_count;
+ char contigId[500];
+ char cur_str[500];
+ read_count = fscanf(file, "%zu %zu\n", &edge_real_id, &pos_info_count);
+ VERIFY(read_count == 2);
+ // INFO( edge_real_id);
+ for (size_t j = 0; j < pos_info_count; j++) {
+ int start_pos, end_pos;
+ int m_start_pos, m_end_pos;
+ read_count = fscanf(file, "%[^\n]s", cur_str);
+ read_count = fscanf(file, "\n");
+ read_count = sscanf(cur_str, "%s [%d - %d] --> [%d - %d]", contigId,
+ &start_pos, &end_pos, &m_start_pos, &m_end_pos);
+ // INFO(cur_str);
+ // INFO (contigId<<" "<< start_pos<<" "<<end_pos);
+ // VERIFY(read_count == 3);
+ VERIFY(read_count == 5);
+ VERIFY(this->edge_id_map().find(edge_real_id) != this->edge_id_map().end());
+ EdgeId eid = this->edge_id_map()[edge_real_id];
+ edge_pos.AddEdgePosition(eid, string(contigId), start_pos - 1, end_pos, m_start_pos - 1, m_end_pos);
+ }
+ }
+ fclose(file);
+ return true;
+ }
+
+ private:
+ Graph& g_;
+ // int edge_count_;
+ map<size_t, EdgeId> edge_id_map_;
+ map<size_t, VertexId> vertex_id_map_;
+
+ protected:
+ DataScanner(Graph &g) : g_(g) {
+ INFO("Creating of scanner started");
+ // edge_count_ = 0;
+ }
+
+ Graph& g() {
+ return g_;
+ }
+
+ map<size_t, EdgeId> &edge_id_map() {
+ return edge_id_map_;
+ }
+
+ map<size_t, VertexId> &vertex_id_map() {
+ return vertex_id_map_;
+ }
+
+ const map<size_t, EdgeId> &edge_id_map() const {
+ return edge_id_map_;
+ }
+
+ const map<size_t, VertexId> &vertex_id_map() const {
+ return vertex_id_map_;
+ }
+
+ public:
+ virtual ~DataScanner() {
+
+ }
+};
+
+template<class Graph>
+class ConjugateDataScanner: public DataScanner<Graph> {
+ typedef DataScanner<Graph> base;
+public:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+private:
+ restricted::IdSegmentStorage CreateIdStorage(const string& file_name) {
+ FILE* file = fopen((file_name + ".gid").c_str(), "r");
+ //This is to support compatibility to old saves. Will be removed soon
+ if(file == NULL) {
+ return this->g().GetGraphIdDistributor().ReserveUpTo(1000000000);
+ }
+ VERIFY_MSG(file != NULL, "Couldn't find file " << (file_name + ".gid"));
+ size_t max;
+ int flag = fscanf(file, "%zu\n", &max);
+ VERIFY(flag == 1);
+ fclose(file);
+ return this->g().GetGraphIdDistributor().ReserveUpTo(max);
+ }
+
+ public:
+ /*virtual*/
+ void LoadGraph(const string& file_name) {
+ restricted::IdSegmentStorage id_storage = CreateIdStorage(file_name);
+ INFO("Trying to read conjugate de bruijn graph from " << file_name << ".grp");
+ FILE* file = fopen((file_name + ".grp").c_str(), "r");
+ VERIFY_MSG(file != NULL, "Couldn't find file " << (file_name + ".grp"));
+ FILE* sequence_file = fopen((file_name + ".sqn").c_str(), "r");
+ VERIFY_MSG(file != NULL, "Couldn't find file " << (file_name + ".sqn"));
+ INFO("Reading conjugate de bruijn graph from " << file_name << " started");
+ size_t vertex_count;
+ size_t edge_count;
+ int flag = fscanf(file, "%zu %zu \n", &vertex_count, &edge_count);
+ VERIFY(flag == 2);
+ for (size_t i = 0; i < vertex_count; i++) {
+ size_t vertex_real_id, conjugate_id;
+ flag = fscanf(file, "Vertex %zu ~ %zu .\n", &vertex_real_id, &conjugate_id);
+ TRACE("Vertex "<<vertex_real_id<<" ~ "<<conjugate_id<<" .");
+ VERIFY(flag == 2);
+
+ if (this->vertex_id_map().find((int) vertex_real_id) == this->vertex_id_map().end()) {
+ size_t ids[2] = {vertex_real_id, conjugate_id};
+ auto id_distributor = id_storage.GetSegmentIdDistributor(ids, ids + 2);
+ VertexId vid = this->g().AddVertex(typename Graph::VertexData(), id_distributor);
+ VertexId conj_vid = this->g().conjugate(vid);
+
+ this->vertex_id_map()[vertex_real_id] = vid;
+ this->vertex_id_map()[conjugate_id] = conj_vid;
+ }
+ }
+
+ char first_char = (char) getc(sequence_file);
+ VERIFY(!ferror(sequence_file));
+ ungetc(first_char, sequence_file);
+ bool fasta = (first_char == '>'); // if it's not fasta, then it's old .sqn
+
+
+ if (!fasta) {
+ size_t tmp_edge_count;
+ flag = fscanf(sequence_file, "%zu", &tmp_edge_count);
+ VERIFY(flag == 1);
+ VERIFY(edge_count == tmp_edge_count);
+ }
+
+ const size_t longstring_size = 1000500; // TODO: O RLY magic constant? => Can't load edges >= 1Mbp
+ char longstring[longstring_size];
+ for (size_t i = 0; i < edge_count; i++) {
+ size_t e_real_id, start_id, fin_id, length, conjugate_edge_id;
+ flag = fscanf(file, "Edge %zu : %zu -> %zu, l = %zu ~ %zu .\n",
+ &e_real_id, &start_id, &fin_id, &length, &conjugate_edge_id);
+ VERIFY(flag == 5);
+ VERIFY(length < longstring_size);
+ if (fasta) {
+ flag = fscanf(sequence_file, ">%zu\n%s\n", &e_real_id, longstring);
+ }
+ else {
+ flag = fscanf(sequence_file, "%zu %s .", &e_real_id, longstring);
+ }
+ VERIFY(flag == 2);
+ TRACE("Edge " << e_real_id << " : " << start_id << " -> "
+ << fin_id << " l = " << length << " ~ " << conjugate_edge_id);
+ if (this->edge_id_map().find((int) e_real_id) == this->edge_id_map().end()) {
+ size_t ids[2] = {e_real_id, conjugate_edge_id};
+ auto id_distributor = id_storage.GetSegmentIdDistributor(ids, ids + 2);
+ Sequence tmp(longstring);
+ EdgeId eid = this->g().AddEdge(this->vertex_id_map()[start_id], this->vertex_id_map()[fin_id], tmp, id_distributor);
+ this->edge_id_map()[e_real_id] = eid;
+ this->edge_id_map()[conjugate_edge_id] = this->g().conjugate(eid);
+ }
+ }
+ fclose(file);
+ fclose(sequence_file);
+ }
+ public:
+ ConjugateDataScanner(Graph& g) :
+ base(g) {
+ }
+};
+
+inline std::string MakeSingleReadsFileName(const std::string& file_name,
+ size_t index) {
+ return file_name + "_paths_" + ToString(index) + ".mpr";
+}
+
+//helper methods
+// todo think how to organize them in the most natural way
+
+template<class Graph>
+void PrintBasicGraph(const string& file_name, DataPrinter<Graph>& printer) {
+ printer.SaveGraph(file_name);
+ printer.SaveEdgeSequences(file_name);
+ printer.SaveCoverage(file_name);
+}
+
+template<class graph_pack>
+void PrintGraphPack(const string& file_name,
+ DataPrinter<typename graph_pack::graph_t>& printer,
+ const graph_pack& gp) {
+ PrintBasicGraph(file_name, printer);
+ // printer.SavePaired(file_name + "_et", gp.etalon_paired_index);
+ if (gp.edge_pos.IsAttached())
+ printer.SavePositions(file_name, gp.edge_pos);
+ if (gp.index.IsAttached())
+ SaveEdgeIndex(file_name, gp.index.inner_index());
+ if (gp.kmer_mapper.IsAttached())
+ SaveKmerMapper(file_name, gp.kmer_mapper);
+ if (gp.flanking_cov.IsAttached())
+ printer.SaveFlankingCoverage(file_name, gp.flanking_cov);
+}
+
+template<class graph_pack>
+void PrintGraphPack(const string& file_name, const graph_pack& gp) {
+ ConjugateDataPrinter<typename graph_pack::graph_t> printer(gp.g);
+ PrintGraphPack(file_name, printer, gp);
+}
+
+template<class Graph>
+void PrintPairedIndex(const string& file_name, DataPrinter<Graph>& printer,
+ const PairedInfoIndexT<Graph>& paired_index) {
+ printer.SavePaired(file_name, paired_index);
+}
+
+template<class Graph>
+void PrintUnclusteredIndex(const string& file_name, DataPrinter<Graph>& printer,
+ const UnclusteredPairedInfoIndexT<Graph>& paired_index) {
+ printer.SavePaired(file_name, paired_index);
+}
+
+template<class Graph>
+void PrintClusteredIndex(const string& file_name, DataPrinter<Graph>& printer,
+ const PairedInfoIndexT<Graph>& clustered_index) {
+ PrintPairedIndex(file_name + "_cl", printer, clustered_index);
+}
+
+template<class Graph>
+void PrintScaffoldingIndex(const string& file_name, DataPrinter<Graph>& printer,
+ const PairedInfoIndexT<Graph>& clustered_index) {
+ PrintPairedIndex(file_name + "_scf", printer, clustered_index);
+}
+
+template<class Graph>
+void PrintScaffoldIndex(const string& file_name, DataPrinter<Graph>& printer,
+ const PairedInfoIndexT<Graph>& scaffold_index) {
+ PrintPairedIndex(file_name + "_scf", printer, scaffold_index);
+}
+
+template<class Graph>
+void PrintUnclusteredIndices(const string& file_name, DataPrinter<Graph>& printer,
+ const UnclusteredPairedInfoIndicesT<Graph>& paired_indices) {
+ for (size_t i = 0; i < paired_indices.size(); ++i)
+ PrintUnclusteredIndex(file_name + "_" + ToString(i), printer, paired_indices[i]);
+}
+
+template<class Graph>
+void PrintClusteredIndices(const string& file_name, DataPrinter<Graph>& printer,
+ const PairedInfoIndicesT<Graph>& paired_indices) {
+ for (size_t i = 0; i < paired_indices.size(); ++i)
+ PrintClusteredIndex(file_name + "_" + ToString(i), printer, paired_indices[i]);
+}
+
+template<class Graph>
+void PrintScaffoldingIndices(const string& file_name, DataPrinter<Graph>& printer,
+ const PairedInfoIndicesT<Graph>& paired_indices) {
+ for (size_t i = 0; i < paired_indices.size(); ++i)
+ PrintScaffoldingIndex(file_name + "_" + ToString(i), printer, paired_indices[i]);
+}
+
+template<class graph_pack>
+void PrintWithPairedIndex(const string& file_name,
+ DataPrinter<typename graph_pack::graph_t>& printer,
+ const graph_pack& gp,
+ const PairedInfoIndexT<typename graph_pack::graph_t>& paired_index,
+ bool clustered_index = false) {
+
+ PrintGraphPack(file_name, printer, gp);
+ if (!clustered_index) {
+ PrintPairedIndex(file_name, printer, paired_index);
+ } else {
+ PrintClusteredIndex(file_name, printer, paired_index);
+ }
+}
+
+template<class graph_pack>
+void PrintWithClusteredIndex(const string& file_name,
+ DataPrinter<typename graph_pack::graph_t>& printer,
+ const graph_pack& gp,
+ const PairedInfoIndexT<typename graph_pack::graph_t>& paired_index) {
+ PrintWithPairedIndex(file_name, printer, gp, paired_index, true);
+}
+
+template<class graph_pack>
+void PrintWithPairedIndices(const string& file_name,
+ DataPrinter<typename graph_pack::graph_t>& printer,
+ const graph_pack& gp,
+ const PairedInfoIndicesT<typename graph_pack::graph_t>& paired_indices,
+ bool clustered_index = false) {
+ PrintGraphPack(file_name, printer, gp);
+ if (!clustered_index)
+ PrintPairedIndices(file_name, printer, paired_indices);
+ else
+ PrintClusteredIndices(file_name, printer, paired_indices);
+}
+
+template<class graph_pack>
+void PrintWithClusteredIndices(const string& file_name,
+ DataPrinter<typename graph_pack::graph_t>& printer,
+ const graph_pack& gp,
+ const PairedInfoIndicesT<typename graph_pack::graph_t>& paired_indices) {
+ PrintWithPairedIndices(file_name, printer, gp, paired_indices, true);
+}
+
+template<class Graph>
+void PrintSingleLongReads(const string& file_name, const LongReadContainer<Graph>& single_long_reads) {
+ for (size_t i = 0; i < single_long_reads.size(); ++i){
+ single_long_reads[i].DumpToFile(MakeSingleReadsFileName(file_name, i));
+ }
+}
+
+template<class graph_pack>
+void PrintAll(const string& file_name, const graph_pack& gp) {
+ ConjugateDataPrinter<typename graph_pack::graph_t> printer(gp.g, gp.g.begin(), gp.g.end());
+ PrintGraphPack(file_name, printer, gp);
+ PrintUnclusteredIndices(file_name, printer, gp.paired_indices);
+ PrintClusteredIndices(file_name, printer, gp.clustered_indices);
+ PrintScaffoldingIndices(file_name, printer, gp.scaffolding_indices);
+ PrintSingleLongReads(file_name, gp.single_long_reads);
+ gp.ginfo.Save(file_name + ".ginfo");
+}
+
+template<class graph_pack, class VertexIt>
+void PrintWithPairedIndex(const string& file_name, const graph_pack& gp,
+ VertexIt begin, VertexIt end,
+ const PairedInfoIndexT<typename graph_pack::graph_t>& paired_index,
+ bool clustered_index = false) {
+ ConjugateDataPrinter<typename graph_pack::graph_t> printer(gp.g,
+ begin, end);
+ PrintWithPairedIndex(file_name, printer, gp, paired_index, clustered_index);
+}
+
+template<class graph_pack, class VertexIt>
+void PrintWithClusteredIndex(const string& file_name, const graph_pack& gp,
+ VertexIt begin, VertexIt end,
+ const PairedInfoIndexT<typename graph_pack::graph_t>& clustered_index) {
+ ConjugateDataPrinter<typename graph_pack::graph_t> printer(gp.g,
+ begin, end);
+ PrintWithPairedIndex(file_name, printer, gp, clustered_index, true);
+}
+
+template<class graph_pack>
+void PrintWithPairedIndex(const string& file_name, const graph_pack& gp,
+ const PairedInfoIndexT<typename graph_pack::graph_t>& paired_index,
+ bool clustered_index = false) {
+ PrintWithPairedIndex(file_name, gp, gp.g.begin(), gp.g.end(), paired_index,
+ clustered_index);
+}
+
+template<class graph_pack, class VertexIt>
+void PrinGraphPack(const string& file_name, const graph_pack& gp,
+ VertexIt begin, VertexIt end) {
+ ConjugateDataPrinter<typename graph_pack::graph_t> printer(gp.g,
+ begin, end);
+ PrintGraphPack(file_name, printer, gp);
+}
+
+template<class graph_pack>
+void PrintWithClusteredIndex(const string& file_name, const graph_pack& gp,
+ const PairedInfoIndexT<typename graph_pack::graph_t>& clustered_index) {
+ PrintWithPairedIndex(file_name, gp, clustered_index, true);
+}
+
+template<class graph_pack>
+void PrintWithPairedIndices(const string& file_name, const graph_pack& gp,
+ const PairedInfoIndicesT<typename graph_pack::graph_t>& paired_indices,
+ bool clustered_index = false) {
+
+ ConjugateDataPrinter<typename graph_pack::graph_t> printer(gp.g, gp.g.begin(), gp.g.end());
+
+ PrintWithPairedIndices(file_name, printer, gp, paired_indices, clustered_index);
+}
+
+template<class graph_pack>
+void PrintWithClusteredIndices(const string& file_name, const graph_pack& gp,
+ const PairedInfoIndicesT<typename graph_pack::graph_t>& paired_indices) {
+ PrintWithPairedIndices(file_name, gp, paired_indices, true);
+}
+
+template<class Graph>
+void ScanBasicGraph(const string& file_name, DataScanner<Graph>& scanner) {
+ scanner.LoadGraph(file_name);
+ scanner.LoadCoverage(file_name);
+}
+
+template<class graph_pack>
+void ScanGraphPack(const string& file_name,
+ DataScanner<typename graph_pack::graph_t>& scanner, graph_pack& gp) {
+ ScanBasicGraph(file_name, scanner);
+ gp.index.Attach();
+ if (LoadEdgeIndex(file_name, gp.index.inner_index())) {
+ gp.index.Update();
+ } else {
+ WARN("Cannot load edge index, kmer coverages will be missed");
+ gp.index.Refill();
+ }
+ // scanner.LoadPaired(file_name + "_et", gp.etalon_paired_index);
+ scanner.LoadPositions(file_name, gp.edge_pos);
+ //load kmer_mapper only if needed
+ if (gp.kmer_mapper.IsAttached())
+ if (!LoadKmerMapper(file_name, gp.kmer_mapper)) {
+ WARN("Cannot load kmer_mapper, information on projected kmers will be missed");
+ }
+ if (!scanner.LoadFlankingCoverage(file_name, gp.flanking_cov)) {
+ gp.flanking_cov.Fill(gp.index.inner_index());
+ }
+}
+
+template<class Graph>
+void ScanPairedIndex(const string& file_name, DataScanner<Graph>& scanner,
+ UnclusteredPairedInfoIndexT<Graph>& paired_index,
+ bool force_exists = true) {
+ scanner.LoadPaired(file_name, paired_index, force_exists);
+}
+
+template<class Graph>
+void ScanClusteredIndex(const string& file_name, DataScanner<Graph>& scanner,
+ PairedInfoIndexT<Graph>& clustered_index,
+ bool force_exists = true) {
+ scanner.LoadPaired(file_name + "_cl", clustered_index, force_exists);
+}
+
+template<class Graph>
+void ScanScaffoldingIndex(const string& file_name, DataScanner<Graph>& scanner,
+ PairedInfoIndexT<Graph>& clustered_index,
+ bool force_exists = true) {
+ scanner.LoadPaired(file_name + "_scf", clustered_index, force_exists);
+}
+
+template<class Graph>
+void ScanPairedIndices(const std::string& file_name, DataScanner<Graph>& scanner,
+ UnclusteredPairedInfoIndicesT<Graph>& paired_indices,
+ bool force_exists = true) {
+ for (size_t i = 0; i < paired_indices.size(); ++i)
+ ScanPairedIndex(file_name + "_" + ToString(i), scanner, paired_indices[i], force_exists);
+}
+
+template<class Graph>
+void ScanClusteredIndices(const std:: string& file_name, DataScanner<Graph>& scanner,
+ PairedInfoIndicesT<Graph>& paired_indices,
+ bool force_exists = true) {
+ for (size_t i = 0; i < paired_indices.size(); ++i)
+ ScanClusteredIndex(file_name + "_" + ToString(i), scanner, paired_indices[i], force_exists);
+}
+
+template<class Graph>
+void ScanScaffoldingIndices(const std:: string& file_name, DataScanner<Graph>& scanner,
+ PairedInfoIndicesT<Graph>& paired_indices,
+ bool force_exists = true) {
+ for (size_t i = 0; i < paired_indices.size(); ++i)
+ ScanScaffoldingIndex(file_name + "_" + ToString(i), scanner, paired_indices[i], force_exists);
+}
+
+template<class Graph>
+void ScanScaffoldIndices(const string& file_name, DataScanner<Graph>& scanner,
+ PairedInfoIndicesT<Graph>& scaffold_indices) {
+
+ for (size_t i = 0; i < scaffold_indices.size(); ++i) {
+ ScanScaffoldIndex(file_name + "_" + ToString(i), scanner, scaffold_indices[i]);
+ }
+}
+
+template<class graph_pack>
+void ScanWithPairedIndex(const string& file_name,
+ DataScanner<typename graph_pack::graph_t>& scanner, graph_pack& gp,
+ PairedInfoIndexT<typename graph_pack::graph_t>& paired_index,
+ bool clustered_index = false) {
+ ScanGraphPack(file_name, scanner, gp);
+ if (!clustered_index) {
+ ScanPairedIndex(file_name, scanner, paired_index);
+ } else {
+ ScanClusteredIndex(file_name, scanner, paired_index);
+ }
+}
+
+template<class graph_pack>
+void ScanWithPairedIndices(const string& file_name,
+ DataScanner<typename graph_pack::graph_t>& scanner, graph_pack& gp,
+ PairedInfoIndicesT<typename graph_pack::graph_t>& paired_indices,
+ bool clustered_index = false) {
+
+ ScanGraphPack(file_name, scanner, gp);
+ if (!clustered_index) {
+ ScanPairedIndices(file_name, scanner, paired_indices);
+ } else {
+ ScanClusteredIndices(file_name, scanner, paired_indices);
+ }
+}
+
+template<class graph_pack>
+void ScanWithPairedIndex(const string& file_name, graph_pack& gp,
+ PairedInfoIndexT<typename graph_pack::graph_t>& paired_index,
+ bool clustered_index = false) {
+ ConjugateDataScanner<typename graph_pack::graph_t> scanner(gp.g);
+ ScanWithPairedIndex(file_name, scanner, gp, paired_index, clustered_index);
+}
+
+template<class graph_pack>
+void ScanWithClusteredIndex(const string& file_name, graph_pack& gp,
+ PairedInfoIndexT<typename graph_pack::graph_t>& clustered_index) {
+ ScanWithPairedIndex(file_name, gp, clustered_index, true);
+}
+
+template<class graph_pack>
+void ScanWithClusteredIndices(const string& file_name,
+ DataScanner<typename graph_pack::graph_t>& scanner, graph_pack& gp,
+ PairedInfoIndicesT<typename graph_pack::graph_t>& paired_indices) {
+ ScanWithPairedIndices(file_name, scanner, gp, paired_indices, true);
+}
+
+template<class graph_pack>
+void ScanWithPairedIndices(const string& file_name, graph_pack& gp,
+ PairedInfoIndicesT<typename graph_pack::graph_t>& paired_indices,
+ bool clustered_index = false) {
+ ConjugateDataScanner<typename graph_pack::graph_t> scanner(gp.g);
+ ScanWithPairedIndices(file_name, scanner, gp, paired_indices, clustered_index);
+}
+
+
+template<class graph_pack>
+void ScanWithClusteredIndices(const string& file_name, graph_pack& gp,
+ PairedInfoIndicesT<typename graph_pack::graph_t>& paired_indices) {
+ ScanWithPairedIndices(file_name, gp, paired_indices, true);
+}
+
+template<class Graph>
+void ScanBasicGraph(const string& file_name, Graph& g) {
+ ConjugateDataScanner<Graph> scanner(g);
+ ScanBasicGraph<Graph>(file_name, scanner);
+}
+
+template<class Graph>
+void ScanSingleLongReads(const string& file_name, LongReadContainer<Graph>& single_long_reads) {
+ for (size_t i = 0; i < single_long_reads.size(); ++i){
+ single_long_reads[i].LoadFromFile(MakeSingleReadsFileName(file_name, i), false);
+ }
+}
+
+template<class graph_pack>
+void ScanGraphPack(const string& file_name, graph_pack& gp) {
+ ConjugateDataScanner<typename graph_pack::graph_t> scanner(gp.g);
+ ScanGraphPack(file_name, scanner, gp);
+}
+
+template<class graph_pack>
+void ScanAll(const std::string& file_name, graph_pack& gp,
+ bool force_exists = true) {
+ ConjugateDataScanner<typename graph_pack::graph_t> scanner(gp.g);
+ ScanGraphPack(file_name, scanner, gp);
+ ScanPairedIndices(file_name, scanner, gp.paired_indices, force_exists);
+ ScanClusteredIndices(file_name, scanner, gp.clustered_indices, force_exists);
+ ScanScaffoldingIndices(file_name, scanner, gp.scaffolding_indices, force_exists);
+ ScanSingleLongReads(file_name, gp.single_long_reads);
+ gp.ginfo.Load(file_name + ".ginfo");
+}
+}
+}
diff --git a/src/modules/pipeline/library.cpp b/src/modules/pipeline/library.cpp
new file mode 100644
index 0000000..6852156
--- /dev/null
+++ b/src/modules/pipeline/library.cpp
@@ -0,0 +1,137 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "pipeline/library.hpp"
+#include "dev_support/path_helper.hpp"
+
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/FileSystem.h"
+
+#include <string>
+#include <fstream>
+#include <iostream>
+
+using namespace llvm;
+using namespace io;
+
+namespace llvm { namespace yaml {
+template <>
+struct ScalarEnumerationTraits<LibraryOrientation> {
+ static void enumeration(yaml::IO &io, LibraryOrientation &value) {
+ io.enumCase(value, "fr", LibraryOrientation::FR);
+ io.enumCase(value, "rf", LibraryOrientation::RF);
+ io.enumCase(value, "ff", LibraryOrientation::FF);
+ io.enumCase(value, "rr", LibraryOrientation::RR);
+ }
+};
+
+template <>
+struct ScalarEnumerationTraits<LibraryType> {
+ static void enumeration(yaml::IO &io, LibraryType &value) {
+ io.enumCase(value, "paired-end", LibraryType::PairedEnd);
+ io.enumCase(value, "mate-pairs", LibraryType::MatePairs);
+ io.enumCase(value, "hq-mate-pairs", LibraryType::HQMatePairs);
+ io.enumCase(value, "pacbio", LibraryType::PacBioReads);
+ io.enumCase(value, "single", LibraryType::SingleReads);
+ io.enumCase(value, "sanger", LibraryType::SangerReads);
+ io.enumCase(value, "nanopore", LibraryType::NanoporeReads);
+ io.enumCase(value, "trusted-contigs", LibraryType::TrustedContigs);
+ io.enumCase(value, "untrusted-contigs", LibraryType::UntrustedContigs);
+ io.enumCase(value, "path-extend-contigs", LibraryType::PathExtendContigs);
+ }
+};
+
+template <>
+struct SequenceTraits<std::vector<std::string>> {
+ static size_t size(IO &, std::vector<std::string> &seq) {
+ return seq.size();
+ }
+ static std::string&
+ element(IO &, std::vector<std::string> &seq, size_t index) {
+ if (index >= seq.size())
+ seq.resize(index+1);
+ return seq[index];
+ }
+};
+}}
+
+namespace io {
+template<>
+void SequencingLibrary<io::NoData>::yamlize(llvm::yaml::IO &io) {
+ SequencingLibraryBase::yamlize(io);
+}
+template<>
+void SequencingLibrary<io::NoData>::validate(llvm::yaml::IO &io, llvm::StringRef &res) {
+ SequencingLibraryBase::validate(io, res);
+}
+}
+
+void SequencingLibraryBase::yamlize(llvm::yaml::IO &io) {
+ io.mapRequired("type", type_);
+ io.mapOptional("orientation", orientation_, LibraryOrientation::Undefined);
+ io.mapOptional("left reads", left_paired_reads_);
+ io.mapOptional("right reads", right_paired_reads_);
+ io.mapOptional("single reads", single_reads_);
+}
+
+void SequencingLibraryBase::validate(llvm::yaml::IO &, llvm::StringRef &res) {
+ switch (type_) {
+ case LibraryType::PairedEnd:
+ case LibraryType::MatePairs:
+ case LibraryType::HQMatePairs:
+ if (left_paired_reads_.size() != right_paired_reads_.size()) {
+ res = "Left and right reads lists should have equal length";
+ return;
+ }
+
+ if (orientation_ == LibraryOrientation::Undefined) {
+ res = "Orientation for paired reads should be specified";
+ return;
+ }
+ break;
+ case LibraryType::SingleReads:
+ case LibraryType::PacBioReads:
+ case LibraryType::SangerReads:
+ case LibraryType::NanoporeReads:
+ case LibraryType::TrustedContigs:
+ case LibraryType::UntrustedContigs:
+ case LibraryType::PathExtendContigs:
+ if (left_paired_reads_.size() || right_paired_reads_.size()) {
+ res = "Paired reads should not be set for this library type";
+ return;
+ }
+ break;
+ default:
+ // Impossible
+ res = "Unsupported library type";
+ return;
+ }
+}
+
+// FIXME: Lambda
+struct update_relative_filename : public std::binary_function<std::string, std::string, std::string> {
+ std::string operator() (const std::string &filename, const std::string &input_dir) const {
+ if (filename[0] == '/')
+ return filename;
+ return input_dir + filename;
+ }
+};
+
+void SequencingLibraryBase::update_relative_reads_filenames(const std::string &input_dir) {
+ std::transform(left_paired_reads_.begin(), left_paired_reads_.end(), left_paired_reads_.begin(),
+ std::bind2nd(update_relative_filename(), input_dir));
+ std::transform(right_paired_reads_.begin(), right_paired_reads_.end(), right_paired_reads_.begin(),
+ std::bind2nd(update_relative_filename(), input_dir));
+ std::transform(single_reads_.begin(), single_reads_.end(), single_reads_.begin(),
+ std::bind2nd(update_relative_filename(), input_dir));
+}
+
+#include "pipeline/library.inl"
+
+// Provide default implementation here (e.g. in case of Data == io::NoData)
+template class io::DataSet<>;
diff --git a/src/modules/pipeline/library.hpp b/src/modules/pipeline/library.hpp
new file mode 100644
index 0000000..580fcaf
--- /dev/null
+++ b/src/modules/pipeline/library.hpp
@@ -0,0 +1,365 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef __IO_LIBRARY_HPP__
+#define __IO_LIBRARY_HPP__
+
+#include "utils/adt/chained_iterator.hpp"
+#include "utils/adt/iterator_range.hpp"
+
+#include <boost/iterator/iterator_facade.hpp>
+
+#include <string>
+#include <vector>
+
+// Forward decls for YAML API
+namespace llvm { namespace yaml { class IO; template<typename T> struct MappingTraits; } }
+namespace llvm { class StringRef; }
+
+namespace io {
+
+enum class LibraryType {
+ SingleReads,
+ PairedEnd,
+ MatePairs,
+ HQMatePairs,
+ PacBioReads,
+ SangerReads,
+ NanoporeReads,
+ TrustedContigs,
+ UntrustedContigs,
+ PathExtendContigs
+};
+
+static std::vector<LibraryType> LibraryPriotity = {
+ LibraryType::SingleReads,
+ LibraryType::SangerReads,
+ LibraryType::PacBioReads,
+ LibraryType::NanoporeReads,
+ LibraryType::PairedEnd,
+ LibraryType::HQMatePairs,
+ LibraryType::MatePairs,
+ LibraryType::TrustedContigs,
+ LibraryType::PathExtendContigs,
+ LibraryType::UntrustedContigs
+};
+
+enum class LibraryOrientation {
+ FR,
+ FF,
+ RF,
+ RR,
+ Undefined
+};
+
+class SequencingLibraryBase {
+public:
+ class paired_reads_iterator :
+ public boost::iterator_facade<paired_reads_iterator,
+ std::pair<std::string, std::string>,
+ boost::forward_traversal_tag,
+ std::pair<std::string, std::string> > {
+
+ typedef std::vector<std::string>::const_iterator inner_iterator;
+
+ public:
+ paired_reads_iterator(inner_iterator left, inner_iterator right)
+ : left_(left), right_(right){}
+
+ private:
+ friend class boost::iterator_core_access;
+
+ void increment() { ++left_; ++right_; }
+ bool equal(const paired_reads_iterator &other) const {
+ return this->left_ == other.left_ && this->right_ == other.right_;
+ }
+ std::pair<std::string, std::string> dereference() const {
+ return std::make_pair(*left_, *right_);
+ }
+
+ inner_iterator left_;
+ inner_iterator right_;
+ };
+
+ typedef chained_iterator<std::vector<std::string>::const_iterator> single_reads_iterator;
+
+ SequencingLibraryBase()
+ : type_(LibraryType::PairedEnd), orientation_(LibraryOrientation::FR) {}
+
+ // YAML API. Public because we cannot have template friend class.
+ void yamlize(llvm::yaml::IO &io);
+ void validate(llvm::yaml::IO &io, llvm::StringRef &res);
+
+ LibraryType type() const { return type_; }
+ void set_type(LibraryType type) { type_ = type; }
+ LibraryOrientation orientation() const { return orientation_; }
+ void set_orientation(LibraryOrientation orientation) { orientation_ = orientation; }
+
+ void clear() {
+ left_paired_reads_.clear();
+ right_paired_reads_.clear();
+ single_reads_.clear();
+ }
+
+ void update_relative_reads_filenames(const std::string &input_dir);
+
+ void push_back_single(const std::string &reads) {
+ single_reads_.push_back(reads);
+ }
+
+ void push_back_paired(const std::string &left, const std::string &right) {
+ left_paired_reads_.push_back(left);
+ right_paired_reads_.push_back(right);
+ }
+
+ paired_reads_iterator paired_begin() const {
+ return paired_reads_iterator(left_paired_reads_.begin(), right_paired_reads_.begin());
+ }
+ paired_reads_iterator paired_end() const {
+ return paired_reads_iterator(left_paired_reads_.end(), right_paired_reads_.end());
+ }
+
+ adt::iterator_range<paired_reads_iterator> paired_reads() const {
+ return adt::make_range(paired_begin(), paired_end());
+ }
+
+ single_reads_iterator reads_begin() const {
+ // NOTE: We have a contract with single_end here. Single reads always go last!
+ single_reads_iterator res(left_paired_reads_.begin(), left_paired_reads_.end());
+ res.join(right_paired_reads_.begin(), right_paired_reads_.end());
+ res.join(single_reads_.begin(), single_reads_.end());
+
+ return res;
+ }
+ single_reads_iterator reads_end() const {
+ // NOTE: Do not forget about the contract with single_begin here!
+ return single_reads_iterator(single_reads_.end(), single_reads_.end());
+ }
+
+ adt::iterator_range<single_reads_iterator> reads() const {
+ return adt::make_range(reads_begin(), reads_end());
+ }
+
+ single_reads_iterator single_begin() const {
+ return single_reads_iterator(single_reads_.begin(), single_reads_.end());
+ }
+ single_reads_iterator single_end() const {
+ // NOTE: Do not forget about the contract with single_begin here!
+ return single_reads_iterator(single_reads_.end(), single_reads_.end());
+ }
+
+ adt::iterator_range<single_reads_iterator> single_reads() const {
+ return adt::make_range(single_begin(), single_end());
+ }
+
+ bool is_graph_contructable() const {
+ return (type_ == io::LibraryType::PairedEnd ||
+ type_ == io::LibraryType::SingleReads ||
+ type_ == io::LibraryType::HQMatePairs);
+ }
+
+ bool is_bwa_alignable() const {
+ return type_ == io::LibraryType::MatePairs;
+ }
+
+ bool is_mismatch_correctable() const {
+ return is_graph_contructable();
+ }
+
+ bool is_binary_covertable() {
+ return is_graph_contructable() || is_mismatch_correctable() || is_paired();
+ }
+
+ bool is_paired() const {
+ return (type_ == io::LibraryType::PairedEnd ||
+ type_ == io::LibraryType::MatePairs||
+ type_ == io::LibraryType::HQMatePairs);
+ }
+
+ bool is_repeat_resolvable() const {
+ return (type_ == io::LibraryType::PairedEnd ||
+ type_ == io::LibraryType::HQMatePairs ||
+ type_ == io::LibraryType::MatePairs ||
+ type_ == io::LibraryType::PacBioReads ||
+ type_ == io::LibraryType::SangerReads ||
+ type_ == io::LibraryType::NanoporeReads ||
+ type_ == io::LibraryType::TrustedContigs ||
+ type_ == io::LibraryType::UntrustedContigs ||
+ type_ == io::LibraryType::PathExtendContigs);
+ }
+
+ static bool is_contig_lib(LibraryType type) {
+ return type == io::LibraryType::TrustedContigs ||
+ type == io::LibraryType::UntrustedContigs ||
+ type == io::LibraryType::PathExtendContigs;
+ }
+
+ static bool is_long_read_lib(LibraryType type) {
+ return type == io::LibraryType::PacBioReads ||
+ type == io::LibraryType::SangerReads ||
+ type == io::LibraryType::NanoporeReads;
+ }
+
+ bool is_contig_lib() const {
+ return is_contig_lib(type_);
+ }
+
+ bool is_long_read_lib() const {
+ return is_long_read_lib(type_);
+ }
+
+ bool is_pacbio_alignable() const {
+ return (type_ == io::LibraryType::PacBioReads ||
+ type_ == io::LibraryType::SangerReads ||
+ type_ == io::LibraryType::NanoporeReads ||
+ //comment next line to switch alignment method for trusted contigs
+ type_ == io::LibraryType::TrustedContigs ||
+ type_ == io::LibraryType::UntrustedContigs);
+ }
+
+private:
+ LibraryType type_;
+ LibraryOrientation orientation_;
+
+ std::vector<std::string> left_paired_reads_;
+ std::vector<std::string> right_paired_reads_;
+ std::vector<std::string> single_reads_;
+};
+
+struct NoData {};
+
+template<class Data = NoData>
+class SequencingLibrary: public SequencingLibraryBase {
+public:
+ const Data& data() const {
+ return data_;
+ }
+ Data& data() {
+ return data_;
+ }
+
+ void yamlize(llvm::yaml::IO &io);
+ void validate(llvm::yaml::IO &io, llvm::StringRef &res);
+
+private:
+ Data data_;
+};
+
+// Just convenient wrapper to "unwrap" the iterators over libraries.
+template<class Data = NoData>
+class DataSet {
+ typedef SequencingLibrary<Data> Library;
+ typedef std::vector<Library> LibraryStorage;
+
+public:
+ typedef typename LibraryStorage::iterator iterator;
+ typedef typename LibraryStorage::const_iterator const_iterator;
+ typedef chained_iterator<typename Library::single_reads_iterator> single_reads_iterator;
+ typedef chained_iterator<typename Library::paired_reads_iterator> paired_reads_iterator;
+
+ DataSet() {}
+ explicit DataSet(const std::string &path) { load(path); }
+
+ void load(const std::string &filename);
+ void save(const std::string &filename);
+
+ void clear() { libraries_.clear(); }
+ void push_back(const Library &lib) {
+ libraries_.push_back(lib);
+ }
+ Library& operator[](size_t n) { return libraries_[n]; }
+ const Library& operator[](size_t n) const { return libraries_[n]; }
+ size_t lib_count() const { return libraries_.size(); }
+
+ iterator library_begin() { return libraries_.begin(); }
+ const_iterator library_begin() const { return libraries_.begin(); }
+ iterator begin() { return libraries_.begin(); }
+ const_iterator begin() const { return libraries_.begin(); }
+
+ iterator library_end() { return libraries_.end(); }
+ const_iterator library_end() const { return libraries_.end(); }
+ iterator end() { return libraries_.end(); }
+ const_iterator end() const { return libraries_.end(); }
+
+ adt::iterator_range<iterator> libraries() {
+ return adt::make_range(library_begin(), library_end());
+ }
+ adt::iterator_range<const_iterator> libraries() const {
+ return adt::make_range(library_begin(), library_end());
+ }
+
+ single_reads_iterator reads_begin() const {
+ auto it = libraries_.begin();
+ single_reads_iterator res(it->reads_begin(), it->reads_end());
+ ++it;
+ for (auto end = libraries_.end(); it != end; ++it)
+ res.join(it->reads_begin(), it->reads_end());
+
+ return res;
+ }
+ single_reads_iterator reads_end() const {
+ return single_reads_iterator(libraries_.back().reads_end(), libraries_.back().reads_end());
+ }
+ adt::iterator_range<single_reads_iterator> reads() {
+ return adt::make_range(reads_begin(), reads_end());
+ }
+
+ single_reads_iterator single_begin() const {
+ auto it = libraries_.begin();
+ single_reads_iterator res(it->single_begin(), it->single_end());
+ ++it;
+ for (auto end = libraries_.end(); it != end; ++it)
+ res.join(it->single_begin(), it->single_end());
+
+ return res;
+ }
+ single_reads_iterator single_end() const {
+ return single_reads_iterator(libraries_.back().single_end(), libraries_.back().single_end());
+ }
+ adt::iterator_range<single_reads_iterator> single_reads() {
+ return adt::make_range(single_begin(), single_end());
+ }
+
+ paired_reads_iterator paired_begin() const {
+ auto it = libraries_.begin();
+ paired_reads_iterator res(it->paired_begin(), it->paired_end());
+ ++it;
+ for (auto end = libraries_.end(); it != end; ++it)
+ res.join(it->paired_begin(), it->paired_end());
+
+ return res;
+ }
+ paired_reads_iterator paired_end() const {
+ return paired_reads_iterator(libraries_.back().paired_end(), libraries_.back().paired_end());
+ }
+
+ adt::iterator_range<paired_reads_iterator> paired_reads() const {
+ return adt::make_range(paired_begin(), paired_end());
+ }
+
+private:
+ LibraryStorage libraries_;
+};
+
+}
+
+namespace llvm { namespace yaml {
+template <>
+struct MappingTraits<io::SequencingLibraryBase> {
+ static void mapping(llvm::yaml::IO &io, io::SequencingLibraryBase &lib);
+ static StringRef validate(llvm::yaml::IO &io, io::SequencingLibraryBase &lib);
+};
+
+template <class Data>
+struct MappingTraits<io::SequencingLibrary<Data> > {
+ static void mapping(llvm::yaml::IO &io, io::SequencingLibrary<Data> &lib);
+ static StringRef validate(llvm::yaml::IO &io, io::SequencingLibrary<Data> &lib);
+};
+
+}}
+
+#endif // __IO_LIBRARY_HPP__
diff --git a/src/modules/pipeline/library.inl b/src/modules/pipeline/library.inl
new file mode 100644
index 0000000..cef8c21
--- /dev/null
+++ b/src/modules/pipeline/library.inl
@@ -0,0 +1,64 @@
+template<class Data>
+using Library = io::SequencingLibrary<Data>;
+
+namespace llvm { namespace yaml {
+template <class Data>
+struct SequenceTraits<std::vector<Library<Data> >> {
+ static size_t size(IO &, std::vector<Library<Data> > &seq) {
+ return seq.size();
+ }
+ static Library<Data>&
+ element(IO &, std::vector<Library<Data>> &seq, size_t index) {
+ if (index >= seq.size())
+ seq.resize(index+1);
+ return seq[index];
+ }
+};
+
+template<class Data>
+void MappingTraits<Library<Data>>::mapping(yaml::IO &io, Library<Data> &lib) {
+ lib.yamlize(io);
+}
+
+template<class Data>
+StringRef MappingTraits<Library<Data>>::validate(yaml::IO &io, Library<Data> &lib) {
+ // We use such crazy API for validate() since we don't want to pull
+ // llvm::StringRef into library.hpp.
+ llvm::StringRef res;
+ lib.validate(io, res);
+
+ return res;
+}
+}}
+
+template<class Data>
+void io::DataSet<Data>::save(const std::string &filename) {
+ std::error_code EC;
+ llvm::raw_fd_ostream ofs(filename, EC, llvm::sys::fs::OpenFlags::F_Text);
+ llvm::yaml::Output yout(ofs);
+ yout << libraries_;
+}
+
+template<class Data>
+void io::DataSet<Data>::load(const std::string &filename) {
+ ErrorOr<std::unique_ptr<MemoryBuffer>> Buf = MemoryBuffer::getFile(filename);
+ if (!Buf) {
+ std::cerr << std::string("Failed to load file ") + filename;
+ throw;
+ }
+
+ yaml::Input yin(*Buf.get());
+ yin >> libraries_;
+
+ if (yin.error()) {
+ std::cerr << std::string("Failed to load file ") + filename;
+ throw;
+ }
+
+ std::string input_dir = path::parent_path(filename);
+ if (input_dir[input_dir.length() - 1] != '/')
+ input_dir += '/';
+
+ for (auto& lib : libraries_)
+ lib.update_relative_reads_filenames(input_dir);
+}
diff --git a/src/modules/pipeline/stage.cpp b/src/modules/pipeline/stage.cpp
new file mode 100644
index 0000000..4477536
--- /dev/null
+++ b/src/modules/pipeline/stage.cpp
@@ -0,0 +1,133 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "pipeline/stage.hpp"
+#include "pipeline/graphio.hpp"
+
+#include "dev_support/logger/log_writers.hpp"
+
+#include <algorithm>
+#include <cstring>
+
+namespace spades {
+
+void AssemblyStage::load(debruijn_graph::conj_graph_pack& gp,
+ const std::string &load_from,
+ const char* prefix) {
+ std::string p = path::append_path(load_from, prefix == NULL ? id_ : prefix);
+ INFO("Loading current state from " << p);
+
+ debruijn_graph::graphio::ScanAll(p, gp, false);
+ debruijn_graph::config::load_lib_data(p);
+}
+
+void AssemblyStage::save(const debruijn_graph::conj_graph_pack& gp,
+ const std::string &save_to,
+ const char* prefix) const {
+ std::string p = path::append_path(save_to, prefix == NULL ? id_ : prefix);
+ INFO("Saving current state to " << p);
+
+ debruijn_graph::graphio::PrintAll(p, gp);
+ debruijn_graph::config::write_lib_data(p);
+}
+
+class StageIdComparator {
+ public:
+ StageIdComparator(const char* id)
+ : id_(id) {
+ const char* pos = strstr(id, ":");
+ len_ = (pos != NULL ? pos - id : strlen(id));
+ }
+
+ bool operator()(const std::unique_ptr<AssemblyStage> &stage) const {
+ const char* sid = stage->id();
+ return (0 == strncmp(id_, sid, len_) && sid[len_] == 0);
+ }
+
+ private:
+ const char* id_;
+ size_t len_;
+};
+
+class PhaseIdComparator {
+ public:
+ PhaseIdComparator(const char* id) {
+ const char* pos = strstr(id, ":");
+ VERIFY(pos != NULL);
+ id_ = pos + 1;
+ }
+
+ bool operator()(const std::unique_ptr<CompositeStageBase::PhaseBase> &phase) const {
+ return 0 == strcmp(id_, phase->id());
+ }
+
+ private:
+ const char* id_;
+};
+
+void CompositeStageBase::run(debruijn_graph::conj_graph_pack& gp,
+ const char* started_from) {
+ VERIFY(parent_);
+ auto start_phase = phases_.begin();
+ if (started_from &&
+ strstr(started_from, ":") &&
+ started_from == strstr(started_from, id())) {
+ start_phase = std::find_if(phases_.begin(), phases_.end(), PhaseIdComparator(started_from));
+ if (start_phase == phases_.end()) {
+ ERROR("Invalid start stage / phase combination specified: " << started_from);
+ exit(-1);
+ }
+ if (start_phase != phases_.begin()) {
+ PhaseBase * prev_phase = std::prev(start_phase)->get();
+ std::string composite_id(id());
+ composite_id += ":";
+ composite_id += prev_phase->id();
+ prev_phase->load(gp, parent_->saves_policy().load_from_, composite_id.c_str());
+ }
+ }
+
+ for (auto et = phases_.end(); start_phase != et; ++start_phase) {
+ PhaseBase *phase = start_phase->get();
+
+ INFO("PROCEDURE == " << phase->name());
+ phase->run(gp, started_from);
+
+ if (parent_->saves_policy().make_saves_) {
+ std::string composite_id(id());
+ composite_id += ":";
+ composite_id += phase->id();
+
+ phase->save(gp, parent_->saves_policy().save_to_, composite_id.c_str());
+ }
+
+ }
+}
+
+void StageManager::run(debruijn_graph::conj_graph_pack& g,
+ const char* start_from) {
+ auto start_stage = stages_.begin();
+ if (start_from) {
+ start_stage = std::find_if(stages_.begin(), stages_.end(), StageIdComparator(start_from));
+ if (start_stage == stages_.end()) {
+ ERROR("Invalid start stage specified: " << start_from);
+ exit(-1);
+ }
+ if (start_stage != stages_.begin())
+ (*std::prev(start_stage))->load(g, saves_policy_.load_from_);
+ }
+
+ for (; start_stage != stages_.end(); ++start_stage) {
+ AssemblyStage *stage = start_stage->get();
+
+ INFO("STAGE == " << stage->name());
+ stage->run(g, start_from);
+ if (saves_policy_.make_saves_)
+ stage->save(g, saves_policy_.save_to_);
+ }
+}
+
+}
diff --git a/src/modules/pipeline/stage.hpp b/src/modules/pipeline/stage.hpp
new file mode 100644
index 0000000..11aa8a2
--- /dev/null
+++ b/src/modules/pipeline/stage.hpp
@@ -0,0 +1,165 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef __STAGE_HPP__
+#define __STAGE_HPP__
+
+#include "pipeline/graph_pack.hpp"
+
+#include <vector>
+#include <memory>
+
+namespace spades {
+
+class StageManager;
+
+class AssemblyStage {
+public:
+ AssemblyStage(const char *name, const char *id)
+ : name_(name), id_(id), parent_(NULL) { }
+
+ virtual ~AssemblyStage() { }
+
+ AssemblyStage(const AssemblyStage &) = delete;
+
+ AssemblyStage &operator=(const AssemblyStage &) = delete;
+
+ const char *name() const { return name_; }
+
+ const char *id() const { return id_; }
+
+ virtual void load(debruijn_graph::conj_graph_pack &, const std::string &load_from, const char *prefix = NULL);
+
+ virtual void save(const debruijn_graph::conj_graph_pack &, const std::string &save_to,
+ const char *prefix = NULL) const;
+
+ virtual void run(debruijn_graph::conj_graph_pack &, const char *started_from = NULL) = 0;
+
+private:
+ const char *name_;
+ const char *id_;
+
+protected:
+ const StageManager *parent_;
+
+ friend class StageManager;
+};
+
+class CompositeStageBase : public AssemblyStage {
+public:
+ class PhaseBase : public AssemblyStage {
+ public:
+ PhaseBase(const char *name, const char *id)
+ : AssemblyStage(name, id), parent_stage_(NULL) { }
+
+ protected:
+ CompositeStageBase *parent_stage_;
+
+ friend class CompositeStageBase;
+ };
+
+ CompositeStageBase(const char *name, const char *id)
+ : AssemblyStage(name, id) { }
+
+ CompositeStageBase *add(PhaseBase *phase) {
+ phases_.push_back(std::unique_ptr<PhaseBase>(phase));
+ phase->parent_stage_ = this;
+
+ return this;
+ }
+
+ CompositeStageBase *add(std::initializer_list<PhaseBase *> phases) {
+ for (auto it = phases.begin(), et = phases.end(); it != et; ++it)
+ add(*it);
+
+ return this;
+ }
+
+ void run(debruijn_graph::conj_graph_pack &gp, const char * = NULL);
+
+private:
+ std::vector<std::unique_ptr<PhaseBase> > phases_;
+};
+
+template<class Storage>
+class CompositeStage : public CompositeStageBase {
+public:
+ class Phase : public PhaseBase {
+ public:
+ Phase(const char *name, const char *id)
+ : PhaseBase(name, id) { }
+
+ CompositeStage<Storage> *parent() { return static_cast<CompositeStage<Storage> *>(parent_stage_); }
+
+ const CompositeStage<Storage> *parent() const { return static_cast<const CompositeStage<Storage> *>(parent_stage_); }
+
+ Storage &storage() { return parent()->storage(); }
+
+ const Storage &storage() const { return parent()->storage(); }
+ };
+
+ CompositeStage(const char *name, const char *id)
+ : CompositeStageBase(name, id) { }
+
+ Storage &storage() { return storage_; }
+
+ const Storage &storage() const { return storage_; }
+
+private:
+ Storage storage_;
+};
+
+class StageManager {
+
+public:
+ struct SavesPolicy {
+ bool make_saves_;
+ std::string load_from_;
+ std::string save_to_;
+
+ SavesPolicy()
+ : make_saves_(false), load_from_(""), save_to_("") { }
+
+ SavesPolicy(bool make_saves, const std::string &load_from, const std::string &save_to)
+ : make_saves_(make_saves), load_from_(load_from), save_to_(save_to) { }
+ };
+
+ StageManager(SavesPolicy policy = SavesPolicy())
+ : saves_policy_(policy) { }
+
+ StageManager &add(AssemblyStage *stage) {
+ stages_.push_back(std::unique_ptr<AssemblyStage>(stage));
+ stages_.back()->parent_ = this;
+
+ return *this;
+ }
+
+ StageManager &add(std::initializer_list<AssemblyStage *> stages) {
+ for (auto it = stages.begin(), et = stages.end(); it != et; ++it)
+ add(*it);
+
+ return *this;
+ }
+
+ void run(debruijn_graph::conj_graph_pack &g,
+ const char *start_from = NULL);
+
+ const SavesPolicy &saves_policy() const {
+ return saves_policy_;
+ }
+
+private:
+ std::vector<std::unique_ptr<AssemblyStage> > stages_;
+ SavesPolicy saves_policy_;
+
+ DECL_LOGGER("StageManager");
+};
+
+
+};
+
+#endif // __STAGE_HPP__
diff --git a/src/modules/stages/CMakeLists.txt b/src/modules/stages/CMakeLists.txt
new file mode 100644
index 0000000..f1c3ca8
--- /dev/null
+++ b/src/modules/stages/CMakeLists.txt
@@ -0,0 +1,12 @@
+############################################################################
+# Copyright (c) 2015 Saint Petersburg State University
+# Copyright (c) 2011-2014 Saint Petersburg Academic University
+# All Rights Reserved
+# See file LICENSE for details.
+############################################################################
+
+project(stages CXX)
+
+add_library(stages STATIC
+ construction.cpp simplification.cpp)
+
diff --git a/src/modules/stages/construction.cpp b/src/modules/stages/construction.cpp
new file mode 100644
index 0000000..86bd711
--- /dev/null
+++ b/src/modules/stages/construction.cpp
@@ -0,0 +1,69 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "io/reads_io/vector_reader.hpp"
+#include "io/dataset_support/dataset_readers.hpp"
+#include "pipeline/graph_pack.hpp"
+#include "io/dataset_support/read_converter.hpp"
+
+#include "algorithms/graph_construction.hpp"
+#include "assembly_graph/stats/picture_dump.hpp"
+#include "construction.hpp"
+
+namespace debruijn_graph {
+
+template<class Read>
+void construct_graph(io::ReadStreamList<Read>& streams,
+ conj_graph_pack& gp, io::SingleStreamPtr contigs_stream = io::SingleStreamPtr()) {
+ config::debruijn_config::construction params = cfg::get().con;
+ params.early_tc.enable &= !cfg::get().gap_closer_enable;
+
+ ReadStatistics stats = ConstructGraphWithCoverage(params, streams, gp.g,
+ gp.index, gp.flanking_cov, contigs_stream);
+ size_t rl = stats.max_read_length_;
+
+ if (!cfg::get().ds.RL()) {
+ INFO("Figured out: read length = " << rl);
+ cfg::get_writable().ds.set_RL(rl);
+ cfg::get_writable().ds.set_aRL(1.0 * stats.bases_ / stats.reads_);
+ } else if (cfg::get().ds.RL() != rl)
+ WARN("In datasets.info, wrong RL is specified: " << cfg::get().ds.RL() << ", not " << rl);
+}
+
+void Construction::run(conj_graph_pack &gp, const char*) {
+ // Has to be separate stream for not counting it in coverage
+ io::ReadStreamList<io::SingleRead> trusted_contigs;
+ if (cfg::get().use_additional_contigs) {
+ INFO("Contigs from previous K will be used");
+ trusted_contigs.push_back(io::EasyStream(cfg::get().additional_contigs, true));
+ }
+
+ bool trusted_contigs_exist = false;
+ for (const auto& lib : cfg::get().ds.reads) {
+ if (lib.type() != io::LibraryType::TrustedContigs)
+ continue;
+
+ for (const auto& read : lib.single_reads()) {
+ trusted_contigs.push_back(io::EasyStream(read, true));
+ trusted_contigs_exist = true;
+ }
+ }
+
+ if (trusted_contigs_exist)
+ INFO("Trusted contigs will be used in graph construction");
+ auto contigs_stream = MultifileWrap(trusted_contigs);
+
+ std::vector<size_t> libs_for_construction;
+ for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i)
+ if (cfg::get().ds.reads[i].is_graph_contructable())
+ libs_for_construction.push_back(i);
+
+ auto streams = single_binary_readers_for_libs(libs_for_construction, true, true);
+ construct_graph<io::SingleReadSeq>(streams, gp, contigs_stream);
+}
+
+} //namespace debruijn_graph
diff --git a/src/modules/stages/construction.hpp b/src/modules/stages/construction.hpp
new file mode 100644
index 0000000..574d042
--- /dev/null
+++ b/src/modules/stages/construction.hpp
@@ -0,0 +1,23 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "pipeline/stage.hpp"
+
+namespace debruijn_graph {
+
+class Construction : public spades::AssemblyStage {
+public:
+ Construction()
+ : AssemblyStage("Construction", "construction") { }
+
+ void run(conj_graph_pack &gp, const char *);
+};
+
+}
+
diff --git a/src/modules/stages/simplification.cpp b/src/modules/stages/simplification.cpp
new file mode 100644
index 0000000..665e9d3
--- /dev/null
+++ b/src/modules/stages/simplification.cpp
@@ -0,0 +1,509 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "assembly_graph/graph_core/basic_graph_stats.hpp"
+#include "assembly_graph/graph_support/graph_processing_algorithm.hpp"
+#include "stages/simplification_pipeline/simplification_settings.hpp"
+#include "stages/simplification_pipeline/graph_simplification.hpp"
+#include "algorithms/simplification/parallel_simplification_algorithms.hpp"
+
+#include "simplification.hpp"
+
+namespace debruijn_graph {
+
+using namespace debruijn::simplification;
+using namespace config;
+
+class GraphSimplifier {
+ typedef std::function<void(EdgeId)> HandlerF;
+ typedef omnigraph::PersistentEdgeRemovingAlgorithm<Graph,
+ omnigraph::ParallelInterestingElementFinder<Graph, EdgeId>,
+ LengthComparator<Graph>> TipClipperT;
+ typedef omnigraph::PersistentEdgeRemovingAlgorithm<Graph,
+ omnigraph::ParallelInterestingElementFinder<Graph, EdgeId>,
+ CoverageComparator<Graph>> ECRemoverT;
+
+ typedef std::vector<std::pair<AlgoPtr<Graph>, std::string>> AlgoStorageT;
+
+ conj_graph_pack& gp_;
+ Graph& g_;
+ SimplifInfoContainer info_container_;
+ const debruijn_config::simplification simplif_cfg_;
+
+ CountingCallback<Graph> cnt_callback_;
+ HandlerF removal_handler_;
+ stats::detail_info_printer& printer_;
+
+// bool FastModeAvailable(const SimplifInfoContainer& info, double activation_cov_threshold) {
+// const auto& cfg = cfg::get();
+//
+// //todo fix logic
+// //also handles meta case for now
+// if (cfg.ds.single_cell) {
+// return !cfg::get().main_iteration;
+// }
+//
+// if (math::eq(info.detected_mean_coverage(), 0.) &&
+// !cfg.kcm.use_coverage_threshold) {
+// WARN("Mean coverage wasn't reliably estimated");
+// return false;
+// }
+//
+// //todo review logic
+// if (math::ls(info.detected_mean_coverage(), activation_cov_threshold) &&
+// !(cfg.kcm.use_coverage_threshold &&
+// math::ge(cfg.kcm.coverage_threshold, activation_cov_threshold))) {
+// INFO("Estimated mean coverage " << info.detected_mean_coverage() <<
+// " is less than fast mode activation coverage " << activation_cov_threshold);
+// return false;
+// }
+//
+// return true;
+// }
+
+ bool PerformInitCleaning() {
+
+ if (simplif_cfg_.init_clean.early_it_only && info_container_.main_iteration()) {
+ INFO("Most init cleaning disabled on main iteration");
+ return false;
+ }
+ if (math::ge(simplif_cfg_.init_clean.activation_cov, 0.)
+ && math::ls(info_container_.detected_mean_coverage(), simplif_cfg_.init_clean.activation_cov)) {
+ INFO("Most init cleaning disabled since detected mean " << info_container_.detected_mean_coverage()
+ << " was less than activation coverage " << simplif_cfg_.init_clean.activation_cov);
+ return false;
+ }
+
+ return true;
+ }
+
+ void RemoveShortPolyATEdges(size_t max_length,
+ HandlerF removal_handler = 0, size_t chunk_cnt = 1) {
+ INFO("Removing short polyAT");
+ EdgeRemover<Graph> er(g_, removal_handler);
+ ATCondition<Graph> condition (g_, 0.8, max_length, false);
+ for (auto iter = g_.SmartEdgeBegin(); !iter.IsEnd(); ++iter){
+ if (g_.length(*iter) == 1 && condition.Check(*iter)) {
+ er.DeleteEdgeWithNoCompression(*iter);
+ }
+ }
+ ParallelCompress(g_, chunk_cnt);
+ }
+
+ void InitialCleaning() {
+ INFO("PROCEDURE == InitialCleaning");
+
+ AlgoStorageT algos;
+
+ PushValid(
+ SelfConjugateEdgeRemoverInstance(g_,
+ simplif_cfg_.init_clean.self_conj_condition,
+ info_container_, removal_handler_),
+ "Self conjugate edge remover",
+ algos);
+
+ if (cfg::get().mode == config::pipeline_type::rna){
+ RemoveShortPolyATEdges(1, removal_handler_, info_container_.chunk_cnt());
+ PushValid(ShortPolyATEdgesRemoverInstance(g_, 1, removal_handler_, info_container_.chunk_cnt()), "Short PolyA/T Edges",algos) ;
+ PushValid(ATTipClipperInstance(g_, removal_handler_, info_container_.chunk_cnt()), "AT Tips", algos);
+ }
+
+ if (PerformInitCleaning()) {
+ PushValid(
+ IsolatedEdgeRemoverInstance(g_,
+ simplif_cfg_.init_clean.ier,
+ info_container_, removal_handler_),
+ "Initial isolated edge remover",
+ algos);
+
+ PushValid(
+ TipClipperInstance(g_,
+ debruijn_config::simplification::tip_clipper(simplif_cfg_.init_clean.tip_condition),
+ info_container_,
+ removal_handler_),
+ "Initial tip clipper",
+ algos);
+
+ PushValid(
+ ECRemoverInstance(g_,
+ debruijn_config::simplification::erroneous_connections_remover(simplif_cfg_.init_clean.ec_condition),
+ info_container_,
+ removal_handler_),
+ "Initial ec remover",
+ algos);
+
+ PushValid(
+ LowFlankDisconnectorInstance(g_, gp_.flanking_cov,
+ simplif_cfg_.init_clean.disconnect_flank_cov, info_container_,
+ removal_handler_),
+ "Disconnecting edges with low flanking coverage",
+ algos);
+ }
+
+ RunAlgos(algos);
+
+ //FIXME why called directly?
+ if (cfg::get().mode == config::pipeline_type::rna){
+ RemoveHiddenLoopEC(g_, gp_.flanking_cov, info_container_.detected_coverage_bound(), simplif_cfg_.her, removal_handler_);
+ cnt_callback_.Report();
+ }
+ }
+
+ bool AllTopology() {
+ bool res = TopologyRemoveErroneousEdges(gp_.g, simplif_cfg_.tec,
+ removal_handler_);
+ cnt_callback_.Report();
+ res |= TopologyReliabilityRemoveErroneousEdges(gp_.g, simplif_cfg_.trec,
+ removal_handler_);
+ cnt_callback_.Report();
+ res |= RemoveThorns(gp_.g, simplif_cfg_.isec, removal_handler_);
+ cnt_callback_.Report();
+ res |= MultiplicityCountingRemoveErroneousEdges(gp_.g, simplif_cfg_.tec,
+ removal_handler_);
+ cnt_callback_.Report();
+ return res;
+ }
+
+ bool FinalRemoveErroneousEdges() {
+
+ // gp.ClearQuality();
+ // gp.FillQuality();
+ // auto colorer = debruijn_graph::DefaultGPColorer(gp);
+ // omnigraph::DefaultLabeler<typename gp_t::graph_t> labeler(gp.g, gp.edge_pos);
+ // QualityEdgeLocalityPrintingRH<Graph> qual_removal_handler(gp.g, gp.edge_qual, labeler, colorer,
+ // cfg::get().output_dir + "pictures/colored_edges_deleted/");
+ //
+ // //positive quality edges removed (folder colored_edges_deleted)
+ // std::function<void(EdgeId)> qual_removal_handler_f = boost::bind(
+ // // &QualityLoggingRemovalHandler<Graph>::HandleDelete,
+ // &QualityEdgeLocalityPrintingRH<Graph>::HandleDelete,
+ // boost::ref(qual_removal_handler), _1);
+ //
+ // std::function<void(set<EdgeId>)> set_removal_handler_f = boost::bind(
+ // &omnigraph::simplification::SingleEdgeAdapter<set<EdgeId>>, _1, qual_removal_handler_f);
+ //
+
+ std::function<void(set<EdgeId>)> set_removal_handler_f(0);
+ if (removal_handler_) {
+ set_removal_handler_f = std::bind(
+ &omnigraph::simplification::SingleEdgeAdapter<set<EdgeId>>, std::placeholders::_1, removal_handler_);
+ }
+
+ bool changed = RemoveRelativelyLowCoverageComponents(gp_.g, gp_.flanking_cov,
+ simplif_cfg_.rcc, info_container_, set_removal_handler_f);
+
+ cnt_callback_.Report();
+
+ changed |= DisconnectRelativelyLowCoverageEdges(gp_.g, gp_.flanking_cov, simplif_cfg_.relative_ed);
+
+ if (simplif_cfg_.topology_simplif_enabled && info_container_.main_iteration()) {
+ changed |= AllTopology();
+ changed |= MaxFlowRemoveErroneousEdges(gp_.g, simplif_cfg_.mfec,
+ removal_handler_);
+ cnt_callback_.Report();
+ }
+ return changed;
+ }
+
+ void PostSimplification() {
+ INFO("PROCEDURE == Post simplification");
+ size_t iteration = 0;
+
+ AlgoStorageT algos;
+
+ PushValid(
+ TipClipperInstance(g_, simplif_cfg_.tc,
+ info_container_, removal_handler_),
+ "Tip clipper",
+ algos);
+
+ PushValid(
+ TipClipperInstance(g_, simplif_cfg_.final_tc,
+ info_container_, removal_handler_),
+ "Final tip clipper",
+ algos);
+
+ PushValid(
+ BRInstance(g_, simplif_cfg_.br,
+ info_container_, removal_handler_),
+ "Bulge remover",
+ algos);
+
+ PushValid(
+ BRInstance(g_, simplif_cfg_.final_br,
+ info_container_, removal_handler_),
+ "Final bulge remover",
+ algos);
+
+ if (simplif_cfg_.topology_simplif_enabled) {
+ PushValid(
+ TopologyTipClipperInstance(g_, simplif_cfg_.ttc,
+ info_container_, removal_handler_),
+ "Topology tip clipper",
+ algos);
+ }
+
+ //FIXME need better configuration
+
+ if (cfg::get().mode == config::pipeline_type::meta) {
+ PushValid(
+ BRInstance(g_, simplif_cfg_.second_final_br,
+ info_container_, removal_handler_),
+ "Yet another final bulge remover",
+ algos);
+ }
+
+ if (cfg::get().mode == config::pipeline_type::rna) {
+ PushValid(ATTipClipperInstance(g_, removal_handler_, info_container_.chunk_cnt()), "AT Tips", algos);
+ }
+
+ bool enable_flag = true;
+ while (enable_flag) {
+ enable_flag = false;
+
+ INFO("Iteration " << iteration);
+
+ enable_flag |= FinalRemoveErroneousEdges();
+ cnt_callback_.Report();
+
+ enable_flag |= ClipComplexTips(gp_.g, simplif_cfg_.complex_tc, info_container_, removal_handler_);
+ cnt_callback_.Report();
+
+ enable_flag |= RemoveComplexBulges(gp_.g, simplif_cfg_.cbr, iteration);
+ cnt_callback_.Report();
+
+ enable_flag |= RunAlgos(algos);
+
+ iteration++;
+
+ // printer(ipp_before_final_err_con_removal);
+ // printer(ipp_final_tip_clipping, str(format("_%d") % iteration));
+ // printer(ipp_final_err_con_removal, str(format("_%d") % iteration));
+ // printer(ipp_final_bulge_removal, str(format("_%d") % iteration));
+ }
+
+ //fixme move to AllTopology?
+ if (simplif_cfg_.topology_simplif_enabled) {
+ RemoveHiddenEC(gp_.g, gp_.flanking_cov, simplif_cfg_.her, info_container_, removal_handler_);
+
+ cnt_callback_.Report();
+ }
+
+ INFO("Disrupting self-conjugate edges");
+ SelfConjugateDisruptor<Graph>(gp_.g, removal_handler_).Run();
+ cnt_callback_.Report();
+ }
+
+ //inline
+ //void IdealSimplification(Graph& graph,
+ // std::function<double(EdgeId)> quality_handler_f) {
+ // for (auto iterator = graph.SmartEdgeBegin(); !iterator.IsEnd();
+ // ++iterator) {
+ // if (math::eq(quality_handler_f(*iterator), 0.))
+ // graph.DeleteEdge(*iterator);
+ // }
+ // CompressAllVertices(graph);
+ //}
+
+// std::shared_ptr<Predicate<EdgeId>> ParseCondition(const string& condition) const {
+// ConditionParser<Graph> parser(g_, condition, info_container_);
+// return parser();
+// }
+
+ void PushValid(const AlgoPtr<Graph>& algo_ptr, std::string comment, AlgoStorageT& algos) const {
+ if (algo_ptr) {
+ algos.push_back(std::make_pair(algo_ptr, comment));
+ }
+ }
+
+ bool RunAlgos(AlgoStorageT& algos, bool force_primary_launch = false) {
+ bool changed = false;
+ for (auto algo_comment : algos) {
+ INFO("Running " << algo_comment.second);
+ changed |= algo_comment.first->Run(force_primary_launch);
+ cnt_callback_.Report();
+ }
+ return changed;
+ }
+
+public:
+ GraphSimplifier(conj_graph_pack &gp, const SimplifInfoContainer& info_container,
+ const debruijn_config::simplification& simplif_cfg,
+ const std::function<void(EdgeId)>& removal_handler,
+ stats::detail_info_printer& printer)
+ : gp_(gp),
+ g_(gp_.g),
+ info_container_(info_container),
+ simplif_cfg_(simplif_cfg),
+ removal_handler_(AddCountingCallback(cnt_callback_, removal_handler)),
+ printer_(printer) {
+
+ }
+
+ void SimplifyGraph() {
+ printer_(info_printer_pos::before_simplification);
+ INFO("Graph simplification started");
+
+ InitialCleaning();
+
+ AlgoStorageT algos;
+
+ PushValid(
+ TipClipperInstance(g_, simplif_cfg_.tc, info_container_, removal_handler_, simplif_cfg_.cycle_iter_count),
+ "Tip clipper",
+ algos);
+ PushValid(
+ BRInstance(g_, simplif_cfg_.br, info_container_, removal_handler_, simplif_cfg_.cycle_iter_count),
+ "Bulge remover",
+ algos);
+ PushValid(
+ ECRemoverInstance(g_, simplif_cfg_.ec, info_container_, removal_handler_, simplif_cfg_.cycle_iter_count),
+ "Low coverage edge remover",
+ algos);
+
+ size_t iteration = 0;
+ bool graph_changed = true;
+ //cannot stop simply if nothing changed, since threshold change on every iteration
+ while (iteration < simplif_cfg_.cycle_iter_count || graph_changed) {
+ INFO("PROCEDURE == Simplification cycle, iteration " << iteration + 1);
+ graph_changed = RunAlgos(algos);
+ ++iteration;
+ }
+
+ printer_(info_printer_pos::before_post_simplification);
+
+ if (simplif_cfg_.post_simplif_enabled) {
+ PostSimplification();
+ } else {
+ INFO("PostSimplification disabled");
+ }
+ }
+};
+
+void Simplification::run(conj_graph_pack &gp, const char*) {
+ using namespace omnigraph;
+
+ //no other handlers here, todo change with DetachAll
+ gp.index.Detach();
+ gp.index.clear();
+
+ omnigraph::DefaultLabeler<Graph> labeler(gp.g, gp.edge_pos);
+
+ stats::detail_info_printer printer(gp, labeler, cfg::get().output_dir);
+
+ // QualityLoggingRemovalHandler<Graph> qual_removal_handler(gp.g, edge_qual);
+// auto colorer = debruijn_graph::DefaultGPColorer(gp);
+// QualityEdgeLocalityPrintingRH<Graph> qual_removal_handler(gp.g, gp.edge_qual, labeler, colorer,
+// cfg::get().output_dir + "pictures/colored_edges_deleted/");
+//
+// //positive quality edges removed (folder colored_edges_deleted)
+// std::function<void(EdgeId)> removal_handler_f = boost::bind(
+// // &QualityLoggingRemovalHandler<Graph>::HandleDelete,
+// &QualityEdgeLocalityPrintingRH<Graph>::HandleDelete,
+// boost::ref(qual_removal_handler), _1);
+
+
+ SimplifInfoContainer info_container;
+ info_container.set_read_length(cfg::get().ds.RL())
+ .set_main_iteration(cfg::get().main_iteration)
+ .set_chunk_cnt(5 * cfg::get().max_threads);
+
+ //0 if model didn't converge
+ //todo take max with trusted_bound
+ //FIXME add warning when used for uneven coverage applications
+ info_container.set_detected_mean_coverage(gp.ginfo.estimated_mean())
+ .set_detected_coverage_bound(gp.ginfo.ec_bound());
+
+ GraphSimplifier simplifier(gp, info_container,
+ preliminary_ ? *cfg::get().preliminary_simp : cfg::get().simp,
+ nullptr/*removal_handler_f*/,
+ printer);
+ simplifier.SimplifyGraph();
+}
+
+
+void SimplificationCleanup::run(conj_graph_pack &gp, const char*) {
+ SimplifInfoContainer info_container;
+ info_container
+ .set_read_length(cfg::get().ds.RL())
+ .set_main_iteration(cfg::get().main_iteration)
+ .set_chunk_cnt(5 * cfg::get().max_threads);
+
+ IsolatedEdgeRemoverInstance(gp.g, cfg::get().simp.ier, info_container, (HandlerF<Graph>)nullptr)->Run();
+
+ double low_threshold = gp.ginfo.trusted_bound();
+ if (math::gr(low_threshold, 0.0)) {
+ INFO("Removing all the edges having coverage " << low_threshold << " and less");
+ ParallelEdgeRemovingAlgorithm<Graph, CoverageComparator<Graph>>
+ cov_cleaner(gp.g,
+ CoverageUpperBound<Graph>(gp.g, low_threshold),
+ info_container.chunk_cnt(),
+ (HandlerF<Graph>)nullptr,
+ /*canonical_only*/true,
+ CoverageComparator<Graph>(gp.g));
+ cov_cleaner.Run();
+ }
+
+ omnigraph::DefaultLabeler<Graph> labeler(gp.g, gp.edge_pos);
+ stats::detail_info_printer printer(gp, labeler, cfg::get().output_dir);
+ printer(info_printer_pos::final_simplified);
+
+ DEBUG("Graph simplification finished");
+
+ INFO("Counting average coverage");
+ AvgCovereageCounter<Graph> cov_counter(gp.g);
+
+ cfg::get_writable().ds.set_avg_coverage(cov_counter.Count());
+
+ INFO("Average coverage = " << cfg::get().ds.avg_coverage());
+ if (!cfg::get().uneven_depth) {
+ if (cfg::get().ds.avg_coverage() < gp.ginfo.ec_bound())
+ WARN("The determined erroneous connection coverage threshold may be determined improperly\n");
+ }
+}
+
+
+#if 0
+void corrected_and_save_reads(const conj_graph_pack& gp) {
+ //saving corrected reads
+ //todo read input files, correct, save and use on the next iteration
+
+ auto_ptr<io::IReader<io::PairedReadSeq>> paired_stream =
+ paired_binary_multireader(false, /*insert_size*/0);
+ io::ModifyingWrapper<io::PairedReadSeq> refined_paired_stream(
+ *paired_stream,
+ GraphReadCorrectorInstance(gp.g, *MapperInstance(gp)));
+
+ auto_ptr<io::IReader<io::SingleReadSeq>> single_stream =
+ single_binary_multireader(false, /*include_paired_reads*/false);
+ io::ModifyingWrapper<io::SingleReadSeq> refined_single_stream(
+ *single_stream,
+ GraphReadCorrectorInstance(gp.g, *MapperInstance(gp)));
+
+ if (cfg::get().graph_read_corr.binary) {
+ INFO("Correcting paired reads");
+
+ io::BinaryWriter paired_converter(
+ cfg::get().paired_read_prefix + "_cor", cfg::get().max_threads,
+ cfg::get().buffer_size);
+ paired_converter.ToBinary(refined_paired_stream);
+
+ INFO("Correcting single reads");
+ io::BinaryWriter single_converter(
+ cfg::get().single_read_prefix + "_cor", cfg::get().max_threads,
+ cfg::get().buffer_size);
+ single_converter.ToBinary(refined_single_stream);
+ } else {
+ //save in fasta
+ VERIFY(false);
+ }
+
+ INFO("Error correction done");
+}
+#endif
+
+} //debruijn_graph
diff --git a/src/modules/stages/simplification.hpp b/src/modules/stages/simplification.hpp
new file mode 100644
index 0000000..dfc3bd9
--- /dev/null
+++ b/src/modules/stages/simplification.hpp
@@ -0,0 +1,34 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "pipeline/stage.hpp"
+
+namespace debruijn_graph {
+
+class Simplification : public spades::AssemblyStage {
+ const bool preliminary_;
+public:
+ Simplification(bool preliminary = false)
+ : AssemblyStage(preliminary ? "Preliminary Simplification" : "Simplification",
+ preliminary ? "simplification_preliminary" : "simplification"),
+ preliminary_(preliminary) { }
+
+ void run(conj_graph_pack &gp, const char *);
+};
+
+class SimplificationCleanup : public spades::AssemblyStage {
+public:
+ SimplificationCleanup()
+ : AssemblyStage("Simplification Cleanup", "simplification_cleanup") { }
+
+ void run(conj_graph_pack &gp, const char *);
+};
+
+}
+
diff --git a/src/modules/stages/simplification_pipeline/graph_simplification.hpp b/src/modules/stages/simplification_pipeline/graph_simplification.hpp
new file mode 100644
index 0000000..cd9d9d4
--- /dev/null
+++ b/src/modules/stages/simplification_pipeline/graph_simplification.hpp
@@ -0,0 +1,978 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * graph_simplification.hpp
+ *
+ * Created on: Aug 12, 2011
+ * Author: sergey
+ */
+
+#pragma once
+
+#include "pipeline/config_struct.hpp"
+
+#include "algorithms/simplification/tip_clipper.hpp"
+#include "algorithms/simplification/complex_tip_clipper.hpp"
+#include "algorithms/simplification/bulge_remover.hpp"
+#include "algorithms/simplification/complex_bulge_remover.hpp"
+#include "algorithms/simplification/erroneous_connection_remover.hpp"
+#include "algorithms/simplification/relative_coverage_remover.hpp"
+#include "algorithms/simplification/mf_ec_remover.hpp"
+#include "algorithms/simplification/parallel_simplification_algorithms.hpp"
+#include "stages/simplification_pipeline/simplification_settings.hpp"
+#include "stages/simplification_pipeline/single_cell_simplification.hpp"
+
+#include "algorithms/graph_read_correction.hpp"
+
+#include "assembly_graph/graph_support/chimera_stats.hpp"
+#include "assembly_graph/graph_support/basic_edge_conditions.hpp"
+#include "assembly_graph/stats/picture_dump.hpp"
+#include "assembly_graph/graph_support/parallel_processing.hpp"
+#include "assembly_graph/graph_support/detail_coverage.hpp"
+
+#include "assembly_graph/graph_core/graph.hpp"
+
+#include "visualization/graph_colorer.hpp"
+#include "dev_support/standard_base.hpp"
+
+namespace debruijn {
+
+namespace simplification {
+
+//todo remove this line
+using namespace debruijn_graph;
+
+template<class Graph>
+using AlgoPtr = std::shared_ptr<omnigraph::PersistentAlgorithmBase<Graph>>;
+
+template<class Graph>
+using EdgeConditionT = pred::TypedPredicate<typename Graph::EdgeId>;
+
+template<class Graph>
+class ConditionParser {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+
+ const Graph& g_;
+ string next_token_;
+ string input_;
+ const SimplifInfoContainer settings_;
+ size_t curr_iteration_;
+ size_t iteration_cnt_;
+ std::queue<string> tokenized_input_;
+
+ size_t max_length_bound_;
+ double max_coverage_bound_;
+
+ string ReadNext() {
+ if (!tokenized_input_.empty()) {
+ next_token_ = tokenized_input_.front();
+ tokenized_input_.pop();
+ } else {
+ next_token_ = "";
+ }
+ return next_token_;
+ }
+
+ template<typename T>
+ bool RelaxMax(T& cur_max, T t) {
+ if (t > cur_max) {
+ cur_max = t;
+ return true;
+ }
+ return false;
+ }
+
+ template<typename T>
+ bool RelaxMin(T& cur_min, T t) {
+ if (t < cur_min) {
+ cur_min = t;
+ return true;
+ }
+ return false;
+ }
+
+ double GetCoverageBound() {
+ if (next_token_ == "auto") {
+ return settings_.detected_coverage_bound();
+ } else {
+ return std::stod(next_token_);
+ }
+ }
+
+ pred::TypedPredicate<EdgeId> ParseCondition(size_t& min_length_bound,
+ double& min_coverage_bound) {
+ if (next_token_ == "tc_lb") {
+ double length_coeff = std::stod(ReadNext());
+
+ DEBUG("Creating tip length bound. Coeff " << length_coeff);
+ size_t length_bound = LengthThresholdFinder::MaxTipLength(
+ settings_.read_length(), g_.k(), length_coeff);
+
+ DEBUG("Length bound " << length_bound);
+
+ RelaxMin(min_length_bound, length_bound);
+ DEBUG("Min length bound - " << min_length_bound);
+ return LengthUpperBound<Graph>(g_, length_bound);
+
+ } else if (next_token_ == "rlmk") {
+ //Read length minus k
+ VERIFY_MSG(settings_.read_length() > g_.k(), "Read length was shorter than K");
+ DEBUG("Creating (rl - k) bound");
+ size_t length_bound = settings_.read_length() - g_.k();
+ RelaxMin(min_length_bound, length_bound);
+ DEBUG("Min length bound - " << min_length_bound);
+ return LengthUpperBound<Graph>(g_, length_bound);
+
+ } else if (next_token_ == "to_ec_lb") {
+ double length_coeff = std::stod(ReadNext());
+
+ DEBUG( "Creating length bound for erroneous connections originated from tip merging. Coeff " << length_coeff);
+ size_t length_bound =
+ LengthThresholdFinder::MaxTipOriginatedECLength(
+ settings_.read_length(), g_.k(), length_coeff);
+
+ DEBUG("Length bound " << length_bound);
+
+ RelaxMin(min_length_bound, length_bound);
+ DEBUG("Min length bound - " << min_length_bound);
+ return LengthUpperBound<Graph>(g_, length_bound);
+
+ } else if (next_token_ == "ec_lb") {
+ size_t length_coeff = std::stoll(ReadNext());
+
+ DEBUG("Creating ec length bound. Coeff " << length_coeff);
+ size_t length_bound =
+ LengthThresholdFinder::MaxErroneousConnectionLength(
+ g_.k(), length_coeff);
+
+ DEBUG("Length bound " << length_bound);
+
+ RelaxMin(min_length_bound, length_bound);
+ DEBUG("Min length bound - " << min_length_bound);
+ return LengthUpperBound<Graph>(g_, length_bound);
+ } else if (next_token_ == "lb") {
+ size_t length_bound = std::stoll(ReadNext());
+
+ DEBUG("Creating length bound. Value " << length_bound);
+
+ RelaxMin(min_length_bound, length_bound);
+ DEBUG("Min length bound - " << min_length_bound);
+ return LengthUpperBound<Graph>(g_, length_bound);
+ } else if (next_token_ == "cb") {
+ ReadNext();
+ double cov_bound = GetCoverageBound();
+ DEBUG("Creating coverage upper bound " << cov_bound);
+ RelaxMin(min_coverage_bound, cov_bound);
+ return CoverageUpperBound<Graph>(g_, cov_bound);
+ } else if (next_token_ == "icb") {
+ VERIFY(iteration_cnt_ != -1ul && curr_iteration_ != -1ul);
+ ReadNext();
+ double cov_bound = GetCoverageBound();
+ cov_bound = cov_bound / (double) iteration_cnt_ * (double) (curr_iteration_ + 1);
+ DEBUG("Creating iterative coverage upper bound " << cov_bound);
+ RelaxMin(min_coverage_bound, cov_bound);
+ return CoverageUpperBound<Graph>(g_, cov_bound);
+ } else if (next_token_ == "rctc") {
+ ReadNext();
+ DEBUG("Creating relative cov tip cond " << next_token_);
+ return RelativeCoverageTipCondition<Graph>(g_, std::stod(next_token_));
+ } else if (next_token_ == "disabled") {
+ DEBUG("Creating disabling condition");
+ return pred::AlwaysFalse<EdgeId>();
+ } else if (next_token_ == "mmm") {
+ ReadNext();
+ DEBUG("Creating max mismatches cond " << next_token_);
+ return MismatchTipCondition<Graph>(g_, std::stoll(next_token_));
+ } else {
+ VERIFY(false);
+ return pred::AlwaysTrue<EdgeId>();
+ }
+ }
+
+ pred::TypedPredicate<EdgeId> ParseConjunction(size_t& min_length_bound,
+ double& min_coverage_bound) {
+ pred::TypedPredicate<EdgeId> answer = pred::AlwaysTrue<EdgeId>();
+ VERIFY(next_token_ == "{");
+ ReadNext();
+ while (next_token_ != "}") {
+ answer = pred::And(answer,
+ ParseCondition(min_length_bound, min_coverage_bound));
+ ReadNext();
+ }
+ return answer;
+ }
+
+public:
+
+ ConditionParser(const Graph& g, string input, const SimplifInfoContainer& settings,
+ size_t curr_iteration = -1ul, size_t iteration_cnt = -1ul)
+ : g_(g),
+ input_(input),
+ settings_(settings),
+ curr_iteration_(curr_iteration),
+ iteration_cnt_(iteration_cnt),
+ max_length_bound_(0),
+ max_coverage_bound_(0.) {
+ DEBUG("Creating parser for string " << input);
+ using namespace boost;
+ vector<string> tmp_tokenized_input;
+ boost::split(tmp_tokenized_input, input_, boost::is_any_of(" ,;"), boost::token_compress_on);
+ for (auto it = tmp_tokenized_input.begin();
+ it != tmp_tokenized_input.end(); ++it) {
+ tokenized_input_.push(*it);
+ }
+ ReadNext();
+ }
+
+ pred::TypedPredicate<EdgeId> operator()() {
+ DEBUG("Parsing");
+ pred::TypedPredicate<EdgeId> answer = pred::AlwaysFalse<EdgeId>();
+ VERIFY_MSG(next_token_ == "{", "Expected \"{\", but next token was " << next_token_);
+ while (next_token_ == "{") {
+ size_t min_length_bound = numeric_limits<size_t>::max();
+ double min_coverage_bound = numeric_limits<double>::max();
+ answer = pred::Or(answer,
+ ParseConjunction(min_length_bound, min_coverage_bound));
+ RelaxMax(max_length_bound_, min_length_bound);
+ RelaxMax(max_coverage_bound_, min_coverage_bound);
+ ReadNext();
+ }
+ return answer;
+ }
+
+ size_t max_length_bound() const {
+ return max_length_bound_;
+ }
+
+ double max_coverage_bound() const {
+ return max_coverage_bound_;
+ }
+
+private:
+ DECL_LOGGER("ConditionParser");
+};
+
+//todo move to visualization
+template<class graph_pack>
+shared_ptr<omnigraph::visualization::GraphColorer<typename graph_pack::graph_t>> DefaultGPColorer(
+ const graph_pack& gp) {
+ auto mapper = MapperInstance(gp);
+ auto path1 = mapper->MapSequence(gp.genome).path();
+ auto path2 = mapper->MapSequence(!gp.genome).path();
+ return omnigraph::visualization::DefaultColorer(gp.g, path1, path2);
+}
+
+template<class Graph>
+class EditDistanceTrackingCallback {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::EdgeData EdgeData;
+ const Graph& g_;
+
+public:
+ EditDistanceTrackingCallback(const Graph& g)
+ : g_(g) {
+ }
+
+ bool operator()(EdgeId edge, const vector<EdgeId>& path) const {
+ vector<Sequence> path_sequences;
+ for (auto it = path.begin(); it != path.end(); ++it) {
+ path_sequences.push_back(g_.EdgeNucls(*it));
+ }
+ Sequence path_sequence(
+ MergeOverlappingSequences(path_sequences, g_.k()));
+ size_t dist = EditDistance(g_.EdgeNucls(edge), path_sequence);
+ TRACE( "Bulge sequences with distance " << dist << " were " << g_.EdgeNucls(edge) << " and " << path_sequence);
+ return true;
+ }
+
+private:
+ DECL_LOGGER("EditDistanceTrackingCallback")
+ ;
+};
+
+//template<class Graph, class SmartEdgeIt>
+//bool ClipTips(
+// Graph& g,
+// SmartEdgeIt& it,
+// const config::debruijn_config::simplification::tip_clipper& tc_config,
+// const SimplifInfoContainer& info,
+// std::function<void(typename Graph::EdgeId)> removal_handler = 0) {
+//
+// INFO("Clipping tips");
+//
+// string condition_str = tc_config.condition;
+//
+// ConditionParser<Graph> parser(g, condition_str, info);
+// auto condition = parser();
+//
+// omnigraph::EdgeRemovingAlgorithm<Graph> tc(g,
+// omnigraph::AddTipCondition(g, condition),
+// removal_handler, true);
+//
+// TRACE("Tip length bound " << parser.max_length_bound());
+// return tc.RunFromIterator(it,
+// make_shared<LengthUpperBound<Graph>>(g, parser.max_length_bound()));
+//}
+
+//template<class Graph>
+//bool ClipTips(
+// Graph& g,
+// const config::debruijn_config::simplification::tip_clipper& tc_config,
+// const SimplifInfoContainer& info,
+// std::function<void(typename Graph::EdgeId)> removal_handler = 0) {
+//
+// auto it = g.SmartEdgeBegin(LengthComparator<Graph>(g), true);
+// return ClipTips(g, it, tc_config, info, removal_handler);
+//}
+
+//enabling tip projection, todo optimize if hotspot
+template<class gp_t>
+HandlerF<typename gp_t::graph_t> WrapWithProjectionCallback(
+ gp_t& gp,
+ HandlerF<typename gp_t::graph_t> removal_handler) {
+ typedef typename gp_t::graph_t Graph;
+ typedef typename Graph::EdgeId EdgeId;
+ TipsProjector<gp_t> tip_projector(gp);
+
+ HandlerF<Graph> projecting_callback = std::bind(&TipsProjector<gp_t>::ProjectTip,
+ tip_projector, std::placeholders::_1);
+
+ return func::Composition<EdgeId>(std::ref(removal_handler), projecting_callback);
+}
+
+template<class Graph, class InterestingEdgeFinder>
+class LowCoverageEdgeRemovingAlgorithm : public PersistentEdgeRemovingAlgorithm<Graph,
+ InterestingEdgeFinder, CoverageComparator<Graph>> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef PersistentEdgeRemovingAlgorithm<Graph, InterestingEdgeFinder, CoverageComparator<Graph>> base;
+ SimplifInfoContainer simplif_info_;
+ std::string condition_str_;
+ pred::TypedPredicate<EdgeId> remove_condition_;
+ pred::TypedPredicate<EdgeId> proceed_condition_;
+
+protected:
+
+ void PrepareIteration(size_t it_cnt, size_t total_it_estimate) override {
+ TRACE("Preparing iteration " << it_cnt << " out of total estimate " << total_it_estimate);
+ ConditionParser<Graph> parser(this->g(), condition_str_,
+ simplif_info_, it_cnt, total_it_estimate);
+ remove_condition_ = omnigraph::AddAlternativesPresenceCondition(this->g(), parser());
+ TRACE("Updated remove condition");
+ proceed_condition_ = CoverageUpperBound<Graph>(this->g(), parser.max_coverage_bound());
+ TRACE("Updated proceed condition up to coverage " << parser.max_coverage_bound());
+ }
+
+ bool Proceed(EdgeId e) const override {
+ return proceed_condition_(e);
+ }
+
+ bool ShouldRemove(EdgeId e) const override {
+ return remove_condition_(e);
+ }
+
+public:
+ LowCoverageEdgeRemovingAlgorithm(Graph& g,
+ const InterestingEdgeFinder& interest_edge_finder,
+ const SimplifInfoContainer& simplif_info,
+ const std::string& condition_str,
+ std::function<void(EdgeId)> removal_handler = nullptr,
+ bool canonical_only = false,
+ bool track_changes = true,
+ size_t total_iteration_estimate = -1ul)
+ : base(g, interest_edge_finder,
+ removal_handler,
+ canonical_only,
+ CoverageComparator<Graph>(g),
+ track_changes,
+ total_iteration_estimate),
+ simplif_info_(simplif_info),
+ condition_str_(condition_str),
+ remove_condition_(pred::AlwaysFalse<EdgeId>()),
+ proceed_condition_(pred::AlwaysTrue<EdgeId>()) {}
+private:
+ DECL_LOGGER("LowCoverageEdgeRemovingAlgorithm");
+};
+
+template<class Graph>
+AlternativesAnalyzer<Graph> ParseBRConfig(const Graph& g,
+ const config::debruijn_config::simplification::bulge_remover& config) {
+ size_t max_length = LengthThresholdFinder::MaxBulgeLength(
+ g.k(), config.max_bulge_length_coefficient,
+ config.max_additive_length_coefficient);
+
+ DEBUG("Length bound " << max_length);
+
+ return AlternativesAnalyzer<Graph>(g, config.max_coverage,
+ max_length,
+ config.max_relative_coverage,
+ config.max_delta,
+ config.max_relative_delta,
+ config.max_number_edges);
+}
+
+template<class Graph>
+AlgoPtr<Graph> SelfConjugateEdgeRemoverInstance(Graph &g, const string& condition_str,
+ const SimplifInfoContainer& info,
+ HandlerF<Graph> removal_handler = 0) {
+ ConditionParser<Graph> parser(g, condition_str, info);
+ auto condition = pred::And(SelfConjugateCondition<Graph>(g), parser());
+
+ return std::make_shared<ParallelEdgeRemovingAlgorithm<Graph>>(g,
+ condition,
+ info.chunk_cnt(),
+ removal_handler,
+ /*canonical_only*/true);
+}
+
+template<class Graph>
+bool RemoveRelativelyLowCoverageComponents(
+ Graph &g,
+ const FlankingCoverage<Graph>& flanking_cov,
+ const config::debruijn_config::simplification::relative_coverage_comp_remover& rcc_config,
+ const SimplifInfoContainer& info,
+ typename ComponentRemover<Graph>::HandlerF removal_handler = 0) {
+ if (rcc_config.enabled) {
+ INFO("Removing relatively low covered connections");
+ size_t connecting_path_length_bound = LengthThresholdFinder::MaxErroneousConnectionLength(
+ g.k(), rcc_config.max_ec_length_coefficient);
+
+ std::string pics_dir = "";//cfg::get().output_dir + "rel_cov_components/"
+
+ double max_coverage = math::ge(rcc_config.max_coverage_coeff, 0.)
+ ? info.detected_coverage_bound() * rcc_config.max_coverage_coeff
+ : std::numeric_limits<double>::max();
+
+ omnigraph::simplification::relative_coverage::
+ RelativeCoverageComponentRemover<Graph> rel_rem(
+ g,
+ std::bind(&FlankingCoverage<Graph>::LocalCoverage,
+ std::cref(flanking_cov), std::placeholders::_1, std::placeholders::_2),
+ rcc_config.coverage_gap, size_t(double(info.read_length()) * rcc_config.length_coeff),
+ size_t(double(info.read_length()) * rcc_config.tip_allowing_length_coeff),
+ connecting_path_length_bound,
+ max_coverage,
+ removal_handler, rcc_config.vertex_count_limit, pics_dir);
+ return rel_rem.Run();
+ } else {
+ INFO("Removal of relatively low covered connections disabled");
+ return false;
+ }
+}
+
+template<class Graph>
+bool DisconnectRelativelyLowCoverageEdges(Graph &g,
+ const FlankingCoverage<Graph>& flanking_cov,
+ const config::debruijn_config::simplification::relative_coverage_edge_disconnector& rced_config) {
+ if (rced_config.enabled) {
+ INFO("Disconnecting edges with relatively low coverage");
+ omnigraph::simplification::relative_coverage::RelativeCoverageDisconnector<
+ Graph> disconnector(g, std::bind(&FlankingCoverage<Graph>::LocalCoverage,
+ std::cref(flanking_cov), std::placeholders::_1,
+ std::placeholders::_2), rced_config.diff_mult);
+ return disconnector.Run();
+ } else {
+ INFO("Disconnection of relatively low covered edges disabled");
+ return false;
+ }
+}
+
+template<class Graph>
+bool RemoveComplexBulges(
+ Graph& g,
+ config::debruijn_config::simplification::complex_bulge_remover cbr_config,
+ size_t /*iteration*/ = 0) {
+ if (!cbr_config.enabled)
+ return false;
+ INFO("Removing complex bulges");
+ size_t max_length = (size_t) ((double) g.k() * cbr_config.max_relative_length);
+ size_t max_diff = cbr_config.max_length_difference;
+ omnigraph::complex_br::ComplexBulgeRemover<Graph> complex_bulge_remover(
+ g, max_length, max_diff);
+ return complex_bulge_remover.Run();
+}
+
+//template<class Graph>
+//bool RemoveIsolatedEdges(Graph &g, size_t max_length, double max_coverage, size_t max_length_any_cov,
+// std::function<void(typename Graph::EdgeId)> removal_handler = 0, size_t chunk_cnt = 1) {
+// typedef typename Graph::EdgeId EdgeId;
+//
+// //todo add info that some other edges might be removed =)
+// INFO("Removing isolated edges");
+// INFO("All edges shorter than " << max_length_any_cov << " will be removed");
+// INFO("Also edges shorter than " << max_length << " and coverage smaller than " << max_coverage << " will be removed");
+// //todo add warn on max_length_any_cov > max_length
+//
+// auto condition = func::And<EdgeId>(
+// make_shared<IsolatedEdgeCondition<Graph>>(g),
+// func::Or<EdgeId>(
+// make_shared<LengthUpperBound<Graph>>(g, max_length_any_cov),
+// func::And<EdgeId>(
+// make_shared<LengthUpperBound<Graph>>(g, max_length),
+// make_shared<CoverageUpperBound<Graph>>(g, max_coverage)
+// )));
+//
+// if (chunk_cnt == 1) {
+// omnigraph::EdgeRemovingAlgorithm<Graph> removing_algo(g, condition, removal_handler);
+//
+// return removing_algo.Run(LengthComparator<Graph>(g),
+// make_shared<LengthUpperBound<Graph>>(g, std::max(max_length, max_length_any_cov)));
+// } else {
+// SemiParallelAlgorithmRunner<Graph, EdgeId> runner(g);
+// SemiParallelEdgeRemovingAlgorithm<Graph> removing_algo(g, condition, removal_handler);
+//
+// return RunEdgeAlgorithm(g, runner, removing_algo, chunk_cnt);
+// }
+//}
+
+template<class Graph>
+bool ClipComplexTips(Graph& g, config::debruijn_config::simplification::complex_tip_clipper ctc_conf, const SimplifInfoContainer& info, HandlerF<Graph> removal_handler = 0) {
+ if(!ctc_conf.enabled) {
+ INFO("Complex tip clipping disabled");
+ return false;
+ }
+
+ std::function<void(set<EdgeId>)> set_removal_handler_f(0);
+ if (removal_handler) {
+ set_removal_handler_f = std::bind(
+ &omnigraph::simplification::SingleEdgeAdapter<set<EdgeId>>, std::placeholders::_1, removal_handler);
+ }
+
+ INFO("Complex tip clipping");
+
+ ConditionParser<Graph> parser(g, ctc_conf.condition, info);
+ parser();
+
+ ComplexTipClipper<Graph> tip_clipper(g, ctc_conf.max_relative_coverage, ctc_conf.max_edge_len, parser.max_length_bound(), "", set_removal_handler_f);
+ return tip_clipper.Run();
+}
+
+template<class Graph>
+AlgoPtr<Graph> ShortPolyATEdgesRemoverInstance (Graph &g, size_t max_length, HandlerF<Graph> removal_handler = 0, size_t chunk_cnt = 1){
+ auto condition = pred::And(ATCondition<Graph>(g, 0.8, max_length, false), LengthUpperBound<Graph>(g, 1));
+ return std::make_shared<ParallelEdgeRemovingAlgorithm<Graph>>(g, condition, chunk_cnt, removal_handler, true);
+}
+
+template<class Graph>
+AlgoPtr<Graph> ATTipClipperInstance (Graph &g, HandlerF<Graph> removal_handler = 0, size_t chunk_cnt = 1) {
+//TODO: review params 0.8, 200?
+ return std::make_shared<ParallelEdgeRemovingAlgorithm<Graph>>(g, ATCondition<Graph>(g, 0.8, 200, true), chunk_cnt, removal_handler, true);
+}
+
+template<class Graph>
+AlgoPtr<Graph> IsolatedEdgeRemoverInstance(Graph &g,
+ config::debruijn_config::simplification::isolated_edges_remover ier,
+ const SimplifInfoContainer& info,
+ HandlerF<Graph> removal_handler = 0) {
+ if (!ier.enabled) {
+ return nullptr;
+ }
+ size_t max_length_any_cov = std::max(info.read_length(), ier.max_length_any_cov);
+
+ INFO("Removing isolated edges");
+ INFO("All isolated edges shorter than " << max_length_any_cov << " will be removed");
+ INFO("Also isolated edges shorter than " << ier.max_length << " and coverage smaller than " << ier.max_coverage << " will be removed");
+
+ auto condition = pred::And(IsolatedEdgeCondition<Graph>(g),
+ pred::Or(LengthUpperBound<Graph>(g, max_length_any_cov),
+ pred::And(LengthUpperBound<Graph>(g, ier.max_length),
+ CoverageUpperBound<Graph>(g, ier.max_coverage))));
+
+ return std::make_shared<ParallelEdgeRemovingAlgorithm<Graph>>(g,
+ condition,
+ info.chunk_cnt(),
+ removal_handler,
+ /*canonical_only*/true);
+}
+
+template<class Graph>
+pred::TypedPredicate<typename Graph::EdgeId> NecessaryBulgeCondition(const Graph& g,
+ const config::debruijn_config::simplification::bulge_remover& br_config,
+ const SimplifInfoContainer&) {
+ auto analyzer = ParseBRConfig(g, br_config);
+ return omnigraph::NecessaryBulgeCondition(g, analyzer.max_length(), analyzer.max_coverage());
+}
+
+template<class Graph>
+pred::TypedPredicate<typename Graph::EdgeId> NecessaryTipCondition(const Graph& g,
+ const config::debruijn_config::simplification::tip_clipper& tc_config,
+ const SimplifInfoContainer& info) {
+ ConditionParser<Graph> parser(g, tc_config.condition, info);
+ auto condition = parser();
+ return omnigraph::NecessaryTipCondition(g, parser.max_length_bound(),
+ parser.max_coverage_bound());
+}
+
+template<class Graph>
+pred::TypedPredicate<typename Graph::EdgeId> NecessaryECCondition(const Graph& g,
+ const config::debruijn_config::simplification::erroneous_connections_remover& ec_config,
+ const SimplifInfoContainer& info, size_t current_iteration = 0, size_t iteration_cnt = 1) {
+ ConditionParser<Graph> parser(g, ec_config.condition, info, current_iteration, iteration_cnt);
+ auto condition = parser();
+ return omnigraph::NecessaryECCondition(g, parser.max_length_bound(),
+ parser.max_coverage_bound());
+}
+
+template<class Graph>
+AlgoPtr<Graph> ECRemoverInstance(Graph& g,
+ const config::debruijn_config::simplification::erroneous_connections_remover& ec_config,
+ const SimplifInfoContainer& info,
+ HandlerF<Graph> removal_handler,
+ size_t iteration_cnt = 1) {
+ if (ec_config.condition.empty())
+ return nullptr;
+
+ typedef omnigraph::ParallelInterestingElementFinder<Graph> InterestingFinderT;
+ InterestingFinderT interesting_finder(g,
+ NecessaryECCondition(g, ec_config, info, iteration_cnt - 1, iteration_cnt),
+ info.chunk_cnt());
+ return make_shared<LowCoverageEdgeRemovingAlgorithm<Graph, InterestingFinderT>>(
+ g, interesting_finder, info, ec_config.condition, removal_handler,
+ /*canonical only*/ true, /*track changes*/ true, iteration_cnt);
+}
+
+template<class Graph>
+AlgoPtr<Graph> TipClipperInstance(Graph& g,
+ const EdgeConditionT<Graph>& condition,
+ const SimplifInfoContainer& info,
+ HandlerF<Graph> removal_handler,
+ bool track_changes = true,
+ size_t /*iteration_cnt*/ = 1) {
+ return make_shared<ParallelEdgeRemovingAlgorithm<Graph, LengthComparator<Graph>>>(g,
+ AddTipCondition(g, condition),
+ info.chunk_cnt(),
+ removal_handler,
+ /*canonical_only*/true,
+ LengthComparator<Graph>(g),
+ track_changes);
+}
+
+template<class Graph>
+AlgoPtr<Graph> TipClipperInstance(Graph& g,
+ const config::debruijn_config::simplification::tip_clipper& tc_config,
+ const SimplifInfoContainer& info,
+ HandlerF<Graph> removal_handler,
+ size_t iteration_cnt = 1) {
+ if (tc_config.condition.empty())
+ return nullptr;
+
+ ConditionParser<Graph> parser(g, tc_config.condition, info);
+ auto condition = parser();
+ return TipClipperInstance(g, condition, info, removal_handler, /*track changes*/true, iteration_cnt);
+}
+
+template<class Graph>
+AlgoPtr<Graph> TopologyTipClipperInstance(
+ Graph &g,
+ const config::debruijn_config::simplification::topology_tip_clipper& ttc_config,
+ const SimplifInfoContainer& info,
+ HandlerF<Graph> removal_handler) {
+
+ auto condition
+ = pred::And(LengthUpperBound<Graph>(g,
+ LengthThresholdFinder::MaxTipLength(info.read_length(), g.k(), ttc_config.length_coeff)),
+ DefaultUniquenessPlausabilityCondition<Graph>(g,
+ ttc_config.uniqueness_length, ttc_config.plausibility_length));
+
+ return TipClipperInstance(g,
+ condition, info, removal_handler, /*track changes*/false);
+}
+
+template<class Graph>
+AlgoPtr<Graph> BRInstance(Graph& g,
+ const config::debruijn_config::simplification::bulge_remover& br_config,
+ const SimplifInfoContainer& info,
+ HandlerF<Graph> removal_handler,
+ size_t /*iteration_cnt*/ = 1) {
+ typedef ParallelInterestingElementFinder<Graph,
+ typename Graph::EdgeId> InterestingEdgeFinder;
+ if (!br_config.enabled || (br_config.main_iteration_only && !info.main_iteration())) {
+ return nullptr;
+ }
+
+ auto alternatives_analyzer = ParseBRConfig(g, br_config);
+
+
+ InterestingEdgeFinder interesting_edge_finder(g,
+ NecessaryBulgeCondition(g,
+ alternatives_analyzer.max_length(),
+ alternatives_analyzer.max_coverage()),
+ info.chunk_cnt());
+ if (br_config.parallel) {
+ INFO("Creating parallel br instance");
+ return make_shared<ParallelBulgeRemover<Graph, InterestingEdgeFinder>>(g,
+ interesting_edge_finder,
+ br_config.buff_size,
+ br_config.buff_cov_diff,
+ br_config.buff_cov_rel_diff,
+ alternatives_analyzer,
+ nullptr,
+ removal_handler,
+ /*track_changes*/true);
+ } else {
+ INFO("Creating br instance");
+ return make_shared<BulgeRemover<Graph, InterestingEdgeFinder>>(g,
+ interesting_edge_finder,
+ alternatives_analyzer,
+ nullptr,
+ removal_handler,
+ /*track_changes*/true);
+ }
+}
+
+//todo make this all work for end of the edges also? switch to canonical iteration?
+//todo rename, since checking topology also
+template<class Graph>
+class FlankingCovBound : public EdgeCondition<Graph> {
+ typedef EdgeCondition<Graph> base;
+ typedef typename Graph::EdgeId EdgeId;
+ const FlankingCoverage<Graph>& flanking_cov_;
+ double max_coverage_;
+public:
+ FlankingCovBound(const Graph& g,
+ const FlankingCoverage<Graph>& flanking_cov,
+ double max_coverage)
+ : base(g),
+ flanking_cov_(flanking_cov),
+ max_coverage_(max_coverage) {
+ }
+
+ bool Check(EdgeId e) const override {
+ return this->g().length(e) > 1
+ && this->g().OutgoingEdgeCount(this->g().EdgeStart(e)) > 1
+ && math::le(flanking_cov_.CoverageOfStart(e), max_coverage_);
+ }
+
+};
+
+template<class Graph, class Comparator = std::less<typename Graph::EdgeId>>
+class ParallelDisconnectionAlgorithm : public PersistentProcessingAlgorithm<Graph,
+ typename Graph::EdgeId,
+ ParallelInterestingElementFinder<Graph>, Comparator> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef PersistentProcessingAlgorithm<Graph, EdgeId,
+ ParallelInterestingElementFinder<Graph>, Comparator> base;
+ pred::TypedPredicate<EdgeId> condition_;
+ omnigraph::simplification::relative_coverage::EdgeDisconnector<Graph> disconnector_;
+
+public:
+ ParallelDisconnectionAlgorithm(Graph& g,
+ pred::TypedPredicate<EdgeId> condition,
+ size_t chunk_cnt,
+ HandlerF<Graph> removal_handler,
+ const Comparator& comp = Comparator(),
+ bool track_changes = true)
+ : base(g,
+ ParallelInterestingElementFinder<Graph>(g, condition, chunk_cnt),
+ /*canonical_only*/false, comp, track_changes),
+ condition_(condition),
+ disconnector_(g, removal_handler) {
+ }
+
+ bool Process(EdgeId e) override {
+ if (condition_(e)) {
+ disconnector_(e);
+ return true;
+ }
+ return false;
+ }
+
+};
+
+template<class Graph>
+AlgoPtr<Graph> LowFlankDisconnectorInstance(Graph& g,
+ const FlankingCoverage<Graph>& flanking_cov,
+ double cov_bound,
+ const SimplifInfoContainer& info,
+ HandlerF<Graph> removal_handler) {
+ if (math::ls(cov_bound, 0.)) {
+ INFO("Flanking coverage based disconnection disabled");
+ return nullptr;
+ }
+
+ return make_shared<ParallelDisconnectionAlgorithm<Graph>>(g,
+ FlankingCovBound<Graph>(g, flanking_cov, cov_bound),
+ info.chunk_cnt(),
+ removal_handler);
+}
+
+template<class Graph>
+bool RemoveHiddenLoopEC(Graph& g,
+ const FlankingCoverage<Graph>& flanking_cov,
+ double determined_coverage_threshold,
+ config::debruijn_config::simplification::hidden_ec_remover her_config,
+ HandlerF<Graph> removal_handler) {
+ if (her_config.enabled) {
+ INFO("Removing loops and rc loops with erroneous connections");
+ ECLoopRemover<Graph> hc(g, flanking_cov,
+ determined_coverage_threshold,
+ cfg::get().simp.her.relative_threshold, removal_handler);
+ bool res = hc.Run();
+ hc.PrintLoopStats();
+ return res;
+ }
+ return false;
+}
+
+
+////todo add chunk_cnt
+//template<class Graph>
+//bool ClipTips(
+// Graph& g,
+// const std::string& condition,
+// const SimplifInfoContainer& info,
+// std::function<void(typename Graph::EdgeId)> removal_handler = 0) {
+//
+// if (condition != "") {
+// ConditionParser<Graph> parser(g, condition, info);
+// auto condition = parser();
+// ParallelEdgeRemovingAlgorithm<Graph, LengthComparator<Graph>> algo(g,
+// AddTipCondition(g, condition),
+// info.chunk_cnt(),
+// removal_handler,
+// /*canonical_only*/true,
+// LengthComparator<Graph>(g));
+// return algo.Run();
+// } else {
+// return false;
+// }
+//}
+
+//template<class Graph>
+//bool RemoveLowCoverageEdges(
+// Graph& g,
+// const std::string& condition,
+// const SimplifInfoContainer& info,
+// std::function<void(typename Graph::EdgeId)> removal_handler = 0) {
+//
+// if (condition != "") {
+// ConditionParser<Graph> parser(g, condition, info);
+// auto condition = parser();
+// blahblahblah
+// ParallelEdgeRemovingAlgorithm<Graph, CoverageComparator<Graph>> algo(g,
+// condition,
+// info.chunk_cnt(),
+// removal_handler,
+// /*canonical_only*/true,
+// CoverageComparator<Graph>(g));
+// return algo.Run();
+// } else {
+// return false;
+// }
+//}
+
+
+//Parallel algo launch
+
+template<class Graph>
+void ParallelCompress(Graph& g, size_t chunk_cnt, bool loop_post_compression = true) {
+ INFO("Parallel compression");
+ debruijn::simplification::ParallelCompressor<Graph> compressor(g);
+ TwoStepAlgorithmRunner<Graph, typename Graph::VertexId> runner(g, false);
+ RunVertexAlgorithm(g, runner, compressor, chunk_cnt);
+
+ //have to call cleaner to get rid of new isolated vertices
+ omnigraph::Cleaner<Graph>(g, chunk_cnt).Run();
+
+ if (loop_post_compression) {
+ INFO("Launching post-compression to compress loops");
+ CompressAllVertices(g, chunk_cnt);
+ }
+}
+
+template<class Graph>
+bool ParallelClipTips(Graph& g,
+ const string& tip_condition,
+ const SimplifInfoContainer& info,
+ std::function<void(typename Graph::EdgeId)> removal_handler = 0) {
+ INFO("Parallel tip clipping");
+
+ string condition_str = tip_condition;
+
+ ConditionParser<Graph> parser(g, condition_str, info);
+
+ parser();
+
+ debruijn::simplification::ParallelTipClippingFunctor<Graph> tip_clipper(g,
+ parser.max_length_bound(), parser.max_coverage_bound(), removal_handler);
+
+ AlgorithmRunner<Graph, typename Graph::VertexId> runner(g);
+
+ RunVertexAlgorithm(g, runner, tip_clipper, info.chunk_cnt());
+
+ ParallelCompress(g, info.chunk_cnt());
+ //Cleaner is launched inside ParallelCompression
+ //CleanGraph(g, info.chunk_cnt());
+
+ return true;
+}
+
+//template<class Graph>
+//bool ParallelRemoveBulges(Graph& g,
+// const config::debruijn_config::simplification::bulge_remover& br_config,
+// size_t /*read_length*/,
+// std::function<void(typename Graph::EdgeId)> removal_handler = 0) {
+// INFO("Parallel bulge remover");
+//
+// size_t max_length = LengthThresholdFinder::MaxBulgeLength(
+// g.k(), br_config.max_bulge_length_coefficient,
+// br_config.max_additive_length_coefficient);
+//
+// DEBUG("Max bulge length " << max_length);
+//
+// debruijn::simplification::ParallelSimpleBRFunctor<Graph> bulge_remover(g,
+// max_length,
+// br_config.max_coverage,
+// br_config.max_relative_coverage,
+// br_config.max_delta,
+// br_config.max_relative_delta,
+// removal_handler);
+// for (VertexId v : g) {
+// bulge_remover(v);
+// }
+//
+// Compress(g);
+// return true;
+//}
+
+template<class Graph>
+bool ParallelEC(Graph& g,
+ const string& ec_condition,
+ const SimplifInfoContainer& info,
+ std::function<void(typename Graph::EdgeId)> removal_handler = 0) {
+ INFO("Parallel ec remover");
+
+ ConditionParser<Graph> parser(g, ec_condition, info);
+
+ auto condition = parser();
+
+ size_t max_length = parser.max_length_bound();
+ double max_coverage = parser.max_coverage_bound();
+
+ debruijn::simplification::CriticalEdgeMarker<Graph> critical_marker(g, info.chunk_cnt());
+ critical_marker.PutMarks();
+
+ debruijn::simplification::ParallelLowCoverageFunctor<Graph> ec_remover(g,
+ max_length,
+ max_coverage,
+ removal_handler);
+
+ TwoStepAlgorithmRunner<Graph, typename Graph::EdgeId> runner(g, true);
+
+ RunEdgeAlgorithm(g, runner, ec_remover, info.chunk_cnt());
+
+ critical_marker.ClearMarks();
+
+ ParallelCompress(g, info.chunk_cnt());
+ //called in parallel compress
+ //CleanGraph(g, info.chunk_cnt());
+ return true;
+}
+
+}
+}
diff --git a/src/modules/stages/simplification_pipeline/simplification_settings.hpp b/src/modules/stages/simplification_pipeline/simplification_settings.hpp
new file mode 100644
index 0000000..9b074a0
--- /dev/null
+++ b/src/modules/stages/simplification_pipeline/simplification_settings.hpp
@@ -0,0 +1,105 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+namespace debruijn {
+
+namespace simplification {
+
+class LengthThresholdFinder {
+public:
+ static size_t MaxTipLength(size_t read_length, size_t k, double coeff) {
+ return std::max((size_t) math::round((double)std::min(k, read_length / 2) * coeff),
+ read_length);
+ }
+
+ static size_t MaxBulgeLength(size_t k, double coeff,
+ size_t additive_coeff) {
+ return std::max((size_t) math::round((double)k * coeff), k + additive_coeff);
+ }
+
+ static size_t MaxErroneousConnectionLength(size_t k, size_t param) {
+ return k + param;
+ }
+
+ static size_t MaxTipOriginatedECLength(size_t read_length, size_t k,
+ double coeff) {
+ return 2 * MaxTipLength(read_length, k, coeff) - 1;
+ }
+};
+
+//todo use GenomicInfo as field!
+class SimplifInfoContainer {
+ size_t read_length_;
+ double detected_mean_coverage_;
+ double detected_coverage_bound_;
+ bool main_iteration_;
+ size_t chunk_cnt_;
+
+public:
+ SimplifInfoContainer() :
+ read_length_(-1ul),
+ detected_mean_coverage_(-1.0),
+ detected_coverage_bound_(-1.0),
+ main_iteration_(false),
+ chunk_cnt_(-1ul) {
+ }
+
+ size_t read_length() const {
+ VERIFY(read_length_ != -1ul);
+ return read_length_;
+ }
+
+ double detected_mean_coverage() const {
+ VERIFY(math::ge(detected_mean_coverage_, 0.));
+ return detected_mean_coverage_;
+ }
+
+ double detected_coverage_bound() const {
+ VERIFY(math::ge(detected_coverage_bound_, 0.));
+ return detected_coverage_bound_;
+ }
+
+ bool main_iteration() const {
+ return main_iteration_;
+ }
+
+ size_t chunk_cnt() const {
+ VERIFY(chunk_cnt_ != -1ul);
+ return chunk_cnt_;
+ }
+
+ SimplifInfoContainer& set_read_length(size_t read_length) {
+ read_length_ = read_length;
+ return *this;
+ }
+
+ SimplifInfoContainer& set_detected_coverage_bound(double detected_coverage_bound) {
+ detected_coverage_bound_ = detected_coverage_bound;
+ return *this;
+ }
+
+ SimplifInfoContainer& set_detected_mean_coverage(double detected_mean_coverage) {
+ detected_mean_coverage_ = detected_mean_coverage;
+ return *this;
+ }
+
+ SimplifInfoContainer& set_main_iteration(bool main_iteration) {
+ main_iteration_ = main_iteration;
+ return *this;
+ }
+
+ SimplifInfoContainer& set_chunk_cnt(size_t chunk_cnt) {
+ chunk_cnt_ = chunk_cnt;
+ return *this;
+ }
+};
+
+}
+
+}
diff --git a/src/modules/stages/simplification_pipeline/single_cell_simplification.hpp b/src/modules/stages/simplification_pipeline/single_cell_simplification.hpp
new file mode 100644
index 0000000..49dbc27
--- /dev/null
+++ b/src/modules/stages/simplification_pipeline/single_cell_simplification.hpp
@@ -0,0 +1,110 @@
+#pragma once
+
+#include "pipeline/config_struct.hpp"
+#include "algorithms/simplification/erroneous_connection_remover.hpp"
+#include "algorithms/simplification/mf_ec_remover.hpp"
+#include "stages/simplification_pipeline/simplification_settings.hpp"
+#include "assembly_graph/graph_support/detail_coverage.hpp"
+
+namespace debruijn {
+namespace simplification {
+
+template<class Graph>
+bool TopologyRemoveErroneousEdges(
+ Graph &g,
+ const debruijn_graph::config::debruijn_config::simplification::topology_based_ec_remover& tec_config,
+ std::function<void(typename Graph::EdgeId)> removal_handler) {
+ INFO("Removing connections based on topology");
+ size_t max_length = LengthThresholdFinder::MaxErroneousConnectionLength(
+ g.k(), tec_config.max_ec_length_coefficient);
+
+ pred::TypedPredicate<typename Graph::EdgeId>
+ condition(omnigraph::DefaultUniquenessPlausabilityCondition<Graph>(g, tec_config.uniqueness_length, tec_config.plausibility_length));
+
+ return omnigraph::RemoveErroneousEdgesInLengthOrder(g, condition, max_length, removal_handler);
+}
+
+template<class Graph>
+bool MultiplicityCountingRemoveErroneousEdges(
+ Graph &g,
+ const debruijn_graph::config::debruijn_config::simplification::topology_based_ec_remover& tec_config,
+ std::function<void(typename Graph::EdgeId)> removal_handler) {
+ INFO("Removing connections based on topological multiplicity counting");
+ size_t max_length = LengthThresholdFinder::MaxErroneousConnectionLength(
+ g.k(), tec_config.max_ec_length_coefficient);
+
+ pred::TypedPredicate<typename Graph::EdgeId>
+ condition(omnigraph::MultiplicityCountingCondition<Graph>(g, tec_config.uniqueness_length,
+ /*plausibility*/ MakePathLengthLowerBound(g,
+ omnigraph::PlausiblePathFinder<Graph>(g, 2 * tec_config.plausibility_length), tec_config.plausibility_length)));
+
+ return omnigraph::RemoveErroneousEdgesInLengthOrder(g, condition, max_length, removal_handler);
+}
+
+template<class Graph>
+bool RemoveThorns(
+ Graph &g,
+ const debruijn_graph::config::debruijn_config::simplification::interstrand_ec_remover& isec_config,
+ std::function<void(typename Graph::EdgeId)> removal_handler) {
+ INFO("Removing interstrand connections");
+ size_t max_length = LengthThresholdFinder::MaxErroneousConnectionLength(
+ g.k(), isec_config.max_ec_length_coefficient);
+
+ auto condition
+ = pred::And(omnigraph::LengthUpperBound<Graph>(g, max_length),
+ omnigraph::ThornCondition<Graph>(g, isec_config.uniqueness_length, isec_config.span_distance));
+
+ return omnigraph::RemoveErroneousEdgesInCoverageOrder(g, condition, numeric_limits<double>::max(), removal_handler);
+}
+
+template<class Graph>
+bool TopologyReliabilityRemoveErroneousEdges(
+ Graph &g,
+ const debruijn_graph::config::debruijn_config::simplification::tr_based_ec_remover& trec_config,
+ std::function<void(typename Graph::EdgeId)> removal_handler) {
+ INFO("Removing connections based on topology and reliable coverage");
+ size_t max_length = LengthThresholdFinder::MaxErroneousConnectionLength(
+ g.k(), trec_config.max_ec_length_coefficient);
+
+ auto condition
+ = pred::And(omnigraph::CoverageUpperBound<Graph>(g, trec_config.unreliable_coverage),
+ omnigraph::PredicateUniquenessPlausabilityCondition<Graph>(g,
+ /*uniqueness*/omnigraph::MakePathLengthLowerBound(g, omnigraph::UniquePathFinder<Graph>(g), trec_config.uniqueness_length),
+ /*plausibility*/pred::AlwaysTrue<typename Graph::EdgeId>()));
+
+ return omnigraph::RemoveErroneousEdgesInLengthOrder(g, condition, max_length, removal_handler);
+}
+
+template<class Graph>
+bool MaxFlowRemoveErroneousEdges(
+ Graph &g,
+ const debruijn_graph::config::debruijn_config::simplification::max_flow_ec_remover& mfec_config,
+ omnigraph::HandlerF<Graph> removal_handler = 0) {
+ if (!mfec_config.enabled)
+ return false;
+ INFO("Removing connections based on max flow strategy");
+ size_t max_length = LengthThresholdFinder::MaxErroneousConnectionLength(
+ g.k(), (size_t) mfec_config.max_ec_length_coefficient);
+ omnigraph::MaxFlowECRemover<Graph> erroneous_edge_remover(
+ g, max_length, mfec_config.uniqueness_length,
+ mfec_config.plausibility_length, removal_handler);
+ return erroneous_edge_remover.Process();
+}
+
+template<class Graph>
+bool RemoveHiddenEC(Graph& g,
+ const debruijn_graph::FlankingCoverage<Graph>& flanking_cov,
+ const debruijn_graph::config::debruijn_config::simplification::hidden_ec_remover& her_config,
+ const SimplifInfoContainer& info,
+ omnigraph::HandlerF<Graph> removal_handler) {
+ if (her_config.enabled) {
+ INFO("Removing hidden erroneous connections");
+ return omnigraph::HiddenECRemover<Graph>(g, her_config.uniqueness_length, flanking_cov,
+ her_config.unreliability_threshold, info.detected_coverage_bound(),
+ her_config.relative_threshold, removal_handler).Run();
+ }
+ return false;
+}
+
+}
+}
diff --git a/src/modules/visualization/graph_colorer.hpp b/src/modules/visualization/graph_colorer.hpp
new file mode 100644
index 0000000..234e1c1
--- /dev/null
+++ b/src/modules/visualization/graph_colorer.hpp
@@ -0,0 +1,340 @@
+#pragma once
+
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "assembly_graph/components/graph_component.hpp"
+#include "assembly_graph/paths/mapping_path.hpp"
+#include "visualization/printing_parameter_storage.hpp"
+//#include "edges_position_handler.hpp"
+
+namespace omnigraph {
+namespace visualization {
+
+template<typename ElementId>
+class ElementColorer : public virtual ParameterStorage<ElementId, string> {
+public:
+ template<typename Iter>
+ set<ElementId> ColoredWith(Iter begin, Iter end, const string &color) {
+ set<ElementId> result;
+ for(Iter it = begin; it != end; ++it) {
+ if(this->GetValue(*it) == color)
+ result.insert(*it);
+ }
+ return result;
+ }
+};
+
+//TODO remove all default color parameters!
+
+template<typename ElementId>
+class MapColorer : public ElementColorer<ElementId>, public MapParameterStorage<ElementId, string> {
+public:
+ MapColorer(const string &default_color) : MapParameterStorage<ElementId, string>(default_color) {
+ }
+
+ MapColorer(const map<ElementId, string> &color_map) : MapParameterStorage<ElementId, string>(color_map) {
+ }
+
+ MapColorer(const map<ElementId, string> &color_map, const string& default_color) : MapParameterStorage<ElementId, string>(color_map, default_color) {
+ }
+
+ template<class It>
+ MapColorer(It begin, It end, const string& color, const string& default_color) : MapParameterStorage<ElementId, string>(begin, end, color, default_color) {
+ }
+
+ virtual ~MapColorer() {
+ }
+};
+
+template<typename ElementId>
+class FixedColorer: public MapColorer<ElementId> {
+public:
+ FixedColorer(const string& default_color): MapColorer<ElementId>(default_color) {
+ }
+};
+
+template<class Graph>
+class SetColorer : public MapColorer<typename Graph::EdgeId> {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ const Graph &graph_;
+
+ template<class It>
+ map<EdgeId, string> ConstructColorMap(It begin, It end, const string &color) {
+ map<EdgeId, string> result;
+ for (auto it = begin; it != end; ++it) {
+ result[*it] = color;
+ }
+ return result;
+ }
+
+public:
+ template<class It>
+ SetColorer(const Graph &graph, It begin, It end, const string &color) :
+ MapColorer<typename Graph::EdgeId>(ConstructColorMap(begin, end, color), "black"), graph_(graph) {
+ }
+
+ template<class Collection>
+ SetColorer(const Graph &graph, const Collection& c, const string &color) :
+ MapColorer<typename Graph::EdgeId>(ConstructColorMap(c.begin(), c.end(), color), "black"), graph_(graph) {
+ }
+
+};
+//
+//template<class Graph>
+//class PositionsEdgeColorer: public ElementColorer<typename Graph::EdgeId> {
+//private:
+// typedef typename Graph::VertexId VertexId;
+// typedef typename Graph::EdgeId EdgeId;
+// const Graph &graph_;
+// EdgesPositionHandler<Graph> &positions_;
+//public:
+// PositionsEdgeColorer(const Graph &graph, EdgesPositionHandler<Graph> &positions):
+// graph_(graph), positions_(positions) {
+// }
+// string GetValue(EdgeId element) const {
+// std::vector<EdgeId> path;
+// path.push_back(element);
+// if (positions_.GetEdgePositions(element).size() == 0) return "black";
+// else {
+// if (positions_.IsConsistentWithGenome(path)) return "green";
+// else return "orange";
+// }
+// }
+//};
+
+
+template<class Graph>
+class CompositeEdgeColorer: public ElementColorer<typename Graph::EdgeId> {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ string default_color_;
+ vector<shared_ptr<ElementColorer<typename Graph::EdgeId>>> colorers_;
+
+ vector<string> CollectColors(EdgeId edge) const {
+ vector<string> result = {default_color_};
+ for(auto it = colorers_.begin(); it != colorers_.end(); ++it) {
+ string next_color = (*it)->GetValue(edge);
+ if(std::find(result.begin(), result.end(), next_color) == result.end())
+ result.push_back(next_color);
+ }
+ return result;
+ }
+
+ string ConstructColorString(const vector<string> &colors) const {
+ if(colors.size() == 1)
+ return default_color_;
+ string result = "";
+ for(size_t i = 1; i < colors.size(); i++)
+ result += ":" + colors[i];
+ return result.substr(1, result.size());
+ }
+
+public:
+ CompositeEdgeColorer(const string &default_color): default_color_(default_color) {
+ }
+
+ CompositeEdgeColorer(shared_ptr<ElementColorer<typename Graph::EdgeId>> colorer, const string &default_color): default_color_(default_color) {
+ AddColorer(colorer);
+ }
+
+ CompositeEdgeColorer(shared_ptr<ElementColorer<typename Graph::EdgeId>> colorer1, shared_ptr<ElementColorer<typename Graph::EdgeId>> colorer2,
+ const string &default_color): default_color_(default_color) {
+ AddColorer(colorer1);
+ AddColorer(colorer2);
+ }
+
+ void AddColorer(shared_ptr<ElementColorer<typename Graph::EdgeId>> colorer) {
+ colorers_.push_back(colorer);
+ }
+
+ string GetValue(EdgeId edge) const {
+ return ConstructColorString(CollectColors(edge));
+ }
+};
+
+template<class Graph>
+class GraphColorer : public ElementColorer<typename Graph::VertexId>, public ElementColorer<typename Graph::EdgeId>{
+public:
+ string GetValue(typename Graph::VertexId) const = 0;
+ string GetValue(typename Graph::EdgeId) const = 0;
+
+ template<typename Iter>
+ set<typename Iter::value_type> ColoredWith(Iter begin, Iter end, const string &color) {
+ return ElementColorer<typename Iter::value_type>::ColoredWith(begin, end, color);
+ }
+};
+
+template<class Graph>
+class DelegatingGraphColorer : public GraphColorer<Graph> {
+private:
+ const GraphColorer<Graph> &inner_colorer_;
+public:
+ DelegatingGraphColorer(const GraphColorer<Graph> &inner_colorer) : inner_colorer_(inner_colorer) {
+ }
+
+ string GetValue(typename Graph::VertexId v) const {
+ return inner_colorer_.GetValue(v);
+ }
+ string GetValue(typename Graph::EdgeId e) const {
+ return inner_colorer_.GetValue(e);
+ }
+};
+
+template<typename Graph>
+class BorderDecorator : public GraphColorer<Graph> {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ const GraphComponent<Graph> &component_;
+// const shared_ptr<const ElementColorer<typename Graph::VertexId>> vertex_colorer_ptr_;
+// const shared_ptr<const ElementColorer<typename Graph::EdgeId>> edge_colorer_ptr_;
+ const ElementColorer<typename Graph::VertexId> &vertex_colorer_;
+ const ElementColorer<typename Graph::EdgeId> &edge_colorer_;
+ const string border_color_;
+public:
+// BorderDecorator(const GraphComponent<Graph> &component,
+// const shared_ptr<const GraphColorer<Graph>> colorer,
+// const string &border_color) :
+// component_(component), vertex_colorer_ptr_(colorer), edge_colorer_ptr_(
+// colorer), vertex_colorer_(*colorer), edge_colorer_(
+// *colorer), border_color_(border_color) {
+// }
+
+ BorderDecorator(const GraphComponent<Graph> &component,
+ const GraphColorer<Graph> &colorer, const string &border_color = "yellow") :
+ component_(component), vertex_colorer_(colorer), edge_colorer_(colorer), border_color_(border_color) {
+ }
+
+ string GetValue(VertexId v) const {
+ if(component_.IsBorder(v)) {
+ return border_color_;
+ } else {
+ return vertex_colorer_.GetValue(v);
+ }
+ }
+
+ string GetValue(EdgeId e) const {
+ return edge_colorer_.GetValue(e);
+ }
+
+ static shared_ptr<BorderDecorator<Graph>> GetInstance(const GraphComponent<Graph> &component,
+ const GraphColorer<Graph> &colorer, const string &border_color = "yellow") {
+ return make_shared<BorderDecorator<Graph>>(component, colorer, border_color);
+ }
+};
+
+
+template<typename Graph>
+class SinkSourceDecorator : public GraphColorer<Graph> {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ const GraphComponent<Graph> &component_;
+// const shared_ptr<const ElementColorer<typename Graph::VertexId>> vertex_colorer_ptr_;
+// const shared_ptr<const ElementColorer<typename Graph::EdgeId>> edge_colorer_ptr_;
+ const ElementColorer<typename Graph::VertexId> &vertex_colorer_;
+ const ElementColorer<typename Graph::EdgeId> &edge_colorer_;
+ const string sink_color_;
+ const string source_color_;
+ const string sinksource_color_;
+public:
+
+ SinkSourceDecorator(const GraphComponent<Graph> &component,
+ const GraphColorer<Graph> &colorer, const string &sink_color = "red", const string &source_color = "orange", const string &sinksource_color = "green") :
+ component_(component), vertex_colorer_(colorer), edge_colorer_(colorer), sink_color_(sink_color), source_color_(source_color), sinksource_color_(sinksource_color) {
+ }
+
+ string GetValue(VertexId v) const {
+ if(component_.sinks().count(v) && !component_.sources().count(v)) {
+ return sink_color_;
+ }
+ if(component_.sources().count(v) && !component_.sinks().count(v))
+ {
+ return source_color_;
+ }
+ if(component_.sources().count(v) && component_.sinks().count(v))
+ {
+ return sinksource_color_;
+ }
+
+ return vertex_colorer_.GetValue(v);
+ }
+
+ string GetValue(EdgeId e) const {
+ return edge_colorer_.GetValue(e);
+ }
+
+ static shared_ptr<SinkSourceDecorator<Graph>> GetInstance(const GraphComponent<Graph> &component,
+ const GraphColorer<Graph> &colorer, const string &sink_color = "red", const string &source_color = "orange") {
+ return make_shared<SinkSourceDecorator<Graph>>(component, colorer, sink_color, source_color);
+ }
+};
+
+template<class Graph>
+class CompositeGraphColorer: public GraphColorer<Graph> {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ const shared_ptr<ElementColorer<VertexId>> vertex_colorer_;
+ const shared_ptr<ElementColorer<EdgeId>> edge_colorer_;
+public:
+ CompositeGraphColorer(shared_ptr<ElementColorer<VertexId>> vertex_colorer
+ , shared_ptr<ElementColorer<EdgeId>> edge_colorer) :
+ vertex_colorer_(vertex_colorer),
+ edge_colorer_(edge_colorer) {
+ }
+
+// explicit CompositeGraphColorer(shared_ptr<ElementColorer<EdgeId>> edge_colorer = make_shared<FixedColorer<EdgeId>>("black")) :
+// vertex_colorer_(shared_ptr<ElementColorer<VertexId>>(new FixedColorer<VertexId>("white"))),
+// edge_colorer_(edge_colorer) {
+// }
+
+ string GetValue(VertexId v) const {
+ return vertex_colorer_->GetValue(v);
+ }
+
+ string GetValue(EdgeId e) const {
+ return edge_colorer_->GetValue(e);
+ }
+
+};
+
+
+
+// edge_colorer management is passed here
+//TODO check all usages
+template <class Graph>
+shared_ptr<GraphColorer<Graph>> DefaultColorer(const Graph& /*g*/,
+ shared_ptr<ElementColorer<typename Graph::EdgeId>> edge_colorer) {
+ return shared_ptr<GraphColorer<Graph>>(new CompositeGraphColorer<Graph>(make_shared<FixedColorer<typename Graph::VertexId>>("white"), edge_colorer));
+}
+
+template <class Graph>
+shared_ptr<GraphColorer<Graph>> DefaultColorer(const Graph& g,
+ const Path<typename Graph::EdgeId>& path1,
+ const Path<typename Graph::EdgeId>& path2) {
+ shared_ptr<ElementColorer<typename Graph::EdgeId>> edge_colorer =
+ make_shared<CompositeEdgeColorer<Graph>>(
+ make_shared<SetColorer<Graph>>(g, path1.sequence(), "red"),
+ make_shared<SetColorer<Graph>>(g, path2.sequence(), "blue"), "black");
+ return DefaultColorer(g, edge_colorer);
+}
+
+template<class Graph>
+shared_ptr<GraphColorer<Graph>> DefaultColorer(const Graph& /*g*/) {
+ return shared_ptr<GraphColorer<Graph>>(new CompositeGraphColorer<Graph>(
+ make_shared<FixedColorer<typename Graph::VertexId>>("white"),
+ make_shared<FixedColorer<typename Graph::EdgeId>>("black")));
+}
+
+}
+}
diff --git a/src/modules/visualization/graph_labeler.hpp b/src/modules/visualization/graph_labeler.hpp
new file mode 100644
index 0000000..733ca0f
--- /dev/null
+++ b/src/modules/visualization/graph_labeler.hpp
@@ -0,0 +1,304 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef GRAPH_LABELER_HPP_
+#define GRAPH_LABELER_HPP_
+
+#include "dev_support/simple_tools.hpp"
+#include "dev_support/standard_base.hpp"
+#include "assembly_graph/handlers/edges_position_handler.hpp"
+
+namespace omnigraph {
+
+/**
+ * (Interface)
+ * Provides string labels for vertices and edges of some graph.
+ * Used with GraphPrinter to visualize graphs.
+ */
+template<class Graph>
+class GraphLabeler {
+public:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ virtual ~GraphLabeler() {
+ }
+
+ virtual string label(VertexId v) const = 0;
+
+ virtual string label(EdgeId e) const = 0;
+
+};
+
+//template<class Graph>
+//class MapGraphLabeler {
+// typedef typename Graph::EdgeId EdgeId;
+// typedef typename Graph::VertexId VertexId;
+// map<EdgeId, string> edge_map_;
+// map<VertexId, string> vertex_map_;
+//
+//public:
+//
+// string label(VertexId v) const {
+// auto it = vertex_map_.find(v);
+// if (it == vertex_map_.end())
+// return "";
+// else
+// return it->second;
+// }
+//
+// string label(EdgeId e) const {
+// auto it = edge_map_.find(e);
+// if (it == edge_map_.end())
+// return "";
+// else
+// return it->second;
+// }
+//
+//};
+
+template<class Graph>
+class AbstractGraphLabeler: public GraphLabeler<Graph> {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ const Graph& g_;
+protected:
+ AbstractGraphLabeler(const Graph& g): g_(g) {
+
+ }
+
+ const Graph& graph() const {
+ return g_;
+ }
+
+public:
+ /*virtual*/ std::string label(VertexId /*v*/) const {
+ return "";
+ }
+
+ /*virtual*/ std::string label(EdgeId /*e*/) const {
+ return "";
+ }
+
+};
+
+/**
+ * Trivial implementation of GraphLabeler.
+ * All labels are "".
+ */
+template<class Graph>
+class EmptyGraphLabeler : public GraphLabeler<Graph> {
+ typedef GraphLabeler<Graph> base;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+public:
+ EmptyGraphLabeler() {}
+
+ std::string label(VertexId /*v*/) const {
+ return "";
+ }
+
+ std::string label(EdgeId /*e*/) const {
+ return "";
+ }
+};
+
+/**
+ * Implementation of GraphLabeler for Graphs that have methods
+ * str(VertexId) and str(EdgeId), such as AbstractGraph.
+ */
+template<class Graph>
+class StrGraphLabeler : public AbstractGraphLabeler<Graph> {
+ typedef AbstractGraphLabeler<Graph> base;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+public:
+ StrGraphLabeler(const Graph& g) : base(g) {}
+
+ /*virtual*/ std::string label(VertexId v) const {
+ return this->graph().str(v);
+ }
+
+ /*virtual*/ std::string label(EdgeId e) const {
+ return this->graph().str(e);
+ }
+
+ /*virtual*/ ~StrGraphLabeler() {
+
+ }
+};
+
+template <class Graph>
+shared_ptr<GraphLabeler<Graph>> StrGraphLabelerInstance(const Graph& g) {
+ return make_shared<StrGraphLabeler<Graph>>(g);
+}
+
+template<class Graph>
+class LengthIdGraphLabeler : public StrGraphLabeler<Graph> {
+ typedef StrGraphLabeler<Graph> base;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+public:
+ LengthIdGraphLabeler(const Graph& g) : base(g) {}
+
+ /*virtual*/ std::string label(EdgeId e) const {
+ std::stringstream ss;
+ ss << this->graph().length(e) << " (id: " << this->graph().int_id(e) << ")";
+ return ss.str();
+ }
+
+};
+
+template<class Graph>
+class LengthGraphLabeler : public StrGraphLabeler<Graph> {
+ typedef StrGraphLabeler<Graph> base;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+public:
+ LengthGraphLabeler(const Graph& g) : base(g) {}
+
+ /*virtual*/ std::string label(EdgeId e) const {
+ return ToString(this->graph().length(e));
+ }
+
+};
+
+template<class Graph>
+class CoverageGraphLabeler : public AbstractGraphLabeler<Graph> {
+ typedef AbstractGraphLabeler<Graph> base;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+public:
+ CoverageGraphLabeler(const Graph& g) : base(g) {}
+
+ std::string label(EdgeId e) const {
+ double coverage = this->graph().coverage(e);
+ return " {Cov:" + ToString(coverage) + "}";
+ }
+};
+
+template<class Graph>
+class CompositeLabeler : public GraphLabeler<Graph> {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ vector<GraphLabeler<Graph>*> list_;
+
+ template<typename ElementId>
+ string ConstructLabel(ElementId id) const {
+ vector<string> to_print;
+ for(size_t i = 0; i < list_.size(); i++) {
+ string next = list_[i]->label(id);
+ if(next.size() != 0) {
+ to_print.push_back(next);
+ }
+ }
+ string result = "";
+ for(size_t i = 0; i < to_print.size(); i++) {
+ result += to_print[i];
+ if(i + 1 < to_print.size())
+ result += "\\n";
+ }
+ return result;
+ }
+
+public:
+ CompositeLabeler() {
+ }
+
+ CompositeLabeler(GraphLabeler<Graph> &labeler1, GraphLabeler<Graph> &labeler2, GraphLabeler<Graph> &labeler3, GraphLabeler<Graph> &labeler4) {
+ AddLabeler(labeler1);
+ AddLabeler(labeler2);
+ AddLabeler(labeler3);
+ AddLabeler(labeler4);
+ }
+
+ CompositeLabeler(GraphLabeler<Graph> &labeler1, GraphLabeler<Graph> &labeler2, GraphLabeler<Graph> &labeler3) {
+ AddLabeler(labeler1);
+ AddLabeler(labeler2);
+ AddLabeler(labeler3);
+ }
+
+ CompositeLabeler(GraphLabeler<Graph> &labeler1, GraphLabeler<Graph> &labeler2) {
+ AddLabeler(labeler1);
+ AddLabeler(labeler2);
+ }
+
+ virtual ~CompositeLabeler() {
+ }
+
+ void AddLabeler(GraphLabeler<Graph> &labeler) {
+ list_.push_back(&labeler);
+ }
+
+ virtual string label(VertexId vertexId) const {
+ return ConstructLabel<VertexId>(vertexId);
+ }
+
+ virtual string label(EdgeId edgeId) const {
+ return ConstructLabel<EdgeId>(edgeId);
+ }
+};
+
+template<class Graph>
+class EdgePosGraphLabeler: public AbstractGraphLabeler<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+public:
+ const EdgesPositionHandler<Graph>& edge_pos_;
+
+ EdgePosGraphLabeler(const Graph& g, const EdgesPositionHandler<Graph>& edge_pos) :
+ AbstractGraphLabeler<Graph>(g), edge_pos_(edge_pos) {
+ }
+
+ virtual std::string label(EdgeId edgeId) const {
+ return "Positions: " + edge_pos_.str(edgeId);
+ }
+
+ virtual ~EdgePosGraphLabeler() {
+// TRACE("~EdgePosGraphLabeler");
+ }
+private:
+ DECL_LOGGER("EdgePosGraphLabeler")
+};
+
+template<class Graph>
+class DefaultLabeler: public GraphLabeler<Graph> {
+private:
+ const Graph& g_;
+ const EdgesPositionHandler<Graph> &edges_positions_;
+protected:
+ typedef GraphLabeler<Graph> super;
+ typedef typename super::EdgeId EdgeId;
+ typedef typename super::VertexId VertexId;
+public:
+
+ DefaultLabeler(const Graph &g, const EdgesPositionHandler<Graph> &position_handler) :
+ g_(g), edges_positions_(position_handler) {
+ }
+
+ virtual std::string label(VertexId vertexId) const {
+ return ToString(vertexId.int_id());
+ }
+
+ virtual std::string label(EdgeId edgeId) const {
+ std::string ret_label;
+ ret_label += "Id " + g_.str(edgeId) + "\\n";
+ ret_label += "Positions:\\n"+ edges_positions_.str(edgeId);
+ size_t len = g_.length(edgeId);
+ double cov = g_.coverage(edgeId);
+ ret_label += "Len(cov): " + ToString(len) + "(" + ToString(cov) + ")";
+ return ret_label;
+ }
+
+ virtual ~DefaultLabeler() {
+ }
+};
+
+}
+
+#endif /* GRAPH_LABELER_HPP_ */
diff --git a/src/modules/visualization/graph_printer.hpp b/src/modules/visualization/graph_printer.hpp
new file mode 100644
index 0000000..9a9927f
--- /dev/null
+++ b/src/modules/visualization/graph_printer.hpp
@@ -0,0 +1,176 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "dev_support/standard_base.hpp"
+#include "io/graph_io/graph_print_utils.hpp"
+#include "graph_labeler.hpp"
+#include "graph_colorer.hpp"
+#include "vertex_linker.hpp"
+
+namespace omnigraph {
+namespace visualization {
+
+template<class Graph>
+class GraphPrinter {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+// ostream& os_;
+ const Graph &graph_;
+protected:
+ const GraphLabeler<Graph> &labeler_;
+ const GraphColorer<Graph> &colorer_;
+ const VertexLinker<Graph> &linker_;
+
+protected:
+// ostream& os() {
+// return os_;
+// }
+
+
+ const Graph &graph() {
+ return graph_;
+ }
+
+ template<class GvisVertexId>
+ gvis::BaseVertex<GvisVertexId> CreateBaseVertex(GvisVertexId id, VertexId v) {
+ return gvis::BaseVertex<GvisVertexId>(id, labeler_.label(v), linker_.GetValue(v), colorer_.GetValue(v));
+ }
+
+ template<class GvisVertexId>
+ gvis::BaseEdge<GvisVertexId> CreateBaseEdge(GvisVertexId from, GvisVertexId to, EdgeId e){
+ return gvis::BaseEdge<GvisVertexId>(from, to, this->labeler_.label(e), this->colorer_.GetValue(e));
+ }
+
+ virtual void ManageDrawn(VertexId v, set<VertexId> &visited) {
+ visited.insert(v);
+ }
+
+public:
+ GraphPrinter(const Graph &graph, /*ostream &os,*/
+ const GraphLabeler<Graph> &labeler,
+ const GraphColorer<Graph> &colorer,
+ const VertexLinker<Graph> &linker) :
+ /*os_(os), */graph_(graph), labeler_(labeler), colorer_(colorer), linker_(
+ linker) {
+ }
+
+ virtual void open() = 0;
+
+ virtual void close() = 0;
+
+ virtual void AddVertex(VertexId v1) = 0;
+
+ template<class iter>
+ void AddVertices(iter vbegin, iter vend) {
+ set<VertexId> drawn;
+ for(;vbegin != vend; ++vbegin) {
+ if(drawn.count(*vbegin) == 0) {
+ AddVertex(*vbegin);
+ ManageDrawn(*vbegin, drawn);
+ }
+ }
+ }
+
+ virtual void AddEdge(EdgeId e) = 0;
+
+ template<class iter>
+ void AddEdges(iter ebegin, iter eend) {
+ for(;ebegin != eend; ++ebegin) {
+ AddEdge(*ebegin);
+ }
+ }
+
+ virtual ~GraphPrinter() {
+ }
+};
+
+template<typename Graph>
+class SingleGraphPrinter : public GraphPrinter<Graph> {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ gvis::DotSingleGraphRecorder<size_t> recorder_;
+
+public:
+ SingleGraphPrinter(const Graph &graph, ostream &os,
+ const GraphLabeler<Graph> &labeler,
+ const GraphColorer<Graph> &colorer,
+ const VertexLinker<Graph> &linker) : GraphPrinter<Graph>(/*os_, */graph, labeler, colorer, linker), recorder_(os){
+ }
+
+ void open() {
+ recorder_.startGraphRecord("graph_picture");
+ }
+
+ void close() {
+ recorder_.endGraphRecord();
+ }
+
+ void AddVertex(VertexId v) {
+ recorder_.recordVertex(this->CreateBaseVertex((size_t)this->graph().int_id(v), v));
+ }
+
+ void AddEdge(EdgeId edge) {
+ recorder_.recordEdge(this->CreateBaseEdge((size_t)this->graph().int_id(this->graph().EdgeStart(edge)), (size_t)this->graph().int_id(this->graph().EdgeEnd(edge)), edge));
+ }
+};
+
+template<typename Graph>
+class PairedGraphPrinter : public GraphPrinter<Graph> {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ gvis::DotPairedGraphRecorder<size_t> recorder_;
+
+ pair<gvis::BaseVertex<size_t>, gvis::BaseVertex<size_t>> CreateDoubleVertex(VertexId v) {
+ gvis::BaseVertex<size_t> u1 = this->CreateBaseVertex((size_t)this->graph().int_id(v), v);
+ gvis::BaseVertex<size_t> u2 = this->CreateBaseVertex((size_t)this->graph().int_id(this->graph().conjugate(v)), this->graph().conjugate(v));
+ return make_pair(u1, u2);
+ }
+
+ pair<size_t, size_t> CreateDoubleVertexId(VertexId v) {
+ return make_pair(this->graph().int_id(v), this->graph().int_id(this->graph().conjugate(v)));
+ }
+protected:
+ /*virtual */void ManageDrawn(VertexId v, set<VertexId> &visited) {
+ visited.insert(v);
+ visited.insert(this->graph().conjugate(v));
+ }
+
+public:
+ PairedGraphPrinter(const Graph &graph, ostream &os,
+ const GraphLabeler<Graph> &labeler,
+ const GraphColorer<Graph> &colorer,
+ const VertexLinker<Graph> &linker) : GraphPrinter<Graph>(/*os_, */graph, labeler, colorer, linker), recorder_(os) {
+ }
+
+ void open() {
+ recorder_.startGraphRecord("graph_picture");
+ }
+
+ void close() {
+ recorder_.endGraphRecord();
+ }
+
+ void AddVertex(VertexId v) {
+ recorder_.recordVertex(CreateDoubleVertex(v));
+ }
+
+ void AddEdge(EdgeId edge) {
+ auto vid1 = CreateDoubleVertexId(this->graph().EdgeStart(edge));
+ auto vid2 = CreateDoubleVertexId(this->graph().EdgeEnd(edge));
+ recorder_.recordEdge(gvis::BaseEdge<pair<size_t, size_t>>(vid1, vid2, this->labeler_.label(edge), this->colorer_.GetValue(edge)));
+ }
+};
+
+}
+}
diff --git a/src/modules/visualization/position_filler.hpp b/src/modules/visualization/position_filler.hpp
new file mode 100644
index 0000000..db088bb
--- /dev/null
+++ b/src/modules/visualization/position_filler.hpp
@@ -0,0 +1,91 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "assembly_graph/graph_alignment/sequence_mapper.hpp"
+#include "assembly_graph/handlers/edges_position_handler.hpp"
+#include "io/reads_io/wrapper_collection.hpp"
+
+namespace debruijn_graph {
+
+template<class Graph>
+class PosFiller {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef std::shared_ptr<SequenceMapper<Graph>> MapperPtr;
+ const Graph &g_;
+ MapperPtr mapper_;
+ EdgesPositionHandler<Graph> &edge_pos_;
+
+public:
+ PosFiller(const Graph &g, MapperPtr mapper,
+ EdgesPositionHandler<Graph> &edge_pos) :
+ g_(g), mapper_(mapper), edge_pos_(edge_pos) {
+
+ }
+
+ void Process(const Sequence &s, string name) const {
+ //todo stupid conversion!
+ return Process(io::SingleRead(name, s.str()));
+ }
+
+ void Process(const io::SingleRead &read) const {
+ MappingPath<EdgeId> path = mapper_->MapRead(read);
+ const string name = read.name();
+ int cur_pos = 0;
+ TRACE("Contig " << name << " mapped on " << path.size()
+ << " fragments.");
+ for (size_t i = 0; i < path.size(); i++) {
+ EdgeId ei = path[i].first;
+ MappingRange mr = path[i].second;
+ int len = (int) (mr.mapped_range.end_pos - mr.mapped_range.start_pos);
+ if (i > 0) if (path[i - 1].first != ei) if (g_.EdgeStart(ei) != g_.EdgeEnd(path[i - 1].first)) {
+ TRACE(
+ "Contig " << name
+ << " mapped on not adjacent edge. Position in contig is "
+ << path[i - 1].second.initial_range.start_pos
+ + 1
+ << "--"
+ << path[i - 1].second.initial_range.end_pos
+ << " and "
+ << mr.initial_range.start_pos + 1
+ << "--" << mr.initial_range.end_pos);
+ }
+ edge_pos_.AddEdgePosition(ei, name, mr.initial_range.start_pos,
+ mr.initial_range.end_pos,
+ mr.mapped_range.start_pos,
+ mr.mapped_range.end_pos);
+ cur_pos += len;
+ }
+ }
+
+ void Process(io::SingleStream &stream) const {
+ io::SingleRead read;
+ while (!stream.eof()) {
+ stream >> read;
+ Process(read);
+ }
+ }
+
+private:
+ DECL_LOGGER("PosFiller");
+};
+
+template<class gp_t>
+void FillPos(gp_t &gp, const string &contig_file, string prefix, bool with_rc = false) {
+ PosFiller<typename gp_t::graph_t> pos_filler(gp.g, MapperInstance(gp), gp.edge_pos);
+ auto irs = std::make_shared<io::PrefixAddingReaderWrapper>(io::EasyStream(contig_file, with_rc), prefix);
+ pos_filler.Process(*irs);
+}
+
+template<class gp_t>
+void FillPos(gp_t &gp, const Sequence &s, string name) {
+ PosFiller<typename gp_t::graph_t> pos_filler(gp.g, MapperInstance(gp), gp.edge_pos);
+ pos_filler.Process(s, name);
+}
+
+}
diff --git a/src/modules/visualization/printing_parameter_storage.hpp b/src/modules/visualization/printing_parameter_storage.hpp
new file mode 100644
index 0000000..f052733
--- /dev/null
+++ b/src/modules/visualization/printing_parameter_storage.hpp
@@ -0,0 +1,81 @@
+#pragma once
+
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+
+#include "dev_support/standard_base.hpp"
+#include "assembly_graph/components/graph_component.hpp"
+namespace omnigraph {
+namespace visualization {
+
+template<typename ElementId, typename Value>
+class ParameterStorage {
+public:
+ virtual Value GetValue(ElementId element) const = 0;
+
+ virtual ~ParameterStorage() {
+ }
+};
+
+template<typename ElementId, typename Value>
+class MapParameterStorage : public virtual ParameterStorage<ElementId, Value> {
+private:
+private:
+ template<class It>
+ static map<ElementId, string> ConstructMap(It begin, It end, const string& color) {
+ map<ElementId, string> result;
+ for (auto it = begin; it != end; ++it) {
+ result.insert(make_pair(*it, color));
+ }
+ return result;
+ }
+
+protected:
+ map<ElementId, Value> storage_;
+private:
+ boost::optional<Value> default_value_;
+public:
+ MapParameterStorage(const string &default_value) : default_value_(default_value) {
+ }
+
+ MapParameterStorage(map<ElementId, Value> storage, Value default_value) : storage_(storage), default_value_(default_value) {
+ }
+
+ MapParameterStorage(map<ElementId, Value> storage) : storage_(storage) {
+ }
+
+ template<class It>
+ MapParameterStorage(It begin, It end, const Value& value, const string& default_value) : storage_(ConstructMap(begin, end, value)), default_value_(default_value) {
+ }
+
+
+ Value GetValue(ElementId element) const {
+ auto it = storage_.find(element);
+ if (it == storage_.end()) {
+ VERIFY(default_value_);
+ return default_value_.get();
+ }
+ return it->second;
+ }
+};
+
+template<typename ElementId, typename Value>
+class DecoratorParameterStorage : public virtual ParameterStorage<ElementId, Value> {
+private:
+ ParameterStorage<ElementId, Value> inner_storage_;
+public:
+ DecoratorParameterStorage(ParameterStorage<ElementId, Value> inner_storage) : inner_storage_(inner_storage) {
+ }
+
+ Value GetInnerValue(ElementId element) {
+ return inner_storage_.GetValue(element);
+ }
+};
+
+}
+}
diff --git a/src/modules/visualization/vertex_linker.hpp b/src/modules/visualization/vertex_linker.hpp
new file mode 100644
index 0000000..f960b20
--- /dev/null
+++ b/src/modules/visualization/vertex_linker.hpp
@@ -0,0 +1,41 @@
+#pragma once
+
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "dev_support/standard_base.hpp"
+#include "printing_parameter_storage.hpp"
+
+namespace omnigraph {
+namespace visualization {
+
+template<class Graph>
+class VertexLinker : public virtual ParameterStorage<typename Graph::VertexId, string> {
+};
+
+template<class Graph>
+class MapVertexLinker : public VertexLinker<Graph>, public MapParameterStorage<typename Graph::VertexId, string> {
+public:
+ MapVertexLinker() : MapParameterStorage<typename Graph::VertexId, string>("") {
+ }
+
+ MapVertexLinker(const map<typename Graph::VertexId, string> &link_map) : MapParameterStorage<typename Graph::VertexId, string>(link_map, "") {
+ }
+
+ virtual ~MapVertexLinker() {
+ }
+};
+
+template<class Graph>
+class EmptyGraphLinker : public MapVertexLinker<Graph> {
+public:
+ EmptyGraphLinker() {
+ }
+};
+
+}
+}
diff --git a/src/include/omni/visualization/visualization.hpp b/src/modules/visualization/visualization.hpp
similarity index 100%
rename from src/include/omni/visualization/visualization.hpp
rename to src/modules/visualization/visualization.hpp
diff --git a/src/modules/visualization/visualization_utils.hpp b/src/modules/visualization/visualization_utils.hpp
new file mode 100644
index 0000000..1c03492
--- /dev/null
+++ b/src/modules/visualization/visualization_utils.hpp
@@ -0,0 +1,210 @@
+#pragma once
+
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "graph_printer.hpp"
+#include "algorithms/dijkstra/dijkstra_helper.hpp"
+#include "assembly_graph/components/splitters.hpp"
+#include "assembly_graph/components/graph_component.hpp"
+#include "visualizers.hpp"
+#include "vertex_linker.hpp"
+
+#include <fstream>
+
+namespace omnigraph {
+namespace visualization {
+
+
+template<class Graph>
+void WriteComponents(const Graph& g,
+ const string& folder_name,
+ shared_ptr<GraphSplitter<Graph>> inner_splitter,
+ shared_ptr<GraphColorer<Graph>> colorer,
+ const GraphLabeler<Graph> &labeler) {
+ EmptyGraphLinker<Graph> linker;
+// shared_ptr<GraphComponentFilter<Graph>> checker = make_shared<ComponentSizeFilter<Graph>>(g, 1500, 2, 300);
+ auto filter = make_shared<omnigraph::SmallComponentFilter<Graph>>(g, 3);
+ shared_ptr<GraphSplitter<Graph>> splitter = make_shared<omnigraph::CollectingSplitterWrapper<Graph>>(inner_splitter, filter);
+ omnigraph::visualization::SplittingGraphVisualizer<Graph>(g, labeler, *colorer, linker).SplitAndVisualize(*splitter, folder_name);
+}
+
+template<class Graph>
+void DrawComponentsOfShortEdges(const Graph& g, size_t min_length, size_t sinks, size_t sources)
+{
+ vector<typename Graph::EdgeId> short_edges;
+ std::string pics_folder_ = cfg::get().output_dir + ToString(min_length) + "_" + ToString(sinks) + "_" + ToString(sources) + "_"+ "pics_polymorphic/";
+ make_dir(pics_folder_);
+ INFO("Writing pics with components consisting of short edges to " + pics_folder_);
+ shared_ptr<GraphSplitter<Graph>> splitter = LongEdgesExclusiveSplitter<Graph>(g, min_length);
+ while (splitter->HasNext()) {
+ GraphComponent<Graph> component = splitter->Next();
+ if(component.v_size() > 3 && component.sinks().size() == sinks && component.sources().size() == sources)
+ {
+ bool fail = false;
+ for(auto v : component.sources()) {
+ if(component.g().IncomingEdgeCount(v) != 1) {
+ fail = true;
+ }
+ }
+ for(auto v : component.sinks()) {
+ if(component.g().OutgoingEdgeCount(v) != 1) {
+ fail = true;
+ }
+ }
+
+ if(fail)
+ {
+ continue;
+ }
+
+ StrGraphLabeler<Graph> labeler(component.g());
+ CoverageGraphLabeler<Graph> labeler2(component.g());
+ CompositeLabeler<Graph> compositeLabeler(labeler, labeler2);
+ WriteComponentSinksSources(component, pics_folder_ + ToString(g.int_id(*component.vertices().begin()))
+ + ".dot", visualization::DefaultColorer(g),
+ compositeLabeler);
+ INFO("Component is written to " + ToString(g.int_id(*component.vertices().begin())) + ".dot");
+
+ // PrintComponent(component,
+// pics_folder_ + "ShortComponents/"
+// + ToString(gp.g.int_id(component.vertices_[0]))
+// + ".dot");
+ }
+ }
+}
+
+
+template<class Graph>
+void WriteSizeLimitedComponents(const Graph& g,
+ const string& folder_name,
+ shared_ptr<GraphSplitter<Graph>> inner_splitter,
+ shared_ptr<GraphColorer<Graph>> colorer,
+ const GraphLabeler<Graph> &labeler, int min_component_size, int max_component_size, size_t max_components) {
+ EmptyGraphLinker<Graph> linker;
+
+ auto filter = make_shared<omnigraph::ComponentSizeFilter<Graph>>(g, 1000000000, (size_t) min_component_size, (size_t) max_component_size);
+ shared_ptr<GraphSplitter<Graph>> splitter = make_shared<omnigraph::CollectingSplitterWrapper<Graph>>(inner_splitter, filter);
+ omnigraph::visualization::SplittingGraphVisualizer<Graph>(g, labeler, *colorer, linker, false, max_components).SplitAndVisualize(*splitter, folder_name);
+}
+
+template<class Graph>
+void WriteComponent(const GraphComponent<Graph>& gc,
+ const string& file_name, shared_ptr<GraphColorer<Graph>> colorer,
+ const GraphLabeler<Graph> &labeler) {
+ EmptyGraphLinker<Graph> linker;
+ BorderDecorator<Graph> component_colorer(gc, *colorer, "yellow");
+ std::ofstream os;
+ os.open(file_name);
+ omnigraph::visualization::ComponentVisualizer<Graph>(gc.g(), true).Visualize(gc, os, labeler, component_colorer, linker);
+ os.close();
+}
+
+template<class Graph>
+void WriteComponentSinksSources(const GraphComponent<Graph>& gc,
+ const string& file_name, shared_ptr<GraphColorer<Graph>> colorer,
+ const GraphLabeler<Graph> &labeler) {
+ EmptyGraphLinker<Graph> linker;
+ SinkSourceDecorator<Graph> component_colorer(gc, *colorer);
+ std::ofstream os;
+ os.open(file_name);
+ omnigraph::visualization::ComponentVisualizer<Graph>(gc.g(), true).Visualize(gc, os, labeler, component_colorer, linker);
+ os.close();
+}
+
+template<class Graph>
+void WriteComponentSinksSources(const GraphComponent<Graph>& gc,
+ const string& file_name) {
+
+ StrGraphLabeler<Graph> labeler(gc.g());
+ CoverageGraphLabeler<Graph> labeler2(gc.g());
+ CompositeLabeler<Graph> compositeLabeler(labeler, labeler2);
+ EmptyGraphLinker<Graph> linker;
+ WriteComponentSinksSources(gc, file_name, DefaultColorer(gc.g()),
+ compositeLabeler);
+}
+
+template<class Graph>
+void WriteSimpleComponent(const GraphComponent<Graph>& gc,
+ const string& file_name, shared_ptr<GraphColorer<Graph>> colorer,
+ const GraphLabeler<Graph> &labeler) {
+ EmptyGraphLinker<Graph> linker;
+ std::ofstream os;
+ os.open(file_name);
+ omnigraph::visualization::ComponentVisualizer<Graph>(gc.g(), false).Visualize(gc, os, labeler, *colorer, linker);
+ os.close();
+}
+
+template<class Graph>
+void WriteComponentsAlongPath(const Graph& g, vector<typename Graph::EdgeId> path,
+ const string& prefix_path, shared_ptr<GraphColorer<Graph>> colorer,
+ const GraphLabeler<Graph> &labeler, bool color_path = true) {
+ auto edge_colorer = make_shared<CompositeEdgeColorer<Graph>>("black");
+ edge_colorer->AddColorer(colorer);
+ if (color_path) {
+ edge_colorer->AddColorer(make_shared<SetColorer<Graph>>(g, path, "green"));
+ }
+ shared_ptr<GraphColorer<Graph>> resulting_colorer = make_shared<CompositeGraphColorer<Graph>>(colorer, edge_colorer);
+ shared_ptr<GraphSplitter<Graph>> rs = ReliableSplitterAlongPath<Graph>(g, path);
+ auto filter = make_shared<omnigraph::SmallComponentFilter<Graph>>(g, 3);
+ shared_ptr<GraphSplitter<Graph>> splitter = make_shared<omnigraph::CondensingSplitterWrapper<Graph>>(rs, filter);
+ WriteComponents<Graph>(g, prefix_path, splitter, resulting_colorer, labeler);
+}
+
+template<class Graph>
+class LocalityPrintingRH {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ const Graph& g_;
+ const GraphLabeler<Graph>& labeler_;
+ std::shared_ptr<visualization::GraphColorer<Graph>> colorer_;
+ const string output_folder_;
+public:
+ LocalityPrintingRH(const Graph& g
+ , const GraphLabeler<Graph>& labeler
+ , std::shared_ptr<visualization::GraphColorer<Graph>> colorer
+ , const string& output_folder) :
+ g_(g),
+ labeler_(labeler),
+ colorer_(colorer),
+ output_folder_(output_folder) {
+ path::make_dirs(output_folder_);
+ }
+
+ void HandleDelete(EdgeId e, const string& add_label = "") {
+ //todo magic constant
+// map<EdgeId, string> empty_coloring;
+ auto edge_colorer = make_shared<visualization::CompositeEdgeColorer<Graph>>("black");
+ edge_colorer->AddColorer(colorer_);
+ edge_colorer->AddColorer(make_shared<visualization::SetColorer<Graph>>(this->g(), vector<EdgeId>(1, e), "green"));
+ shared_ptr<visualization::GraphColorer<Graph>> resulting_colorer = make_shared<visualization::CompositeGraphColorer<Graph>>(colorer_, edge_colorer);
+
+ string fn = output_folder_ + "edge_" + ToString(this->g().int_id(e)) + add_label + ".dot";
+ omnigraph::visualization::WriteComponent(omnigraph::EdgeNeighborhood<Graph>(this->g(), e, 50, 250)
+ , fn
+ , resulting_colorer, labeler_);
+ }
+
+private:
+ DECL_LOGGER("LocalityPrintingRH")
+ ;
+};
+
+//static void WriteFilteredComponents(const Graph& g,
+// const string& folder_name,
+// shared_ptr<GraphComponentFilter<Graph>> filter,
+// shared_ptr<GraphSplitter<Graph>> splitter,
+// shared_ptr<GraphColorer<Graph>> colorer,
+// const GraphLabeler<Graph> &labeler) {
+// EmptyGraphLinker<Graph> linker;
+//// shared_ptr<GraphComponentFilter<Graph>> checker = make_shared<ComponentSizeFilter<Graph>>(g, 1500, 2, 300);
+// omnigraph::FilteringSplitterWrapper<Graph> filtered_splitter(splitter, filter);
+// omnigraph::visualization::SplittingGraphVisualizer<Graph>(g, labeler, *colorer, linker).SplitAndVisualize(filtered_splitter, folder_name);
+//}
+
+}
+}
diff --git a/src/modules/visualization/visualizers.hpp b/src/modules/visualization/visualizers.hpp
new file mode 100644
index 0000000..6b35a94
--- /dev/null
+++ b/src/modules/visualization/visualizers.hpp
@@ -0,0 +1,173 @@
+#pragma once
+
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+
+#include "dev_support/standard_base.hpp"
+#include "graph_printer.hpp"
+#include <fstream>
+
+namespace omnigraph {
+namespace visualization {
+
+//DECL_LOGGER("omg.gvis")
+
+template<class Graph>
+class ComponentVisualizer {
+ const Graph& graph_;
+ const bool paired_;
+
+private:
+ void Visualize(const GraphComponent<Graph>& component, GraphPrinter<Graph> &printer) {
+ printer.open();
+ printer.AddVertices(component.vertices().begin(), component.vertices().end());
+ for (auto e_it = component.e_begin(); e_it != component.e_end();
+ ++e_it) {
+ printer.AddEdge(*e_it);
+ }
+ printer.close();
+ }
+
+public:
+ ComponentVisualizer(const Graph& graph, bool paired = true) :
+ graph_(graph), paired_(paired) {
+ }
+
+ void Visualize(const GraphComponent<Graph>& component, ostream &os,
+ const GraphLabeler<Graph> &labeler,
+ const GraphColorer<Graph> &colorer,
+ const VertexLinker<Graph> &linker) {
+ if(paired_) {
+ PairedGraphPrinter<Graph> printer(graph_, os, labeler, colorer, linker);
+ Visualize(component, printer);
+ } else {
+ SingleGraphPrinter<Graph> printer(graph_, os, labeler, colorer, linker);
+ Visualize(component, printer);
+ }
+ }
+
+ void Visualize(ostream &os,
+ const GraphLabeler<Graph> &labeler,
+ const GraphColorer<Graph> &colorer,
+ const VertexLinker<Graph> &linker) {
+ GraphComponent<Graph> component(graph_, graph_.begin(), graph_.end(), false);
+ Visualize(component, os, labeler, colorer, linker);
+ }
+};
+
+
+template<class Graph>
+class ComponentNameGenerator {
+public:
+ virtual string ComponentName(const GraphComponent<Graph>& component) = 0;
+
+ virtual ~ComponentNameGenerator() {
+ }
+};
+
+template<class Graph>
+class SimpleCountingComponentNameGenerator: public ComponentNameGenerator<Graph> {
+private:
+ string name_;
+ string extension_;
+ size_t cnt_;
+public:
+ SimpleCountingComponentNameGenerator(string name, string extension): name_(name), extension_(extension), cnt_(0) {
+ }
+
+ string ComponentName(const GraphComponent<Graph>& component) {
+ cnt_++;
+ stringstream ss;
+ ss << name_ << "_" << cnt_;
+ if(component.name().size() > 0)
+ ss << "_" << component.name();
+ ss << "." << extension_;
+ return ss.str();
+ }
+};
+
+template<class Graph>
+class CountingSizeComponentNameGenerator: public ComponentNameGenerator<Graph> {
+private:
+ string name_;
+ string extension_;
+ size_t cnt_;
+public:
+ CountingSizeComponentNameGenerator(string name, string extension): name_(name), extension_(extension), cnt_(0) {
+ }
+
+ string ComponentName(const GraphComponent<Graph>& component) {
+ cnt_++;
+ stringstream ss;
+ ss << name_ << "_" << cnt_;
+ if(component.name().size() > 0)
+ ss << "_" << component.name();
+ ss << "_size_" << component.size();
+ ss << "." << extension_;
+
+ return ss.str();
+ }
+};
+
+
+template<class Graph>
+class SplittingGraphVisualizer {
+private:
+ const Graph& graph_;
+ const GraphLabeler<Graph> &labeler_;
+ const GraphColorer<Graph> &colorer_;
+ const VertexLinker<Graph> &linker_;
+ const bool paired_;
+ const size_t max_component_number_;
+ static const size_t DEFAULT_MAX_COMPONENT_NUMBER = 500;
+
+ string ComponentFileName(size_t cnt, const string &folder, const GraphComponent<Graph>& component) {
+ stringstream ss;
+ ss << folder << cnt;
+ if(component.name().size() > 0)
+ ss << "graph_" << component.name();
+ ss << ".dot";
+ return ss.str();
+ }
+
+public:
+ SplittingGraphVisualizer(const Graph& graph,
+ const GraphLabeler<Graph> &labeler,
+ const GraphColorer<Graph> &colorer,
+ const VertexLinker<Graph> &linker,
+ bool paired = true,
+ size_t max_component_number = DEFAULT_MAX_COMPONENT_NUMBER) :
+ graph_(graph), labeler_(labeler), colorer_(colorer), linker_(linker), paired_(paired), max_component_number_(max_component_number) {
+ }
+
+ size_t SplitAndVisualize(GraphSplitter<Graph> &splitter, const string &folder) {
+ INFO("Writing components to folder " << folder);
+ ComponentVisualizer<Graph> visualizer(graph_, paired_);
+ size_t cnt = 0;
+ while(splitter.HasNext()) {
+ if(cnt > max_component_number_) {
+ INFO("The number of graph components exceeded " << max_component_number_ << ". Aborting current visualization.");
+ break;
+ }
+ cnt++;
+ GraphComponent<Graph> component = splitter.Next();
+ BorderDecorator<Graph> border_colorer(component, colorer_, "yellow");
+ ofstream os(ComponentFileName(cnt, folder, component));
+ visualizer.Visualize(component, os, labeler_, border_colorer, linker_);
+ os.close();
+ }
+ return cnt;
+ }
+
+private:
+ DECL_LOGGER("SplittingGraphVisualizer");
+};
+
+}
+}
+
diff --git a/src/projects/CMakeLists.txt b/src/projects/CMakeLists.txt
new file mode 100644
index 0000000..7888a79
--- /dev/null
+++ b/src/projects/CMakeLists.txt
@@ -0,0 +1,13 @@
+############################################################################
+# Copyright (c) 2015 Saint Petersburg State University
+# Copyright (c) 2011-2014 Saint Petersburg Academic University
+# All Rights Reserved
+# See file LICENSE for details.
+############################################################################
+
+add_subdirectory(spades)
+add_subdirectory(hammer)
+add_subdirectory(ionhammer)
+add_subdirectory(dipspades)
+add_subdirectory(corrector)
+add_subdirectory(scaffold_correction)
\ No newline at end of file
diff --git a/src/projects/cap/CMakeLists.txt b/src/projects/cap/CMakeLists.txt
new file mode 100644
index 0000000..519dbee
--- /dev/null
+++ b/src/projects/cap/CMakeLists.txt
@@ -0,0 +1,47 @@
+############################################################################
+# Copyright (c) 2015 Saint Petersburg State University
+# Copyright (c) 2011-2014 Saint Petersburg Academic University
+# All Rights Reserved
+# See file LICENSE for details.
+############################################################################
+
+project(cap CXX)
+
+# Define minimum and maximum K
+set(SPADES_MIN_K 1 CACHE INTEGER "Minimum k-mer length")
+set(SPADES_MAX_K 300 CACHE INTEGER "Maximum k-mer length")
+configure_file("${SPADES_MAIN_INCLUDE_DIR}/k_range.hpp.in"
+ "${SPADES_BUILT_INCLUDE_DIR}/k_range.hpp")
+include_directories(${CMAKE_SOURCE_DIR}/modules
+ ${CMAKE_SOURCE_DIR}/online_vis)
+
+add_executable(cap-tools
+ ${EXT_DIR}/include/teamcity_boost/teamcity_boost.cpp
+ ${EXT_DIR}/include/teamcity_boost/teamcity_messages.cpp
+ tools.cpp)
+
+add_executable(cap main.cpp)
+
+#if (DEFINED static_build)
+# set_target_properties(cap PROPERTIES LINK_SEARCH_END_STATIC 1)
+#endif()
+
+if (READLINE_FOUND)
+ include_directories(${READLINE_INCLUDE_DIR})
+else(READLINE_FOUND)
+ #MESSAGE(WARNING "Library `readline' was not found (not installed?).")
+endif()
+if (CURSES_FOUND)
+ include_directories(${CURSES_INCLUDE_PATH})
+else(CURSES_FOUND)
+ #MESSAGE(WARNING "Library `ncurses' was not found (not installed?)")
+endif()
+include_directories(./drawing_commands)
+
+set(CAP_LIBS input cityhash nlopt ${COMMON_LIBRARIES})
+if (READLINE_FOUND AND CURSES_FOUND)
+ set(CAP_LIBS ${CAP_LIBS} ${READLINE_LIBRARY} ${CURSES_NCURSES_LIBRARY})
+endif()
+
+target_link_libraries(cap pipeline dev-support path_extend ${CAP_LIBS})
+target_link_libraries(cap-tools dev-support math ${CAP_LIBS})
diff --git a/src/projects/cap/assembly_compare.hpp b/src/projects/cap/assembly_compare.hpp
new file mode 100644
index 0000000..e879bc0
--- /dev/null
+++ b/src/projects/cap/assembly_compare.hpp
@@ -0,0 +1,520 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "pipeline/graph_pack.hpp"
+#include "pipeline/graphio.hpp"
+#include "dev_support/simple_tools.hpp"
+#include "algorithms/simplification/cleaner.hpp"
+#include "io/reads_io/splitting_wrapper.hpp"
+#include "io/reads_io/multifile_reader.hpp"
+#include <boost/algorithm/string/predicate.hpp>
+
+#include "coloring.hpp"
+#include "colored_graph_construction.hpp"
+#include "untangling.hpp"
+#include "assembly_problem_detection.hpp"
+#include "assembly_graph/stats/picture_dump.hpp"
+#include "simple_indel_finder.hpp"
+#include "test_utils.hpp"
+
+namespace cap {
+
+//class RCSplittingStream: public io::DelegatingReaderWrapper<io::SingleReaGapsRemoverd> {
+//private:
+// io::SplittingWrapper filtered_reader_;
+// io::RCReaderWrapper<io::SingleRead> stream_;
+//public:
+// RCSplittingStream(ContigStream &base_stream) :
+// filtered_reader_(base_stream), stream_(filtered_reader_) {
+// Init(stream_);
+// }
+//};
+
+//todo finish later
+//template<class gp_t1, class gp_t2>
+//void ConvertToBPGraphPack(const gp_t1& gp
+// , const ColorHandler<typename gp_t1::graph_t>& coloring
+// , gp_t2& bp_gp) {
+// string tmp_dir = "/home/snurk/tmp/";
+// string filename = tmp_dir + "tmp";
+// make_dir(tmp_dir);
+// PrintGraphPack(filename, gp);
+// typename ScannerTraits<typename gp_t2::graph_t>::Scanner scanner(bp_gp.g);
+// ScanBasicGraph(filename, scanner);
+// scanner.loadPositions(filename, bp_gp.edge_pos);
+// //
+//}
+
+template<class Graph>
+void DeleteEdgesByColor(Graph& g, const ColorHandler<Graph>& coloring,
+ TColorSet color) {
+ for (auto it = g.SmartEdgeBegin(); !it.IsEnd(); ++it) {
+ if (coloring.Color(*it) == color) {
+ g.DeleteEdge(*it);
+ }
+ }
+ omnigraph::Cleaner<Graph>(g).Run();
+}
+
+template<class Graph>
+class GapsRemover {
+ typedef typename Graph::VertexId VertexId;
+ Graph& g_;
+ const ColorHandler<Graph>& coloring_;
+ const TColorSet gap_color_;
+ const size_t length_bound_;
+public:
+ GapsRemover(Graph& g, const ColorHandler<Graph>& coloring,
+ TColorSet gap_color, size_t length_bound) :
+ g_(g), coloring_(coloring), gap_color_(gap_color), length_bound_(
+ length_bound) {
+
+ }
+
+ void RemoveGaps() {
+ for (auto it = g_.SmartEdgeBegin(); !it.IsEnd(); ++it) {
+ if (coloring_.Color(*it) == gap_color_
+ && g_.length(*it) <= length_bound_
+ && g_.CanCompressVertex(g_.EdgeStart(*it))
+ && g_.CanCompressVertex(g_.EdgeEnd(*it))) {
+ VertexId start = g_.EdgeStart(*it);
+ VertexId end = g_.EdgeEnd(*it);
+ if (!g_.RelatedVertices(start, end)) {
+ g_.CompressVertex(end);
+ }
+ g_.CompressVertex(start);
+ }
+ }
+ }
+};
+
+//class EasyContigStream: public io::DelegatingReaderWrapper<io::SingleRead> {
+//private:
+// io::Reader raw_stream_;
+// io::RCReaderWrapper<io::SingleRead> rc_stream_;
+// io::PrefixAddingReaderWrapper prefix_stream_;
+//public:
+// EasyContigStream(const string& filename, const string& prefix) :
+// raw_stream_(filename), rc_stream_(raw_stream_), prefix_stream_(
+// rc_stream_, prefix) {
+// Init(prefix_stream_);
+// }
+//};
+
+//template<class gp_t>
+//class AssemblyComparer {
+//private:
+// typedef typename gp_t::graph_t Graph;
+// typedef typename Graph::EdgeId EdgeId;
+// typedef typename Graph::VertexId VertexId;
+// typedef NewExtendedSequenceMapper<Graph, typename gp_t::seq_t> Mapper; // gp_t::k_value + 1
+//
+// gp_t gp_;
+// ColorHandler<Graph> coloring_;
+// io::RCReaderWrapper<io::SingleRead> rc_stream1_;
+// io::RCReaderWrapper<io::SingleRead> rc_stream2_;
+// string name1_;
+// io::PrefixAddingReaderWrapper stream1_;
+// string name2_;
+// io::PrefixAddingReaderWrapper stream2_;
+// bool untangle_;
+//
+//// void WriteMagicLocality() {
+//// LengthIdGraphLabeler<Graph> basic_labeler(gp_.g);
+//// EdgePosGraphLabeler<Graph> pos_labeler(gp_.g, gp_.edge_pos);
+////
+//// CompositeLabeler<Graph> labeler(basic_labeler, pos_labeler);
+//// make_dir("/home/snurk/gingi_dbg");
+//// WriteComponentsAlongPath(gp_.g, labeler, "/home/snurk/gingi_dbg/path.dot", /*split_length*/1000, /*vertex_number*/15
+//// , (*MapperInstance(gp_)).MapSequence((!gp_.genome).Subseq(2024608, 2067372)), *ConstructBorderColorer(gp_.g, coloring_));
+//// }
+//
+// template<class gp_t2>
+// void UniversalSaveGP(
+// const gp_t2& gp/*, const omnigraph::visualization::GraphColorer<typename gp_t2::graph_t> coloring*/,
+// const string& filename) {
+// typename PrinterTraits<Graph>::Printer printer(gp.g);
+// INFO("Saving graph to " << filename);
+// printer.saveGraph(filename);
+// printer.saveEdgeSequences(filename);
+// printer.savePositions(filename, gp.edge_pos);
+//// SaveColoring(gp.g
+//// , coloring
+//// , filename);
+//
+//// LengthIdGraphLabeler<Graph> labeler(gp.g);
+//// WriteSimple(gp.g, labeler, filename + ".dot");
+// }
+//
+// void SaveOldGraph(const string& path) {
+// INFO("Saving graph to " << path);
+// PrintGraphPack(path, gp_);
+//// LengthIdGraphLabeler<Graph> labeler(gp_.g);
+//// WriteToDotFile(gp_.g, labeler, path + ".dot");
+// }
+//
+// template<class gp_t2>
+// void ProduceResults(gp_t2& gp, const ColorHandler<Graph>& coloring,
+// const string& output_folder, bool detailed_output) {
+//// INFO("Removing unnecessary edges");
+//// DeleteVioletEdges(gp.g, coloring);
+//
+//// // if (detailed_output) {
+//// // PrintColoredGraph(gp.g, coloring, gp.edge_pos,
+//// // output_folder + "initial_pics/purple_removed.dot");
+//// // UniversalSaveGP(gp, output_folder + "saves/purple_removed");
+//// // }
+//
+//// // ReliableSplitter<Graph> splitter(gp.g, /*max_size*/100, /*edge_length_bound*/5000);
+//// // BreakPointsFilter<Graph> filter(gp.g, coloring, 3);
+//// INFO("Counting stats, outputting pictures");
+//// BPGraphStatCounter<Graph> counter(gp.g, coloring, output_folder);
+//// LengthIdGraphLabeler<Graph> labeler(gp.g);
+//// counter.CountStats(labeler, detailed_output);
+// }
+//
+// void PrepareDirs(const string& output_folder, bool detailed_output) {
+// DIR *dp;
+// if ((dp = opendir(output_folder.c_str())) == NULL) {
+// INFO("Dir " + output_folder + " did not exist, creating");
+// } else {
+// INFO("Dir " + output_folder + " purged");
+// remove_dir(output_folder);
+// }
+// utils::MakeDirPath(output_folder);
+// if (detailed_output) {
+// make_dir(output_folder + "initial_pics/");
+// make_dir(output_folder + "saves/");
+// make_dir(output_folder + "purple_edges_pics/");
+// }
+// }
+//
+//public:
+//
+// AssemblyComparer(size_t k_value, io::IReader<io::SingleRead> &stream1,
+// io::IReader<io::SingleRead> &stream2, const string& name1,
+// const string& name2, bool untangle = false,
+// const Sequence& reference = Sequence()) :
+// gp_(k_value, "tmp", reference, 200, true), coloring_(gp_.g, 2), rc_stream1_(
+// stream1), rc_stream2_( // TODO dir
+// stream2), name1_(name1), stream1_(rc_stream1_, name1), name2_(
+// name2), stream2_(rc_stream2_, name2), untangle_(untangle) {
+// }
+//
+// void CompareAssemblies(const string& output_folder, bool detailed_output =
+// true, bool one_many_resolve = false,
+// const string& add_saves_path = "") {
+//// VERIFY(gp_.genome.size() > 0);
+// //todo ???
+// stream1_.reset();
+// stream2_.reset();
+//
+// PrepareDirs(output_folder, detailed_output);
+// vector<ContigStream*> stream_vec = { &stream1_, &stream2_ };
+// ContigStreams streams(stream_vec, false);
+//
+// CoordinatesHandler<Graph> coordinates_handler;
+// ConstructColoredGraph<gp_t>(gp_, coloring_, coordinates_handler, streams);
+//
+// if (gp_.genome.size() > 0) {
+// INFO("Filling ref pos " << gp_.genome.size());
+//// FillPos(gp_, gp_.genome, "ref_0");
+//// FillPos(gp_, !gp_.genome, "ref_1");
+//
+//// SimpleInDelAnalyzer<Graph> del_analyzer(gp_.g, coloring_,
+//// gp_.edge_pos,
+//// (*MapperInstance < gp_t > (gp_)).MapSequence(gp_.genome).simple_path(),
+//// kRedColorSet, output_folder);
+//// del_analyzer.Analyze();
+//
+//// AlternatingPathsCounter<Graph> alt_count(gp_.g, coloring);
+//// alt_count.CountPaths();
+//
+//// ContigBlockStats<Graph, Mapper> block_stats(gp_.g, gp_.edge_pos,
+//// *MapperInstance(gp_), gp_.genome, stream1_);
+//// block_stats.Count();
+//
+//// MissingGenesAnalyser<Graph, Mapper> missed_genes(gp_.g, coloring_,
+//// gp_.edge_pos, gp_.genome, *MapperInstance(gp_),
+//// vector<pair<bool, pair<size_t, size_t>>> {
+//// make_pair(true, make_pair(260354, 260644)),
+//// make_pair(true, make_pair(300641, 300904)),
+//// make_pair(true, make_pair(300904, 301920)),
+//// make_pair(true, make_pair(301917, 302348)),
+//// make_pair(true, make_pair(260354, 260644)),
+//// make_pair(true, make_pair(300641, 300904)),
+//// make_pair(true, make_pair(300904, 301920)),
+//// make_pair(true, make_pair(301917, 302348)),
+//// make_pair(true, make_pair(302449, 304752)),
+//// make_pair(true, make_pair(263821, 264594)),
+//// make_pair(true, make_pair(265025, 265726)),
+//// make_pair(true, make_pair(265740, 266951))
+//// }
+//// , output_folder + "missed_genes/");
+////
+//// missed_genes.Analyze();
+// }
+//
+// ////////////
+//// WriteMagicLocality();
+// ////////////
+//
+//// 2339834
+//// INFO("Removing gaps");
+//// GapsRemover<Graph> gaps_remover(gp_.g, coloring, kBlueColorSet, 700);
+//// gaps_remover.RemoveGaps();
+//// INFO("Gaps removed");
+//
+// if (boost::starts_with(name1_, "idba")) {
+// IDBADiffAnalyzer<gp_t> diff_analyzer(gp_, coloring_, name1_, name2_,
+// output_folder + "/idba_analysis/");
+// diff_analyzer.Analyze(stream1_, stream2_);
+// }
+//
+// if (one_many_resolve) {
+// VERIFY(!untangle_);
+// RestrictedOneManyResolver<Graph> resolver(gp_.g, coloring_,
+// kVioletColorSet);
+// resolver.Resolve();
+// }
+//
+// if (detailed_output) {
+// if (gp_.genome.size() > 0) {
+// PrintColoredGraphAlongRef(gp_, coloring_, // gp_.edge_pos, TODO why not corresponding???
+// //gp_.genome,
+// output_folder + "initial_pics/colored_split_graph.dot");
+// } else {
+// PrintColoredGraph(gp_.g, coloring_, gp_.edge_pos,
+// output_folder + "initial_pics/colored_split_graph.dot");
+// }
+//
+// if (add_saves_path != "") {
+// UniversalSaveGP(gp_, //coloring,
+// add_saves_path);
+// SaveColoring(gp_.g, coloring_, add_saves_path);
+// //PrintColoredGraphWithColorFilter(gp_.g, coloring_, gp_.edge_pos,
+// // add_saves_path + ".dot");
+// }
+// UniversalSaveGP(gp_, //coloring,
+// output_folder + "saves/colored_split_graph");
+// SaveColoring(gp_.g, coloring_,
+// output_folder + "saves/colored_split_graph");
+// //PrintColoredGraphWithColorFilter(gp_.g, coloring_, gp_.edge_pos,
+// // output_folder + "saves/colored_split_graph.dot");
+// }
+//
+// if (untangle_) {
+// VERIFY(false);
+//// INFO("Untangling graph");
+//// bp_graph_pack<typename gp_t::graph_t> untangled_gp(gp_t::k_value);
+//// UntangledGraphConstructor<gp_t> untangler(gp_, coloring,
+//// untangled_gp, stream1_, stream2_);
+//// //todo ???
+//// // SimplifyGraph(untangled_gp.g);
+////
+//// if (detailed_output) {
+//// PrintColoredGraph(untangled_gp.g, untangled_gp.coloring,
+//// untangled_gp.edge_pos,
+//// output_folder + "initial_pics/untangled_graph.dot");
+//// UniversalSaveGP(untangled_gp, //untangled_gp.coloring,
+//// output_folder + "saves/untangled_graph");
+//// }
+////
+//// ProduceResults(untangled_gp, untangled_gp.coloring, output_folder,
+//// detailed_output);
+// } else {
+//// INFO("Analyzing gaps");
+//// GapComparativeAnalyzer<Graph> gap_analyzer(gp_.g, coloring,
+//// gp_.edge_pos);
+//// gap_analyzer.ReportPotentialGapsCloses(
+//// output_folder + "gap_closing_edges/");
+//
+// //trivial breakpoints
+// string bp_folder = output_folder + "breakpoints/";
+// make_dir(bp_folder);
+// TrivialBreakpointFinder<Graph> bp_finder(gp_.g, coloring_,
+// gp_.edge_pos);
+// bp_finder.FindBreakPoints(bp_folder);
+//
+// //possible rearrangements
+// string rearr_folder = output_folder + "rearrangements/";
+// make_dir(rearr_folder);
+// SimpleRearrangementDetector<gp_t> rearr_det(gp_, coloring_, "tdc_",
+// rearr_folder);
+// rearr_det.Detect();
+//
+// ProduceResults(gp_, coloring_, output_folder, detailed_output);
+// }
+// }
+//
+//private:
+// DECL_LOGGER("AssemblyComparer")
+// ;
+//};
+
+//template<size_t k, size_t K, class BuildSeq>
+//void RunBPComparison(ContigStream& raw_stream1, ContigStream& raw_stream2,
+// const string& name1, const string& name2, bool refine, bool untangle,
+// const string& output_folder, bool detailed_output = true, size_t delta =
+// 5, Sequence reference = Sequence(),
+// const string& add_saves_path = "") {
+// static double lol_time = 0;
+// cap::utils::add_time(lol_time, -1);
+//
+// io::SplittingWrapper stream1(raw_stream1);
+// io::SplittingWrapper stream2(raw_stream2);
+//
+// typedef debruijn_graph::graph_pack<
+// /*Nonc*/debruijn_graph::ConjugateDeBruijnGraph, BuildSeq> comparing_gp_t;
+//
+// if (refine) {
+// typedef graph_pack<ConjugateDeBruijnGraph, BuildSeq> refining_gp_t;
+// refining_gp_t refining_gp(k, "tmp");
+// io::VectorReader<io::SingleRead> genome_stream(
+// io::SingleRead("genome", reference.str()));
+// ContigStreamsPtr streams_ptr = make_shared<ContigStreams>(vector<ContigStream*>{&stream1, &stream2, &genome_stream}, false);
+//
+// ConstructGPForRefinement(refining_gp, streams_ptr, delta);
+//
+// io::ModifyingWrapper<io::SingleRead> refined_stream1(stream1,
+// GraphReadCorrectorInstance(refining_gp.g,
+// *MapperInstance(refining_gp)));
+// io::ModifyingWrapper<io::SingleRead> refined_stream2(stream2,
+// GraphReadCorrectorInstance(refining_gp.g,
+// *MapperInstance(refining_gp)));
+// io::ModifyingWrapper<io::SingleRead> reference_stream(genome_stream,
+// GraphReadCorrectorInstance(refining_gp.g,
+// *MapperInstance(refining_gp)));
+//
+// reference_stream.reset();
+// AssemblyComparer<comparing_gp_t> comparer(K, refined_stream1,
+// refined_stream2, name1, name2, untangle,
+// ReadSequence(reference_stream));
+// comparer.CompareAssemblies(output_folder, detailed_output, /*one_many_resolve*/
+// false, add_saves_path);
+// } else {
+// AssemblyComparer<comparing_gp_t> comparer(K, stream1, stream2, name1,
+// name2, untangle, reference);
+// comparer.CompareAssemblies(output_folder, detailed_output, /*one_many_resolve*/
+// false, add_saves_path);
+// }
+//
+// cap::utils::add_time(lol_time, +1);
+// INFO("LOL_TIME:: " << lol_time);
+//}
+
+//template<size_t k, size_t K>
+//void RunBPComparison(const Sequence& ref, ContigStream& stream,
+// const string& name1, const string& name2, bool refine, bool untangle,
+// const string& output_folder, bool detailed_output = true, size_t delta =
+// 5) {
+// io::VectorReader<io::SingleRead> ref_stream(
+// io::SingleRead(name1, ref.str()));
+// RunBPComparison<k, K>(ref_stream, stream, name1, name2, refine, untangle,
+// output_folder, detailed_output, delta);
+//}
+
+//template<size_t k, size_t K>
+//void RunBPComparison(const Sequence& s1, const Sequence& s2,
+// const string& name1, const string& name2, bool refine, bool untangle,
+// const string& output_folder, bool detailed_output = true) {
+// io::VectorReader<io::SingleRead> stream(io::SingleRead(name2, s2.str()));
+// RunBPComparison<k, K>(s1, stream, name1, name2, refine, untangle,
+// output_folder, detailed_output);
+//}
+//
+//template<size_t k, size_t K>
+//void RunBPComparison(const Sequence& ref, const vector<Sequence>& contigs,
+// const string& name1, const string& name2, bool refine, bool untangle,
+// const string& output_folder, bool detailed_output = true) {
+// io::VectorReader<io::SingleRead> stream(MakeReads(contigs));
+// RunBPComparison<k, K>(ref, stream, name1, name2, refine, untangle,
+// output_folder, detailed_output);
+//}
+
+//template<size_t k, class BuildSeq>
+//void CompareGenomes(const Sequence& genome_1, const Sequence& genome_2,
+// const string& output_dir) {
+// INFO("Genome comparison started");
+// io::VectorReader<io::SingleRead> stream1(
+// io::SingleRead("", genome_1.str()));
+// io::VectorReader<io::SingleRead> stream2(
+// io::SingleRead("", genome_2.str()));
+// typedef graph_pack</*Nonc*/ConjugateDeBruijnGraph, BuildSeq> comparing_gp_t; // k
+// INFO("Running assembly comparer");
+// AssemblyComparer<comparing_gp_t> comparer(k, stream1, stream2, "strain1",
+// "strain2", /*untangle*/false);
+// comparer.CompareAssemblies(output_dir, /*detailed_output*/true, /*on_many_resolve*/
+// false);
+// INFO("Finished");
+//}
+
+template<class gp_t>
+void ThreadAssemblies(const string& base_saves, ContigStream& base_assembly,
+ const string& base_prefix, ContigStream& assembly_to_thread,
+ const string& to_thread_prefix, const string& output_dir) {
+ typedef typename gp_t::graph_t Graph;
+ gp_t gp;
+// ConstructGraph<gp_t::k_value, Graph>(gp.g, gp.index, base_assembly);
+ ScanGraphPack(base_saves, gp);
+ base_assembly.reset();
+ FillPos(gp, base_assembly, base_prefix);
+ FillPos(gp, assembly_to_thread, to_thread_prefix);
+
+ EdgePosGraphLabeler<Graph> pos_labeler(gp.g, gp.edge_pos);
+ StrGraphLabeler<Graph> str_labeler(gp.g);
+ CompositeLabeler<Graph> labeler(pos_labeler, str_labeler);
+
+ auto mapper = MapperInstance(gp);
+
+ assembly_to_thread.reset();
+ io::SingleRead read;
+ while (!assembly_to_thread.eof()) {
+ assembly_to_thread >> read;
+ make_dir(output_dir + read.name());
+ WriteComponentsAlongPath(gp.g, labeler,
+ output_dir + read.name() + "/.dot", /*split_edge_length*/400,
+ mapper->MapSequence(read.sequence()),
+ Path<typename Graph::EdgeId>(), Path<typename Graph::EdgeId>(),
+ true);
+ }
+}
+
+template<class gp_t>
+void RunMultipleGenomesVisualization(size_t k_visualize,
+ vector<pair<std::string, std::string> > genomes_paths,
+ std::string output_folder) {
+ typedef typename gp_t::graph_t Graph;
+
+ utils::MakeDirPath(output_folder);
+
+ gp_t gp(k_visualize, "tmp", 0, Sequence(), 200);
+ ColorHandler<Graph> coloring(gp.g, genomes_paths.size());
+ CoordinatesHandler<Graph> coordinates_handler;
+
+ // ContigStream -> SplittingWrapper -> RCReaderWrapper -> PrefixAddingReaderWrapper
+
+ ContigStreams streams;
+ for (auto it = genomes_paths.begin(); it != genomes_paths.end(); ++it) {
+ streams.push_back(make_shared<io::FileReadStream>(it->second));
+ }
+
+ ContigStreams rc_wrapped = io::RCWrap(streams);
+
+ ConstructColoredGraph(gp, coloring, coordinates_handler, rc_wrapped);
+
+ ofstream indel_event_logger(output_folder + "/indel_events");
+
+// UnversalSaveGP(gp, output_folder + "/colored_split_graph");
+// SaveColoring(gp.g, coloring, output_folder + "/colored_split_graph");
+ //PrintColoredGraphWithColorFilter(gp.g, coloring, gp.edge_pos,
+ // output_folder + "/colored_split_graph.dot");
+}
+
+}
diff --git a/src/projects/cap/assembly_problem_detection.hpp b/src/projects/cap/assembly_problem_detection.hpp
new file mode 100644
index 0000000..7040caf
--- /dev/null
+++ b/src/projects/cap/assembly_problem_detection.hpp
@@ -0,0 +1,453 @@
+////***************************************************************************
+////* Copyright (c) 2011-2014 Saint-Petersburg Academic University
+////* All Rights Reserved
+////* See file LICENSE for details.
+////****************************************************************************
+//
+//#pragma once
+//
+//namespace cap {
+//
+//template<class Graph>
+//class LabelFilter : public GraphComponentFilter<Graph> {
+// typedef typename Graph::VertexId VertexId;
+// const EdgesPositionHandler<Graph> &edge_pos_;
+//public:
+// LabelFilter(const Graph &graph, const EdgesPositionHandler<Graph> &edge_pos) :
+// GraphComponentFilter<Graph>(graph), edge_pos_(edge_pos) {
+// }
+//
+// virtual bool Check(const vector<typename Graph::VertexId> &component) const {
+// set<VertexId> cset(component.begin(), component.end());
+// for(auto vit = component.begin(); vit != component.end(); ++vit) {
+// auto out = this->graph().OutgoingEdges(*vit);
+// for(auto eit = out.begin(); eit != out.end(); ++eit) {
+// if(cset.count(this->graph().EdgeEnd(*eit)) > 0) {
+// auto labels = edge_pos_.GetEdgePositions(*eit);
+// for(auto it = labels.begin(); it != labels.end(); ++it) {
+// if(it->first == "ref_0" || it->first == "ref_1")
+// return true;
+// }
+// }
+// }
+// }
+// return false;
+// }
+//};
+//
+////todo как обобщить на сравнение с геномом???
+//template<class gp_t>
+//class IDBADiffAnalyzer {
+//private:
+// typedef typename gp_t::graph_t Graph;
+// typedef typename gp_t::index_t Index;
+// typedef typename gp_t::seq_t Kmer;
+// typedef typename Graph::EdgeId EdgeId;
+// typedef typename Graph::VertexId VertexId;
+// typedef io::SingleRead Contig;
+// typedef io::IReader<io::SingleRead> ContigStream;
+// typedef io::MultifileReader<io::SingleRead> CompositeStream;
+// typedef debruijn_graph::NewExtendedSequenceMapper<Graph, Index> Mapper;
+//
+// const gp_t& gp_;
+// const ColorHandler<Graph>& coloring_;
+// Mapper mapper_;
+//
+// string good_assembly_prefix_;
+// string bad_assembly_prefix_;
+// string dir_;
+// map<string, Sequence> contigs_map_;
+//
+// bool StartsWith(const string& s, const string& prefix) {
+// return boost::starts_with(s, prefix);
+// }
+//
+// void CollectContigs(ContigStream& good_assembly, ContigStream& bad_assembly) {
+// CompositeStream composite(good_assembly, bad_assembly);
+// composite.reset();
+// while(!composite.eof()) {
+// Contig c;
+// composite >> c;
+// contigs_map_[c.name()] = c.sequence();
+// }
+// composite.reset();
+// }
+//
+// set<string> CollectBadContigIdsAlongPath(const vector<EdgeId>& path) {
+// DEBUG("Collecting intersecting contigs from position marks");
+// set<string> answer;
+// for (auto it = path.begin(); it != path.end(); ++it) {
+// vector<EdgePosition> positions = gp_.edge_pos.GetEdgePositions(*it);
+// for (auto pos_it = positions.begin(); pos_it != positions.end(); ++pos_it) {
+// string id = pos_it->contigId;
+// if (StartsWith(id, bad_assembly_prefix_)) {
+// answer.insert(id);
+// }
+// }
+// }
+// DEBUG("Collected " << answer.size() << " contigs");
+// return answer;
+// }
+//
+// size_t Intersection(const set<EdgeId>& s1, const set<EdgeId>& s2) {
+// size_t ans = 0;
+// for (auto it = s1.begin(); it != s1.end(); ++it) {
+// if (s2.count(*it) > 0) {
+// ans++;
+// }
+// }
+// return ans;
+// }
+//
+// set<EdgeId> AsSet(vector<EdgeId> v) {
+// return set<EdgeId>(v.begin(), v.end());
+// }
+//
+// vector<EdgeId> MappingEdgeVector(const string& contig_id) {
+// VERIFY(contigs_map_.find(contig_id) != contigs_map_.end());
+// return mapper_.MapSequence(contigs_map_[contig_id]).simple_path();
+// }
+//
+// set<EdgeId> MappingEdgeSet(const string& contig_id) {
+// return AsSet(MappingEdgeVector(contig_id));
+// }
+//
+// string FindBestBadContigWRTPath(const set<string>& contigs, const vector<EdgeId>& path) {
+// DEBUG("Looking for best contig")
+// set<EdgeId> path_edges(path.begin(), path.end());
+// size_t best_intersection = 0;
+// string best_contig = "";
+// for (auto it = contigs.begin(); it != contigs.end(); ++it) {
+// size_t intersect = Intersection(MappingEdgeSet(*it), path_edges);
+// if (intersect > best_intersection) {
+// best_intersection = intersect;
+// best_contig = *it;
+// }
+// }
+// DEBUG("Best contig is " << best_contig);
+// return best_contig;
+// }
+//
+// bool InnerVertex(VertexId v, const vector<EdgeId>& path) {
+// if (path.empty())
+// return false;
+// for (size_t i = 0; i < path.size() - 1; ++i) {
+// if (gp_.g.EdgeEnd(path[i]) == v)
+// return true;
+// }
+// return false;
+// }
+//
+// VertexId FirstBranchingVertex(const vector<EdgeId>& bad_path, const vector<EdgeId>& good_path) {
+// for (auto it = bad_path.begin(); it != bad_path.end(); ++it) {
+// if (InnerVertex(gp_.g.EdgeStart(*it), good_path)) {
+// return gp_.g.EdgeStart(*it);
+// }
+// if (InnerVertex(gp_.g.EdgeEnd(*it), good_path)) {
+// return gp_.g.EdgeEnd(*it);
+// }
+// }
+// return (VertexId) NULL;
+// }
+//
+// VertexId LastBranchingVertex(const vector<EdgeId>& bad_path, const vector<EdgeId>& good_path) {
+// for (auto it = bad_path.rbegin(); it != bad_path.rend(); ++it) {
+// if (InnerVertex(gp_.g.EdgeEnd(*it), good_path)) {
+// return gp_.g.EdgeEnd(*it);
+// }
+// if (InnerVertex(gp_.g.EdgeStart(*it), good_path)) {
+// return gp_.g.EdgeStart(*it);
+// }
+// }
+// return (VertexId) NULL;
+// }
+//
+// bool SingleAssemblyEdge(EdgeId e, const string& prefix) {
+// vector<EdgePosition> positions = gp_.edge_pos.GetEdgePositions(e);
+// for (auto it = positions.begin(); it != positions.end(); ++it) {
+// if (!StartsWith(it->contigId, prefix)) {
+// return false;
+// }
+// }
+// return true;
+// }
+//
+// bool ContainsSingleAssemblyEdge(const vector<EdgeId> edges, const string& prefix) {
+// for (auto it = edges.begin(); it != edges.end(); ++it)
+// if (SingleAssemblyEdge(*it, prefix))
+// return true;
+// return false;
+// }
+//
+// vector<EdgeId> SingleAssemblyEdges(const vector<EdgeId> edges, const string& prefix) {
+// vector<EdgeId> answer;
+// for (auto it = edges.begin(); it != edges.end(); ++it)
+// if (SingleAssemblyEdge(*it, prefix))
+// answer.push_back(*it);
+// return answer;
+// }
+//
+// vector<EdgeId> IncidentEdges(VertexId v) {
+// vector<EdgeId> ans;
+// push_back_all(ans, gp_.g.IncomingEdges(v));
+// push_back_all(ans, gp_.g.OutgoingEdges(v));
+// return ans;
+// }
+//
+// vector<EdgeId> IncidentEdgesInPath(VertexId v, const vector<EdgeId>& good_contig_path) {
+// vector<EdgeId> ans;
+// vector<EdgeId> adj = IncidentEdges(v);
+// for (size_t i = 0; i < adj.size(); ++i)
+// if (find(good_contig_path.begin(), good_contig_path.end(), adj[i]) != good_contig_path.end())
+// ans.push_back(adj[i]);
+// return ans;
+// }
+//
+// void ReportLocality(VertexId v, const vector<EdgeId>& good_contig_path, const string& best_contig, const Contig& c, const string& folder) {
+// using namespace omnigraph::visualization;
+// make_dir(folder);
+// LengthIdGraphLabeler<Graph> basic_labeler(gp_.g);
+// EdgePosGraphLabeler<Graph> pos_labeler(gp_.g, gp_.edge_pos);
+//
+// CompositeLabeler<Graph> labeler(basic_labeler, pos_labeler);
+//
+// LabelFilter<typename gp_t::graph_t> lf(gp_.g, gp_.edge_pos);
+// string file_name = folder + c.name() + "_|_" + best_contig + ".dot";
+// EdgeId edge = IncidentEdgesInPath(v, good_contig_path).front();
+// GraphComponent<Graph> component = omnigraph::EdgeNeighborhood(gp_.g, edge);
+// if(lf.Check(vector<VertexId>(component.v_begin(), component.v_end()))) {
+// WriteComponent(component, file_name, BorderDecorator<Graph>::GetInstance(component, coloring_.GetInstance()), labeler);
+// }
+// }
+//
+// size_t LengthForward(EdgeId e, const vector<EdgeId>& good_contig_path) {
+// size_t ans = 0;
+// bool passed = false;
+// for (auto it = good_contig_path.begin(); it != good_contig_path.end(); ++it) {
+// if (*it == e)
+// passed = true;
+// if (passed)
+// ans += gp_.g.length(*it);
+// }
+// return ans;
+// }
+//
+// size_t LengthBackward(EdgeId e, const vector<EdgeId>& good_contig_path) {
+// size_t ans = 0;
+// bool passed = false;
+// for (auto it = good_contig_path.rbegin(); it != good_contig_path.rend(); ++it) {
+// if (*it == e)
+// passed = true;
+// if (passed)
+// ans += gp_.g.length(*it);
+// }
+// return ans;
+// }
+//
+// size_t LengthToEndOnGoodContig(VertexId v, const vector<EdgeId>& good_contig_path, const set<EdgeId>& best_alt_contig_path) {
+// vector<EdgeId> adj_in_path = IncidentEdgesInPath(v, good_contig_path);
+// for (auto it = adj_in_path.begin(); it != adj_in_path.end(); ++it) {
+// if (best_alt_contig_path.count(*it) == 0) {
+// if (v == gp_.g.EdgeStart(*it)) {
+// return LengthForward(*it, good_contig_path);
+// } else {
+// VERIFY(v == gp_.g.EdgeEnd(*it));
+// return LengthBackward(*it, good_contig_path);
+// }
+// }
+// }
+//// VERIFY(false);
+// WARN("Something strange for vertex " << v);
+// return 0;
+// }
+//
+// void ClassifyAndReportBreak(VertexId v, const vector<EdgeId>& good_contig_path, const string& best_contig, const Contig& c) {
+// DEBUG("Trying to classify break");
+// if (gp_.g.EdgeStart(good_contig_path.front()) == v || gp_.g.EdgeEnd(good_contig_path.back()) == v) {
+// DEBUG("Vertex was an end of initial contig");
+// return;
+// }
+// if (ContainsSingleAssemblyEdge(IncidentEdgesInPath(v, good_contig_path), good_assembly_prefix_)) {
+// DEBUG("Vertex has adjacent \"good\" assembly edge");
+// if (SingleAssemblyEdges(IncidentEdgesInPath(v, good_contig_path), good_assembly_prefix_).size() == 1) {
+// EdgeId e = SingleAssemblyEdges(IncidentEdgesInPath(v, good_contig_path), good_assembly_prefix_).front();
+// if (e == good_contig_path.front() || e == good_contig_path.back()) {
+// DEBUG("This edge is at the end of initial contig");
+// DEBUG("Skipping");
+// return;
+// }
+// }
+// DEBUG("Reporting locality of vertex " << gp_.g.str(v) << " as possible coverage gap");
+// ReportLocality(v, good_contig_path, best_contig, c, dir_ + "/coverage_gaps/");
+// return;
+// }
+// if (ContainsSingleAssemblyEdge(IncidentEdges(v), bad_assembly_prefix_)
+// && !ContainsSingleAssemblyEdge(IncidentEdgesInPath(v, good_contig_path), good_assembly_prefix_)) {
+// DEBUG("Reporting locality of vertex " << gp_.g.str(v) << " as possible EC problem");
+// ReportLocality(v, good_contig_path, best_contig, c, dir_ + "/ec_problem/");
+// return;
+// }
+// if (!ContainsSingleAssemblyEdge(IncidentEdges(v), bad_assembly_prefix_)
+// && !ContainsSingleAssemblyEdge(IncidentEdgesInPath(v, good_contig_path), good_assembly_prefix_)) {
+// DEBUG("Possible RR problem. Checking remaining length");
+// if (LengthToEndOnGoodContig(v, good_contig_path, MappingEdgeSet(best_contig)) > 10000) {
+// DEBUG("Check ok");
+// DEBUG("Reporting locality of vertex " << gp_.g.str(v) << " as possible RR problem");
+// ReportLocality(v, good_contig_path, best_contig, c, dir_ + "/rr_problem/");
+// return;
+// } else {
+// DEBUG("Check fail, won't report");
+// }
+// }
+// DEBUG("Unclassified problem type");
+// }
+//
+// void AnalyzeBadContigsWRTPath(const set<string>& contigs, const vector<EdgeId>& path, const Contig& c) {
+// string best_contig = FindBestBadContigWRTPath(contigs, path);
+// if (best_contig == "")
+// return;
+// vector<EdgeId> path_edges = MappingEdgeVector(best_contig);
+//
+// DEBUG("Best contig mapped to path: " << gp_.g.str(path_edges));
+//
+// if (path_edges.empty() || !CheckContiguous(gp_.g, path_edges)) {
+// WARN("Path for best contig " << best_contig << " wasn't continuous");
+// return;
+// }
+// DEBUG("Looking for first branching vertex");
+// VertexId first = FirstBranchingVertex(path_edges, path);
+// if(first != VertexId(NULL)) {
+// DEBUG("First branching vertex is " << gp_.g.str(first));
+// ClassifyAndReportBreak(first, path, best_contig, c);
+// } else {
+// DEBUG("Failed to find first branching vertex");
+// }
+// DEBUG("Looking for last branching vertex");
+// VertexId last = LastBranchingVertex(path_edges, path);
+// if(last != VertexId(NULL)) {
+// DEBUG("Last branching vertex is " << gp_.g.str(last));
+// ClassifyAndReportBreak(last, path, best_contig, c);
+// } else {
+// DEBUG("Failed to find last branching vertex");
+// }
+// }
+//
+// void AnalyzeGoodContig(const Contig& c) {
+// DEBUG("Analyzing contig " << c.name());
+//
+// vector<EdgeId> path_edges = MappingEdgeVector(c.name());
+// DEBUG("Contig mapped to path: " << gp_.g.str(path_edges));
+//
+// if (path_edges.empty() || !CheckContiguous(gp_.g, path_edges)) {
+// WARN("Path for good contig " << c.name() << " wasn't continuous");
+// return;
+// }
+//
+// set<string> bad_contig_ids = CollectBadContigIdsAlongPath(path_edges);
+// AnalyzeBadContigsWRTPath(bad_contig_ids, path_edges, c);
+// }
+//
+//public:
+// IDBADiffAnalyzer(const gp_t& gp,
+//// const EdgesPositionHandler<Graph>& pos,
+// const ColorHandler<Graph>& coloring,
+// const string& good_assembly_prefix,
+// const string& bad_assembly_prefix,
+// const string& dir)
+// : gp_(gp)/*, pos_(pos)*/,
+// coloring_(coloring),
+// mapper_(gp.g, gp.index, gp.kmer_mapper, gp.k_value + 1),
+// good_assembly_prefix_(good_assembly_prefix),
+// bad_assembly_prefix_(bad_assembly_prefix),
+// dir_(dir) {
+// DEBUG("\"Good\" assembly prefix " << good_assembly_prefix);
+// DEBUG("\"Bad\" assembly prefix " << bad_assembly_prefix);
+// }
+//
+// void Analyze(ContigStream& good_assembly, ContigStream& bad_assembly) {
+// CollectContigs(good_assembly, bad_assembly);
+// while (!good_assembly.eof()) {
+// Contig c;
+// good_assembly >> c;
+// AnalyzeGoodContig(c);
+// }
+// }
+//
+// DECL_LOGGER("IDBADiffAnalyzer");
+//};
+//
+////investigates if red edges can close gaps in blue assembly
+//template<class Graph>
+//class GapComparativeAnalyzer {
+// typedef typename Graph::EdgeId EdgeId;
+// typedef typename Graph::VertexId VertexId;
+//
+// const Graph& g_;
+// const ColorHandler<Graph>& coloring_;
+// const EdgesPositionHandler<Graph>& pos_;
+//
+// bool PurpleOrRed(EdgeId e) {
+// return coloring_.Color(e) == kRedColorSet
+// || coloring_.Color(e) == kVioletColorSet;
+// }
+//
+// bool CheckVertexCondition(VertexId v) {
+// return g_.CheckUniqueOutgoingEdge(v) && g_.CheckUniqueIncomingEdge(v)
+// && PurpleOrRed(g_.GetUniqueOutgoingEdge(v))
+// && PurpleOrRed(g_.GetUniqueIncomingEdge(v));
+// }
+//
+// void ReportEdge(EdgeId e, const string& folder) {
+// using namespace omnigraph::visualization;
+// INFO(
+// "Can close gap between edges " << g_.str(g_.GetUniqueIncomingEdge(g_.EdgeStart(e))) << " and " << g_.str(g_.GetUniqueOutgoingEdge(g_.EdgeEnd(e))) << " with edge " << g_.str(e));
+// LengthIdGraphLabeler<Graph> basic_labeler(g_);
+// EdgePosGraphLabeler<Graph> pos_labeler(g_, pos_);
+//
+// CompositeLabeler<Graph> labeler(basic_labeler, pos_labeler);
+// GraphComponent<Graph> component = omnigraph::EdgeNeighborhood(g_, e);
+// auto colorer = coloring_.ConstructColorer(component);
+// omnigraph::visualization::WriteComponent(component, folder + ToString(g_.int_id(e)) + "_loc.dot", colorer, labeler);
+// }
+//
+//// bool CheckEdges(const vector<EdgeId>& edges) {
+//// set<TColorSet> colors;
+//// for (auto it = edges.begin(); it != edges.end(); ++it) {
+//// colors.insert(coloring_.Color(*it));
+//// }
+//// return edges.size() == 1
+//// || (edges.size() == 2 && colors.count(kBlueColor) == 1);
+//// }
+//
+//public:
+// GapComparativeAnalyzer(const Graph& g, const ColorHandler<Graph>& coloring,
+// const EdgesPositionHandler<Graph>& pos) :
+// g_(g), coloring_(coloring), pos_(pos) {
+// }
+//
+// void ReportPotentialGapsCloses(const string& folder) {
+// make_dir(folder);
+// for (auto it = g_.SmartEdgeBegin(); !it.IsEnd(); ++it) {
+// if (coloring_.Color(*it) == kRedColorSet
+// && CheckVertexCondition(g_.EdgeStart(*it))
+// && CheckVertexCondition(g_.EdgeEnd(*it))) {
+// ReportEdge(*it, folder);
+// }
+// }
+// }
+//
+//// void ReportDeepGapsCloses(const string& folder) {
+//// make_dir(folder);
+//// for (auto it = g_.SmartEdgeBegin(); !it.IsEnd(); ++it) {
+//// if (coloring_.Color(*it) == kRedColorSet
+//// && CheckEdges(g_.OutgoingEdges(g_.EdgeStart(*it)))
+//// && CheckEdges(g_.IncomingEdges(g_.EdgeEnd(*it)))
+//// && ContainsTip()) {
+////
+//// }
+//// }
+//// }
+//};
+//
+//
+//}
diff --git a/src/projects/cap/cap_commands.hpp b/src/projects/cap/cap_commands.hpp
new file mode 100644
index 0000000..c4c637f
--- /dev/null
+++ b/src/projects/cap/cap_commands.hpp
@@ -0,0 +1,731 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "cap_environment.hpp"
+#include "cap_environment_manager.hpp"
+#include "mosaic.hpp"
+#include "io/reads_io/sequence_reader.hpp"
+#include "dev_support/path_helper.hpp"
+
+namespace online_visualization {
+
+class AddGenomeCommand : public LocalCommand<CapEnvironment> {
+ public:
+ AddGenomeCommand() : LocalCommand<CapEnvironment>("add_genome") {
+ }
+
+ virtual std::string Usage() const {
+ return "Command `add_genome`\n"
+ " Adds genome contained in file specified to the environment. This involves:\n"
+ " * further tracking of genome (incl. refinement, searching for diffs, drawing pics, etc.\n"
+ " * storing all genome sequence in RAM throughout usage of current environment\n"
+ "Usage:\n"
+ "> add_genome <path_to_file> <genome_name> [<crop_repeats> (Yy)] \n"
+ " You should specify path to file in which genome data is stored "
+ "(.fasta, .gb, etc.). Also you should provide name for genome"
+ "to display in future output.\n"
+ "For example:\n"
+ "> add_genome /home/puperuser/genomes/my_genome.fasta my_genome\n"
+ " would add to the environment genome stored in file "
+ "`my_genome.fasta` located in folder `/home/puperuser/genomes`\n"
+ "Optionally N's other strange symbols and repeat families marked by programs s.a. RepeatMasker (written in small letters)"
+ " can be ommited without loss of original coordinates\n";
+ }
+
+ virtual void Execute(CapEnvironment& curr_env, const ArgumentList& arg_list) const {
+ if (!CheckCorrectness(arg_list)) {
+ return;
+ }
+ const vector<string>& args = arg_list.GetAllArguments();
+ const std::string &filename = args[1];
+ std::string name = filename;
+ name = args[2];
+ bool crop_repeats = false;
+ if (args.size() > 3) {
+ VERIFY(args[3] == "Y" || args[3] == "y");
+ crop_repeats = true;
+ std::cout << "Repeat crop enabled! All small letters will be ignored with coordinated preserved\n";
+ }
+
+ bool success = curr_env.manager().AddGenomeFromFile(filename, name, crop_repeats);
+ if (!success) {
+ std::cout << "Failed. Genome is not valid. Please check input.\n";
+ }
+ }
+
+ protected:
+ virtual size_t MinArgNumber() const {
+ return 2;
+ }
+
+ virtual bool CheckCorrectness(const ArgumentList& arg_list) const {
+ const vector<std::string> &args = arg_list.GetAllArguments();
+ if (!CheckEnoughArguments(args)) {
+ std::cout << "Command takes one or more arguments. Aborting.\n";
+ return false;
+ }
+
+ const std::string &filename = args[1];
+ if (!CheckFileExists(filename)) {
+ std::cout << "There is no file linked to the path given. Aborting.\n";
+ return false;
+ }
+
+ return true;
+ }
+};
+
+class SaveGenomesCommand : public LocalCommand<CapEnvironment> {
+ public:
+ SaveGenomesCommand() : LocalCommand<CapEnvironment>("save_genomes") {
+ }
+
+ virtual std::string Usage() const {
+ return "Command `save_genomes`\n"
+ " Saves all progress of refining the genomes.\n"
+ " Namely, stores refined (modified) sequences on hard drive.\n"
+ "Usage:\n"
+ "> save_genomes [force]\n"
+ " `force` is optional parameter. If `force` is /y|Y|(force)/ then\n"
+ " genomes will be written even if this combination of genomes and Ks\n"
+ " was stored before/\n";
+ }
+
+ virtual void Execute(CapEnvironment& curr_env, const ArgumentList& arg_list) const {
+ const vector<string> &args = arg_list.GetAllArguments();
+
+ bool force = false;
+ if (args.size() > 1 && (args[1] == "y" || args[1] == "Y" || args[1] == "force")) {
+ force = true;
+ }
+
+ std::string dir = curr_env.manager().GetDirForCurrentState();
+
+ std::cout << "Saving genomes to " << dir << " ...";
+ if (cap::utils::DirExist(dir)) {
+ std::cout << "Looks like current state was already stored. ";
+ if (force) {
+ std::cout << "(!) FORCED WRITE..";
+ } else {
+ std::cout << "Omitting this stage.\n";
+ return;
+ }
+ }
+ curr_env.manager().SaveGenomesToDisk(force);
+
+ cout << "Done.\n";
+ }
+
+ protected:
+ virtual size_t MinArgNumber() const {
+ return 0;
+ }
+
+};
+
+class RefineCommand : public LocalCommand<CapEnvironment> {
+ public:
+ RefineCommand() : LocalCommand<CapEnvironment>("refine") {
+ }
+
+ virtual std::string Usage() const {
+ return "Command `refine`\n"
+ " Refines multicolored De Bruijn graph built from all genomes in environment with last chosen K.\n"
+ " Some K should be selected and graph built before running this command (see `build_graph`)\n"
+ "Usage:\n"
+ "> refine\n";
+ }
+
+ virtual void Execute(CapEnvironment& curr_env, const ArgumentList& /* arg_list */) const {
+ if (curr_env.GetGraphK() == CapEnvironment::kNoGraphK) {
+ cout << "Graph has not yet been constructed, aborting.\n";
+ return;
+ }
+
+ cout << "Refining graph..";
+ curr_env.manager().Refine();
+ cout << " Done.\n";
+ }
+
+ protected:
+ virtual size_t MinArgNumber() const {
+ return 0;
+ }
+
+};
+
+class BuildGraphCommand : public LocalCommand<CapEnvironment> {
+ public:
+ BuildGraphCommand() : LocalCommand<CapEnvironment>("build_graph") {
+ }
+
+ virtual std::string Usage() const {
+ return "Command `build_graph`\n"
+ " Sets K for multicolored De Bruijn graph and builds graph from genomes previously added to environment (see `add_genome`)\n"
+ " K should be odd.\n"
+ "Usage:\n"
+ "> build_graph <k>\n";
+ }
+
+ virtual void Execute(CapEnvironment& curr_env, const ArgumentList& arg_list) const {
+ const vector<string> &args = arg_list.GetAllArguments();
+
+ if (!CheckEnoughArguments(args)) {
+ return;
+ }
+ unsigned k;
+
+ std::stringstream ss(args[1]);
+ ss >> k;
+
+ if (k % 2 == 0) {
+ cout << "K should be odd. Aborting.\n";
+ return;
+ }
+
+ cout << "Building graph..";
+ curr_env.manager().ConstructGraph(k);
+ cout << " Done.\n";
+ }
+
+ protected:
+ virtual size_t MinArgNumber() const {
+ return 1;
+ }
+};
+
+template <>
+class LoadCommand<CapEnvironment> : public Command<CapEnvironment> {
+ private:
+ typedef CapEnvironment Env;
+ shared_ptr<Env> MakeNewEnvironment(const string& name, const string &desc) const {
+ DEBUG("Making new environment " << name);
+ shared_ptr<Env> EnvPointer(new Env(name, desc));
+ DEBUG("Done");
+ return EnvPointer;
+ }
+
+ protected:
+ size_t MinArgNumber() const {
+ return 1;
+ }
+
+ bool CheckCorrectness(const vector<string>& args, LoadedEnvironments<Env>& loaded_environments) const
+ {
+ if (!this->CheckEnoughArguments(args))
+ return false;
+
+ string name = args[1];
+ for (auto iterator = loaded_environments.begin(); iterator != loaded_environments.end(); ++iterator) {
+ if (name == iterator->first) {
+ cout << "Name " << name << " already exists" << endl;
+ cout << "Maybe you want to switch to this environment? " << name << endl;
+ cout << "Please try again" << endl;
+ return false;
+ }
+ }
+ return true;
+ }
+
+ public:
+ string Usage() const {
+ string answer;
+ answer = answer + "Command `load` \n" +
+ "Usage:\n" +
+ "> load <environment_name> [description]\n" +
+ " You should specify the name of the new environment. All data and cache concerning \n" +
+ " this environment will be stored in " + cap_cfg::get().cache_root + "/<environment_name>/\n" +
+ " See cap_config for changing cache root folder.\n";
+ return answer;
+ }
+
+ LoadCommand() : Command<Env>("load")
+ {
+ }
+
+ void Execute(shared_ptr<Env>& curr_env,
+ LoadedEnvironments<Env>& loaded_environments,
+ const ArgumentList& arg_list) const
+ {
+ vector<string> args = arg_list.GetAllArguments();
+ string name = args[1];
+ string desc = "";
+ for (size_t i = 2; i < args.size(); ++i) {
+ if (i > 2) {
+ desc += " ";
+ }
+ desc += args[i];
+ }
+
+ cout << "Loading " << name << endl;
+ if (!CheckCorrectness(args, loaded_environments))
+ return;
+
+ shared_ptr<Env> new_env = MakeNewEnvironment(name, desc);
+ loaded_environments.insert(make_pair(name, new_env));
+ curr_env = new_env;
+ }
+
+};
+
+class SaveEnvCommand : public NewLocalCommand<CapEnvironment> {
+ public:
+ SaveEnvCommand() : NewLocalCommand<CapEnvironment>("save_env", 1) {
+ }
+
+ virtual std::string Usage() const {
+ return "Command `save_env`\n"
+ "Usage:\n"
+ "> save_graph <directory_to_save_to>\n";
+ }
+
+ private:
+ virtual void InnerExecute(CapEnvironment& curr_env, const vector<string>& args) const {
+ std::string folder;
+ folder = args[1] + "/";
+
+ cout << "Saving env in " << folder << " ...";
+
+ cap::utils::MakeDirPath(folder);
+ VERIFY(cap::utils::DirExist(folder));
+
+ std::ofstream write_stream(folder + "environment");
+ curr_env.WriteToStream(write_stream);
+ write_stream.close();
+ cout << " Done.\n";
+ }
+
+};
+
+class LoadEnvCommand : public NewLocalCommand<CapEnvironment> {
+ public:
+ LoadEnvCommand() : NewLocalCommand<CapEnvironment>("load_env", 1) {
+ }
+
+ virtual std::string Usage() const {
+ return "Command `load_env`\n"
+ "Usage:\n"
+ "> load_env <directory with save>\n";
+ }
+
+private:
+ virtual void InnerExecute(CapEnvironment& curr_env, const vector<string>& args) const {
+ std::string folder;
+ VERIFY(args.size() > 1);
+ folder = args[1] + "/";
+
+ cout << "Load env from " << folder << " ...";
+
+ std::ifstream read_stream(folder + "environment");
+ curr_env.ReadFromStream(read_stream);
+ read_stream.close();
+ cout << " Done.\n";
+ }
+
+};
+
+class SaveGraphCommand : public NewLocalCommand<CapEnvironment> {
+ public:
+ SaveGraphCommand() : NewLocalCommand<CapEnvironment>("save_graph", 0) {
+ }
+
+ virtual std::string Usage() const {
+ return "Command `save_graph`\n"
+ " Saves graph in common spades format in specified directory.\n"
+ " If no directory is specified then default cache directory for current state is used.\n"
+ "Usage:\n"
+ "> save_graph <directory_to_save_to>\n";
+ }
+
+ virtual void InnerExecute(CapEnvironment& curr_env, const vector<string>& args) const {
+ if (curr_env.GetGraphK() == CapEnvironment::kNoGraphK) {
+ cout << "You should build graph prior to saving it. Aborting.\n";
+ return;
+ }
+
+ string folder = TryFetchFolder(curr_env, args);
+
+ cout << "Saving graph in " << folder << " ...";
+ curr_env.manager().SaveGraph(folder + "saves/");
+ cout << " Done.\n";
+ }
+
+};
+
+class DrawPicsCommand : public LocalCommand<CapEnvironment> {
+ public:
+ DrawPicsCommand() : LocalCommand<CapEnvironment>("draw_pics") {
+ }
+
+ virtual std::string Usage() const {
+ return "Command `draw_pics`\n"
+ " Draws colored graph components in in specified directory.\n"
+ " If no directory is specified then default cache directory for current state is used.\n"
+ "Usage:\n"
+ "> draw_pics <directory_to_save_to>\n";
+ }
+
+ virtual void Execute(CapEnvironment& curr_env, const ArgumentList& arg_list) const {
+ if (curr_env.GetGraphK() == CapEnvironment::kNoGraphK) {
+ cout << "You should build graph prior to saving it. Aborting.\n";
+ return;
+ }
+
+ std::string folder = TryFetchFolder(curr_env, arg_list);
+
+ cout << "Drawing pics in " << folder << " ...";
+ curr_env.manager().DrawPics(folder + "pics/");
+ cout << " Done.\n";
+ }
+
+};
+
+class FindIndelsCommand : public LocalCommand<CapEnvironment> {
+ public:
+ FindIndelsCommand() : LocalCommand<CapEnvironment>("find_indels") {
+ }
+
+ virtual std::string Usage() const {
+ return "Command `find_indels`\n"
+ " Finds common in-del events that transform genomes into each other and writes out them.\n"
+ " If no output file is specified, the results are written to the default file (see `log_file`)\n"
+ " Also there is a feature to mask found indels in graph (!!! this does not affect sequences)\n"
+ " Note that graph should be built prior to finding indel events\n"
+ "Usage:\n"
+ "> find_indels [<mask>=N [<filename>]]\n"
+ "Where\n"
+ " <mask> is either Y or N\n"
+ "For example:\n"
+ "> find_indels Y ./indels/log5.txt\n"
+ "NOTE: when output file is specified it is overwritten if exists\n";
+ }
+
+ virtual void Execute(CapEnvironment &curr_env, const ArgumentList &arg_list) const {
+ if (curr_env.GetGraphK() == CapEnvironment::kNoGraphK) {
+ cout << "You should build graph prior to saving it. Aborting.\n";
+ return;
+ }
+
+ const vector<string> &args = arg_list.GetAllArguments();
+
+ bool mask_indels = false;
+ std::string filename = curr_env.event_log_path();
+ std::string mode = curr_env.event_log_file_mode();
+
+ if (args.size() > 1) {
+ VERIFY(args[1].size());
+ if (args[1][0] == 'Y' || args[1][0] == 'y') {
+ mask_indels = true;
+ }
+ }
+ if (args.size() > 2) {
+ filename = args[2];
+ mode = "w";
+ }
+
+ int code = curr_env.manager().FindIndels(mask_indels, filename, mode);
+ if (code == 1) {
+ cout << "Output file could not be opened for writing. Aborting.\n";
+ }
+ }
+
+};
+
+class FindInversionsCommand : public LocalCommand<CapEnvironment> {
+ public:
+ FindInversionsCommand() : LocalCommand<CapEnvironment>("find_inversions") {
+ }
+
+ virtual std::string Usage() const {
+ return "Command `find_inversions`\n"
+ " Finds common inversion events that transform genomes into each other and writes out them (actually, NO).\n"
+ " Note that graph should be built prior to finding inversion events\n"
+ "Usage:\n"
+ "> find_inversions\n";
+ }
+
+ virtual void Execute(CapEnvironment &curr_env, const ArgumentList &/* arg_list */) const {
+ if (curr_env.GetGraphK() == CapEnvironment::kNoGraphK) {
+ cout << "You should build graph prior to saving it. Aborting.\n";
+ return;
+ }
+
+// const vector<string> &args = arg_list.GetAllArguments();?
+
+ bool mask_inversions = false;
+ std::string filename = curr_env.event_log_path();
+ std::string mode = curr_env.event_log_file_mode();
+
+ /*
+ if (args.size() > 1) {
+ VERIFY(args[1].size());
+ if (args[1][0] == 'Y' || args[1][0] == 'y') {
+ mask_indels = true;
+ }
+ }
+ if (args.size() > 2) {
+ filename = args[2];
+ mode = "w";
+ }
+ */
+
+ /*int code = */curr_env.manager().FindInversions(mask_inversions, filename, mode);
+ /*
+ if (code == 1) {
+ cout << "Output file could not be opened for writing. Aborting.\n";
+ }
+ */
+ }
+
+};
+
+class BlocksToGRIMMFormat : public LocalCommand<CapEnvironment> {
+ public:
+ BlocksToGRIMMFormat() : LocalCommand<CapEnvironment>("blocks_to_grimm") {
+ }
+
+ virtual std::string Usage() const {
+ return "Command `block_to_grimm`\n"
+ " Converts blocks output by `save_blocks' to GRIMM format.\n"
+ "Usage:\n"
+ "> block_to_grimm <blocks_file> <grimm_file>\n";
+ }
+
+ virtual void Execute(CapEnvironment &/* curr_env */, const ArgumentList &arg_list) const {
+ const vector<string> &args = arg_list.GetAllArguments();
+
+ if (args.size() <= 2) {
+ cerr << "Not emough arguments" << endl;
+ return;
+ }
+
+ std::string file_from = args[1],
+ file_to = args[2];
+
+ path::make_full_path(file_from);
+ path::make_full_path(file_to);
+
+ std::string dir = path::parent_path(file_to);
+ cap::utils::MakeDirPath(dir);
+
+ BlockPrinter<Graph>::ConvertBlocksToGRIMM(file_from, file_to);
+ }
+};
+
+class SaveBlocksCommand : public LocalCommand<CapEnvironment> {
+ public:
+ SaveBlocksCommand() : LocalCommand<CapEnvironment>("save_blocks") {
+ }
+
+ virtual std::string Usage() const {
+ return "Command `save_blocks`\n"
+ " Saves all trivial synteny blocks (aka graph edges).\n"
+ " Synteny blocks are given new ids (with edge ids also in the file).\n"
+ " All the coordinates ()\n"
+ "Usage:\n"
+ "> save_blocks <file_to_save_to> [unique]\n"
+ "Where\n"
+ " [unique] if set and equals to (unique|Y|y) then only blocks\n"
+ " that appear exactly once in the contigs will be reported.\n";
+ }
+
+ virtual void Execute(CapEnvironment& curr_env, const ArgumentList& arg_list) const {
+ const vector<string> &args = arg_list.GetAllArguments();
+ const std::string folder = TryFetchFolder(curr_env, arg_list);
+
+ bool unique = false;
+ if (args.size() > 2 && (args[2] == "y" || args[2] == "Y" || args[2] == "unique")) {
+ unique = true;
+ }
+ INFO("unique = " << unique << ", args[2] = " << args[2]);
+
+ BlockPrinter<Graph> *printer;
+
+ if (!unique) {
+ printer = new BlockPrinter<Graph>(curr_env.graph(),
+ curr_env.coordinates_handler(), folder + "blocks.txt");
+ } else {
+ vector<pair<size_t, size_t>> rc_pairs = PrepareRCContigPairs(curr_env);
+ printer = new UniqueBlockPrinter<Graph>(curr_env.graph(),
+ curr_env.coordinates_handler(), folder + "blocks.txt", rc_pairs);
+ }
+
+ for (unsigned i = 0; i < curr_env.genome_cnt(); ++i) {
+ printer->ProcessContig(i, 2*i, curr_env.genome_names()[i]);
+ }
+
+ delete printer;
+ }
+
+ protected:
+ virtual size_t MinArgNumber() const {
+ return 1;
+ }
+
+ private:
+ vector<pair<size_t, size_t>> PrepareRCContigPairs(const CapEnvironment &curr_env) const {
+ size_t num_contigs = curr_env.genome_cnt();
+
+ vector<pair<size_t, size_t>> res;
+ for (size_t i = 0; i < num_contigs; ++i) {
+ res.push_back(make_pair(2 * i, 2 * i + 1));
+ }
+
+ return res;
+ }
+
+};
+
+class LoadGraphCommand : public LocalCommand<CapEnvironment> {
+ public:
+ LoadGraphCommand() : LocalCommand<CapEnvironment>("load_graph") {
+ }
+
+ virtual std::string Usage() const {
+ return "Command `load_graph`\n"
+ " Loads graph from previously saved saves\n"
+ "Usage:\n"
+ "> load_graph <K> <path>\n"
+ "For example:\n"
+ "> find_indels 55 ./masked/graph\n";
+ }
+
+ virtual void Execute(CapEnvironment &curr_env, const ArgumentList &arg_list) const {
+ const vector<string> &args = arg_list.GetAllArguments();
+
+ uint K = 21;
+ stringstream ss(args[1]);
+ ss >> K;
+ const std::string &path = args[2];
+
+ curr_env.manager().LoadGraphFromSaves(K, path);
+ }
+
+ protected:
+ virtual size_t MinArgNumber() const {
+ return 2;
+ }
+
+};
+
+class MosaicAnalysisCommand : public NewLocalCommand<CapEnvironment> {
+ public:
+ MosaicAnalysisCommand() : NewLocalCommand<CapEnvironment>("mosaic", 0) {
+ }
+
+ virtual std::string Usage() const {
+ return "Command `mosaic`";
+ }
+
+ private:
+ virtual void InnerExecute(CapEnvironment& curr_env, const vector<string>& args) const {
+ VERIFY(curr_env.genome_cnt() == 1);
+// const Sequence& genome = curr_env.genomes()[1];
+ const Sequence& genome = curr_env.genomes()[0];
+ size_t min_support_length = 100;
+ size_t max_support_mult = 10;
+ size_t max_inter_length = 1000;
+ size_t min_reportable_mosaic_length = 500;
+ size_t min_reportable_submosaic_length = 100;
+ std::string folder = TryFetchFolder(curr_env, args);
+ cout << "Mosaic analysis triggered" << endl;
+ cout << "Min support block length " << min_support_length << endl;
+ cout << "Max support block multiplicity " << max_support_mult << endl;
+ cout << "Max inter-block length " << max_inter_length << endl;
+ if (curr_env.LSeqIsUsed()) {
+ VERIFY(false);
+// mosaic::PerformMosaicAnalysis(curr_env.l_seq_gp(), curr_env.coordinates_handler().AsMappingPath(0),
+// genome, min_support_length, max_support_mult, max_inter_length,
+// min_reportable_mosaic_length,
+// min_reportable_submosaic_length, out);
+ } else {
+ mosaic::PerformMosaicAnalysis(curr_env.rt_seq_gp(), curr_env.coordinates_handler().AsMappingPath(0),
+ genome, min_support_length, max_support_mult, max_inter_length,
+ min_reportable_mosaic_length,
+ min_reportable_submosaic_length, folder);
+ }
+ }
+};
+
+//todo works for finished genomes, not contigs!!!
+ContigStreams ConvertRefsToStreams(const vector<Sequence>& ss, const vector<string>& names) {
+ ContigStreams answer;
+ VERIFY(ss.size() == names.size());
+ for (size_t i = 0; i < ss.size(); ++i) {
+ answer.push_back(make_shared<io::SequenceReadStream<Contig>>(ss[i], names[i]));
+ }
+ return answer;
+}
+
+class MaskRepeatsCommand : public NewLocalCommand<CapEnvironment> {
+public:
+ MaskRepeatsCommand()
+ : NewLocalCommand<CapEnvironment>("mask_repeats", 2) {
+ }
+
+ virtual std::string Usage() const {
+ return "Command `mask_repeats <k> <max_iter_count>`";
+ }
+
+private:
+
+ vector<string> AppendFasta(const vector<string>& files) const {
+ vector<string> answer;
+ for (string s : files) {
+ answer.push_back(s + ".fasta");
+ }
+ return answer;
+ }
+
+ Sequence ReadSequence(ContigStream& reader) const {
+ VERIFY(!reader.eof());
+ io::SingleRead read;
+ reader >> read;
+ return read.sequence();
+ }
+
+ void UpdateGenomes(ContigStreams streams, CapEnvironment& curr_env) const {
+ vector<Sequence>& genomes = curr_env.mutable_genomes();
+ VERIFY(streams.size() == genomes.size());
+ for (size_t i = 0; i < streams.size(); ++i) {
+ genomes[i] = ReadSequence(streams[i]);
+ }
+ }
+
+ /*virtual*/
+ void InnerExecute(CapEnvironment& curr_env,
+ const vector<string>& args) const {
+ size_t k = GetInt(args[1]);
+ size_t iteration_cnt = GetInt(args[2]);
+
+ cout << "Masking repeats for k=" << k << " in " << iteration_cnt << " iterations" << endl;
+
+ ContigStreams streams = ConvertRefsToStreams(
+ curr_env.genomes(), curr_env.genome_names());
+
+ //todo temporary hack
+ curr_env.manager().SaveGenomesToDisk(false);
+
+ string folder = this->CurrentFolder(curr_env) + "masking/";
+ make_dir(folder);
+ bool success = MaskRepeats(k, streams, AppendFasta(curr_env.genome_names()),
+ iteration_cnt, folder);
+ if (!success) {
+ cout << "Failed to mask repeats in " << iteration_cnt
+ << " iterations" << endl;
+ } else {
+ cout << "Repeats successfully masked" << endl;
+ cout << "Updating genomes in environment" << endl;
+ UpdateGenomes(OpenStreams(CurrentFolder(curr_env) + "masking/masked/", AppendFasta(curr_env.genome_names()), false), curr_env);
+ }
+ }
+
+};
+
+}
diff --git a/src/projects/cap/cap_config_struct.hpp b/src/projects/cap/cap_config_struct.hpp
new file mode 100644
index 0000000..a0e997b
--- /dev/null
+++ b/src/projects/cap/cap_config_struct.hpp
@@ -0,0 +1,40 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "pipeline/config_common.hpp"
+#include "pipeline/config_singl.hpp"
+
+namespace cap {
+
+struct cap_config {
+ std::string cache_root;
+ std::string desc_file_name;
+ std::string default_log_filename;
+ std::string default_log_file_mode;
+};
+
+inline void load(cap_config &cfg, boost::property_tree::ptree const& pt, bool /*complete*/) {
+ using config_common::load;
+
+ load(cfg.cache_root, pt, "cache_root");
+ load(cfg.desc_file_name, pt, "desc_file_name");
+ load(cfg.default_log_filename, pt, "default_log_filename");
+ load(cfg.default_log_file_mode, pt, "default_log_file_mode");
+}
+
+void load(cap_config& cfg, const std::string &filename) {
+ boost::property_tree::ptree pt;
+ boost::property_tree::read_info(filename, pt);
+
+ load(cfg, pt, true);
+}
+
+}
+
+typedef config_common::config<cap::cap_config> cap_cfg;
diff --git a/src/projects/cap/cap_environment.hpp b/src/projects/cap/cap_environment.hpp
new file mode 100644
index 0000000..f0f24d4
--- /dev/null
+++ b/src/projects/cap/cap_environment.hpp
@@ -0,0 +1,265 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "../online_vis/environment.hpp"
+#include "compare_standard.hpp"
+#include "coloring.hpp"
+#include "coordinates_handler.hpp"
+#include "test_utils.hpp"
+#include "serialization.hpp"
+
+namespace online_visualization {
+
+class CapEnvironmentManager;
+/*
+ * Cap Environment is designed to handle operations on fixed
+ * set of genomes
+ */
+class CapEnvironment : public Environment {
+ friend class CapEnvironmentManager;
+
+ private:
+ typedef debruijn_graph::Graph Graph;
+ typedef Graph::VertexId VertexId;
+ typedef Graph::EdgeId EdgeId;
+
+ typedef debruijn_graph::KmerStoringEdgeIndex<Graph, runtime_k::RtSeq, kmer_index_traits<runtime_k::RtSeq>, debruijn_graph::SimpleStoring> RtSetIndex;
+ typedef debruijn_graph::graph_pack<Graph, runtime_k::RtSeq, RtSetIndex> RtSeqGraphPack;
+ typedef debruijn_graph::KmerStoringEdgeIndex<Graph, cap::LSeq, kmer_index_traits<cap::LSeq>, debruijn_graph::SimpleStoring> LSeqIndex;
+ typedef debruijn_graph::graph_pack<Graph, cap::LSeq, LSeqIndex> LSeqGraphPack;
+
+ typedef cap::ColorHandler<Graph> ColorHandler;
+ typedef cap::CoordinatesHandler<Graph> CoordinatesHandler;
+
+ std::string name_;
+ std::string dir_;
+ std::string description_;
+
+ // Sequence of Ks which were used to refine current genome
+ std::vector<unsigned> k_history_;
+ // History of number of studied genomes (for case of adding genomes in the middle of pipeline)
+ std::vector<size_t> num_genomes_history_;
+
+ // Paths on fs
+ std::vector<std::string> init_genomes_paths_;
+ // Genome sequences themselves. Yes, it may be lots of GBs.
+ std::vector<Sequence> genomes_;
+ std::vector<std::string> genomes_names_;
+
+ std::shared_ptr<RtSeqGraphPack> gp_rtseq_;
+ std::shared_ptr<LSeqGraphPack> gp_lseq_;
+
+ std::shared_ptr<ColorHandler> coloring_;
+
+ CoordinatesHandler coordinates_handler_;
+
+ // Aliases to GraphPack parts:
+ //
+ Graph *graph_;
+ EdgesPositionHandler<Graph> *edge_pos_;
+ GraphElementFinder<Graph> *element_finder_;
+
+ // Information concerning the default way to write out info about diversities
+ std::string event_log_path_;
+ // Either "w" or "a", and using the "a" mode during the environment load file
+ // will be anyway recreated (and purged) though.
+ std::string event_log_file_mode_;
+
+ // Environment Manager for complex methods on this Environment
+ std::shared_ptr<CapEnvironmentManager> manager_;
+
+ void AssignGPReferences() {
+ VERIFY(gp_lseq_ != NULL || gp_rtseq_ != NULL);
+ if (LSeqIsUsed()) {
+ graph_ = &(gp_lseq_->g);
+ edge_pos_ = &(gp_lseq_->edge_pos);
+ element_finder_ = &(gp_lseq_->element_finder);
+ } else {
+ graph_ = &(gp_rtseq_->g);
+ edge_pos_ = &(gp_rtseq_->edge_pos);
+ element_finder_ = &(gp_rtseq_->element_finder);
+ }
+ coordinates_handler_.SetGraph(graph_);
+ }
+
+ void set_gp(const std::shared_ptr<LSeqGraphPack> &gp_lseq) {
+ gp_lseq_ = gp_lseq;
+ }
+
+ void set_gp(const std::shared_ptr<RtSeqGraphPack> &gp_rtseq) {
+ gp_rtseq_ = gp_rtseq;
+ }
+
+ public:
+ static const unsigned kNoGraphK = -1;
+ const std::string kDefaultGPWorkdir;
+
+ CapEnvironment(const std::string &name, /*const std::string base_path, */const std::string &description = "")
+ : Environment(name, cap_cfg::get().cache_root + "/env_" + name),
+ name_(name),
+ dir_(cap_cfg::get().cache_root + "/env_" + name),
+ //base_path_(base_path),
+ description_(description),
+ k_history_(),
+ num_genomes_history_(),
+ init_genomes_paths_(),
+ gp_rtseq_(),
+ gp_lseq_(),
+ coloring_(),
+ coordinates_handler_(),
+ graph_(NULL),
+ edge_pos_(NULL),
+ element_finder_(NULL),
+ event_log_path_(dir_ + "/" + cap_cfg::get().default_log_filename),
+ event_log_file_mode_(cap_cfg::get().default_log_file_mode),
+ manager_(std::make_shared<CapEnvironmentManager>(this)),
+ kDefaultGPWorkdir("./tmp") {
+ cap::utils::MakeDirPath(dir_);
+ }
+
+ void WriteToStream(std::ostream &out) const {
+ cap::Serializer s(out);
+
+ s.WriteLine("name", name_);
+ s.WriteLine("dir", dir_);
+ s.WriteLine("description", description_);
+ s.WriteLine("k_history", k_history_);
+ s.WriteLine("num_genomes_history", num_genomes_history_);
+ s.WriteLine("init_genomes_paths", init_genomes_paths_);
+ s.WriteLine("genomes_names", genomes_names_);
+
+ s.WriteLine("genomes", genomes_);
+
+ s.WriteLine("coordinates_threads", coordinates_handler_.GetStoredThreads());
+ }
+
+ void ReadFromStream(std::istream &in) {
+ cap::Deserializer s(in);
+
+ s.ReadStream();
+
+ s.ReadValue("name", name_);
+ s.ReadValue("dir", dir_);
+ s.ReadValue("description", description_);
+ s.ReadValue("k_history", k_history_);
+ s.ReadValue("num_genomes_history", num_genomes_history_);
+ s.ReadValue("init_genomes_paths", init_genomes_paths_);
+ s.ReadValue("genomes_names", genomes_names_);
+
+ s.ReadValue("genomes", genomes_);
+
+ std::vector<std::pair<uint, std::vector<CoordinatesHandler::Thread>>> coords_threads;
+ s.ReadValue("coordinates_threads", coords_threads);
+ coordinates_handler_.SetStoredThreads(coords_threads);
+ }
+
+ void CheckConsistency() const {
+ VERIFY(gp_rtseq_ == NULL || gp_lseq_ == NULL);
+ bool have_any_gp = gp_rtseq_ != NULL || gp_lseq_ != NULL;
+ if (have_any_gp) {
+ VERIFY(graph_ != NULL);
+ VERIFY(edge_pos_ != NULL);
+ VERIFY(element_finder_ != NULL);
+ VERIFY(coordinates_handler_.GetGraph() == graph_);
+ }
+ }
+
+ void ClearGP() {
+ // shared_ptr deletes automatically
+ coordinates_handler_.UnsetGraph();
+ gp_rtseq_.reset();
+ gp_lseq_.reset();
+ graph_ = NULL;
+ edge_pos_ = NULL;
+ element_finder_ = NULL;
+ coloring_.reset();
+
+ CheckConsistency();
+ }
+
+ template <class GraphPack>
+ void SetGraphPack(const std::shared_ptr<GraphPack> &gp) {
+ VERIFY(gp_rtseq_ == NULL && gp_lseq_ == NULL);
+ set_gp(gp);
+ AssignGPReferences();
+ CheckConsistency();
+ }
+
+ unsigned GetGraphK() const {
+ if (gp_rtseq_ == NULL && gp_lseq_ == NULL) {
+ return kNoGraphK;
+ }
+
+ return unsigned(graph_->k());
+ }
+ bool LSeqIsUsed() const {
+ return gp_lseq_ != NULL;
+ }
+
+ // Method defining of we use RtSeq or LSeq for particular K
+ bool UseLSeqForThisK(unsigned k) const {
+ return k > 201;
+ }
+
+ const std::string &name() const {
+ return name_;
+ }
+ const std::string &dir() const {
+ return dir_;
+ }
+ const Graph &graph() const {
+ return *graph_;
+ }
+
+ RtSeqGraphPack& rt_seq_gp() const {
+ return *gp_rtseq_;
+ }
+
+ LSeqGraphPack& l_seq_gp() const {
+ return *gp_lseq_;
+ }
+
+ const vector<Sequence>& genomes() const {
+ return genomes_;
+ }
+
+ vector<Sequence>& mutable_genomes() {
+ return genomes_;
+ }
+
+ const vector<string>& genome_names() const {
+ return genomes_names_;
+ }
+
+ const CoordinatesHandler& coordinates_handler() const {
+ return coordinates_handler_;
+ }
+
+ unsigned genome_cnt() const {
+ return unsigned(genomes_names_.size());
+ }
+
+ const EdgesPositionHandler<Graph> &edge_pos() const {
+ return *edge_pos_;
+ }
+ const ColorHandler &coloring() const {
+ return *coloring_;
+ }
+ const std::string &event_log_path() {
+ return event_log_path_;
+ }
+ const std::string &event_log_file_mode() {
+ return event_log_file_mode_;
+ }
+ CapEnvironmentManager &manager() const {
+ return *manager_;
+ }
+
+};
+}
diff --git a/src/projects/cap/cap_environment_manager.hpp b/src/projects/cap/cap_environment_manager.hpp
new file mode 100644
index 0000000..9628fdb
--- /dev/null
+++ b/src/projects/cap/cap_environment_manager.hpp
@@ -0,0 +1,493 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "stages/simplification_pipeline/graph_simplification.hpp"
+
+#include "compare_standard.hpp"
+#include "pipeline/graphio.hpp"
+
+#include "comparison_utils.hpp"
+#include "diff_masking.hpp"
+#include "genome_correction.hpp"
+#include "assembly_compare.hpp"
+#include "simple_indel_finder.hpp"
+#include "simple_inversion_finder.hpp"
+#include "graph_traversal_constraints.hpp"
+#include "test_utils.hpp"
+
+#include "cap_environment.hpp"
+#include "io/reads_io/sequence_reader.hpp"
+#include "pipeline/config_struct.hpp"
+#include "junk_cropping_reader.hpp"
+
+namespace online_visualization {
+
+using namespace cap;
+
+class CapEnvironmentManager {
+ typedef CapEnvironment::Graph Graph;
+ typedef CapEnvironment::LSeqGraphPack LSeqGP;
+ typedef CapEnvironment::RtSeqGraphPack RtSeqGP;
+ typedef Graph::VertexId VertexId;
+ typedef Graph::EdgeId EdgeId;
+
+ CapEnvironment *env_;
+ //vector<ContigStream *> last_streams_used_;
+
+ void WriteStateDesc(std::string file_path) const {
+ FILE *fd = fopen(file_path.c_str(), "w");
+
+ fputs("Genomes:\n", fd);
+ for (auto it = env_->init_genomes_paths_.begin(); it != env_->init_genomes_paths_.end(); ++it) {
+ fputs(it->c_str(), fd);
+ fputs("\n", fd);
+ }
+ fputs("Refining k_sequence:\n", fd);
+ for (size_t i = 0; i < env_->k_history_.size(); ++i) {
+ if (i != 0) {
+ fputs(" -> ", fd);
+ }
+ fprintf(fd, "%d (#%ld)", env_->k_history_[i], env_->num_genomes_history_[i]);
+ }
+ fputs("\n", fd);
+
+ fclose(fd);
+ }
+
+ void PrepareDirForSave(std::string path) const {
+ cap::utils::MakeDirPath(path);
+ WriteStateDesc(path + cap_cfg::get().desc_file_name);
+ }
+
+ template <class gp_t>
+ shared_ptr<gp_t> BuildGPFromStreams(ContigStreams &streams,
+ unsigned k) const {
+ typedef NewExtendedSequenceMapper<Graph, typename gp_t::index_t> Mapper;
+
+ shared_ptr<gp_t> result(new gp_t(k, env_->kDefaultGPWorkdir, 0));
+
+ //fixme use rc_wrapper
+ ContigStreams rc_contigs = io::RCWrap(streams);
+ rc_contigs.reset();
+
+ debruijn_graph::ConstructGraphUsingExtentionIndex(config::debruijn_config::construction(),
+ rc_contigs, result->g, result->index);
+
+ env_->coloring_ = std::make_shared<ColorHandler<Graph> >(result->g, streams.size());
+ ColoredGraphConstructor<Graph, Mapper> colored_graph_constructor(result->g,
+ *(env_->coloring_), *MapperInstance<gp_t>(*result));
+ colored_graph_constructor.ConstructGraph(rc_contigs);
+
+ INFO("Filling positions");
+ FillPositions(*result, rc_contigs, env_->coordinates_handler_);
+ INFO("Filling positions done.");
+
+ return result;
+ }
+
+ //template <class gp_t>
+ //shared_ptr<gp_t> BuildGPFromSaves(const size_t K, const std::string &/* path */) const;
+
+ shared_ptr<RtSeqGP> BuildGPFromSaves(const size_t K, const std::string &path) const {
+ typedef RtSeqGP gp_t;
+
+ shared_ptr<gp_t> result(new gp_t(unsigned(K), env_->kDefaultGPWorkdir, 0));
+
+ debruijn_graph::graphio::ScanGraphPack(path, *result);
+
+ ContigStreams streams;
+ for (size_t i = 0; i < env_->genomes_.size(); ++i) {
+ streams.push_back(make_shared<io::SequenceReadStream<Contig>>(
+ env_->genomes_[i], env_->genomes_names_[i]));
+ }
+ ContigStreams rc_contigs = io::RCWrap(streams);
+ rc_contigs.reset();
+
+ INFO("Filling positions");
+ FillPositions(*result, rc_contigs, env_->coordinates_handler_);
+ INFO("Filling positions done.");
+
+ return result;
+ }
+
+ template <class gp_t>
+ void SaveCurrentStreams(const gp_t &/* gp */, const std::string &dir) const {
+ for (size_t i = 0; i < env_->genomes_.size(); ++i) {
+ std::string output_filename = dir + path::filename(env_->init_genomes_paths_[i]);
+ if (!output_filename.empty()) {
+ Contig contig;
+ io::osequencestream out_stream(output_filename);
+ DEBUG("Saving to " << output_filename);
+
+ io::SequenceReadStream<io::SingleRead> stream(env_->genomes_[i], env_->genomes_names_[i]);
+ while (!stream.eof()) {
+ stream >> contig;
+ out_stream << contig;
+ }
+ }
+ }
+ }
+
+ void UpdateStreams() {
+ for (unsigned i = 0; i < env_->genomes_.size(); ++i) {
+ env_->genomes_[i] = env_->coordinates_handler_.ReconstructGenome(2 * i);
+ //VERIFY(env_->genomes_[i]->IsValid());
+ }
+ }
+
+ template <class gp_t>
+ void RefineTemplated(gp_t &gp) {
+ INFO("Store threads");
+ //env_->coordinates_handler_.StoreGenomeThreads();
+ INFO("Store threads ended");
+ double delta = 5.;
+
+ //outdated!!!
+ //debruijn_config::simplification::bulge_remover br_config;
+ //br_config.max_bulge_length_coefficient = 3;
+ //br_config.max_coverage = 1000.;
+ //br_config.max_relative_coverage = 1.2;
+ //br_config.max_delta = delta;
+ //br_config.max_relative_delta = 0.1;
+
+ INFO("Removing bulges");
+
+ BulgeRemoverCallbackToCoordinatesHandlerAdapter<Graph> adapter(
+ env_->coordinates_handler_);
+ boost::function<void(EdgeId, const std::vector<EdgeId> &)> projecting_callback =
+ boost::bind(&BulgeRemoverCallbackToCoordinatesHandlerAdapter<Graph>::Project,
+ &adapter, _1, _2);
+
+ //omp_set_num_threads(1);
+ debruijn::simplification::RemoveBulges(gp.g, br_config, projecting_callback);
+ //omp_set_num_threads(4);
+
+ INFO("Remapped " << gp.kmer_mapper.size() << " k-mers");
+
+ /*
+ debruijn_config::simplification::complex_bulge_remover cbr_config;
+ cbr_config.enabled = true;
+ cbr_config.pics_enabled = false;
+ cbr_config.folder = "";
+ cbr_config.max_relative_length = 3;
+ cbr_config.max_length_difference = 1000;
+
+ INFO("Removing complex bulges");
+ RemoveComplexBulges(gp.g, cbr_config);
+
+ INFO("Clipping tips with projection");
+
+ TipsProjector<gp_t> tip_projector(gp);
+ boost::function<void(EdgeId)> projecting_callback = boost::bind(
+ &TipsProjector<gp_t>::ProjectTip, &tip_projector, _1);
+ debruijn_config::simplification::tip_clipper tc_config;
+
+ tc_config.condition = "{ tc_lb 2. }";
+
+ ClipTipsWithProjection(gp, tc_config, true);
+ */
+
+ INFO("Remapped " << gp.kmer_mapper.size() << " k-mers");
+
+
+ env_->k_history_.push_back(env_->GetGraphK());
+ env_->num_genomes_history_.push_back(env_->init_genomes_paths_.size());
+ env_->coordinates_handler_.DumpRanges();
+
+ UpdateStreams();
+ }
+
+ template <class gp_t>
+ void FindIndelsTemplated(gp_t& gp, std::ofstream &out_stream,
+ const bool mask_indels) {
+ GenomeContiguousPathsGraphTraversalConstraints<Graph> traversal_constraints(
+ env_->coordinates_handler_);
+ SimpleIndelFinder<gp_t> indel_finder(gp, *env_->coloring_,
+ env_->coordinates_handler_, traversal_constraints, out_stream,
+ mask_indels);
+ indel_finder.FindIndelEvents();
+
+ if (mask_indels) {
+ env_->k_history_.push_back(env_->GetGraphK());
+ env_->num_genomes_history_.push_back(env_->init_genomes_paths_.size());
+ env_->coordinates_handler_.DumpRanges();
+ UpdateStreams();
+
+ }
+
+ }
+
+ template <class gp_t>
+ void FindInversionsTemplated(gp_t& gp, const std::string &base_pic_file_name,
+ const bool mask_inversions) const {
+ SimpleInversionFinder<gp_t> finder(gp, *env_->coloring_, env_->coordinates_handler_,
+ base_pic_file_name, mask_inversions);
+ finder.FindInversionEvents();
+ }
+
+ template<class gp_t>
+ void RefillPositions(const gp_t &gp) {
+ ContigStreams streams;
+ for (size_t i = 0; i < env_->genomes_.size(); ++i) {
+ streams.push_back(make_shared<io::SequenceReadStream<Contig>>(
+ env_->genomes_[i], env_->genomes_names_[i]));
+ }
+ ContigStreams rc_contigs = io::RCWrap(streams);
+ rc_contigs.reset();
+
+ INFO("Filling positions");
+ FillPositions(gp, rc_contigs, env_->coordinates_handler_);
+ INFO("Filling positions done.");
+ }
+
+ public:
+ CapEnvironmentManager(CapEnvironment *env)
+ : env_(env) {
+ //last_streams_used_() {
+ }
+
+ virtual ~CapEnvironmentManager() {
+ /*
+ for (auto it = last_streams_used_.begin(); it != last_streams_used_.end(); ++it) {
+ delete *it;
+ }
+ */
+ }
+
+ void ClearEnvironment() const {
+ env_->ClearGP();
+ }
+
+ std::string GetDirForCurrentState() const {
+ std::string env_dir = env_->dir();
+ std::stringstream merged_ks_stream;
+ for (size_t i = 0; i < env_->k_history_.size(); ++i) {
+ if (i != 0) {
+ merged_ks_stream << " ";
+ }
+ merged_ks_stream << env_->k_history_[i] << " " << env_->num_genomes_history_[i];
+ }
+ std::string cache_dir = "cache_" +
+ cap::utils::GenMD5FromFiles(env_->init_genomes_paths_, merged_ks_stream.str());
+
+ return env_dir + "/" + cache_dir + "/";
+ }
+
+ void ConstructGraphFromStreams(ContigStreams &streams, unsigned k) {
+ ClearEnvironment();
+ env_->CheckConsistency();
+ //last_streams_used_ = streams;
+
+ VERIFY(env_->gp_rtseq_ == NULL && env_->gp_lseq_ == NULL);
+ if (env_->UseLSeqForThisK(k)) {
+ VERIFY(false);
+// env_->SetGraphPack(BuildGPFromStreams<LSeqGP>(
+// streams, k));
+ } else {
+ env_->SetGraphPack(BuildGPFromStreams<RtSeqGP>(
+ streams, k));
+ }
+ }
+
+ void ConstructGraph(unsigned k) {
+ ContigStreams streams;
+ for (size_t i = 0; i < env_->genomes_.size(); ++i) {
+ streams.push_back(make_shared<io::SequenceReadStream<Contig>>(
+ env_->genomes_[i], env_->genomes_names_[i]));
+ }
+
+ ConstructGraphFromStreams(streams, k);
+ }
+
+ void SaveGraph(std::string folder) const {
+ cap::utils::MakeDirPath(folder);
+ VERIFY(cap::utils::DirExist(folder));
+
+ std::string filename = folder + "graph";
+
+ // Saving graph
+ /*
+ debruijn_graph::graphio::PrinterTraits<Graph>::Printer printer(*env_->graph_);
+ printer.SaveGraph(filename);
+ printer.SaveEdgeSequences(filename);
+ printer.SavePositions(filename, *env_->edge_pos_);
+ */
+ if (env_->LSeqIsUsed()) {
+ //PrintGraphPack(filename, env_->gp_lseq_);
+ } else {
+ debruijn_graph::graphio::PrintGraphPack(filename, *env_->gp_rtseq_);
+ }
+
+ // Saving coloring of graph
+ cap::SaveColoring(*env_->graph_, *env_->coloring_, filename);
+ }
+
+ void DrawPics(std::string folder) const {
+ // Saving pics
+ cap::utils::MakeDirPath(folder);
+ VERIFY(cap::utils::DirExist(folder));
+
+ std::vector<std::string> genomes_names;
+ for (const auto &gname : env_->genomes_names_) {
+ genomes_names.push_back(gname);
+ genomes_names.push_back(gname + "_RC");
+ }
+ cap::PrintColoredGraphWithColorFilter(*env_->graph_, *env_->coloring_,
+ env_->coordinates_handler_, genomes_names, folder);
+ }
+
+ void SetGenomes(const std::vector<std::string> &genomes_paths,
+ const std::vector<std::string> &/* genomes_names */) const {
+ VERIFY(env_->init_genomes_paths_.size() == 0);
+
+ env_->init_genomes_paths_ = genomes_paths;
+ }
+
+ /*
+ * Returns true if added successfully
+ */
+ bool AddGenomeFromFile(const std::string &filename,
+ const std::string &name,
+ bool crop_repeats = false) const {
+ if (!CheckFileExists(filename)) {
+ return false;
+ }
+
+ if (crop_repeats) {
+ JunkCroppingWrapper reader(make_shared<io::FileReadStream>(filename));
+ io::SingleRead genome;
+ reader >> genome;
+
+ if (!genome.IsValid()) {
+ return false;
+ }
+
+ env_->init_genomes_paths_.push_back(filename);
+ env_->genomes_.push_back(genome.sequence());
+ env_->genomes_names_.push_back(name);
+ env_->coordinates_handler_.StoreGenomeThreadManual(uint(env_->genomes_.size() - 1),
+ reader.coordinates_ladder());
+ } else {
+ io::FileReadStream reader(filename);
+ io::SingleRead genome;
+ reader >> genome;
+
+ if (!genome.IsValid()) {
+ return false;
+ }
+
+ env_->init_genomes_paths_.push_back(filename);
+ env_->genomes_.push_back(genome.sequence());
+ env_->genomes_names_.push_back(name);
+ }
+
+
+ return true;
+ }
+
+ void SaveGenomesToDisk(const bool force) const {
+ const std::string &dir = GetDirForCurrentState();
+
+ if (!force && cap::utils::DirExist(dir)) {
+ return;
+ }
+ PrepareDirForSave(dir);
+
+ if (env_->LSeqIsUsed()) {
+ SaveCurrentStreams(*env_->gp_lseq_, dir);
+ } else {
+ SaveCurrentStreams(*env_->gp_rtseq_, dir);
+ }
+ }
+
+ void Refine() {
+ env_->CheckConsistency();
+
+ if (env_->LSeqIsUsed()) {
+ RefineTemplated(*env_->gp_lseq_);
+ } else {
+ RefineTemplated(*env_->gp_rtseq_);
+ }
+
+ env_->CheckConsistency();
+ }
+
+ int FindIndels(const bool mask_indels, const std::string &output_file,
+ const std::string &output_mode) {
+ std::ios_base::openmode mode;
+ if (output_mode == "w") {
+ mode = std::ios_base::out;
+ } else {
+ mode = std::ios_base::app;
+ }
+ std::ofstream stream(output_file, mode);
+
+ if (stream.fail()) {
+ return 1;
+ }
+
+ if (env_->LSeqIsUsed()) {
+ FindIndelsTemplated(*env_->gp_lseq_, stream, mask_indels);
+ } else {
+ FindIndelsTemplated(*env_->gp_rtseq_, stream, mask_indels);
+ }
+
+ return 0;
+ }
+
+ int FindInversions(const bool mask_inversions, const std::string &/* output_file */, const std::string &/* output_mode */) const {
+ const std::string &dir = GetDirForCurrentState();
+ const std::string &base_pic_dir = dir + "/inversions";
+ const std::string &base_pic_file_name = base_pic_dir + "/inv";
+
+ cap::utils::MakeDirPath(base_pic_dir);
+ /*
+ std::ios_base::openmode mode;
+ if (output_mode == "w") {
+ mode = std::ios_base::out;
+ } else {
+ mode = std::ios_base::app;
+ }
+ std::ofstream stream(output_file, mode);
+
+ if (stream.fail()) {
+ return 1;
+ }
+ */
+
+ if (env_->LSeqIsUsed()) {
+ FindInversionsTemplated(*env_->gp_lseq_, base_pic_file_name, mask_inversions);
+ } else {
+ FindInversionsTemplated(*env_->gp_rtseq_, base_pic_file_name, mask_inversions);
+ }
+
+ return 0;
+ }
+
+ void LoadGraphFromSaves(const unsigned K, const std::string &path) {
+ ClearEnvironment();
+ env_->CheckConsistency();
+ //last_streams_used_ = streams;
+
+ VERIFY(env_->gp_rtseq_ == NULL && env_->gp_lseq_ == NULL);
+ if (env_->UseLSeqForThisK(K)) {
+ //env_->SetGraphPack(BuildGPFromSaves<LSeqGP>(K, path));
+ } else {
+ env_->SetGraphPack(BuildGPFromSaves(K, path));
+ }
+
+ env_->coloring_ = std::make_shared<ColorHandler<Graph> >(env_->graph(), env_->genome_cnt());
+ INFO("Loading coloring from " << path);
+ cap::LoadColoring(*env_->graph_, *env_->element_finder_, *env_->coloring_, path);
+
+ env_->CheckConsistency();
+ }
+};
+
+}
diff --git a/src/projects/cap/cap_graph_pack.hpp b/src/projects/cap/cap_graph_pack.hpp
new file mode 100644
index 0000000..bcd367f
--- /dev/null
+++ b/src/projects/cap/cap_graph_pack.hpp
@@ -0,0 +1,33 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "compare_standard.hpp"
+#include "coloring.hpp"
+
+namespace cap {
+
+template<class Graph>
+struct cap_graph_pack {
+ typedef Graph graph_t;
+ typedef string contig_id_t;
+ typedef typename Graph::EdgeId EdgeId;
+ Graph g;
+ omnigraph::GraphElementFinder<Graph> element_finder;
+ ColorHandler<Graph> coloring;
+// map<contig_id_t, vector<EdgeId>> red_paths;
+// map<contig_id_t, vector<EdgeId>> blue_paths;
+ EdgesPositionHandler<Graph> edge_pos;
+
+ cap_graph_pack(size_t k) :
+ g(k), element_finder(g), coloring(g), edge_pos(g) {
+
+ }
+};
+
+}
diff --git a/src/projects/cap/cap_kmer_index.hpp b/src/projects/cap/cap_kmer_index.hpp
new file mode 100644
index 0000000..5275554
--- /dev/null
+++ b/src/projects/cap/cap_kmer_index.hpp
@@ -0,0 +1,535 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "compare_standard.hpp"
+#include "longseq.hpp"
+#include "polynomial_hash.hpp"
+#include "utils/adt/kmer_map.hpp"
+#include "data_structures/indices/edge_position_index.hpp"
+
+#include "io/reads_io/sequence_reader.hpp"
+#include "data_structures/mph_index/base_hash.hpp"
+
+template<>
+struct kmer_index_traits<cap::LSeq> {
+ typedef cap::LSeq SeqType;
+ typedef std::vector<cap::LSeq> RawKMerStorage;
+ typedef std::vector<cap::LSeq> FinalKMerStorage;
+
+ typedef RawKMerStorage::iterator raw_data_iterator;
+ typedef RawKMerStorage::const_iterator raw_data_const_iterator;
+ typedef RawKMerStorage::iterator::value_type KMerRawData;
+ typedef RawKMerStorage::iterator::reference KMerRawReference;
+
+ struct raw_equal_to {
+ inline bool operator()(const SeqType &lhs, const KMerRawReference rhs) {
+ // Using fast_equal_to, which relies only on hash:
+ // 1. True comparison leads to poor performance on large k
+ // 2. Hashes are to be different (in other case MPH is impossible)
+ return SeqType::fast_equal_to()(lhs, rhs);
+ }
+ };
+
+ struct hash_function {
+ inline uint64_t operator()(const SeqType &k) const {
+ return k.GetHash().get<2>();
+ }
+ inline uint64_t operator()(const KMerRawReference k) const {
+ return k.GetHash().get<2>();
+ }
+ };
+
+ struct seeded_hash_function {
+ static cxxmph::h128 hash128(const KMerRawReference k, uint32_t seed) {
+ SeqType::HashType hash = k.GetHash();
+ // uint64_t salt = hash.get<2>();
+ cxxmph::h128 h;
+ MurmurHash3_x64_128(reinterpret_cast<const void*>(&hash), sizeof(hash),
+ seed, &h);
+ // h.set64(hash.get<0>(), 0);
+ // h.set64(hash.get<1>(), 1);
+
+ // INFO("SEE MAN:: hash=" << hash.get<0>() << ", seed=" << seed << ", result = " << h.get64(0) << " " << h.get64(1) << " " << k.str());
+ return h;
+ }
+
+ static cxxmph::h128 hash128(const SeqType &k, uint32_t seed) {
+ SeqType::HashType hash = k.GetHash();
+ // uint64_t salt = hash.get<2>();
+ cxxmph::h128 h;
+ MurmurHash3_x64_128(reinterpret_cast<const void*>(&hash), sizeof(hash),
+ seed, &h);
+ // h.set64(hash.get<0>(), 0);
+ // h.set64(hash.get<1>(), 1);
+
+ // INFO("SEE MAN:: hash=" << hash.get<0>() << ", seed=" << seed << ", result = " << h.get64(0) << " " << h.get64(1) << " " << k.str());
+
+ return h;
+ }
+ };
+
+ struct raw_create {
+ inline SeqType operator()(unsigned /*K*/, const KMerRawReference kmer) {
+ return SeqType(kmer);
+ }
+ };
+
+ template <class Reader>
+ static RawKMerStorage *raw_deserialize(Reader &/*reader*/, const std::string &/*FileName*/) {
+ VERIFY(false);
+ return NULL;
+ }
+
+};
+
+namespace cap {
+
+ template <class ReadType>
+ class CapKMerCounter : public ::KMerCounter<LSeq> {
+ typedef KMerCounter<LSeq> __super;
+ typedef typename __super::RawKMerStorage RawKMerStorage;
+
+ unsigned k_;
+ io::ReadStreamList<ReadType> streams_;
+ std::unordered_set<LSeq, LSeq::hash, LSeq::equal_to> storage_;
+ RawKMerStorage *bucket;
+
+ bool has_counted_;
+
+ public:
+ CapKMerCounter(const unsigned k, io::ReadStreamList<ReadType> streams)
+ : k_(k),
+ streams_(streams),
+ storage_(),
+ bucket(NULL),
+ has_counted_(false) {
+ }
+
+ CapKMerCounter(const unsigned k) : CapKMerCounter(k, NULL) {
+ }
+
+ virtual ~CapKMerCounter() {
+ ReleaseBucket(0);
+ }
+
+ virtual size_t KMerSize() const {
+ return LSeq::GetDataSize(k_) * sizeof(typename LSeq::DataType);
+ }
+
+ virtual size_t Count(unsigned /* num_buckets */, unsigned /* num_threads */) {
+ if (!has_counted_) {
+ Init();
+ has_counted_ = true;
+ }
+ INFO("K-mer counting done. There are " << storage_.size() << " kmers in total. ");
+ return storage_.size();
+ }
+
+ virtual size_t CountAll(unsigned /* num_buckets */, unsigned /* num_threads */, bool /* merge */= true) {
+ if (!has_counted_) {
+ Init();
+ has_counted_ = true;
+ }
+ INFO("K-mer counting done. There are " << storage_.size() << " kmers in total. ");
+ return storage_.size();
+ }
+
+ virtual void MergeBuckets(unsigned /* num_buckets */) {
+ VERIFY(bucket == NULL);
+ }
+
+ virtual void OpenBucket(size_t /* idx */, bool /* unlink */= true) {
+ VERIFY(bucket == NULL);
+
+ if (!has_counted_) {
+ Init();
+ has_counted_ = true;
+ }
+ TRACE("BUCKET OPEN");
+ bucket = new RawKMerStorage();
+ bucket->reserve(storage_.size());
+ for (auto it = storage_.begin(); it != storage_.end(); ++it) {
+ bucket->push_back(*it);
+ }
+ }
+
+ virtual void ReleaseBucket(size_t /* idx */) {
+ TRACE("RELEASE BUCKET");
+ delete bucket;
+ bucket = NULL;
+ }
+
+ virtual RawKMerStorage* TransferBucket(size_t /* idx */) {
+ VERIFY(bucket != NULL);
+ TRACE("TRANSFERRING BUCKET" <<
+ "BUCKET size=" << bucket->size());
+
+ RawKMerStorage *ret = bucket;
+ bucket = NULL;
+
+ return ret;
+ }
+
+ virtual RawKMerStorage* GetFinalKMers() {
+ OpenBucket(0);
+ VERIFY(bucket != NULL);
+
+ RawKMerStorage *ret = bucket;
+ bucket = NULL;
+
+ return ret;
+ }
+
+ virtual typename __super::iterator bucket_begin(size_t /* idx */) {
+ return bucket->begin();
+ }
+ virtual typename __super::iterator bucket_end(size_t /* idx */) {
+ return bucket->end();
+ }
+
+ protected:
+ virtual void Init() {
+ VERIFY(streams_.size() > 0);
+ for (size_t i = 0; i < streams_.size(); ++i) {
+ while (!streams_[i].eof()) {
+ ReadType r;
+ streams_[i] >> r;
+ const Sequence &seq = r.sequence();
+ if (seq.size() == 0) {
+ continue;
+ }
+ if (seq.size() < k_) {
+ INFO("WARNING: too small sequence!!");
+ continue;
+ }
+
+ LSeq kmer(k_, seq);
+ do {
+ storage_.insert(kmer);
+ kmer.Shift();
+ } while (kmer.IsValid());
+
+ }
+ }
+ streams_.clear();
+ }
+
+ void SetStreams(io::ReadStreamList<ReadType>& streams) {
+ streams_ = streams;
+ }
+
+ };
+
+ template <class Graph>
+ class CapKMerGraphCounter : public CapKMerCounter<io::SingleRead> {
+
+ public:
+ CapKMerGraphCounter(const unsigned k, const Graph &g)
+ : CapKMerCounter<io::SingleRead>(k),
+ g_(g) {
+
+ }
+
+ protected:
+ virtual void Init() {
+ io::ReadStreamList<io::SingleRead> stream_vector;
+ //fixme create reasonable reader from the graph
+ for (auto it = g_.ConstEdgeBegin(); !it.IsEnd(); ++it) {
+ stream_vector.push_back(make_shared<io::SequenceReadStream<io::SingleRead>>(g_.EdgeNucls(*it)));
+ }
+
+ CapKMerCounter<io::SingleRead>::SetStreams(stream_vector);
+ CapKMerCounter<io::SingleRead>::Init();
+ }
+
+ private:
+ const Graph &g_;
+ };
+
+}
+
+namespace debruijn_graph {
+
+ template<class Index>
+ class DeBruijnStreamKMerIndexBuilder<cap::LSeq, Index> {
+ public:
+ typedef Index IndexT;
+
+ template <class Streams>
+ size_t BuildIndexFromStream(Index &index,
+ Streams &streams,
+ io::SingleStream* /* contigs_stream */= 0) const {
+ /*
+ std::vector<io::IReader<io::SingleRead> *> stream_vec(streams.size());
+ for (size_t i = 0; i < streams.size(); ++i) {
+ stream_vec.push_back(&streams[i]);
+ }
+ auto streams_ptr = std::make_shared<Streams>(
+ new io::ReadStreamVector<io::IReader<io::SingleRead>>(stream_vec, false));
+ */
+ cap::CapKMerCounter<typename Streams::ReadT> counter(
+ index.k(), streams);
+
+ index.BuildIndex(counter, 1, 1);
+ return 0;
+ }
+
+ };
+
+ template <class Index>
+ class DeBruijnGraphKMerIndexBuilder<Index, cap::LSeq> {
+ public:
+ typedef Index IndexT;
+
+ template<class Graph>
+ void BuildIndexFromGraph(IndexT &index, const Graph &g) const {
+ cap::CapKMerGraphCounter<Graph> counter(index.k(), g);
+ index.BuildIndex(counter, 16, 1);
+ }
+ };
+
+}
+
+namespace runtime_k {
+
+ //todo review this class
+ template <typename Value>
+ class KmerMap<Value, cap::LSeq> {
+ public:
+ typedef typename cap::LSeq key_type;
+ typedef typename std::pair<const key_type, Value> value_type;
+ typedef KmerMap<Value, cap::LSeq> map_type;
+
+ private:
+ // Note using equal_to which maintains special 'transitions' inside LongSeqs
+ typedef std::unordered_map<key_type, Value, typename key_type::hash, typename key_type::equal_to> int_map_type;
+ typedef typename std::pair<const key_type, const Value> const_value_type;
+ int_map_type *data_;
+
+ class InnerIterator {
+ friend class KmerMap<Value, cap::LSeq>;
+ typedef typename int_map_type::iterator base;
+ base base_;
+
+ public:
+
+ InnerIterator(const base &iter): base_(iter) {
+ }
+
+ InnerIterator &operator++() {
+ ++base_;
+ return *this;
+ }
+ InnerIterator operator++(int) {
+ InnerIterator stored = *this;
+ ++base_;
+ return stored;
+ }
+
+ value_type operator*() {
+ return base_->operator *();
+ }
+
+ const key_type first() {
+ return base_->first();
+ }
+
+ Value& second() {
+ return base_->second();
+ }
+
+ bool operator==(const InnerIterator& iter) const {
+ return base_ == iter.base_;
+ }
+ bool operator!=(const InnerIterator& iter) const {
+ return !operator==(iter);
+ }
+
+ };
+
+ class InnerConstIterator {
+ friend class KmerMap<Value, cap::LSeq>;
+ typedef typename int_map_type::const_iterator base;
+ base base_;
+
+ public:
+
+ InnerConstIterator(const base &iter): base_(iter) {
+ }
+
+ InnerConstIterator &operator++() {
+ ++base_;
+ return *this;
+ }
+ InnerConstIterator operator++(int) {
+ InnerConstIterator stored = *this;
+ ++base_;
+ return stored;
+ }
+
+ const value_type operator*() const {
+ return base_->operator *();
+ }
+
+ key_type first() const {
+ return base_->first;
+ }
+
+ const Value& second() const {
+ return base_->second;
+ }
+
+ bool operator==(const InnerConstIterator& iter) const {
+ return base_ == iter.base_;
+ }
+ bool operator!=(const InnerConstIterator& iter) const {
+ return !operator==(iter);
+ }
+
+ };
+
+ public:
+ typedef InnerIterator iterator;
+ typedef InnerConstIterator const_iterator;
+
+ KmerMap(size_t /* k */) {
+ data_ = new int_map_type();
+ }
+
+ KmerMap(int_map_type *map) : data_(map) {
+ }
+
+ KmerMap(const map_type& map) {
+ data_ = new int_map_type(*map.data);
+ }
+
+ map_type& operator=(const map_type& map) {
+ if (map.data_ != data_) {
+ delete data_;
+ data_ = new int_map_type(*map.data_);
+ }
+
+ return *this;
+ }
+
+ ~KmerMap() {
+ delete data_;
+ }
+
+ bool empty() const {
+ return data_->empty();
+ }
+
+ size_t size() const {
+ return data_->size();
+ }
+
+ size_t max_size() const {
+ return data_->max_size();
+ }
+
+ const_iterator begin() const {
+ return InnerConstIterator(data_->begin());
+ }
+
+ iterator begin() {
+ return InnerIterator(data_->begin());
+ }
+
+ const_iterator end() const {
+ return InnerConstIterator(data_->end());
+ }
+
+ iterator end() {
+ return InnerIterator(data_->end());
+ }
+
+ Value& operator[](const key_type& kmer_seq) {
+ return data_->operator [](kmer_seq);
+ }
+
+ const_iterator find(const key_type& kmer_seq) const {
+ return InnerConstIterator(data_->find(kmer_seq));
+ }
+
+ iterator find(const key_type& kmer_seq) {
+ return InnerIterator(data_->find(kmer_seq));
+ }
+
+ size_t count(const key_type& kmer_seq) const {
+ return data_->count(kmer_seq);
+ }
+
+ pair<iterator, bool> insert(const value_type& val) {
+ return data_->insert(val);
+ }
+
+ size_t erase(const key_type& kmer_seq) {
+ return data_->erase(kmer_seq);
+ }
+
+ // iterator erase(const const_iterator& iter) {
+ // return iterator(data_->erase(iter.get_data()));
+ // }
+
+ iterator erase(const iterator& iter) {
+ return data_->erase(iter.base_);
+ }
+
+ void clear() {
+ data_->clear();
+ }
+
+ /*
+ size_t bucket_count() const {
+ return data_->bucket_count();
+ }
+
+ size_t max_bucket_count() const {
+ return data_->max_bucket_count();
+ }
+
+ size_t bucket_size(size_t n) const {
+ return data_->bucket_size(n);
+ }
+
+ size_t bucket(const RtSeq& kmer_seq) const {
+ return data_->bucket(kmer_seq);
+ }
+
+ float load_factor() const {
+ return data_->load_factor();
+ }
+
+ float max_load_factor() const {
+ return data_->max_load_factor();
+ }
+
+ void max_load_factor(float z) {
+ data_->max_load_factor(z);
+ }
+
+ void rehash(size_t n) {
+ data_->rehash(n);
+ }
+
+ size_t get_k() const {
+ return data_->get_k();
+ }
+
+ int_map_type& get_data() {
+ return *data_;
+ }
+ */
+
+
+
+ };
+
+}
diff --git a/src/projects/cap/cap_logger.hpp b/src/projects/cap/cap_logger.hpp
new file mode 100644
index 0000000..b54bc48
--- /dev/null
+++ b/src/projects/cap/cap_logger.hpp
@@ -0,0 +1,30 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "dev_support/logger/log_writers.hpp"
+
+/*
+#undef INFO
+#define INFO(message) \
+{ \
+ cout << __FILE__ << " " << __LINE__ << " ::: " << message << endl; \
+} \
+*/
+
+
+#define LOG(message) \
+{ \
+ cout << message << endl; \
+} \
+
+//#define trace(message) LOG_MSG(logging::L_TRACE, message)
+#define debug(print, message) \
+{ \
+ if (print) { \
+ cout << message << endl; \
+ } \
+}
diff --git a/src/projects/cap/cap_online_visualizer.hpp b/src/projects/cap/cap_online_visualizer.hpp
new file mode 100644
index 0000000..06299be
--- /dev/null
+++ b/src/projects/cap/cap_online_visualizer.hpp
@@ -0,0 +1,41 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "../online_vis/online_visualizer.hpp"
+#include "cap_environment.hpp"
+#include "cap_commands.hpp"
+
+namespace online_visualization {
+
+class CapOnlineVisualizer : public OnlineVisualizer<CapEnvironment> {
+ protected:
+ void AddSpecificCommands() {
+ AddCommand(make_shared<AddGenomeCommand>());
+ AddCommand(make_shared<BuildGraphCommand>());
+ AddCommand(make_shared<RefineCommand>());
+ AddCommand(make_shared<SaveGenomesCommand>());
+ AddCommand(make_shared<SaveGraphCommand>());
+ AddCommand(make_shared<SaveEnvCommand>());
+ AddCommand(make_shared<LoadEnvCommand>());
+ AddCommand(make_shared<DrawPicsCommand>());
+ AddCommand(make_shared<FindIndelsCommand>());
+ AddCommand(make_shared<FindInversionsCommand>());
+ AddCommand(make_shared<SaveBlocksCommand>());
+ AddCommand(make_shared<MosaicAnalysisCommand>());
+ AddCommand(make_shared<MaskRepeatsCommand>());
+ AddCommand(make_shared<LoadGraphCommand>());
+ AddCommand(make_shared<BlocksToGRIMMFormat>());
+ }
+
+ public:
+ CapOnlineVisualizer() : OnlineVisualizer<CapEnvironment>() {
+ }
+};
+
+}
diff --git a/src/projects/cap/colored_graph_construction.hpp b/src/projects/cap/colored_graph_construction.hpp
new file mode 100644
index 0000000..bfceb8c
--- /dev/null
+++ b/src/projects/cap/colored_graph_construction.hpp
@@ -0,0 +1,397 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "data_structures/sequence/runtime_k.hpp"
+#include "compare_standard.hpp"
+#include "cap_kmer_index.hpp"
+#include "algorithms/graph_construction.hpp"
+
+namespace cap {
+
+template<class Graph, class Mapper>
+class CoveredRangesFinder {
+ const Graph& g_;
+ const Mapper mapper_;
+
+ typedef typename Graph::EdgeId EdgeId;
+ typedef restricted::map<EdgeId, vector<Range>> CoveredRanges;
+
+ vector<Range> ProcessRange(Range new_range,
+ const vector<Range>& curr_ranges) const {
+ vector < Range > answer;
+ size_t i = 0;
+ while (i < curr_ranges.size()
+ && curr_ranges[i].end_pos < new_range.start_pos) {
+ answer.push_back(curr_ranges[i]);
+ ++i;
+ }
+
+ size_t merge_start =
+ (i != curr_ranges.size()) ?
+ std::min(curr_ranges[i].start_pos,
+ new_range.start_pos) :
+ new_range.start_pos;
+
+ size_t merge_end = new_range.end_pos;
+ while (i < curr_ranges.size()
+ && curr_ranges[i].start_pos <= new_range.end_pos) {
+ if (curr_ranges[i].end_pos > merge_end)
+ merge_end = curr_ranges[i].end_pos;
+ ++i;
+ }
+ answer.push_back(Range(merge_start, merge_end));
+ while (i < curr_ranges.size()) {
+ answer.push_back(curr_ranges[i]);
+ ++i;
+ }
+ return answer;
+ }
+
+ void ProcessPath(const MappingPath<EdgeId>& path,
+ CoveredRanges& crs) const {
+ for (size_t i = 0; i < path.size(); ++i) {
+ auto mapping = path[i];
+ EdgeId edge = mapping.first;
+ const vector<Range>& curr_ranges = crs[edge];
+ Range mapping_range = mapping.second.mapped_range;
+ VERIFY(g_.length(edge) >= mapping_range.end_pos);
+ crs[edge] = ProcessRange(mapping_range, curr_ranges);
+ VERIFY(g_.length(edge) >= crs[edge].back().end_pos);
+ }
+ }
+
+public:
+
+ CoveredRangesFinder(const Graph& g, const Mapper& mapper) :
+ g_(g), mapper_(mapper) {
+
+ }
+
+ void FindCoveredRanges(CoveredRanges& crs, ContigStream& stream) const {
+ io::SingleRead read;
+ stream.reset();
+// NewExtendedSequenceMapper<gp_t::k_value + 1, Graph> mapper(gp_.g,
+// gp_.index, gp_.kmer_mapper);
+ while (!stream.eof()) {
+ stream >> read;
+ ProcessPath(mapper_.MapSequence(read.sequence()), crs);
+ }
+ }
+
+};
+
+template<class Graph, class Mapper>
+class ColoredGraphConstructor {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef restricted::map<EdgeId, vector<Range>> CoveredRanges;
+ typedef restricted::map<EdgeId, vector<size_t>> BreakPoints;
+
+ Graph& g_;
+ ColorHandler<Graph>& coloring_;
+ const Mapper mapper_;
+
+ void AddBreaks(set<size_t>& breaks, const vector<Range>& ranges) const {
+ for (auto it = ranges.begin(); it != ranges.end(); ++it) {
+ breaks.insert(it->start_pos);
+ breaks.insert(it->end_pos);
+ }
+ }
+
+ vector<size_t> PostProcessBreaks(const set<size_t>& tmp_breaks) const {
+ vector<size_t> breaks(tmp_breaks.begin(), tmp_breaks.end());
+ //breaks contain 0 and edge_length here!
+ VERIFY(breaks.size() >= 2);
+ //cleaning breaks from 0 and edge_length
+ vector<size_t> final_breaks;
+ for (size_t i = 1; i < breaks.size() - 1; ++i) {
+ final_breaks.push_back(breaks[i]);
+ }
+ return final_breaks;
+ }
+
+ void FindBreakPoints(BreakPoints& bps,
+ const vector<CoveredRanges>& crss) const {
+ for (auto it = g_.SmartEdgeBegin(); !it.IsEnd(); ++it) {
+ EdgeId e = *it;
+ set<size_t> tmp_breaks;
+ for (size_t i = 0; i < crss.size(); ++i) {
+ auto crs_it = crss[i].find(e);
+ if (crs_it != crss[i].end()) {
+ AddBreaks(tmp_breaks, crs_it->second);
+ }
+ }
+ bps[e] = PostProcessBreaks(tmp_breaks);
+ VERIFY(bps[e].empty() || bps[e].back() < g_.length(e));
+ }
+ }
+
+ void SplitEdge(const vector<size_t>& breaks, EdgeId e) {
+ vector<size_t> shifts(breaks.size());
+ if (!breaks.empty()) {
+ shifts[0] = breaks[0];
+ for (size_t i = 1; i < breaks.size(); ++i) {
+ shifts[i] = breaks[i] - breaks[i - 1];
+ }
+ }
+ EdgeId curr_e = e;
+ for (size_t i = 0; i < breaks.size(); ++i) {
+ auto split_result = g_.SplitEdge(curr_e, shifts[i]);
+ curr_e = split_result.second;
+ }
+ }
+
+// void PrintCRS(const CoveredRanges& crs) {
+// for (auto it = crs.begin(); it != crs.end(); ++it) {
+// DEBUG(
+// "For edge " << gp_.g.str(it->first) << " ranges "
+// << it->second);
+// }
+// }
+
+ void SplitGraph(ContigStreams& streams) {
+ INFO("Determining covered ranges");
+ CoveredRangesFinder<Graph, Mapper> crs_finder(g_, mapper_);
+ vector<CoveredRanges> crss(streams.size());
+ for (size_t i = 0; i < streams.size(); ++i) {
+ crs_finder.FindCoveredRanges(crss[i], streams[i]);
+ // DEBUG("Printing covered ranges for stream i");
+ // PrintCRS(crss[i]);
+ }
+ BreakPoints bps;
+ INFO("Determining breakpoints");
+ FindBreakPoints(bps, crss);
+
+ INFO("Splitting graph");
+ SplitGraph(bps);
+ }
+
+ void SplitGraph(/*const */BreakPoints& bps) {
+ vector<EdgeId> initial_edges;
+ for (auto it = g_.SmartEdgeBegin(); !it.IsEnd(); ++it) {
+ initial_edges.push_back(*it);
+ }
+ for (auto it = SmartSetIterator<Graph, EdgeId>(g_,
+ initial_edges.begin(), initial_edges.end()); !it.IsEnd();
+ ++it) {
+ EdgeId e = *it;
+ VERIFY(bps.find(e) != bps.end());
+ VERIFY(bps[e].empty() || bps[e].back() < g_.length(e));
+ //todo temporary fix!!!
+ if (e == g_.conjugate(e))
+ continue;
+ SplitEdge(bps[e], e);
+ }
+ }
+
+ void PaintGraph(ContigStream& stream, TColorSet color) {
+ io::SingleRead read;
+ stream.reset();
+ while (!stream.eof()) {
+ stream >> read;
+ PaintPath(mapper_.MapSequence(read.sequence()).path(),
+ color);
+ }
+ }
+
+ void PaintGraph(ContigStreams& streams, const vector<TColorSet>& stream_colors) {
+ VERIFY(streams.size() == stream_colors.size());
+ for (size_t i = 0; i < streams.size(); ++i) {
+ PaintGraph(streams[i], stream_colors[i]);
+ }
+ }
+
+ void PaintPath(const Path<EdgeId>& path, TColorSet color) {
+ for (size_t i = 0; i < path.size(); ++i) {
+ coloring_.PaintEdge(path[i], color);
+ coloring_.PaintVertex(g_.EdgeStart(path[i]), color);
+ coloring_.PaintVertex(g_.EdgeEnd(path[i]), color);
+ }
+ }
+
+ void CompressGraph(Graph& g, ColorHandler<Graph>& coloring) {
+ for (auto it = g.SmartVertexBegin(); !it.IsEnd(); ++it) {
+ VertexId v = *it;
+ if (g.CanCompressVertex(v)
+ && coloring.Color(g.GetUniqueOutgoingEdge(v))
+ == coloring.Color(g.GetUniqueIncomingEdge(v))) {
+ g.CompressVertex(v);
+ }
+ }
+ }
+
+public:
+ ColoredGraphConstructor(Graph& g, ColorHandler<Graph>& coloring,
+ const Mapper& mapper) :
+ g_(g), coloring_(coloring), mapper_(mapper) {
+
+ }
+
+ void ConstructGraph(ContigStreams& streams) {
+// It is not truth anymore?
+// VERIFY(streams.size() == 2);
+
+// if (detailed_output) {
+// //saving for debug and investigation
+// SaveOldGraph(output_folder + "saves/init_graph");
+// }
+
+ SplitGraph(streams);
+
+// if (detailed_output) {
+// //saving for debug and investigation
+// SaveOldGraph(output_folder + "saves/split_graph");
+// }
+
+ vector<TColorSet> stream_colors;
+
+ // Obsolete two-coloring
+// stream_mapping.push_back(make_pair(streams[0], kRedColorSet));
+// stream_mapping.push_back(make_pair(streams[1], kBlueColorSet));
+
+ TColor color_number = 0;
+ for (auto it = streams.begin(); it != streams.end(); ++it) {
+ stream_colors.push_back(TColorSet::SingleColor(color_number));
+ ++color_number;
+ }
+
+ INFO("Coloring graph");
+ PaintGraph(streams, stream_colors);
+ INFO("Coloring done.");
+
+ //situation in example 6 =)
+ INFO("Compressing graph");
+ CompressGraph(g_, coloring_);
+ INFO("Compressing done.");
+ }
+};
+
+template<class Graph>
+void SimplifyGraph(Graph& g, size_t br_delta) {
+ //outdated
+ //debruijn_config::simplification::bulge_remover br_config;
+ //br_config.max_bulge_length_coefficient = 20;
+ //br_config.max_coverage = 1000.;
+ //br_config.max_relative_coverage = 1.2;
+ //br_config.max_delta = br_delta;
+ //br_config.max_relative_delta = 0.1;
+ INFO("Removing bulges");
+ RemoveBulges(g, br_config);
+
+// debruijn_config::simplification::tip_clipper tc;
+// tc.max_coverage = 1000;
+// tc.max_relative_coverage = 1000;
+// tc.max_tip_length_coefficient = 6;
+// ClipTips(gp.g, tc, 10 * gp.g.k());
+}
+
+template<class gp_t>
+void SplitAndColorGraph(gp_t& gp,
+ ColorHandler<typename gp_t::graph_t>& coloring,
+ ContigStreams& streams) {
+
+ typedef typename gp_t::graph_t Graph;
+ typedef typename gp_t::index_t Index;
+ typedef NewExtendedSequenceMapper<Graph, Index> Mapper;
+
+ ColoredGraphConstructor<Graph, Mapper> colored_graph_constructor(gp.g,
+ coloring, *MapperInstance<gp_t>(gp));
+
+
+ colored_graph_constructor.ConstructGraph(streams);
+}
+
+template<class Graph, class Index, class Streams>
+size_t CapConstructGraph(Streams& streams, Graph& g,
+ Index& index) {
+ return ConstructGraphUsingOldIndex(streams, g, index);
+}
+
+template<class gp_t>
+void FillPositions(const gp_t &gp, ContigStreams &streams,
+ CoordinatesHandler<typename gp_t::graph_t>& coordinates_handler) {
+ typedef NewExtendedSequenceMapper<typename gp_t::graph_t,
+ typename gp_t::index_t> Mapper;
+
+ VERIFY(coordinates_handler.GetGraph() == NULL);
+ coordinates_handler.SetGraph(&(gp.g));
+
+ unsigned contig_id = 0;
+ std::shared_ptr<const Mapper> mapper = MapperInstance<gp_t>(gp);
+
+ for (auto it = streams.begin(); it != streams.end(); ++it) {
+ //cap::RCWrapper stream(**it);
+ ContigStream &stream = *it;
+ stream.reset();
+
+ io::SingleRead contig;
+ // for forward and reverse directions
+ while (!stream.eof()) {
+ stream >> contig;
+
+ MappingPath<EdgeId> mapping_path = mapper->MapRead(contig);
+ const std::vector<EdgeId> edge_path =
+ mapping_path.simple_path();
+ coordinates_handler.AddGenomePath(contig_id, edge_path);
+ contig_id++;
+ }
+
+ stream.reset();
+ }
+}
+
+template<class gp_t>
+void ConstructColoredGraph(gp_t& gp,
+ ColorHandler<typename gp_t::graph_t>& coloring,
+ CoordinatesHandler<typename gp_t::graph_t>& coordinates_handler,
+ ContigStreams& streams) {
+
+ INFO("Constructing de Bruijn graph for k=" << gp.k_value);
+
+ CapConstructGraph(streams,
+ gp.g, gp.index);
+ SplitAndColorGraph(gp, coloring, streams);
+ FillPositions(gp, streams, coordinates_handler);
+}
+
+//template<class gp_t>
+//void ConstructColoredGraph(gp_t& gp,
+// ColorHandler<typename gp_t::graph_t>& coloring,
+// vector<ContigStream*>& streams, const string& reference, bool fill_pos = true, int br_delta = -1) {
+// typedef typename gp_t::graph_t Graph;
+// const size_t k = gp_t::k_value;
+// typedef NewExtendedSequenceMapper<k + 1, Graph> Mapper;
+//
+// INFO("Constructing de Bruijn graph for k=" << k);
+//
+// //dirty hack because parallel construction uses cfg::get!!!
+// vector<ContigStream*>& tmp_streams(streams.begin(), streams.end());
+// tmp_streams.push_back(EasyC)
+// io::MultifileReader<Contig> stream(tmp_streams);
+// ConstructGraph<k, Graph>(gp.g, gp.index, stream);
+//
+// //TODO do we still need it?
+// if (br_delta > 0)
+// SimplifyGraph(gp.g, br_delta);
+//
+// ColoredGraphConstructor<Graph, Mapper> colored_graph_constructor(gp.g,
+// coloring, *MapperInstance < gp_t > (gp));
+// colored_graph_constructor.ConstructGraph(streams);
+//
+// if (fill_pos) {
+// INFO("Filling contig positions");
+// for (auto it = streams.begin(); it != streams.end(); ++it) {
+// ContigStream& stream = **it;
+// stream.reset();
+// FillPos(gp, stream);
+// }
+// }
+//}
+
+}
diff --git a/src/projects/cap/coloring.hpp b/src/projects/cap/coloring.hpp
new file mode 100644
index 0000000..3916129
--- /dev/null
+++ b/src/projects/cap/coloring.hpp
@@ -0,0 +1,461 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <boost/format/format_fwd.hpp>
+
+namespace cap {
+
+const size_t kDefaultMaxColorsUsed = 8;
+
+typedef size_t TColor;
+
+class TColorSet {
+ typedef std::bitset <kDefaultMaxColorsUsed> TBitSet;
+
+private:
+ TBitSet bitset_;
+
+public:
+ TColorSet() : bitset_ () {
+ }
+
+ // Legacy 'uint' strings support
+ // Assuming uints were used only for two colors
+ explicit TColorSet(const string &bitset_string) {
+ bool is_legacy;
+ size_t length = bitset_string.length();
+ if (length == 1) {
+ is_legacy = true;
+ } else {
+ is_legacy = false;
+ }
+ if (is_legacy) {
+ std::istringstream iss(bitset_string);
+ unsigned long bitset_ul;
+ iss >> bitset_ul;
+
+ bitset_ = TBitSet(bitset_ul);
+ } else {
+ bitset_ = TBitSet(bitset_string);
+ }
+ }
+
+ explicit TColorSet(const unsigned long bitset_ul) : bitset_ (bitset_ul) {
+ }
+
+ explicit TColorSet(const TBitSet &base_set) : bitset_(base_set) {
+ }
+
+ TColorSet(const TColorSet &tcs) : bitset_(tcs.getBitset()) {
+ }
+
+ const TBitSet &getBitset() const {
+ return bitset_;
+ }
+
+ inline TColorSet operator | (const TColorSet &other) const {
+ return TColorSet(getBitset() | other.getBitset());
+ }
+
+ inline void operator |= (const TColorSet &other) {
+ bitset_ |= other.getBitset();
+ }
+
+ inline TColorSet operator & (const TColorSet &other) const {
+ return TColorSet(getBitset() & other.getBitset());
+ }
+
+ inline void operator &= (const TColorSet &other) {
+ bitset_ &= other.getBitset();
+ }
+
+ inline bool operator == (const TColorSet &other) const {
+ return getBitset() == other.getBitset();
+ }
+
+ inline bool operator != (const TColorSet &other) const {
+ return getBitset() != other.getBitset();
+ }
+
+ bool operator [] (size_t pos) const {
+ return bitset_[pos];
+ }
+
+ bool operator < (const TColorSet &other) const {
+ const TBitSet &other_bitset = other.getBitset();
+ for (int i = kDefaultMaxColorsUsed - 1; i >= 0; --i) {
+ if (bitset_[i] != other_bitset[i]) {
+ return bitset_[i] < other_bitset[i];
+ }
+ }
+ return false;
+ }
+
+ bool any() const {
+ return bitset_.any();
+ }
+
+ void SetBit(size_t bit_number, bool value) {
+ bitset_[bit_number] = value;
+ }
+
+ string ToString() const {
+ return bitset_.to_string();
+ }
+
+ static TColorSet SingleColor(const TColor color) {
+ TBitSet bitset;
+ bitset[color] = 1;
+
+ return TColorSet(bitset);
+ }
+
+ static TColorSet AllColorsSet(const size_t number_of_colors) {
+ TBitSet bitset;
+ for (size_t i = 0; i < number_of_colors; ++i) {
+ bitset[i] = 1;
+ }
+
+ return TColorSet(bitset);
+ }
+};
+
+
+// ColorGenerator: Singleton class for generating colors
+// ColorGenerator generates max_colors different colors in HSV format
+// First color is always black
+class ColorGenerator {
+ size_t max_colors_;
+
+ // Hue array of needed size
+ vector <double> hue_array_;
+
+ static double GenerateIthColor(const size_t color_number) {
+ double hue_value = 0;
+ size_t high_bit = 0;
+
+ for (size_t i = 0; (1ul << i) <= color_number; ++i) {
+ bool bit = (color_number >> i) & 1;
+ if (bit) {
+ high_bit = i;
+ }
+ }
+
+ hue_value = (1.0 + 2 * double(color_number ^ (1 << high_bit))) / (1 << (high_bit + 1));
+
+ return hue_value;
+ }
+
+public:
+ ColorGenerator(const size_t max_colors = kDefaultMaxColorsUsed) : max_colors_(0), hue_array_() {
+ GenerateColors(max_colors);
+ }
+
+ void GenerateColors(const size_t number_of_colors) {
+ // If all needed colors were already generated, do nothing
+ if (number_of_colors <= max_colors_) {
+ return;
+ }
+
+ hue_array_.resize(number_of_colors);
+ for (size_t i = max_colors_; i < number_of_colors; ++i) {
+ hue_array_[i] = GenerateIthColor(i);
+ }
+ max_colors_ = number_of_colors;
+ }
+
+ string GetIthColor(const size_t color_number) const {
+ VERIFY(color_number < max_colors_);
+
+ // black one is the very special
+ if (color_number == 0) {
+ return "0 0 0";
+ }
+ // personally, I like red much more than cyan, so
+ if (color_number == 1) {
+ return "0 1 1";
+ }
+
+ return str(
+ boost::format("%.3lf %.3lf %.3lf") % hue_array_[color_number] % 0.8 % 0.8
+ );
+ }
+
+ static ColorGenerator instance() {
+ static ColorGenerator instance;
+ return instance;
+ }
+
+};
+
+template<class Graph, class Element>
+class ElementColorHandler: public GraphActionHandler<Graph>, public visualization::ElementColorer<Element> {
+ typedef GraphActionHandler<Graph> base;
+
+ // For each element will store a bitmask of used there colors.
+ std::unordered_map<Element, TColorSet > data_;
+
+ // Maximum number of different colors that may be used in coloring
+ size_t max_colors_;
+
+public:
+ // here we have no VERIFYcation. However, there is in color generator.
+ string color_str(const TColor color) const {
+ return ColorGenerator::instance().GetIthColor((size_t) color);
+ }
+
+ string color_str(const TColorSet &color_set) const {
+ if (!color_set.any()) {
+ return color_str((TColor) 0);
+ }
+ string result = "";
+ for (size_t i = 0; i < max_colors_; ++i) {
+ if (!color_set[i]) continue;
+ if (result.length() != 0) {
+ result += ':';
+ }
+ result += color_str((TColor) (i + 1));
+
+ // IF WE ARE DIRTY BASTARDS
+ if (color_set[max_colors_]) {
+ result += ":" + color_str((TColor) (i + 1));
+ }
+ }
+ return result;
+ }
+
+ ElementColorHandler(const Graph& g, const size_t max_colors = kDefaultMaxColorsUsed) :
+ base(g, "ElementColorHandler"),
+ data_(),
+ max_colors_(max_colors) {
+ }
+
+ void PaintElement(Element e, const TColorSet &color_set) {
+ auto find_it = data_.find(e);
+ if (find_it == data_.end()) {
+ data_[e] = color_set;
+ } else {
+ find_it->second = find_it->second | color_set;
+ }
+ VERIFY((data_[e] | color_set) == data_[e]);
+ }
+
+ TColorSet Color(Element e) const {
+ auto it = data_.find(e);
+ if (it == data_.end())
+ return TColorSet();
+ else
+ return it->second;
+ }
+
+ string ColorStr(Element e) const {
+ return color_str(Color(e));
+ }
+
+ /*virtual*/
+ void HandleDelete(Element e) {
+ data_.erase(e);
+ }
+
+ void Erase(Element e) {
+ data_.erase(e);
+ }
+
+ string GetValue(Element e) const {
+ return ColorStr(e);
+ }
+};
+
+template<class Graph>
+class ColorHandler: public visualization::GraphColorer<Graph>, public GraphActionHandler<Graph> {
+ typedef GraphActionHandler<Graph> base;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ size_t max_colors_;
+
+ ElementColorHandler<Graph, EdgeId> edge_color_;
+ ElementColorHandler<Graph, VertexId> vertex_color_;
+public:
+
+ ColorHandler(const Graph& g, const size_t max_colors = kDefaultMaxColorsUsed) :
+ base(g, "ColorHandler"),
+ max_colors_(max_colors),
+ edge_color_(g, max_colors),
+ vertex_color_(g, max_colors) {
+ }
+
+ void PaintEdge(EdgeId e, const TColorSet &color_set) {
+ edge_color_.PaintElement(e, color_set);
+ }
+
+ void PaintVertex(VertexId v, const TColorSet &color_set) {
+ vertex_color_.PaintElement(v, color_set);
+ }
+
+ string GetValue(EdgeId e) const {
+ return edge_color_.GetValue(e);
+ }
+
+ string GetValue(VertexId v) const {
+ return vertex_color_.GetValue(v);
+ }
+
+ TColorSet Color(EdgeId e) const {
+ return edge_color_.Color(e);
+ }
+
+ TColorSet Color(VertexId v) const {
+ return vertex_color_.Color(v);
+ }
+
+ map<EdgeId, string> EdgeColorMap() const {
+ map<EdgeId, string> answer;
+ for (auto it = this->g().SmartEdgeBegin(); !it.IsEnd(); ++it) {
+ answer[*it] = edge_color_.ColorStr(*it);
+ }
+ return answer;
+ }
+
+ map<VertexId, string> VertexColorMap() const {
+ map<VertexId, string> answer;
+ for (auto it = this->g().begin(); it != this->g().end(); ++it) {
+ answer[*it] = vertex_color_.ColorStr(*it);
+ }
+ return answer;
+ }
+
+ /*virtual*/
+ void HandleDelete(EdgeId e) {
+ edge_color_.Erase(e);
+ }
+
+ /*virtual*/
+ void HandleDelete(VertexId v) {
+ vertex_color_.Erase(v);
+ }
+
+ /*virtual*/
+ void HandleMerge(const vector<EdgeId>& old_edges, EdgeId new_edge) {
+ VERIFY(old_edges.size() > 0);
+// auto color = Color(old_edges.front());
+ for (auto it = old_edges.begin(); it != old_edges.end(); ++it) {
+// VERIFY(color == Color(*it));
+ PaintEdge(new_edge, Color(*it));
+ }
+// Paint(new_edge, color);
+ }
+
+ /*virtual*/
+ void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) {
+ //todo temporary verification
+// VERIFY(Color(edge2) == edge_type::black && new_edge == edge2);
+ PaintEdge(new_edge, Color(edge2));
+ PaintEdge(new_edge, Color(edge1));
+ }
+
+ /*virtual*/
+ void HandleSplit(EdgeId old_edge, EdgeId new_edge_1, EdgeId new_edge_2) {
+ PaintVertex(this->g().EdgeEnd(new_edge_1), Color(old_edge));
+ PaintEdge(new_edge_1, Color(old_edge));
+ PaintEdge(new_edge_2, Color(old_edge));
+ }
+
+ //This is a bad unsafe code! The right way is to use shared_ptr of this class in all interfaces.
+ //Then one can easily draw with this colorer without any delegation
+ shared_ptr<omnigraph::visualization::GraphColorer<Graph>> ConstructColorer() const {
+ using namespace omnigraph::visualization;
+ return shared_ptr<GraphColorer<Graph>>(new omnigraph::visualization::DelegatingGraphColorer<Graph>(*this));
+ }
+
+ shared_ptr<omnigraph::visualization::GraphColorer<Graph>> ConstructColorer(GraphComponent<Graph> gc) const {
+ shared_ptr<omnigraph::visualization::GraphColorer<Graph>> colorer = ConstructColorer();
+ return omnigraph::visualization::BorderDecorator<Graph>::GetInstance(gc, colorer);
+ }
+
+ size_t max_colors() const {
+ return max_colors_;
+ }
+};
+
+
+template<class Graph>
+void SaveColoring(const Graph& g
+ , const ColorHandler<Graph>& coloring
+ , const string& filename) {
+ GraphComponent<Graph> whole_graph(g);
+ ofstream stream((filename + ".clr").c_str());
+ stream << whole_graph.v_size() << endl;
+ for (auto it = whole_graph.v_begin(); it != whole_graph.v_end(); ++it) {
+ stream << g.int_id(*it) << " " << coloring.Color(*it).ToString() << endl;
+ }
+ stream << whole_graph.e_size() << endl;
+ for (auto it = whole_graph.e_begin(); it != whole_graph.e_end(); ++it) {
+ stream << g.int_id(*it) << " " << coloring.Color(*it).ToString() << endl;
+ }
+}
+
+template<class Graph>
+void LoadColoring(const Graph& /*g*/
+ , const omnigraph::GraphElementFinder<Graph>& element_finder
+ , ColorHandler<Graph>& coloring
+ , const string& filename) {
+ ifstream stream((filename + ".clr").c_str());
+ size_t v_count;
+ stream >> v_count;
+ for (size_t i = 0; i < v_count; ++i) {
+ size_t id;
+ stream >> id;
+ string color_string;
+ stream >> color_string;
+ coloring.PaintVertex(element_finder.ReturnVertexId(id), TColorSet(color_string));
+ }
+ size_t e_count;
+ stream >> e_count;
+ for (size_t i = 0; i < e_count; ++i) {
+ size_t id;
+ stream >> id;
+ string color_string;
+ stream >> color_string;
+ coloring.PaintEdge(element_finder.ReturnEdgeId(id), TColorSet(color_string));
+ }
+}
+
+
+
+template<class Graph>
+std::auto_ptr<omnigraph::visualization::GraphColorer<Graph>> ConstructColorer(
+ const ColorHandler<Graph>& coloring) {
+ using namespace omnigraph::visualization;
+ return std::auto_ptr<GraphColorer<Graph>>(
+ new CompositeGraphColorer<Graph>(
+ make_shared<MapColorer<typename Graph::VertexId>>(coloring.VertexColorMap()),
+ make_shared<MapColorer<typename Graph::EdgeId>>(coloring.EdgeColorMap())));
+}
+
+template<class Graph>
+std::auto_ptr<omnigraph::visualization::GraphColorer<Graph>> ConstructBorderColorer(const Graph& /*g*/,
+ const ColorHandler<Graph>& coloring) {
+ using namespace omnigraph::visualization;
+ return std::auto_ptr<GraphColorer<Graph>>(
+ new CompositeGraphColorer<Graph>(
+ make_shared<FixedColorer<Graph>>("white"),
+ make_shared<MapColorer<typename Graph::EdgeId>>(coloring.EdgeColorMap())));
+}
+
+// Temporary while have only two colors
+TColor kRedColor = (TColor) 0;
+TColor kBlueColor = (TColor) 1;
+TColorSet kRedColorSet = TColorSet::SingleColor(kRedColor);
+TColorSet kBlueColorSet = TColorSet::SingleColor(kBlueColor);
+TColorSet kVioletColorSet = kRedColorSet | kBlueColorSet;
+TColorSet kEmptyColorSet = TColorSet(0);
+
+}
+
diff --git a/src/projects/cap/compare_standard.hpp b/src/projects/cap/compare_standard.hpp
new file mode 100644
index 0000000..7e0f85e
--- /dev/null
+++ b/src/projects/cap/compare_standard.hpp
@@ -0,0 +1,47 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "dev_support/standard_base.hpp"
+
+// log
+#include "dev_support/logger/logger.hpp"
+
+// utils
+#include "dev_support/cpp_utils.hpp"
+#include "dev_support/path_helper.hpp"
+
+#include "dev_support/simple_tools.hpp"
+
+// longseq
+#include "longseq.hpp"
+
+// config
+#include "cap_config_struct.hpp"
+
+// io
+#include "io/reads_io/ireader.hpp"
+#include "io/reads_io/converting_reader_wrapper.hpp"
+#include "io/reads_io/vector_reader.hpp"
+#include "io/reads_io/multifile_reader.hpp"
+#include "io/reads_io/rc_reader_wrapper.hpp"
+#include "io/reads_io/osequencestream.hpp"
+
+namespace cap {
+typedef io::SingleRead Contig;
+typedef io::ReadStream<Contig> ContigStream;
+typedef std::shared_ptr<ContigStream> ContigStreamPtr;
+typedef io::MultifileStream<io::SingleRead> CompositeContigStream;
+typedef io::RCWrapper<io::SingleRead> RCWrapper;
+typedef io::ReadStreamList<Contig> ContigStreams;
+}
+
+// debruijn
+#include "assembly_graph/graph_core/graph.hpp"
+#include "pipeline/graph_pack.hpp"
+#include "algorithms/graph_construction.hpp"
diff --git a/src/projects/cap/comparison_utils.hpp b/src/projects/cap/comparison_utils.hpp
new file mode 100644
index 0000000..eefe93d
--- /dev/null
+++ b/src/projects/cap/comparison_utils.hpp
@@ -0,0 +1,208 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "pipeline/graphio.hpp"
+#include "dev_support/simple_tools.hpp"
+#include "assembly_graph/graph_core/graph.hpp"
+#include "coordinates_handler.hpp"
+#include "math/xmath.h"
+#include <iostream>
+#include <vector>
+#include "dev_support/logger/logger.hpp"
+#include "io/reads_io/multifile_reader.hpp"
+#include "io/reads_io/splitting_wrapper.hpp"
+#include "io/reads_io/modifying_reader_wrapper.hpp"
+#include "io/reads_io/vector_reader.hpp"
+#include <boost/property_tree/ptree.hpp>
+#include <boost/property_tree/xml_parser.hpp>
+
+namespace cap {
+using namespace debruijn_graph;
+
+template <class Graph>
+MappingRange TrivialRange(const Graph& g, typename Graph::EdgeId e, size_t& offset) {
+ size_t l = g.length(e);
+ offset += l;
+ return MappingRange(Range(offset - l, offset), Range(0, l));
+}
+
+template <class Graph>
+MappingPath<EdgeId> TrivialMappingPath(const Graph& g
+ , const vector<typename Graph::EdgeId>& edges) {
+ INFO("start tripath");
+ vector<MappingRange> ranges;
+ size_t offset = 0;
+ for (auto it = edges.begin(); it != edges.end(); ++it) {
+ ranges.push_back(TrivialRange(g, *it, offset));
+ }
+ INFO("end tripath");
+ return MappingPath<EdgeId>(edges, ranges);
+}
+
+inline Sequence ReadSequence(ContigStream& reader) {
+ VERIFY(!reader.eof());
+ io::SingleRead read;
+ reader >> read;
+ return read.sequence();
+}
+
+template<class Graph, class Index>
+void ConstructGraph(Graph& g, Index& index,
+ ContigStream& stream) {
+ vector<ContigStream*> streams = { &stream };
+ ConstructGraph<Graph>(streams, g, index);
+}
+
+/*
+template<class Graph, class Index>
+void ConstructGraph(Graph& g, Index& index,
+ ContigStream& stream1,
+ ContigStream& stream2) {
+ io::MultifileStream<io::SingleRead> composite_reader(stream1, stream2);
+ ConstructGraph<Graph, Index>(g, index, composite_reader);
+}
+*/
+
+inline Sequence ReadGenome(const string& filename) {
+ path::CheckFileExistenceFATAL(filename);
+ io::FileReadStream genome_stream(filename);
+ return ReadSequence(genome_stream);
+}
+
+void WriteGenome(const Sequence& genome, const string& filename) {
+ io::osequencestream stream(filename);
+ io::SingleRead read("genome", genome.str());
+ stream << read;
+}
+
+inline vector<io::SingleRead> MakeReads(const vector<Sequence>& ss) {
+ vector<io::SingleRead> ans;
+ for (size_t i = 0; i < ss.size(); ++i) {
+ ans.push_back(io::SingleRead("read_" + ToString(i), ss[i].str()));
+ }
+ return ans;
+}
+
+inline Sequence FirstSequence(ContigStream& stream) {
+ stream.reset();
+ io::SingleRead r;
+ VERIFY(!stream.eof());
+ stream >> r;
+ return r.sequence();
+}
+
+inline vector<Sequence> AllSequences(ContigStream& stream) {
+ vector<Sequence> answer;
+ stream.reset();
+ io::SingleRead r;
+ while (!stream.eof()) {
+ stream >> r;
+ answer.push_back(r.sequence());
+ }
+ return answer;
+}
+
+inline vector<Sequence> ReadContigs(const string& filename) {
+ path::CheckFileExistenceFATAL(filename);
+ io::FileReadStream genome_stream(filename);
+ return AllSequences(genome_stream);
+}
+
+//Prints only basic graph structure!!!
+//todo rewrite with normal splitter usage instead of filtering
+inline void PrintGraphComponentContainingEdge(const string& file_name, const Graph& g,
+ size_t split_edge_length, const omnigraph::GraphElementFinder<Graph>& element_finder,
+ int int_edge_id) {
+ shared_ptr<GraphSplitter<Graph>> inner_splitter = ReliableSplitter<Graph>(g, split_edge_length);
+
+// VERIFY_MSG(element_finder.ReturnEdgeId(int_edge_id) != NULL,
+// "Couldn't find edge with id = " << int_edge_id);
+
+ shared_ptr<GraphComponentFilter<Graph>> filter = make_shared<AnyEdgeContainFilter<Graph>>(g, element_finder.ReturnEdgeId(int_edge_id));
+ FilteringSplitterWrapper<Graph> splitter(inner_splitter, filter);
+ vector<vector<VertexId>> components;
+ while (splitter.HasNext()) {
+ auto component = splitter.Next();
+ components.push_back(vector<VertexId>(component.vertices().begin(), component.vertices().end()));
+ }
+ VERIFY(components.size() == 1);
+ debruijn_graph::graphio::ConjugateDataPrinter<Graph> printer(g, components.front().begin(), components.front().end());
+ debruijn_graph::graphio::PrintBasicGraph<Graph>(file_name, printer);
+}
+
+template<class Graph>
+class EdgeCoordinatesGraphLabeler: public AbstractGraphLabeler<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+public:
+ const CoordinatesHandler<Graph>& edge_pos_;
+ const std::vector<std::string> genome_names_;
+
+ EdgeCoordinatesGraphLabeler(const Graph& g,
+ const CoordinatesHandler<Graph>& edge_pos,
+ const std::vector<std::string> &genome_names)
+ : AbstractGraphLabeler<Graph>(g),
+ edge_pos_(edge_pos),
+ genome_names_(genome_names) {
+ }
+
+ virtual std::string label(EdgeId edge) const {
+ auto ranges = edge_pos_.GetRanges(edge);
+ std::sort(ranges.begin(), ranges.end());
+
+ std::stringstream ss;
+ for (const auto &entry : ranges) {
+ Range genome_range = CoordinatesHandler<Graph>::GetPrintableRange(
+ entry.second.first);
+ Range seq_range = CoordinatesHandler<Graph>::GetPrintableRange(
+ entry.second.second);
+ // Make inclusive
+ genome_range.end_pos--;
+ seq_range.end_pos--;
+
+ ss << genome_names_[size_t(entry.first)] << ": " <<
+ "G" << genome_range << ", Seq" << seq_range << "\\n";
+ }
+
+ return ss.str();
+ }
+};
+
+template <class Graph>
+class BulgeRemoverCallbackToCoordinatesHandlerAdapter {
+ public:
+ typedef typename CoordinatesHandler<Graph>::EdgeId EdgeId;
+
+ BulgeRemoverCallbackToCoordinatesHandlerAdapter(
+ CoordinatesHandler<Graph> &coordinates_handler)
+ : coordinates_handler_(coordinates_handler) {
+ }
+
+ void Project(const EdgeId edge_from, const std::vector<EdgeId> &to) {
+ std::vector<EdgeId> from;
+ from.push_back(edge_from);
+
+ coordinates_handler_.ProjectPath(from, to);
+
+ // Do the same for conjugate sequences as bulge reomver does not provide
+ // such functionality :)
+ from[0] = coordinates_handler_.GetGraph()->conjugate(from[0]);
+ std::vector<EdgeId> to_conj = to;
+ std::reverse(to_conj.begin(), to_conj.end());
+ for (auto &edge : to_conj)
+ edge = coordinates_handler_.GetGraph()->conjugate(edge);
+
+ coordinates_handler_.ProjectPath(from, to_conj);
+ }
+
+ private:
+ CoordinatesHandler<Graph> &coordinates_handler_;
+};
+
+}
diff --git a/src/projects/cap/coordinates_handler.hpp b/src/projects/cap/coordinates_handler.hpp
new file mode 100644
index 0000000..3caeb4c
--- /dev/null
+++ b/src/projects/cap/coordinates_handler.hpp
@@ -0,0 +1,1262 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <cstring>
+#include <vector>
+#include <algorithm>
+#include "data_structures/sequence/sequence.hpp"
+#include "data_structures/sequence/sequence_tools.hpp"
+
+namespace cap {
+
+namespace utils {
+
+// For some internal maps
+struct unordered_map_pair_hash {
+ size_t operator()(std::pair<unsigned, size_t> p) const {
+ return size_t(p.first) | (p.second << 8);
+ }
+};
+
+bool compare_pairs_reversed(const std::pair<size_t, size_t> p1,
+ const std::pair<size_t, size_t> p2) {
+ if (p1.second != p2.second) {
+ return p1.second < p2.second;
+ }
+ return p1.first < p2.first;
+}
+
+}
+
+namespace debug {
+
+const unsigned kShiftValue = 24;
+const size_t kMaskValue = (1ull << kShiftValue) - 1;
+
+template<class T, class Graph>
+std::string Debug(const Graph *g_, const std::vector<T> &p) {
+ std::stringstream ss;
+ for (const auto &x : p) {
+ ss << g_->str(x) << ";";
+ }
+ return ss.str();
+}
+
+template<class T, class Graph>
+std::string Debug(const Graph &g_, const std::vector<T> &p) {
+ return Debug(&g_, p);
+}
+
+std::string PrintComplexPosition(const size_t pos) {
+ std::stringstream ss;
+ ss << (pos >> kShiftValue) << ":" << (double(pos & kMaskValue) / (1 << kShiftValue));
+ return ss.str();
+}
+
+std::string PrintComplexRange(const Range &range) {
+ return "[" + PrintComplexPosition(range.start_pos) + ", " +
+ PrintComplexPosition(range.end_pos) + "]";
+}
+
+std::string PrintComplexRange(const std::pair<size_t, size_t> &range) {
+ return "[" + PrintComplexPosition(range.first) + ", " +
+ PrintComplexPosition(range.second) + "]";
+}
+
+
+}
+
+/*
+ * Since some time positions do not exactly correspond to the positions
+ * in the genomes. Instead, they are shifted left by 16 bits and, probably,
+ * added with some small number in order to exclude empty ranges.
+ */
+template<class Graph>
+class CoordinatesHandler : public ActionHandler<typename Graph::VertexId,
+ typename Graph::EdgeId> {
+ public:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef ActionHandler<VertexId, EdgeId> base;
+ typedef std::vector<EdgeId> Path;
+ typedef unsigned uint;
+ typedef std::vector<std::pair<uint, size_t> > PosArray;
+ typedef std::vector<std::pair<uint, Range> > RangeArray;
+ typedef std::vector<std::pair<size_t, size_t> > Thread;
+
+ const static unsigned kShiftValue = 24;
+ const static size_t kMaskValue = (1ull << kShiftValue) - 1;
+ const static size_t kNotMaskValue = ~kMaskValue;
+ const static size_t kHalfMask = (1ull << (kShiftValue - 1));
+ const static size_t kLeftEndMask = (1ull << (kShiftValue - 1)) | (1ull << (kShiftValue - 2));
+ const static size_t kRightEndMask = (0ull << (kShiftValue - 1)) | (1ull << (kShiftValue - 2));
+
+ CoordinatesHandler()
+ : base("CoordinatesHandler"),
+ g_(NULL),
+ genome_info_(),
+ edge_ranges_(),
+ stored_threading_history_(),
+ //genome_first_edges_(),
+ last_deleted_(),
+ pending_add_(),
+ is_locked_(false) {
+ }
+
+ CoordinatesHandler(
+ const std::vector<std::pair<uint, std::vector<Thread>>> &stored_threads)
+ : CoordinatesHandler() {
+ SetStoredThreads(stored_threads);
+ }
+
+ virtual ~CoordinatesHandler() {
+ }
+
+ void SetGraph(const Graph *g) {
+ VERIFY(g != NULL);
+
+ if (g == g_)
+ return;
+
+ if (g_ != NULL)
+ UnsetGraph();
+ g_ = g;
+ g_->AddActionHandler(this);
+ }
+
+ void UnsetGraph() {
+ if (g_ == NULL)
+ return;
+
+ g_->RemoveActionHandler(this);
+ g_ = NULL;
+ edge_ranges_.clear();
+ }
+
+ const Graph *GetGraph() const {
+ return g_;
+ }
+
+ void AddGenomePath(const uint genome_id, const Path &genome_path) {
+ TRACE("AddGenomePath Start");
+ VERIFY(g_ != NULL);
+
+ if (genome_path.size() == 0) {
+ INFO("Trying to add path of length 0");
+ return;
+ }
+ genome_info_[genome_id].id = genome_id;
+ genome_info_[genome_id].first_edge = genome_path[0];
+ //genome_first_edges_[genome_path[0]].push_back(genome_id);
+
+ size_t cur_start = 0;
+ for (const auto &edge : genome_path) {
+ if (edge == EdgeId(0)) {
+ DEBUG("ZERO EDGE!");
+ continue;
+ }
+ const size_t cur_end = (cur_start + (g_->length(edge) << kShiftValue)) | kHalfMask;
+
+ //DEBUG("edge " << g_->str(edge) << ": " << PrintComplexRange(Range(cur_start, cur_end)));
+
+ edge_ranges_[edge].AddGenomeRange(genome_id, Range(cur_start, cur_end));
+ cur_start = cur_end;
+ }
+
+ genome_info_[genome_id].sequence_length = cur_start;
+ TRACE("AddGenomePath End");
+ }
+
+ Sequence ReconstructGenome(const uint genome_id) const {
+ const std::vector<EdgeId> genome_path =
+ AsMappingPath(genome_id).simple_path();
+
+ std::vector<Sequence> path_sequences;
+ for (const auto &e : genome_path)
+ path_sequences.push_back(g_->EdgeNucls(e));
+
+ return MergeOverlappingSequences(path_sequences, g_->k());
+ }
+
+ PosArray FilterPosArray(const PosArray &old_array,
+ const EdgeId edge) const {
+ PosArray result;
+ auto edge_data_it = edge_ranges_.find(edge);
+ if (edge_data_it == edge_ranges_.end())
+ return result;
+
+ for (auto entry : old_array) {
+ if (edge_data_it->second.HasForwardLink(entry)) {
+ entry.second = edge_data_it->second.GetForwardPos(entry);
+ result.push_back(entry);
+ }
+ }
+
+ return result;
+ }
+
+ void SetStoredThreads(
+ const std::vector<std::pair<uint, std::vector<Thread>>> &threads) {
+ for (const auto &entry : threads) {
+ stored_threading_history_[entry.first] = entry.second;
+ }
+ }
+
+ std::vector<std::pair<uint, std::vector<Thread>>> GetStoredThreads() const {
+ std::vector<std::pair<uint, std::vector<Thread>>> result;
+ for (const auto &entry : stored_threading_history_) {
+ result.push_back(entry);
+ }
+ return result;
+ }
+
+ /**
+ * Automatically adds conjugate strand!!!
+ */
+ void StoreGenomeThreadManual(const uint genome_id, const Thread &ladder) {
+ stored_threading_history_[2 * genome_id].push_back(PreprocessCoordinates(ladder));
+ stored_threading_history_[(2 * genome_id) ^ 1].push_back(PreprocessCoordinates(ConjugateThread(ladder)));
+ }
+
+ size_t PreprocessCoordinates(const size_t coord) const {
+ return (coord << kShiftValue);
+ }
+
+ /*
+ * `from` is meant to have needed range data
+ */
+ bool ProjectPath(const Path &from, const Path &to);
+
+ bool ProjectPath(const Path &from, const Path &to,
+ const PosArray &threads_to_delete);
+
+ void UnrollChanges();
+
+ void LockChanges();
+
+ void ReleaseChanges();
+
+ PosArray GetContiguousThreads(const Path &path) const;
+
+ bool CheckCorrectPathProjection(const Path &from, const Path &to) const;
+
+ size_t GetOriginalPos(const uint genome_id, const size_t new_pos) const;
+
+ //size_t GetNewestPos(const uint genome_id, const size_t old_pos) const;
+
+ // TODO getOrigRange?? (Edge)
+ virtual void HandleDelete(EdgeId e);
+
+ virtual void HandleMerge(const vector<EdgeId> &old_edges, EdgeId new_edge);
+
+ virtual void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2);
+
+ virtual void HandleSplit(EdgeId old_edge, EdgeId new_edge_1,
+ EdgeId new_edge_2);
+
+ EdgeId FindGenomeFirstEdge(const uint genome_id) const;
+
+ // First range is graph range, second is original sequence range
+ std::vector<std::pair<uint, std::pair<Range, Range> > > GetRanges(
+ const EdgeId e) const {
+ std::vector<std::pair<uint, std::pair<Range, Range> > > res;
+ if (g_ == NULL)
+ return res;
+
+ const auto edge_data_it = edge_ranges_.find(e);
+ VERIFY(edge_data_it != edge_ranges_.end());
+
+ for (const auto &p : edge_data_it->second.GetRanges()) {
+ const uint genome_id = p.first;
+ Range newest = p.second;
+ DEBUG("from " << debug::PrintComplexRange(newest));
+ const Range original(GetOriginalPos(genome_id, newest.start_pos),
+ GetOriginalPos(genome_id, newest.end_pos));
+
+ res.push_back(make_pair(genome_id, make_pair(newest, original)));
+ }
+
+ return res;
+ }
+
+ std::vector<std::pair<uint, Range> > GetRawRanges(const EdgeId e) const {
+ if (g_ == NULL)
+ return {};
+
+ const auto edge_data_it = edge_ranges_.find(e);
+ VERIFY(edge_data_it != edge_ranges_.end());
+
+ return edge_data_it->second.GetRanges();
+ }
+
+
+ static Range GetPrintableRange(const Range &r) {
+ return Range(r.start_pos >> kShiftValue, r.end_pos >> kShiftValue);
+ }
+
+ PosArray GetEndPosArray(const EdgeId e) const {
+ PosArray result;
+
+ const auto edge_data_it = edge_ranges_.find(e);
+ if (edge_data_it == edge_ranges_.end())
+ return result;
+
+ for (const auto &entry : edge_data_it->second.GetRanges()) {
+ result.push_back(make_pair(entry.first, entry.second.end_pos));
+ }
+
+ return result;
+ }
+
+ bool HasForwardLink(const EdgeId edge, const uint genome_id,
+ const size_t start_pos) const {
+ auto edge_it = edge_ranges_.find(edge);
+ // VERIFY(edge_it != edge_ranges_.end());
+ if (edge_it == edge_ranges_.end())
+ return false;
+
+ return edge_it->second.HasForwardLink(make_pair(genome_id, start_pos));
+ }
+
+ size_t GetForwardPos(const EdgeId edge, const uint genome_id,
+ const size_t start_pos) const {
+ auto edge_it = edge_ranges_.find(edge);
+ VERIFY(edge_it != edge_ranges_.end());
+
+ return edge_it->second.GetForwardPos(make_pair(genome_id, start_pos));
+ }
+
+ pair<EdgeId, size_t> StepForward(const VertexId v, const uint genome_id,
+ const size_t pos) const {
+ for (EdgeId e : g_->OutgoingEdges(v)) {
+ if (HasForwardLink(e, genome_id, pos)) {
+ return make_pair(e, GetForwardPos(e, genome_id, pos));
+ }
+ }
+ return make_pair(EdgeId(0), -1u);
+ }
+
+ pair<EdgeId, size_t> StepForwardPos(const EdgeId last_edge, const uint genome_id,
+ const size_t last_pos) const {
+ return StepForwardPos(g_->EdgeEnd(last_edge), genome_id, last_pos);
+ }
+
+ size_t GetMultiplicity(const EdgeId edge) const {
+ auto edge_it = edge_ranges_.find(edge);
+ if (edge_it == edge_ranges_.end())
+ return 0;
+ return edge_it->second.GetMultiplicity();
+ }
+
+ void DebugOutput(const EdgeId e) {
+ if (HasEdgeData(e)) {
+ DEBUG("edge " << g_->str(e) << " " << edge_ranges_.at(e).DebugOutput());
+ } else {
+ DEBUG("edge " << g_->str(e) << " empty");
+ }
+ }
+
+ //todo some usages do not need original pos, optimize if needed
+ MappingPath<EdgeId> AsMappingPath(unsigned genome_id) const {
+ MappingPath<EdgeId> answer;
+ VertexId v = g_->EdgeStart(FindGenomeFirstEdge(genome_id));
+ size_t genome_pos = 0;
+
+ while (true) {
+ auto step = StepForward(v, genome_id, genome_pos);
+ if (step.second == -1u)
+ break;
+ EdgeId e = step.first;
+
+ size_t next_genome_pos = step.second;
+
+ Range original_pos(
+ GetOriginalPos(genome_id, genome_pos),
+ GetOriginalPos(genome_id, next_genome_pos));
+
+ //todo fix possible troubles with cyclic genomes etc later
+ Range original_pos_printable = GetPrintableRange(original_pos);
+ Range graph_pos_printable(0, g_->length(e));
+
+ answer.push_back(e, MappingRange(original_pos_printable, graph_pos_printable));
+
+ v = g_->EdgeEnd(e);
+ genome_pos = next_genome_pos;
+ }
+ //todo can we verify total length somehow
+ return answer;
+ }
+
+ void DumpRanges() {
+ std::unordered_map<uint, Path> genome_paths;
+
+ for (const auto &genome_i : genome_info_) {
+ const uint genome_id = genome_i.first;
+
+ genome_paths[genome_id] = AsMappingPath(genome_id).
+ simple_path();
+ }
+
+ StoreGenomeThreads();
+ edge_ranges_.clear();
+ genome_info_.clear();
+
+ for (const auto &genome_it : genome_paths) {
+ const uint genome_id = genome_it.first;
+
+ AddGenomePath(genome_id, genome_it.second);
+ }
+ }
+
+ private:
+ typedef std::unordered_map<std::pair<uint, size_t>, size_t,
+ utils::unordered_map_pair_hash> MapT;
+
+ class EdgeData {
+ public:
+ EdgeData() {
+ }
+
+ inline void AddGenomeRange(const uint genome_id, const Range &range) {
+ //INFO("add genome range: genome " << int(genome_id) << ": " << range);
+ //INFO("initial " << DebugOutput());
+ Range extended_range = range;
+ auto connected_it = genome_ranges_backward_.find(
+ make_pair(genome_id, extended_range.start_pos));
+ if (connected_it != genome_ranges_backward_.end()) {
+ extended_range.start_pos = connected_it->second;
+
+ genome_ranges_forward_.erase(
+ make_pair(genome_id, extended_range.start_pos));
+ genome_ranges_backward_.erase(connected_it);
+ }
+
+ connected_it = genome_ranges_forward_.find(
+ make_pair(genome_id, extended_range.end_pos));
+ if (connected_it != genome_ranges_forward_.end()) {
+ extended_range.end_pos = connected_it->second;
+
+ genome_ranges_forward_.erase(connected_it);
+ genome_ranges_backward_.erase(
+ make_pair(genome_id, extended_range.end_pos));
+ }
+
+ genome_ranges_forward_[make_pair(genome_id, extended_range.start_pos)] =
+ extended_range.end_pos;
+ genome_ranges_backward_[make_pair(genome_id, extended_range.end_pos)] =
+ extended_range.start_pos;
+
+ //INFO("resulting " << DebugOutput());
+ }
+
+ inline void operator+=(const EdgeData &other) {
+ for (const auto &it : other.genome_ranges_forward_) {
+ this->AddGenomeRange(it.first.first, Range(it.first.second, it.second));
+ }
+ }
+
+ inline std::vector<Range> GetGenomeRanges(const uint genome_id) const {
+ std::vector<Range> result;
+
+ for (auto &it : genome_ranges_forward_) {
+ if (it.first.first == genome_id) {
+ result.push_back(Range(it.first.second, it.second));
+ }
+ }
+
+ return result;
+ }
+
+ inline std::vector<std::pair<uint, Range> > GetRanges() const {
+ std::vector<std::pair<uint, Range> > result;
+
+ for (auto &it : genome_ranges_forward_) {
+ result.push_back(make_pair(it.first.first,
+ Range(it.first.second, it.second)));
+ }
+
+ return result;
+ }
+
+ inline bool HasForwardLink(const std::pair<uint, size_t> &start_pos) const {
+ return genome_ranges_forward_.count(start_pos) > 0;
+ }
+ inline void DeleteForwardLink(const std::pair<uint, size_t> &pos) {
+ const auto it = genome_ranges_forward_.find(pos);
+ if (it == genome_ranges_forward_.end())
+ return;
+
+ size_t end_pos = it->second;
+ genome_ranges_forward_.erase(pos);
+ genome_ranges_backward_.erase(make_pair(pos.first, end_pos));
+ }
+ inline Range PopForwardLink(const std::pair<uint, size_t> &pos) {
+ const auto it = genome_ranges_forward_.find(pos);
+ if (it == genome_ranges_forward_.end())
+ return Range(-1, -1);
+
+ size_t end_pos = it->second;
+ genome_ranges_forward_.erase(pos);
+ genome_ranges_backward_.erase(make_pair(pos.first, end_pos));
+
+ return Range(pos.second, end_pos);
+ }
+ inline size_t GetForwardPos(const std::pair<uint, size_t> &start_pos) const {
+ auto it = genome_ranges_forward_.find(start_pos);
+ VERIFY(it != genome_ranges_forward_.end());
+ return it->second;
+ }
+ inline size_t GetMultiplicity() const {
+ return genome_ranges_forward_.size();
+ }
+
+ std::string DebugOutput() const {
+ using debug::PrintComplexPosition;
+
+ std::stringstream ss;
+ ss << "forward (";
+ for (const auto &p : genome_ranges_forward_) {
+ ss << "{" << int(p.first.first) << "," << PrintComplexPosition(p.first.second) << "}->" << PrintComplexPosition(p.second) << ";";
+ }
+ ss << ") backward (";
+ for (const auto &p : genome_ranges_backward_) {
+ ss << "{" << int(p.first.first) << "," << PrintComplexPosition(p.first.second) << "}->" << PrintComplexPosition(p.second) << ";";
+ }
+ ss << ")";
+ return ss.str();
+ }
+
+ private:
+
+ MapT genome_ranges_forward_;
+ MapT genome_ranges_backward_;
+ };
+
+ struct GenomeInfo {
+ uint id;
+ EdgeId first_edge;
+ size_t sequence_length;
+ };
+
+ constexpr static long double EPS = 1e-9;
+
+ pair<size_t, size_t> PreprocessCoordinates(const pair<size_t, size_t>& point) const {
+ return make_pair(PreprocessCoordinates(point.first), PreprocessCoordinates(point.second));
+ }
+
+ Thread PreprocessCoordinates(const Thread& ladder) const {
+ Thread answer;
+ for (pair<size_t, size_t> point : ladder) {
+ answer.push_back(PreprocessCoordinates(point));
+ }
+ return answer;
+ }
+
+ Thread ConjugateThread(const Thread& ladder) const {
+ Thread answer;
+ size_t first_length = ladder.back().first;
+ size_t second_length = ladder.back().second;
+ for (auto it = ladder.rbegin(); it != ladder.rend(); ++it) {
+ answer.push_back(make_pair(first_length - it->first,
+ second_length - it->second));
+ }
+ return answer;
+ }
+
+ void StoreGenomeThreads() {
+ std::vector<std::pair<std::pair<uint, Range>, EdgeId> > all_ranges;
+ /*
+ * A kind of verification
+ */
+ /*
+ for (auto it = g_->SmartEdgeBegin(); !it.IsEnd(); ++it) {
+ if (!HasEdgeData(*it)) {
+ INFO("NO data for edge " << g_->str(*it));
+ }
+ for (const auto &range_data : edge_ranges_.at(*it).GetRanges()) {
+ all_ranges.push_back(make_pair(make_pair(range_data.first, range_data.second), *it));
+ }
+ }
+ std::sort(all_ranges.begin(), all_ranges.end());
+ for (const auto &e : all_ranges) {
+ INFO("genome " << int(e.first.first) << ", " << e.first.second << ": " <<
+ g_->str(e.second));
+ }
+ for (size_t i = 1; i < all_ranges.size(); ++i) {
+ if (all_ranges[i].first.first != all_ranges[i - 1].first.first) continue;
+ if (all_ranges[i].first.second.start_pos != all_ranges[i - 1].first.second.end_pos) {
+ INFO("!!! TORN in genome " << int(all_ranges[i].first.first) << " at position " << all_ranges[i].first.second.start_pos);
+ }
+ }
+ */
+
+ TRACE("StoreGenomeThreads Start");
+ VERIFY(g_ != NULL);
+ for (auto &genome_it : genome_info_) {
+ const uint genome_id = genome_it.first;
+ stored_threading_history_[genome_id].push_back(Thread());
+ Thread &thread = stored_threading_history_[genome_id].back();
+
+ StoreGenomeThread(genome_id, thread);
+ }
+ TRACE("StoreGenomeThreads End");
+ }
+
+
+ void StoreGenomeThread(const uint genome_id, Thread &thread);
+
+ RangeArray PopAndUpdateRangesToCopy(const EdgeId edge,
+ PosArray &delete_positions);
+
+ bool CheckContiguousPath(const Path &path) const;
+
+ size_t FlushZeroSequence(const std::vector<EdgeId> &to_edges,
+ const uint genome_id, const Range &from_range,
+ const bool finalize_range);
+
+ size_t GetNonzeroSplit(const Range &from_range, const size_t taken_length,
+ const bool finalize_range);
+
+ size_t GetRightEnd(const bool finalize_range, const Range &range) const;
+
+ void AddPendingRangeToEdge(const EdgeId edge, const uint genome_id, const Range &range);
+
+ void DeleteRangeFromEdge(const EdgeId edge, const uint genome_id, const size_t range_from);
+
+ void DeleteRangeFromEdge(const EdgeId edge, const std::pair<uint, size_t> &pos);
+
+ void FlushPendingRanges();
+
+ bool HasEdgeData(const EdgeId edge) {
+ return edge_ranges_.find(edge) != edge_ranges_.end();
+ }
+
+ template <class T>
+ inline void CleanEdgeData(const T edge) {
+ edge_ranges_.erase(edge);
+ }
+
+ size_t GetPathLength(const Path &p) const {
+ size_t res = 0;
+ for (const auto &edge : p) {
+ res += g_->length(edge);
+ }
+ return res;
+ }
+
+ size_t CalculatePos(const Range &graph_range, const Range &genome_range,
+ const size_t graph_pos) const {
+ const long double len_ratio = (long double)GetPrintableRange(genome_range).size() /
+ GetPrintableRange(graph_range).size();
+
+ return ((genome_range.start_pos & kNotMaskValue) | kHalfMask) +
+ (size_t(((graph_pos - graph_range.start_pos) >> kShiftValue) * len_ratio) << kShiftValue);
+ }
+
+ const Graph *g_;
+
+ std::unordered_map<uint, GenomeInfo> genome_info_;
+ std::unordered_map<EdgeId, EdgeData> edge_ranges_;
+ std::unordered_map<uint, std::vector<Thread> > stored_threading_history_;
+ //std::unordered_map<EdgeId, std::vector<uint> > genome_first_edges_;
+
+ // For deletion unroll
+ std::vector<std::pair<EdgeId, std::pair<uint, Range> > > last_deleted_;
+ // For suspended adding
+ std::vector<std::pair<EdgeId, std::pair<uint, Range> > > pending_add_;
+
+ bool is_locked_;
+
+ DECL_LOGGER("CoordinatesHandler");
+
+};
+
+template <class Graph>
+bool CoordinatesHandler<Graph>::ProjectPath(const Path &from, const Path &to,
+ const PosArray &threads_to_delete) {
+
+ using debug::PrintComplexPosition;
+ using debug::PrintComplexRange;
+ using debug::Debug;
+
+ TRACE("ProjectPath Start");
+ //VERIFY(CheckCorrectPathProjection(from, to));
+ //DEBUG("Projecting " << Debug(g_, from) << " to " << Debug(g_, to));
+
+
+ VERIFY(g_ != NULL);
+ const Path &p1 = from,
+ &p2 = to;
+ size_t l1 = GetPathLength(p1),
+ l2 = GetPathLength(p2);
+ PosArray cur_delete_positions = threads_to_delete;
+ const size_t n_ranges = cur_delete_positions.size();
+
+ std::vector<std::vector<EdgeId> > zero_sequences(n_ranges);
+
+ auto it2 = p2.begin();
+
+ VERIFY(l1 != 0 && l2 != 0);
+ VERIFY(it2 != p2.end());
+
+ long double lratio = (long double)l2 / l1;
+
+ size_t cur_2_edge_len = g_->length(*it2);
+ for (auto &edge1 : p1) {
+ size_t cur_1_edge_len = g_->length(edge1);
+
+ RangeArray genome_ranges_to_copy =
+ PopAndUpdateRangesToCopy(edge1, cur_delete_positions);
+
+ if (genome_ranges_to_copy.size() != n_ranges) {
+ //ClearChanges();
+ DEBUG("FALSE; unrolling");
+ return false;
+ }
+
+ while (cur_1_edge_len > 0 || edge1 == p1.back()) {
+ VERIFY(it2 != p2.end());
+ const bool second_edge_is_last = (*it2 == p2.back());
+ size_t taken_len_1 = min(cur_1_edge_len,
+ size_t(ceil(double(cur_2_edge_len / lratio - EPS))));
+ if (second_edge_is_last) {
+ taken_len_1 = cur_1_edge_len;
+ }
+ const size_t taken_len_2 = min(cur_2_edge_len,
+ size_t(ceil(double(taken_len_1 * lratio - EPS))));
+ const long double edge_1_percentage = (cur_1_edge_len == 0) ? 0 :
+ (long double)taken_len_1 / cur_1_edge_len;
+
+ const bool finalize_range = (edge1 == p1.back() && second_edge_is_last)
+ || (edge1 != p1.back() && taken_len_1 == cur_1_edge_len);
+
+ for (size_t i = 0; i < n_ranges; ++i) {
+ auto &ranges = genome_ranges_to_copy[i];
+ Range &range = ranges.second;
+ const size_t range_size = GetPrintableRange(range).size();
+ const size_t taken_length =
+ size_t(ceil(double(range_size * edge_1_percentage - EPS)));
+
+ if (taken_length == 0) {
+ zero_sequences[i].push_back(*it2);
+
+ DEBUG("taken length is zero");
+ continue;
+ }
+
+ if (!zero_sequences[i].empty()) {
+ range.start_pos = FlushZeroSequence(zero_sequences[i],
+ ranges.first, range, false);//, taken_length == 0 && finalize_range);
+ zero_sequences[i].clear();
+ }
+
+ const size_t split_pos = GetNonzeroSplit(
+ range, taken_length, finalize_range);
+ const Range range_to_add(range.start_pos, split_pos);
+
+ /*
+ DEBUG("DEBUG: " << PrintComplexPosition(range.start_pos) << " " << taken_length << " "
+ << PrintComplexPosition(split_pos));
+ DEBUG(" Proj " << g_->str(edge1) << " -> " << g_->str(*it2) << ": "
+ << int(ranges.first) << ": " << PrintComplexRange(ranges.second) << "->" << PrintComplexRange(range_to_add));
+ */
+
+ AddPendingRangeToEdge(*it2, ranges.first, range_to_add);
+ range.start_pos = split_pos;
+ }
+
+ cur_1_edge_len -= taken_len_1;
+ cur_2_edge_len -= taken_len_2;
+
+ if (edge1 == p1.back()) {
+ ++it2;
+ if (it2 == p2.end()) {
+ break;
+ }
+ cur_2_edge_len = g_->length(*it2);
+ } else if (cur_2_edge_len == 0) {
+ if (*it2 != p2.back()) {
+ ++it2;
+ cur_2_edge_len = g_->length(*it2);
+ }
+ }
+ }
+
+
+ // Check that all ranges were moved completely
+ for (size_t i = 0; i < n_ranges; ++i) {
+ const uint genome_id = genome_ranges_to_copy[i].first;
+ Range &range = genome_ranges_to_copy[i].second;
+
+ if (!zero_sequences[i].empty()) {
+ range.start_pos = FlushZeroSequence(zero_sequences[i],
+ genome_id, range, true);//, taken_length == 0 && finalize_range);
+ zero_sequences[i].clear();
+ }
+
+ VERIFY(range.start_pos == range.end_pos);
+ }
+ }
+
+ FlushPendingRanges();
+
+ TRACE("ProjectPath End");
+
+ return true;
+}
+
+template <class Graph>
+bool CoordinatesHandler<Graph>::ProjectPath(
+ const Path &from, const Path &to) {
+ PosArray all_positions =
+ GetContiguousThreads(from);
+ return ProjectPath(from, to, all_positions);
+}
+
+template <class Graph>
+typename CoordinatesHandler<Graph>::RangeArray
+CoordinatesHandler<Graph>::PopAndUpdateRangesToCopy(
+ const EdgeId edge,
+ PosArray &delete_positions) {
+ auto edge_data_it = edge_ranges_.find(edge);
+ if (edge_data_it == edge_ranges_.end()) {
+ INFO("trying to get " << delete_positions.size() << " positions from empty!!!");
+ return {};
+ }
+ //VERIFY(edge_data_it != edge_ranges_.end());
+ auto &edge_data = edge_data_it->second;
+
+ RangeArray genome_ranges_to_copy;
+ for (auto &del_pos : delete_positions) {
+ if (edge_data.HasForwardLink(del_pos)) {
+ const Range range_to_copy(del_pos.second,
+ edge_data.GetForwardPos(del_pos));
+ DeleteRangeFromEdge(edge, del_pos);
+
+ genome_ranges_to_copy.push_back(
+ make_pair(del_pos.first, range_to_copy));
+ del_pos.second = range_to_copy.end_pos;
+ }
+ }
+ //if (edge_data.GetMultiplicity() == 0)
+ // CleanEdgeData(edge_data_it);
+
+ return genome_ranges_to_copy;
+}
+
+template <class Graph>
+bool CoordinatesHandler<Graph>::CheckCorrectPathProjection(
+ const Path &from, const Path &to) const {
+ if (from.size() == 0 || to.size() == 0) return false;
+ if (g_->EdgeStart(from[0]) != g_->EdgeStart(to[0])) return false;
+ if (g_->EdgeEnd(from.back()) != g_->EdgeEnd(to.back())) return false;
+ if (!CheckContiguousPath(from) || !CheckContiguousPath(to)) return false;
+ return true;
+}
+
+template<class Graph>
+size_t CoordinatesHandler<Graph>::FlushZeroSequence(const std::vector<EdgeId> &to_edges,
+ const uint genome_id, const Range &from_range,
+ const bool finalize_range) {
+ TRACE("FlushZeroSequence " << debug::Debug(g_, to_edges) << " " << genome_id << " " << debug::PrintComplexRange(from_range) << " " << finalize_range);
+ const size_t N = to_edges.size();
+ // the least power of 2 that is not smaller than N
+ uint K = 0;
+ while ((1u << K) < N)
+ ++K;
+
+ const size_t right_end = GetRightEnd(finalize_range, from_range);
+ VERIFY((right_end & kNotMaskValue) == (from_range.start_pos & kNotMaskValue));
+
+ const size_t dx = right_end - from_range.start_pos;
+ const size_t step = dx >> K;
+ VERIFY(step != 0);
+
+ size_t cur_pos = from_range.start_pos;
+
+ for (size_t i = 0; i < N; ++i) {
+ size_t next_pos = cur_pos + step;
+ if (i + 1 == N)
+ next_pos = right_end;
+
+ AddPendingRangeToEdge(to_edges[i], genome_id, Range(cur_pos, next_pos));
+ cur_pos = next_pos;
+ }
+
+ return right_end;
+}
+
+template<class Graph>
+size_t CoordinatesHandler<Graph>::GetNonzeroSplit(const Range &from_range, const size_t taken_length,
+ const bool finalize_range) {
+ size_t split_pos = 0;
+
+ if (finalize_range)
+ split_pos = from_range.end_pos;
+ else {
+ // just base
+ split_pos = (from_range.start_pos & kNotMaskValue) + (taken_length << kShiftValue);
+ if ((from_range.end_pos & kNotMaskValue) != split_pos) {
+ split_pos |= kHalfMask;
+ } else {
+ split_pos |= (from_range.end_pos & kMaskValue) >> 1;
+ }
+ }
+
+ return split_pos;
+}
+
+template<class Graph>
+size_t CoordinatesHandler<Graph>::GetRightEnd(const bool finalize_range, const Range &range) const {
+ size_t right_end = 0;
+
+ if (finalize_range)
+ right_end = range.end_pos;
+ else if ((range.start_pos & kMaskValue) == kRightEndMask)
+ right_end = (range.start_pos & kNotMaskValue) | kLeftEndMask;
+ else {
+ const size_t was = range.start_pos & kMaskValue;
+ const size_t one_minus_was = (1ull << kShiftValue) - was;
+ const size_t new_right = (1ull << kShiftValue) - (one_minus_was >> 1);
+ right_end = (range.start_pos & kNotMaskValue) | new_right;
+ }
+
+ return right_end;
+}
+
+template<class Graph>
+void CoordinatesHandler<Graph>::AddPendingRangeToEdge(const EdgeId edge,
+ const uint genome_id, const Range &range) {
+ pending_add_.push_back(make_pair(edge, make_pair(genome_id, range)));
+}
+
+template<class Graph>
+void CoordinatesHandler<Graph>::DeleteRangeFromEdge(const EdgeId edge,
+ const uint genome_id, const size_t range_from) {
+ DeleteRangeFromEdge(edge, make_pair(genome_id, range_from));
+}
+template<class Graph>
+void CoordinatesHandler<Graph>::DeleteRangeFromEdge(const EdgeId edge,
+ const std::pair<uint, size_t> &pos) {
+ const Range deleted_range =
+ edge_ranges_.at(edge).PopForwardLink(pos);
+ if (edge_ranges_.at(edge).GetMultiplicity() == 0) {
+ CleanEdgeData(edge);
+ }
+
+ last_deleted_.push_back(make_pair(edge,
+ make_pair(pos.first, deleted_range)));
+}
+
+template<class Graph>
+void CoordinatesHandler<Graph>::FlushPendingRanges() {
+ if (is_locked_)
+ return;
+
+ for (const auto &e : pending_add_) {
+ edge_ranges_[e.first].AddGenomeRange(e.second.first, e.second.second);
+ }
+ last_deleted_.clear();
+ pending_add_.clear();
+}
+
+template<class Graph>
+void CoordinatesHandler<Graph>::UnrollChanges() {
+ for (const auto &e : last_deleted_) {
+ edge_ranges_[e.first].AddGenomeRange(e.second.first, e.second.second);
+ }
+ pending_add_.clear();
+ last_deleted_.clear();
+}
+
+template<class Graph>
+void CoordinatesHandler<Graph>::ReleaseChanges() {
+ is_locked_ = false;
+ FlushPendingRanges();
+}
+template<class Graph>
+void CoordinatesHandler<Graph>::LockChanges() {
+ is_locked_ = true;
+}
+
+template <class Graph>
+bool CoordinatesHandler<Graph>::CheckContiguousPath(
+ const Path &path) const {
+ for (size_t i = 1; i < path.size(); ++i) {
+ if (g_->EdgeEnd(path[i - 1]) != g_->EdgeStart(path[i]))
+ return false;
+ }
+ return true;
+}
+
+template <class Graph>
+typename CoordinatesHandler<Graph>::PosArray
+CoordinatesHandler<Graph>::GetContiguousThreads(const Path &path) const {
+ PosArray result;
+ PosArray cur_pos;
+
+ const auto first_edge_it = edge_ranges_.find(path[0]);
+ if (first_edge_it == edge_ranges_.end())
+ return result;
+
+ for (const auto &entry : first_edge_it->second.GetRanges()) {
+ result.push_back(make_pair(entry.first, entry.second.start_pos));
+ cur_pos.push_back(make_pair(entry.first, entry.second.end_pos));
+ }
+
+ for (size_t path_i = 1; path_i < path.size(); ++path_i) {
+ const auto edge_data_it = edge_ranges_.find(path[path_i]);
+ if (edge_data_it == edge_ranges_.end())
+ return {};
+
+ for (ssize_t i = 0; i < (ssize_t)cur_pos.size(); ++i) {
+ if (edge_data_it->second.HasForwardLink(cur_pos[i])) {
+ cur_pos[i].second = edge_data_it->second.GetForwardPos(cur_pos[i]);
+ } else {
+ std::swap(cur_pos[i], cur_pos.back());
+ std::swap(result[i], result.back());
+ cur_pos.pop_back();
+ result.pop_back();
+ i--;
+ }
+ }
+ }
+
+ return result;
+}
+
+template <class Graph>
+void CoordinatesHandler<Graph>::StoreGenomeThread(
+ const uint genome_id, Thread &thread) {
+
+ TRACE("StoreGenomeThread Start");
+
+ size_t graph_pos = 0,
+ genome_pos = 0,
+ genome_length = genome_info_[genome_id].sequence_length;
+
+ std::pair<uint, size_t> cur_pos = make_pair(genome_id, genome_pos);
+ EdgeId cur_edge = genome_info_[genome_id].first_edge;
+ //INFO("searching for first edge " << cur_edge << "of genome " << int(genome_id));
+ if (!HasEdgeData(cur_edge)) {
+ cur_edge = FindGenomeFirstEdge(genome_id);
+ }
+
+ do {
+ if (cur_edge == EdgeId(0)) {
+ INFO("Could not thread genome path! genome_id=" << int(genome_id));
+ return;
+ }
+
+ thread.push_back(make_pair(graph_pos, genome_pos));
+ graph_pos += g_->length(cur_edge) << kShiftValue;
+ VERIFY(HasEdgeData(cur_edge));
+ genome_pos = edge_ranges_.at(cur_edge).GetForwardPos(cur_pos);
+ cur_pos.second = genome_pos;
+
+ const VertexId v = g_->EdgeEnd(cur_edge);
+
+ //DEBUG("current edge " << g_->str(cur_edge) << ", outgoing count " << g_->OutgoingEdgeCount(v));
+ cur_edge = EdgeId(0);
+ for (const auto &out_edge : g_->OutgoingEdges(v)) {
+ //DEBUG("considering edge " << g_->str(out_edge) << " at position (seq) " << genome_pos);
+
+ auto edge_info_it = edge_ranges_.find(out_edge);
+ if (edge_info_it == edge_ranges_.end())
+ continue;
+
+ //TRACE("!");
+
+ if (edge_info_it->second.HasForwardLink(cur_pos)) {
+ cur_edge = out_edge;
+ break;
+ }
+ }
+ } while (genome_pos != genome_length);
+
+ thread.push_back(make_pair(graph_pos, genome_pos));
+ TRACE("StoreGenomeThread End");
+}
+
+template <class Graph>
+typename CoordinatesHandler<Graph>::EdgeId
+CoordinatesHandler<Graph>::FindGenomeFirstEdge(const uint genome_id) const {
+ std::pair<uint, size_t> pos_in_question(genome_id, 0);
+
+ for (auto it = g_->SmartEdgeBegin(); !it.IsEnd(); ++it) {
+ auto range_it = edge_ranges_.find(*it);
+ if (range_it == edge_ranges_.end())
+ continue;
+ if (range_it->second.HasForwardLink(pos_in_question)) {
+ return *it;
+ }
+ }
+
+ // remember first edge and update it
+ VERIFY_MSG(false, "Could not find start of the sequence in graph");
+ return EdgeId(0);
+}
+
+template <class Graph>
+size_t CoordinatesHandler<Graph>::GetOriginalPos(
+ const uint genome_id, const size_t new_pos) const {
+
+ // No refinement has been done
+ if (stored_threading_history_.size() == 0)
+ return new_pos;
+
+ size_t cur_pos = new_pos;
+
+ const auto history_it = stored_threading_history_.find(genome_id);
+ VERIFY(history_it != stored_threading_history_.end());
+ const std::vector<Thread> &history = history_it->second;
+
+ for (auto thread_it = history.rbegin(), E = history.rend();
+ thread_it != E; ++thread_it) {
+ // Verify thread sort order?
+
+ VERIFY(thread_it->size() > 0);
+
+ // Kmers can have different lengths so going from larger kmers to smaller
+ // implies shorting of thread length what may lead to "range-overflow"
+ if (cur_pos > thread_it->back().first)
+ cur_pos = thread_it->back().first;
+
+ auto found_it = std::lower_bound(thread_it->begin(), thread_it->end(),
+ make_pair(cur_pos, size_t(0)));
+
+ DEBUG("Searching for pos " << debug::PrintComplexPosition(cur_pos)
+ << "in thread of " << debug::PrintComplexRange(thread_it->front())
+ << " - " << debug::PrintComplexRange(thread_it->back()));
+ VERIFY(found_it != thread_it->end());
+ if (cur_pos == found_it->first) {
+ cur_pos = found_it->second;
+ continue;
+ }
+ VERIFY(found_it != thread_it->begin());
+
+ Range graph_range(0, 0);
+ Range genome_range(0, 0);
+
+ graph_range.end_pos = found_it->first;
+ genome_range.end_pos = found_it->second;
+ --found_it;
+ graph_range.start_pos = found_it->first;
+ genome_range.start_pos = found_it->second;
+
+ DEBUG("from ranges " << debug::PrintComplexRange(graph_range) <<
+ " and " << debug::PrintComplexRange(genome_range) <<
+ " in search of " << debug::PrintComplexPosition(cur_pos));
+ cur_pos = CalculatePos(graph_range, genome_range, cur_pos);
+ DEBUG("gettin' " << debug::PrintComplexPosition(cur_pos));
+ }
+
+ return cur_pos;
+}
+
+/*
+template<class Graph>
+size_t CoordinatesHandler<Graph>::GetNewestPos(
+ const uint genome_id, const size_t old_pos) const {
+ if (stored_threading_history_.size() == 0)
+ return old_pos;
+
+ const auto history_it = stored_threading_history_.find(genome_id);
+ VERIFY(history_it != stored_threading_history_.end());
+ const std::vector<Thread> &history = history_it->second;
+ const Thread &latest = history.back();
+
+ // Kmers can have different lengths so going from larger kmers to smaller
+ // implies shorting of thread length what may lead to "range-overflow"
+ size_t search_pos = old_pos;
+ if (search_pos > latest.back().second)
+ search_pos = latest.back().second;
+
+ auto found_it = std::lower_bound(latest.begin(), latest.end(),
+ make_pair(size_t(0), search_pos), utils::compare_pairs_reversed);
+
+ VERIFY(found_it != latest.end());
+ if (search_pos == found_it->second)
+ return found_it->first;
+
+ VERIFY(found_it != latest.begin());
+
+ Range graph_range(0, 0);
+ Range genome_range(0, 0);
+
+ graph_range.end_pos = found_it->second;
+ genome_range.end_pos = found_it->first;
+ --found_it;
+ graph_range.start_pos = found_it->second;
+ genome_range.start_pos = found_it->first;
+
+ DEBUG("from ranges " << debug::PrintComplexRange(graph_range) <<
+ " and " << debug::PrintComplexRange(genome_range) <<
+ " in search of " << debug::PrintComplexPosition(search_pos));
+ const size_t result_pos = CalculatePos(graph_range, genome_range, search_pos);
+ DEBUG("gettin' " << debug::PrintComplexPosition(result_pos));
+ return result_pos;
+}
+*/
+
+/*
+ * Some handling methods (description in omni_utils.hpp)
+ */
+template <class Graph>
+void CoordinatesHandler<Graph>::HandleDelete(EdgeId e) {
+ if (HasEdgeData(e)) {
+ INFO("edge " << g_->str(e) << " " << edge_ranges_[e].DebugOutput());
+ }
+ VERIFY(!HasEdgeData(e));
+ CleanEdgeData(e);
+}
+
+template <class Graph>
+void CoordinatesHandler<Graph>::HandleMerge(const vector<EdgeId> &old_edges, EdgeId new_edge) {
+ //DEBUG("HandleMerge : " << debug::Debug(g_, old_edges) << " -> " << g_->str(new_edge));
+ for (const auto &edge : old_edges) {
+ if (HasEdgeData(edge)) {
+ //DEBUG("edge " << g_->str(edge) << " " << edge_ranges_[edge].DebugOutput());
+ edge_ranges_[new_edge] += edge_ranges_[edge];
+ CleanEdgeData(edge);
+ }
+ }
+}
+
+template <class Graph>
+void CoordinatesHandler<Graph>::HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) {
+ //DEBUG("HandleGlue : " << g_->str(new_edge) << " <- " << g_->str(edge1) << " + " << g_->str(edge2));
+ if (HasEdgeData(edge1)) {
+ //DEBUG("edge " << g_->str(edge1) << " " << edge_ranges_[edge1].DebugOutput());
+ edge_ranges_[new_edge] += edge_ranges_[edge1];
+ CleanEdgeData(edge1);
+ }
+ if (HasEdgeData(edge2)) {
+ //DEBUG("edge " << g_->str(edge2) << " " << edge_ranges_[edge2].DebugOutput());
+ edge_ranges_[new_edge] += edge_ranges_[edge2];
+ CleanEdgeData(edge2);
+ }
+}
+
+template <class Graph>
+void CoordinatesHandler<Graph>::HandleSplit(EdgeId old_edge, EdgeId /* new_edge_1 */,
+ EdgeId /* new_edge_2 */) {
+ //DEBUG("HandleSplit " << g_->str(old_edge) << " -> " << g_->str(new_edge_1) << " + " << g_->str(new_edge_2));
+ VERIFY(!HasEdgeData(old_edge));
+ /*
+ const std::vector<std::pair<uint, Range> > old_ranges =
+ edge_ranges[old_edge].GetRanges();
+ for (const auto &range : old_ranges) {
+
+ }
+ */
+}
+
+
+}
diff --git a/src/projects/cap/deprecated/kmer_jumper.hpp b/src/projects/cap/deprecated/kmer_jumper.hpp
new file mode 100644
index 0000000..f067a19
--- /dev/null
+++ b/src/projects/cap/deprecated/kmer_jumper.hpp
@@ -0,0 +1,73 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+namespace cap {
+
+template <class TargetType>
+class KmerJumper {
+ public:
+ virtual void SetTransition(char symbol, TargetType *link) const = 0;
+ virtual const TargetType &GetTransition(char symbol) const = 0;
+ virtual TargetType *GetTransitionLink(char symbol) const = 0;
+ virtual bool HasTransition(char symbol) const = 0;
+ virtual char Arity() const = 0;
+};
+
+template <class TargetType>
+class MultiKmerJumper : public KmerJumper<TargetType> {
+ TargetType *transitions[4];
+
+ public:
+ MultiKmerJumper() : transitions({NULL, NULL, NULL, NULL}) {
+ }
+
+ inline void SetTransition(char symbol, TargetType *link) {
+ transitions[symbol] = link;
+ }
+ inline const TargetType &GetTransition(char symbol) const {
+ return *(transitions[symbol]);
+ }
+ inline TargetType GetTransitionLink(char symbol) const {
+ return transitions[symbol];
+ }
+ inline bool HasTransition(char symbol) const {
+ return transitions[symbol] != NULL;
+ }
+ inline char Arity() const {
+ return 4;
+ }
+};
+
+template <class TargetType>
+class SingleKmerJumper : public KmerJumper<TargetType> {
+ TargetType *transition;
+
+ public:
+ SingleKmerJumper() : transition(NULL) {
+ }
+
+ inline void SetTransition(char symbol, TargetType *link) const {
+ transition = link;
+ }
+ inline const TargetType &GetTransition(char symbol) const {
+ return *transition;
+ }
+ inline TargetType *GetTransitionLink(char symbol) const {
+ return transition;
+ }
+ inline bool HasTransition(char symbol) const {
+ return transition != NULL;
+ }
+ inline char Arity() const {
+ return 1;
+ }
+
+};
+
+}
diff --git a/src/projects/cap/deprecated/longseq_storage.hpp b/src/projects/cap/deprecated/longseq_storage.hpp
new file mode 100644
index 0000000..edfdd1b
--- /dev/null
+++ b/src/projects/cap/deprecated/longseq_storage.hpp
@@ -0,0 +1,64 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "kmer_jumper.hpp"
+
+namespace cap {
+
+template <class Kmer, class hasher = typename Kmer::hash, class equal = typename Kmer::equal_to>
+class LongSeqStorage {
+ typedef KmerJumper<Kmer> JumperT;
+ typedef std::unordered_set<Kmer, hasher, equal> StorageT;
+ typedef std::unordered_map<Kmer, JumperT, hasher, equal> JumpMapT;
+
+ StorageT storage_;
+ JumpMapT jumper_map_;
+
+ void MaintainJump(const Kmer &kmer1, const Kmer &kmer2) {
+ if (kmer.GetNextNucl() != Kmer::kNoNextNucl &&
+ kmer.GetNextNucl() !=
+ JumpMapT::iterator it = jumper_map_.find(kmer);
+ if (it != jumper_map_.end()) {
+ //it->second.SetTransition(
+ }
+ }
+
+ public:
+ LongSeqStorage() : storage_() {
+ }
+ static LongSeqStorage<Kmer> &Instance() {
+ static LongSeqStorage<Kmer> instance;
+ return instance;
+ }
+
+ void Put(const Kmer &kmer) {
+ StorageT::iterator it = storage_.find(kmer);
+ if (it != storage_.end()) {
+ MaintainJump(*it, kmer);
+ } else {
+ storage_.insert(kmer);
+ }
+ }
+ void Replace(const Kmer &kmer) {
+ storage_.erase(kmer);
+ storage_.insert(kmer);
+ }
+ const Kmer &Get(const Kmer &kmer) const {
+ VERIFY(storage_.find(kmer) != storage_.end());
+ return *(storage_.find(kmer));
+ }
+ size_t size() const {
+ return storage_.size();
+ }
+ void clear() {
+ storage_.clear();
+ }
+};
+
+}
diff --git a/src/projects/cap/deprecated/tools_deprecated.cpp b/src/projects/cap/deprecated/tools_deprecated.cpp
new file mode 100644
index 0000000..63883cf
--- /dev/null
+++ b/src/projects/cap/deprecated/tools_deprecated.cpp
@@ -0,0 +1,468 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+
+//Gingi block
+
+//BOOST_AUTO_TEST_CASE( MaskDiffsForGingi ) {
+// MaskDifferencesAndSave(vector<string> {
+// "/home/snurk/Dropbox/gingi/jeff.fasta",
+// "/home/snurk/Dropbox/gingi/TDC60.fasta" }, vector<string> { "jeff",
+// "tdc60" }, "assembly_comp/gingi_diff_mask/", k<15>(), k<21>(),
+// k<55>()/*, k<101>(), k<201>()*/);
+//}
+
+//BOOST_AUTO_TEST_CASE( ClearGingiGenome ) {
+// Clear<201>("assembly_comp/gingi_diff_mask/tdc60.fasta",
+// "assembly_comp/gingi_diff_mask/tdc60_cl.fasta");
+//}
+//
+//BOOST_AUTO_TEST_CASE( ClearJeffAssembly ) {
+// Clear<201>("assembly_comp/gingi_diff_mask/jeff.fasta",
+// "assembly_comp/gingi_diff_mask/jeff_cl.fasta");
+//}
+// BOOST_AUTO_TEST_CASE
+// ( AssemblyRefComparison ) {
+// static const size_t K = 55;
+// typedef debruijn_graph::graph_pack<
+// /*Nonc*/debruijn_graph::ConjugateDeBruijnGraph, K> gp_t;
+// typedef gp_t::graph_t Graph;
+// typedef Graph::EdgeId EdgeId;
+// typedef Graph::VertexId VertexId;
+// typedef NewExtendedSequenceMapper<gp_t::k_value + 1, Graph> Mapper;
+
+// // EasyContigStream stream_1("/home/snurk/Dropbox/gingi/jeff.fasta");
+// // EasyContigStream stream_2("/home/snurk/Dropbox/gingi/TDC60.fasta");
+// // string ref = "/home/snurk/Dropbox/gingi/TDC60.fasta";
+// // EasyContigStream stream_1("assembly_comp/gingi_diff_mask/jeff_cl.fasta");
+// // EasyContigStream stream_2("assembly_comp/gingi_diff_mask/tdc60_cl.fasta");
+// // string ref = "assembly_comp/gingi_diff_mask/tdc60_cl.fasta";
+// EasyContigStream stream_1("/home/snurk/Dropbox/lab/mrsa/MRSA_RCH_S60.fasta",
+// "s60_");
+// EasyContigStream stream_2(
+// "/home/snurk/Dropbox/lab/mrsa/USA300_FPR3757.fasta", "usa300_");
+// // EasyContigStream stream_1("assembly_comp/gingi_diff_mask/jeff.fasta",
+// // "jeff_");
+// // EasyContigStream stream_2("assembly_comp/gingi_diff_mask/tdc60.fasta",
+// // "tdc_");
+
+// string ref = "/home/snurk/Dropbox/lab/mrsa/USA300_FPR3757.fasta";
+// // string ref = "assembly_comp/gingi_diff_mask/tdc60.fasta";
+// string output_folder = "assembly_comp/s60_usa300_" + ToString(K) + "/";
+// remove_dir(output_folder);
+// make_dir(output_folder);
+
+// int br_delta = -1;
+// gp_t gp(ReadGenome(ref), 200, true);
+// ColorHandler<Graph> coloring(gp.g);
+
+// vector<ContigStream*> streams = { &stream_1, &stream_2 };
+// ConstructColoredGraph(gp, coloring, streams, false, br_delta);
+
+// // INFO("Filling ref pos " << gp.genome.size());
+// // FillPos(gp_, gp_.genome, "ref_0");
+// // FillPos(gp_, !gp_.genome, "ref_1");
+
+// //Indels
+// // make_dir(output_folder + "indels/");
+// // SimpleInDelAnalyzer<Graph> del_analyzer(gp.g, coloring, gp.edge_pos,
+// // (*MapperInstance(gp)).MapSequence(gp.genome).simple_path(),
+// // edge_type::red, output_folder + "indels/");
+// // del_analyzer.Analyze();
+
+// //Alternating paths
+// // AlternatingPathsCounter<Graph> alt_count(gp_.g, coloring);
+// // alt_count.CountPaths();
+
+// //Block stats
+// // ContigBlockStats<Graph, Mapper> block_stats(gp_.g, gp_.edge_pos,
+// // *MapperInstance(gp_), gp_.genome, stream1_);
+// // block_stats.Count();
+
+// // Missing genes
+// // MissingGenesAnalyser<Graph, Mapper> missed_genes(gp.g, coloring,
+// // gp.edge_pos, gp.genome, *MapperInstance(gp),
+// // vector<pair<bool, pair<size_t, size_t>>> {
+// // make_pair(/*true*/false, make_pair(416000, 430000)),
+// // make_pair(/*true*/false, make_pair(1513000, 1518000)),
+// // make_pair(/*true*/false, make_pair(260354, 260644)),
+// // make_pair(/*true*/false, make_pair(300641, 300904)),
+// // make_pair(/*true*/false, make_pair(300904, 301920)),
+// // make_pair(/*true*/false, make_pair(301917, 302348)),
+// // make_pair(/*true*/false, make_pair(260354, 260644)),
+// // make_pair(/*true*/false, make_pair(300641, 300904)),
+// // make_pair(/*true*/false, make_pair(300904, 301920)),
+// // make_pair(/*true*/false, make_pair(301917, 302348)),
+// // make_pair(/*true*/false, make_pair(302449, 304752)),
+// // make_pair(/*true*/false, make_pair(263821, 264594)),
+// // make_pair(/*true*/false, make_pair(265025, 265726)),
+// // make_pair(/*true*/false, make_pair(265740, 266951))
+// // }
+// // , output_folder + "missed_genes/");
+// // missed_genes.Analyze();
+
+// // 2339834
+// ////////////
+// // WriteMagicLocality();
+// ////////////
+
+// //possible rearrangements
+// // string rearr_folder = output_folder + "rearrangements/";
+// // make_dir(rearr_folder);
+// // SimpleRearrangementDetector<gp_t> rearr_det(gp_, coloring_, "tdc_",
+// // rearr_folder);
+// // rearr_det.Detect();
+
+// //print graph
+// make_dir(output_folder + "initial_pics");
+// PrintColoredGraphAlongRef(gp, coloring,
+// output_folder + "initial_pics/colored_split_graph.dot");
+
+// //reference correction
+// SimpleInDelCorrector<Graph> corrector(gp.g, coloring,
+// (*MapperInstance(gp)).MapSequence(gp.genome).simple_path().sequence(), /*genome_color*/
+// kBlueColorSet, /*assembly_color*/kRedColorSet);
+// corrector.Analyze();
+
+// //trivial breakpoints
+// string bp_folder = output_folder + "breakpoints/";
+// make_dir(bp_folder);
+// TrivialBreakpointFinder<Graph> bp_finder(gp.g, coloring, gp.edge_pos);
+// bp_finder.FindBreakPoints(bp_folder);
+
+// //make saves
+// make_dir(output_folder + "saves");
+// string filename = output_folder + "saves/graph";
+// PrinterTraits<Graph>::Printer printer(gp.g);
+// INFO("Saving graph to " << filename);
+// printer.saveGraph(filename);
+// printer.saveEdgeSequences(filename);
+// printer.savePositions(filename, gp.edge_pos);
+// SaveColoring(gp.g, coloring, filename);
+// }
+
+//End of gingi block
+
+//BOOST_AUTO_TEST_CASE( RefVSAssemblyComparison ) {
+// static const size_t k = 55;
+// static const size_t K = 55;
+// Sequence ref = ReadGenome("/home/snurk/MRSA/USA300_FPR3757.fasta");
+// io::Reader contig_stream("/home/snurk/MRSA/MRSA_RCH_I56.fasta");
+// string folder = "mrsa_comp/RCH_I56/";
+// make_dir(folder);
+// RunBPComparison<k, K>(
+// ref,
+// contig_stream,
+// "ref",
+// "assembly",
+// true/*refine*/,
+// false/*untangle*/,
+// folder,
+// true/*detailed_output*/,
+// 20);
+//}
+
+//BOOST_AUTO_TEST_CASE( TwoAssemblyComparison ) {
+// static const size_t k = 19;
+// static const size_t K = 55;
+//// static const size_t K = 57;
+//// static const size_t K = 53;
+//
+//// io::Reader stream_1("/home/snurk/gingi/2.fasta");
+//// io::Reader stream_2("/home/snurk/gingi/3.fasta");
+//
+// io::Reader stream_1("/home/anton/idba_compare/idba.fasta");
+// io::Reader stream_2("/home/anton/idba_compare/hammer21_dis_tuned_simpl_try_improve.fasta");
+// string ref = "/home/anton/idba_compare/MG1655-K12.fasta";
+// string folder = "/home/anton/idba_compare/hammer21_dis_tuned_simpl_vs_idba/";
+//// string folder = "assembly_comp/gingi_new_3_vs_jeff/";
+// make_dir(folder);
+//
+// RunBPComparison<k, K>(
+// stream_1,
+// stream_2,
+// "idba_",
+// "k21ts_",
+// true/*refine*/,
+// false/*untangle*/,
+// folder,
+// true/*detailed_output*/,
+// 5/*delta*/,
+// ReadGenome(ref));
+//}
+
+//BOOST_AUTO_TEST_CASE( TwoAssemblyComparison ) {
+// static const size_t k = 19;
+// static const size_t K = 55;
+//// static const size_t K = 57;
+//// static const size_t K = 53;
+//
+//// io::Reader stream_1("/home/snurk/gingi/2.fasta");
+//// io::Reader stream_2("/home/snurk/gingi/3.fasta");
+//
+//// io::Reader stream_1("/home/snurk/gingi/PGINGIVALIS_LANE2_BH.fasta");
+//// io::Reader stream_1("/home/snurk/gingi/PGINGIVALIS_LANE3_BH.fasta");
+// io::Reader stream_2("/home/snurk/gingi/jeff.fasta");
+//
+//// io::Reader stream_2("/home/snurk/gingi/PGINGIVALIS_LANE2_BH.fasta");
+//
+// io::Reader stream_1("/home/snurk/gingi/PGINGIVALIS_LANE3_BH.fasta");
+//// io::Reader stream_2("/home/snurk/gingi/lane2_evsc.fasta");
+//
+//// string folder = "assembly_comp/gingi_new_3_vs_new_2/";
+// string folder = "assembly_comp/gingi_new_3_vs_jeff/";
+// make_dir(folder);
+//
+// RunBPComparison<k, K>(
+// stream_1,
+// stream_2,
+//// "2",
+//// "jeff",
+// "3_new_",
+//// "2_new_",
+// "jeff_",
+// true/*refine*/,
+// false/*untangle*/,
+// folder,
+// true/*detailed_output*/);
+//}
+
+//BOOST_AUTO_TEST_CASE( AssemblyRefComparison ) {
+// static const size_t k = 21;
+// static const size_t K = 201/*55*//*201*/;
+//// static const size_t K = 57;
+//// static const size_t K = 53;
+//
+//// io::Reader stream_1("/home/snurk/gingi/2.fasta");
+//// io::Reader stream_2("/home/snurk/gingi/3.fasta");
+//
+// io::Reader stream_1("/home/snurk/Dropbox/gingi/jeff.fasta");
+// io::Reader stream_2("/home/snurk/Dropbox/gingi/TDC60.fasta");
+// string ref = "/home/snurk/Dropbox/gingi/TDC60.fasta";
+//
+//// string folder = "assembly_comp/gingi_jeff_vs_tdc60_55/";
+// string folder = "assembly_comp/gingi_jeff_vs_tdc60_201/";
+// make_dir(folder);
+//
+// RunBPComparison<k, K>(
+// stream_1,
+// stream_2,
+// "jeff_",
+// "tdc_",
+// true/*refine*/,
+// false/*untangle*/,
+// folder,
+// true/*detailed_output*/,
+// 5/*delta*/,
+// ReadGenome(ref));
+//}
+
+//BOOST_AUTO_TEST_CASE( IDBA_vs_SPADES ) {
+// static const size_t k = 19;
+// static const size_t K = 55;
+//// static const size_t K = 57;
+//// static const size_t K = 53;
+//
+//// io::Reader stream_1("/home/snurk/gingi/2.fasta");
+//// io::Reader stream_2("/home/snurk/gingi/3.fasta");
+//
+// io::Reader stream_1("/home/snurk/idba_comp/idba-contig-100.fa");
+// io::Reader stream_2("/home/snurk/idba_comp/k21nodiscard.fasta");
+// string ref = "/home/snurk/idba_comp/MG1655-K12.fasta";
+//
+// string folder = "/home/snurk/idba_comp/results/";
+// make_dir(folder);
+//
+// RunBPComparison<k, K>(
+// stream_1,
+// stream_2,
+// "idba_",
+// "bh21_",
+// true/*refine*/,
+// false/*untangle*/,
+// folder,
+// true/*detailed_output*/,
+// 5/*delta*/,
+// ReadGenome(ref));
+//}
+
+
+
+/*
+BOOST_AUTO_TEST_CASE( TwoStrainComparisonWR ) {
+ INFO("Running comparison of two strains");
+
+ make_dir("bp_graph_test");
+
+ std::string base_dir = "/Users/valich/Dropbox/mrsa/";
+ std::string genome_path1 = "/smallnas/yana/X5-l-velvet-scaff.closed.fasta",
+ genome_path2 = "/smallnas/yana/X5_results/scaffolds.fasta";
+
+ pair<Sequence, Sequence> genomes = CorrectGenomes<55>(CorrectGenomes<21>(
+ ReadGenome(genome_path1),
+ ReadGenome(genome_path2)), 200);
+
+ INFO("Genomes ready");
+
+ CompareGenomes<77>(genomes.first, genomes.second, "bp_graph_test/two_strain_comp_wr/");
+ INFO("Finished");
+}
+*/
+// inline void StrainComparisonWOR(const string& strain_1, const string& strain_2, const string& output_folder) {
+// make_dir("bp_graph_test");
+// INFO("Running comparison of two strains");
+// pair<Sequence, Sequence> genomes = CorrectGenomes<55>(TotallyClearGenomes<55>(CorrectGenomes<21>(ReadGenome(strain_1)
+// , ReadGenome(strain_2))), 30);
+// // genomes = TotallyClearGenomes<701>(genomes);
+// VERIFY(CheckNoRepeats<301>(genomes.first));
+// VERIFY(CheckNoRepeats<301>(genomes.second));
+// INFO("Genomes ready");
+
+// CompareGenomes<701>(genomes.first, genomes.second, output_folder);
+// }
+
+// BOOST_AUTO_TEST_CASE( TwoStrainComparisonWOR ) {
+// string base_dir = "/Users/valich/Dropbox/mrsa/";
+// StrainComparisonWOR(base_dir + "/MRSA_RCH_I56.fasta"
+// , base_dir + "MRSA_RCH_S60.fasta", "bp_graph_test/two_strain_comp_wo_repeats/");
+// }
+
+//BOOST_AUTO_TEST_CASE( CompareAllMRSA ) {
+// string mrsa_root = "/home/snurk/MRSA/more_strains/";
+// ifstream stream;
+// stream.open(mrsa_root + "list.txt");
+// string s1;
+// string s2;
+// VERIFY(!stream.eof());
+// stream >> s1;
+// while (!stream.eof()) {
+// stream >> s2;
+// StrainComparisonWOR(mrsa_root + s1 + ".fasta", mrsa_root + s2 + ".fasta"
+// , mrsa_root + "results/" + s1 + "_vs_" + s2 + "/");
+// }
+// stream.close();
+//}
+
+//
+//BOOST_AUTO_TEST_CASE( TwoStrainComparisonFirstWOR ) {
+// make_dir("bp_graph_test");
+// INFO("Running comparison of two strains");
+// pair<Sequence, Sequence> genomes = CorrectGenomes<21>(ReadGenome("data/input/E.coli/MG1655-K12.fasta.gz")
+// , ReadGenome("data/input/E.coli/DH10B-K12.fasta"));
+// genomes = CorrectGenomes<55>(genomes, 200);
+// genomes.first = ClearGenome<55>(genomes.first);
+// genomes = CorrectGenomes<55>(genomes, 200);
+// VERIFY(CheckNoRepeats<301>(genomes.first));
+// INFO("Genomes ready");
+//
+// CompareGenomes<701>(genomes.first, genomes.second, "bp_graph_test/two_strain_comp_first_wo_repeats/");
+//}
+
+//BOOST_AUTO_TEST_CASE( StrainVSRepeatGraphComparison ) {
+// static const size_t repeat_clearing_k = 55;
+// static const size_t repeat_graph_k = 101;
+// static const size_t refining_k1 = 25;
+// static const size_t refining_k2 = 151;
+// static const size_t bp_k = 301;
+//
+// make_dir("bp_graph_test");
+// INFO("Running comparison of strain vs repeat graph for other strain");
+// Sequence genome1 = ReadGenome("data/input/E.coli/MG1655-K12.fasta.gz");
+// Sequence genome2 = ReadGenome("data/input/E.coli/DH10B-K12.fasta");
+//
+// typedef graph_pack<ConjugateDeBruijnGraph, repeat_graph_k> repeat_gp_t;
+// vector<Sequence> repeat_graph_edges = RepeatGraphEdges<repeat_gp_t>(genome2);
+//
+// pair<Sequence, vector<Sequence>> refined_data1 = RefineData<refining_k2>(RefineData<refining_k1>(
+// make_pair(genome1, repeat_graph_edges)));
+//
+// pair<Sequence, vector<Sequence>> cleared = Clear<repeat_clearing_k>(refined_data1.first, refined_data1.second);
+//
+// pair<Sequence, vector<Sequence>> refined_data2 = RefineData<bp_k>(RefineData<refining_k2>(RefineData<refining_k1>(cleared)));
+//
+// io::VectorReader<io::SingleRead> final_stream1(
+// io::SingleRead("first", refined_data2.first.str()));
+// io::VectorReader<io::SingleRead> final_stream2(MakeReads(refined_data2.second));
+//
+// typedef graph_pack<ConjugateDeBruijnGraph, bp_k> comparing_gp_t;
+// INFO("Running assembly comparer");
+// AssemblyComparer<comparing_gp_t> comparer(final_stream1, final_stream2, "strain1", "strain2", /*untangle*/false);
+// comparer.CompareAssemblies("bp_graph_test/strain_vs_repeat_graph_comp/", /*detailed_output*/true);
+// INFO("Finished");
+//}
+//
+//BOOST_AUTO_TEST_CASE( StrainVSRepeatGraphComparison2 ) {
+// static const size_t repeat_clearing_k = 55;
+// static const size_t repeat_graph_k = 201;
+// static const size_t refining_k1 = 25;
+// static const size_t refining_k2 = 151;
+// static const size_t bp_k = 201;
+//
+// make_dir("bp_graph_test");
+// INFO("Running comparison of strain vs repeat graph for other strain");
+// Sequence genome1 = ReadGenome("data/input/E.coli/MG1655-K12.fasta.gz");
+// Sequence genome2 = ReadGenome("data/input/E.coli/DH10B-K12.fasta");
+//
+// typedef graph_pack<ConjugateDeBruijnGraph, repeat_graph_k> repeat_gp_t;
+// vector<Sequence> repeat_graph_edges = RepeatGraphEdges<repeat_gp_t>(genome2);
+//
+// pair<Sequence, vector<Sequence>> refined_data1 = RefineData<refining_k2>(RefineData<refining_k1>(
+// make_pair(genome1, repeat_graph_edges)));
+//
+//// pair<Sequence, vector<Sequence>> cleared = Clear<repeat_clearing_k>(refined_data1.first, refined_data1.second);
+// Sequence cleared = ClearGenome<repeat_clearing_k>(refined_data1.first);
+//
+// pair<Sequence, vector<Sequence>> refined_data2 = RefineData<bp_k>(RefineData<refining_k2>(RefineData<refining_k1>(make_pair(cleared, refined_data1.second))));
+//
+// io::VectorReader<io::SingleRead> final_stream1(
+// io::SingleRead("first", refined_data2.first.str()));
+// io::VectorReader<io::SingleRead> final_stream2(MakeReads(refined_data2.second));
+//
+// typedef graph_pack<ConjugateDeBruijnGraph, bp_k> comparing_gp_t;
+// INFO("Running assembly comparer");
+// AssemblyComparer<comparing_gp_t> comparer(final_stream1, final_stream2, "strain1", "strain2", /*untangle*/false);
+// comparer.CompareAssemblies("bp_graph_test/strain_vs_repeat_graph_comp2/", /*detailed_output*/true);
+// INFO("Finished");
+//}
+
+//BOOST_AUTO_TEST_CASE( ThreadingContigsOverGraph ) {
+// typedef graph_pack<ConjugateDeBruijnGraph, 55> gp_t;
+// io::EasyReader base_contigs("/home/anton/gitrep/algorithmic-biology/assembler/data/tmp/andrew_nurk.fasta");
+// io::EasyReader other_contigs("/home/anton/gitrep/algorithmic-biology/assembler/data/tmp/velvet-sc.fasta");
+// string base_saves = "/home/anton/gitrep/algorithmic-biology/assembler/data/debruijn/LBOUILLONII_QUAKE/saves/simplified_graph";
+// string output_dir = "bul_comparison/ac";
+// make_dir(output_dir);
+// ThreadAssemblies<gp_t>(base_saves, base_contigs, "spades", other_contigs, "velvet", output_dir);
+//}
+
+//BOOST_AUTO_TEST_CASE( GenerateGraphFragment ) {
+// std::string input_path = "./data/debruijn/HMP_LANE_3_0/K55/latest/saves/simplified_graph";
+// std::string output_path = "./src/test/debruijn/graph_fragments/topology_ec/iter_unique_path";
+// size_t split_threshold = 230;
+// int int_edge_id = 5573881;
+// graph_pack<ConjugateDeBruijnGraph, 55> gp;
+// ScanGraphPack(input_path, gp);
+// //prints only basic graph structure
+// PrintGraphComponentContainingEdge(output_path, gp.g,
+// split_threshold, int_edge_id);
+//
+// //long way to write to dot file
+// Graph g(55);
+// ScanBasicGraph(output_path, g);
+// total_labeler_graph_struct graph_struct(g, (const EdgesPositionHandler<Graph>*)0);
+// total_labeler tot_lab(&graph_struct);
+// WriteToDotFile(g,
+// tot_lab, output_path + ".dot",
+// "mygraph", Path<EdgeId>(), Path<EdgeId>());
+//}
+
+/*
+BOOST_AUTO_TEST_CASE( GapComparativeAnalysis ) {
+ std::string strain1 = "/smallnas/yana/X5-l-velvet-scaff.closed.fasta",
+ strain2 = "/smallnas/yana/X5_results/scaffolds_fcb.fasta";
+}
+*/
diff --git a/src/projects/cap/diff_masking.hpp b/src/projects/cap/diff_masking.hpp
new file mode 100644
index 0000000..b4027be
--- /dev/null
+++ b/src/projects/cap/diff_masking.hpp
@@ -0,0 +1,335 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "io/reads_io/read_stream_vector.hpp"
+#include "algorithms/graph_construction.hpp"
+#include "stages/simplification_pipeline/graph_simplification.hpp"
+#include "algorithms/graph_read_correction.hpp"
+#include "test_utils.hpp"
+
+#include "coloring.hpp"
+#include "colored_graph_construction.hpp"
+#include "gene_analysis.hpp"
+#include "repeat_masking.hpp"
+#include "visualization.hpp"
+
+//todo deprecated
+namespace cap {
+
+inline void SaveAll(ContigStreams& streams, const vector<string>& suffixes,
+ const string& out_root) {
+ make_dir(out_root);
+
+ streams.reset();
+ for (size_t i = 0; i < streams.size(); ++i) {
+ if (!suffixes[i].empty()) {
+ string output_filename = out_root + suffixes[i];
+ auto rc_wrapped = io::RCWrap<Contig>(streams.ptr_at(i));
+ io::osequencestream ostream(output_filename);
+ Transfer(*rc_wrapped, ostream);
+ }
+ }
+}
+
+//todo changes the graph!!! color edge splitting!!!
+template<class gp_t>
+void MakeSaves(gp_t& gp, ContigStreams streams, const string& root,
+ const vector<string>& suffixes, bool optional = true) {
+
+ SaveAll(streams, suffixes, root);
+
+ static bool make_optional_saves = true;
+
+ if (!make_optional_saves && optional)
+ return;
+
+ make_dir(root);
+
+ streams.reset();
+
+ ColorHandler<Graph> coloring(gp.g, streams.size());
+ CoordinatesHandler<Graph> coordinates_handler;
+ SplitAndColorGraph(gp, coloring, streams);
+ FillPositions(gp, streams, coordinates_handler);
+
+ PrintColoredGraphWithColorFilter(gp.g, coloring, gp.edge_pos,
+ root + "colored_split_graph");
+}
+
+template<class gp_t>
+void RefineGP(gp_t& gp, size_t delta = 5) {
+ using namespace debruijn_graph;
+ INFO("Constructing graph pack for refinement");
+
+ //outdated
+ //debruijn_config::simplification::bulge_remover br_config;
+ //br_config.max_bulge_length_coefficient = 3;
+ //br_config.max_coverage = 1000.;
+ //br_config.max_relative_coverage = 1.2;
+ //br_config.max_delta = double(delta);
+ //br_config.max_relative_delta = 0.1;
+
+ INFO("Removing bulges");
+ debruijn::simplification::RemoveBulges(gp.g, br_config);
+
+ INFO("Remapped " << gp.kmer_mapper.size() << " k-mers");
+
+ debruijn_config::simplification::complex_bulge_remover cbr_config;
+ cbr_config.enabled = true;
+ cbr_config.pics_enabled = false;
+ cbr_config.folder = "";
+ cbr_config.max_relative_length = 3;
+ cbr_config.max_length_difference = delta;
+
+ INFO("Removing complex bulges");
+ debruijn::simplification::RemoveComplexBulges(gp.g, cbr_config);
+
+ TipsProjector<gp_t> tip_projector(gp);
+ boost::function<void(EdgeId)> projecting_callback = boost::bind(
+ &TipsProjector<gp_t>::ProjectTip, &tip_projector, _1);
+ debruijn_config::simplification::tip_clipper tc_config;
+
+ tc_config.condition = "{ tc_lb 2. }";
+
+ INFO("Clipping tips with projection");
+
+ debruijn::simplification::SimplifInfoContainer info_container;
+
+ debruijn::simplification::ClipTipsWithProjection(gp, tc_config, info_container);
+
+ INFO("Remapped " << gp.kmer_mapper.size() << " k-mers");
+}
+
+template<class gp_t>
+void ConstructGPForRefinement(gp_t& gp, ContigStreams& contigs,
+ size_t delta = 5) {
+ using namespace debruijn_graph;
+ typedef typename gp_t::graph_t Graph;
+ INFO("Constructing graph pack for refinement");
+
+ CapConstructGraph(gp.k_value, contigs, gp.g, gp.index);
+
+ RefineGP(gp, delta);
+}
+
+template<class gp_t>
+ContigStreams RefinedStreams(ContigStreams& streams, const gp_t& gp) {
+ ContigStreams refined_streams;
+ for (size_t i = 0; i < streams.size(); ++i) {
+ refined_streams.push_back(
+ make_shared<io::ModifyingWrapper<Contig>>(
+ streams.ptr_at(i),
+ GraphReadCorrectorInstance(gp.g, *MapperInstance(gp))));
+ }
+ return refined_streams;
+}
+
+template<class Seq>
+ContigStreams RefineStreams(ContigStreams& streams,
+ size_t k,
+ size_t delta = 5,
+ const std::string &workdir = "tmp") {
+ typedef debruijn_graph::KmerStoringEdgeIndex<Graph, Seq, kmer_index_traits<runtime_k::RtSeq>, debruijn_graph::SimpleStoring> RefiningIndex;
+ typedef graph_pack<ConjugateDeBruijnGraph, Seq, RefiningIndex> refining_gp_t;
+ refining_gp_t gp(k, workdir);
+
+ CapConstructGraph(gp.k_value, streams, gp.g, gp.index);
+
+ RefineGP(gp, delta);
+
+ return RefineStreams(streams, gp);
+
+}
+
+
+template<class Seq>
+void RefineData(const string& base_path,
+ const vector<string>& suffixes,
+ const string& out_root,
+ size_t k,
+ size_t delta = 5,
+ const std::string &workdir = "tmp") {
+ ContigStreams streams = OpenStreams(base_path, suffixes, true);
+ ContigStreams refined = RefineStreams<Seq>(streams, k, delta, workdir);
+ SaveAll(refined, suffixes, out_root);
+}
+
+//template<class gp_t>
+//void ConstructGPForRefinement(gp_t& gp,
+// io::IReader<io::SingleRead>& raw_stream_1,
+// io::IReader<io::SingleRead>& raw_stream_2, size_t delta = 5) {
+// ContigStreamsPtr streams_ptr = make_shared<ContigStreams>(
+// vector<ContigStream*> { &raw_stream_1, &raw_stream_2 }, false);
+// ConstructGPForRefinement(gp, streams_ptr, delta);
+//}
+
+//template<size_t k, class Seq>
+//pair<Sequence, Sequence> CorrectGenomes(const Sequence& genome1,
+// const Sequence& genome2, size_t delta = 5) {
+// io::VectorReader<io::SingleRead> stream1(
+// io::SingleRead("first", genome1.str()));
+// io::VectorReader<io::SingleRead> stream2(
+// io::SingleRead("second", genome2.str()));
+//
+// typedef debruijn_graph::graph_pack<debruijn_graph::ConjugateDeBruijnGraph,
+// Seq> refining_gp_t;
+// refining_gp_t refining_gp(k, "tmp");
+// ConstructGPForRefinement(refining_gp, stream1, stream2, delta);
+//
+// io::ModifyingWrapper<io::SingleRead> refined_stream1(stream1,
+// GraphReadCorrectorInstance(refining_gp.g,
+// *MapperInstance(refining_gp)));
+// io::ModifyingWrapper<io::SingleRead> refined_stream2(stream2,
+// GraphReadCorrectorInstance(refining_gp.g,
+// *MapperInstance(refining_gp)));
+//
+// pair<Sequence, Sequence> answer = make_pair(FirstSequence(refined_stream1),
+// FirstSequence(refined_stream2));
+// return answer;
+//}
+
+//template<size_t k>
+//pair<Sequence, Sequence> CorrectGenomes(const pair<Sequence, Sequence>& genomes,
+// size_t delta = 5) {
+// return CorrectGenomes<k>(genomes.first, genomes.second, delta);
+//}
+
+//template<size_t k, class Seq>
+//pair<Sequence, vector<Sequence>> RefineData(
+// const pair<Sequence, vector<Sequence>>& data) {
+// io::VectorReader<io::SingleRead> stream1(
+// io::SingleRead("first", data.first.str()));
+// io::VectorReader<io::SingleRead> stream2(MakeReads(data.second));
+//
+// typedef graph_pack<ConjugateDeBruijnGraph, Seq> refining_gp_t;
+// refining_gp_t refining_gp(k, "tmp");
+// ConstructGPForRefinement(refining_gp, stream1, stream2);
+//
+// io::ModifyingWrapper<io::SingleRead> refined_stream1(stream1,
+// GraphReadCorrectorInstance(refining_gp.g,
+// *MapperInstance(refining_gp)));
+// io::ModifyingWrapper<io::SingleRead> refined_stream2(stream2,
+// GraphReadCorrectorInstance(refining_gp.g,
+// *MapperInstance(refining_gp)));
+//
+// return make_pair(FirstSequence(refined_stream1),
+// AllSequences(refined_stream2));
+//}
+
+template<class Seq>
+void PerformRefinement(ContigStreams& streams, const string& root,
+ const vector<string>& suffixes, size_t k, const string& gene_root,
+ GeneCollection& gene_collection) {
+ VERIFY(streams.size() == suffixes.size());
+
+ const size_t delta = std::max(size_t(5), k /*/ 5*/);
+ typedef graph_pack<ConjugateDeBruijnGraph, Seq, KmerStoringEdgeIndex<Graph, Seq, kmer_index_traits<Seq>, SimpleStoring>> gp_t;
+ typedef NewExtendedSequenceMapper<Graph, typename gp_t::index_t> Mapper;
+
+ make_dir(root);
+ INFO("Constructing graph pack for k=" << k << " delta=" << delta);
+ gp_t gp(unsigned(k), "tmp", 0);
+
+ CapConstructGraph(streams, gp.g, gp.index);
+
+ MakeSaves(gp, streams, root + "before_refinement/", suffixes);
+
+ RefineGP(gp, delta);
+
+ ContigStreams refined_streams = RefinedStreams(streams, gp);
+
+ MakeSaves(gp, refined_streams, root + "after_refinement/", suffixes);
+
+ //todo temporary
+ if (!gene_root.empty()) {
+ gene_collection.Update(gp);
+ ColorHandler<typename gp_t::graph_t> coloring(gp.g);
+ string gene_save_dir = root + "updated_gene_info/";
+ make_dir(gene_save_dir);
+ gene_collection.Save(gene_save_dir, "genomes/", "gene_info.txt");
+ string gene_pics_dir = gene_save_dir + "pics/";
+ make_dir(gene_pics_dir);
+// WriteGeneLocality(gene_collection, gp, gene_pics_dir, coloring);
+ }
+ //end temporary
+}
+
+inline void PerformIterativeRefinement(ContigStreams& streams,
+ const vector<string>& suffixes, const string& out_root,
+ vector<size_t> &k_values, const string& gene_root,
+ GeneCollection& gene_collection) {
+
+ if (k_values.size() == 0) {
+ SaveAll(streams, suffixes, out_root + "final_contigs/");
+ return;
+ }
+
+ size_t current_k = k_values.back();
+ k_values.pop_back();
+
+ string root = out_root + ToString(current_k) + "/";
+
+ if (utils::NeedToUseLongSeq(current_k)) {
+ omp_set_num_threads(1);
+ PerformRefinement<LSeq>(streams, root, suffixes, current_k, gene_root,
+ gene_collection);
+ } else {
+ omp_set_num_threads(8);
+ PerformRefinement<runtime_k::RtSeq>(streams, root, suffixes, current_k,
+ gene_root, gene_collection);
+ }
+
+ ContigStreams corr_streams = OpenStreams(root + "after_refinement/",
+ suffixes, true);
+ //recursive call
+ GeneCollection updated_collection;
+
+ if (!gene_root.empty()) {
+ updated_collection.Load(gene_root + "genome_list.txt",
+ root + "updated_gene_info/genomes/",
+ root + "updated_gene_info/gene_info.txt",
+ gene_root + "interesting_orthologs.txt");
+ }
+ PerformIterativeRefinement(corr_streams, suffixes, out_root, k_values,
+ gene_root, updated_collection);
+
+}
+
+inline void PerformIterativeRefinement(const string& base_path,
+ const vector<string>& suffixes, const string& out_root,
+ vector<size_t>& k_values, bool /* gene_analysis */= false) {
+// remove_dir(out_root);
+ utils::MakeDirPath(out_root);
+ ContigStreams streams = OpenStreams(base_path, suffixes, true);
+
+ //stab
+ GeneCollection gene_collection;
+ PerformIterativeRefinement(streams, suffixes, out_root, k_values, "", gene_collection);
+}
+
+//todo temporary
+inline void PerformIterativeGeneAnalysis(const string& base_path,
+ const string& out_root,
+ vector<size_t>& k_values) {
+
+ GeneCollection gene_collection;
+ gene_collection.Load(base_path + "genome_list.txt", base_path + "/genomes/",
+ base_path + "gene_info.txt",
+ base_path + "interesting_orthologs.txt");
+ ContigStreams streams;
+ vector<string> suffixes;
+ for (auto it = gene_collection.genomes.begin(); it != gene_collection.genomes.end(); ++it) {
+ streams.push_back(make_shared<io::VectorReadStream<Contig>>(Contig(it->second.name, it->second.sequence.str())));
+ suffixes.push_back(it->second.name);
+ }
+ PerformIterativeRefinement(streams, suffixes, out_root, k_values, base_path,
+ gene_collection);
+}
+
+}
diff --git a/src/projects/cap/gene_analysis.hpp b/src/projects/cap/gene_analysis.hpp
new file mode 100644
index 0000000..a174024
--- /dev/null
+++ b/src/projects/cap/gene_analysis.hpp
@@ -0,0 +1,353 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "dev_support/standard_base.hpp"
+#include "dev_support/simple_tools.hpp"
+#include "comparison_utils.hpp"
+#include "boost/tokenizer.hpp"
+#include "coloring.hpp"
+
+//todo deprecated
+namespace cap {
+using namespace omnigraph;
+
+typedef Sequence Genome;
+typedef map<Genome, Range> GeneCoordinates;
+
+//range of nucleotide positions on main strand; true if main strand
+typedef pair<Range, bool> Pos;
+typedef size_t GenomeId;
+typedef size_t GeneId;
+
+typedef multimap<GenomeId, Pos> GenePositions;
+typedef vector<Range> Coordinates;
+
+struct GenomeInfo {
+ GenomeId id;
+ string name;
+ string short_name;
+ Sequence sequence;
+
+ GenomeInfo()
+ : id(0) {
+
+ }
+
+ GenomeInfo(const GenomeId& id_, const string& name_,
+ const string& short_name_, const Sequence& sequence_)
+ : id(id_),
+ name(name_),
+ short_name(short_name_),
+ sequence(sequence_) {
+ }
+
+};
+
+struct GeneInfo {
+ GeneId id;
+// string name;
+ GenePositions gene_positions;
+
+ GeneInfo()
+ : id(0) {
+ }
+
+ GeneInfo(const GeneId& id_/*, const string& name_*/)
+ : id(id_)/*, name(name_)*/{
+ }
+
+ void AddPosition(const GenomeId& genome_id, const Pos& pos) {
+ gene_positions.insert(make_pair(genome_id, pos));
+ }
+
+};
+
+Range VerifiedRange(Range r, size_t k, size_t genome_length) {
+ VERIFY(genome_length > k + 1);
+ VERIFY(r.start_pos < genome_length);
+ VERIFY(r.end_pos <= genome_length);
+ VERIFY(r.start_pos <= r.end_pos);
+ return r;
+}
+
+//ALL k-mers, lying in the nucl range
+Range NuclToKmerRange(Range nucl_range, size_t k, size_t genome_length) {
+// return Range(std::max(0, int(nucl_range.start_pos) - int(k)),
+// nucl_range.end_pos);
+
+ size_t start_pos =
+ (nucl_range.start_pos + k + 1 > genome_length) ?
+ genome_length - k - 1 : nucl_range.start_pos;
+ size_t end_pos = std::max((int) nucl_range.end_pos - (int) k,
+ int(start_pos + 1));
+
+ return VerifiedRange(Range(start_pos, end_pos), k, genome_length);
+}
+
+//ALL nucls, covered by k-mer range
+Range KmerToNuclRange(Range kmer_range, size_t k, size_t genome_length) {
+// return Range(min(kmer_range.start_pos + k, kmer_range.end_pos - 1),
+// kmer_range.end_pos);
+ return VerifiedRange(Range(kmer_range.start_pos, kmer_range.end_pos + k + 1),
+ k, genome_length);
+}
+
+//todo fix
+Range OppositeStrandNuclCoord(Range nucl_coord, size_t genome_length) {
+ VERIFY(nucl_coord.end_pos <= genome_length);
+ return Range(genome_length - 1 - nucl_coord.end_pos,
+ genome_length - 1 - nucl_coord.start_pos);
+}
+
+//Updates k-mer coordinates to the coordinates of start/end of condensed edge path in simplified graph
+// (gene is somewhere inside it)
+template<class gp_t>
+class CoordinatesUpdater {
+ typedef typename gp_t::graph_t Graph;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ const gp_t& gp_;
+
+ //Works with and returns k-mer coordinates!
+ Range NewCoords(const MappingPath<EdgeId>& path, Range coord) const {
+ size_t cumm_length = 0;
+ Range answer(0, 0);
+ size_t i = 0;
+ for (;
+ i < path.size()
+ && path[i].second.initial_range.end_pos <= coord.start_pos; ++i) {
+ cumm_length += gp_.g.length(path[i].first);
+ }
+ VERIFY(i < path.size());
+ VERIFY(path[i].second.initial_range.end_pos > coord.start_pos);
+ VERIFY(path[i].second.initial_range.start_pos <= coord.start_pos);
+ answer.start_pos = cumm_length;
+ for (;
+ i < path.size() && path[i].second.initial_range.end_pos < coord.end_pos;
+ ++i) {
+ cumm_length += gp_.g.length(path[i].first);
+ }
+ VERIFY(i < path.size());
+ VERIFY(path[i].second.initial_range.end_pos >= coord.end_pos);
+ VERIFY(path[i].second.initial_range.start_pos < coord.end_pos);
+ answer.end_pos = cumm_length + gp_.g.length(path[i].first);
+ return answer;
+ }
+
+ public:
+ CoordinatesUpdater(const gp_t& gp)
+ : gp_(gp) {
+
+ }
+
+ pair<Genome, Coordinates> Update(const Genome& genome,
+ const Coordinates& coords) const {
+ pair<Genome, Coordinates> answer;
+ auto mapper = *MapperInstance(gp_);
+ auto mapping_path = mapper.MapSequence(genome);
+ for (Range r : coords) {
+ answer.second.push_back(
+ KmerToNuclRange(
+ NewCoords(
+ mapping_path,
+ NuclToKmerRange(r, gp_.k_value,
+ genome.size())),
+ gp_.k_value, genome.size()));
+ }
+ auto read_corrector = GraphReadCorrectorInstance(gp_.g, mapper);
+ answer.first = read_corrector->Modify(genome);
+ return answer;
+ }
+ private:
+ DECL_LOGGER("CoordinatesUpdater")
+ ;
+};
+
+//deals with nucleotide coordinates!
+struct GeneCollection {
+ map<GenomeId, GenomeInfo> genomes;
+ map<GeneId, GeneInfo> genes;
+
+ map<string, GenomeId> genome_name_id_mapping;
+ private:
+
+ void LoadGenomes(const set<string>& genome_names,
+ const string& genomes_folder) {
+ size_t id = 0;
+ for (string name : genome_names) {
+ string filename = genomes_folder + name;
+ path::CheckFileExistenceFATAL(filename);
+ genomes.insert(
+ make_pair(
+ id,
+ GenomeInfo(id, name, name, ReadGenome(filename))));
+ genome_name_id_mapping.insert(make_pair(name, id));
+ id++;
+ }
+}
+
+GenomeId genome_id(const string& name) const {
+ return get(genome_name_id_mapping, name);
+}
+
+void LoadGenomes(const string& file_with_genomes,
+ const string& genomes_folder) {
+ path::CheckFileExistenceFATAL(file_with_genomes);
+ ifstream stream(file_with_genomes);
+ set<string> genome_names;
+ string name;
+ while (!stream.eof()) {
+ stream >> name;
+ genome_names.insert(name);
+ }
+ LoadGenomes(genome_names, genomes_folder);
+}
+
+void SaveGenomes(const string& /* folder */) const {
+ for (auto it = genomes.begin(); it != genomes.end(); ++it) {
+ WriteGenome(it->second.sequence, it->second.name);
+ }
+}
+
+void SaveGeneInfo(const string& filename) const {
+ ofstream stream(filename);
+ for (auto it1 = genes.begin(); it1 != genes.end(); ++it1) {
+ for (auto it = it1->second.gene_positions.begin(); it != it1->second.gene_positions.end(); ++it) {
+ stream << (boost::format("%i\t%s\t%i\t%i\t%s\n") % it1->second.id
+ % get(genomes, it->first).name
+ % it->second.first.start_pos
+ % it->second.first.end_pos
+ % it->second.second).str();
+ }
+ }
+}
+
+set<int> LoadGeneIDs(const string& file_with_ids) {
+ path::CheckFileExistenceFATAL(file_with_ids);
+ ifstream stream(file_with_ids);
+ set<int> gene_ids;
+ int id;
+ while (!stream.eof()) {
+ stream >> id;
+ gene_ids.insert(id);
+ }
+ return gene_ids;
+}
+
+void AddGeneInfo(const GeneId& gene_id, const GenomeId& genome_id, const Range& range, bool forward) {
+ if (genes.count(gene_id) == 0) {
+ genes.insert(make_pair(gene_id, GeneInfo(gene_id)));
+ }
+ get(genes, gene_id).AddPosition(genome_id, Pos(range, forward));
+}
+
+//ortholog ids is better wording
+void LoadGeneInfo(const string& filename, set<int> gene_ids) {
+ using boost::tokenizer;
+ using boost::escaped_list_separator;
+ path::CheckFileExistenceFATAL(filename);
+ ifstream stream(filename);
+ string line;
+ while (!stream.eof()) {
+ stream >> line;
+ tokenizer<escaped_list_separator<char>> tk(
+ line, escaped_list_separator<char>('\t'));
+ vector<string> record(tk.begin(), tk.end());
+ //0 - id
+ //1 - genome name
+ //2 - start
+ //3 - end
+ //4 - forward/reverse
+ VERIFY(record.size() == 8);
+ VERIFY(record[4] == "reverse" || record[4] == "forward");
+ int gene_id = lexical_cast<int>(record[0]);
+ if (gene_ids.count(gene_id) > 0) {
+ AddGeneInfo(gene_id, genome_id(record[1]),
+ Range(lexical_cast<size_t>(record[2]), lexical_cast<size_t>(record[3]))
+ , record[4] == "forward");
+ }
+ }
+}
+
+public:
+
+GeneCollection() {}
+
+//ortholog ids is better wording
+void Load(const string& file_with_genomes, const string& genomes_folder,
+ const string& file_with_gene_info, const string& file_with_ids) {
+ LoadGenomes(file_with_genomes, genomes_folder);
+ LoadGeneInfo(file_with_gene_info, LoadGeneIDs(file_with_ids));
+}
+
+void Save(const string& root_folder, const string& genomes_folder,
+ const string& file_with_gene_info) const {
+ SaveGenomes(root_folder + genomes_folder);
+ SaveGeneInfo(root_folder + file_with_gene_info);
+}
+
+template<class gp_t>
+void Update(const gp_t& gp) {
+ CoordinatesUpdater<gp_t> updater(gp);
+ for (GenomeId genome_id : key_set(genomes)) {
+ Coordinates gene_coords;
+ vector<GeneId> gene_ids;
+ for (GeneId gene_id : key_set(genes)) {
+ for (Pos pos: get_all(genes[gene_id].gene_positions, genome_id)) {
+ gene_ids.push_back(gene_id);
+ VERIFY(pos.second);
+ gene_coords.push_back(pos.first);
+ }
+ }
+ auto updated = updater.Update(genomes.find(genome_id)->second.sequence, gene_coords);
+ genomes.find(genome_id)->second.sequence = updated.first;
+
+ //clearing gene positions
+ for (GeneId gene_id : key_set(genes)) {
+ genes[gene_id].gene_positions.clear();
+ }
+
+ //updating gene positions
+ for (size_t j = 0; j < gene_ids.size(); ++j) {
+ genes[gene_ids[j]].gene_positions.insert(
+ make_pair(genome_id,
+ make_pair(updated.second[j], true)));
+ }
+ }
+}
+
+private:
+DECL_LOGGER("GeneCollection")
+;
+};
+
+//template<class gp_t>
+//void WriteGeneLocality(const GeneCollection& gene_collection, const gp_t& gp,
+// const string& folder,
+// const ColorHandler<typename gp_t::graph_t>& coloring) {
+// for (auto it = gene_collection.genes.begin();
+// it != gene_collection.genes.end(); ++it) {
+//// make_dir(folder + ToString(it->first));
+// const GenePositions& gene_poss = it->second.gene_positions;
+//
+// //todo improve later
+// Sequence total_gene_sequence;
+// for (GenomeId genome_id : key_set(gene_collection.genomes)) {
+// const Sequence& genome = get(gene_collection.genomes, genome_id).sequence;
+// for (Pos pos : get_all(gene_poss, genome_id)) {
+// total_gene_sequence = total_gene_sequence + genome.Subseq(pos.first.start_pos, pos.first.end_pos);
+// }
+// }
+// WriteComponentsAlongSequence(gp, folder + ToString(it->first) + "/",
+// 100000, 50, total_gene_sequence, coloring);
+// }
+//}
+
+}
diff --git a/src/projects/cap/genome_correction.hpp b/src/projects/cap/genome_correction.hpp
new file mode 100644
index 0000000..e9ba688
--- /dev/null
+++ b/src/projects/cap/genome_correction.hpp
@@ -0,0 +1,496 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "coloring.hpp"
+
+#include <vector>
+#include <map>
+#include "utils/adt/bag.hpp"
+
+namespace cap {
+
+template<class Graph>
+class GenomePath: public GraphActionHandler<Graph> {
+ typedef GraphActionHandler<Graph> base;
+public:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename vector<EdgeId>::const_iterator iterator;
+private:
+ vector<EdgeId> path_;
+ bool cyclic_;
+ bag<EdgeId> edge_mult_;
+
+ int Idx(size_t idx) {
+ if (cyclic_) {
+ return idx % path_.size();
+ }
+ if (idx >= path_.size())
+ return -1;
+ return idx;
+ }
+
+ bool CheckAllMatch(const vector<EdgeId>& edges, size_t pos) {
+ for (size_t j = 0; j < edges.size(); ++j) {
+ int idx = Idx(pos + j);
+ if (idx > 0) {
+ if (path_[idx] != edges[j])
+ return false;
+ } else {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ bool CheckNoneMatch(const vector<EdgeId>& edges, size_t pos) {
+ for (size_t j = 0; j < edges.size(); ++j) {
+ int idx = Idx(pos + j);
+ if (idx > 0) {
+ if (path_[idx] == edges[j])
+ return false;
+ } else {
+ return true;
+ }
+ }
+ return true;
+ }
+
+ bool CheckConsistency(const vector<EdgeId>& edges, size_t pos) {
+ return CheckAllMatch(edges, pos) || CheckNoneMatch(edges, pos);
+// if (Idx(pos) < 0)
+// return true;
+// bool first_matched = (path_[idx] == edges[0]);
+// for (size_t j = 0; j < edges.size(); ++j) {
+// int idx = Idx(pos + j);
+// if (idx > 0) {
+// if (first_matched ^ (path_[idx] == edges[j]) != 0)
+// return false;
+// } else {
+// return !first_matched;
+// }
+// }
+// return true;
+ }
+
+ bool CheckConsistency(const vector<EdgeId>& edges) {
+ VERIFY(!edges.empty());
+ size_t mult = edge_mult_.mult(edges[0]);
+ DEBUG("Mult of " << this->g().str(edges[0]) << " is " << mult);
+ for (size_t i = 1; i < edges.size(); ++i) {
+ DEBUG(
+ "Mult of " << this->g().str(edges[i]) << " is " << edge_mult_.mult(edges[i]));
+ if (!CheckConsistency(edges, i)
+ || edge_mult_.mult(edges[i]) != mult) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ void MovePrefixBack(size_t prefix_length) {
+ VERIFY(cyclic_);
+ vector<EdgeId> tmp(path_.begin(), path_.begin() + prefix_length);
+ path_.erase(path_.begin(), path_.begin() + prefix_length);
+ path_.insert(path_.end(), tmp.begin(), tmp.end());
+ }
+
+ void SubstituteNonBorderFragment(size_t start_pos, size_t end_pos,
+ const vector<EdgeId>& subst) {
+ VERIFY(start_pos < end_pos && end_pos <= path_.size());
+ ChangeMult(start_pos, end_pos, subst);
+ path_.insert(
+ path_.erase(path_.begin() + start_pos, path_.begin() + end_pos),
+ subst.begin(), subst.end());
+ }
+
+ void FillEdgeMult() {
+ for (auto it = path_.begin(); it != path_.end(); ++it) {
+ DEBUG("Edge " << this->g().str(*it) << " is genomic")
+ edge_mult_.put(*it);
+// edge_mult_.put(this->g().conjugate(*it));
+ }
+ }
+
+ void ChangeMult(size_t start_pos, size_t end_pos,
+ const vector<EdgeId>& subst) {
+ for (size_t i = start_pos; i < end_pos; ++i) {
+ bool could_take = edge_mult_.take(path_[i]);
+ VERIFY(could_take);
+ }
+ for (auto it = subst.begin(); it != subst.end(); ++it) {
+ edge_mult_.put(*it);
+ }
+ }
+
+public:
+ GenomePath(const Graph& g, const vector<EdgeId>& path, bool cyclic = false) :
+ base(g, "GenomePath"), path_(path), cyclic_(cyclic) {
+ FillEdgeMult();
+ }
+
+ /*virtual*/
+ void HandleMerge(const vector<EdgeId>& old_edges, EdgeId new_edge) {
+// DEBUG(
+// "Handling merge of edges " << this->g().str(old_edges) << " into edge " << this->g().str(new_edge));
+ VERIFY(CheckConsistency(old_edges));
+// DEBUG("Path before: " << this->g().str(path_));
+ auto it = find(path_.begin(), path_.end(), old_edges.front());
+ while (it != path_.end()) {
+ size_t start = it - path_.begin();
+ size_t end = start + old_edges.size();
+ Substitute(start, end, vector<EdgeId> { new_edge });
+// DEBUG("Path after find: " << this->g().str(path_));
+ it = find(path_.begin(), path_.end(), old_edges.front());
+ }
+ //debug
+ for (auto it2 = old_edges.begin(); it2 != old_edges.end(); ++it2) {
+// DEBUG("Checking " << this->g().str(*it2))
+ VERIFY(find(path_.begin(), path_.end(), *it2) == path_.end());
+ VERIFY(edge_mult_.mult(*it2) == 0);
+ }
+ //debug
+// DEBUG("Path final: " << this->g().str(path_));
+ DEBUG("Merge handled");
+ }
+
+ /*virtual*/
+ void HandleDelete(EdgeId e) {
+ DEBUG(
+ "Multiplicity of edge " << this->g().str(e) << " in delete " << edge_mult_.mult(e));
+ VERIFY(edge_mult_.mult(e) == 0);
+ }
+
+ //for cyclic paths, end_pos might be > path.size()
+ //might change indices unexpectedly
+ void Substitute(size_t start_pos, size_t end_pos,
+ const vector<EdgeId>& subst) {
+ DEBUG("Substitute called");
+ VERIFY(cyclic_ || end_pos <= path_.size());
+ if (end_pos <= path_.size()) {
+ SubstituteNonBorderFragment(start_pos, end_pos, subst);
+ } else {
+ size_t prefix_length = end_pos - path_.size();
+ VERIFY(start_pos >= prefix_length);
+ MovePrefixBack(prefix_length);
+ SubstituteNonBorderFragment(start_pos - prefix_length, path_.size(),
+ subst);
+ }
+ }
+
+ size_t size() const {
+ return path_.size();
+ }
+
+ iterator begin() const {
+ return path_.begin();
+ }
+
+ iterator end() const {
+ return path_.end();
+ }
+
+ size_t mult(EdgeId e) {
+ return edge_mult_.mult(e);
+ }
+
+private:
+ DECL_LOGGER("GenomePath")
+ ;
+};
+
+template<class Graph>
+class AssemblyPathCallback: public PathProcessor<Graph>::Callback {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename std::vector<EdgeId> Path;
+
+private:
+ const Graph& g_;
+ const ColorHandler<Graph>& coloring_;
+ const TColorSet assembly_color_;
+ size_t edge_count_;
+
+ std::vector<Path> paths_;
+
+ bool CheckPath(const vector<EdgeId>& path) const {
+ DEBUG("Checking path " << g_.str(path));
+ if (path.size() > edge_count_) {
+ DEBUG("false");
+ return false;
+ }
+ for (auto it = path.begin(); it != path.end(); ++it) {
+ if ((coloring_.Color(*it) & assembly_color_) == kEmptyColorSet) {
+ DEBUG("false");
+ return false;
+ }
+ }
+ DEBUG("true");
+ return true;
+ }
+
+public:
+ AssemblyPathCallback(const Graph& g, const ColorHandler<Graph>& coloring,
+ TColorSet assembly_color, size_t edge_count) :
+ g_(g), coloring_(coloring), assembly_color_(assembly_color), edge_count_(
+ edge_count) {
+ }
+
+ virtual void HandleReversedPath(const Path& rev_path) {
+ Path path = this->ReversePath(rev_path);
+ if (CheckPath(path)) {
+ paths_.push_back(path);
+ }
+ }
+
+ size_t size() const {
+ return paths_.size();
+ }
+
+ vector<Path> paths() const {
+ return paths_;
+ }
+};
+
+template<class Graph>
+class SimpleInDelCorrector {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ Graph& g_;
+ ColorHandler<Graph>& coloring_;
+
+ //become invalidated during process
+// const EdgesPositionHandler<Graph>& edge_pos_;
+ GenomePath<Graph> genome_path_;
+ const TColorSet genome_color_;
+ const TColorSet assembly_color_;
+
+ vector<EdgeId> FindAssemblyPath(VertexId start, VertexId end,
+ size_t edge_count_bound, size_t min_length, size_t max_length) {
+ AssemblyPathCallback<Graph> assembly_callback(g_, coloring_,
+ assembly_color_, edge_count_bound);
+ PathProcessor<Graph> path_finder(g_, min_length, max_length, start, end,
+ assembly_callback);
+ path_finder.Process();
+ if (assembly_callback.size() == 0) {
+ DEBUG("Couldn't find assembly path");
+ } else if (assembly_callback.size() > 1) {
+ DEBUG("Found several assembly paths");
+ DEBUG("Taking first");
+ return assembly_callback.paths().front();
+ } else {
+ DEBUG("Found unique assembly path");
+ return assembly_callback.paths().front();
+ }
+ return {};
+ }
+
+ int TryFindGenomePath(size_t pos, VertexId end, size_t edge_count_bound) {
+ for (size_t i = 0;
+ i + pos < genome_path_.size() && i < edge_count_bound; ++i) {
+ if (g_.EdgeEnd(*(genome_path_.begin() + pos + i)) == end) {
+ return pos + i + 1;
+ }
+ }
+ return -1;
+ }
+
+// bag<TColorSet> ColorLengths(const vector<EdgeId>& edges) {
+// bag<TColorSet> answer;
+// for (size_t i = 0; i < edges.size(); ++i) {
+// answer.put(coloring_.Color(edges[i]), g_.length(edges[i]));
+// }
+// return answer;
+// }
+
+ size_t VioletLengthOfGenomeUnique(const vector<EdgeId>& edges) {
+ size_t answer = 0;
+ for (size_t i = 0; i < edges.size(); ++i) {
+ // TODO make reference, do not copy!!!
+ TColorSet edge_color_set = coloring_.Color(edges[i]);
+ if (edge_color_set[0] && edge_color_set[1]
+ && (genome_path_.mult(edges[i]) == 1)) {
+ answer += g_.length(edges[i]);
+ }
+ }
+ return answer;
+ }
+
+ //genome pos exclusive
+// size_t CumulativeGenomeLengthToPos(size_t pos) {
+// size_t answer = 0;
+// for (size_t i = 0; i < pos; ++i) {
+// answer += g_.length(genome_path_[i]);
+// }
+// return answer;
+// }
+
+ bool CheckGenomePath(size_t genome_start, size_t genome_end) {
+ return VioletLengthOfGenomeUnique(
+ vector<EdgeId>(genome_path_.begin() + genome_start,
+ genome_path_.begin() + genome_end)) < 25;
+ }
+
+ optional<pair<size_t, size_t>> FindGenomePath(VertexId start, VertexId end,
+ size_t edge_count_bound) {
+ for (size_t i = 0; i < genome_path_.size(); ++i) {
+ if (g_.EdgeStart(*(genome_path_.begin() + i)) == start) {
+ int path_end = TryFindGenomePath(i, end, edge_count_bound);
+ if (path_end > 0 && CheckGenomePath(i, path_end))
+ return make_optional(make_pair(size_t(i), size_t(path_end)));
+ }
+ }
+ return boost::none;
+ }
+
+ void RemoveObsoleteEdges(const vector<EdgeId>& edges) {
+ for (auto it = SmartSetIterator<Graph, EdgeId>(g_, edges.begin(),
+ edges.end()); !it.IsEnd(); ++it) {
+ if (coloring_.Color(*it) == genome_color_
+ && genome_path_.mult(*it) == 0
+ && genome_path_.mult(g_.conjugate(*it)) == 0) {
+ DEBUG("Removing edge " << g_.str(*it) << " as obsolete");
+ VertexId start = g_.EdgeStart(*it);
+ VertexId end = g_.EdgeEnd(*it);
+ g_.DeleteEdge(*it);
+ DEBUG("Comressing start");
+ g_.CompressVertex(start);
+ if (!g_.RelatedVertices(start, end)) {
+ DEBUG("Comressing end");
+ g_.CompressVertex(end);
+ }
+ DEBUG("Edge removed");
+ }
+ }
+ }
+
+ string GenomePathStr(size_t genome_start, size_t genome_end) const {
+ return g_.str(
+ vector<EdgeId>(genome_path_.begin() + genome_start,
+ genome_path_.begin() + genome_end));
+ }
+
+ void GenPicAlongPath(const vector<EdgeId> path, size_t cnt) {
+ utils::MakeDirPath("ref_correction");
+ WriteComponentsAlongPath(g_, StrGraphLabeler<Graph>(g_),
+ "ref_correction/" + ToString(cnt) + ".dot", 100000, 10,
+ TrivialMappingPath(g_, path), *ConstructColorer(coloring_));
+ }
+
+ void GenPicAroundEdge(EdgeId e, size_t cnt) {
+ utils::MakeDirPath("ref_correction");
+ GraphComponent<Graph> component = omnigraph::EdgeNeighborhood(g_, e, 10, 100000);
+ omnigraph::visualization::WriteComponent(g_, "ref_correction/" + ToString(cnt) + ".dot", component, coloring_.GetInstance(), StrGraphLabeler<Graph>(g_));
+ }
+
+ void CorrectGenomePath(size_t genome_start, size_t genome_end,
+ const vector<EdgeId>& assembly_path) {
+ static size_t cnt = 0;
+ DEBUG(
+ "Case " << ++cnt << " Substituting genome path " << GenomePathStr(genome_start, genome_end) << " with assembly path " << g_.str(assembly_path));
+ vector<EdgeId> genomic_edges;
+ for (size_t i = genome_start; i < genome_end; ++i) {
+ genomic_edges.push_back(*(genome_path_.begin() + i));
+ }
+ GenPicAlongPath(genomic_edges, cnt * 100);
+ GenPicAlongPath(assembly_path, cnt * 100 + 1);
+ for (size_t i = 0; i < assembly_path.size(); ++i) {
+ coloring_.PaintEdge(assembly_path[i], genome_color_);
+ }
+ genome_path_.Substitute(genome_start, genome_end, assembly_path);
+ RemoveObsoleteEdges(genomic_edges);
+ GenPicAroundEdge(
+ *((genome_start < genome_path_.size()) ?
+ (genome_path_.begin() + genome_start) :
+ genome_path_.end() - 1), cnt * 100 + 2);
+ }
+
+// pair<string, pair<size_t, size_t>> ContigIdAndPositions(EdgeId e) {
+// vector<EdgePosition> poss = edge_pos_.GetEdgePositions(e);
+// VERIFY(!poss.empty());
+// if (poss.size() > 1) {
+// WARN("Something strange with assembly positions");
+// return make_pair("", make_pair(0, 0));
+// }
+// EdgePosition pos = poss.front();
+// return make_pair(pos.contigId_, make_pair(pos.start(), pos.end()));
+// }
+
+// void WriteAltPath(EdgeId e, const vector<EdgeId>& genome_path) {
+// LengthIdGraphLabeler<Graph> basic_labeler(g_);
+// EdgePosGraphLabeler<Graph> pos_labeler(g_, edge_pos_);
+//
+// CompositeLabeler<Graph> labeler(basic_labeler, pos_labeler);
+//
+// string alt_path_folder = folder_ + ToString(g_.int_id(e)) + "/";
+// make_dir(alt_path_folder);
+// WriteComponentsAlongPath(g_, labeler, alt_path_folder + "path.dot", /*split_length*/
+// 1000, /*vertex_number*/15, TrivialMappingPath(g_, genome_path),
+// *ConstructBorderColorer(g_, coloring_));
+// }
+
+//todo use contig constraints here!!!
+ void AnalyzeGenomeEdge(EdgeId e) {
+ DEBUG("Analysing shortcut genome edge " << g_.str(e));
+// VERIFY(genome_path_.mult(e) > 0);
+ DEBUG("Multiplicity " << genome_path_.mult(e));
+ if (genome_path_.mult(e) == 1) {
+ vector<EdgeId> assembly_path = FindAssemblyPath(g_.EdgeStart(e),
+ g_.EdgeEnd(e), 100, 0, g_.length(e) + 1000);
+ if (!assembly_path.empty()) {
+ DEBUG("Assembly path " << g_.str(assembly_path));
+ auto it = std::find(genome_path_.begin(), genome_path_.end(),
+ e);
+ VERIFY(it != genome_path_.end());
+ size_t pos = it - genome_path_.begin();
+ CorrectGenomePath(pos, pos + 1, assembly_path);
+ } else {
+ DEBUG("Couldn't find assembly path");
+ }
+ }
+ }
+
+ void AnalyzeAssemblyEdge(EdgeId e) {
+ DEBUG("Analysing shortcut assembly edge " << g_.str(e));
+ optional < pair < size_t, size_t >> genome_path = FindGenomePath(
+ g_.EdgeStart(e), g_.EdgeEnd(e), /*edge count bound*//*100*/
+ 300);
+ if (genome_path) {
+ CorrectGenomePath(genome_path->first, genome_path->second,
+ vector<EdgeId> { e });
+ } else {
+ DEBUG("Empty genome path");
+ }
+ }
+
+public:
+ SimpleInDelCorrector(Graph& g, ColorHandler<Graph>& coloring,
+ const vector<EdgeId>& genome_path, TColorSet genome_color,
+ TColorSet assembly_color) :
+ g_(g), coloring_(coloring), genome_path_(g_, genome_path), genome_color_(
+ genome_color), assembly_color_(assembly_color) {
+ }
+
+ void Analyze() {
+ //remove_dir("ref_correction");
+ for (auto it = g_.SmartEdgeBegin(); !it.IsEnd(); ++it) {
+ if (coloring_.Color(*it) == genome_color_
+ && genome_path_.mult(*it) > 0) {
+ AnalyzeGenomeEdge(*it);
+ }
+ if (coloring_.Color(*it) == assembly_color_) {
+ AnalyzeAssemblyEdge(*it);
+ }
+ }
+ }
+
+private:
+ DECL_LOGGER("SimpleInDelCorrector")
+ ;
+};
+
+}
diff --git a/src/projects/cap/graph_traversal_constraints.hpp b/src/projects/cap/graph_traversal_constraints.hpp
new file mode 100644
index 0000000..a1c1c73
--- /dev/null
+++ b/src/projects/cap/graph_traversal_constraints.hpp
@@ -0,0 +1,75 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "coordinates_handler.hpp"
+#include "coloring.hpp"
+
+namespace cap {
+
+template <class Graph>
+class GraphTraversalConstraints {
+ public:
+ typedef typename Graph::EdgeId EdgeId;
+
+ GraphTraversalConstraints() {
+ }
+
+ virtual void PushEdge(const EdgeId /* edge */) {
+ // do nothing
+ }
+
+ virtual void PopEdge() {
+ // do nothing
+ }
+
+ virtual bool PathIsCorrect() const {
+ return true;
+ }
+
+ private:
+ DECL_LOGGER("GraphTraversalConstraints")
+ ;
+};
+
+template <class Graph>
+class GenomeContiguousPathsGraphTraversalConstraints
+ : public GraphTraversalConstraints<Graph> {
+
+ public:
+ typedef typename CoordinatesHandler<Graph>::PosArray PosArray;
+
+ GenomeContiguousPathsGraphTraversalConstraints(
+ const CoordinatesHandler<Graph> &coordinates_handler)
+ : coordinates_handler_(coordinates_handler),
+ pos_array_queue_() {
+ }
+
+ virtual void PushEdge(const EdgeId edge) {
+ if (pos_array_queue_.size() == 0)
+ pos_array_queue_.push(coordinates_handler_.GetEndPosArray(edge));
+ else
+ pos_array_queue_.push(
+ coordinates_handler_.FilterPosArray(pos_array_queue_.top(), edge));
+ }
+
+ virtual void PopEdge() {
+ pos_array_queue_.pop();
+ }
+
+ virtual bool PathIsCorrect() const {
+ return pos_array_queue_.top().size() > 0;
+ }
+
+ private:
+ const CoordinatesHandler<Graph> &coordinates_handler_;
+
+ std::stack<PosArray> pos_array_queue_;
+};
+
+}
diff --git a/src/projects/cap/junk_cropping_reader.hpp b/src/projects/cap/junk_cropping_reader.hpp
new file mode 100644
index 0000000..5927d75
--- /dev/null
+++ b/src/projects/cap/junk_cropping_reader.hpp
@@ -0,0 +1,54 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+#include "dev_support/standard_base.hpp"
+#include "io/reads_io/delegating_reader_wrapper.hpp"
+
+namespace cap {
+
+class JunkCroppingWrapper : public io::DelegatingWrapper<io::SingleRead> {
+ typedef io::DelegatingWrapper<io::SingleRead> base;
+ vector<pair<size_t, size_t>> coordinates_ladder_;
+
+ bool IsGoodSymbol(char c) {
+ return c == 'A' || c == 'C' || c == 'G' || c == 'T';
+ }
+
+public:
+ JunkCroppingWrapper(base::ReadStreamPtrT reader) : base(reader) {
+
+ }
+
+ JunkCroppingWrapper& operator>>(io::SingleRead& read) {
+ base::operator >>(read);
+ coordinates_ladder_.clear();
+ string orig_string = read.GetSequenceString();
+ string orig_qual = read.GetQualityString();
+ string cropped = "";
+ string cropped_qual = "";
+ coordinates_ladder_.push_back(make_pair(0, 0));
+ for (size_t coord = 0; coord < orig_string.size(); ++coord) {
+ if (coord > 0 && (IsGoodSymbol(orig_string[coord - 1]) ^ IsGoodSymbol(orig_string[coord]))) {
+ coordinates_ladder_.push_back(make_pair(cropped.size(), coord));
+ }
+ if (IsGoodSymbol(orig_string[coord])) {
+ cropped += orig_string[coord];
+ cropped_qual += orig_qual[coord];
+ }
+ }
+ coordinates_ladder_.push_back(make_pair(cropped.size(), orig_string.size()));
+ read = io::SingleRead(read.name(), cropped, cropped_qual);
+ return *this;
+ }
+
+ vector<pair<size_t, size_t>> coordinates_ladder() const {
+ return coordinates_ladder_;
+ }
+};
+
+}
diff --git a/src/projects/cap/longseq.hpp b/src/projects/cap/longseq.hpp
new file mode 100644
index 0000000..571e69e
--- /dev/null
+++ b/src/projects/cap/longseq.hpp
@@ -0,0 +1,480 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <cstdlib>
+#include <cstdint>
+#include "polynomial_hash.hpp"
+#include "dev_support/log.hpp"
+#include "data_structures/sequence/sequence.hpp"
+#include "dev_support/openmp_wrapper.h"
+
+namespace cap {
+
+typedef unsigned char uchar;
+
+template <class HashT = PolynomialHash<> >
+class LongSeq {
+ typedef LongSeq<HashT> ThisType;
+ public:
+ /**
+ * @variable Number of bits in type T (e.g. 8 for char)
+ * @example 8: 2^8 = 256 or 16
+ */
+ const static size_t TBits = 8;
+
+ /**
+ * @variable Number of nucleotides that can be stored in one type T (e.g. 4 for char)
+ * TNucl MUST be a power of two
+ * @example 4: 8/2 = 4 or 16/2 = 8
+ */
+ const static size_t TNucl = 1;
+
+ /**
+ * @variable Number of bits in TNucl (e.g. 2 for char). Useful for shifts instead of divisions.
+ */
+ const static size_t TNuclBits = log_<TNucl, 2>::value;
+
+ /**
+ * @variable Number of Ts which required to store all sequence.
+ */
+ const static size_t DataSize = (16 + TNucl - 1) >> TNuclBits;
+
+ /**
+ * @variable Number of meaningful bytes in whick seq is stored
+ */
+ const static size_t TotalBytes = sizeof(char) * DataSize;
+
+ typedef char DataType;
+ typedef typename HashT::DataType HashType;
+
+ static size_t GetDataSize(size_t size) {
+ return (size + TNucl - 1) >> TNuclBits;
+ }
+
+ static const char kNoNextNucl = char(69);
+
+ private:
+ // Maybe SequenceData?
+ Sequence sequence_;
+
+ unsigned size_;
+ size_t pos_;
+ /*
+ * last_char_ contains information about extra first/last chars
+ * storage: xxxyxxxy
+ * frnt--back
+ * xxx codes character itself
+ * y codes existance of extra char
+ */
+ uchar last_char_;
+ HashT hash_;
+
+ static const uchar kNoFrontChar = uchar(8);
+ static const uchar kNoBackChar = uchar(128);
+ static const uchar kNoChar = uchar(255);
+
+ inline void InitHash() {
+ for (size_t i = 0; i < size_; ++i) {
+ hash_.Update(sequence_[pos_ + i]);
+ }
+ }
+
+ // for fast copy
+ LongSeq(unsigned size, const Sequence &sequence, size_t pos, const HashT &hash)
+ : sequence_(sequence),
+ size_(size),
+ pos_(pos),
+ last_char_(kNoChar),
+ hash_(size, hash) {
+ }
+
+ // convenient methods
+ inline bool HasExtraLastChar() const {
+ return !(last_char_ & kNoBackChar);
+ }
+ inline bool HasExtraFrontChar() const {
+ return !(last_char_ & kNoFrontChar);
+ }
+ inline char LastChar() const {
+ if (!HasExtraLastChar()) {
+ return operator[](size_ - 1);
+ }
+ return (last_char_ >> 4) & 7;
+ }
+ inline char FirstChar() const {
+ if (!HasExtraFrontChar()) {
+ return operator[](0);
+ }
+ return last_char_ & 7;
+ }
+ inline void SetFirstChar(uchar c) {
+ last_char_ = uchar((last_char_ & 0xF0) | c);
+ }
+ inline void SetLastChar(uchar c) {
+ last_char_ = uchar((last_char_ & 0x0F) | (c << 4));
+ }
+
+ public:
+ LongSeq()
+ : sequence_(),
+ size_(0),
+ pos_(0),
+ last_char_(kNoChar),
+ hash_(0) {
+ }
+ explicit LongSeq(unsigned size)
+ : sequence_(std::string(size, 'A')), // optimize by setting bad first char
+ size_(size),
+ pos_(0),
+ last_char_(kNoChar),
+ hash_(size_) {
+ }
+ LongSeq(unsigned size, const Sequence &sequence, size_t pos = 0)
+ : sequence_(sequence),
+ size_(size),
+ pos_(pos),
+ last_char_(kNoChar),
+ hash_(size_) {
+ InitHash();
+ VERIFY(pos_ + size_ <= sequence_.size());
+ }
+
+ // Weird constructor for constructing from `data' origined from another LongSeq
+ LongSeq(unsigned /* size */, const LongSeq<HashT> &other)
+ : sequence_(other.sequence_),
+ size_(other.size_),
+ pos_(other.pos_),
+ last_char_(other.last_char_),
+ hash_(size_, other.hash_) {
+ }
+
+ LongSeq(const LongSeq<HashT> &other)
+ : sequence_(other.sequence_),
+ size_(other.size_),
+ pos_(other.pos_),
+ last_char_(other.last_char_),
+ hash_(size_, other.hash_) {
+ }
+
+ LongSeq<HashT> operator =(const LongSeq<HashT> &other) {
+ sequence_ = other.sequence_;
+ size_ = other.size_;
+ pos_ = other.pos_;
+ last_char_ = other.last_char_;
+ hash_.CopyFrom(size_, other.hash_);
+
+ return *this;
+ }
+
+ unsigned size() const {
+ return size_;
+ }
+
+ // TODO check consistensy
+ const LongSeq<HashT> &data() const {
+ return *this;
+ }
+
+ size_t data_size() const {
+ return DataSize;
+ }
+
+ char operator [](size_t pos) const {
+ return sequence_[pos_ + pos];
+ }
+
+ LongSeq<HashT> operator !() const {
+ return LongSeq<HashT>(size_, !sequence_, sequence_.size() - size_ - pos_);
+ }
+
+ void Shift() {
+ if (pos_ + size_ < sequence_.size()) {
+ hash_.Update(sequence_[pos_ + size_], sequence_[pos_]);
+ }
+ pos_++;
+ }
+
+ void operator <<=(char c) {
+ if (is_nucl(c)) {
+ c = dignucl(c);
+ }
+
+ if (pos_ + size_ < sequence_.size() && c == sequence_[pos_ + size_]) {
+ // do nothing
+ } else {
+ // actually this can be only once during the life
+ // of Kmer.
+ VERIFY(!HasExtraLastChar());
+ SetLastChar(c);
+ }
+
+ char front_char;
+ if (HasExtraFrontChar()) {
+ front_char = FirstChar();
+ SetFirstChar(kNoFrontChar);
+ } else {
+ front_char = sequence_[pos_];
+ }
+ // Updating hash
+ hash_.Update(c, front_char);
+
+ pos_++;
+
+ }
+
+ LongSeq<HashT> operator <<(char c) const {
+ LongSeq<HashT> other_seq = *this;
+ other_seq <<= c;
+ return other_seq;
+ }
+
+ LongSeq<HashT> pushBack(char c) const {
+ if (is_nucl(c)) {
+ c = dignucl(c);
+ }
+
+ LongSeq<HashT> result(size_ + 1, sequence_, pos_, hash_);
+
+ if (pos_ + size_ < sequence_.size() && c == sequence_[pos_ + size_]) {
+ // do nothing
+ } else {
+ VERIFY(!HasExtraLastChar());
+ result.SetLastChar(c);
+ }
+
+ result.hash_.Update(c);
+
+ return result;
+ }
+
+ void operator >>=(char c) {
+ if (is_nucl(c)) {
+ c = dignucl(c);
+ }
+
+ if (pos_ > 0 && c == sequence_[pos_ - 1]) {
+ // do nothing
+ } else {
+ // actually this can be only once during the life
+ // of Kmer.
+ VERIFY(!HasExtraFrontChar());
+ SetFirstChar(c);
+ }
+
+ pos_--;
+
+ char back_char;
+ if (HasExtraLastChar()) {
+ back_char = LastChar();
+ SetLastChar(kNoFrontChar);
+ } else {
+ back_char = sequence_[pos_ + size_];
+ }
+ // Updating hash
+ hash_.UpdateBack(c, back_char);
+
+ }
+
+ LongSeq<HashT> operator >>(char c) const {
+ LongSeq<HashT> other_seq = *this;
+ other_seq >>= c;
+ return other_seq;
+ }
+ LongSeq<HashT> pushFront(char c) const {
+ if (is_nucl(c)) {
+ c = dignucl(c);
+ }
+
+ LongSeq<HashT> result(size_ + 1, sequence_, pos_ - 1, hash_);
+
+ if (pos_ > 0 && c == sequence_[pos_ - 1]) {
+ // do nothing
+ } else {
+ VERIFY(!HasExtraFrontChar());
+ result.SetFirstChar(c);
+ }
+
+ result.hash_.UpdateBack(c);
+
+ return result;
+ }
+
+ bool IsValid() const {
+ return pos_ + size_ <= sequence_.size();
+ }
+
+ bool operator ==(const LongSeq<HashT> &other) const {
+
+ VERIFY(size_ == other.size_);
+ if (hash_ != other.hash_) {
+// (was a kind of joke, yea)
+// VERIFY(str() != other.str());
+ return false;
+ }
+ for (size_t i = 1; i + 1 < size_; ++i) {
+ if (operator[](i) != other[i]) {
+// VERIFY(str() != other.str());
+ return false;
+ }
+ }
+// VERIFY((str() == other.str()) == (
+// FirstChar() == other.FirstChar() &&
+// LastChar() == other.LastChar()
+// ));
+ return FirstChar() == other.FirstChar() &&
+ LastChar() == other.LastChar();
+ }
+
+ bool operator !=(const LongSeq<HashT> &other) const {
+ return !(operator==(other));
+ }
+
+
+ std::string str() const {
+ if (size_ > 1) {
+ std::string res(size_, '-');
+ res[0] = nucl(FirstChar());
+ res[size_ - 1] = nucl(LastChar());
+ for (size_t i = 1; i + 1 < size_; ++i) {
+ res[i] = nucl(operator[](i));
+ }
+ return res;
+ }
+
+ if (size_ == 0) {
+ return "";
+ }
+ //if (size_ == 1)
+ if (HasExtraFrontChar()) {
+ return std::string("") + nucl(FirstChar());
+ } else if (HasExtraLastChar()) {
+ return std::string("") + nucl(LastChar());
+ } else {
+ return std::string("") + nucl(operator[](0));
+ }
+ }
+ std::string err() const {
+ std::ostringstream oss;
+ oss << "{ sequence_=" << sequence_.err() <<
+ "[" << sequence_.str() << "]" <<
+ ", size_=" << size_ <<
+ ", pos_=" << pos_ <<
+ ", last_char_=" << size_t(last_char_) <<
+ ", hash_=" << hash_.GetHash() << " }";
+ return oss.str();
+ }
+
+ typename HashT::DataType GetHash() const {
+ return hash_.GetHash();
+ }
+ typename HashT::DataType GetHash(typename HashT::AtomType seed) const {
+ return hash_.GetHash(seed);
+ }
+
+ char GetNextNucl() const {
+ if (pos_ + size_ >= sequence_.size()) {
+ return kNoNextNucl;
+ }
+ return sequence_[pos_ + size_];
+ }
+
+ /*
+ void UpdateTransition(char symbol, LongSeq<HashT> *link) {
+ char my_symbol = GetNextNucl();
+ if (symbol == my_symbol) {
+ if (transition_storage_ == NULL) {
+ transition_storage_ = std::shared_ptr<KmerJumper<ThisType> >(new SingleKmerJumper<ThisType>);
+ }
+ if (!transition_storage_->HasTransition()) {
+ transition_storage_->SetTransition(symbol, link);
+ }
+ return;
+ }
+
+ if (transition_storage_ == NULL) {
+ transition_storage_ = std::shared_ptr<KmerJumper<ThisType> >(new MultiKmerJumper<ThisType>);
+ } else if (transition_storage_->Arity() == 1) {
+ LongSeq<HashT> *old_link = transition_storage_->GetTransitionLink(my_symbol);
+ transition_storage_ = std::shared_ptr<KmerJumper<ThisType> >(new MultiKmerJumper<ThisType>);
+ if (my_symbol != kNoNextNucl) {
+ transition_storage_->SetTransition(my_symbol, old_link);
+ }
+ }
+ transition_storage_->SetTransition(symbol, link);
+ }
+ */
+
+ struct hash {
+ inline size_t operator()(const LongSeq<HashT>& seq) const {
+ return seq.hash_.GetHashInt();
+ }
+
+ size_t operator()(const DataType * /*data*/, size_t /*sz*/ = DataSize) {
+ VERIFY(false);
+ return 0;
+ }
+ };
+
+ struct equal_to {
+ inline bool operator()(const LongSeq<HashT>& l, const LongSeq<HashT>& r) const {
+ return l == r;
+ }
+ };
+
+ struct fast_equal_to {
+ inline bool operator()(const LongSeq<HashT>& l, const LongSeq<HashT>& r) const {
+ return l.hash_ == r.hash_;
+ }
+ };
+
+ struct less2 {
+ bool operator()(const LongSeq<HashT> &l, const LongSeq<HashT> &r) const {
+ VERIFY(l.size() == r.size());
+ size_t size = l.size();
+ for (size_t i = 1; i + 1 < size; ++i) {
+ if (l[i] != r[i]) {
+ return (l[i] < r[i]);
+ }
+ }
+ return l.FirstChar() == r.FirstChar() && l.LastChar() < r.LastChar();
+ }
+ };
+
+ bool operator<(const LongSeq<HashT>& that) const{
+ static less2 comp;
+ return comp(*this, that);
+ }
+
+ /**
+ * Denotes some (weird) order on k-mers. Works fast.
+ struct less2_fast {
+ bool operator()(const LongSeq<HashT> &l, const LongSeq<HashT> &r) const {
+ return l.GetHash() < r.GetHash();
+ }
+ };
+ */
+
+ bool IsMinimal() const {
+ return true;
+ }
+};
+
+typedef LongSeq<MultiPolynomialHash<3, uint64_t> > LSeq;
+
+}
+
+namespace std {
+
+template<typename HashT>
+std::ostream& operator<<(std::ostream& os, const cap::LongSeq<HashT> &seq) {
+ os << seq.str();
+ return os;
+}
+
+}
diff --git a/src/projects/cap/main.cpp b/src/projects/cap/main.cpp
new file mode 100644
index 0000000..7228aab
--- /dev/null
+++ b/src/projects/cap/main.cpp
@@ -0,0 +1,73 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+// just to check that headers from include and debruijn folders are correctly included
+#include "cap_kmer_index.hpp"
+#include "cap_logger.hpp"
+
+#include "dev_support/segfault_handler.hpp"
+#include "dev_support/stacktrace.hpp"
+#include "pipeline/config_struct.hpp"
+#include "dev_support/simple_tools.hpp"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include "dev_support/memory_limit.hpp"
+#include "io/dataset_support/read_converter.hpp"
+
+#include "cap_online_visualizer.hpp"
+
+void create_console_logger(string const& cfg_filename) {
+ using namespace logging;
+
+ string log_props_file = cfg::get().log_filename;
+
+ if (!path::FileExists(log_props_file))
+ log_props_file = path::append_path(path::parent_path(cfg_filename), cfg::get().log_filename);
+
+ logger *lg = create_logger(path::FileExists(log_props_file) ? log_props_file : "");
+ lg->add_writer(std::make_shared<console_writer>());
+
+ attach_logger(lg);
+}
+
+int main(int argc, char** argv) {
+ const size_t GB = 1 << 30;
+ try {
+ using namespace online_visualization;
+
+ VERIFY(argc >= 2);
+ string cfg_filename = argv[1];
+ string cap_cfg_filename = argv[2];
+ path::CheckFileExistenceFATAL(cfg_filename);
+
+ cfg::create_instance(cfg_filename);
+ cap_cfg::create_instance(cap_cfg_filename);
+
+ create_console_logger(cfg_filename);
+ cout << "\ncapGAF (Graph Analysis Framework) started" << endl;
+ cout << "Print help to see help (killer feature)" << endl;
+ limit_memory(cfg::get().max_memory * GB);
+
+ CapOnlineVisualizer online_vis;
+ online_vis.init();
+ string batch = "";
+ if (argc > 3) {
+ batch = string(argv[3]);
+ }
+ online_vis.run(batch);
+ }
+ catch (std::exception const& e) {
+ std::cerr << "Exception caught " << e.what() << std::endl;
+ return EINTR;
+ }
+ catch (...) {
+ std::cerr << "Unknown exception caught " << std::endl;
+ return EINTR;
+ }
+ return 0;
+}
diff --git a/src/projects/cap/mosaic.hpp b/src/projects/cap/mosaic.hpp
new file mode 100644
index 0000000..d6bdcb1
--- /dev/null
+++ b/src/projects/cap/mosaic.hpp
@@ -0,0 +1,1101 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "dev_support/standard_base.hpp"
+#include "io/reads_io/rc_reader_wrapper.hpp"
+#include "io/reads_io/sequence_reader.hpp"
+#include "diff_masking.hpp"
+#include "utils/adt/bag.hpp"
+#include "io/reads_io/vector_reader.hpp"
+#include "visualization/graph_colorer.hpp"
+
+namespace cap {
+
+namespace mosaic {
+
+/*
+ * todo
+ * 1. Somehow report information on single edges
+ * 2. Show multiplicity information
+ */
+
+//todo temporary for correct eclipse highlight
+using std::make_pair;
+
+typedef ConjugateDeBruijnGraph Graph;
+typedef Graph::EdgeId EdgeId;
+
+//block and its conjugate are represented by different natural
+//which are not related in any trivial way
+typedef size_t Block;
+
+typedef size_t Pos;
+typedef pair<Pos, bool> StrandPos;
+typedef pair<Range, bool> StrandRange;
+
+class BlockInfoProvider {
+ const Graph& g_;
+ omnigraph::GraphElementFinder<Graph> finder_;
+
+public:
+ BlockInfoProvider(const Graph& g) : g_(g), finder_(g) {
+ finder_.Init();
+ }
+
+ Block conjugate(const Block& block) const {
+ return g_.int_id(g_.conjugate(edge_id(block)));
+ }
+
+ size_t length(const Block& block) const {
+ return g_.length(edge_id(block));
+ }
+
+ Sequence seq(const Block& block) const {
+ return g_.EdgeNucls(edge_id(block));
+ }
+
+ EdgeId edge_id(const Block& block) const {
+ return finder_.ReturnEdgeId(block);
+ }
+
+ const Graph& g() const {
+ return g_;
+ }
+};
+
+class GenomeBlockComposition {
+ vector<Block> blocks_;
+ multimap<Block, StrandPos> occurences_;
+ map<Pos, Range> genome_coordinates_;
+ //in k-mers!!!
+ const size_t genome_length_;
+ const BlockInfoProvider& block_info_;
+
+ bool CheckPath(const MappingPath<EdgeId>& mapping_path) const {
+ for (Pos i = 1; i < mapping_path.size(); ++i)
+ if (mapping_path[i].second.initial_range.start_pos != mapping_path[i - 1].second.initial_range.end_pos)
+ return false;
+ return true;
+ }
+
+ vector<StrandPos> AddStrand(vector<Pos> poss, bool strand) const {
+ vector<StrandPos> answer;
+ for (auto pos : poss) {
+ answer.push_back(make_pair(pos, strand));
+ }
+ return answer;
+ }
+
+public:
+ GenomeBlockComposition(const Graph& g, const MappingPath<EdgeId>& mapping_path, size_t genome_length, const BlockInfoProvider& block_info)
+ : genome_length_(genome_length), block_info_(block_info) {
+ VERIFY(CheckPath(mapping_path));
+ for (EdgeId e : mapping_path.simple_path()) {
+ blocks_.push_back(g.int_id(e));
+ }
+ for (Pos i = 0; i < blocks_.size(); ++i) {
+ genome_coordinates_.insert(make_pair(i, mapping_path[i].second.initial_range));
+ StrandPos strand_pos = make_pair(i, true);
+ occurences_.insert(make_pair(blocks_[i], strand_pos));
+ occurences_.insert(make_pair(ConjBlock(blocks_[i]), ConjStrandPos(strand_pos)));
+ }
+ }
+
+ Block ConjBlock(Block b) const {
+ return block_info_.conjugate(b);
+ }
+
+ const BlockInfoProvider& block_info() const {
+ return block_info_;
+ }
+
+ Pos ConjPos(Pos pos) const {
+ return size() - pos - 1;
+ }
+
+ StrandPos ConjStrandPos(StrandPos pos) const {
+ return make_pair(ConjPos(pos.first), !pos.second);
+ }
+
+ Range ConjCoordsRange(Range r) const {
+ return Range(genome_length_ - r.end_pos, genome_length_ - r.start_pos);
+ }
+
+ StrandRange ConjStrandRange(StrandRange r) const {
+ return make_pair(Range(size() - r.first.end_pos, size() - r.first.start_pos), !r.second);
+ }
+
+ Pos size() const {
+ return blocks_.size();
+ }
+
+ const vector<Block>& blocks() const {
+ return blocks_;
+ }
+
+ Block block(StrandPos pos) const {
+ return block(pos.first, pos.second);
+ }
+
+ Block block(Pos pos, bool strand) const {
+ if (strand)
+ return blocks_[pos];
+ else
+ return block_info_.conjugate(blocks_[ConjPos(pos)]);
+ }
+
+ Range genome_coords(StrandPos pos) const {
+ return genome_coords(pos.first, pos.second);
+ }
+
+ Range genome_coords(Pos pos, bool strand) const {
+ if (strand)
+ return get(genome_coordinates_, pos);
+ else
+ return ConjCoordsRange(get(genome_coordinates_, ConjPos(pos)));
+ }
+
+ size_t multiplicity(Block block) const {
+ return occurences_.count(block);
+ }
+
+ vector<StrandPos> occurences(Block block) const {
+ return get_all(occurences_, block);
+ }
+
+// vector<Range> all_genome_coords(Block block) const {
+// vector<Range> answer;
+// for (size_t pos : occurences(block)) {
+// answer.push_back(genome_coords(pos));
+// }
+// return answer;
+// }
+
+ Range genome_coords(StrandRange pos_range) const {
+ return genome_coords(pos_range.first, pos_range.second);
+ }
+
+ Range genome_coords(Range pos_range, bool strand) const {
+ VERIFY(pos_range.end_pos > 0);
+ return Range(genome_coords(pos_range.start_pos, strand).start_pos, genome_coords(pos_range.end_pos - 1, strand).end_pos);
+ }
+
+ size_t genome_span(Range pos_range, bool strand) const {
+ return genome_coords(pos_range, strand).size();
+ }
+};
+
+//todo maybe use ranges for latter parts of analysis
+struct MosaicInterval {
+ bool strand;
+ Range pos_range;
+ vector<Pos> support_blocks;
+
+ MosaicInterval(bool strand_, Pos pos)
+ : strand(strand_), pos_range(pos, pos + 1) {
+ support_blocks.push_back(pos);
+ }
+
+ MosaicInterval(bool strand_, Range pos_range_, const vector<Pos>& support_blocks_)
+ : strand(strand_), pos_range(pos_range_), support_blocks(support_blocks_) {
+ }
+
+ MosaicInterval SubInterval(Range pos_range_) const {
+ vector<Pos> sub_support_blocks;
+ for (Pos pos : support_blocks) {
+ if (pos >= pos_range_.start_pos && pos < pos_range_.end_pos) {
+ sub_support_blocks.push_back(pos);
+ }
+ }
+ return MosaicInterval(strand, pos_range_, sub_support_blocks);
+ }
+
+ StrandRange strand_range() const {
+ return make_pair(pos_range, strand);
+ }
+
+ size_t support_size() const {
+ return support_blocks.size();
+ }
+
+// bool operator<(const MosaicInterval &other) const {
+// return pos_range < other.pos_range;
+// }
+};
+
+template<class It1, class It2>
+It2 Find(It1 pattern_begin, It1 pattern_end, It2 target_begin,
+ It2 target_end) {
+ for (It2 it = target_begin;; ++it) {
+ size_t i = 0;
+ bool flag = true;
+ for (It1 it2 = pattern_begin; it2 != pattern_end; ++it2) {
+ if (it + i == target_end)
+ return target_end;
+ if (*(it + i) != *it2) {
+ flag = false;
+ break;
+ }
+ ++i;
+ }
+ if (flag) {
+ return it;
+ }
+ }
+ return target_end;
+}
+
+template<class Container1, class Container2>
+size_t Find(const Container1& pattern, const Container2& target) {
+ auto it = Find(pattern.begin(), pattern.end(), target.begin(),
+ target.end());
+ if (it == target.end())
+ return -1u;
+ else
+ return it - target.begin();
+}
+
+class MosaicStructure {
+ vector<Block> blocks_;
+// vector<MosaicInterval> occurences_;
+
+ vector<Block> ToBlocks(const GenomeBlockComposition& block_composition, const MosaicInterval& interval) {
+ vector<Block> answer;
+ for (Pos pos : interval.support_blocks) {
+ answer.push_back(block_composition.block(pos, interval.strand));
+ }
+ return answer;
+ }
+
+ //todo simplify after switching to Ranges
+// vector<MosaicInterval> SubIntervals(size_t block_start, size_t block_end) const {
+ vector<MosaicInterval> answer;
+// for (const MosaicInterval& interval : occurences_) {
+// answer.push_back(interval.SubInterval(Range(interval.support_blocks[block_start],
+// interval.support_blocks[block_end] + 1)));
+// }
+// return answer;
+// }
+
+public:
+ explicit MosaicStructure(const vector<Block>& blocks)
+ : blocks_(blocks) {
+ }
+
+ MosaicStructure(const GenomeBlockComposition& block_composition, const MosaicInterval& interval) :
+ blocks_(ToBlocks(block_composition, interval)) {
+ }
+
+ const vector<Block>& blocks() const {
+ return blocks_;
+ }
+
+ size_t block_size() const {
+ return blocks_.size();
+ }
+
+ //block end incl
+ MosaicStructure SubMosaic(size_t block_start, size_t block_end) const {
+ VERIFY(block_start < blocks_.size());
+ VERIFY(block_end < blocks_.size());
+ VERIFY(block_start <= block_end);
+
+ return MosaicStructure(vector<Block>(blocks_.begin() + block_start, blocks_.begin() + block_end + 1));
+ }
+
+ //sorted by length in decreasing order
+ vector<MosaicStructure> SubMosaics() const {
+ vector<MosaicStructure> answer;
+ for (size_t d = block_size(); d > 0; --d) {
+ for (size_t i = 0; i + d < block_size(); ++i) {
+ answer.push_back(SubMosaic(i, i + d));
+ }
+ }
+ return answer;
+ }
+
+ string Fingerprint() const {
+ std::stringstream ss;
+ string delim = "";
+ for (Block block : blocks_) {
+ ss << delim;
+ ss << block;
+ delim = " ";
+ }
+ return ss.str();
+ }
+
+ bool SameBlocks(const MosaicStructure& that) const {
+ return blocks_ == that.blocks_;
+ }
+
+ bool IsContainedIn(const MosaicStructure& that) const {
+ return Find(blocks_, that.blocks_) != -1u;
+ }
+
+ MosaicStructure conjugate(const BlockInfoProvider& block_info) const {
+ vector<Block> answer;
+ for (auto it = blocks_.rbegin(); it != blocks_.rend(); ++it) {
+ answer.push_back(block_info.conjugate(*it));
+ }
+ return MosaicStructure(answer);
+ }
+
+};
+
+ostream& operator << (ostream& out, const MosaicStructure& structure) {
+ return out << "Mosaic. size=" << structure.block_size() << " blocks=" << structure.blocks();
+}
+
+class IntervalIndex {
+ multimap<string, StrandRange> all_substruct_pos_;
+ const GenomeBlockComposition& block_composition_;
+
+ bool Reported(StrandRange range, const set<StrandRange>& reported) const {
+ for (auto r : reported)
+ if (r.second == range.second && r.first.contains(range.first))
+ return true;
+ return false;
+ }
+
+public:
+ IntervalIndex(const GenomeBlockComposition& block_composition)
+ : block_composition_(block_composition) {
+
+ }
+
+ void IndexSubIntervals(const MosaicInterval& interval) {
+ VERIFY(interval.support_blocks.front() == interval.pos_range.start_pos);
+ VERIFY(interval.support_blocks.back() + 1 == interval.pos_range.end_pos);
+ vector<Pos> support = interval.support_blocks;
+ for (size_t i = 0; i < support.size(); ++i) {
+ for (size_t j = i; j < support.size(); ++j) {
+ MosaicInterval sub_interval = interval.SubInterval(Range(support[i], support[j] + 1));
+ MosaicStructure sub_mosaic(block_composition_, sub_interval);
+ all_substruct_pos_.insert(make_pair(sub_mosaic.Fingerprint(), sub_interval.strand_range()));
+ all_substruct_pos_.insert(make_pair(sub_mosaic.conjugate(block_composition_.block_info()).Fingerprint(),
+ block_composition_.ConjStrandRange(sub_interval.strand_range())));
+ }
+ }
+ }
+
+ vector<StrandRange> Occurences(const MosaicStructure& mosaic) const {
+ return get_all(all_substruct_pos_, mosaic.Fingerprint());
+ }
+
+ size_t Mult(const MosaicStructure& mosaic) const {
+ return all_substruct_pos_.count(mosaic.Fingerprint());
+ }
+
+ vector<StrandRange> UnReportedOccurences(const MosaicStructure& mosaic, const set<StrandRange>& reported) const {
+ vector<StrandRange> answer;
+ for (StrandRange range : Occurences(mosaic))
+ if (!Reported(range, reported) && !Reported(block_composition_.ConjStrandRange(range), reported))
+ answer.push_back(range);
+ return answer;
+ }
+
+ const multimap<string, StrandRange>& all_substruct_pos() const {
+ return all_substruct_pos_;
+ }
+};
+
+class MosaicHelper {
+ const BlockInfoProvider& block_info_;
+ const GenomeBlockComposition& block_composition_;
+ const IntervalIndex& interval_index_;
+
+ vector<Pos> FindSupportBlocks(Range r, bool strand, const MosaicStructure& mosaic) const {
+ vector<Pos> answer;
+ size_t i = r.start_pos;
+ for (Block b : mosaic.blocks()) {
+ while (i < r.end_pos && block_composition_.block(i, strand) != b) {
+ ++i;
+ }
+ VERIFY(i < r.end_pos);
+ answer.push_back(i);
+ ++i;
+ }
+ VERIFY(answer.front() == r.start_pos && answer.back() == r.end_pos - 1);
+ return answer;
+ }
+
+public:
+ MosaicHelper(const GenomeBlockComposition& block_composition,
+ const IntervalIndex& interval_index)
+ : block_info_(block_composition.block_info()),
+ block_composition_(block_composition),
+ interval_index_(interval_index) {
+ }
+
+ size_t Mult(const MosaicStructure& mosaic) const {
+ return interval_index_.Mult(mosaic);
+ }
+
+ double AvgSpan(const MosaicStructure& mosaic) const {
+ double avg = 0.;
+ for (StrandRange r : interval_index_.Occurences(mosaic)) {
+ avg += double(r.first.size());
+ }
+ return avg / double(Mult(mosaic));
+ }
+
+ double AvgGenomicSpan(const MosaicStructure& mosaic) const {
+ double avg = 0.;
+ for (StrandRange r : interval_index_.Occurences(mosaic)) {
+ Range genomic_range = block_composition_.genome_coords(r);
+ avg += double(genomic_range.size());
+ }
+ return avg / double(interval_index_.Mult(mosaic));
+ }
+
+ vector<double> AvgGenomicInterLengths(const MosaicStructure& mosaic) const {
+ vector<double> answer(mosaic.block_size() - 1, 0.);
+ for (StrandRange r : interval_index_.Occurences(mosaic)) {
+ bool strand = r.second;
+ vector<Pos> support = FindSupportBlocks(r.first, strand, mosaic);
+ for (size_t i = 1; i < support.size(); ++i) {
+ answer[i - 1] += double(block_composition_.genome_coords(Range(
+ support[i - 1] + 1,
+ support[i]), strand).size());
+ }
+ }
+ for (size_t i = 0; i < answer.size(); ++i) {
+ answer[i] /= double(interval_index_.Mult(mosaic));
+ }
+ return answer;
+ }
+
+ size_t length(Block b) const {
+ return block_info_.length(b);
+ }
+
+ size_t TotalBlockLength(const MosaicStructure& mosaic) const {
+ size_t block_length = 0;
+ for (Block b : mosaic.blocks()) {
+ block_length += length(b);
+ }
+ return block_length;
+ }
+
+ MosaicStructure GetStructure(const MosaicInterval& interval) const {
+ return MosaicStructure(block_composition_, interval);
+ }
+
+ const BlockInfoProvider& block_info() const {
+ return block_info_;
+ }
+
+ const GenomeBlockComposition& genome_composition() const {
+ return block_composition_;
+ }
+
+};
+
+class MosaicPrinter {
+public:
+ virtual void StartRecord(const MosaicStructure& /*mosaic*/) {
+
+ }
+
+ virtual void ReportSubMosaic(const MosaicStructure& /*mosaic*/, const vector<StrandRange>& /*ranges*/) {
+
+ }
+
+ virtual void EndRecord() {
+
+ }
+
+ virtual ~MosaicPrinter() {}
+};
+
+class TxtFileMosaicPrinter : public MosaicPrinter {
+ size_t cnt_;
+ const MosaicHelper& helper_;
+// const multimap<string, size_t>& different_irred_presence_;
+ ofstream out_;
+
+ void BlockInfo(Block b) {
+ out_ << b << " (" << helper_.length(b) << ")";
+ }
+
+public:
+
+ TxtFileMosaicPrinter(const MosaicHelper& helper,
+// const multimap<string, size_t>& different_irred_presence,
+ const string& filename) :
+ cnt_(0),
+ helper_(helper),
+// different_irred_presence_(different_irred_presence),
+ out_(filename) {
+
+ }
+
+ virtual void StartRecord(const MosaicStructure& mosaic) {
+ out_ << "Irreducible Mosaic " << cnt_++ << endl;
+ out_ << "Support block cnt = " << mosaic.block_size();
+ out_ << "; Total support block length = " << helper_.TotalBlockLength(mosaic);
+ out_ << "; Full mosaic multiplicity = " << helper_.Mult(mosaic);
+ out_ << "; Avg block span = " << helper_.AvgSpan(mosaic);
+ out_ << "; Avg genome span = " << helper_.AvgGenomicSpan(mosaic);
+ out_ << endl;
+ if (helper_.Mult(mosaic) > 1) {
+ out_ << "WARN! Full mosaic multiplicity = " << helper_.Mult(mosaic);
+ out_ << endl;
+ }
+ out_ << "Structure: ";
+ vector<double> inter_lengths = helper_.AvgGenomicInterLengths(mosaic);
+ BlockInfo(mosaic.blocks().front());
+ for (size_t i = 1; i < mosaic.block_size(); ++i) {
+ out_ << " |...";
+ out_ << inter_lengths[i - 1];
+ out_ << "...| ";
+ BlockInfo(mosaic.blocks()[i]);
+ }
+ out_ << endl;
+ out_ << "Sub_mosaics" << endl;
+ }
+
+ virtual void EndRecord() {
+ out_ << "............................" << endl;
+ }
+
+ virtual void ReportSubMosaic(const MosaicStructure& mosaic, const vector<StrandRange>& ranges) {
+ string finger = mosaic.Fingerprint();
+ out_ << "------" << endl;
+ out_ << "Sub_mosaic. Block cnt = " << mosaic.block_size() << endl;
+ out_ << "Blocks " << finger;
+// set<size_t> different_irred;
+// insert_all(different_irred, get_all(different_irred_presence_, finger));
+// out_ << " ; Found in " << different_irred.size() << " different irreducible mosaics";
+// string delim = " (";
+// for (size_t idx : different_irred) {
+// out_ << delim;
+// out_ << idx;
+// delim = ", ";
+// }
+// out_ << ")" << endl;
+
+ string delim = "";
+ out_ << "Ranges: ";
+ for (StrandRange r : ranges) {
+ out_ << delim;
+ out_ << "strand: " << (r.second ? "+" : "-") << " ";
+ out_ << helper_.genome_composition().genome_coords(r);
+ out_ << " (Pos: ";
+ out_ << (r.second ? r.first : helper_.genome_composition().ConjStrandRange(r).first);
+ out_ << ")";
+ delim = "; ";
+ }
+ out_ << endl;
+ }
+
+};
+
+class ParsableFormatPrinter : public MosaicPrinter {
+ size_t cnt_;
+ const MosaicHelper& helper_;
+ vector<pair<MosaicStructure, vector<StrandRange>>> submosaics_;
+ ofstream out_;
+
+ void BlockInfo(Block b) {
+ out_ << b << " " << helper_.length(b) << endl;
+ }
+
+public:
+
+ ParsableFormatPrinter(const MosaicHelper& helper,
+ const string& filename) :
+ cnt_(0),
+ helper_(helper),
+ out_(filename) {
+
+ }
+
+ virtual void StartRecord(const MosaicStructure& mosaic) {
+ out_ << cnt_++ << endl; // (the index of the irreducible mosaic)
+ out_ << mosaic.block_size() << endl; // (number of the the support blocks)
+ out_ << helper_.TotalBlockLength(mosaic) << endl; // (total length)
+// out_ << "; Full mosaic multiplicity = " << helper_.Mult(mosaic);
+// out_ << "; Avg block span = " << helper_.AvgSpan(mosaic);
+// out_ << "; Avg genome span = " << helper_.AvgGenomicSpan(mosaic);
+// out_ << endl;
+// if (helper_.Mult(mosaic) > 1) {
+// out_ << "WARN! Full mosaic multiplicity = " << helper_.Mult(mosaic);
+// out_ << endl;
+// }
+// out_ << "Structure: ";
+ vector<double> inter_lengths = helper_.AvgGenomicInterLengths(mosaic);
+ BlockInfo(mosaic.blocks().front());
+ for (size_t i = 1; i < mosaic.block_size(); ++i) {
+ out_ << inter_lengths[i - 1] << endl;
+ BlockInfo(mosaic.blocks()[i]);
+ }
+ submosaics_.clear();
+ }
+
+ virtual void ReportSubMosaic(const MosaicStructure& mosaic, const vector<StrandRange>& ranges) {
+ submosaics_.push_back(make_pair(mosaic, ranges));
+ }
+
+ virtual void EndRecord() {
+ out_ << submosaics_.size() << endl;
+ size_t cnt = 1;
+ for (auto pair : submosaics_) {
+ auto mosaic = pair.first;
+ auto ranges = pair.second;
+ //1 399 733 735 + 1630584 1634815// (the first sub_mosaic structure---1, the blocks---399 733 735, genomic position info--- + 1630584 1634815)
+// string finger = mosaic.Fingerprint();
+ // out_ << "Sub_mosaic. Block cnt = " << mosaic.block_size() << endl;
+ // out_ << "Blocks " << finger;
+ // out_ << " ; Found in " << get_all(different_irred_presence_, finger).size() << " different irreducible mosaics";
+ // string delim = " (";
+ // for (size_t idx : get_all(different_irred_presence_, finger)) {
+ // out_ << delim;
+ // out_ << idx;
+ // delim = ", ";
+ // }
+ // out_ << ")" << endl;
+
+ out_ << cnt++;
+ string delim = " ";
+ // out_ << "Ranges: ";
+ for (Block b : mosaic.blocks()) {
+ out_ << delim;
+ out_ << b;
+ }
+ for (StrandRange r : ranges) {
+ out_ << delim;
+ out_ << (r.second ? "+" : "-") << " ";
+ out_ << helper_.genome_composition().genome_coords(r);
+ // out_ << " (Pos: ";
+ // out_ << (r.second ? r.first : helper_.genome_composition().ConjStrandRange(r).first);
+ // out_ << ")";
+ // delim = "; ";
+ }
+ out_ << endl;
+ }
+ }
+
+};
+
+class NotTandemFilter : public func::Predicate<MosaicStructure> {
+ const BlockInfoProvider& block_info_;
+public:
+ NotTandemFilter(const BlockInfoProvider& block_info) :
+ block_info_(block_info) {
+
+ }
+
+ //todo use const references!!!
+ bool Check (MosaicStructure mosaic) const {
+ bag<Block> block_cnts;
+ for (Block b : mosaic.blocks()) {
+ block_cnts.put(b);
+ block_cnts.put(block_info_.conjugate(b));
+ }
+ size_t tandem_block_cnt = 0;
+ for (Block b : mosaic.blocks()) {
+ if (block_cnts.mult(b) > 1)
+ tandem_block_cnt += 1;
+ }
+ return tandem_block_cnt * 2 <= mosaic.block_size();
+ }
+
+};
+
+class LengthFilter : public func::Predicate<MosaicStructure> {
+ const MosaicHelper& helper_;
+ size_t min_span_length_;
+public:
+ LengthFilter(const MosaicHelper& helper, size_t min_span_length) :
+ helper_(helper),
+ min_span_length_(min_span_length) {
+
+ }
+
+ //todo use const references!!!
+ bool Check (MosaicStructure mosaic) const {
+ return size_t(math::round(helper_.AvgGenomicSpan(mosaic))) >= min_span_length_;
+ }
+
+};
+
+class FullMosaicTracker : public MosaicPrinter {
+// vector<vector<StrandRange>> full_mosaic_ranges_;
+ vector<StrandRange> full_mosaic_ranges_;
+
+ size_t curr_length_;
+public:
+ FullMosaicTracker() : curr_length_(0) {
+ }
+
+ virtual void StartRecord(const MosaicStructure& mosaic) {
+ curr_length_ = mosaic.block_size();
+ }
+
+ virtual void ReportSubMosaic(const MosaicStructure& mosaic, const vector<StrandRange>& ranges) {
+ if (mosaic.block_size() == curr_length_)
+ full_mosaic_ranges_.push_back(ranges.front());
+ }
+
+ const vector<StrandRange>& full_mosaic_ranges() const {
+ return full_mosaic_ranges_;
+ }
+
+};
+
+class AllRangesTracker : public MosaicPrinter {
+ vector<StrandRange> all_ranges_;
+
+public:
+ AllRangesTracker() {
+ }
+
+ virtual void ReportSubMosaic(const MosaicStructure& /*mosaic*/, const vector<StrandRange>& ranges) {
+ all_ranges_.push_back(ranges.front());
+ }
+
+ const vector<StrandRange>& all_ranges() const {
+ return all_ranges_;
+ }
+
+};
+
+class MosaicStructureSet {
+ const GenomeBlockComposition& block_composition_;
+ IntervalIndex& interval_index_;
+
+ vector<MosaicInterval> raw_intervals_;
+ vector<MosaicStructure> irreducible_structures_;
+// multimap<string, size_t> different_irred_presence_;
+
+ shared_ptr<func::Predicate<MosaicStructure>> filter_;
+ shared_ptr<func::Predicate<MosaicStructure>> sub_filter_;
+
+
+ MosaicStructure ConjugateMosaic(const MosaicStructure mosaic) const {
+ return mosaic.conjugate(block_composition_.block_info());
+ }
+
+// void CountDifferentIrred(const MosaicStructure& mosaic, size_t idx) {
+// for (size_t i = 0; i < mosaic.block_size(); ++i) {
+// for (size_t j = i; j < mosaic.block_size(); ++j) {
+// MosaicStructure sub_mosaic = mosaic.SubMosaic(i, j);
+// different_irred_presence_.insert(make_pair(sub_mosaic.Fingerprint(), idx));
+// different_irred_presence_.insert(
+// make_pair(ConjugateMosaic(sub_mosaic).Fingerprint(), idx));
+// }
+// }
+// }
+//
+// void CountDifferentIrred() {
+// for (size_t i = 0; i < irreducible_structures_.size(); ++i) {
+// CountDifferentIrred(irreducible_structures_[i], i);
+// }
+// }
+
+ bool AnalyzeStructure(const MosaicStructure& mosaic) {
+ for (auto& irred_struct : irreducible_structures_) {
+ if (mosaic.IsContainedIn(irred_struct)) {
+ DEBUG("Contained in some irred structure");
+ return false;
+ }
+ if (mosaic.IsContainedIn(ConjugateMosaic(irred_struct))) {
+ DEBUG("Contained in conjugate of some irred structure");
+ return false;
+ }
+ }
+ irreducible_structures_.push_back(mosaic);
+ return true;
+ }
+
+public:
+ MosaicStructureSet(const GenomeBlockComposition& block_composition,
+ IntervalIndex& interval_index) :
+ block_composition_(block_composition),
+ interval_index_(interval_index) {
+
+ }
+
+ void ProcessInterval(const MosaicInterval& interval) {
+ interval_index_.IndexSubIntervals(interval);
+ if (interval.support_size() > 1) {
+ raw_intervals_.push_back(interval);
+ }
+ }
+
+// const multimap<string, size_t>& different_irred_presence() const {
+// return different_irred_presence_;
+// }
+
+ void Analysis() {
+ INFO("Sorting raw intervals");
+ std::sort(
+ raw_intervals_.begin(),
+ raw_intervals_.end(),
+ [](const MosaicInterval& a, const MosaicInterval& b) {
+ return a.support_blocks.size() > b.support_blocks.size();
+ });
+ INFO("Analyzing sorted intervals");
+ for (const MosaicInterval& interval : raw_intervals_) {
+ AnalyzeStructure(MosaicStructure(block_composition_, interval));
+ }
+ INFO("Counting distinct irreducible");
+ //CountDifferentIrred();
+ }
+
+ void set_structure_filter(shared_ptr<func::Predicate<MosaicStructure>> filter) {
+ filter_ = filter;
+ }
+
+ void set_substructure_filter(shared_ptr<func::Predicate<MosaicStructure>> sub_filter) {
+ sub_filter_ = sub_filter;
+ }
+
+ void Report(MosaicPrinter& printer) {
+ for (size_t i = 0; i < irreducible_structures_.size(); ++i) {
+ MosaicStructure mosaic = irreducible_structures_[i];
+ if (!filter_ || filter_->Check(mosaic)) {
+ printer.StartRecord(mosaic);
+ set<StrandRange> reported_ranges;
+ for (const MosaicStructure& submosaic: mosaic.SubMosaics()) {
+ vector<StrandRange> ranges = interval_index_
+ .UnReportedOccurences(submosaic, reported_ranges);
+ if (ranges.empty())
+ continue;
+ if (!sub_filter_ || sub_filter_->Check(mosaic))
+ printer.ReportSubMosaic(submosaic, ranges);
+ insert_all(reported_ranges, ranges);
+ }
+ printer.EndRecord();
+ }
+ }
+ }
+
+ void Report(const vector<shared_ptr<MosaicPrinter>>& printers) {
+ for (auto printer_ptr : printers) {
+ Report(*printer_ptr);
+ }
+ }
+};
+
+//todo reduce duplication
+const io::SingleRead MakeRead(const string& read, const string& name = "") {
+ //todo fill with good quality
+ std::string qual;
+ qual.resize(read.size());
+ return io::SingleRead(name, read, qual);
+}
+
+const vector<io::SingleRead> MakeReads(const vector<string>& reads) {
+ vector<io::SingleRead> ans;
+ for (size_t i = 0; i < reads.size(); ++i) {
+ ans.push_back(MakeRead(reads[i]));
+ }
+ return ans;
+}
+
+const vector<io::SingleRead> MakeReads(const vector<string>& reads, const vector<string>& names) {
+ vector<io::SingleRead> ans;
+ for (size_t i = 0; i < reads.size(); ++i) {
+ ans.push_back(MakeRead(reads[i], names[i]));
+ }
+ return ans;
+}
+
+vector<string> mosaic_names(size_t n) {
+ vector<string> ans;
+ for (size_t i = 0; i < n; ++i) {
+ ans.push_back("mosaic_" + ToString(i));
+ }
+ return ans;
+}
+
+shared_ptr<io::ReadStream<io::SingleRead>> StreamInstance(const vector<string>& sequences) {
+ return make_shared<io::VectorReadStream<io::SingleRead>>(MakeReads(sequences));
+}
+
+shared_ptr<io::ReadStream<io::SingleRead>> StreamInstance(const vector<string>& sequences, const vector<string>& names) {
+ return make_shared<io::VectorReadStream<io::SingleRead>>(MakeReads(sequences, names));
+}
+//multimap<string, StrandRange> all_substruct_pos_;
+//const GenomeBlockComposition& block_composition_;
+
+string ExtractSequence(const StrandRange& strand_range, const GenomeBlockComposition& block_composition) {
+ const BlockInfoProvider& block_info = block_composition.block_info();
+ const Graph& g = block_info.g();
+ vector<Graph::EdgeId> edges;
+ Range range = strand_range.first;
+ bool strand = strand_range.second;
+ for (size_t i = range.start_pos; i < range.end_pos; ++i) {
+ edges.push_back(block_info.edge_id(block_composition.block(i, strand)));
+ }
+ return MergeSequences(g, edges).str();
+}
+
+vector<string> ExtractSequences(const vector<StrandRange>& strand_ranges, const GenomeBlockComposition& block_composition) {
+ vector<string> answer;
+ for (StrandRange range : strand_ranges) {
+ answer.push_back(ExtractSequence(range, block_composition));
+ }
+ return answer;
+}
+
+void DrawGraph(const vector<StrandRange>& all_ranges,
+ const vector<StrandRange>& full_mosaic_ranges,
+ const GenomeBlockComposition& block_composition) {
+ make_dir("tmp");
+ graph_pack<Graph, runtime_k::RtSeq> gp(block_composition.block_info().g().k(), "tmp", 0);
+
+ auto stream = io::RCWrap(StreamInstance(ExtractSequences(all_ranges, block_composition)));
+ auto streams = io::ReadStreamList<io::SingleRead>(stream);
+// ConstructGraphUsingOldIndex(streams, gp.g, gp.index);
+ ConstructGraph(config::debruijn_config::construction(), streams, gp.g, gp.index);
+
+ auto full_mosaic_pos_stream = io::RCWrap(StreamInstance(ExtractSequences(full_mosaic_ranges, block_composition), mosaic_names(full_mosaic_ranges.size())));
+ INFO("Threading " << full_mosaic_ranges.size() << " full mosaics");
+ FillPos(gp, *full_mosaic_pos_stream);
+
+ omnigraph::DefaultLabeler<Graph> labeler(gp.g, gp.edge_pos);
+
+ shared_ptr<GraphSplitter<Graph>> splitter = omnigraph::ReliableSplitter(gp.g,
+ numeric_limits<size_t>::max(),
+ 50
+ /*numeric_limits<size_t>::max()*/);
+
+ path::remove_if_exists("mosaic_pics");
+ path::make_dir("mosaic_pics");
+ INFO("Writing components");
+ omnigraph::visualization::WriteComponents(gp.g, "mosaic_pics/", splitter,
+ omnigraph::visualization::DefaultColorer(gp.g), labeler);
+ INFO("Components written");
+}
+
+class MosaicStructureAnalyzer {
+ const BlockInfoProvider& block_info_;
+ const GenomeBlockComposition& block_composition_;
+
+ size_t min_support_block_length_;
+ size_t max_support_block_multiplicity_;
+ size_t max_inter_block_length_;
+ size_t min_reportable_mosaic_length_;
+ size_t min_reportable_submosaic_length_;
+ string folder_;
+
+ Pos curr_pos_;
+
+ bool CheckSupporting(Block b) {
+ size_t mult = block_composition_.multiplicity(b);
+ return mult > 1 && block_info_.length(b) > min_support_block_length_
+ && mult < max_support_block_multiplicity_;
+ }
+
+ bool CheckSupporting() {
+ return CheckSupporting(block_composition_.block(curr_pos_, true));
+ }
+
+ /*
+ * returns distance to the next support block or -1u if it wasn't found
+ * curr_block is updated as a side effect!
+ */
+ size_t MoveToNextSupportBlock() {
+ Pos init_pos = curr_pos_;
+ curr_pos_++;
+ while (curr_pos_ < block_composition_.size() && !CheckSupporting()) {
+ curr_pos_++;
+ }
+ return curr_pos_ != block_composition_.size()
+ ? block_composition_.genome_coords(curr_pos_, true).start_pos
+ - block_composition_.genome_coords(init_pos, true).end_pos
+ : -1u;
+ }
+
+public:
+ MosaicStructureAnalyzer(const GenomeBlockComposition& block_composition,
+ size_t min_support_length, size_t max_support_mult,
+ size_t max_inter_length, size_t min_reportable_mosaic_length,
+ size_t min_reportable_submosaic_length,
+ const string& folder)
+ : block_info_(block_composition.block_info()),
+ block_composition_(block_composition),
+ min_support_block_length_(min_support_length),
+ max_support_block_multiplicity_(max_support_mult),
+ max_inter_block_length_(max_inter_length),
+ min_reportable_mosaic_length_(min_reportable_mosaic_length),
+ min_reportable_submosaic_length_(min_reportable_submosaic_length),
+ folder_(folder),
+ curr_pos_(0) {
+ }
+
+ void Analyze() {
+ IntervalIndex interval_index(block_composition_);
+ MosaicHelper helper(block_composition_,
+ interval_index);
+
+ MosaicStructureSet interval_set(block_composition_, interval_index);
+ INFO("Collecting mosaic intervals");
+ MoveToNextSupportBlock();
+ while (curr_pos_ < block_composition_.size()) {
+ MosaicInterval interval(true, curr_pos_);
+ while (MoveToNextSupportBlock() < max_inter_block_length_) {
+ interval.pos_range.end_pos = (curr_pos_ + 1);
+ interval.support_blocks.push_back(curr_pos_);
+ }
+ interval_set.ProcessInterval(interval);
+ }
+ INFO("Analyzing intervals and forming mosaic structures");
+ interval_set.Analysis();
+ INFO("Reporting mosaic structures");
+
+ //might have problems if only largest occurence is tandem making whole mosaic invalid
+ //todo magic constant!
+ auto filter = func::And<MosaicStructure>(make_shared<NotTandemFilter>(block_info_),
+ make_shared<LengthFilter>(helper, min_reportable_mosaic_length_));
+// auto filter = make_shared<func::AlwaysTrue<MosaicStructure>>();
+
+ interval_set.set_structure_filter(filter);
+ //todo move filter to set constructor
+
+ //pics
+ FullMosaicTracker tracker;
+ interval_set.Report(tracker);
+ vector<StrandRange> full_mosaic_ranges = tracker.full_mosaic_ranges();
+ AllRangesTracker all_tracker;
+ interval_set.Report(all_tracker);
+ DrawGraph(all_tracker.all_ranges(), full_mosaic_ranges, block_composition_);
+ //end pics
+
+ interval_set.set_substructure_filter(make_shared<LengthFilter>(helper, min_reportable_submosaic_length_));
+
+ ParsableFormatPrinter parsable_printer(helper, folder_ + "mosaic_to_parse.txt");
+ interval_set.Report(parsable_printer);
+
+ TxtFileMosaicPrinter readable_printer(helper, /*interval_set.different_irred_presence(),*/
+ folder_ + "mosaic_to_read.txt");
+ interval_set.Report(readable_printer);
+
+ }
+
+};
+
+template<class gp_t>
+void PerformMosaicAnalysis(const gp_t& gp, const MappingPath<EdgeId> mapping_path, const Sequence& genome,
+ size_t min_support_length, size_t max_support_mult,
+ size_t max_inter_length, size_t min_reportable_mosaic_length,
+ size_t min_reportable_submosaic_length, const string& folder) {
+ BlockInfoProvider block_info(gp.g);
+ GenomeBlockComposition block_composition(gp.g, mapping_path, genome.size() - gp.g.k(), block_info);
+
+ MosaicStructureAnalyzer analyzer(block_composition, min_support_length, max_support_mult,
+ max_inter_length, min_reportable_mosaic_length,
+ min_reportable_submosaic_length, folder);
+ analyzer.Analyze();
+}
+
+}
+}
diff --git a/src/projects/cap/path_projector.hpp b/src/projects/cap/path_projector.hpp
new file mode 100644
index 0000000..0fa0dad
--- /dev/null
+++ b/src/projects/cap/path_projector.hpp
@@ -0,0 +1,445 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include <iostream>
+#include <vector>
+
+#include "coordinates_handler.hpp"
+//#include "coloring.hpp"
+#include "assembly_graph/graph_support/graph_processing_algorithm.hpp"
+
+namespace cap {
+
+/*
+ * ConditionedSmartSetIterator acts much like SmartSetIterator, but, unlike the above case,
+ * one can (and must) provide merge handler that will decide whether to add merged edge to
+ * the set being iterated or not (extending add_new_ parameter logic of SmartIterator)
+ * Also has the ability to be `reset` (i.e. start from the begin-iterator with respect to
+ * added and deleted values)
+ * MergeHandler class/struct must provide:
+ * bool operator()(const std::vector<ElementId> &, ElementId)
+ */
+template<class Graph, typename ElementId, class MergeHandler>
+class ConditionedSmartSetIterator : public SmartSetIterator<Graph, ElementId> {
+ typedef SmartSetIterator<Graph, ElementId> base;
+
+ MergeHandler &merge_handler_;
+ std::unordered_set<ElementId> true_elements_;
+
+ public:
+
+ template <class Iterator>
+ ConditionedSmartSetIterator(const Graph &graph, Iterator begin, Iterator end,
+ MergeHandler &merge_handler)
+ : SmartSetIterator<Graph, ElementId>(graph, begin, end),
+ merge_handler_(merge_handler),
+ true_elements_() {
+
+ for (auto it = begin; it != end; ++it) {
+ true_elements_.insert(*it);
+ }
+ }
+
+ void HandleAdd(ElementId v) override {
+ TRACE("handleAdd " << this->g().str(v));
+ if (true_elements_.count(v)) {
+ this->push(v);
+ }
+ }
+
+ void HandleDelete(ElementId v) override {
+ TRACE("handleDel " << this->g().str(v));
+ base::HandleDelete(v);
+ true_elements_.erase(v);
+ }
+
+ void HandleMerge(const std::vector<ElementId>& old_edges, ElementId new_edge) override {
+ TRACE("handleMer " << this->g().str(new_edge));
+ if (merge_handler_(old_edges, new_edge)) {
+ true_elements_.insert(new_edge);
+ }
+ }
+
+private:
+ DECL_LOGGER("ConditionedSmartSetIterator");
+};
+
+template <class Graph>
+class PathProjector {
+ public:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef std::vector<EdgeId> Path;
+ typedef typename CoordinatesHandler<Graph>::PosArray PosArray;
+ typedef unsigned uint;
+
+ PathProjector(Graph &g, CoordinatesHandler<Graph> &coordinates_handler)
+ : g_(g),
+ coordinates_handler_(coordinates_handler),
+ edge_remover_(g),
+ is_deleting_locked_(false) {
+ }
+
+ virtual std::vector<Path> FilterPaths(const std::vector<Path> &paths) const {
+ return paths;
+ }
+ virtual std::vector<PosArray> GetThreadsToDelete(
+ const std::vector<Path> &paths) const {
+ std::vector<PosArray> threads_to_delete;
+ for (const auto &path : paths) {
+ threads_to_delete.push_back(coordinates_handler_.GetContiguousThreads(path));
+ }
+ return threads_to_delete;
+ }
+
+ virtual size_t ChooseBasePath(const std::vector<Path> &paths,
+ const std::vector<PosArray> &threads_to_delete) const {
+ std::vector<size_t> num_bridges;
+ std::vector<size_t> sum_multiplicities;
+ for (const auto &path : paths) {
+ num_bridges.push_back(CalcBridges(path, threads_to_delete.back().size()));
+ sum_multiplicities.push_back(CalcMultiplicitySum(path));
+ }
+
+ size_t chosen_path = 0;
+ for (size_t i = 1; i < paths.size(); ++i) {
+ if (num_bridges[i] < num_bridges[chosen_path] ||
+ (num_bridges[i] == num_bridges[chosen_path] &&
+ sum_multiplicities[i] > sum_multiplicities[chosen_path])) {
+ chosen_path = i;
+ }
+ }
+
+ return chosen_path;
+ }
+
+ bool CollapsePaths(const std::vector<Path> &paths_to_collapse) {
+ TRACE("CollapsePaths Begin");
+
+ const std::vector<Path> &paths = FilterPaths(paths_to_collapse);
+ if (CheckPresenceOfSelfContiguousEdges(paths))
+ return false;
+
+ std::vector<PosArray> threads_to_delete =
+ GetThreadsToDelete(paths);
+ size_t chosen_path = ChooseBasePath(paths, threads_to_delete);
+
+ // RC paths
+ std::vector<Path> rc_paths;
+ for (const auto &path : paths) {
+ Path rc_path;
+ for (auto it = path.rbegin(); it != path.rend(); ++it) {
+ rc_path.push_back(g_.conjugate(*it));
+ }
+ rc_paths.push_back(rc_path);
+ }
+ auto rc_threads = GetThreadsToDelete(rc_paths);
+
+ // We restrict merging RC and original genome threads
+ if (!CheckForRCAbsence(threads_to_delete))
+ return false;
+ if (!CheckForCorrectPaths(paths))
+ return false;
+
+
+ if (!CheckDeletionOfIntouchables(paths, threads_to_delete, chosen_path))
+ return false;
+ std::unordered_set<size_t> bad_paths = GetIntersectingPaths(paths,
+ chosen_path);
+
+ if(false && bad_paths.size() == paths.size() - 1) {
+ for (size_t i = 0; i < paths.size(); ++i) {
+ INFO("failed: paths " << Debug(paths[i]) << " and " << Debug(rc_paths[i]));
+ for (const auto &e : threads_to_delete[i])
+ INFO("to delete 1: " << int(e.first) << " - " << e.second);
+ for (const auto &e : rc_threads[i])
+ INFO("to delete 2: " << int(e.first) << " - " << e.second);
+ INFO("---------paths1:");
+ for (const auto &e : paths[i])
+ coordinates_handler_.DebugOutput(e);
+ INFO("---------paths2:");
+ for (const auto &e : rc_paths[i])
+ coordinates_handler_.DebugOutput(e);
+ }
+ VERIFY(false);
+ }
+
+ DEBUG("Collapsing paths:");
+ for (size_t i = 0; i < paths.size(); ++i) {
+ if (i == chosen_path)
+ continue;
+
+ DEBUG(debug::Debug(g_, paths[i]));
+ for (const auto &t : threads_to_delete[i]) {
+ DEBUG("" << int(t.first) << " " << debug::PrintComplexPosition(t.second));
+ }
+ DEBUG(debug::Debug(g_, rc_paths[i]));
+ for (const auto &t : rc_threads[i]) {
+ DEBUG("" << int(t.first) << " " << debug::PrintComplexPosition(t.second));
+ }
+ }
+ {
+ size_t i = chosen_path;
+ DEBUG(debug::Debug(g_, paths[i]));
+ for (const auto &t : threads_to_delete[i]) {
+ DEBUG("" << int(t.first) << " " << debug::PrintComplexPosition(t.second));
+ }
+ DEBUG(debug::Debug(g_, rc_paths[i]));
+ for (const auto &t : rc_threads[i]) {
+ DEBUG("" << int(t.first) << " " << debug::PrintComplexPosition(t.second));
+ }
+ }
+
+ LockDelete();
+ coordinates_handler_.LockChanges();
+ for (size_t i = 0; i < paths.size(); ++i) {
+ /*
+ if (threads_to_delete[i].size() != rc_threads[i].size()) {
+ INFO("failed: paths " << Debug(paths[i]) << " and " << Debug(rc_paths[i]));
+ for (const auto &e : threads_to_delete[i])
+ INFO("to delete 1: " << int(e.first) << " - " << e.second);
+ for (const auto &e : rc_threads[i])
+ INFO("to delete 2: " << int(e.first) << " - " << e.second);
+ INFO("---------paths1:");
+ for (const auto &e : paths[i])
+ coordinates_handler_.DebugOutput(e);
+ INFO("---------paths2:");
+ for (const auto &e : rc_paths[i])
+ coordinates_handler_.DebugOutput(e);
+ VERIFY(false);
+ }
+ */
+
+ if (i == chosen_path) continue;
+ if (bad_paths.count(i) > 0) continue;
+
+ if (threads_to_delete[i].size() == 0) {
+ TRACE("nothin to delete!");
+ continue;
+ }
+ bool success = true;
+
+ success &= ProjectPath(paths[i], paths[chosen_path], threads_to_delete[i]);
+ success &= ProjectPath(rc_paths[i], rc_paths[chosen_path], rc_threads[i]);
+
+ if (!success) {
+ ClearDeleteList();
+ coordinates_handler_.UnrollChanges();
+ break;
+ }
+ }
+ coordinates_handler_.ReleaseChanges();
+ ReleaseDelete();
+
+ TRACE("CollapsePaths End");
+ return true;
+ }
+
+ bool ProjectPath(const Path &from, const Path &to,
+ const PosArray &threads_to_delete) {
+ const bool success = coordinates_handler_.ProjectPath(
+ from, to, threads_to_delete);
+
+ if (!success)
+ return false;
+
+ for (const auto e : from) {
+ if (coordinates_handler_.GetMultiplicity(e) == 0) {
+ DeleteEdge(e);
+ }
+ }
+
+ return true;
+ }
+
+ bool ProjectPath(const Path &from, const Path &to) {
+ const std::vector<std::pair<uint, size_t> > threads_to_delete =
+ coordinates_handler_.GetContiguousThreads(from);
+ return ProjectPath(from, to, threads_to_delete);
+ }
+
+
+ private:
+ class DeletingMergeHandler {
+ public:
+ DeletingMergeHandler(const std::vector<EdgeId> &to_delete)
+ : delete_set_(to_delete.begin(), to_delete.end()) {
+ }
+
+ bool operator()(const std::vector<EdgeId> &old_edges, EdgeId new_edge) {
+ return false;
+ bool ret = true;
+ for (auto it = old_edges.begin(); it != old_edges.end(); ++it) {
+ ret &= bool(delete_set_.count(*it));
+ delete_set_.erase(*it);
+ }
+ if (ret) {
+ delete_set_.insert(new_edge);
+ }
+ return ret;
+ }
+
+ private:
+ std::unordered_set<EdgeId> delete_set_;
+ };
+
+ bool CheckForRCAbsence(
+ const std::vector<std::vector<std::pair<uint, size_t> > > &threads) {
+ return true;
+ // We use that #of thread = 2 * genome_num + RC
+ std::vector<uint> genomes;
+ for (const auto &entry : threads) {
+ for (const auto &pos : entry) {
+ genomes.push_back(pos.first);
+ }
+ }
+ std::sort(genomes.begin(), genomes.end());
+ for (size_t i = 1; i < genomes.size(); ++i) {
+ if (genomes[i] == genomes[i - 1] + 1 &&
+ (genomes[i] & 1) == 1) {
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ bool CheckForCorrectPaths(const std::vector<Path> &paths) {
+ for (size_t i = 1; i < paths.size(); ++i) {
+ if (!coordinates_handler_.CheckCorrectPathProjection(paths[i - 1],
+ paths[i]))
+ return false;
+ }
+ return true;
+ }
+
+ std::unordered_set<size_t> GetIntersectingPaths(const std::vector<Path> &paths,
+ const size_t chosen_path) {
+ std::unordered_set<EdgeId> intouchable_edges;
+ for (const auto e : paths[chosen_path]) {
+ intouchable_edges.insert(e);
+ }
+
+ std::unordered_set<size_t> result;
+ for (size_t i = 0; i < paths.size(); ++i) {
+ if (i == chosen_path) continue;
+
+ for (const auto e : paths[i]) {
+ if (intouchable_edges.count(e) > 0) {
+ result.insert(i);
+ break;
+ }
+ }
+ }
+
+ return result;
+ }
+
+ bool CheckDeletionOfIntouchables(const std::vector<Path> &paths,
+ const std::vector<std::vector<std::pair<uint, size_t> > > &del_threads,
+ const size_t chosen_path) const {
+ std::unordered_set<EdgeId> intouchable_edges;
+ for (const auto e : paths[chosen_path]) {
+ intouchable_edges.insert(e);
+ intouchable_edges.insert(g_.conjugate(e));
+ }
+
+ for (size_t i = 0; i < paths.size(); ++i) {
+ if (i == chosen_path) continue;
+ const Path &path = paths[i];
+
+ for (const auto e : path) {
+ if (coordinates_handler_.GetMultiplicity(e) == del_threads[i].size()) {
+ if (intouchable_edges.count(e) > 0) {
+ return false;
+ }
+ }
+ }
+ }
+
+ return true;
+ }
+
+ bool CheckPresenceOfSelfContiguousEdges(const std::vector<Path> &paths) const {
+ for (const auto &path : paths) {
+ for (const auto e : path) {
+ if (g_.conjugate(e) == e)
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ size_t CalcBridges(const Path &path, const size_t thin_multiplicity) const {
+ size_t bridges = 0;
+ for (const auto e : path)
+ bridges += coordinates_handler_.GetMultiplicity(e) == thin_multiplicity;
+
+ return bridges;
+ }
+
+ size_t CalcMultiplicitySum(const Path &path) const {
+ size_t multiplicity_sum = 0;
+ for (const auto e : path)
+ multiplicity_sum += coordinates_handler_.GetMultiplicity(e);
+
+ return multiplicity_sum;
+ }
+
+ void DeleteEdge(const EdgeId edge) {
+ edges_to_delete_.push_back(edge);
+ if (!is_deleting_locked_)
+ ReleaseDelete();
+ }
+
+ void ForceDeleteEdges(const std::vector<EdgeId> &edges) {
+ VERIFY(!is_deleting_locked_);
+
+ //TRACE("DeleteEdges Begin " << Debug(edges) << " of size " << edges.size());
+ DeletingMergeHandler merge_handler(edges);
+ ConditionedSmartSetIterator<Graph, EdgeId, DeletingMergeHandler> smart_it(
+ g_, edges.begin(), edges.end(), merge_handler);
+
+ for (; !smart_it.IsEnd(); ++smart_it) {
+ edge_remover_.DeleteEdge(*smart_it);
+ }
+ TRACE("DeleteEdges End");
+ }
+
+ void LockDelete() {
+ is_deleting_locked_ = true;
+ }
+ void ReleaseDelete() {
+ is_deleting_locked_ = false;
+ ForceDeleteEdges(edges_to_delete_);
+ ClearDeleteList();
+ }
+ void ClearDeleteList() {
+ edges_to_delete_.clear();
+ }
+ template<class T>
+ std::string Debug(const std::vector<T> &p) {
+ std::stringstream ss;
+ for (const auto &x : p) {
+ ss << g_.str(x) << ";";
+ }
+ return ss.str();
+ }
+
+ Graph &g_;
+ CoordinatesHandler<Graph> &coordinates_handler_;
+ EdgeRemover<Graph> edge_remover_;
+
+ bool is_deleting_locked_;
+ std::vector<EdgeId> edges_to_delete_;
+
+ DECL_LOGGER("PathProjector")
+ ;
+};
+
+}
diff --git a/src/projects/cap/polynomial_hash.hpp b/src/projects/cap/polynomial_hash.hpp
new file mode 100644
index 0000000..f15d9dd
--- /dev/null
+++ b/src/projects/cap/polynomial_hash.hpp
@@ -0,0 +1,404 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <cstdlib>
+#include <cstdint>
+#include <algorithm>
+
+namespace cap {
+
+namespace precalc {
+
+template <uint64_t base, uint64_t degree, class Enable = void>
+struct Power {
+};
+template <uint64_t base, uint64_t degree>
+struct Power<base, degree, typename boost::enable_if_c<degree % 2 == 1>::type > {
+ enum { value = Power<base * base, degree / 2>::value * base };
+};
+template <uint64_t base, uint64_t degree>
+struct Power<base, degree, typename boost::enable_if_c<degree % 2 == 0>::type > {
+ enum { value = Power<base * base, degree / 2>::value };
+};
+
+template <uint64_t base>
+struct Power<base, 0, void> {
+ enum { value = 1 };
+};
+
+template <uint64_t base>
+struct Power<base, 1, void> {
+ enum { value = base };
+};
+
+}
+
+namespace utils {
+
+template <class T>
+inline T FastPow(T base, size_t pow) {
+ if (pow == 0) {
+ return 1;
+ }
+ if (pow & 1) {
+ return FastPow(base * base, pow / 2) * base;
+ }
+ return FastPow(base * base, pow / 2);
+}
+
+template <size_t ind>
+struct PrimeHolder {
+};
+template <>
+struct PrimeHolder<0> {
+ enum { val = 239 };
+};
+template <>
+struct PrimeHolder<1> {
+ enum { val = 241 };
+};
+template <>
+struct PrimeHolder<2> {
+ enum { val = 251 };
+};
+template <>
+struct PrimeHolder<3> {
+ enum { val = 257 };
+};
+template <>
+struct PrimeHolder<4> {
+ enum { val = 263 };
+};
+template <>
+struct PrimeHolder<5> {
+ enum { val = 269 };
+};
+/*
+template <size_t ind>
+class PrimeHolder {
+ enum { val = ({239, 241, 251, 257, 263, 269, 271, 277, 281, 283})[ind] };
+};
+*/
+
+}
+
+template <size_t prime = 239, class HashT = uint64_t>
+class PolynomialHash : private boost::noncopyable {
+ static const size_t kHashP = prime;
+ HashT kHashPDeg;
+ HashT hash_;
+// uint8_t last_chars_; // assume every char <= 2bits (preferably)
+
+ inline static HashT GenPolyDeg(unsigned polydeg) {
+ return utils::FastPow<HashT>(prime, polydeg - 1);
+ }
+
+ explicit PolynomialHash()
+ : kHashPDeg(0),
+ hash_(0) {
+ }
+
+ public:
+ typedef HashT DataType;
+
+ PolynomialHash(unsigned polydeg)
+ : kHashPDeg(GenPolyDeg(polydeg)),
+ hash_(0)/*,
+ last_chars_(0)*/ {
+ }
+
+ PolynomialHash(unsigned polydeg, const PolynomialHash<prime, HashT> &other)
+ : kHashPDeg(GenPolyDeg(polydeg)),
+ hash_(other.hash_)/*,
+ last_chars_(0)*/ {
+ }
+
+ inline void CopyFrom(unsigned polydeg, const PolynomialHash<prime, HashT> &other) {
+ kHashPDeg = GenPolyDeg(polydeg);
+ hash_ = other.hash_;
+ //last_chars_ = other.last_chars_;
+ }
+
+
+ inline void Update(HashT front) {
+ hash_ = hash_ * kHashP + front;
+ //last_chars_ = (last_chars_ << 2) ^ (front & 8);
+ }
+ inline void Update(HashT front, HashT back) {
+ hash_ = (hash_ - back * kHashPDeg) * kHashP + front;
+ //last_chars_ = (last_chars_ << 2) ^ (front & 8);
+ }
+ inline void UpdateBack(HashT back) {
+ hash_ = hash_ + kHashPDeg * back;
+ //last_chars_ >>= 2;
+ }
+ inline void UpdateBack(HashT back, HashT front) {
+ hash_ = (hash_ - front) * precalc::Power<kHashP, (1ull << 63) - 1>::value + back * kHashPDeg;
+ //last_chars_ >>= 2;
+ }
+ inline HashT GetHash() const {
+ return hash_;
+ }
+ inline HashT GetHash(HashT /*seed*/) const {
+ return hash_;// + last_chars_ * seed * kHashPDeg * kHashP;
+ }
+
+ inline bool operator ==(const PolynomialHash<prime, HashT> &other) const {
+ return hash_ == other.hash_;
+ }
+ inline bool operator !=(const PolynomialHash<prime, HashT> &other) const {
+ return !(operator==(other));
+ }
+};
+
+template <size_t size, class StorageT>
+class HashTuple {
+ typedef HashTuple<size - 1, StorageT> ChildClass;
+
+ ChildClass child_data_;
+ StorageT data_;
+
+ public:
+ HashTuple() : child_data_(), data_() {
+ }
+ HashTuple(ChildClass child_data, StorageT data)
+ : child_data_(child_data),
+ data_(data) {
+ }
+ template <size_t pos>
+ inline void set(StorageT data, typename boost::enable_if_c<pos == size - 1>::type* = 0) {
+ data_ = data;
+ }
+ template <size_t pos>
+ inline void set(StorageT data, typename boost::enable_if_c<pos != size - 1>::type* = 0) {
+ child_data_.template set<pos>(data);
+ }
+
+ template <size_t pos>
+ inline StorageT get(typename boost::enable_if_c<pos == size - 1>::type* = 0) {
+ return data_;
+ }
+ template <size_t pos>
+ inline StorageT get(typename boost::enable_if_c<pos != size - 1>::type* = 0) {
+ return child_data_.template get<pos>();
+ }
+};
+
+template <class StorageT>
+class HashTuple<1, StorageT> {
+ StorageT data_;
+
+ public:
+ HashTuple() : data_() {
+ }
+ HashTuple(StorageT data) : data_(data) {
+ }
+
+ template <size_t pos>
+ inline void set(StorageT data) {
+ data_ = data;
+ }
+ template <size_t pos>
+ inline StorageT get() const {
+ return data_;
+ }
+};
+
+// nobody knows how to do it in a good way
+template <size_t size, class HashT = uint64_t>
+class MultiPolynomialHash {
+ typedef MultiPolynomialHash<size - 1, HashT> ChildClass;
+
+ ChildClass child_hash_;
+ PolynomialHash<utils::PrimeHolder<size - 1>::val, HashT> hash_;
+
+ public:
+ typedef HashTuple<size, HashT> DataType;
+ typedef HashT AtomType;
+
+ MultiPolynomialHash(unsigned polydeg) : child_hash_(polydeg), hash_(polydeg) {
+ }
+
+ MultiPolynomialHash(unsigned polydeg, const MultiPolynomialHash<size, HashT> &other)
+ : child_hash_(polydeg, other.child_hash_),
+ hash_(polydeg, other.hash_) {
+ }
+ inline void CopyFrom(unsigned polydeg, const MultiPolynomialHash<size, HashT> &other) {
+ child_hash_.CopyFrom(polydeg, other.child_hash_);
+ hash_.CopyFrom(polydeg, other.hash_);
+ };
+
+ inline void Update(HashT front) {
+ child_hash_.Update(front);
+ hash_.Update(front);
+ }
+ inline void Update(HashT front, HashT back) {
+ child_hash_.Update(front, back);
+ hash_.Update(front, back);
+ }
+ inline void UpdateBack(HashT back) {
+ child_hash_.UpdateBack(back);
+ hash_.UpdateBack(back);
+ }
+ inline void UpdateBack(HashT back, HashT front) {
+ child_hash_.UpdateBack(back, front);
+ hash_.UpdateBack(back, front);
+ }
+
+ inline HashTuple<size, HashT> GetHash() const {
+ return HashTuple<size, HashT>(child_hash_.GetHash(), hash_.GetHash());
+ }
+ inline HashTuple<size, HashT> GetHash(HashT seed) const {
+ return HashTuple<size, HashT>(child_hash_.GetHash(seed), hash_.GetHash(seed));
+ }
+
+ inline size_t GetHashInt() const {
+ return child_hash_.GetHashInt() ^ hash_.GetHash();
+ }
+
+
+ inline bool operator ==(const MultiPolynomialHash<size, HashT> &other) const {
+ return child_hash_ == other.child_hash_ && hash_ == other.hash_;
+ }
+ inline bool operator !=(const MultiPolynomialHash<size, HashT> &other) const {
+ return !(operator==(other));
+ }
+};
+
+template <class HashT>
+class MultiPolynomialHash<1, HashT> {
+ PolynomialHash<utils::PrimeHolder<0>::val, HashT> hash_;
+
+ public:
+ typedef HashTuple<1, HashT> DataType;
+
+ MultiPolynomialHash(unsigned polydeg) : hash_(polydeg) {
+ }
+
+ MultiPolynomialHash(unsigned polydeg, const MultiPolynomialHash<1, HashT> &other)
+ : hash_(polydeg, other.hash_) {
+ }
+ inline void CopyFrom(unsigned polydeg, const MultiPolynomialHash<1, HashT> &other) {
+ hash_.CopyFrom(polydeg, other.hash_);
+ };
+
+ inline void Update(HashT front) {
+ hash_.Update(front);
+ }
+ inline void Update(HashT front, HashT back) {
+ hash_.Update(front, back);
+ }
+ inline void UpdateBack(HashT back) {
+ hash_.UpdateBack(back);
+ }
+ inline void UpdateBack(HashT back, HashT front) {
+ hash_.UpdateBack(back, front);
+ }
+
+ inline HashTuple<1, HashT> GetHash() const {
+ return HashTuple<1, HashT>(hash_.GetHash());
+ }
+
+ inline HashTuple<1, HashT> GetHash(HashT seed) const {
+ return HashTuple<1, HashT>(hash_.GetHash(seed));
+ }
+
+ inline size_t GetHashInt() const {
+ return hash_.GetHash();
+ }
+
+
+ inline bool operator ==(const MultiPolynomialHash<1, HashT> &other) const {
+ return hash_ == other.hash_;
+ }
+ inline bool operator !=(const MultiPolynomialHash<1, HashT> &other) const {
+ return !(operator==(other));
+ }
+};
+
+/*
+template <class HashT>
+class MultiPolynomialHash<2, HashT> {
+ PolynomialHash<239, HashT> h1;
+ PolynomialHash<269, HashT> h2;
+
+ public:
+ typedef std::pair<HashT, HashT> DataType;
+
+ MultiPolynomialHash(unsigned polydeg) : h1(polydeg), h2(polydeg) {
+ }
+
+ MultiPolynomialHash(unsigned polydeg, const MultiPolynomialHash<2, HashT> &other)
+ : h1(polydeg, other.h1),
+ h2(polydeg, other.h2) {
+ }
+ inline void CopyFrom(unsigned polydeg, const MultiPolynomialHash<2, HashT> &other) {
+ h1.CopyFrom(polydeg, other.h1);
+ h2.CopyFrom(polydeg, other.h2);
+ };
+
+ inline void Update(HashT front) {
+ h1.Update(front);
+ h2.Update(front);
+ }
+ inline void Update(HashT front, HashT back) {
+ h1.Update(front, back);
+ h2.Update(front, back);
+ }
+ inline void UpdateBack(HashT back) {
+ h1.UpdateBack(back);
+ h2.UpdateBack(back);
+ }
+ inline void UpdateBack(HashT back, HashT front) {
+ h1.UpdateBack(back, front);
+ h2.UpdateBack(back, front);
+ }
+
+ inline std::pair<HashT, HashT> GetHash() const {
+ return std::pair<HashT, HashT>(h1.GetHash(), h2.GetHash());
+ }
+
+ inline size_t GetHashInt() const {
+ return h1.GetHash() ^ h2.GetHash();
+ }
+
+
+ inline bool operator ==(const MultiPolynomialHash<2, HashT> &other) const {
+ return GetHash() == other.GetHash();
+ }
+ inline bool operator !=(const MultiPolynomialHash<2, HashT> &other) const {
+ return !(operator==(other));
+ }
+};
+
+template <class HashT = uint64_t>
+class DoublePolynomialHash : public MultiPolynomialHash<2, HashT> {
+ typedef MultiPolynomialHash<2, HashT> base;
+
+ public:
+ DoublePolynomialHash(unsigned polydeg) : base(polydeg) {
+ }
+ DoublePolynomialHash(unsigned polydeg, const DoublePolynomialHash<HashT> &other)
+ : base(polydeg, other) {
+ }
+
+};
+*/
+
+}
+
+namespace std {
+
+template<size_t size, class HashT>
+std::ostream& operator<<(std::ostream& os, const cap::MultiPolynomialHash<size, HashT> &hash) {
+ os << hash.GetHashInt();
+ return os;
+}
+
+}
diff --git a/src/projects/cap/repeat_masking.hpp b/src/projects/cap/repeat_masking.hpp
new file mode 100644
index 0000000..5928bb8
--- /dev/null
+++ b/src/projects/cap/repeat_masking.hpp
@@ -0,0 +1,544 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "data_structures/sequence/nucl.hpp"
+#include "io/reads_io/modifying_reader_wrapper.hpp"
+#include "utils/adt/bag.hpp"
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_01.hpp>
+#include <boost/random/uniform_int.hpp>
+#include <boost/random/variate_generator.hpp>
+//#include "indices/edge_index_builders.hpp"
+
+namespace cap {
+
+struct Count {
+ size_t count;
+ Count() : count(0) {}
+ Count conjugate(size_t /*k*/) {
+ return *this;
+ }
+};
+
+template <class Builder>
+class RepeatSearchingIndexBuilder : public Builder {
+ typedef Builder base;
+ public:
+ typedef typename Builder::IndexT IndexT;
+ typedef typename IndexT::KMer Kmer;
+ typedef typename IndexT::KMerIdx KmerIdx;
+ typedef typename IndexT::KeyWithHash KWH;
+
+ private:
+ template<class ReadStream>
+ size_t FillCoverageFromStream(ReadStream &stream,
+ IndexT &index) const {
+ unsigned k = index.k();
+ while (!stream.eof()) {
+ typename ReadStream::ReadT r;
+ stream >> r;
+
+ const Sequence &seq = r.sequence();
+ if (seq.size() < k)
+ continue;
+
+ KWH kwh = index.ConstructKWH(seq.start<Kmer>(k));
+ kwh >>= 'A';
+ for (size_t j = k - 1; j < seq.size(); ++j) {
+ kwh<<= seq[j];
+ VERIFY(index.valid(kwh));
+ if (index.get_value(kwh).count != -1u) {
+ index.get_raw_value_reference(kwh).count += 1;
+ }
+ }
+ }
+
+ return 0;
+ }
+
+ void ProcessCounts(IndexT &index) const {
+ for (auto it = index.value_begin(); it != index.value_end(); ++it) {
+ if (it->count > 1) {
+ it->count = -1u;
+ } else {
+ it->count = 0;
+ }
+ }
+ }
+
+ template<class Streams>
+ size_t FindRepeats(IndexT &index, Streams &streams) const {
+ INFO("Collecting k-mer coverage information from reads, this takes a while.");
+ unsigned nthreads = (unsigned) streams.size();
+ streams.reset();
+ for (size_t i = 0; i < nthreads; ++i) {
+ FillCoverageFromStream(streams[i], index);
+ ProcessCounts(index);
+ }
+
+ return 0;
+ }
+
+ public:
+
+ template<class Streams>
+ size_t BuildIndexFromStream(IndexT &index,
+ Streams &streams,
+ io::SingleStream* contigs_stream = 0) const {
+ base::BuildIndexFromStream(index, streams, contigs_stream);
+
+ return FindRepeats(index, streams);
+ }
+
+};
+
+template<class Index>
+struct CountIndexHelper {
+ typedef Index IndexT;
+ typedef typename IndexT::KMer Kmer;
+ typedef typename IndexT::KMerIdx KMerIdx;
+ typedef typename IndexT::traits_t traits_t;
+// typedef typename IndexT::IdType IdType;
+ typedef DeBruijnStreamKMerIndexBuilder<Kmer, IndexT> DeBruijnStreamKMerIndexBuilderT;
+ typedef RepeatSearchingIndexBuilder<DeBruijnStreamKMerIndexBuilderT> RepeatSearchingIndexBuilderT;
+};
+
+class RandNucl {
+ unsigned seed_;
+ boost::mt19937 rand_engine_;
+ boost::uniform_int<> rand_dist_;
+ boost::variate_generator<boost::mt19937&, boost::uniform_int<>> rand_nucl_;
+
+public:
+
+ RandNucl(unsigned seed) :
+ seed_(seed),
+ rand_engine_(seed_),
+ rand_dist_(0, 3),
+ rand_nucl_(rand_engine_, rand_dist_) {
+
+ }
+
+ char operator()() {
+ return nucl((char) rand_nucl_());
+ }
+};
+
+class RepeatMasker : public io::SequenceModifier {
+private:
+ typedef runtime_k::RtSeq Kmer;
+ typedef KeyIteratingMap<Kmer, Count, kmer_index_traits<Kmer>, SimpleStoring> KmerCountIndex;
+ typedef typename KmerCountIndex::KeyWithHash KeyWithHash;
+ typedef KmerCountIndex::KMerIdx KmerIdx;
+
+ size_t k_;
+
+ RandNucl& rand_nucl_;
+
+ KmerCountIndex index_;
+ //todo maybe remove mutable? will need removing const from Modify
+
+
+ bool IsRepeat(const KeyWithHash& kwh) const {
+ return index_.get_value(kwh).count == -1u;
+ }
+
+ template<class S>
+ const vector<Range> RepeatIntervals(const S& s) const {
+ vector<Range> answer;
+ answer.push_back(Range(0, 0));
+ KeyWithHash kwh = index_.ConstructKWH(Kmer(k_, s) >> 'A');
+ for (size_t i = k_ - 1; i < s.size(); ++i) {
+ kwh <<= s[i];
+ if (IsRepeat(kwh)) {
+ if (i <= answer.back().end_pos) {
+ answer.back().end_pos = i + k_;
+ } else {
+ answer.push_back(Range(i, i + k_));
+ }
+ }
+ }
+ return answer;
+ }
+
+ void MaskRepeat(const Range& repeat, std::string& s) const {
+ TRACE("Masking repeat of size " << repeat.size() << " " << repeat);
+ TRACE("Old sequence " << s.substr(repeat.start_pos, repeat.size()));
+ for (size_t i = repeat.start_pos; i < repeat.end_pos; ++i) {
+ s[i] = rand_nucl_();
+ }
+ TRACE("New sequence " << s.substr(repeat.start_pos, repeat.size()));
+ }
+
+ void MaskRepeats(const vector<Range>& rep_int, std::string& s) const {
+ TRACE("Masking " << rep_int.size() << " repeat ranges in sequence of length " << s.size());
+ for (Range r : rep_int) {
+ MaskRepeat(r, s);
+ }
+ }
+
+public:
+ RepeatMasker(size_t k, RandNucl& rand_nucl, const std::string& workdir) :
+ k_(k),
+ rand_nucl_(rand_nucl),
+ index_(k_, workdir) {
+ }
+
+ template<class Streams>
+ size_t FindRepeats(Streams streams) {
+ INFO("Looking for repetitive " << k_ << "-mers");
+ CountIndexHelper<KmerCountIndex>::RepeatSearchingIndexBuilderT().BuildIndexFromStream(index_, streams);
+ size_t rep_kmer_cnt = 0;
+ for (auto it = index_.value_cbegin(); it != index_.value_cend(); ++it) {
+ if (it->count == -1u) {
+ rep_kmer_cnt++;
+ } else {
+ VERIFY(it->count == 0);
+ }
+ }
+ INFO("Found " << rep_kmer_cnt << " repetitive " << k_ << "-mers");
+ return rep_kmer_cnt;
+ }
+
+ /*virtual*/Sequence Modify(const Sequence& s) {
+ if (s.size() < k_)
+ return s;
+ string str = s.str();
+ MaskRepeats(RepeatIntervals(s), str);
+ return Sequence(str);
+ }
+
+private:
+ DECL_LOGGER("RepeatMasker")
+};
+
+template<class Stream1, class Stream2>
+void Transfer(Stream1& s1, Stream2& s2) {
+ typename Stream1::ReadT r;
+ while (!s1.eof()) {
+ s1 >> r;
+ s2 << r;
+ }
+}
+
+inline ContigStreams OpenStreams(const string& root,
+ const vector<string>& filenames, bool add_rc) {
+ ContigStreams streams;
+ for (auto filename : filenames) {
+ DEBUG("Opening stream from " << root << filename);
+ ContigStreamPtr reader = make_shared<io::FileReadStream>(root + filename);
+ if (add_rc)
+ reader = io::RCWrap<Contig>(reader);
+ streams.push_back(reader);
+ }
+ return streams;
+}
+
+inline void SaveStreams(ContigStreams streams, const vector<string>& suffixes,
+ const string& out_root) {
+ make_dir(out_root);
+
+ streams.reset();
+ for (size_t i = 0; i < streams.size(); ++i) {
+ VERIFY(!suffixes[i].empty());
+ string output_filename = out_root + suffixes[i];
+ io::osequencestream ostream(output_filename);
+ Transfer(streams[i], ostream);
+ }
+}
+
+inline void ModifyAndSave(shared_ptr<io::SequenceModifier> modifier, ContigStreams streams, const vector<string>& suffixes,
+ const string& out_root) {
+ ContigStreams modified;
+ for (size_t i = 0; i < streams.size(); ++i) {
+ modified.push_back(make_shared<io::ModifyingWrapper<Contig>>(streams.ptr_at(i), modifier));
+ }
+ SaveStreams(modified, suffixes, out_root);
+}
+
+inline bool MaskRepeatsIteration(size_t k, const string& input_dir, const vector<string>& suffixes, const string& output_dir, RandNucl& rand_nucl) {
+ shared_ptr<RepeatMasker> masker_ptr = make_shared<RepeatMasker>(k, rand_nucl, "tmp");
+ INFO("Opening streams in " << input_dir);
+ bool repeats_found = masker_ptr->FindRepeats(OpenStreams(input_dir, suffixes, true));
+ if (repeats_found) {
+ INFO("Repeats found");
+ INFO("Modifying and saving streams to " << output_dir);
+ ModifyAndSave(masker_ptr, OpenStreams(input_dir, suffixes, false), suffixes, output_dir);
+ } else {
+ INFO("No repeats found");
+ }
+ return !repeats_found;
+}
+
+//inline bool MaskRepeats(const string& input_dir, const vector<string>& suffixes, size_t max_iter_count, const string& work_dir) {
+// size_t iter = 0;
+// bool no_repeats = false;
+// while (iter <= max_iter_count) {
+// string out_dir = input_dir + ToString(iter) + "/";
+// make_dir(out_dir);
+// no_repeats = MaskRepeatsIteration(input_dir, suffixes, out_dir);
+// if (no_repeats) {
+// break;
+// }
+// ++iter;
+// }
+// if (no_repeats) {
+// string out_dir = input_dir + "masked/";
+// make_dir(out_dir);
+// ModifyAndSave(make_shared<io::TrivialModifier>(),
+// OpenStreams(input_dir + "/" + ToString(iter) + "/", suffixes,
+// out_dir));
+// } else {
+// WARN("Failed to mask repeats in " << max_iter_count << " iterations");
+// }
+// return no_repeats;ContigStreamsPtr
+//}
+
+inline bool MaskRepeats(size_t k, ContigStreams input_streams, const vector<string>& suffixes, size_t max_iter_count, const string& work_dir) {
+ size_t iter = 0;
+ RandNucl rand_nucl(239);
+ bool no_repeats = false;
+ string input_dir = work_dir + "init/";
+ make_dir(input_dir);
+ SaveStreams(input_streams, suffixes, input_dir);
+ while (iter <= max_iter_count) {
+ INFO("------------------------");
+ INFO("Iteration " << iter);
+ string out_dir = work_dir + ToString(iter) + "/";
+ make_dir(out_dir);
+ no_repeats = MaskRepeatsIteration(k, input_dir, suffixes, out_dir, rand_nucl);
+ if (no_repeats) {
+ INFO("No repeats found");
+ break;
+ }
+ input_dir = out_dir;
+ ++iter;
+ }
+ if (no_repeats) {
+ INFO("Repeats succesfully masked in " << iter << " iterations");
+ string out_dir = work_dir + "masked/";
+ make_dir(out_dir);
+ SaveStreams(OpenStreams(input_dir, suffixes, false),
+ suffixes, out_dir);
+ } else {
+ WARN("Failed to mask repeats in " << max_iter_count << " iterations");
+ }
+ return no_repeats;
+}
+
+//void FillBagForStrand(const Sequence& strand,
+// map<Seq<k>>& bag) {
+// if (strand.size() < k)
+// return;
+// Seq<k> kmer(strand);
+// kmer >> 'A';
+// for (size_t i = k - 1; i < strand.size(); ++i) {
+// kmer = kmer << strand[i];
+// bag[kmer] += 1;
+// }
+//}
+//
+//template<size_t k>
+//void FillRepeats(const Sequence& genome,
+// set<Seq<k>, typename Seq<k>::less2>& repeats) {
+// map<Seq<k>, size_t, typename Seq<k>::less2> bag;
+//
+// FillBagForStrand(genome, bag);
+// FillBagForStrand(!genome, bag);
+//
+// for (auto it = bag.begin(); it != bag.end(); ++it) {
+// if (it->second > 1)
+// repeats.insert(it->first);
+// }
+//}
+//
+//template<size_t k>
+//void FillRepeats(const vector<Sequence>& assembly,
+// set<Seq<k>, typename Seq<k>::less2>& repeats) {
+// map<Seq<k>, size_t, typename Seq<k>::less2> bag;
+//
+// for (auto it = assembly.begin(); it != assembly.end(); ++it) {
+// FillBagForStrand(*it, bag);
+// FillBagForStrand(!(*it), bag);
+// }
+//
+// for (auto it = bag.begin(); it != bag.end(); ++it) {
+// if (it->second > 1)
+// repeats.insert(it->first);
+// }
+//}
+
+//template<size_t k>
+//class RepeatCleaner {
+// typedef Seq<k> Kmer;
+// typedef set<Kmer, typename Kmer::less2> Repeats;
+//
+// void MarkPositions(size_t start, size_t end, vector<bool>& answer) {
+// for (size_t i = start; i < end; ++i) {
+// answer[i] = true;
+// }
+// }
+//
+// void MarkRepeatNucls(const Sequence& s, const Repeats& repeats, vector<bool>& answer) {
+//// vector<bool> answer(s.size(), false);
+// Kmer kmer(s);
+// kmer = kmer >> 'A';
+// for (size_t i = k - 1 ; i < s.size(); ++i) {
+// kmer = kmer << s[i];
+// if (repeats.count(kmer) > 0) {
+// MarkPositions(i - k + 1, i + 1, answer);
+// }
+// }
+// }
+//
+// void MarkShortIslands(, size_t threshold = k) {
+//
+// }
+//
+//public:
+//
+//};
+
+//template<size_t k>
+//Sequence ClearGenome(const Sequence& genome,
+// const set<Seq<k>, typename Seq<k>::less2>& repeats) {
+// INFO("Clearing genome");
+// if (genome.size() < k)
+// return genome;
+//
+// string answer;
+// for (size_t i = 0; i < k - 1; ++i) {
+// answer += nucl(genome[i]);
+// }
+// //intervals of kmers that should be resolved afterwards
+// vector<Range> repeat_intervals;
+// Seq<k> kmer(genome);
+// size_t curr_pos = 0;
+// //curr_pos + k - next nucl pos
+// bool changed = false;
+// while (curr_pos + k != genome.size()) {
+// size_t int_start = curr_pos;
+// while (repeats.count(kmer) > 0 && curr_pos + k < genome.size()) {
+// kmer = kmer << genome[curr_pos + k];
+// curr_pos++;
+// changed = true;
+// }
+// if (int_start != curr_pos)
+// repeat_intervals.push_back(Range(int_start, curr_pos));
+//
+// if (curr_pos + k == genome.size())
+// break;
+//
+// while (repeats.count(kmer) == 0 && curr_pos + k < genome.size()) {
+// answer += nucl(kmer[k - 1]);
+// kmer = kmer << genome[curr_pos + k];
+// curr_pos++;
+// }
+// }
+// if (changed) {
+// INFO("Genome was changed during cleaning");
+// } else {
+// INFO("Genome wasn't changed during cleaning");
+// }
+// return Sequence(answer);
+//}
+//
+//template<size_t k>
+//Sequence ClearGenome(const Sequence& genome) {
+// INFO("Clearing genome of repeats");
+//
+// set<Seq<k>, typename Seq<k>::less2> repeats;
+// INFO("Filling set of repeats");
+// FillRepeats<k>(genome, repeats);
+// INFO("Clearing genome");
+// return ClearGenome<k>(genome, repeats);
+//}
+//
+////todo bad strategy for assembly cleaning
+//template<size_t k>
+//pair<Sequence, vector<Sequence>> Clear(const Sequence& genome,
+// const vector<Sequence>& assembly) {
+// INFO("Clearing genome of repeats");
+//
+// set<Seq<k>, typename Seq<k>::less2> repeats;
+// INFO("Filling set of repeats");
+// FillRepeats<k>(genome, repeats);
+//// for (auto it = assembly.begin(); it != assembly.end(); ++it) {
+//// FillRepeats(*it, repeats);
+//// }
+// INFO("Clearing genome");
+// Sequence new_genome = ClearGenome<k>(genome, repeats);
+// INFO("Clearing assembly");
+// vector<Sequence> new_assembly;
+// for (auto it = assembly.begin(); it != assembly.end(); ++it) {
+// new_assembly.push_back(ClearGenome<k>(*it, repeats));
+// }
+// return make_pair(new_genome, new_assembly);
+//}
+
+//template<size_t k>
+//void Clear(const string& genome_in, const string& genome_out
+// , const string& assembly_in, const string& assembly_out) {
+// INFO("Clearing genome of repeats");
+// pair<Sequence, vector<Sequence>> cleared
+// = Clear<k>(ReadGenome(genome_in), ReadContigs(assembly_in));
+// io::ofastastream genome_out
+//
+//}
+
+//template<size_t k>
+//void Clear(const string& in, const string& out) {
+// io::Reader in_stream(in);
+// set<Seq<k>, typename Seq<k>::less2> repeats;
+// FillRepeats<k>(AllSequences(in_stream), repeats);
+// in_stream.reset();
+// io::osequencestream out_stream(out);
+// io::SingleRead contig;
+// while (!in_stream.eof()) {
+// in_stream >> contig;
+// Sequence cleared = ClearGenome<k>(contig.sequence(), repeats);
+// out_stream << io::SingleRead(contig.name(), cleared.str());
+// }
+//}
+//
+//template<size_t k>
+//pair<Sequence, Sequence> ClearGenomes(const pair<Sequence, Sequence>& genomes) {
+// INFO("Clearing genomes from repeats");
+//
+// set<Seq<k>, typename Seq<k>::less2> repeats;
+// INFO("Filling set of repeats");
+// FillRepeats<k>(genomes.first, repeats);
+// FillRepeats<k>(genomes.second, repeats);
+// INFO("Clearing genomes");
+// return make_pair(ClearGenome<k>(genomes.first, repeats),
+// ClearGenome<k>(genomes.second, repeats));
+//}
+//
+//template<size_t k>
+//pair<Sequence, Sequence> TotallyClearGenomes(
+// const pair<Sequence, Sequence>& genomes) {
+// static const size_t iter_count = 1;
+// pair<Sequence, Sequence> tmp = genomes;
+// for (size_t i = 0; i < iter_count; ++i) {
+// INFO("Cleaning iteration " << i);
+// tmp = ClearGenomes<k>(tmp);
+// }
+// return tmp;
+//}
+//
+//template<size_t k>
+//bool CheckNoRepeats(const Sequence& genome) {
+// set<Seq<k>, typename Seq<k>::less2> repeats;
+// FillRepeats<k>(genome, repeats);
+// return repeats.empty();
+//}
+
+
+
+}
diff --git a/src/projects/cap/serialization.hpp b/src/projects/cap/serialization.hpp
new file mode 100644
index 0000000..7fb38f2
--- /dev/null
+++ b/src/projects/cap/serialization.hpp
@@ -0,0 +1,151 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <iostream>
+#include <unordered_map>
+#include <cstring>
+#include <string>
+#include <vector>
+
+#include "data_structures/sequence/sequence.hpp"
+
+namespace cap {
+
+class Serializer {
+ public:
+ Serializer(std::ostream &os) : os_(os) {
+ }
+
+ template <class T>
+ void WriteLine(const std::string &key, const T &value) {
+ os_ << key << kDelimiter;
+ SerializeToStream(value, os_);
+ os_ << std::endl;
+ }
+
+ template <class T>
+ std::string Serialize(const T &value) const {
+ std::stringstream ss;
+ SerializeToStream(value, ss);
+ return ss.str();
+ }
+
+ private:
+ static const char kDelimiter = char(1);
+
+ template <class T>
+ void SerializeToStream(const T &value, std::ostream &out) const {
+ out << value;
+ out << kDelimiter;
+ }
+ template <class T>
+ void SerializeToStream(const std::vector<T> &value, std::ostream &out) const;
+ template<class T1, class T2>
+ void SerializeToStream (const std::pair<T1, T2> &value, std::ostream &out) const;
+
+ std::ostream &os_;
+};
+
+template<class T>
+void Serializer::SerializeToStream(const std::vector<T> &value, std::ostream &out) const {
+ //out << value.size() << kDelimiter;
+ SerializeToStream(value.size(), out);
+ for (const auto &x : value) {
+ SerializeToStream(x, out);
+ }
+}
+
+template<class T1, class T2>
+void Serializer::SerializeToStream (const std::pair<T1, T2> &value, std::ostream &out) const {
+ SerializeToStream(value.first, out);
+ SerializeToStream(value.second, out);
+}
+
+class Deserializer {
+ public:
+ Deserializer(std::istream &is) : is_(is) {
+ }
+
+ void ReadStream() {
+ while (!is_.eof()) {
+ std::string key;
+ std::string value;
+
+ std::getline(is_, key, kDelimiter);
+ std::getline(is_, value);
+ read_map_[key] = value;
+
+ }
+ }
+
+ template <class T>
+ void ReadValue(const std::string &key, T &value) const {
+ if (read_map_.count(key) == 0)
+ return;
+ Deserialize(read_map_.at(key), value);
+ }
+
+ template <class T>
+ void Deserialize(const std::string &s, T &value) const {
+ std::stringstream ss(s);
+ DeserializeFromStream(ss, value);
+ }
+
+ private:
+ static const char kDelimiter = char(1);
+
+ template <class T>
+ void DeserializeFromStream(std::istream &is, T &value) const {
+ is >> value;
+ is.ignore(1, kDelimiter);
+ }
+ template <class T>
+ void DeserializeFromStream(std::istream &is, std::vector<T> &value) const;
+ template<class T1, class T2>
+ void DeserializeFromStream (std::istream &is,
+ std::pair<T1, T2> &value) const;
+ void DeserializeFromStream(std::istream &is, Sequence &s) const;
+
+ std::istream &is_;
+ std::unordered_map<std::string, std::string> read_map_;
+};
+
+template <>
+void Deserializer::DeserializeFromStream<std::string>(std::istream &is,
+ std::string &value) const {
+ std::getline(is, value, kDelimiter);
+}
+
+template <class T>
+void Deserializer::DeserializeFromStream(std::istream &is,
+ std::vector<T> &value) const {
+ size_t size = 0;
+ DeserializeFromStream(is, size);
+ value.resize(size);
+
+ for (size_t i = 0; i < size; ++i) {
+ DeserializeFromStream(is, value[i]);
+ }
+}
+
+template<class T1, class T2>
+void Deserializer::DeserializeFromStream (std::istream &is,
+ std::pair<T1, T2> &value) const {
+ DeserializeFromStream(is, value.first);
+ DeserializeFromStream(is, value.second);
+}
+
+void Deserializer::DeserializeFromStream(std::istream &is,
+ Sequence &s) const {
+ std::string str;
+ DeserializeFromStream(is, str);
+ s = Sequence(str.c_str());
+}
+
+}
diff --git a/src/projects/cap/simple_indel_finder.hpp b/src/projects/cap/simple_indel_finder.hpp
new file mode 100644
index 0000000..77cf46f
--- /dev/null
+++ b/src/projects/cap/simple_indel_finder.hpp
@@ -0,0 +1,382 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <iostream>
+#include "coloring.hpp"
+#include "assembly_graph/graph_support/graph_processing_algorithm.hpp"
+#include "path_projector.hpp"
+#include "graph_traversal_constraints.hpp"
+
+namespace cap {
+
+template <class gp_t>
+class SimpleIndelFinder {
+ typedef typename gp_t::graph_t Graph;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef std::vector<EdgeId> Path;
+ typedef uint64_t u64int;
+
+ gp_t &gp_;
+ Graph &g_;
+ ColorHandler<Graph> &coloring_;
+ CoordinatesHandler<Graph> &coordinates_handler_;
+
+ GraphTraversalConstraints<Graph> &graph_traversal_constraints_;
+
+ ostream &output_stream_;
+
+ bool mask_indels_;
+ PathProjector<Graph> path_projector_;
+
+ size_t colors_number_;
+ size_t coloring_version_;
+
+ unordered_map<VertexId, u64int> processor_coloring_;
+ unordered_map<VertexId, size_t> vertices_distances_;
+
+ VertexId restricted_vertex_;
+
+ bool found_merge_point_;
+ VertexId best_vertex_;
+ size_t best_distance_to_vertex_;
+ bool need_reflow_;
+
+ vector<Path> alternative_paths_;
+ size_t snps_;
+ size_t indels_;
+ size_t unknown_snp_;
+ size_t unknown_indel_;
+
+ // Maximum number of outgoing edges we are interested in
+ // If more, we do not consider this case at all
+ const static size_t kOutgingEdgesNumberThreshold = 4;
+ const static size_t kDfsDepthThreshold = 4;
+ const static size_t kProcessorColorShift = 32;
+ const static size_t kProcessorColorMask = (1ll << kProcessorColorShift) - 1;
+
+ inline void OrVertexColor(VertexId vertex, size_t color_set) {
+ u64int ¤t_val = processor_coloring_[vertex];
+ if ((current_val & kProcessorColorMask) < coloring_version_) {
+ // yes, it clears color bits
+ current_val = coloring_version_;
+ }
+ current_val |= u64int(color_set) << kProcessorColorShift;
+
+ //processor_coloring_[vertex] = current_val;
+ }
+
+ inline size_t GetVertexColor(VertexId vertex) {
+ const u64int current_val = processor_coloring_[vertex];
+ if ((current_val & kProcessorColorMask) < coloring_version_) {
+ return 0;
+ }
+ return size_t(current_val >> kProcessorColorShift);
+ }
+
+
+ inline TColorSet GetIncomingColoring(const VertexId vertex) const {
+ TColorSet incoming_coloring;
+ vector<EdgeId> incoming_edges = g_.IncomingEdges(vertex);
+ for (auto it = incoming_edges.begin(); it != incoming_edges.end(); ++it) {
+ incoming_coloring |= coloring_.Color(*it);
+ }
+ return incoming_coloring;
+ }
+
+ inline bool CheckColorSetExistence(const vector<EdgeId> &edges,
+ const TColorSet &coloring) const {
+ for (auto it = edges.begin(); it != edges.end(); ++it) {
+ if (coloring_.Color(*it) == coloring) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ size_t RelaxVertexDist(const VertexId v, const size_t dist) {
+ const auto it = vertices_distances_.find(v);
+ if (it == vertices_distances_.end()) {
+ return vertices_distances_[v] = dist;
+ } else {
+ if (it->second > dist) {
+ it->second = dist;
+ }
+ return it->second;
+ }
+ }
+
+ void ColoringDfs(const EdgeId edge,
+ const size_t color_mask, const size_t path_length,
+ const size_t depth) {
+ const VertexId vertex = g_.EdgeEnd(edge);
+ if (vertex == restricted_vertex_) {
+ return;
+ }
+ OrVertexColor(vertex, color_mask);
+ size_t cur_dist = RelaxVertexDist(vertex, path_length);
+
+ if (__builtin_popcountll(GetVertexColor(vertex)) >= 2) {
+ TRACE("checking vertex from edge " << g_.str(edge) << " of dist " << cur_dist);
+ if (!found_merge_point_) {
+ best_vertex_ = vertex;
+ best_distance_to_vertex_ = cur_dist;
+
+ found_merge_point_ = true;
+ TRACE("found merge point from edge " << g_.str(edge) << " length " << cur_dist);
+ } else if (cur_dist < best_distance_to_vertex_) {
+ best_vertex_ = vertex;
+ best_distance_to_vertex_ = cur_dist;
+ TRACE("found merge point from edge " << g_.str(edge) << " length " << cur_dist);
+ }
+ return;
+ }
+
+ if (depth >= kDfsDepthThreshold) {
+ return;
+ }
+ for (EdgeId e : g_.OutgoingEdges(vertex)) {
+ graph_traversal_constraints_.PushEdge(e);
+ if (graph_traversal_constraints_.PathIsCorrect())
+ ColoringDfs(e, color_mask,
+ path_length + g_.length(e), depth + 1);
+ graph_traversal_constraints_.PopEdge();
+ }
+ }
+
+ // returns if endpoint was found or not
+ bool GatheringDfs(const EdgeId edge,
+ /*const size_t color_mask_needed,*/ const size_t depth,
+ vector<EdgeId> &path_seq) {
+ VertexId vertex = g_.EdgeEnd(edge);
+ if (vertex == restricted_vertex_) {
+ return false;
+ }
+
+ path_seq.push_back(edge);
+
+ //if (GetVertexColor(vertex) == color_mask_needed) {
+ //if (__builtin_popcount(GetVertexColor(vertex)) >= 2) {
+ if (vertex == best_vertex_) {
+ /*
+ INFO("found final vertex " << g_.str(vertex));
+ if (coordinates_handler_.GetContiguousThreads(path_seq).size() !=
+ pos_array.size()) {
+ INFO("" << coordinates_handler_.GetContiguousThreads(path_seq).size() << " " <<
+ pos_array.size());
+ VERIFY(false);
+ }
+ */
+
+ alternative_paths_.push_back(path_seq);
+ path_seq.pop_back();
+ return true;
+ }
+
+ if (depth >= kDfsDepthThreshold) {
+ path_seq.pop_back();
+ return false;
+ }
+
+ for (EdgeId e : g_.OutgoingEdges(vertex)) {
+ graph_traversal_constraints_.PushEdge(e);
+ if (graph_traversal_constraints_.PathIsCorrect())
+ GatheringDfs(e,
+ /*color_mask_needed,*/ depth + 1, path_seq);
+ graph_traversal_constraints_.PopEdge();
+ }
+ path_seq.pop_back();
+ return false;
+ }
+
+ void PaintPath(const Path &path) {
+ for (const auto e : path)
+ coloring_.PaintEdge(e, TColorSet::SingleColor(colors_number_));
+ }
+
+ size_t GetPathLength(const Path &p) const {
+ size_t res = 0;
+ for (const auto &edge : p) {
+ res += g_.length(edge);
+ }
+ return res;
+ }
+
+ void CollapsePaths() {
+ if (alternative_paths_.size() == 0)
+ return;
+ bool has_short_enough = false;
+ for (const auto &path : alternative_paths_) {
+ if (GetPathLength(path) < 10 * g_.k()) {
+ //todo move threshold out of here
+ //TRACE("Too long path: " << GetPathLength(path));
+ has_short_enough = true;
+ break;
+ }
+ }
+ if (!has_short_enough) {
+ alternative_paths_.clear();
+ return;
+ }
+
+ bool success = path_projector_.CollapsePaths(alternative_paths_);
+ if (!success) {
+ TRACE("Could not collapse paths: " << alternative_paths_ << " ("
+ << alternative_paths_.size() << " paths)");
+ for (const auto &path : alternative_paths_) {
+ PaintPath(path);
+ }
+ }
+ alternative_paths_.clear();
+ }
+
+ size_t EstimateSNPNumber(const size_t branch_len) {
+ if (branch_len < 2) return 0;
+ return 1 + (branch_len - (g_.k() + 1) + g_.k() - 1) / g_.k();
+ }
+
+ void AnalyseThreadLengths() {
+ std::vector<size_t> thread_lengths;
+ for (const auto &path : alternative_paths_) {
+ thread_lengths.push_back(GetPathLength(path));
+ }
+ bool is_simple_snp = true;
+ bool is_simple_indel = false;
+ bool equal_lengths = true;
+
+ size_t prev = 0;
+ size_t min_branch = size_t(-1);
+ for (size_t len : thread_lengths) {
+ if (len != g_.k() + 1) {
+ is_simple_snp = false;
+ }
+ if (prev && len != prev) {
+ equal_lengths = false;
+ }
+ if (len == g_.k()) {
+ is_simple_indel = true;
+ }
+
+ prev = len;
+ min_branch = min(min_branch, len);
+ }
+
+ if (is_simple_snp)
+ snps_++;
+ else if (is_simple_indel)
+ indels_++;
+ else
+ if (equal_lengths) {
+ snps_++;
+ unknown_snp_ += EstimateSNPNumber(prev) - 1;
+ } else {
+ indels_++;
+ unknown_snp_ += EstimateSNPNumber(min_branch) - 1;
+ }
+ }
+
+ void CheckForIndelEvent(const VertexId starting_vertex) {
+ TRACE("New indel event");
+
+ // Check that is is interesting
+ size_t outgoing_edges_number = g_.OutgoingEdgeCount(starting_vertex);
+ if (outgoing_edges_number <= 1 || outgoing_edges_number > kOutgingEdgesNumberThreshold) {
+ // Nothing to do here
+ return;
+ }
+
+ /*
+ // Generate initial colorset
+ TColorSet initial_coloring = GetIncomingColoring(starting_vertex);
+
+ // Check that there exists split of colors
+ if (CheckColorSetExistence(outgoing_edges, initial_coloring)) {
+ // Nothing to do here
+ return;
+ }
+ */
+
+ // Dfs and try to resolve all splits
+ ++coloring_version_;
+ found_merge_point_ = false;
+ restricted_vertex_ = starting_vertex;
+ size_t branch_num = 0;
+ for (EdgeId e : g_.OutgoingEdges(starting_vertex)) {
+ graph_traversal_constraints_.PushEdge(e);
+ ColoringDfs(e, 1 << branch_num, g_.length(e), 0);
+ branch_num++;
+ graph_traversal_constraints_.PopEdge();
+ }
+
+ vertices_distances_.clear();
+
+ if (found_merge_point_) {
+ vector<EdgeId> edge_seq_vector;
+ for (EdgeId e : g_.OutgoingEdges(starting_vertex)) {
+ graph_traversal_constraints_.PushEdge(e);
+ GatheringDfs(e,
+ /*(1 << outgoing_edges_number) - 1,*/ 0, edge_seq_vector);
+ graph_traversal_constraints_.PopEdge();
+ }
+
+ AnalyseThreadLengths();
+
+ TRACE("Resolved split of color bunch");
+
+ if (mask_indels_) {
+ TRACE("Removing edges... (masking indels)");
+ CollapsePaths();
+ //DeleteEdges();
+ TRACE("Done");
+ }
+ }
+
+ }
+
+ public:
+ SimpleIndelFinder(gp_t &gp, ColorHandler<Graph> &coloring,
+ CoordinatesHandler<Graph> &coordinates_handler,
+ GraphTraversalConstraints<Graph> &graph_traversal_constraints,
+ ostream &output_stream, const bool mask_indels = false)
+ : gp_(gp),
+ g_(gp_.g),
+ coloring_(coloring),
+ coordinates_handler_(coordinates_handler),
+ graph_traversal_constraints_(graph_traversal_constraints),
+ output_stream_(output_stream),
+ mask_indels_(mask_indels),
+ path_projector_(g_, coordinates_handler_),
+ colors_number_(coloring.max_colors()),
+ coloring_version_(0),
+ processor_coloring_() {
+ }
+
+ void FindIndelEvents() {
+ indels_ = 0;
+ snps_ = 0;
+ unknown_snp_ = 0;
+ unknown_indel_ = 0;
+ INFO("Searching for In-Del events");
+ for (auto it = g_.SmartVertexBegin(); !it.IsEnd(); ++it) {
+ do {
+ need_reflow_ = false;
+ CheckForIndelEvent(*it);
+ } while (need_reflow_);
+ }
+ INFO("Found around " << snps_/2 << "+" << unknown_snp_/2 << "=" <<
+ (snps_+unknown_snp_)/2 << " SNPs and " << indels_/2 << "+" <<
+ unknown_indel_/2 << " indels");
+ }
+
+ private:
+ DECL_LOGGER("SimpleIndelFinder")
+ ;
+};
+
+}
diff --git a/src/projects/cap/simple_inversion_finder.hpp b/src/projects/cap/simple_inversion_finder.hpp
new file mode 100644
index 0000000..3088a0a
--- /dev/null
+++ b/src/projects/cap/simple_inversion_finder.hpp
@@ -0,0 +1,433 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <iostream>
+#include "coloring.hpp"
+#include "coordinates_handler.hpp"
+#include "compare_standard.hpp"
+#include "comparison_utils.hpp"
+#include "algorithms/dijkstra/dijkstra_helper.hpp"
+
+namespace cap {
+
+template <class Graph>
+class ReliableTargetedBoundedDijkstra;
+template <class Graph>
+class GenomePathsFinder;
+
+/*
+ * SimpleInversionFinder searches for inversions occured in a set of genomes
+ * relative to the others (all_genomes minus inversed_genomes).
+ * Currently only for two genome sequences.
+ *
+ * Algorithm just searches for "cycles" of length 4 with alternating colors
+ * of the following kind:
+ * v1 --red-> v3
+ * v2 --red-> v4
+ * v1 -blue-> v4
+ * v2 -blue-> v3
+ */
+
+template <class gp_t>
+class SimpleInversionFinder {
+ public:
+ SimpleInversionFinder(gp_t &gp, ColorHandler<Graph> &coloring,
+ CoordinatesHandler<Graph> &coordinates_handler,
+ const std::string base_pic_file_name,
+ const bool mask_inversed = false)
+ : gp_(gp),
+ g_(gp_.g),
+ coloring_(coloring),
+ coordinates_handler_(coordinates_handler),
+ base_pic_file_name_(base_pic_file_name),
+ num_cycles_found_(0),
+ found_lengths_(),
+ mask_inversed_(mask_inversed) {
+ }
+
+ void FindInversionEvents() {
+ num_cycles_found_ = 0;
+
+ INFO("Searching for inversions");
+ for (auto it = g_.SmartVertexBegin(); !it.IsEnd(); ++it) {
+ CheckForCycle(*it);
+ }
+ INFO("Searching for inversions done. Cycles found: " << num_cycles_found_);
+
+ INFO("Found lengths:");
+ const std::vector<size_t> found = found_lengths();
+ for (auto it = found.begin(); it != found.end(); ++it) {
+ INFO("" << *it);
+ }
+ }
+
+ std::vector<size_t> found_lengths() {
+ std::sort(found_lengths_.begin(), found_lengths_.end());
+ return found_lengths_;
+ }
+
+ private:
+ typedef typename gp_t::graph_t Graph;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ typedef std::vector<EdgeId> EdgeList;
+ typedef std::vector<EdgeId> Path;
+ typedef std::vector<VertexId> VertexList;
+ typedef uint64_t u64int;
+
+
+ gp_t &gp_;
+ Graph &g_;
+ ColorHandler<Graph> &coloring_;
+ CoordinatesHandler<Graph> &coordinates_handler_;
+
+ //ostream &output_stream_;
+ std::string base_pic_file_name_;
+ size_t num_cycles_found_;
+ std::vector<size_t> found_lengths_;
+
+ bool mask_inversed_;
+
+ void CheckForCycle(const VertexId v1) {
+ // the search assumes colors 0 and 1 are used
+ const std::vector<EdgeList> &solo_color_v1_out = GetSoloColoredEdgeLists(
+ g_.OutgoingEdges(v1));
+
+ for (auto it4 = solo_color_v1_out[1].begin();
+ it4 != solo_color_v1_out[1].end(); ++it4) {
+ const VertexId v4 = g_.EdgeEnd(*it4);
+ if (v4 < v1) continue;
+
+ const EdgeList back_list = GetSoloColoredEdgeLists(
+ g_.IncomingEdges(v4))[0];
+
+ for (auto it2 = back_list.begin(); it2 != back_list.end(); ++it2) {
+ const VertexId v2 = g_.EdgeStart(*it2);
+ if (v2 == v1 || v2 < v1)
+ continue;
+
+ const EdgeList v4_to_v3_list = GetSoloColoredEdgeLists(
+ g_.OutgoingEdges(v2))[1];
+
+ VertexList v3_list = FindEqualVerticesOnEnds(
+ solo_color_v1_out[0], v4_to_v3_list);
+
+ while (v3_list.size() > 0 && v3_list.back() == v4) {
+ v3_list.pop_back();
+ }
+
+ if (v3_list.size() > 0) {
+ if (v3_list[0] < v1)
+ continue;
+
+ DEBUG("Found inversion! (" << v3_list.size() << " candidates)");
+
+ ProcessFoundCycle(VertexList({v1, v2, v3_list[0], v4}));
+
+ // should return here
+ }
+ }
+ }
+ }
+
+ void ProcessFoundCycle(const VertexList &cycle) {
+ stringstream v_list_str;
+ for (auto it = cycle.begin(); it != cycle.end(); ++it) {
+ v_list_str << g_.str(*it) << " ";
+ }
+ DEBUG("cycle found: " << v_list_str.str());
+
+ const std::string edge_pic_name = base_pic_file_name_ + "_" +
+ ToString(num_cycles_found_) + ".dot";
+ const std::string path_pic_name = base_pic_file_name_ + "_path_" +
+ ToString(num_cycles_found_) + ".dot";
+
+ /*
+ PrintColoredGraphAroundEdge(g_, coloring_, edge, gp_.edge_pos,
+ edge_pic_name);
+ */
+
+ ssize_t length = -1;
+ ssize_t l1 = FindAndPrintPath(cycle[0], g_.conjugate(cycle[1]), cycle[2], cycle[3]);
+ //int l2 = FindAndPrintPath(g_.conjugate(cycle[1]), cycle[0], cycle[2], cycle[3]);
+ ssize_t l2 = FindAndPrintPath(g_.conjugate(cycle[2]), cycle[3], g_.conjugate(cycle[0]), g_.conjugate(cycle[1]));
+ if (l1 > 0 && (length < 0 || length > l1))
+ length = l1;
+ if (l2 > 0 && (length < 0 || length > l2))
+ length = l2;
+ /*
+ FindAndPrintPath(cycle[2], g_.conjugate(cycle[3]), path_pic_name) ||
+ FindAndPrintPath(g_.conjugate(cycle[2]), cycle[3], path_pic_name) ||
+ FindAndPrintPath(cycle[0], g_.conjugate(cycle[1]), path_pic_name) ||
+ FindAndPrintPath(g_.conjugate(cycle[0]), cycle[1], path_pic_name);
+ */
+
+ if (length < 0) {
+ INFO("found cycle but not path!");
+ return;
+ }
+ num_cycles_found_++;
+ found_lengths_.push_back(length);
+ }
+
+ template<class EdgeContainer>
+ std::vector<EdgeList> GetSoloColoredEdgeLists(const EdgeContainer &all_edges) const {
+ std::vector<EdgeList> result(coloring_.max_colors());
+
+ for (auto it = all_edges.begin(); it != all_edges.end(); ++it) {
+ int color = GetEdgeSoloColor(*it);
+ if (IsSoloColor(color)) {
+ VERIFY(color < (int)result.size());
+ result[color].push_back(*it);
+ }
+ }
+
+ return result;
+ }
+
+ int GetEdgeSoloColor(const EdgeId edge) const {
+ const TColorSet &color = coloring_.Color(edge);
+ int result = 0;
+ for (unsigned i = 0; i < coloring_.max_colors(); ++i) {
+ if (!color[i])
+ continue;
+
+ if (result) {
+ result = -1;
+ } else {
+ result = i + 1;
+ }
+ }
+
+ return result - 1;
+ }
+
+ inline bool IsSoloColor(int solo_color) const {
+ return solo_color >= 0;
+ }
+
+ VertexList FindEqualVerticesOnEnds(
+ const EdgeList &l1, const EdgeList &l2) const {
+ const VertexList &vl1 = GetSortedEdgeEnds(l1);
+ const VertexList &vl2 = GetSortedEdgeEnds(l2);
+
+ VertexList result;
+
+ size_t i = 0, j = 0;
+ while (i < vl1.size() && j < vl2.size()) {
+ if (vl1[i] == vl2[j]) {
+ result.push_back(vl1[i]);
+ ++i, ++j;
+ } else if (vl1[i] < vl2[j]) {
+ ++i;
+ } else {
+ ++j;
+ }
+ }
+
+ return result;
+ }
+
+ VertexList GetSortedEdgeEnds(const EdgeList &l) const {
+ VertexList result;
+ result.reserve(l.size());
+
+ for (auto it = l.begin(); it != l.end(); ++it) {
+ result.push_back(g_.EdgeEnd(*it));
+ }
+
+ std::sort(result.begin(), result.end());
+ return result;
+ }
+
+ inline ssize_t FindAndPrintPath(const VertexId v1, const VertexId v2, const VertexId v3, const VertexId v4) const {
+ TRACE("Finding paths from " << g_.str(v1) << " to " << g_.str(v2));
+ const static size_t max_length = 800000;
+
+ std::vector<EdgeId> out_edges;
+ for (const auto e : g_.OutgoingEdges(v1)) {
+ if (g_.EdgeEnd(e) == v3 || g_.EdgeEnd(e) == v4) {
+ out_edges.push_back(e);
+ //INFO("out edge " << g_.str(e));
+ }
+ }
+ if (out_edges.size() == 0)
+ return -1;
+
+ VERIFY(out_edges.size() == 2);
+
+ GenomePathsFinder<Graph> dfs(g_, coordinates_handler_);
+ std::vector<Path> paths = dfs.FindGenomePaths(out_edges, v2, max_length);
+ if (paths.size() > 0) {
+ if (paths.size() == 1) {
+ //INFO("found only one path, strange");
+ }
+ return GetPathLength(paths[0]);
+ }
+ //INFO("could not find any path :(");
+
+ /*
+ ReliableTargetedBoundedDijkstra<Graph> dijkstra(g_, v2, max_length, 500);
+ dijkstra.run(v1);
+ if (dijkstra.DistanceCounted(v2)) {
+ TRACE("Finding path done; length=" << dijkstra.GetDistance(v2));
+ return dijkstra.GetDistance(v2);
+ }
+ */
+
+ return -1;
+ }
+
+ size_t GetPathLength(const Path &p) const {
+ size_t res = 0;
+ for (const auto &edge : p) {
+ res += g_.length(edge);
+ }
+ return res;
+ }
+
+ inline void PrintPath(const EdgeList &path, const std::string out_file) const {
+ const static size_t edge_length = 1;
+ const static size_t max_vertices = 100;
+ MappingPath<EdgeId> mpath = TrivialMappingPath(g_, path);
+ //Path<EdgeId> cpath(path, mpath.start_pos(), mpath.end_pos());
+
+ LengthIdGraphLabeler<Graph> basic_labeler(g_);
+ EdgePosGraphLabeler<Graph> pos_labeler(g_, gp_.edge_pos);
+ CompositeLabeler<Graph> labeler(basic_labeler, pos_labeler);
+
+ WriteComponentsAlongPath(g_, labeler, out_file, edge_length, max_vertices,
+ mpath, *ConstructBorderColorer(g_, coloring_));
+ }
+
+ size_t PathLength(const EdgeList &path) const {
+ size_t res = 0;
+ for (auto I = path.begin(); I != path.end(); ++I)
+ res += g_.length(*I);
+ return res;
+ }
+
+ DECL_LOGGER("SimpleInversionFinder")
+ ;
+};
+
+template <class Graph>
+class GenomePathsFinder {
+ public:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef std::vector<EdgeId> Path;
+ typedef typename CoordinatesHandler<Graph>::PosArray PosArray;
+
+ GenomePathsFinder(const Graph &g, const CoordinatesHandler<Graph> &crd)
+ : g_(g),
+ crd_(crd) {
+ }
+
+ std::vector<Path> FindGenomePaths(const std::vector<EdgeId> &from,
+ const VertexId to, const size_t length_bound) {
+ std::vector<Path> result;
+ target_ = to;
+ answer_.clear();
+
+ for (const auto &e : from) {
+ Path path_seq;
+ const PosArray init_pos_array = crd_.GetEndPosArray(e);
+ RunGenomeDFS(e, init_pos_array, length_bound, path_seq);
+ }
+
+ return answer_;
+ }
+
+ inline std::vector<Path> FindGenomePaths(const VertexId from,
+ const VertexId to, const size_t length_bound) {
+ return FindGenomePaths(g_.OutgoingEdges(from), to, length_bound);
+ }
+
+ private:
+ void RunGenomeDFS(const EdgeId edge, const PosArray &cur_pos,
+ const long long remaining, Path &path_seq) {
+ path_seq.push_back(edge);
+
+ VertexId vertex = g_.EdgeEnd(edge);
+ //INFO("dfs: vertex " << g_.str(vertex) << " from edge " << g_.str(edge) << " thread " << int(cur_pos[0].first) << " - " << cur_pos[0].second);
+ if (vertex == target_) {
+ answer_.push_back(path_seq);
+ path_seq.pop_back();
+ return;
+ }
+ if (remaining < 0) {
+ path_seq.pop_back();
+ return;
+ }
+
+
+ for (EdgeId e : g_.OutgoingEdges(vertex)) {
+ const PosArray further_array = crd_.FilterPosArray(cur_pos, e);
+ if (further_array.size() == 0)
+ continue;
+
+ RunGenomeDFS(e, further_array, remaining - g_.length(e), path_seq);
+ }
+ path_seq.pop_back();
+ }
+
+ const Graph &g_;
+ const CoordinatesHandler<Graph> &crd_;
+ VertexId target_;
+ std::vector<Path> answer_;
+};
+
+/*
+template<class Graph>
+class ReliableTargetedBoundedDijkstra : public Dijkstra<Graph> {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef Dijkstra<Graph> base;
+
+ public:
+ ReliableTargetedBoundedDijkstra(const Graph &g, const VertexId target,
+ const size_t bound, const size_t max_vertex_number)
+ : base(g),
+ target_(target),
+ bound_(bound),
+ max_vertex_number_(max_vertex_number),
+ vertices_number_(0),
+ vertex_limit_exceeded_(false) {
+ }
+
+ virtual bool CheckProcessVertex(const VertexId vertex, const size_t distance) {
+ ++vertices_number_;
+
+ if (vertices_number_ > max_vertex_number_)
+ vertex_limit_exceeded_ = true;
+
+ if (vertex == target_) {
+ this->set_finished(true);
+ }
+
+ return (vertices_number_ < max_vertex_number_) && (distance <= bound_);
+ }
+
+ bool VertexLimitExceeded() const {
+ return vertex_limit_exceeded_;
+ }
+
+ private:
+ const VertexId target_;
+ const size_t bound_;
+ const size_t max_vertex_number_;
+ size_t vertices_number_;
+ bool vertex_limit_exceeded_;
+};
+*/
+
+}
+
diff --git a/src/projects/cap/stats.hpp b/src/projects/cap/stats.hpp
new file mode 100644
index 0000000..4d7f1ef
--- /dev/null
+++ b/src/projects/cap/stats.hpp
@@ -0,0 +1,1502 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "assembly_graph/graph_support/component_filters.hpp"
+#include "assembly_graph/components/graph_component.hpp"
+#include "assembly_graph/components/splitters.hpp"
+#include "utils.hpp"
+#include "dev_support/simple_tools.hpp"
+#include "comparison_utils.hpp"
+#include "assembly_graph/graph_support/basic_graph_stats.hpp"
+#include "coloring.hpp"
+#include "visualization/visualization_utils.hpp"
+
+namespace cap {
+
+template<class Graph>
+class Component {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ vector<size_t> edge_lengths_;
+ vector<VertexId> component_;
+public:
+ Component(const Graph &g, const vector<VertexId> &component) :
+ component_(component) {
+ for (size_t i = 0; i < component.size(); i++)
+ for (size_t j = 0; j < component.size(); j++) {
+ vector<EdgeId> edges = g.GetEdgesBetween(component[i],
+ component[j]);
+ for (auto it = edges.begin(); it != edges.end(); ++it) {
+ edge_lengths_.push_back(g.length(*it));
+ }
+ }
+ std::sort(edge_lengths_.rbegin(), edge_lengths_.rend());
+ }
+
+ bool operator<(const Component<Graph> &that) const {
+ size_t i = 0;
+ while (i < this->edge_lengths_.size() && i < that.edge_lengths_.size()
+ && this->edge_lengths_[i] == that.edge_lengths_[i])
+ i++;
+ if (i == that.edge_lengths_.size())
+ return false;
+ if (i == this->edge_lengths_.size())
+ return true;
+ return this->edge_lengths_[i] < that.edge_lengths_[i];
+ }
+
+ bool operator==(const Component<Graph> &that) const {
+ if (this->edge_lengths_.size() != that.edge_lengths_.size())
+ return false;
+ for (size_t i = 0; i < this->edge_lengths_.size(); i++)
+ if (this->edge_lengths_[i] != that.edge_lengths_[i])
+ return false;
+ return true;
+ }
+
+ const vector<size_t> &edge_lengths() const {
+ return edge_lengths_;
+ }
+
+};
+
+template<class Stream, class Graph>
+Stream &operator<<(Stream &stream, const Component<Graph> &component) {
+ const vector<size_t> &lengths = component.edge_lengths();
+ for (size_t i = 0; i < lengths.size(); i++) {
+ stream << lengths[i] << " ";
+ }
+ return stream;
+}
+
+template<class Graph>
+class ComponentClassifier {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+public:
+ enum component_type {
+ any = 0,
+ error,
+ single_red,
+ single_blue,
+ simple_bulge,
+ tip,
+ simple_misassembly,
+ monochrome,
+ complex_misassembly,
+ size
+ };
+
+ static string info_printer_pos_name(size_t t) {
+ const string names[] = { "all", "error", "single_red", "single_blue",
+ "simple_bulge", "tip", "simple_misassembly", "monochrome",
+ "complex_misassembly", "size" };
+ return names[t];
+ }
+
+private:
+ const Graph &g_;
+ const ColorHandler<Graph> &coloring_;
+ const size_t bulge_length_;
+
+public:
+ ComponentClassifier(const Graph &g, const ColorHandler<Graph> &coloring,
+ size_t bulge_length) :
+ g_(g), coloring_(coloring), bulge_length_(bulge_length) {
+ }
+
+ ComponentClassifier(const Graph &g, const ColorHandler<Graph> &coloring) :
+ g_(g), coloring_(coloring), bulge_length_(g_.k() * 1000000) {
+ }
+
+ TColorSet GetColour(EdgeId edge) const {
+ return coloring_.Color(edge);
+ }
+
+ bool CheckSimpleMisassembly(const vector<VertexId> &component) const {
+ if (component.size() == 4) {
+ for (size_t i = 0; i < 4; i++)
+ for (size_t j = i + 1; j < 4; j++) {
+ vector<VertexId> sources;
+ sources.push_back(component[i]);
+ sources.push_back(component[j]);
+ vector<VertexId> sinks;
+ for (size_t k = 0; k < 4; k++) {
+ if (k != i && k != j)
+ sinks.push_back(component[k]);
+ }
+ if (CheckSimpleMisassembly(sources, sinks)) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ bool CheckSimpleMisassembly(const vector<VertexId>& sources,
+ const vector<VertexId>& sinks) const {
+ if (sources.size() != 2 || sinks.size() != 2)
+ return false;
+ for (size_t i = 0; i < sources.size(); i++)
+ for (size_t j = 0; j < sinks.size(); j++) {
+ if (g_.GetEdgesBetween(sources[i], sinks[j]).size() != 1) {
+ return false;
+ }
+ }
+ for (size_t i = 0; i < 2; i++) {
+ if (g_.GetEdgesBetween(sources[i], sources[1 - i]).size() != 0)
+ return false;
+ if (g_.GetEdgesBetween(sinks[i], sinks[1 - i]).size() != 0)
+ return false;
+ }
+ for (size_t i = 0; i < 2; i++) {
+ if (GetColour(g_.GetEdgesBetween(sources[i], sinks[0])[0])
+ == GetColour(g_.GetEdgesBetween(sources[i], sinks[1])[0]))
+ return false;
+ if (GetColour(g_.GetEdgesBetween(sources[0], sinks[i])[0])
+ == GetColour(g_.GetEdgesBetween(sources[1], sinks[i])[0]))
+ return false;
+ }
+ return true;
+ }
+
+ bool CheckIsolated(TColorSet colour,
+ const vector<VertexId> &component) const {
+ if (component.size() != 2)
+ return false;
+ vector<EdgeId> edges01 = g_.GetEdgesBetween(component[0], component[1]);
+ vector<EdgeId> edges10 = g_.GetEdgesBetween(component[1], component[0]);
+ vector<EdgeId> edges;
+ edges.insert(edges.end(), edges01.begin(), edges01.end());
+ edges.insert(edges.end(), edges10.begin(), edges10.end());
+ if (edges.size() != 1) {
+ return false;
+ }
+ return GetColour(edges[0]) == colour;
+ }
+
+ bool CheckBulge(const vector<VertexId> &component) const {
+ if (component.size() != 2)
+ return false;
+ vector<EdgeId> edges01 = g_.GetEdgesBetween(component[0], component[1]);
+ vector<EdgeId> edges10 = g_.GetEdgesBetween(component[1], component[0]);
+ vector<EdgeId> edges;
+ edges.insert(edges.end(), edges01.begin(), edges01.end());
+ edges.insert(edges.end(), edges10.begin(), edges10.end());
+ return (edges01.size() == 0 || edges10.size() == 0) && edges.size() == 2
+ && g_.length(edges[0]) < bulge_length_
+ && g_.length(edges[1]) < bulge_length_;
+ }
+
+ size_t EdgeNumber(const vector<VertexId> &component) const {
+ size_t result = 0;
+ for (size_t i = 0; i < component.size(); i++)
+ for (size_t j = 0; j < component.size(); j++) {
+ result += g_.GetEdgesBetween(component[i], component[j]).size();
+ }
+ return result;
+ }
+
+ bool Connected(VertexId v1, VertexId v2) const {
+ return g_.GetEdgesBetween(v1, v2).size() > 0;
+ }
+
+ bool CheckTip(const vector<VertexId> &component) const {
+ if (component.size() != 3)
+ return false;
+ if (EdgeNumber(component) != 2)
+ return false;
+ for (size_t i = 0; i < 3; i++) {
+ if (CheckFork(component[i], component[(i + 1) % 3],
+ component[(i + 2) % 3]))
+ return true;
+ }
+ return false;
+ }
+
+ bool CheckFork(VertexId base, VertexId tip1, VertexId tip2) const {
+ return (Connected(base, tip1) && Connected(base, tip2))
+ || (Connected(tip1, base) && Connected(tip2, base));
+ }
+
+ bool CheckMonochrome(const vector<VertexId> &component) const {
+ set<TColorSet> colours;
+ for (size_t i = 0; i < component.size(); i++)
+ for (size_t j = 0; j < component.size(); j++) {
+ vector<EdgeId> edges = g_.GetEdgesBetween(component[i],
+ component[j]);
+ for (auto it = edges.begin(); it != edges.end(); ++it) {
+ colours.insert(GetColour(*it));
+ }
+ }
+ return colours.size() == 1;
+ }
+
+ component_type GetComponentType(const vector<VertexId> &component) const {
+ if (component.size() < 2)
+ return component_type::error;
+ if (component.size() == 2) {
+ if (CheckIsolated(kRedColorSet, component))
+ return component_type::single_red;
+ if (CheckIsolated(kBlueColorSet, component))
+ return component_type::single_blue;
+ if (CheckBulge(component)) {
+ return component_type::simple_bulge;
+ }
+ return component_type::complex_misassembly;
+ }
+ if (CheckMonochrome(component))
+ return component_type::monochrome;
+ if (CheckTip(component)) {
+ return component_type::tip;
+ }
+ if (CheckSimpleMisassembly(component))
+ return component_type::simple_misassembly;
+ return component_type::complex_misassembly;
+ }
+};
+
+template<class Graph>
+class ComponentTypeFilter: public GraphComponentFilter<Graph> {
+private:
+ typedef GraphComponentFilter<Graph> base;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef typename ComponentClassifier<Graph>::component_type component_type;
+
+ component_type to_draw_;
+ ComponentClassifier<Graph> cc_;
+
+public:
+ ComponentTypeFilter(const Graph &g, component_type to_draw,
+ const ColorHandler<Graph> &coloring)
+ : base(g),
+ to_draw_(to_draw),
+ cc_(g, coloring) {
+ }
+
+ /*virtual*/
+ bool Check(const vector<VertexId>& component) const {
+ return to_draw_ == component_type::any
+ || cc_.GetComponentType(component) == to_draw_;
+ }
+
+private:
+ DECL_LOGGER("ComponentTypeFilter")
+ ;
+};
+
+template<class Graph>
+class BreakPointsFilter: public GraphComponentFilter<Graph> {
+ typedef GraphComponentFilter<Graph> base;
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ const ColorHandler<Graph> coloring_;
+ size_t color_threshold_;
+public:
+ BreakPointsFilter(const Graph& graph, const ColorHandler<Graph>& coloring,
+ size_t color_threshold) :
+ base(graph), coloring_(coloring), color_threshold_(color_threshold) {
+
+ }
+
+ bool MultiColored(const GraphComponent<Graph>& component) const {
+ set<TColorSet> colors;
+ for (auto it = component.e_begin(); it != component.e_end(); ++it) {
+ colors.insert(coloring_.Color(*it));
+ }
+ return colors.size() >= color_threshold_;
+ }
+
+ /*virtual*/
+ //todo change to set or GraphComponent and add useful protected methods
+ bool Check(const vector<VertexId> &component_veritces) const {
+ GraphComponent<Graph> component(this->graph(),
+ component_veritces.begin(), component_veritces.end());
+ return component.v_size() > 2 && MultiColored(component);
+ }
+
+};
+
+template<class Graph>
+class BreakPointGraphStatistics: public GraphComponentFilter<Graph> {
+private:
+ typedef GraphComponentFilter<Graph> base;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef typename ComponentClassifier<Graph>::component_type component_type;
+
+ const ColorHandler<Graph> &coloring_;
+ ComponentClassifier<Graph> cc_;
+ mutable vector<vector<Component<Graph>>> components_;
+ mutable vector<size_t> total_red_;
+ mutable vector<size_t> total_blue_;
+ bool ready_;
+
+ void UpdateTotalEdgeLength(component_type t, const vector<VertexId>& component) const {
+ for(size_t i = 0; i < component.size(); i++) {
+ for(size_t j = 0; j < component.size(); j++) {
+ vector<EdgeId> edges = this->graph().GetEdgesBetween(component[i], component[j]);
+ for(auto it = edges.begin(); it != edges.end(); ++it) {
+ if(coloring_.Color(*it) == kRedColorSet)
+ total_red_[t] += this->graph().length(*it);
+ if(coloring_.Color(*it) == kBlueColorSet)
+ total_blue_[t] += this->graph().length(*it);
+ }
+ }
+ }
+ }
+
+ void UpdateStats(const vector<VertexId>& component) const {
+ component_type t = cc_.GetComponentType(component);
+ Component<Graph> c(this->graph(), component);
+ components_[t].push_back(c);
+ UpdateTotalEdgeLength(t, component);
+ }
+
+public:
+ BreakPointGraphStatistics(const Graph &g, const ColorHandler<Graph> &coloring) :
+ base(g), coloring_(coloring), cc_(g, coloring), components_(component_type::size), total_red_(component_type::size), total_blue_(component_type::size), ready_(
+ false) {
+ }
+
+ /*virtual*/bool Check(const vector<VertexId>& component) const {
+ UpdateStats(component);
+ return false;
+ }
+
+ void CountStats() {
+ EmptyGraphLabeler<Graph> labeler;
+ make_dir("assembly_compare");
+ shared_ptr<GraphSplitter<Graph>> splitter = LongEdgesExclusiveSplitter<Graph>(this->graph(), 1000000000);
+ WriteComponents(this->graph(), *splitter, *this,
+ "assembly_compare/breakpoint_graph.dot",
+ *ConstructColorer(coloring_), labeler);
+ ready_ = true;
+ for (size_t i = 0; i < component_type::size; ++i) {
+ INFO("Number of components of type " << ComponentClassifier<Graph>::info_printer_pos_name(i) << " is " << GetComponentNumber(i));
+ INFO("Total length of red edges in these components is " << total_red_[i]);
+ INFO("Total length of blue edges in these components is " << total_blue_[i]);
+ }
+ }
+
+ size_t GetComponentNumber(size_t t) const {
+ return components_[t].size();
+ }
+
+ const vector<Component<Graph>> &GetComponents(size_t t) const {
+ std::sort(components_[t].rbegin(), components_[t].rend());
+ return components_[t];
+ }
+
+private:
+ DECL_LOGGER("BreakPointGraphStatistics");
+};
+
+template<class Graph>
+class BPGraphStatCounter {
+private:
+ typedef typename ComponentClassifier<Graph>::component_type component_type;
+ typedef typename Graph::VertexId VertexId;
+ const Graph &graph_;
+ const ColorHandler<Graph> &coloring_;
+ const string output_folder_;
+public:
+ BPGraphStatCounter(const Graph &g, const ColorHandler<Graph> &coloring,
+ const string& output_folder) :
+ graph_(g), coloring_(coloring), output_folder_(output_folder) {
+ }
+
+ void PrintComponents(component_type c_type,
+ const GraphLabeler<Graph>& labeler,
+ bool create_subdir = true) const {
+ string filename;
+ if (create_subdir) {
+ make_dir(output_folder_);
+ string type_dir = output_folder_
+ + ComponentClassifier<Graph>::info_printer_pos_name(c_type)
+ + "/";
+ make_dir(type_dir);
+ string picture_dir = type_dir + "pictures/";
+ make_dir(picture_dir);
+ filename = picture_dir + "breakpoint_graph.dot";
+ } else {
+ filename = output_folder_
+ + ComponentClassifier<Graph>::info_printer_pos_name(c_type)
+ + ".dot";
+ }
+ shared_ptr<GraphSplitter<Graph>> splitter = LongEdgesExclusiveSplitter<Graph>(graph_, 1000000000);
+ ComponentTypeFilter<Graph> stats(graph_, c_type, coloring_);
+
+ WriteComponents(this->graph_, *splitter, stats, filename,
+ *ConstructColorer(coloring_), labeler);
+ }
+
+ void PrintStats(const BreakPointGraphStatistics<Graph> &stats) const {
+ make_dir(output_folder_);
+ for (size_t t = 0; t < component_type::size; t++) {
+ string type_dir = output_folder_
+ + ComponentClassifier<Graph>::info_printer_pos_name(t)
+ + "/";
+ make_dir(type_dir);
+ ofstream stream;
+ stream.open((type_dir + "components.txt").c_str());
+ const vector<Component<Graph>> &components = stats.GetComponents(t);
+ for (auto it = components.begin(); it != components.end(); ++it) {
+ stream << *it << endl;
+ }
+ stream.close();
+ }
+ }
+
+ void CountStats(const GraphLabeler<Graph>& labeler, bool detailed_output =
+ true) const {
+ make_dir(output_folder_);
+ BreakPointGraphStatistics<Graph> stats(graph_, coloring_);
+ stats.CountStats();
+ if (detailed_output) {
+ PrintStats(stats);
+ PrintComponents(component_type::complex_misassembly, labeler);
+ PrintComponents(component_type::monochrome, labeler);
+ PrintComponents(component_type::tip, labeler);
+ PrintComponents(component_type::simple_misassembly, labeler);
+ }
+ PrintComponents(component_type::any, labeler, detailed_output);
+ }
+};
+
+template<class Graph>
+class TrivialBreakpointFinder: public AbstractFilter<
+ vector<typename Graph::VertexId>> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ struct bp_comp {
+ bp_comp(const Graph& g, const ColorHandler<Graph>& coloring) :
+ g_(g), coloring_(coloring) {
+ }
+
+ bool operator()(VertexId v1, VertexId v2) {
+ return MaxRedBlueIncLength(v1) > MaxRedBlueIncLength(v2);
+ }
+
+ size_t MaxRedBlueIncLength(VertexId v) {
+ vector<EdgeId> edges;
+ push_back_all(edges, g_.IncomingEdges(v));
+ push_back_all(edges, g_.OutgoingEdges(v));
+ return MaxRedBlueLength(edges);
+ }
+
+ private:
+ const Graph& g_;
+ const ColorHandler<Graph>& coloring_;
+
+ size_t MaxRedBlueLength(const vector<EdgeId> edges) {
+ size_t max_length = 0;
+ for (auto it = edges.begin(); it != edges.end(); ++it) {
+ if (coloring_.Color(*it) == kBlueColorSet
+ || coloring_.Color(*it) == kRedColorSet) {
+ if (g_.length(*it) > max_length) {
+ max_length = g_.length(*it);
+ }
+ }
+ }
+ VERIFY(max_length > 0);
+ return max_length;
+ }
+ };
+
+ const Graph& g_;
+ const ColorHandler<Graph>& coloring_;
+ const EdgesPositionHandler<Graph>& pos_;
+
+ void ReportBreakpoint(VertexId v, const string& folder,
+ const string& prefix) {
+ TRACE("Vertex " << g_.str(v) << " identified as breakpoint");
+ LengthIdGraphLabeler<Graph> basic_labeler(g_);
+ EdgePosGraphLabeler<Graph> pos_labeler(g_, pos_);
+
+ CompositeLabeler<Graph> labeler(basic_labeler, pos_labeler);
+ VERIFY(g_.OutgoingEdgeCount(v) > 0);
+ EdgeId e = g_.OutgoingEdges(v).front();
+ GraphComponent<Graph> component = omnigraph::EdgeNeighborhood(g_, e);
+ visualization::WriteComponent(
+ component,
+ folder + prefix + ToString(g_.int_id(v)) + "_loc.dot",
+ coloring_.ConstructColorer(component), labeler);
+ }
+
+ bool CheckEdges(const vector<EdgeId>& edges) const {
+ std::set<TColorSet> colors;
+ for (auto it = edges.begin(); it != edges.end(); ++it) {
+ colors.insert(coloring_.Color(*it));
+ }
+ return edges.size() == 2 && colors.count(kBlueColorSet) == 1
+ && NotTips(edges);
+ }
+
+ bool IsTip(VertexId v) const {
+ return g_.IncomingEdgeCount(v) + g_.OutgoingEdgeCount(v) == 1;
+ }
+
+ bool IsTip(EdgeId e) const {
+ return (IsTip(g_.EdgeStart(e)) || IsTip(g_.EdgeEnd(e)))
+ && g_.length(e) < 200;
+ }
+
+ bool NotTips(const vector<EdgeId>& edges) const {
+ for (auto it = edges.begin(); it != edges.end(); ++it)
+ if (IsTip(*it))
+ return false;
+ return true;
+ }
+
+public:
+ TrivialBreakpointFinder(const Graph& g, const ColorHandler<Graph>& coloring,
+ const EdgesPositionHandler<Graph>& pos) :
+ g_(g), coloring_(coloring), pos_(pos) {
+ }
+
+ void FindBreakPoints(const string& folder) {
+ vector<VertexId> breakpoints;
+ for (auto it = g_.SmartEdgeBegin(); !it.IsEnd(); ++it) {
+ if (coloring_.Color(*it) == kRedColorSet) {
+ if (CheckEdges(g_.OutgoingEdges(g_.EdgeStart(*it))))
+ breakpoints.push_back(g_.EdgeStart(*it));
+// ReportBreakpoint(g_.EdgeStart(*it));
+ if (CheckEdges(g_.IncomingEdges(g_.EdgeEnd(*it))))
+ breakpoints.push_back(g_.EdgeEnd(*it));
+// ReportBreakpoint(g_.EdgeEnd(*it));
+ }
+ }
+ bp_comp comp(g_, coloring_);
+ std::sort(breakpoints.begin(), breakpoints.end(), comp);
+ for (size_t i = 0; i < breakpoints.size(); ++i) {
+ ReportBreakpoint(
+ breakpoints[i],
+ folder,
+ ToString(i) + "_"
+ + ToString(comp.MaxRedBlueIncLength(breakpoints[i]))
+ + "_");
+ }
+ }
+
+ virtual bool Check(const vector<typename Graph::VertexId> &vertices) const {
+ GraphComponent<Graph> component(g_, vertices.begin(), vertices.end());
+ for (auto it = component.e_begin(); it != component.e_end(); ++it) {
+ if (coloring_.Color(*it) == kRedColorSet) {
+ if (CheckEdges(g_.OutgoingEdges(g_.EdgeStart(*it)))
+ || CheckEdges(g_.IncomingEdges(g_.EdgeEnd(*it))))
+ return true;
+ }
+ }
+ return false;
+ }
+
+private:
+ DECL_LOGGER("TrivialBreakpointFinder")
+ ;
+};
+
+template<class Graph>
+class SimpleInDelAnalyzer {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ const Graph& g_;
+ const ColorHandler<Graph>& coloring_;
+ const EdgesPositionHandler<Graph>& edge_pos_;
+ const vector<EdgeId> genome_path_;
+ const TColorSet shortcut_color_;
+ const string folder_;
+
+ vector<EdgeId> TryFindPath(size_t pos, VertexId end,
+ size_t edge_count_bound) {
+ vector<EdgeId> answer;
+ for (size_t i = 0;
+ i + pos < genome_path_.size() && i < edge_count_bound; ++i) {
+// if ((coloring_.Color(genome_path_[pos + i]) & shortcut_color_) > 0) {
+// DEBUG("Came into edge of wrong color");
+// return vector<EdgeId>();
+// }
+ answer.push_back(genome_path_[pos + i]);
+ if (g_.EdgeEnd(genome_path_[pos + i]) == end) {
+ return answer;
+ }
+ }
+ DEBUG("Edge bound reached");
+ return vector<EdgeId>();
+ }
+
+ map<TColorSet, size_t> ColorLengths(const vector<EdgeId>& edges) {
+ map<TColorSet, size_t> answer;
+ for (size_t i = 0; i < edges.size(); ++i) {
+ answer[coloring_.Color(edges[i])] += g_.length(edges[i]);
+ }
+ return answer;
+ }
+
+ size_t VioletLengthOfGenomeUnique(const vector<EdgeId>& edges) {
+ size_t answer = 0;
+ for (size_t i = 0; i < edges.size(); ++i) {
+ if (coloring_.Color(edges[i]) == kVioletColorSet
+ && std::count(genome_path_.begin(), genome_path_.end(), edges[i]) == 1) {
+ answer += g_.length(edges[i]);
+ }
+ }
+ return answer;
+ }
+
+ //genome pos exclusive
+ size_t CumulativeGenomeLengthToPos(size_t pos) {
+ size_t answer = 0;
+ for (size_t i = 0; i < pos; ++i) {
+ answer += g_.length(genome_path_[i]);
+ }
+ return answer;
+ }
+
+ pair<vector<EdgeId>, pair<size_t, size_t>> FindGenomePath(VertexId start,
+ VertexId end, size_t edge_count_bound) {
+ for (size_t i = 0; i < genome_path_.size(); ++i) {
+ if (g_.EdgeStart(genome_path_[i]) == start) {
+ vector<EdgeId> path = TryFindPath(i, end, edge_count_bound);
+ if (!path.empty())
+ return make_pair(
+ path,
+ make_pair(
+ CumulativeGenomeLengthToPos(i),
+ CumulativeGenomeLengthToPos(
+ i + path.size())));
+ }
+ }
+ return make_pair(vector<EdgeId>(), make_pair(0, 0));
+ }
+
+ pair<string, pair<size_t, size_t>> ContigIdAndPositions(EdgeId e) {
+ vector<EdgePosition> poss = edge_pos_.GetEdgePositions(e);
+ VERIFY(!poss.empty());
+ if (poss.size() > 1) {
+ WARN("Something strange with assembly positions");
+ return make_pair("", make_pair(0, 0));
+ }
+ EdgePosition pos = poss.front();
+ return make_pair(pos.contigId, make_pair(pos.mr.initial_range.start_pos, pos.mr.initial_range.end_pos));
+ }
+
+ void WriteAltPath(EdgeId e, const vector<EdgeId>& genome_path) {
+ LengthIdGraphLabeler<Graph> basic_labeler(g_);
+ EdgePosGraphLabeler<Graph> pos_labeler(g_, edge_pos_);
+
+ CompositeLabeler<Graph> labeler(basic_labeler, pos_labeler);
+
+ string alt_path_folder = folder_ + ToString(g_.int_id(e)) + "/";
+ make_dir(alt_path_folder);
+ WriteComponentsAlongPath(g_, labeler, alt_path_folder + "path.dot", /*split_length*/
+ 1000, /*vertex_number*/15, TrivialMappingPath(g_, genome_path),
+ *ConstructBorderColorer(g_, coloring_));
+ }
+
+ void Process(EdgeId e, const vector<EdgeId>& genome_path,
+ size_t genome_start, size_t genome_end) {
+ DEBUG("Processing edge and genome path");
+ const size_t mem_lim = 2 << 26;
+ Sequence edge_nucls = g_.EdgeNucls(e);
+ Sequence path_nucls = debruijn_graph::MergeSequences(g_, genome_path);
+ size_t edge_length = g_.length(e);
+ size_t path_length = CumulativeLength(g_, genome_path);
+ DEBUG(
+ "Diff length " << abs((int) edge_length - (int) path_length)
+ << "; genome path length " << path_length
+ << "; edge length " << edge_length);
+ pair<string, pair<size_t, size_t>> c_id_and_pos = ContigIdAndPositions(
+ e);
+
+ if (c_id_and_pos.first == "")
+ return;
+
+ WriteAltPath(e, genome_path);
+ size_t unique_violet = VioletLengthOfGenomeUnique(genome_path);
+ map<TColorSet, size_t> color_cumm_lengths = ColorLengths(genome_path);
+ if (color_cumm_lengths[kVioletColorSet] * 10
+ > color_cumm_lengths[kBlueColorSet]) {
+ WARN(
+ "Very long path along violet edges: "
+ << color_cumm_lengths[kVioletColorSet]
+ << " while blue path length: "
+ << color_cumm_lengths[kBlueColorSet]);
+ WARN("While processing edge: " << g_.str(e));
+ }
+ if (color_cumm_lengths[kVioletColorSet] > 0)
+ DEBUG("Violet edges in path");
+
+ DEBUG("Total blue " << color_cumm_lengths[kBlueColorSet]);
+ DEBUG("Total violet " << color_cumm_lengths[kVioletColorSet]);
+ DEBUG("Total unique violet " << unique_violet);
+
+ if (edge_length * path_length <= mem_lim) {
+ size_t edit_dist = EditDistance(edge_nucls, path_nucls);
+ DEBUG(
+ "Edit distance " << edit_dist << ". That is "
+ << double(edit_dist) / max(edge_length, path_length));
+ pair<size_t, size_t> local_sim = LocalSimilarity(edge_nucls,
+ path_nucls);
+ DEBUG(
+ "Local sim " << local_sim.first << " interval length "
+ << local_sim.second << " relative "
+ << ((double) local_sim.first / local_sim.second));
+// assembly_length-genome_length relative_local_sim genome_path_length assembly_length genome_length
+// contig_id contig_start contig_end genome_start genome_end min max local_sim sim_interval edit_dist edit_dist/max tot_blue
+// tot_violet unique_violet edge_id
+ cerr
+ << str(
+ format(
+ "%d %f %d %d %d %s %d %d %d %d %d %d %d %d %d %f %d %d %d %d")
+ % ((int) edge_length - (int) path_length)
+ % ((double) local_sim.first
+ / local_sim.second)
+ % genome_path.size() % edge_length
+ % path_length % c_id_and_pos.first
+ % c_id_and_pos.second.first
+ % c_id_and_pos.second.second % genome_start
+ % genome_end % min(edge_length, path_length)
+ % max(edge_length, path_length)
+ % local_sim.first % local_sim.second
+ % edit_dist
+ % (double(edit_dist)
+ / max(edge_length, path_length))
+ % color_cumm_lengths[kBlueColorSet]
+ % color_cumm_lengths[kVioletColorSet]
+ % unique_violet
+ % g_.int_id(e)) << endl;
+ } else {
+ WARN("Edges were too long");
+ }
+ }
+
+ void AnalyzeShortcutEdge(EdgeId e) {
+ DEBUG("Analysing edge " << g_.str(e));
+ pair<vector<EdgeId>, pair<size_t, size_t>> genome_path = FindGenomePath(
+ g_.EdgeStart(e), g_.EdgeEnd(e), /*edge count bound*//*100*/300);
+ if (!genome_path.first.empty()) {
+ DEBUG(
+ "Non empty genome path of edge count "
+ << genome_path.first.size());
+ DEBUG("Path " << g_.str(genome_path.first));
+ Process(e, genome_path.first, genome_path.second.first,
+ genome_path.second.second);
+ } else {
+ DEBUG("Empty genome path");
+ }
+ }
+
+public:
+ SimpleInDelAnalyzer(const Graph& g, const ColorHandler<Graph>& coloring,
+ const EdgesPositionHandler<Graph>& edge_pos,
+ const vector<EdgeId> genome_path, TColorSet shortcut_color,
+ const string& folder) :
+ g_(g), coloring_(coloring), edge_pos_(edge_pos), genome_path_(
+ genome_path), shortcut_color_(shortcut_color), folder_(
+ folder) {
+ }
+
+ void Analyze() {
+ cerr
+ << "assembly_length-genome_length relative_local_sim genome_path_length assembly_length genome_length "
+ << "contig_id contig_start contig_end genome_start genome_end min max local_sim sim_interval edit_dist "
+ << "edit_dist/max tot_blue tot_violet unique_violet edge_id" << endl;
+ for (auto it = g_.SmartEdgeBegin(); !it.IsEnd(); ++it) {
+ if (coloring_.Color(*it) == shortcut_color_) {
+ AnalyzeShortcutEdge(*it);
+ }
+ }
+ }
+private:
+ DECL_LOGGER("SimpleInDelAnalyzer")
+ ;
+};
+
+template<class gp_t>
+class SimpleRearrangementDetector {
+private:
+ typedef typename gp_t::graph_t Graph;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ const gp_t& gp_;
+ const ColorHandler<Graph>& coloring_;
+ const string ref_prefix_;
+ const string folder_;
+ mutable size_t count_;
+
+ void ReportPossibleRearrangementConnection(EdgeId e, int start_ref_pos,
+ int end_ref_pos, const string& folder) const {
+ INFO(
+ "Edge " << gp_.g.str(e)
+ << " identified as rearrangement connection");
+ LengthIdGraphLabeler<Graph> basic_labeler(gp_.g);
+ EdgePosGraphLabeler<Graph> pos_labeler(gp_.g, gp_.edge_pos);
+
+ CompositeLabeler<Graph> labeler(basic_labeler, pos_labeler);
+
+ INFO(
+ count_ << " example start_ref_pos: " << start_ref_pos
+ << " end_ref_pos: " << end_ref_pos);
+ string filename = str(
+ boost::format("%s%d_%d_%d_%d.dot") % folder % count_
+ % gp_.g.int_id(e) % start_ref_pos % end_ref_pos);
+ GraphComponent<Graph> component = omnigraph::EdgeNeighborhood(gp_.g, e);
+ omnigraph::visualization::WriteComponent(component, filename, coloring_.ConstructColorer(component), labeler);
+ count_++;
+ }
+
+ bool ContainsBlueEdge(const vector<EdgeId>& edges) const {
+ for (size_t i = 0; i < edges.size(); ++i) {
+ if (coloring_.Color(edges[i]) == kBlueColorSet)
+ return true;
+ }
+ return false;
+ }
+
+ EdgeId GetBlueEdge(const vector<EdgeId>& edges) const {
+ for (size_t i = 0; i < edges.size(); ++i) {
+ if (coloring_.Color(edges[i]) == kBlueColorSet)
+ return edges[i];
+ }
+ VERIFY(false);
+ return EdgeId(NULL);
+ }
+
+ int GetRefPosition(EdgeId e, bool start_position) const {
+ EdgePosition pos =
+ RefPositions(gp_.edge_pos.GetEdgePositions(e)).front();
+ int coeff = boost::ends_with(pos.contigId, "_RC") ? -1 : 1;
+ Range range = pos.mr.initial_range;
+ return coeff * (start_position ? range.start_pos : range.end_pos);
+ }
+
+ bool IsSingleRefPosition(EdgeId e) const {
+ return RefPositions(gp_.edge_pos.GetEdgePositions(e)).size() == 1;
+ }
+
+ vector<EdgePosition> RefPositions(const vector<EdgePosition>& poss) const {
+ vector < EdgePosition > answer;
+ for (auto it = poss.begin(); it != poss.end(); ++it) {
+ if (boost::starts_with(it->contigId, ref_prefix_)) {
+ answer.push_back(*it);
+ }
+ }
+ return answer;
+ }
+
+public:
+ SimpleRearrangementDetector(const gp_t& gp,
+ const ColorHandler<Graph>& coloring, const string& ref_prefix,
+ const string& folder) :
+ gp_(gp), coloring_(coloring), ref_prefix_(ref_prefix), folder_(
+ folder), count_(0) {
+ }
+
+ void Detect() const {
+ for (auto it = gp_.g.SmartEdgeBegin(); !it.IsEnd(); ++it) {
+ if (coloring_.Color(*it) == kRedColorSet) {
+ INFO("Processing red edge " << gp_.g.str(*it));
+ if (gp_.g.OutgoingEdgeCount(gp_.g.EdgeStart(*it)) == 2
+ && ContainsBlueEdge(
+ gp_.g.OutgoingEdges(gp_.g.EdgeStart(*it)))) {
+ EdgeId first_edge = GetBlueEdge(
+ gp_.g.OutgoingEdges(gp_.g.EdgeStart(*it)));
+ if (gp_.g.IncomingEdgeCount(gp_.g.EdgeEnd(*it)) == 2
+ && ContainsBlueEdge(
+ gp_.g.IncomingEdges(gp_.g.EdgeEnd(*it)))) {
+ EdgeId second_edge = GetBlueEdge(
+ gp_.g.IncomingEdges(gp_.g.EdgeEnd(*it)));
+ if (first_edge != second_edge) {
+ INFO("Edges passed topology checks");
+ if (IsSingleRefPosition(first_edge)
+ && IsSingleRefPosition(second_edge)) {
+ int start_ref_pos = GetRefPosition(first_edge,
+ true);
+ int end_ref_pos = GetRefPosition(second_edge,
+ false);
+ INFO("Edges had multiplicity one in reference");
+ ReportPossibleRearrangementConnection(*it,
+ start_ref_pos, end_ref_pos, folder_);
+ } else {
+ INFO("Ooops");
+ INFO(
+ "Edges had multiplicity more than one in reference");
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ DECL_LOGGER("SimpleRearrangementDetector")
+ ;
+};
+
+//template<class Graph>
+//class GraphEdgeEnumerator {
+// const Graph& g_;
+// typedef typename Graph::EdgeId EdgeId;
+//protected:
+// GraphEdgeEnumerator(const Graph& g) :
+// g_(g) {
+// }
+//
+// const Graph& g() const {
+// return g_;
+// }
+//public:
+// virtual ~GraphEdgeEnumerator() {
+// }
+// virtual map<EdgeId, string> Enumerate() const;
+//};
+
+//template<class Graph>
+//class ThreadedGenomeEnumerator/*: public GraphEdgeEnumerator<Graph>*/{
+//// typedef GraphEdgeEnumerator<Graph> base;
+// typedef typename Graph::EdgeId EdgeId;
+// const Graph& g_;
+// const vector<EdgeId> genome_path_;
+//public:
+// ThreadedGenomeEnumerator(const Graph& g, const vector<EdgeId>& genome_path) :
+// g_(g), genome_path_(genome_path) {
+// }
+//
+// /*virtual */
+// map<EdgeId, string> Enumerate() const {
+// map<EdgeId, string> answer;
+// //numerating genome path
+// int curr = 0;
+// for (auto it = genome_path_.begin(); it != genome_path_.end(); ++it) {
+// if (answer.find(*it) == answer.end()) {
+// curr++;
+// answer[*it] = ToString(curr);
+// answer[g_.conjugate(*it)] = ToString(-curr);
+// }
+// }
+// curr = 1000000;
+// for (auto it = g_.SmartEdgeBegin(); !it.IsEnd(); ++it) {
+// if (answer.find(*it) == answer.end()) {
+// curr++;
+// answer[*it] = ToString(curr);
+// answer[g_.conjugate(*it)] = ToString(-curr);
+// }
+// }
+// return answer;
+// }
+//};
+
+//todo fixme use exact coordinates!
+template<class Graph>
+class BlockPrinter {
+
+public:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ BlockPrinter(const Graph& g, const CoordinatesHandler<Graph>& coords,
+ const string& filename)
+ : g_(g),
+ coords_(coords),
+ output_stream_(filename),
+ curr_id_(1) {
+ output_stream_
+ << "genome_id\tcontig_name\tcanonical_id\tcontig_start_pos\tcontig_end_pos"
+ << "\trefined_start_pos\trefined_end_pos\tsign\torig_id"
+ << endl;
+ }
+
+ virtual ~BlockPrinter() {
+ }
+
+ //genome is supposed to perfectly correspond to some path in the graph
+ void ProcessContig(unsigned genome_id, unsigned transparent_id,
+ const string& contig_name) {
+ INFO("Processing contig " << transparent_id << " name " << contig_name);
+ MappingPath<EdgeId> mapping_path = coords_.AsMappingPath(transparent_id);
+
+ size_t graph_pos = 0;
+ for (size_t i = 0; i < mapping_path.size(); ++i) {
+ EdgeId e = mapping_path[i].first;
+ MappingRange mapping = mapping_path[i].second;
+ if (CheckPatternMatch(e)) {
+ auto canon = CanonicalId(e);
+ size_t next_graph_pos = graph_pos + g_.length(e);
+
+ output_stream_
+ << (format("%d\t%s\t%d\t%d\t%d\t%d\t%d\t%s\t%d")
+ % genome_id % contig_name % canon.first
+ % mapping.initial_range.start_pos
+ % mapping.initial_range.end_pos
+ % graph_pos
+ % next_graph_pos
+ % (canon.second ? "+" : "-") % g_.int_id(e)).str()
+ << endl;
+
+ graph_pos = next_graph_pos;
+ }
+ }
+
+// VertexId v = g_.EdgeStart(coords_.FindGenomeFirstEdge(transparent_id));
+// size_t genome_pos = 0;
+//
+// while (true) {
+// auto step = coords_.StepForward(v, transparent_id, genome_pos);
+// if (step.second == -1u)
+// break;
+//
+// EdgeId e = step.first;
+//
+// Range graph_pos(coords_.GetNewestPos(transparent_id, genome_pos),
+// coords_.GetNewestPos(transparent_id, step.second));
+// Range contig_pos(
+// coords_.GetOriginalPos(transparent_id, graph_pos.start_pos),
+// coords_.GetOriginalPos(transparent_id, graph_pos.end_pos));
+// Range graph_pos_printable = coords_.GetPrintableRange(graph_pos);
+// Range contig_pos_printable = coords_.GetPrintableRange(contig_pos);
+//
+// if (CheckPatternMatch(e)) {
+// auto canon = CanonicalId(e);
+//
+// output_stream_
+// << (format("%d\t%s\t%d\t%d\t%d\t%d\t%d\t%s\t%d")
+// % genome_id % contig_name % canon.first
+// % contig_pos_printable.start_pos
+// % contig_pos_printable.end_pos
+// % graph_pos_printable.start_pos
+// % graph_pos_printable.end_pos
+// % (canon.second ? "+" : "-") % g_.int_id(e)).str()
+// << endl;
+// }
+//
+// v = g_.EdgeEnd(e);
+// genome_pos = step.second;
+// }
+ }
+
+ static void ConvertBlocksToGRIMM(const string &file_from,
+ const string &file_to) {
+ ifstream in(file_from);
+ ofstream out(file_to);
+
+ size_t id = 0;
+ int last_genome_id = -1;
+ size_t num_in_line = 0;
+ while (!in.eof()) {
+ ++id;
+
+ string line;
+ std::getline(in, line);
+ if (id == 1)
+ continue;
+
+ if (line == "")
+ continue;
+
+ std::stringstream ss(line);
+
+ int genome_id;
+ string genome_name;
+ string sign;
+ size_t contig_id;
+
+ string tmp;
+ ss >> genome_id >> genome_name >> contig_id >> tmp >> tmp >> tmp
+ >> tmp >> sign >> tmp;
+ if (genome_id != last_genome_id) {
+ if (last_genome_id != -1)
+ out << "\n";
+ out << "> " << genome_name << "\n";
+
+ last_genome_id = genome_id;
+ num_in_line = 0;
+ }
+
+ if (num_in_line > 10) {
+ out << "\n";
+ num_in_line = 0;
+ }
+
+ if (num_in_line != 0)
+ out << " ";
+
+ if (sign == "-")
+ out << sign;
+ out << contig_id;
+
+ num_in_line++;
+ }
+
+ in.close();
+ out.close();
+ }
+
+protected:
+ virtual bool CheckPatternMatch(const EdgeId /* e */) {
+ return true;
+ }
+
+ const Graph& g_;
+ const CoordinatesHandler<Graph>& coords_;
+
+private:
+ ofstream output_stream_;
+ size_t curr_id_;
+ map<EdgeId, size_t> block_id_;
+
+ pair<size_t, bool> CanonicalId(EdgeId e) {
+// size_t id = gp_.g.int_id(e);
+// size_t conj_id = gp_.g.int_id(gp_.g.conjugate(e));
+// if (id <= conj_id)
+// return make_pair(id, true);
+// else
+// return make_pair(conj_id, false);
+
+ if (block_id_.count(e) > 0) {
+ return make_pair(get(block_id_, e), true);
+ } else if (block_id_.count(g_.conjugate(e)) > 0) {
+ return make_pair(get(block_id_, g_.conjugate(e)), false);
+ } else {
+ block_id_[e] = curr_id_++;
+ return make_pair(get(block_id_, e), true);
+ }
+ }
+
+ DECL_LOGGER("BlockPrinter");
+};
+
+template<class Graph>
+class UniqueBlockPrinter : public BlockPrinter<Graph> {
+ public:
+ UniqueBlockPrinter(const Graph &g, const CoordinatesHandler<Graph> &coords,
+ const string &filename, const vector<pair<size_t, size_t>> rc_pairs)
+ : BlockPrinter<Graph>(g, coords, filename),
+ rc_pairs_(rc_pairs),
+ contig_map_(),
+ cur_time_(rc_pairs_.size()),
+ glob_time_(0) {
+ PrepareContigMap(rc_pairs);
+ }
+
+ // virtual ~UniqueBlockPrinter() {
+ // }
+
+ protected:
+ virtual bool CheckPatternMatch(const EdgeId e) {
+ glob_time_++;
+
+ const auto &ranges = this->coords_.GetRawRanges(e);
+ if (ranges.size() != rc_pairs_.size())
+ return false;
+
+ for (const auto &e : ranges) {
+ size_t my_id = contig_map_.at(e.first);
+ if (cur_time_[my_id] == glob_time_)
+ return false;
+ cur_time_[my_id] = glob_time_;
+ }
+
+ // By the Dirichlet priciple..
+ return true;
+ }
+
+ private:
+ void PrepareContigMap(const vector<pair<size_t, size_t>> rc_pairs) {
+ for (size_t i = 0; i < rc_pairs.size(); ++i) {
+ const auto &p = rc_pairs[i];
+
+ contig_map_[p.first] = i;
+ contig_map_[p.second] = i;
+ }
+ }
+
+ vector<pair<size_t, size_t>> rc_pairs_;
+ unordered_map<size_t, size_t> contig_map_;
+ vector<size_t> cur_time_;
+ size_t glob_time_;
+
+ DECL_LOGGER("UniqueBlockPrinter");
+};
+
+//template<class Graph, class Mapper>
+//class ContigBlockStats {
+// typedef ThreadedGenomeEnumerator<Graph> Enumerator;
+// const Graph& g_;
+// const EdgesPositionHandler<Graph>& edge_pos_;
+// const Mapper mapper_;
+// const vector<EdgeId> genome_path_;
+// ContigStream& contigs_;
+// const map<EdgeId, string> labels_;
+//
+// const string& get(const map<EdgeId, string>& from, EdgeId key) const {
+// auto it = from.find(key);
+// VERIFY(it != from.end());
+// return it->second;
+// }
+//
+// void ReportGenomeBlocks() const {
+// set < EdgeId > visited;
+//// cerr << "Genome blocks started" << endl;
+// for (auto it = genome_path_.begin(); it != genome_path_.end(); ++it) {
+// if (visited.count(*it) > 0)
+// continue;
+// cerr << get(labels_, *it) << " $ "
+// << g_.int_id(*it)
+// << " $ "
+// << g_.length(*it)
+// /*<< " positions: " << edge_pos_.GetEdgePositions(*it) */<< endl;
+// visited.insert(*it);
+// visited.insert(g_.conjugate(*it));
+// }
+//// cerr << "Genome blocks ended" << endl;
+// }
+//
+// void ReportOtherBlocks() const {
+//// cerr << "Other blocks started" << endl;
+// for (auto it = labels_.begin(); it != labels_.end(); ++it) {
+// if (boost::lexical_cast<int>(it->second) > (int) 1000000) {
+// cerr << get(labels_, it->first) << " $ "
+// << g_.int_id(it->first)
+// << " $ "
+// << g_.length(it->first)
+// /*<< " positions: " << edge_pos_.GetEdgePositions(it->first) */<< endl;
+// }
+// }
+//// cerr << "Other blocks ended" << endl;
+// }
+//
+// void ReportContigs() const {
+// contigs_.reset();
+// Contig contig;
+// cerr << "Contigs started" << endl;
+// while (!contigs_.eof()) {
+// contigs_ >> contig;
+// vector < EdgeId > path =
+// mapper_.MapSequence(contig.sequence()).simple_path();
+// cerr << contig.name() << " $ ";
+// string delim = "";
+// for (auto it = path.begin(); it != path.end(); ++it) {
+// cerr << delim << get(labels_, *it);
+// delim = ";";
+// }
+// cerr << endl;
+// }
+// cerr << "Contigs ended" << endl;
+// }
+//
+//public:
+// ContigBlockStats(const Graph& g,
+// const EdgesPositionHandler<Graph>& edge_pos, const Mapper& mapper,
+// const Sequence& genome, ContigStream& contigs) :
+// g_(g), edge_pos_(edge_pos), mapper_(mapper), genome_path_(
+// mapper_.MapSequence(genome).simple_path()), contigs_(
+// contigs), labels_(Enumerator(g_, genome_path_).Enumerate()) {
+// }
+//
+// void Count() const {
+// cerr << "Block id $ Graph edge id $ (for debug) $ Length (in 201-mers)"
+// << endl;
+//
+// ReportGenomeBlocks();
+// ReportOtherBlocks();
+//
+// cerr << "Contig id $ Block ids" << endl;
+// ReportContigs();
+// }
+//};
+
+template<class Graph>
+class AlternatingPathsCounter {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ const Graph& g_;
+ const ColorHandler<Graph>& coloring_;
+
+ TColorSet InvertColor(TColorSet color) const {
+ if (color == kRedColorSet) {
+ return kBlueColorSet;
+ } else if (color == kBlueColorSet) {
+ return kRedColorSet;
+ }
+ VERIFY(false);
+ return kBlueColorSet;
+ }
+
+ vector<EdgeId> FilterEdges(vector<EdgeId> edges, TColorSet color) const {
+ vector<EdgeId> answer;
+ for (size_t i = 0; i < edges.size(); ++i) {
+ if (coloring_.Color(edges[i]) == color) {
+ answer.push_back(edges[i]);
+ }
+ }
+ return answer;
+ }
+
+ vector<EdgeId> OutgoingEdges(VertexId v, TColorSet color) const {
+ DEBUG(
+ "Looking for outgoing edges for vertex " << g_.str(v)
+ << " of color " << color);
+ return FilterEdges(g_.OutgoingEdges(v), color);
+ }
+
+ vector<EdgeId> IncomingEdges(VertexId v, TColorSet color) const {
+ DEBUG(
+ "Looking for incoming edges for vertex " << g_.str(v)
+ << " of color " << color);
+ return FilterEdges(g_.IncomingEdges(v), color);
+ }
+
+ bool CheckNotContains(vector<EdgeId>& path, EdgeId e) const {
+ return std::find(path.begin(), path.end(), e) == path.end();
+ }
+
+ VertexId OtherVertex(EdgeId e, VertexId v) const {
+ VERIFY(
+ g_.EdgeStart(e) != g_.EdgeEnd(e)
+ && (g_.EdgeStart(e) == v || g_.EdgeEnd(e) == v));
+ if (g_.EdgeStart(e) == v) {
+ DEBUG("Next vertex " << g_.EdgeEnd(e));
+ return g_.EdgeEnd(e);
+ }
+ DEBUG("Next vertex " << g_.EdgeStart(e));
+ return g_.EdgeStart(e);
+ }
+
+ bool Grow(vector<EdgeId>& path, VertexId last_vertex) const {
+ DEBUG("Growing path for vertex " << g_.str(last_vertex));
+ EdgeId last_edge = path.back();
+ DEBUG("Last edge " << last_edge);
+ TColorSet next_color = InvertColor(coloring_.Color(last_edge));
+ vector<EdgeId> next_candidates =
+ (g_.EdgeEnd(last_edge) == last_vertex) ?
+ IncomingEdges(last_vertex, next_color) :
+ OutgoingEdges(last_vertex, next_color);
+ if (next_candidates.empty()) {
+ DEBUG("No candidates");
+ return true;
+ }
+ if (next_candidates.size() > 1) {
+ DEBUG("Several candidates");
+ return false;
+ }
+ EdgeId next_edge = next_candidates.front();
+ DEBUG(
+ "Adding edge " << g_.str(next_edge) << " of color "
+ << coloring_.Color(next_edge));
+ if (!CheckNotContains(path, next_edge)) {
+ WARN("PROBLEM");
+ return false;
+ }
+
+ path.push_back(next_edge);
+ return Grow(path, OtherVertex(next_edge, last_vertex));
+ }
+
+ vector<EdgeId> AlternatingPathContainingEdge(EdgeId e) const {
+ vector<EdgeId> answer;
+ vector<EdgeId> tmp_path;
+ DEBUG("Growing backward");
+ tmp_path.push_back(e);
+ if (Grow(tmp_path, g_.EdgeStart(e))) {
+ answer.insert(answer.end(), tmp_path.rbegin(), tmp_path.rend());
+ tmp_path.clear();
+ DEBUG("Growing forward");
+ tmp_path.push_back(e);
+ if (Grow(tmp_path, g_.EdgeEnd(e))) {
+ answer.insert(answer.end(), (++tmp_path.begin()),
+ tmp_path.end());
+ return answer;
+ }
+ }
+ return vector<EdgeId>();
+ }
+
+ void ProcessAltPath(const vector<EdgeId>& path) const {
+ DEBUG("Processing path of length " << path.size());
+ cerr << path.size() << endl;
+ }
+
+public:
+ AlternatingPathsCounter(const Graph& g, const ColorHandler<Graph>& coloring) :
+ g_(g), coloring_(coloring) {
+ }
+
+ void CountPaths() const {
+ set<EdgeId> visited_edges;
+ for (auto it = g_.SmartEdgeBegin(); !it.IsEnd(); ++it) {
+ if (visited_edges.count(*it) > 0)
+ continue;
+ if (coloring_.Color(*it) == kRedColorSet) {
+ DEBUG("Looking for alt path for edge " << g_.str(*it));
+ vector<EdgeId> alt_path = AlternatingPathContainingEdge(*it);
+ if (!alt_path.empty()) {
+ ProcessAltPath(alt_path);
+ visited_edges.insert(alt_path.begin(), alt_path.end());
+ }
+ }
+ }
+ }
+private:
+ DECL_LOGGER("AlternatingPathsCounter")
+ ;
+};
+
+template<class Graph, class Mapper>
+class MissingGenesAnalyser {
+ typedef typename Graph::EdgeId EdgeId;
+ const Graph& g_;
+ const ColorHandler<Graph>& coloring_;
+ const EdgesPositionHandler<Graph>& edge_pos_;
+ const Sequence genome_;
+ const Mapper mapper_;
+ const vector<pair<bool, pair<size_t, size_t>>> locations_;
+ const string output_dir_;
+
+ void ReportLocality(const Sequence& s, const string& out_file) {
+ LengthIdGraphLabeler<Graph> basic_labeler(g_);
+ EdgePosGraphLabeler<Graph> pos_labeler(g_, edge_pos_);
+
+ CompositeLabeler<Graph> labeler(basic_labeler, pos_labeler);
+
+ WriteComponentsAlongPath(g_, labeler, out_file, /*split_length*/1000, /*vertex_number*/15
+ , mapper_.MapSequence(s), *ConstructBorderColorer(g_, coloring_));
+ }
+
+public:
+ MissingGenesAnalyser(const Graph& g, const ColorHandler<Graph>& coloring
+ , const EdgesPositionHandler<Graph>& edge_pos
+ , const Sequence& genome
+ , const Mapper& mapper
+ , const vector<pair<bool, pair<size_t, size_t>>>& locations
+ , const string& output_dir):
+ g_(g), coloring_(coloring), edge_pos_(edge_pos), genome_(genome), mapper_(mapper), locations_(locations), output_dir_(output_dir) {
+
+ }
+
+ void Analyze() {
+ remove_dir(output_dir_);
+ make_dir(output_dir_);
+ for (size_t i = 0; i < locations_.size(); ++i) {
+ pair<bool, pair<size_t, size_t>> location = locations_[i];
+ Sequence locality = genome_.Subseq(location.second.first - g_.k(), location.second.second + g_.k());
+ if (location.first) {
+ locality = !locality;
+ }
+ ReportLocality(locality, output_dir_ + ToString(i) + ".dot");
+ }
+ }
+};}
diff --git a/src/projects/cap/test_utils.hpp b/src/projects/cap/test_utils.hpp
new file mode 100644
index 0000000..3c80aa2
--- /dev/null
+++ b/src/projects/cap/test_utils.hpp
@@ -0,0 +1,143 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <sys/time.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <algorithm>
+#include <vector>
+#include <iostream>
+
+#include "compare_standard.hpp"
+#include "pipeline/graphio.hpp"
+
+namespace cap {
+
+namespace utils {
+
+class TmpFolderFixture {
+ std::string tmp_folder_;
+
+ public:
+ TmpFolderFixture(std::string tmp_folder) {
+ tmp_folder_ = tmp_folder;
+ INFO("Creating " << tmp_folder_ << ": " << make_dir(tmp_folder_));
+ }
+
+ void Stub() {
+ if (1 + 1 == 1) {
+ INFO("LOL WAT");
+ }
+ }
+
+ ~TmpFolderFixture() {
+ // Deleting temporary dir iff all indices are destructed
+ int ret_code = rmdir(tmp_folder_.c_str());
+ INFO("Removing temporary : " << (ret_code ? "failure, some indices were \
+ not deleted" : "success"));
+ }
+};
+
+int add_time(double &time, int multiplier = 1, int ret = 0) {
+ timeval curtime;
+ gettimeofday(&curtime, 0);
+ time += multiplier * (double(curtime.tv_sec) + double(curtime.tv_usec) * 1e-6);
+ return ret;
+}
+
+//todo remove
+void MakeDirPath(const std::string& path) {
+ if (path.size() == 0) {
+ TRACE("Somewhat delirium: trying to create directory ``");
+ return;
+ }
+
+ size_t slash_pos = 0;
+ while ((slash_pos = path.find_first_of('/', slash_pos + 1)) != std::string::npos) {
+ make_dir(path.substr(0, slash_pos));
+ }
+ if (path[path.size() - 1] != '/') {
+ make_dir(path);
+ }
+}
+
+//todo remove
+bool DirExist(std::string path) {
+ struct stat st;
+ return (stat(path.c_str(), &st) == 0) && (S_ISDIR(st.st_mode));
+}
+
+ContigStreams OpenStreams(const vector<string>& filenames) {
+ ContigStreams streams;
+ for (auto it = filenames.begin(); it != filenames.end(); ++it) {
+ DEBUG("Opening stream from " << *it);
+ streams.push_back(make_shared<io::FileReadStream>(*it));
+ }
+ return streams;
+}
+
+std::string GetMD5CommandString() {
+ static std::string answer = "";
+
+ if (answer != "") {
+ return answer;
+ }
+
+ FILE *output;
+ char buf[40];
+ output = popen("echo a | md5sum 2> /dev/null", "r");
+ if (1 == fscanf(output, "%s", buf) &&
+ strcmp(buf, "60b725f10c9c85c70d97880dfe8191b3") == 0) {
+ return answer = "md5sum ";
+ }
+ pclose(output);
+
+ output = popen("echo a | md5 2> /dev/null", "r");
+ if (1 == fscanf(output, "%s", buf) &&
+ strcmp(buf, "60b725f10c9c85c70d97880dfe8191b3") == 0) {
+ return answer = "md5 ";
+ }
+ pclose(output);
+
+ return answer = "head -c 20 ";
+
+}
+
+std::string GenMD5FromFiles(const std::vector<std::string> &paths,
+ const std::string &salt = "") {
+ VERIFY(!paths.empty());
+ std::vector<std::string> paths_s = paths;
+ //std::sort(paths_s.begin(), paths_s.end());
+
+ std::string accum_string = "";
+ for (auto it = paths_s.begin(); it != paths_s.end(); ++it) {
+ accum_string += *it;
+ accum_string += " ";
+ }
+
+ std::cerr << "Using " << GetMD5CommandString() << std::endl;
+
+ FILE *md5_output = popen(("(head -n 1000 " + accum_string + "&& echo " + salt
+ + ") | " + GetMD5CommandString()).c_str(), "r");
+ VERIFY(md5_output != NULL);
+
+ char buf[40];
+ VERIFY(1 == fscanf(md5_output, "%s", buf));
+ pclose(md5_output);
+
+ return std::string(buf);
+}
+
+bool NeedToUseLongSeq(size_t k) {
+ return k > 99;
+}
+
+}
+
+}
diff --git a/src/projects/cap/tools.cpp b/src/projects/cap/tools.cpp
new file mode 100755
index 0000000..41cd674
--- /dev/null
+++ b/src/projects/cap/tools.cpp
@@ -0,0 +1,183 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "compare_standard.hpp"
+#include "dev_support/logger/log_writers.hpp"
+#include "pipeline/graphio.hpp"
+#include <boost/test/unit_test.hpp>
+
+#include "comparison_utils.hpp"
+#include "diff_masking.hpp"
+#include "repeat_masking.hpp"
+#include "genome_correction.hpp"
+#include "assembly_compare.hpp"
+#include "test_utils.hpp"
+#include "gene_analysis.hpp"
+
+namespace cap {
+
+BOOST_AUTO_TEST_CASE( CompareEcoli ) {
+ return;
+ utils::TmpFolderFixture _("tmp");
+
+ std::string base_path = "/home/snurk/ecoli_refs/";
+
+ vector<std::string> paths = {
+ "H6.fasta",
+ "K12.fasta"
+ };
+
+ vector<size_t> k_sequence = { 5001, 1001, 501, 201, 101, 55, 21 };
+
+// std::string files_md5 = utils::GenMD5FromFiles(paths);
+// INFO("result is stored with md5 of " << files_md5);
+
+ PerformIterativeRefinement(base_path, paths,
+ //"bp_graph_" + files_md5 +
+ base_path + "H6_K12_processed/", k_sequence);
+}
+
+//BOOST_AUTO_TEST_CASE( MaskDiffsForMultiple ) {
+// return;
+// utils::TmpFolderFixture _("tmp");
+//
+// std::string base_path = "/home/snurk/Dropbox/olga_gelf/";
+//
+// /*
+// vector<std::string> paths = {
+// "/home/valich/mrsa/more_strains/MSSA476.fasta",
+// "/home/valich/mrsa/more_strains/MRSA252.fasta",
+// "/home/valich/mrsa/more_strains/TW20.fasta",
+// "/home/valich/mrsa/more_strains/USA300.fasta"
+// };
+// vector<std::string> suffixes = {
+// "mssa476",
+// "rmsa252",
+// "tw20",
+// "usa300"
+// };
+//
+// base_path + "CCDC5079.fasta",
+// base_path + "CCDC5180.fasta"
+//
+// */
+// // E.Coli
+// vector<std::string> paths = {
+//// base_path + "EDL.fasta",
+// base_path + "genomes/Escherichia coli 536.fasta",
+//// base_path + "HS.fasta",
+// base_path + "genomes/Escherichia coli 55989.fasta"
+// };
+// vector<std::string> suffices = {
+// "EC536",
+// "EC55989"
+// };
+//
+// vector<size_t> k_sequence = { /*1001, 501, 201, 101, */55, 21, 15 };
+//
+// /*
+// std::string base_path = "/home/valich/work/human/";
+//
+// vector<std::string> paths = {
+// base_path + "homo_sapiens_X.fasta",
+// base_path + "pan_troglodytes_X.fasta"
+// };
+// vector<std::string> suffices = {
+// "homo_sapiens_X",
+// "pan_troglodytes_X"
+// };
+// vector<size_t> k_sequence = {
+// 101, 55, 21, 15
+// };
+// */
+//
+//// std::string files_md5 = utils::GenMD5FromFiles(paths);
+//// INFO("result is stored with md5 of " << files_md5);
+//
+// MaskDifferencesAndSave(paths, suffices,
+// //"bp_graph_" + files_md5 +
+// base_path + "processed/", k_sequence);
+//}
+
+//BOOST_AUTO_TEST_CASE( TestGeneAnalysis ) {
+// return;
+// utils::TmpFolderFixture _("tmp");
+//// gp_t gp(k, "tmp", Sequence(), 200, true);
+// vector<size_t> ks = {55, 21};
+// PerformIterativeGeneAnalysis("/home/snurk/Dropbox/olga_gelf/", "gene_out", ks);
+//
+//// GeneCollection gene_collection;
+//// string root = "/home/snurk/Dropbox/olga_gelf/";
+//// gene_collection.Load(root, "genome_list.txt",
+//// "/genomes/",
+//// "gs.25ESS_ver3_sf_TN.csv",
+//// "interesting_orthologs.txt");
+//// gene_collection.Update(gp);
+////
+//// ColorHandler<gp_t::graph_t> coloring(gp.g);
+////
+//// make_dir(root + "out/");
+////
+//// WriteGeneLocality(gene_collection, gp, root + "out/", coloring);
+//}
+
+BOOST_AUTO_TEST_CASE( MultipleGenomesVisualization ) {
+ return;
+ typedef KmerStoringEdgeIndex<Graph, LSeq, kmer_index_traits<LSeq>, SimpleStoring> comparing_index_t;
+ typedef debruijn_graph::graph_pack<
+ /*Nonc*/debruijn_graph::ConjugateDeBruijnGraph, LSeq, comparing_index_t> comparing_gp_t;
+ static const size_t K = 1001;
+ utils::TmpFolderFixture _("tmp");
+
+ std::string base_path = "bp_graph/refined/501/";
+
+ // vector of pairs <name, path_to_fasta>
+ /*
+ vector<pair<std::string, std::string> > genomes_paths = {
+ make_pair("MSSA476", "bp_graph_test/refined/mssa476.fasta"),
+ make_pair("MRSA252", "bp_graph_test/refined/mrsa252.fasta"),
+ make_pair("TW20", "bp_graph_test/refined/tw20.fasta"),
+ make_pair("USA300", "bp_graph_test/refined/usa300.fasta")
+ // make_pair("11819", "bp_graph_test/refined/11819.fasta"),
+ // make_pair("COL", "bp_graph_test/refined/COL.fasta")
+ };
+ */
+
+ vector<pair<std::string, std::string> > genomes_paths = {
+// make_pair("EDL", base_path + "EDL.fasta"),
+ make_pair("H6", base_path + "H6.fasta"),
+// make_pair("HS", base_path + "HS.fasta"),
+ make_pair("K12", base_path + "K12.fasta"), make_pair("TW",
+ base_path + "TW.fasta"), make_pair("UTI",
+ base_path + "UTI.fasta")
+// make_pair("CCDC5079", base_path + "CCDC5079.fasta"),
+// make_pair("CCDC5180", base_path + "CCDC5180.fasta")
+ };
+
+ std::string folder = "bp_graph/multiple_genomes_visualization/";
+
+ RunMultipleGenomesVisualization < comparing_gp_t
+ > (K, genomes_paths, folder);
+}
+
+}
+
+::boost::unit_test::test_suite* init_unit_test_suite(int, char*[]) {
+ logging::logger *log = logging::create_logger("", logging::L_INFO);
+ log->add_writer(std::make_shared<logging::console_writer>());
+ logging::attach_logger(log);
+
+ using namespace ::boost::unit_test;
+ char module_name[] = "cap";
+
+ assign_op(framework::master_test_suite().p_name.value,
+ basic_cstring<char>(module_name), 0);
+
+ omp_set_num_threads(1);
+
+ return 0;
+}
diff --git a/src/projects/cap/untangling.hpp b/src/projects/cap/untangling.hpp
new file mode 100644
index 0000000..dc8737b
--- /dev/null
+++ b/src/projects/cap/untangling.hpp
@@ -0,0 +1,345 @@
+////***************************************************************************
+////* Copyright (c) 2011-2014 Saint-Petersburg Academic University
+////* All Rights Reserved
+////* See file LICENSE for details.
+////****************************************************************************
+//
+//#pragma once
+//
+//namespace cap {
+//
+//template<class Graph>
+//struct bp_graph_pack {
+// typedef Graph graph_t;
+// typedef string contig_id_t;
+// typedef typename Graph::EdgeId EdgeId;
+// Graph g;
+// ColorHandler<Graph> coloring;
+// map<contig_id_t, vector<EdgeId>> red_paths;
+// map<contig_id_t, vector<EdgeId>> blue_paths;
+// EdgesPositionHandler<Graph> edge_pos;
+//
+// bp_graph_pack(size_t k) :
+// g(k), coloring(g), edge_pos(g) {
+//
+// }
+//};
+//
+//template<class gp_t>
+//class UntangledGraphContigMapper {
+// typedef typename gp_t::graph_t Graph;
+// typedef typename Graph::EdgeId EdgeId;
+// const bp_graph_pack<Graph>& bp_gp_;
+//
+//public:
+// UntangledGraphContigMapper(const bp_graph_pack<Graph>& bp_gp) :
+// bp_gp_(bp_gp) {
+//
+// }
+//
+// MappingPath<EdgeId> MapRead(const io::SingleRead &read) const {
+// auto it = bp_gp_.red_paths.find(read.name());
+// if (it != bp_gp_.red_paths.end()) {
+// return TrivialMappingPath(it->second);
+// }
+// it = bp_gp_.blue_paths.find(read.name());
+// if (it != bp_gp_.blue_paths.end()) {
+// return TrivialMappingPath(it->second);
+// }VERIFY(false);
+// return MappingPath<EdgeId>();
+// }
+//
+//};
+//
+//template<class gp_t>
+//class UntangledGraphConstructor {
+//private:
+// typedef typename gp_t::graph_t Graph;
+// typedef typename Graph::VertexId VertexId;
+// typedef typename Graph::EdgeId EdgeId;
+//
+// static const size_t k = gp_t::k_value;
+//
+// const gp_t& old_gp_;
+// const ColorHandler<Graph>& old_coloring_;
+// bp_graph_pack<Graph>& new_gp_;
+// restricted::map<EdgeId, EdgeId> purple_edge_mapping_;
+// restricted::map<VertexId, VertexId> vertex_mapping_;
+// //todo draw in different color!
+// restricted::set<VertexId> artificial_vertices_;
+// set<string> processed_contigs_;
+//
+// //todo test that!!!
+// string ConjugateContigId(const string& contig_id) {
+// string answer;
+// if (contig_id.substr(contig_id.size() - 3, 3) == "_RC")
+// answer = contig_id.substr(0, contig_id.size() - 3);
+// else
+// answer = contig_id + "_RC";
+// DEBUG("Conjugate to " << contig_id << " is " << answer);
+// return answer;
+// }
+//
+// void AddToProcessed(const string& contig_id) {
+// processed_contigs_.insert(contig_id);
+// processed_contigs_.insert(ConjugateContigId(contig_id));
+// }
+//
+// VertexId GetStartVertex(const Path<EdgeId> &path, size_t i) {
+// if (i != 0 || path.start_pos() == 0)
+// return vertex_mapping_[old_gp_.g.EdgeStart(path[i])];
+// else {
+// //todo discuss with Anton!!!
+// VertexId art_v = new_gp_.g.AddVertex();
+// WARN("Art vertex added")
+//// VERIFY(false);
+// artificial_vertices_.insert(art_v);
+// return art_v;
+// }
+// }
+//
+// VertexId GetEndVertex(const Path<EdgeId> &path, size_t i) {
+// if (i != path.size() - 1 || path.end_pos() == old_gp_.g.length(path[i]))
+// return vertex_mapping_[old_gp_.g.EdgeEnd(path[i])];
+// else {
+// //todo discuss with Anton!!!
+// VertexId art_v = new_gp_.g.AddVertex();
+// WARN("Art vertex added")
+//// VERIFY(false);
+// artificial_vertices_.insert(art_v);
+// return art_v;
+// }
+// }
+//
+// void Untangle(ContigStream& stream, TColorSet color) {
+// io::SingleRead read;
+// stream.reset();
+// set<string> processed;
+// while (!stream.eof()) {
+// stream >> read;
+// //todo can look at new_gp_.*_paths keys
+// if (processed.count(read.name()) > 0)
+// continue;
+// processed.insert(read.name());
+// processed.insert(ConjugateContigId(read.name()));
+//
+// Untangle(read.sequence(), read.name(), color);
+// }
+// }
+//
+// void Untangle(const Sequence& contig, const string& name, TColorSet color) {
+// VERIFY(color == kRedColorSet || color == kBlueColorSet);
+// DEBUG("Untangling contig " << name);
+// Path<EdgeId> path = MapperInstance(old_gp_).MapSequence(contig).path();
+// vector<EdgeId> new_path;
+// DEBUG("Mapped contig" << name);
+// for (size_t i = 0; i < path.size(); i++) {
+// EdgeId next;
+// if (old_coloring_.Color(path[i]) != kVioletColorSet) {
+// DEBUG("Next edge is not purple");
+// size_t j = i;
+// vector<EdgeId> to_glue;
+// while (j < path.size()
+// && old_coloring_.Color(path[j]) != kVioletColorSet) {
+// to_glue.push_back(path[j]);
+// j++;
+// }
+// Sequence new_edge_sequence = MergeSequences(old_gp_.g, to_glue);
+// next = new_gp_.g.AddEdge(GetStartVertex(path, i),
+// GetEndVertex(path, j - 1), new_edge_sequence);
+// DEBUG(
+// "Added shortcut edge " << new_gp_.g.int_id(next) << " for path " << old_gp_.g.str(to_glue));
+// i = j - 1;
+// } else {
+// DEBUG("Next edge is purple");
+// next = purple_edge_mapping_[path[i]];
+// }
+// new_path.push_back(next);
+// DEBUG("Coloring new edge and complement");
+// PaintEdgeWithVertices(next, color);
+// }
+// if (color == kRedColorSet) {
+// VERIFY(new_gp_.red_paths.find(name) == new_gp_.red_paths.end());
+// new_gp_.red_paths[name] = new_path;
+// new_gp_.red_paths[ConjugateContigId(name)] = ConjugatePath(
+// new_gp_.g, new_path);
+// } else {
+// VERIFY(new_gp_.blue_paths.find(name) == new_gp_.blue_paths.end());
+// new_gp_.blue_paths[name] = new_path;
+// new_gp_.blue_paths[name] = ConjugatePath(new_gp_.g, new_path);
+// }
+// }
+//
+// vector<EdgeId> ConjugatePath(const Graph& g, const vector<EdgeId> path) {
+// vector<EdgeId> answer;
+// for (int i = path.size() - 1; i >= 0; i--) {
+// answer.push_back(g.conjugate(path[i]));
+// }
+// return answer;
+// }
+//
+// template<class T>
+// void ColorWithConjugate(T t, TColorSet color) {
+// new_gp_.coloring.Paint(t, color);
+// new_gp_.coloring.Paint(new_gp_.g.conjugate(t), color);
+// }
+//
+// void PaintEdgeWithVertices(EdgeId e, TColorSet color) {
+// DEBUG(
+// "Coloring edges " << new_gp_.g.int_id(e) << " and " << new_gp_.g.int_id(new_gp_.g.conjugate(e)));
+// ColorWithConjugate(e, color);
+// ColorWithConjugate(new_gp_.g.EdgeStart(e), color);
+// ColorWithConjugate(new_gp_.g.EdgeEnd(e), color);
+// }
+//
+//public:
+// UntangledGraphConstructor(const gp_t &old_gp,
+// const ColorHandler<Graph> &old_coloring,
+// bp_graph_pack<Graph>& new_gp, io::IReader<io::SingleRead> &stream1,
+// io::IReader<io::SingleRead> &stream2) :
+// old_gp_(old_gp), old_coloring_(old_coloring), new_gp_(new_gp) {
+// const Graph& old_graph = old_gp.g;
+// //adding vertices
+// restricted::set<VertexId> processed_purple_v;
+// for (auto it = old_graph.begin(); it != old_graph.end(); ++it) {
+// if (processed_purple_v.count(*it) > 0)
+// continue;
+// processed_purple_v.insert(*it);
+// processed_purple_v.insert(old_graph.conjugate(*it));
+// vertex_mapping_[*it] = new_gp_.g.AddVertex();
+// vertex_mapping_[old_graph.conjugate(*it)] = new_gp_.g.conjugate(
+// vertex_mapping_[*it]);
+// DEBUG(
+// "Adding purple vertex " << new_gp_.g.int_id(vertex_mapping_[*it]) << " corresponding to " << old_graph.int_id(*it) << " and conjugates")
+// }
+//
+// restricted::set<EdgeId> processed_purple;
+// //propagating purple color to new graph
+// for (auto it = old_graph.SmartEdgeBegin(); !it.IsEnd(); ++it) {
+// if (processed_purple.count(*it) > 0)
+// continue;
+// processed_purple.insert(*it);
+// processed_purple.insert(old_graph.conjugate(*it));
+//
+// if (old_coloring.Color(*it) == kVioletColorSet) {
+// EdgeId new_edge = new_gp_.g.AddEdge(
+// vertex_mapping_[old_graph.EdgeStart(*it)],
+// vertex_mapping_[old_graph.EdgeEnd(*it)],
+// old_graph.EdgeNucls(*it));
+// DEBUG(
+// "Adding purple edge " << new_gp_.g.int_id(new_edge) << " corresponding to " << old_graph.int_id(*it) << " and conjugate")
+// purple_edge_mapping_[*it] = new_edge;
+// purple_edge_mapping_[old_graph.conjugate(*it)] =
+// new_gp_.g.conjugate(new_edge);
+// PaintEdgeWithVertices(new_edge, kVioletColorSet);
+// }
+// }
+//
+// VERIFY(new_gp_.red_paths.empty());
+// VERIFY(new_gp_.blue_paths.empty());
+//
+// Untangle(stream1, 0);
+// Untangle(stream2, 1);
+//
+// UntangledGraphContigMapper<bp_graph_pack<Graph>> contig_mapper(new_gp_);
+// FillPos(new_gp_.g, contig_mapper, new_gp_.edge_pos, stream1);
+// FillPos(new_gp_.g, contig_mapper, new_gp_.edge_pos, stream2);
+// }
+//private:
+// DECL_LOGGER("UntangledGraphConstructor")
+// ;
+//};
+//
+////Currently works for conjugate graphs only
+//template<class Graph>
+//class RestrictedOneManyResolver {
+// typedef typename Graph::EdgeId EdgeId;
+// typedef typename Graph::VertexId VertexId;
+//
+// Graph& g_;
+// const ColorHandler<Graph>& coloring_;
+// TColorSet restricting_color_;
+//
+// bool CheckColor(const vector<EdgeId>& edges) {
+// DEBUG("Checking color")
+// for (auto it = edges.begin(); it != edges.end(); ++it) {
+// if (coloring_.Color(*it) != restricting_color_) {
+// DEBUG("fail")
+// return false;
+// }
+// }DEBUG("ok")
+// return true;
+// }
+//
+// bool CheckColor(VertexId v) {
+// return CheckColor(g_.IncomingEdges(v))
+// && CheckColor(g_.OutgoingEdges(v));
+// }
+//
+// bool CheckSimple(const vector<EdgeId>& edges) {
+// DEBUG("Checking simple")
+// for (auto it = edges.begin(); it != edges.end(); ++it) {
+// if (g_.EdgeStart(*it) == g_.EdgeEnd(*it)
+// || g_.EdgeStart(*it) == g_.conjugate(g_.EdgeEnd(*it))) {
+// DEBUG("fail")
+// return false;
+// }
+// }DEBUG("ok")
+// return true;
+// }
+//
+// bool CheckSimple(VertexId v) {
+// return CheckSimple(g_.IncomingEdges(v))
+// && CheckSimple(g_.OutgoingEdges(v));
+// }
+//
+// bool CheckVertex(VertexId v) {
+// return CheckSimple(v) && CheckColor(v);
+// }
+//
+// void SplitVertex(VertexId v) {
+// DEBUG("Splitting vertex " << g_.str(v))
+// EdgeId incoming_edge = g_.GetUniqueIncomingEdge(v);
+// vector<EdgeId> outgoing_edges = g_.OutgoingEdges(v);
+// DEBUG("Going to create " << outgoing_edges.size() << " new edges")
+// for (auto it = outgoing_edges.begin(); it != outgoing_edges.end();
+// ++it) {
+// VertexId copy_vertex = g_.AddVertex(g_.data(v));
+// EdgeId e1 = g_.AddEdge(g_.EdgeStart(incoming_edge), copy_vertex,
+// g_.data(incoming_edge));
+// g_.FireProject(incoming_edge, e1);
+// EdgeId e2 = g_.AddEdge(copy_vertex, g_.EdgeEnd(*it), g_.data(*it));
+// //todo think of better way!!! now not stable and awful because of th order of information transfer!!!
+// g_.FireProject(*it, e2);
+// EdgeId e = g_.MergePath(vector<EdgeId> { e1, e2 });
+// DEBUG("Created edge " << g_.str(e))
+// }
+// g_.ForceDeleteVertex(v);
+// }
+//
+//public:
+// RestrictedOneManyResolver(Graph& g, const ColorHandler<Graph>& coloring,
+// TColorSet restricting_color) :
+// g_(g), coloring_(coloring), restricting_color_(restricting_color) {
+//
+// }
+//
+// void Resolve() {
+// INFO("Running one-many resolve");
+// for (auto it = g_.SmartVertexBegin(); !it.IsEnd(); ++it) {
+// DEBUG("Checking vertex " << g_.str(*it) << " for split.")
+// if (g_.IncomingEdgeCount(*it) == 1 && CheckVertex(*it)) {
+// DEBUG("Condition was satisfied.")
+// SplitVertex(*it);
+// } else {
+// DEBUG("Condition was not satisfied.")
+// }
+// }INFO("Finished one-many resolve");
+// }
+//
+//private:
+// DECL_LOGGER("RestrictedOneManyResolver")
+// ;
+//};
+//
+//}
diff --git a/src/projects/cap/visualization.hpp b/src/projects/cap/visualization.hpp
new file mode 100644
index 0000000..7b862bb
--- /dev/null
+++ b/src/projects/cap/visualization.hpp
@@ -0,0 +1,171 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+#include "assembly_graph/graph_alignment/sequence_mapper.hpp"
+#include "visualization/visualization_utils.hpp"
+
+namespace cap {
+
+//todo refactor all these methods
+
+/*
+ * This filter restricts also components where all cops are borschs
+ * (all edges are colored in all colors)
+ */
+template<class Graph>
+class ComponentSingleColorFilter: public GraphComponentFilter<Graph> {
+private:
+ typedef GraphComponentFilter<Graph> base;
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ ColorHandler<Graph> color_handler_;
+ TColorSet restricted_color_;
+ size_t max_length_;
+ size_t vertex_number_;
+
+public:
+ ComponentSingleColorFilter(const Graph &graph, const ColorHandler<Graph> &color_handler,
+ const TColorSet &restricted_color, size_t max_length, size_t vertex_number)
+ : base(graph),
+ color_handler_(color_handler),
+ restricted_color_(restricted_color),
+ max_length_(max_length),
+ vertex_number_(vertex_number) {
+ }
+
+ /*virtual*/
+ bool Check(const GraphComponent<Graph> &gc) const {
+ return true;
+ TRACE("Check component");
+ auto &component = gc.vertices();
+ if (component.size() <= vertex_number_)
+ return false;
+
+ bool length_flag = false,
+ color_flag = false;
+// set < VertexId > component(vertices.begin(), vertices.end());
+ for (auto iterator = component.begin(); iterator != component.end();
+ ++iterator) {
+ for (EdgeId e : this->graph().OutgoingEdges(*iterator)) {
+ if (component.count(this->graph().EdgeEnd(e)) == 1) {
+ if (this->graph().length(e) <= max_length_) {
+ length_flag = true;
+ }
+ if (color_handler_.Color(e) != restricted_color_) {
+ TRACE("Found good color " << color_handler_.Color(e).ToString());
+ color_flag = true;
+ }
+ if (length_flag && color_flag) {
+ return true;
+ }
+ }
+ }
+ }
+ return false;
+ }
+};
+
+template<class Graph>
+void PrintColoredGraph(const Graph& g, const ColorHandler<Graph>& coloring,
+ const EdgesPositionHandler<Graph>& pos, const string& output_filename) {
+ shared_ptr<GraphSplitter<Graph>> splitter = ReliableSplitter<Graph>(g, 1000000, 30);
+ LengthIdGraphLabeler<Graph> basic_labeler(g);
+ EdgePosGraphLabeler<Graph> pos_labeler(g, pos);
+
+ CompositeLabeler<Graph> labeler(basic_labeler, pos_labeler);
+ WriteComponents(g, splitter, output_filename,
+// *ConstructColorer(coloring),
+ *ConstructBorderColorer(g, coloring), labeler);
+}
+
+template<class Graph>
+void PrintColoredGraphAroundEdge(const Graph& g,
+ const ColorHandler<Graph>& coloring, const EdgeId edge,
+ const EdgesPositionHandler<Graph>& pos, const string& output_filename) {
+ INFO(output_filename);
+ LengthIdGraphLabeler<Graph> basic_labeler(g);
+ EdgePosGraphLabeler<Graph> pos_labeler(g, pos);
+
+ CompositeLabeler<Graph> labeler(basic_labeler, pos_labeler);
+ GraphComponent<Graph> component = omnigraph::EdgeNeighborhood(g, edge);
+ omnigraph::visualization::WriteComponent(component, output_filename, coloring.ConstructColorer(component), labeler);
+}
+
+template<class Graph>
+void PrintColoredGraphWithColorFilter(const Graph &g, const ColorHandler<Graph> &coloring,
+ const CoordinatesHandler<Graph> &pos, const vector<string> &genome_names, const string &output_folder) {
+
+ size_t edge_length_bound = 1000000;
+ size_t colors_number = coloring.max_colors();
+ TColorSet restricted_color = TColorSet::AllColorsSet(colors_number);
+
+ shared_ptr<GraphSplitter<Graph>> splitter = ReliableSplitter<Graph>(g, edge_length_bound, 30);
+ shared_ptr<omnigraph::GraphComponentFilter<Graph>> filter = make_shared<ComponentSingleColorFilter<Graph>>(g, coloring, restricted_color, edge_length_bound, 2);
+ shared_ptr<omnigraph::GraphSplitter<Graph>> fs = make_shared<omnigraph::FilteringSplitterWrapper<Graph> >(splitter, filter);
+ LengthIdGraphLabeler<Graph> basic_labeler(g);
+ EdgeCoordinatesGraphLabeler<Graph> pos_labeler(g, pos, genome_names);
+
+ CompositeLabeler<Graph> labeler(basic_labeler, pos_labeler);
+ omnigraph::visualization::WriteComponents(g, output_folder, fs, coloring.ConstructColorer(), labeler);
+}
+
+//fixme code duplication
+template<class Graph>
+void PrintColoredGraphWithColorFilter(const Graph &g, const ColorHandler<Graph> &coloring,
+ const EdgesPositionHandler<Graph> &pos, const string &output_folder) {
+
+ size_t edge_length_bound = 1000000;
+ size_t colors_number = coloring.max_colors();
+ TColorSet restricted_color = TColorSet::AllColorsSet(colors_number);
+
+ shared_ptr<omnigraph::GraphSplitter<Graph>> splitter = ReliableSplitter<Graph>(g, edge_length_bound, 30);
+ shared_ptr<omnigraph::GraphComponentFilter<Graph>> filter = make_shared<ComponentSingleColorFilter<Graph>>(g, coloring, restricted_color, edge_length_bound, 2);
+ shared_ptr<omnigraph::GraphSplitter<Graph>> fs = make_shared<omnigraph::FilteringSplitterWrapper<Graph>>(splitter, filter);
+ LengthIdGraphLabeler<Graph> basic_labeler(g);
+ EdgePosGraphLabeler<Graph> pos_labeler(g, pos);
+
+ CompositeLabeler<Graph> labeler(basic_labeler, pos_labeler);
+ omnigraph::visualization::WriteComponents(g, output_folder, fs, coloring.ConstructColorer(), labeler);
+}
+
+//todo alert!!! magic constants!!!
+//todo refactoring of params needed
+template<class gp_t>
+void WriteComponentsAlongSequence(
+ const gp_t& gp,
+ const AbstractFilter<vector<typename gp_t::graph_t::VertexId>>& /*filter*/,
+ const string& /*file_name*/,
+ size_t /*split_edge_length*/, size_t /*component_vertex_number*/,
+ const Sequence& /*s*/, const ColorHandler<typename gp_t::graph_t>& /*coloring*/) {
+ typedef typename gp_t::graph_t Graph;
+ LengthIdGraphLabeler < Graph > basic_labeler(gp.g);
+ EdgePosGraphLabeler < Graph > pos_labeler(gp.g, gp.edge_pos);
+ CompositeLabeler < Graph > labeler(basic_labeler, pos_labeler);
+}
+
+template<class gp_t>
+void PrintColoredGraphAlongRef(const gp_t& gp,
+ const ColorHandler<Graph>& coloring,
+ const string& output_filename) {
+ LengthIdGraphLabeler < Graph > basic_labeler(gp.g);
+ EdgePosGraphLabeler < Graph > pos_labeler(gp.g, gp.edge_pos);
+
+ CompositeLabeler < Graph > labeler(basic_labeler, pos_labeler);
+
+// only breakpoints
+ TrivialBreakpointFinder<Graph> bp_f(gp.g, coloring, gp.edge_pos);
+
+ WriteComponentsAlongSequence(gp, bp_f, labeler, output_filename, 1000000,
+ 30, MapperInstance(gp)->MapSequence(gp.genome.GetSequence()),
+ *ConstructBorderColorer(gp.g, coloring)
+// *ConstructColorer(coloring)
+ );
+}
+
+}
diff --git a/src/projects/corrector/CMakeLists.txt b/src/projects/corrector/CMakeLists.txt
new file mode 100644
index 0000000..0434323
--- /dev/null
+++ b/src/projects/corrector/CMakeLists.txt
@@ -0,0 +1,34 @@
+############################################################################
+# Copyright (c) 2015 Saint Petersburg State University
+# Copyright (c) 2011-2014 Saint Petersburg Academic University
+# All Rights Reserved
+# See file LICENSE for details.
+############################################################################
+
+project(corrector CXX)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
+
+add_executable(corrector
+ positional_read.cpp
+ interesting_pos_processor.cpp
+ contig_processor.cpp
+ dataset_processor.cpp
+ config_struct.cpp
+ main.cpp)
+
+target_link_libraries(corrector input spades_modules ${COMMON_LIBRARIES})
+
+
+
+if (SPADES_STATIC_BUILD)
+ set_target_properties(corrector PROPERTIES LINK_SEARCH_END_STATIC 1)
+endif()
+
+install(TARGETS corrector
+ DESTINATION bin
+ COMPONENT runtime)
+install(DIRECTORY "${SPADES_CFG_DIR}/corrector"
+ DESTINATION share/spades/configs
+ FILES_MATCHING PATTERN "*.info")
diff --git a/src/projects/corrector/config_struct.cpp b/src/projects/corrector/config_struct.cpp
new file mode 100644
index 0000000..d799b7a
--- /dev/null
+++ b/src/projects/corrector/config_struct.cpp
@@ -0,0 +1,78 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "config_struct.hpp"
+
+#include "dev_support/openmp_wrapper.h"
+
+#include "llvm/Support/YAMLParser.h"
+#include "llvm/Support/YAMLTraits.h"
+
+#include <string>
+
+using namespace llvm;
+
+namespace llvm { namespace yaml {
+template <>
+struct ScalarEnumerationTraits<corrector::Strategy> {
+ static void enumeration(yaml::IO &io, corrector::Strategy &value) {
+ io.enumCase(value, "all_reads", corrector::Strategy::AllReads);
+ io.enumCase(value, "majority_only", corrector::Strategy::MajorityOnly);
+ io.enumCase(value, "not_started", corrector::Strategy::AllExceptJustStarted);
+ io.enumCase(value, "mapped_squared", corrector::Strategy::MappedSquared);
+ }
+};
+}}
+
+// FIXME: This is temporary
+class DataSetReader {
+ public:
+ DataSetReader(yaml::IO&) {}
+ DataSetReader(yaml::IO&, io::DataSet<>&) {}
+
+ io::DataSet<> denormalize(yaml::IO &) {
+ return io::DataSet<>(path);
+ }
+
+ std::string path;
+};
+
+namespace llvm { namespace yaml {
+template <>
+struct MappingTraits<corrector::corrector_config> {
+ static void mapping(yaml::IO &io, corrector::corrector_config &cfg) {
+ yaml::MappingNormalization<DataSetReader, io::DataSet<>> dataset(io, cfg.dataset);
+
+ io.mapRequired("dataset", dataset->path);
+ io.mapOptional("work_dir", cfg.work_dir, std::string("."));
+ io.mapOptional("output_dir", cfg.output_dir, std::string("."));
+ io.mapOptional("max_nthreads", cfg.max_nthreads, 1u);
+ io.mapRequired("strategy", cfg.strat);
+ io.mapOptional("bwa", cfg.bwa, std::string("."));
+ }
+};
+}}
+
+
+namespace corrector {
+void load(corrector_config& cfg, const std::string &filename) {
+ ErrorOr<std::unique_ptr<MemoryBuffer>> Buf = MemoryBuffer::getFile(filename);
+ if (!Buf)
+ throw(std::string("Failed to load config file ") + filename);
+
+ yaml::Input yin(*Buf.get());
+ yin >> cfg;
+
+ if (yin.error())
+ throw(std::string("Failed to load config file ") + filename);
+
+ // Fix number of threads according to OMP capabilities.
+ cfg.max_nthreads = std::min(cfg.max_nthreads, (unsigned)omp_get_max_threads());
+ // Inform OpenMP runtime about this :)
+ omp_set_num_threads(cfg.max_nthreads);
+}
+}
diff --git a/src/projects/corrector/config_struct.hpp b/src/projects/corrector/config_struct.hpp
new file mode 100644
index 0000000..e1fcf19
--- /dev/null
+++ b/src/projects/corrector/config_struct.hpp
@@ -0,0 +1,33 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "pipeline/config_singl.hpp"
+
+#include "pipeline/library.hpp"
+
+namespace corrector {
+enum class Strategy {
+ AllReads = 1,
+ MappedSquared = 2,
+ AllExceptJustStarted = 3,
+ MajorityOnly = 4
+};
+struct corrector_config {
+ io::DataSet<> dataset;
+ std::string work_dir;
+ std::string output_dir;
+ unsigned max_nthreads;
+ Strategy strat;
+ std::string bwa;
+};
+
+void load(corrector::corrector_config& cfg, const std::string &filename);
+}
+
+typedef config_common::config<corrector::corrector_config> corr_cfg;
diff --git a/src/projects/corrector/contig_processor.cpp b/src/projects/corrector/contig_processor.cpp
new file mode 100644
index 0000000..7a90b62
--- /dev/null
+++ b/src/projects/corrector/contig_processor.cpp
@@ -0,0 +1,306 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "contig_processor.hpp"
+#include "config_struct.hpp"
+#include "variants_table.hpp"
+
+#include "io/reads_io/ireader.hpp"
+#include "io/reads_io/osequencestream.hpp"
+#include "io/reads_io/file_reader.hpp"
+#include "io/reads/single_read.hpp"
+#include "dev_support/path_helper.hpp"
+
+#include <boost/algorithm/string.hpp>
+
+using namespace std;
+
+namespace corrector {
+
+void ContigProcessor::ReadContig() {
+ io::FileReadStream frs(contig_file_);
+ io::SingleRead cur_read;
+ frs >> cur_read;
+ if (!frs.eof()) {
+#pragma omp critical
+ {
+ ERROR("Non unique sequnce in one contig fasta!");
+ }
+ }
+ contig_name_ = cur_read.name();
+ contig_ = cur_read.GetSequenceString();
+
+ output_contig_file_ = path::append_path(path::parent_path(contig_file_), path::basename(contig_file_) + ".ref.fasta");
+ charts_.resize(contig_.length());
+}
+
+void ContigProcessor::UpdateOneRead(const SingleSamRead &tmp, MappedSamStream &sm) {
+ unordered_map<size_t, position_description> all_positions;
+ if (tmp.contig_id() < 0) {
+ return;
+ }
+ auto cur_s = sm.get_contig_name(tmp.contig_id());
+ if (contig_name_.compare(cur_s) != 0) {
+ return;
+ }
+ CountPositions(tmp, all_positions);
+ size_t error_num = 0;
+
+ for (auto &pos : all_positions) {
+ charts_[pos.first].update(pos.second);
+ if (pos.second.FoundOptimal(contig_[pos.first]) != var_to_pos[(int) contig_[pos.first]]) {
+ error_num++;
+ }
+ }
+
+ if (error_num >= error_counts_.size())
+ error_counts_[error_counts_.size() - 1]++;
+ else
+ error_counts_[error_num]++;
+}
+
+//returns: number of changed nucleotides;
+size_t ContigProcessor::UpdateOneBase(size_t i, stringstream &ss, const unordered_map<size_t, position_description> &interesting_positions) const{
+ char old = (char) toupper(contig_[i]);
+ auto strat = corr_cfg::get().strat;
+ size_t maxi = charts_[i].FoundOptimal(contig_[i]);
+ auto i_position = interesting_positions.find(i);
+ if (i_position != interesting_positions.end()) {
+ size_t maxj = i_position->second.FoundOptimal(contig_[i]);
+ if (maxj != maxi) {
+ DEBUG("Interesting positions differ with majority!");
+ DEBUG("On position " << i << " old: " << old << " majority: " << pos_to_var[maxi] << "interesting: " << pos_to_var[maxj]);
+ if (strat != Strategy::MajorityOnly)
+ maxi = maxj;
+ }
+ }
+ if (old != pos_to_var[maxi]) {
+ DEBUG("On position " << i << " changing " << old << " to " << pos_to_var[maxi]);
+ DEBUG(charts_[i].str());
+ if (maxi < Variants::Deletion) {
+ ss << pos_to_var[maxi];
+ return 1;
+ } else if (maxi == Variants::Deletion) {
+ return 1;
+ } else if (maxi == Variants::Insertion) {
+ string maxj = "";
+ //first base before insertion;
+ size_t new_maxi = var_to_pos[(int) contig_[i]];
+ int new_maxx = charts_[i].votes[new_maxi];
+ for (size_t k = 0; k < MAX_VARIANTS; k++) {
+ if (new_maxx < charts_[i].votes[k] && (k != Variants::Insertion) && (k != Variants::Deletion)) {
+ new_maxx = charts_[i].votes[k];
+ new_maxi = k;
+ }
+ }
+ ss << pos_to_var[new_maxi];
+ int max_ins = 0;
+ for (const auto &ic : charts_[i].insertions) {
+ if (ic.second > max_ins) {
+ max_ins = ic.second;
+ maxj = ic.first;
+ }
+ }
+ DEBUG("most popular insertion: " << maxj);
+ ss << maxj;
+ if (old == maxj[0]) {
+ return (int) maxj.length() - 1;
+ } else {
+ return (int) maxj.length();
+ }
+ } else {
+ //something strange happened
+ WARN("While processing base " << i << " unknown decision was made");
+ return 0;
+ }
+ } else {
+ ss << old;
+ return 0;
+ }
+}
+
+
+bool ContigProcessor::CountPositions(const SingleSamRead &read, unordered_map<size_t, position_description> &ps) const {
+
+ if (read.contig_id() < 0) {
+ DEBUG("not this contig");
+ return false;
+ }
+ //TODO: maybe change to read.is_properly_aligned() ?
+ if (read.map_qual() == 0) {
+ DEBUG("zero qual");
+ return false;
+ }
+ int pos = read.pos();
+ if (pos < 0) {
+ WARN("Negative position " << pos << " found on read " << read.name() << ", skipping");
+ return false;
+ }
+ size_t position = size_t(pos);
+ int mate = 1; // bonus for mate mapped can be here;
+ size_t l_read = (size_t) read.data_len();
+ size_t l_cigar = read.cigar_len();
+
+ int aligned_length = 0;
+ uint32_t *cigar = read.cigar_ptr();
+ //* in cigar;
+ if (l_cigar == 0)
+ return false;
+ if (bam_cigar_opchr(cigar[0]) == '*')
+ return false;
+ for (size_t i = 0; i < l_cigar; i++)
+ if (bam_cigar_opchr(cigar[i]) == 'M')
+ aligned_length += bam_cigar_oplen(cigar[i]);
+//It's about bad aligned reads, but whether it is necessary?
+ double read_len_double = (double) l_read;
+ if ((aligned_length < min(read_len_double * 0.4, 40.0)) && (position > read_len_double / 2) && (contig_.length() > read_len_double / 2 + (double) position)) {
+ return false;
+ }
+ int state_pos = 0;
+ int shift = 0;
+ size_t skipped = 0;
+ size_t deleted = 0;
+ string insertion_string = "";
+ auto seq = read.seq_ptr();
+ for (size_t i = 0; i < l_read; i++) {
+ DEBUG(i << " " << position << " " << skipped);
+ if (shift + bam_cigar_oplen(cigar[state_pos]) <= i) {
+ shift += bam_cigar_oplen(cigar[state_pos]);
+ state_pos += 1;
+ }
+ if (insertion_string != "" and bam_cigar_opchr(cigar[state_pos]) != 'I') {
+ VERIFY(i + position >= skipped + 1);
+ size_t ind = i + position - skipped - 1;
+ if (ind >= contig_.length())
+ break;
+ ps[ind].insertions[insertion_string] += 1;
+ insertion_string = "";
+ }
+ char cur_state = bam_cigar_opchr(cigar[state_pos]);
+ if (cur_state == 'M') {
+ VERIFY(i >= deleted);
+ if (i + position < skipped) {
+ WARN(i << " " << position << " " << skipped);
+ INFO(read.name());
+ }
+ VERIFY(i + position >= skipped);
+
+ size_t ind = i + position - skipped;
+ size_t cur = var_to_pos[(int) bam_nt16_rev_table[bam1_seqi(seq, i - deleted)]];
+ if (ind >= contig_.length())
+ continue;
+ ps[ind].votes[cur] = ps[ind].votes[cur] + mate;
+
+ } else {
+ if (cur_state == 'I' || cur_state == 'H' || cur_state == 'S' ) {
+ if (cur_state == 'I') {
+ if (insertion_string == "") {
+ size_t ind = i + position - skipped - 1;
+ if (ind >= contig_.length())
+ break;
+ ps[ind].votes[Variants::Insertion] += mate;
+ }
+ insertion_string += bam_nt16_rev_table[bam1_seqi(seq, i - deleted)];
+ }
+ skipped += 1;
+ } else if (bam_cigar_opchr(cigar[state_pos]) == 'D') {
+ if (i + position - skipped >= contig_.length())
+ break;
+ ps[i + position - skipped].votes[Variants::Deletion] += mate;
+ deleted += 1;
+ }
+ }
+ }
+ if (insertion_string != "" and bam_cigar_opchr(cigar[state_pos]) != 'I') {
+ VERIFY(l_read + position >= skipped + 1);
+ size_t ind = l_read + position - skipped - 1;
+ if (ind < contig_.length()) {
+ ps[ind].insertions[insertion_string] += 1;
+ }
+ insertion_string = "";
+ }
+ return true;
+}
+
+
+bool ContigProcessor::CountPositions(const PairedSamRead &read, unordered_map<size_t, position_description> &ps) const {
+
+ TRACE("starting pairing");
+ bool t1 = CountPositions(read.Left(), ps );
+ unordered_map<size_t, position_description> tmp;
+ bool t2 = CountPositions(read.Right(), tmp);
+ //overlaps.. multimap? Look on qual?
+ if (ps.size() == 0 || tmp.size() == 0) {
+ //We do not need paired reads which are not really paired
+ ps.clear();
+ return false;
+ }
+ TRACE("counted, uniting maps of " << tmp.size() << " and " << ps.size());
+ ps.insert(tmp.begin(), tmp.end());
+ TRACE("united");
+ return (t1 && t2);
+}
+
+size_t ContigProcessor::ProcessMultipleSamFiles() {
+ error_counts_.resize(kMaxErrorNum);
+ for (const auto &sf : sam_files_) {
+ MappedSamStream sm(sf.first);
+ while (!sm.eof()) {
+ SingleSamRead tmp;
+ sm >> tmp;
+
+ UpdateOneRead(tmp, sm);
+ }
+ sm.close();
+ }
+
+ ipp_.FillInterestingPositions(charts_);
+ for (const auto &sf : sam_files_) {
+ MappedSamStream sm(sf.first);
+ while (!sm.eof()) {
+ unordered_map<size_t, position_description> ps;
+ if (sf.second == io::LibraryType::PairedEnd ) {
+ PairedSamRead tmp;
+ sm >> tmp;
+ CountPositions(tmp, ps);
+ } else {
+ SingleSamRead tmp;
+ sm >> tmp;
+ CountPositions(tmp, ps);
+ }
+ ipp_.UpdateInterestingRead(ps);
+ }
+ sm.close();
+ }
+ ipp_.UpdateInterestingPositions();
+ unordered_map<size_t, position_description> interesting_positions = ipp_.get_weights();
+ stringstream s_new_contig;
+ size_t total_changes = 0;
+ for (size_t i = 0; i < contig_.length(); i++) {
+ total_changes += UpdateOneBase(i, s_new_contig, interesting_positions);
+ }
+ vector<string> contig_name_splitted;
+ boost::split(contig_name_splitted, contig_name_, boost::is_any_of("_"));
+ io::osequencestream_simple oss(output_contig_file_);
+ for(size_t i = 0; i < contig_name_splitted.size(); i++) {
+ if (contig_name_splitted[i] == "length" && i + 1 < contig_name_splitted.size()) {
+ contig_name_splitted[i + 1] = std::to_string(int(s_new_contig.str().length()));
+ break;
+ }
+ }
+ std::string new_header = contig_name_splitted[0];
+ for(size_t i = 1; i < contig_name_splitted.size(); i++) {
+ new_header += "_" + contig_name_splitted[i];
+ }
+ oss.set_header(new_header);
+ oss << s_new_contig.str();
+
+ return total_changes;
+}
+
+}
+;
diff --git a/src/projects/corrector/contig_processor.hpp b/src/projects/corrector/contig_processor.hpp
new file mode 100644
index 0000000..0a46be4
--- /dev/null
+++ b/src/projects/corrector/contig_processor.hpp
@@ -0,0 +1,65 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * contig_processor.hpp
+ *
+ * Created on: Jun 27, 2014
+ * Author: lab42
+ */
+
+#pragma once
+#include "interesting_pos_processor.hpp"
+#include "positional_read.hpp"
+#include "dev_support/openmp_wrapper.h"
+
+#include <io/sam_io/sam_reader.hpp>
+#include <io/sam_io/read.hpp>
+#include "pipeline/library.hpp"
+
+#include <string>
+#include <vector>
+#include <unordered_map>
+
+namespace corrector {
+
+using namespace sam_reader;
+
+typedef std::vector<std::pair<std::string, io::LibraryType> > sam_files_type;
+class ContigProcessor {
+ sam_files_type sam_files_;
+ std::string contig_file_;
+ std::string contig_name_;
+ std::string output_contig_file_;
+ std::string contig_;
+ std::vector<position_description> charts_;
+ InterestingPositionProcessor ipp_;
+ std::vector<int> error_counts_;
+
+ const size_t kMaxErrorNum = 20;
+
+public:
+ ContigProcessor(const sam_files_type &sam_files, const std::string &contig_file)
+ : sam_files_(sam_files), contig_file_(contig_file) {
+ ReadContig();
+ ipp_.set_contig(contig_);
+ }
+ size_t ProcessMultipleSamFiles();
+private:
+ void ReadContig();
+//Moved from read.hpp
+ bool CountPositions(const SingleSamRead &read, std::unordered_map<size_t, position_description> &ps) const;
+ bool CountPositions(const PairedSamRead &read, std::unordered_map<size_t, position_description> &ps) const;
+
+ void UpdateOneRead(const SingleSamRead &tmp, MappedSamStream &sm);
+ //returns: number of changed nucleotides;
+
+ size_t UpdateOneBase(size_t i, std::stringstream &ss, const std::unordered_map<size_t, position_description> &interesting_positions) const ;
+
+};
+}
+;
diff --git a/src/projects/corrector/dataset_processor.cpp b/src/projects/corrector/dataset_processor.cpp
new file mode 100644
index 0000000..15fe997
--- /dev/null
+++ b/src/projects/corrector/dataset_processor.cpp
@@ -0,0 +1,273 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "dataset_processor.hpp"
+#include "variants_table.hpp"
+#include "contig_processor.hpp"
+#include "config_struct.hpp"
+
+#include "io/reads_io/file_reader.hpp"
+#include "dev_support/path_helper.hpp"
+#include "io/reads_io/osequencestream.hpp"
+#include "dev_support/openmp_wrapper.h"
+
+#include <boost/algorithm/string.hpp>
+
+#include <iostream>
+#include <unistd.h>
+
+using namespace std;
+
+namespace corrector {
+std::string DatasetProcessor::GetLibDir(const size_t lib_count) {
+ if (lib_dirs_.find(lib_count) != lib_dirs_.end())
+ return lib_dirs_[lib_count];
+ std::string res = path::make_temp_dir(corr_cfg::get().work_dir, "lib" + to_string(lib_count));
+ lib_dirs_[lib_count] = res;
+ return res;
+}
+
+void DatasetProcessor::SplitGenome(const string &genome_splitted_dir) {
+ io::FileReadStream frs(genome_file_);
+ size_t cur_id = 0;
+ while (!frs.eof()) {
+ io::SingleRead cur_read;
+ frs >> cur_read;
+ string contig_name = cur_read.name();
+ string contig_seq = cur_read.GetSequenceString();
+ if (all_contigs_.find(contig_name) != all_contigs_.end()) {
+ WARN("Duplicated contig names! Multiple contigs with name" << contig_name);
+ }
+ string full_path = path::append_path(genome_splitted_dir, contig_name + ".fasta");
+ string out_full_path = path::append_path(genome_splitted_dir, contig_name + ".ref.fasta");
+ string sam_filename = path::append_path(genome_splitted_dir, contig_name + ".pair.sam");
+ all_contigs_[contig_name] = {full_path, out_full_path, contig_seq.length(), sam_files_type(), sam_filename, cur_id};
+ cur_id ++;
+ buffered_reads_[contig_name].clear();
+ io::osequencestream oss(full_path);
+ oss << io::SingleRead(contig_name, contig_seq);
+ DEBUG("full_path " + full_path)
+ }
+}
+
+//contigs - set of aligned contig names
+void DatasetProcessor::GetAlignedContigs(const string &read, set<string> &contigs) const {
+ vector<string> arr;
+ boost::split(arr, read, boost::is_any_of("\t"));
+ if (arr.size() > 5) {
+ if (arr[2] != "*" && stoi(arr[4]) > 0) {
+// here can be multuple aligned parsing if neeeded;
+ contigs.insert(arr[2]);
+ }
+ }
+
+}
+
+void DatasetProcessor::SplitSingleLibrary(const string &all_reads_filename, const size_t lib_count) {
+ ifstream fs(all_reads_filename);
+ while (!fs.eof()) {
+ set<string> contigs;
+ string r1;
+ getline(fs, r1);
+ if (r1[0] == '@')
+ continue;
+ GetAlignedContigs(r1, contigs);
+ for (auto &contig : contigs) {
+ VERIFY_MSG(all_contigs_.find(contig) != all_contigs_.end(), "wrong contig name in SAM file header: " + contig);
+ BufferedOutputRead(r1, contig, lib_count);
+ }
+ }
+ FlushAll(lib_count);
+}
+
+void DatasetProcessor::FlushAll(const size_t lib_count) {
+ for (const auto &ac : all_contigs_) {
+ if (buffered_reads_[ac.first].size() > 0) {
+ ofstream stream(ac.second.sam_filenames[lib_count].first.c_str(), std::ios_base::app | std::ios_base::out);
+ for (const string &read : buffered_reads_[ac.first]) {
+ stream << read;
+ stream << '\n';
+ }
+ buffered_reads_[ac.first].clear();
+ }
+ }
+}
+
+void DatasetProcessor::BufferedOutputRead(const string &read, const string &contig_name, const size_t lib_count) {
+ buffered_reads_[contig_name].push_back(read);
+ buffered_count_++;
+ if (buffered_count_ % kBuffSize == 0) {
+ if (buffered_count_ % (10 * kBuffSize) == 0)
+ INFO("processed " << buffered_count_ << "reads, flushing");
+ FlushAll(lib_count);
+ }
+}
+
+void DatasetProcessor::SplitPairedLibrary(const string &all_reads_filename, const size_t lib_count) {
+ ifstream fs(all_reads_filename);
+ while (!fs.eof()) {
+ set<string> contigs;
+ string r1;
+ string r2;
+ getline(fs, r1);
+ if (r1[0] == '@')
+ continue;
+ getline(fs, r2);
+ GetAlignedContigs(r1, contigs);
+ GetAlignedContigs(r2, contigs);
+ for (const auto &contig : contigs) {
+ VERIFY_MSG(all_contigs_.find(contig) != all_contigs_.end(), "wrong contig name in SAM file header: " + contig);
+ if (all_contigs_.find(contig) != all_contigs_.end()) {
+ BufferedOutputRead(r1, contig, lib_count);
+ BufferedOutputRead(r2, contig, lib_count);
+ }
+ }
+ }
+ FlushAll(lib_count);
+}
+
+string DatasetProcessor::RunPairedBwa(const string &left, const string &right, const size_t lib) {
+ string cur_dir = GetLibDir(lib);
+ int run_res = 0;
+ string tmp_sam_filename = path::append_path(cur_dir, "tmp.sam");
+ string bwa_string = path::screen_whitespaces(path::screen_whitespaces(corr_cfg::get().bwa));
+ string genome_screened = path::screen_whitespaces(genome_file_);
+ string index_line = bwa_string + string(" index ") + "-a " + "is " + genome_screened ;
+ INFO("Running bwa index ...: " << index_line);
+ run_res = system(index_line.c_str());
+ if (run_res != 0) {
+ INFO("bwa failed, skipping sublib");
+ return "";
+ }
+ string nthreads_str = to_string(nthreads_);
+ string last_line = bwa_string + string(" mem ") + " -v 1 -t " + nthreads_str + " "+ genome_screened + " " + path::screen_whitespaces(left) + " " + path::screen_whitespaces(right) + " > "
+ + path::screen_whitespaces(tmp_sam_filename) ;
+ INFO("Running bwa mem ...:" << last_line);
+ run_res = system(last_line.c_str());
+ if (run_res != 0) {
+ INFO("bwa failed, skipping sublib");
+ return "";
+ }
+ return tmp_sam_filename;
+}
+
+string DatasetProcessor::RunSingleBwa(const string &single, const size_t lib) {
+ int run_res = 0;
+ string cur_dir = GetLibDir(lib);
+ string tmp_sam_filename = path::append_path(cur_dir, "tmp.sam");
+ string bwa_string = path::screen_whitespaces(path::screen_whitespaces(corr_cfg::get().bwa));
+ string genome_screened = path::screen_whitespaces(genome_file_);
+ string index_line = bwa_string + string(" index ") + "-a " + "is " + genome_screened ;
+ INFO("Running bwa index ...: " << index_line);
+ run_res = system(index_line.c_str());
+ if (run_res != 0) {
+ INFO("bwa failed, skipping sublib");
+ return "";
+ }
+ string nthreads_str = to_string(nthreads_);
+ string last_line = bwa_string + " mem "+ " -v 1 -t " + nthreads_str + " " + genome_screened + " " + single + " > " + path::screen_whitespaces(tmp_sam_filename);
+ INFO("Running bwa mem ...:" << last_line);
+ run_res = system(last_line.c_str());
+ if (run_res != 0) {
+ INFO("bwa failed, skipping sublib");
+ return "";
+ }
+ return tmp_sam_filename;
+}
+
+void DatasetProcessor::PrepareContigDirs(const size_t lib_count) {
+ string out_dir = GetLibDir(lib_count);
+ for (auto &ac : all_contigs_) {
+ auto contig_name = ac.first;
+ string out_name = path::append_path(out_dir, contig_name + ".sam");
+ ac.second.sam_filenames.push_back(make_pair(out_name, unsplitted_sam_files_[lib_count].second));
+ BufferedOutputRead("@SQ\tSN:" + contig_name + "\tLN:" + to_string(all_contigs_[contig_name].contig_length), contig_name, lib_count);
+ }
+ FlushAll(lib_count);
+}
+
+void DatasetProcessor::ProcessDataset() {
+ size_t lib_num = 0;
+ INFO("Splitting assembly...");
+ INFO("Assembly file: " + genome_file_);
+ SplitGenome(work_dir_);
+ for (size_t i = 0; i < corr_cfg::get().dataset.lib_count(); ++i) {
+ const auto& dataset = corr_cfg::get().dataset[i];
+ auto lib_type = dataset.type();
+ if (lib_type == io::LibraryType::PairedEnd || lib_type == io::LibraryType::HQMatePairs || lib_type == io::LibraryType::SingleReads) {
+ for (auto iter = dataset.paired_begin(); iter != dataset.paired_end(); iter++) {
+ INFO("Processing paired sublib of number " << lib_num);
+ string left = iter->first;
+ string right = iter->second;
+ INFO(left + " " + right);
+ string samf = RunPairedBwa(left, right, lib_num);
+ if (samf != "") {
+ INFO("Adding samfile " << samf);
+ unsplitted_sam_files_.push_back(make_pair(samf, lib_type));
+ PrepareContigDirs(lib_num);
+ SplitPairedLibrary(samf, lib_num);
+ lib_num++;
+ } else {
+ FATAL_ERROR("Failed to align paired reads " << left << " and " << right);
+ }
+ }
+ for (auto iter = dataset.single_begin(); iter != dataset.single_end(); iter++) {
+ INFO("Processing single sublib of number " << lib_num);
+ string left = *iter;
+ INFO(left);
+ string samf = RunSingleBwa(left, lib_num);
+ if (samf != "") {
+ INFO("Adding samfile " << samf);
+ unsplitted_sam_files_.push_back(make_pair(samf, io::LibraryType::SingleReads));
+ PrepareContigDirs(lib_num);
+ SplitSingleLibrary(samf, lib_num);
+ lib_num++;
+ } else {
+ FATAL_ERROR("Failed to align single reads " << left);
+ }
+ }
+ }
+ }
+ INFO("Processing contigs");
+ vector<pair<size_t, string> > ordered_contigs;
+ for (const auto &ac : all_contigs_) {
+ ordered_contigs.push_back(make_pair(ac.second.contig_length, ac.first));
+ }
+ size_t cont_num = ordered_contigs.size();
+ sort(ordered_contigs.begin(), ordered_contigs.end(), std::greater<pair<size_t, string> >());
+ auto all_contigs_ptr = &all_contigs_;
+# pragma omp parallel for shared(all_contigs_ptr, ordered_contigs) num_threads(nthreads_) schedule(dynamic,1)
+ for (size_t i = 0; i < cont_num; i++) {
+ bool long_enough = (*all_contigs_ptr)[ordered_contigs[i].second].contig_length > kMinContigLengthForInfo;
+ ContigProcessor pc((*all_contigs_ptr)[ordered_contigs[i].second].sam_filenames, (*all_contigs_ptr)[ordered_contigs[i].second].input_contig_filename);
+ size_t changes = pc.ProcessMultipleSamFiles();
+ if (long_enough) {
+#pragma omp critical
+ {
+ INFO("Contig " << ordered_contigs[i].second << " processed with " << changes << " changes in thread " << omp_get_thread_num());
+ }
+ }
+ }
+ INFO("Gluing processed contigs");
+ GlueSplittedContigs(output_contig_file_);
+}
+
+void DatasetProcessor::GlueSplittedContigs(string &out_contigs_filename) {
+ ofstream of_c(out_contigs_filename, std::ios_base::binary);
+ vector<string> ordered_names;
+ ordered_names.resize(all_contigs_.size());
+ for (const auto &ac : all_contigs_) {
+ ordered_names[ac.second.id] = ac.first;
+ }
+ for (size_t i = 0; i < ordered_names.size(); i++) {
+ ifstream a_f(all_contigs_[ordered_names[i]].output_contig_filename, std::ios_base::binary);
+ of_c << a_f.rdbuf();
+ }
+}
+
+}
+;
diff --git a/src/projects/corrector/dataset_processor.hpp b/src/projects/corrector/dataset_processor.hpp
new file mode 100644
index 0000000..397f5ed
--- /dev/null
+++ b/src/projects/corrector/dataset_processor.hpp
@@ -0,0 +1,71 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "dev_support/path_helper.hpp"
+
+#include "io/reads_io/file_reader.hpp"
+#include "dev_support/path_helper.hpp"
+
+#include "pipeline/library.hpp"
+
+#include <string>
+#include <set>
+#include <vector>
+#include <unordered_map>
+
+namespace corrector {
+
+typedef std::vector<std::pair<std:: string, io::LibraryType> > sam_files_type;
+
+struct OneContigDescription {
+ std::string input_contig_filename;
+ std::string output_contig_filename;
+ size_t contig_length;
+ sam_files_type sam_filenames;
+ std::string sam_filename;
+ size_t id;
+};
+typedef std::unordered_map<std::string, OneContigDescription> ContigInfoMap;
+
+class DatasetProcessor {
+
+ const std::string &genome_file_;
+ std::string output_contig_file_;
+ ContigInfoMap all_contigs_;
+ sam_files_type unsplitted_sam_files_;
+ const std::string &work_dir_;
+ std::unordered_map<std::string, std::vector<std::string> > buffered_reads_;
+ size_t nthreads_;
+ size_t buffered_count_;
+ std::unordered_map<size_t, std::string> lib_dirs_;
+ const size_t kBuffSize = 100000;
+ const size_t kMinContigLengthForInfo = 20000;
+public:
+ DatasetProcessor(const std::string &genome_file, const std::string &work_dir, const std::string &output_dir, const size_t &thread_num)
+ : genome_file_(genome_file), work_dir_(work_dir), nthreads_(thread_num) {
+ output_contig_file_ = path::append_path(output_dir, "corrected_contigs.fasta");
+ buffered_count_ = 0;
+ }
+
+ void ProcessDataset();
+private:
+ void SplitGenome(const std::string &genome_splitted_dir);
+ void FlushAll(const size_t lib_count);
+ void BufferedOutputRead(const std::string &read, const std::string &contig_name, const size_t lib_count);
+ void GetAlignedContigs(const std::string &read, std::set<std::string> &contigs) const;
+ void SplitSingleLibrary(const std::string &out_contigs_filename, const size_t lib_count);
+ void SplitPairedLibrary(const std::string &all_reads, const size_t lib_count);
+ void GlueSplittedContigs(std::string &out_contigs_filename);
+ std::string RunPairedBwa(const std::string &left, const std::string &right, const size_t lib);
+ std::string RunSingleBwa(const std::string &single, const size_t lib);
+ void PrepareContigDirs(const size_t lib_count);
+ std::string GetLibDir(const size_t lib_count);
+};
+}
+;
diff --git a/src/projects/corrector/interesting_pos_processor.cpp b/src/projects/corrector/interesting_pos_processor.cpp
new file mode 100644
index 0000000..160f4a1
--- /dev/null
+++ b/src/projects/corrector/interesting_pos_processor.cpp
@@ -0,0 +1,127 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "interesting_pos_processor.hpp"
+#include "config_struct.hpp"
+
+#include "dev_support/logger/logger.hpp"
+
+using namespace std;
+
+namespace corrector {
+bool InterestingPositionProcessor::FillInterestingPositions(const vector<position_description> &charts) {
+ bool any_interesting = false;
+ for (size_t i = 0; i < contig_.length(); i++) {
+ int sum_total = 0;
+ for (size_t j = 0; j < MAX_VARIANTS; j++) {
+ if (j != Variants::Insertion && j != Variants::Deletion) {
+ sum_total += charts[i].votes[j];
+ }
+ }
+ int variants = 0;
+ for (size_t j = 0; j < MAX_VARIANTS; j++) {
+ //TODO::For IT reconsider this condition
+ if (j != Variants::Insertion && j != Variants::Deletion && (charts[i].votes[j] > 0.1 * sum_total) && (charts[i].votes[j] < 0.9 * sum_total) && (sum_total > 20)) {
+ variants++;
+ }
+ }
+ if (variants > 1 || contig_[i] == Variants::Undefined) {
+ DEBUG("Adding interesting position: " << i << " " << charts[i].str());
+ any_interesting = true;
+ is_interesting_[i] = true;
+ for (int j = -kAnchorNum; j <= kAnchorNum; j++) {
+ int additional = (int) (i / kAnchorGap + j) * kAnchorGap;
+ if (additional >= 0 && additional < (int) contig_.length())
+ is_interesting_[additional] = true;
+ }
+ }
+ }
+
+ return any_interesting;
+}
+
+void InterestingPositionProcessor::UpdateInterestingRead(const PositionDescriptionMap &ps) {
+ vector<size_t> interesting_in_read;
+ for (const auto &pos : ps) {
+ if (is_interesting(pos.first)) {
+ interesting_in_read.push_back(pos.first);
+ }
+ }
+ if (interesting_in_read.size() >= 2) {
+ WeightedPositionalRead wr(interesting_in_read, ps, contig_);
+ size_t cur_id = wr_storage_.size();
+ wr_storage_.push_back(wr);
+ for (size_t i = 0; i < interesting_in_read.size(); i++) {
+ TRACE(interesting_in_read[i] << " " << contig_.length());
+ read_ids_[interesting_in_read[i]].push_back(cur_id);
+ }
+ }
+}
+
+void InterestingPositionProcessor::set_contig(const string &ctg) {
+ contig_ = ctg;
+ size_t len = contig_.length();
+ is_interesting_.resize(len);
+ read_ids_.resize(len);
+}
+
+void InterestingPositionProcessor::UpdateInterestingPositions() {
+ auto strat = corr_cfg::get().strat;
+ for (int dir = 1; dir >= -1; dir -= 2) {
+ int start_pos;
+ dir == 1 ? start_pos = 0 : start_pos = (int) contig_.length() - 1;
+ int current_pos = start_pos;
+ for (; current_pos >= 0 && current_pos < (int) contig_.length(); current_pos += dir) {
+ if (is_interesting_[current_pos]) {
+ DEBUG("reads on position: " << read_ids_[current_pos].size());
+ for (size_t i = 0; i < read_ids_[current_pos].size(); i++) {
+ size_t current_read_id = read_ids_[current_pos][i];
+ size_t current_variant = wr_storage_[current_read_id].positions[current_pos];
+ {
+ int coef = 1;
+ if (strat == Strategy::AllReads)
+ coef = 1;
+ else if (strat == Strategy::MappedSquared)
+ coef = wr_storage_[current_read_id].processed_positions * wr_storage_[current_read_id].processed_positions;
+ else if (strat == Strategy::AllExceptJustStarted)
+ coef = wr_storage_[current_read_id].is_first(current_pos, dir);
+ interesting_weights[current_pos].votes[current_variant] += get_error_weight(
+ wr_storage_[current_read_id].error_num ) * coef;
+ }
+ }
+ size_t maxi = interesting_weights[current_pos].FoundOptimal(contig_[current_pos]);
+ for (size_t i = 0; i < read_ids_[current_pos].size(); i++) {
+ size_t current_read_id = read_ids_[current_pos][i];
+ size_t current_variant = wr_storage_[current_read_id].positions[current_pos];
+ if (current_variant != maxi) {
+ wr_storage_[current_read_id].error_num++;
+ } else {
+ wr_storage_[current_read_id].processed_positions++;
+ }
+
+ }
+
+ if ((char) toupper(contig_[current_pos]) != pos_to_var[maxi]) {
+ DEBUG("Interesting positions differ at position " << current_pos);
+ DEBUG("Was " << (char) toupper(contig_[current_pos]) << "new " << pos_to_var[maxi]);
+ DEBUG("weights" << interesting_weights[current_pos].str());
+ changed_weights_[current_pos] = interesting_weights[current_pos];
+ }
+ //for backward pass
+ interesting_weights[current_pos].clear();
+ }
+ }
+ if (dir == 1)
+ DEBUG("reversing the order...");
+ for (size_t i = 0; i < wr_storage_.size(); i++) {
+ wr_storage_[i].error_num = 0;
+ wr_storage_[i].processed_positions = 0;
+ }
+ }
+}
+}
+;
diff --git a/src/corrector/interesting_pos_processor.hpp b/src/projects/corrector/interesting_pos_processor.hpp
similarity index 100%
rename from src/corrector/interesting_pos_processor.hpp
rename to src/projects/corrector/interesting_pos_processor.hpp
diff --git a/src/projects/corrector/main.cpp b/src/projects/corrector/main.cpp
new file mode 100644
index 0000000..07f0ee0
--- /dev/null
+++ b/src/projects/corrector/main.cpp
@@ -0,0 +1,67 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "dataset_processor.hpp"
+#include "pipeline/config_struct.hpp"
+
+#include "dev_support/logger/log_writers.hpp"
+#include "config_struct.hpp"
+#include "dev_support/segfault_handler.hpp"
+
+#include "version.hpp"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <string>
+
+using namespace std;
+void create_console_logger() {
+ using namespace logging;
+
+ logger *lg = create_logger("");
+ lg->add_writer(std::make_shared<console_writer>());
+ attach_logger(lg);
+}
+
+int main(int argc, char** argv) {
+ perf_counter pc;
+
+ srand(42);
+ srandom(42);
+ try {
+ create_console_logger();
+
+ if (argc != 3) {
+ WARN("Wrong argument number");
+ return 1;
+ }
+ string contig_name(argv[2]);
+ string cfg_file(argv[1]);
+ corr_cfg::create_instance(cfg_file);
+ string work_dir = corr_cfg::get().work_dir;
+ if (!path::check_existence(corr_cfg::get().output_dir))
+ path::make_dir(corr_cfg::get().output_dir);
+ if (!path::check_existence(corr_cfg::get().work_dir))
+ path::make_dir(corr_cfg::get().work_dir);
+
+ INFO("Starting MismatchCorrector, built from " SPADES_GIT_REFSPEC ", git revision " SPADES_GIT_SHA1);
+
+ corrector::DatasetProcessor dp(contig_name, corr_cfg::get().work_dir, corr_cfg::get().output_dir, corr_cfg::get().max_nthreads);
+ dp.ProcessDataset();
+ } catch (std::string const &s) {
+ std::cerr << s;
+ return EINTR;
+ }
+ unsigned ms = (unsigned) pc.time_ms();
+ unsigned secs = (ms / 1000) % 60;
+ unsigned mins = (ms / 1000 / 60) % 60;
+ unsigned hours = (ms / 1000 / 60 / 60);
+
+ INFO("Correcting time: " << hours << " hours " << mins << " minutes " << secs << " seconds");
+
+ return 0;
+}
diff --git a/src/corrector/positional_read.cpp b/src/projects/corrector/positional_read.cpp
similarity index 100%
rename from src/corrector/positional_read.cpp
rename to src/projects/corrector/positional_read.cpp
diff --git a/src/corrector/positional_read.hpp b/src/projects/corrector/positional_read.hpp
similarity index 100%
rename from src/corrector/positional_read.hpp
rename to src/projects/corrector/positional_read.hpp
diff --git a/src/corrector/variants_table.hpp b/src/projects/corrector/variants_table.hpp
similarity index 100%
rename from src/corrector/variants_table.hpp
rename to src/projects/corrector/variants_table.hpp
diff --git a/src/projects/dipspades/CMakeLists.txt b/src/projects/dipspades/CMakeLists.txt
new file mode 100644
index 0000000..b60d4b8
--- /dev/null
+++ b/src/projects/dipspades/CMakeLists.txt
@@ -0,0 +1,26 @@
+############################################################################
+# Copyright (c) 2015 Saint Petersburg State University
+# Copyright (c) 2011-2014 Saint Petersburg Academic University
+# All Rights Reserved
+# See file LICENSE for details.
+############################################################################
+
+project(dipspades CXX)
+
+add_executable(dipspades
+ dipspades_config.cpp
+ utils/files_utils.cpp
+ main.cpp)
+
+target_link_libraries(dipspades spades_modules ${COMMON_LIBRARIES})
+
+if (SPADES_STATIC_BUILD)
+ set_target_properties(dipspades PROPERTIES LINK_SEARCH_END_STATIC 1)
+endif()
+
+install(TARGETS dipspades
+ DESTINATION bin
+ COMPONENT runtime)
+install(DIRECTORY "${SPADES_CFG_DIR}/dipspades"
+ DESTINATION share/spades/configs
+ FILES_MATCHING PATTERN "*.info")
diff --git a/src/projects/dipspades/consensus_contigs_constructor/consensus_contigs_constructor.hpp b/src/projects/dipspades/consensus_contigs_constructor/consensus_contigs_constructor.hpp
new file mode 100644
index 0000000..4623fa0
--- /dev/null
+++ b/src/projects/dipspades/consensus_contigs_constructor/consensus_contigs_constructor.hpp
@@ -0,0 +1,332 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "io/reads_io/io_helper.hpp"
+
+#include "utils/element_printers.hpp"
+#include "utils/files_utils.hpp"
+
+#include "contig_correctors/close_gaps_corrector.hpp"
+#include "contig_correctors/iterative_redundant_contigs_remover.hpp"
+#include "contig_correctors/overlap_searcher.hpp"
+#include "contig_correctors/same_edge_deletion_corrector.hpp"
+#include "contig_correctors/incorrect_contig_remover.hpp"
+
+using namespace debruijn_graph;
+
+namespace dipspades{
+
+class ConsensusContigsConstructor {
+ conj_graph_pack &graph_pack_;
+ BaseHistogram<size_t> &bulge_len_hist_;
+ NewExtendedSequenceMapper<conj_graph_pack::graph_t, conj_graph_pack::index_t> seq_mapper_;
+ VertexPathIndex path_index_;
+
+ CorrectionResult correction_result_;
+ ContigStoragePtr default_storage_;
+ ContigStoragePtr composite_storage_;
+
+ set<size_t> ind_zero_paths_;
+
+ struct contig_name {
+ string fname;
+ string name;
+
+ contig_name(string new_fname, string new_name) :
+ fname(cut_fname_from_path(new_fname)),
+ name(new_name) { }
+ };
+
+ typedef pair<contig_name, Sequence> contig;
+
+ vector<contig> ReadContigs(string contig_fname){
+ vector<contig> contigs;
+ auto fstream = io::SplittingWrap(EasyStream(contig_fname, false));
+
+ while(!fstream->eof()){
+ SingleRead single_read;
+ (*fstream) >> single_read;
+ contigs.push_back(contig(contig_name(contig_fname, single_read.name()),
+ single_read.sequence()));
+ }
+ INFO(contigs.size() << " contigs from " << contig_fname << " were read");
+ return contigs;
+ }
+
+ vector<contig> ReadContigsFromFiles(vector<string> contig_fnames){
+ vector<contig> contigs;
+ for(auto it = contig_fnames.begin(); it != contig_fnames.end(); it++){
+ if(fname_valid(*it)){
+ auto contigs_from_file = ReadContigs(*it);
+ contigs.insert(contigs.end(), contigs_from_file.begin(), contigs_from_file.end());
+ }
+ }
+ return contigs;
+ }
+
+ vector<MappingPath<EdgeId> > ConstructMappathsWithoutRC(vector<contig> &contigs){
+ vector<MappingPath<EdgeId> > map_paths;
+ size_t zero_paths = 0;
+ size_t total_length_unmapped = 0;
+ for(size_t i = 0; i < contigs.size(); i++){
+ map_paths.push_back(seq_mapper_.MapSequence(contigs[i].second));
+ if(map_paths[map_paths.size() - 1].size() == 0){
+ total_length_unmapped += contigs[i].second.size();
+ zero_paths++;
+ }
+ }
+ if(zero_paths != 0)
+ INFO(ToString(zero_paths) + " contigs with total length " << total_length_unmapped <<
+ " have mapped path with zero length");
+ return map_paths;
+ }
+
+ void DefineIndicesOfZeroPaths(vector<MappingPath<EdgeId> > &map_paths){
+ for(size_t i = 0; i < map_paths.size(); i++)
+ if(map_paths[i].size() == 0)
+ ind_zero_paths_.insert(i);
+ }
+
+ ContigStoragePtr CreateContigStorage(vector<contig> &contigs,
+ vector<MappingPath<EdgeId> > &mapping_paths) {
+ ContigStoragePtr default_storage(new SimpleContigStorage());
+ for(size_t i = 0; i < mapping_paths.size(); i++){
+ if(ind_zero_paths_.find(i) == ind_zero_paths_.end()){
+ int i1 = int(i * 2), i2 = int(i * 2 + 1);
+ default_storage->Add(MappingContigPtr(
+ new SimpleMappingContig(contigs[i].first.name,
+ contigs[i].first.fname, contigs[i].second,
+ mapping_paths[i], i1, i2)));
+ }
+ }
+ return default_storage;
+ }
+
+ void PrimaryContigsProcessing(ContigStoragePtr storage){
+ INFO("Removing repetitive edges in contigs mapping starts");
+ SameEdgeDeletionCorrector same_edges_corr(graph_pack_.g);
+ same_edges_corr.Correct(storage);
+// INFO(storage->Size() << " contigs will be used");
+ INFO("Removing repetitive edges in contigs mapping ends");
+
+ INFO("Close gaps in contigs mappings starts")
+ CloseGapsCorrector close_gaps_corr(graph_pack_.g);
+ close_gaps_corr.Correct(storage);
+// INFO(storage->Size() << " contigs will be used");
+ INFO("Close gaps in contigs mappings ends");
+
+ INFO("Removing incorrect contigs")
+ RemoveUnconnectContigsCorrector del_unconn_corr(graph_pack_.g);
+ del_unconn_corr.Correct(storage);
+// INFO(storage->Size() << " contigs will be used");
+ }
+
+ string name_to_rc_name(string name){
+ return name + "_RC";
+ }
+
+ ContigStoragePtr CreateStorageWithRCContigs(ContigStoragePtr old_storage){
+ ContigStoragePtr new_storage(new SimpleContigStorage());
+ TRACE("CreateStorageWithRCContigs starts");
+ for(size_t i = 0; i < old_storage->Size(); i++){
+ auto contig = (*old_storage)[i];
+ new_storage->Add(contig);
+
+ MappingContigPtr rc_contig = MappingContigPtr(
+ new SimpleMappingContig(
+ name_to_rc_name(contig->name()),
+ contig->src_file(),
+ !contig->seq(),
+ GetRCToMappingPath(graph_pack_.g, contig->mapping_path(), contig->seq().size()),
+ GetRCToPathSeq(graph_pack_.g, contig->path_seq()),
+ contig->id() + 1, contig->id()));
+ new_storage->Add(rc_contig);
+ }
+ TRACE("CreateStorageWithRCContigs ends");
+ INFO("Addition of RC contigs. " << new_storage->Size() << " contigs will be used");
+ return new_storage;
+ }
+
+ void RemoveRedundantContigs(ContigStoragePtr storage){
+ INFO("Redundant contigs remover starts");
+ VertexPathIndex path_index(graph_pack_.g);
+ IterativeLoopCorrector iter_loop_corr(
+ graph_pack_.g,
+ graph_pack_.k_value,
+ path_index,
+ dsp_cfg::get().cc.max_loop_length,
+ dsp_cfg::get().cc.min_lcs_size,
+ dsp_cfg::get().cc.estimate_tails ?
+ bulge_len_hist_.Quantile(dsp_cfg::get().cc.bulge_len_quantile) :
+ dsp_cfg::get().pbr.max_bulge_nucls_len);
+ iter_loop_corr.Correct(storage);
+ INFO("Redundant contigs remover ends");
+ correction_result_ = iter_loop_corr.Results();
+ }
+
+ ContigStoragePtr DefineOverlappingContigs(ContigStoragePtr storage){
+ INFO("Overlapping search starts");
+ path_index_.Initialize(storage);
+ OverlapCorrector over_corr(graph_pack_.g,
+ graph_pack_.k_value,
+ dsp_cfg::get().cc.min_overlap_size,
+ path_index_);
+ auto new_storage = over_corr.Correct(storage);
+ path_index_.Clear();
+ INFO("Overlapping search ends");
+ return new_storage;
+ }
+
+ void WriteContigsToFile(ContigStoragePtr contigs, string filename){
+ size_t total_length = 0;
+ ofstream out(filename);
+ for(size_t i = 0; i < contigs->Size(); i++){
+ vector<EdgeId> contig_path = (*contigs)[i]->path_seq();
+ TRACE(i << " path: " << SimplePathWithVerticesToString(graph_pack_.g, contig_path));
+ Sequence seq = (*contigs)[i]->seq();
+ out << ">" << (*contigs)[i]->name() << endl;
+ out << seq.str() << endl;
+ total_length += seq.size();
+ }
+ INFO(contigs->Size() << " with total length " << total_length << " were written in " <<
+ filename);
+ }
+
+ void WritePairedAndUnpairedContigs(ContigStoragePtr storage){
+ ContigStoragePtr double_contigs(new SimpleContigStorage());
+ ContigStoragePtr single_contigs(new SimpleContigStorage());
+ for(size_t i = 0; i < storage->Size(); i++){
+ auto contig = (*storage)[i];
+ if(contig->AllMappingContigs().size() == 0){
+ if(correction_result_.redundancy_map.GetValuesByKey(contig->id()).size() == 0)
+ single_contigs->Add(contig);
+ else
+ double_contigs->Add(contig);
+ }
+ else
+ double_contigs->Add(contig);
+ }
+ WriteContigsToFile(double_contigs,
+ path::append_path(dsp_cfg::get().io.output_dir, "paired_consensus_contigs.fasta").c_str());
+ WriteContigsToFile(single_contigs,
+ path::append_path(dsp_cfg::get().io.output_dir, "unpaired_consensus_contigs.fasta").c_str());
+ }
+
+ void WriteAlignedHaplocontigs(){
+ string fname = path::append_path(dsp_cfg::get().io.output_dir, "haplocontigs_alignment");
+ ofstream out(fname.c_str());
+ INFO("Writing haplocontigs alignment to " << fname);
+
+ for(size_t i = 0; i < composite_storage_->Size(); i++){
+ auto composite_contig = (*composite_storage_)[i];
+ out << "Consensus contig: " << composite_contig->name() << endl;
+ auto haplocontigs = composite_contig->AllMappingContigs();
+ if(haplocontigs.size() == 0) // contig is not composite
+ haplocontigs.push_back(composite_contig);
+
+ if(haplocontigs.size() > 1){
+ out << "\tOverlapped haplocontigs: " << endl;
+ for(size_t i = 0; i < haplocontigs.size() - 1; i++)
+ out << "\t\t" << haplocontigs[i]->full_name() << "\t" <<
+ haplocontigs[i + 1]->full_name() << endl;
+ }
+
+ out << "\tAligned pairs: " << endl;
+ size_t written_pairs = 0;
+ for(auto h = haplocontigs.begin(); h != haplocontigs.end(); h++){
+ size_t id = (*h)->id();
+ auto redundant_contigs = correction_result_.redundancy_map.GetValuesByKey(id);
+ for(auto it = redundant_contigs.begin(); it != redundant_contigs.end(); it++){
+ out << "\t\t" << (*h)->full_name() << "\t" <<
+ default_storage_->GetContigById(*it)->full_name() << endl;
+ written_pairs++;
+ }
+ }
+
+ if(written_pairs == 0)
+ out << "\t\tNo pairs" << endl;
+ }
+
+/* for(auto it = correction_result_.redundancy_map.begin();
+ it != correction_result_.redundancy_map.end(); it++){
+ auto contig1 = default_storage_->GetContigById(it->first);
+ auto set_ids = it->second;
+ for(auto set_it = set_ids.begin(); set_it != set_ids.end(); set_it++){
+ auto contig2 = default_storage_->GetContigById(*set_it);
+ out << contig1->src_file() << ":" << contig1->name() << "\t" <<
+ contig2->src_file() << ":" << contig2->name() << endl;
+ }
+ }*/
+
+ }
+
+public:
+ ConsensusContigsConstructor(conj_graph_pack &graph_pack,
+ BaseHistogram<size_t> &bulge_len_hist) :
+ graph_pack_(graph_pack),
+ bulge_len_hist_(bulge_len_hist),
+ seq_mapper_(graph_pack.g, graph_pack.index,
+ graph_pack.kmer_mapper, false),
+ path_index_(graph_pack.g),
+ correction_result_(),
+ default_storage_(),
+ composite_storage_() { }
+
+ void Run() {
+ INFO("Consensus contigs constructor starts");
+ auto contigs = ReadContigsFromFiles(GetAllLinesFromFile(dsp_cfg::get().io.haplocontigs));
+ INFO("Total: " << contigs.size() << " contigs were read");
+ if(contigs.size() == 0)
+ return;
+
+ vector<MappingPath<EdgeId> > mapping_paths = ConstructMappathsWithoutRC(contigs);
+ VERIFY(mapping_paths.size() == contigs.size());
+ DefineIndicesOfZeroPaths(mapping_paths);
+
+ auto preliminary_storage = CreateContigStorage(contigs, mapping_paths);
+
+ TRACE("Preliminary storage:");
+ TRACE(preliminary_storage->ToString(graph_pack_.g));
+
+ PrimaryContigsProcessing(preliminary_storage);
+
+ TRACE("Preliminary storage after 1st processing:");
+ TRACE(preliminary_storage->ToString(graph_pack_.g));
+
+ auto processed_storage = CreateStorageWithRCContigs(preliminary_storage);
+ VERIFY(processed_storage->Size() % 2 == 0);
+
+ default_storage_ = processed_storage->Clone();
+ RemoveRedundantContigs(processed_storage);
+
+ TRACE("Storage after removing redundant contigs:");
+ TRACE(processed_storage->ToString(graph_pack_.g));
+
+ composite_storage_ = DefineOverlappingContigs(processed_storage);
+
+ string consensus_fname(path::append_path(dsp_cfg::get().io.output_dir, "consensus_contigs.fasta").c_str());
+ WriteContigsToFile(composite_storage_, consensus_fname);
+ WritePairedAndUnpairedContigs(composite_storage_);
+
+ WriteAlignedHaplocontigs();
+
+ INFO("Consensus contigs constructor ends");
+ }
+
+ ContigStoragePtr DefaultContigsStorage() { return default_storage_; }
+
+ ContigStoragePtr CompositeContigsStorage() { return composite_storage_; }
+
+ CorrectionResult RedundancyResult() { return correction_result_; }
+
+private:
+ DECL_LOGGER("ConsensusContigsConstructor");
+};
+
+}
diff --git a/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/abstract_contig_corrector.hpp b/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/abstract_contig_corrector.hpp
new file mode 100644
index 0000000..a3b5481
--- /dev/null
+++ b/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/abstract_contig_corrector.hpp
@@ -0,0 +1,43 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "../mapping_contigs_storage.hpp"
+#include "../../utils/lcs_utils.hpp"
+#include "../../utils/path_routines.hpp"
+#include "../../utils/path_index.hpp"
+#include "../../utils/bulge_utils.hpp"
+#include "../../utils/redundancy_map.hpp"
+
+using namespace debruijn_graph;
+
+namespace dipspades {
+
+struct CorrectionResult{
+ OverlapGraph g;
+ RedundancyMap<size_t> redundancy_map;
+};
+
+//--------------------------------------------------------------------------
+class AbstractContigCorrector{
+protected:
+ Graph& g_;
+public:
+ AbstractContigCorrector(Graph& g) : g_(g) {
+
+ }
+ virtual ContigStoragePtr Correct(ContigStoragePtr storage) { return storage; }
+ virtual MappingContigPtr Correct(MappingContigPtr contig) { return contig; }
+ virtual ~AbstractContigCorrector(){}
+ virtual CorrectionResult Results(){
+ CorrectionResult res;
+ return res;
+ }
+};
+
+}
diff --git a/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/close_gaps_corrector.hpp b/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/close_gaps_corrector.hpp
new file mode 100644
index 0000000..aa5047c
--- /dev/null
+++ b/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/close_gaps_corrector.hpp
@@ -0,0 +1,154 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "abstract_contig_corrector.hpp"
+
+using namespace debruijn_graph;
+
+namespace dipspades {
+
+class CloseGapsCorrector : public AbstractContigCorrector{
+
+ set<size_t> incorr_contigs;
+ size_t num_corr;
+
+ size_t connected_length_;
+ size_t disconnected_length_;
+
+ vector<EdgeId> ClosePathGap(vector<EdgeId> path, vector<size_t> gap_index){
+ vector<EdgeId> new_path;
+ size_t current_gap = 0;
+ for(size_t i = 0; i < path.size() - 1; i++){
+ EdgeId cur_edge = path[i];
+ new_path.push_back(cur_edge);
+ if(i == gap_index[current_gap]){
+ VertexId start = g_.EdgeEnd(cur_edge);
+ VertexId end = g_.EdgeStart(path[i + 1]);
+ auto dijkstra = DijkstraHelper<Graph>::CreateTargeredBoundedDijkstra(g_,
+ end, dsp_cfg::get().pbr.max_bulge_nucls_len); //DijkstraHelper<Graph>::CreateBoundedDijkstra(g_, dsp_cfg::get().pbr.max_bulge_nucls_len);
+ dijkstra.Run(start);
+ if(dijkstra.DistanceCounted(end)){
+ vector<EdgeId> add_path = dijkstra.GetShortestPathTo(end);
+ for(auto e = add_path.begin(); e != add_path.end(); e++)
+ if(g_.EdgeStart(*e) != g_.EdgeEnd(*e))
+ new_path.push_back(*e);
+ }
+ else{
+ // second attempt
+ VertexId prev_start = g_.EdgeStart(cur_edge);
+ dijkstra.Run(prev_start);
+ if(dijkstra.DistanceCounted(end)){
+ vector<EdgeId> add_path = dijkstra.GetShortestPathTo(end);
+ new_path.erase(new_path.begin() + new_path.size() - 1);
+ for(auto e = add_path.begin(); e != add_path.end(); e++)
+ if(g_.EdgeStart(*e) != g_.EdgeEnd(*e))
+ new_path.push_back(*e);
+ }
+ }
+ current_gap++;
+ }
+ }
+ new_path.push_back(path[path.size() - 1]);
+ return new_path;
+ }
+
+ size_t CountContigsWithGaps(ContigStoragePtr storage) {
+ size_t contigs_with_gaps = 0;
+ for(size_t i = 0; i < storage->Size(); i++)
+ if(!IsPathConnected(g_, (*storage)[i]->path_seq()))
+ contigs_with_gaps++;
+ return contigs_with_gaps;
+ }
+
+ void ProcessContigs(ContigStoragePtr storage) {
+ double processed_perc = 0.1;
+ double step = 0.1;
+ for(size_t i = 0; i < storage->Size(); i++) {
+ storage->ReplaceContig(Correct((*storage)[i]), i);
+ double cur_process_perc = static_cast<double>(i) / static_cast<double>(storage->Size());
+ if(cur_process_perc > processed_perc) {
+ while(processed_perc + step <= cur_process_perc)
+ processed_perc += step;
+ INFO(ToString(processed_perc * 100.0) << "% contigs were processed");
+ processed_perc += step;
+ }
+ }
+ INFO("100% contigs were processed");
+ }
+
+public:
+ CloseGapsCorrector(Graph &g) :
+ AbstractContigCorrector(g),
+ num_corr(0),
+ connected_length_(0),
+ disconnected_length_(0) { }
+
+ virtual ContigStoragePtr Correct(ContigStoragePtr storage){
+
+ INFO(ToString(CountContigsWithGaps(storage)) << " contigs from " <<
+ ToString(storage->Size()) << " have gaps before correction");
+
+ ProcessContigs(storage);
+
+ INFO(ToString(num_corr) << " contigs from " <<
+ ToString(storage->Size()) << " with total length " << ToString(connected_length_) + " are correct");
+ INFO(ToString(storage->Size() - num_corr) << " contigs from "
+ << ToString(storage->Size()) << " with total length " <<
+ ToString(disconnected_length_) + " have gaps after correction");
+
+ storage->DeleteByIDs(incorr_contigs);
+ return storage;
+ }
+
+ virtual MappingContigPtr Correct(MappingContigPtr contig){
+ vector<EdgeId> path = contig->path_seq();
+ if(path.size() <= 1){
+ num_corr++;
+ return contig;
+ }
+ vector<size_t> gap_indexes;
+ for(size_t i = 0; i < path.size() - 1; i++){
+ EdgeId e1 = path[i];
+ EdgeId e2 = path[i + 1];
+ if(!AreEdgesConnected(g_, e1, e2)){
+ gap_indexes.push_back(i);
+ }
+ }
+
+ TRACE("Contig " << contig->id() << " has " << gap_indexes.size() << " gaps");
+
+ // contig is connected
+ if(gap_indexes.size() == 0) {
+ num_corr++;
+ connected_length_ += GetPathLength(g_, contig->path_seq());
+ return contig;
+ }
+
+ TRACE("Contig path before correction: " << SimplePathWithVerticesToString(g_, contig->path_seq()));
+
+ vector<EdgeId> new_path = ClosePathGap(path, gap_indexes);
+ if(IsPathConnected(g_, new_path)) {
+ TRACE("Gaps were closed");
+ TRACE("Contig path after correction: " << SimplePathWithVerticesToString(g_, new_path));
+ num_corr++;
+ connected_length_ += GetPathLength(g_, new_path);
+ return MappingContigPtr(new ReplacedPathMappingContig(contig, new_path));
+ }
+
+ TRACE("Contig " << contig->id() << " remains incorrected!");
+ incorr_contigs.insert(contig->id());
+ disconnected_length_ += GetPathLength(g_, contig->path_seq());
+ return contig;
+ }
+
+private:
+ DECL_LOGGER("CloseGapsCorrector")
+};
+
+}
diff --git a/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/equal_path_deletion_correction.hpp b/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/equal_path_deletion_correction.hpp
new file mode 100644
index 0000000..fd68341
--- /dev/null
+++ b/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/equal_path_deletion_correction.hpp
@@ -0,0 +1,82 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "abstract_contig_corrector.hpp"
+
+using namespace debruijn_graph;
+
+namespace dipspades {
+
+class EqualPathDeletionCorrector : public AbstractContigCorrector{
+ VertexPathIndex &path_index_;
+ CorrectionResult res_;
+
+ void InitializeMap(ContigStoragePtr contigs){
+ for(size_t i = 0; i < contigs->Size(); i++){
+ size_t id = (*contigs)[i]->id();
+ res_.redundancy_map.AddNewKey(id);
+ }
+ }
+
+public:
+
+ EqualPathDeletionCorrector(Graph &g, VertexPathIndex &path_index) : AbstractContigCorrector(g),
+ path_index_(path_index){ }
+
+ ContigStoragePtr Correct(ContigStoragePtr contigs) {
+
+ INFO("Computing redundant equal contigs starts");
+
+ InitializeMap(contigs);
+ set<size_t> ids_for_deletion;
+ for(size_t i = 0; i < contigs->Size() - 1; i++){
+ size_t id1 = (*contigs)[i]->id();
+ size_t rc_id1 = (*contigs)[i]->rc_id();
+ if(ids_for_deletion.find(id1) == ids_for_deletion.end() &&
+ ids_for_deletion.find(rc_id1) == ids_for_deletion.end()){
+ auto path1 = (*contigs)[i]->path_seq();
+ auto contigs_for_processing = path_index_.GetPathsIntersectedWith(path1);
+ for(auto it = contigs_for_processing.begin(); it != contigs_for_processing.end(); it++){
+ size_t j = *it;
+ size_t id2 = (*contigs)[j]->id();
+ size_t rc_id2 = (*contigs)[j]->rc_id();
+ if(ids_for_deletion.find(id2) == ids_for_deletion.end() &&
+ ids_for_deletion.find(rc_id2) == ids_for_deletion.end() && j > i){
+ auto path2 = (*contigs)[j]->path_seq();
+ if(ArePathEqual(path1, path2)){
+ size_t id2 = (*contigs)[j]->id();
+ ids_for_deletion.insert(id2);
+ ids_for_deletion.insert(rc_id2);
+ res_.redundancy_map.AddNewPair(id1, id2);
+ res_.redundancy_map.AddNewPair(rc_id1, rc_id2);
+ }
+ }
+ }
+ }
+ }
+ RedundancyMapCondenser<size_t> condenser;
+ res_.redundancy_map = condenser.Condense(res_.redundancy_map);
+ INFO(ToString(ids_for_deletion.size()) + " contigs from " << contigs->Size() << " are redundant");
+ contigs->DeleteByIDs(ids_for_deletion);
+
+ INFO("Computing redundant equal contigs ends");
+
+ return contigs;
+ }
+
+ MappingContigPtr Correct(MappingContigPtr contig){
+ return contig;
+ }
+
+ CorrectionResult Result(){
+ return res_;
+ }
+};
+
+}
diff --git a/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/incorrect_contig_remover.hpp b/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/incorrect_contig_remover.hpp
new file mode 100644
index 0000000..70b0757
--- /dev/null
+++ b/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/incorrect_contig_remover.hpp
@@ -0,0 +1,43 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "abstract_contig_corrector.hpp"
+
+using namespace debruijn_graph;
+
+namespace dipspades {
+
+class RemoveUnconnectContigsCorrector : public AbstractContigCorrector{
+
+public:
+ RemoveUnconnectContigsCorrector(Graph &g) : AbstractContigCorrector(g){ }
+
+ ContigStoragePtr Correct(ContigStoragePtr storage) {
+ set<size_t> contigs_for_deletion;
+ for(size_t i = 0; i < storage->Size(); i++){
+ auto contig_path = (*storage)[i]->path_seq();
+ TRACE((*storage)[i]->id() << " contig");
+ TRACE("Path: " << SimplePathWithVerticesToString(g_, contig_path));
+ if(!IsPathConnected(g_, contig_path)){
+ contigs_for_deletion.insert((*storage)[i]->id());
+ }
+ }
+ INFO(ToString(contigs_for_deletion.size()) + " contigs from " <<
+ storage->Size() << " were deleted");
+ storage->DeleteByIDs(contigs_for_deletion);
+ return storage;
+ }
+
+ MappingContigPtr Correct(MappingContigPtr contig){
+ return contig;
+ }
+
+};
+
+}
diff --git a/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/iterative_redundant_contigs_remover.hpp b/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/iterative_redundant_contigs_remover.hpp
new file mode 100644
index 0000000..bd12357
--- /dev/null
+++ b/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/iterative_redundant_contigs_remover.hpp
@@ -0,0 +1,94 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "redundant_contig_remover.hpp"
+#include "equal_path_deletion_correction.hpp"
+
+using namespace debruijn_graph;
+
+namespace dipspades {
+
+class IterativeLoopCorrector : public AbstractContigCorrector{
+
+ size_t k_value_;
+
+ VertexPathIndex &index_;
+ size_t max_loop_len_;
+ size_t min_lcs_length_;
+ size_t max_tail_length_;
+ CorrectionResult res;
+
+public:
+ IterativeLoopCorrector(Graph &g, size_t k_value, VertexPathIndex &index, size_t max_loop_len,
+ size_t min_lcs_length, size_t max_tail_length) :
+ AbstractContigCorrector(g), k_value_(k_value), index_(index),
+ max_loop_len_(max_loop_len), min_lcs_length_(min_lcs_length),
+ max_tail_length_(max_tail_length) {
+ }
+
+ ContigStoragePtr Correct(ContigStoragePtr contigs) {
+ {
+ INFO("Equal path remover starts");
+ index_.Initialize(contigs);
+ EqualPathDeletionCorrector equal_path_remover(g_, index_);
+ contigs = equal_path_remover.Correct(contigs);
+ res.redundancy_map = equal_path_remover.Result().redundancy_map;
+ index_.Clear();
+ INFO(ToString(contigs->Size()) + " contigs will be used further");
+ }
+
+ INFO("Iterative loop corrector starts");
+ {
+ INFO("Only exact match iteration with parameters:");
+ INFO("\tMaximal loop length - " + ToString(max_loop_len_));
+ INFO("\tMinimal lcs length - " + ToString(min_lcs_length_));
+ INFO("\tMaximal tail length - 0");
+
+ index_.Initialize(contigs);
+ LoopBulgeDeletionCorrector loop_corr(g_, k_value_,
+ max_loop_len_, 0, min_lcs_length_, index_);
+ contigs = loop_corr.Correct(contigs);
+ auto old_map = res.redundancy_map;
+ auto new_map = loop_corr.Results().redundancy_map;
+ RedundancyMapMerger<size_t> map_merger;
+ res.redundancy_map = map_merger.MergeTwoMaps(old_map, new_map);
+ index_.Clear();
+ INFO(ToString(contigs->Size()) + " contigs will be used further");
+ }
+
+ {
+ INFO("Tails allowing match iteration with parameters:");
+ INFO("\tMaximal loop length - " + ToString(max_loop_len_));
+ INFO("\tMinimal lcs length - " + ToString(min_lcs_length_));
+ INFO("\tMaximal tail length - " + ToString(max_tail_length_));
+ index_.Initialize(contigs);
+ LoopBulgeDeletionCorrector loop_corr(g_, k_value_,
+ max_loop_len_, max_tail_length_, min_lcs_length_, index_);
+ contigs = loop_corr.Correct(contigs);
+ auto old_map = res.redundancy_map;
+ auto new_map = loop_corr.Results().redundancy_map;
+ RedundancyMapMerger<size_t> map_merger;
+ res.redundancy_map = map_merger.MergeTwoMaps(old_map, new_map);
+ index_.Clear();
+ INFO(ToString(contigs->Size()) + " contigs will be used further");
+ }
+ INFO("Iterative loop corrector ends");
+ return contigs;
+ }
+
+ MappingContigPtr Correct(MappingContigPtr contig){
+ return contig;
+ }
+
+ CorrectionResult Results(){
+ return res;
+ }
+};
+
+}
diff --git a/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/overlap_searcher.hpp b/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/overlap_searcher.hpp
new file mode 100644
index 0000000..6e8c49a
--- /dev/null
+++ b/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/overlap_searcher.hpp
@@ -0,0 +1,541 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "redundant_contig_remover.hpp"
+
+using namespace debruijn_graph;
+
+namespace dipspades {
+
+void OverlapgraphToDot(string dotfname, OverlapGraph & g, ContigStoragePtr stor){
+ ofstream dot(dotfname.c_str());
+
+// cout << "Number of vertices - " << g.VerticesCount() << endl;
+// cout << "Number of contigs - " << stor->Size() << endl;
+ VERIFY(g.VerticesCount() <= stor->Size());
+
+ dot << "digraph Overlaped_paths {" << endl << "node[fontname=<Courier>]" << endl;
+
+ auto vertices = g.Vertices();
+ for(auto v = vertices.begin(); v != vertices.end(); v++){
+ dot << *v << "[label=\"ID = #" << *v << ". " << *v << ", RC_ID = " <<
+ stor->GetContigById(*v)->rc_id() << "\"]" << endl;
+ }
+
+ auto edges = g.Edges();
+ for(auto e = edges.begin(); e != edges.end(); e++)
+ dot << e->first << "->" << e->second << "[label=\"" << g.GetWeightOf(*e) << "\"]" << endl;
+
+ dot << "}";
+}
+
+//--------------------------------------------------------------------------------------------
+
+class OverlappedContigsMap {
+ size_t min_lcs_length_;
+public:
+ struct OverlappedKey {
+ size_t id1;
+ size_t id2;
+ size_t id1_rc;
+ size_t id2_rc;
+
+ OverlappedKey(size_t new_id1, size_t new_id2,
+ size_t new_id1_rc, size_t new_id2_rc) :
+ id1(new_id1),
+ id2(new_id2),
+ id1_rc(new_id1_rc),
+ id2_rc(new_id2_rc) { }
+
+ OverlappedKey() :
+ id1(), id2(), id1_rc(), id2_rc() { }
+
+ string ToString() const {
+ stringstream ss;
+ ss << "<" << id1 << ", " << id2 << "> <" << id2_rc << ", " << id1_rc << ">";
+ return ss.str();
+ }
+
+ string id() const {
+ stringstream ss;
+ ss << id1 << "_" << id2 << "_" << id1_rc << "_" << id2_rc;
+ return ss.str();
+ }
+
+ OverlappedKey Reverse1() {
+ return OverlappedKey(id1, id2_rc, id1_rc, id2);
+ }
+
+ OverlappedKey Reverse2() {
+ return OverlappedKey(id2_rc, id1, id2, id1_rc);
+ }
+
+ OverlappedKey Reverse3() {
+ return OverlappedKey(id1_rc, id2, id1, id2_rc);
+ }
+
+ OverlappedKey Reverse4() {
+ return OverlappedKey(id2, id1_rc, id2_rc, id1);
+ }
+ };
+
+ struct OverlappedValue {
+ Range range_left;
+ Range range_right;
+ Range range_left_rc;
+ Range range_right_rc;
+ size_t lcs_length;
+
+ OverlappedValue(Range new_range_left, Range new_range_right,
+ Range new_range_left_rc, Range new_range_right_rc,
+ size_t new_lcs_length) :
+ range_left(new_range_left),
+ range_right(new_range_right),
+ range_left_rc(new_range_left_rc),
+ range_right_rc(new_range_right_rc),
+ lcs_length(new_lcs_length) { }
+
+ OverlappedValue() :
+ range_left(),
+ range_right(),
+ range_left_rc(),
+ range_right_rc(),
+ lcs_length() { }
+
+ string ToString() const {
+ stringstream ss;
+ ss << "(" << range_left.start_pos << ", " << range_left.end_pos << "), (" <<
+ range_right.start_pos << ", " << range_right.end_pos << "): " << lcs_length;
+ return ss.str();
+ }
+ };
+
+private:
+ class OverlappedKeyComparator {
+ public:
+ bool operator()(const OverlappedKey &obj1, const OverlappedKey &obj2) const {
+ return obj1.id() < obj2.id();
+
+ if(obj1.id1 < obj2.id1)
+ return true;
+ return obj1.id2 < obj2.id2;
+ }
+ };
+
+ map<OverlappedKey, OverlappedValue, OverlappedKeyComparator> overlap_map_;
+ map<pair<size_t, size_t>, pair<Range, Range> > pair_overlap_map_;
+
+ void RemoveElement(OverlappedKey key) {
+ overlap_map_.erase(key);
+ pair_overlap_map_.erase(make_pair(key.id1, key.id2));
+ pair_overlap_map_.erase(make_pair(key.id2_rc, key.id1_rc));
+ }
+
+ void AddElement(OverlappedKey key, OverlappedValue value) {
+ overlap_map_[key] = value;
+ pair_overlap_map_[make_pair(key.id1, key.id2)] =
+ make_pair(value.range_left, value.range_right);
+ pair_overlap_map_[make_pair(key.id2_rc, key.id1_rc)] =
+ make_pair(value.range_right_rc, value.range_left_rc);
+ }
+
+ void ProcessReverseKey(OverlappedKey key, OverlappedValue value,
+ OverlappedKey reverse_key) {
+ if(overlap_map_.find(reverse_key) == overlap_map_.end())
+ AddElement(key, value);
+ else
+ if(overlap_map_[reverse_key].lcs_length < value.lcs_length) {
+ AddElement(key, value);
+ RemoveElement(reverse_key);
+ }
+ }
+
+public:
+ OverlappedContigsMap(size_t min_lcs_length) :
+ min_lcs_length_(min_lcs_length) { }
+
+ void Add(OverlappedKey key, OverlappedValue value) {
+ if(value.lcs_length < min_lcs_length_)
+ return;
+ ProcessReverseKey(key, value, key.Reverse1());
+ ProcessReverseKey(key, value, key.Reverse2());
+ ProcessReverseKey(key, value, key.Reverse3());
+ ProcessReverseKey(key, value, key.Reverse4());
+ }
+
+ void PrintMap() {
+ for(auto it = overlap_map_.begin(); it != overlap_map_.end(); it++) {
+ TRACE(it->first.ToString() << " - " << it->second.ToString());
+ }
+ }
+
+ size_t Size() { return overlap_map_.size(); }
+
+ typedef map<OverlappedKey, OverlappedValue, OverlappedKeyComparator>::const_iterator overlap_map_iter;
+
+ overlap_map_iter begin() const { return overlap_map_.begin(); }
+
+ overlap_map_iter end() const { return overlap_map_.end(); }
+
+ pair<Range, Range> Ranges(size_t id1, size_t id2) {
+ return pair_overlap_map_[make_pair(id1, id2)];
+ }
+
+private:
+ DECL_LOGGER("OverlappedContigsMap");
+};
+
+ostream& operator<<(ostream& os, const OverlappedContigsMap& obj) {
+ for(auto it = obj.begin(); it != obj.end(); it++)
+ os << it->first.ToString() << " - " << it->second.ToString() << endl;
+ return os;
+}
+
+//--------------------------------------------------------------------------------------------
+
+class OverlapCorrector : public LoopBulgeDeletionCorrector{
+ size_t k_value_;
+
+ struct overlap_res {
+ bool correctness;
+ size_t size;
+
+ overlap_res(bool over_corr, size_t over_size) :
+ correctness(over_corr),
+ size(over_size) { }
+
+ overlap_res() :
+ correctness(false),
+ size(0) { }
+ };
+
+ // todo insert check of bulge sides
+ overlap_res IsOverlapCorrect(vector<EdgeId> first_path, vector<size_t> first_pos,
+ vector<EdgeId> last_path, vector<size_t> last_pos){
+
+ VERIFY(first_pos.size() == last_pos.size());
+
+ if(first_pos.size() <= 1)
+ return overlap_res();
+
+// cout << "Left tail length - " << GetLeftTailLength(last_path, last_pos) << endl;
+// cout << "Right tail length - " << GetRightTailLength(first_path, first_pos) << endl;
+
+ if(IsLeftTailCorrect(last_path, last_pos) && IsRightTailCorrect(first_path, first_pos)){
+
+ size_t first_start = ConvInd(first_pos[0], first_path.size());
+ size_t last_end = ConvInd(last_pos[last_pos.size() - 1], last_path.size());
+
+ // check of reachment of left tail start
+ bool is_left_tail_correct = true;
+ if(IsLeftTailExist(last_path, last_pos) ){
+
+ if(dsp_cfg::get().cc.tails_lie_on_bulges){
+ VertexId start1 = g_.EdgeStart(first_path[0]);
+ VertexId start2 = g_.EdgeStart(last_path[0]);
+
+ auto path_searcher = DijkstraHelper<Graph>::CreateBackwardBoundedDijkstra(g_,
+ dsp_cfg::get().pbr.max_bulge_nucls_len);
+ path_searcher.Run(start1);
+ auto reached_vert1 = path_searcher.ReachedVertices();
+
+ path_searcher.Run(start2);
+ auto reached_vert2 = path_searcher.ReachedVertices();
+
+ for(size_t i = 0; i < first_start; i++){
+ VertexId cur_vert = g_.EdgeStart(first_path[i]);
+ reached_vert1.push_back(cur_vert);
+ }
+
+ bool common_vertex_exists = false;
+ for(auto v1 = reached_vert1.begin(); v1 != reached_vert1.end(); v1++)
+ for(auto v2 = reached_vert2.begin(); v2 != reached_vert2.end(); v2++)
+ if(*v1 == *v2){
+ common_vertex_exists = true;
+ break;
+ }
+ is_left_tail_correct = common_vertex_exists;
+ }
+ else{
+ }
+ }
+
+ if(!is_left_tail_correct)
+ return overlap_res();
+
+ // check of reachment of right tail start
+ bool is_right_tail_correct = true;
+ if(IsRightTailExist(first_path, first_pos)){
+
+ if(dsp_cfg::get().cc.tails_lie_on_bulges){
+ size_t first_path_size = first_path.size(),
+ last_path_size = last_path.size();
+
+ VertexId end1 = g_.EdgeStart(first_path[first_path_size - 1]);
+ VertexId end2 = g_.EdgeStart(last_path[last_path_size - 1]);
+
+ auto path_searcher = DijkstraHelper<Graph>::CreateBackwardBoundedDijkstra(g_,
+ dsp_cfg::get().pbr.max_bulge_nucls_len);
+ path_searcher.Run(end1);
+ auto reached_vert1 = path_searcher.ReachedVertices();
+
+ path_searcher.Run(end2);
+ auto reached_vert2 = path_searcher.ReachedVertices();
+
+ for(size_t i = last_end; i < last_path.size(); i++){
+ VertexId cur_vert = g_.EdgeEnd(last_path[i]);
+ reached_vert2.push_back(cur_vert);
+ }
+
+ bool common_vertex_exists = false;
+ for(auto v1 = reached_vert1.begin(); v1 != reached_vert1.end(); v1++)
+ for(auto v2 = reached_vert2.begin(); v2 != reached_vert2.end(); v2++)
+ if(*v1 == *v2){
+ common_vertex_exists = true;
+ break;
+ }
+ is_right_tail_correct = common_vertex_exists;
+ }
+ }
+
+ if(is_right_tail_correct)
+ return overlap_res(true, GetLeftTailLength(last_path, last_pos) +
+ GetRightTailLength(first_path, first_pos));
+ }
+ return overlap_res();
+
+ }
+
+ pair<overlap_res, overlap_res> ArePathsOverlapped(vector<EdgeId> path1, vector<size_t> pos1,
+ vector<EdgeId> path2, vector<size_t> pos2){
+
+ if(path1.size() == 0 || path2.size() == 0)
+ return make_pair(overlap_res(), overlap_res());
+
+ VERIFY(pos1.size() == pos2.size());
+
+ if(pos1.size() <= 1)
+ return make_pair(overlap_res(), overlap_res());
+
+ if(!IsLCSCorrect(path1, pos1, path2, pos2))
+ return make_pair(overlap_res(), overlap_res());
+
+ return make_pair(IsOverlapCorrect(path2, pos2, path1, pos1), IsOverlapCorrect(path1, pos1, path2, pos2));
+ }
+
+ string get_composite_contig_name(size_t i, size_t length){
+ stringstream ss;
+ ss << i << "_contig_" << length << "_length";
+ return ss.str();
+ }
+
+ void FillOverlapGraphByMap(OverlappedContigsMap &overlap_map, OverlapGraph &graph) {
+ for(auto it = overlap_map.begin(); it != overlap_map.end(); it++) {
+ graph.AddNeighVertices(it->first.id1, it->first.id2, it->second.lcs_length);
+ graph.AddNeighVertices(it->first.id2_rc, it->first.id1_rc, it->second.lcs_length);
+ }
+ }
+
+public:
+ OverlapCorrector(Graph &g, size_t k_value, size_t min_overlap_length, VertexPathIndex &path_index) :
+ LoopBulgeDeletionCorrector(g,
+ k_value,
+ dsp_cfg::get().cc.max_loop_length,
+ dsp_cfg::get().pbr.max_bulge_nucls_len,
+ min_overlap_length,
+ path_index),
+ k_value_(k_value) {}
+
+ ContigStoragePtr Correct(ContigStoragePtr contigs) {
+
+ INFO("Computing overlaps starts");
+
+ OverlappedContigsMap overlap_map(dsp_cfg::get().cc.min_overlap_size);
+
+ OverlapGraph og;
+ vector<size_t> vertices;
+ vector<size_t> id, rc_id;
+ for(size_t i = 0; i < contigs->Size(); i++){
+ vertices.push_back((*contigs)[i]->id());
+ id.push_back((*contigs)[i]->id());
+ rc_id.push_back((*contigs)[i]->rc_id());
+ }
+ og.InitializeVertexSet(vertices, id, rc_id);
+
+ vector<vector<VertexId> > seqs;
+ for(size_t i = 0; i < contigs->Size(); i++){
+ vector<VertexId> seq = GetListOfVertices((*contigs)[i]->path_seq());
+ seqs.push_back(seq);
+ }
+ LCSCalculator<VertexId> lcs_calc;
+ set<pair<int, int> > processed_pairs;
+
+ for(size_t i = 0; i < contigs->Size(); i++){
+ auto path1 = (*contigs)[i]->path_seq();
+ size_t id1 = (*contigs)[i]->id();
+ size_t rc_id1 = (*contigs)[i]->rc_id();
+ auto contigs_for_processing = path_index_.GetPathsIntersectedWith(path1);
+ for(auto it = contigs_for_processing.begin(); it != contigs_for_processing.end(); it++){
+ size_t j = *it;
+ size_t id2 = (*contigs)[j]->id();
+ size_t rc_id2 = (*contigs)[j]->rc_id();
+ bool need_process = !((i % 2 == 0 && i + 1 == j) || j <= i);
+ need_process = need_process && (processed_pairs.find(pair<int, int>(rc_id1, rc_id2)) ==
+ processed_pairs.end());
+ if(need_process){
+ processed_pairs.insert(pair<int, int>(id1, id2));
+ auto path2 = (*contigs)[j]->path_seq();
+ auto lcs_res = lcs_calc.LCS(seqs[i], seqs[j]);
+ vector<size_t> pos1, pos2;
+ auto pos_vectors_pair = GetBestPosVectors(lcs_calc, path1, seqs[i], path2, seqs[j], lcs_res);
+ pos1 = pos_vectors_pair.first;
+ pos2 = pos_vectors_pair.second;
+
+ {
+ TRACE("--------------------------------");
+ size_t id_i = id1, id_j = id2;
+ TRACE("Indexes " << i << " " << j );
+ TRACE("IDs " << id_i << " " << id_j);
+ TRACE("LCS string : " << VerticesVectorToString(g_, lcs_res));
+ TRACE("Path1. " << SimplePathWithVerticesToString(g_, path1));
+ TRACE("Pos1. " << VectorToString<size_t>(pos1));
+ TRACE("Path2. " << SimplePathWithVerticesToString(g_, path2));
+ TRACE("Pos2. " << VectorToString<size_t>(pos2));
+ }
+
+ // Overlapping
+ auto overlap_result = ArePathsOverlapped(path1, pos1, path2, pos2);
+ bool is_overlaped = overlap_result.first.correctness ||
+ overlap_result.second.correctness;
+
+ if(is_overlaped){
+
+ size_t first_id, last_id;
+ vector<EdgeId> first_path, last_path;
+ vector<size_t> first_pos, last_pos;
+
+ if(overlap_result.first.correctness && overlap_result.second.correctness){
+ if(overlap_result.first.size < overlap_result.second.size){
+ first_id = id2; last_id = id1;
+ }
+ else {
+ first_id = id1; last_id = id2;
+ }
+ }
+ else{
+ if(overlap_result.first.correctness) {
+ first_id = id2; last_id = id1;
+ }
+ else {
+ first_id = id1; last_id = id2;
+ }
+ }
+
+ first_path = (first_id == id1) ? path1 : path2;
+ last_path = (last_id == id1) ? path1 : path2;
+ first_pos = (first_id == id1) ? pos1 : pos2;
+ last_pos = (last_id == id1) ? pos1 : pos2;
+
+ size_t rc_first_id = contigs->GetContigById(first_id)->rc_id();
+ size_t rc_last_id = contigs->GetContigById(last_id)->rc_id();
+
+ size_t lcs_len1 = GetLCSLengthByPath(path1, pos1);
+ size_t lcs_len2 = GetLCSLengthByPath(path2, pos2);
+
+ Range overlap_first(first_pos[0], first_pos[first_pos.size() - 1]);
+ Range overlap_last(last_pos[0], last_pos[last_pos.size() - 1]);
+
+ Range overlap_first_rc(first_path.size() - overlap_first.end_pos,
+ first_path.size() - overlap_first.start_pos);
+ Range overlap_last_rc(last_path.size() - overlap_last.end_pos,
+ last_path.size() - overlap_last.start_pos);
+
+ overlap_map.Add(
+ OverlappedContigsMap::OverlappedKey(first_id, last_id, rc_first_id, rc_last_id),
+ OverlappedContigsMap::OverlappedValue(overlap_first, overlap_last,
+ overlap_first_rc, overlap_last_rc, max<size_t>(lcs_len1, lcs_len2)));
+
+ TRACE(first_id << " - " << last_id << ". " << overlap_first.start_pos << " - " <<
+ overlap_first.end_pos << ", " << overlap_last.start_pos << " - " <<
+ overlap_last.end_pos);
+
+ TRACE(rc_last_id << " - " << rc_first_id << ". " << overlap_last_rc.start_pos << " - " <<
+ overlap_last_rc.end_pos << ", " << overlap_first_rc.start_pos << " - " <<
+ overlap_first_rc.end_pos);
+ }
+ }
+ }
+ }
+
+ TRACE("Overlapped contigs map. Size - " << ToString(overlap_map.Size()) << endl <<
+ overlap_map);
+
+ FillOverlapGraphByMap(overlap_map, og);
+
+ string fname = dsp_cfg::get().io.output_dir + "default_overlap_graph.dot";
+ OverlapgraphToDot(fname, og, contigs);
+
+ INFO("Overlap graph with " + ToString(og.Vertices().size()) + " vertices and " +
+ ToString(og.Edges().size()) + " edges constructed");
+
+ auto og_vertices = og.Vertices();
+ auto edges = og.Edges();
+
+ SimplifyOverlapGraph(og, 10, 5);
+
+ INFO("Simplified overlap graph contains " + ToString(og.Vertices().size()) + " vertices and " +
+ ToString(og.Edges().size()) + " edges");
+
+ fname = dsp_cfg::get().io.output_dir + "simplified_overlap_graph.dot";
+ OverlapgraphToDot(fname, og, contigs);
+
+ UniquePathsSearcher ps(og);
+ auto paths = ps.FindLongPaths();
+ TRACE(paths.size() << " paths in overlap graph were searched");
+
+ ContigStoragePtr new_storage(new SimpleContigStorage());
+ size_t i = 1;
+ for(auto p = paths.begin(); p != paths.end(); p++){
+ VERIFY(p->size() > 0);
+ if(p->size() == 1){
+ TRACE("Consensus contig " << i << " is simple");
+ auto contig = contigs->GetContigById((*p)[0]);
+ MappingContigPtr new_rc(new ReplacedNameMappingContig(contig,
+ get_composite_contig_name(i, contig->length())));
+ new_storage->Add(new_rc);
+ }
+ else{
+ TRACE("Consensus contig " << i << " is composite");
+
+ vector<pair<Range, Range> > overlaps;
+ vector<MappingContigPtr> mc_vect;
+ for(size_t i = 0; i < p->size() - 1; i++)
+ overlaps.push_back(overlap_map.Ranges((*p)[i], (*p)[i + 1]));
+
+ for(auto id = p->begin(); id != p->end(); id++)
+ mc_vect.push_back(contigs->GetContigById(*id));
+
+ MappingContigPtr new_mc(new CompositeMappingContig(g_, k_value_,
+ mc_vect, overlaps));
+ new_mc->ChangeName(get_composite_contig_name(i, new_mc->length()));
+ new_storage->Add(new_mc);
+ }
+ i++;
+ }
+
+ INFO("Computing overlaps ends");
+
+ return new_storage;
+ }
+
+private:
+ DECL_LOGGER("OverlapCorrector");
+};
+
+}
diff --git a/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/redundant_contig_remover.hpp b/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/redundant_contig_remover.hpp
new file mode 100644
index 0000000..f8fac9d
--- /dev/null
+++ b/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/redundant_contig_remover.hpp
@@ -0,0 +1,891 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "abstract_contig_corrector.hpp"
+
+using namespace debruijn_graph;
+
+namespace dipspades {
+
+class LoopBulgeDeletionCorrector : public AbstractContigCorrector{
+
+protected:
+ size_t k_value_;
+
+ size_t max_loop_length_;
+ size_t max_tail_length_;
+ size_t min_lcs_length_;
+ VertexPathIndex &path_index_;
+
+ ostream &out_;
+
+public:
+ vector<VertexId> GetListOfVertices(vector<EdgeId> path){
+ return get_list_of_vertices_in_path(g_, path);
+ }
+
+ size_t ConvInd(size_t ind, size_t size){
+ if(ind == size)
+ return ind - 1;
+ return ind;
+ }
+
+ bool IsLoopCorrect(vector<EdgeId> path, size_t i1, size_t i2){
+ VERIFY(i1 < path.size());
+ VERIFY(i2 < path.size());
+
+ size_t length = 0;
+ for(size_t i = i1; i <= i2; i++)
+ length += g_.length(path[i]);
+
+ return length <= max_loop_length_;
+ }
+
+ bool IsRegionLoop(vector<EdgeId> path, size_t i1, size_t i2){
+ return g_.EdgeStart(path[i1]) == g_.EdgeEnd(path[i2]);
+ }
+
+ bool IsPathCorrect(vector<EdgeId> path, vector<size_t> pos){
+ return (IsLeftTailCorrect(path, pos) && IsRightTailCorrect(path, pos));
+ }
+
+ bool IsRegionBulgeSide(vector<EdgeId> path, size_t ind1, size_t ind2){
+ return g_.EdgeStart(path[ind1]) != g_.EdgeEnd(path[ind2]);
+ }
+
+ bool AreRegionsBulge(vector<EdgeId> path1, size_t i_11, size_t i_12,
+ vector<EdgeId> path2, size_t i_21, size_t i_22){
+ return IsRegionBulge(g_, CutSubpathByRegion(path1, make_pair(i_11, i_12)),
+ CutSubpathByRegion(path2, make_pair(i_21, i_22)));
+ }
+
+ bool AreRegionsDiploidBulge(vector<EdgeId> path1, size_t i_11, size_t i_12,
+ vector<EdgeId> path2, size_t i_21, size_t i_22){
+
+ TRACE("Bulge: pos1: " << i_11 << " - " << i_12 << ", pos2: " << i_21 << " - " << i_22 );
+
+ if(dsp_cfg::get().cc.align_bulge_sides){
+ Bulge bulge(g_, k_value_, path1, make_pair(i_11, i_12), path2, make_pair(i_21, i_22));
+ return bulge.IsBulgeDiploid(dsp_cfg::get().pbr.rel_bulge_align,
+ dsp_cfg::get().pbr.rel_bulge_align);
+ }
+ return true;
+ }
+
+ bool IsLCSCorrect(vector<EdgeId> path1, vector<size_t> pos1,
+ vector<EdgeId> path2, vector<size_t> pos2){
+
+ VERIFY(pos1.size() == pos2.size());
+
+ size_t pos_len = pos1.size();
+ if(pos_len <= 1)
+ return false;
+
+ size_t lcs_len = min<size_t>(GetLCSLengthByPath(path1, pos1), GetLCSLengthByPath(path2, pos2));
+ size_t path1_len = GetPathLength(g_, path1), path2_len = GetPathLength(g_, path2);
+
+ TRACE("LCS length - " << lcs_len);
+ TRACE("Path length1 - " << path1_len << ", path length2 - " << path2_len);
+
+ if(lcs_len <= min_lcs_length_ &&
+ min<size_t>(path1_len, path2_len) > min_lcs_length_){
+ return false;
+ }
+
+ for(size_t i = 0; i < pos_len - 1; i++){
+
+ TRACE("Pos1 - " << pos1[i] << ", " << pos1[i + 1]);
+ TRACE("Pos2 - " << pos2[i] << ", " << pos2[i + 1]);
+ // if bath are not neighbors
+ if(pos1[i] + 1 != pos1[i + 1] || pos2[i] + 1 != pos2[i + 1]){
+
+ TRACE("1st loop checking");
+ bool is_1st_loop = false;
+ bool is_1st_corr = true;
+
+ size_t i_11, i_12;
+ if(pos1[i] + 1 != pos1[i + 1]){
+ TRACE("Positions are not consecutive");
+ // it may be loop
+ i_11 = ConvInd(pos1[i], path1.size());
+ i_12 = ConvInd(pos1[i + 1], path1.size()) - 1;
+
+ is_1st_loop = IsRegionLoop(path1, i_11, i_12);
+ TRACE("Is loop - " << is_1st_loop);
+ if(is_1st_loop){
+ is_1st_corr = IsLoopCorrect(path1, i_11, i_12);
+ }
+ else{ // then region is bulge
+ VERIFY(IsRegionBulgeSide(path1, i_11, i_12));
+ }
+ }
+ else{
+ i_11 = pos1[i];
+ i_12 = pos1[i];
+ }
+
+ TRACE("2nd loop checking");
+ bool is_2nd_loop = false;
+ bool is_2nd_corr = true;
+ size_t i_21, i_22;
+ if(pos2[i] + 1 != pos2[i + 1]){
+ TRACE("Positions are not consecutive");
+ // it may be loop
+ i_21 = ConvInd(pos2[i], path2.size());
+ i_22 = ConvInd(pos2[i + 1], path2.size()) - 1;
+
+ is_2nd_loop = IsRegionLoop(path2, i_21, i_22);
+ TRACE("Is loop - " << is_2nd_loop );
+ if(is_2nd_loop){
+ is_2nd_corr = IsLoopCorrect(path2, i_21, i_22);
+ }
+ else{
+ VERIFY(IsRegionBulgeSide(path2, i_21, i_22));
+ }
+ }
+ else{
+ i_21 = pos2[i];
+ i_22 = pos2[i];
+ }
+
+ if(!is_1st_loop && !is_2nd_loop){
+
+ i_12 = (pos1[i + 1] == path1.size()) ? path1.size() - 1 : i_12;
+ i_22 = (pos2[i + 1] == path2.size()) ? path2.size() - 1 : i_22;
+
+ if(AreRegionsBulge(path1, i_11, i_12, path2, i_21, i_22))
+ if(!AreRegionsDiploidBulge(path1, i_11, i_12, path2, i_21, i_22))
+ return false;
+ }
+ else
+ if(!is_1st_corr || !is_2nd_corr){
+ return false;
+ }
+ }
+ }
+
+ return true;
+ }
+
+ size_t GetLCSLengthByPath(vector<EdgeId> path, vector<size_t> pos){
+ if(pos.size() <= 1)
+ return 0;
+ size_t pos_len = pos.size();
+ size_t last_pos = pos[pos_len - 1];
+
+ size_t ind_start = ConvInd(pos[0], path.size());
+ size_t ind_end = ConvInd(last_pos, path.size());
+ if(last_pos != path.size())
+ ind_end--;
+
+ size_t len = 0;
+ for(size_t i = ind_start; i <= ind_end; i++)
+ len += g_.length(path[i]);
+
+ return len;
+ }
+
+ size_t GetLeftTailLength(vector<EdgeId> path, vector<size_t> pos){
+ if(pos.size() <= 1)
+ return 0;
+
+ size_t first_pos = ConvInd(pos[0], path.size());
+
+ size_t tail_len = 0;
+ for(size_t i = 0; i < first_pos; i++)
+ tail_len += g_.length(path[i]);
+
+ return tail_len;
+ }
+
+ bool IsLeftTailCorrect(vector<EdgeId> path, vector<size_t> pos){
+ if(pos.size() <= 1)
+ return false;
+
+ size_t tail_len = GetLeftTailLength(path, pos);
+// TRACE("Left tail length - " << tail_len );
+ return (tail_len <= max_tail_length_);
+ }
+
+ size_t GetRightTailLength(vector<EdgeId> path, vector<size_t> pos){
+ if(pos.size() <= 1)
+ return 0;
+
+ size_t last_pos = pos[pos.size() - 1];
+
+ size_t tail_len = 0;
+ for(size_t i = last_pos; i < path.size(); i++)
+ tail_len += g_.length(path[i]);
+
+ return tail_len;
+ }
+
+ bool IsRightTailCorrect(vector<EdgeId> path, vector<size_t> pos){
+ if(pos.size() <= 1)
+ return false;
+
+ size_t tail_len = GetRightTailLength(path, pos);
+// TRACE("Right tail length - " << tail_len );
+ return (tail_len <= max_tail_length_);
+ }
+
+ bool AreLeftTailsCorrect(vector<EdgeId> path1, vector<size_t> pos1,
+ vector<EdgeId> path2, vector<size_t> pos2){
+
+ VERIFY(pos1.size() == pos2.size());
+
+ if(pos1.size() <= 1)
+ return false;
+
+ size_t tail_length1 = GetLeftTailLength(path1, pos1);
+ size_t tail_length2 = GetLeftTailLength(path2, pos2);
+
+ if(min<size_t>(tail_length1, tail_length2) > max_tail_length_)
+ return false;
+
+ VertexId start1 = g_.EdgeStart(path1[0]); //g_.EdgeStart(path1[first_pos1]);
+ VertexId start2 = g_.EdgeStart(path2[0]); //g_.EdgeStart(path2[first_pos2]);
+
+ bool are_tails_correct = false;
+ if(g_.IncomingEdgeCount(start1) == 0 &&
+ g_.IncomingEdgeCount(start2) == 0){
+ are_tails_correct = true;
+ }
+ else{
+
+ if(dsp_cfg::get().cc.tails_lie_on_bulges){
+ // find vertex v, such that paths starts are reachable from v
+ auto path_searcher1 = DijkstraHelper<Graph>::CreateBackwardBoundedDijkstra(g_,
+ max_tail_length_);
+ path_searcher1.Run(start1);
+ auto reached_vert1 = path_searcher1.ReachedVertices();
+
+ auto path_searcher2 = DijkstraHelper<Graph>::CreateBackwardBoundedDijkstra(g_,
+ max_tail_length_);
+ path_searcher2.Run(start2);
+ auto reached_vert2 = path_searcher2.ReachedVertices();
+
+ for(size_t i = 0; i < pos1[0]; i++)
+ reached_vert1.push_back(g_.EdgeStart(path1[i]));
+
+ for(size_t i = 0; i < pos2[0]; i++)
+ reached_vert2.push_back(g_.EdgeStart(path2[i]));
+
+ for(auto v1 = reached_vert1.begin(); v1 != reached_vert1.end(); v1++){
+ for(auto v2 = reached_vert2.begin(); v2 != reached_vert2.end(); v2++){
+ if(*v1 == *v2){
+ are_tails_correct = true;
+ break;
+ }
+ }
+ if(are_tails_correct)
+ break;
+ }
+ }
+ else{
+ are_tails_correct = true;
+ }
+ }
+
+ if(!are_tails_correct)
+ return false;
+
+ if(!dsp_cfg::get().cc.align_bulge_sides)
+ return true;
+
+ Sequence tail_seq1 = GetSequenceOfPathRegion(g_, k_value_, path1,
+ pair<size_t, size_t>(0, pos1[0] - 1));
+
+ Sequence tail_seq2 = GetSequenceOfPathRegion(g_, k_value_, path2,
+ pair<size_t, size_t>(0, pos2[0] - 1));
+
+ Sequence trim_seq1, trim_seq2;
+ if(min<size_t>(tail_seq1.size(), tail_seq2.size()) == tail_seq1.size()){
+ trim_seq1 = tail_seq1;
+ trim_seq2 = tail_seq2.Subseq(tail_seq2.size() - tail_seq1.size(),
+ tail_seq2.size());
+ }
+ else{
+ trim_seq1 = tail_seq1.Subseq(tail_seq1.size() - tail_seq2.size(),
+ tail_seq1.size());
+ trim_seq2 = tail_seq2;
+ }
+
+ if(trim_seq1.size() > max_tail_length_)
+ return false;
+
+ return RelAlignmentOfSequences(trim_seq1, trim_seq2) <=
+ dsp_cfg::get().pbr.rel_bulge_align;
+
+ }
+
+ bool AreRightTailsCorrect(vector<EdgeId> path1, vector<size_t> pos1,
+ vector<EdgeId> path2, vector<size_t> pos2){
+
+ VERIFY(pos1.size() == pos2.size());
+
+ if(pos1.size() <= 1)
+ return false;
+
+ size_t tail_length1 = GetRightTailLength(path1, pos1);
+ size_t tail_length2 = GetRightTailLength(path2, pos2);
+
+ if(min<size_t>(tail_length1, tail_length2) > max_tail_length_)
+ return false;
+
+ VertexId end1 = g_.EdgeEnd(path1[path1.size() - 1]);
+ VertexId end2 = g_.EdgeEnd(path2[path2.size() - 1]);
+
+ bool are_tails_correct = false;
+
+ if(g_.OutgoingEdgeCount(end1) == 0 && g_.OutgoingEdgeCount(end2) == 0){
+ are_tails_correct = true;
+ }
+ else{
+
+ if(dsp_cfg::get().cc.tails_lie_on_bulges){
+ // find vertex v, such that paths ends are reachable from v
+ auto path_searcher1 = DijkstraHelper<Graph>::CreateBackwardBoundedDijkstra(g_,
+ max_tail_length_);
+ path_searcher1.Run(end1);
+ auto reached_vert1 = path_searcher1.ReachedVertices();
+
+ auto path_searcher2 = DijkstraHelper<Graph>::CreateBackwardBoundedDijkstra(g_,
+ max_tail_length_);
+ path_searcher2.Run(end2);
+ auto reached_vert2 = path_searcher2.ReachedVertices();
+
+ for(size_t i = ConvInd(pos1[pos1.size() - 1], path1.size()); i < path1.size(); i++)
+ reached_vert1.push_back(g_.EdgeEnd(path1[i]));
+
+ for(size_t i = ConvInd(pos2[pos2.size() - 1], path2.size()); i < path2.size(); i++)
+ reached_vert2.push_back(g_.EdgeEnd(path2[i]));
+
+ for(auto v1 = reached_vert1.begin(); v1 != reached_vert1.end(); v1++){
+ for(auto v2 = reached_vert2.begin(); v2 != reached_vert2.end(); v2++){
+ if(*v1 == *v2){
+ are_tails_correct = true;
+ break;
+ }
+ }
+ if(are_tails_correct)
+ break;
+ }
+ }
+ else{
+ // tail lengths comparison?
+ are_tails_correct = true;
+ }
+ }
+
+ if(!are_tails_correct)
+ return false;
+
+ if(!dsp_cfg::get().cc.align_bulge_sides)
+ return true;
+
+ Sequence tail_seq1 = GetSequenceOfPathRegion(g_, k_value_, path1,
+ pair<size_t,size_t>(pos1[pos1.size() - 1], path1.size() - 1));
+
+ Sequence tail_seq2 = GetSequenceOfPathRegion(g_, k_value_, path2,
+ pair<size_t,size_t>(pos2[pos2.size() - 1], path2.size() - 1));
+
+ Sequence trim_seq1, trim_seq2;
+ if(min<size_t>(tail_seq1.size(), tail_seq2.size()) == tail_seq1.size()){
+ trim_seq1 = tail_seq1;
+ trim_seq2 = tail_seq2.Subseq(0, tail_seq1.size());
+ }
+ else{
+ trim_seq1 = tail_seq1.Subseq(0, tail_seq2.size());
+ trim_seq2 = tail_seq2;
+ }
+
+ if(trim_seq1.size() > max_tail_length_)
+ return false;
+
+ return (RelAlignmentOfSequences(trim_seq1, trim_seq2) <= dsp_cfg::get().pbr.rel_bulge_align);
+ }
+
+ bool IsLeftTailExist(vector<EdgeId>, vector<size_t> pos){
+ size_t first_index = pos[0]; //ConvInd(pos[0], path.size());
+ return (first_index != 0);
+ }
+
+ bool AreBothLeftTailsExist(vector<EdgeId> path1, vector<size_t> pos1,
+ vector<EdgeId> path2, vector<size_t> pos2){
+
+ VERIFY(pos1.size() == pos2.size());
+ if(pos1.size() == 0)
+ return false;
+
+ TRACE("Left: " << IsLeftTailExist(path1, pos2) << " " << IsLeftTailExist(path2, pos2) );
+ return (IsLeftTailExist(path1, pos1) && IsLeftTailExist(path2, pos2));
+ }
+
+ bool IsRightTailExist(vector<EdgeId> path, vector<size_t> pos){
+ size_t last_index = pos[pos.size() - 1]; //ConvInd(pos[pos.size() - 1], path.size());
+ return (last_index != path.size());
+ }
+
+ bool AreBothRightTailsExist(vector<EdgeId> path1, vector<size_t> pos1,
+ vector<EdgeId> path2, vector<size_t> pos2){
+ VERIFY(pos1.size() == pos2.size());
+ if(pos1.size() == 0)
+ return false;
+
+ TRACE("Right: " << IsRightTailExist(path1, pos1) << " " << IsRightTailExist(path2, pos2) );
+ return IsRightTailExist(path1, pos1) && IsRightTailExist(path2, pos2);
+
+ }
+
+ // yana todo replace
+ vector<VertexId> RearrangementSearch(vector<EdgeId> path1, vector<EdgeId> path2){
+
+ vector<VertexId> common_vertices;
+ if(path1.size() == 0 || path2.size() == 0)
+ return common_vertices;
+
+ map<VertexId, int> vertex_count;
+
+ set<VertexId> vertices1; vertices1.insert(g_.EdgeStart(path1[0]));
+ for(auto e = path1.begin(); e != path1.end(); e++){
+ vertices1.insert(g_.EdgeEnd(*e));
+ }
+
+ set<VertexId> vertices2; vertices2.insert(g_.EdgeStart(path2[0]));
+ for(auto e = path2.begin(); e != path2.end(); e++){
+ vertices2.insert(g_.EdgeEnd(*e));
+ }
+
+ for(auto v = vertices1.begin(); v != vertices1.end(); v++)
+ vertex_count[*v]++;
+
+ for(auto v = vertices2.begin(); v != vertices2.end(); v++)
+ vertex_count[*v]++;
+
+ for(auto it = vertex_count.begin(); it != vertex_count.end(); it++)
+ if(it->second == 2)
+ common_vertices.push_back(it->first);
+
+// TRACE("Common vertices: " );
+// PrintVectorOfVertices(cout, g_, common_vertices);
+ return common_vertices;
+ }
+
+ bool ArePathsCorrect(vector<EdgeId> path1, vector<size_t> pos1,
+ vector<EdgeId> path2, vector<size_t> pos2){
+
+ VERIFY(pos1.size() == pos2.size());
+
+ if(AreBothLeftTailsExist(path1, pos1, path2, pos2))
+ {
+ TRACE("Both left tails exist" );
+ bool tail_corr = AreLeftTailsCorrect(path1, pos1, path2, pos2);
+ if(!tail_corr){
+ TRACE("One of left tails is not correct" );
+ return false;
+ }
+ }
+
+ if(AreBothRightTailsExist(path1, pos1, path2, pos2))
+ {
+ TRACE("Both right tails exist" );
+ bool tail_corr = AreRightTailsCorrect(path1, pos1, path2, pos2);
+ if(!tail_corr){
+ TRACE("One of right tails is not correct" );
+ return false;
+ }
+ }
+
+ bool lcs_corr = IsLCSCorrect(path1, pos1, path2, pos2);
+
+ if(!lcs_corr){
+ TRACE("LCS is not correct" );
+ auto common_vert = RearrangementSearch(path1, path2);
+ if(common_vert.size() > pos1.size())
+ TRACE("Possible rearrangement!");
+ return false;
+ }
+
+ size_t lcs_length1 = GetLCSLengthByPath(path1, pos1),
+ lcs_length2 = GetLCSLengthByPath(path2, pos2);
+
+ return (min<size_t>(lcs_length1, lcs_length2) > 0);
+ }
+
+ bool IsPathRedundant(vector<EdgeId> path, vector<size_t> pos){
+ if(pos.size() <= 1) return true;
+ return IsLeftTailCorrect(path, pos) && IsRightTailCorrect(path, pos);
+ }
+
+ void CorrectPositionVertor(vector<EdgeId> path, vector<size_t> & pos){
+ if(pos.size() <= 1)
+ return;
+
+ for(size_t i = 0; i < pos.size() - 1; i++){
+ if(pos[i] + 1 != pos[i + 1]){
+
+ size_t i1 = ConvInd(pos[i], path.size()) + 1;
+ size_t i2 = ConvInd(pos[i + 1], path.size()) - 1;
+
+ if(IsRegionLoop(path, i1, i2))
+ if(!IsLoopCorrect(path, i1, i2)){
+ VertexId v;
+ if(pos[i + 1] == path.size())
+ v = g_.EdgeEnd(path[path.size() - 1]);
+ else
+ v = g_.EdgeStart(path[pos[i + 1]]);
+ for(size_t j = pos[i] + 1; j < pos[i + 1]; j++)
+ if(g_.EdgeStart(path[j]) == v){
+ pos[i + 1] = j;
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ size_t GetNumberOfErrorsFromLCS(vector<EdgeId> path, vector<size_t> pos){
+ if(pos.size() <= 1)
+ return 0;
+
+ size_t error_num = 0;
+
+ for(size_t i = 0; i < pos.size() - 1; i++){
+ if(pos[i] + 1 != pos[i + 1]){
+
+ size_t i1 = ConvInd(pos[i], path.size()) + 1;
+ size_t i2 = ConvInd(pos[i + 1], path.size()) - 1;
+
+ if(IsRegionLoop(path, i1, i2)){
+ if(!IsLoopCorrect(path, i1, i2))
+ error_num++;
+ }
+ }
+ }
+
+ return error_num;
+ }
+
+ pair<vector<size_t>, vector<size_t> > GetBestPosVectors(LCSCalculator<VertexId> & calc,
+ vector<EdgeId> path1, vector<VertexId> vert_path1,
+ vector<EdgeId> path2, vector<VertexId> vert_path2,
+ vector<VertexId> lcs){
+
+ // first path processing
+ auto pos_right1 = calc.GetPosVector(vert_path1, lcs);
+ auto pos_left1 = calc.GetPosVectorFromLeft(vert_path1, lcs);
+
+ bool equal_num_err1 = true;
+ vector<size_t> best_vect1;
+
+ {
+ size_t err_right1 = GetNumberOfErrorsFromLCS(path1, pos_right1);
+ size_t err_left1 = GetNumberOfErrorsFromLCS(path1, pos_left1);
+ equal_num_err1 = err_left1 == err_right1;
+ best_vect1 = (err_left1 < err_right1) ? pos_left1 : pos_right1;
+ }
+
+ size_t lcs_right1 = GetLCSLengthByPath(path1, pos_right1);
+ size_t lcs_left1 = GetLCSLengthByPath(path1, pos_left1);
+
+ // second path processing
+ auto pos_right2 = calc.GetPosVector(vert_path2, lcs);
+ auto pos_left2 = calc.GetPosVectorFromLeft(vert_path2, lcs);
+
+ bool equal_num_err2 = true;
+ vector<size_t> best_vect2;
+
+ {
+ size_t err_right2 = GetNumberOfErrorsFromLCS(path2, pos_right2);
+ size_t err_left2 = GetNumberOfErrorsFromLCS(path2, pos_left2);
+ equal_num_err2 = err_left2 == err_right2;
+ best_vect2 = (err_left2 < err_right2) ? pos_left2 : pos_right2;
+ }
+
+ size_t lcs_right2 = GetLCSLengthByPath(path2, pos_right2);
+ size_t lcs_left2 = GetLCSLengthByPath(path2, pos_left2);
+
+ if(equal_num_err1 && !equal_num_err2){
+
+ size_t best_lcs2 = GetLCSLengthByPath(path2, best_vect2);
+
+ if(abs_diff(lcs_right1, best_lcs2) < abs_diff(lcs_left1, best_lcs2))
+ return pair<vector<size_t>, vector<size_t> >(pos_right1, best_vect2);
+ else
+ return pair<vector<size_t>, vector<size_t> >(pos_left1, best_vect2);
+ }
+
+ if(!equal_num_err1 && equal_num_err2){
+ size_t best_lcs1 = GetLCSLengthByPath(path1, best_vect1);
+
+ if(abs_diff(lcs_right2, best_lcs1) < abs_diff(lcs_left2, best_lcs1))
+ return pair<vector<size_t>, vector<size_t> >(best_vect1, pos_right2);
+ else
+ return pair<vector<size_t>, vector<size_t> >(best_vect1, pos_left2);
+ }
+
+ if(equal_num_err1 && equal_num_err2){
+
+ // best pair computing
+ size_t left_left = abs_diff(lcs_left1, lcs_left2);
+ size_t left_right = abs_diff(lcs_left1, lcs_right2);
+ size_t right_left = abs_diff(lcs_right1, lcs_left2);
+ size_t right_right = abs_diff(lcs_right1, lcs_right2);
+
+ size_t min_diff = min<size_t>(min<size_t>(left_left, left_right),
+ min<size_t>(right_left, right_right));
+
+ if(min_diff == left_left){
+ return pair<vector<size_t>, vector<size_t> >(pos_left1, pos_left2);
+ }
+
+ if(min_diff == left_right){
+ return pair<vector<size_t>, vector<size_t> >(pos_left1, pos_right2);
+ }
+
+ if(min_diff == right_left){
+ return pair<vector<size_t>, vector<size_t> >(pos_right1, pos_left2);
+ }
+
+ if(min_diff == right_right){
+ return pair<vector<size_t>, vector<size_t> >(pos_right1, pos_right2);
+ }
+ }
+
+ return pair<vector<size_t>, vector<size_t> >(best_vect1, best_vect2);
+ }
+
+ vector<size_t> GetBestPosVector(LCSCalculator<VertexId> & calc, vector<EdgeId> path,
+ vector<VertexId> vert_path, vector<VertexId> lcs){
+
+ auto pos_right = calc.GetPosVector(vert_path, lcs);
+ auto pos_left = calc.GetPosVectorFromLeft(vert_path, lcs);
+
+ size_t err_right = GetNumberOfErrorsFromLCS(path, pos_right);
+ size_t err_left = GetNumberOfErrorsFromLCS(path, pos_left);
+
+ if(min<size_t>(err_left, err_right) == err_left)
+ return pos_left;
+ else
+ return pos_right;
+ }
+
+ void InitializeMap(ContigStoragePtr contigs){
+ for(size_t i = 0; i < contigs->Size(); i++){
+ size_t id = (*contigs)[i]->id();
+ res.redundancy_map.AddNewKey(id);
+ }
+ }
+
+ void AddRedundantContig(ContigStoragePtr contigs, size_t index_red, size_t index_main){
+ size_t id_main = (*contigs)[index_main]->id(),
+ id_rc_main = (*contigs)[index_main]->rc_id();
+ size_t id_red = (*contigs)[index_red]->id(),
+ id_rc_red = (*contigs)[index_red]->rc_id();
+ redundant_contigs.insert(id_red);
+ redundant_contigs.insert(id_rc_red);
+ // current contig
+ res.redundancy_map.AddNewPair(id_main, id_red);
+ res.redundancy_map.AddNewPair(id_rc_main, id_rc_red);
+ }
+
+ CorrectionResult res;
+ set<size_t> redundant_contigs;
+
+public:
+ LoopBulgeDeletionCorrector(Graph &g, size_t k_value, size_t max_loop_length,
+ size_t max_tail_length, size_t min_lcs_length, VertexPathIndex &path_index,
+ ostream &out = cout) : AbstractContigCorrector(g), k_value_(k_value),
+ path_index_(path_index), out_(out) {
+
+ max_loop_length_ = max_loop_length;
+ max_tail_length_ = max_tail_length;
+ min_lcs_length_ = min_lcs_length;
+ }
+
+ virtual ContigStoragePtr Correct(ContigStoragePtr contigs) {
+
+ INFO("Computing redundant contigs starts");
+
+ redundant_contigs.clear();
+
+ InitializeMap(contigs);
+
+ LCSCalculator<VertexId> lcs_calc;
+
+ vector<vector<VertexId> > seqs;
+ for(size_t i = 0; i < contigs->Size(); i++){
+ vector<VertexId> seq = GetListOfVertices((*contigs)[i]->path_seq());
+ seqs.push_back(seq);
+ }
+
+ set<size_t> processed_contigs;
+ set<size_t> absolutely_redundant;
+
+ size_t contigs_number = seqs.size();
+ double processed_perc = 0.1;
+ double processed_step = 0.1;
+
+ for(size_t i = 0; i < seqs.size() - 1; i++){
+
+ size_t id_i = (*contigs)[i]->id();
+ size_t rc_id_i = (*contigs)[i]->rc_id();
+
+ processed_contigs.insert(id_i);
+
+ if(processed_contigs.find(rc_id_i) == processed_contigs.end() &&
+ absolutely_redundant.find(i) == absolutely_redundant.end()){
+
+ vector<EdgeId> path1 = (*contigs)[i]->path_seq();
+ set<int> analyzed_contigs;
+
+ auto contigs_for_analyze = path_index_.GetPathsIntersectedWith(path1);
+ for(auto it = contigs_for_analyze.begin(); it != contigs_for_analyze.end(); it++){
+
+ size_t j = *it;
+ size_t id_j = (*contigs)[j]->id();
+ size_t rc_id_j = (*contigs)[j]->rc_id();
+
+ bool need_process = !((i % 2 == 0 && i + 1 == j) || j <= i);
+ need_process = need_process &&
+ absolutely_redundant.find(j) == absolutely_redundant.end();
+ if(need_process){
+
+ vector<EdgeId> path2 = (*contigs)[j]->path_seq();
+ vector<VertexId> lcs_res = lcs_calc.LCS(seqs[i], seqs[j]);
+ vector<size_t> pos1, pos2;
+
+ auto pos_vectors_pair = GetBestPosVectors(lcs_calc, path1, seqs[i], path2, seqs[j], lcs_res);
+ pos1 = pos_vectors_pair.first;
+ pos2 = pos_vectors_pair.second;
+
+ {
+ TRACE("--------------------------------");
+ TRACE("Indexes " << i << " " << j);
+ TRACE("IDs " << id_i << " " << id_j);
+ TRACE("RC_Ids " << rc_id_i << " " << rc_id_j);
+
+ TRACE("Path1. " << SimplePathWithVerticesToString(g_, path1));
+ TRACE("Path2. " << SimplePathWithVerticesToString(g_, path2));
+
+ TRACE("LCS string: " << VerticesVectorToString(g_, lcs_res));
+
+ TRACE("Pos1. " << VectorToString<size_t>(pos1));
+ TRACE("Pos2. " << VectorToString<size_t>(pos2));
+ }
+
+ if(pos1.size() > 1){
+
+ bool paths_corr = ArePathsCorrect(path1, pos1, path2, pos2);
+
+ {
+ TRACE("ArePathsCorrect - " << paths_corr);
+ }
+
+ if(paths_corr){
+
+ size_t first_tail1 = GetLeftTailLength(path1, pos1);
+ size_t first_tail2 = GetRightTailLength(path1, pos1);
+ size_t first_tails = first_tail1 + first_tail2;
+
+ size_t second_tail1 = GetLeftTailLength(path2, pos2);
+ size_t second_tail2 = GetRightTailLength(path2, pos2);
+ size_t second_tails = second_tail1 + second_tail2;
+
+ bool first_path_red = IsPathRedundant(path1, pos1);
+ bool second_path_red = IsPathRedundant(path2, pos2);
+
+ {
+ TRACE("\tFirst tails length - " << first_tails);
+ TRACE("\tFirst path is redundant - " << first_path_red);
+ TRACE("\tSecond tails length - " << second_tails);
+ TRACE("\tSecond path is redundant - " << second_path_red);
+ }
+
+ if(first_path_red && second_path_red){
+ if(first_tails < second_tails){
+ TRACE(id_i << " is redundant");
+ AddRedundantContig(contigs, i, j);
+
+ if(first_tails == 0)
+ absolutely_redundant.insert(i);
+ }
+ else{
+ TRACE(id_j << " is redundant");
+ AddRedundantContig(contigs, j, i);
+
+ if(second_tails == 0)
+ absolutely_redundant.insert(j);
+ }
+ }
+ else{
+ if(first_path_red && !second_path_red){
+ TRACE(id_i << " is redundant");
+ AddRedundantContig(contigs, i, j);
+
+ if(first_tails == 0)
+ absolutely_redundant.insert(i);
+
+ }
+ else
+ if(!first_path_red && second_path_red){
+ TRACE(id_j << " is redundant");
+ AddRedundantContig(contigs, j, i);
+
+ if(second_tails == 0)
+ absolutely_redundant.insert(j);
+ }
+ }
+
+ if(absolutely_redundant.find(i) != absolutely_redundant.end())
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ double cur_process_perc = static_cast<double>(i) / static_cast<double>(contigs_number);
+ if(cur_process_perc > processed_perc) {
+ while(processed_perc + processed_step<= cur_process_perc)
+ processed_perc += processed_step;
+ INFO(ToString(processed_perc * 100.0) << "% contigs were processed");
+ processed_perc += processed_step;
+ }
+ }
+ INFO("100% contigs were processed");
+
+ RedundancyMapCondenser<size_t> condenser;
+ condenser.Condense(res.redundancy_map);
+
+ INFO(ToString(redundant_contigs.size()) + " contigs from " + ToString(contigs->Size()) + " are redundant");
+
+ contigs->DeleteByIDs(redundant_contigs);
+
+ INFO("Computing redundant contigs ends");
+
+ return contigs;
+ }
+
+ MappingContigPtr Correct(MappingContigPtr contig){
+ return contig;
+ }
+
+ CorrectionResult Results(){
+ return res;
+ }
+
+ virtual ~LoopBulgeDeletionCorrector(){}
+
+protected:
+ DECL_LOGGER("LoopBulgeDeletionCorrector");
+};
+
+}
diff --git a/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/same_edge_deletion_corrector.hpp b/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/same_edge_deletion_corrector.hpp
new file mode 100644
index 0000000..5a299b7
--- /dev/null
+++ b/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/same_edge_deletion_corrector.hpp
@@ -0,0 +1,71 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "abstract_contig_corrector.hpp"
+
+using namespace debruijn_graph;
+
+namespace dipspades {
+
+class SameEdgeDeletionCorrector : public AbstractContigCorrector{
+
+ MappingRange WideMappingRange(MappingRange old_range, MappingRange new_range){
+
+ size_t initial_range_start_pos = min<size_t>(old_range.initial_range.start_pos, new_range.initial_range.start_pos);
+ size_t initial_range_end_pos = max<size_t>(old_range.initial_range.end_pos, new_range.initial_range.end_pos);
+ size_t mapped_range_start_pos = min<size_t>(old_range.mapped_range.start_pos, new_range.mapped_range.start_pos);
+ size_t mapped_range_end_pos = max<size_t>(old_range.mapped_range.end_pos, new_range.mapped_range.end_pos);
+
+ Range init(initial_range_start_pos, initial_range_end_pos), mapp(mapped_range_start_pos, mapped_range_end_pos);
+ MappingRange res(init, mapp);
+ return res;
+ }
+
+public:
+ SameEdgeDeletionCorrector(Graph &g) : AbstractContigCorrector(g) {
+ }
+
+ ContigStoragePtr Correct(ContigStoragePtr contigs) {
+ for(size_t i = 0; i < contigs->Size(); i++)
+ contigs->ReplaceContig(Correct((*contigs)[i]), i);
+ TRACE(contigs->Size() << " contigs from " << contigs->Size() << " were corrected");
+ return contigs;
+ }
+
+ MappingContigPtr Correct(MappingContigPtr contig){
+ MappingPath<EdgeId> map_path = contig->mapping_path();
+
+ if(map_path.size() <= 0)
+ return contig;
+
+ vector<EdgeId> new_path;
+ vector<MappingRange> new_ranges;
+ EdgeId cur_edge = map_path[0].first;
+ new_path.push_back(cur_edge);
+ new_ranges.push_back(map_path[0].second);
+
+ for (size_t i = 1; i < map_path.size(); i++) {
+ EdgeId e = map_path[i].first;
+ if (e != cur_edge) {
+ cur_edge = e;
+ new_path.push_back(e);
+ new_ranges.push_back(map_path[i].second);
+ }
+ else {
+ new_ranges[new_ranges.size() - 1] = WideMappingRange(
+ new_ranges[new_ranges.size() - 1], map_path[i].second);
+ }
+ }
+
+ MappingPath<EdgeId> new_map_path(new_path, new_ranges);
+ return MappingContigPtr(new ReplacedPathMappingContig(contig, new_map_path));
+ }
+};
+
+}
diff --git a/src/projects/dipspades/consensus_contigs_constructor/mapping_contig.hpp b/src/projects/dipspades/consensus_contigs_constructor/mapping_contig.hpp
new file mode 100644
index 0000000..80a29d3
--- /dev/null
+++ b/src/projects/dipspades/consensus_contigs_constructor/mapping_contig.hpp
@@ -0,0 +1,380 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * mapping_contig.hpp
+ *
+ * Created on: 13.11.2012
+ * Author: yana
+ */
+
+#pragma once
+#include "overlap_graph.hpp"
+#include "../utils/element_printers.hpp"
+#include <map>
+
+using namespace debruijn_graph;
+
+namespace dipspades {
+
+class Loop{
+ vector<EdgeId> list_;
+
+public:
+ Loop(vector<EdgeId> list) : list_(list) {}
+ const vector<EdgeId> edges() { return list_; }
+ size_t Size() { return list_.size(); }
+};
+
+class MappingContig{
+public:
+ MappingContig() { }
+ virtual Sequence seq() = 0;
+ virtual vector<EdgeId> path_seq() = 0;
+ virtual MappingPath<EdgeId> mapping_path() = 0;
+
+ virtual string name() = 0;
+ virtual string src_file() = 0;
+ virtual string full_name() { return src_file() + ":" + name(); }
+
+ virtual size_t size() = 0;
+ virtual size_t length() = 0;
+ virtual size_t id(){ return -1; }
+ virtual size_t rc_id() = 0;
+ virtual vector<shared_ptr<MappingContig> > AllMappingContigs() = 0;
+ virtual void ChangeMappingRange(size_t, MappingRange){ }
+ virtual void ChangeName(string new_name) = 0;
+
+ virtual string ToString(Graph &graph) = 0;
+
+ virtual ~MappingContig(){}
+};
+
+typedef shared_ptr<MappingContig> MappingContigPtr;
+
+class SimpleMappingContig : public MappingContig{
+ string name_;
+ string src_file_;
+ Sequence seq_;
+ MappingPath<EdgeId> map_path_;
+ vector<EdgeId> edge_path_;
+ size_t id_, rc_id_;
+
+public:
+ SimpleMappingContig(){ }
+
+ SimpleMappingContig(string name, string src_file, Sequence seq,
+ MappingPath<EdgeId> map_path, size_t id, size_t rc_id) :
+ name_(name),
+ src_file_(src_file),
+ seq_(seq),
+ map_path_(map_path),
+ edge_path_(map_path_.simple_path()),
+ id_(id),
+ rc_id_(rc_id) { }
+
+ SimpleMappingContig(string name, string src_file, Sequence seq,
+ MappingPath<EdgeId> map_path, vector<EdgeId> edge_path,
+ size_t id, size_t rc_id) :
+ name_(name),
+ src_file_(src_file),
+ seq_(seq),
+ map_path_(map_path),
+ edge_path_(edge_path),
+ id_(id),
+ rc_id_(rc_id) { }
+
+ Sequence seq() { return seq_; }
+
+ vector<EdgeId> path_seq() { return edge_path_; }
+
+ MappingPath<EdgeId> mapping_path() { return map_path_; }
+
+ string name() { return name_; }
+
+ string src_file() { return src_file_; }
+
+ size_t size() { return edge_path_.size(); }
+
+ size_t length() { return seq_.size(); }
+
+ size_t id(){ return id_; }
+
+ size_t rc_id() { return rc_id_; }
+
+ vector<MappingContigPtr> AllMappingContigs(){
+ return vector<MappingContigPtr>();
+ }
+
+ void ChangeMappingRange(size_t index, MappingRange new_range){
+ VERIFY(index < map_path_.size());
+ vector<EdgeId> new_path = map_path_.simple_path();
+ vector<MappingRange> new_ranges;
+ for(size_t i = 0; i < map_path_.size(); i++)
+ if(i != index)
+ new_ranges.push_back(map_path_[i].second);
+ else
+ new_ranges.push_back(new_range);
+ MappingPath<EdgeId> new_map_path(new_path, new_ranges);
+ map_path_ = new_map_path;
+ }
+
+ void ChangeName(string new_name) {
+ name_ = new_name;
+ }
+
+ string ToString(Graph &graph) {
+ stringstream ss;
+ ss << "Id: " << id_ << ". Seq size: " << seq_.size() <<
+ ". Map path: " << MappingPathToString(graph, map_path_);
+ return ss.str();
+ }
+};
+
+class ReplacedPathMappingContig : public MappingContig{
+ MappingContigPtr c_;
+ vector<EdgeId> new_path_;
+ MappingPath<EdgeId> new_map_path_;
+
+public:
+ ReplacedPathMappingContig(MappingContigPtr c, vector<EdgeId> new_path) : c_(c), new_path_(new_path) { }
+
+ ReplacedPathMappingContig(MappingContigPtr c, MappingPath<EdgeId> new_map_path) : c_(c), new_map_path_(new_map_path) {
+ new_path_ = new_map_path_.simple_path();
+ }
+
+ Sequence seq() { return c_->seq(); }
+
+ vector<EdgeId> path_seq() {
+ return new_path_;
+ }
+
+ MappingPath<EdgeId> mapping_path(){
+ if(new_map_path_.size() != 0)
+ return new_map_path_;
+ return c_->mapping_path();
+ }
+
+ string name() { return c_->name(); }
+
+ string src_file() { return c_->src_file(); }
+
+ size_t size() { return new_path_.size(); }
+
+ size_t length() { return c_->length(); }
+
+ size_t id(){ return c_->id(); }
+
+ size_t rc_id() { return c_->rc_id(); }
+
+ vector<MappingContigPtr> AllMappingContigs(){
+ return vector<MappingContigPtr>();
+ }
+
+ void ChangeName(string new_name) {
+ c_->ChangeName(new_name);
+ }
+
+ string ToString(Graph &graph) {
+ if(new_map_path_.size() == 0)
+ return c_-> ToString(graph);
+ stringstream ss;
+ ss << "Id: " << id() << ". Seq size: " << seq().size() <<
+ ". Map path: " << MappingPathToString(graph, new_map_path_);
+ return ss.str();
+ }
+};
+
+class CompositeMappingContig : public MappingContig{
+
+ Graph &g_;
+ size_t k_value_;
+
+ vector<MappingContigPtr> contigs_;
+ vector<pair<Range, Range> > overlaps_;
+
+ string contig_name_;
+
+ Sequence composite_seq;
+ vector<EdgeId> composite_path;
+ size_t composite_size;
+
+ size_t IndexOfEdgeByNumberOfVertex(size_t vertex_index){
+ if(vertex_index == 0)
+ return 0;
+ return vertex_index - 1;
+ }
+
+public:
+ CompositeMappingContig(Graph &g,
+ size_t k_value,
+ vector<MappingContigPtr> contigs,
+ vector<pair<Range, Range> > overlaps) :
+ g_(g),
+ k_value_(k_value),
+ contigs_(contigs),
+ overlaps_(overlaps),
+ contig_name_("") {
+ VERIFY(contigs.size() > 1);
+ VERIFY(contigs.size() == overlaps.size() + 1);
+ composite_size = 0;
+ }
+
+ Sequence seq(){
+ if(composite_seq.size() == 0){
+ vector<EdgeId> comp_path = path_seq();
+ composite_seq = GetSequenceByPath(g_, k_value_, comp_path);
+ }
+ return composite_seq;
+ }
+
+ vector<EdgeId> path_seq(){
+ if(composite_path.size() == 0){
+ if(overlaps_.size() == 0){
+ if(contigs_.size() == 0)
+ return vector<EdgeId>();
+ return contigs_[0]->path_seq();
+ }
+ else{
+ TRACE("New composite contig:");
+ TRACE("Path construction of composite contig starts");
+
+ TRACE("Ranges: ");
+ for(auto it = overlaps_.begin(); it != overlaps_.end(); it++)
+ TRACE(it->first.start_pos << " - " << it->first.end_pos << ". " <<
+ it->second.start_pos << " - " << it->second.end_pos);
+
+ // first path processing
+ {
+ TRACE("First path processing");
+ TRACE("Id - " << contigs_[0]->id());
+ vector<EdgeId> first_path = contigs_[0]->path_seq();
+ size_t end_ind = min<size_t>(IndexOfEdgeByNumberOfVertex(overlaps_[0].first.end_pos),
+ first_path.size() - 1);
+ for(size_t i = 0; i <= end_ind; i++)
+ composite_path.push_back(first_path[i]);
+ }
+
+ TRACE("Intermediate paths processing");
+ // intermediate paths processing
+ for(size_t i = 0; i < overlaps_.size() - 1; i++){
+ auto cur_path = contigs_[i + 1]->path_seq();
+ TRACE("Id: " << contigs_[i + 1]->id());
+ size_t start_ind = min<size_t>(IndexOfEdgeByNumberOfVertex(overlaps_[i].second.end_pos) + 1,
+ cur_path.size() - 1);
+ size_t end_ind = min<size_t>(IndexOfEdgeByNumberOfVertex(overlaps_[i + 1].first.end_pos),
+ cur_path.size() - 1);
+ TRACE("Start - " << start_ind << ", end - " << end_ind);
+ VERIFY(start_ind < cur_path.size() && end_ind < cur_path.size());
+ for(size_t j = start_ind; j <= end_ind; j++)
+ composite_path.push_back(cur_path[j]);
+ }
+
+ {
+ // last path processing
+ TRACE("Last path processing");
+ vector<EdgeId> last_path = contigs_[contigs_.size() - 1]->path_seq();
+ TRACE("Id: " << contigs_[contigs_.size() - 1]->id());
+ size_t start_ind = IndexOfEdgeByNumberOfVertex(overlaps_[overlaps_.size() - 1].second.end_pos) + 1;
+ start_ind = min<size_t>(start_ind, last_path.size() - 1);
+ size_t end_ind = last_path.size() - 1;
+ TRACE("Start - " << start_ind << ", end - " << end_ind);
+ VERIFY(start_ind < last_path.size() && end_ind < last_path.size());
+ for(size_t i = start_ind; i <= end_ind; i++)
+ composite_path.push_back(last_path[i]);
+ }
+
+ // deletion of repetitive start edge
+ TRACE("Deletion of repetitive start-end edge");
+ if(composite_path[0] == composite_path[composite_path.size() - 1]){
+ composite_path.erase(composite_path.begin() + composite_path.size() - 1);
+ TRACE("Deletion done");
+ }
+
+ TRACE("Path construction of composite contig ends");
+ }
+ }
+ return composite_path;
+ }
+
+ MappingPath<EdgeId> mapping_path(){ return MappingPath<EdgeId>(); } // todo refactor
+
+ string name() { return contig_name_; }
+
+ string src_file() { return ""; }
+
+ size_t size(){
+ return path_seq().size();
+ }
+
+ size_t length() { return seq().size(); }
+
+ size_t id(){ return 0; }
+
+ size_t rc_id() { return 0; }
+
+ void ChangeName(string new_name) {
+ contig_name_ = new_name;
+ }
+
+ vector<MappingContigPtr> AllMappingContigs(){
+ return contigs_;
+ }
+
+ string ToString(Graph &){
+ return "Composite contig";
+ }
+
+private:
+ DECL_LOGGER("CompositeMappingContig");
+};
+
+class ReplacedNameMappingContig : public MappingContig{
+ MappingContigPtr c_;
+ string contig_name_;
+
+public:
+ ReplacedNameMappingContig(MappingContigPtr c, string contig_name) :
+ c_(c),
+ contig_name_ (contig_name) { }
+
+ Sequence seq() { return c_->seq(); }
+
+ vector<EdgeId> path_seq() {
+ return c_->path_seq();
+ }
+
+ MappingPath<EdgeId> mapping_path(){
+ return c_->mapping_path();
+ }
+
+ string name() { return contig_name_; }
+
+ string src_file() { return c_->src_file(); }
+
+ size_t size() { return c_->size(); }
+
+ size_t length() { return c_->length(); }
+
+ size_t id(){ return c_->id(); }
+
+ size_t rc_id() { return c_->rc_id(); }
+
+ vector<MappingContigPtr> AllMappingContigs(){
+ return c_->AllMappingContigs();
+ }
+
+ void ChangeName(string new_name) {
+ c_->ChangeName(new_name);
+ }
+
+ string ToString(Graph &graph) {
+ return c_->ToString(graph);
+ }
+};
+
+}
diff --git a/src/projects/dipspades/consensus_contigs_constructor/mapping_contigs_storage.hpp b/src/projects/dipspades/consensus_contigs_constructor/mapping_contigs_storage.hpp
new file mode 100644
index 0000000..7bc9bcc
--- /dev/null
+++ b/src/projects/dipspades/consensus_contigs_constructor/mapping_contigs_storage.hpp
@@ -0,0 +1,114 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "mapping_contig.hpp"
+
+using namespace debruijn_graph;
+
+namespace dipspades {
+
+// interface for contig storage
+class ContigStorage{
+public:
+ virtual void Add(MappingContigPtr new_contig) = 0;
+ virtual size_t Size() = 0;
+ virtual MappingContigPtr& operator[](size_t index) = 0;
+ virtual void ReplaceContig(MappingContigPtr new_contig, size_t index) = 0;
+ virtual void DeleteByIDs(set<size_t> ids) = 0;
+ virtual MappingContigPtr GetContigById(size_t id) = 0;
+ virtual MappingContigPtr GetRCContigById(size_t id) = 0;
+ virtual shared_ptr<ContigStorage> Clone() = 0;
+ virtual ~ContigStorage(){}
+
+ virtual string ToString(Graph &graph) = 0;
+};
+
+typedef shared_ptr<ContigStorage> ContigStoragePtr;
+
+// simple implementation
+class SimpleContigStorage : public ContigStorage{
+ vector<MappingContigPtr> storage_;
+
+public:
+ void Add(MappingContigPtr new_contig) {
+ storage_.push_back(new_contig);
+ }
+
+ size_t Size() {
+ return storage_.size();
+ }
+
+ MappingContigPtr& operator[](size_t index){
+ VERIFY(index < storage_.size());
+ return storage_[index];
+ }
+
+ void ReplaceContig(MappingContigPtr new_contig, size_t index) {
+ VERIFY(index < storage_.size());
+ storage_[index] = new_contig;
+ }
+
+ void DeleteByIDs(set<size_t> ids){
+ vector<MappingContigPtr> new_storage;
+ for(size_t i = 0; i < storage_.size(); i++)
+ if(ids.find(storage_[i]->id()) == ids.end())
+ new_storage.push_back(storage_[i]);
+ storage_ = new_storage;
+ }
+
+ MappingContigPtr GetContigById(size_t id){
+ for(size_t i = 0; i < storage_.size(); i++)
+ if(storage_[i]->id() == id)
+ return storage_[i];
+ return MappingContigPtr(new SimpleMappingContig());
+ }
+
+ MappingContigPtr GetRCContigById(size_t id){
+ for(size_t i = 0; i < storage_.size(); i++)
+ if(storage_[i]->rc_id() == id)
+ return storage_[i];
+ return MappingContigPtr(new SimpleMappingContig());
+ }
+
+ ContigStoragePtr Clone(){
+ ContigStoragePtr clone_storage(new SimpleContigStorage());
+ for(size_t i = 0; i < storage_.size(); i++)
+ clone_storage->Add(storage_[i]);
+ return clone_storage;
+ }
+
+ string ToString(Graph &graph) {
+ stringstream ss;
+ for(auto c = storage_.begin(); c != storage_.end(); c++)
+ ss << (*c)->ToString(graph) << endl;
+ return ss.str();
+ }
+};
+
+//-------------------------------------------------------------------------
+void save_contig_storage(Graph&g, ContigStoragePtr stor, string fname){
+
+ ofstream save(fname.c_str());
+ for(size_t i = 0; i < stor->Size(); i++){
+ save << "#" << i << " contig" << endl;
+ auto contig = (*stor)[i];
+ save << "id " << contig->id() << endl;
+ save << "rc_id " << contig->rc_id() << endl;
+
+ auto path = contig->path_seq();
+ for(size_t j = 0; j < path.size(); j++){
+ save << g.int_id(path[j]) << " ";
+ }
+ save << endl;
+ }
+ save.close();
+}
+//-------------------------------------------------------------------------
+
+}
diff --git a/src/projects/dipspades/consensus_contigs_constructor/overlap_graph.hpp b/src/projects/dipspades/consensus_contigs_constructor/overlap_graph.hpp
new file mode 100644
index 0000000..923f4ec
--- /dev/null
+++ b/src/projects/dipspades/consensus_contigs_constructor/overlap_graph.hpp
@@ -0,0 +1,1119 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+#include <map>
+#include <set>
+
+using namespace std;
+
+using namespace debruijn_graph;
+
+namespace dipspades {
+
+class OverlapGraph{
+ map<size_t, set<size_t> > in_edges_;
+ map<size_t, set<size_t> > out_edges_;
+ map<size_t, pair<bool, int> > label;
+
+ set<pair<size_t,size_t> > edges_;
+
+ map<pair<size_t,size_t>, size_t> weights_;
+
+ set<size_t> vertices_;
+ map<size_t, pair<size_t, size_t> > map_id_rc_id;
+ map<size_t ,size_t> id_ind;
+
+ void CheckAndRemoveIsolatedVertex(size_t v){
+ if(IncomingVerticesCount(v) == 0 && OutgoingVerticesCount(v) == 0){
+ vertices_.erase(v);
+ }
+ }
+
+public:
+ OverlapGraph(){}
+ OverlapGraph(vector<size_t> vertices, vector<size_t> id, vector<size_t> rc_id,
+ map<size_t, vector<size_t> > in_edges, map<size_t, vector<size_t> > in_weight,
+ map<size_t, vector<size_t> > out_edges, map<size_t, vector<size_t> > out_weight) {
+
+ InitializeVertexSet(vertices, id, rc_id);
+ InitializeIncomingVertices(in_edges, in_weight);
+ InitializeOutgoingVertices(out_edges, out_weight);
+ }
+
+ void Clear(){
+ vertices_.clear();
+ map_id_rc_id.clear();
+ id_ind.clear();
+ label.clear();
+
+ in_edges_.clear();
+ out_edges_.clear();
+ weights_.clear();
+ edges_.clear();
+ }
+
+ void InitializeVertexSet(vector<size_t> vertices, vector<size_t> id, vector<size_t> rc_id){
+
+ VERIFY(vertices.size() == id.size());
+ VERIFY(vertices.size() == rc_id.size());
+
+ size_t size = vertices.size();
+ for(size_t i = 0; i < size; i++){
+
+ auto v = vertices[i];
+ vertices_.insert(v);
+
+ map_id_rc_id[v] = pair<size_t, size_t>(id[i], rc_id[i]);
+ id_ind[id[i]] = v;
+
+ label[v] = pair<bool, int>(false, -1);
+ }
+ }
+
+ void InitializeIncomingVertices(map<size_t, vector<size_t> > in_edges,
+ map<size_t, vector<size_t> > in_weight){
+
+ VERIFY(in_edges.size() == in_weight.size());
+
+ auto it_v = in_edges.begin();
+ auto it_w = in_weight.begin();
+
+ for(size_t i = 0; i < in_edges.size(); i++){
+
+ auto v = it_v->first;
+ auto w = it_w->first;
+
+ VERIFY(v == w);
+
+ auto v_vect = it_v->second;
+ auto w_vect = it_w->second;
+
+ VERIFY(v_vect.size() == w_vect.size());
+
+ for(size_t j = 0; j < v_vect.size(); j++){
+ AddNeighVertices(v_vect[j], v, w_vect[j]);
+ }
+
+ it_v++; it_w++;
+ }
+ }
+
+ void InitializeOutgoingVertices(map<size_t, vector<size_t> > out_edges,
+ map<size_t, vector<size_t> > out_weight){
+
+ VERIFY(out_edges.size() == out_weight.size());
+
+ auto it_v = out_edges.begin();
+ auto it_w = out_weight.begin();
+
+ for(size_t i = 0; i < out_edges.size(); i++){
+
+ auto v = it_v->first;
+ auto w = it_w->first;
+
+ VERIFY(v == w);
+
+ auto v_vect = it_v->second;
+ auto w_vect = it_w->second;
+
+ VERIFY(v_vect.size() == w_vect.size());
+
+ for(size_t j = 0; j < v_vect.size(); j++){
+ AddNeighVertices(v, v_vect[j], w_vect[j]);
+ }
+ it_v++; it_w++;
+ }
+ }
+
+ void AddNeighVertices(size_t start, size_t end, size_t weight){
+
+ if(vertices_.find(start) == vertices_.end())
+ vertices_.insert(start);
+ if(vertices_.find(end) == vertices_.end())
+ vertices_.insert(end);
+
+ if(edges_.find(make_pair(start, end)) == edges_.end())
+ edges_.insert(make_pair(start, end));
+ weights_[make_pair(start, end)] = weight;
+
+ in_edges_[end].insert(start);
+ out_edges_[start].insert(end);
+ }
+
+ vector<size_t> GetVerticesWithoutInEdges(){
+ vector<size_t> res;
+ for(auto v = vertices_.begin(); v != vertices_.end(); v++)
+ if(in_edges_.find(*v) == in_edges_.end() || in_edges_[*v].size() == 0){
+ res.push_back(*v);
+ break;
+ }
+ return res;
+ }
+
+ set<size_t> IncomingVertices(size_t v){
+ set<size_t> res;
+ if(in_edges_.find(v) != in_edges_.end())
+ res = in_edges_[v];
+ return res;
+ }
+
+ size_t IncomingVerticesCount(size_t v){
+ if(in_edges_.find(v) != in_edges_.end())
+ return in_edges_[v].size();
+ return 0;
+ }
+
+ set<size_t> OutgoingVertices(size_t v){
+ set<size_t> res;
+ if(out_edges_.find(v) != out_edges_.end())
+ res = out_edges_[v];
+ return res;
+ }
+
+ size_t OutgoingVerticesCount(size_t v){
+ if(out_edges_.find(v) != out_edges_.end())
+ return out_edges_[v].size();
+ return 0;
+ }
+
+ pair<bool, int> GetLabel(size_t v){
+ if(label.find(v) != label.end())
+ return label[v];
+ return make_pair(false, -1);
+ }
+
+ void SetLabel(size_t v, bool bool_label, int value){
+ if(label.find(v) != label.end())
+ label[v] = make_pair(bool_label, value);
+ }
+
+ size_t GetWeightOf(pair<size_t, size_t> edge){
+ if(weights_.find(edge) != weights_.end())
+ return weights_[edge];
+ return 0;
+ }
+
+ size_t GetWeightOf(size_t start, size_t end){
+ return GetWeightOf(pair<size_t, size_t>(start, end));
+ }
+
+ void DeleteVertex(size_t v){
+ vertices_.erase(v);
+ auto in_set = in_edges_[v];
+ auto out_set = out_edges_[v];
+
+ for(auto w = in_set.begin(); w != in_set.end(); w++){
+ DeleteEdge(*w, v);
+ }
+
+ in_edges_.erase(v);
+
+ for(auto w = out_set.begin(); w != out_set.end(); w++){
+ DeleteEdge(v, *w);
+ }
+
+ in_edges_.erase(v);
+
+ size_t id = IdByInd(v);
+ map_id_rc_id.erase(v);
+ id_ind.erase(id);
+ }
+
+ void DeleteEdge(size_t start, size_t end){
+ if(out_edges_.find(start) != out_edges_.end())
+ out_edges_[start].erase(end);
+ if(in_edges_.find(end) != in_edges_.end())
+ in_edges_[end].erase(start);
+ weights_.erase(pair<size_t, size_t>(start, end));
+ edges_.erase(pair<size_t, size_t>(start, end));
+
+// CheckAndRemoveIsolatedVertex(start);
+// CheckAndRemoveIsolatedVertex(end);
+ }
+
+ set<pair<size_t,size_t> > Edges(){
+ auto res = edges_;
+ return res;
+ }
+
+ set<size_t> Vertices(){
+ auto res = vertices_;
+ return res;
+ }
+
+ bool IsEdgeExist(size_t start, size_t end){
+ if(edges_.find(pair<size_t, size_t>(start, end)) != edges_.end())
+ return true;
+ return false;
+ }
+
+ bool IsVertexExist(size_t v){
+ return vertices_.find(v) != vertices_.end();
+ }
+
+ size_t VerticesCount(){
+ return vertices_.size();
+ }
+
+ size_t EdgesCount(){
+ return edges_.size();
+ }
+
+ size_t IdByInd(size_t v){
+ if(map_id_rc_id.find(v) != map_id_rc_id.end())
+ return map_id_rc_id[v].first;
+ return size_t(-1);
+ }
+
+ size_t RCIdByInd(size_t v){
+ if(map_id_rc_id.find(v) != map_id_rc_id.end())
+ return map_id_rc_id[v].second;
+ return size_t(-1);
+ }
+
+ size_t IndById(size_t id){
+ if(id_ind.find(id) != id_ind.end())
+ return id_ind[id];
+ return size_t(-1);
+ }
+
+ size_t IndOfRC(size_t v){
+ return IndById(RCIdByInd(v));
+ }
+
+ bool IsVertexValid(size_t v){
+ return (v != size_t(-1));
+ }
+
+ bool IsVertexIsolated(size_t v){
+ return OutgoingVerticesCount(v) == 0 && IncomingVerticesCount(v) == 0;
+ }
+
+ vector<size_t> GetIsolatedEdges(){
+ vector<size_t> res;
+ for(auto v = vertices_.begin(); v != vertices_.end(); v++)
+ if(IsVertexIsolated(*v))
+ res.push_back(*v);
+ return res;
+ }
+};
+
+//----------------------------------------------------------------------------------------------
+
+class OGD_StartVerticesDefiner{
+protected:
+ OverlapGraph &g_;
+public:
+ OGD_StartVerticesDefiner(OverlapGraph &g) : g_(g) {
+ }
+ virtual vector<size_t> GetStartVertices() = 0;
+ virtual ~OGD_StartVerticesDefiner() {
+ }
+};
+
+class OGD_GetParametrizedStartvertex : public OGD_StartVerticesDefiner{
+ size_t start_vertex_;
+public:
+ OGD_GetParametrizedStartvertex(OverlapGraph &g, size_t start_vertex) :
+ OGD_StartVerticesDefiner(g), start_vertex_(start_vertex){
+
+ }
+
+ vector<size_t> GetStartVertices(){
+ vector<size_t> res;
+ res.push_back(start_vertex_);
+ return res;
+ }
+};
+
+class OGD_GetIsolatedAndVerticesWithoutIncoming : public OGD_StartVerticesDefiner{
+public:
+ OGD_GetIsolatedAndVerticesWithoutIncoming(OverlapGraph &g) : OGD_StartVerticesDefiner(g){
+
+ }
+
+ vector<size_t> GetStartVertices(){
+
+// cout << "OGD_GetIsolatedAndVerticesWithoutIncoming starts" << endl;
+// cout << g_.VerticesCount() << " vertices in OG" << endl;
+
+ vector<size_t> res;
+
+ if(g_.VerticesCount() == 0)
+ return res;
+
+ vector<size_t> isolated = g_.GetIsolatedEdges();
+ vector<size_t> noincoming = g_.GetVerticesWithoutInEdges();
+
+ if(isolated.size() != 0)
+ for(auto v = isolated.begin(); v != isolated.end(); v++)
+ res.push_back(*v);
+
+ if(noincoming.size() != 0)
+ res.push_back(*noincoming.begin());
+
+ if(res.size() == 0){
+ size_t any_vertex = *g_.Vertices().begin();
+ res.push_back(any_vertex);
+ }
+
+ return res;
+ }
+};
+
+//----------------------------------------------------------------------------------------------
+class OGD_DirectionDefiner{
+protected:
+ OverlapGraph &g_;
+
+public:
+ OGD_DirectionDefiner(OverlapGraph &g) : g_(g){
+
+ }
+
+ virtual set<size_t> GetDirectedVertices(size_t vertex) = 0;
+ virtual set<size_t> GetAntidirectedVertices(size_t vertex) = 0;
+ virtual ~OGD_DirectionDefiner(){
+
+ }
+};
+
+class OGD_OutgoingDirection : public OGD_DirectionDefiner{
+public:
+ OGD_OutgoingDirection(OverlapGraph &g) : OGD_DirectionDefiner(g){
+
+ }
+
+ set<size_t> GetDirectedVertices(size_t vertex){
+ return g_.OutgoingVertices(vertex);
+ }
+
+ set<size_t> GetAntidirectedVertices(size_t vertex){
+ return g_.IncomingVertices(vertex);
+ }
+};
+//----------------------------------------------------------------------------------------------
+
+class OGD_OneVertexProcesser{
+protected:
+ OverlapGraph &g_;
+ OGD_DirectionDefiner *direction_definer_;
+public:
+ OGD_OneVertexProcesser(OverlapGraph &g, OGD_DirectionDefiner *direction_definer) : g_(g),
+ direction_definer_(direction_definer){
+ }
+ virtual void ProcessVertex(size_t vertex, set<size_t> &visited, set<size_t> &queue,
+ map<size_t, vector<size_t> > &paths) = 0;
+ virtual ~OGD_OneVertexProcesser(){
+
+ }
+};
+
+class OGD_SimpleProcessing : public OGD_OneVertexProcesser{
+public:
+ OGD_SimpleProcessing(OverlapGraph &g, OGD_DirectionDefiner *direction_definer) :
+ OGD_OneVertexProcesser(g, direction_definer){
+
+ }
+
+ void ProcessVertex(size_t vertex, set<size_t> &visited, set<size_t> &queue,
+ map<size_t, vector<size_t> > &paths){
+ if(visited.find(vertex) != visited.end())
+ return;
+
+ visited.insert(vertex);
+ queue.erase(vertex);
+
+ auto vert_for_visit = direction_definer_->GetDirectedVertices(vertex);
+ for(auto neigh = vert_for_visit.begin(); neigh != vert_for_visit.end(); neigh++)
+ if(visited.find(*neigh) == visited.end()){
+ paths[*neigh] = paths[vertex];
+ paths[*neigh].push_back(*neigh);
+
+ queue.insert(*neigh);
+ }
+ }
+};
+
+class OGD_UniquePathProcessing : public OGD_OneVertexProcesser{
+public:
+ OGD_UniquePathProcessing(OverlapGraph &g, OGD_DirectionDefiner *direction_definer) :
+ OGD_OneVertexProcesser(g, direction_definer){
+
+ }
+
+ void ProcessVertex(size_t vertex, set<size_t> &visited, set<size_t> &queue,
+ map<size_t, vector<size_t> > &paths){
+ if(visited.find(vertex) != visited.end() || vertex == size_t(-1))
+ return;
+
+// cout << "Processing of " << vertex << endl;
+
+ visited.insert(vertex);
+ queue.erase(vertex);
+
+ size_t rc_v = g_.IndOfRC(vertex);
+ if(g_.IsVertexValid(rc_v)){
+ visited.insert(rc_v);
+ queue.erase(rc_v);
+ }
+
+ auto vert_for_visit = direction_definer_->GetDirectedVertices(vertex);
+
+ if(vert_for_visit.size() == 1){
+ size_t neigh = *vert_for_visit.begin();
+
+ if(visited.find(neigh) == visited.end() && paths.find(neigh) == paths.end()){
+ paths[neigh] = paths[vertex];
+ paths[neigh].push_back(neigh);
+ queue.insert(neigh);
+ }
+ }
+ else
+ for(auto neigh = vert_for_visit.begin(); neigh != vert_for_visit.end(); neigh++)
+ if(visited.find(*neigh) == visited.end() && paths.find(*neigh) == paths.end()){
+ paths[*neigh].push_back(*neigh);
+ queue.insert(*neigh);
+ }
+ }
+};
+
+class OGD_AlternativePathProcesser : public OGD_OneVertexProcesser{
+ vector<size_t> alter_path_;
+ set<size_t> forbidden_vert;
+ bool alter_path_is_edge;
+
+ size_t path_start, path_end;
+
+public:
+ OGD_AlternativePathProcesser(OverlapGraph &g, OGD_DirectionDefiner *direct_definer,
+ vector<size_t> alter_path) : OGD_OneVertexProcesser(g, direct_definer), alter_path_(alter_path){
+
+ VERIFY(alter_path.size() > 1);
+
+ for(auto e = alter_path.begin() + 1; e != alter_path.end() - 1; e++)
+ forbidden_vert.insert(*e);
+
+ alter_path_is_edge = alter_path_.size() == 2;
+
+ path_start = *alter_path.begin(), path_end = *(alter_path.end() - 1);
+ }
+
+ void ProcessVertex(size_t vertex, set<size_t> &visited, set<size_t> &queue,
+ map<size_t, vector<size_t> > &paths){
+ if(visited.find(vertex) != visited.end() || vertex == size_t(-1))
+ return;
+
+ visited.insert(vertex);
+ queue.erase(vertex);
+
+ auto vert_for_visit = direction_definer_->GetDirectedVertices(vertex);
+
+ for(auto neigh = vert_for_visit.begin(); neigh != vert_for_visit.end(); neigh++){
+ if(visited.find(*neigh) == visited.end()){
+ bool is_not_visit = (vertex == path_start && *neigh == path_end && alter_path_is_edge);
+ if(!is_not_visit && forbidden_vert.find(*neigh) == forbidden_vert.end()){
+ queue.insert(*neigh);
+ paths[*neigh] = paths[vertex];
+ paths[*neigh].push_back(*neigh);
+ }
+ }
+ }
+ }
+};
+//----------------------------------------------------------------------------------------------
+
+class OGD_NewProcessedVertexDefiner{
+protected:
+ OverlapGraph &g_;
+
+public:
+ OGD_NewProcessedVertexDefiner(OverlapGraph &g) : g_(g){
+
+ }
+ virtual size_t GetNewVertex(set<size_t> &visited, set<size_t> &queue,
+ map<size_t, vector<size_t> > &paths) = 0;
+ virtual ~OGD_NewProcessedVertexDefiner(){
+
+ }
+};
+
+class OGD_NewVertexInQueueDefiner : public OGD_NewProcessedVertexDefiner{
+
+public:
+ OGD_NewVertexInQueueDefiner(OverlapGraph &g) :
+ OGD_NewProcessedVertexDefiner(g){
+ }
+
+ size_t GetNewVertex(set<size_t> &, set<size_t> &queue,
+ map<size_t, vector<size_t> > &){
+ if(queue.size() > 0)
+ return *queue.begin();
+ return size_t(-1);
+ }
+};
+
+class OGD_SimpleNewVertexDefiner : public OGD_NewProcessedVertexDefiner{
+ OGD_DirectionDefiner *direction_definer_;
+
+public:
+ OGD_SimpleNewVertexDefiner(OverlapGraph &g, OGD_DirectionDefiner *direction_definer) :
+ OGD_NewProcessedVertexDefiner(g), direction_definer_(direction_definer){
+
+ }
+
+ size_t GetNewVertex(set<size_t> &visited, set<size_t> &queue,
+ map<size_t, vector<size_t> > &paths){
+ if(queue.size() > 0)
+ return *queue.begin();
+ else{
+
+ auto vertices = g_.Vertices();
+ for(auto v = vertices.begin(); v != vertices.end(); v++){
+ if(visited.find(*v) == visited.end()){
+ auto in_vertices = direction_definer_->GetAntidirectedVertices(*v);
+ bool all_invisited = true;
+ for(auto in_v = in_vertices.begin(); in_v != in_vertices.end(); in_v++){
+ if(visited.find(*in_v) == visited.end()){
+ all_invisited = false;
+ break;
+ }
+ }
+
+ if(all_invisited){
+ paths[*v].push_back(*v);
+ return *v;
+ }
+ }
+ }
+
+ // if vertex without antidirected edges is not exist
+ // then return any unvisited vertex
+ for (auto v = vertices.begin(); v != vertices.end(); v++) {
+ if (visited.find(*v) == visited.end()) {
+ paths[*v].push_back(*v);
+ return *v;
+ }
+ }
+ }
+ return -1;
+ }
+};
+
+//----------------------------------------------------------------------------------------------
+
+class OGD_StopCondition{
+protected:
+ OverlapGraph &g_;
+public:
+ OGD_StopCondition(OverlapGraph &g) : g_(g){
+
+ }
+ virtual bool IsStop(set<size_t> &visited, set<size_t> &queue, map<size_t, vector<size_t> > &paths) = 0;
+ virtual ~OGD_StopCondition(){
+
+ }
+};
+
+class OGD_SearchedVertexIsFound : public OGD_StopCondition{
+ size_t searched_vertex_;
+public:
+ OGD_SearchedVertexIsFound(OverlapGraph &g, size_t searched_vertex) : OGD_StopCondition(g),
+ searched_vertex_(searched_vertex){
+
+ }
+
+ bool IsStop(set<size_t> &visited, set<size_t> &queue, map<size_t, vector<size_t> > &){
+ return (visited.find(searched_vertex_) != visited.end() ||
+ visited.size() == g_.VerticesCount() || queue.size() == 0);
+ }
+};
+
+class OGD_NoVerticesForVisit : public OGD_StopCondition{
+public:
+ OGD_NoVerticesForVisit(OverlapGraph &g) : OGD_StopCondition(g){
+
+ }
+
+ bool IsStop(set<size_t> &visited, set<size_t> &, map<size_t, vector<size_t> > &){
+ return visited.size() == g_.VerticesCount();
+ }
+};
+//----------------------------------------------------------------------------------------------
+
+struct OGD_Config{
+ OGD_StartVerticesDefiner * start_vert_definer;
+ OGD_OneVertexProcesser * vertex_processer;
+ OGD_NewProcessedVertexDefiner * new_vert_definer;
+ OGD_StopCondition * stop_condition;
+
+ OGD_Config(OGD_StartVerticesDefiner * &start_vert_definer,
+ OGD_OneVertexProcesser * &vertex_processer,
+ OGD_NewProcessedVertexDefiner * &new_vert_definer,
+ OGD_StopCondition * &stop_condition){
+ this->start_vert_definer = start_vert_definer;
+ this->vertex_processer = vertex_processer;
+ this->new_vert_definer = new_vert_definer;
+ this->stop_condition = stop_condition;
+ }
+
+ ~OGD_Config(){
+ delete new_vert_definer;
+ delete start_vert_definer;
+ delete stop_condition;
+ delete vertex_processer;
+ }
+};
+
+OGD_Config CreateConfigForUniquePathsSearch(OverlapGraph &g){
+ OGD_StartVerticesDefiner *start_def = new OGD_GetIsolatedAndVerticesWithoutIncoming(g);
+ OGD_DirectionDefiner *direct_def = new OGD_OutgoingDirection(g);
+ OGD_OneVertexProcesser *vert_proc = new OGD_UniquePathProcessing(g, direct_def);
+ OGD_NewProcessedVertexDefiner *new_vert_definer = new OGD_SimpleNewVertexDefiner(g, direct_def);
+ OGD_StopCondition *stop_cond = new OGD_NoVerticesForVisit(g);
+
+ OGD_Config conf(start_def, vert_proc, new_vert_definer, stop_cond);
+
+ return conf;
+}
+
+OGD_Config CreateContigForDijkstraFromOneVertex(OverlapGraph &g, size_t start_vertex, size_t end_vertex){
+ OGD_StartVerticesDefiner *start_def = new OGD_GetParametrizedStartvertex(g, start_vertex);
+ OGD_DirectionDefiner * direct_def = new OGD_OutgoingDirection(g);
+ OGD_OneVertexProcesser *vert_proc = new OGD_SimpleProcessing(g, direct_def);
+ OGD_NewProcessedVertexDefiner * new_vert_definer = new OGD_NewVertexInQueueDefiner(g);
+ OGD_StopCondition *stop_cond = new OGD_SearchedVertexIsFound(g, end_vertex);
+
+ OGD_Config conf(start_def, vert_proc, new_vert_definer, stop_cond);
+ return conf;
+}
+
+OGD_Config CreateConfigForAlternativePathSearch(OverlapGraph &g, vector<size_t> path){
+
+ VERIFY(path.size() > 1);
+ size_t start_vertex = *(path.begin()), end_vertex = *(path.end() - 1);
+
+ OGD_StartVerticesDefiner *start_def = new OGD_GetParametrizedStartvertex(g, start_vertex);
+ OGD_DirectionDefiner * direct_def = new OGD_OutgoingDirection(g);
+ OGD_OneVertexProcesser *vert_proc = new OGD_AlternativePathProcesser(g, direct_def, path);
+ OGD_NewProcessedVertexDefiner * new_vert_definer = new OGD_NewVertexInQueueDefiner(g);
+ OGD_StopCondition *stop_cond = new OGD_SearchedVertexIsFound(g, end_vertex);
+
+ OGD_Config conf(start_def, vert_proc, new_vert_definer, stop_cond);
+ return conf;
+}
+
+class OverlapGraphDijkstra{
+ OverlapGraph &g_;
+ set<size_t> visited, queue;
+ map<size_t, vector<size_t> > paths;
+
+ OGD_Config& config_;
+
+public:
+ OverlapGraphDijkstra(OverlapGraph &g, OGD_Config &config) : g_(g), config_(config){
+
+ }
+
+ void Run(){
+
+// cout << "Dijkstra run" << endl;
+// cout << "Start vertices search" << endl;
+ auto start_vertices = config_.start_vert_definer->GetStartVertices();
+
+// cout << "Processing of start vertices" << endl;
+ for(auto new_start_vertex = start_vertices.begin(); new_start_vertex != start_vertices.end();
+ new_start_vertex++){
+ if(visited.find(*new_start_vertex) == visited.end()){
+ paths[*new_start_vertex].push_back(*new_start_vertex);
+ config_.vertex_processer->ProcessVertex(*new_start_vertex, visited, queue, paths);
+ }
+ }
+
+// cout << "Dijkstra cycle starts" << endl;
+ while(!config_.stop_condition->IsStop(visited, queue, paths)){
+ size_t current_vertex = config_.new_vert_definer->GetNewVertex(visited, queue, paths);
+ config_.vertex_processer->ProcessVertex(current_vertex, visited, queue, paths);
+ }
+// cout << "Dijkstra cycle ends" << endl;
+ }
+
+ map<size_t, vector<size_t> > Paths(){
+// cout << "Paths:" << endl;
+// for(auto it = paths.begin(); it != paths.end(); it++){
+// cout << it->first << ". ";
+// auto path = it->second;
+// for(auto e = path.begin(); e != path.end(); e++)
+// cout << *e << " ";
+// cout << endl;
+// }
+ return paths;
+ }
+
+ const OverlapGraph & GetGraph() { return g_; }
+
+ ~OverlapGraphDijkstra(){
+ }
+};
+
+//---------------------------------------------------------------------------------
+
+vector<vector<size_t> > DeleteRedundantEndsFromPaths(OverlapGraph &g, vector<vector<size_t> > paths){
+
+ if(paths.size() == 0)
+ return paths;
+
+ vector<size_t> starts, ends;
+ vector<bool> is_nes_start, is_nes_end;
+ for(auto p = paths.begin(); p != paths.end(); p++){
+
+ size_t cur_start = *(p->begin());
+ size_t cur_end = *(p->end() - 1);
+
+ starts.push_back(cur_start);
+ ends.push_back(cur_end);
+
+ is_nes_start.push_back(true);
+
+ if(g.RCIdByInd(cur_start) == cur_end)
+ is_nes_end.push_back(false);
+ else
+ is_nes_end.push_back(true);
+ }
+
+ size_t num_paths = paths.size();
+ for(size_t i = 0; i < num_paths; i++){
+ size_t cur_start = starts[i], cur_end = ends[i];
+ for(size_t j = i + 1; j < num_paths; j++){
+ size_t neig_start = starts[j], neig_end = ends[j];
+ if(g.RCIdByInd(cur_start) == neig_start || g.RCIdByInd(cur_start) == neig_end)
+ is_nes_start[j] = false;
+
+ if(g.RCIdByInd(cur_end) == neig_start || g.RCIdByInd(cur_end) == neig_end)
+ is_nes_end[j] = false;
+ }
+ }
+
+ vector<vector<size_t> > corrected_paths;
+ corrected_paths.push_back(paths[0]);
+
+ for(size_t i = 1; i < num_paths; i++){
+ if(!is_nes_start[i] || !is_nes_end[i]){
+ if(paths[i].size() > 1){
+ if(paths[i].size() == 2 && !is_nes_start[i] && !is_nes_end[i]){
+ }
+ else{
+ vector<size_t> tmp;
+ if(is_nes_start[i])
+ tmp.push_back(paths[i][0]);
+ for(size_t j = 1; j < paths[i].size() - 1; j++)
+ tmp.push_back(paths[i][j]);
+ if(is_nes_end[i])
+ tmp.push_back(paths[i][paths[i].size() - 1]);
+ corrected_paths.push_back(tmp);
+ }
+ }
+ }
+ else
+ corrected_paths.push_back(paths[i]);
+ }
+
+ return corrected_paths;
+}
+
+class UniquePathsSearcher{
+ OverlapGraph &g_;
+
+ map<size_t, vector<size_t> > sh_paths;
+
+ vector<vector<size_t> > DefineLongestPathsFromMap(){
+ vector<vector<size_t> > res;
+ set<size_t> used;
+ while(used.size() < sh_paths.size()){
+ size_t longest_path_size = 0;
+ vector<size_t> longest_path;
+ for(auto p = sh_paths.begin(); p != sh_paths.end(); p++){
+ if(p->second.size() > longest_path_size && used.find(p->first) == used.end()){
+ longest_path = p->second;
+ longest_path_size = longest_path.size();
+ }
+ }
+
+ for(auto v = longest_path.begin(); v != longest_path.end(); v++)
+ if(sh_paths.find(*v) != sh_paths.end())
+ used.insert(*v);
+
+ res.push_back(longest_path);
+ }
+
+ return res;
+ }
+
+public:
+ UniquePathsSearcher(OverlapGraph &g) : g_(g) {}
+
+ vector<vector<size_t> > FindLongPaths(){
+
+ OGD_Config conf = CreateConfigForUniquePathsSearch(g_);
+ OverlapGraphDijkstra dijkstra(g_, conf);
+ dijkstra.Run();
+ sh_paths = dijkstra.Paths();
+
+ auto long_paths = DefineLongestPathsFromMap();
+
+ auto corrected_long_paths = DeleteRedundantEndsFromPaths(g_, long_paths);
+
+// cout << "Long paths" << endl;
+// for(auto p = corrected_long_paths.begin(); p != corrected_long_paths.end(); p++){
+// cout << "New path. ";
+/// for(auto e = p->begin(); e != p->end(); e++)
+// cout << *e << " ";
+// cout << endl;
+// }
+
+ return corrected_long_paths;
+ }
+};
+
+class OverlapPathSearcher{
+ OverlapGraph &g_;
+public:
+ OverlapPathSearcher(OverlapGraph &g) : g_(g) {}
+
+ vector<size_t> GetPathAlternativeToPath(size_t start, size_t end, vector<size_t> path){
+ vector<size_t> res;
+
+ VERIFY(path.size() != 0);
+ VERIFY(path[0] == start && path[path.size() - 1] == end);
+
+ OGD_Config conf = CreateConfigForAlternativePathSearch(g_, path);
+ OverlapGraphDijkstra dijkstra(g_, conf);
+ dijkstra.Run();
+ map<size_t, vector<size_t> > short_paths = dijkstra.Paths();
+
+ if(short_paths.find(end) != short_paths.end()){
+ res = short_paths[end];
+ }
+
+ return res;
+ }
+
+ vector<vector<size_t> > GetAlternativePaths(size_t v1, size_t v2){
+ vector<vector<size_t> > paths;
+
+// cout << "Outgoing count - " << g_.OutgoingVerticesCount(v1) << " and incoming - " << g_.IncomingVerticesCount(v2) << endl;
+ if(g_.OutgoingVerticesCount(v1) <= 1 || g_.IncomingVerticesCount(v2) <= 1)
+ return paths;
+
+ OGD_Config conf = CreateContigForDijkstraFromOneVertex(g_, v1, v2);
+ OverlapGraphDijkstra dijkstra(g_, conf);
+ dijkstra.Run();
+ map<size_t, vector<size_t> > sh_paths = dijkstra.Paths();
+
+ if(sh_paths.find(v2) == sh_paths.end()){
+// INFO("Path from " + ToString(v1) + " to " + ToString(v2) + " isn't found");
+ return paths;
+ }
+ else{
+ auto fst_path = sh_paths[v2];
+ paths.push_back(fst_path);
+
+ vector<size_t> snd_path = GetPathAlternativeToPath(v1, v2, fst_path);
+ if(snd_path.size() != 0){
+ VERIFY(snd_path[0] == v1 && snd_path[snd_path.size() - 1] == v2);
+ paths.push_back(snd_path);
+ }
+ }
+
+ return paths;
+ }
+};
+
+//---------------------------------------------------------------------------------
+
+void dijkstra_for_overlap_graph_test(){
+ OverlapGraph g;
+// g.AddNeighVertices(1, 2, 1);
+ g.AddNeighVertices(1, 3, 1);
+ g.AddNeighVertices(1, 4, 1);
+
+ g.AddNeighVertices(2, 3, 1);
+ g.AddNeighVertices(4, 3, 1);
+
+ g.AddNeighVertices(3, 4, 1);
+
+// OverlapPathSearcher path_searcher(g);
+// vector<int> path1;
+// path1.push_back(1);
+// path1.push_back(4);
+
+// auto path2 = path_searcher.GetPathAlternativeToPath(1, 4, path1);
+// for(auto v = path2.begin(); v != path2.end(); v++)
+// cout << *v << " ";
+// cout << endl;
+
+ UniquePathsSearcher path_searcher2(g);
+ auto paths = path_searcher2.FindLongPaths();
+
+// for(auto p = paths.begin(); p != paths.end(); p++){
+// cout << "New path. ";
+// for(auto v = p->begin(); v != p->end(); v++)
+// cout << *v << " ";
+// cout << endl;
+// }
+
+// auto paths_1_3 = path_searcher.GetAlternativePaths(1, 3);
+// for(auto p = paths_1_3.begin(); p != paths_1_3.end(); p++){
+// cout << "New path. ";
+// for(auto v = p->begin(); v != p->end(); v++)
+// cout << *v << " ";
+// cout << endl;
+// }
+}
+
+//---------------------------------------------------------------------------------
+
+class OverlapGraphCorrector{
+public:
+ virtual size_t Correct(OverlapGraph & g) = 0;
+ virtual ~OverlapGraphCorrector() { }
+};
+
+
+class TipClipperCorrector : OverlapGraphCorrector{
+public:
+ size_t Correct(OverlapGraph & g){
+ auto edges = g.Edges();
+
+ size_t deleted_edges = 0;
+ for(auto e = edges.begin(); e != edges.end(); e++){
+ auto start = e->first;
+ auto end = e->second;
+
+ if(g.IncomingVerticesCount(start) == 0 && g.OutgoingVerticesCount(start) == 1 &&
+ g.IncomingVerticesCount(end) > 1 /*&& g.OutgoingVerticesCount(end) > 0*/){
+// cout << "Tip - " << start << " " << end << endl;
+ g.DeleteVertex(start);
+ deleted_edges++;
+ }
+ if(g.OutgoingVerticesCount(end) == 0 && g.OutgoingVerticesCount(start) > 1 &&
+ /*g.IncomingVerticesCount(start) > 0 &&*/ g.IncomingVerticesCount(end) == 1){
+// cout << "Tip - " << start << " " << end << endl;
+ g.DeleteVertex(end);
+ deleted_edges++;
+ }
+ }
+
+ return deleted_edges;
+ }
+};
+
+class TransitiveReductionCorrector : public OverlapGraphCorrector{
+public:
+ size_t Correct(OverlapGraph & g){
+ auto edges = g.Edges();
+ OverlapPathSearcher ps(g);
+
+ size_t res = 0;
+
+ for(auto e = edges.begin(); e != edges.end(); e++){
+ auto start = e->first;
+ auto end = e->second;
+
+ if(g.IsEdgeExist(start, end)){
+
+ vector<size_t> path; path.push_back(start); path.push_back(end);
+ vector<size_t> alt_path = ps.GetPathAlternativeToPath(start, end, path);
+
+ if(alt_path.size() > 2){
+ g.DeleteEdge(start, end);
+ res++;
+ }
+ }
+ }
+
+ return res;
+ }
+};
+
+
+class BulgeRemoverCorrector : public OverlapGraphCorrector{
+public:
+ size_t Correct(OverlapGraph & g){
+ auto vertices = g.Vertices();
+ OverlapPathSearcher ps(g);
+
+ size_t res = 0;
+
+ for(auto v = vertices.begin(); v != vertices.end(); v++)
+ for(auto w = vertices.begin(); w != vertices.end(); w++)
+ if(*v != *w && g.IsVertexExist(*v) && g.IsVertexExist(*w)){
+
+ auto paths = ps.GetAlternativePaths(*v, *w);
+
+ if(paths.size() > 1){
+
+ vector<size_t> path1 = paths[0], path2 = paths[1];
+
+ size_t w1 = 0, w2 = 0;
+
+ for(size_t i = 0; i < path1.size() - 1; i++){
+ w1 += g.GetWeightOf(path1[i], path1[i + 1]);
+ }
+
+ for(size_t i = 0; i < path2.size() - 1; i++){
+ w2 += g.GetWeightOf(path2[i], path2[i + 1]);
+ }
+
+ vector<size_t> deleted_path;
+ if(w1 > w2)
+ deleted_path = path2;
+ else
+ deleted_path = path1;
+
+ // deletion of vertices
+ for(size_t i = 0; i < deleted_path.size() - 1; i++)
+ g.DeleteEdge(deleted_path[i], deleted_path[i + 1]);
+
+ // deletion of inner vertices of bulge
+ for(size_t i = 1; i < deleted_path.size() - 1; i++)
+ g.DeleteVertex(deleted_path[i]);
+
+ res++;
+ }
+ }
+ return res;
+ }
+};
+
+void SimplifyOverlapGraph(OverlapGraph &overlap_graph, size_t tc_num_iter, size_t br_num_iter){
+
+ size_t tc_res = 1, tr_res = 1;
+ for(size_t i = 0; (i < tc_num_iter && (tc_res > 0 || tr_res > 0)); i++){
+ TipClipperCorrector tc_corr;
+ tc_res = tc_corr.Correct(overlap_graph);
+
+ TransitiveReductionCorrector tr_corr;
+ tr_res = tr_corr.Correct(overlap_graph);
+
+ INFO(ToString(tc_res) + " tips and " + ToString(tr_res) + " transitive edges were deleted in overlap graph");
+ }
+
+ INFO("Bulge remover starts");
+ BulgeRemoverCorrector br_corr;
+ size_t num_bulges = 1;
+ for(size_t i = 0; (i < br_num_iter && num_bulges > 0); i++){
+ num_bulges = br_corr.Correct(overlap_graph);
+ INFO(ToString(num_bulges) + " bulges were deleted in overlap graph");
+ }
+}
+
+}
diff --git a/src/projects/dipspades/dipspades.hpp b/src/projects/dipspades/dipspades.hpp
new file mode 100644
index 0000000..08c3ad9
--- /dev/null
+++ b/src/projects/dipspades/dipspades.hpp
@@ -0,0 +1,265 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+
+#include "io/reads_io/splitting_wrapper.hpp"
+#include "algorithms/graph_construction.hpp"
+#include "pipeline/stage.hpp"
+
+#include "dipspades_config.hpp"
+
+#include "polymorphic_bulge_remover/polymorphic_bulge_remover.hpp"
+#include "consensus_contigs_constructor/consensus_contigs_constructor.hpp"
+#include "haplotype_assembly/haplotype_assembler.hpp"
+#include "kmer_gluing/equal_sequence_gluer.hpp"
+
+using namespace debruijn_graph;
+using namespace spades;
+
+namespace dipspades {
+
+void construct_graph_from_contigs(debruijn_graph::conj_graph_pack &graph_pack){
+ auto fnames = GetAllLinesFromFile(dsp_cfg::get().io.haplocontigs);
+ ReadStreamList<SingleRead> streams;
+ for(auto fname = fnames.begin(); fname != fnames.end(); fname++)
+ if(fname_valid(*fname)){
+ INFO("Addition of contigs from " << *fname);
+ streams.push_back(io::SplittingWrap(EasyStream(*fname, true)));
+ }
+
+ INFO("Construction of the de Bruijn graph with K=" << dsp_cfg::get().bp.K);
+ config::debruijn_config::construction params;
+ params.con_mode = config::construction_mode::extention;
+ params.early_tc.enable = false;
+ params.early_tc.length_bound = 10;
+ params.keep_perfect_loops = true;
+ params.read_buffer_size = dsp_cfg::get().bp.read_buffer_size;
+
+ ConstructGraphWithCoverage(params,
+ streams,
+ graph_pack.g,
+ graph_pack.index,
+ graph_pack.flanking_cov);
+}
+
+
+class DipSPAdesStorage{
+public:
+ BaseHistogram<size_t> bulge_len_histogram;
+ ContigStoragePtr default_storage;
+ ContigStoragePtr composite_storage;
+ CorrectionResult redundancy_map;
+};
+
+
+class DipSPAdes : public CompositeStage<DipSPAdesStorage> {
+ DipSPAdesStorage dsp_params_;
+public:
+ DipSPAdes() : CompositeStage<DipSPAdesStorage>("dipSPAdes", "dipspades") { }
+
+ void load(debruijn_graph::conj_graph_pack&,
+ const std::string &,
+ const char*) { }
+
+ void save(const debruijn_graph::conj_graph_pack&,
+ const std::string &,
+ const char*) const { }
+
+ virtual ~DipSPAdes() { }
+};
+
+class ContigGraphConstructionStage : public DipSPAdes::Phase {
+public:
+ ContigGraphConstructionStage() :
+ DipSPAdes::Phase("Construction of graph from contigs", "contig_graph_construction") { }
+
+ void run(debruijn_graph::conj_graph_pack &graph_pack, const char*) {
+ construct_graph_from_contigs(graph_pack);
+ }
+
+ void load(debruijn_graph::conj_graph_pack& gp,
+ const std::string &load_from,
+ const char* prefix) {
+ std::string p = path::append_path(load_from, prefix == NULL ? id() : prefix);
+ INFO("Loading current state from " << p);
+ debruijn_graph::graphio::ScanAll(p, gp, false);
+
+ }
+
+ void save(const debruijn_graph::conj_graph_pack& gp,
+ const std::string & save_to,
+ const char* prefix) const {
+ std::string p = path::append_path(save_to, prefix == NULL ? id() : prefix);
+ INFO("Saving current state to " << p);
+ debruijn_graph::graphio::PrintAll(p, gp);
+ }
+
+ virtual ~ContigGraphConstructionStage() { }
+};
+
+class PolymorphicBulgeRemoverStage : public DipSPAdes::Phase {
+public:
+ PolymorphicBulgeRemoverStage() :
+ DipSPAdes::Phase("Polymorphic bulge remover", "polymorphic_br") { }
+
+ void run(debruijn_graph::conj_graph_pack &graph_pack, const char*){
+ if(dsp_cfg::get().pbr.enabled){
+ PolymorphicBulgeRemover(graph_pack, storage().bulge_len_histogram).Run();
+ INFO("Consensus graph was constructed");
+ }
+ }
+
+ void load(debruijn_graph::conj_graph_pack& gp,
+ const std::string &load_from,
+ const char* prefix) {
+ std::string p = path::append_path(load_from, prefix == NULL ? id() : prefix);
+ INFO("Loading current state from " << p);
+ debruijn_graph::graphio::ScanAll(p, gp, false);
+ INFO("Loading histogram of bulge length");
+ INFO("loading from " << p + ".hist");
+ storage().bulge_len_histogram.LoadFrom(p + ".hist");
+ }
+
+ void save(const debruijn_graph::conj_graph_pack& gp,
+ const std::string & save_to,
+ const char* prefix) const {
+ std::string p = path::append_path(save_to, prefix == NULL ? id() : prefix);
+ INFO("Saving current state to " << p);
+ debruijn_graph::graphio::PrintAll(p, gp);
+ storage().bulge_len_histogram.SaveToFile(p + ".hist");
+ }
+
+ virtual ~PolymorphicBulgeRemoverStage() { }
+};
+
+class EqualKmerGluingStage : public DipSPAdes::Phase {
+public:
+ EqualKmerGluingStage() :
+ DipSPAdes::Phase("Equal k-mer gluing", "kmer_gluer") { }
+
+ void run(debruijn_graph::conj_graph_pack &graph_pack, const char*) {
+ INFO("Glueing equal kmers starts");
+ EqualSequencesGluer<Graph>(graph_pack.g, graph_pack.index).GlueEqualKmers();
+ INFO("Glueing equal kmers ends");
+ }
+
+ void load(debruijn_graph::conj_graph_pack& gp,
+ const std::string &load_from,
+ const char* prefix) {
+ std::string p = path::append_path(load_from, prefix == NULL ? id() : prefix);
+ INFO("Loading current state from " << p);
+ debruijn_graph::graphio::ScanAll(p, gp, false);
+ INFO("Loading histogram of bulge length");
+ INFO("loading from " << p + ".hist");
+ storage().bulge_len_histogram.LoadFrom(p + ".hist");
+ }
+
+ void save(const debruijn_graph::conj_graph_pack& gp,
+ const std::string & save_to,
+ const char* prefix) const {
+ std::string p = path::append_path(save_to, prefix == NULL ? id() : prefix);
+ INFO("Saving current state to " << p);
+ debruijn_graph::graphio::PrintAll(p, gp);
+ storage().bulge_len_histogram.SaveToFile(p + ".hist");
+ }
+
+ virtual ~EqualKmerGluingStage() { }
+};
+
+class ConsensusConstructionStage : public DipSPAdes::Phase {
+public:
+ ConsensusConstructionStage() :
+ DipSPAdes::Phase("Consensus contigs construction", "consensus_construction") { }
+
+ void run(debruijn_graph::conj_graph_pack &graph_pack, const char*){
+ if(dsp_cfg::get().cc.enabled){
+ ConsensusContigsConstructor consensus_constructor(graph_pack, storage().bulge_len_histogram);
+ consensus_constructor.Run();
+ storage().composite_storage = consensus_constructor.CompositeContigsStorage();
+ storage().default_storage = consensus_constructor.DefaultContigsStorage();
+ storage().redundancy_map = consensus_constructor.RedundancyResult();
+ }
+ }
+
+ void load(debruijn_graph::conj_graph_pack& gp,
+ const std::string &load_from,
+ const char* prefix) {
+ std::string p = path::append_path(load_from, prefix == NULL ? id() : prefix);
+ INFO("Loading current state from " << p);
+ debruijn_graph::graphio::ScanAll(p, gp, false);
+ }
+
+ void save(const debruijn_graph::conj_graph_pack& gp,
+ const std::string & save_to,
+ const char* prefix) const {
+ std::string p = path::append_path(save_to, prefix == NULL ? id() : prefix);
+ INFO("Saving current state to " << p);
+ debruijn_graph::graphio::PrintAll(p, gp);
+ storage().bulge_len_histogram.SaveToFile(p + ".hist");
+ }
+
+ virtual ~ConsensusConstructionStage() { }
+};
+
+class HaplotypeAssemblyStage : public DipSPAdes::Phase {
+public:
+ HaplotypeAssemblyStage() :
+ DipSPAdes::Phase("Haplotype assembly", "haplotype_assembly") { }
+
+ void run(debruijn_graph::conj_graph_pack &graph_pack, const char*) {
+ if(!storage().composite_storage || !storage().default_storage)
+ return;
+ if(storage().composite_storage->Size() == 0 || storage().default_storage->Size() == 0)
+ return;
+ INFO("Diploid graph construction");
+ conj_graph_pack double_graph_pack(graph_pack.k_value, dsp_cfg::get().io.tmp_dir,
+ dsp_cfg::get().io.num_libraries, "");
+ construct_graph_from_contigs(double_graph_pack);
+ HaplotypeAssembler(graph_pack, double_graph_pack, storage().default_storage,
+ storage().composite_storage, storage().redundancy_map).Run();
+ }
+
+ void load(debruijn_graph::conj_graph_pack&,
+ const std::string &,
+ const char*) { }
+
+ void save(const debruijn_graph::conj_graph_pack&,
+ const std::string &,
+ const char*) const { }
+
+ virtual ~HaplotypeAssemblyStage() { }
+};
+void run_dipspades() {
+ INFO("dipSPAdes started");
+
+ debruijn_graph::conj_graph_pack conj_gp(
+ dsp_cfg::get().bp.K,
+ dsp_cfg::get().io.tmp_dir,
+ dsp_cfg::get().io.num_libraries,
+ "", // reference genome
+ 1); // flanking range
+
+ conj_gp.kmer_mapper.Attach();
+
+ StageManager DS_Manager ( {dsp_cfg::get().rp.developer_mode,
+ dsp_cfg::get().io.saves,
+ dsp_cfg::get().io.output_saves} );
+ auto ds_phase = new DipSPAdes();
+ ds_phase -> add(new ContigGraphConstructionStage()) ->
+ add(new PolymorphicBulgeRemoverStage()) ->
+ add(new EqualKmerGluingStage()) ->
+ add(new ConsensusConstructionStage());
+ if(dsp_cfg::get().ha.ha_enabled) {
+ ds_phase->add(new HaplotypeAssemblyStage());
+ }
+
+ DS_Manager.add(ds_phase);
+ DS_Manager.run(conj_gp, dsp_cfg::get().rp.entry_point.c_str());
+ INFO("dipSPAdes finished");
+}
+
+}
diff --git a/src/projects/dipspades/dipspades_config.cpp b/src/projects/dipspades/dipspades_config.cpp
new file mode 100644
index 0000000..88545e8
--- /dev/null
+++ b/src/projects/dipspades/dipspades_config.cpp
@@ -0,0 +1,132 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "dipspades_config.hpp"
+#include "pipeline/config_common.hpp"
+#include "utils/files_utils.hpp"
+#include "dev_support/path_helper.hpp"
+
+using namespace dipspades;
+
+void load(dipspades_config::base_params &bp,
+ boost::property_tree::ptree const &pt, bool) {
+ using config_common::load;
+ load(bp.K , pt, "K" );
+ load(bp.max_memory , pt, "max_memory" );
+ load(bp.max_threads , pt, "max_threads" );
+ load(bp.read_buffer_size , pt, "read_buffer_size" );
+}
+
+void load(dipspades_config::io_params &io,
+ boost::property_tree::ptree const &pt, bool) {
+ using config_common::load;
+ load(io.haplocontigs , pt, "haplocontigs" );
+ io.num_libraries = GetAllLinesFromFile(io.haplocontigs).size();
+
+ load(io.log_filename , pt, "log_filename" );
+
+ load(io.output_base , pt, "output_base" );
+ if (io.output_base[io.output_base.length() - 1] != '/')
+ io.output_base += '/';
+
+ load(io.output_dir , pt, "output_dir" );
+ if (io.output_dir[io.output_dir.length() - 1] != '/')
+ io.output_dir += '/';
+
+ load(io.tmp_dir , pt, "tmp_dir" );
+ if (io.tmp_dir[io.tmp_dir.length() - 1] != '/')
+ io.tmp_dir += '/';
+
+ load(io.saves , pt, "saves" );
+ if(io.saves[io.saves.length() - 1] != '/')
+ io.saves += '/';
+}
+
+void load(dipspades_config::run_params &rp,
+ boost::property_tree::ptree const &pt, bool) {
+ using config_common::load;
+ load(rp.entry_point , pt, "entry_point" );
+ load(rp.developer_mode , pt, "developer_mode");
+}
+
+void edit_io_params(bool developer_mode, dipspades_config::io_params &io){
+ if(developer_mode){
+ io.dataset_name = io.output_dir.substr(0, io.output_dir.length() - 1);
+ io.output_dir = io.output_base + io.output_dir + "/";
+ io.output_root = io.output_dir;
+ io.output_suffix = path::MakeLaunchTimeDirName() + "/";
+ io.output_dir = io.output_root + io.output_suffix;
+ io.output_saves = io.output_dir + "saves/";
+// io.load_from = io.output_root + io.load_from;
+ if (io.tmp_dir[0] != '/') { // relative path
+ io.tmp_dir = io.output_dir + io.tmp_dir;
+ }
+ return;
+ }
+
+ // no developer mode
+ io.dataset_name = io.output_dir;
+ io.output_root = io.output_dir;
+ io.output_suffix = "";
+ io.output_base = "";
+ io.output_saves = io.output_dir;
+ io.saves = "";
+ if (io.tmp_dir[0] != '/') { // relative path
+ io.tmp_dir = io.output_dir + io.tmp_dir;
+ }
+}
+
+inline void load(dipspades_config::polymorphic_br &pbr,
+ boost::property_tree::ptree const& pt, bool){
+ using config_common::load;
+ load(pbr.enabled , pt, "enabled" );
+ load(pbr.rel_bulge_length , pt, "rel_bulge_length" );
+ load(pbr.rel_bulge_align , pt, "rel_bulge_align" );
+ load(pbr.paired_vert_abs_threshold , pt, "paired_vert_abs_threshold" );
+ load(pbr.paired_vert_rel_threshold , pt, "paired_vert_rel_threshold" );
+ load(pbr.max_bulge_nucls_len , pt, "max_bulge_nucls_len" );
+ load(pbr.max_neigh_number , pt, "max_neigh_number" );
+ load(pbr.num_iters_lbr , pt, "num_iters_lbr" );
+}
+
+inline void load(dipspades_config::consensus_constructor &cc,
+ boost::property_tree::ptree const& pt, bool /*complete*/){
+ using config_common::load;
+ load(cc.enabled , pt, "enabled" );
+ load(cc.bulge_len_quantile , pt, "bulge_len_quantile" );
+ load(cc.tails_lie_on_bulges , pt, "tails_lie_on_bulges" );
+ load(cc.estimate_tails , pt, "estimate_tails" );
+ load(cc.align_bulge_sides , pt, "align_bulge_sides" );
+ load(cc.min_overlap_size , pt, "min_overlap_size" );
+ load(cc.min_lcs_size , pt, "min_lcs_size" );
+ load(cc.max_loop_length , pt, "max_loop_length" );
+}
+
+inline void load(dipspades_config::haplotype_assembly &ha,
+ boost::property_tree::ptree const& pt, bool /*complete*/){
+ using config_common::load;
+ load(ha.ha_enabled , pt, "ha_enabled" );
+}
+
+void load(dipspades_config &cfg,
+ boost::property_tree::ptree const &pt, bool complete){
+ using config_common::load;
+ load(cfg.bp , pt, "bp", complete);
+ load(cfg.io , pt, "io", complete);
+ load(cfg.rp , pt, "rp", complete);
+ load(cfg.cc , pt, "cc", complete);
+ load(cfg.ha , pt, "ha", complete);
+ load(cfg.pbr , pt, "pbr", complete);
+}
+
+void load(dipspades_config &cfg, std::string const &filename) {
+ boost::property_tree::ptree pt;
+ boost::property_tree::read_info(filename, pt);
+ load(cfg, pt, true);
+ edit_io_params(cfg.rp.developer_mode, cfg.io);
+
+}
diff --git a/src/projects/dipspades/dipspades_config.hpp b/src/projects/dipspades/dipspades_config.hpp
new file mode 100644
index 0000000..12351e0
--- /dev/null
+++ b/src/projects/dipspades/dipspades_config.hpp
@@ -0,0 +1,82 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "pipeline/config_singl.hpp"
+#include <boost/property_tree/ptree_fwd.hpp>
+
+struct dipspades_config {
+
+ struct base_params {
+ size_t K;
+ size_t max_threads;
+ size_t max_memory;
+ size_t read_buffer_size;
+ };
+
+ struct io_params {
+ std::string haplocontigs;
+ size_t num_libraries;
+ std::string log_filename;
+
+ std::string output_base;
+ std::string output_root;
+ std::string output_dir;
+ std::string tmp_dir;
+ std::string output_suffix;
+ std::string output_saves;
+
+ std::string dataset_name;
+
+ std::string saves;
+ };
+
+ struct run_params {
+ std::string entry_point;
+ bool developer_mode;
+ };
+
+ struct polymorphic_br {
+ bool enabled;
+ double rel_bulge_length;
+ double rel_bulge_align;
+ size_t paired_vert_abs_threshold;
+ double paired_vert_rel_threshold;
+ size_t max_bulge_nucls_len;
+ size_t max_neigh_number;
+ size_t num_iters_lbr;
+ size_t num_iters_hbr;
+ };
+
+ struct consensus_constructor {
+ bool enabled;
+ double bulge_len_quantile;
+ bool tails_lie_on_bulges;
+ bool align_bulge_sides;
+ bool estimate_tails;
+ size_t min_overlap_size;
+ size_t min_lcs_size;
+ size_t max_loop_length;
+ };
+
+ struct haplotype_assembly {
+ bool ha_enabled;
+ };
+
+ base_params bp;
+ io_params io;
+ run_params rp;
+ polymorphic_br pbr;
+ consensus_constructor cc;
+ haplotype_assembly ha;
+};
+
+void load(dipspades_config &cfg, std::string const &filename);
+
+typedef config_common::config<dipspades_config> dsp_cfg;
+
diff --git a/src/projects/dipspades/haplotype_assembly/conservative_regions_searcher.hpp b/src/projects/dipspades/haplotype_assembly/conservative_regions_searcher.hpp
new file mode 100644
index 0000000..c5c5f91
--- /dev/null
+++ b/src/projects/dipspades/haplotype_assembly/conservative_regions_searcher.hpp
@@ -0,0 +1,174 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "assembly_graph/graph_alignment/sequence_mapper.hpp"
+#include "contig_separation_utils.hpp"
+
+using namespace debruijn_graph;
+
+namespace dipspades {
+
+class ConservativeRegionsSearcher{
+ conj_graph_pack & dbl_gp_;
+ ContigStoragePtr storage_;
+ SignedLabels signed_labels_;
+ ConservativeRegionStorage cons_reg_storage_;
+
+ NewExtendedSequenceMapper<conj_graph_pack::graph_t, conj_graph_pack::index_t> mapper_;
+ map<int, MappingPath<EdgeId> > contig_map_path_;
+
+ typedef map<int, vector<int> > diff_labeled_contigs;
+ diff_labeled_contigs map_of_diff_contigs_;
+
+ MappingPath<EdgeId> GetMappingPath(int contig){
+ if(contig_map_path_.find(contig) == contig_map_path_.end()){
+ auto seq = storage_->GetContigById(contig)->seq();
+ MappingPath<EdgeId> map_path = mapper_.MapSequence(seq);
+ contig_map_path_[contig] = map_path;
+ }
+ return contig_map_path_[contig];
+ }
+
+ void ComputeDifferentLabeledContigs(){
+ for(auto it = signed_labels_.begin(); it != signed_labels_.end(); it++)
+ if(it->second == from_different){
+ int contig1 = it->first.first, contig2 = it->first.second;
+ map_of_diff_contigs_[contig1].push_back(contig2);
+ }
+ }
+
+ vector<EdgeId> GetConservativeEdges(vector<MappingPath<EdgeId> > paths,
+ vector<int> labels){
+
+ map<EdgeId, set<int> > edge_labels;
+ for(size_t i = 0; i < paths.size(); i++){
+ MappingPath<EdgeId> path = paths[i];
+ int label = labels[i];
+
+ for(size_t j = 0; j < path.size(); j++)
+ edge_labels[path[j].first].insert(label);
+ }
+
+// for(auto it = edge_labels.begin(); it != edge_labels.end(); it++){
+// cout << it->first << ". Labels ";
+// PrintSet<int>(cout, it->second);
+// }
+
+ vector<EdgeId> cons_edges;
+ for(auto it = edge_labels.begin(); it != edge_labels.end(); it++)
+ if(it->second.size() > 1)
+ cons_edges.push_back(it->first);
+
+ return cons_edges;
+ }
+
+ vector<int> GatherLabelsForSeparatedContigs(vector<int> separated_contigs){
+ vector<int> labels;
+ labels.push_back(1);
+ for(auto c = separated_contigs.begin(); c != separated_contigs.end(); c++){
+ labels.push_back(2);
+ }
+ return labels;
+ }
+
+ vector<MappingPath<EdgeId> > GatherMappingPathForContigs(vector<int> contigs){
+ vector<MappingPath<EdgeId> > map_paths;
+ for(auto c = contigs.begin(); c != contigs.end(); c++)
+ map_paths.push_back(GetMappingPath(*c));
+ return map_paths;
+ }
+
+ void FindTwoColoredEdges(){
+
+ ComputeDifferentLabeledContigs();
+
+ for(auto it = map_of_diff_contigs_.begin(); it != map_of_diff_contigs_.end(); it++){
+
+ auto contig = it->first;
+ auto separated_contigs = it->second;
+
+// cout << contig << ". Separated set - ";
+// PrintVector<int>(cout, separated_contigs);
+
+ auto labels = GatherLabelsForSeparatedContigs(separated_contigs);
+
+ // gather all mapping paths
+ auto contig_map_path = GetMappingPath(contig);
+ vector<MappingPath<EdgeId> > map_paths = GatherMappingPathForContigs(separated_contigs);
+ map_paths.insert(map_paths.begin(), contig_map_path);
+
+ // find two or more colored edges
+ auto cur_cons_edges = GetConservativeEdges(map_paths, labels);
+
+ // add them in storage
+ for(auto e = cur_cons_edges.begin(); e != cur_cons_edges.end(); e++)
+ cons_reg_storage_.AddConservativeRegion(dbl_gp_.g.EdgeNucls(*e));
+ }
+ }
+
+ void WriteConservativeRegionsStorageToFile(string filename, cons_regions_iterator iter_begin,
+ cons_regions_iterator iter_end){
+ ofstream fout(filename);
+ int cnt = 1;
+ for(auto it = iter_begin; it != iter_end; it++){
+ Sequence curr_seq = *it;
+ fout << ">" << cnt << "_conservative_region_length_" << curr_seq.size() << endl;
+ fout << curr_seq.str() << endl;
+ cnt++;
+ }
+ }
+
+ size_t ComputeSummaryLengthOfRegionInStorage(cons_regions_iterator iter_begin,
+ cons_regions_iterator iter_end){
+ size_t summary_cons_reg_length = 0;
+ for(auto it = iter_begin; it != iter_end; it++){
+ summary_cons_reg_length += it->size();
+ }
+ return summary_cons_reg_length;
+ }
+
+public:
+ ConservativeRegionsSearcher(conj_graph_pack &dbl_gp, ContigStoragePtr storage,
+ SignedLabels signed_labels, ConservativeRegionStorage cons_reg_storage) :
+ dbl_gp_(dbl_gp),
+ storage_(storage),
+ signed_labels_(signed_labels),
+ cons_reg_storage_(cons_reg_storage),
+ mapper_(dbl_gp_.g, dbl_gp_.index,
+ dbl_gp_.kmer_mapper) { }
+
+ void Search(){
+ FindTwoColoredEdges();
+ size_t cons_regions_length = ComputeSummaryLengthOfRegionInStorage(cons_reg_storage_.cons_regions_begin(),
+ cons_reg_storage_.cons_regions_end());
+ if(cons_regions_length > 0){
+ string cons_regions_fname(path::append_path(dsp_cfg::get().io.output_dir,
+ "conservative_regions.fasta").c_str());
+ WriteConservativeRegionsStorageToFile(cons_regions_fname, cons_reg_storage_.cons_regions_begin(),
+ cons_reg_storage_.cons_regions_end());
+ INFO("Conservative regions with total length " << cons_regions_length <<
+ " written in file " << cons_regions_fname);
+ }
+
+ size_t poss_cons_regions_length = ComputeSummaryLengthOfRegionInStorage(cons_reg_storage_.poss_cons_regions_begin(),
+ cons_reg_storage_.poss_cons_regions_end());
+ if(poss_cons_regions_length > 0){
+ string poss_cons_regions_fname(path::append_path(dsp_cfg::get().io.output_dir,
+ "possibly_conservative_regions.fasta").c_str());
+// INFO("Possibly conservative regions written in file " << poss_cons_regions_fname);
+ WriteConservativeRegionsStorageToFile(poss_cons_regions_fname, cons_reg_storage_.poss_cons_regions_begin(),
+ cons_reg_storage_.poss_cons_regions_end());
+ INFO("Conservative regions with total length " << poss_cons_regions_length <<
+ " written in file " << poss_cons_regions_fname);
+ }
+ }
+};
+
+}
+
diff --git a/src/projects/dipspades/haplotype_assembly/conservative_regions_storage.hpp b/src/projects/dipspades/haplotype_assembly/conservative_regions_storage.hpp
new file mode 100644
index 0000000..2b74bf2
--- /dev/null
+++ b/src/projects/dipspades/haplotype_assembly/conservative_regions_storage.hpp
@@ -0,0 +1,44 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+namespace dipspades {
+
+typedef vector<Sequence>::iterator cons_regions_iterator;
+
+class ConservativeRegionStorage{
+ vector<Sequence> cons_regions_;
+ vector<Sequence> poss_cons_regions_;
+
+public:
+ void AddConservativeRegion(Sequence seq){
+ cons_regions_.push_back(seq);
+ }
+
+ void AddPossiblyConservativeRegion(Sequence seq){
+ poss_cons_regions_.push_back(seq);
+ }
+
+ cons_regions_iterator cons_regions_begin(){
+ return cons_regions_.begin();
+ }
+
+ cons_regions_iterator cons_regions_end(){
+ return cons_regions_.end();
+ }
+
+ cons_regions_iterator poss_cons_regions_begin(){
+ return poss_cons_regions_.begin();
+ }
+
+ cons_regions_iterator poss_cons_regions_end(){
+ return poss_cons_regions_.end();
+ }
+};
+
+}
diff --git a/src/projects/dipspades/haplotype_assembly/contig_separation_utils.hpp b/src/projects/dipspades/haplotype_assembly/contig_separation_utils.hpp
new file mode 100644
index 0000000..e72051e
--- /dev/null
+++ b/src/projects/dipspades/haplotype_assembly/contig_separation_utils.hpp
@@ -0,0 +1,515 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "../utils/range_utils.hpp"
+#include "../utils/path_routines.hpp"
+#include "../utils/bulge_utils.hpp"
+#include "conservative_regions_storage.hpp"
+#include <string>
+
+using namespace debruijn_graph;
+
+namespace dipspades {
+
+enum haplotype {unknown, from_one, from_different};
+
+typedef map<pair<int, int>, haplotype>::iterator signed_label_iter;
+
+class SignedLabels{
+
+ map<pair<int, int>, haplotype> contigs_pairs;
+
+public:
+ void Add(int contig_id1, int contig_id2, haplotype new_label){
+ pair<int, int> proc_pair(contig_id1, contig_id2);
+ if(contigs_pairs.find(proc_pair) == contigs_pairs.end())
+ contigs_pairs[proc_pair] = new_label;
+ else{
+ haplotype old_label = contigs_pairs[proc_pair];
+ if(old_label < new_label)
+ contigs_pairs[proc_pair] = new_label;
+ }
+ }
+
+ haplotype GetHaplotypeByPair(int contig_id1, int contig_id2){
+ return contigs_pairs[pair<int, int>(contig_id1, contig_id2)];
+ }
+
+ signed_label_iter begin(){
+ return contigs_pairs.begin();
+ }
+
+ signed_label_iter end(){
+ return contigs_pairs.end();
+ }
+
+ void MergeWith(SignedLabels new_signed_labels){
+ for(auto it = new_signed_labels.begin(); it != new_signed_labels.end(); it++){
+ Add(it->first.first, it->first.second, it->second);
+ }
+ }
+
+ size_t Size(){
+ return contigs_pairs.size();
+ }
+
+ string ToString(){
+ stringstream ss;
+ for(auto it = contigs_pairs.begin(); it != contigs_pairs.end(); it++)
+ ss << "Pair " << it->first.first << ", " << it->first.second << " - " << it->second << ". ";
+ return ss.str();
+ }
+
+ void WriteToFile(string fname, ContigStoragePtr contig_storage){
+ ofstream out(fname.c_str());
+ for(auto it= contigs_pairs.begin(); it != contigs_pairs.end(); it++)
+ if(it->second == from_different){
+ auto contig1 = contig_storage->GetContigById(it->first.first);
+ auto contig2 = contig_storage->GetContigById(it->first.second);
+ out << contig1->src_file() << ":" << contig1->name() << "\t" <<
+ contig2->src_file() << ":" << contig2->name() << endl;
+ }
+ out.close();
+ }
+};
+
+class ContigLabelAllocator{
+ ContigStoragePtr contig_storage_;
+
+ Sequence GetSequenceByRange(Sequence seq, pair<size_t, size_t> r){
+ return seq.Subseq(r.first, r.second);
+ }
+
+ bool AreRangesIntersect(MappingRange mapping_range1, MappingRange mapping_range2){
+
+ Range mapped_range1 = mapping_range1.mapped_range;
+ Range mapped_range2 = mapping_range2.mapped_range;
+
+ if(!is_intersection_exist(mapped_range1, mapped_range2))
+ return false;
+
+ Range intersection = get_intersection_of_ranges(mapped_range1, mapped_range2);
+ return intersection.end_pos - intersection.start_pos > 100;
+ }
+
+ haplotype ComputeLabelForPair(MappingRange mapping_range1, Sequence seq1,
+ MappingRange mapping_range2, Sequence seq2){
+
+ Range mapped_range1 = mapping_range1.mapped_range;
+ Range mapped_range2 = mapping_range2.mapped_range;
+
+ VERIFY(is_intersection_exist(mapped_range1, mapped_range2));
+
+ TRACE("Mapping range1: " << mapped_range1.start_pos << " " <<
+ mapped_range1.end_pos);
+ TRACE("Mapping range2: " << mapped_range2.start_pos << " " <<
+ mapped_range2.end_pos);
+
+ TRACE("Init range1: " << mapping_range1.initial_range.start_pos << " " <<
+ mapping_range1.initial_range.end_pos);
+ TRACE("Init range2: " << mapping_range2.initial_range.start_pos << " " <<
+ mapping_range2.initial_range.end_pos);
+
+
+ Range intersection = get_intersection_of_ranges(mapped_range1, mapped_range2);
+
+ TRACE("Intersection: " << intersection.start_pos << " " << intersection.end_pos);
+
+ auto new_init_pair1 = project_init_range_to_new(mapped_range1, intersection,
+ mapping_range1.initial_range);
+ auto new_init_pair2 = project_init_range_to_new(mapped_range2, intersection,
+ mapping_range2.initial_range);
+
+ TRACE("1st projection: " << new_init_pair1.first << " " << new_init_pair1.second);
+ TRACE("2nd projection: " << new_init_pair2.first << " " << new_init_pair2.second);
+
+ if(!is_range_pair_correct(new_init_pair1) || !is_range_pair_correct(new_init_pair2))
+ return unknown;
+
+ Sequence subseq1 = GetSequenceByRange(seq1, new_init_pair1);
+ Sequence subseq2 = GetSequenceByRange(seq2, new_init_pair2);
+
+ double relative_align = RelAlignmentOfSequences(subseq1, subseq2);
+
+ TRACE("Seq1 size - " << subseq1.size() << ", seq2 size - " << subseq2.size());
+ TRACE("Relative alignment - " << relative_align);
+
+ if(fabs(relative_align) < 0.0001)
+ return from_one;
+ return from_different;
+ }
+
+public:
+ ContigLabelAllocator(ContigStoragePtr contig_storage) :
+ contig_storage_(contig_storage) { }
+
+ SignedLabels SignLabelsOnEdge(set<size_t> contigs, EdgeId current_edge){
+
+ SignedLabels this_edge_labels;
+ vector<int> indexes_of_edge;
+ for(auto contig = contigs.begin(); contig != contigs.end(); contig++){
+ int index = get_index_of_edge(contig_storage_->GetContigById(*contig)->
+ mapping_path().simple_path(), current_edge);
+ VERIFY(index != -1);
+ indexes_of_edge.push_back(index);
+ }
+ vector<int> oppa(contigs.begin(), contigs.end());
+ for(size_t cnt1 = 0; cnt1 < oppa.size(); cnt1++) {
+ int id1 = oppa[cnt1];
+ auto seq1 = contig_storage_->GetContigById(id1)->seq();
+ auto mapping_path1 = contig_storage_->GetContigById(id1)->mapping_path();
+ MappingRange mapping_range1 = mapping_path1[indexes_of_edge[cnt1]].second;
+ for(size_t cnt2 = cnt1 + 1; cnt2 < oppa.size(); cnt2++) {
+ int id2 = oppa[cnt2];
+ auto seq2 = contig_storage_->GetContigById(id2)->seq();
+ auto mapping_path2 = contig_storage_->GetContigById(id2)->mapping_path();
+ TRACE("Sign label for " << id1 << " " << id2);
+ TRACE("Seq1 size - " << seq1.size() << " , seq2 size - " << seq2.size())
+ MappingRange mapping_range2 = mapping_path2[indexes_of_edge[cnt2]].second;
+ if(AreRangesIntersect(mapping_range1, mapping_range2)){
+ TRACE("Intersection exists");
+ haplotype label = ComputeLabelForPair(mapping_range1, seq1, mapping_range2, seq2);
+ this_edge_labels.Add(id1, id2, label);
+ }
+ }
+ }
+
+ return this_edge_labels;
+ }
+
+private:
+ DECL_LOGGER("ContigLabelAllocator");
+};
+
+class IndexedPairOfEdges{
+ pair<EdgeId, EdgeId> edges;
+ pair<size_t, size_t> indexes;
+ bool is_correct;
+
+public:
+ IndexedPairOfEdges(){
+ is_correct = false;
+ }
+
+ IndexedPairOfEdges(EdgeId edge1, EdgeId edge2, size_t index1, size_t index2){
+ edges.first = edge1;
+ edges.second = edge2;
+
+ indexes.first = index1;
+ indexes.second = index2;
+
+ is_correct = index1 <= index2;
+ }
+
+ EdgeId FirstEdge(){
+ VERIFY(is_correct);
+ return edges.first;
+ }
+
+ EdgeId SecondEdge(){
+ VERIFY(is_correct);
+ return edges.first;
+ }
+
+ size_t FirstIndex(){
+ VERIFY(is_correct);
+ return indexes.first;
+ }
+
+ size_t SecondIndex(){
+ VERIFY(is_correct);
+ return indexes.first;
+ }
+
+ bool IsNull(){
+ return !is_correct;
+ }
+};
+
+enum separation_result {not_identified, separated, diploid_repeat, conservative_region};
+
+class SeparationResultInterpretator{
+
+ bool IsConservativeRegion(SignedLabels labels){
+
+ if(labels.Size() == 0)
+ return false;
+
+ for(auto it = labels.begin(); it != labels.end(); it++){
+ if(it->second == from_different || it->second == unknown)
+ return false;
+ }
+
+ return true;
+ }
+
+ bool AddNewEdgeIntoBigraph(set<int> &first_part, set<int> &second_part,
+ pair<int,int> new_edge){
+
+ int vertex1 = new_edge.first, vertex2 = new_edge.second;
+
+ if(first_part.find(vertex1) != first_part.end() &&
+ first_part.find(vertex2) != first_part.end())
+ return false;
+
+ if(second_part.find(vertex1) != second_part.end() &&
+ second_part.find(vertex2) != second_part.end())
+ return false;
+
+ if(first_part.find(vertex1) != first_part.end()){
+ second_part.insert(vertex2);
+ }
+ else{
+ if(second_part.find(vertex1) != second_part.end())
+ first_part.insert(vertex2);
+ else{
+
+ if(first_part.find(vertex2) != first_part.end())
+ second_part.insert(vertex1);
+ else{
+ first_part.insert(vertex1);
+ second_part.insert(vertex2);
+ }
+ }
+ }
+ return true;
+ }
+
+ bool AreSeparatedContigs(SignedLabels labels){
+
+ if(labels.Size() == 0)
+ return false;
+
+ set<int> first_part, second_part;
+
+ for(auto it = labels.begin(); it != labels.end(); it++){
+ if(it->second == from_different){
+ pair<int, int> new_edge = it->first;
+ if(!AddNewEdgeIntoBigraph(first_part, second_part, new_edge)){
+ TRACE("Edge doesn't added");
+ return false;
+ }
+ }
+ }
+
+ return true;
+ }
+
+public:
+ separation_result Interpretate(SignedLabels labels){
+
+ if(labels.Size() == 0){
+ TRACE("Result unknown");
+ return not_identified;
+ }
+
+ if(IsConservativeRegion(labels)){
+ TRACE("Conservative region");
+ return conservative_region;
+ }
+
+ if(AreSeparatedContigs(labels)){
+ TRACE("Contigs are separated into two haplotypes");
+ return separated;
+ }
+
+ TRACE("Diploid repeat");
+ return diploid_repeat;
+ }
+};
+
+class DiploidContigSeparator{
+
+ typedef map<EdgeId, set<size_t> > EdgeContigsMap;
+
+ Graph &g_;
+ ContigStoragePtr default_storage_;
+ ContigStoragePtr composite_storage_;
+ CorrectionResult res_of_corr_cycle_;
+
+ set<size_t> GetOfInnerContigsOf(size_t composite_contig_index){
+ MappingContigPtr contig = (*composite_storage_)[composite_contig_index];
+ set<size_t> contigs;
+ vector<MappingContigPtr> inner_contigs = contig->AllMappingContigs();
+ if(inner_contigs.size() == 0)
+ contigs.insert(contig->id());
+ else{
+ for(auto it = inner_contigs.begin(); it != inner_contigs.end(); it++){
+ size_t cur_id = (*it)->id();
+ contigs.insert(cur_id);
+ auto set_red_conts = res_of_corr_cycle_.redundancy_map.GetValuesByKey(cur_id);
+ for(auto c = set_red_conts.begin(); c != set_red_conts.end(); c++)
+ contigs.insert(*c);
+ }
+ }
+ return contigs;
+ }
+
+ set<EdgeId> GetSetOfEdgesFromPath(vector<EdgeId> path){
+ set<EdgeId> res;
+ for(auto e = path.begin(); e != path.end(); e++){
+ res.insert(*e);
+ }
+ return res;
+ }
+
+ IndexedPairOfEdges DefineStartAndEndEdges(vector<EdgeId> common_path, MappingContigPtr inner_contig){
+ MappingPath<EdgeId> map_path = inner_contig->mapping_path();
+ VERIFY(map_path.size() > 0);
+ EdgeId first_edge;
+ size_t first_ind = size_t(-1);
+ bool is_1st_found = false;
+ for(size_t i = 0; i < map_path.size(); i++){
+ for(size_t j = 0; j < common_path.size(); j++)
+ if(map_path[i].first == common_path[j]){
+ first_edge = map_path[i].first;
+ first_ind = j;
+ is_1st_found = true;
+ break;
+ }
+ if(is_1st_found)
+ break;
+ }
+
+ EdgeId last_edge;
+ size_t last_ind = size_t(-1);
+ bool is_2nd_found = false;
+ for(int i = int(map_path.size() - 1); i >= 0; i--){
+ for(int j = int(common_path.size()- 1); j >= 0; j--)
+ if(map_path[i].first == common_path[j]){
+ last_edge = map_path[i].first;
+ last_ind = size_t(j);
+ is_2nd_found = true;
+ break;
+ }
+ if(is_2nd_found)
+ break;
+ }
+
+ if(first_ind <= last_ind && is_1st_found && is_2nd_found)
+ return IndexedPairOfEdges(first_edge, last_edge, first_ind, last_ind);
+ else
+ return IndexedPairOfEdges();
+ }
+
+ set<size_t> DeleteSubsetFromSet(set<size_t> set_, set<size_t> subset_){
+ for(auto it = subset_.begin(); it != subset_.end(); it++)
+ set_.erase(*it);
+ return set_;
+ }
+
+ EdgeContigsMap DefineContigsOnEdges(set<size_t> contigs){
+ EdgeContigsMap res;
+ for(auto contig = contigs.begin(); contig != contigs.end(); contig++){
+ auto map_path = default_storage_->GetContigById(*contig)->mapping_path();
+ for(size_t i = 0; i < map_path.size(); i++)
+ res[map_path[i].first].insert(*contig);
+ }
+ return res;
+ }
+
+ SignedLabels signed_labels_;
+ ConservativeRegionStorage cons_regions_stor_;
+
+public:
+ DiploidContigSeparator(Graph &g, ContigStoragePtr default_storage,
+ ContigStoragePtr composite_storage, CorrectionResult res_of_corr_cycle) :
+ g_(g), default_storage_(default_storage), composite_storage_(composite_storage),
+ res_of_corr_cycle_(res_of_corr_cycle){
+ }
+
+ void SeparateContigs(){
+
+ SignedLabels signed_labels;
+ ContigLabelAllocator label_allocator(default_storage_);
+
+ // for each composite contig
+ for(size_t i = 0; i < composite_storage_->Size(); i++){
+
+ TRACE("New composite contig");
+
+ // computing set of inner contigs
+ set<size_t> inner_contigs = GetOfInnerContigsOf(i);
+
+ TRACE("Number of contigs - " << inner_contigs.size());
+
+ // define which contigs intersect consensus path
+ vector<EdgeId> consensus_path = (*composite_storage_)[i]->path_seq();
+
+ set<EdgeId> start_edge_edges_set;
+ map<size_t, IndexedPairOfEdges> contig_start_end_map;
+ set<size_t> contigs_for_deletion;
+
+ for(auto c = inner_contigs.begin(); c != inner_contigs.end(); c++){
+ MappingContigPtr contig = default_storage_->GetContigById(*c);
+ auto edges = DefineStartAndEndEdges(consensus_path, contig);
+ if(!edges.IsNull()){
+ contig_start_end_map[*c] = edges;
+ start_edge_edges_set.insert(edges.FirstEdge());
+ start_edge_edges_set.insert(edges.SecondEdge());
+ }
+ else
+ contigs_for_deletion.insert(*c);
+ }
+
+ inner_contigs = DeleteSubsetFromSet(inner_contigs, contigs_for_deletion);
+
+ EdgeContigsMap contigs_on_edge = DefineContigsOnEdges(inner_contigs);
+
+ TRACE("Defining labels");
+ SeparationResultInterpretator interpret;
+ for(auto e = consensus_path.begin(); e != consensus_path.end(); e++){
+
+ TRACE("Edge - " << g_.str(*e) << ", start - " << g_.str(g_.EdgeStart(*e)) <<
+ ", end - " << g_.str(g_.EdgeEnd(*e)));
+ auto contigs_ids_on_edge = contigs_on_edge[*e];
+
+ if(contigs_ids_on_edge.size() == 1){
+ cons_regions_stor_.AddPossiblyConservativeRegion(g_.EdgeNucls(*e));
+ TRACE(g_.int_id(*e) << " - possibly conservative region");
+ }
+
+ TRACE("Contigs on this edge: " << SetToString<size_t>(contigs_ids_on_edge));
+
+ SignedLabels current_signed_labels = label_allocator.SignLabelsOnEdge(contigs_ids_on_edge, *e);
+
+ TRACE("Signed labels for this edge");
+// current_signed_labels.Print(cout);
+ signed_labels_.MergeWith(current_signed_labels);
+
+ TRACE("Interpretation of results");
+ auto inpret_res = interpret.Interpretate(current_signed_labels);
+ TRACE("------------------------------------------");
+
+ if(inpret_res == conservative_region){
+ cons_regions_stor_.AddConservativeRegion(g_.EdgeNucls(*e));
+ TRACE(g_.int_id(*e) << " - conservative region");
+ }
+ }
+ }
+
+ TRACE("Signed labels:");
+ TRACE(signed_labels_.ToString());
+
+ }
+
+ SignedLabels GetSignedLabels(){
+ return signed_labels_;
+ }
+
+ ConservativeRegionStorage GetConservativeRegionStorage(){
+ return cons_regions_stor_;
+ }
+
+private:
+ DECL_LOGGER("DiploidContigSeparator");
+
+};
+
+}
diff --git a/src/projects/dipspades/haplotype_assembly/haplotype_assembler.hpp b/src/projects/dipspades/haplotype_assembly/haplotype_assembler.hpp
new file mode 100644
index 0000000..8b72abf
--- /dev/null
+++ b/src/projects/dipspades/haplotype_assembly/haplotype_assembler.hpp
@@ -0,0 +1,59 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "conservative_regions_searcher.hpp"
+
+namespace dipspades {
+
+class HaplotypeAssembler {
+
+ conj_graph_pack &consensus_graph_pack_;
+ conj_graph_pack &double_graph_pack_;
+ ContigStoragePtr default_storage_;
+ ContigStoragePtr composite_storage_;
+ CorrectionResult redundancy_map_;
+
+public:
+
+ HaplotypeAssembler(conj_graph_pack &consensus_graph_pack,
+ conj_graph_pack &double_graph_pack,
+ ContigStoragePtr default_storage,
+ ContigStoragePtr composite_storage,
+ CorrectionResult redundancy_map) :
+ consensus_graph_pack_(consensus_graph_pack),
+ double_graph_pack_(double_graph_pack),
+ default_storage_(default_storage),
+ composite_storage_(composite_storage),
+ redundancy_map_(redundancy_map) {
+ double_graph_pack_.kmer_mapper.Attach();
+ }
+
+ void Run() {
+ INFO("Contigs separation starts");
+ DiploidContigSeparator separator(consensus_graph_pack_.g, default_storage_,
+ composite_storage_, redundancy_map_);
+ INFO("Haplocontigs number: " << default_storage_->Size());
+ INFO("Consensus contigs number: " << composite_storage_->Size());
+ separator.SeparateContigs();
+ SignedLabels signed_labels = separator.GetSignedLabels();
+ string hapl_output(path::append_path(dsp_cfg::get().io.output_dir, "haplotype_assembly.out").c_str());
+ signed_labels.WriteToFile(hapl_output, default_storage_);
+ INFO("Result of haplotype assembly written in file " << hapl_output);
+ INFO("Contigs separation ends");
+
+ INFO("Conservative regions search starts");
+ ConservativeRegionStorage conservative_regions = separator.GetConservativeRegionStorage();
+ ConservativeRegionsSearcher cons_regions_searcher(double_graph_pack_, default_storage_,
+ signed_labels, conservative_regions);
+ cons_regions_searcher.Search();
+ INFO("Conservative regions search ends");
+ }
+};
+
+}
diff --git a/src/projects/dipspades/kmer_gluing/equal_sequence_gluer.hpp b/src/projects/dipspades/kmer_gluing/equal_sequence_gluer.hpp
new file mode 100644
index 0000000..487e6fa
--- /dev/null
+++ b/src/projects/dipspades/kmer_gluing/equal_sequence_gluer.hpp
@@ -0,0 +1,146 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "../utils/edge_gluer.hpp"
+
+using namespace debruijn_graph;
+
+namespace dipspades {
+
+template<class Graph>
+class EqualSequencesGluer {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ Graph &graph_;
+ conj_graph_pack::index_t &index_;
+
+ EdgeId ExtractShortEdge(EdgeId edge, size_t pos) {
+ if(pos + 1 < graph_.length(edge)) {
+ edge = graph_.SplitEdge(edge, pos + 1).first;
+ }
+ if(pos > 0) {
+ edge = graph_.SplitEdge(edge, pos).second;
+ }
+ VERIFY(graph_.length(edge) == 1);
+ return edge;
+ }
+
+ bool CheckClose(size_t a, size_t b, size_t diff) const {
+ return a <= b + diff && b <= a + diff;
+ }
+
+ bool ConjugateEdgesCannotBeSplitted(size_t edge_length, size_t pos1, size_t pos2) {
+ return CheckClose(edge_length, pos1 + pos2 + 1, 1) && CheckClose(pos1, pos2, 1);
+ }
+
+ void GlueEqualEdgeParts(EdgeId edge1, size_t pos1, EdgeId edge2, size_t pos2) {
+ TRACE("Edge1: " << graph_.int_id(edge1) << ", length: " << graph_.length(edge1) << ", pos: " << pos1);
+ TRACE("Edge2: " << graph_.int_id(edge2) << ", length: " << graph_.length(edge2) << ", pos: " << pos2);
+ VERIFY(edge1 != edge2 || pos1 != pos2);
+ if(edge1 == edge2) {
+ if(edge1 == graph_.conjugate(edge2)) {
+ WARN("Equal k-mer gluer faced a difficult situation in graph for edge " << graph_.int_id(edge1)
+ << " Equal k-mers were ignored.");
+ return;
+ }
+ if(pos1 > pos2) {
+ std::swap(pos1, pos2);
+ }
+ pair<EdgeId, EdgeId> split_edges = graph_.SplitEdge(edge1, pos2);
+ edge1 = split_edges.first;
+ edge2 = split_edges.second;
+ pos2 = 0;
+ } else if(edge1 == graph_.conjugate(edge2)) {
+ TRACE("Edges are conjugate pairs");
+ if(ConjugateEdgesCannotBeSplitted(graph_.length(edge1), pos1, pos2)) {
+ WARN("Equal k-mer gluer faced a difficult situation in graph for edges " << graph_.int_id(edge1) <<
+ " and " << graph_.int_id(edge2) << ". Equal k-mers were ignored.");
+ return;
+ }
+ if (pos1 + pos2 == graph_.length(edge1) - 1) {
+ WARN("Equal k-mer gluer faced a difficult situation in graph for edge " << graph_.int_id(edge1)
+ << " Equal k-mers were ignored.");
+ }
+ if(pos1 + pos2 >= graph_.length(edge1) - 1) {
+ size_t tmp = pos1;
+ pos1 = graph_.length(edge1) - pos2 - 1;
+ pos2 = graph_.length(edge1) - tmp - 1;
+ }
+ INFO(pos1 << " " << pos2 << " " << graph_.length(edge1))
+ TRACE("Edge1 " << graph_.int_id(edge1) << " will be splitted");
+ pair<EdgeId, EdgeId> split_edges = graph_.SplitEdge(edge1, pos1 + 1);
+ TRACE("Splitted pair was created");
+ TRACE("New edge1: " << graph_.int_id(split_edges.first) << ", length: " << graph_.length(split_edges.first));
+ TRACE("New edge2: " << graph_.int_id(split_edges.second) << ", length: " << graph_.length(split_edges.second));
+ edge1 = split_edges.first;
+ edge2 = graph_.conjugate(split_edges.second);
+// pos2 -= pos1 + 1;
+ }
+ EdgeId se1 = ExtractShortEdge(edge1, pos1);
+ EdgeId se2 = ExtractShortEdge(edge2, pos2);
+ VERIFY(graph_.EdgeNucls(se1) == graph_.EdgeNucls(se2));
+ GlueEqualEdges(se1, se2);
+ }
+
+ void SafelyGlueEdges(EdgeId e1, EdgeId e2){
+ // e1 -> e2
+ vector<EdgeId> forbidden_edges = {e1, e2};
+ EdgeGluer(graph_).MoveEdgesFromVertexToVertex(graph_.EdgeStart(e1),
+ graph_.EdgeStart(e2), forbidden_edges);
+ EdgeGluer(graph_).MoveEdgesFromVertexToVertex(graph_.EdgeEnd(e1),
+ graph_.EdgeEnd(e2), forbidden_edges);
+ graph_.GlueEdges(e1, e2);
+ }
+
+ void GlueEqualEdges(EdgeId edge1, EdgeId edge2) {
+ set<VertexId> endVertices = {graph_.EdgeStart(edge1), graph_.EdgeEnd(edge1),
+ graph_.EdgeStart(edge2), graph_.EdgeEnd(edge2),
+ graph_.conjugate(graph_.EdgeStart(edge1)),
+ graph_.conjugate(graph_.EdgeEnd(edge1)),
+ graph_.conjugate(graph_.EdgeStart(edge2)),
+ graph_.conjugate(graph_.EdgeEnd(edge2))};
+ if(endVertices.size() != 8)
+ return;
+ SafelyGlueEdges(edge1, edge2);
+ }
+
+public:
+ EqualSequencesGluer(Graph &graph, conj_graph_pack::index_t &index): graph_(graph), index_(index) { }
+
+ Sequence get(EdgeId e, size_t pos) const {
+ return graph_.EdgeNucls(e).subseq(pos, pos + graph_.k() + 1);
+ }
+
+ void GlueEqualKmers() {
+ size_t cnt = 0;
+ for(auto it = graph_.SmartEdgeBegin(); !it.IsEnd(); ++it) {
+ Sequence nucls = graph_.EdgeNucls(*it);
+ runtime_k::RtSeq kmer = nucls.start<runtime_k::RtSeq>(graph_.k() + 1) >> 'A';
+ for(size_t i = graph_.k(); i < graph_.length(*it); i++) {
+ kmer = kmer << graph_.EdgeNucls(*it)[i];
+ if(!index_.contains(kmer)) {
+ continue;
+ }
+ pair<EdgeId, size_t> pos = index_.get(kmer);
+ if(pos.first != *it || pos.second != i - graph_.k()) {
+ GlueEqualEdgeParts(pos.first, pos.second, *it, i - graph_.k());
+ cnt++;
+ break;
+ }
+ }
+ }
+ INFO(cnt << " kmers glued");
+ }
+
+private:
+ DECL_LOGGER("EqualSequencesGluer");
+};
+
+}
diff --git a/src/projects/dipspades/main.cpp b/src/projects/dipspades/main.cpp
new file mode 100644
index 0000000..d0fbf86
--- /dev/null
+++ b/src/projects/dipspades/main.cpp
@@ -0,0 +1,110 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * Assembler Main
+ */
+#include "dev_support/logger/log_writers.hpp"
+
+#include "dev_support/segfault_handler.hpp"
+#include "dev_support/memory_limit.hpp"
+#include "dev_support/copy_file.hpp"
+#include "data_structures/sequence/runtime_k.hpp"
+
+
+#include "pipeline/graph_pack.hpp"
+#include "stages/construction.hpp"
+
+#include "dipspades.hpp"
+
+void make_dirs(){
+ make_dir(dsp_cfg::get().io.output_base);
+ make_dir(dsp_cfg::get().io.output_root);
+ make_dir(dsp_cfg::get().io.output_dir);
+ make_dir(dsp_cfg::get().io.output_saves);
+ make_dir(dsp_cfg::get().io.tmp_dir);
+}
+
+void copy_configs(string cfg_filename, string to) {
+ using namespace debruijn_graph;
+
+ if (!make_dir(to)) {
+ WARN("Could not create files use in /tmp directory");
+ }
+ path::copy_files_by_ext(path::parent_path(cfg_filename), to, ".info", true);
+}
+
+void load_config(string cfg_filename) {
+ path::CheckFileExistenceFATAL(cfg_filename);
+ dsp_cfg::create_instance(cfg_filename);
+// string path_to_copy = path::append_path(dsp_cfg::get().io.output_dir, "configs");
+// copy_configs(cfg_filename, path_to_copy);
+}
+
+void create_console_logger(string cfg_filename) {
+ using namespace logging;
+
+ string log_props_file = dsp_cfg::get().io.log_filename;
+
+ if (!path::FileExists(log_props_file)){
+ log_props_file = path::append_path(path::parent_path(cfg_filename), dsp_cfg::get().io.log_filename);
+ }
+
+ logger *lg = create_logger(path::FileExists(log_props_file) ? log_props_file : "");
+ lg->add_writer(std::make_shared<console_writer>());
+ attach_logger(lg);
+}
+
+int main(int /*argc*/, char** argv) {
+ perf_counter pc;
+ const size_t GB = 1 << 30;
+
+ srand(42);
+ srandom(42);
+
+ segfault_handler sh;
+
+ try {
+ using namespace debruijn_graph;
+ string cfg_filename = argv[1];
+ load_config (cfg_filename);
+ make_dirs();
+ if(dsp_cfg::get().rp.developer_mode)
+ copy_configs(cfg_filename, path::append_path(dsp_cfg::get().io.output_dir, "configs"));
+ create_console_logger(cfg_filename);
+
+ INFO("Loaded config from " << cfg_filename);
+
+ VERIFY(dsp_cfg::get().bp.K >= runtime_k::MIN_K && dsp_cfg::get().bp.K < runtime_k::MAX_K);
+ VERIFY(dsp_cfg::get().bp.K % 2 != 0);
+
+ limit_memory(dsp_cfg::get().bp.max_memory * GB);
+
+ INFO("Starting dipSPAdes, built from " SPADES_GIT_REFSPEC ", git revision " SPADES_GIT_SHA1);
+ INFO("Assembling dataset (" << dsp_cfg::get().io.dataset_name << ") with K=" << dsp_cfg::get().bp.K);
+ dipspades::run_dipspades();
+// link_output("latest_success");
+ } catch (std::bad_alloc const& e) {
+ std::cerr << "Not enough memory to run SPAdes. " << e.what() << std::endl;
+ return EINTR;
+ } catch (std::exception const& e) {
+ std::cerr << "Exception caught " << e.what() << std::endl;
+ return EINTR;
+ } catch (...) {
+ std::cerr << "Unknown exception caught " << std::endl;
+ return EINTR;
+ }
+
+ unsigned ms = (unsigned)pc.time_ms();
+ unsigned secs = (ms / 1000) % 60;
+ unsigned mins = (ms / 1000 / 60) % 60;
+ unsigned hours = (ms / 1000 / 60 / 60);
+ INFO("Assembling time: " << hours << " hours " << mins << " minutes " << secs << " seconds");
+
+ // OK
+ return 0;
+}
diff --git a/src/projects/dipspades/polymorphic_bulge_remover/bulge_correction_condition.hpp b/src/projects/dipspades/polymorphic_bulge_remover/bulge_correction_condition.hpp
new file mode 100644
index 0000000..5e49299
--- /dev/null
+++ b/src/projects/dipspades/polymorphic_bulge_remover/bulge_correction_condition.hpp
@@ -0,0 +1,128 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "../utils/bulge_utils.hpp"
+#include "../dipspades_config.hpp"
+
+using namespace debruijn_graph;
+
+namespace dipspades {
+
+class BaseBulgeCorrectionCondition{
+protected:
+ Graph &graph_;
+public:
+ BaseBulgeCorrectionCondition(Graph &graph) : graph_(graph) { }
+ virtual bool IsBulgeCorrect(shared_ptr<BaseBulge> bulge) = 0;
+ virtual bool IsPathBulgeSide(vector<EdgeId> path) = 0;
+ virtual ~BaseBulgeCorrectionCondition(){ }
+};
+
+class RelatedVerticesCondition : public BaseBulgeCorrectionCondition {
+ bool TwoVerticesRelated(VertexId v1, VertexId v2){
+ return graph_.RelatedVertices(v1, v2);
+ }
+
+ bool PathContainsNoRelatedToVertex(vector<EdgeId> path, VertexId vertex,
+ bool check_start_vertex = false){
+ VERIFY(path.size() != 0);
+
+ if(check_start_vertex)
+ if(TwoVerticesRelated(graph_.EdgeStart(path[0]), vertex))
+ return false;
+
+ for(auto e = path.begin(); e != path.end(); e++)
+ if(TwoVerticesRelated(vertex, graph_.EdgeEnd(*e)))
+ return false;
+ return true;
+ }
+
+ bool PathContainsNoRelatedVertices(vector<EdgeId> path){
+ if(!PathContainsNoRelatedToVertex(path, graph_.EdgeStart(path[0])))
+ return false;
+ for(auto e1 = path.begin(); e1 != path.end(); e1++)
+ for(auto e2 = e1 + 1; e2 != path.end(); e2++)
+ if(TwoVerticesRelated(graph_.EdgeEnd(*e1), graph_.EdgeEnd(*e2)))
+ return false;
+ return true;
+ }
+
+ bool PathsContainNoRelatedVertices(shared_ptr<BaseBulge> bulge){
+ auto path1 = bulge->path1();
+ auto path2 = bulge->path2();
+ for(auto e1 = path1.begin(); e1 != path1.end(); e1++)
+ for(auto e2 = path2.begin(); e2 != path2.end(); e2++)
+ if((e1 != path1.end() - 1) && (e2 != path2.end() - 1))
+ if(TwoVerticesRelated(graph_.EdgeEnd(*e1), graph_.EdgeEnd(*e2)))
+ return false;
+ return true;
+ }
+
+public:
+ RelatedVerticesCondition(Graph &graph) : BaseBulgeCorrectionCondition(graph) { }
+ bool IsBulgeCorrect(shared_ptr<BaseBulge> bulge){
+ if(!PathContainsNoRelatedVertices(bulge->path1()) ||
+ !PathContainsNoRelatedVertices(bulge->path2()))
+ return false;
+ return PathsContainNoRelatedVertices(bulge);
+ }
+
+ bool IsPathBulgeSide(vector<EdgeId> path){
+ return PathContainsNoRelatedVertices(path);
+ }
+};
+
+class AdjacencyToAutoRCEdges : public BaseBulgeCorrectionCondition {
+
+public:
+ AdjacencyToAutoRCEdges(Graph &graph) : BaseBulgeCorrectionCondition(graph) { }
+
+ bool IsBulgeCorrect(shared_ptr<BaseBulge> bulge){
+ return IsPathBulgeSide(bulge->path1()) && IsPathBulgeSide(bulge->path2());
+ }
+
+ bool IsPathBulgeSide(vector<EdgeId> path){
+ return !PathAdjacentRelatedEdges(graph_, path);
+ }
+};
+
+class DiploidyCondition : public BaseBulgeCorrectionCondition {
+ double rel_length_;
+ double rel_align_;
+public:
+ DiploidyCondition(Graph &graph,
+ double rel_length,
+ double rel_align) :
+ BaseBulgeCorrectionCondition(graph),
+ rel_length_(rel_length),
+ rel_align_(rel_align) { }
+
+ bool IsBulgeCorrect(shared_ptr<BaseBulge> bulge){
+ return bulge->IsBulgeDiploid(rel_length_, rel_align_);
+ }
+
+ bool IsPathBulgeSide(vector<EdgeId>){
+ return true;
+ }
+};
+
+class CorrectSplitCondition : public BaseBulgeCorrectionCondition {
+public:
+ CorrectSplitCondition(Graph &graph) : BaseBulgeCorrectionCondition(graph) { }
+
+ bool IsBulgeCorrect(shared_ptr<BaseBulge> bulge){
+ return bulge->path1().size() == bulge->path2().size();
+ }
+
+ bool IsPathBulgeSide(vector<EdgeId>){
+ return true;
+ }
+};
+
+}
diff --git a/src/projects/dipspades/polymorphic_bulge_remover/bulge_gluer.hpp b/src/projects/dipspades/polymorphic_bulge_remover/bulge_gluer.hpp
new file mode 100644
index 0000000..1076171
--- /dev/null
+++ b/src/projects/dipspades/polymorphic_bulge_remover/bulge_gluer.hpp
@@ -0,0 +1,88 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "glue_direction_definer.hpp"
+#include "gluing_vertices_definer.hpp"
+#include "bulge_splitter.hpp"
+#include "bulge_correction_condition.hpp"
+#include "../utils/edge_gluer.hpp"
+
+using namespace debruijn_graph;
+
+namespace dipspades {
+
+template<class GlueDirectionDefiner, class GluingVericesDefiner, class BulgeSplitter>
+class ComplexBulgeGluer {
+
+ Graph &graph_;
+ GlueDirectionDefiner glue_dir_definer_;
+ GluingVericesDefiner glue_definer_;
+ BulgeSplitter splitter_;
+
+ bool IsSplittedBulgeCorrect(shared_ptr<BaseBulge> splitted_bulge){
+ return !splitted_bulge->IsEmpty() && CorrectSplitCondition(graph_).IsBulgeCorrect(splitted_bulge) &&
+ RelatedVerticesCondition(graph_).IsBulgeCorrect(splitted_bulge);
+ }
+
+ void GlueSplittedBulge(shared_ptr<BaseBulge> splitted_bulge){
+ size_t bulge_edge_len = splitted_bulge->path1().size();
+ EdgeGluer edge_gluer(graph_);
+ TRACE("Edge gluer starts");
+ for(size_t i = 0; i < bulge_edge_len - 1; i++){
+ auto edge1 = splitted_bulge->path1()[i];
+ auto edge2 = splitted_bulge->path2()[i];
+ auto next_edge1 = splitted_bulge->path1()[i + 1];
+ TRACE("edge1 - " << graph_.str(edge1) << ", edge2 - " << graph_.str(edge2) <<
+ ", next_edge1 - " << graph_.str(next_edge1));
+ vector<EdgeId> tmp = {edge1, edge2, next_edge1};
+ edge_gluer.MoveEdgesFromVertexToVertex(
+ graph_.EdgeEnd(edge1),
+ graph_.EdgeEnd(edge2),
+ tmp);
+ graph_.GlueEdges(edge1, edge2);
+ TRACE("Edges were moved");
+ }
+ graph_.GlueEdges(splitted_bulge->path1()[bulge_edge_len - 1],
+ splitted_bulge->path2()[bulge_edge_len - 1]);
+ TRACE("Gluing was completed");
+ }
+
+public:
+ ComplexBulgeGluer(Graph &graph, GlueDirectionDefiner glue_dir_definer,
+ GluingVericesDefiner glue_definer, BulgeSplitter splitter) :
+ graph_(graph),
+ glue_dir_definer_(glue_dir_definer),
+ glue_definer_(glue_definer),
+ splitter_(splitter) { }
+
+ bool GlueBulge(shared_ptr<BaseBulge> bulge){
+ auto glue_dir = glue_dir_definer_.Define(bulge);
+ TRACE("Gluing direction - " << glue_dir);
+ if(glue_dir == undefined)
+ return false;
+
+ shared_ptr<BaseBulge> directed_bulge(new DirectedBulge(graph_, bulge, glue_dir));
+ TRACE("Glue vertices definer starts");
+ auto glue_def_res = glue_definer_.Run(directed_bulge);
+ TRACE("Bulge splitter starts");
+ auto splitted_bulge = splitter_.SplitBulge(directed_bulge, glue_def_res);
+
+ if(IsSplittedBulgeCorrect(splitted_bulge)){
+ TRACE("Splitted bulge correct");
+ GlueSplittedBulge(splitted_bulge);
+ return true;
+ }
+ return false;
+ }
+
+private:
+ DECL_LOGGER("ComplexBulgeGluer");
+};
+
+}
diff --git a/src/projects/dipspades/polymorphic_bulge_remover/bulge_paths_searcher.hpp b/src/projects/dipspades/polymorphic_bulge_remover/bulge_paths_searcher.hpp
new file mode 100644
index 0000000..ac97830
--- /dev/null
+++ b/src/projects/dipspades/polymorphic_bulge_remover/bulge_paths_searcher.hpp
@@ -0,0 +1,97 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <vector>
+#include "algorithms/dijkstra/dijkstra_helper.hpp"
+#include "assembly_graph/paths/path_processor.hpp"
+#include "dipspades_config.hpp"
+
+using namespace std;
+using namespace debruijn_graph;
+using namespace omnigraph;
+
+namespace dipspades {
+
+class DijkstraBulgePathsSearcher {
+ typedef map<VertexId, vector<EdgeId> > shortest_paths;
+
+ Graph &graph_;
+ size_t search_depth_;
+ size_t max_neigh_number_;
+
+public:
+ DijkstraBulgePathsSearcher(Graph &graph,
+ size_t search_depth,
+ size_t max_neigh_number) :
+ graph_(graph),
+ search_depth_(search_depth),
+ max_neigh_number_(max_neigh_number) {
+ TRACE("Search depth - " << search_depth);
+ }
+
+ vector<VertexId> VerticesReachedFrom(VertexId start_vertex) {
+ auto bounded_dijkstra = DijkstraHelper<Graph>::CreateBoundedDijkstra(this->graph_,
+ this->search_depth_, this->max_neigh_number_);
+ bounded_dijkstra.Run(start_vertex);
+ TRACE("Reached vertices size - " << bounded_dijkstra.ReachedVertices());
+ return bounded_dijkstra.ReachedVertices();
+ }
+
+ vector<vector<EdgeId> > GetAllPathsTo(VertexId start_vertex, VertexId end_vertex) {
+ auto bounded_dijkstra = DijkstraHelper<Graph>::CreateBoundedDijkstra(this->graph_,
+ this->search_depth_, this->max_neigh_number_);
+ bounded_dijkstra.Run(start_vertex);
+
+ vector<vector<EdgeId> > alternative_paths;
+ auto shortest_path = bounded_dijkstra.GetShortestPathTo(end_vertex);
+ alternative_paths.push_back(shortest_path);
+ if(shortest_path.size() == 0)
+ return alternative_paths;
+
+ EdgeId shpath_last_edge = shortest_path[shortest_path.size() - 1];
+ for(auto in_edge = this->graph_.IncomingEdges(end_vertex).begin();
+ in_edge != this->graph_.IncomingEdges(end_vertex).end(); in_edge++){
+ if(shpath_last_edge != *in_edge &&
+ bounded_dijkstra.DistanceCounted(graph_.EdgeStart(*in_edge))){
+ auto curr_short_path = bounded_dijkstra.GetShortestPathTo(graph_.EdgeStart(*in_edge));
+ curr_short_path.push_back(*in_edge);
+ alternative_paths.push_back(curr_short_path);
+ }
+ }
+ return alternative_paths;
+ }
+
+private:
+ DECL_LOGGER("DijkstraBulgePathsSearcher");
+};
+
+class PathProcessorBulgeSearcher {
+ Graph &graph_;
+ size_t search_depth_;
+public:
+ PathProcessorBulgeSearcher(Graph &graph, size_t search_depth) :
+ graph_(graph),
+ search_depth_(search_depth) { }
+
+ vector<VertexId> VerticesReachedFrom(VertexId start_vertex) {
+ auto bounded_dijkstra = DijkstraHelper<Graph>::CreateBoundedDijkstra(this->graph_,
+ this->search_depth_);
+ bounded_dijkstra.Run(start_vertex);
+ return bounded_dijkstra.ReachedVertices();
+ }
+
+ vector<vector<EdgeId> > GetAllPathsTo(VertexId start_vertex, VertexId end_vertex) {
+ PathStorageCallback<Graph> callback(this->graph_);
+ ProcessPaths(this->graph_, 0, this->search_depth_,
+ start_vertex, end_vertex, callback);
+ return callback.paths();
+ }
+};
+
+}
diff --git a/src/projects/dipspades/polymorphic_bulge_remover/bulge_splitter.hpp b/src/projects/dipspades/polymorphic_bulge_remover/bulge_splitter.hpp
new file mode 100644
index 0000000..cb57930
--- /dev/null
+++ b/src/projects/dipspades/polymorphic_bulge_remover/bulge_splitter.hpp
@@ -0,0 +1,497 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "../utils/bulge_utils.hpp"
+#include "../utils/element_printers.hpp"
+
+#include "gluing_vertices_definer.hpp"
+
+using namespace debruijn_graph;
+using namespace io;
+
+namespace dipspades {
+
+// both values inclusive
+// first index points on index of the start edge in subpath
+// last index points on index of the last edge in subpath
+typedef pair<size_t, size_t> subpath_range;
+
+class SplitResult {
+ Graph &graph_;
+ vector<EdgeId> path1_;
+ vector<EdgeId> path2_;
+
+ bool CheckExtention(vector<EdgeId> &old_path, vector<EdgeId> &new_path){
+ if(old_path.size() != 0 && new_path.size() != 0)
+ return graph_.EdgeEnd(old_path[old_path.size() - 1]) ==
+ graph_.EdgeStart(new_path[0]);
+ return true;
+ }
+
+ void ExtendPath(vector<EdgeId> &old_path, vector<EdgeId> new_path){
+ VERIFY(CheckExtention(old_path, new_path));
+ old_path.insert(old_path.end(), new_path.begin(), new_path.end());
+ }
+
+public:
+ SplitResult(Graph &graph, vector<EdgeId> path1, vector<EdgeId> path2) :
+ graph_(graph), path1_(path1), path2_(path2) { }
+
+ SplitResult(Graph &graph) : graph_(graph) { }
+
+ void ExtendPaths(SplitResult new_results) {
+ ExtendPath(path1_, new_results.path1());
+ ExtendPath(path2_, new_results.path2());
+ }
+
+ vector<EdgeId> path1() { return path1_; }
+
+ vector<EdgeId> path2() { return path2_; }
+
+ bool IsEmpty() { return path1_.size() == 0 || path2_.size() == 0; }
+};
+
+class SubpathsSplitter {
+
+ typedef vector<EdgeId> edge_path;
+
+ Graph &graph_;
+ pair<edge_path, edge_path> old_paths_;
+ pair<vector<size_t>, vector<size_t> > part_lens_;
+ pair<size_t, size_t> num_splits_;
+ pair<edge_path, edge_path > split_paths_;
+ pair<vector<size_t>, vector<size_t> > partlen_split_paths_;
+ pair<size_t, size_t> spath_lens_;
+ pair<subpath_range, subpath_range> ranges_;
+
+ enum owner { no_owner, first_path, second_path };
+ struct vertex_to_split {
+ double rel_dist_; // relative distance of vertex from the start of subpath
+ owner owner_path_; // path-owner of this vertex
+ size_t edge_ind_; // index of edge end of which matches with vertex
+
+ vertex_to_split(double rel_dist, owner owner_path, size_t edge_ind) :
+ rel_dist_(rel_dist), owner_path_(owner_path), edge_ind_(edge_ind) { }
+ };
+
+ vector<vertex_to_split> vertices_to_split_;
+
+ struct split_find_result {
+ EdgeId edge_;
+ size_t pos_;
+ bool correct_;
+
+ split_find_result() : edge_(), pos_(), correct_(false) { }
+
+ split_find_result(EdgeId edge, size_t pos, bool correct) :
+ edge_(edge),
+ pos_(pos),
+ correct_(correct) { }
+
+ void initialize(EdgeId edge, size_t pos, bool correct){
+ edge_ = edge;
+ pos_ = pos;
+ correct_ = correct;
+ }
+ };
+
+ void clear(){
+ split_paths_.first.clear();
+ split_paths_.second.clear();
+ partlen_split_paths_.first.clear();
+ partlen_split_paths_.second.clear();
+ vertices_to_split_.clear();
+ }
+
+ edge_path CutSubpath(edge_path path, subpath_range range){
+ return edge_path(path.begin() + range.first, path.begin() + range.second + 1);
+ }
+
+ size_t DefineStartEdgePosition(size_t num_splits, subpath_range range){
+ if(num_splits == 0)
+ return range.second;
+ return range.first;
+ }
+
+ size_t DefineSpathLenBefore(vector<size_t> &part_len, subpath_range range){
+ if(range.first == 0)
+ return 0;
+ return part_len[range.first - 1];
+ }
+
+ size_t DefineSubpathLen(vector<size_t> &part_len, subpath_range range){
+ return part_len[range.second] - DefineSpathLenBefore(part_len, range);
+ }
+
+ void InitializeSplitVectors(size_t num_splits, edge_path &split_path, vector<size_t> &split_lens,
+ edge_path old_path, subpath_range range){
+ if(num_splits == 0){
+ split_path = CutSubpath(old_path, range);
+ split_lens = CalculatePathPartLens(graph_, split_path);
+ return;
+ }
+ split_path.push_back(old_path[range.first]);
+ split_lens.push_back(graph_.length(old_path[range.first]));
+ }
+
+ size_t IndexFirstOppositeEdge(size_t split_index, set<size_t> &processed_indices){
+ for(size_t index = split_index + 1; index < vertices_to_split_.size(); index++)
+ if(vertices_to_split_[index].owner_path_ != vertices_to_split_[split_index].owner_path_)
+ if(processed_indices.find(index) == processed_indices.end())
+ return index;
+ return vertices_to_split_.size() - 1;
+ }
+
+ bool VerticesMergePossible(size_t ind1, size_t ind2){
+ VERIFY(vertices_to_split_[ind1].owner_path_ != vertices_to_split_[ind2].owner_path_);
+ // todo replace magic const to config file
+ return fabs(vertices_to_split_[ind1].rel_dist_ - vertices_to_split_[ind2].rel_dist_) < .01;
+ }
+
+ bool OwnersMatch(size_t ind, owner owner_path){
+ return vertices_to_split_[ind].owner_path_ == owner_path;
+ }
+
+ pair<size_t, size_t> OrderByPaths(size_t ind1, size_t ind2){
+ VERIFY(vertices_to_split_[ind1].owner_path_ != vertices_to_split_[ind2].owner_path_);
+ if(OwnersMatch(ind1, first_path))
+ return pair<size_t, size_t>(ind1, ind2);
+ return pair<size_t, size_t>(ind2, ind1);
+ }
+
+ void PerformMerge(pair<size_t, size_t> indexes){
+ size_t edge_ind1 = vertices_to_split_[indexes.first].edge_ind_ + 1;
+ split_paths_.first.push_back(old_paths_.first[edge_ind1]);
+ partlen_split_paths_.first.push_back(partlen_split_paths_.first[partlen_split_paths_.first.size() - 1] +
+ graph_.length(old_paths_.first[edge_ind1]));
+
+ size_t edge_ind2 = vertices_to_split_[indexes.second].edge_ind_ + 1;
+ split_paths_.second.push_back(old_paths_.second[edge_ind2]);
+ partlen_split_paths_.second.push_back(partlen_split_paths_.second[partlen_split_paths_.second.size() - 1] +
+ graph_.length(old_paths_.second[edge_ind2]));
+ }
+
+ EdgeId get_last_edge_by_owner(owner path_owner){
+ if(path_owner == first_path)
+ return split_paths_.first[split_paths_.first.size() - 1];
+ return split_paths_.second[split_paths_.second.size() - 1];
+// size_t path_index = vertices_to_split_[index].edge_ind_;
+// owner path_owner = vertices_to_split_[index].owner_path_;
+// if(path_owner == first_path)
+// return old_paths_.first[path_index];
+// return old_paths_.second[path_index];
+ }
+
+ split_find_result FindSplitPosition(pair<size_t, size_t> indices, size_t oppos_spaths_len,
+ vector<size_t> &oppos_pathlen){
+ EdgeId edge_to_split = get_last_edge_by_owner(
+ vertices_to_split_[indices.second].owner_path_);
+ size_t split_pos = size_t(vertices_to_split_[indices.first].rel_dist_ * double(oppos_spaths_len));
+ TRACE("Absolute split position " << split_pos);
+ TRACE("oppos_pathlen[oppos_pathlen.size() - 2] - " << oppos_pathlen[oppos_pathlen.size() - 2]);
+ if(oppos_pathlen.size() != 1 && split_pos >= oppos_pathlen[oppos_pathlen.size() - 2])
+ split_pos -= oppos_pathlen[oppos_pathlen.size() - 2];
+
+ if(split_pos == 0) split_pos++;
+
+ TRACE("Edge before split - " << graph_.str(edge_to_split) <<
+ ", split pos - " << split_pos);
+
+ return split_find_result(edge_to_split, split_pos, split_pos < graph_.length(edge_to_split));
+ }
+
+ void UpdateSplittedPath(edge_path &path, pair<EdgeId, EdgeId> splitted_edges){
+ if(path.size() == 0)
+ path.push_back(splitted_edges.first);
+ else
+ path[path.size() - 1] = splitted_edges.first;
+ path.push_back(splitted_edges.second);
+ }
+
+ void UpdatesplittedPartLens(vector<size_t> &part_lens, pair<EdgeId, EdgeId> splitted_edges){
+ if(part_lens.size() == 0)
+ part_lens.push_back(graph_.length(splitted_edges.first));
+ else if(part_lens.size() == 1)
+ part_lens[0] = graph_.length(splitted_edges.first);
+ else
+ part_lens[part_lens.size() - 1] = part_lens[part_lens.size() - 2] +
+ graph_.length(splitted_edges.first);
+ part_lens.push_back(part_lens[part_lens.size() - 1] + graph_.length(splitted_edges.second));
+ }
+
+ void SplitOppositeEdge(split_find_result split_res, edge_path &oppos_path,
+ vector<size_t> &oppos_partlen){
+ if(!split_res.correct_ || graph_.length(split_res.edge_) < split_res.pos_)
+ return;
+ pair<EdgeId, EdgeId> splitted_edges = graph_.SplitEdge(split_res.edge_, split_res.pos_);
+ TRACE("Edges after split - " << graph_.str(splitted_edges.first) << " " <<
+ graph_.str(splitted_edges.second));
+ UpdateSplittedPath(oppos_path, splitted_edges);
+ UpdatesplittedPartLens(oppos_partlen, splitted_edges);
+ }
+
+ // first from pairs - splitted, second - opposite
+ bool PerformSplit(pair<size_t, size_t> indexes,
+ pair<edge_path&, edge_path& > split_paths,
+ pair<vector<size_t>&, vector<size_t>& > split_partlens,
+ pair<edge_path&, edge_path& > default_paths,
+ pair<size_t, size_t> spaths_len,
+ pair<size_t, size_t> num_splits){
+
+ TRACE("New path1 before: " << SimplePathWithVerticesToString(graph_, split_paths.first));
+ TRACE("New path2 before: " << SimplePathWithVerticesToString(graph_, split_paths.second));
+
+ TRACE("FindEdgeAndSplitPosition");
+ split_find_result split_res = FindSplitPosition(indexes, spaths_len.second,
+ split_partlens.second);
+
+ if(!split_res.correct_){
+ TRACE("Split was not performed");
+ return false;
+ }
+
+ TRACE("SplitOppositeEdge");
+ SplitOppositeEdge(split_res, split_paths.second, split_partlens.second);
+
+ // update non splitted path
+ TRACE("Update non splitted path");
+ if(num_splits.second != 0){
+ size_t edge_ind = vertices_to_split_[indexes.first].edge_ind_ + 1;
+ split_paths.first.push_back(default_paths.first[edge_ind]);
+ split_partlens.first.push_back(split_partlens.first[split_partlens.first.size() - 1] +
+ graph_.length(default_paths.first[edge_ind]));
+ }
+
+ TRACE("New path1 after: " << SimplePathWithVerticesToString(graph_, split_paths.first));
+ TRACE("New path2 after: " << SimplePathWithVerticesToString(graph_, split_paths.second));
+
+ return true;
+ }
+
+ // function expect that order in pair_to_order matches with (first_path, second_path)
+ template<typename T>
+ pair<T&, T&> OrderBySplitAndOpposite(size_t split_ind, size_t oppos_ind, pair<T, T> &pair_to_order){
+ VERIFY(vertices_to_split_[split_ind].owner_path_ !=
+ vertices_to_split_[oppos_ind].owner_path_);
+ if(vertices_to_split_[split_ind].owner_path_ == first_path)
+ return pair<T&, T&>(pair_to_order.first, pair_to_order.second);
+ return pair<T&, T&>(pair_to_order.second, pair_to_order.first);
+ }
+
+ bool PerformSplitting(subpath_range range1, subpath_range range2){
+ TRACE("Vector initialization");
+ InitializeSplitVectors(num_splits_.second, split_paths_.first,
+ partlen_split_paths_.first, old_paths_.first, range1);
+ InitializeSplitVectors(num_splits_.first, split_paths_.second,
+ partlen_split_paths_.second, old_paths_.second, range2);
+
+ size_t num_done_splits = 0;
+ size_t split_index = 1;
+
+ TRACE("Splitting cycle starts");
+
+ set<size_t> processed_indices;
+ while(num_done_splits < num_splits_.first + num_splits_.second){
+ TRACE("Splitted index - " << split_index << " , owner - " <<
+ vertices_to_split_[split_index].owner_path_);
+
+ size_t opposite_index = IndexFirstOppositeEdge(split_index, processed_indices);
+ TRACE("Opposite index - " << opposite_index << ", owner - " <<
+ vertices_to_split_[opposite_index].owner_path_);
+
+ if(processed_indices.find(split_index) == processed_indices.end()){
+ if(VerticesMergePossible(split_index, opposite_index) &&
+ (opposite_index != vertices_to_split_.size() - 2) &&
+ (opposite_index != vertices_to_split_.size() - 1)){
+
+ TRACE("Merge starts");
+ PerformMerge(OrderByPaths(split_index, opposite_index));
+ num_done_splits += 2;
+ processed_indices.insert(opposite_index);
+
+ TRACE("Merge was performed");
+ }
+ else{
+ TRACE("Split starts");
+
+ bool split_res = PerformSplit(pair<size_t, size_t>(split_index, opposite_index),
+ OrderBySplitAndOpposite<edge_path>(split_index, opposite_index, split_paths_),
+ OrderBySplitAndOpposite<vector<size_t> >(split_index, opposite_index, partlen_split_paths_),
+ OrderBySplitAndOpposite<edge_path>(split_index, opposite_index, old_paths_),
+ OrderBySplitAndOpposite<size_t>(split_index, opposite_index, spath_lens_),
+ OrderBySplitAndOpposite<size_t>(split_index, opposite_index, num_splits_));
+
+ if(!split_res)
+ return false;
+
+ num_done_splits++;
+ TRACE("Split was performed");
+ }
+
+ processed_indices.insert(split_index);
+ }
+ TRACE("Number done splittings - " << num_done_splits);
+ split_index ++;
+ TRACE("-------------------------");
+ }
+ TRACE("Splitting cycle ends");
+ TRACE("-------------------------");
+ return true;
+ }
+
+ void CreateVectorSplitVertices(subpath_range range1, subpath_range range2){
+ pair<size_t, size_t> lens_before_spath(DefineSpathLenBefore(part_lens_.first, range1),
+ DefineSpathLenBefore(part_lens_.second, range2));
+ spath_lens_ = pair<size_t, size_t>(DefineSubpathLen(part_lens_.first, range1),
+ DefineSubpathLen(part_lens_.second, range2));
+ vertices_to_split_.push_back(vertex_to_split(0, no_owner, 0));
+ size_t iter1 = range1.first;
+ size_t iter2 = range2.first;
+
+ TRACE("Partlens for 1st vector: " << VectorToString<size_t>(part_lens_.first));
+ TRACE("Partlens for 2nd vector: " << VectorToString<size_t>(part_lens_.second));
+ TRACE("Slens before - " << lens_before_spath.first << " " << lens_before_spath.second);
+
+ for(size_t i = 0; i < num_splits_.first + num_splits_.second; i++){
+ double rel_dist1 = double(part_lens_.first[iter1] - lens_before_spath.first) /
+ double(spath_lens_.first);
+ double rel_dist2 = double(part_lens_.second[iter2] - lens_before_spath.second) /
+ double(spath_lens_.second);
+ if(rel_dist1 < rel_dist2){
+ vertices_to_split_.push_back(vertex_to_split(rel_dist1, first_path, iter1));
+ iter1++;
+ }
+ else{
+ vertices_to_split_.push_back(vertex_to_split(rel_dist2, second_path, iter2));
+ iter2++;
+ }
+ }
+ vertices_to_split_.push_back(vertex_to_split(1.0, second_path, iter2));
+ vertices_to_split_.push_back(vertex_to_split(1.0, first_path, iter1));
+ }
+
+public:
+ SubpathsSplitter(Graph &graph, shared_ptr<BaseBulge> bulge) :
+ graph_(graph),
+ old_paths_(pair<edge_path, edge_path>(bulge->path1(), bulge->path2())),
+ part_lens_(make_pair(CalculatePathPartLens(graph_, old_paths_.first),
+ CalculatePathPartLens(graph_, old_paths_.second))),
+ num_splits_(),
+ split_paths_(),
+ partlen_split_paths_() { }
+
+ SplitResult SplitSubpaths(subpath_range range1, subpath_range range2) {
+ clear();
+
+ // number of splits on the 1st and the 2nd subpaths
+ ranges_.first = range1;
+ ranges_.second = range2;
+
+ num_splits_.first = range1.second - range1.first;
+ num_splits_.second = range2.second - range2.first;
+
+ TRACE("Range 1: " << range1.first << " - " << range1.second);
+ TRACE("Range 2: " << range2.first << " - " << range2.second);
+ TRACE("Num splits 1 - " << num_splits_.first << ", num splits 2 - " << num_splits_.second);
+
+ TRACE("Subpath to split1 - " << SimplePathWithVerticesToString(graph_, CutSubpath(old_paths_.first, range1)));
+ TRACE("Subpath to split2 - " << SimplePathWithVerticesToString(graph_, CutSubpath(old_paths_.second, range2)));
+
+ if(num_splits_.first + num_splits_.second == 0)
+ return SplitResult(graph_, CutSubpath(old_paths_.first, range1),
+ CutSubpath(old_paths_.second, range2));
+
+ CreateVectorSplitVertices(range1, range2);
+
+ TRACE("Vertices to split:");
+ for(auto it = vertices_to_split_.begin(); it != vertices_to_split_.end(); it++)
+ TRACE(it->rel_dist_ << " " << it->owner_path_ << " " << it->edge_ind_ );
+
+ TRACE("Auxiliary vectors were created");
+
+ if(!PerformSplitting(range1, range2))
+ return SplitResult(graph_);
+
+ TRACE("Splitted spath1 - " << SimplePathWithVerticesToString(graph_, split_paths_.first));
+ TRACE("Splitted spath2 - " << SimplePathWithVerticesToString(graph_, split_paths_.second));
+ return SplitResult(graph_, split_paths_.first, split_paths_.second);
+ }
+
+private:
+ DECL_LOGGER("SubpathSplitter");
+};
+
+class BulgeSplitter {
+ Graph &graph_;
+public:
+ BulgeSplitter(Graph &graph) : graph_(graph) { }
+
+ shared_ptr<BaseBulge> SplitBulge(shared_ptr<BaseBulge> bulge, GluingVericesDefinerResults gluing_def_results) {
+ if(bulge->IsSimple()){
+ TRACE("Bulge is simple. Splitting was not performed");
+ return shared_ptr<BaseBulge>(new Bulge(graph_, graph_.k(), bulge->path1(), bulge->path2()));
+ }
+
+ SubpathsSplitter spaths_splitter(graph_, bulge);
+ if(gluing_def_results.size() == 0){
+ // one big split
+ TRACE("No gluing vertices. Split will perform between start and end vertices");
+ auto split_res = spaths_splitter.SplitSubpaths(
+ subpath_range(0, bulge->path1().size() - 1),
+ subpath_range(0, bulge->path2().size() - 1));
+ TRACE("bulge was splitted");
+ TRACE("1st new bulge side - " << SimplePathWithVerticesToString(graph_, split_res.path1()));
+ TRACE("2nd new bulge side - " << SimplePathWithVerticesToString(graph_, split_res.path2()));
+ return shared_ptr<BaseBulge>(new Bulge(graph_, graph_.k(), split_res.path1(), split_res.path2()));
+ }
+ TRACE(gluing_def_results.size() << " - number of gluing pairs");
+ // splitting before first gluing pair
+ TRACE("Splitting before first gluing pair");
+ auto split_result = spaths_splitter.SplitSubpaths(
+ subpath_range(0, gluing_def_results.begin()->first),
+ subpath_range(0, gluing_def_results.begin()->second));
+
+ if(split_result.IsEmpty())
+ return shared_ptr<BaseBulge>(new Bulge(graph_));
+
+ // perform all intermediate splittings
+ TRACE("All intermediate splittings");
+ for(auto iter1 = gluing_def_results.begin(), iter2 = ++gluing_def_results.begin();
+ iter2 != gluing_def_results.end(); iter1++, iter2++){
+ TRACE("Gluing pairs - (" << iter1->first << " " << iter1->second << ") (" <<
+ iter2->first << " " << iter2->second << ")");
+ auto new_split_res = spaths_splitter.SplitSubpaths(
+ subpath_range(iter1->first + 1, iter2->first),
+ subpath_range(iter1->second + 1, iter2->second));
+ if(new_split_res.IsEmpty())
+ return shared_ptr<BaseBulge>(new Bulge(graph_));
+ split_result.ExtendPaths(new_split_res);
+ }
+
+ // splitting after last gluing last pair
+ TRACE("Splitting after last gluing last pair");
+ auto last_split_res = spaths_splitter.SplitSubpaths(
+ subpath_range((--gluing_def_results.end())->first + 1, bulge->path1().size() - 1),
+ subpath_range((--gluing_def_results.end())->second + 1, bulge->path2().size() - 1));
+ if(last_split_res.IsEmpty())
+ return shared_ptr<BaseBulge>(new Bulge(graph_));
+ split_result.ExtendPaths(last_split_res);
+
+ TRACE("New bulge path1 - " << SimplePathWithVerticesToString(graph_, split_result.path1()));
+ TRACE("New bulge path2 - " << SimplePathWithVerticesToString(graph_, split_result.path2()));
+ TRACE("Splitting completed");
+
+ return shared_ptr<BaseBulge>(new Bulge(graph_, graph_.k(), split_result.path1(), split_result.path2()));
+ }
+
+private:
+ DECL_LOGGER("BulgeSplitter");
+};
+
+}
diff --git a/src/projects/dipspades/polymorphic_bulge_remover/complex_bulge_remover.hpp b/src/projects/dipspades/polymorphic_bulge_remover/complex_bulge_remover.hpp
new file mode 100644
index 0000000..debe5e3
--- /dev/null
+++ b/src/projects/dipspades/polymorphic_bulge_remover/complex_bulge_remover.hpp
@@ -0,0 +1,145 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "pipeline/config_struct.hpp"
+#include "pipeline/graphio.hpp"
+#include "stages/construction.hpp"
+
+#include "utils/path_routines.hpp"
+#include "utils/element_printers.hpp"
+#include "utils/histogram.hpp"
+
+#include "bulge_correction_condition.hpp"
+#include "bulge_gluer.hpp"
+#include "diploid_bulge_finder.hpp"
+
+#include "io/reads_io/splitting_wrapper.hpp"
+
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+
+namespace dipspades {
+
+bool EdgeExists(Graph &graph_, size_t edge_id){
+ for(auto e = graph_.SmartEdgeBegin(); !e.IsEnd(); ++e)
+ if(graph_.int_id(*e) == edge_id)
+ return true;
+ return false;
+}
+
+template<class BulgePathsSearcher, class BulgeGluer>
+class BulgeRemoverAlgorithm{
+ typedef vector<vector<EdgeId> > paths;
+protected:
+ Graph &graph_;
+ BulgeGluer bulge_gluer_;
+ BaseHistogram<size_t> &hist_;
+ const dipspades_config::polymorphic_br &pbr_config_;
+
+ DiploidBulgeFinder bulge_finder_;
+ DiploidyCondition dip_bulge_checker_;
+ RelatedVerticesCondition rel_bulge_checker_;
+
+ bool BulgeExistFrom(VertexId start){
+ return graph_.OutgoingEdgeCount(start) > 1;
+ }
+
+ bool BulgeExistTo(VertexId end){
+ return graph_.IncomingEdgeCount(end) > 1;
+ }
+
+ void FillHistogram(shared_ptr<BaseBulge> bulge){
+ hist_.Add(max<size_t>(GetPathLength(graph_, bulge->path1()),
+ GetPathLength(graph_, bulge->path2())));
+ }
+
+ bool FindGlueBulge(paths &bulge_paths){
+ TRACE("Bulge finder from " << bulge_paths.size() << " paths starts");
+ auto bulge = bulge_finder_.Find(bulge_paths);
+ if(bulge->IsEmpty()){
+ TRACE("Paths do not form a bulge");
+ return false;
+ }
+ TRACE("Paths form a bulge");
+ TRACE("Bulge gluing starts");
+ if(!rel_bulge_checker_.IsBulgeCorrect(bulge)/* ||
+ !dip_bulge_checker_.IsBulgeCorrect(bulge)*/){
+ TRACE("Bulge do not successed diploid condition");
+ return false;
+ }
+
+ TRACE("Correct bulge:");
+ TRACE("Path1:" << SimplePathWithVerticesToString(graph_, bulge->path1()));
+ TRACE("Path2:" << SimplePathWithVerticesToString(graph_, bulge->path2()));
+
+ FillHistogram(bulge);
+ TRACE("Diploid condition was passed");
+ if(!bulge_gluer_.GlueBulge(bulge))
+ return false;
+
+ TRACE("Bulge gluing ends");
+ return true;
+ }
+
+public:
+ BulgeRemoverAlgorithm(Graph &graph,
+ BulgeGluer bulge_gluer,
+ BaseHistogram<size_t> &hist,
+ const dipspades_config::polymorphic_br &pbr_config) :
+ graph_(graph),
+ bulge_gluer_(bulge_gluer),
+ hist_(hist),
+ pbr_config_(pbr_config),
+ bulge_finder_(graph, pbr_config.rel_bulge_length, pbr_config.rel_bulge_align),
+ dip_bulge_checker_(graph, pbr_config.rel_bulge_length, pbr_config.rel_bulge_align),
+ rel_bulge_checker_(graph) { }
+
+ size_t Run(){
+ size_t num_merged_paths = 0;
+ BulgePathsSearcher paths_searcher(graph_,
+ max<size_t>(hist_.max(), pbr_config_.max_bulge_nucls_len),
+ pbr_config_.max_neigh_number);
+ INFO("Maximal length of glued bulge: " << hist_.max());
+ TRACE("BulgeRemoverAlgorithm starts");
+ for(auto v = graph_.SmartVertexBegin(); !v.IsEnd(); ++v){
+ TRACE("Processing vertex " << graph_.str(*v));
+ if(BulgeExistFrom(*v)){
+ auto reached_vertices = paths_searcher.VerticesReachedFrom(*v);
+ TRACE("Number of neigs - " << reached_vertices.size());
+ for(auto neigh = SmartSetIterator<Graph, VertexId>(graph_,
+ reached_vertices.begin(), reached_vertices.end());
+ !neigh.IsEnd(); ++neigh){
+ if(*neigh != *v && BulgeExistTo(*neigh)){
+ TRACE("Bulge can be found");
+ TRACE("Processing neigh " << graph_.str(*neigh));
+ auto bulge_paths = paths_searcher.GetAllPathsTo(*v, *neigh);
+
+ TRACE("Bulge paths:");
+ for(auto p = bulge_paths.begin(); p != bulge_paths.end(); p++)
+ TRACE(SimplePathWithVerticesToString(graph_, *p));
+
+ if(FindGlueBulge(bulge_paths)){
+ num_merged_paths++;
+ TRACE("Bulge was glued");
+ break;
+ }
+ }
+ }
+ }
+ }
+ TRACE(num_merged_paths << " bulges were glued");
+ return num_merged_paths;
+ }
+
+private:
+ DECL_LOGGER("PolymorphicBulgeRemover");
+};
+
+}
diff --git a/src/projects/dipspades/polymorphic_bulge_remover/diploid_bulge_finder.hpp b/src/projects/dipspades/polymorphic_bulge_remover/diploid_bulge_finder.hpp
new file mode 100644
index 0000000..709f02f
--- /dev/null
+++ b/src/projects/dipspades/polymorphic_bulge_remover/diploid_bulge_finder.hpp
@@ -0,0 +1,102 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "../utils/bulge_utils.hpp"
+#include "bulge_paths_searcher.hpp"
+
+using namespace debruijn_graph;
+
+namespace dipspades {
+
+class DiploidBulgeFinder {
+ typedef vector<vector<EdgeId> > paths;
+
+ Graph &graph_;
+ double rel_length_;
+ double rel_align_;
+
+ bool RelativeLengthGood(double rel_length){
+ return rel_length >= rel_length_;
+ }
+
+ bool RelativeAlignGood(double rel_align){
+ return rel_align <= rel_align_;
+ }
+
+ bool BulgePathsIntersected(vector<EdgeId> &path1, vector<EdgeId> &path2){
+ for(auto e1 = path1.begin(); e1 != path1.end(); e1++)
+ for(auto e2 = path2.begin(); e2 != path2.end(); e2++)
+ if(*e1 == *e2)
+ return true;
+ return false;
+ }
+
+ vector<pair<size_t, size_t> > ChoosePairsWithoutIntersection(paths &bulge_paths){
+ vector<pair<size_t, size_t> > correct_pairs;
+ for(size_t i = 0; i < bulge_paths.size(); i++)
+ for(size_t j = i + 1; j < bulge_paths.size(); j++)
+ if(!BulgePathsIntersected(bulge_paths[i], bulge_paths[j]))
+ correct_pairs.push_back(make_pair(i, j));
+ return correct_pairs;
+ }
+
+ vector<pair<size_t, size_t> > DefineLenSatisfiedPairs(vector<size_t> &lens,
+ vector<pair<size_t, size_t> > pairs){
+ vector<pair<size_t, size_t> > good_pairs;
+ for(auto it = pairs.begin(); it != pairs.end(); it++)
+ if(RelativeLengthGood(RelativeLengthEquality(lens[it->first], lens[it->second])))
+ good_pairs.push_back(*it);
+ return good_pairs;
+ }
+
+ vector<pair<size_t, size_t> > ChooseSeqSatisfiedPairs(vector<Sequence> &seqs,
+ vector<pair<size_t, size_t> > pairs){
+ vector<pair<size_t, size_t> > good_pairs;
+ for(auto it = pairs.begin(); it != pairs.end(); it++)
+ if(RelativeAlignGood(RelAlignmentOfSequences(seqs[it->first], seqs[it->second])))
+ good_pairs.push_back(*it);
+ return good_pairs;
+ }
+
+ vector<Sequence> GetSequences(paths &bulge_paths){
+ vector<Sequence> seqs;
+ for(auto it = bulge_paths.begin(); it != bulge_paths.end(); it++)
+ seqs.push_back(GetSequenceByPath(graph_, graph_.k(), *it));
+ return seqs;
+ }
+
+ vector<size_t> GetLengths(paths &bulge_paths){
+ vector<size_t> lens;
+ for(auto it = bulge_paths.begin(); it != bulge_paths.end(); it++)
+ lens.push_back(GetPathLength(graph_, *it));
+ return lens;
+ }
+
+public:
+ DiploidBulgeFinder(Graph &graph, double rel_length, double rel_align) :
+ graph_(graph),
+ rel_length_(rel_length),
+ rel_align_(rel_align) { }
+
+ shared_ptr<BaseBulge> Find(paths &bulge_paths){
+ if(bulge_paths.size() <= 1)
+ return shared_ptr<BaseBulge>(new Bulge(graph_));
+
+ auto good_pairs = ChoosePairsWithoutIntersection(bulge_paths);
+ vector<Sequence> seqs = GetSequences(bulge_paths);
+ vector<size_t> lens = GetLengths(bulge_paths);
+ good_pairs = DefineLenSatisfiedPairs(lens, good_pairs);
+ good_pairs = ChooseSeqSatisfiedPairs(seqs, good_pairs);
+
+ if(good_pairs.size() == 0)
+ return shared_ptr<BaseBulge>(new Bulge(graph_));
+ return shared_ptr<BaseBulge>(new Bulge(graph_, graph_.k(), bulge_paths[good_pairs[0].first],
+ bulge_paths[good_pairs[0].second]));
+ }
+};
+
+}
diff --git a/src/projects/dipspades/polymorphic_bulge_remover/glue_direction_definer.hpp b/src/projects/dipspades/polymorphic_bulge_remover/glue_direction_definer.hpp
new file mode 100644
index 0000000..6fadfd8
--- /dev/null
+++ b/src/projects/dipspades/polymorphic_bulge_remover/glue_direction_definer.hpp
@@ -0,0 +1,76 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "../utils/bulge_utils.hpp"
+#include "../utils/path_routines.hpp"
+#include "../dipspades_config.hpp"
+
+using namespace debruijn_graph;
+
+namespace dipspades {
+
+
+class GluingDirectionDefiner {
+protected:
+ Graph &graph_;
+public:
+ GluingDirectionDefiner(Graph &graph) : graph_(graph) { }
+ virtual glue_direction Define(shared_ptr<BaseBulge>) {
+ return undefined;
+ }
+ virtual ~GluingDirectionDefiner() { }
+};
+
+class RelatedBaseGlueDirectionDefiner : public GluingDirectionDefiner{
+public:
+ RelatedBaseGlueDirectionDefiner(Graph &graph) : GluingDirectionDefiner(graph) { }
+
+ glue_direction Define(shared_ptr<BaseBulge> bulge){
+ bool rel_edges_path1 = PathAdjacentRelatedEdges(this->graph_, bulge->path1());
+ bool rel_edges_path2 = PathAdjacentRelatedEdges(this->graph_, bulge->path2());
+ if(rel_edges_path1 && rel_edges_path2)
+ return undefined;
+
+ // if only path2 contains related edges
+ // we need gluing path2 to path1
+ if(rel_edges_path2)
+ return reverse_gluing;
+ return direct_gluing;
+ }
+};
+
+class CoverageBaseGlueDirectionDefiner : public GluingDirectionDefiner{
+public:
+ CoverageBaseGlueDirectionDefiner(Graph &graph) : GluingDirectionDefiner(graph) { }
+
+ glue_direction Define(shared_ptr<BaseBulge>){
+ // todo implement me
+ return direct_gluing;
+ }
+};
+
+class CompositeGlueDirectionDefiner : public GluingDirectionDefiner {
+ vector<shared_ptr<GluingDirectionDefiner> > &definers_;
+public:
+ CompositeGlueDirectionDefiner(Graph &graph,
+ vector<shared_ptr<GluingDirectionDefiner> > &definers) :
+ GluingDirectionDefiner(graph),
+ definers_(definers) { }
+
+ glue_direction Define(shared_ptr<BaseBulge> bulge){
+ set<glue_direction> directions;
+ for(auto it = definers_.begin(); it != definers_.end(); it++)
+ directions.insert((*it)->Define(bulge));
+ if(directions.size() == 1)
+ return *(directions.begin());
+ return undefined;
+ }
+};
+
+}
diff --git a/src/projects/dipspades/polymorphic_bulge_remover/gluing_vertices_definer.hpp b/src/projects/dipspades/polymorphic_bulge_remover/gluing_vertices_definer.hpp
new file mode 100644
index 0000000..e83848d
--- /dev/null
+++ b/src/projects/dipspades/polymorphic_bulge_remover/gluing_vertices_definer.hpp
@@ -0,0 +1,170 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "../utils/bulge_utils.hpp"
+#include "../dipspades_config.hpp"
+
+using namespace debruijn_graph;
+
+namespace dipspades {
+
+size_t abs_diff(size_t a, size_t b){
+ if(a > b)
+ return a - b;
+ return b - a;
+}
+
+typedef map<size_t, size_t> BulgeGluingVertices;
+typedef map<size_t, size_t>::iterator BulgeGluingVerticesIter;
+
+class GluingVericesDefinerResults {
+ map<size_t, size_t> gluing_pairs_;
+
+ bool IsNewPairGood(pair<size_t, size_t> new_pair){
+ if(gluing_pairs_.size() == 0)
+ return true;
+
+ auto upper = gluing_pairs_.upper_bound(new_pair.first);
+ auto lower = gluing_pairs_.lower_bound(new_pair.first);
+
+ // map doesnt contain element with greater 1st vertex
+ if(upper == gluing_pairs_.end()){
+ // map doesnt contain element with the same 1st vertex
+ if(lower == upper){
+ // go to the previous element (with less 1st vertex)
+ // if its 2nd vertex precedes 2nd vertex of new pair, add new pair
+ // otherwise do not add new pair
+ lower--;
+ return lower->second < new_pair.second;
+ }
+ // map contains element with the same key.
+ // addition of new pair is incorrect
+ return false;
+ }
+ // map contains element with greater 1st vertex
+ // and corresponding 2nd vertex is <= 2nd vertex of new pair
+ // addition is incorrect
+ if(upper->second <= new_pair.second)
+ return false;
+ // map contains element with greater 1st vertex
+ // and doesnt contain the same 1st vertex
+ if(lower == upper){
+ // if there are no other elements
+ // add new pair
+ if(lower == gluing_pairs_.begin())
+ return true;
+ // there are other elements exist
+ // go to the previous element
+ // and check for preceding its 2nd vertex to new pair 2nd vertex
+ lower--;
+ return lower->second < new_pair.second;
+ }
+ return false;
+ }
+
+public:
+ void AddNewPair(pair<size_t, size_t> new_pair){
+ if(IsNewPairGood(new_pair)){
+ TRACE("New pair was inserted: " << new_pair.first << " " << new_pair.second);
+ gluing_pairs_.insert(new_pair);
+ }
+ }
+
+ BulgeGluingVerticesIter begin() { return gluing_pairs_.begin(); }
+
+ BulgeGluingVerticesIter end() { return gluing_pairs_.end(); }
+
+ size_t size() { return gluing_pairs_.size(); }
+
+private:
+ DECL_LOGGER("GluingVericesDefinerResults");
+};
+
+class GluingVericesDefiner {
+ Graph &graph_;
+ double rel_length_threshold_;
+
+ typedef map<pair<size_t, size_t>, double> PairSubpaths;
+ PairSubpaths gluing_candidate_;
+
+ double RelativeSimilarityOfLength(size_t len1, size_t len2){
+ return double(abs_diff(len1, len2)) / double(min<size_t>(len1, len2));
+ }
+
+ size_t StartSpathLength(const vector<size_t> &lens, size_t index){
+ VERIFY(index < lens.size());
+ return lens[index];
+ }
+
+ size_t EndSpathLength(const vector<size_t> &lens, size_t index){
+ VERIFY(index < lens.size());
+ return lens[lens.size() - 1] - lens[index];
+ }
+
+ void ChooseGluingCandidate(shared_ptr<BaseBulge> bulge){
+
+ TRACE("Choosing gluing candidates");
+ vector<size_t> part_lens1 = CalculatePathPartLens(graph_, bulge->path1());
+ vector<size_t> part_lens2 = CalculatePathPartLens(graph_, bulge->path2());
+
+ for(size_t i = 0; i < part_lens1.size() - 1; i++)
+ for(size_t j = 0; j < part_lens2.size() - 1; j++){
+ double rel_len_start_spaths = RelativeSimilarityOfLength(StartSpathLength(part_lens1, i),
+ StartSpathLength(part_lens2, j));
+ double rel_len_end_spaths = RelativeSimilarityOfLength(EndSpathLength(part_lens1, i),
+ EndSpathLength(part_lens2, j));
+
+ if(rel_len_start_spaths <= rel_length_threshold_ &&
+ rel_len_end_spaths <= rel_length_threshold_){
+ TRACE("New gluing candidate - " << i << ", " << j);
+ TRACE("rel_len_start_spaths - " << rel_len_start_spaths);
+ TRACE("rel_len_end_spaths - " << rel_len_end_spaths);
+ gluing_candidate_[make_pair(i,j)] = max<double>(rel_len_start_spaths, rel_len_end_spaths);
+ }
+ }
+ }
+
+ pair<size_t, size_t> GetBestPair(){
+ double min = 1;
+ pair<size_t, size_t> best_res;
+
+ for(auto it = gluing_candidate_.begin(); it != gluing_candidate_.end(); it++)
+ if(it->second < min){
+ best_res = it->first;
+ min = it->second;
+ }
+ return best_res;
+ }
+
+ GluingVericesDefinerResults ChooseGluingPairs(shared_ptr<BaseBulge> bulge){
+ gluing_candidate_.clear();
+ ChooseGluingCandidate(bulge);
+ GluingVericesDefinerResults gluing_pairs;
+ while(gluing_candidate_.size() != 0){
+ auto best_pair = GetBestPair();
+ gluing_pairs.AddNewPair(best_pair);
+ gluing_candidate_.erase(best_pair);
+ }
+ return gluing_pairs;
+ }
+
+public:
+ GluingVericesDefiner(Graph &graph, double rel_length_threshold) :
+ graph_(graph),
+ rel_length_threshold_(rel_length_threshold) { }
+
+ GluingVericesDefinerResults Run(shared_ptr<BaseBulge> bulge){
+ return ChooseGluingPairs(bulge);
+ }
+
+private:
+ DECL_LOGGER("GluingVericesDefiner");
+};
+
+}
diff --git a/src/projects/dipspades/polymorphic_bulge_remover/iterative_tails_gluing.hpp b/src/projects/dipspades/polymorphic_bulge_remover/iterative_tails_gluing.hpp
new file mode 100644
index 0000000..ef4ce17
--- /dev/null
+++ b/src/projects/dipspades/polymorphic_bulge_remover/iterative_tails_gluing.hpp
@@ -0,0 +1,132 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "../utils/bulge_utils.hpp"
+#include "../dipspades_config.hpp"
+
+using namespace debruijn_graph;
+
+namespace dipspades {
+
+class IterativeTailGluing {
+ Graph &graph_;
+ double rel_align_;
+
+ typedef VertexId::type::edge_const_iterator edge_const_iterator;
+ typedef pair<edge_const_iterator, edge_const_iterator> edge_iters;
+ typedef boost::optional<EdgeId> OptEdgeId;
+
+
+ OptEdgeId GetEdgeForGlue(EdgeId edge, edge_iters iters){
+ double best_rel_align = 1;
+ OptEdgeId res;
+ for(auto it = iters.first; it != iters.second; it++){
+ if(edge != *it){
+ pair<Sequence, Sequence> seqs;
+ if(graph_.length(edge) <= graph_.length(*it)){
+ seqs.first = graph_.EdgeNucls(edge);
+ seqs.second = graph_.EdgeNucls(*it).Subseq(0, seqs.first.size());
+ }
+ else{
+ seqs.first = graph_.EdgeNucls(*it);
+ seqs.second = graph_.EdgeNucls(edge).Subseq(0, seqs.first.size());
+ }
+ double rel_align = RelAlignmentOfSequences(seqs.first, seqs.second);
+ if(rel_align <= rel_align_ && rel_align <= best_rel_align){
+ best_rel_align = rel_align;
+ res = *it;
+ }
+ }
+ }
+ return res;
+ }
+
+ bool ProcessTail(EdgeId edge, edge_iters iters){
+ auto edge_for_glue = GetEdgeForGlue(edge, iters);
+ if(edge_for_glue.is_initialized()){
+
+ TRACE("Edge for glue " << graph_.str(edge_for_glue.get()));
+ TRACE("Edges lengths" << graph_.length(edge) << " - " << graph_.length(edge_for_glue.get()));
+
+ size_t min_len = min<size_t>(graph_.length(edge), graph_.length(edge_for_glue.get()));
+ if(min_len == graph_.length(edge) && min_len == graph_.length(edge_for_glue.get())){
+ graph_.GlueEdges(edge, edge_for_glue.get());
+ }
+ else{
+ if(min_len == graph_.length(edge)){
+ pair<EdgeId, EdgeId> new_edges = graph_.SplitEdge(edge_for_glue.get(), min_len);
+ graph_.GlueEdges(edge, new_edges.first);
+ }
+ else {
+ auto new_edges = graph_.SplitEdge(edge, min_len);
+ graph_.GlueEdges(new_edges.first, edge_for_glue.get());
+ }
+ }
+ return true;
+ }
+ return false;
+ }
+
+ bool IsTailIncoming(EdgeId edge){
+ return graph_.IncomingEdgeCount(graph_.EdgeStart(edge)) == 0 &&
+ graph_.OutgoingEdgeCount(graph_.EdgeStart(edge)) == 0;
+ }
+
+ bool ProcessTail(EdgeId edge){
+ if(IsTailIncoming(edge))
+ return ProcessTail(edge,
+ edge_iters(graph_.IncomingEdges(graph_.EdgeEnd(edge)).begin(),
+ graph_.IncomingEdges(graph_.EdgeEnd(edge)).end()));
+ return ProcessTail(edge,
+ edge_iters(graph_.OutgoingEdges(graph_.EdgeStart(edge)).begin(),
+ graph_.OutgoingEdges(graph_.EdgeStart(edge)).end()));
+ }
+
+ bool EdgeIsTail(EdgeId edge) {
+ return (graph_.IncomingEdgeCount(graph_.EdgeStart(edge)) == 0 &&
+ graph_.OutgoingEdgeCount(graph_.EdgeStart(edge)) == 1) ||
+ (graph_.IncomingEdgeCount(graph_.EdgeEnd(edge)) == 1 &&
+ graph_.OutgoingEdgeCount(graph_.EdgeEnd(edge)) == 0);
+ }
+
+ bool EdgeIsIsolate(EdgeId edge){
+ return (graph_.IncomingEdgeCount(graph_.EdgeStart(edge)) == 0 &&
+ graph_.OutgoingEdgeCount(graph_.EdgeStart(edge)) == 1) &&
+ (graph_.IncomingEdgeCount(graph_.EdgeEnd(edge)) == 1 &&
+ graph_.OutgoingEdgeCount(graph_.EdgeEnd(edge)) == 0); }
+
+ size_t ProcessTails(){
+ size_t num_glued_tails = 0;
+ for(auto edge = graph_.SmartEdgeBegin(); !edge.IsEnd(); ++edge)
+ if(EdgeIsTail(*edge) && !EdgeIsIsolate(*edge)){
+ TRACE("Processing edge " << graph_.str(*edge));
+ if(ProcessTail(*edge))
+ num_glued_tails++;
+ }
+ return num_glued_tails;
+ }
+
+public:
+ IterativeTailGluing(Graph &graph, double rel_align) :
+ graph_(graph),
+ rel_align_(rel_align) { }
+
+ size_t IterativeProcessTails(){
+ size_t num_glued_tails = 1;
+ size_t num_iter = 1;
+ while(num_glued_tails > 0){
+ num_glued_tails = ProcessTails();
+ INFO(num_iter << " iteration : " << num_glued_tails << " tails were glued");
+ num_iter++;
+ }
+ return num_glued_tails;
+ }
+};
+
+}
diff --git a/src/projects/dipspades/polymorphic_bulge_remover/polymorphic_bulge_remover.hpp b/src/projects/dipspades/polymorphic_bulge_remover/polymorphic_bulge_remover.hpp
new file mode 100644
index 0000000..3b481b7
--- /dev/null
+++ b/src/projects/dipspades/polymorphic_bulge_remover/polymorphic_bulge_remover.hpp
@@ -0,0 +1,109 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "simple_bulge_remover.hpp"
+#include "complex_bulge_remover.hpp"
+#include "iterative_tails_gluing.hpp"
+#include "assembly_graph/stats/picture_dump.hpp"
+
+#include "visualization/visualization.hpp"
+#include "assembly_graph/handlers/edges_position_handler.hpp"
+#include "assembly_graph/components/graph_component.hpp"
+#include "algorithms/simplification/compressor.hpp"
+
+using namespace debruijn_graph;
+
+namespace dipspades {
+
+class PolymorphicBulgeRemoverHelper {
+public:
+ typedef ComplexBulgeGluer<RelatedBaseGlueDirectionDefiner, GluingVericesDefiner, BulgeSplitter> BaseBulgeGluer;
+ static BaseBulgeGluer CreateBaseBulgeGluer(Graph &graph, double rel_len_threshold){
+ return BaseBulgeGluer(graph, RelatedBaseGlueDirectionDefiner(graph),
+ GluingVericesDefiner(graph, rel_len_threshold), BulgeSplitter(graph));
+ }
+};
+
+class PolymorphicBulgeRemover {
+ conj_graph_pack &graph_pack_;
+ BaseHistogram<size_t> &bulge_len_hist_;
+
+ typedef BulgeRemoverAlgorithm<DijkstraBulgePathsSearcher,
+ PolymorphicBulgeRemoverHelper::BaseBulgeGluer> LightBulgeRemover;
+ typedef BulgeRemoverAlgorithm<PathProcessorBulgeSearcher,
+ PolymorphicBulgeRemoverHelper::BaseBulgeGluer> HardBulgeRemover;
+
+ void RunSimpleBRCycle(){
+ INFO("Simple polymorphic bulge remover runs");
+ SimpleBulgeRemover spath_br(graph_pack_.g, bulge_len_hist_, dsp_cfg::get().pbr);
+ size_t num_glued_bulges = 1;
+ for(size_t num_iter = 1; num_glued_bulges > 0; num_iter++){
+ num_glued_bulges = spath_br.Run();
+ CompressAllVertices(graph_pack_.g, false);
+ INFO(ToString(num_iter) + " iteration: " + ToString(num_glued_bulges) + " simple bulges were glued");
+ }
+ INFO("Simple polymorphic bulge remover ends");
+ }
+
+ template<class BulgeRemover>
+ void BulgeRemoverCycle(string bulge_remover_name, size_t num_iters){
+ INFO(bulge_remover_name + " starts");
+ INFO("Maximal number of iterations: " << num_iters);
+ BulgeRemover br(graph_pack_.g,
+ PolymorphicBulgeRemoverHelper::CreateBaseBulgeGluer(graph_pack_.g,
+ dsp_cfg::get().pbr.paired_vert_rel_threshold),
+ bulge_len_hist_,
+ dsp_cfg::get().pbr);
+ size_t num_glued_bulges = 1;
+ for(size_t i = 0; (i < num_iters) && (num_glued_bulges != 0); i++){
+ num_glued_bulges = br.Run();
+ CompressAllVertices(graph_pack_.g, false);
+ INFO(ToString(i + 1) + " iteration: " + ToString(num_glued_bulges) + " complex bulges were glued");
+ }
+ INFO(bulge_remover_name + " ends");
+ }
+
+ void WriteComponents(string component_dir) {
+ if(!dsp_cfg::get().rp.developer_mode)
+ return;
+
+ graph_pack_.EnsureDebugInfo();
+ make_dir(dsp_cfg::get().io.output_dir + "components/");
+ omnigraph::DefaultLabeler<Graph> labeler(graph_pack_.g, graph_pack_.edge_pos);
+ make_dir(dsp_cfg::get().io.output_dir + "components/" + component_dir + "/");
+ omnigraph::visualization::WriteComponents(graph_pack_.g,
+ dsp_cfg::get().io.output_dir + "components/" + component_dir + "/",
+ omnigraph::ReliableSplitter<Graph>(graph_pack_.g),
+ omnigraph::visualization::DefaultColorer(graph_pack_.g, Path<EdgeId>(), Path<EdgeId>()),
+ labeler);
+ }
+
+public:
+ PolymorphicBulgeRemover(conj_graph_pack &graph_pack,
+ BaseHistogram<size_t> &bulge_len_hist) :
+ graph_pack_(graph_pack),
+ bulge_len_hist_(bulge_len_hist) { }
+
+ void Run(){
+ if(!dsp_cfg::get().pbr.enabled)
+ return ;
+ WriteComponents("before_pbr");
+ graph_pack_.kmer_mapper.SetUnsafeMode(true);
+ INFO("Polymorphic bulge remover starts");
+ RunSimpleBRCycle();
+ BulgeRemoverCycle<LightBulgeRemover>("LightBulgeRemover", dsp_cfg::get().pbr.num_iters_lbr);
+ INFO("Index refilling");
+ graph_pack_.index.Refill();
+ INFO("Polymorphic ends remover ends");
+ WriteComponents("after_pbr");
+ graph_pack_.kmer_mapper.SetUnsafeMode(false);
+ }
+};
+
+}
diff --git a/src/projects/dipspades/polymorphic_bulge_remover/simple_bulge_remover.hpp b/src/projects/dipspades/polymorphic_bulge_remover/simple_bulge_remover.hpp
new file mode 100644
index 0000000..73f727d
--- /dev/null
+++ b/src/projects/dipspades/polymorphic_bulge_remover/simple_bulge_remover.hpp
@@ -0,0 +1,51 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "bulge_correction_condition.hpp"
+#include "bulge_gluer.hpp"
+#include "../utils/histogram.hpp"
+
+using namespace debruijn_graph;
+
+namespace dipspades {
+
+class SimpleBulgeRemover{
+ Graph &graph_;
+ BaseHistogram<size_t> &bulge_len_hist_;
+ RelatedVerticesCondition rel_bulge_checker_;
+ DiploidyCondition dip_bulge_checker_;
+public:
+ SimpleBulgeRemover(Graph &graph,
+ BaseHistogram<size_t> &bulge_len_hist,
+ const dipspades_config::polymorphic_br &pbr_config) :
+ graph_(graph),
+ bulge_len_hist_(bulge_len_hist),
+ rel_bulge_checker_(graph),
+ dip_bulge_checker_(graph, pbr_config.rel_bulge_length, pbr_config.rel_bulge_align) {}
+
+ size_t Run(){
+ size_t glued_edges_count = 0;
+ for(auto e = graph_.SmartEdgeBegin(); !e.IsEnd(); ++e){
+ vector<EdgeId> edges = graph_.GetEdgesBetween(graph_.EdgeStart(*e),
+ graph_.EdgeEnd(*e));
+ if(edges.size() >= 2){
+ auto bulge = shared_ptr<BaseBulge>(new Bulge(graph_, graph_.k(), edges[0], edges[1]));
+ if(rel_bulge_checker_.IsBulgeCorrect(bulge) &&
+ dip_bulge_checker_.IsBulgeCorrect(bulge)){
+ bulge_len_hist_.Add(max<size_t>(graph_.length(edges[0]), graph_.length(edges[1])));
+ graph_.GlueEdges(edges[0], edges[1]);
+ glued_edges_count++;
+ }
+ }
+ }
+ return glued_edges_count;
+ }
+};
+
+}
diff --git a/src/projects/dipspades/utils/bulge_utils.hpp b/src/projects/dipspades/utils/bulge_utils.hpp
new file mode 100644
index 0000000..0471891
--- /dev/null
+++ b/src/projects/dipspades/utils/bulge_utils.hpp
@@ -0,0 +1,267 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "path_routines.hpp"
+#include "element_printers.hpp"
+
+using namespace debruijn_graph;
+
+namespace dipspades {
+
+bool IsRegionBulge(Graph &g, vector<EdgeId> path1, vector<EdgeId> path2){
+ if(path1.size() == 0 || path2.size() == 0)
+ return true;
+ if ((g.EdgeStart(path1[0]) != g.EdgeStart(path2[0])) ||
+ (g.EdgeEnd(path1[path1.size() - 1]) != g.EdgeEnd(path2[path2.size() - 1])))
+ return false;
+ return !PathsShareEdge(path1, path2);
+}
+
+size_t AlignmentOfSequencesByParts(Sequence seq1, Sequence seq2){
+ size_t max_length = 10000;
+ if(min<size_t>(seq1.size(), seq2.size()) > max_length){
+ size_t shrink1 = max_length;
+ size_t num_full_iter = seq1.size() / shrink1;
+
+ size_t summary_dist = 0;
+ size_t shrink2 = size_t((double(shrink1) / double(seq1.size())) * double(seq2.size()));
+ for(size_t i = 0; i < num_full_iter; i++){
+ Sequence cur_seq1 = seq1.Subseq(shrink1 * i, shrink1 * (i + 1));
+ Sequence cur_seq2 = seq2.Subseq(shrink2 * i, shrink2 * (i + 1));
+ summary_dist += EditDistance(cur_seq1, cur_seq2);
+ }
+
+ if(shrink1 % seq1.size() != 0){
+ Sequence cur_seq1 = seq1.Subseq(shrink1 * num_full_iter, seq1.size());
+ Sequence cur_seq2 = seq2.Subseq(shrink2 * num_full_iter, seq2.size());
+ summary_dist += EditDistance(cur_seq1, cur_seq2);
+ }
+
+ return summary_dist;
+ }
+ return EditDistance(seq1, seq2);
+}
+
+double RelAlignmentOfSequences(Sequence seq1, Sequence seq2){
+ return double(AlignmentOfSequencesByParts(seq1, seq2)) / double(min<size_t>(seq1.size(), seq2.size()));
+}
+
+double RelativeLengthEquality(size_t len1, size_t len2){
+ return double(min<size_t>(len1, len2)) / double(max<size_t>(len1, len2));
+}
+
+enum glue_direction { direct_gluing, reverse_gluing, undefined };
+
+class BaseBulge{
+protected:
+ Graph &graph_;
+public:
+ BaseBulge(Graph &graph) : graph_(graph) { }
+ BaseBulge(const BaseBulge& bulge) : graph_(bulge.graph_) { }
+
+ virtual double relative_length() = 0;
+ virtual double relative_align() = 0;
+ virtual bool IsBulgeDiploid(double rel_length_threshold, double rel_seq_threshold) = 0;
+ virtual vector<EdgeId> path1() = 0; // todo make it const
+ virtual vector<EdgeId> path2() = 0; // todo make it const
+ virtual Sequence seq1() = 0;
+ virtual Sequence seq2() = 0;
+ virtual VertexId start_vertex() = 0;
+ virtual VertexId end_vertex() = 0;
+ virtual bool IsSimple() = 0;
+ virtual bool IsEmpty() = 0;
+
+ virtual size_t BulgeLength() = 0;
+
+ virtual string StrId() = 0;
+ virtual string BulgeToString() = 0;
+
+ virtual ~BaseBulge() { }
+};
+
+class Bulge : public BaseBulge{
+ size_t k_value_;
+ vector<EdgeId> path1_;
+ vector<EdgeId> path2_;
+ Sequence seq1_;
+ Sequence seq2_;
+ double rel_length_;
+ double rel_align_;
+
+ void CalculateRelativeLength(size_t length1, size_t length2){
+ rel_length_ = double(min<size_t>(length1, length2)) / double(max<size_t>(length1, length2));
+ }
+
+ void CalculateRelativeAlign(Sequence seq1, Sequence seq2){
+ rel_align_ = RelAlignmentOfSequences(seq1, seq2);
+ }
+
+ string GetPathStr(vector<EdgeId> path) {
+ string s1 = "";
+ for(auto edge = path.begin(); edge != path.end(); edge++)
+ s1 = ToString(graph_.int_id(*edge)) + "-";
+ return s1.substr(0, s1.size() - 1);
+ }
+
+public:
+ Bulge(Graph &graph) : BaseBulge(graph), k_value_(graph.k()), path1_(), path2_(),
+ seq1_(), seq2_(), rel_length_(), rel_align_() { }
+
+ Bulge(Graph &g, size_t k_value, vector<EdgeId> path1, pair<size_t,size_t> bulge_region1,
+ vector<EdgeId> path2, pair<size_t,size_t> bulge_region2) :
+ BaseBulge(g), k_value_(k_value),
+ path1_(CutSubpathByRegion(path1, bulge_region1)),
+ path2_(CutSubpathByRegion(path2, bulge_region2)),
+ seq1_(GetSequenceByPath(graph_, k_value_, path1_).str().c_str()), // todo make it lazy
+ seq2_(GetSequenceByPath(graph_, k_value_, path2_).str().c_str()), // todo make it lazy
+ rel_length_(0), rel_align_(0) {
+ VERIFY(IsRegionBulge(graph_, path1_, path2_));
+ }
+
+ Bulge(Graph &g, size_t k_value, vector<EdgeId> path1, vector<EdgeId> path2) :
+ BaseBulge(g), k_value_(k_value), path1_(path1), path2_(path2),
+ seq1_(GetSequenceByPath(graph_, k_value_, path1_).str().c_str()),
+ seq2_(GetSequenceByPath(graph_, k_value_, path2_).str().c_str()),
+ rel_length_(0), rel_align_(0){
+ VERIFY(IsRegionBulge(graph_, path1_, path2_));
+ }
+
+ Bulge(Graph &g, size_t k_value, EdgeId edge1, EdgeId edge2) : BaseBulge(g),
+ k_value_(k_value),
+ path1_(1, edge1), path2_(1, edge2),
+ seq1_(graph_.EdgeNucls(edge1).str().c_str()),
+ seq2_(graph_.EdgeNucls(edge2).str().c_str()),
+ rel_length_(0), rel_align_(0) {
+ VERIFY(IsRegionBulge(graph_, path1_, path2_));
+ }
+
+ double relative_length(){
+ if(rel_length_ == 0){
+ size_t length1 = GetPathLength(graph_, path1_);
+ size_t length2 = GetPathLength(graph_, path2_);
+ CalculateRelativeLength(length1, length2);
+ }
+ return rel_length_;
+ }
+
+ double relative_align(){
+ if(rel_align_ == 0){
+ Sequence seq1 = GetSequenceByPath(graph_, k_value_, path1_);
+ Sequence seq2 = GetSequenceByPath(graph_, k_value_, path2_);
+ CalculateRelativeAlign(seq1, seq2);
+ }
+ return rel_align_;
+ }
+
+ bool IsBulgeDiploid(double rel_length_threshold, double rel_seq_threshold){
+ if(relative_length() < rel_length_threshold)
+ return false;
+
+ return relative_align() <= rel_seq_threshold;
+ }
+
+ vector<EdgeId> path1(){
+ return path1_;
+ }
+
+ vector<EdgeId> path2(){
+ return path2_;
+ }
+
+ Sequence seq1() { return seq1_; }
+
+ Sequence seq2() { return seq2_; }
+
+ VertexId start_vertex(){
+ return graph_.EdgeStart(path1_[0]);
+ }
+
+ VertexId end_vertex(){
+ return graph_.EdgeEnd(path1_[path1_.size() - 1]);
+ }
+
+ bool IsSimple() { return path1_.size() == 1 && path2_.size() == 1; }
+
+ bool IsEmpty() { return path1_.size() == 0 || path2_.size() == 0; }
+
+ string StrId() {
+ string s1 = GetPathStr(path1());
+ string s2 = GetPathStr(path2());
+ return min<string>(s1,s2) + "_" + max<string>(s1,s2);
+ }
+
+ size_t BulgeLength() {
+ return max<size_t>(GetPathLength(graph_, path1()), GetPathLength(graph_, path2()));
+ }
+
+ string BulgeToString() {
+ return "Side1: " + SimplePathWithVerticesToString(graph_, path1()) + "\n" +
+ "Side2: " + SimplePathWithVerticesToString(graph_, path2());
+ }
+};
+
+class DirectedBulge : public BaseBulge {
+ shared_ptr<BaseBulge> bulge_;
+ bool glue_direct_;
+public:
+ DirectedBulge(Graph &graph, shared_ptr<BaseBulge> bulge, glue_direction glue_direct = direct_gluing) :
+ BaseBulge(graph), bulge_(bulge), glue_direct_(glue_direct) { }
+
+ double relative_length() { return bulge_->relative_length(); }
+
+ double relative_align() { return bulge_->relative_align(); }
+
+ bool IsBulgeDiploid(double rel_length_threshold, double rel_seq_threshold) {
+ return bulge_->IsBulgeDiploid(rel_length_threshold, rel_seq_threshold);
+ }
+
+ vector<EdgeId> path1() {
+ if(glue_direct_ == direct_gluing)
+ return bulge_->path1();
+ return bulge_->path2();
+ }
+
+ vector<EdgeId> path2() {
+ if(glue_direct_ == direct_gluing)
+ return bulge_->path2();
+ return bulge_->path1();
+ }
+
+ Sequence seq1() {
+ if(glue_direct_ == direct_gluing)
+ return bulge_->seq1();
+ return bulge_->seq1();
+ }
+
+ Sequence seq2() {
+ if(glue_direct_ == direct_gluing)
+ return bulge_->seq2();
+ return bulge_->seq2();
+ }
+
+ VertexId start_vertex() {
+ return bulge_->start_vertex();
+ }
+
+ VertexId end_vertex() {
+ return bulge_->end_vertex();
+ }
+
+ bool IsSimple() { return bulge_->IsSimple(); }
+
+ bool IsEmpty() { return bulge_-> IsEmpty(); }
+
+ string StrId() { return bulge_->StrId(); }
+
+ size_t BulgeLength() { return bulge_->BulgeLength(); }
+
+ string BulgeToString() { return bulge_->BulgeToString(); }
+};
+
+}
diff --git a/src/projects/dipspades/utils/dijkstra_utils.hpp b/src/projects/dipspades/utils/dijkstra_utils.hpp
new file mode 100644
index 0000000..3dbcdd4
--- /dev/null
+++ b/src/projects/dipspades/utils/dijkstra_utils.hpp
@@ -0,0 +1,163 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * dijkstra_utils.hpp
+ *
+ * Created on: 23.11.2012
+ * Author: yana
+ */
+
+#pragma once
+
+#include <iostream>
+#include <map>
+#include <vector>
+
+using namespace std;
+using namespace debruijn_graph;
+
+namespace dipspades {
+
+struct paths_searcher_config{
+ size_t max_num_vertices;
+ size_t depth_neigh_search;
+ size_t max_len_path;
+};
+
+class PathsSearcher{
+protected:
+ Graph & g_;
+ paths_searcher_config conf_;
+public:
+ PathsSearcher(Graph & g) : g_(g) {}
+ void Initialize(paths_searcher_config conf){
+ conf_ = conf;
+ }
+
+ virtual map<VertexId, vector<EdgeId> > FindShortestPathsFrom(VertexId v) = 0;
+ virtual ~PathsSearcher(){}
+};
+
+class DijkstraSearcher : public PathsSearcher{
+
+public:
+ DijkstraSearcher(Graph & g) : PathsSearcher(g) {
+ }
+
+ map<VertexId, vector<EdgeId> > FindShortestPathsFrom(VertexId v){
+ map<VertexId, vector<EdgeId> > short_paths;
+
+ multimap<size_t, VertexId> dist_v;
+ map<VertexId, size_t> v_dist;
+ map<VertexId, size_t> v_depth;
+ set<VertexId> visited;
+
+ // insertion of the initial vertex
+ vector<EdgeId> empty_path;
+ dist_v.insert(pair<size_t, VertexId>(0, v));
+ v_dist.insert(pair<VertexId, size_t>(v, 0));
+ short_paths.insert(pair<VertexId, vector<EdgeId> >(v, empty_path));
+ v_depth[v] = 0;
+
+ size_t num_visited = 0;
+
+ while((visited.size() < conf_.max_num_vertices) && (dist_v.size() != 0)){
+
+ VertexId cur_v = dist_v.begin()->second;
+ size_t cur_dist = dist_v.begin()->first;
+
+ size_t cur_depth;
+ if(v_depth.find(cur_v) != v_depth.end())
+ cur_depth = v_depth[cur_v];
+ else{
+ size_t min_depth = 100000;
+ bool is_defined = false;
+
+ // defining of depth
+ auto in_edges = g_.IncomingEdges(cur_v);
+ for(auto e = in_edges.begin(); e!= in_edges.end(); e++){
+ VertexId w = g_.EdgeStart(*e);
+ if(v_depth.find(w) != v_depth.end())
+ if(min_depth > v_depth[w]){
+ min_depth = v_depth[w];
+ is_defined = true;
+ }
+ }
+
+ if(is_defined){
+ cur_depth = min_depth + 1;
+ }
+ else{
+ cur_depth = 0;
+ }
+ v_depth[cur_v] = cur_depth;
+ }
+
+ if((cur_depth <= conf_.depth_neigh_search)){
+
+ auto out_edges = g_.OutgoingEdges(cur_v);
+
+ for(auto e = out_edges.begin(); e != out_edges.end(); e++){
+
+ VertexId cur_neigh = g_.EdgeEnd(*e);
+
+ if(visited.find(cur_neigh) == visited.end()){
+
+ size_t new_neigh_dist = g_.length(*e) + cur_dist;
+
+ bool is_replaced = false;
+ if(v_dist.find(cur_neigh) != v_dist.end()){
+
+ size_t old_neigh_dist = v_dist[cur_neigh];
+
+ if(old_neigh_dist > new_neigh_dist){
+ is_replaced = true;
+
+ for(auto it = dist_v.find(old_neigh_dist); it != dist_v.end(); it++)
+ if(it->second == cur_neigh){
+ dist_v.erase(it);
+ break;
+ }
+ }
+ }
+ else
+ is_replaced = true;
+
+ if(is_replaced && new_neigh_dist <= conf_.max_len_path){
+
+ dist_v.insert(pair<size_t, VertexId>(new_neigh_dist, cur_neigh));
+ v_dist[cur_neigh] = new_neigh_dist;
+
+ short_paths[cur_neigh] = short_paths[cur_v];
+ short_paths[cur_neigh].push_back(*e);
+ }
+ }
+ }
+ }
+ else{
+ break;
+ }
+
+ num_visited++;
+ visited.insert(cur_v);
+
+ // erasing of visited element;
+ for(auto it = dist_v.find(cur_dist); it != dist_v.end(); it++){
+ if(it->second == cur_v){
+ dist_v.erase(it);
+ v_dist.erase(it->second);
+ break;
+ }
+ }
+ }
+
+ return short_paths;
+ }
+};
+
+}
diff --git a/src/projects/dipspades/utils/edge_gluer.hpp b/src/projects/dipspades/utils/edge_gluer.hpp
new file mode 100644
index 0000000..7cc1e50
--- /dev/null
+++ b/src/projects/dipspades/utils/edge_gluer.hpp
@@ -0,0 +1,102 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "algorithms/dijkstra/neighbours_iterator.hpp"
+
+using namespace debruijn_graph;
+
+namespace dipspades {
+
+class EdgeGluer {
+ Graph &graph_;
+
+ void MoveRelatedEdge(EdgeId edge, VertexId new_start, VertexId new_end){
+ EdgeId new_edge = graph_.AddEdge(new_start, new_end, graph_.EdgeNucls(edge));
+ TRACE("New edge " << graph_.str(new_edge) << "was added");
+ graph_.DeleteEdge(edge);
+ }
+
+ void MoverUnrelatedEdge(EdgeId edge, VertexId new_start, VertexId new_end){
+ EdgeId new_edge = graph_.AddEdge(new_start, new_end, graph_.EdgeNucls(edge));
+ TRACE("New edge - " << graph_.str(new_edge) << " old edge - " << graph_.str(edge));
+ if(IsEdgeRelated(graph_, new_edge))
+ graph_.DeleteEdge(edge);
+ else
+ graph_.GlueEdges(edge, new_edge);
+ }
+
+ void StandardEdgeMoving(EdgeId edge, VertexId new_start, VertexId new_end){
+ graph_.AddEdge(new_start, new_end, graph_.EdgeNucls(edge));
+ graph_.DeleteEdge(edge);
+ }
+
+public:
+ EdgeGluer(Graph &graph) : graph_(graph) { }
+
+ void MoveEdgesFromVertexToVertex(VertexId old_vertex, VertexId new_vertex,
+ vector<EdgeId> forbidden_edges){
+
+ TRACE("New start - " << graph_.str(new_vertex) << ", old vertex - " << graph_.str(old_vertex));
+ TRACE("Incoming edges");
+ for(auto in_edges_iter = SmartSetIterator<Graph, EdgeId>(graph_,
+ graph_.IncomingEdges(old_vertex).begin(),
+ graph_.IncomingEdges(old_vertex).end());
+ !in_edges_iter.IsEnd(); ++in_edges_iter){
+ if(find(forbidden_edges.begin(), forbidden_edges.end(), *in_edges_iter) ==
+ forbidden_edges.end()){
+ TRACE("Edge " << graph_.str(*in_edges_iter) << " is not forbidden");
+ if(IsEdgeRelated(graph_, *in_edges_iter)){
+ TRACE("Edge is related");
+ if(IsEdgeLoop(graph_, *in_edges_iter)){
+ TRACE("Edge is loop");
+ StandardEdgeMoving(*in_edges_iter, new_vertex, new_vertex);
+ }
+ else{
+ TRACE("Edge is adjacent to conjugate");
+ StandardEdgeMoving(*in_edges_iter, graph_.conjugate(new_vertex), new_vertex);
+ }
+ }
+ else{
+ TRACE("Edge is not related");
+ StandardEdgeMoving(*in_edges_iter, graph_.EdgeStart(*in_edges_iter), new_vertex);
+ }
+ }
+ }
+
+ TRACE("Outgoing edges");
+ for(auto out_edges_iter = SmartSetIterator<Graph, EdgeId>(graph_,
+ graph_.OutgoingEdges(old_vertex).begin(),
+ graph_.OutgoingEdges(old_vertex).end());
+ !out_edges_iter.IsEnd(); ++out_edges_iter){
+ if(find(forbidden_edges.begin(), forbidden_edges.end(), *out_edges_iter) ==
+ forbidden_edges.end()){
+ TRACE("Edge " << graph_.str(*out_edges_iter) << " is not forbidden");
+ if(IsEdgeRelated(graph_, *out_edges_iter)){
+ TRACE("Edge is related");
+ if(IsEdgeLoop(graph_, *out_edges_iter)){
+ TRACE("Edge is loop");
+ StandardEdgeMoving(*out_edges_iter, new_vertex, new_vertex);
+ }
+ else{
+ TRACE("Edge is adjacent to conjugate");
+ StandardEdgeMoving(*out_edges_iter, new_vertex, graph_.conjugate(new_vertex));
+ }
+ }
+ else{
+ TRACE("Edge is not related");
+ StandardEdgeMoving(*out_edges_iter, new_vertex, graph_.EdgeEnd(*out_edges_iter));
+ }
+ }
+ }
+ }
+
+private:
+ DECL_LOGGER("EdgeGluer");
+};
+
+}
diff --git a/src/projects/dipspades/utils/element_printers.hpp b/src/projects/dipspades/utils/element_printers.hpp
new file mode 100644
index 0000000..ae835e1
--- /dev/null
+++ b/src/projects/dipspades/utils/element_printers.hpp
@@ -0,0 +1,108 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <set>
+
+using namespace std;
+using namespace debruijn_graph;
+
+namespace dipspades {
+
+void PrintSimplePath(ostream &out, Graph &g, vector<EdgeId> path){
+ for(auto e = path.begin(); e != path.end(); e++)
+ out << g.int_id(*e) << " ";
+ out << endl;
+}
+
+void PrintSimplePathWithVertices(ostream &out, Graph &g, vector<EdgeId> &path){
+ for(auto e = path.begin(); e != path.end(); e++)
+ out << g.int_id(*e) << " (" << g.length(*e) << "), " << g.int_id(g.EdgeStart(*e)) << " - " <<
+ g.int_id(g.EdgeEnd(*e)) << ". ";
+ out << endl;
+}
+
+string SimplePathWithVerticesToString(const Graph &g, vector<EdgeId> path){
+ stringstream ss;
+ for(auto e = path.begin(); e != path.end(); e++)
+ ss << g.int_id(*e) << " (" << g.length(*e) << "), " << g.int_id(g.EdgeStart(*e)) << " - " <<
+ g.int_id(g.EdgeEnd(*e)) << ". ";
+ return ss.str();
+}
+
+string MappingPathToString(Graph &g, MappingPath<EdgeId> path){
+ stringstream ss;
+ for(size_t i = 0; i < path.size(); i++){
+ Range init = path[i].second.initial_range, mapp = path[i].second.mapped_range;
+ ss << "Edge - " << g.str(path[i].first) << " (" << g.length(path[i].first) << ") . Init range - " << init.start_pos <<
+ " - " << init.end_pos << ". Mapp range - " << mapp.start_pos << " - " <<
+ mapp.end_pos << ". ";
+ }
+ return ss.str();
+}
+
+template<class T>
+void PrintSet(ostream &out, set<T> set_elem){
+ for(auto e = set_elem.begin(); e != set_elem.end(); e++)
+ out << *e << " ";
+ out << endl;
+}
+
+template<class T>
+string SetToString(set<T> set_elem){
+ stringstream ss;
+ for(auto e = set_elem.begin(); e != set_elem.end(); e++)
+ ss << *e << " ";
+ return ss.str();
+}
+
+template<class T>
+void PrintVector(ostream &out, vector<T> vect_elem){
+ for(auto e = vect_elem.begin(); e != vect_elem.end(); e++)
+ out << *e << " ";
+ out << endl;
+}
+
+template<class T>
+string VectorToString(vector<T> vect_elem){
+ stringstream ss;
+ for(auto e = vect_elem.begin(); e != vect_elem.end(); e++)
+ ss << *e << " ";
+ return ss.str();
+}
+
+string VerticesVectorToString(Graph &g, vector<VertexId> vertices){
+ stringstream ss;
+ for(auto it = vertices.begin(); it != vertices.end(); it++)
+ ss << g.str(*it) << " ";
+ return ss.str();
+}
+
+void PrintEdgeWithVertices(ostream &out, Graph &g, EdgeId edge){
+ out << "Edge - " << g.int_id(edge) << ". Start vertex - " << g.int_id(g.EdgeStart(edge)) <<
+ ". End vertex - " << g.int_id(g.EdgeEnd(edge)) << endl;
+}
+
+void PrintEdgeWithLength(ostream &out, Graph &g, EdgeId edge){
+ out << "Edge - " << g.int_id(edge) << " with length - " << g.length(edge) << endl;
+}
+
+void PrintVectorOfVertices(ostream &out, Graph &g, vector<VertexId> vect){
+ for(auto v = vect.begin(); v != vect.end(); v++)
+ out << g.int_id(*v) << " ";
+ out << endl;
+}
+
+string MappingRangeToString(MappingRange mr){
+ stringstream ss;
+ ss << "Init: " << mr.initial_range.start_pos << " " << mr.initial_range.end_pos
+ << ". Map: " << mr.mapped_range.start_pos << " " << mr.mapped_range.end_pos << endl;
+ return ss.str();
+}
+
+}
diff --git a/src/projects/dipspades/utils/files_utils.cpp b/src/projects/dipspades/utils/files_utils.cpp
new file mode 100644
index 0000000..2d0896d
--- /dev/null
+++ b/src/projects/dipspades/utils/files_utils.cpp
@@ -0,0 +1,48 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "files_utils.hpp"
+
+namespace dipspades {
+
+vector<string> GetAllLinesFromFile(string filename){
+ ifstream freader(filename.c_str());
+ vector<string> lines;
+ if(!freader.fail()){
+ while(!freader.eof()){
+ string new_line;
+ getline(freader, new_line);
+ if(new_line != "")
+ lines.push_back(new_line);
+ }
+ }
+ return lines;
+}
+
+string cut_fname_from_path(string path){
+ string res;
+ for(size_t i = path.size() - 1; i > 0; i--)
+ if(path[i] == '/'){
+ res = path.substr(i + 1, path.size() - i - 1);
+ break;
+ }
+
+ for(size_t i = res.size() - 1; i > 0 ; i--)
+ if(res[i] == '.'){
+ res = res.substr(0, i);
+ break;
+ }
+
+ return res;
+}
+
+bool fname_valid(string fname){
+ ifstream out(fname.c_str());
+ return !out.fail();
+}
+
+}
diff --git a/src/dipspades/utils/files_utils.hpp b/src/projects/dipspades/utils/files_utils.hpp
similarity index 100%
rename from src/dipspades/utils/files_utils.hpp
rename to src/projects/dipspades/utils/files_utils.hpp
diff --git a/src/projects/dipspades/utils/histogram.hpp b/src/projects/dipspades/utils/histogram.hpp
new file mode 100644
index 0000000..bcc8e1d
--- /dev/null
+++ b/src/projects/dipspades/utils/histogram.hpp
@@ -0,0 +1,104 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <map>
+#include <vector>
+
+using namespace std;
+
+namespace dipspades {
+
+template <typename T>
+class BaseHistogram {
+
+ map<T, size_t> sorted_elems_;
+ vector<T> part_sums_;
+ bool invalid_part_sums_;
+
+ void CalculatePartSums(){
+ part_sums_.clear();
+ T prev_elem = 0;
+ for(auto it = sorted_elems_.begin(); it != sorted_elems_.end(); it++){
+ T key = it->first;
+ size_t count = it->second;
+ for(size_t i = 0; i < count; i++){
+ part_sums_.push_back(key + prev_elem);
+ prev_elem += key;
+ }
+ }
+ invalid_part_sums_ = false;
+ }
+
+ T operator[](size_t idx){
+ VERIFY(idx < part_sums_.size());
+ if(size() == 0)
+ return 0;
+ if(idx == 0)
+ return part_sums_[0];
+ return part_sums_[idx] - part_sums_[idx - 1];
+ }
+
+public:
+ void Add(T new_elem, size_t count = 1){
+ invalid_part_sums_ = true;
+ if(sorted_elems_.find(new_elem) == sorted_elems_.end())
+ sorted_elems_[new_elem] = count;
+ else
+ sorted_elems_[new_elem] += count;
+ }
+
+ T Quantile(double quantile){
+ VERIFY(quantile > 0 && quantile <= 1);
+ if(invalid_part_sums_)
+ CalculatePartSums();
+ if(part_sums_.size() == 0)
+ return 0;
+ T total_sum = part_sums_[part_sums_.size() - 1];
+ for(size_t i = 0; i < part_sums_.size(); i++)
+ if(double(part_sums_[i]) / double(total_sum) >= quantile)
+ return (*this)[i];
+
+ return T(0);
+ }
+
+ size_t size() { return part_sums_.size(); }
+
+ T max() {
+ CalculatePartSums();
+ if(size() == 0)
+ return 0;
+ return (*this)[size() - 1];
+ }
+
+ void SaveToFile(string filename) const {
+ ofstream out(filename.c_str());
+ for(auto it = sorted_elems_.begin(); it != sorted_elems_.end(); it++)
+ out << it->first << ' ' << it->second << endl;
+ }
+
+ void LoadFrom(string filename) {
+ ifstream src(filename.c_str());
+ VERIFY(!src.fail());
+ while(!src.eof()){
+ string tmp;
+ getline(src, tmp);
+ if(tmp != ""){
+ stringstream ss;
+ ss << tmp;
+ T elem;
+ size_t count;
+ ss >> elem;
+ ss >> count;
+ Add(elem, count);
+ }
+ }
+ }
+};
+
+}
diff --git a/src/projects/dipspades/utils/lcs_utils.hpp b/src/projects/dipspades/utils/lcs_utils.hpp
new file mode 100644
index 0000000..254514a
--- /dev/null
+++ b/src/projects/dipspades/utils/lcs_utils.hpp
@@ -0,0 +1,146 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <vector>
+#include <string>
+#include <memory.h>
+
+using namespace std;
+
+namespace dipspades {
+
+template<class T>
+class LCSCalculator{
+
+ int ** mask;
+
+ void Initialize(size_t length1, size_t length2){
+ mask = new int*[length1 + 1];
+ for(size_t i = 0; i < length1 + 1; i++){
+ mask[i] = new int[length2 + 1];
+ memset(mask[i], 0, sizeof(int) * (length2 + 1));
+ }
+ }
+
+ void Terminate(size_t length){
+ for(size_t i = 0; i < length + 1; i++)
+ delete[] mask[i];
+ delete[] mask;
+ }
+
+ void RecursiveLCSLenCalc(vector<T> str1, vector<T> str2, size_t i1, size_t i2){
+
+// cout << i1 << " - " << i2 << "; " << str1[i1 - 1] << " - " << str2[i2 - 1] << "; ";
+
+ if(str1[i1 - 1] == str2[i2 - 1]){
+// cout << "1st case; ";
+ mask[i1][i2] = mask[i1 - 1][i2 - 1] + 1;
+ }
+ else{
+
+// cout << "2nd case; ";
+ int res1 = mask[i1][i2 - 1];
+ int res2 = mask[i1 - 1][i2];
+
+ mask[i1][i2] = max<int>(res1, res2);
+ }
+ }
+
+ int LCSLengthCalculation(vector<T> str1, vector<T> str2){
+
+ for(size_t i = 1; i <= str1.size(); i++)
+ for(size_t j = 1; j <= str2.size(); j++){
+ RecursiveLCSLenCalc(str1, str2, i, j);
+ }
+
+ return mask[str1.size()][str2.size()];
+ }
+
+ vector<T> RecursiveRestoreLCS(vector<T> str1, vector<T> str2,
+ size_t i, size_t j){
+ vector<T> res;
+ if(i == 0 || j == 0){
+ return res;
+ }
+
+ if(str1[i - 1] == str2[j - 1]){
+ res = RecursiveRestoreLCS(str1, str2, i - 1, j - 1);
+ res.push_back(str1[i - 1]);
+ return res;
+ }
+
+ if(mask[i][j - 1] > mask[i - 1][j])
+ return RecursiveRestoreLCS(str1, str2, i, j - 1);
+ else
+ return RecursiveRestoreLCS(str1, str2, i - 1, j);
+ }
+
+ vector<T> RestoreLCS(vector<T> string1, vector<T> string2, size_t){
+
+// cout << "LCS string length - " << lcs_length << endl;
+ vector<T> lcs = RecursiveRestoreLCS(string1, string2, string1.size(), string2.size());
+ return lcs;
+ }
+
+public:
+
+ vector<T> LCS(vector<T> string1, vector<T> string2){
+ vector<T> res;
+ if(string1.size() == 0 || string2.size() == 0)
+ return res;
+
+ Initialize(string1.size(), string2.size());
+
+ int lcs_length = LCSLengthCalculation(string1, string2);
+ res = RestoreLCS(string1, string2, lcs_length);
+ Terminate(string1.size());
+
+ return res;
+ }
+
+ vector<size_t> GetPosVectorFromLeft(vector<T> string, vector<T> lcs){
+ vector<size_t> pos;
+
+ if(string.size() == 0 || lcs.size() == 0)
+ return pos;
+
+ int str_ind = 0;
+ for(size_t i = 0; i < lcs.size(); i++){
+ while(string[str_ind] != lcs[i]){
+ str_ind++;
+ }
+ pos.push_back(str_ind);
+ str_ind++;
+ }
+
+ VERIFY(lcs.size() == pos.size());
+
+ return pos;
+ }
+
+ vector<size_t> GetPosVector(vector<T> string, vector<T> lcs){
+ vector<size_t> pos;
+ if(string.size() == 0 || lcs.size() == 0)
+ return pos;
+
+ int lcs_ind = int(lcs.size() - 1);
+ int str_size = int(string.size());
+ for(int i = str_size - 1; i >= 0 && lcs_ind >= 0; i--)
+ if(string[i] == lcs[lcs_ind]){
+ pos.insert(pos.begin(), size_t(i));
+ lcs_ind--;
+ }
+
+ VERIFY(pos.size() == lcs.size());
+
+ return pos;
+ }
+};
+
+}
diff --git a/src/projects/dipspades/utils/path_index.hpp b/src/projects/dipspades/utils/path_index.hpp
new file mode 100644
index 0000000..18f8200
--- /dev/null
+++ b/src/projects/dipspades/utils/path_index.hpp
@@ -0,0 +1,68 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "../consensus_contigs_constructor/mapping_contigs_storage.hpp"
+
+namespace dipspades {
+
+class VertexPathIndex{
+ Graph &g_;
+
+ map<VertexId, set<size_t> > index_;
+
+ void AddNewPair(VertexId v, size_t path_index){
+ index_[v].insert(path_index);
+ }
+
+ set<size_t> JoinTwoSets(set<size_t> set1, set<size_t> set2){
+ for(auto it = set2.begin(); it != set2.end(); it++)
+ set1.insert(*it);
+ return set1;
+ }
+
+public:
+ VertexPathIndex(Graph &g) : g_(g) {
+ }
+
+ void Initialize(ContigStoragePtr storage){
+ INFO("Initialization of vertex-paths index starts");
+ for(size_t i = 0; i < storage->Size(); i++){
+ auto path = (*storage)[i]->path_seq();
+ if(path.size() > 0){
+ VertexId start_vertex = g_.EdgeStart(path[0]);
+ AddNewPair(start_vertex, i);
+
+ for(auto e = path.begin(); e != path.end(); e++){
+ VertexId v = g_.EdgeEnd(*e);
+ AddNewPair(v, i);
+ }
+ }
+ }
+ INFO("Initialization of vertex-paths index ends");
+ }
+
+ void Clear(){
+ index_.clear();
+ }
+
+ set<size_t> GetPathsIntersectedWith(vector<EdgeId> path){
+ set<size_t> res;
+ if(path.size() == 0)
+ return res;
+ VertexId start_vertex = g_.EdgeStart(path[0]);
+ res = index_[start_vertex];
+ for(auto e = path.begin(); e != path.end(); e++){
+ VertexId v = g_.EdgeEnd(*e);
+ res = JoinTwoSets(res, index_[v]);
+ }
+ return res;
+ }
+};
+
+}
diff --git a/src/projects/dipspades/utils/path_routines.hpp b/src/projects/dipspades/utils/path_routines.hpp
new file mode 100644
index 0000000..a251496
--- /dev/null
+++ b/src/projects/dipspades/utils/path_routines.hpp
@@ -0,0 +1,285 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+#include "assembly_graph/graph_core/graph.hpp"
+#include "pipeline/graph_pack.hpp"
+
+using namespace debruijn_graph;
+
+namespace dipspades {
+
+bool AreEdgesConnected(Graph &g, EdgeId e1, EdgeId e2){
+ return g.EdgeEnd(e1) == g.EdgeStart(e2);
+}
+
+bool IsPathConnected(Graph &g, vector<EdgeId> path){
+ if(path.size() <= 1)
+ return true;
+ for(size_t i = 0; i < path.size() - 1; i++){
+ EdgeId e1 = path[i];
+ EdgeId e2 = path[i + 1];
+ if(!AreEdgesConnected(g, e1, e2)){
+ return false;
+ }
+ }
+ return true;
+}
+
+bool PathContainsLoop(Graph &g, vector<EdgeId> p){
+ if(p.size() == 0)
+ return false;
+
+ set<VertexId> pathv;
+ pathv.insert(g.EdgeStart(p[0]));
+
+ for(auto e = p.begin(); e != p.end(); e++){
+ VertexId end = g.EdgeEnd(*e);
+ if(pathv.find(end) == pathv.end())
+ pathv.insert(end);
+ else
+ return true;
+ }
+
+ return false;
+}
+
+vector<VertexId> get_list_of_vertices_in_path(Graph &g, vector<EdgeId> path){
+ vector<VertexId> list;
+ if(path.size() == 0)
+ return list;
+
+ for(size_t i = 0; i < path.size(); i++)
+ list.push_back(g.EdgeStart(path[i]));
+
+ list.push_back(g.EdgeEnd(path[path.size() - 1]));
+
+ return list;
+}
+
+bool is_1st_edge_not_later_2nd_edge(vector<EdgeId> path, EdgeId first_edge,
+ EdgeId second_edge){
+ bool first_found = false;
+ for(auto e = path.begin(); e != path.end(); e++){
+ if(*e == first_edge)
+ first_found = true;
+ if(*e == second_edge && !first_found)
+ return false;
+ if(*e == second_edge && first_found)
+ return true;
+ }
+
+ return false;
+}
+
+bool is_1st_edge_not_later_2nd_edge(vector<EdgeId> path, EdgeId first_edge,
+ EdgeId second_edge, int ind1, int ind2){
+ bool first_found = false;
+ //for(auto e = path.begin(); e != path.end(); e++){
+ for(int i = ind1; i <= ind2; i++){
+ EdgeId e = path[i];
+ if(e == first_edge)
+ first_found = true;
+ if(e == second_edge && !first_found)
+ return false;
+ if(e == second_edge && first_found)
+ return true;
+ }
+
+ return false;
+}
+
+int get_index_of_edge(vector<EdgeId> path, EdgeId edge){
+ for(size_t i = 0; i < path.size(); i++)
+ if(path[i] == edge)
+ return int(i);
+ return -1;
+}
+
+EdgeId GetEdgeById(conj_graph_pack & gp, size_t id){
+ for(auto e = gp.g.SmartEdgeBegin(); !e.IsEnd(); ++e)
+ if(gp.g.int_id(*e) == id)
+ return *e;
+ return EdgeId(0);
+}
+
+bool IsPathRegionCorrect(pair<size_t, size_t> region, size_t path_size){
+ return region.first < path_size && region.second < path_size;
+}
+
+size_t GetLengthOfPathRegion(Graph &g, vector<EdgeId> path, pair<size_t, size_t> region){
+ VERIFY(IsPathRegionCorrect(region, path.size()));
+ size_t region_length = 0;
+ for(size_t i = region.first; i <= region.second; i++)
+ region_length += g.length(path[i]);
+ return region_length;
+}
+
+size_t GetPathLength(Graph &g, vector<EdgeId> path){
+ if(path.size() == 0)
+ return 0;
+ return GetLengthOfPathRegion(g, path, pair<size_t, size_t>(0, path.size() - 1));
+}
+
+Sequence GetSequenceOfPathRegion(Graph &g, size_t k_value, vector<EdgeId> path,
+ pair<size_t, size_t> region){
+ VERIFY(IsPathRegionCorrect(region, path.size()));
+
+ if(region.first > region.second)
+ return Sequence();
+
+ EdgeId cur_edge = path[region.first];
+ Sequence seq = g.EdgeNucls(cur_edge);
+
+ for(auto i = region.first + 1; i <= region.second; ++i){
+ Sequence next_seq = g.EdgeNucls(path[i]);
+ seq = seq + next_seq.Subseq(k_value, next_seq.size());
+ }
+
+ return seq;
+}
+
+Sequence GetSequenceByPath(Graph &g, size_t k_value, const vector<EdgeId> path){
+ if(path.size() == 0)
+ return Sequence();
+ return GetSequenceOfPathRegion(g, k_value, path, pair<size_t, size_t>(0, path.size() - 1));
+}
+
+Sequence GetSequenceByPath(conj_graph_pack &gp, const vector<EdgeId> path){
+ if(path.size() == 0)
+ return Sequence();
+ return GetSequenceOfPathRegion(gp.g, gp.k_value, path, pair<int, int>(0, path.size() - 1));
+}
+
+vector<EdgeId> GetRCToPathSeq(Graph &g, vector<EdgeId> path){
+ vector<EdgeId> rc_path;
+ for(auto e = path.begin(); e != path.end(); e++){
+ rc_path.insert(rc_path.begin(), g.conjugate(*e));
+ }
+ return rc_path;
+}
+
+MappingPath<EdgeId> GetRCToMappingPath(Graph &g, MappingPath<EdgeId> map_path, size_t seq_size){
+ vector<EdgeId> rc_path_seq;
+ vector<MappingRange> rc_map_ranges;
+ for(size_t i = 0; i < map_path.size(); i++){
+ // computing edges sequence
+ EdgeId cur_edge = map_path[i].first;
+ rc_path_seq.insert(rc_path_seq.begin(), g.conjugate(cur_edge));
+
+ // computing initial ranges
+ Range init_range = map_path[i].second.initial_range;
+ Range rc_init_range(seq_size - init_range.end_pos, seq_size - init_range.start_pos);
+
+ // computing mapped ranges
+ size_t edge_length = g.length(cur_edge);
+ Range map_range = map_path[i].second.mapped_range;
+ Range rc_map_range(edge_length - map_range.end_pos, edge_length - map_range.start_pos);
+
+ rc_map_ranges.insert(rc_map_ranges.begin(), MappingRange(rc_init_range, rc_map_range));
+ }
+
+ return MappingPath<EdgeId>(rc_path_seq, rc_map_ranges);
+}
+
+bool ArePathEqual(vector<EdgeId> path1, vector<EdgeId> path2){
+ if(path1.size() != path2.size())
+ return false;
+
+ for(size_t i = 0; i < path1.size(); i++)
+ if(path1[i] != path2[i])
+ return false;
+
+ return true;
+}
+
+bool PathsShareEdge(vector<EdgeId> path1, vector<EdgeId> path2){
+ for(auto it1 = path1.begin(); it1 != path1.end(); it1++)
+ for(auto it2 = path2.begin(); it2 != path2.end(); it2++)
+ if(*it1 == *it2)
+ return true;
+ return false;
+}
+
+vector<EdgeId> CutSubpathByRegion(vector<EdgeId> path, pair<size_t, size_t> region){
+ VERIFY(IsPathRegionCorrect(region, path.size()));
+ vector<EdgeId> subpath;
+ for(size_t i = region.first; i <= region.second; i++)
+ subpath.push_back(path[i]);
+ return subpath;
+}
+
+bool IsEdgeRelated(Graph &g, EdgeId edge){
+ return g.RelatedVertices(g.EdgeStart(edge), g.EdgeEnd(edge));
+}
+
+bool IsEdgeLoop(Graph &g, EdgeId edge){
+ return g.EdgeStart(edge) == g.EdgeEnd(edge);
+}
+
+bool VertexAdjacentRelatedEdges(Graph &g, VertexId vertex){
+ auto in_edges = g.IncomingEdges(vertex);
+ for(auto it = in_edges.begin(); it != in_edges.end(); it++)
+ if(IsEdgeRelated(g, *it))
+ return true;
+ auto out_edges = g.OutgoingEdges(vertex);
+ for(auto it = out_edges.begin(); it != out_edges.end(); it++)
+ if(IsEdgeRelated(g, *it))
+ return true;
+ return false;
+}
+
+bool PathAdjacentRelatedEdges(Graph &g, vector<EdgeId> path, bool check_start = false,
+ bool check_end = false){
+ for(auto e = path.begin(); e != path.end() - 1; e++)
+ if(VertexAdjacentRelatedEdges(g, g.EdgeEnd(*e)))
+ return true;
+ if(path.size() != 0)
+ if(check_start)
+ if(VertexAdjacentRelatedEdges(g, g.EdgeStart(path[0])))
+ return true;
+ if(check_end)
+ if(VertexAdjacentRelatedEdges(g, g.EdgeEnd(path[path.size() - 1])))
+ return true;
+ return false;
+}
+
+vector<size_t> CalculatePathPartLens(Graph &g, vector<EdgeId> path){
+ vector<size_t> lens;
+ size_t prev_len = 0;
+ for(auto e = path.begin(); e != path.end(); e++){
+ lens.push_back(prev_len + g.length(*e));
+ prev_len += g.length(*e);
+ }
+ return lens;
+}
+
+/*
+void detect_loop_length(ostream &out, Graph& g, ContigStorage* stor){
+
+ for(size_t i = 0; i < stor->Size(); i++){
+ vector<EdgeId> path = (*stor)[i]->PathSeq();
+
+ vector<VertexId> vert = get_list_of_vertices_in_path(g, path);
+
+ for(size_t j = 0; j < vert.size() - 1; j++){
+ for(size_t k = j + 1; k < vert.size(); k++){
+ if(vert[j] == vert[k]){
+ size_t ind1 = j, ind2 = (k == path.size()) ? (k - 1) : k;
+ size_t loop_length = 0;
+
+ for(size_t l = ind1; l <= ind2; l++)
+ loop_length += g.length(path[l]);
+ out << loop_length << endl;
+ }
+ }
+ }
+ }
+}
+*/
+
+}
diff --git a/src/projects/dipspades/utils/range_utils.hpp b/src/projects/dipspades/utils/range_utils.hpp
new file mode 100644
index 0000000..896879c
--- /dev/null
+++ b/src/projects/dipspades/utils/range_utils.hpp
@@ -0,0 +1,57 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <vector>
+#include <map>
+#include <set>
+#include <iostream>
+
+#include "math.h"
+
+using namespace std;
+using namespace io;
+
+namespace dipspades {
+
+bool is_intersection_exist(Range r1, Range r2){
+ if(r1.end_pos <= r2.start_pos || r2.end_pos <= r1.start_pos)
+ return false;
+ else
+ return true;
+}
+
+Range get_intersection_of_ranges(Range r1, Range r2){
+ VERIFY(is_intersection_exist(r1, r2));
+ size_t max_start = max<size_t>(r1.start_pos, r2.start_pos);
+ size_t min_end = min<size_t>(r1.end_pos, r2.end_pos);
+
+ Range r(max_start, min_end);
+ return r;
+}
+
+pair<size_t, size_t> project_init_range_to_new(Range old_map_rg, Range new_map_rg, Range old_init_rg){
+
+ size_t start_pos, end_pos;
+
+ int shift_start = int(new_map_rg.start_pos) - int(old_map_rg.start_pos);
+ double start_coeff = double(shift_start) / double(old_map_rg.end_pos - old_map_rg.start_pos);
+ start_pos = old_init_rg.start_pos + int(start_coeff * double(old_init_rg.end_pos - old_init_rg.start_pos));
+
+ int shift_end = int(new_map_rg.end_pos) - int(old_map_rg.end_pos);
+ double end_coeff = double(shift_end) / double(old_map_rg.end_pos - old_map_rg.start_pos);
+ end_pos = old_init_rg.end_pos + int(end_coeff * double(old_init_rg.end_pos - old_init_rg.start_pos));
+
+ return pair<size_t, size_t>(start_pos, end_pos);
+}
+
+bool is_range_pair_correct(pair<size_t, size_t> p){
+ return p.first < p.second;
+}
+
+}
diff --git a/src/projects/dipspades/utils/redundancy_map.hpp b/src/projects/dipspades/utils/redundancy_map.hpp
new file mode 100644
index 0000000..40e7bf5
--- /dev/null
+++ b/src/projects/dipspades/utils/redundancy_map.hpp
@@ -0,0 +1,235 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+using namespace std;
+
+namespace dipspades {
+
+template<typename Id>
+class RedundancyMap{
+ map<Id, set<Id> > red_map_;
+ set<Id> all_ids_;
+
+ void ComputeAllIDs(){
+ for(auto it = red_map_.begin(); it != red_map_.end(); it++){
+ all_ids_.insert(it->first);
+ all_ids_.insert(it->second.begin(), it->second.end());
+ }
+ }
+
+public:
+ void AddNewKey(Id key){
+ set<Id> empty_set;
+ red_map_[key] = empty_set;
+
+ all_ids_.insert(key);
+ }
+
+ void AddNewPair(Id key, Id value){
+ red_map_[key].insert(value);
+
+ all_ids_.insert(key);
+ all_ids_.insert(value);
+ }
+
+ set<Id> GetValuesByKey(Id key){
+ return red_map_[key];
+ }
+
+ void SetValuesByKey(Id key, set<Id> values){
+ red_map_[key].insert(values.begin(), values.end());
+ all_ids_.insert(values.begin(), values.end());
+ all_ids_.insert(key);
+ }
+
+ bool ContainsKey(Id key){
+ return red_map_.find(key) != red_map_.end();
+ }
+
+ size_t AllElementsNumber(){
+
+ ComputeAllIDs();
+
+ return all_ids_.size();
+ }
+
+ set<Id> AllElements(){
+
+ ComputeAllIDs();
+
+ return all_ids_;
+ }
+
+ typename map<Id, set<Id> >::iterator begin(){
+ return red_map_.begin();
+ }
+
+ typename map<Id, set<Id> >::iterator end(){
+ return red_map_.end();
+ }
+
+ void Clear(){
+ red_map_.clear();
+ all_ids_.clear();
+ }
+};
+
+template<typename Id>
+class RedundancyMapCondenser{
+
+ RedundancyMap<Id> uncondensed_map_;
+ RedundancyMap<Id> condensed_map_;
+
+ size_t number_processed_;
+ size_t need_processed_;
+
+ map<Id, bool> is_processed_;
+
+ void ProcessCondensing(){
+ bool non_zero_processed = true;
+ while(number_processed_ < need_processed_ && non_zero_processed){
+ int num_cur_processed = 0;
+ for(auto it = condensed_map_.begin(); it != condensed_map_.end(); it++){
+ set<Id> cur_set = it->second;
+
+ for(auto it_set = cur_set.begin(); it_set != cur_set.end(); it_set++){
+ if(!is_processed_[*it_set]){
+ set<Id> child_set = uncondensed_map_.GetValuesByKey(*it_set);
+ it->second.insert(child_set.begin(), child_set.end());
+
+ is_processed_[*it_set] = true;
+ num_cur_processed++;
+ }
+ }
+ }
+ non_zero_processed = num_cur_processed != 0;
+ number_processed_ += num_cur_processed;
+ TRACE("Number of processed - " << number_processed_ << ", total number - " << need_processed_);
+ }
+
+ }
+
+ void ClearParams(){
+ number_processed_ = 0;
+ need_processed_ = 0;
+ is_processed_.clear();
+ condensed_map_.Clear();
+ }
+
+public:
+ RedundancyMap<Id> Condense(RedundancyMap<Id> uncondensed_map){
+ uncondensed_map_ = uncondensed_map;
+ ClearParams();
+
+ TRACE("Start condensing");
+
+ TRACE("Computing of main keys");
+ auto all_ids_ = uncondensed_map_.AllElements();
+ map<Id, bool> is_main;
+ for(auto it = all_ids_.begin(); it != all_ids_.end(); it++)
+ is_main[*it] = true;
+
+ for(auto it = uncondensed_map_.begin(); it != uncondensed_map_.end(); it++){
+ for(auto it_set = it->second.begin(); it_set != it->second.end(); it_set++){
+ is_main[*it_set] = false;
+ }
+ }
+
+ set<Id> main_keys;
+ for(auto it = is_main.begin(); it != is_main.end(); it++)
+ if(it->second)
+ main_keys.insert(it->first);
+
+ TRACE("Number of all keys - " << all_ids_.size());
+ TRACE("Number of main keys - " << main_keys.size());
+ TRACE("Condensing starts");
+
+ need_processed_ = all_ids_.size();
+ number_processed_ = 0;
+
+ for(auto it = all_ids_.begin(); it != all_ids_.end(); it++)
+ is_processed_[*it] = false;
+
+ for(auto main_key = main_keys.begin(); main_key != main_keys.end(); main_key++){
+ condensed_map_.SetValuesByKey(*main_key, uncondensed_map_.GetValuesByKey(*main_key));
+ number_processed_++;
+ is_processed_[*main_key] = true;
+ }
+
+ // main processing
+ ProcessCondensing();
+
+ // processing of non visiting Ids
+ while(number_processed_ < need_processed_){
+ size_t max_child_setsize = 0;
+ Id start_id(0);
+ for(auto it = is_processed_.begin(); it != is_processed_.end(); it++){
+ if(!it->second && uncondensed_map_.GetValuesByKey(it->first).size() >= max_child_setsize){
+ start_id = it->first;
+ max_child_setsize = uncondensed_map_.GetValuesByKey(it->first).size();
+ }
+ }
+ auto start_set = uncondensed_map_.GetValuesByKey(start_id);
+ for(auto it = start_set.begin(); it != start_set.end(); it++)
+ if(!is_processed_[*it])
+ condensed_map_.AddNewPair(start_id, *it);
+
+ is_processed_[start_id] = true;
+ number_processed_++;
+ ProcessCondensing();
+ }
+
+ VERIFY(number_processed_ == need_processed_);
+ return condensed_map_;
+ }
+};
+
+template<typename Id>
+class RedundancyMapMerger{
+
+ bool AreMergeResultsCorrect(RedundancyMap<Id> old_map, RedundancyMap<Id> new_map){
+// cout << "Correctness - " << old_map.AllElementsNumber() << " " << new_map.AllElementsNumber() << endl;
+ return old_map.AllElementsNumber() == new_map.AllElementsNumber();
+ }
+
+public:
+ RedundancyMap<Id> MergeTwoMaps(RedundancyMap<Id> map1, RedundancyMap<Id> map2){
+
+ for(auto it_old = map1.begin(); it_old != map1.end(); it_old++){
+ Id old_key = it_old->first;
+ auto old_set = it_old->second;
+
+ if(map2.ContainsKey(old_key)){
+ map2.SetValuesByKey(old_key, old_set);
+ }
+ else{
+ bool is_found = false;
+
+ for(auto it_new = map2.begin(); it_new != map2.end(); it_new++){
+ Id new_key = it_new->first;
+ auto new_set = it_new->second;
+ if(new_set.find(old_key) != new_set.end()){
+ map2.SetValuesByKey(new_key, old_set);
+ is_found = true;
+ break;
+ }
+ }
+
+ if(!is_found)
+ map2.SetValuesByKey(old_key, old_set);
+ }
+ }
+ RedundancyMapCondenser<Id> condenser;
+ map2 = condenser.Condense(map2);
+ VERIFY(AreMergeResultsCorrect(map1, map2));
+ return map2;
+ }
+};
+
+}
diff --git a/src/projects/dipspades/utils/sequence_utils.hpp b/src/projects/dipspades/utils/sequence_utils.hpp
new file mode 100644
index 0000000..6f82697
--- /dev/null
+++ b/src/projects/dipspades/utils/sequence_utils.hpp
@@ -0,0 +1,36 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+using namespace debruijn_graph;
+using namespace std;
+
+#include "bulge_utils.hpp"
+
+namespace dipspades {
+
+double RelativeAlignmentOfSequencesByMinimal(Sequence seq1, Sequence seq2, bool from_start = true){
+ Sequence trim_seq1, trim_seq2;
+ if(min<size_t>(seq1.size(), seq2.size()) == seq1.size()){
+ trim_seq1 = seq1;
+ if(from_start)
+ trim_seq2 = seq2.Subseq(seq2.size());
+ else
+ trim_seq2 = seq2.Subseq(seq2.size() - seq1.size(), seq1.size());
+ }
+ else{
+ if(from_start)
+ trim_seq1 = seq1.Subseq(seq2.size());
+ else
+ trim_seq1 = seq1.Subseq(seq1.size() - seq2.size(), seq1.size());
+ trim_seq2 = seq2;
+ }
+ return RelAlignmentOfSequences(trim_seq1, trim_seq2);
+}
+
+}
diff --git a/src/projects/hammer/CMakeLists.txt b/src/projects/hammer/CMakeLists.txt
new file mode 100644
index 0000000..5f5277a
--- /dev/null
+++ b/src/projects/hammer/CMakeLists.txt
@@ -0,0 +1,36 @@
+############################################################################
+# Copyright (c) 2015 Saint Petersburg State University
+# Copyright (c) 2011-2014 Saint Petersburg Academic University
+# All Rights Reserved
+# See file LICENSE for details.
+############################################################################
+
+project(hammer CXX)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
+add_executable(hammer
+ main.cpp
+ hammer_tools.cpp
+ hamcluster.cpp
+ kmer_cluster.cpp
+ kmer_data.cpp
+ config_struct_hammer.cpp
+ read_corrector.cpp
+ expander.cpp)
+
+# add_subdirectory(quake_count)
+# add_subdirectory(gen_test_data)
+
+target_link_libraries(hammer input dev_support mph_index pipeline BamTools format ${COMMON_LIBRARIES})
+
+if (SPADES_STATIC_BUILD)
+ set_target_properties(hammer PROPERTIES LINK_SEARCH_END_STATIC 1)
+endif()
+
+install(TARGETS hammer
+ DESTINATION bin
+ COMPONENT runtime)
+install(DIRECTORY "${SPADES_CFG_DIR}/hammer"
+ DESTINATION share/spades/configs
+ FILES_MATCHING PATTERN "*.info")
diff --git a/src/projects/hammer/config_struct_hammer.cpp b/src/projects/hammer/config_struct_hammer.cpp
new file mode 100644
index 0000000..37cd8ac
--- /dev/null
+++ b/src/projects/hammer/config_struct_hammer.cpp
@@ -0,0 +1,86 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * config_struct_hammer.cpp
+ *
+ * Created on: Oct 15, 2011
+ * Author: snikolenko
+ */
+
+#include "config_struct_hammer.hpp"
+#include "pipeline/config_common.hpp"
+#include "dev_support/openmp_wrapper.h"
+
+#include <boost/property_tree/ptree.hpp>
+#include <string>
+
+void load(hammer_config& cfg, const std::string &filename) {
+ boost::property_tree::ptree pt;
+ boost::property_tree::read_info(filename, pt);
+
+ load(cfg, pt);
+}
+
+void load(hammer_config& cfg, boost::property_tree::ptree const& pt) {
+ using config_common::load;
+ load(cfg.general_do_everything_after_first_iteration, pt, "general_do_everything_after_first_iteration");
+ load(cfg.general_hard_memory_limit, pt, "general_hard_memory_limit");
+ load(cfg.general_max_nthreads, pt, "general_max_nthreads");
+ load(cfg.general_tau, pt, "general_tau");
+ load(cfg.general_max_iterations, pt, "general_max_iterations");
+ load(cfg.general_debug, pt, "general_debug");
+
+ load(cfg.count_do, pt, "count_do");
+ load(cfg.count_numfiles, pt, "count_numfiles");
+ load(cfg.count_merge_nthreads, pt, "count_merge_nthreads");
+ load(cfg.count_split_buffer, pt, "count_split_buffer");
+ load(cfg.count_filter_singletons, pt, "count_filter_singletons");
+
+ load(cfg.hamming_do, pt, "hamming_do");
+ load(cfg.hamming_blocksize_quadratic_threshold, pt, "hamming_blocksize_quadratic_threshold");
+
+ load(cfg.bayes_do, pt, "bayes_do");
+ load(cfg.bayes_nthreads, pt, "bayes_nthreads");
+ load(cfg.bayes_singleton_threshold, pt, "bayes_singleton_threshold");
+ load(cfg.bayes_nonsingleton_threshold, pt, "bayes_nonsingleton_threshold");
+ load(cfg.bayes_discard_only_singletons, pt, "bayes_discard_only_singletons");
+ load(cfg.bayes_debug_output, pt, "bayes_debug_output");
+ load(cfg.bayes_use_hamming_dist, pt, "bayes_use_hamming_dist");
+ load(cfg.bayes_hammer_mode, pt, "bayes_hammer_mode");
+ load(cfg.bayes_write_bad_kmers, pt, "bayes_write_bad_kmers");
+ load(cfg.bayes_write_solid_kmers, pt, "bayes_write_solid_kmers");
+ load(cfg.bayes_initial_refine, pt, "bayes_initial_refine");
+
+ load(cfg.expand_do, pt, "expand_do");
+ load(cfg.expand_max_iterations, pt, "expand_max_iterations");
+ load(cfg.expand_nthreads, pt, "expand_nthreads");
+ load(cfg.expand_write_each_iteration, pt, "expand_write_each_iteration");
+ load(cfg.expand_write_kmers_result, pt, "expand_write_kmers_result");
+
+ load(cfg.correct_do, pt, "correct_do");
+ load(cfg.correct_nthreads, pt, "correct_nthreads");
+ load(cfg.correct_threshold, pt, "correct_threshold");
+ load(cfg.correct_use_threshold, pt, "correct_use_threshold");
+ load(cfg.correct_readbuffer, pt, "correct_readbuffer");
+ load(cfg.correct_discard_bad, pt, "correct_discard_bad");
+ load(cfg.correct_stats, pt, "correct_stats");
+
+ std::string fname;
+ load(fname, pt, "dataset");
+ cfg.dataset.load(fname);
+
+ load(cfg.input_working_dir, pt, "input_working_dir");
+ load(cfg.input_trim_quality, pt, "input_trim_quality");
+ cfg.input_qvoffset_opt = pt.get_optional<int>("input_qvoffset");
+ load(cfg.output_dir, pt, "output_dir");
+
+ // Fix number of threads according to OMP capabilities.
+ cfg.general_max_nthreads = std::min(cfg.general_max_nthreads, (unsigned)omp_get_max_threads());
+ // Inform OpenMP runtime about this :)
+ omp_set_num_threads(cfg.general_max_nthreads);
+}
diff --git a/src/projects/hammer/config_struct_hammer.hpp b/src/projects/hammer/config_struct_hammer.hpp
new file mode 100644
index 0000000..0e0f772
--- /dev/null
+++ b/src/projects/hammer/config_struct_hammer.hpp
@@ -0,0 +1,89 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * config_struct_hammer.hpp
+ *
+ * Created on: Aug 15, 2011
+ * Author: snikolenko
+ */
+
+#ifndef CONFIG_STRUCT_HAMMER_HPP_
+#define CONFIG_STRUCT_HAMMER_HPP_
+
+#include "pipeline/config_singl.hpp"
+
+#include "pipeline/library.hpp"
+
+#include <boost/optional.hpp>
+#include <boost/property_tree/ptree_fwd.hpp>
+
+#include <string>
+
+#define CONFIG_FILENAME "/home/snikolenko/algorithmic-biology/assembler/src/hammer/config.inp"
+
+// struct for debruijn project's configuration file
+struct hammer_config {
+ io::DataSet<> dataset;
+
+ std::string input_working_dir;
+ int input_trim_quality;
+ boost::optional<int> input_qvoffset_opt;
+ int input_qvoffset;
+ std::string output_dir;
+
+ bool general_do_everything_after_first_iteration;
+ int general_hard_memory_limit;
+ unsigned general_max_nthreads;
+ int general_tau;
+ unsigned general_max_iterations;
+ bool general_debug;
+
+ bool count_do;
+ unsigned count_numfiles;
+ unsigned count_merge_nthreads;
+ size_t count_split_buffer;
+ bool count_filter_singletons;
+
+ bool hamming_do;
+ unsigned hamming_blocksize_quadratic_threshold;
+
+ bool bayes_do;
+ unsigned bayes_nthreads;
+ double bayes_singleton_threshold;
+ double bayes_nonsingleton_threshold;
+ bool bayes_discard_only_singletons;
+ unsigned bayes_debug_output;
+ bool bayes_use_hamming_dist;
+ bool bayes_hammer_mode;
+ bool bayes_write_solid_kmers;
+ bool bayes_write_bad_kmers;
+ bool bayes_initial_refine;
+
+ bool expand_do;
+ unsigned expand_max_iterations;
+ unsigned expand_nthreads;
+ bool expand_write_each_iteration;
+ bool expand_write_kmers_result;
+
+ bool correct_do;
+ bool correct_discard_bad;
+ bool correct_use_threshold;
+ double correct_threshold;
+ unsigned correct_readbuffer;
+ unsigned correct_nthreads;
+ bool correct_stats;
+};
+
+
+// main debruijn config load function
+void load(hammer_config& cfg, const std::string &filename);
+void load(hammer_config& cfg, boost::property_tree::ptree const& pt);
+
+typedef config_common::config<hammer_config> cfg;
+
+#endif
diff --git a/src/projects/hammer/expander.cpp b/src/projects/hammer/expander.cpp
new file mode 100644
index 0000000..a088dc0
--- /dev/null
+++ b/src/projects/hammer/expander.cpp
@@ -0,0 +1,70 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "expander.hpp"
+
+#include "config_struct_hammer.hpp"
+#include "globals.hpp"
+#include "kmer_data.hpp"
+#include "valid_kmer_generator.hpp"
+
+#include "io/reads/read.hpp"
+
+#include <vector>
+#include <cstring>
+
+bool Expander::operator()(const Read &r) {
+ int trim_quality = cfg::get().input_trim_quality;
+
+ // FIXME: Get rid of this
+ Read cr = r;
+ size_t sz = cr.trimNsAndBadQuality(trim_quality);
+
+ if (sz < hammer::K)
+ return false;
+
+ std::vector<unsigned> covered_by_solid(sz, false);
+ std::vector<size_t> kmer_indices(sz, -1ull);
+
+ ValidKMerGenerator<hammer::K> gen(cr);
+ while (gen.HasMore()) {
+ hammer::KMer kmer = gen.kmer();
+ size_t idx = data_.checking_seq_idx(kmer);
+ if (idx != -1ULL) {
+ size_t read_pos = gen.pos() - 1;
+
+ kmer_indices[read_pos] = idx;
+ if (data_[idx].good()) {
+ for (size_t j = read_pos; j < read_pos + hammer::K; ++j)
+ covered_by_solid[j] = true;
+ }
+ }
+ gen.Next();
+ }
+
+ for (size_t j = 0; j < sz; ++j)
+ if (!covered_by_solid[j])
+ return false;
+
+ for (size_t j = 0; j < sz; ++j) {
+ if (kmer_indices[j] == -1ull)
+ continue;
+
+ // FIXME: Do not lock everything
+ KMerStat &kmer_data = data_[kmer_indices[j]];
+ if (!kmer_data.good()) {
+# pragma omp atomic
+ changed_ += 1;
+
+ kmer_data.lock();
+ kmer_data.mark_good();
+ kmer_data.unlock();
+ }
+ }
+
+ return false;
+}
diff --git a/src/hammer/expander.hpp b/src/projects/hammer/expander.hpp
similarity index 100%
rename from src/hammer/expander.hpp
rename to src/projects/hammer/expander.hpp
diff --git a/src/hammer/gen_test_data/CMakeLists.txt b/src/projects/hammer/gen_test_data/CMakeLists.txt
similarity index 100%
rename from src/hammer/gen_test_data/CMakeLists.txt
rename to src/projects/hammer/gen_test_data/CMakeLists.txt
diff --git a/src/hammer/gen_test_data/main.cpp b/src/projects/hammer/gen_test_data/main.cpp
similarity index 100%
rename from src/hammer/gen_test_data/main.cpp
rename to src/projects/hammer/gen_test_data/main.cpp
diff --git a/src/hammer/globals.hpp b/src/projects/hammer/globals.hpp
similarity index 100%
rename from src/hammer/globals.hpp
rename to src/projects/hammer/globals.hpp
diff --git a/src/projects/hammer/hamcluster.cpp b/src/projects/hammer/hamcluster.cpp
new file mode 100644
index 0000000..d1d2ff2
--- /dev/null
+++ b/src/projects/hammer/hamcluster.cpp
@@ -0,0 +1,288 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "hamcluster.hpp"
+
+#include "utils/adt/concurrent_dsu.hpp"
+#include "io/kmers_io/mmapped_reader.hpp"
+#include "parallel_radix_sort.hpp"
+
+#include "config_struct_hammer.hpp"
+#include "globals.hpp"
+
+#include <iostream>
+#include <sstream>
+
+class EncoderKMer {
+public:
+ inline static size_t extract(const SubKMer &x, unsigned shift, unsigned Base) {
+ size_t idx = shift / SubKMer::TBits;
+ size_t ishift = shift - idx * SubKMer::TBits;
+ return (x.data()[idx] >> ishift) & ((1 << Base) - 1);
+ }
+};
+
+struct SubKMerComparator {
+ bool operator()(const SubKMer &l, const SubKMer &r) const {
+ for (size_t i = 0; i < SubKMer::DataSize ; ++i) {
+ if (l.data()[i] != r.data()[i]) {
+ return (l.data()[i] < r.data()[i]);
+ }
+ }
+
+ return false;
+ }
+};
+
+template<class Op>
+std::pair<size_t, size_t> SubKMerSplitter::split(Op &&op) {
+ std::vector<SubKMer> data; std::vector<size_t> blocks;
+
+ MMappedReader bifs(bifname_, /* unlink */ true);
+ MMappedReader kifs(kifname_, /* unlink */ true);
+ size_t icnt = 0, ocnt = 0;
+ while (bifs.good()) {
+ deserialize(blocks, data, bifs, kifs);
+
+ using PairSort = parallel_radix_sort::PairSort<SubKMer, size_t, SubKMer, EncoderKMer>;
+ // PairSort::InitAndSort(data.data(), blocks.data(), data.size());
+ PairSort::InitAndSort(data.data(), blocks.data(), data.size(), data.size() > 1000*16 ? -1 : 1);
+
+ for (auto start = data.begin(), end = data.end(); start != end;) {
+ auto chunk_end = std::upper_bound(start + 1, data.end(), *start, SubKMerComparator());
+ op(blocks.begin() + (start - data.begin()), chunk_end - start);
+ start = chunk_end;
+ ocnt += 1;
+ }
+ icnt += 1;
+ }
+
+ return std::make_pair(icnt, ocnt);
+}
+
+#if 1
+static bool canMerge(const ConcurrentDSU &uf, size_t x, size_t y) {
+ size_t szx = uf.set_size(x), szy = uf.set_size(y);
+ const size_t hardthr = 2500;
+
+ // Global threshold - no cluster larger than hard threshold
+ if (szx + szy > hardthr)
+ return false;
+
+ // If one of the clusters is moderately large, than attach "almost" singletons
+ // only.
+ if ((szx > hardthr * 3 / 4 && szy > 50) ||
+ (szy > hardthr * 3 / 4 && szx > 50))
+ return false;
+
+ return true;
+}
+#else
+static bool canMerge(const ConcurrentDSU &uf, size_t x, size_t y) {
+ return (uf.set_size(x) + uf.set_size(y)) < 10000;
+}
+#endif
+
+
+static void processBlockQuadratic(ConcurrentDSU &uf,
+ const std::vector<size_t>::iterator &block,
+ size_t block_size,
+ const KMerData &data,
+ unsigned tau) {
+ for (size_t i = 0; i < block_size; ++i) {
+ size_t x = block[i];
+ hammer::KMer kmerx = data.kmer(x);
+ for (size_t j = i + 1; j < block_size; j++) {
+ size_t y = block[j];
+ hammer::KMer kmery = data.kmer(y);
+ if (!uf.same(x, y) &&
+ canMerge(uf, x, y) &&
+ hamdistKMer(kmerx, kmery, tau) <= tau) {
+ uf.unite(x, y);
+ }
+ }
+ }
+}
+
+void KMerHamClusterer::cluster(const std::string &prefix,
+ const KMerData &data,
+ ConcurrentDSU &uf) {
+ // First pass - split & sort the k-mers
+ std::string fname = prefix + ".first", bfname = fname + ".blocks", kfname = fname + ".kmers";
+ std::ofstream bfs(bfname, std::ios::out | std::ios::binary);
+ std::ofstream kfs(kfname, std::ios::out | std::ios::binary);
+ VERIFY(bfs.good()); VERIFY(kfs.good());
+
+ INFO("Serializing sub-kmers.");
+ for (unsigned i = 0; i < tau_ + 1; ++i) {
+ size_t from = (*Globals::subKMerPositions)[i];
+ size_t to = (*Globals::subKMerPositions)[i+1];
+
+ INFO("Serializing: [" << from << ", " << to << ")");
+ serialize(bfs, kfs,
+ data, NULL, 0,
+ SubKMerPartSerializer(from, to));
+ }
+ VERIFY(!bfs.fail()); VERIFY(!kfs.fail());
+ bfs.close(); kfs.close();
+
+ size_t big_blocks1 = 0;
+ {
+ unsigned block_thr = cfg::get().hamming_blocksize_quadratic_threshold;
+
+ INFO("Splitting sub-kmers, pass 1.");
+ SubKMerSplitter Splitter(bfname, kfname);
+
+ fname = prefix + ".second", bfname = fname + ".blocks", kfname = fname + ".kmers";
+ bfs.open(bfname, std::ios::out | std::ios::binary);
+ kfs.open(kfname, std::ios::out | std::ios::binary);
+ VERIFY(bfs.good()); VERIFY(kfs.good());
+
+ std::pair<size_t, size_t> stat =
+ Splitter.split([&] (const std::vector<size_t>::iterator &start, size_t sz) {
+ if (sz < block_thr) {
+ // Merge small blocks.
+ processBlockQuadratic(uf, start, sz, data, tau_);
+ } else {
+ big_blocks1 += 1;
+ // Otherwise - dump for next iteration.
+ for (unsigned i = 0; i < tau_ + 1; ++i) {
+ serialize(bfs, kfs,
+ data, &start, sz,
+ SubKMerStridedSerializer(i, tau_ + 1));
+ }
+ }
+ });
+ INFO("Splitting done."
+ " Processed " << stat.first << " blocks."
+ " Produced " << stat.second << " blocks.");
+
+ // Sanity check - there cannot be more blocks than tau + 1 times of total
+ // kmer number. And on the first pass we have only tau + 1 input blocks!
+ VERIFY(stat.first == tau_ + 1);
+ VERIFY(stat.second <= (tau_ + 1) * data.size());
+
+ VERIFY(!bfs.fail()); VERIFY(!kfs.fail());
+ bfs.close(); kfs.close();
+ INFO("Merge done, total " << big_blocks1 << " new blocks generated.");
+ }
+
+ size_t big_blocks2 = 0;
+ {
+ INFO("Spliting sub-kmers, pass 2.");
+ SubKMerSplitter Splitter(bfname, kfname);
+ size_t nblocks = 0;
+ std::pair<size_t, size_t> stat =
+ Splitter.split([&] (const std::vector<size_t>::iterator &start, size_t sz) {
+ if (sz > 50) {
+ big_blocks2 += 1;
+#if 0
+ for (size_t i = 0; i < block.size(); ++i) {
+ std::string s(Globals::blob + data[block[i]], K);
+ INFO("" << block[i] << ": " << s);
+ }
+#endif
+ }
+ processBlockQuadratic(uf, start, sz, data, tau_);
+ nblocks += 1;
+ });
+ INFO("Splitting done."
+ " Processed " << stat.first << " blocks."
+ " Produced " << stat.second << " blocks.");
+
+ // Sanity check - there cannot be more blocks than tau + 1 times of total
+ // kmer number. And there should be tau + 1 times big_blocks input blocks.
+ VERIFY(stat.first == (tau_ + 1)*big_blocks1);
+ VERIFY(stat.second <= (tau_ + 1) * (tau_ + 1) * data.size());
+
+ INFO("Merge done, saw " << big_blocks2 << " big blocks out of " << nblocks << " processed.");
+ }
+}
+
+enum {
+ UNLOCKED = 0,
+ PARTIALLY_LOCKED = 1,
+ FULLY_LOCKED = 3
+};
+
+static bool canMerge2(const ConcurrentDSU &uf, size_t kidx, size_t cidx) {
+ // If either of indices is fully locked - bail out
+ uint64_t kaux = uf.root_aux(kidx), caux = uf.root_aux(cidx);
+ if (kaux == FULLY_LOCKED || caux == FULLY_LOCKED)
+ return false;
+
+ // Otherwise there is a possibility to merge stuff.
+ if (0 && (kaux == PARTIALLY_LOCKED || caux == PARTIALLY_LOCKED)) {
+ // We cannot merge two partially locked clusters.
+ return kaux != caux;
+ }
+
+ return true;
+}
+
+static void ClusterChunk(size_t start_idx, size_t end_idx, const KMerData &data, ConcurrentDSU &uf) {
+ unsigned nthreads = cfg::get().general_max_nthreads;
+
+ // INFO("Cluster: " << start_idx << ":" << end_idx);
+# pragma omp parallel num_threads(nthreads)
+ {
+# pragma omp for
+ for (size_t idx = start_idx; idx < end_idx; ++idx) {
+ hammer::KMer kmer = data.kmer(idx);
+
+ if (kmer.GetHash() > (!kmer).GetHash())
+ continue;
+
+ size_t kidx = data.seq_idx(kmer);
+ size_t rckidx = -1ULL;
+ // INFO("" << kmer << ":" << kidx);
+
+ for (size_t k = 0; k < hammer::K; ++k) {
+ hammer::KMer candidate = kmer;
+ char c = candidate[k];
+ for (char nc = 0; nc < 4; ++nc) {
+ if (nc == c)
+ continue;
+ candidate.set(k, nc);
+ size_t cidx = data.checking_seq_idx(candidate);
+ // INFO("" << candidate << ":" << cidx);
+ if (cidx != -1ULL && canMerge2(uf, kidx, cidx)) {
+ uf.unite(kidx, cidx);
+
+ size_t rccidx = data.seq_idx(!candidate);
+ if (rckidx == -1ULL)
+ rckidx = data.seq_idx(!kmer);
+ uf.unite(rckidx, rccidx);
+ }
+ }
+ }
+ }
+# pragma omp barrier
+ //INFO("Lock: " << start_idx << ":" << end_idx);
+# pragma omp for
+ for (size_t idx = start_idx; idx < end_idx; ++idx) {
+ if (uf.set_size(idx) < 2500)
+ continue;
+
+ if (uf.root_aux(idx) != FULLY_LOCKED)
+ uf.set_root_aux(idx, FULLY_LOCKED);
+ }
+ }
+}
+
+void TauOneKMerHamClusterer::cluster(const std::string &, const KMerData &data, ConcurrentDSU &uf) {
+ size_t start_idx = 0;
+ while (start_idx < data.size()) {
+ size_t end_idx = start_idx + 64*1024;
+ if (end_idx > data.size())
+ end_idx = data.size();
+
+ ClusterChunk(start_idx, end_idx, data, uf);
+
+ start_idx = end_idx;
+ }
+}
diff --git a/src/projects/hammer/hamcluster.hpp b/src/projects/hammer/hamcluster.hpp
new file mode 100644
index 0000000..30f5356
--- /dev/null
+++ b/src/projects/hammer/hamcluster.hpp
@@ -0,0 +1,161 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef HAMMER_SUBKMER_SORTER_HPP
+#define HAMMER_SUBKMER_SORTER_HPP
+
+#include "kmer_stat.hpp"
+#include "kmer_data.hpp"
+#include "io/kmers_io/mmapped_reader.hpp"
+
+#include "dev_support/logger/logger.hpp"
+#include "data_structures/sequence/seq.hpp"
+
+#include <iostream>
+#include <vector>
+
+class ConcurrentDSU;
+
+typedef Seq<(hammer::K + 1) / 2, uint32_t> SubKMer;
+
+template<class Reader>
+inline void binary_read(Reader &is, SubKMer &s) {
+ SubKMer::DataType seq_data[SubKMer::DataSize];
+
+ is.read((char*)seq_data, sizeof(seq_data));
+
+ s = SubKMer(seq_data);
+}
+
+template<class Writer>
+inline Writer &binary_write(Writer &os, const SubKMer &s) {
+ SubKMer::DataType seq_data[SubKMer::DataSize];
+ s.copy_data(seq_data);
+
+ os.write((char*)seq_data, sizeof(seq_data));
+
+ return os;
+}
+
+static_assert(sizeof(SubKMer) == 4, "Too big SubKMer");
+
+class SubKMerPartSerializer{
+ size_t from_;
+ size_t to_;
+
+public:
+ SubKMerPartSerializer(size_t from, size_t to)
+ :from_(from), to_(to) { VERIFY(to_ - from_ <= hammer::K); }
+
+ SubKMer serialize(hammer::KMer k) const {
+ SubKMer res;
+ for (size_t i = 0; i < to_ - from_; ++i)
+ res.set(i, k[from_ + i]);
+
+ return res;
+ }
+};
+
+class SubKMerStridedSerializer{
+ size_t from_;
+ size_t stride_;
+
+public:
+ SubKMerStridedSerializer(size_t from, size_t stride)
+ :from_(from), stride_(stride) { VERIFY(from_ + stride_ <= hammer::K); }
+
+ SubKMer serialize(hammer::KMer k) const {
+ SubKMer res;
+
+ for (size_t i = from_, j = 0; i < hammer::K; i+= stride_, ++j)
+ res.set(j, k[i]);
+
+ return res;
+ }
+};
+
+template<class Writer,
+ class SubKMerSerializer>
+void serialize(Writer &blocks, Writer &kmers,
+ const KMerData &data,
+ const std::vector<size_t>::iterator *block = NULL, size_t sz = 0,
+ const SubKMerSerializer &serializer = SubKMerSerializer()) {
+ if (sz == 0)
+ sz = data.size();
+
+ blocks.write((char*)&sz, sizeof(sz));
+ if (block) {
+ blocks.write((char*)&**block, sz * sizeof((*block)[0]));
+ } else {
+ for (size_t i = 0, e = sz; i != e; ++i)
+ blocks.write((char*)&i, sizeof(i));
+ }
+
+ for (size_t i = 0, e = sz; i != e; ++i) {
+ size_t idx = (block == NULL ? i : (*block)[i]);
+ SubKMer s = serializer.serialize(data.kmer(idx));
+ binary_write(kmers, s);
+ }
+}
+
+class SubKMerSplitter {
+ const std::string bifname_, kifname_;
+
+ public:
+ SubKMerSplitter(const std::string &bifname, const std::string &kifname)
+ : bifname_(bifname), kifname_(kifname) {}
+
+ template<class Writer>
+ void serialize(Writer &os,
+ const std::vector<size_t>::iterator &start,
+ size_t sz) {
+ os.write((char*)&sz, sizeof(sz));
+ os.write((char*)&*start, sz * sizeof(*start));
+ }
+
+ template<class Reader>
+ void deserialize(std::vector<size_t> &blocks,
+ std::vector<SubKMer> &kmers,
+ Reader &bis, Reader &kis) {
+ kmers.clear(); blocks.clear();
+
+ size_t sz;
+ bis.read((char*)&sz, sizeof(sz));
+ blocks.resize(sz);
+ bis.read((char*)blocks.data(), sz * sizeof(blocks[0]));
+
+ kmers.resize(sz);
+ for (size_t i = 0, e = sz; i != e; ++i)
+ binary_read(kis, kmers[i]);
+ }
+
+ template<class Op>
+ std::pair<size_t, size_t> split(Op &&op);
+};
+
+class KMerHamClusterer {
+ unsigned tau_;
+
+ public:
+ KMerHamClusterer(unsigned tau)
+ : tau_(tau) {}
+
+ void cluster(const std::string &prefix, const KMerData &data, ConcurrentDSU &uf);
+ private:
+ DECL_LOGGER("Hamming Clustering");
+};
+
+class TauOneKMerHamClusterer {
+ public:
+ TauOneKMerHamClusterer() {}
+ void cluster(const std::string &prefix, const KMerData &data, ConcurrentDSU &uf);
+ private:
+ DECL_LOGGER("tau = 1 Hamming Clustering");
+};
+
+
+#endif // HAMMER_SUBKMER_SORTER_HPP
diff --git a/src/hammer/hammer debug.launch.template b/src/projects/hammer/hammer debug.launch.template
similarity index 100%
rename from src/hammer/hammer debug.launch.template
rename to src/projects/hammer/hammer debug.launch.template
diff --git a/src/hammer/hammer release.launch.template b/src/projects/hammer/hammer release.launch.template
similarity index 100%
rename from src/hammer/hammer release.launch.template
rename to src/projects/hammer/hammer release.launch.template
diff --git a/src/projects/hammer/hammer_tools.cpp b/src/projects/hammer/hammer_tools.cpp
new file mode 100644
index 0000000..7298097
--- /dev/null
+++ b/src/projects/hammer/hammer_tools.cpp
@@ -0,0 +1,274 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "io/reads_io/ireadstream.hpp"
+#include "valid_kmer_generator.hpp"
+#include "globals.hpp"
+#include "kmer_data.hpp"
+#include "read_corrector.hpp"
+
+#include "io/kmers_io/mmapped_writer.hpp"
+
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+
+#include "config_struct_hammer.hpp"
+#include "hammer_tools.hpp"
+
+using namespace std;
+
+namespace hammer {
+
+void InitializeSubKMerPositions() {
+ std::ostringstream log_sstream;
+ log_sstream.str("");
+ Globals::subKMerPositions = new std::vector<uint32_t>(cfg::get().general_tau + 2);
+ for (uint32_t i=0; i < (uint32_t)(cfg::get().general_tau + 1); ++i) {
+ Globals::subKMerPositions->at(i) = (i * K / (cfg::get().general_tau + 1) );
+ log_sstream << Globals::subKMerPositions->at(i) << " ";
+ }
+ Globals::subKMerPositions->at(cfg::get().general_tau + 1) = K;
+ INFO("Hamming graph threshold tau=" << cfg::get().general_tau << ", k=" << K << ", subkmer positions = [ " << log_sstream.str() << "]" );
+}
+
+std::string getFilename(const string & dirprefix, const string & suffix) {
+ std::ostringstream tmp;
+ tmp.str(""); tmp << dirprefix.data() << "/" << suffix.data();
+ return tmp.str();
+}
+
+string getFilename(const string & dirprefix, unsigned iter_count, const string & suffix ) {
+ ostringstream tmp;
+ tmp.str(""); tmp << dirprefix.data() << "/" << std::setfill('0') << std::setw(2) << iter_count << "." << suffix.data();
+ return tmp.str();
+}
+
+string getReadsFilename(const std::string & dirprefix, const std::string &fname, unsigned iter_no, const std::string & suffix) {
+ ostringstream tmp;
+ tmp.str("");
+
+ tmp << dirprefix.data() << "/" << path::basename(fname) << '.' << std::setfill('0') << std::setw(2) << iter_no << "." << suffix.data();
+ return tmp.str();
+}
+
+string getFilename( const string & dirprefix, const string & suffix, int suffix_num ) {
+ ostringstream tmp;
+ tmp.str(""); tmp << dirprefix.data() << "/" << suffix.data() << "." << suffix_num;
+ return tmp.str();
+}
+
+string getFilename( const string & dirprefix, int iter_count, const string & suffix, int suffix_num ) {
+ ostringstream tmp;
+ tmp.str(""); tmp << dirprefix.data() << "/" << std::setfill('0') << std::setw(2) << iter_count << "." << suffix.data() << "." << suffix_num;
+ return tmp.str();
+}
+
+string getFilename( const string & dirprefix, int iter_count, const string & suffix, int suffix_num, const string & suffix2 ) {
+ ostringstream tmp;
+ tmp.str(""); tmp << dirprefix.data() << "/" << std::setfill('0') << std::setw(2) << iter_count << "." << suffix.data() << "." << suffix_num << "." << suffix2.data();
+ return tmp.str();
+}
+
+void CorrectReadsBatch(std::vector<bool> &res,
+ std::vector<Read> &reads, size_t buf_size,
+ size_t &changedReads, size_t &changedNucleotides, size_t &uncorrectedNucleotides, size_t &totalNucleotides,
+ const KMerData &data) {
+ unsigned correct_nthreads = min(cfg::get().correct_nthreads, cfg::get().general_max_nthreads);
+ bool discard_singletons = cfg::get().bayes_discard_only_singletons;
+ bool correct_threshold = cfg::get().correct_use_threshold;
+ bool discard_bad = cfg::get().correct_discard_bad;
+
+ ReadCorrector corrector(data, cfg::get().correct_stats);
+# pragma omp parallel for shared(reads, res, data) num_threads(correct_nthreads)
+ for (size_t i = 0; i < buf_size; ++i) {
+ if (reads[i].size() >= K) {
+ res[i] =
+ corrector.CorrectOneRead(reads[i],
+ correct_threshold, discard_singletons, discard_bad);
+ } else
+ res[i] = false;
+ }
+
+ changedReads += corrector.changed_reads();
+ changedNucleotides += corrector.changed_nucleotides();
+ uncorrectedNucleotides += corrector.uncorrected_nucleotides();
+ totalNucleotides += corrector.total_nucleotides();
+}
+
+void CorrectReadFile(const KMerData &data,
+ size_t &changedReads, size_t &changedNucleotides, size_t &uncorrectedNucleotides, size_t &totalNucleotides,
+ const std::string &fname,
+ std::ofstream *outf_good, std::ofstream *outf_bad) {
+ int qvoffset = cfg::get().input_qvoffset;
+ int trim_quality = cfg::get().input_trim_quality;
+
+ unsigned correct_nthreads = min(cfg::get().correct_nthreads, cfg::get().general_max_nthreads);
+ size_t read_buffer_size = correct_nthreads * cfg::get().correct_readbuffer;
+ std::vector<Read> reads(read_buffer_size);
+ std::vector<bool> res(read_buffer_size, false);
+
+ ireadstream irs(fname, qvoffset);
+ VERIFY(irs.is_open());
+
+ unsigned buffer_no = 0;
+ while (!irs.eof()) {
+ size_t buf_size = 0;
+ for (; buf_size < read_buffer_size && !irs.eof(); ++buf_size) {
+ irs >> reads[buf_size];
+ reads[buf_size].trimNsAndBadQuality(trim_quality);
+ }
+ INFO("Prepared batch " << buffer_no << " of " << buf_size << " reads.");
+
+ CorrectReadsBatch(res, reads, buf_size,
+ changedReads, changedNucleotides, uncorrectedNucleotides, totalNucleotides,
+ data);
+
+ INFO("Processed batch " << buffer_no);
+ for (size_t i = 0; i < buf_size; ++i) {
+ reads[i].print(*(res[i] ? outf_good : outf_bad), qvoffset);
+ }
+ INFO("Written batch " << buffer_no);
+ ++buffer_no;
+ }
+}
+
+void CorrectPairedReadFiles(const KMerData &data,
+ size_t &changedReads, size_t &changedNucleotides, size_t &uncorrectedNucleotides, size_t &totalNucleotides,
+ const std::string &fnamel, const std::string &fnamer,
+ ofstream * ofbadl, ofstream * ofcorl, ofstream * ofbadr, ofstream * ofcorr, ofstream * ofunp) {
+ int qvoffset = cfg::get().input_qvoffset;
+ int trim_quality = cfg::get().input_trim_quality;
+
+ unsigned correct_nthreads = min(cfg::get().correct_nthreads, cfg::get().general_max_nthreads);
+ size_t read_buffer_size = correct_nthreads * cfg::get().correct_readbuffer;
+ std::vector<Read> l(read_buffer_size);
+ std::vector<Read> r(read_buffer_size);
+ std::vector<bool> left_res(read_buffer_size, false);
+ std::vector<bool> right_res(read_buffer_size, false);
+
+ unsigned buffer_no = 0;
+
+ ireadstream irsl(fnamel, qvoffset), irsr(fnamer, qvoffset);
+ VERIFY(irsl.is_open()); VERIFY(irsr.is_open());
+
+ while (!irsl.eof() && !irsr.eof()) {
+ size_t buf_size = 0;
+ for (; buf_size < read_buffer_size && !irsl.eof() && !irsr.eof(); ++buf_size) {
+ irsl >> l[buf_size]; irsr >> r[buf_size];
+ l[buf_size].trimNsAndBadQuality(trim_quality);
+ r[buf_size].trimNsAndBadQuality(trim_quality);
+ }
+ INFO("Prepared batch " << buffer_no << " of " << buf_size << " reads.");
+
+ CorrectReadsBatch(left_res, l, buf_size,
+ changedReads, changedNucleotides, uncorrectedNucleotides, totalNucleotides,
+ data);
+ CorrectReadsBatch(right_res, r, buf_size,
+ changedReads, changedNucleotides, uncorrectedNucleotides, totalNucleotides,
+ data);
+
+ INFO("Processed batch " << buffer_no);
+ for (size_t i = 0; i < buf_size; ++i) {
+ if (left_res[i] && right_res[i]) {
+ l[i].print(*ofcorl, qvoffset);
+ r[i].print(*ofcorr, qvoffset);
+ } else {
+ l[i].print(*(left_res[i] ? ofunp : ofbadl), qvoffset);
+ r[i].print(*(right_res[i] ? ofunp : ofbadr), qvoffset);
+ }
+ }
+ INFO("Written batch " << buffer_no);
+ ++buffer_no;
+ }
+}
+
+std::string getLargestPrefix(const std::string &str1, const std::string &str2) {
+ string substr = "";
+ for (size_t i = 0; i != str1.size() && i != str2.size(); ++i) {
+ if (str1[i] == str2[i])
+ substr += str1[i];
+ else
+ break;
+ }
+ return substr;
+}
+
+size_t CorrectAllReads() {
+ // Now for the reconstruction step; we still have the reads in rv, correcting them in place.
+ size_t changedReads = 0;
+ size_t changedNucleotides = 0;
+ size_t uncorrectedNucleotides = 0;
+ size_t totalNucleotides = 0;
+
+ int correct_nthreads = std::min(cfg::get().correct_nthreads, cfg::get().general_max_nthreads);
+
+ INFO("Starting read correction in " << correct_nthreads << " threads.");
+
+ const io::DataSet<> &dataset = cfg::get().dataset;
+ io::DataSet<> outdataset;
+ size_t ilib = 0;
+ for (const auto& lib : dataset.libraries()) {
+ auto outlib = lib;
+ outlib.clear();
+
+ size_t iread = 0;
+ for (auto I = lib.paired_begin(), E = lib.paired_end(); I != E; ++I, ++iread) {
+ INFO("Correcting pair of reads: " << I->first << " and " << I->second);
+ std::string usuffix = std::to_string(ilib) + "_" +
+ std::to_string(iread) + ".cor.fastq";
+
+ std::string unpaired = getLargestPrefix(I->first, I->second) + "_unpaired.fastq";
+
+ std::string outcorl = getReadsFilename(cfg::get().output_dir, I->first, Globals::iteration_no, usuffix);
+ std::string outcorr = getReadsFilename(cfg::get().output_dir, I->second, Globals::iteration_no, usuffix);
+ std::string outcoru = getReadsFilename(cfg::get().output_dir, unpaired, Globals::iteration_no, usuffix);
+
+ std::ofstream ofcorl(outcorl.c_str());
+ std::ofstream ofbadl(getReadsFilename(cfg::get().output_dir, I->first, Globals::iteration_no, "bad.fastq").c_str(),
+ std::ios::out | std::ios::ate);
+ std::ofstream ofcorr(outcorr.c_str());
+ std::ofstream ofbadr(getReadsFilename(cfg::get().output_dir, I->second, Globals::iteration_no, "bad.fastq").c_str(),
+ std::ios::out | std::ios::ate);
+ std::ofstream ofunp (outcoru.c_str());
+
+ CorrectPairedReadFiles(*Globals::kmer_data,
+ changedReads, changedNucleotides, uncorrectedNucleotides, totalNucleotides,
+ I->first, I->second,
+ &ofbadl, &ofcorl, &ofbadr, &ofcorr, &ofunp);
+ outlib.push_back_paired(outcorl, outcorr);
+ outlib.push_back_single(outcoru);
+ }
+
+ for (auto I = lib.single_begin(), E = lib.single_end(); I != E; ++I, ++iread) {
+ INFO("Correcting single reads: " << *I);
+ std::string usuffix = std::to_string(ilib) + "_" +
+ std::to_string(iread) + ".cor.fastq";
+
+ std::string outcor = getReadsFilename(cfg::get().output_dir, *I, Globals::iteration_no, usuffix);
+ std::ofstream ofgood(outcor.c_str());
+ std::ofstream ofbad(getReadsFilename(cfg::get().output_dir, *I, Globals::iteration_no, "bad.fastq").c_str(),
+ std::ios::out | std::ios::ate);
+
+ CorrectReadFile(*Globals::kmer_data,
+ changedReads, changedNucleotides, uncorrectedNucleotides, totalNucleotides,
+ *I,
+ &ofgood, &ofbad);
+ outlib.push_back_single(outcor);
+ }
+ outdataset.push_back(outlib);
+ ilib += 1;
+ }
+
+ cfg::get_writable().dataset = outdataset;
+
+ INFO("Correction done. Changed " << changedNucleotides << " bases in " << changedReads << " reads.");
+ INFO("Failed to correct " << uncorrectedNucleotides << " bases out of " << totalNucleotides << ".");
+ return changedReads;
+}
+
+};
diff --git a/src/projects/hammer/hammer_tools.hpp b/src/projects/hammer/hammer_tools.hpp
new file mode 100644
index 0000000..3ef9a6a
--- /dev/null
+++ b/src/projects/hammer/hammer_tools.hpp
@@ -0,0 +1,57 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef HAMMER_TOOLS_HPP
+#define HAMMER_TOOLS_HPP
+
+#include <unordered_map>
+#include <algorithm>
+#include <stdexcept>
+#include <iomanip>
+#include <fstream>
+#include "io/reads/read.hpp"
+#include "io/reads_io/ireadstream.hpp"
+#include "data_structures/sequence/seq.hpp"
+#include "globals.hpp"
+#include "kmer_stat.hpp"
+#include "io/kmers_io/mmapped_reader.hpp"
+
+namespace hammer {
+
+/// initialize subkmer positions and log about it
+void InitializeSubKMerPositions();
+
+/// parallel correction of batch of reads
+void CorrectReadsBatch(std::vector<bool> &res, std::vector<Read> &reads, size_t buf_size,
+ size_t &changedReads, size_t &changedNucleotides, size_t &uncorrectedNucleotides, size_t &totalNucleotides,
+ const KMerData &data);
+
+/// correct reads in a given file
+void CorrectReadFile(const KMerData &data,
+ size_t &changedReads, size_t &changedNucleotides, size_t &uncorrectedNucleotides, size_t &totalNucleotides,
+ const std::string &fname,
+ std::ofstream *outf_good, std::ofstream *outf_bad);
+
+/// correct reads in a given pair of files
+void CorrectPairedReadFiles(const KMerData &data,
+ size_t &changedReads, size_t &changedNucleotides, size_t &uncorrectedNucleotides, size_t &totalNucleotides,
+ const std::string &fnamel, const std::string &fnamer,
+ std::ofstream * ofbadl, std::ofstream * ofcorl, std::ofstream * ofbadr, std::ofstream * ofcorr, std::ofstream * ofunp);
+/// correct all reads
+size_t CorrectAllReads();
+
+std::string getFilename(const std::string & dirprefix, const std::string & suffix );
+std::string getFilename(const std::string & dirprefix, unsigned iter_count, const std::string & suffix );
+std::string getFilename(const std::string & dirprefix, int iter_count, const std::string & suffix, int suffix_num );
+std::string getFilename(const std::string & dirprefix, int iter_count, const std::string & suffix, int suffix_num, const std::string & suffix2 );
+std::string getFilename(const std::string & dirprefix, const std::string & suffix, int suffix_num );
+std::string getReadsFilename(const std::string & dirprefix, const std::string &fname, unsigned iter_no, const std::string & suffix);
+};
+
+
+
+#endif
diff --git a/src/projects/hammer/kmer_cluster.cpp b/src/projects/hammer/kmer_cluster.cpp
new file mode 100644
index 0000000..c1b6eb3
--- /dev/null
+++ b/src/projects/hammer/kmer_cluster.cpp
@@ -0,0 +1,656 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "io/reads_io/ireadstream.hpp"
+#include "hammer_tools.hpp"
+#include "hamcluster.hpp"
+#include "kmer_cluster.hpp"
+#include "config_struct_hammer.hpp"
+
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/symmetric.hpp>
+
+#include <iostream>
+#include <fstream>
+#include <algorithm>
+
+using std::max_element;
+using std::min_element;
+
+namespace numeric = boost::numeric::ublas;
+
+using namespace hammer;
+
+std::string KMerClustering::GetGoodKMersFname() const {
+ // FIXME: This is ugly!
+ std::ostringstream tmp;
+ tmp.str("");
+ tmp << workdir_ << "/" << "kmers.solid";
+
+ return tmp.str();
+}
+
+std::string KMerClustering::GetBadKMersFname() const {
+ // FIXME: This is ugly!
+ std::ostringstream tmp;
+ tmp.str("");
+ tmp << workdir_ << "/" << "kmers.bad";
+
+ return tmp.str();
+}
+
+static hammer::ExpandedSeq ConsensusWithMask(const std::vector<hammer::ExpandedKMer> &kmers,
+ const std::vector<size_t> &mask, size_t maskVal) {
+ size_t block_size = kmers.size();
+
+ // consensus of a single string is trivial
+ if (block_size == 1)
+ return kmers[0].seq();
+
+ uint64_t scores[4*K] = {0};
+ for (size_t j = 0; j < block_size; ++j) {
+ if (mask[j] != maskVal)
+ continue;
+
+ const ExpandedSeq &kmer = kmers[j].seq();
+
+ for (unsigned i = 0; i < K; ++i)
+ scores[4*i + kmer[i]] += kmers[j].count();
+ }
+
+ hammer::ExpandedSeq res;
+ for (unsigned i = 0; i < K; ++i)
+ res[i] = (char)(std::max_element(scores + 4*i, scores + 4*i + 4) - (scores + 4*i));
+
+ return res;
+}
+
+static hammer::ExpandedSeq Consensus(const std::vector<hammer::ExpandedKMer> &kmers) {
+ size_t block_size = kmers.size();
+
+ // consensus of a single string is trivial
+ if (block_size == 1)
+ return kmers[0].seq();
+
+ uint64_t scores[4*K] = {0};
+ for (size_t j = 0; j < block_size; ++j) {
+ const ExpandedSeq &kmer = kmers[j].seq();
+
+ for (unsigned i = 0; i < K; ++i)
+ scores[4*i + kmer[i]] += kmers[j].count();
+ }
+
+ hammer::ExpandedSeq res;
+ for (unsigned i = 0; i < K; ++i)
+ res[i] = (char)(std::max_element(scores + 4*i, scores + 4*i + 4) - (scores + 4*i));
+
+ return res;
+}
+
+double KMerClustering::ClusterBIC(const std::vector<Center> ¢ers,
+ const std::vector<size_t> &indices, const std::vector<hammer::ExpandedKMer> &kmers) const {
+ size_t block_size = indices.size();
+ size_t clusters = centers.size();
+ if (block_size == 0)
+ return -std::numeric_limits<double>::infinity();
+ assert(centers.size() > 0);
+
+ double loglik = 0;
+ unsigned total = 0;
+ for (size_t i = 0; i < block_size; ++i) {
+ loglik += kmers[i].count()*kmers[i].logL(centers[indices[i]].center_);
+ total += kmers[i].count();
+ }
+
+ size_t nparams = (clusters - 1) + clusters*K + 2*clusters*K;
+
+ if (cfg::get().bayes_debug_output > 1) {
+# pragma omp critical
+ {
+ std::cout << " logL: " << loglik << ", clusters: " << clusters << ", nparams: " << nparams << ", N: " << block_size << std::endl;
+ }
+ }
+
+ return loglik - (double)nparams * log((double)total) / 2.0;
+}
+
+
+double KMerClustering::lMeansClustering(unsigned l, const std::vector<hammer::ExpandedKMer> &kmers,
+ std::vector<size_t> &indices, std::vector<Center> ¢ers) {
+ centers.resize(l); // there are l centers
+
+ // if l==1 then clustering is trivial
+ if (l == 1) {
+ centers[0].center_ = Consensus(kmers);
+ centers[0].count_ = kmers.size();
+ for (size_t i = 0; i < kmers.size(); ++i)
+ indices[i] = 0;
+ return ClusterBIC(centers, indices, kmers);
+ }
+
+ // Provide the initial approximation.
+ double totalLikelihood = 0.0;
+ if (cfg::get().bayes_initial_refine) {
+ // Refine the current approximation
+ centers[l-1].center_ = kmers[l-1].seq();
+ for (size_t i = 0; i < kmers.size(); ++i) {
+ size_t cidx = indices[i];
+ unsigned cdist = kmers[i].hamdist(centers[cidx].center_, K);
+ unsigned mdist = kmers[i].hamdist(centers[l-1].center_, cdist);
+ if (mdist < cdist) {
+ indices[i] = l - 1;
+ cidx = l - 1;
+ }
+ totalLikelihood += kmers[i].logL(centers[cidx].center_);
+ }
+ } else {
+ // We assume that kmers are sorted wrt the count.
+ for (size_t j = 0; j < l; ++j)
+ centers[j].center_ = kmers[j].seq();
+
+ for (size_t i = 0; i < kmers.size(); ++i) {
+ unsigned mdist = K;
+ unsigned cidx = 0;
+ for (unsigned j = 0; j < l; ++j) {
+ unsigned cdist = kmers[i].hamdist(centers[j].center_, mdist);
+ if (cdist < mdist) {
+ mdist = cdist;
+ cidx = j;
+ }
+ }
+ indices[i] = cidx;
+ totalLikelihood += kmers[i].logL(centers[cidx].center_);
+ }
+ }
+
+ if (cfg::get().bayes_debug_output > 1) {
+# pragma omp critical
+ {
+ std::cout << " centers:\n";
+ for (size_t i=0; i < centers.size(); ++i) {
+ std::cout << " " << centers[i].center_ << "\n";
+ }
+ }
+ }
+
+ // Main loop
+ bool changed = true, improved = true;
+
+ // auxiliary variables
+ std::vector<size_t> dists(l);
+ std::vector<double> loglike(l);
+ std::vector<bool> changedCenter(l);
+
+ while (changed && improved) {
+ // fill everything with zeros
+ changed = false;
+ std::fill(changedCenter.begin(), changedCenter.end(), false);
+ for (unsigned j = 0; j < l; ++j)
+ centers[j].count_ = 0;
+
+ double curlik = 0;
+
+ // E step: find which clusters we belong to
+ for (size_t i = 0; i < kmers.size(); ++i) {
+ size_t newInd = 0;
+ if (cfg::get().bayes_use_hamming_dist) {
+ for (unsigned j = 0; j < l; ++j)
+ dists[j] = kmers[i].hamdist(centers[j].center_);
+
+ newInd = std::min_element(dists.begin(), dists.end()) - dists.begin();
+ } else {
+ for (unsigned j = 0; j < l; ++j)
+ loglike[j] = kmers[i].logL(centers[j].center_);
+ newInd = std::max_element(loglike.begin(), loglike.end()) - loglike.begin();
+ }
+
+ curlik += loglike[newInd];
+ if (indices[i] != newInd) {
+ changed = true;
+ changedCenter[indices[i]] = true;
+ changedCenter[newInd] = true;
+ indices[i] = newInd;
+ }
+ ++centers[indices[i]].count_;
+ }
+
+ if (cfg::get().bayes_debug_output > 1) {
+# pragma omp critical
+ {
+ std::cout << " total likelihood=" << curlik << " as compared to previous " << totalLikelihood << std::endl;
+ }
+ }
+ improved = (curlik > totalLikelihood);
+ if (improved)
+ totalLikelihood = curlik;
+
+ // M step: find new cluster centers
+ for (unsigned j=0; j < l; ++j) {
+ if (!changedCenter[j])
+ continue; // nothing has changed
+
+ centers[j].center_ = ConsensusWithMask(kmers, indices, j);
+ }
+ }
+
+ // last M step
+ for (unsigned j=0; j < l; ++j)
+ centers[j].center_ = ConsensusWithMask(kmers, indices, j);
+
+ if (cfg::get().bayes_debug_output > 1) {
+# pragma omp critical
+ {
+ std::cout << " final centers:\n";
+ for (size_t i=0; i < centers.size(); ++i) {
+ std::cout << " " << centers[i].center_ << "\n";
+ }
+ }
+ }
+
+ return ClusterBIC(centers, indices, kmers);
+}
+
+
+size_t KMerClustering::SubClusterSingle(const std::vector<size_t> & block, std::vector< std::vector<size_t> > & vec) {
+ size_t newkmers = 0;
+
+ if (cfg::get().bayes_debug_output > 0) {
+# pragma omp critical
+ {
+ std::cout << " kmers:\n";
+ for (size_t i = 0; i < block.size(); i++) {
+ std::cout << data_.kmer(block[i]) << '\n';
+ }
+ }
+ }
+
+ size_t origBlockSize = block.size();
+ if (origBlockSize == 0) return 0;
+
+ // Ad-hoc max cluster limit: we start to consider only those k-mers which
+ // multiplicity differs from maximum multiplicity by 10x.
+ size_t maxcls = 0;
+ size_t cntthr = std::max(10u, data_[block[0]].count() / 10);
+ for (size_t i = 0; i < block.size(); ++i)
+ maxcls += (data_[block[i]].count() > cntthr);
+ // Another limit: we're interested in good centers only
+ size_t maxgcnt = 0;
+ for (size_t i = 0; i < block.size(); ++i) {
+ float center_quality = 1 - data_[block[i]].total_qual;
+ if ((center_quality > cfg::get().bayes_singleton_threshold) ||
+ (cfg::get().correct_use_threshold && center_quality > cfg::get().correct_threshold))
+ maxgcnt += 1;
+ }
+
+ maxcls = std::min(maxcls, maxgcnt) + 1;
+ if (cfg::get().bayes_debug_output > 0) {
+ #pragma omp critical
+ {
+ std::cout << "\nClustering an interesting block. Maximum # of clusters estimated: " << maxcls << std::endl;
+ }
+ }
+
+ // Prepare the expanded k-mer structure
+ std::vector<hammer::ExpandedKMer> kmers;
+ for (size_t idx : block)
+ kmers.emplace_back(data_.kmer(idx), data_[idx]);
+
+ double bestLikelihood = -std::numeric_limits<double>::infinity();
+ std::vector<Center> bestCenters;
+ std::vector<size_t> indices(origBlockSize);
+ std::vector<size_t> bestIndices(origBlockSize);
+
+ unsigned max_l = cfg::get().bayes_hammer_mode ? 1 : (unsigned) origBlockSize;
+ std::vector<Center> centers;
+ for (unsigned l = 1; l <= max_l; ++l) {
+ double curLikelihood = lMeansClustering(l, kmers, indices, centers);
+ if (cfg::get().bayes_debug_output > 0) {
+ #pragma omp critical
+ {
+ std::cout << " indices: ";
+ for (uint32_t i = 0; i < origBlockSize; i++) std::cout << indices[i] << " ";
+ std::cout << "\n";
+ std::cout << " likelihood with " << l << " clusters is " << curLikelihood << std::endl;
+ }
+ }
+ if (curLikelihood > bestLikelihood) {
+ bestLikelihood = curLikelihood;
+ bestCenters = centers; bestIndices = indices;
+ } else if (l >= maxcls)
+ break;
+ }
+
+ // find if centers are in clusters
+ std::vector<size_t> centersInCluster(bestCenters.size(), -1u);
+ for (unsigned i = 0; i < origBlockSize; i++) {
+ unsigned dist = kmers[i].hamdist(bestCenters[bestIndices[i]].center_);
+ if (dist == 0)
+ centersInCluster[bestIndices[i]] = i;
+ }
+
+ if (cfg::get().bayes_debug_output > 0) {
+# pragma omp critical
+ {
+ std::cout << "Centers: \n";
+ for (size_t k=0; k<bestCenters.size(); ++k) {
+ std::cout << " " << std::setw(4) << bestCenters[k].count_ << ": ";
+ if (centersInCluster[k] != -1u) {
+ const KMerStat &kms = data_[block[centersInCluster[k]]];
+ std::cout << kms << " " << std::setw(8) << block[centersInCluster[k]] << " ";
+ } else {
+ std::cout << bestCenters[k].center_;
+ }
+ std::cout << '\n';
+ }
+ std::cout << "The entire block:" << std::endl;
+ for (uint32_t i = 0; i < origBlockSize; i++) {
+ const KMerStat &kms = data_[block[i]];
+ std::cout << " " << kms << " " << std::setw(8) << block[i] << " ";
+ for (uint32_t j=0; j<K; ++j) std::cout << std::setw(3) << (unsigned)getQual(kms, j) << " "; std::cout << "\n";
+ }
+ std::cout << std::endl;
+ }
+ }
+
+ // it may happen that consensus string from one subcluster occurs in other subclusters
+ // we need to check for that
+ bool foundBadCenter = true;
+ while (foundBadCenter) {
+ foundBadCenter = false;
+ for (size_t k=0; k<bestCenters.size(); ++k) {
+ if (foundBadCenter) break; // restart if found one bad center
+ if (bestCenters[k].count_ == 0) continue;
+ if (centersInCluster[k] != -1u) continue;
+ for (size_t s = 0; s< bestCenters.size(); ++s) {
+ if (s == k || centersInCluster[s] == -1u) continue;
+ unsigned dist = hamdist(bestCenters[k].center_, bestCenters[s].center_);
+ if (dist == 0) {
+ // OK, that's the situation, cluster k should be added to cluster s
+ for (uint32_t i = 0; i < origBlockSize; i++) {
+ if (indices[i] == k) {
+ indices[i] = (unsigned)s;
+ bestCenters[s].count_++;
+ }
+ }
+ bestCenters[k].count_ = 0; // it will be skipped now
+ foundBadCenter = true;
+ break;
+ }
+ }
+ }
+ }
+
+ if (cfg::get().bayes_debug_output > 0 && origBlockSize > 2) {
+ #pragma omp critical
+ {
+ std::cout << "\nAfter the check we got centers: \n";
+ for (size_t k=0; k<bestCenters.size(); ++k) {
+ std::cout << " " << bestCenters[k].center_ << " (" << bestCenters[k].count_ << ")";
+ if (centersInCluster[k] != -1u) std::cout << block[centersInCluster[k]];
+ std::cout << "\n";
+ }
+ std::cout << std::endl;
+ }
+ }
+
+ for (size_t k = 0; k < bestCenters.size(); ++k) {
+ if (bestCenters[k].count_ == 0)
+ continue; // superfluous cluster with zero elements
+
+ std::vector<size_t> v;
+ if (bestCenters[k].count_ == 1) {
+ for (size_t i = 0; i < origBlockSize; i++) {
+ if (indices[i] == k) {
+ v.push_back(block[i]);
+ break;
+ }
+ }
+ } else { // there are several kmers in this cluster
+ for (size_t i = 0; i < origBlockSize; i++) {
+ if (bestIndices[i] == k) {
+ if (centersInCluster[k] == i) {
+ v.insert(v.begin(), block[i]);
+ } else {
+ v.push_back(block[i]);
+ }
+ }
+ }
+
+ if (centersInCluster[k] == -1u) {
+ unsigned new_idx = 0;
+ #pragma omp critical
+ {
+ KMer newkmer(bestCenters[k].center_);
+
+ KMerStat kms(0 /* cnt */, 1.0 /* total quality */, NULL /*quality */);
+ kms.mark_good();
+ new_idx = (unsigned)data_.push_back(newkmer, kms);
+ if (data_.kmer(data_.seq_idx(newkmer)) != newkmer)
+ newkmers += 1;
+ }
+ v.insert(v.begin(), new_idx);
+ }
+ }
+ vec.push_back(v);
+ }
+
+ return newkmers;
+}
+
+static void UpdateErrors(numeric::matrix<uint64_t> &m,
+ const KMer k, const KMer kc) {
+ for (unsigned i = 0; i < K; ++i) {
+ m(kc[i], k[i]) += 1;
+ }
+}
+
+size_t KMerClustering::ProcessCluster(const std::vector<size_t> &cur_class,
+ numeric::matrix<uint64_t> &errs,
+ std::ofstream &ofs, std::ofstream &ofs_bad,
+ size_t &gsingl, size_t &tsingl, size_t &tcsingl, size_t &gcsingl,
+ size_t &tcls, size_t &gcls, size_t &tkmers, size_t &tncls) {
+ size_t newkmers = 0;
+
+ // No need for clustering for singletons
+ if (cur_class.size() == 1) {
+ size_t idx = cur_class[0];
+ KMerStat &singl = data_[idx];
+ if ((1-singl.total_qual) > cfg::get().bayes_singleton_threshold) {
+ singl.mark_good();
+ gsingl += 1;
+
+ if (ofs.good()) {
+# pragma omp critical
+ {
+ ofs << " good singleton: " << idx << "\n " << singl << '\n';
+ }
+ }
+ } else {
+ if (cfg::get().correct_use_threshold && (1-singl.total_qual) > cfg::get().correct_threshold)
+ singl.mark_good();
+ else
+ singl.mark_bad();
+
+ if (ofs_bad.good()) {
+# pragma omp critical
+ {
+ ofs_bad << " bad singleton: " << idx << "\n " << singl << '\n';
+ }
+ }
+ }
+ tsingl += 1;
+ return 0;
+ }
+
+ std::vector<std::vector<size_t> > blocksInPlace;
+ if (cfg::get().bayes_debug_output) {
+# pragma omp critical
+ {
+ std::cout << "process_SIN with size=" << cur_class.size() << std::endl;
+ }
+ }
+ newkmers += SubClusterSingle(cur_class, blocksInPlace);
+
+ tncls += 1;
+ for (size_t m = 0; m < blocksInPlace.size(); ++m) {
+ const std::vector<size_t> ¤tBlock = blocksInPlace[m];
+ if (currentBlock.size() == 0)
+ continue;
+
+ size_t cidx = currentBlock[0];
+ KMerStat ¢er = data_[cidx];
+ KMer ckmer = data_.kmer(cidx);
+ double center_quality = 1 - center.total_qual;
+
+ // Computing the overall quality of a cluster.
+ double cluster_quality = 1;
+ if (currentBlock.size() > 1) {
+ for (size_t j = 1; j < currentBlock.size(); ++j)
+ cluster_quality *= data_[currentBlock[j]].total_qual;
+
+ cluster_quality = 1-cluster_quality;
+ }
+
+ if (currentBlock.size() == 1)
+ tcsingl += 1;
+ else
+ tcls += 1;
+
+ if ((center_quality > cfg::get().bayes_singleton_threshold &&
+ cluster_quality > cfg::get().bayes_nonsingleton_threshold) ||
+ cfg::get().bayes_hammer_mode) {
+ center.mark_good();
+
+ if (currentBlock.size() == 1)
+ gcsingl += 1;
+ else
+ gcls += 1;
+
+ if (ofs.good()) {
+# pragma omp critical
+ {
+ ofs << " center of good cluster (" << currentBlock.size() << ", " << cluster_quality << ")" << "\n "
+ << center << '\n';
+ }
+ }
+ } else {
+ if (cfg::get().correct_use_threshold && center_quality > cfg::get().correct_threshold)
+ center.mark_good();
+ else
+ center.mark_bad();
+ if (ofs_bad.good()) {
+# pragma omp critical
+ {
+ ofs_bad << " center of bad cluster (" << currentBlock.size() << ", " << cluster_quality << ")" << "\n "
+ << center << '\n';
+ }
+ }
+ }
+
+ tkmers += currentBlock.size();
+
+ for (size_t j = 1; j < currentBlock.size(); ++j) {
+ size_t eidx = currentBlock[j];
+ KMerStat &kms = data_[eidx];
+
+ UpdateErrors(errs, data_.kmer(eidx), ckmer);
+
+ if (ofs_bad.good()) {
+# pragma omp critical
+ {
+ ofs_bad << " part of cluster (" << currentBlock.size() << ", " << cluster_quality << ")" << "\n "
+ << kms << '\n';
+ }
+ }
+ }
+ }
+
+ return newkmers;
+}
+
+
+class KMerStatCountComparator {
+ const KMerData &data_;
+public:
+ KMerStatCountComparator(const KMerData &data)
+ : data_(data) {}
+ bool operator()(size_t a, size_t b) {
+ return data_[a].count() > data_[b].count();
+ }
+};
+
+void KMerClustering::process(const std::string &Prefix) {
+ size_t newkmers = 0;
+ size_t gsingl = 0, tsingl = 0, tcsingl = 0, gcsingl = 0, tcls = 0, gcls = 0, tkmers = 0, tncls = 0;
+
+ std::ofstream ofs, ofs_bad;
+ if (cfg::get().bayes_write_solid_kmers)
+ ofs.open(GetGoodKMersFname());
+ if (cfg::get().bayes_write_bad_kmers)
+ ofs_bad.open(GetBadKMersFname());
+
+ // Open and read index file
+ MMappedRecordReader<size_t> findex(Prefix + ".idx", /* unlink */ !debug_, -1ULL);
+
+ std::vector<numeric::matrix<uint64_t> > errs(nthreads_, numeric::matrix<double>(4, 4, 0.0));
+
+# pragma omp parallel for shared(ofs, ofs_bad, errs) num_threads(nthreads_) schedule(guided) reduction(+:newkmers, gsingl, tsingl, tcsingl, gcsingl, tcls, gcls, tkmers, tncls)
+ for (size_t chunk = 0; chunk < nthreads_ * nthreads_; ++chunk) {
+ size_t *current = findex.data() + findex.size() * chunk / nthreads_ / nthreads_;
+ size_t *next = findex.data() + findex.size() * (chunk + 1)/ nthreads_ / nthreads_;
+ std::ifstream is(Prefix, std::ios::in | std::ios::binary);
+
+ // Calculate how much we need to seek
+ size_t soff = 0;
+ for (size_t *csz = findex.data(); csz != current; ++csz)
+ soff += *csz;
+
+ // Now see the stream and start processing
+ is.seekg(soff * sizeof(size_t));
+
+ for (; current != next; ++current) {
+ std::vector<size_t> cluster(*current);
+ VERIFY(is.good());
+ is.read((char*)&cluster[0], *current * sizeof(cluster[0]));
+
+ // Underlying code expected classes to be sorted in count decreasing order.
+ std::sort(cluster.begin(), cluster.end(), KMerStatCountComparator(data_));
+
+ newkmers += ProcessCluster(cluster,
+ errs[omp_get_thread_num()],
+ ofs, ofs_bad,
+ gsingl, tsingl, tcsingl, gcsingl,
+ tcls, gcls, tkmers, tncls);
+ }
+ }
+
+ if (!debug_) {
+ int res = unlink(Prefix.c_str());
+ VERIFY_MSG(res == 0,
+ "unlink(2) failed. Reason: " << strerror(errno) << ". Error code: " << errno);
+ }
+
+ for (unsigned i = 1; i < nthreads_; ++i)
+ errs[0] += errs[i];
+
+ numeric::matrix<uint64_t> rowsums = prod(errs[0], numeric::scalar_matrix<double>(4, 1, 1));
+ numeric::matrix<double> err(4, 4);
+ for (unsigned i = 0; i < 4; ++i)
+ for (unsigned j = 0; j < 4; ++j)
+ err(i, j) = 1.0 * (double)errs[0](i, j) / (double)rowsums(i, 0);
+
+ INFO("Subclustering done. Total " << newkmers << " non-read kmers were generated.");
+ INFO("Subclustering statistics:");
+ INFO(" Total singleton hamming clusters: " << tsingl << ". Among them " << gsingl << " (" << 100.0 * (double)gsingl / (double)tsingl << "%) are good");
+ INFO(" Total singleton subclusters: " << tcsingl << ". Among them " << gcsingl << " (" << 100.0 * (double)gcsingl / (double)tcsingl << "%) are good");
+ INFO(" Total non-singleton subcluster centers: " << tcls << ". Among them " << gcls << " (" << 100.0 * (double)gcls / (double)tcls << "%) are good");
+ INFO(" Average size of non-trivial subcluster: " << 1.0 * (double)tkmers / (double)tcls << " kmers");
+ INFO(" Average number of sub-clusters per non-singleton cluster: " << 1.0 * (double)(tcsingl + tcls) / (double)tncls);
+ INFO(" Total solid k-mers: " << gsingl + gcsingl + gcls);
+ INFO(" Substitution probabilities: " << err);
+}
diff --git a/src/hammer/kmer_cluster.hpp b/src/projects/hammer/kmer_cluster.hpp
similarity index 100%
rename from src/hammer/kmer_cluster.hpp
rename to src/projects/hammer/kmer_cluster.hpp
diff --git a/src/projects/hammer/kmer_data.cpp b/src/projects/hammer/kmer_data.cpp
new file mode 100644
index 0000000..160950a
--- /dev/null
+++ b/src/projects/hammer/kmer_data.cpp
@@ -0,0 +1,568 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "kmer_data.hpp"
+#include "io/reads_io/read_processor.hpp"
+#include "valid_kmer_generator.hpp"
+
+#include "io/reads_io/ireadstream.hpp"
+#include "config_struct_hammer.hpp"
+
+#include "dev_support/file_limit.hpp"
+
+#include <libcxx/sort.hpp>
+#include "io/kmers_io/kmer_iterator.hpp"
+
+using namespace hammer;
+
+class BufferFiller;
+
+struct KMerComparator {
+ bool operator()(const KMer &l, const KMer &r) const {
+ for (size_t i = 0; i < KMer::DataSize ; ++i) {
+ if (l.data()[i] != r.data()[i]) {
+ return (l.data()[i] < r.data()[i]);
+ }
+ }
+
+ return false;
+ }
+};
+
+
+class HammerKMerSplitter : public KMerSplitter<hammer::KMer> {
+ typedef std::vector<std::vector<KMer> > KMerBuffer;
+
+ void DumpBuffers(size_t num_files, size_t nthreads,
+ std::vector<KMerBuffer> &buffers,
+ const path::files_t &ostreams) const;
+
+ public:
+ HammerKMerSplitter(std::string &work_dir)
+ : KMerSplitter<hammer::KMer>(work_dir, hammer::K) {}
+
+ virtual path::files_t Split(size_t num_files);
+
+ friend class BufferFiller;
+};
+
+void HammerKMerSplitter::DumpBuffers(size_t num_files, size_t nthreads,
+ std::vector<KMerBuffer> &buffers,
+ const path::files_t &ostreams) const {
+# pragma omp parallel for num_threads(nthreads)
+ for (unsigned k = 0; k < num_files; ++k) {
+ size_t sz = 0;
+ for (size_t i = 0; i < nthreads; ++i)
+ sz += buffers[i][k].size();
+
+ if (!sz)
+ continue;
+
+ std::vector<KMer> SortBuffer;
+ SortBuffer.reserve(sz);
+ for (size_t i = 0; i < nthreads; ++i) {
+ KMerBuffer &entry = buffers[i];
+ SortBuffer.insert(SortBuffer.end(), entry[k].begin(), entry[k].end());
+ }
+ libcxx::sort(SortBuffer.begin(), SortBuffer.end(), KMerComparator());
+ auto it = std::unique(SortBuffer.begin(), SortBuffer.end());
+
+# pragma omp critical
+ {
+ FILE *f = fopen(ostreams[k].c_str(), "ab");
+ VERIFY_MSG(f, "Cannot open temporary file to write");
+ fwrite(SortBuffer.data(), sizeof(KMer), it - SortBuffer.begin(), f);
+ fclose(f);
+ }
+ }
+
+ for (unsigned i = 0; i < nthreads; ++i) {
+ for (unsigned j = 0; j < num_files; ++j) {
+ buffers[i][j].clear();
+ }
+ }
+}
+
+
+class BufferFiller {
+ std::vector<HammerKMerSplitter::KMerBuffer> &tmp_entries_;
+ unsigned num_files_;
+ size_t cell_size_;
+ size_t processed_;
+ const HammerKMerSplitter &splitter_;
+
+ public:
+ BufferFiller(std::vector<HammerKMerSplitter::KMerBuffer> &tmp_entries, size_t cell_size, const HammerKMerSplitter &splitter):
+ tmp_entries_(tmp_entries), num_files_((unsigned)tmp_entries[0].size()), cell_size_(cell_size), processed_(0), splitter_(splitter) {}
+
+ size_t processed() const { return processed_; }
+
+ bool operator()(const Read &r) {
+ int trim_quality = cfg::get().input_trim_quality;
+
+ // FIXME: Get rid of this
+ Read cr = r;
+ size_t sz = cr.trimNsAndBadQuality(trim_quality);
+
+ #pragma omp atomic
+ processed_ += 1;
+
+ if (sz < hammer::K)
+ return false;
+
+ HammerKMerSplitter::KMerBuffer &entry = tmp_entries_[omp_get_thread_num()];
+ ValidKMerGenerator<hammer::K> gen(cr);
+ bool stop = false;
+ while (gen.HasMore()) {
+ KMer seq = gen.kmer();
+ size_t idx = splitter_.GetFileNumForSeq(seq, num_files_);
+ entry[idx].push_back(seq);
+ stop |= entry[idx].size() > cell_size_;
+
+ seq = !seq;
+ idx = splitter_.GetFileNumForSeq(seq, num_files_);
+ entry[idx].push_back(seq);
+ stop |= entry[idx].size() > cell_size_;
+
+ gen.Next();
+ }
+
+ return stop;
+ }
+};
+
+path::files_t HammerKMerSplitter::Split(size_t num_files) {
+ unsigned nthreads = std::min(cfg::get().count_merge_nthreads, cfg::get().general_max_nthreads);
+
+ INFO("Splitting kmer instances into " << num_files << " buckets. This might take a while.");
+
+ // Determine the set of output files
+ path::files_t out;
+ for (unsigned i = 0; i < num_files; ++i)
+ out.push_back(GetRawKMersFname(i));
+
+ size_t file_limit = num_files + 2*nthreads;
+ size_t res = limit_file(file_limit);
+ if (res < file_limit) {
+ WARN("Failed to setup necessary limit for number of open files. The process might crash later on.");
+ WARN("Do 'ulimit -n " << file_limit << "' in the console to overcome the limit");
+ }
+
+ size_t reads_buffer_size = cfg::get().count_split_buffer;
+ if (reads_buffer_size == 0) {
+ reads_buffer_size = 536870912ull;
+ size_t mem_limit = (size_t)((double)(get_free_memory()) / (nthreads * 3));
+ INFO("Memory available for splitting buffers: " << (double)mem_limit / 1024.0 / 1024.0 / 1024.0 << " Gb");
+ reads_buffer_size = std::min(reads_buffer_size, mem_limit);
+ }
+ size_t cell_size = reads_buffer_size / (num_files * sizeof(KMer));
+ // Set sane minimum cell size
+ if (cell_size < 16384)
+ cell_size = 16384;
+
+ INFO("Using cell size of " << cell_size);
+ std::vector<KMerBuffer> tmp_entries(nthreads);
+ for (unsigned i = 0; i < nthreads; ++i) {
+ KMerBuffer &entry = tmp_entries[i];
+ entry.resize(num_files);
+ for (unsigned j = 0; j < num_files; ++j) {
+ entry[j].reserve((size_t)(1.1 * (double)cell_size));
+ }
+ }
+
+ size_t n = 15;
+ BufferFiller filler(tmp_entries, cell_size, *this);
+ const auto& dataset = cfg::get().dataset;
+ for (auto I = dataset.reads_begin(), E = dataset.reads_end(); I != E; ++I) {
+ INFO("Processing " << *I);
+ ireadstream irs(*I, cfg::get().input_qvoffset);
+ while (!irs.eof()) {
+ hammer::ReadProcessor rp(nthreads);
+ rp.Run(irs, filler);
+ DumpBuffers(num_files, nthreads, tmp_entries, out);
+ VERIFY_MSG(rp.read() == rp.processed(), "Queue unbalanced");
+
+ if (filler.processed() >> n) {
+ INFO("Processed " << filler.processed() << " reads");
+ n += 1;
+ }
+ }
+ }
+ INFO("Processed " << filler.processed() << " reads");
+
+ return out;
+}
+
+static inline void Merge(KMerStat &lhs, const KMerStat &rhs) {
+ lhs.set_count(lhs.count() + rhs.count());
+ lhs.total_qual *= rhs.total_qual;
+ lhs.qual += rhs.qual;
+}
+
+static void PushKMer(KMerData &data,
+ KMer kmer, const unsigned char *q, double prob) {
+ size_t idx = data.checking_seq_idx(kmer);
+ if (idx == -1ULL)
+ return;
+ KMerStat &kmc = data[idx];
+ kmc.lock();
+ Merge(kmc,
+ KMerStat(1, (float)prob, q));
+ kmc.unlock();
+}
+
+static void PushKMerRC(KMerData &data,
+ KMer kmer, const unsigned char *q, double prob) {
+ unsigned char rcq[K];
+
+ // Prepare RC kmer with quality.
+ kmer = !kmer;
+ for (unsigned i = 0; i < K; ++i)
+ rcq[K - i - 1] = q[i];
+
+ size_t idx = data.checking_seq_idx(kmer);
+ if (idx == -1ULL)
+ return;
+ KMerStat &kmc = data[idx];
+ kmc.lock();
+ Merge(kmc,
+ KMerStat(1, (float)prob, rcq));
+ kmc.unlock();
+}
+
+class KMerDataFiller {
+ KMerData &data_;
+
+ public:
+ KMerDataFiller(KMerData &data)
+ : data_(data) {}
+
+ bool operator()(const Read &r) {
+ int trim_quality = cfg::get().input_trim_quality;
+
+ // FIXME: Get rid of this
+ Read cr = r;
+ size_t sz = cr.trimNsAndBadQuality(trim_quality);
+
+ if (sz < hammer::K)
+ return false;
+
+ ValidKMerGenerator<hammer::K> gen(cr);
+ const char *q = cr.getQualityString().data();
+ while (gen.HasMore()) {
+ KMer kmer = gen.kmer();
+ const unsigned char *kq = (const unsigned char*)(q + gen.pos() - 1);
+
+ PushKMer(data_, kmer, kq, 1 - gen.correct_probability());
+ PushKMerRC(data_, kmer, kq, 1 - gen.correct_probability());
+
+ gen.Next();
+ }
+
+ return false;
+ }
+};
+
+class KMerMultiplicityCounter {
+ KMerData &data_;
+ uint64_t *cnt_;
+
+ void IncCount(const hammer::KMer &k) {
+ size_t idx = data_.seq_idx(k);
+ size_t block = idx * 2 / (8 * sizeof(uint64_t)), pos = (idx * 2) % (8 * sizeof(uint64_t));
+ size_t mask = 3ull << pos;
+
+ if (__sync_fetch_and_or(cnt_ + block, 1ull << pos) & mask)
+ __sync_fetch_and_or(cnt_ + block, 2ull << pos);
+ }
+
+ public:
+ KMerMultiplicityCounter(KMerData &data)
+ : data_(data) {
+ size_t blocks = (2 * data.size()) / (8 * sizeof(uint64_t)) + 1;
+ cnt_ = new uint64_t[blocks];
+ memset(cnt_, 0, blocks * sizeof(uint64_t));
+ }
+ ~KMerMultiplicityCounter() { delete[] cnt_; }
+
+
+ bool operator()(const Read &r) {
+ int trim_quality = cfg::get().input_trim_quality;
+
+ // FIXME: Get rid of this
+ Read cr = r;
+ size_t sz = cr.trimNsAndBadQuality(trim_quality);
+
+ if (sz < hammer::K)
+ return false;
+
+ ValidKMerGenerator<hammer::K> gen(cr);
+ while (gen.HasMore()) {
+ KMer kmer = gen.kmer();
+
+ IncCount(kmer);
+ IncCount(!kmer);
+
+ gen.Next();
+ }
+
+ return false;
+ }
+
+ size_t count(size_t idx) const {
+ size_t block = idx * 2 / (8 * sizeof(uint64_t)), pos = idx * 2 % (8 * sizeof(uint64_t));
+ return (cnt_[block] >> pos) & 3;
+ }
+};
+
+class NonSingletonKMerSplitter : public KMerSplitter<hammer::KMer> {
+ typedef std::vector<std::vector<KMer> > KMerBuffer;
+
+ std::pair<size_t, size_t>
+ FillBufferFromStream(io::raw_kmer_iterator<hammer::KMer> &it,
+ KMerBuffer &entry,
+ size_t cell_size, size_t num_files) {
+ size_t processed = 0, non_singleton = 0 ;
+ for ( ; it.good(); ++it) {
+ hammer::KMer seq(hammer::K, *it);
+
+ size_t kidx = data_.seq_idx(seq);
+ size_t cnt = counter_.count(kidx);
+
+ processed += 1;
+
+ if (cnt == 1)
+ continue;
+
+ non_singleton += 1;
+
+ size_t idx = this->GetFileNumForSeq(seq, (unsigned)num_files);
+ entry[idx].push_back(seq);
+
+
+ if (entry[idx].size() > cell_size)
+ break;
+ }
+ return std::make_pair(processed, non_singleton);
+ }
+
+ void DumpBuffers(size_t num_files, size_t nthreads,
+ std::vector<KMerBuffer> &buffers,
+ const path::files_t &ostreams) const {
+# pragma omp parallel for num_threads(nthreads)
+ for (unsigned k = 0; k < num_files; ++k) {
+ size_t sz = 0;
+ for (size_t i = 0; i < nthreads; ++i)
+ sz += buffers[i][k].size();
+
+ if (!sz)
+ continue;
+
+ std::vector<KMer> SortBuffer;
+ SortBuffer.reserve(sz);
+ for (size_t i = 0; i < nthreads; ++i) {
+ KMerBuffer &entry = buffers[i];
+ SortBuffer.insert(SortBuffer.end(), entry[k].begin(), entry[k].end());
+ }
+ libcxx::sort(SortBuffer.begin(), SortBuffer.end(), KMerComparator());
+ auto it = std::unique(SortBuffer.begin(), SortBuffer.end());
+
+# pragma omp critical
+ {
+ FILE *f = fopen(ostreams[k].c_str(), "ab");
+ VERIFY_MSG(f, "Cannot open temporary file to write");
+ fwrite(SortBuffer.data(), sizeof(KMer), it - SortBuffer.begin(), f);
+ fclose(f);
+ }
+ }
+
+ for (unsigned i = 0; i < nthreads; ++i) {
+ for (unsigned j = 0; j < num_files; ++j) {
+ buffers[i][j].clear();
+ }
+ }
+ }
+
+ public:
+ NonSingletonKMerSplitter(std::string &work_dir,
+ const std::string &final_kmers,
+ const KMerData &data,
+ const KMerMultiplicityCounter &counter)
+ : KMerSplitter<hammer::KMer>(work_dir, hammer::K), final_kmers_(final_kmers), data_(data), counter_(counter){}
+
+ virtual path::files_t Split(size_t num_files) {
+ unsigned nthreads = std::min(cfg::get().count_merge_nthreads, cfg::get().general_max_nthreads);
+
+ INFO("Splitting kmer instances into " << num_files << " buckets. This might take a while.");
+
+ // Determine the set of output files
+ path::files_t out;
+ for (unsigned i = 0; i < num_files; ++i)
+ out.push_back(GetRawKMersFname(i));
+
+ size_t file_limit = num_files + 2*nthreads;
+ size_t res = limit_file(file_limit);
+ if (res < file_limit) {
+ WARN("Failed to setup necessary limit for number of open files. The process might crash later on.");
+ WARN("Do 'ulimit -n " << file_limit << "' in the console to overcome the limit");
+ }
+
+ size_t reads_buffer_size = cfg::get().count_split_buffer;
+ if (reads_buffer_size == 0) {
+ reads_buffer_size = 536870912ull;
+ size_t mem_limit = (size_t)((double)(get_free_memory()) / (nthreads * 3));
+ INFO("Memory available for splitting buffers: " << (double)mem_limit / 1024.0 / 1024.0 / 1024.0 << " Gb");
+ reads_buffer_size = std::min(reads_buffer_size, mem_limit);
+ }
+ size_t cell_size = reads_buffer_size / (num_files * sizeof(KMer));
+ // Set sane minimum cell size
+ if (cell_size < 16384)
+ cell_size = 16384;
+
+ INFO("Using cell size of " << cell_size);
+ std::vector<KMerBuffer> tmp_entries(nthreads);
+ for (unsigned i = 0; i < nthreads; ++i) {
+ KMerBuffer &entry = tmp_entries[i];
+ entry.resize(num_files);
+ for (unsigned j = 0; j < num_files; ++j) {
+ entry[j].reserve((size_t)(1.1 * (double)cell_size));
+ }
+ }
+
+ size_t n = 15;
+ size_t total_kmers = 0, non_singletons = 0;
+ auto kmers = io::make_kmer_iterator<hammer::KMer>(final_kmers_, hammer::K, nthreads);
+ while (std::any_of(kmers.begin(), kmers.end(),
+ [](const io::raw_kmer_iterator<hammer::KMer> &it) { return it.good(); })) {
+# pragma omp parallel for num_threads(nthreads) reduction(+ : total_kmers) reduction(+ : non_singletons)
+ for (size_t i = 0; i < kmers.size(); ++i) {
+ size_t kc, nsc;
+ std::tie(kc, nsc) = FillBufferFromStream(kmers[i], tmp_entries[i], cell_size, num_files);
+ total_kmers += kc;
+ non_singletons += nsc;
+ }
+
+ DumpBuffers(num_files, nthreads, tmp_entries, out);
+ if (total_kmers >> n) {
+ INFO("Processed " << total_kmers << " kmers");
+ n += 1;
+ }
+ }
+ INFO("Processed " << total_kmers << " kmers");
+
+ INFO("Total " << non_singletons << " non-singleton k-mers written");
+
+ unlink(final_kmers_.c_str());
+
+ return out;
+ }
+
+ private:
+ const std::string final_kmers_;
+ const KMerData &data_;
+ const KMerMultiplicityCounter &counter_;
+};
+
+void KMerDataCounter::BuildKMerIndex(KMerData &data) {
+ // Build the index
+ std::string workdir = cfg::get().input_working_dir;
+ HammerKMerSplitter splitter(workdir);
+ KMerDiskCounter<hammer::KMer> counter(workdir, splitter);
+
+ size_t kmers = KMerIndexBuilder<HammerKMerIndex>(workdir, num_files_, omp_get_max_threads()).BuildIndex(data.index_, counter, /* save final */ true);
+ std::string final_kmers = counter.GetFinalKMersFname();
+ // Optionally perform a filtering step
+ if (cfg::get().count_filter_singletons) {
+ INFO("Filtering singleton k-mers");
+ data.kmers_.set_size(kmers);
+ KMerMultiplicityCounter mcounter(data);
+
+ const auto& dataset = cfg::get().dataset;
+ for (auto I = dataset.reads_begin(), E = dataset.reads_end(); I != E; ++I) {
+ INFO("Processing " << *I);
+ ireadstream irs(*I, cfg::get().input_qvoffset);
+ hammer::ReadProcessor rp(omp_get_max_threads());
+ rp.Run(irs, mcounter);
+ VERIFY_MSG(rp.read() == rp.processed(), "Queue unbalanced");
+ }
+
+ size_t singletons = 0;
+ for (size_t idx = 0; idx < data.size(); ++idx) {
+ size_t cnt = mcounter.count(idx);
+ VERIFY(cnt);
+ singletons += cnt == 1;
+ }
+ INFO("There are " << data.size() << " kmers in total. "
+ "Among them " << data.size() - singletons << " (" << 100.0 * (double)(data.size() - singletons) / (double)data.size() << "%) are non-singletons.");
+
+ NonSingletonKMerSplitter nssplitter(workdir, final_kmers, data, mcounter);
+ KMerDiskCounter<hammer::KMer> nscounter(workdir, nssplitter);
+ HammerKMerIndex reduced_index;
+ kmers = KMerIndexBuilder<HammerKMerIndex>(workdir, num_files_, omp_get_max_threads()).BuildIndex(reduced_index, nscounter, /* save final */ true);
+ data.index_.swap(reduced_index);
+ final_kmers = nscounter.GetFinalKMersFname();
+ }
+
+ // Check, whether we'll ever have enough memory for running BH and bail out earlier
+ double needed = 1.25 * (double)kmers * (sizeof(KMerStat) + sizeof(hammer::KMer));
+ if (needed > (double) get_memory_limit())
+ FATAL_ERROR("The reads contain too many k-mers to fit into available memory. You need approx. "
+ << needed / 1024.0 / 1024.0 / 1024.0
+ << "GB of free RAM to assemble your dataset");
+
+ {
+ INFO("Arranging kmers in hash map order");
+ data.kmers_.set_size(kmers);
+ data.kmers_.set_data(new hammer::KMer::DataType[kmers * hammer::KMer::GetDataSize(hammer::K)]);
+
+ unsigned nthreads = std::min(cfg::get().count_merge_nthreads, cfg::get().general_max_nthreads);
+ auto kmers_its = io::make_kmer_iterator<hammer::KMer>(final_kmers, hammer::K, 16*nthreads);
+
+# pragma omp parallel for num_threads(nthreads) schedule(guided)
+ for (size_t i = 0; i < kmers_its.size(); ++i) {
+ auto &kmer_it = kmers_its[i];
+ for (; kmer_it.good(); ++kmer_it) {
+ size_t kidx = data.index_.seq_idx(hammer::KMer(hammer::K, *kmer_it));
+ memcpy(data.kmers_[kidx].data(), *kmer_it, hammer::KMer::TotalBytes);
+ }
+ }
+
+ unlink(counter.GetFinalKMersFname().c_str());
+ }
+}
+
+void KMerDataCounter::FillKMerData(KMerData &data) {
+ // Now use the index to fill the kmer quality information.
+ INFO("Collecting K-mer information, this takes a while.");
+ data.data_.resize(data.kmers_.size());
+
+ KMerDataFiller filler(data);
+ const auto& dataset = cfg::get().dataset;
+ for (auto I = dataset.reads_begin(), E = dataset.reads_end(); I != E; ++I) {
+ INFO("Processing " << *I);
+ ireadstream irs(*I, cfg::get().input_qvoffset);
+ hammer::ReadProcessor rp(omp_get_max_threads());
+ rp.Run(irs, filler);
+ VERIFY_MSG(rp.read() == rp.processed(), "Queue unbalanced");
+ }
+
+ INFO("Collection done, postprocessing.");
+
+ size_t singletons = 0;
+ for (size_t i = 0; i < data.size(); ++i) {
+ VERIFY(data[i].count());
+
+ // Make sure all the kmers are marked as 'Bad' in the beginning
+ data[i].mark_bad();
+
+ if (data[i].count() == 1)
+ singletons += 1;
+ }
+
+ INFO("There are " << data.size() << " kmers in total. "
+ "Among them " << singletons << " (" << 100.0 * (double)singletons / (double)data.size() << "%) are singletons.");
+}
diff --git a/src/projects/hammer/kmer_data.hpp b/src/projects/hammer/kmer_data.hpp
new file mode 100644
index 0000000..57fd1d2
--- /dev/null
+++ b/src/projects/hammer/kmer_data.hpp
@@ -0,0 +1,141 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef __HAMMER_KMER_DATA_HPP__
+#define __HAMMER_KMER_DATA_HPP__
+
+#include "kmer_stat.hpp"
+#include "utils/adt/array_vector.hpp"
+#include "data_structures/mph_index/kmer_index.hpp"
+#include <vector>
+
+typedef KMerIndex<kmer_index_traits<hammer::KMer> > HammerKMerIndex;
+
+class KMerData {
+ typedef std::vector<KMerStat> KMerDataStorageType;
+ typedef std::vector<hammer::KMer> KMerStorageType;
+ typedef kmer_index_traits<hammer::KMer> traits;
+
+ public:
+ KMerData()
+ : kmers_(nullptr, 0, hammer::KMer::GetDataSize(hammer::K)) {}
+
+ ~KMerData() { delete[] kmers_.data(); }
+
+ size_t size() const { return kmers_.size() + push_back_buffer_.size(); }
+
+ void clear() {
+ data_.clear();
+ push_back_buffer_.clear();
+ kmer_push_back_buffer_.clear();
+ KMerDataStorageType().swap(data_);
+ KMerDataStorageType().swap(push_back_buffer_);
+ }
+
+ size_t push_back(const hammer::KMer kmer, const KMerStat &k) {
+ push_back_buffer_.push_back(k);
+ kmer_push_back_buffer_.push_back(kmer);
+
+ return data_.size() + push_back_buffer_.size() - 1;
+ }
+
+ KMerStat& operator[](size_t idx) {
+ size_t dsz = data_.size();
+ return (idx < dsz ? data_[idx] : push_back_buffer_[idx - dsz]);
+ }
+ const KMerStat& operator[](size_t idx) const {
+ size_t dsz = data_.size();
+ return (idx < dsz ? data_[idx] : push_back_buffer_[idx - dsz]);
+ }
+ hammer::KMer kmer(size_t idx) const {
+ if (idx < kmers_.size()) {
+ auto it = kmers_.begin() + idx;
+ return (traits::raw_create()(hammer::K, *it));
+ }
+
+ idx -= kmers_.size();
+
+ return kmer_push_back_buffer_[idx];
+ }
+
+ size_t checking_seq_idx(hammer::KMer s) const {
+ size_t idx = seq_idx(s);
+ if (idx >= size())
+ return -1ULL;
+
+ return (s == kmer(idx) ? idx : -1ULL);
+ }
+
+ KMerStat& operator[](hammer::KMer s) { return operator[](seq_idx(s)); }
+ const KMerStat& operator[](hammer::KMer s) const { return operator[](seq_idx(s)); }
+ size_t seq_idx(hammer::KMer s) const { return index_.seq_idx(s); }
+
+ template <class Writer>
+ void binary_write(Writer &os) {
+ size_t sz = data_.size();
+ os.write((char*)&sz, sizeof(sz));
+ os.write((char*)&data_[0], sz*sizeof(data_[0]));
+
+ sz = push_back_buffer_.size();
+ os.write((char*)&sz, sizeof(sz));
+ os.write((char*)&push_back_buffer_[0], sz*sizeof(push_back_buffer_[0]));
+ os.write((char*)&kmer_push_back_buffer_[0], sz*sizeof(kmer_push_back_buffer_[0]));
+
+ index_.serialize(os);
+ sz = kmers_.size();
+ os.write((char*)&sz, sizeof(sz));
+ os.write((char*)kmers_.data(), sz * sizeof(hammer::KMer::DataType) * hammer::KMer::GetDataSize(hammer::K));
+ }
+
+ template <class Reader>
+ void binary_read(Reader &is, const std::string &) {
+ clear();
+
+ size_t sz = 0;
+ is.read((char*)&sz, sizeof(sz));
+ data_.resize(sz);
+ is.read((char*)&data_[0], sz*sizeof(data_[0]));
+
+ is.read((char*)&sz, sizeof(sz));
+ push_back_buffer_.resize(sz);
+ is.read((char*)&push_back_buffer_[0], sz*sizeof(push_back_buffer_[0]));
+ kmer_push_back_buffer_.resize(sz);
+ is.read((char*)&kmer_push_back_buffer_[0], sz*sizeof(kmer_push_back_buffer_[0]));
+
+ index_.deserialize(is);
+ is.read((char*)&sz, sizeof(sz));
+ kmers_.set_size(sz);
+ kmers_.set_data(new hammer::KMer::DataType[sz * hammer::KMer::GetDataSize(hammer::K)]);
+ is.read((char*)kmers_.data(), sz * sizeof(hammer::KMer::DataType) * hammer::KMer::GetDataSize(hammer::K));
+ }
+
+ private:
+ array_vector<hammer::KMer::DataType> kmers_;
+
+ KMerDataStorageType data_;
+ KMerStorageType kmer_push_back_buffer_;
+ KMerDataStorageType push_back_buffer_;
+ HammerKMerIndex index_;
+
+ friend class KMerDataCounter;
+};
+
+class KMerDataCounter {
+ unsigned num_files_;
+
+ public:
+ KMerDataCounter(unsigned num_files) : num_files_(num_files) {}
+
+ void BuildKMerIndex(KMerData &data);
+ void FillKMerData(KMerData &data);
+
+ private:
+ DECL_LOGGER("K-mer Counting");
+};
+
+
+#endif
diff --git a/src/projects/hammer/kmer_stat.hpp b/src/projects/hammer/kmer_stat.hpp
new file mode 100644
index 0000000..9501e5f
--- /dev/null
+++ b/src/projects/hammer/kmer_stat.hpp
@@ -0,0 +1,291 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef HAMMER_KMERSTAT_HPP_
+#define HAMMER_KMERSTAT_HPP_
+
+#include "dev_support/verify.hpp"
+
+#include "data_structures/sequence/seq.hpp"
+
+#include <folly/SmallLocks.h>
+
+#include <functional>
+#include <vector>
+#include <iostream>
+#include <iomanip>
+#include <map>
+#include <string>
+#include <cstdint>
+#include <cmath>
+
+#include <sched.h>
+#include <string.h>
+
+
+namespace hammer {
+const uint32_t K = 21;
+typedef Seq<K> KMer;
+};
+
+class Read;
+struct KMerStat;
+
+static inline unsigned hamdistKMer(const hammer::KMer &x, const hammer::KMer &y,
+ unsigned tau = hammer::K) {
+ unsigned dist = 0;
+ for (unsigned i = 0; i < hammer::K; ++i) {
+ if (x[i] != y[i]) {
+ ++dist; if (dist > tau) return dist;
+ }
+ }
+ return dist;
+}
+
+template<unsigned N, unsigned bits,
+ typename Storage = uint64_t>
+class NibbleString {
+ static const unsigned StorageBits = sizeof(Storage) * 8;
+ static_assert(bits <= 8, "Too large nibbles");
+ static const unsigned K = (bits * N + StorageBits - 1) / StorageBits;
+ static const uint64_t MaxValue = (1ull << bits) - 1;
+
+ public:
+ NibbleString() { storage_.fill(0); }
+
+ explicit NibbleString(const uint8_t *data) {
+ for (unsigned i = 0; i < N; ++i)
+ set(i, data ? data[i] : 0);
+ }
+
+ void set(size_t n, uint8_t value) {
+ // Determine the index of storage element and the offset.
+ size_t idx = n * bits / StorageBits, offset = n * bits - idx * StorageBits;
+
+ storage_[idx] = (storage_[idx] & ~(MaxValue << offset)) | ((value & MaxValue) << offset);
+ // Hard case: stuff crosses the boundary
+ if (offset + bits >= StorageBits) {
+ size_t rbits = StorageBits - offset;
+ uint64_t mask = MaxValue >> rbits;
+ uint8_t remaining = uint8_t((value >> rbits) & mask);
+
+ storage_[idx + 1] = (storage_[idx + 1] & ~mask) | remaining;
+ }
+ }
+
+ uint8_t operator[](size_t n) const {
+ // Determine the index of storage element and the offset.
+ size_t idx = n * bits / StorageBits, offset = n * bits - idx * StorageBits;
+
+ // Easy case: everything do not cross the boundary
+ if (offset + bits < StorageBits) {
+ return (storage_[idx] >> offset) & MaxValue;
+ }
+
+ // Assemble stuff from parts
+ size_t rbits = StorageBits - offset;
+ uint64_t mask = MaxValue >> rbits;
+ return uint8_t((storage_[idx] >> offset) | ((storage_[idx + 1] & mask) << rbits));
+ }
+
+ NibbleString& operator+=(const uint8_t *data) {
+ uint64_t mv = MaxValue;
+ for (unsigned i = 0; i < N; ++i)
+ set(i, (uint8_t)std::min(mv, (uint64_t)data[i] + operator[](i)));
+
+ return *this;
+ }
+
+ NibbleString& operator+=(const NibbleString &data) {
+ uint64_t mv = MaxValue;
+ for (unsigned i = 0; i < N; ++i)
+ set(i, (uint8_t)std::min(mv, (uint64_t)data[i] + operator[](i)));
+
+ return *this;
+ }
+
+ Storage *data() { return storage_.data(); }
+ const Storage *data() const { return storage_.data(); }
+
+ private:
+ std::array<Storage, K> storage_;
+};
+
+using QualBitSet = NibbleString<hammer::K, 6>;
+
+struct KMerStat {
+ KMerStat(uint32_t cnt, float kquality, const unsigned char *quality) : total_qual(kquality), qual(quality) {
+ count_with_lock.init(0);
+ set_count(cnt);
+ mark_bad();
+ }
+ KMerStat() : total_qual(1.0), qual() {
+ count_with_lock.init(0);
+ set_count(0);
+ mark_bad();
+ }
+
+ float total_qual;
+ folly::PicoSpinLock<uint32_t> count_with_lock;
+ QualBitSet qual;
+
+ void lock() { count_with_lock.lock(); }
+ void unlock() { count_with_lock.unlock(); }
+ uint32_t count() const { return count_with_lock.getData() >> 1; }
+ void set_count(uint32_t cnt) { count_with_lock.setData((cnt << 1) | good()); }
+ bool good() const { return count_with_lock.getData() & 1; }
+ void mark_good() {
+ uint32_t val = count_with_lock.getData();
+ count_with_lock.setData(val | 1);
+ }
+ void mark_bad() {
+ uint32_t val = count_with_lock.getData();
+ count_with_lock.setData(val & ~1);
+ }
+};
+
+inline
+std::ostream& operator<<(std::ostream &os, const KMerStat &kms) {
+ os << /* kms.kmer().str() << */ " (" << std::setw(3) << kms.count() << ", " << std::setprecision(6) << std::setw(8) << (1-kms.total_qual) << ')';
+
+ return os;
+}
+
+template<class Writer>
+inline Writer& binary_write(Writer &os, const QualBitSet &qbs) {
+ os.write((char*)qbs.data(), sizeof(qbs));
+
+ return os;
+}
+
+template<class Reader>
+inline void binary_read(Reader &is, QualBitSet &qbs) {
+ is.read((char*)qbs.data(), sizeof(qbs));
+}
+
+template<class Writer>
+inline Writer& binary_write(Writer &os, const KMerStat &k) {
+ os.write((char*)&k.count_with_lock, sizeof(k.count_with_lock));
+ os.write((char*)&k.total_qual, sizeof(k.total_qual));
+ return binary_write(os, k.qual);
+}
+
+template<class Reader>
+inline void binary_read(Reader &is, KMerStat &k) {
+ is.read((char*)&k.count_with_lock, sizeof(k.count_with_lock));
+ is.read((char*)&k.total_qual, sizeof(k.total_qual));
+ binary_read(is, k.qual);
+}
+
+inline unsigned char getQual(const KMerStat & kmc, size_t i) {
+ return (unsigned char)kmc.qual[i];
+}
+
+inline double getProb(const KMerStat &kmc, size_t i, bool log);
+inline double getRevProb(const KMerStat &kmc, size_t i, bool log);
+
+namespace hammer {
+typedef std::array<char, hammer::K> ExpandedSeq;
+
+static inline unsigned hamdist(const ExpandedSeq &x, const ExpandedSeq &y,
+ unsigned tau = hammer::K) {
+ unsigned dist = 0;
+ for (unsigned i = 0; i < hammer::K; ++i) {
+ if (x[i] != y[i]) {
+ ++dist; if (dist > tau) return dist;
+ }
+ }
+ return dist;
+}
+
+class ExpandedKMer {
+ public:
+ ExpandedKMer(const KMer k, const KMerStat &kmc) {
+ for (unsigned i = 0; i < hammer::K; ++i) {
+ s_[i] = k[i];
+ for (unsigned j = 0; j < 4; ++j)
+ lprobs_[4*i + j] = ((char)j != s_[i] ?
+ getRevProb(kmc, i, /* log */ true) - log(3) :
+ getProb(kmc, i, /* log */ true));
+ }
+ count_ = kmc.count();
+ }
+
+ double logL(const ExpandedSeq ¢er) const {
+ double res = 0;
+ for (unsigned i = 0; i < hammer::K; ++i)
+ res += lprobs_[4*i + center[i]];
+
+ return res;
+ }
+
+ double logL(const ExpandedKMer ¢er) const {
+ return logL(center.s_);
+ }
+
+ unsigned hamdist(const ExpandedSeq &k,
+ unsigned tau = hammer::K) const {
+ unsigned dist = 0;
+ for (unsigned i = 0; i < hammer::K; ++i) {
+ if (s_[i] != k[i]) {
+ ++dist; if (dist > tau) return dist;
+ }
+ }
+
+ return dist;
+ }
+
+ unsigned hamdist(const ExpandedKMer &k,
+ unsigned tau = hammer::K) const {
+ return hamdist(k.s_, tau);
+ }
+
+ double logL(const KMer center) const {
+ double res = 0;
+ for (unsigned i = 0; i < hammer::K; ++i)
+ res += lprobs_[4*i + center[i]];
+
+ return res;
+ }
+
+ unsigned hamdist(const KMer &k,
+ unsigned tau = hammer::K) const {
+ unsigned dist = 0;
+ for (unsigned i = 0; i < hammer::K; ++i) {
+ if (s_[i] != k[i]) {
+ ++dist; if (dist > tau) return dist;
+ }
+ }
+
+ return dist;
+ }
+
+ uint32_t count() const {
+ return count_;
+ }
+
+ ExpandedSeq seq() const {
+ return s_;
+ }
+
+ private:
+ double lprobs_[4*hammer::K];
+ uint32_t count_;
+ ExpandedSeq s_;
+};
+
+inline
+std::ostream& operator<<(std::ostream &os, const ExpandedSeq &seq) {
+ for (auto s : seq)
+ os << nucl(s);
+
+ return os;
+}
+
+};
+
+#endif // HAMMER_KMERSTAT_HPP_
diff --git a/src/projects/hammer/main.cpp b/src/projects/hammer/main.cpp
new file mode 100644
index 0000000..18077e4
--- /dev/null
+++ b/src/projects/hammer/main.cpp
@@ -0,0 +1,286 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * main.cpp
+ *
+ * Created on: 08.07.2011
+ * Author: snikolenko
+ */
+
+
+#include "config_struct_hammer.hpp"
+#include "hammer_tools.hpp"
+#include "kmer_cluster.hpp"
+#include "globals.hpp"
+#include "kmer_data.hpp"
+#include "expander.hpp"
+
+#include "utils/adt/concurrent_dsu.hpp"
+#include "dev_support/segfault_handler.hpp"
+#include "io/reads_io/read_processor.hpp"
+#include "io/reads_io/ireadstream.hpp"
+
+#include "dev_support/memory_limit.hpp"
+
+#include "dev_support/logger/logger.hpp"
+#include "dev_support/logger/log_writers.hpp"
+
+#include "version.hpp"
+
+#include <algorithm>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+
+std::vector<uint32_t> * Globals::subKMerPositions = NULL;
+KMerData *Globals::kmer_data = NULL;
+int Globals::iteration_no = 0;
+
+char Globals::char_offset = 0;
+bool Globals::char_offset_user = true;
+
+double Globals::quality_probs[256] = { 0 };
+double Globals::quality_lprobs[256] = { 0 };
+double Globals::quality_rprobs[256] = { 0 };
+double Globals::quality_lrprobs[256] = { 0 };
+
+struct UfCmp {
+ bool operator()(const std::vector<int> &lhs, const std::vector<int> &rhs) {
+ return (lhs[0] < rhs[0]);
+ }
+};
+
+void create_console_logger() {
+ using namespace logging;
+
+ logger *lg = create_logger("");
+ lg->add_writer(std::make_shared<console_writer>());
+ attach_logger(lg);
+}
+
+int main(int argc, char * argv[]) {
+ segfault_handler sh;
+
+ srand(42);
+ srandom(42);
+
+ try {
+ create_console_logger();
+
+ std::string config_file = CONFIG_FILENAME;
+ if (argc > 1) config_file = argv[1];
+ INFO("Starting BayesHammer, built from " SPADES_GIT_REFSPEC ", git revision " SPADES_GIT_SHA1);
+ INFO("Loading config from " << config_file.c_str());
+ cfg::create_instance(config_file);
+
+ // hard memory limit
+ const size_t GB = 1 << 30;
+ limit_memory(cfg::get().general_hard_memory_limit * GB);
+
+ // determine quality offset if not specified
+ if (!cfg::get().input_qvoffset_opt) {
+ INFO("Trying to determine PHRED offset");
+ int determined_offset = determine_offset(*cfg::get().dataset.reads_begin());
+ if (determined_offset < 0) {
+ ERROR("Failed to determine offset! Specify it manually and restart, please!");
+ return -1;
+ } else {
+ INFO("Determined value is " << determined_offset);
+ cfg::get_writable().input_qvoffset = determined_offset;
+ }
+ Globals::char_offset_user = false;
+ } else {
+ cfg::get_writable().input_qvoffset = *cfg::get().input_qvoffset_opt;
+ Globals::char_offset_user = true;
+ }
+ Globals::char_offset = (char)cfg::get().input_qvoffset;
+
+ // Pre-cache quality probabilities
+ for (unsigned qual = 0; qual < sizeof(Globals::quality_probs) / sizeof(Globals::quality_probs[0]); ++qual) {
+ Globals::quality_rprobs[qual] = (qual < 3 ? 0.75 : pow(10.0, -(int)qual / 10.0));
+ Globals::quality_probs[qual] = 1 - Globals::quality_rprobs[qual];
+ Globals::quality_lprobs[qual] = log(Globals::quality_probs[qual]);
+ Globals::quality_lrprobs[qual] = log(Globals::quality_rprobs[qual]);
+ }
+
+ // initialize subkmer positions
+ hammer::InitializeSubKMerPositions();
+
+ INFO("Size of aux. kmer data " << sizeof(KMerStat) << " bytes");
+
+ int max_iterations = cfg::get().general_max_iterations;
+
+ // now we can begin the iterations
+ for (Globals::iteration_no = 0; Globals::iteration_no < max_iterations; ++Globals::iteration_no) {
+ std::cout << "\n === ITERATION " << Globals::iteration_no << " begins ===" << std::endl;
+ bool do_everything = cfg::get().general_do_everything_after_first_iteration && (Globals::iteration_no > 0);
+
+ // initialize k-mer structures
+ Globals::kmer_data = new KMerData;
+
+ // count k-mers
+ if (cfg::get().count_do || do_everything) {
+ KMerDataCounter(cfg::get().count_numfiles).BuildKMerIndex(*Globals::kmer_data);
+
+ if (cfg::get().general_debug) {
+ INFO("Debug mode on. Dumping K-mer index");
+ std::string fname = hammer::getFilename(cfg::get().input_working_dir, Globals::iteration_no, "kmer.index");
+ std::ofstream os(fname.c_str(), std::ios::binary);
+ Globals::kmer_data->binary_write(os);
+ }
+ } else {
+ INFO("Reading K-mer index");
+ std::string fname = hammer::getFilename(cfg::get().input_working_dir, Globals::iteration_no, "kmer.index");
+ std::ifstream is(fname.c_str(), std::ios::binary);
+ VERIFY(is.good());
+ Globals::kmer_data->binary_read(is, fname);
+ }
+
+ // Cluster the Hamming graph
+ std::vector<std::vector<size_t> > classes;
+ if (cfg::get().hamming_do || do_everything) {
+ ConcurrentDSU uf(Globals::kmer_data->size());
+ std::string ham_prefix = hammer::getFilename(cfg::get().input_working_dir, Globals::iteration_no, "kmers.hamcls");
+ INFO("Clustering Hamming graph.");
+ if (cfg::get().general_tau > 1) {
+ KMerHamClusterer(cfg::get().general_tau).cluster(ham_prefix, *Globals::kmer_data, uf);
+ } else {
+ TauOneKMerHamClusterer().cluster(ham_prefix, *Globals::kmer_data, uf);
+ }
+
+ INFO("Extracting clusters");
+ size_t num_classes = uf.extract_to_file(hammer::getFilename(cfg::get().input_working_dir, Globals::iteration_no, "kmers.hamming"));
+
+#if 0
+ std::sort(classes.begin(), classes.end(), UfCmp());
+ for (size_t i = 0; i < classes.size(); ++i) {
+ std::cerr << i << ": { ";
+ for (size_t j = 0; j < classes[i].size(); ++j)
+ std::cerr << classes[i][j] << ", ";
+ std::cerr << "}" << std::endl;
+ }
+#endif
+ INFO("Clustering done. Total clusters: " << num_classes);
+ }
+
+ if (cfg::get().bayes_do || do_everything) {
+ KMerDataCounter(cfg::get().count_numfiles).FillKMerData(*Globals::kmer_data);
+
+ INFO("Subclustering Hamming graph");
+ unsigned clustering_nthreads = std::min(cfg::get().general_max_nthreads, cfg::get().bayes_nthreads);
+ KMerClustering kmc(*Globals::kmer_data, clustering_nthreads,
+ cfg::get().input_working_dir, cfg::get().general_debug);
+ kmc.process(hammer::getFilename(cfg::get().input_working_dir, Globals::iteration_no, "kmers.hamming"));
+ INFO("Finished clustering.");
+
+ if (cfg::get().general_debug) {
+ INFO("Debug mode on. Dumping K-mer index");
+ std::string fname = hammer::getFilename(cfg::get().input_working_dir, Globals::iteration_no, "kmer.index2");
+ std::ofstream os(fname.c_str(), std::ios::binary);
+ Globals::kmer_data->binary_write(os);
+ }
+ } else {
+ INFO("Reading K-mer index");
+ std::string fname = hammer::getFilename(cfg::get().input_working_dir, Globals::iteration_no, "kmer.index2");
+ std::ifstream is(fname.c_str(), std::ios::binary);
+ VERIFY(is.good());
+ Globals::kmer_data->binary_read(is, fname);
+ }
+
+ // expand the set of solid k-mers
+ if (cfg::get().expand_do || do_everything) {
+ unsigned expand_nthreads = std::min(cfg::get().general_max_nthreads, cfg::get().expand_nthreads);
+ INFO("Starting solid k-mers expansion in " << expand_nthreads << " threads.");
+ for (unsigned expand_iter_no = 0; expand_iter_no < cfg::get().expand_max_iterations; ++expand_iter_no) {
+ Expander expander(*Globals::kmer_data);
+ const io::DataSet<> &dataset = cfg::get().dataset;
+ for (auto I = dataset.reads_begin(), E = dataset.reads_end(); I != E; ++I) {
+ ireadstream irs(*I, cfg::get().input_qvoffset);
+ hammer::ReadProcessor rp(expand_nthreads);
+ rp.Run(irs, expander);
+ VERIFY_MSG(rp.read() == rp.processed(), "Queue unbalanced");
+ }
+
+ if (cfg::get().expand_write_each_iteration) {
+ std::ofstream oftmp(hammer::getFilename(cfg::get().input_working_dir, Globals::iteration_no, "goodkmers", expand_iter_no).data());
+ for (size_t n = 0; n < Globals::kmer_data->size(); ++n) {
+ const KMerStat &kmer_data = (*Globals::kmer_data)[n];
+ if (kmer_data.good())
+ oftmp << Globals::kmer_data->kmer(n).str() << "\n>" << n
+ << " cnt=" << kmer_data.count() << " tql=" << (1-kmer_data.total_qual) << "\n";
+ }
+ }
+
+ INFO("Solid k-mers iteration " << expand_iter_no << " produced " << expander.changed() << " new k-mers.");
+ if (expander.changed() < 10)
+ break;
+ }
+ INFO("Solid k-mers finalized");
+
+ if (cfg::get().general_debug) {
+ INFO("Debug mode on. Dumping K-mer index");
+ std::string fname = hammer::getFilename(cfg::get().input_working_dir, Globals::iteration_no, "kmer.index3");
+ std::ofstream os(fname.c_str(), std::ios::binary);
+ Globals::kmer_data->binary_write(os);
+ }
+ } else {
+ INFO("Reading K-mer index");
+ std::string fname = hammer::getFilename(cfg::get().input_working_dir, Globals::iteration_no, "kmer.index3");
+ std::ifstream is(fname.c_str(), std::ios::binary);
+ VERIFY(is.good());
+ Globals::kmer_data->binary_read(is, fname);
+ }
+
+ size_t totalReads = 0;
+ // reconstruct and output the reads
+ if (cfg::get().correct_do || do_everything) {
+ totalReads = hammer::CorrectAllReads();
+ }
+
+ // prepare the reads for next iteration
+ delete Globals::kmer_data;
+
+ if (totalReads < 1) {
+ INFO("Too few reads have changed in this iteration. Exiting.");
+ break;
+ }
+ // break;
+ }
+
+ std::string fname = hammer::getFilename(cfg::get().output_dir, "corrected.yaml");
+ INFO("Saving corrected dataset description to " << fname);
+ cfg::get_writable().dataset.save(fname);
+
+ // clean up
+ Globals::subKMerPositions->clear();
+ delete Globals::subKMerPositions;
+
+ INFO("All done. Exiting.");
+ } catch (std::bad_alloc const& e) {
+ std::cerr << "Not enough memory to run BayesHammer. " << e.what() << std::endl;
+ return EINTR;
+ } catch (std::exception const& e) {
+ std::cerr << "Exception caught " << e.what() << std::endl;
+ return EINTR;
+ } catch (const std::string& ex) {
+ std::cerr << "Exception caught: " << ex << std::endl;
+ } catch (const char* s) {
+ std::cerr << "Exception caught: " << s << std::endl;
+ } catch (...) {
+ std::cerr << "Unknown exception caught " << std::endl;
+ return EINTR;
+ }
+
+ return 0;
+}
diff --git a/src/hammer/misc/config.inp b/src/projects/hammer/misc/config.inp
similarity index 100%
rename from src/hammer/misc/config.inp
rename to src/projects/hammer/misc/config.inp
diff --git a/src/hammer/misc/confignohdd.inp b/src/projects/hammer/misc/confignohdd.inp
similarity index 100%
rename from src/hammer/misc/confignohdd.inp
rename to src/projects/hammer/misc/confignohdd.inp
diff --git a/src/hammer/misc/getresults.pl b/src/projects/hammer/misc/getresults.pl
similarity index 100%
rename from src/hammer/misc/getresults.pl
rename to src/projects/hammer/misc/getresults.pl
diff --git a/src/hammer/misc/memusg b/src/projects/hammer/misc/memusg
similarity index 100%
rename from src/hammer/misc/memusg
rename to src/projects/hammer/misc/memusg
diff --git a/src/hammer/misc/pretty_latex.pl b/src/projects/hammer/misc/pretty_latex.pl
similarity index 100%
rename from src/hammer/misc/pretty_latex.pl
rename to src/projects/hammer/misc/pretty_latex.pl
diff --git a/src/projects/hammer/parallel_radix_sort.hpp b/src/projects/hammer/parallel_radix_sort.hpp
new file mode 100644
index 0000000..6a99911
--- /dev/null
+++ b/src/projects/hammer/parallel_radix_sort.hpp
@@ -0,0 +1,592 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+// Copyright 2010, Takuya Akiba
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Takuya Akiba nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef PARALLEL_RADIX_SORT_H_
+#define PARALLEL_RADIX_SORT_H_
+
+#include "dev_support/openmp_wrapper.h"
+
+#include <stdint.h>
+#include <cstring>
+#include <cassert>
+#include <climits>
+#include <algorithm>
+#include <utility>
+
+namespace parallel_radix_sort {
+
+namespace internal {
+// Size of the software managed buffer
+const size_t kOutBufferSize = 32;
+
+// The algorithm is implemented in this internal class
+template<typename PlainType, typename UnsignedType, typename Encoder,
+ typename ValueManager, int Base>
+class ParallelRadixSortInternal {
+public:
+ ParallelRadixSortInternal();
+ ~ParallelRadixSortInternal();
+
+ void Init(size_t max_elems, int max_threads);
+
+ PlainType *Sort(PlainType *data, size_t num_elems, int num_threads,
+ ValueManager *value_manager);
+
+ static void InitAndSort(PlainType *data, size_t num_elems, int num_threads,
+ ValueManager *value_manager);
+private:
+ size_t max_elems_;
+ int max_threads_;
+
+ UnsignedType *tmp_;
+ size_t **histo_;
+ UnsignedType ***out_buf_;
+ size_t **out_buf_n_;
+
+ int num_threads_;
+ size_t *pos_bgn_, *pos_end_;
+ ValueManager *value_manager_;
+
+ void DeleteAll();
+
+ UnsignedType *SortInternal(UnsignedType *data, size_t num_elems,
+ int num_threads, ValueManager *value_manager);
+
+ // Compute |pos_bgn_| and |pos_end_| (associated ranges for each threads)
+ void ComputeRanges(size_t num_elems);
+
+ // First step of each iteration of sorting
+ // Compute the histogram of |src| using bits in [b, b + Base)
+ void ComputeHistogram(int b, UnsignedType *src);
+
+ // Second step of each iteration of sorting
+ // Scatter elements of |src| to |dst| using the histogram
+ void Scatter(int b, UnsignedType *src, UnsignedType *dst);
+};
+
+template<typename PlainType, typename UnsignedType, typename Encoder,
+ typename ValueManager, int Base>
+ParallelRadixSortInternal<PlainType, UnsignedType, Encoder, ValueManager, Base>
+::ParallelRadixSortInternal()
+ : max_elems_(0), max_threads_(0), tmp_(NULL), histo_(NULL),
+ out_buf_(NULL), out_buf_n_(NULL), pos_bgn_(NULL), pos_end_(NULL) {
+ assert(sizeof(PlainType) == sizeof(UnsignedType));
+}
+
+template<typename PlainType, typename UnsignedType, typename Encoder,
+ typename ValueManager, int Base>
+ParallelRadixSortInternal
+<PlainType, UnsignedType, Encoder, ValueManager, Base>
+::~ParallelRadixSortInternal() {
+ DeleteAll();
+}
+
+template<typename PlainType, typename UnsignedType, typename Encoder,
+ typename ValueManager, int Base>
+void ParallelRadixSortInternal
+<PlainType, UnsignedType, Encoder, ValueManager, Base>
+::DeleteAll() {
+ delete [] tmp_;
+ tmp_ = NULL;
+
+ for (int i = 0; i < max_threads_; ++i) delete [] histo_[i];
+ delete [] histo_;
+ histo_ = NULL;
+
+ for (int i = 0; i < max_threads_; ++i) {
+ for (size_t j = 0; j < 1 << Base; ++j) {
+ delete [] out_buf_[i][j];
+ }
+ delete [] out_buf_n_[i];
+ delete [] out_buf_[i];
+ }
+ delete [] out_buf_;
+ delete [] out_buf_n_;
+ out_buf_ = NULL;
+ out_buf_n_ = NULL;
+
+ delete [] pos_bgn_;
+ delete [] pos_end_;
+ pos_bgn_ = pos_end_ = NULL;
+
+ max_elems_ = 0;
+ max_threads_ = 0;
+}
+
+template<typename PlainType, typename UnsignedType, typename Encoder,
+ typename ValueManager, int Base>
+void ParallelRadixSortInternal
+<PlainType, UnsignedType, Encoder, ValueManager, Base>
+::Init(size_t max_elems, int max_threads) {
+ DeleteAll();
+
+ max_elems_ = max_elems;
+
+ if (max_threads == -1) {
+ max_threads = omp_get_max_threads();
+ }
+ assert(max_threads >= 1);
+ max_threads_ = max_threads;
+
+ tmp_ = new UnsignedType[max_elems];
+ histo_ = new size_t*[max_threads];
+ for (int i = 0; i < max_threads; ++i) {
+ histo_[i] = new size_t[1 << Base];
+ }
+
+ out_buf_ = new UnsignedType**[max_threads];
+ out_buf_n_ = new size_t*[max_threads];
+ for (int i = 0; i < max_threads; ++i) {
+ out_buf_[i] = new UnsignedType*[1 << Base];
+ out_buf_n_[i] = new size_t[1 << Base];
+ for (size_t j = 0; j < 1 << Base; ++j) {
+ out_buf_[i][j] = new UnsignedType[kOutBufferSize];
+ }
+ }
+
+ pos_bgn_ = new size_t[max_threads];
+ pos_end_ = new size_t[max_threads];
+}
+
+template<typename PlainType, typename UnsignedType, typename Encoder,
+ typename ValueManager, int Base>
+PlainType *ParallelRadixSortInternal
+<PlainType, UnsignedType, Encoder, ValueManager, Base>
+::Sort(PlainType *data, size_t num_elems,
+ int num_threads, ValueManager *value_manager) {
+ UnsignedType *src = reinterpret_cast<UnsignedType*>(data);
+ UnsignedType *res = SortInternal(src, num_elems, num_threads, value_manager);
+ return reinterpret_cast<PlainType*>(res);
+}
+
+template<typename PlainType, typename UnsignedType, typename Encoder,
+ typename ValueManager, int Base>
+void ParallelRadixSortInternal
+<PlainType, UnsignedType, Encoder, ValueManager, Base>
+::InitAndSort(PlainType *data, size_t num_elems,
+ int num_threads, ValueManager *value_manager) {
+ ParallelRadixSortInternal prs;
+ prs.Init(num_elems, num_threads);
+ const PlainType *res = prs.Sort(data, num_elems, num_threads, value_manager);
+ if (res != data) {
+ for (size_t i = 0; i < num_elems; ++i) data[i] = res[i];
+ }
+}
+
+template<typename PlainType, typename UnsignedType, typename Encoder,
+ typename ValueManager, int Base>
+UnsignedType *ParallelRadixSortInternal
+<PlainType, UnsignedType, Encoder, ValueManager, Base>
+::SortInternal(UnsignedType *data, size_t num_elems,
+ int num_threads, ValueManager *value_manager) {
+ assert(num_elems <= max_elems_);
+
+ if (num_threads == -1) {
+ num_threads = omp_get_max_threads();
+ }
+ assert(1 <= num_threads && num_threads <= max_threads_);
+ num_threads_ = num_threads;
+
+ value_manager_ = value_manager;
+
+ // Compute |pos_bgn_| and |pos_end_|
+ ComputeRanges(num_elems);
+
+ // Iterate from lower bits to higher bits
+ const unsigned bits = CHAR_BIT * sizeof(UnsignedType);
+ UnsignedType *src = data, *dst = tmp_;
+ for (unsigned b = 0; b < bits; b += Base) {
+ ComputeHistogram(b, src);
+ Scatter(b, src, dst);
+
+ std::swap(src, dst);
+ value_manager->Next();
+ }
+
+ return src;
+}
+
+template<typename PlainType, typename UnsignedType, typename Encoder,
+ typename ValueManager, int Base>
+void ParallelRadixSortInternal
+<PlainType, UnsignedType, Encoder, ValueManager, Base>
+::ComputeRanges(size_t num_elems) {
+ pos_bgn_[0] = 0;
+ for (int i = 0; i < num_threads_ - 1; ++i) {
+ const size_t t = (num_elems - pos_bgn_[i]) / (num_threads_ - i);
+ pos_bgn_[i + 1] = pos_end_[i] = pos_bgn_[i] + t;
+ }
+ pos_end_[num_threads_ - 1] = num_elems;
+}
+
+template<typename PlainType, typename UnsignedType, typename Encoder,
+ typename ValueManager, int Base>
+void ParallelRadixSortInternal
+<PlainType, UnsignedType, Encoder, ValueManager, Base>
+::ComputeHistogram(int b, UnsignedType *src) {
+ // Compute local histogram
+ #ifdef _OPENMP
+ #pragma omp parallel num_threads(num_threads_)
+ #endif
+ {
+ const int my_id = omp_get_thread_num();
+ const size_t my_bgn = pos_bgn_[my_id];
+ const size_t my_end = pos_end_[my_id];
+ size_t *my_histo = histo_[my_id];
+
+ memset(my_histo, 0, sizeof(size_t) * (1 << Base));
+ for (size_t i = my_bgn; i < my_end; ++i) {
+ __builtin_prefetch(src + i + 1, 0, 1);
+ size_t t = Encoder::extract(src[i], b, Base);
+ ++my_histo[t];
+ }
+ }
+
+ // Compute global histogram
+ size_t s = 0;
+ for (size_t i = 0; i < 1 << Base; ++i) {
+ for (int j = 0; j < num_threads_; ++j) {
+ const size_t t = s + histo_[j][i];
+ histo_[j][i] = s;
+ s = t;
+ }
+ }
+}
+
+template<typename PlainType, typename UnsignedType, typename Encoder,
+ typename ValueManager, int Base>
+void ParallelRadixSortInternal
+<PlainType, UnsignedType, Encoder, ValueManager, Base>
+::Scatter(int b, UnsignedType *src, UnsignedType *dst) {
+ #ifdef _OPENMP
+ #pragma omp parallel num_threads(num_threads_)
+ #endif
+ {
+ const int my_id = omp_get_thread_num();
+ const size_t my_bgn = pos_bgn_[my_id];
+ const size_t my_end = pos_end_[my_id];
+ size_t *my_histo = histo_[my_id];
+ UnsignedType **my_buf = out_buf_[my_id];
+ size_t *my_buf_n = out_buf_n_[my_id];
+
+ memset(my_buf_n, 0, sizeof(size_t) * (1 << Base));
+ for (size_t i = my_bgn; i < my_end; ++i) {
+ __builtin_prefetch(src + i + 1, 0, 1);
+
+ size_t t = Encoder::extract(src[i], b, Base);
+ my_buf[t][my_buf_n[t]] = src[i];
+ value_manager_->Push(my_id, t, my_buf_n[t], i);
+ ++my_buf_n[t];
+
+ if (my_buf_n[t] == kOutBufferSize) {
+ size_t p = my_histo[t];
+ for (size_t j = 0; j < kOutBufferSize; ++j) {
+ dst[p++] = my_buf[t][j];
+ }
+ value_manager_->Flush(my_id, t, kOutBufferSize, my_histo[t]);
+
+ my_histo[t] += kOutBufferSize;
+ my_buf_n[t] = 0;
+ }
+ }
+
+ // Flush everything
+ for (size_t i = 0; i < 1 << Base; ++i) {
+ size_t p = my_histo[i];
+ for (size_t j = 0; j < my_buf_n[i]; ++j) {
+ dst[p++] = my_buf[i][j];
+ }
+ value_manager_->Flush(my_id, i, my_buf_n[i], my_histo[i]);
+ }
+ }
+}
+} // namespace internal
+
+// Encoders encode signed/unsigned integers and floating point numbers
+// to correctly ordered unsigned integers
+namespace encoder {
+class EncoderUnsigned {
+public:
+ template<typename UnsignedType>
+ inline static size_t extract(const UnsignedType &x, unsigned shift, unsigned Base) {
+ return (x >> shift) & ((1 << Base) - 1);
+ }
+};
+
+class EncoderSigned {
+public:
+ template<typename UnsignedType>
+ inline static size_t extract(const UnsignedType &x, unsigned shift, unsigned Base) {
+ x = x ^ (UnsignedType(1) << (CHAR_BIT * sizeof(UnsignedType) - 1));
+ return (x >> shift) & ((1 << Base) - 1);
+ }
+};
+
+class EncoderDecimal {
+public:
+ template<typename UnsignedType>
+ inline static size_t extract(const UnsignedType &x, unsigned shift, unsigned Base) {
+ static const int bits = CHAR_BIT * sizeof(UnsignedType);
+ const UnsignedType a = x >> (bits - 1);
+ const UnsignedType b = (-a) | (UnsignedType(1) << (bits - 1));
+ x = x ^ b;
+ return (x >> shift) & ((1 << Base) - 1);
+ }
+};
+} // namespace encoder
+
+// Value managers are used to generalize the sorting algorithm
+// to sorting of keys and sorting of pairs
+namespace value_manager {
+class DummyValueManager {
+public:
+ inline void Push(int thread __attribute__((unused)),
+ size_t bucket __attribute__((unused)),
+ size_t num __attribute__((unused)),
+ size_t from_pos __attribute__((unused))) {}
+
+ inline void Flush(int thread __attribute__((unused)),
+ size_t bucket __attribute__((unused)),
+ size_t num __attribute__((unused)),
+ size_t to_pos __attribute__((unused))) {}
+
+ void Next() {}
+};
+
+template<typename ValueType, int Base> class PairValueManager {
+public:
+ PairValueManager()
+ : max_elems_(0), max_threads_(0), original_(NULL), tmp_(NULL),
+ src_(NULL), dst_(NULL), out_buf_(NULL) {}
+
+ ~PairValueManager() {
+ DeleteAll();
+ }
+
+ void Init(size_t max_elems, int max_threads);
+
+ void Start(ValueType *original, size_t num_elems, int num_threads) {
+ assert(num_elems <= max_elems_);
+ assert(num_threads <= max_threads_);
+ src_ = original_ = original;
+ dst_ = tmp_;
+ }
+
+ inline void Push(int thread, size_t bucket, size_t num, size_t from_pos) {
+ out_buf_[thread][bucket][num] = src_[from_pos];
+ }
+
+ inline void Flush(int thread, size_t bucket, size_t num, size_t to_pos) {
+ for (size_t i = 0; i < num; ++i) {
+ dst_[to_pos++] = out_buf_[thread][bucket][i];
+ }
+ }
+
+ void Next() {
+ std::swap(src_, dst_);
+ }
+
+ ValueType *GetResult() {
+ return src_;
+ }
+private:
+ size_t max_elems_;
+ int max_threads_;
+
+ static const size_t kOutBufferSize = internal::kOutBufferSize;
+ ValueType *original_, *tmp_;
+ ValueType *src_, *dst_;
+ ValueType ***out_buf_;
+
+ void DeleteAll();
+};
+
+template<typename ValueType, int Base>
+void PairValueManager<ValueType, Base>
+::Init(size_t max_elems, int max_threads) {
+ if (max_threads == -1) {
+ max_threads = omp_get_max_threads();
+ }
+ assert(max_threads >= 1);
+
+ DeleteAll();
+
+ max_elems_ = max_elems;
+ max_threads_ = max_threads;
+
+ tmp_ = new ValueType[max_elems];
+
+ out_buf_ = new ValueType**[max_threads];
+ for (int i = 0; i < max_threads; ++i) {
+ out_buf_[i] = new ValueType*[1 << Base];
+ for (size_t j = 0; j < 1 << Base; ++j) {
+ out_buf_[i][j] = new ValueType[kOutBufferSize];
+ }
+ }
+}
+
+template<typename ValueType, int Base>
+void PairValueManager<ValueType, Base>
+::DeleteAll() {
+ delete [] tmp_;
+ tmp_ = NULL;
+
+ for (int i = 0; i < max_threads_; ++i) {
+ for (size_t j = 0; j < 1 << Base; ++j) {
+ delete [] out_buf_[i][j];
+ }
+ delete [] out_buf_[i];
+ }
+ delete [] out_buf_;
+ out_buf_ = NULL;
+
+ max_elems_ = 0;
+ max_threads_ = 0;
+}
+} // namespace value_manager
+
+// Frontend class for sorting keys
+template<typename PlainType, typename UnsignedType = PlainType,
+ typename Encoder = encoder::EncoderUnsigned, int Base = 8>
+class KeySort {
+ typedef value_manager::DummyValueManager DummyValueManager;
+ typedef internal::ParallelRadixSortInternal
+ <PlainType, UnsignedType, Encoder, DummyValueManager, Base> Internal;
+
+public:
+ // In the following functions, when |max_threads| or |num_threads| is -1,
+ // the default value given by OpenMP would be used.
+ void Init(size_t max_elems, int max_threads = -1) {
+ internal_.Init(max_elems, max_threads);
+ }
+
+ // Notice that the pointer returned by this
+ // does not necessarily equal to |data|.
+ PlainType *Sort(PlainType *data, size_t num_elems, int num_threads = -1) {
+ return internal_.Sort(data, num_elems, num_threads, &dummy_value_manager_);
+ }
+
+ static void InitAndSort(PlainType *data, size_t num_elems, int num_threads = -1) {
+ DummyValueManager dvm;
+ Internal::InitAndSort(data, num_elems, num_threads, &dvm);
+ }
+private:
+ Internal internal_;
+ DummyValueManager dummy_value_manager_;
+};
+
+// Frontend class for sorting pairs
+template<typename PlainType, typename ValueType,
+ typename UnsignedType = PlainType,
+ typename Encoder = encoder::EncoderUnsigned,
+ int Base = 8>
+class PairSort {
+ typedef value_manager::PairValueManager
+ <ValueType, Base> ValueManager;
+ typedef internal::ParallelRadixSortInternal
+ <PlainType, UnsignedType, Encoder, ValueManager, Base> Internal;
+
+public:
+ // In the following functions, when |max_threads| or |num_threads| is -1,
+ // the default value given by OpenMP would be used.
+ void Init(size_t max_elems, int max_threads = -1) {
+ internal_.Init(max_elems, max_threads);
+ value_manager_.Init(max_elems, max_threads);
+ }
+
+ // Notice that the pointers returned by this
+ // do not necessarily equal to |keys| and |vals|.
+ std::pair<PlainType*, ValueType*> Sort(PlainType *keys, ValueType *vals,
+ size_t num_elems, int num_threads = -1) {
+ value_manager_.Start(vals, num_elems, num_threads);
+ PlainType *res_keys = internal_.Sort(keys, num_elems, num_threads, &value_manager_);
+ ValueType *res_vals = value_manager_.GetResult();
+ return std::make_pair(res_keys, res_vals);
+ }
+
+ static void InitAndSort(PlainType *keys, ValueType *vals,
+ size_t num_elems, int num_threads = -1) {
+ ValueManager vm;
+ vm.Init(num_elems, num_threads);
+ vm.Start(vals, num_elems, num_threads);
+ Internal::InitAndSort(keys, num_elems, num_threads, &vm);
+ ValueType *res_vals = vm.GetResult();
+ if (res_vals != vals) {
+ for (size_t i = 0; i < num_elems; ++i) {
+ vals[i] = res_vals[i];
+ }
+ }
+ }
+private:
+ Internal internal_;
+ ValueManager value_manager_;
+};
+
+#define TYPE_CASE(plain_type, unsigned_type, encoder_type) \
+ template<> class KeySort<plain_type> \
+ : public KeySort<plain_type, unsigned_type, \
+ encoder::Encoder ## encoder_type> {}; \
+ template<typename V> class PairSort<plain_type, V> \
+ : public PairSort<plain_type, V, unsigned_type, \
+ encoder::Encoder ## encoder_type> {}; \
+
+// Signed integers
+TYPE_CASE(char, unsigned char, Signed);
+TYPE_CASE(short, unsigned short, Signed);
+TYPE_CASE(int, unsigned int, Signed);
+TYPE_CASE(long, unsigned long, Signed);
+TYPE_CASE(long long, unsigned long long, Signed);
+
+// |signed char| and |char| are treated as different types
+TYPE_CASE(signed char, unsigned char, Signed);
+
+// Floating point numbers
+TYPE_CASE(float, uint32_t, Decimal);
+TYPE_CASE(double, uint64_t, Decimal);
+
+#undef TYPE_CASE
+
+template<typename KeyType>
+void SortKeys(KeyType *data, size_t num_elems, int num_threads = -1) {
+ KeySort<KeyType>::InitAndSort(data, num_elems, num_threads);
+}
+
+template<typename KeyType, typename ValueType>
+void SortPairs(KeyType *keys, ValueType *vals, size_t num_elems, int num_threads = -1) {
+ PairSort<KeyType, ValueType>::InitAndSort(keys, vals, num_elems, num_threads);
+}
+}; // namespace parallel radix sort
+
+#endif // PARALLEL_RADIX_SORT_H_
diff --git a/src/hammer/quake_correct/CMakeLists.txt b/src/projects/hammer/quake_correct/CMakeLists.txt
similarity index 100%
rename from src/hammer/quake_correct/CMakeLists.txt
rename to src/projects/hammer/quake_correct/CMakeLists.txt
diff --git a/src/projects/hammer/quake_correct/Read.cpp b/src/projects/hammer/quake_correct/Read.cpp
new file mode 100644
index 0000000..c8d13eb
--- /dev/null
+++ b/src/projects/hammer/quake_correct/Read.cpp
@@ -0,0 +1,824 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "Read.h"
+#include "bithash.h"
+#include <iostream>
+#include <math.h>
+#include <algorithm>
+#include <set>
+#include <queue>
+
+#define TESTING false
+
+int bithash::k;
+
+////////////////////////////////////////////////////////////
+// corrections_compare
+//
+// Simple class to compare to corrected_read's in the
+// priority queue
+////////////////////////////////////////////////////////////
+class corrections_compare {
+public:
+ // corrections_compare() {};
+ bool operator() (const corrected_read* lhs, const corrected_read* rhs) const {
+ //return lhs->likelihood < rhs->likelihood;
+ if(lhs->likelihood < rhs->likelihood)
+ return true;
+ else if(lhs->likelihood > rhs->likelihood)
+ return false;
+ else
+ return lhs->region_edits > rhs->region_edits;
+ }
+};
+
+const float Read::trust_spread_t = .1;
+const float Read::correct_min_t = .000001;
+const float Read::learning_min_t = .00001;
+
+////////////////////////////////////////////////////////////
+// Read (constructor)
+//
+// Make shallow copies of sequence and untrusted, and
+// convert quality value string to array of probabilities
+////////////////////////////////////////////////////////////
+Read::Read(const string & h, const unsigned int* s, const string & q, vector<int> & u, const int rl)
+ :untrusted(u) {
+
+ header = h;
+ read_length = rl;
+ trim_length = rl;
+ seq = new unsigned int[read_length];
+ quals = new unsigned int[read_length];
+ prob = new float[read_length];
+ for(int i = 0; i < read_length; i++) {
+ seq[i] = s[i];
+ // quality values of 0,1 lead to p < .25
+ quals[i] = q[i] - quality_scale;
+ if(quals[i] >= max_qual) {
+ cerr << "Quality value " << quals[i] << "larger than maximum allowed quality value " << max_qual << ". Increase the variable 'max_qual' in Read.h." << endl;
+ exit(EXIT_FAILURE);
+ }
+ prob[i] = max(.25, 1.0-pow(10.0,-(quals[i]/10.0)));
+ }
+ trusted_read = 0;
+ global_like = 1.0;
+}
+
+Read::~Read() {
+ delete[] seq;
+ delete[] quals;
+ delete[] prob;
+ if(trusted_read != 0)
+ delete trusted_read;
+}
+
+////////////////////////////////////////////////////////////
+// trim
+//
+// Trim the end of the read the way BWA does it.
+// Removes affected untrusted k-mers.
+// Returns the trimmed read as a string.
+////////////////////////////////////////////////////////////
+string Read::trim(int t) {
+ // find trim index
+ int phredq;
+ int current_trimfunc = 0;
+ int max_trimfunc = 0;
+ trim_length = read_length; // already set in constructor but ok
+ for(int i = read_length-1; i >= 0; i--) {
+ phredq = floor(.5-10*log(1.0 - prob[i])/log(10));
+ current_trimfunc += (t - phredq);
+ if(current_trimfunc > max_trimfunc) {
+ max_trimfunc = current_trimfunc;
+ trim_length = i;
+ }
+ }
+
+ // update untrusted
+ for(int i = untrusted.size()-1; i >= 0; i--) {
+ if(untrusted[i] > trim_length - bithash::k)
+ untrusted.pop_back();
+ }
+
+ vector<correction> no_cors;
+ return print_corrected(no_cors);
+}
+
+
+////////////////////////////////////////////////////////////
+// single_correct
+//
+// Find the set of corrections with maximum likelihood
+// that result in all trusted kmers.
+//
+// Assumes a short read so obsolete.
+////////////////////////////////////////////////////////////
+/*
+bool Read::single_correct(bithash *trusted, ofstream & out, double (&ntnt_prob)[4][4], bool learning) {
+ if(correct_subset(untrusted, trusted, out, ntnt_prob, learning)) {
+ out << header << "\t" << print_seq() << "\t" << print_corrected(trusted_read->corrections) << endl;
+ return true;
+ } else
+ return false;
+}
+*/
+
+////////////////////////////////////////////////////////////
+// correct_cc
+//
+// Find the set of corrections with maximum likelihood that
+// result in all trusted kmers in the region defined by
+// the given untrusted kmer indices.
+//
+// Will print output if the read will not be corrected,
+// but otherwise abstains.
+//
+// Corrections can be accessed through 'trusted_read'
+//
+// Return codes are:
+// 0: corrected
+// 1: ambiguous
+// 2: full queue or quit early
+// 3: empty queue or empty region
+////////////////////////////////////////////////////////////
+//bool Read::correct_cc(vector<short> region, vector<int> untrusted_subset, bithash *trusted, double (&ntnt_prob)[4][4], double prior_prob[4], bool learning) {
+int Read::correct_cc(vector<short> region, vector<int> untrusted_subset, bithash *trusted, double ntnt_prob[][4][4], double prior_prob[4], bool learning) {
+
+ unsigned int max_queue_size = 400000;
+
+ /*
+ if(header == "@read3") {
+ cout << "Untrusted: " << untrusted_subset.size() << endl;
+ for(int i = 0; i < untrusted_subset.size(); i++)
+ cout << untrusted_subset[i] << " ";
+ cout << endl;
+ cout << "Region: " << region.size() << endl;
+ for(int i = 0; i < region.size(); i++)
+ cout << region[i] << " ";
+ cout << endl << endl;
+ }
+ */
+
+ ////////////////////////////////////////
+ // region
+ ////////////////////////////////////////
+ // sort by quality
+ if(region.size() > 0)
+ quality_quicksort(region, 0, region.size()-1);
+ else
+ // die quietly and try again with bigger region
+ return 3;
+
+ ////////////////////////////////////////
+ // stats
+ ////////////////////////////////////////
+ unsigned int cpq_adds = 0;
+ unsigned int check_count = 0;
+ float exp_errors = 0;
+ int nt90 = 0;
+ int nt99 = 0;
+ int non_acgt = 0;
+ for(int i = 0; i < region.size(); i++) {
+ exp_errors += (1-prob[region[i]]);
+ if(prob[region[i]] < .9)
+ nt90++;
+ if(prob[region[i]] < .99)
+ nt99++;
+ if(seq[region[i]] >= 4)
+ non_acgt++;
+ }
+
+ ////////////////////////////////////////
+ // filter
+ ////////////////////////////////////////
+ double mylike_t = correct_min_t;
+ double myglobal_t = correct_min_t;
+ double myspread_t = trust_spread_t;
+ if(learning) {
+ if(nt99 >= 8 || non_acgt > 1) {
+ //out << header << "\t" << print_seq() << "\t." << endl;
+ return 2;
+ }
+ mylike_t = learning_min_t;
+ myglobal_t = learning_min_t;
+ myspread_t = trust_spread_t / 2.0;
+ } else if(nt99 >= 13 || non_acgt > 2) {
+ // just quit
+ if(TESTING)
+ cerr << header << "\t" << print_seq() << "\t." << endl;
+ //cerr << header << "\t" << region.size() << "\t" << untrusted_subset.size() << "\t" << nt90 << "\t" << nt99 << "\t" << exp_errors << "\t0\t0\t0\t0" << endl;
+ return 2;
+
+ } else if(nt99 >= 11) {
+ // proceed very cautiously
+ if(aggressive)
+ mylike_t = .05;
+ else
+ mylike_t = .1;
+
+ } else if(nt99 >= 9) {
+ //proceed cautiously
+ if(aggressive)
+ mylike_t = .001;
+ else
+ mylike_t = .03;
+ }
+
+ ////////////////////////////////////////
+ // priority queue
+ ////////////////////////////////////////
+ // data structure for corrected_reads sorted by likelihood
+ //priority_queue< corrected_read*, vector<corrected_read*>, corrections_compare > cpq;
+ vector<corrected_read*> cpq;
+ corrections_compare cpq_comp;
+
+ ////////////////////////////////////////
+ // initialize
+ ////////////////////////////////////////
+ corrected_read *cr, *next_cr;
+ short edit_i;
+ float like;
+ bitset<bitsize> bituntrusted;
+ for(int i = 0; i < untrusted_subset.size(); i++) {
+ if(untrusted_subset[i] >= bitsize) {
+ cerr << "These reads must be longer than assumed. Increase the variable 'bitsize' in 'Read.h' to the read length." << endl;
+ exit(1);
+ } else
+ bituntrusted.set(untrusted_subset[i]);
+ }
+
+ bool cr_added = true; // once an iteration passes w/ no corrected reads added, we can stop
+ for(short region_edit = 0; region_edit < region.size() && cr_added; region_edit++) {
+ edit_i = region[region_edit];
+ cr_added = false;
+
+ for(short nt = 0; nt < 4; nt++) {
+ if(seq[edit_i] != nt) {
+ // P(obs=o|actual=a)*P(actual=a) for Bayes
+ if(seq[edit_i] < 4)
+ like = (1.0-prob[edit_i]) * ntnt_prob[quals[edit_i]][nt][seq[edit_i]] * prior_prob[nt] / (prob[edit_i] * prior_prob[seq[edit_i]]);
+ else
+ // non-ACGT
+ like = prior_prob[nt] / (1.0/3.0);
+
+ // P(actual=a|obs=o)
+ //like = (1.0-prob[edit_i]) * ntnt_prob[seq[edit_i]][nt] * / prob[edit_i];
+
+ next_cr = new corrected_read(bituntrusted, like, region_edit+1);
+ next_cr->corrections.push_back(correction(edit_i, nt));
+
+ // add to priority queue
+ //cpq.push(next_cr);
+ cpq.push_back(next_cr);
+ push_heap(cpq.begin(), cpq.end(), cpq_comp);
+ cpq_adds++;
+ cr_added = true;
+ }
+ }
+ }
+
+ ////////////////////////////////////////
+ // process corrected reads
+ ////////////////////////////////////////
+ // initialize likelihood parameters
+ trusted_read = 0;
+ float trusted_likelihood;
+ signed int untrusted_count; // trust me
+ bool ambiguous_flag = false;
+
+ while(cpq.size() > 0) {
+
+ /////////////////////////
+ // quit if pq is too big
+ /////////////////////////
+ if(cpq.size() > max_queue_size) {
+ //cout << "queue is too large for " << header << endl;
+ if(TESTING)
+ cerr << header << "\t" << print_seq() << "\t." << endl;
+
+ if(trusted_read != 0) {
+ delete trusted_read;
+ trusted_read = 0;
+ }
+ break;
+ }
+
+ /////////////////////////
+ // pop next
+ /////////////////////////
+ cr = cpq[0];
+ pop_heap(cpq.begin(), cpq.end(), cpq_comp);
+ cpq.pop_back();
+
+ /////////////////////////
+ // check likelihood
+ /////////////////////////
+ if(trusted_read != 0) {
+ // if a corrected read exists, compare likelihoods and if likelihood is too low, break loop return true
+ if(cr->likelihood < trusted_likelihood*myspread_t) {
+ delete cr;
+ break;
+ }
+ } else {
+ // if no corrected read exists and likelihood is too low, break loop return false
+ if(cr->likelihood < mylike_t || global_like*cr->likelihood < myglobal_t) {
+ delete cr;
+ break;
+ }
+ }
+
+ /////////////////////////
+ // check trust
+ /////////////////////////
+ // save for later comparison
+ untrusted_count = (signed int)cr->untrusted.count();
+ if(check_trust(cr, trusted, check_count)) {
+ if(trusted_read == 0) {
+ // if yes, and first trusted read, save
+ trusted_read = cr;
+ trusted_likelihood = cr->likelihood;
+ } else {
+ // if yes, and if trusted read exists
+ ambiguous_flag = true;
+
+ // output ambiguous corrections for testing
+ if(TESTING)
+ cerr << header << "\t" << print_seq() << "\t" << print_corrected(trusted_read->corrections) << "\t" << print_corrected(cr->corrections) << endl;
+
+ // delete trusted_read, break loop
+ delete trusted_read;
+ delete cr;
+ trusted_read = 0;
+ break;
+ }
+ }
+
+ /*
+ if(header == "@read3") {
+ cout << cr->likelihood << "\t";
+ for(int c = 0; c < cr->corrections.size(); c++) {
+ cout << " (" << cr->corrections[c].index << "," << cr->corrections[c].to << ")";
+ }
+ cout << "\t";
+ for(int c = 0; c < trim_length-bithash::k+1; c++) {
+ if(cr->untrusted[c])
+ cout << 1;
+ else
+ cout << 0;
+ }
+ cout << endl;
+ }
+ */
+
+ // if untrusted sharply increases, just bail
+ if(((signed int)cr->untrusted.count() - untrusted_count)*3 < bithash::k) {
+
+ /////////////////////////
+ // add next correction
+ /////////////////////////
+ bool cr_added = true; // once an iteration passes w/ no corrected reads added, we can stop
+ for(short region_edit = cr->region_edits; region_edit < region.size() && cr_added; region_edit++) {
+ edit_i = region[region_edit];
+ cr_added = false;
+
+ // add relatives
+ for(short nt = 0; nt < 4; nt++) {
+ // if actual edit,
+ if(seq[edit_i] != nt) {
+ // calculate new likelihood
+
+ // P(obs=o|actual=a)*P(actual=a) for Bayes
+ if(seq[edit_i] < 4)
+ like = cr->likelihood * (1.0-prob[edit_i]) * ntnt_prob[quals[edit_i]][nt][seq[edit_i]] * prior_prob[nt] / (prob[edit_i] * prior_prob[seq[edit_i]]);
+ else
+ // non-ACGT
+ like = cr->likelihood * prior_prob[nt] / (1.0/3.0);
+
+ // P(actual=a|obs=o)
+ //like = cr->likelihood * (1.0-prob[edit_i]) * ntnt_prob[seq[edit_i]][nt] / prob[edit_i];
+
+ // if thresholds ok, add new correction
+ if(trusted_read != 0) {
+ if(like < trusted_likelihood*myspread_t)
+ continue;
+ } else {
+ // must consider spread or risk missing a case of ambiguity
+ if(like < mylike_t*myspread_t || global_like*like < myglobal_t*myspread_t)
+ continue;
+ }
+
+ next_cr = new corrected_read(cr->corrections, cr->untrusted, like, region_edit+1);
+ next_cr->corrections.push_back(correction(edit_i, nt));
+
+ // add to priority queue
+ cpq.push_back(next_cr);
+ push_heap(cpq.begin(), cpq.end(), cpq_comp);
+ cpq_adds++;
+ cr_added = true;
+ }
+ }
+ }
+ }
+
+ // if not the saved max trusted, delete
+ if(trusted_read != cr) {
+ delete cr;
+ }
+ }
+
+ // clean up priority queue
+ for(int i = 0; i < cpq.size(); i++)
+ delete cpq[i];
+
+ if(trusted_read != 0) {
+ //cerr << header << "\t" << region.size() << "\t" << untrusted_subset.size() << "\t" << nt90 << "\t" << nt99 << "\t" << exp_errors << "\t" << cpq_adds << "\t" << check_count << "\t1\t" << trusted_read->likelihood << endl;
+ return 0;
+ } else {
+ if(TESTING && mylike_t > correct_min_t)
+ cerr << header << "\t" << print_seq() << "\t." << endl;
+ //cerr << header << "\t" << region.size() << "\t" << untrusted_subset.size() << "\t" << nt90 << "\t" << nt99 << "\t" << exp_errors << "\t" << cpq_adds << "\t" << check_count << "\t0\t0" << endl;
+
+ if(ambiguous_flag)
+ return 1;
+ else if(cpq.size() > max_queue_size)
+ return 2;
+ else
+ return 3;
+ }
+}
+
+////////////////////////////////////////////////////////////
+// print_seq
+////////////////////////////////////////////////////////////
+string Read::print_seq() {
+ char nts[5] = {'A','C','G','T','N'};
+ string sseq;
+ for(int i = 0; i < read_length; i++)
+ sseq.push_back(nts[seq[i]]);
+ return sseq;
+}
+
+////////////////////////////////////////////////////////////
+// print_corrected
+//
+// Print read with corrections and trimming.
+////////////////////////////////////////////////////////////
+string Read::print_corrected(vector<correction> & cor) {
+ return print_corrected(cor, trim_length);
+}
+string Read::print_corrected(vector<correction> & cor, int print_nt) {
+ char nts[5] = {'A','C','G','T','N'};
+ string sseq;
+ int correct_i;
+ for(int i = 0; i < print_nt; i++) {
+ correct_i = -1;
+ for(int c = 0; c < cor.size(); c++) {
+ if(cor[c].index == i)
+ correct_i = c;
+ }
+ if(correct_i != -1)
+ sseq.push_back(nts[cor[correct_i].to]);
+ else
+ sseq.push_back(nts[seq[i]]);
+ }
+ return sseq;
+}
+
+
+////////////////////////////////////////////////////////////
+// correct
+//
+// Perform correction by breaking up untrusted kmers
+// into connected components and correcting them
+// independently.
+////////////////////////////////////////////////////////////
+//string Read::correct(bithash *trusted, double (&ntnt_prob)[4][4], double prior_prob[4], bool learning) {
+string Read::correct(bithash *trusted, double ntnt_prob[][4][4], double prior_prob[4], bool learning) {
+ ////////////////////////////////////////
+ // find connected components
+ ////////////////////////////////////////
+ vector< vector<int> > cc_untrusted;
+
+ // add first
+ cc_untrusted.push_back(vector<int>());
+ int cc = 0;
+ cc_untrusted[cc].push_back(untrusted[0]);
+
+ for(int i = 1; i < untrusted.size(); i++) {
+ // if kmer from last untrusted doesn't reach next
+ if(untrusted[i-1]+bithash::k-1 < untrusted[i]) {
+ cc++;
+ cc_untrusted.push_back(vector<int>());
+ }
+ cc_untrusted[cc].push_back(untrusted[i]);
+ }
+
+ ////////////////////////////////////////
+ // process connected components
+ ////////////////////////////////////////
+ vector<correction> multi_cors;
+ vector<short> chop_region;
+ vector<short> big_region;
+ int chop_correct_code, big_correct_code;
+ for(cc = 0; cc < cc_untrusted.size(); cc++) {
+ // try chopped error region
+ chop_region = error_region_chop(cc_untrusted[cc]);
+ chop_correct_code = correct_cc(chop_region, cc_untrusted[cc], trusted, ntnt_prob, prior_prob, learning);
+ if(chop_correct_code > 0) {
+ // try bigger error region
+ big_region = error_region(cc_untrusted[cc]);
+ if(chop_region.size() == big_region.size()) {
+ // cannot correct, and nothing found so trim to untrusted
+ if(chop_correct_code == 1)
+ return print_corrected(multi_cors, chop_region.front());
+ else
+ return print_corrected(multi_cors, cc_untrusted[cc].front());
+
+ } else {
+ big_correct_code = correct_cc(big_region, cc_untrusted[cc], trusted, ntnt_prob, prior_prob, learning);
+
+ if(big_correct_code == 1) {
+ // ambiguous
+ // cannot correct, but trim to region
+ if(chop_correct_code == 1)
+ return print_corrected(multi_cors, chop_region.front());
+ else
+ return print_corrected(multi_cors, big_region.front());
+
+ } else if(big_correct_code == 2 || big_correct_code == 3) {
+ // cannot correct, and chaotic or nothing found so trim to untrusted
+ return print_corrected(multi_cors, cc_untrusted[cc].front());
+ }
+ }
+ }
+ // else, corrected!
+
+ // corrected
+ global_like *= trusted_read->likelihood;
+
+ // store
+ for(int c = 0; c < trusted_read->corrections.size(); c++)
+ multi_cors.push_back(trusted_read->corrections[c]);
+ }
+
+ // create new trusted read (mostly for learn_errors)
+ corrected_read * tmp = trusted_read;
+ trusted_read = new corrected_read(multi_cors, tmp->untrusted, global_like, 0);
+ delete tmp;
+
+ // print read with all corrections
+ return print_corrected(multi_cors);
+}
+
+
+////////////////////////////////////////////////////////////
+// error_region
+//
+// Find region of the read to consider for errors based
+// on the pattern of untrusted kmers
+////////////////////////////////////////////////////////////
+vector<short> Read::error_region(vector<int> untrusted_subset) {
+ // find intersection, or union
+ vector<short> region;
+ if(!untrusted_intersect(untrusted_subset, region))
+ untrusted_union(untrusted_subset, region);
+
+ // if front kmer can reach region, there may be more
+ // errors in the front
+ short f = region.front();
+ short b = region.back();
+
+ if(bithash::k-1 >= f) {
+ // extend to front
+ for(short i = f-1; i >= 0; i--)
+ region.push_back(i);
+ }
+ if(trim_length-bithash::k <= b) {
+ // extend to back
+ for(short i = b+1; i < trim_length; i++)
+ region.push_back(i);
+ }
+
+ return region;
+}
+
+////////////////////////////////////////////////////////////
+// error_region_chop
+//
+// Find region of the read to consider for errors based
+// on the pattern of untrusted kmers, using trusted kmers
+// to further trim the area.
+////////////////////////////////////////////////////////////
+vector<short> Read::error_region_chop(vector<int> untrusted_subset) {
+ // find intersection, or union
+ vector<short> region;
+ if(!untrusted_intersect(untrusted_subset, region))
+ untrusted_union(untrusted_subset, region);
+
+ // fix front
+ int right_leftkmer = untrusted_subset.front()-1;
+ if(right_leftkmer >= 0) {
+ // erase all bp in rightmost left kmer
+ vector<short> front_chop(region);
+ region.clear();
+ for(int i = 0; i < front_chop.size(); i++) {
+ if(front_chop[i] > right_leftkmer+bithash::k-1)
+ region.push_back(front_chop[i]);
+ }
+
+ // add back 1 base if it's low quality, or lower quality than the next base
+ for(int er = 0; er < expand_region; er++) {
+ int pre_region = region[0] - (er+1);
+ if(pre_region >= 0 && (prob[pre_region] < .99 || prob[pre_region] <= prob[pre_region+1])) {
+ vector<short>::iterator it;
+ it = region.begin();
+ region.insert(it, pre_region);
+ }
+ }
+ } else {
+ // extend to front
+ for(int i = region[0]-1; i >= 0; i--)
+ region.push_back(i);
+ }
+
+ // fix back
+ int left_rightkmer = untrusted_subset.back()+1;
+ if(left_rightkmer+bithash::k-1 < trim_length) {
+ // erase all bp in leftmost right kmer
+ vector<short> back_chop(region);
+ region.clear();
+ for(int i = 0; i < back_chop.size(); i++) {
+ if(back_chop[i] < left_rightkmer)
+ region.push_back(back_chop[i]);
+ }
+
+ // add back 1 base if it's low quality, or lower quality than the next base
+ // Two issues with this:
+ // 1. I think region could be empty, so there's a bug
+ // 2. This won't help for errors in the middle of a read that are missing an untrusted kmer
+ // because the region will be empty, and we'll just try the intersection.
+ /*
+ for(int er = 0; er < expand_region; er++) {
+ int post_region = region.back() + (er+1);
+ if(post_region < trim_length && (prob[post_region] < .99 || prob[post_region] <= prob[post_region-1])) {
+ region.push_back(post_region);
+ }
+ }
+ */
+
+ } else {
+ // extend to back
+ for(int i = region.back()+1; i < trim_length; i++)
+ region.push_back(i);
+ }
+
+ return region;
+}
+
+////////////////////////////////////////////////////////////
+// untrusted_intersect
+//
+// Compute the intersection of the untrusted kmers as
+// start,end return true if it's non-empty or false
+// otherwise
+////////////////////////////////////////////////////////////
+bool Read::untrusted_intersect(vector<int> untrusted_subset, vector<short> & region) {
+ int start = 0;
+ int end = read_length-1;
+
+ int u;
+ for(int i = 0; i < untrusted_subset.size(); i++) {
+ u = untrusted_subset[i];
+
+ // if overlap
+ if(start <= u+bithash::k-1 && u <= end) {
+ // take intersection
+ start = max(start, u);
+ end = min(end, u+bithash::k-1);
+ } else {
+ // intersection is empty
+ return false;
+ }
+ }
+
+ // intersection is non-empty
+ for(short i = start; i <= end; i++)
+ region.push_back(i);
+ return true;
+}
+
+////////////////////////////////////////////////////////////
+// untrusted_union
+//
+// Compute the union of the untrusted kmers, though not
+////////////////////////////////////////////////////////////
+void Read::untrusted_union(vector<int> untrusted_subset, vector<short> & region) {
+ short u;
+ set<short> region_set;
+ for(int i = 0; i < untrusted_subset.size(); i++) {
+ u = untrusted_subset[i];
+
+ for(short ui = u; ui < u+bithash::k; ui++)
+ region_set.insert(ui);
+ }
+
+ set<short>::iterator it;
+ for(it = region_set.begin(); it != region_set.end(); it++)
+ region.push_back(*it);
+}
+
+////////////////////////////////////////////////////////////
+// quality_quicksort
+//
+// Sort the indexes from lowest probability of an accurate
+// basecall to highest
+////////////////////////////////////////////////////////////
+void Read::quality_quicksort(vector<short> & indexes, int left, int right) {
+ int i = left, j = right;
+ short tmp;
+ float pivot = prob[indexes[(left + right) / 2]];
+
+ /* partition */
+ while (i <= j) {
+ while (prob[indexes[i]] < pivot)
+ i++;
+ while (prob[indexes[j]] > pivot)
+ j--;
+ if (i <= j) {
+ tmp = indexes[i];
+ indexes[i] = indexes[j];
+ indexes[j] = tmp;
+ i++;
+ j--;
+ }
+ }
+
+ /* recursion */
+ if (left < j)
+ quality_quicksort(indexes, left, j);
+ if (i < right)
+ quality_quicksort(indexes, i, right);
+}
+
+////////////////////////////////////////////////////////////
+// check_trust
+//
+// Given a corrected read and data structure holding
+// trusted kmers, update the corrected_reads's vector
+// of untrusted kmers and return true if it's now empty
+////////////////////////////////////////////////////////////
+bool Read::check_trust(corrected_read *cr, bithash *trusted, unsigned int & check_count) {
+ // original read HAS errors
+ if(cr->corrections.empty())
+ return false;
+
+ // make corrections to sequence, saving nt's to fix later
+ vector<int> seqsave;
+ int i;
+ for(i = 0; i < cr->corrections.size(); i++) {
+ seqsave.push_back(seq[cr->corrections[i].index]);
+ seq[cr->corrections[i].index] = cr->corrections[i].to;
+ }
+
+ int edit = cr->corrections.back().index;
+ int kmer_start = max(0, edit-bithash::k+1);
+ //int kmer_end = min(edit, read_length-k);
+ int kmer_end = min(edit, trim_length-bithash::k);
+
+ check_count += (kmer_end - kmer_start + 1);
+
+ bool non_acgt = false;
+ for(i = kmer_start; i < kmer_end+bithash::k; i++)
+ if(seq[i] >=4)
+ non_acgt = true;
+
+ //non_acgt = true;
+ if(non_acgt) {
+ // easier to just check kmers one by one
+ for(i = kmer_start; i <= kmer_end; i++)
+ // check kmer
+ cr->untrusted.set(i, !trusted->check(&seq[i]));
+
+ } else {
+ // check affected kmers
+ Seq<kK> kmermap;
+ // check first kmer and save map value
+ cr->untrusted.set(kmer_start, !trusted->check(&seq[kmer_start], kmermap));
+ for(i = kmer_start+1; i <= kmer_end; i++) {
+ // check kmer using map value
+ cr->untrusted.set(i, !trusted->check(kmermap, seq[i-1], seq[i+bithash::k-1]));
+ }
+ }
+
+ // fix sequence
+ for(i = 0; i < cr->corrections.size(); i++)
+ seq[cr->corrections[i].index] = seqsave[i];
+
+ return(cr->untrusted.none());
+}
diff --git a/src/hammer/quake_correct/Read.h b/src/projects/hammer/quake_correct/Read.h
similarity index 100%
rename from src/hammer/quake_correct/Read.h
rename to src/projects/hammer/quake_correct/Read.h
diff --git a/src/projects/hammer/quake_correct/bithash.cpp b/src/projects/hammer/quake_correct/bithash.cpp
new file mode 100644
index 0000000..65d8203
--- /dev/null
+++ b/src/projects/hammer/quake_correct/bithash.cpp
@@ -0,0 +1,388 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "bithash.h"
+#include "data_structures/sequence/nucl.hpp"
+#include <iostream>
+#include <fstream>
+#include <cstdlib>
+#include <cassert>
+
+using namespace::std;
+
+bithash::bithash(int _k)
+ :bits()
+{
+ k = _k;
+ assert(_k == kK && "kK and k passed from he programm does not match");
+}
+
+bithash::~bithash() {
+}
+
+////////////////////////////////////////////////////////////
+// add
+//
+// Add a single sequence to the bitmap
+////////////////////////////////////////////////////////////
+void bithash::add(Seq<kK> kmer) {
+ bits.insert(kmer);
+}
+
+
+////////////////////////////////////////////////////////////
+// check
+//
+// Check for the presence of a sequence in the tree
+//
+// Can handle N's! Returns False!
+////////////////////////////////////////////////////////////
+bool bithash::check(unsigned kmer[]) {
+ for(int i = 0; i < k; i++) { // ToDo: if we add constructor which
+ // can soft fail if we pass N's in seq
+ // we can optimize this code.
+ if (!is_dignucl(kmer[i])) {
+ return false;
+ }
+ }
+ return bits.count(Seq<kK>(kmer)) != 0;
+}
+
+////////////////////////////////////////////////////////////
+// check
+//
+// Check for the presence of a sequence in the tree.
+// Pass the kmer map value back by reference to be re-used
+//
+// Can't handle N's!
+////////////////////////////////////////////////////////////
+bool bithash::check(unsigned kmer[], Seq<kK> &kmermap) {
+ kmermap = Seq<kK>(kmer);
+ return bits.count(kmermap) != 0;
+}
+
+////////////////////////////////////////////////////////////
+// check
+//
+// Check for the presence of a sequence in the tree.
+////////////////////////////////////////////////////////////
+bool bithash::check(Seq<kK> kmermap) {
+ return bits.count(kmermap) != 0;
+}
+
+////////////////////////////////////////////////////////////
+// check
+//
+// Check for the presence of a sequence in the tree.
+// Pass the kmer map value back by reference to be re-used
+//
+// Can't handle N's!
+////////////////////////////////////////////////////////////
+bool bithash::check(Seq<kK> &kmermap, unsigned last, unsigned next) {
+ kmermap = kmermap << next;
+ // ToDo we can optimize this if Seq will
+ // have << operator
+ return bits.count(kmermap) != 0;
+}
+
+////////////////////////////////////////////////////////////
+// file_load
+//
+// Make a prefix_tree from kmers in the FASTA-file
+////////////////////////////////////////////////////////////
+void bithash::hammer_file_load(istream & hammer_in, unsigned long long atgc[]) {
+ string line;
+ while(getline(hammer_in, line)) {
+ if (line[0] != '>') {
+ // add to tree
+ string kmer = line.substr(0,k);
+ add(binary_kmer(kmer));
+
+ // add reverse to tree
+ add(binary_rckmer(kmer));
+
+ // count gc
+ if(atgc != NULL) {
+ unsigned int at = count_at(kmer);
+ atgc[0] += at;
+ atgc[1] += (k-at);
+ }
+ }
+ }
+}
+
+
+////////////////////////////////////////////////////////////
+// file_load
+//
+// Make a prefix_tree from kmers in the file given that
+// occur >= "boundary" times
+////////////////////////////////////////////////////////////
+void bithash::meryl_file_load(const char* merf, const double boundary) {
+ ifstream mer_in(merf);
+ string line;
+ double count;
+ bool add_kmer = false;
+
+ while(getline(mer_in, line)) {
+ if(line[0] == '>') {
+ // get count
+ count = atof(line.substr(1).c_str());
+ //cout << count << endl;
+
+ // compare to boundary
+ if(count >= boundary) {
+ add_kmer = true;
+ } else {
+ add_kmer = false;
+ }
+
+ } else if(add_kmer) {
+ // add to tree
+ add(binary_kmer(line));
+
+ // add reverse to tree
+ add(binary_rckmer(line));
+ }
+ }
+}
+
+////////////////////////////////////////////////////////////
+// file_load
+//
+// Make a prefix_tree from kmers in the file given that
+// occur >= "boundary" times
+////////////////////////////////////////////////////////////
+void bithash::tab_file_load(istream & mer_in, const double boundary, unsigned long long atgc[]) {
+ string line;
+ double count;
+
+ while(getline(mer_in, line)) {
+ if(line[k] != ' ' && line[k] != '\t') {
+ cerr << "Kmers are not of expected length " << k << endl;
+ exit(EXIT_FAILURE);
+ }
+
+ // get count
+ count = atof(line.substr(k+1).c_str());
+ //cout << count << endl;
+
+ // compare to boundary
+ if(count >= boundary) {
+ // add to tree
+ add(binary_kmer(line.substr(0,k)));
+
+ // add reverse to tree
+ add(binary_rckmer(line.substr(0,k)));
+
+ // count gc
+ if(atgc != NULL) {
+ unsigned int at = count_at(line.substr(0,k));
+ atgc[0] += at;
+ atgc[1] += (k-at);
+ }
+ }
+ }
+}
+
+////////////////////////////////////////////////////////////
+// file_load
+//
+// Make a prefix_tree from kmers in the file given that
+// occur >= "boundary" times
+////////////////////////////////////////////////////////////
+void bithash::tab_file_load(istream & mer_in, const vector<double> boundary, unsigned long long atgc[]) {
+ string line;
+ double count;
+ int at;
+
+ while(getline(mer_in, line)) {
+ if(line[k] != '\t') {
+ cerr << "Kmers are not of expected length " << k << endl;
+ exit(EXIT_FAILURE);
+ }
+
+ at = count_at(line.substr(0,k));
+
+ // get count
+ count = atof(line.substr(k+1).c_str());
+ //cout << count << endl;
+
+ // compare to boundary
+ if(count >= boundary[at]) {
+ // add to tree
+ add(binary_kmer(line.substr(0,k)));
+
+ // add reverse to tree
+ add(binary_rckmer(line.substr(0,k)));
+
+ // count gc
+ if(atgc != NULL) {
+ unsigned int at = count_at(line.substr(0,k));
+ atgc[0] += at;
+ atgc[1] += (k-at);
+ }
+ }
+ }
+}
+
+////////////////////////////////////////////////////////////
+// binary_file_output
+//
+// Write bithash to file in binary format
+////////////////////////////////////////////////////////////
+void bithash::binary_file_output(char* outf) {
+ /* unsigned long long mysize = (unsigned long long)bits.size() / 8ULL;
+ char* buffer = new char[mysize];
+ unsigned int flag = 1;
+ for(unsigned long long i = 0; i < mysize; i++) {
+ unsigned int temp = 0;
+ for(unsigned int j = 0; j < 8; j++) { // read 8 bits from the bitset
+ temp <<= 1;
+ //unsigned int tmp = i*8 + j;
+ //cout << tmp << ",";
+ if(bits.count(i*8 + j) != 0)
+ temp |= flag;
+ }
+ buffer[i] = (char)temp;
+ }
+ ofstream ofs(outf, ios::out | ios::binary);
+ ofs.write(buffer, mysize);
+ ofs.close();*/
+}
+
+////////////////////////////////////////////////////////////
+// binary_file_input
+//
+// Read bithash from file in binary format
+////////////////////////////////////////////////////////////
+/*
+void bithash::binary_file_input(char* inf) {
+ ifstream ifs(inf, ios::binary);
+
+ // get size of file
+ ifs.seekg(0,ifstream::end);
+ unsigned long long mysize = ifs.tellg();
+ ifs.seekg(0);
+
+ // allocate memory for file content
+ char* buffer = new char[mysize];
+
+ // read content of ifs
+ ifs.read (buffer, mysize);
+
+ // parse bits
+ unsigned int flag = 128;
+ unsigned int temp;
+ for(unsigned long i = 0; i < mysize; i++) {
+ temp = (unsigned int)buffer[i];
+ for(unsigned int j = 0; j < 8; j++) {
+ if((temp & flag) == flag)
+ bits.set(i*8 + j);
+ temp <<= 1;
+ }
+ }
+
+ delete[] buffer;
+}
+*/
+
+////////////////////////////////////////////////////////////
+// binary_file_input
+//
+// Read bithash from file in binary format
+////////////////////////////////////////////////////////////
+void bithash::binary_file_input(char* inf, unsigned long long atgc[]) {
+ /*unsigned int flag = 128;
+ unsigned int temp;
+
+ ifstream ifs(inf, ios::binary);
+
+ // get size of file
+ ifs.seekg(0,ifstream::end);
+ unsigned long long mysize = ifs.tellg();
+ ifs.seekg(0);
+
+ // allocate memory for file content
+ unsigned long long buffersize = 134217728; // i.e. 4^15 / 8, 16 MB
+ if(mysize < buffersize)
+ buffersize = mysize;
+ char* buffer = new char[buffersize];
+
+ for(unsigned long long b = 0; b < mysize/buffersize; b++) {
+
+ // read content of ifs
+ ifs.read (buffer, buffersize);
+
+ // parse bits
+ for(unsigned long long i = 0; i < buffersize; i++) {
+ temp = (unsigned int)buffer[i];
+ for(int j = 0; j < 8; j++) {
+ if((temp & flag) == flag) {
+ bits.set((buffersize*b + i)*8 + j);
+
+ // count gc
+ unsigned int at = count_at((buffersize*b + i)*8 + j);
+ atgc[0] += at;
+ atgc[1] += (k-at);
+ }
+ temp <<= 1;
+ }
+ }
+ }
+
+ delete[] buffer;*/
+}
+
+////////////////////////////////////////////////////////////
+// count_at
+//
+// Count the A's and T's in the sequence given
+////////////////////////////////////////////////////////////
+int bithash::count_at(string seq) {
+ int at = 0;
+ for(int i = 0; i < seq.size(); i++)
+ if(seq[i] == 'A' || seq[i] == 'T')
+ at += 1;
+ return at;
+}
+
+int bithash::count_at(Seq<kK> seq) {
+ int at = 0;
+ unsigned long long mask = 3;
+ unsigned long long nt;
+ for(int i = 0; i < k; i++) {
+ if(seq[i] == 0 || seq[i] == 3)
+ at++;
+ }
+ return at;
+}
+
+// Convert string s to its binary equivalent in mer .
+Seq<kK> bithash::binary_kmer(const string & s) {
+ return Seq<kK>(s);
+}
+
+// Convert string s to its binary equivalent in mer .
+Seq<kK> bithash::binary_rckmer(const string & s) {
+ return !Seq<kK>(s); //ToDo: optimize
+}
+
+// Return the binary equivalent of ch .
+unsigned bithash::binary_nt(char ch) {
+ switch (tolower (ch)) {
+ case 'a' : return 0;
+ case 'c' : return 1;
+ case 'g' : return 2;
+ case 't' : return 3;
+ }
+}
+
+
+unsigned int bithash::num_kmers() {
+ return (unsigned int)bits.size();
+}
diff --git a/src/hammer/quake_correct/bithash.h b/src/projects/hammer/quake_correct/bithash.h
similarity index 100%
rename from src/hammer/quake_correct/bithash.h
rename to src/projects/hammer/quake_correct/bithash.h
diff --git a/src/projects/hammer/quake_correct/correct.cpp b/src/projects/hammer/quake_correct/correct.cpp
new file mode 100644
index 0000000..3b16195
--- /dev/null
+++ b/src/projects/hammer/quake_correct/correct.cpp
@@ -0,0 +1,897 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "bithash.h"
+#include "Read.h"
+#include "edit.h"
+#include "gzstream.h"
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include <string>
+#include <string.h>
+#include <cstring>
+#include <getopt.h>
+#ifdef _OPENMP
+#include <omp.h>
+#else
+#define omp_set_num_threads(x)
+#define omp_get_max_threads() 1
+#define omp_get_thread_num() 0
+#define omp_get_num_threads() 0
+#endif
+#include <cstdlib>
+#include <iomanip>
+#include <sys/stat.h>
+
+////////////////////////////////////////////////////////////
+// options
+////////////////////////////////////////////////////////////
+const static char* myopts = "r:f:k:m:b:c:a:t:q:l:p:v:zCuh";
+static struct option long_options [] = {
+ {"headers", 0, 0, 1000},
+ {"log", 0, 0, 1001},
+ {0, 0, 0, 0}
+};
+
+// -r, fastq file of reads
+//char* fastqf;
+// -f, file of fastq files of reads
+//char* file_of_fastqf;
+
+// -z, zip output files
+//bool zip_output = false;
+
+// -k, kmer size
+static int k = 0;
+
+// -m, mer counts
+static char* merf = NULL;
+// -b, bithash
+static char* bithashf = NULL;
+// -v, good kmers from Hammer
+static char* hammerf = NULL;
+
+// -c, cutoff between trusted and untrusted mers
+static double cutoff = 0;
+// -a, AT cutoff
+static char* ATcutf = NULL;
+
+// -q
+//int Read::quality_scale;
+// -l
+static int trim_t = 30;
+// -t
+//static int trimq = 3;
+
+// -p, number of threads
+//int threads;
+
+// --headers, Print only normal headers
+static bool orig_headers = false;
+
+// -C, Contrail output
+static bool contrail_out = false;
+// -u, output uncorrected reads
+static bool uncorrected_out = false;
+// --log, output correction log
+static bool out_log = false;
+
+static bool overwrite_temp = true;
+
+// Note: to not trim, set trimq=0 and trim_t>read_length-k
+
+// constants
+#define TESTING false
+static char* nts = (char*)"ACGTN";
+//unsigned int chunks_per_thread = 200;
+
+ // to collect stats
+struct stats {
+ stats() {
+ validated = 0;
+ corrected = 0;
+ removed = 0;
+ trimmed = 0;
+ trimmed_only = 0;
+ }
+ unsigned long long validated;
+ unsigned long long corrected;
+ unsigned long long removed;
+ unsigned long long trimmed;
+ unsigned long long trimmed_only;
+};
+
+static void Usage
+ (char * command)
+
+// Print to stderr description of options and command line for
+// this program. command is the command that was used to
+// invoke it.
+
+{
+ fprintf (stderr,
+ "USAGE: correct [options]\n"
+ "\n"
+ "Correct sequencing errors in fastq file provided with -r\n"
+ "and output trusted and corrected reads to\n"
+ "<fastq-prefix>.cor.fastq.\n"
+ "\n"
+ "Options:\n"
+ " -r <file>\n"
+ " Fastq file of reads\n"
+ " -f <file>\n"
+ " File containing fastq file names, one per line or\n"
+ " two per line for paired end reads.\n"
+ " -z\n"
+ " Write output files as gzipped.\n"
+ " -m <file>\n"
+ " File containing kmer counts in format `seq\tcount`.\n"
+ " Can be gzipped.\n"
+ " -b <file>\n"
+ " File containing saved bithash.\n"
+ " -c <num>\n"
+ " Separate trusted/untrusted kmers at cutoff <num>\n"
+ " -a <file>\n"
+ " Separate trusted/untrusted kmers as a function of AT\n"
+ " content, with cutoffs found in <file>, one per line\n"
+ " -p <num>\n"
+ " Use <num> openMP threads\n"
+ " -l <num>=30\n"
+ " Return only reads corrected and/or trimmed to >= <num>\n"
+ " bp\n"
+ " -q <num>\n"
+ " Quality value ascii scale, generally 64 or 33. If not\n"
+ " specified, it will guess.\n"
+ " -v <file>\n"
+ " File with good k-mers from Hammer.\n"
+ " -t <num>=3\n"
+ " Use BWA trim parameter <num>\n"
+ " -u\n"
+ " Output errors reads even if they can't be corrected,\n"
+ " maintaining paired end reads.\n"
+ " --headers\n"
+ " Output only the original read headers without\n"
+ " correction messages\n"
+ " --log\n"
+ " Output a log of all corrections into *.log as\n"
+ " 'quality position new_nt old_nt'\n"
+ "\n");
+
+ return;
+ }
+
+////////////////////////////////////////////////////////////
+// parse_command_line
+////////////////////////////////////////////////////////////
+static void parse_command_line(int argc, char **argv) {
+ bool errflg = false;
+ int ch;
+ optarg = NULL;
+ int option_index = 0;
+ char* p;
+ k = kK;
+ // parse args
+ while(!errflg && ((ch = getopt_long(argc, argv, myopts, long_options, &option_index)) != EOF)) {
+ //while(!errflg && ((ch = getopt(argc, argv, myopts)) != EOF)) {
+ switch(ch) {
+ case 'r':
+ fastqf = strdup(optarg);
+ break;
+
+ case 'f':
+ file_of_fastqf = strdup(optarg);
+ break;
+
+ case 'z':
+ zip_output = true;
+ break;
+
+ case 'm':
+ merf = strdup(optarg);
+ break;
+
+ case 'b':
+ bithashf = strdup(optarg);
+ break;
+
+ case 'c':
+ cutoff = double(strtod(optarg, &p));
+ if(p == optarg || cutoff < 0) {
+ fprintf(stderr, "Bad mer cutoff value \"%s\"\n",optarg);
+ errflg = true;
+ }
+ break;
+
+ case 'a':
+ ATcutf = strdup(optarg);
+ break;
+
+ case 't':
+ trimq = int(strtol(optarg, &p, 10));
+ if(p == optarg || trimq < 0) {
+ fprintf(stderr, "Bad trim quality value \"%s\"\n",optarg);
+ errflg = true;
+ }
+ break;
+
+ case 'l':
+ trim_t = int(strtol(optarg, &p, 10));
+ if(p == optarg || trim_t < 1) {
+ fprintf(stderr, "Bad trim threshold \"%s\"\n",optarg);
+ errflg = true;
+ }
+ break;
+
+ case 'q':
+ Read::quality_scale = int(strtol(optarg, &p, 10));
+ if(p == optarg || Read::quality_scale < -1) {
+ fprintf(stderr, "Bad quality value scale \"%s\"\n",optarg);
+ errflg = true;
+ }
+ break;
+
+ case 'C':
+ contrail_out = true;
+ break;
+
+ case 'u':
+ uncorrected_out = true;
+ break;
+
+ case 'p':
+ threads = int(strtol(optarg, &p, 10));
+ if(p == optarg || threads <= 0) {
+ fprintf(stderr, "Bad number of threads \"%s\"\n",optarg);
+ errflg = true;
+ }
+ break;
+
+ case 1000:
+ orig_headers = true;
+ break;
+
+ case 1001:
+ out_log = true;
+ break;
+
+ case 'v':
+ hammerf = strdup(optarg);
+ break;
+
+ case 'h':
+ Usage(argv[0]);
+ exit(EXIT_FAILURE);
+
+ case '?':
+ fprintf (stderr, "Unrecognized option -%c\n", optopt);
+
+ default:
+ errflg = true;
+ }
+ }
+
+ // for some reason, optind is not advancing properly so this
+ // always returns an error
+
+ // return errors
+ /*
+ if(errflg || optind != argc-1) {
+ Usage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+ */
+
+ ////////////////////////////////////////
+ // correct user input errors
+ ////////////////////////////////////////
+ if(fastqf == NULL && file_of_fastqf == NULL) {
+ cerr << "Must provide a fastq file of reads (-r) or a file containing a list of fastq files of reads (-f)" << endl;
+ exit(EXIT_FAILURE);
+ }
+
+ if(k == 0) {
+ cerr << "Must provide kmer size (-k)" << endl;
+ exit(EXIT_FAILURE);
+ }
+
+ if(merf != NULL) {
+ if(cutoff == 0 && ATcutf == NULL) {
+ cerr << "Must provide a trusted/untrusted kmer cutoff (-c) or a file containing the cutoff as a function of the AT content (-a)" << endl;
+ exit(EXIT_FAILURE);
+ }
+ } else if(bithashf == NULL && hammerf == NULL) {
+ cerr << "Must provide a file of kmer counts (-m) or a saved bithash (-b) or solid kmers from Hammer (-v)" << endl;
+ exit(EXIT_FAILURE);
+ }
+
+}
+
+
+////////////////////////////////////////////////////////////
+// regress_probs
+//
+// Use ntnt_counts to perform nonparametric regression
+// on ntnt_prob across quality values.
+////////////////////////////////////////////////////////////
+void regress_probs(double ntnt_prob[Read::max_qual][4][4], unsigned int ntnt_counts[Read::max_qual][4][4]) {
+ double sigma = 2.0;
+ double sigma2 = pow(sigma, 2);
+
+ // count # occurrences for each (quality=q,actual=a) tuple
+ unsigned int actual_counts[Read::max_qual][4] = {0};
+ for(int q = 1; q < Read::max_qual; q++)
+ for(int i = 0; i < 4; i++)
+ for(int j = 0; j < 4; j++)
+ actual_counts[q][i] += ntnt_counts[q][i][j];
+
+ // regress
+ double ntdsum;
+ for(int q = 1; q < Read::max_qual; q++) {
+ for(int i = 0; i < 4; i++) {
+ //ntdsum = 0;
+ for(int j = 0; j < 4; j++) {
+ double pnum = 0;
+ double pden = 0;
+ for(int qr = 1; qr < Read::max_qual; qr++) {
+ pnum += ntnt_counts[qr][i][j] * exp(-pow((double)(qr - q), 2)/(2*sigma2));
+ pden += actual_counts[qr][i] * exp(-pow((double)(qr - q), 2)/(2*sigma2));
+ }
+ ntnt_prob[q][i][j] = pnum / pden;
+ //ntdsum += ntnt_prob[q][i][j];
+ }
+
+ // re-normalize to sum to 1
+ //for(int j = 0; j < 4; j++)
+ //ntnt_prob[q][i][j] /= ntdsum;
+ }
+ }
+}
+
+
+////////////////////////////////////////////////////////////
+// output_model
+//
+// Print the error model to the file error_model.txt
+////////////////////////////////////////////////////////////
+void output_model(double ntnt_prob[Read::max_qual][4][4], unsigned int ntnt_counts[Read::max_qual][4][4], string fqf) {
+ string base = split(fqf,'/').back();
+
+ int suffix_index = base.rfind(".");
+ string prefix;
+ if(suffix_index == -1) {
+ prefix = base;
+ } else {
+ prefix = base.substr(0,suffix_index);
+ }
+
+ string outf = "error_model." + prefix + ".txt";
+
+ ofstream mod_out(outf.c_str());
+
+ unsigned int ntsum;
+ for(int q = 1; q < Read::max_qual; q++) {
+ mod_out << "Quality = " << q << endl;
+
+ // counts
+ mod_out << "\tA\tC\tG\tT" << endl;
+ for(int i = 0; i < 4; i++) {
+ mod_out << nts[i];
+
+ ntsum = 0;
+ for(int j = 0; j < 4; j++)
+ ntsum += ntnt_counts[q][i][j];
+
+ for(int j = 0; j < 4; j++) {
+ if(i == j)
+ mod_out << "\t-";
+ else if(ntsum > 0)
+ mod_out << "\t" << ((double)ntnt_counts[q][i][j] / (double)ntsum) << "(" << ntnt_counts[q][i][j] << ")";
+ else
+ mod_out << "\t0";
+ }
+ mod_out << endl;
+ }
+
+ // probs
+ mod_out << "\tA\tC\tG\tT" << endl;
+ for(int i = 0; i < 4; i++) {
+ mod_out << nts[i];
+ for(int j = 0; j < 4; j++) {
+ if(i == j)
+ mod_out << "\t-";
+ else
+ mod_out << "\t" << ntnt_prob[q][i][j];
+ }
+ mod_out << endl;
+ }
+ mod_out << endl;
+ }
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// output_read
+//
+// Output the given possibly corrected and/or trimmed
+// read according to the given options.
+////////////////////////////////////////////////////////////////////////////////
+static void output_read(ofstream & reads_out, ofstream & corlog_out, int pe_code, string header, string ntseq, string mid, string strqual, string corseq, stats & tstats) {
+ if(corseq.size() >= trim_t) {
+ // check for changes
+ bool corrected = false;
+ for(int i = 0; i < corseq.size(); i++) {
+ if(corseq[i] != ntseq[i]) {
+ // log it
+ if(corlog_out.good())
+ corlog_out << (strqual[i]-Read::quality_scale) << "\t" << (i+1) << "\t" << corseq[i] << "\t" << ntseq[i] << endl;
+ // note it
+ corrected = true;
+ // set qual to crap
+ strqual[i] = (char)(Read::quality_scale+2);
+ }
+ }
+ if(corrected)
+ tstats.corrected++;
+
+ // update header
+ if(!orig_headers) {
+ if(corrected)
+ header += " correct";
+ unsigned int trimlen = ntseq.size()-corseq.size();
+ if(trimlen > 0) {
+ stringstream trim_inter;
+ trim_inter << trimlen;
+ header += " trim=" + trim_inter.str();
+ tstats.trimmed++;
+ if(!corrected)
+ tstats.trimmed_only++;
+ } else {
+ if(!corrected)
+ tstats.validated++;
+ }
+ }
+ // print
+ if(contrail_out)
+ reads_out << header << "\t" << corseq << endl;
+ else
+ reads_out << header << endl << corseq << endl << mid << endl << strqual.substr(0,corseq.size()) << endl;
+ if(TESTING)
+ cerr << header << "\t" << ntseq << "\t" << corseq << endl;
+ } else {
+ tstats.removed++;
+ if(uncorrected_out || pe_code > 0) {
+ // update header
+ header += " error";
+
+ //print
+ if(contrail_out)
+ reads_out << header << "\t" << ntseq << endl;
+ else
+ reads_out << header << endl << ntseq << endl << mid << endl << strqual << endl;
+ }
+ if(TESTING)
+ cerr << header << "\t" << ntseq << "\t-" << endl; // or . if it's only trimmed?
+ }
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// correct_reads
+//
+// Correct the reads in the file 'fqf' using the data structure of trusted
+// kmers 'trusted', matrix of nt->nt error rates 'ntnt_prob' and prior nt
+// probabilities 'prior_prob'. 'starts' and 'counts' help openMP parallelize
+// the read processing. If 'pairedend_code' is 0, the reads are not paired;
+// if it's 1, this file is the first of a pair so print all reads and withold
+// combining; if it's 2, the file is the second of a pair so print all reads
+// and then combine both 1 and 2.
+////////////////////////////////////////////////////////////////////////////////
+static void correct_reads(string fqf, int pe_code, bithash * trusted, vector<streampos> & starts, vector<unsigned long long> & counts, double ntnt_prob[Read::max_qual][4][4], double prior_prob[4]) {
+ // output directory
+ struct stat st_file_info;
+ string path_suffix = split(fqf,'/').back();
+ string out_dir("."+path_suffix);
+ if(stat(out_dir.c_str(), &st_file_info) == 0) {
+ cerr << "Hidden temporary directory " << out_dir << " already exists and will be used" << endl;
+ } else {
+ if(mkdir(out_dir.c_str(), S_IRWXU) == -1) {
+ cerr << "Failed to create hidden temporary directory " << out_dir << endl;
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ // collect stats
+ stats * thread_stats = new stats[omp_get_max_threads()];
+
+ unsigned int chunk = 0;
+#pragma omp parallel //shared(trusted)
+ {
+ int tid = omp_get_thread_num();
+
+ // input
+ ifstream reads_in(fqf.c_str());
+
+ unsigned int tchunk;
+ string header,ntseq,mid,strqual,corseq;
+ int trim_length;
+ char* nti;
+ Read *r;
+
+ #pragma omp critical
+ tchunk = chunk++;
+
+ while(tchunk < starts.size()) {
+ reads_in.seekg(starts[tchunk]);
+
+ // output
+ string toutf(out_dir+"/");
+ stringstream tconvert;
+ tconvert << tchunk;
+ toutf += tconvert.str();
+
+ if(overwrite_temp || stat(toutf.c_str(), &st_file_info) == -1) {
+ ofstream reads_out(toutf.c_str());
+ //cout << toutf << endl;
+
+ // output log
+ string tlogf = toutf + ".log";
+ ofstream corlog_out;
+ if(out_log) {
+ corlog_out.open(tlogf.c_str());
+ }
+
+ unsigned long long tcount = 0;
+ while(getline(reads_in, header)) {
+ //cout << tid << " " << header << endl;
+
+ // get sequence
+ getline(reads_in, ntseq);
+ //cout << ntseq << endl;
+
+ // convert ntseq to iseq
+ vector<unsigned int> iseq;
+ for(int i = 0; i < ntseq.size(); i++) {
+ nti = strchr(nts, ntseq[i]);
+ iseq.push_back(nti - nts);
+ }
+
+ // get quality values
+ getline(reads_in,mid);
+ //cout << mid << endl;
+ getline(reads_in,strqual);
+ //cout << strqual << endl;
+
+ vector<int> untrusted;
+
+ if(iseq.size() < trim_t)
+ trim_length = 0;
+ else {
+ for(int i = 0; i < iseq.size()-k+1; i++) {
+ if(!trusted->check(&iseq[i])) {
+ untrusted.push_back(i);
+ }
+ }
+
+ trim_length = quick_trim(strqual, untrusted);
+ //trim_length = iseq.size();
+ }
+
+ // fix error reads
+ if(untrusted.size() > 0) {
+ r = new Read(header, &iseq[0], strqual, untrusted, trim_length);
+ corseq = r->correct(trusted, ntnt_prob, prior_prob);
+
+ // output read w/ trim and corrections
+ output_read(reads_out, corlog_out, pe_code, header, ntseq, mid, strqual, corseq, thread_stats[tid]);
+
+ delete r;
+ } else {
+ output_read(reads_out, corlog_out, pe_code, header, ntseq, mid, strqual, ntseq.substr(0,trim_length), thread_stats[tid]);
+ // output read as trimmed
+ /*
+ if(contrail_out)
+ reads_out << header << "\t" << ntseq.substr(0,trim_length) << endl;
+ else
+ reads_out << header << endl << ntseq.substr(0,trim_length) << endl << mid << endl << strqual.substr(0,trim_length) << endl;
+ */
+ }
+
+ if(++tcount == counts[tchunk])
+ break;
+ }
+ reads_out.close();
+ }
+
+#pragma omp critical
+ tchunk = chunk++;
+ }
+ reads_in.close();
+ }
+
+ // combine stats
+ for(int i = 1; i < omp_get_max_threads(); i++) {
+ thread_stats[0].validated += thread_stats[i].validated;
+ thread_stats[0].corrected += thread_stats[i].corrected;
+ thread_stats[0].trimmed += thread_stats[i].trimmed;
+ thread_stats[0].trimmed_only += thread_stats[i].trimmed_only;
+ thread_stats[0].removed += thread_stats[i].removed;
+ }
+
+ // print stats
+ int suffix_index = fqf.rfind(".");
+ string outf;
+ if(suffix_index == -1) {
+ outf = fqf+".stats.txt";
+ } else {
+ outf = fqf.substr(0,suffix_index+1) + "stats.txt";
+ }
+ ofstream stats_out(outf.c_str());
+ stats_out << "Validated: " << thread_stats[0].validated << endl;
+ stats_out << "Corrected: " << thread_stats[0].corrected << endl;
+ stats_out << "Trimmed: " << thread_stats[0].trimmed << endl;
+ stats_out << "Trimmed only: " << thread_stats[0].trimmed_only << endl;
+ stats_out << "Removed: " << thread_stats[0].removed << endl;
+ stats_out.close();
+}
+
+
+////////////////////////////////////////////////////////////
+// learn_errors
+//
+// Correct reads using a much stricter filter in order
+// to count the nt->nt errors and learn the errors
+// probabilities
+////////////////////////////////////////////////////////////
+//static void learn_errors(string fqf, bithash * trusted, vector<streampos> & starts, vector<unsigned long long> & counts, double (&ntnt_prob)[4][4], double prior_prob[4]) {
+static void learn_errors(string fqf, bithash * trusted, vector<streampos> & starts, vector<unsigned long long> & counts, double ntnt_prob[Read::max_qual][4][4], double prior_prob[4]) {
+ unsigned int ntnt_counts[Read::max_qual][4][4] = {0};
+ unsigned int samples = 0;
+
+ unsigned int chunk = 0;
+#pragma omp parallel //shared(trusted)
+ {
+ unsigned int tchunk;
+ string header,ntseq,strqual,corseq;
+ int trim_length;
+ char* nti;
+ Read *r;
+ ifstream reads_in(fqf.c_str());
+
+ while(chunk < threads*chunks_per_thread) {
+#pragma omp critical
+ tchunk = chunk++;
+
+ reads_in.seekg(starts[tchunk]);
+
+ unsigned long long tcount = 0;
+ while(getline(reads_in, header)) {
+ //cout << header << endl;
+
+ // get sequence
+ getline(reads_in, ntseq);
+ //cout << ntseq << endl;
+
+ // convert ntseq to iseq
+ vector<unsigned int> iseq;
+ for(int i = 0; i < ntseq.size(); i++) {
+ nti = strchr(nts, ntseq[i]);
+ iseq.push_back(nti - nts);
+ }
+
+ // get quality values
+ getline(reads_in,strqual);
+ //cout << strqual << endl;
+ getline(reads_in,strqual);
+ //cout << strqual << endl;
+
+ vector<int> untrusted;
+
+ if(iseq.size() < trim_t)
+ trim_length = 0;
+ else {
+ for(int i = 0; i < iseq.size()-k+1; i++) {
+ if(!trusted->check(&iseq[i])) {
+ untrusted.push_back(i);
+ }
+ }
+
+ trim_length = quick_trim(strqual, untrusted);
+ }
+
+ // fix error reads
+ if(untrusted.size() > 0) {
+ // correct
+ r = new Read(header, &iseq[0], strqual, untrusted, trim_length);
+ corseq = r->correct(trusted, ntnt_prob, prior_prob, true);
+
+ // if trimmed to long enough
+ if(corseq.size() >= trim_t) {
+ if(r->trusted_read != 0) { // else no guarantee there was a correction
+ for(int c = 0; c < r->trusted_read->corrections.size(); c++) {
+ correction cor = r->trusted_read->corrections[c];
+ if(iseq[cor.index] < 4) {
+ // P(obs=o|actual=a,a!=o) for Bayes
+ ntnt_counts[strqual[cor.index]-Read::quality_scale][cor.to][iseq[cor.index]]++;
+
+ // P(actual=a|obs=o,a!=o)
+ //ntnt_counts[iseq[cor.index]][cor.to]++;
+ samples++;
+ }
+ }
+ }
+ }
+ delete r;
+ }
+
+ if(++tcount == counts[tchunk] || samples > 200000)
+ break;
+ }
+ }
+ reads_in.close();
+ }
+
+ regress_probs(ntnt_prob, ntnt_counts);
+
+ output_model(ntnt_prob, ntnt_counts, fqf);
+}
+
+
+////////////////////////////////////////////////////////////
+// load_AT_cutoffs
+//
+// Load AT cutoffs from file
+////////////////////////////////////////////////////////////
+vector<double> load_AT_cutoffs() {
+ vector<double> cutoffs;
+ ifstream cut_in(ATcutf);
+ string line;
+ double cut;
+
+ while(getline(cut_in, line)) {
+ stringstream ss(stringstream::in | stringstream::out);
+ ss << line;
+ ss >> cut;
+ cutoffs.push_back(cut);
+ }
+
+ if(cutoffs.size() != (k+1)) {
+ cerr << "Must specify " << (k+1) << " AT cutoffs in " << ATcutf << endl;
+ exit(EXIT_FAILURE);
+ }
+
+ return cutoffs;
+}
+
+
+////////////////////////////////////////////////////////////
+// main
+////////////////////////////////////////////////////////////
+int main(int argc, char **argv) {
+ parse_command_line(argc, argv);
+
+ // prepare AT and GC counts
+ unsigned long long atgc[2] = {0};
+
+ // make trusted kmer data structure
+ bithash *trusted = new bithash(k);
+
+ // get good kmers from Hammer
+ if (hammerf != NULL) {
+ string hammerf_str(hammerf);
+ if (hammerf_str.substr(hammerf_str.size()-3) == ".gz") {
+ igzstream hammerf_in(hammerf);
+ trusted->hammer_file_load(hammerf_in, atgc);
+ } else {
+ ifstream hammerf_in(hammerf);
+ trusted->hammer_file_load(hammerf_in, atgc);
+ }
+ }
+
+ // get kmer counts
+ if(merf != NULL) {
+ string merf_str(merf);
+ if(ATcutf != NULL) {
+ if(merf_str.substr(merf_str.size()-3) == ".gz") {
+ igzstream mer_in(merf);
+ trusted->tab_file_load(mer_in, load_AT_cutoffs(), atgc);
+ } else {
+ ifstream mer_in(merf);
+ trusted->tab_file_load(mer_in, load_AT_cutoffs(), atgc);
+ }
+ } else {
+ if(merf_str.substr(merf_str.size()-3) == ".gz") {
+ igzstream mer_in(merf);
+ trusted->tab_file_load(mer_in, cutoff, atgc);
+ } else {
+ ifstream mer_in(merf);
+ trusted->tab_file_load(mer_in, cutoff, atgc);
+ }
+ }
+
+ // saved bithash
+ } else if(bithashf != NULL) {
+ if(strcmp(bithashf,"-") == 0) {
+ cerr << "Saved bithash cannot be piped in. Please specify file." << endl;
+ exit(EXIT_FAILURE);
+ } else
+ trusted->binary_file_input(bithashf, atgc);
+ }
+ cout << trusted->num_kmers() << " trusted kmers" << endl;
+
+ double prior_prob[4];
+ prior_prob[0] = (double)atgc[0] / (double)(atgc[0]+atgc[1]) / 2.0;
+ prior_prob[1] = .5 - prior_prob[0];
+ prior_prob[2] = prior_prob[1];
+ prior_prob[3] = prior_prob[0];
+
+ //cout << "AT: " << atgc[0] << " GC: " << atgc[1] << endl;
+ cout << "AT% = " << (2*prior_prob[0]) << endl;
+
+ // make list of files
+ vector<string> fastqfs;
+ vector<int> pairedend_codes;
+ parse_fastq(fastqfs, pairedend_codes);
+
+ // process each file
+ string fqf;
+ bool zip;
+ for(int f = 0; f < fastqfs.size(); f++) {
+ fqf = fastqfs[f];
+ cout << fqf << endl;
+
+ // unzip
+ if(fqf.substr(fqf.size()-3) == ".gz") {
+ zip = true;
+ unzip_fastq(fqf);
+ } else
+ zip = false;
+
+ // determine quality value scale
+ if(Read::quality_scale == -1)
+ guess_quality_scale(fqf);
+
+ // split file
+ vector<streampos> starts;
+ vector<unsigned long long> counts;
+ chunkify_fastq(fqf, starts, counts);
+
+ // learn nt->nt transitions
+ double ntnt_prob[Read::max_qual][4][4] = {0};
+ for(int q = 0; q < Read::max_qual; q++)
+ for(int i = 0; i < 4; i++)
+ for(int j = 0; j < 4; j++)
+ if(i != j)
+ ntnt_prob[q][i][j] = 1.0/3.0;
+
+ if(!TESTING)
+ learn_errors(fqf, trusted, starts, counts, ntnt_prob, prior_prob);
+
+ // correct
+ correct_reads(fqf, pairedend_codes[f], trusted, starts, counts, ntnt_prob, prior_prob);
+
+ // combine
+ if(pairedend_codes[f] == 0) {
+ combine_output(fqf, string("cor"), uncorrected_out);
+ }
+
+ // combine paired end
+ if(pairedend_codes[f] == 2) {
+ if(!zip) {
+ combine_output_paired(fastqfs[f-1], fqf, string("cor"), uncorrected_out);
+ } else {
+ combine_output_paired(fastqfs[f-1].substr(0,fastqfs[f-1].size()-3), fqf, string("cor"), uncorrected_out);
+ }
+ }
+
+ if(zip)
+ zip_fastq(fqf);
+ }
+
+ return 0;
+}
diff --git a/src/projects/hammer/quake_correct/edit.cpp b/src/projects/hammer/quake_correct/edit.cpp
new file mode 100644
index 0000000..4f88901
--- /dev/null
+++ b/src/projects/hammer/quake_correct/edit.cpp
@@ -0,0 +1,665 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include <sys/stat.h>
+#include <fstream>
+#include "omp_wrapper.h"
+#include <iostream>
+#include <sstream>
+#include <cstring>
+#include "gzstream.h"
+#include <vector>
+#include "Read.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// options
+////////////////////////////////////////////////////////////////////////////////
+// -r, fastq file of reads
+char* fastqf = NULL;
+// -f, file of fastq files of reads
+char* file_of_fastqf = NULL;
+
+// -z, zip output files
+bool zip_output = false;
+
+// -q
+int Read::quality_scale = -1;
+// -p, number of threads
+int threads = 4;
+
+// -t
+int trimq = 3;
+
+unsigned int chunks_per_thread = 200;
+
+
+////////////////////////////////////////////////////////////////////////////////
+// split
+//
+// Split on whitespace
+////////////////////////////////////////////////////////////////////////////////
+vector<string> split(string s) {
+ vector<string> splits;
+ int split_num = 0;
+ bool last_space = true;
+
+ for(int i = 0; i < s.size(); i++) {
+ if(s[i] == ' ' || s[i] == '\t' || s[i] == '\n' || s[i] == '\r') {
+ if(!last_space)
+ split_num++;
+ last_space = true;
+ } else {
+ if(split_num == splits.size())
+ splits.push_back("");
+ splits[split_num] += s[i];
+ last_space = false;
+ }
+ }
+
+ return splits;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// split
+//
+// Split on the character c, trying to match Python's split method
+////////////////////////////////////////////////////////////////////////////////
+vector<string> split(string s, char c)
+{
+ vector<string> splits;
+ splits.push_back("");
+ int split_num = 0;
+
+ for(int i = 0; i < s.size(); i++) {
+ if(s[i] == c) {
+ split_num++;
+ splits.push_back("");
+ } else {
+ splits[split_num] += s[i];
+ }
+ }
+
+ return splits;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// unzip_fastq
+//
+// Unzip read file and remove ".gz" suffix from 'fqf'.
+////////////////////////////////////////////////////////////////////////////////
+void unzip_fastq(string & fqf) {
+ char mycmd[500];
+
+ // rename
+ string fqf_zip(fqf);
+ fqf.erase(fqf.size()-3);
+
+ // unzip but leave original file
+ strcpy(mycmd, "gunzip -c ");
+ strcat(mycmd, fqf_zip.c_str());
+ strcat(mycmd, " > ");
+ strcat(mycmd, fqf.c_str());
+ system(mycmd);
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// zip_fastq
+//
+// Zip the original read file as well as the corrected
+// read files.
+////////////////////////////////////////////////////////////////////////////////
+void zip_fastq(string fqf) {
+ char mycmd[100];
+
+ // gzip fqf
+ //strcpy(mycmd, "gzip ");
+ //strcat(mycmd, fqf.c_str());
+ //system(mycmd);
+
+ // remove unzipped fqf, leaving only zipped
+ remove(fqf.c_str());
+
+ // determine output file
+ /*
+ string fqf_str(fqf);
+ int suffix_index = fqf_str.rfind(".");
+ string prefix = fqf_str.substr(0,suffix_index);
+ string suffix = fqf_str.substr(suffix_index, fqf_str.size()-suffix_index);
+ string pairf = prefix + string(".cor") + suffix;
+ string singlef = prefix + string(".cor.single") + suffix;
+
+ // gzip pair
+ strcpy(mycmd, "gzip ");
+ strcat(mycmd, pairf.c_str());
+ system(mycmd);
+
+ // gzip single
+ struct stat st_file_info;
+ if(stat(singlef.c_str(), &st_file_info) == 0) {
+ strcpy(mycmd, "gzip ");
+ strcat(mycmd, singlef.c_str());
+ system(mycmd);
+ }
+ */
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// combine_logs
+//
+// Combine log files that may be in out_dir into a single log file named
+// using fqf.
+////////////////////////////////////////////////////////////////////////////////
+void combine_logs(string fqf, string out_dir) {
+ struct stat st_file_info;
+ string log1 = out_dir+"/0.log";
+ if(stat(log1.c_str(), &st_file_info) == 0) {
+ // format log output
+ string logf = fqf + ".log";
+ ofstream corlog_out(logf.c_str());
+
+ // combine
+ string line;
+ for(int t = 0; t < threads*chunks_per_thread; t++) {
+ string tc_file(out_dir+"/");
+ stringstream tc_convert;
+ tc_convert << t;
+ tc_file += tc_convert.str();
+ tc_file += ".log";
+
+ if(stat(tc_file.c_str(), &st_file_info) == 0) {
+ ifstream tc_out(tc_file.c_str());
+ while(getline(tc_out, line)) {
+ corlog_out << line << endl;
+ }
+ tc_out.close();
+ remove(tc_file.c_str());
+ }
+ }
+ corlog_out.close();
+ }
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// combine_output_stream
+//
+// Combine output files in 'out_dir' into a single file defined by the given
+// stream, and remove those files along the way.
+////////////////////////////////////////////////////////////////////////////////
+void combine_output_stream(ostream & combine_out, ostream & err_out, string out_dir) {
+ string header, seq, mid, qual;
+ struct stat st_file_info;
+ for(int t = 0; t < threads*chunks_per_thread; t++) {
+ string tc_file(out_dir+"/");
+ stringstream tc_convert;
+ tc_convert << t;
+ tc_file += tc_convert.str();
+
+ // if file exists, add to single output
+ if(stat(tc_file.c_str(), &st_file_info) == 0) {
+ ifstream tc_out(tc_file.c_str());
+ while(getline(tc_out, header)) {
+ getline(tc_out, seq);
+ getline(tc_out, mid);
+ getline(tc_out, qual);
+
+ if(!err_out.good() || header.find("error") == -1)
+ combine_out << header << endl << seq << endl << mid << endl << qual << endl;
+ else
+ err_out << header.substr(0,header.find("error")) << endl << seq << endl << mid << endl << qual << endl;
+ }
+ tc_out.close();
+ remove(tc_file.c_str());
+ }
+ }
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// combine_output
+//
+// Combine output files in 'out_dir' into a single file and remove 'out_dir'
+////////////////////////////////////////////////////////////////////////////////
+void combine_output(string fqf, string mid_ext, bool uncorrected_out) {
+ // format output directory
+ string path_suffix = split(fqf,'/').back();
+ string out_dir("."+path_suffix);
+
+ // format output file
+ int suffix_index = fqf.rfind(".");
+ string prefix, suffix;
+ if(suffix_index == -1) {
+ prefix = fqf+".";
+ suffix = "";
+ } else {
+ prefix = fqf.substr(0,suffix_index+1);
+ suffix = fqf.substr(suffix_index, fqf.size()-suffix_index);
+ }
+
+ string outf;
+ string errf;
+ if(zip_output) {
+ // zipped
+ outf = prefix + mid_ext + suffix + ".gz";
+ ogzstream combine_out(outf.c_str());
+ ogzstream err_out;
+ if(uncorrected_out) {
+ errf = prefix + "err" + suffix + ".gz";
+ err_out.open(errf.c_str());
+ }
+ combine_output_stream(combine_out, err_out, out_dir);
+ combine_out.close();
+ err_out.close();
+ } else {
+ // normal
+ outf = prefix + mid_ext + suffix;
+ ofstream combine_out(outf.c_str());
+ ofstream err_out;
+ if(uncorrected_out) {
+ errf = prefix + "err" + suffix;
+ err_out.open(errf.c_str());
+ }
+ combine_output_stream(combine_out, err_out, out_dir);
+ combine_out.close();
+ err_out.close();
+ }
+
+ // log
+ combine_logs(fqf, out_dir);
+
+ // remove output directory
+ rmdir(out_dir.c_str());
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// combine_output_paired_stream
+////////////////////////////////////////////////////////////////////////////////
+void combine_output_paired_stream(string fqf1, string fqf2, ostream & pair_out1, ostream & single_out1, ostream & single_err_out1, ostream & err_out1, ostream & pair_out2, ostream & single_out2, ostream & single_err_out2, ostream & err_out2) {
+ // format output directories
+ string path_suffix1 = split(fqf1, '/').back();
+ string out_dir1("."+path_suffix1);
+ string path_suffix2 = split(fqf2, '/').back();
+ string out_dir2("."+path_suffix2);
+
+ string header1, seq1, mid1, qual1, header2, seq2, mid2, qual2;
+ struct stat st_file_info;
+ for(int t = 0; t < threads*chunks_per_thread; t++) {
+ // format thread-chunk output files
+ string tc_file1(out_dir1+"/");
+ stringstream tc_convert1;
+ tc_convert1 << t;
+ tc_file1 += tc_convert1.str();
+
+ string tc_file2(out_dir2+"/");
+ stringstream tc_convert2;
+ tc_convert2 << t;
+ tc_file2 += tc_convert2.str();
+
+ // if file exists, both must
+ if(stat(tc_file1.c_str(), &st_file_info) == 0) {
+ ifstream tc_out1(tc_file1.c_str());
+ ifstream tc_out2(tc_file2.c_str());
+
+ while(getline(tc_out1, header1)) {
+ // get read1
+ getline(tc_out1, seq1);
+ getline(tc_out1, mid1);
+ getline(tc_out1, qual1);
+
+ // get read2
+ if(!getline(tc_out2, header2)) {
+ cerr << "Uneven number of reads in paired end read files " << tc_file1.c_str() << " and " << tc_file2.c_str() << endl;
+ exit(EXIT_FAILURE);
+ }
+ getline(tc_out2, seq2);
+ getline(tc_out2, mid2);
+ getline(tc_out2, qual2);
+
+ if(header1.find("error") == -1) {
+ if(header2.find("error") == -1) {
+ // no errors
+ pair_out1 << header1 << endl << seq1 << endl << mid1 << endl << qual1 << endl;
+ pair_out2 << header2 << endl << seq2 << endl << mid2 << endl << qual2 << endl;
+ } else {
+ // error in 2
+ single_out1 << header1 << endl << seq1 << endl << mid1 << endl << qual1 << endl;
+ if(single_err_out2.good())
+ single_err_out2 << header2.substr(0,header2.find("error")) << endl << seq2 << endl << mid2 << endl << qual2 << endl;
+ }
+ } else {
+ if(header2.find("error") == -1) {
+ // error in 1
+ if(single_err_out1.good())
+ single_err_out1 << header1.substr(0,header1.find("error")) << endl << seq1 << endl << mid1 << endl << qual1 << endl;
+ single_out2 << header2 << endl << seq2 << endl << mid2 << endl << qual2 << endl;
+ } else {
+ // error in 1,2
+ if(err_out1.good()) {
+ err_out1 << header1.substr(0,header1.find("error")) << endl << seq1 << endl << mid1 << endl << qual1 << endl;
+ err_out2 << header2.substr(0,header2.find("error")) << endl << seq2 << endl << mid2 << endl << qual2 << endl;
+ }
+ }
+ }
+ }
+ tc_out1.close();
+ tc_out2.close();
+ remove(tc_file1.c_str());
+ remove(tc_file2.c_str());
+ }
+ }
+
+ // logs
+ combine_logs(fqf1, out_dir1);
+ combine_logs(fqf2, out_dir2);
+
+ // remove output directory
+ rmdir(out_dir1.c_str());
+ rmdir(out_dir2.c_str());
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// combine_output_paired
+//
+// Combine output files in 'out_dir' into a single file and remove 'out_dir'
+////////////////////////////////////////////////////////////////////////////////
+void combine_output_paired(string fqf1, string fqf2, string mid_ext, bool uncorrected_out) {
+ string prefix, suffix;
+
+ if(zip_output) {
+ // format output pair file1
+ int suffix_index = fqf1.rfind(".");
+ if(suffix_index == -1) {
+ prefix = fqf1+".";
+ suffix = "";
+ } else {
+ prefix = fqf1.substr(0,suffix_index+1);
+ suffix = fqf1.substr(suffix_index, fqf1.size()-suffix_index);
+ }
+ string outf = prefix + mid_ext + suffix + ".gz";
+ ogzstream pair_out1(outf.c_str());
+
+ // and single file1
+ outf = prefix + mid_ext + "_single" + suffix + ".gz";
+ ogzstream single_out1(outf.c_str());
+
+ // and error file1
+ ogzstream single_err_out1;
+ ogzstream err_out1;
+ if(uncorrected_out) {
+ outf = prefix + "err_single" + suffix + ".gz";
+ single_err_out1.open(outf.c_str());
+ outf = prefix + "err" + suffix + ".gz";
+ err_out1.open(outf.c_str());
+ }
+
+ // format output pair file2
+ suffix_index = fqf2.rfind(".");
+ if(suffix_index == -1) {
+ prefix = fqf2+".";
+ suffix = "";
+ } else {
+ prefix = fqf2.substr(0,suffix_index+1);
+ suffix = fqf2.substr(suffix_index, fqf2.size()-suffix_index);
+ }
+ outf = prefix + mid_ext + suffix + ".gz";
+ ogzstream pair_out2(outf.c_str());
+
+ // and single file2
+ outf = prefix + mid_ext + "_single" + suffix + ".gz";
+ ogzstream single_out2(outf.c_str());
+
+ // and error file1
+ ogzstream single_err_out2;
+ ogzstream err_out2;
+ if(uncorrected_out) {
+ outf = prefix + "err_single" + suffix + ".gz";
+ single_err_out2.open(outf.c_str());
+ outf = prefix + "err" + suffix + ".gz";
+ err_out2.open(outf.c_str());
+ }
+
+ combine_output_paired_stream(fqf1, fqf2, pair_out1, single_out1, single_err_out1, err_out1, pair_out2, single_out2, single_err_out2, err_out2);
+
+ pair_out1.close();
+ pair_out2.close();
+ single_out1.close();
+ single_out2.close();
+
+ } else {
+ // format output pair file1
+ int suffix_index = fqf1.rfind(".");
+ if(suffix_index == -1) {
+ prefix = fqf1+".";
+ suffix = "";
+ } else {
+ prefix = fqf1.substr(0,suffix_index+1);
+ suffix = fqf1.substr(suffix_index, fqf1.size()-suffix_index);
+ }
+ string outf = prefix + mid_ext + suffix;
+ ofstream pair_out1(outf.c_str());
+
+ // and single file1
+ outf = prefix + mid_ext + "_single" + suffix;
+ ofstream single_out1(outf.c_str());
+
+ // and error file1
+ ofstream single_err_out1;
+ ofstream err_out1;
+ if(uncorrected_out) {
+ outf = prefix + "err_single" + suffix;
+ single_err_out1.open(outf.c_str());
+ outf = prefix + "err" + suffix;
+ err_out1.open(outf.c_str());
+ }
+
+ // format output pair file2
+ suffix_index = fqf2.rfind(".");
+ if(suffix_index == -1) {
+ prefix = fqf1+".";
+ suffix = "";
+ } else {
+ prefix = fqf2.substr(0,suffix_index+1);
+ suffix = fqf2.substr(suffix_index, fqf2.size()-suffix_index);
+ }
+ outf = prefix + mid_ext + suffix;
+ ofstream pair_out2(outf.c_str());
+
+ // and single file2
+ outf = prefix + mid_ext + "_single" + suffix;
+ ofstream single_out2(outf.c_str());
+
+ // and error file2
+ ofstream single_err_out2;
+ ofstream err_out2;
+ if(uncorrected_out) {
+ outf = prefix + "err_single" + suffix;
+ single_err_out2.open(outf.c_str());
+ outf = prefix + "err" + suffix;
+ err_out2.open(outf.c_str());
+ }
+
+ combine_output_paired_stream(fqf1, fqf2, pair_out1, single_out1, single_err_out1, err_out1, pair_out2, single_out2, single_err_out2, err_out2);
+
+ pair_out1.close();
+ pair_out2.close();
+ single_out1.close();
+ single_out2.close();
+ }
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// chunkify_fastq
+//
+// Determine start points and sequence counts for all
+// chunks to be processed in parallel.
+////////////////////////////////////////////////////////////////////////////////
+void chunkify_fastq(string fqf, vector<streampos> & starts, vector<unsigned long long> & counts) {
+ // count number of sequences
+ unsigned long long N = 0;
+ ifstream reads_in(fqf.c_str());
+ string toss;
+ while(getline(reads_in, toss))
+ N++;
+ reads_in.close();
+ N /= 4ULL;
+
+ if(threads*chunks_per_thread > N) {
+ // use 1 thread for everything
+ counts.push_back(N);
+ starts.push_back(0);
+ omp_set_num_threads(1);
+
+ } else {
+ // determine counts per thread
+ unsigned long long sum = 0;
+ for(int i = 0; i < threads*chunks_per_thread-1; i++) {
+ counts.push_back(N / (threads*chunks_per_thread));
+ sum += counts.back();
+ }
+ counts.push_back(N - sum);
+
+ // find start points
+ reads_in.open(fqf.c_str());
+ starts.push_back(reads_in.tellg());
+ unsigned long long s = 0;
+ unsigned int t = 0;
+ while(getline(reads_in,toss)) {
+ // sequence
+ getline(reads_in, toss);
+ // +
+ getline(reads_in, toss);
+ // quality
+ getline(reads_in, toss);
+
+ if(++s == counts[t] && t < counts.size()-1) {
+ starts.push_back(reads_in.tellg());
+ s = 0;
+ t++;
+ }
+
+ // set up parallelism
+ omp_set_num_threads(threads);
+ }
+ }
+}
+
+
+////////////////////////////////////////////////////////////
+// guess_quality_scale
+//
+// Guess at ascii scale of quality values by examining
+// a bunch of reads and looking for quality values < 64,
+// in which case we set it to 33.
+//
+// Assuming the file is unzipped.
+////////////////////////////////////////////////////////////
+void guess_quality_scale(string fqf) {
+ string header, seq, mid, strqual;
+ int reads_to_check = 10000;
+ int reads_checked = 0;
+ ifstream reads_in(fqf.c_str());
+ while(getline(reads_in, header)) {
+ getline(reads_in, seq);
+ getline(reads_in, mid);
+ getline(reads_in, strqual);
+
+ for(int i = 0; i < strqual.size(); i++) {
+ if(strqual[i] < 64) {
+ cerr << "Guessing quality values are on ascii 33 scale" << endl;
+ Read::quality_scale = 33;
+ reads_in.close();
+ return;
+ }
+ }
+
+ if(++reads_checked >= reads_to_check)
+ break;
+ }
+ reads_in.close();
+ cerr << "Guessing quality values are on ascii 64 scale" << endl;
+ Read::quality_scale = 64;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// parse_fastq
+//
+// Accept a single fastq file from input, or parse a file with names of fastq
+// files. For multiple files, attached a paired end code to tell the correction
+// method how to handle the file.
+////////////////////////////////////////////////////////////////////////////////
+vector<string> parse_fastq(vector<string> & fastqfs, vector<int> & pairedend_codes) {
+ if(file_of_fastqf != NULL) {
+ ifstream ff(file_of_fastqf);
+ vector<string> next_fastqf;
+ string line;
+
+ while(getline(ff, line) && line.size() > 0) {
+ next_fastqf = split(line);
+
+ if(next_fastqf.size() == 1) {
+ fastqfs.push_back(next_fastqf[0]);
+ pairedend_codes.push_back(0);
+
+ } else if(next_fastqf.size() == 2) {
+ fastqfs.push_back(next_fastqf[0]);
+ fastqfs.push_back(next_fastqf[1]);
+ pairedend_codes.push_back(1);
+ pairedend_codes.push_back(2);
+
+ } else {
+ cerr << "File of fastq file names must have a single fastq file per line for single reads or two fastqf files per line separated by a space for paired end reads " << endl;
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ } else {
+ fastqfs.push_back(string(fastqf));
+ pairedend_codes.push_back(0);
+ }
+
+ return fastqfs;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// quick_trim
+//
+// Trim the end of the read the way BWA does it.
+// Removes affected untrusted k-mers.
+// Returns the trimmed length.
+////////////////////////////////////////////////////////////////////////////////
+int quick_trim(string strqual, vector<int> & untrusted) {
+ // find trim index
+ int phredq;
+ int current_trimfunc = 0;
+ int max_trimfunc = 0;
+ int trim_length = strqual.size();
+ for(int i = strqual.size()-1; i >= 0; i--) {
+ //phredq = floor(.5-10*log(1.0 - prob[i])/log(10));
+ phredq = strqual[i] - Read::quality_scale;
+ current_trimfunc += (trimq - phredq);
+ if(current_trimfunc > max_trimfunc) {
+ max_trimfunc = current_trimfunc;
+ trim_length = i;
+ }
+ }
+
+ // update untrusted
+ for(int i = untrusted.size()-1; i >= 0; i--) {
+ if(untrusted[i] > trim_length - bithash::k)
+ untrusted.pop_back();
+ }
+
+ return trim_length;
+}
diff --git a/src/hammer/quake_correct/edit.h b/src/projects/hammer/quake_correct/edit.h
similarity index 100%
rename from src/hammer/quake_correct/edit.h
rename to src/projects/hammer/quake_correct/edit.h
diff --git a/src/hammer/quake_correct/gzstream.C b/src/projects/hammer/quake_correct/gzstream.C
similarity index 100%
rename from src/hammer/quake_correct/gzstream.C
rename to src/projects/hammer/quake_correct/gzstream.C
diff --git a/src/hammer/quake_correct/gzstream.h b/src/projects/hammer/quake_correct/gzstream.h
similarity index 100%
rename from src/hammer/quake_correct/gzstream.h
rename to src/projects/hammer/quake_correct/gzstream.h
diff --git a/src/hammer/quake_count/CMakeLists.txt b/src/projects/hammer/quake_count/CMakeLists.txt
similarity index 100%
rename from src/hammer/quake_count/CMakeLists.txt
rename to src/projects/hammer/quake_count/CMakeLists.txt
diff --git a/src/hammer/quake_count/kmer_freq_info.hpp b/src/projects/hammer/quake_count/kmer_freq_info.hpp
similarity index 100%
rename from src/hammer/quake_count/kmer_freq_info.hpp
rename to src/projects/hammer/quake_count/kmer_freq_info.hpp
diff --git a/src/projects/hammer/quake_count/quake_count.cpp b/src/projects/hammer/quake_count/quake_count.cpp
new file mode 100644
index 0000000..244e650
--- /dev/null
+++ b/src/projects/hammer/quake_count/quake_count.cpp
@@ -0,0 +1,241 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/**
+ * @file preproc.cpp
+ * @author Alex Davydow
+ * @version 1.0
+ *
+ * @section LICENSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * @section DESCRIPTION
+ *
+ * For each k-mer this program calculates number of occurring in
+ * the reads provided. Reads file is supposed to be in fastq
+ * format.
+ */
+
+#include "standard.hpp"
+
+#include <stdint.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+#include <string>
+#include <set>
+#include <unordered_map>
+#include <vector>
+#include <iostream>
+#include <iomanip>
+#include "io/ireadstream.hpp"
+#include "io/read.hpp"
+#include "data_structures/sequence/seq.hpp"
+#include "kmer_freq_info.hpp"
+#include "valid_kmer_generator.hpp"
+#define SUPPRESS_UNUSED(X) ((void) (X))
+
+using std::string;
+using std::set;
+using std::vector;
+using std::unordered_map;
+using std::map;
+using std::ofstream;
+using std::ifstream;
+using std::pair;
+
+namespace {
+
+const uint32_t kK = 55;
+typedef Seq<kK> KMer;
+typedef unordered_map<KMer, KMerFreqInfo, KMer::hash> UnorderedMap;
+
+void print_time() {
+ time_t rawtime;
+ tm * ptm;
+ time ( &rawtime );
+ ptm = gmtime( &rawtime );
+ std::cout << std::setfill('0') << "[ " << std::setw(2) << ptm->tm_hour << ":" << std::setw(2) << ptm->tm_min
+ << ":" << std::setw(2) << ptm->tm_sec << " ] ";
+}
+
+#define LOG(a) print_time(); std::cout << a << std::endl
+
+/**
+ * @variable Every kStep k-mer will appear in the log.
+ */
+const int kStep = 1e5;
+
+struct Options {
+ /**
+ * @variable An offset for quality in a fastq file.
+ */
+ uint32_t qvoffset;
+ string ifile;
+ string ofile;
+ uint32_t error_threshold;
+ /**
+ * @variable How many files will be used when splitting k-mers.
+ */
+ uint32_t file_number;
+ bool q_mers;
+ bool valid;
+ Options()
+ : qvoffset(0),
+ ifile(""),
+ ofile(""),
+ error_threshold(0),
+ file_number(3),
+ q_mers(false),
+ valid(true) {}
+};
+
+void PrintHelp(char *program_name) {
+ printf("Usage: %s qvoffset ifile.fastq ofile.[q]cst file_number error_threshold [q]\n",
+ program_name);
+ printf("Where:\n");
+ printf("\tqvoffset\tan offset of fastq quality data\n");
+ printf("\tifile.fastq\tan input file with reads in fastq format\n");
+ printf("\tofile.[q]cst\ta filename where k-mer statistics will be outputted\n");
+ printf("\terror_threshold\tnucliotides with quality lower then threshold will be cut from the ends of reads\n");
+ printf("\tfile_number\thow many files will be used when splitting k-mers\n");
+ printf("\tq\t\tif you want to count q-mers instead of k-mers.\n");
+}
+
+Options ParseOptions(int argc, char *argv[]) {
+ Options ret;
+ if (argc != 6 && argc != 7) {
+ ret.valid = false;
+ } else {
+ ret.qvoffset = atoi(argv[1]);
+ ret.valid &= (ret.qvoffset >= 0 && ret.qvoffset <= 255);
+ ret.ifile = argv[2];
+ ret.ofile = argv[3];
+ ret.error_threshold = atoi(argv[4]);
+ ret.valid &= (ret.error_threshold >= 0 && ret.error_threshold <= 255);
+ ret.file_number = atoi(argv[5]);
+ if (argc == 7) {
+ if (string(argv[6]) == "q") {
+ ret.q_mers = true;
+ } else {
+ ret.valid = false;
+ }
+ }
+ }
+ return ret;
+}
+
+/**
+ * This function reads reads from the stream and splits them into
+ * k-mers. Then k-mers are written to several file almost
+ * uniformly. It is guaranteed that the same k-mers are written to the
+ * same files.
+ * @param ifs Steam to read reads from.
+ * @param ofiles Files to write the result k-mers. They are written
+ * one per line.
+ */
+void SplitToFiles(ireadstream ifs, vector<ofstream *> &ofiles,
+ bool q_mers, uint8_t error_threshold) {
+ uint32_t file_number = ofiles.size();
+ uint64_t read_number = 0;
+ while (!ifs.eof()) {
+ ++read_number;
+ if (read_number % kStep == 0) {
+ LOG("Reading read " << read_number << ".");
+ }
+ Read r;
+ ifs >> r;
+ KMer::hash hash_function;
+ for (ValidKMerGenerator<kK> gen(r, error_threshold); gen.HasMore(); gen.Next()) {
+ KMer kmer = gen.kmer();
+ if (KMer::less2()(!kmer, kmer)) {
+ kmer = !kmer;
+ }
+ ofstream &cur_file = *ofiles[hash_function(kmer) % file_number];
+ KMer::BinWrite(cur_file, kmer);
+ if (q_mers) {
+ double correct_probability = gen.correct_probability();
+ cur_file.write((const char*) &correct_probability, sizeof(correct_probability));
+ }
+ }
+ }
+}
+
+/**
+ * This function reads k-mer and calculates number of occurrences for
+ * each of them.
+ * @param ifile File with k-mer to process. One per line.
+ * @param ofile Output file. For each unique k-mer there will be a
+ * line with k-mer itself and number of its occurrences.
+ */
+template<typename KMerStatMap>
+void EvalFile(ifstream &ifile, FILE *ofile, bool q_mers) {
+ KMerStatMap stat_map;
+ char buffer[kK + 1];
+ buffer[kK] = 0;
+ KMer kmer;
+ while (KMer::BinRead(ifile, &kmer)) {
+ KMerFreqInfo &info = stat_map[kmer];
+ if (q_mers) {
+ double correct_probability = -1;
+ ifile.read((char *) &correct_probability, sizeof(correct_probability));
+ assert(ifile.fail());
+ info.q_count += correct_probability;
+ } else {
+ info.count += 1;
+ }
+ }
+ for (typename KMerStatMap::iterator it = stat_map.begin();
+ it != stat_map.end(); ++it) {
+ fprintf(ofile, "%s ", it->first.str().c_str());
+ if (q_mers) {
+ fprintf(ofile, "%f\n", it->second.q_count);
+ } else {
+ fprintf(ofile, "%d\n", it->second.count);
+ }
+ }
+}
+}
+
+int main(int argc, char *argv[]) {
+ Options opts = ParseOptions(argc, argv);
+ if (!opts.valid) {
+ PrintHelp(argv[0]);
+ return 1;
+ }
+ // BasicConfigurator::configure();
+ LOG("Starting preproc: evaluating " << opts.ifile << ".");
+ vector<ofstream*> ofiles(opts.file_number);
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ char filename[50];
+ snprintf(filename, sizeof(filename), "%u.kmer.part", i);
+ ofiles[i] = new ofstream(filename);
+ assert(!ofiles[i]->fail() && "Too many files to open");
+ }
+ SplitToFiles(ireadstream(opts.ifile, opts.qvoffset),
+ ofiles, opts.q_mers, opts.error_threshold);
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ delete ofiles[i];
+ }
+ FILE *ofile = fopen(opts.ofile.c_str(), "w");
+ assert(ofile != NULL && "Too many files to open");
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ char ifile_name[50];
+ snprintf(ifile_name, sizeof(ifile_name), "%u.kmer.part", i);
+ ifstream ifile(ifile_name);
+ LOG("Processing " << ifile_name << ".");
+ EvalFile<UnorderedMap>(ifile, ofile, opts.q_mers);
+ LOG("Processed " << ifile_name << ".");
+ }
+ fclose(ofile);
+ LOG("Preprocessing done. You can find results in " << opts.ofile << ".");
+ return 0;
+}
diff --git a/src/projects/hammer/quake_count/quake_count_17.cpp b/src/projects/hammer/quake_count/quake_count_17.cpp
new file mode 100644
index 0000000..2771ea8
--- /dev/null
+++ b/src/projects/hammer/quake_count/quake_count_17.cpp
@@ -0,0 +1,238 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/**
+ * @file preproc.cpp
+ * @author Alex Davydow
+ * @version 1.0
+ *
+ * @section LICENSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * @section DESCRIPTION
+ *
+ * For each k-mer this program calculates number of occurring in
+ * the reads provided. Reads file is supposed to be in fastq
+ * format.
+ */
+#include "standard.hpp"
+#include <stdint.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+#include <string>
+#include <set>
+#include <unordered_map>
+#include <vector>
+#include <iostream>
+#include <iomanip>
+#include "io/ireadstream.hpp"
+#include "io/read.hpp"
+#include "data_structures/sequence/seq.hpp"
+#include "kmer_freq_info.hpp"
+#include "valid_kmer_generator.hpp"
+#define SUPPRESS_UNUSED(X) ((void) (X))
+
+using std::string;
+using std::set;
+using std::vector;
+using std::unordered_map;
+using std::map;
+using std::ofstream;
+using std::ifstream;
+
+namespace {
+
+const uint32_t kK = 17;
+typedef Seq<kK> KMer;
+typedef unordered_map<KMer, KMerFreqInfo, KMer::hash> UnorderedMap;
+
+void print_time() {
+ time_t rawtime;
+ tm * ptm;
+ time ( &rawtime );
+ ptm = gmtime( &rawtime );
+ std::cout << std::setfill('0') << "[ " << std::setw(2) << ptm->tm_hour << ":" << std::setw(2) << ptm->tm_min
+ << ":" << std::setw(2) << ptm->tm_sec << " ] ";
+}
+
+#define LOG(a) print_time(); std::cout << a << std::endl
+
+/**
+ * @variable Every kStep k-mer will appear in the log.
+ */
+const int kStep = 1e5;
+
+struct Options {
+ /**
+ * @variable An offset for quality in a fastq file.
+ */
+ uint32_t qvoffset;
+ string ifile;
+ string ofile;
+ uint32_t error_threshold;
+ /**
+ * @variable How many files will be used when splitting k-mers.
+ */
+ uint32_t file_number;
+ bool q_mers;
+ bool valid;
+ Options()
+ : qvoffset(0),
+ ifile(""),
+ ofile(""),
+ error_threshold(0),
+ file_number(3),
+ q_mers(false),
+ valid(true) {}
+};
+
+void PrintHelp(char *program_name) {
+ printf("Usage: %s qvoffset ifile.fastq ofile.[q]cst file_number error_threshold [q]\n",
+ program_name);
+ printf("Where:\n");
+ printf("\tqvoffset\tan offset of fastq quality data\n");
+ printf("\tifile.fastq\tan input file with reads in fastq format\n");
+ printf("\tofile.[q]cst\ta filename where k-mer statistics will be outputted\n");
+ printf("\terror_threshold\tnucliotides with quality lower then threshold will be cut from the ends of reads\n");
+ printf("\tfile_number\thow many files will be used when splitting k-mers\n");
+ printf("\tq\t\tif you want to count q-mers instead of k-mers.\n");
+}
+
+Options ParseOptions(int argc, char *argv[]) {
+ Options ret;
+ if (argc != 6 && argc != 7) {
+ ret.valid = false;
+ } else {
+ ret.qvoffset = atoi(argv[1]);
+ ret.valid &= (ret.qvoffset >= 0 && ret.qvoffset <= 255);
+ ret.ifile = argv[2];
+ ret.ofile = argv[3];
+ ret.error_threshold = atoi(argv[4]);
+ ret.valid &= (ret.error_threshold >= 0 && ret.error_threshold <= 255);
+ ret.file_number = atoi(argv[5]);
+ if (argc == 7) {
+ if (string(argv[6]) == "q") {
+ ret.q_mers = true;
+ } else {
+ ret.valid = false;
+ }
+ }
+ }
+ return ret;
+}
+
+/**
+ * This function reads reads from the stream and splits them into
+ * k-mers. Then k-mers are written to several file almost
+ * uniformly. It is guaranteed that the same k-mers are written to the
+ * same files.
+ * @param ifs Steam to read reads from.
+ * @param ofiles Files to write the result k-mers. They are written
+ * one per line.
+ */
+void SplitToFiles(ireadstream ifs, vector<ofstream *> &ofiles,
+ bool q_mers, uint8_t error_threshold) {
+ uint32_t file_number = ofiles.size();
+ uint64_t read_number = 0;
+ while (!ifs.eof()) {
+ ++read_number;
+ if (read_number % kStep == 0) {
+ LOG("Reading read " << read_number << ".");
+ }
+ Read r;
+ ifs >> r;
+ KMer::hash hash_function;
+ for (ValidKMerGenerator<kK> gen(r, error_threshold); gen.HasMore(); gen.Next()) {
+ KMer kmer = gen.kmer();
+ if (KMer::less2()(!kmer, kmer)) {
+ kmer = !kmer;
+ }
+ ofstream &cur_file = *ofiles[hash_function(kmer) % file_number];
+ KMer::BinWrite(cur_file, kmer);
+ if (q_mers) {
+ double correct_probability = gen.correct_probability();
+ cur_file.write((const char*) &correct_probability, sizeof(correct_probability));
+ }
+ }
+ }
+}
+
+/**
+ * This function reads k-mer and calculates number of occurrences for
+ * each of them.
+ * @param ifile File with k-mer to process. One per line.
+ * @param ofile Output file. For each unique k-mer there will be a
+ * line with k-mer itself and number of its occurrences.
+ */
+template<typename KMerStatMap>
+void EvalFile(ifstream &ifile, FILE *ofile, bool q_mers) {
+ KMerStatMap stat_map;
+ char buffer[kK + 1];
+ buffer[kK] = 0;
+ KMer kmer;
+ while (KMer::BinRead(ifile, &kmer)) {
+ KMerFreqInfo &info = stat_map[kmer];
+ if (q_mers) {
+ double correct_probability = -1;
+ ifile.read((char *) &correct_probability, sizeof(correct_probability));
+ assert(ifile.fail());
+ info.q_count += correct_probability;
+ } else {
+ info.count += 1;
+ }
+ }
+ for (typename KMerStatMap::iterator it = stat_map.begin();
+ it != stat_map.end(); ++it) {
+ fprintf(ofile, "%s ", it->first.str().c_str());
+ if (q_mers) {
+ fprintf(ofile, "%f\n", it->second.q_count);
+ } else {
+ fprintf(ofile, "%d\n", it->second.count);
+ }
+ }
+}
+}
+
+int main(int argc, char *argv[]) {
+ Options opts = ParseOptions(argc, argv);
+ if (!opts.valid) {
+ PrintHelp(argv[0]);
+ return 1;
+ }
+ // BasicConfigurator::configure();
+ LOG("Starting preproc: evaluating " << opts.ifile << ".");
+ vector<ofstream*> ofiles(opts.file_number);
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ char filename[50];
+ snprintf(filename, sizeof(filename), "%u.kmer.part", i);
+ ofiles[i] = new ofstream(filename);
+ assert(!ofiles[i]->fail() && "Too many files to open");
+ }
+ SplitToFiles(ireadstream(opts.ifile, opts.qvoffset),
+ ofiles, opts.q_mers, opts.error_threshold);
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ delete ofiles[i];
+ }
+ FILE *ofile = fopen(opts.ofile.c_str(), "w");
+ assert(ofile != NULL && "Too many files to open");
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ char ifile_name[50];
+ snprintf(ifile_name, sizeof(ifile_name), "%u.kmer.part", i);
+ ifstream ifile(ifile_name);
+ LOG("Processing " << ifile_name << ".");
+ EvalFile<UnorderedMap>(ifile, ofile, opts.q_mers);
+ LOG("Processed " << ifile_name << ".");
+ }
+ fclose(ofile);
+ LOG("Preprocessing done. You can find results in " << opts.ofile << ".");
+ return 0;
+}
diff --git a/src/projects/hammer/quake_count/quake_count_19.cpp b/src/projects/hammer/quake_count/quake_count_19.cpp
new file mode 100644
index 0000000..8bc22ba
--- /dev/null
+++ b/src/projects/hammer/quake_count/quake_count_19.cpp
@@ -0,0 +1,238 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/**
+ * @file preproc.cpp
+ * @author Alex Davydow
+ * @version 1.0
+ *
+ * @section LICENSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * @section DESCRIPTION
+ *
+ * For each k-mer this program calculates number of occurring in
+ * the reads provided. Reads file is supposed to be in fastq
+ * format.
+ */
+#include "standard.hpp"
+#include <stdint.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+#include <string>
+#include <set>
+#include <unordered_map>
+#include <vector>
+#include <iostream>
+#include <iomanip>
+#include "io/ireadstream.hpp"
+#include "io/read.hpp"
+#include "data_structures/sequence/seq.hpp"
+#include "kmer_freq_info.hpp"
+#include "valid_kmer_generator.hpp"
+#define SUPPRESS_UNUSED(X) ((void) (X))
+
+using std::string;
+using std::set;
+using std::vector;
+using std::unordered_map;
+using std::map;
+using std::ofstream;
+using std::ifstream;
+
+namespace {
+
+const uint32_t kK = 19;
+typedef Seq<kK> KMer;
+typedef unordered_map<KMer, KMerFreqInfo, KMer::hash> UnorderedMap;
+
+void print_time() {
+ time_t rawtime;
+ tm * ptm;
+ time ( &rawtime );
+ ptm = gmtime( &rawtime );
+ std::cout << std::setfill('0') << "[ " << std::setw(2) << ptm->tm_hour << ":" << std::setw(2) << ptm->tm_min
+ << ":" << std::setw(2) << ptm->tm_sec << " ] ";
+}
+
+#define LOG(a) print_time(); std::cout << a << std::endl
+
+/**
+ * @variable Every kStep k-mer will appear in the log.
+ */
+const int kStep = 1e5;
+
+struct Options {
+ /**
+ * @variable An offset for quality in a fastq file.
+ */
+ uint32_t qvoffset;
+ string ifile;
+ string ofile;
+ uint32_t error_threshold;
+ /**
+ * @variable How many files will be used when splitting k-mers.
+ */
+ uint32_t file_number;
+ bool q_mers;
+ bool valid;
+ Options()
+ : qvoffset(0),
+ ifile(""),
+ ofile(""),
+ error_threshold(0),
+ file_number(3),
+ q_mers(false),
+ valid(true) {}
+};
+
+void PrintHelp(char *program_name) {
+ printf("Usage: %s qvoffset ifile.fastq ofile.[q]cst file_number error_threshold [q]\n",
+ program_name);
+ printf("Where:\n");
+ printf("\tqvoffset\tan offset of fastq quality data\n");
+ printf("\tifile.fastq\tan input file with reads in fastq format\n");
+ printf("\tofile.[q]cst\ta filename where k-mer statistics will be outputted\n");
+ printf("\terror_threshold\tnucliotides with quality lower then threshold will be cut from the ends of reads\n");
+ printf("\tfile_number\thow many files will be used when splitting k-mers\n");
+ printf("\tq\t\tif you want to count q-mers instead of k-mers.\n");
+}
+
+Options ParseOptions(int argc, char *argv[]) {
+ Options ret;
+ if (argc != 6 && argc != 7) {
+ ret.valid = false;
+ } else {
+ ret.qvoffset = atoi(argv[1]);
+ ret.valid &= (ret.qvoffset >= 0 && ret.qvoffset <= 255);
+ ret.ifile = argv[2];
+ ret.ofile = argv[3];
+ ret.error_threshold = atoi(argv[4]);
+ ret.valid &= (ret.error_threshold >= 0 && ret.error_threshold <= 255);
+ ret.file_number = atoi(argv[5]);
+ if (argc == 7) {
+ if (string(argv[6]) == "q") {
+ ret.q_mers = true;
+ } else {
+ ret.valid = false;
+ }
+ }
+ }
+ return ret;
+}
+
+/**
+ * This function reads reads from the stream and splits them into
+ * k-mers. Then k-mers are written to several file almost
+ * uniformly. It is guaranteed that the same k-mers are written to the
+ * same files.
+ * @param ifs Steam to read reads from.
+ * @param ofiles Files to write the result k-mers. They are written
+ * one per line.
+ */
+void SplitToFiles(ireadstream ifs, vector<ofstream *> &ofiles,
+ bool q_mers, uint8_t error_threshold) {
+ uint32_t file_number = ofiles.size();
+ uint64_t read_number = 0;
+ while (!ifs.eof()) {
+ ++read_number;
+ if (read_number % kStep == 0) {
+ LOG("Reading read " << read_number << ".");
+ }
+ Read r;
+ ifs >> r;
+ KMer::hash hash_function;
+ for (ValidKMerGenerator<kK> gen(r, error_threshold); gen.HasMore(); gen.Next()) {
+ KMer kmer = gen.kmer();
+ if (KMer::less2()(!kmer, kmer)) {
+ kmer = !kmer;
+ }
+ ofstream &cur_file = *ofiles[hash_function(kmer) % file_number];
+ KMer::BinWrite(cur_file, kmer);
+ if (q_mers) {
+ double correct_probability = gen.correct_probability();
+ cur_file.write((const char*) &correct_probability, sizeof(correct_probability));
+ }
+ }
+ }
+}
+
+/**
+ * This function reads k-mer and calculates number of occurrences for
+ * each of them.
+ * @param ifile File with k-mer to process. One per line.
+ * @param ofile Output file. For each unique k-mer there will be a
+ * line with k-mer itself and number of its occurrences.
+ */
+template<typename KMerStatMap>
+void EvalFile(ifstream &ifile, FILE *ofile, bool q_mers) {
+ KMerStatMap stat_map;
+ char buffer[kK + 1];
+ buffer[kK] = 0;
+ KMer kmer;
+ while (KMer::BinRead(ifile, &kmer)) {
+ KMerFreqInfo &info = stat_map[kmer];
+ if (q_mers) {
+ double correct_probability = -1;
+ ifile.read((char *) &correct_probability, sizeof(correct_probability));
+ assert(ifile.fail());
+ info.q_count += correct_probability;
+ } else {
+ info.count += 1;
+ }
+ }
+ for (typename KMerStatMap::iterator it = stat_map.begin();
+ it != stat_map.end(); ++it) {
+ fprintf(ofile, "%s ", it->first.str().c_str());
+ if (q_mers) {
+ fprintf(ofile, "%f\n", it->second.q_count);
+ } else {
+ fprintf(ofile, "%d\n", it->second.count);
+ }
+ }
+}
+}
+
+int main(int argc, char *argv[]) {
+ Options opts = ParseOptions(argc, argv);
+ if (!opts.valid) {
+ PrintHelp(argv[0]);
+ return 1;
+ }
+ // BasicConfigurator::configure();
+ LOG("Starting preproc: evaluating " << opts.ifile << ".");
+ vector<ofstream*> ofiles(opts.file_number);
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ char filename[50];
+ snprintf(filename, sizeof(filename), "%u.kmer.part", i);
+ ofiles[i] = new ofstream(filename);
+ assert(!ofiles[i]->fail() && "Too many files to open");
+ }
+ SplitToFiles(ireadstream(opts.ifile, opts.qvoffset),
+ ofiles, opts.q_mers, opts.error_threshold);
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ delete ofiles[i];
+ }
+ FILE *ofile = fopen(opts.ofile.c_str(), "w");
+ assert(ofile != NULL && "Too many files to open");
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ char ifile_name[50];
+ snprintf(ifile_name, sizeof(ifile_name), "%u.kmer.part", i);
+ ifstream ifile(ifile_name);
+ LOG("Processing " << ifile_name << ".");
+ EvalFile<UnorderedMap>(ifile, ofile, opts.q_mers);
+ LOG("Processed " << ifile_name << ".");
+ }
+ fclose(ofile);
+ LOG("Preprocessing done. You can find results in " << opts.ofile << ".");
+ return 0;
+}
diff --git a/src/projects/hammer/quake_count/quake_count_21.cpp b/src/projects/hammer/quake_count/quake_count_21.cpp
new file mode 100644
index 0000000..24ed7f2
--- /dev/null
+++ b/src/projects/hammer/quake_count/quake_count_21.cpp
@@ -0,0 +1,238 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/**
+ * @file preproc.cpp
+ * @author Alex Davydow
+ * @version 1.0
+ *
+ * @section LICENSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * @section DESCRIPTION
+ *
+ * For each k-mer this program calculates number of occurring in
+ * the reads provided. Reads file is supposed to be in fastq
+ * format.
+ */
+#include "standard.hpp"
+#include <stdint.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+#include <string>
+#include <set>
+#include <unordered_map>
+#include <vector>
+#include <iostream>
+#include <iomanip>
+#include "io/ireadstream.hpp"
+#include "io/read.hpp"
+#include "data_structures/sequence/seq.hpp"
+#include "kmer_freq_info.hpp"
+#include "valid_kmer_generator.hpp"
+#define SUPPRESS_UNUSED(X) ((void) (X))
+
+using std::string;
+using std::set;
+using std::vector;
+using std::unordered_map;
+using std::map;
+using std::ofstream;
+using std::ifstream;
+
+namespace {
+
+const uint32_t kK = 21;
+typedef Seq<kK> KMer;
+typedef unordered_map<KMer, KMerFreqInfo, KMer::hash> UnorderedMap;
+
+void print_time() {
+ time_t rawtime;
+ tm * ptm;
+ time ( &rawtime );
+ ptm = gmtime( &rawtime );
+ std::cout << std::setfill('0') << "[ " << std::setw(2) << ptm->tm_hour << ":" << std::setw(2) << ptm->tm_min
+ << ":" << std::setw(2) << ptm->tm_sec << " ] ";
+}
+
+#define LOG(a) print_time(); std::cout << a << std::endl
+
+/**
+ * @variable Every kStep k-mer will appear in the log.
+ */
+const int kStep = 1e5;
+
+struct Options {
+ /**
+ * @variable An offset for quality in a fastq file.
+ */
+ uint32_t qvoffset;
+ string ifile;
+ string ofile;
+ uint32_t error_threshold;
+ /**
+ * @variable How many files will be used when splitting k-mers.
+ */
+ uint32_t file_number;
+ bool q_mers;
+ bool valid;
+ Options()
+ : qvoffset(0),
+ ifile(""),
+ ofile(""),
+ error_threshold(0),
+ file_number(3),
+ q_mers(false),
+ valid(true) {}
+};
+
+void PrintHelp(char *program_name) {
+ printf("Usage: %s qvoffset ifile.fastq ofile.[q]cst file_number error_threshold [q]\n",
+ program_name);
+ printf("Where:\n");
+ printf("\tqvoffset\tan offset of fastq quality data\n");
+ printf("\tifile.fastq\tan input file with reads in fastq format\n");
+ printf("\tofile.[q]cst\ta filename where k-mer statistics will be outputted\n");
+ printf("\terror_threshold\tnucliotides with quality lower then threshold will be cut from the ends of reads\n");
+ printf("\tfile_number\thow many files will be used when splitting k-mers\n");
+ printf("\tq\t\tif you want to count q-mers instead of k-mers.\n");
+}
+
+Options ParseOptions(int argc, char *argv[]) {
+ Options ret;
+ if (argc != 6 && argc != 7) {
+ ret.valid = false;
+ } else {
+ ret.qvoffset = atoi(argv[1]);
+ ret.valid &= (ret.qvoffset >= 0 && ret.qvoffset <= 255);
+ ret.ifile = argv[2];
+ ret.ofile = argv[3];
+ ret.error_threshold = atoi(argv[4]);
+ ret.valid &= (ret.error_threshold >= 0 && ret.error_threshold <= 255);
+ ret.file_number = atoi(argv[5]);
+ if (argc == 7) {
+ if (string(argv[6]) == "q") {
+ ret.q_mers = true;
+ } else {
+ ret.valid = false;
+ }
+ }
+ }
+ return ret;
+}
+
+/**
+ * This function reads reads from the stream and splits them into
+ * k-mers. Then k-mers are written to several file almost
+ * uniformly. It is guaranteed that the same k-mers are written to the
+ * same files.
+ * @param ifs Steam to read reads from.
+ * @param ofiles Files to write the result k-mers. They are written
+ * one per line.
+ */
+void SplitToFiles(ireadstream ifs, vector<ofstream *> &ofiles,
+ bool q_mers, uint8_t error_threshold) {
+ uint32_t file_number = ofiles.size();
+ uint64_t read_number = 0;
+ while (!ifs.eof()) {
+ ++read_number;
+ if (read_number % kStep == 0) {
+ LOG("Reading read " << read_number << ".");
+ }
+ Read r;
+ ifs >> r;
+ KMer::hash hash_function;
+ for (ValidKMerGenerator<kK> gen(r, error_threshold); gen.HasMore(); gen.Next()) {
+ KMer kmer = gen.kmer();
+ if (KMer::less2()(!kmer, kmer)) {
+ kmer = !kmer;
+ }
+ ofstream &cur_file = *ofiles[hash_function(kmer) % file_number];
+ KMer::BinWrite(cur_file, kmer);
+ if (q_mers) {
+ double correct_probability = gen.correct_probability();
+ cur_file.write((const char*) &correct_probability, sizeof(correct_probability));
+ }
+ }
+ }
+}
+
+/**
+ * This function reads k-mer and calculates number of occurrences for
+ * each of them.
+ * @param ifile File with k-mer to process. One per line.
+ * @param ofile Output file. For each unique k-mer there will be a
+ * line with k-mer itself and number of its occurrences.
+ */
+template<typename KMerStatMap>
+void EvalFile(ifstream &ifile, FILE *ofile, bool q_mers) {
+ KMerStatMap stat_map;
+ char buffer[kK + 1];
+ buffer[kK] = 0;
+ KMer kmer;
+ while (KMer::BinRead(ifile, &kmer)) {
+ KMerFreqInfo &info = stat_map[kmer];
+ if (q_mers) {
+ double correct_probability = -1;
+ ifile.read((char *) &correct_probability, sizeof(correct_probability));
+ assert(ifile.fail());
+ info.q_count += correct_probability;
+ } else {
+ info.count += 1;
+ }
+ }
+ for (typename KMerStatMap::iterator it = stat_map.begin();
+ it != stat_map.end(); ++it) {
+ fprintf(ofile, "%s ", it->first.str().c_str());
+ if (q_mers) {
+ fprintf(ofile, "%f\n", it->second.q_count);
+ } else {
+ fprintf(ofile, "%d\n", it->second.count);
+ }
+ }
+}
+}
+
+int main(int argc, char *argv[]) {
+ Options opts = ParseOptions(argc, argv);
+ if (!opts.valid) {
+ PrintHelp(argv[0]);
+ return 1;
+ }
+ // BasicConfigurator::configure();
+ LOG("Starting preproc: evaluating " << opts.ifile << ".");
+ vector<ofstream*> ofiles(opts.file_number);
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ char filename[50];
+ snprintf(filename, sizeof(filename), "%u.kmer.part", i);
+ ofiles[i] = new ofstream(filename);
+ assert(!ofiles[i]->fail() && "Too many files to open");
+ }
+ SplitToFiles(ireadstream(opts.ifile, opts.qvoffset),
+ ofiles, opts.q_mers, opts.error_threshold);
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ delete ofiles[i];
+ }
+ FILE *ofile = fopen(opts.ofile.c_str(), "w");
+ assert(ofile != NULL && "Too many files to open");
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ char ifile_name[50];
+ snprintf(ifile_name, sizeof(ifile_name), "%u.kmer.part", i);
+ ifstream ifile(ifile_name);
+ LOG("Processing " << ifile_name << ".");
+ EvalFile<UnorderedMap>(ifile, ofile, opts.q_mers);
+ LOG("Processed " << ifile_name << ".");
+ }
+ fclose(ofile);
+ LOG("Preprocessing done. You can find results in " << opts.ofile << ".");
+ return 0;
+}
diff --git a/src/projects/hammer/quake_count/quake_count_25.cpp b/src/projects/hammer/quake_count/quake_count_25.cpp
new file mode 100644
index 0000000..2160242
--- /dev/null
+++ b/src/projects/hammer/quake_count/quake_count_25.cpp
@@ -0,0 +1,238 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/**
+ * @file preproc.cpp
+ * @author Alex Davydow
+ * @version 1.0
+ *
+ * @section LICENSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * @section DESCRIPTION
+ *
+ * For each k-mer this program calculates number of occurring in
+ * the reads provided. Reads file is supposed to be in fastq
+ * format.
+ */
+#include "standard.hpp"
+#include <stdint.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+#include <string>
+#include <set>
+#include <unordered_map>
+#include <vector>
+#include <iostream>
+#include <iomanip>
+#include "io/ireadstream.hpp"
+#include "io/read.hpp"
+#include "data_structures/sequence/seq.hpp"
+#include "kmer_freq_info.hpp"
+#include "valid_kmer_generator.hpp"
+#define SUPPRESS_UNUSED(X) ((void) (X))
+
+using std::string;
+using std::set;
+using std::vector;
+using std::unordered_map;
+using std::map;
+using std::ofstream;
+using std::ifstream;
+
+namespace {
+
+const uint32_t kK = 25;
+typedef Seq<kK> KMer;
+typedef unordered_map<KMer, KMerFreqInfo, KMer::hash> UnorderedMap;
+
+void print_time() {
+ time_t rawtime;
+ tm * ptm;
+ time ( &rawtime );
+ ptm = gmtime( &rawtime );
+ std::cout << std::setfill('0') << "[ " << std::setw(2) << ptm->tm_hour << ":" << std::setw(2) << ptm->tm_min
+ << ":" << std::setw(2) << ptm->tm_sec << " ] ";
+}
+
+#define LOG(a) print_time(); std::cout << a << std::endl
+
+/**
+ * @variable Every kStep k-mer will appear in the log.
+ */
+const int kStep = 1e5;
+
+struct Options {
+ /**
+ * @variable An offset for quality in a fastq file.
+ */
+ uint32_t qvoffset;
+ string ifile;
+ string ofile;
+ uint32_t error_threshold;
+ /**
+ * @variable How many files will be used when splitting k-mers.
+ */
+ uint32_t file_number;
+ bool q_mers;
+ bool valid;
+ Options()
+ : qvoffset(0),
+ ifile(""),
+ ofile(""),
+ error_threshold(0),
+ file_number(3),
+ q_mers(false),
+ valid(true) {}
+};
+
+void PrintHelp(char *program_name) {
+ printf("Usage: %s qvoffset ifile.fastq ofile.[q]cst file_number error_threshold [q]\n",
+ program_name);
+ printf("Where:\n");
+ printf("\tqvoffset\tan offset of fastq quality data\n");
+ printf("\tifile.fastq\tan input file with reads in fastq format\n");
+ printf("\tofile.[q]cst\ta filename where k-mer statistics will be outputted\n");
+ printf("\terror_threshold\tnucliotides with quality lower then threshold will be cut from the ends of reads\n");
+ printf("\tfile_number\thow many files will be used when splitting k-mers\n");
+ printf("\tq\t\tif you want to count q-mers instead of k-mers.\n");
+}
+
+Options ParseOptions(int argc, char *argv[]) {
+ Options ret;
+ if (argc != 6 && argc != 7) {
+ ret.valid = false;
+ } else {
+ ret.qvoffset = atoi(argv[1]);
+ ret.valid &= (ret.qvoffset >= 0 && ret.qvoffset <= 255);
+ ret.ifile = argv[2];
+ ret.ofile = argv[3];
+ ret.error_threshold = atoi(argv[4]);
+ ret.valid &= (ret.error_threshold >= 0 && ret.error_threshold <= 255);
+ ret.file_number = atoi(argv[5]);
+ if (argc == 7) {
+ if (string(argv[6]) == "q") {
+ ret.q_mers = true;
+ } else {
+ ret.valid = false;
+ }
+ }
+ }
+ return ret;
+}
+
+/**
+ * This function reads reads from the stream and splits them into
+ * k-mers. Then k-mers are written to several file almost
+ * uniformly. It is guaranteed that the same k-mers are written to the
+ * same files.
+ * @param ifs Steam to read reads from.
+ * @param ofiles Files to write the result k-mers. They are written
+ * one per line.
+ */
+void SplitToFiles(ireadstream ifs, vector<ofstream *> &ofiles,
+ bool q_mers, uint8_t error_threshold) {
+ uint32_t file_number = ofiles.size();
+ uint64_t read_number = 0;
+ while (!ifs.eof()) {
+ ++read_number;
+ if (read_number % kStep == 0) {
+ LOG("Reading read " << read_number << ".");
+ }
+ Read r;
+ ifs >> r;
+ KMer::hash hash_function;
+ for (ValidKMerGenerator<kK> gen(r, error_threshold); gen.HasMore(); gen.Next()) {
+ KMer kmer = gen.kmer();
+ if (KMer::less2()(!kmer, kmer)) {
+ kmer = !kmer;
+ }
+ ofstream &cur_file = *ofiles[hash_function(kmer) % file_number];
+ KMer::BinWrite(cur_file, kmer);
+ if (q_mers) {
+ double correct_probability = gen.correct_probability();
+ cur_file.write((const char*) &correct_probability, sizeof(correct_probability));
+ }
+ }
+ }
+}
+
+/**
+ * This function reads k-mer and calculates number of occurrences for
+ * each of them.
+ * @param ifile File with k-mer to process. One per line.
+ * @param ofile Output file. For each unique k-mer there will be a
+ * line with k-mer itself and number of its occurrences.
+ */
+template<typename KMerStatMap>
+void EvalFile(ifstream &ifile, FILE *ofile, bool q_mers) {
+ KMerStatMap stat_map;
+ char buffer[kK + 1];
+ buffer[kK] = 0;
+ KMer kmer;
+ while (KMer::BinRead(ifile, &kmer)) {
+ KMerFreqInfo &info = stat_map[kmer];
+ if (q_mers) {
+ double correct_probability = -1;
+ ifile.read((char *) &correct_probability, sizeof(correct_probability));
+ assert(ifile.fail());
+ info.q_count += correct_probability;
+ } else {
+ info.count += 1;
+ }
+ }
+ for (typename KMerStatMap::iterator it = stat_map.begin();
+ it != stat_map.end(); ++it) {
+ fprintf(ofile, "%s ", it->first.str().c_str());
+ if (q_mers) {
+ fprintf(ofile, "%f\n", it->second.q_count);
+ } else {
+ fprintf(ofile, "%d\n", it->second.count);
+ }
+ }
+}
+}
+
+int main(int argc, char *argv[]) {
+ Options opts = ParseOptions(argc, argv);
+ if (!opts.valid) {
+ PrintHelp(argv[0]);
+ return 1;
+ }
+ // BasicConfigurator::configure();
+ LOG("Starting preproc: evaluating " << opts.ifile << ".");
+ vector<ofstream*> ofiles(opts.file_number);
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ char filename[50];
+ snprintf(filename, sizeof(filename), "%u.kmer.part", i);
+ ofiles[i] = new ofstream(filename);
+ assert(!ofiles[i]->fail() && "Too many files to open");
+ }
+ SplitToFiles(ireadstream(opts.ifile, opts.qvoffset),
+ ofiles, opts.q_mers, opts.error_threshold);
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ delete ofiles[i];
+ }
+ FILE *ofile = fopen(opts.ofile.c_str(), "w");
+ assert(ofile != NULL && "Too many files to open");
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ char ifile_name[50];
+ snprintf(ifile_name, sizeof(ifile_name), "%u.kmer.part", i);
+ ifstream ifile(ifile_name);
+ LOG("Processing " << ifile_name << ".");
+ EvalFile<UnorderedMap>(ifile, ofile, opts.q_mers);
+ LOG("Processed " << ifile_name << ".");
+ }
+ fclose(ofile);
+ LOG("Preprocessing done. You can find results in " << opts.ofile << ".");
+ return 0;
+}
diff --git a/src/projects/hammer/quake_count/quake_count_29.cpp b/src/projects/hammer/quake_count/quake_count_29.cpp
new file mode 100644
index 0000000..cdbd7cd
--- /dev/null
+++ b/src/projects/hammer/quake_count/quake_count_29.cpp
@@ -0,0 +1,238 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/**
+ * @file preproc.cpp
+ * @author Alex Davydow
+ * @version 1.0
+ *
+ * @section LICENSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * @section DESCRIPTION
+ *
+ * For each k-mer this program calculates number of occurring in
+ * the reads provided. Reads file is supposed to be in fastq
+ * format.
+ */
+#include "standard.hpp"
+#include <stdint.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+#include <string>
+#include <set>
+#include <unordered_map>
+#include <vector>
+#include <iostream>
+#include <iomanip>
+#include "io/ireadstream.hpp"
+#include "io/read.hpp"
+#include "data_structures/sequence/seq.hpp"
+#include "kmer_freq_info.hpp"
+#include "valid_kmer_generator.hpp"
+#define SUPPRESS_UNUSED(X) ((void) (X))
+
+using std::string;
+using std::set;
+using std::vector;
+using std::unordered_map;
+using std::map;
+using std::ofstream;
+using std::ifstream;
+
+namespace {
+
+const uint32_t kK = 29;
+typedef Seq<kK> KMer;
+typedef unordered_map<KMer, KMerFreqInfo, KMer::hash> UnorderedMap;
+
+void print_time() {
+ time_t rawtime;
+ tm * ptm;
+ time ( &rawtime );
+ ptm = gmtime( &rawtime );
+ std::cout << std::setfill('0') << "[ " << std::setw(2) << ptm->tm_hour << ":" << std::setw(2) << ptm->tm_min
+ << ":" << std::setw(2) << ptm->tm_sec << " ] ";
+}
+
+#define LOG(a) print_time(); std::cout << a << std::endl
+
+/**
+ * @variable Every kStep k-mer will appear in the log.
+ */
+const int kStep = 1e5;
+
+struct Options {
+ /**
+ * @variable An offset for quality in a fastq file.
+ */
+ uint32_t qvoffset;
+ string ifile;
+ string ofile;
+ uint32_t error_threshold;
+ /**
+ * @variable How many files will be used when splitting k-mers.
+ */
+ uint32_t file_number;
+ bool q_mers;
+ bool valid;
+ Options()
+ : qvoffset(0),
+ ifile(""),
+ ofile(""),
+ error_threshold(0),
+ file_number(3),
+ q_mers(false),
+ valid(true) {}
+};
+
+void PrintHelp(char *program_name) {
+ printf("Usage: %s qvoffset ifile.fastq ofile.[q]cst file_number error_threshold [q]\n",
+ program_name);
+ printf("Where:\n");
+ printf("\tqvoffset\tan offset of fastq quality data\n");
+ printf("\tifile.fastq\tan input file with reads in fastq format\n");
+ printf("\tofile.[q]cst\ta filename where k-mer statistics will be outputted\n");
+ printf("\terror_threshold\tnucliotides with quality lower then threshold will be cut from the ends of reads\n");
+ printf("\tfile_number\thow many files will be used when splitting k-mers\n");
+ printf("\tq\t\tif you want to count q-mers instead of k-mers.\n");
+}
+
+Options ParseOptions(int argc, char *argv[]) {
+ Options ret;
+ if (argc != 6 && argc != 7) {
+ ret.valid = false;
+ } else {
+ ret.qvoffset = atoi(argv[1]);
+ ret.valid &= (ret.qvoffset >= 0 && ret.qvoffset <= 255);
+ ret.ifile = argv[2];
+ ret.ofile = argv[3];
+ ret.error_threshold = atoi(argv[4]);
+ ret.valid &= (ret.error_threshold >= 0 && ret.error_threshold <= 255);
+ ret.file_number = atoi(argv[5]);
+ if (argc == 7) {
+ if (string(argv[6]) == "q") {
+ ret.q_mers = true;
+ } else {
+ ret.valid = false;
+ }
+ }
+ }
+ return ret;
+}
+
+/**
+ * This function reads reads from the stream and splits them into
+ * k-mers. Then k-mers are written to several file almost
+ * uniformly. It is guaranteed that the same k-mers are written to the
+ * same files.
+ * @param ifs Steam to read reads from.
+ * @param ofiles Files to write the result k-mers. They are written
+ * one per line.
+ */
+void SplitToFiles(ireadstream ifs, vector<ofstream *> &ofiles,
+ bool q_mers, uint8_t error_threshold) {
+ uint32_t file_number = ofiles.size();
+ uint64_t read_number = 0;
+ while (!ifs.eof()) {
+ ++read_number;
+ if (read_number % kStep == 0) {
+ LOG("Reading read " << read_number << ".");
+ }
+ Read r;
+ ifs >> r;
+ KMer::hash hash_function;
+ for (ValidKMerGenerator<kK> gen(r, error_threshold); gen.HasMore(); gen.Next()) {
+ KMer kmer = gen.kmer();
+ if (KMer::less2()(!kmer, kmer)) {
+ kmer = !kmer;
+ }
+ ofstream &cur_file = *ofiles[hash_function(kmer) % file_number];
+ KMer::BinWrite(cur_file, kmer);
+ if (q_mers) {
+ double correct_probability = gen.correct_probability();
+ cur_file.write((const char*) &correct_probability, sizeof(correct_probability));
+ }
+ }
+ }
+}
+
+/**
+ * This function reads k-mer and calculates number of occurrences for
+ * each of them.
+ * @param ifile File with k-mer to process. One per line.
+ * @param ofile Output file. For each unique k-mer there will be a
+ * line with k-mer itself and number of its occurrences.
+ */
+template<typename KMerStatMap>
+void EvalFile(ifstream &ifile, FILE *ofile, bool q_mers) {
+ KMerStatMap stat_map;
+ char buffer[kK + 1];
+ buffer[kK] = 0;
+ KMer kmer;
+ while (KMer::BinRead(ifile, &kmer)) {
+ KMerFreqInfo &info = stat_map[kmer];
+ if (q_mers) {
+ double correct_probability = -1;
+ ifile.read((char *) &correct_probability, sizeof(correct_probability));
+ assert(ifile.fail());
+ info.q_count += correct_probability;
+ } else {
+ info.count += 1;
+ }
+ }
+ for (typename KMerStatMap::iterator it = stat_map.begin();
+ it != stat_map.end(); ++it) {
+ fprintf(ofile, "%s ", it->first.str().c_str());
+ if (q_mers) {
+ fprintf(ofile, "%f\n", it->second.q_count);
+ } else {
+ fprintf(ofile, "%d\n", it->second.count);
+ }
+ }
+}
+}
+
+int main(int argc, char *argv[]) {
+ Options opts = ParseOptions(argc, argv);
+ if (!opts.valid) {
+ PrintHelp(argv[0]);
+ return 1;
+ }
+ // BasicConfigurator::configure();
+ LOG("Starting preproc: evaluating " << opts.ifile << ".");
+ vector<ofstream*> ofiles(opts.file_number);
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ char filename[50];
+ snprintf(filename, sizeof(filename), "%u.kmer.part", i);
+ ofiles[i] = new ofstream(filename);
+ assert(!ofiles[i]->fail() && "Too many files to open");
+ }
+ SplitToFiles(ireadstream(opts.ifile, opts.qvoffset),
+ ofiles, opts.q_mers, opts.error_threshold);
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ delete ofiles[i];
+ }
+ FILE *ofile = fopen(opts.ofile.c_str(), "w");
+ assert(ofile != NULL && "Too many files to open");
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ char ifile_name[50];
+ snprintf(ifile_name, sizeof(ifile_name), "%u.kmer.part", i);
+ ifstream ifile(ifile_name);
+ LOG("Processing " << ifile_name << ".");
+ EvalFile<UnorderedMap>(ifile, ofile, opts.q_mers);
+ LOG("Processed " << ifile_name << ".");
+ }
+ fclose(ofile);
+ LOG("Preprocessing done. You can find results in " << opts.ofile << ".");
+ return 0;
+}
diff --git a/src/projects/hammer/quake_count/quake_count_33.cpp b/src/projects/hammer/quake_count/quake_count_33.cpp
new file mode 100644
index 0000000..7e8cde1
--- /dev/null
+++ b/src/projects/hammer/quake_count/quake_count_33.cpp
@@ -0,0 +1,239 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/**
+ * @file preproc.cpp
+ * @author Alex Davydow
+ * @version 1.0
+ *
+ * @section LICENSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * @section DESCRIPTION
+ *
+ * For each k-mer this program calculates number of occurring in
+ * the reads provided. Reads file is supposed to be in fastq
+ * format.
+ */
+#include "standard.hpp"
+#include <stdint.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+#include <string>
+#include <set>
+#include <unordered_map>
+#include <vector>
+#include <iostream>
+#include <iomanip>
+#include "io/ireadstream.hpp"
+#include "io/read.hpp"
+#include "data_structures/sequence/seq.hpp"
+#include "kmer_freq_info.hpp"
+#include "valid_kmer_generator.hpp"
+
+#define SUPPRESS_UNUSED(X) ((void) (X))
+
+using std::string;
+using std::set;
+using std::vector;
+using std::unordered_map;
+using std::map;
+using std::ofstream;
+using std::ifstream;
+
+namespace {
+
+const uint32_t kK = 33;
+typedef Seq<kK> KMer;
+typedef unordered_map<KMer, KMerFreqInfo, KMer::hash> UnorderedMap;
+
+void print_time() {
+ time_t rawtime;
+ tm * ptm;
+ time ( &rawtime );
+ ptm = gmtime( &rawtime );
+ std::cout << std::setfill('0') << "[ " << std::setw(2) << ptm->tm_hour << ":" << std::setw(2) << ptm->tm_min
+ << ":" << std::setw(2) << ptm->tm_sec << " ] ";
+}
+
+#define LOG(a) print_time(); std::cout << a << std::endl
+
+/**
+ * @variable Every kStep k-mer will appear in the log.
+ */
+const int kStep = 1e5;
+
+struct Options {
+ /**
+ * @variable An offset for quality in a fastq file.
+ */
+ uint32_t qvoffset;
+ string ifile;
+ string ofile;
+ uint32_t error_threshold;
+ /**
+ * @variable How many files will be used when splitting k-mers.
+ */
+ uint32_t file_number;
+ bool q_mers;
+ bool valid;
+ Options()
+ : qvoffset(0),
+ ifile(""),
+ ofile(""),
+ error_threshold(0),
+ file_number(3),
+ q_mers(false),
+ valid(true) {}
+};
+
+void PrintHelp(char *program_name) {
+ printf("Usage: %s qvoffset ifile.fastq ofile.[q]cst file_number error_threshold [q]\n",
+ program_name);
+ printf("Where:\n");
+ printf("\tqvoffset\tan offset of fastq quality data\n");
+ printf("\tifile.fastq\tan input file with reads in fastq format\n");
+ printf("\tofile.[q]cst\ta filename where k-mer statistics will be outputted\n");
+ printf("\terror_threshold\tnucliotides with quality lower then threshold will be cut from the ends of reads\n");
+ printf("\tfile_number\thow many files will be used when splitting k-mers\n");
+ printf("\tq\t\tif you want to count q-mers instead of k-mers.\n");
+}
+
+Options ParseOptions(int argc, char *argv[]) {
+ Options ret;
+ if (argc != 6 && argc != 7) {
+ ret.valid = false;
+ } else {
+ ret.qvoffset = atoi(argv[1]);
+ ret.valid &= (ret.qvoffset >= 0 && ret.qvoffset <= 255);
+ ret.ifile = argv[2];
+ ret.ofile = argv[3];
+ ret.error_threshold = atoi(argv[4]);
+ ret.valid &= (ret.error_threshold >= 0 && ret.error_threshold <= 255);
+ ret.file_number = atoi(argv[5]);
+ if (argc == 7) {
+ if (string(argv[6]) == "q") {
+ ret.q_mers = true;
+ } else {
+ ret.valid = false;
+ }
+ }
+ }
+ return ret;
+}
+
+/**
+ * This function reads reads from the stream and splits them into
+ * k-mers. Then k-mers are written to several file almost
+ * uniformly. It is guaranteed that the same k-mers are written to the
+ * same files.
+ * @param ifs Steam to read reads from.
+ * @param ofiles Files to write the result k-mers. They are written
+ * one per line.
+ */
+void SplitToFiles(ireadstream ifs, vector<ofstream *> &ofiles,
+ bool q_mers, uint8_t error_threshold) {
+ uint32_t file_number = ofiles.size();
+ uint64_t read_number = 0;
+ while (!ifs.eof()) {
+ ++read_number;
+ if (read_number % kStep == 0) {
+ LOG("Reading read " << read_number << ".");
+ }
+ Read r;
+ ifs >> r;
+ KMer::hash hash_function;
+ for (ValidKMerGenerator<kK> gen(r, error_threshold); gen.HasMore(); gen.Next()) {
+ KMer kmer = gen.kmer();
+ if (KMer::less2()(!kmer, kmer)) {
+ kmer = !kmer;
+ }
+ ofstream &cur_file = *ofiles[hash_function(kmer) % file_number];
+ KMer::BinWrite(cur_file, kmer);
+ if (q_mers) {
+ double correct_probability = gen.correct_probability();
+ cur_file.write((const char*) &correct_probability, sizeof(correct_probability));
+ }
+ }
+ }
+}
+
+/**
+ * This function reads k-mer and calculates number of occurrences for
+ * each of them.
+ * @param ifile File with k-mer to process. One per line.
+ * @param ofile Output file. For each unique k-mer there will be a
+ * line with k-mer itself and number of its occurrences.
+ */
+template<typename KMerStatMap>
+void EvalFile(ifstream &ifile, FILE *ofile, bool q_mers) {
+ KMerStatMap stat_map;
+ char buffer[kK + 1];
+ buffer[kK] = 0;
+ KMer kmer;
+ while (KMer::BinRead(ifile, &kmer)) {
+ KMerFreqInfo &info = stat_map[kmer];
+ if (q_mers) {
+ double correct_probability = -1;
+ ifile.read((char *) &correct_probability, sizeof(correct_probability));
+ assert(ifile.fail());
+ info.q_count += correct_probability;
+ } else {
+ info.count += 1;
+ }
+ }
+ for (typename KMerStatMap::iterator it = stat_map.begin();
+ it != stat_map.end(); ++it) {
+ fprintf(ofile, "%s ", it->first.str().c_str());
+ if (q_mers) {
+ fprintf(ofile, "%f\n", it->second.q_count);
+ } else {
+ fprintf(ofile, "%d\n", it->second.count);
+ }
+ }
+}
+}
+
+int main(int argc, char *argv[]) {
+ Options opts = ParseOptions(argc, argv);
+ if (!opts.valid) {
+ PrintHelp(argv[0]);
+ return 1;
+ }
+ // BasicConfigurator::configure();
+ LOG("Starting preproc: evaluating " << opts.ifile << ".");
+ vector<ofstream*> ofiles(opts.file_number);
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ char filename[50];
+ snprintf(filename, sizeof(filename), "%u.kmer.part", i);
+ ofiles[i] = new ofstream(filename);
+ assert(!ofiles[i]->fail() && "Too many files to open");
+ }
+ SplitToFiles(ireadstream(opts.ifile, opts.qvoffset),
+ ofiles, opts.q_mers, opts.error_threshold);
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ delete ofiles[i];
+ }
+ FILE *ofile = fopen(opts.ofile.c_str(), "w");
+ assert(ofile != NULL && "Too many files to open");
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ char ifile_name[50];
+ snprintf(ifile_name, sizeof(ifile_name), "%u.kmer.part", i);
+ ifstream ifile(ifile_name);
+ LOG("Processing " << ifile_name << ".");
+ EvalFile<UnorderedMap>(ifile, ofile, opts.q_mers);
+ LOG("Processed " << ifile_name << ".");
+ }
+ fclose(ofile);
+ LOG("Preprocessing done. You can find results in " << opts.ofile << ".");
+ return 0;
+}
diff --git a/src/projects/hammer/quake_count/quake_count_37.cpp b/src/projects/hammer/quake_count/quake_count_37.cpp
new file mode 100644
index 0000000..2780c3e
--- /dev/null
+++ b/src/projects/hammer/quake_count/quake_count_37.cpp
@@ -0,0 +1,238 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/**
+ * @file preproc.cpp
+ * @author Alex Davydow
+ * @version 1.0
+ *
+ * @section LICENSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * @section DESCRIPTION
+ *
+ * For each k-mer this program calculates number of occurring in
+ * the reads provided. Reads file is supposed to be in fastq
+ * format.
+ */
+#include "standard.hpp"
+#include <stdint.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+#include <string>
+#include <set>
+#include <unordered_map>
+#include <vector>
+#include <iostream>
+#include <iomanip>
+#include "io/ireadstream.hpp"
+#include "io/read.hpp"
+#include "data_structures/sequence/seq.hpp"
+#include "kmer_freq_info.hpp"
+#include "valid_kmer_generator.hpp"
+#define SUPPRESS_UNUSED(X) ((void) (X))
+
+using std::string;
+using std::set;
+using std::vector;
+using std::unordered_map;
+using std::map;
+using std::ofstream;
+using std::ifstream;
+
+namespace {
+
+const uint32_t kK = 37;
+typedef Seq<kK> KMer;
+typedef unordered_map<KMer, KMerFreqInfo, KMer::hash> UnorderedMap;
+
+void print_time() {
+ time_t rawtime;
+ tm * ptm;
+ time ( &rawtime );
+ ptm = gmtime( &rawtime );
+ std::cout << std::setfill('0') << "[ " << std::setw(2) << ptm->tm_hour << ":" << std::setw(2) << ptm->tm_min
+ << ":" << std::setw(2) << ptm->tm_sec << " ] ";
+}
+
+#define LOG(a) print_time(); std::cout << a << std::endl
+
+/**
+ * @variable Every kStep k-mer will appear in the log.
+ */
+const int kStep = 1e5;
+
+struct Options {
+ /**
+ * @variable An offset for quality in a fastq file.
+ */
+ uint32_t qvoffset;
+ string ifile;
+ string ofile;
+ uint32_t error_threshold;
+ /**
+ * @variable How many files will be used when splitting k-mers.
+ */
+ uint32_t file_number;
+ bool q_mers;
+ bool valid;
+ Options()
+ : qvoffset(0),
+ ifile(""),
+ ofile(""),
+ error_threshold(0),
+ file_number(3),
+ q_mers(false),
+ valid(true) {}
+};
+
+void PrintHelp(char *program_name) {
+ printf("Usage: %s qvoffset ifile.fastq ofile.[q]cst file_number error_threshold [q]\n",
+ program_name);
+ printf("Where:\n");
+ printf("\tqvoffset\tan offset of fastq quality data\n");
+ printf("\tifile.fastq\tan input file with reads in fastq format\n");
+ printf("\tofile.[q]cst\ta filename where k-mer statistics will be outputted\n");
+ printf("\terror_threshold\tnucliotides with quality lower then threshold will be cut from the ends of reads\n");
+ printf("\tfile_number\thow many files will be used when splitting k-mers\n");
+ printf("\tq\t\tif you want to count q-mers instead of k-mers.\n");
+}
+
+Options ParseOptions(int argc, char *argv[]) {
+ Options ret;
+ if (argc != 6 && argc != 7) {
+ ret.valid = false;
+ } else {
+ ret.qvoffset = atoi(argv[1]);
+ ret.valid &= (ret.qvoffset >= 0 && ret.qvoffset <= 255);
+ ret.ifile = argv[2];
+ ret.ofile = argv[3];
+ ret.error_threshold = atoi(argv[4]);
+ ret.valid &= (ret.error_threshold >= 0 && ret.error_threshold <= 255);
+ ret.file_number = atoi(argv[5]);
+ if (argc == 7) {
+ if (string(argv[6]) == "q") {
+ ret.q_mers = true;
+ } else {
+ ret.valid = false;
+ }
+ }
+ }
+ return ret;
+}
+
+/**
+ * This function reads reads from the stream and splits them into
+ * k-mers. Then k-mers are written to several file almost
+ * uniformly. It is guaranteed that the same k-mers are written to the
+ * same files.
+ * @param ifs Steam to read reads from.
+ * @param ofiles Files to write the result k-mers. They are written
+ * one per line.
+ */
+void SplitToFiles(ireadstream ifs, vector<ofstream *> &ofiles,
+ bool q_mers, uint8_t error_threshold) {
+ uint32_t file_number = ofiles.size();
+ uint64_t read_number = 0;
+ while (!ifs.eof()) {
+ ++read_number;
+ if (read_number % kStep == 0) {
+ LOG("Reading read " << read_number << ".");
+ }
+ Read r;
+ ifs >> r;
+ KMer::hash hash_function;
+ for (ValidKMerGenerator<kK> gen(r, error_threshold); gen.HasMore(); gen.Next()) {
+ KMer kmer = gen.kmer();
+ if (KMer::less2()(!kmer, kmer)) {
+ kmer = !kmer;
+ }
+ ofstream &cur_file = *ofiles[hash_function(kmer) % file_number];
+ KMer::BinWrite(cur_file, kmer);
+ if (q_mers) {
+ double correct_probability = gen.correct_probability();
+ cur_file.write((const char*) &correct_probability, sizeof(correct_probability));
+ }
+ }
+ }
+}
+
+/**
+ * This function reads k-mer and calculates number of occurrences for
+ * each of them.
+ * @param ifile File with k-mer to process. One per line.
+ * @param ofile Output file. For each unique k-mer there will be a
+ * line with k-mer itself and number of its occurrences.
+ */
+template<typename KMerStatMap>
+void EvalFile(ifstream &ifile, FILE *ofile, bool q_mers) {
+ KMerStatMap stat_map;
+ char buffer[kK + 1];
+ buffer[kK] = 0;
+ KMer kmer;
+ while (KMer::BinRead(ifile, &kmer)) {
+ KMerFreqInfo &info = stat_map[kmer];
+ if (q_mers) {
+ double correct_probability = -1;
+ ifile.read((char *) &correct_probability, sizeof(correct_probability));
+ assert(ifile.fail());
+ info.q_count += correct_probability;
+ } else {
+ info.count += 1;
+ }
+ }
+ for (typename KMerStatMap::iterator it = stat_map.begin();
+ it != stat_map.end(); ++it) {
+ fprintf(ofile, "%s ", it->first.str().c_str());
+ if (q_mers) {
+ fprintf(ofile, "%f\n", it->second.q_count);
+ } else {
+ fprintf(ofile, "%d\n", it->second.count);
+ }
+ }
+}
+}
+
+int main(int argc, char *argv[]) {
+ Options opts = ParseOptions(argc, argv);
+ if (!opts.valid) {
+ PrintHelp(argv[0]);
+ return 1;
+ }
+ // BasicConfigurator::configure();
+ LOG("Starting preproc: evaluating " << opts.ifile << ".");
+ vector<ofstream*> ofiles(opts.file_number);
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ char filename[50];
+ snprintf(filename, sizeof(filename), "%u.kmer.part", i);
+ ofiles[i] = new ofstream(filename);
+ assert(!ofiles[i]->fail() && "Too many files to open");
+ }
+ SplitToFiles(ireadstream(opts.ifile, opts.qvoffset),
+ ofiles, opts.q_mers, opts.error_threshold);
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ delete ofiles[i];
+ }
+ FILE *ofile = fopen(opts.ofile.c_str(), "w");
+ assert(ofile != NULL && "Too many files to open");
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ char ifile_name[50];
+ snprintf(ifile_name, sizeof(ifile_name), "%u.kmer.part", i);
+ ifstream ifile(ifile_name);
+ LOG("Processing " << ifile_name << ".");
+ EvalFile<UnorderedMap>(ifile, ofile, opts.q_mers);
+ LOG("Processed " << ifile_name << ".");
+ }
+ fclose(ofile);
+ LOG("Preprocessing done. You can find results in " << opts.ofile << ".");
+ return 0;
+}
diff --git a/src/projects/hammer/quake_count/quake_count_45.cpp b/src/projects/hammer/quake_count/quake_count_45.cpp
new file mode 100644
index 0000000..663bba3
--- /dev/null
+++ b/src/projects/hammer/quake_count/quake_count_45.cpp
@@ -0,0 +1,238 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/**
+ * @file preproc.cpp
+ * @author Alex Davydow
+ * @version 1.0
+ *
+ * @section LICENSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * @section DESCRIPTION
+ *
+ * For each k-mer this program calculates number of occurring in
+ * the reads provided. Reads file is supposed to be in fastq
+ * format.
+ */
+#include "standard.hpp"
+#include <stdint.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+#include <string>
+#include <set>
+#include <unordered_map>
+#include <vector>
+#include <iostream>
+#include <iomanip>
+#include "io/ireadstream.hpp"
+#include "io/read.hpp"
+#include "data_structures/sequence/seq.hpp"
+#include "kmer_freq_info.hpp"
+#include "valid_kmer_generator.hpp"
+#define SUPPRESS_UNUSED(X) ((void) (X))
+
+using std::string;
+using std::set;
+using std::vector;
+using std::unordered_map;
+using std::map;
+using std::ofstream;
+using std::ifstream;
+
+namespace {
+
+const uint32_t kK = 45;
+typedef Seq<kK> KMer;
+typedef unordered_map<KMer, KMerFreqInfo, KMer::hash> UnorderedMap;
+
+void print_time() {
+ time_t rawtime;
+ tm * ptm;
+ time ( &rawtime );
+ ptm = gmtime( &rawtime );
+ std::cout << std::setfill('0') << "[ " << std::setw(2) << ptm->tm_hour << ":" << std::setw(2) << ptm->tm_min
+ << ":" << std::setw(2) << ptm->tm_sec << " ] ";
+}
+
+#define LOG(a) print_time(); std::cout << a << std::endl
+
+/**
+ * @variable Every kStep k-mer will appear in the log.
+ */
+const int kStep = 1e5;
+
+struct Options {
+ /**
+ * @variable An offset for quality in a fastq file.
+ */
+ uint32_t qvoffset;
+ string ifile;
+ string ofile;
+ uint32_t error_threshold;
+ /**
+ * @variable How many files will be used when splitting k-mers.
+ */
+ uint32_t file_number;
+ bool q_mers;
+ bool valid;
+ Options()
+ : qvoffset(0),
+ ifile(""),
+ ofile(""),
+ error_threshold(0),
+ file_number(3),
+ q_mers(false),
+ valid(true) {}
+};
+
+void PrintHelp(char *program_name) {
+ printf("Usage: %s qvoffset ifile.fastq ofile.[q]cst file_number error_threshold [q]\n",
+ program_name);
+ printf("Where:\n");
+ printf("\tqvoffset\tan offset of fastq quality data\n");
+ printf("\tifile.fastq\tan input file with reads in fastq format\n");
+ printf("\tofile.[q]cst\ta filename where k-mer statistics will be outputted\n");
+ printf("\terror_threshold\tnucliotides with quality lower then threshold will be cut from the ends of reads\n");
+ printf("\tfile_number\thow many files will be used when splitting k-mers\n");
+ printf("\tq\t\tif you want to count q-mers instead of k-mers.\n");
+}
+
+Options ParseOptions(int argc, char *argv[]) {
+ Options ret;
+ if (argc != 6 && argc != 7) {
+ ret.valid = false;
+ } else {
+ ret.qvoffset = atoi(argv[1]);
+ ret.valid &= (ret.qvoffset >= 0 && ret.qvoffset <= 255);
+ ret.ifile = argv[2];
+ ret.ofile = argv[3];
+ ret.error_threshold = atoi(argv[4]);
+ ret.valid &= (ret.error_threshold >= 0 && ret.error_threshold <= 255);
+ ret.file_number = atoi(argv[5]);
+ if (argc == 7) {
+ if (string(argv[6]) == "q") {
+ ret.q_mers = true;
+ } else {
+ ret.valid = false;
+ }
+ }
+ }
+ return ret;
+}
+
+/**
+ * This function reads reads from the stream and splits them into
+ * k-mers. Then k-mers are written to several file almost
+ * uniformly. It is guaranteed that the same k-mers are written to the
+ * same files.
+ * @param ifs Steam to read reads from.
+ * @param ofiles Files to write the result k-mers. They are written
+ * one per line.
+ */
+void SplitToFiles(ireadstream ifs, vector<ofstream *> &ofiles,
+ bool q_mers, uint8_t error_threshold) {
+ uint32_t file_number = ofiles.size();
+ uint64_t read_number = 0;
+ while (!ifs.eof()) {
+ ++read_number;
+ if (read_number % kStep == 0) {
+ LOG("Reading read " << read_number << ".");
+ }
+ Read r;
+ ifs >> r;
+ KMer::hash hash_function;
+ for (ValidKMerGenerator<kK> gen(r, error_threshold); gen.HasMore(); gen.Next()) {
+ KMer kmer = gen.kmer();
+ if (KMer::less2()(!kmer, kmer)) {
+ kmer = !kmer;
+ }
+ ofstream &cur_file = *ofiles[hash_function(kmer) % file_number];
+ KMer::BinWrite(cur_file, kmer);
+ if (q_mers) {
+ double correct_probability = gen.correct_probability();
+ cur_file.write((const char*) &correct_probability, sizeof(correct_probability));
+ }
+ }
+ }
+}
+
+/**
+ * This function reads k-mer and calculates number of occurrences for
+ * each of them.
+ * @param ifile File with k-mer to process. One per line.
+ * @param ofile Output file. For each unique k-mer there will be a
+ * line with k-mer itself and number of its occurrences.
+ */
+template<typename KMerStatMap>
+void EvalFile(ifstream &ifile, FILE *ofile, bool q_mers) {
+ KMerStatMap stat_map;
+ char buffer[kK + 1];
+ buffer[kK] = 0;
+ KMer kmer;
+ while (KMer::BinRead(ifile, &kmer)) {
+ KMerFreqInfo &info = stat_map[kmer];
+ if (q_mers) {
+ double correct_probability = -1;
+ ifile.read((char *) &correct_probability, sizeof(correct_probability));
+ assert(ifile.fail());
+ info.q_count += correct_probability;
+ } else {
+ info.count += 1;
+ }
+ }
+ for (typename KMerStatMap::iterator it = stat_map.begin();
+ it != stat_map.end(); ++it) {
+ fprintf(ofile, "%s ", it->first.str().c_str());
+ if (q_mers) {
+ fprintf(ofile, "%f\n", it->second.q_count);
+ } else {
+ fprintf(ofile, "%d\n", it->second.count);
+ }
+ }
+}
+}
+
+int main(int argc, char *argv[]) {
+ Options opts = ParseOptions(argc, argv);
+ if (!opts.valid) {
+ PrintHelp(argv[0]);
+ return 1;
+ }
+ // BasicConfigurator::configure();
+ LOG("Starting preproc: evaluating " << opts.ifile << ".");
+ vector<ofstream*> ofiles(opts.file_number);
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ char filename[50];
+ snprintf(filename, sizeof(filename), "%u.kmer.part", i);
+ ofiles[i] = new ofstream(filename);
+ assert(!ofiles[i]->fail() && "Too many files to open");
+ }
+ SplitToFiles(ireadstream(opts.ifile, opts.qvoffset),
+ ofiles, opts.q_mers, opts.error_threshold);
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ delete ofiles[i];
+ }
+ FILE *ofile = fopen(opts.ofile.c_str(), "w");
+ assert(ofile != NULL && "Too many files to open");
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ char ifile_name[50];
+ snprintf(ifile_name, sizeof(ifile_name), "%u.kmer.part", i);
+ ifstream ifile(ifile_name);
+ LOG("Processing " << ifile_name << ".");
+ EvalFile<UnorderedMap>(ifile, ofile, opts.q_mers);
+ LOG("Processed " << ifile_name << ".");
+ }
+ fclose(ofile);
+ LOG("Preprocessing done. You can find results in " << opts.ofile << ".");
+ return 0;
+}
diff --git a/src/projects/hammer/quake_count/quake_count_55.cpp b/src/projects/hammer/quake_count/quake_count_55.cpp
new file mode 100644
index 0000000..c096b19
--- /dev/null
+++ b/src/projects/hammer/quake_count/quake_count_55.cpp
@@ -0,0 +1,240 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/**
+ * @file preproc.cpp
+ * @author Alex Davydow
+ * @version 1.0
+ *
+ * @section LICENSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * @section DESCRIPTION
+ *
+ * For each k-mer this program calculates number of occurring in
+ * the reads provided. Reads file is supposed to be in fastq
+ * format.
+ */
+#include "standard.hpp"
+#include <stdint.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+#include <string>
+#include <set>
+#include <unordered_map>
+#include <vector>
+#include <iostream>
+#include <iomanip>
+#include "io/ireadstream.hpp"
+#include "io/read.hpp"
+#include "data_structures/sequence/seq.hpp"
+#include "kmer_freq_info.hpp"
+#include "valid_kmer_generator.hpp"
+#define SUPPRESS_UNUSED(X) ((void) (X))
+
+using std::string;
+using std::set;
+using std::vector;
+using std::unordered_map;
+using std::map;
+using std::ofstream;
+using std::ifstream;
+
+namespace {
+
+const uint32_t kK = 55;
+typedef Seq<kK> KMer;
+typedef unordered_map<KMer, KMerFreqInfo, KMer::hash> UnorderedMap;
+
+void print_time() {
+ time_t rawtime;
+ tm * ptm;
+ time ( &rawtime );
+ ptm = gmtime( &rawtime );
+ std::cout << std::setfill('0') << "[ " << std::setw(2) << ptm->tm_hour << ":" << std::setw(2) << ptm->tm_min
+ << ":" << std::setw(2) << ptm->tm_sec << " ] ";
+}
+
+#define LOG(a) print_time(); std::cout << a << std::endl
+
+/**
+ * @variable Every kStep k-mer will appear in the log.
+ */
+const int kStep = 1e5;
+
+struct Options {
+ /**
+ * @variable An offset for quality in a fastq file.
+ */
+ uint32_t qvoffset;
+ string ifile;
+ string ofile;
+ uint32_t error_threshold;
+ /**
+ * @variable How many files will be used when splitting k-mers.
+ */
+ uint32_t file_number;
+ bool q_mers;
+ bool valid;
+ Options()
+ : qvoffset(0),
+ ifile(""),
+ ofile(""),
+ error_threshold(0),
+ file_number(3),
+ q_mers(false),
+ valid(true) {}
+};
+
+void PrintHelp(char *program_name) {
+ printf("Usage: %s qvoffset ifile.fastq ofile.[q]cst file_number error_threshold [q]\n",
+ program_name);
+ printf("Where:\n");
+ printf("\tqvoffset\tan offset of fastq quality data\n");
+ printf("\tifile.fastq\tan input file with reads in fastq format\n");
+ printf("\tofile.[q]cst\ta filename where k-mer statistics will be outputted\n");
+ printf("\terror_threshold\tnucliotides with quality lower then threshold will be cut from the ends of reads\n");
+ printf("\tfile_number\thow many files will be used when splitting k-mers\n");
+ printf("\tq\t\tif you want to count q-mers instead of k-mers.\n");
+}
+
+Options ParseOptions(int argc, char *argv[]) {
+ Options ret;
+ if (argc != 6 && argc != 7) {
+ ret.valid = false;
+ } else {
+ ret.qvoffset = atoi(argv[1]);
+ ret.valid &= (ret.qvoffset >= 0 && ret.qvoffset <= 255);
+ ret.ifile = argv[2];
+ ret.ofile = argv[3];
+ ret.error_threshold = atoi(argv[4]);
+ ret.valid &= (ret.error_threshold >= 0 && ret.error_threshold <= 255);
+ ret.file_number = atoi(argv[5]);
+ if (argc == 7) {
+ if (string(argv[6]) == "q") {
+ ret.q_mers = true;
+ } else {
+ ret.valid = false;
+ }
+ }
+ }
+ return ret;
+}
+
+/**
+ * This function reads reads from the stream and splits them into
+ * k-mers. Then k-mers are written to several file almost
+ * uniformly. It is guaranteed that the same k-mers are written to the
+ * same files.
+ * @param ifs Steam to read reads from.
+ * @param ofiles Files to write the result k-mers. They are written
+ * one per line.
+ */
+void SplitToFiles(ireadstream ifs, vector<ofstream *> &ofiles,
+ bool q_mers, uint8_t error_threshold) {
+ uint32_t file_number = ofiles.size();
+ uint64_t read_number = 0;
+ while (!ifs.eof()) {
+ ++read_number;
+ if (read_number % kStep == 0) {
+ LOG("Reading read " << read_number << ".");
+ }
+ Read r;
+ ifs >> r;
+ //cout << r.getSequenceString() << endl;
+ KMer::hash hash_function;
+ for (ValidKMerGenerator<kK> gen(r, error_threshold); gen.HasMore(); gen.Next()) {
+ KMer kmer = gen.kmer();
+ //cout << kmer.str() << endl;
+ if (KMer::less2()(!kmer, kmer)) {
+ kmer = !kmer;
+ }
+ ofstream &cur_file = *ofiles[hash_function(kmer) % file_number];
+ KMer::BinWrite(cur_file, kmer);
+ if (q_mers) {
+ double correct_probability = gen.correct_probability();
+ cur_file.write((const char*) &correct_probability, sizeof(correct_probability));
+ }
+ }
+ }
+}
+
+/**
+ * This function reads k-mer and calculates number of occurrences for
+ * each of them.
+ * @param ifile File with k-mer to process. One per line.
+ * @param ofile Output file. For each unique k-mer there will be a
+ * line with k-mer itself and number of its occurrences.
+ */
+template<typename KMerStatMap>
+void EvalFile(ifstream &ifile, FILE *ofile, bool q_mers) {
+ KMerStatMap stat_map;
+ char buffer[kK + 1];
+ buffer[kK] = 0;
+ KMer kmer;
+ while (KMer::BinRead(ifile, &kmer)) {
+ KMerFreqInfo &info = stat_map[kmer];
+ if (q_mers) {
+ double correct_probability = -1;
+ ifile.read((char *) &correct_probability, sizeof(correct_probability));
+ assert(ifile.fail());
+ info.q_count += correct_probability;
+ } else {
+ info.count += 1;
+ }
+ }
+ for (typename KMerStatMap::iterator it = stat_map.begin();
+ it != stat_map.end(); ++it) {
+ fprintf(ofile, "%s ", it->first.str().c_str());
+ if (q_mers) {
+ fprintf(ofile, "%f\n", it->second.q_count);
+ } else {
+ fprintf(ofile, "%d\n", it->second.count);
+ }
+ }
+}
+}
+
+int main(int argc, char *argv[]) {
+ Options opts = ParseOptions(argc, argv);
+ if (!opts.valid) {
+ PrintHelp(argv[0]);
+ return 1;
+ }
+ // BasicConfigurator::configure();
+ LOG("Starting preproc: evaluating " << opts.ifile << ".");
+ vector<ofstream*> ofiles(opts.file_number);
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ char filename[50];
+ snprintf(filename, sizeof(filename), "%u.kmer.part", i);
+ ofiles[i] = new ofstream(filename);
+ assert(!ofiles[i]->fail() && "Too many files to open");
+ }
+ SplitToFiles(ireadstream(opts.ifile, opts.qvoffset),
+ ofiles, opts.q_mers, opts.error_threshold);
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ delete ofiles[i];
+ }
+ FILE *ofile = fopen(opts.ofile.c_str(), "w");
+ assert(ofile != NULL && "Too many files to open");
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ char ifile_name[50];
+ snprintf(ifile_name, sizeof(ifile_name), "%u.kmer.part", i);
+ ifstream ifile(ifile_name);
+ LOG("Processing " << ifile_name << ".");
+ EvalFile<UnorderedMap>(ifile, ofile, opts.q_mers);
+ LOG("Processed " << ifile_name << ".");
+ }
+ fclose(ofile);
+ LOG("Preprocessing done. You can find results in " << opts.ofile << ".");
+ return 0;
+}
diff --git a/src/projects/hammer/quake_count/quake_count_65.cpp b/src/projects/hammer/quake_count/quake_count_65.cpp
new file mode 100644
index 0000000..0ac0017
--- /dev/null
+++ b/src/projects/hammer/quake_count/quake_count_65.cpp
@@ -0,0 +1,238 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/**
+ * @file preproc.cpp
+ * @author Alex Davydow
+ * @version 1.0
+ *
+ * @section LICENSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * @section DESCRIPTION
+ *
+ * For each k-mer this program calculates number of occurring in
+ * the reads provided. Reads file is supposed to be in fastq
+ * format.
+ */
+#include "standard.hpp"
+#include <stdint.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+#include <string>
+#include <set>
+#include <unordered_map>
+#include <vector>
+#include <iostream>
+#include <iomanip>
+#include "io/ireadstream.hpp"
+#include "io/read.hpp"
+#include "data_structures/sequence/seq.hpp"
+#include "kmer_freq_info.hpp"
+#include "valid_kmer_generator.hpp"
+#define SUPPRESS_UNUSED(X) ((void) (X))
+
+using std::string;
+using std::set;
+using std::vector;
+using std::unordered_map;
+using std::map;
+using std::ofstream;
+using std::ifstream;
+
+namespace {
+
+const uint32_t kK = 65;
+typedef Seq<kK> KMer;
+typedef unordered_map<KMer, KMerFreqInfo, KMer::hash> UnorderedMap;
+
+void print_time() {
+ time_t rawtime;
+ tm * ptm;
+ time ( &rawtime );
+ ptm = gmtime( &rawtime );
+ std::cout << std::setfill('0') << "[ " << std::setw(2) << ptm->tm_hour << ":" << std::setw(2) << ptm->tm_min
+ << ":" << std::setw(2) << ptm->tm_sec << " ] ";
+}
+
+#define LOG(a) print_time(); std::cout << a << std::endl
+
+/**
+ * @variable Every kStep k-mer will appear in the log.
+ */
+const int kStep = 1e5;
+
+struct Options {
+ /**
+ * @variable An offset for quality in a fastq file.
+ */
+ uint32_t qvoffset;
+ string ifile;
+ string ofile;
+ uint32_t error_threshold;
+ /**
+ * @variable How many files will be used when splitting k-mers.
+ */
+ uint32_t file_number;
+ bool q_mers;
+ bool valid;
+ Options()
+ : qvoffset(0),
+ ifile(""),
+ ofile(""),
+ error_threshold(0),
+ file_number(3),
+ q_mers(false),
+ valid(true) {}
+};
+
+void PrintHelp(char *program_name) {
+ printf("Usage: %s qvoffset ifile.fastq ofile.[q]cst file_number error_threshold [q]\n",
+ program_name);
+ printf("Where:\n");
+ printf("\tqvoffset\tan offset of fastq quality data\n");
+ printf("\tifile.fastq\tan input file with reads in fastq format\n");
+ printf("\tofile.[q]cst\ta filename where k-mer statistics will be outputted\n");
+ printf("\terror_threshold\tnucliotides with quality lower then threshold will be cut from the ends of reads\n");
+ printf("\tfile_number\thow many files will be used when splitting k-mers\n");
+ printf("\tq\t\tif you want to count q-mers instead of k-mers.\n");
+}
+
+Options ParseOptions(int argc, char *argv[]) {
+ Options ret;
+ if (argc != 6 && argc != 7) {
+ ret.valid = false;
+ } else {
+ ret.qvoffset = atoi(argv[1]);
+ ret.valid &= (ret.qvoffset >= 0 && ret.qvoffset <= 255);
+ ret.ifile = argv[2];
+ ret.ofile = argv[3];
+ ret.error_threshold = atoi(argv[4]);
+ ret.valid &= (ret.error_threshold >= 0 && ret.error_threshold <= 255);
+ ret.file_number = atoi(argv[5]);
+ if (argc == 7) {
+ if (string(argv[6]) == "q") {
+ ret.q_mers = true;
+ } else {
+ ret.valid = false;
+ }
+ }
+ }
+ return ret;
+}
+
+/**
+ * This function reads reads from the stream and splits them into
+ * k-mers. Then k-mers are written to several file almost
+ * uniformly. It is guaranteed that the same k-mers are written to the
+ * same files.
+ * @param ifs Steam to read reads from.
+ * @param ofiles Files to write the result k-mers. They are written
+ * one per line.
+ */
+void SplitToFiles(ireadstream ifs, vector<ofstream *> &ofiles,
+ bool q_mers, uint8_t error_threshold) {
+ uint32_t file_number = ofiles.size();
+ uint64_t read_number = 0;
+ while (!ifs.eof()) {
+ ++read_number;
+ if (read_number % kStep == 0) {
+ LOG("Reading read " << read_number << ".");
+ }
+ Read r;
+ ifs >> r;
+ KMer::hash hash_function;
+ for (ValidKMerGenerator<kK> gen(r, error_threshold); gen.HasMore(); gen.Next()) {
+ KMer kmer = gen.kmer();
+ if (KMer::less2()(!kmer, kmer)) {
+ kmer = !kmer;
+ }
+ ofstream &cur_file = *ofiles[hash_function(kmer) % file_number];
+ KMer::BinWrite(cur_file, kmer);
+ if (q_mers) {
+ double correct_probability = gen.correct_probability();
+ cur_file.write((const char*) &correct_probability, sizeof(correct_probability));
+ }
+ }
+ }
+}
+
+/**
+ * This function reads k-mer and calculates number of occurrences for
+ * each of them.
+ * @param ifile File with k-mer to process. One per line.
+ * @param ofile Output file. For each unique k-mer there will be a
+ * line with k-mer itself and number of its occurrences.
+ */
+template<typename KMerStatMap>
+void EvalFile(ifstream &ifile, FILE *ofile, bool q_mers) {
+ KMerStatMap stat_map;
+ char buffer[kK + 1];
+ buffer[kK] = 0;
+ KMer kmer;
+ while (KMer::BinRead(ifile, &kmer)) {
+ KMerFreqInfo &info = stat_map[kmer];
+ if (q_mers) {
+ double correct_probability = -1;
+ ifile.read((char *) &correct_probability, sizeof(correct_probability));
+ assert(ifile.fail());
+ info.q_count += correct_probability;
+ } else {
+ info.count += 1;
+ }
+ }
+ for (typename KMerStatMap::iterator it = stat_map.begin();
+ it != stat_map.end(); ++it) {
+ fprintf(ofile, "%s ", it->first.str().c_str());
+ if (q_mers) {
+ fprintf(ofile, "%f\n", it->second.q_count);
+ } else {
+ fprintf(ofile, "%d\n", it->second.count);
+ }
+ }
+}
+}
+
+int main(int argc, char *argv[]) {
+ Options opts = ParseOptions(argc, argv);
+ if (!opts.valid) {
+ PrintHelp(argv[0]);
+ return 1;
+ }
+ // BasicConfigurator::configure();
+ LOG("Starting preproc: evaluating " << opts.ifile << ".");
+ vector<ofstream*> ofiles(opts.file_number);
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ char filename[50];
+ snprintf(filename, sizeof(filename), "%u.kmer.part", i);
+ ofiles[i] = new ofstream(filename);
+ assert(!ofiles[i]->fail() && "Too many files to open");
+ }
+ SplitToFiles(ireadstream(opts.ifile, opts.qvoffset),
+ ofiles, opts.q_mers, opts.error_threshold);
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ delete ofiles[i];
+ }
+ FILE *ofile = fopen(opts.ofile.c_str(), "w");
+ assert(ofile != NULL && "Too many files to open");
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ char ifile_name[50];
+ snprintf(ifile_name, sizeof(ifile_name), "%u.kmer.part", i);
+ ifstream ifile(ifile_name);
+ LOG("Processing " << ifile_name << ".");
+ EvalFile<UnorderedMap>(ifile, ofile, opts.q_mers);
+ LOG("Processed " << ifile_name << ".");
+ }
+ fclose(ofile);
+ LOG("Preprocessing done. You can find results in " << opts.ofile << ".");
+ return 0;
+}
diff --git a/src/projects/hammer/quake_count/quake_count_75.cpp b/src/projects/hammer/quake_count/quake_count_75.cpp
new file mode 100644
index 0000000..fb8de1d
--- /dev/null
+++ b/src/projects/hammer/quake_count/quake_count_75.cpp
@@ -0,0 +1,238 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/**
+ * @file preproc.cpp
+ * @author Alex Davydow
+ * @version 1.0
+ *
+ * @section LICENSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * @section DESCRIPTION
+ *
+ * For each k-mer this program calculates number of occurring in
+ * the reads provided. Reads file is supposed to be in fastq
+ * format.
+ */
+#include "standard.hpp"
+#include <stdint.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+#include <string>
+#include <set>
+#include <unordered_map>
+#include <vector>
+#include <iostream>
+#include <iomanip>
+#include "io/ireadstream.hpp"
+#include "io/read.hpp"
+#include "data_structures/sequence/seq.hpp"
+#include "kmer_freq_info.hpp"
+#include "valid_kmer_generator.hpp"
+#define SUPPRESS_UNUSED(X) ((void) (X))
+
+using std::string;
+using std::set;
+using std::vector;
+using std::unordered_map;
+using std::map;
+using std::ofstream;
+using std::ifstream;
+
+namespace {
+
+const uint32_t kK = 75;
+typedef Seq<kK> KMer;
+typedef unordered_map<KMer, KMerFreqInfo, KMer::hash> UnorderedMap;
+
+void print_time() {
+ time_t rawtime;
+ tm * ptm;
+ time ( &rawtime );
+ ptm = gmtime( &rawtime );
+ std::cout << std::setfill('0') << "[ " << std::setw(2) << ptm->tm_hour << ":" << std::setw(2) << ptm->tm_min
+ << ":" << std::setw(2) << ptm->tm_sec << " ] ";
+}
+
+#define LOG(a) print_time(); std::cout << a << std::endl
+
+/**
+ * @variable Every kStep k-mer will appear in the log.
+ */
+const int kStep = 1e5;
+
+struct Options {
+ /**
+ * @variable An offset for quality in a fastq file.
+ */
+ uint32_t qvoffset;
+ string ifile;
+ string ofile;
+ uint32_t error_threshold;
+ /**
+ * @variable How many files will be used when splitting k-mers.
+ */
+ uint32_t file_number;
+ bool q_mers;
+ bool valid;
+ Options()
+ : qvoffset(0),
+ ifile(""),
+ ofile(""),
+ error_threshold(0),
+ file_number(3),
+ q_mers(false),
+ valid(true) {}
+};
+
+void PrintHelp(char *program_name) {
+ printf("Usage: %s qvoffset ifile.fastq ofile.[q]cst file_number error_threshold [q]\n",
+ program_name);
+ printf("Where:\n");
+ printf("\tqvoffset\tan offset of fastq quality data\n");
+ printf("\tifile.fastq\tan input file with reads in fastq format\n");
+ printf("\tofile.[q]cst\ta filename where k-mer statistics will be outputted\n");
+ printf("\terror_threshold\tnucliotides with quality lower then threshold will be cut from the ends of reads\n");
+ printf("\tfile_number\thow many files will be used when splitting k-mers\n");
+ printf("\tq\t\tif you want to count q-mers instead of k-mers.\n");
+}
+
+Options ParseOptions(int argc, char *argv[]) {
+ Options ret;
+ if (argc != 6 && argc != 7) {
+ ret.valid = false;
+ } else {
+ ret.qvoffset = atoi(argv[1]);
+ ret.valid &= (ret.qvoffset >= 0 && ret.qvoffset <= 255);
+ ret.ifile = argv[2];
+ ret.ofile = argv[3];
+ ret.error_threshold = atoi(argv[4]);
+ ret.valid &= (ret.error_threshold >= 0 && ret.error_threshold <= 255);
+ ret.file_number = atoi(argv[5]);
+ if (argc == 7) {
+ if (string(argv[6]) == "q") {
+ ret.q_mers = true;
+ } else {
+ ret.valid = false;
+ }
+ }
+ }
+ return ret;
+}
+
+/**
+ * This function reads reads from the stream and splits them into
+ * k-mers. Then k-mers are written to several file almost
+ * uniformly. It is guaranteed that the same k-mers are written to the
+ * same files.
+ * @param ifs Steam to read reads from.
+ * @param ofiles Files to write the result k-mers. They are written
+ * one per line.
+ */
+void SplitToFiles(ireadstream ifs, vector<ofstream *> &ofiles,
+ bool q_mers, uint8_t error_threshold) {
+ uint32_t file_number = ofiles.size();
+ uint64_t read_number = 0;
+ while (!ifs.eof()) {
+ ++read_number;
+ if (read_number % kStep == 0) {
+ LOG("Reading read " << read_number << ".");
+ }
+ Read r;
+ ifs >> r;
+ KMer::hash hash_function;
+ for (ValidKMerGenerator<kK> gen(r, error_threshold); gen.HasMore(); gen.Next()) {
+ KMer kmer = gen.kmer();
+ if (KMer::less2()(!kmer, kmer)) {
+ kmer = !kmer;
+ }
+ ofstream &cur_file = *ofiles[hash_function(kmer) % file_number];
+ KMer::BinWrite(cur_file, kmer);
+ if (q_mers) {
+ double correct_probability = gen.correct_probability();
+ cur_file.write((const char*) &correct_probability, sizeof(correct_probability));
+ }
+ }
+ }
+}
+
+/**
+ * This function reads k-mer and calculates number of occurrences for
+ * each of them.
+ * @param ifile File with k-mer to process. One per line.
+ * @param ofile Output file. For each unique k-mer there will be a
+ * line with k-mer itself and number of its occurrences.
+ */
+template<typename KMerStatMap>
+void EvalFile(ifstream &ifile, FILE *ofile, bool q_mers) {
+ KMerStatMap stat_map;
+ char buffer[kK + 1];
+ buffer[kK] = 0;
+ KMer kmer;
+ while (KMer::BinRead(ifile, &kmer)) {
+ KMerFreqInfo &info = stat_map[kmer];
+ if (q_mers) {
+ double correct_probability = -1;
+ ifile.read((char *) &correct_probability, sizeof(correct_probability));
+ assert(ifile.fail());
+ info.q_count += correct_probability;
+ } else {
+ info.count += 1;
+ }
+ }
+ for (typename KMerStatMap::iterator it = stat_map.begin();
+ it != stat_map.end(); ++it) {
+ fprintf(ofile, "%s ", it->first.str().c_str());
+ if (q_mers) {
+ fprintf(ofile, "%f\n", it->second.q_count);
+ } else {
+ fprintf(ofile, "%d\n", it->second.count);
+ }
+ }
+}
+}
+
+int main(int argc, char *argv[]) {
+ Options opts = ParseOptions(argc, argv);
+ if (!opts.valid) {
+ PrintHelp(argv[0]);
+ return 1;
+ }
+ // BasicConfigurator::configure();
+ LOG("Starting preproc: evaluating " << opts.ifile << ".");
+ vector<ofstream*> ofiles(opts.file_number);
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ char filename[50];
+ snprintf(filename, sizeof(filename), "%u.kmer.part", i);
+ ofiles[i] = new ofstream(filename);
+ assert(!ofiles[i]->fail() && "Too many files to open");
+ }
+ SplitToFiles(ireadstream(opts.ifile, opts.qvoffset),
+ ofiles, opts.q_mers, opts.error_threshold);
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ delete ofiles[i];
+ }
+ FILE *ofile = fopen(opts.ofile.c_str(), "w");
+ assert(ofile != NULL && "Too many files to open");
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ char ifile_name[50];
+ snprintf(ifile_name, sizeof(ifile_name), "%u.kmer.part", i);
+ ifstream ifile(ifile_name);
+ LOG("Processing " << ifile_name << ".");
+ EvalFile<UnorderedMap>(ifile, ofile, opts.q_mers);
+ LOG("Processed " << ifile_name << ".");
+ }
+ fclose(ofile);
+ LOG("Preprocessing done. You can find results in " << opts.ofile << ".");
+ return 0;
+}
diff --git a/src/projects/hammer/quake_count/valid_kmer_generator.hpp b/src/projects/hammer/quake_count/valid_kmer_generator.hpp
new file mode 100644
index 0000000..270c6e0
--- /dev/null
+++ b/src/projects/hammer/quake_count/valid_kmer_generator.hpp
@@ -0,0 +1,194 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef HAMMER_VALIDKMERGENERATOR_HPP_
+#define HAMMER_VALIDKMERGENERATOR_HPP_
+#include <stdint.h>
+#include <cmath>
+#include <string>
+#include <vector>
+#include "io/read.hpp"
+#include "data_structures/sequence/seq.hpp"
+/**
+ * This class is designed to iterate through valid k-mers in read.
+ * @example
+ * ValidKMerGenerator<2> gen(read, 4);
+ * while (gen.HasMore()) {
+ * MyTrickyFunction(gen.kmer());
+ * gen.Next();
+ * }
+ * or
+ * for (ValidKMerGenerator<2> gen(read, 2); gen.HasMore; gen.Next() {
+ * MyTrickyFunction(gen.kmer(), gen.pos(), gen.correct_probability());
+ * k}
+ * @param kkK k-mer length.
+ */
+template<uint32_t kK>
+class ValidKMerGenerator {
+ public:
+ /**
+ * @param read Read to generate k-mers from.
+ * @param bad_quality_threshold This class virtually cuts
+ * nucleotides with quality lower the threshold from the ends of the
+ * read.
+ */
+ explicit ValidKMerGenerator(const Read &read,
+ uint32_t bad_quality_threshold = 2) :
+ bad_quality_threshold_(bad_quality_threshold),
+ pos_(-1),
+ end_(-1),
+ len_(read.getSequenceString().size()),
+ has_more_(true),
+ correct_probability_(1),
+ first(true),
+ kmer_(),
+ seq_(read.getSequenceString().data()),
+ qual_(read.getQualityString().data()) {
+ TrimBadQuality();
+ Next();
+ }
+ /**
+ * @param seq sequence to generate k-mers from.
+ * @param qual quality string
+ * @param bad_quality_threshold This class virtually cuts
+ * nucleotides with quality lower the threshold from the ends of the
+ * read.
+ */
+ explicit ValidKMerGenerator(const char *seq, const char *qual,
+ size_t len,
+ uint32_t bad_quality_threshold = 2) :
+ bad_quality_threshold_(bad_quality_threshold),
+ pos_(-1),
+ end_(-1),
+ len_(len),
+ has_more_(true),
+ correct_probability_(1),
+ first(true),
+ kmer_(),
+ seq_(seq),
+ qual_(qual) {
+ TrimBadQuality();
+ Next();
+ }
+ /**
+ * @result true if Next() succeed while generating new k-mer, false
+ * otherwise.
+ */
+ bool HasMore() const {
+ return has_more_;
+ }
+ /**
+ * @result last k-mer generated by Next().
+ */
+ const Seq<kK>& kmer() const {
+ return kmer_;
+ }
+ /**
+ * @result last k-mer position in initial read.
+ */
+ int pos() const {
+ return pos_;
+ }
+ /**
+ * @result probability that last generated k-mer is correct.
+ */
+ double correct_probability() const {
+ return correct_probability_;
+ }
+ /**
+ * This functions reads next k-mer from the read and sets hasmore to
+ * if succeeded. You can access k-mer read with kmer().
+ */
+ void Next();
+ private:
+ void TrimBadQuality();
+ double Prob(uint8_t qual) {
+ if (qual < 3) {
+ return 0.25;
+ }
+ static std::vector<double> prob(255, -1);
+ if (prob[qual] < -0.1) {
+ prob[qual] = 1 - pow(10.0, - qual / 10.0);
+ }
+ return prob[qual];
+ }
+ uint32_t GetQual(uint32_t pos) {
+ if (pos >= len_) {
+ return 2;
+ } else {
+ return qual_[pos];
+ }
+ }
+ uint32_t bad_quality_threshold_;
+ size_t pos_;
+ size_t end_;
+ size_t len_;
+ bool has_more_;
+ double correct_probability_;
+ bool first;
+ Seq<kK> kmer_;
+ const char* seq_;
+ const char* qual_;
+ // Disallow copy and assign
+ ValidKMerGenerator(const ValidKMerGenerator&);
+ void operator=(const ValidKMerGenerator&);
+};
+
+template<uint32_t kK>
+void ValidKMerGenerator<kK>::TrimBadQuality() {
+ pos_ = 0;
+ if (qual_)
+ for (; pos_ < len_; ++pos_) {
+ if (GetQual(pos_) >= bad_quality_threshold_)
+ break;
+ }
+ end_ = len_;
+ if (qual_)
+ for (; end_ > pos_; --end_) {
+ if (GetQual(end_ - 1) >= bad_quality_threshold_)
+ break;
+ }
+}
+
+template<uint32_t kK>
+void ValidKMerGenerator<kK>::Next() {
+ if (pos_ + kK > end_) {
+ has_more_ = false;
+ } else if (first || !is_nucl(seq_[pos_ + kK - 1])) {
+ // in this case we have to look for new k-mer
+ correct_probability_ = 1.0;
+ uint32_t start_hypothesis = pos_;
+ uint32_t i = pos_;
+ for (; i < len_; ++i) {
+ if (i == kK + start_hypothesis) {
+ break;
+ }
+ if (qual_)
+ correct_probability_ *= Prob(GetQual(i));
+ if (!is_nucl(seq_[i])) {
+ start_hypothesis = i + 1;
+ correct_probability_ = 1.0;
+ }
+ }
+ if (i == kK + start_hypothesis) {
+ kmer_ = Seq<kK>(seq_ + start_hypothesis, 0, kK, /* raw */ true);
+ pos_ = start_hypothesis + 1;
+ } else {
+ has_more_ = false;
+ }
+ } else {
+ // good case we can just shift our previous answer
+ kmer_ = kmer_ << seq_[pos_ + kK - 1];
+ if (qual_) {
+ correct_probability_ *= Prob(GetQual(pos_ + kK - 1));
+ correct_probability_ /= Prob(GetQual(pos_ - 1));
+ }
+ ++pos_;
+ }
+ first = false;
+}
+#endif // HAMMER_VALIDKMERGENERATOR_HPP__
diff --git a/src/hammer/quake_enhanced/CMakeLists.txt b/src/projects/hammer/quake_enhanced/CMakeLists.txt
similarity index 100%
rename from src/hammer/quake_enhanced/CMakeLists.txt
rename to src/projects/hammer/quake_enhanced/CMakeLists.txt
diff --git a/src/hammer/quake_enhanced/correct_hist/CMakeLists.txt b/src/projects/hammer/quake_enhanced/correct_hist/CMakeLists.txt
similarity index 100%
rename from src/hammer/quake_enhanced/correct_hist/CMakeLists.txt
rename to src/projects/hammer/quake_enhanced/correct_hist/CMakeLists.txt
diff --git a/src/hammer/quake_enhanced/correct_hist/main.cpp b/src/projects/hammer/quake_enhanced/correct_hist/main.cpp
similarity index 100%
rename from src/hammer/quake_enhanced/correct_hist/main.cpp
rename to src/projects/hammer/quake_enhanced/correct_hist/main.cpp
diff --git a/src/projects/hammer/quake_enhanced/count.cpp b/src/projects/hammer/quake_enhanced/count.cpp
new file mode 100644
index 0000000..32b6ecd
--- /dev/null
+++ b/src/projects/hammer/quake_enhanced/count.cpp
@@ -0,0 +1,131 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include <stdint.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+#include <string>
+#include <vector>
+#include <unordered_map>
+#include "io/ireadstream.hpp"
+#include "io/read.hpp"
+#include "data_structures/sequence/seq.hpp"
+#include "valid_kmer_generator.hpp"
+#include "quake_enhanced/quake.hpp"
+#define SUPPRESS_UNUSED(X) ((void) (X))
+using quake_enhanced::Quake;
+
+using std::string;
+using std::vector;
+using std::unordered_map;
+using io::Reader;
+using io::SingleRead;
+
+struct KMerInfo {
+ int count;
+ double q_count;
+ double freq;
+};
+
+typedef Seq<kK> KMer;
+typedef unordered_map<KMer, KMerInfo, KMer::hash> UnorderedMap;
+
+/**
+ * This function reads reads from the stream and splits them into
+ * k-mers. Then k-mers are written to several file almost
+ * uniformly. It is guaranteed that the same k-mers are written to the
+ * same files.
+ * @param ifs Steam to read reads from.
+ * @param ofiles Files to write the result k-mers. They are written
+ * one per line.
+ */
+void Quake::SplitToFiles(ireadstream ifs, vector<ofstream*> &ofiles,
+ uint8_t error_threshold) {
+ uint32_t file_number = ofiles.size();
+ while (!ifs.eof()) {
+ Read r;
+ ifs >> r;
+ KMer::hash hash_function;
+ for (ValidKMerGenerator<kK> gen(r, error_threshold); gen.HasMore(); gen.Next()) {
+ KMer kmer = gen.kmer();
+ if (KMer::less2()(!kmer, kmer)) {
+ kmer = !kmer;
+ }
+ ofstream &cur_file = *ofiles[hash_function(kmer) % file_number];
+ KMer::BinWrite(cur_file, kmer);
+ double q_count = gen.correct_probability();
+ cur_file.write((const char *) &q_count, sizeof(q_count));
+ }
+ }
+}
+
+/**
+ * This function reads k-mer and calculates number of occurrences for
+ * each of them.
+ * @param ifile File with k-mer to process. One per line.
+ * @param ofile Output file. For each unique k-mer there will be a
+ * line with k-mer itself and number of its occurrences.
+ */
+void Quake::EvalFile(ifstream &ifile, ofstream &ofile) {
+ UnorderedMap stat_map;
+ char buffer[kK + 1];
+ buffer[kK] = 0;
+ KMer kmer;
+ while (KMer::BinRead(ifile, &kmer)) {
+ KMerInfo &info = stat_map[kmer];
+ double q_count = -1;
+ ifile.read((char *) &q_count, sizeof(q_count));
+ assert(ifile.fail());
+ double freq = 0;
+ // ToDo 0.5 threshold ==>> command line option
+ if (q_count > 0.5) {
+ freq = 1 / q_count;
+ }
+ info.q_count += q_count;
+ info.count += 1;
+ info.freq += freq;
+ }
+ for (UnorderedMap::iterator it = stat_map.begin();
+ it != stat_map.end(); ++it) {
+ const KMerInfo &info = it->second;
+ AddToHist(info.freq);
+ ofile << it->first.str().c_str() << " "
+ << info.count << " "
+ << info.q_count << " "
+ << info.freq << endl;
+ }
+}
+
+void Quake::Count(string ifile_name, string ofile_name,
+ string hash_file_prefix, uint32_t hash_file_number,
+ uint8_t quality_offset, uint8_t quality_threshold) {
+ vector<ofstream*> ofiles(hash_file_number);
+ for (uint32_t i = 0; i < hash_file_number; ++i) {
+ char filename[50];
+ snprintf(filename, sizeof(filename), "%s%u.part",
+ hash_file_prefix.c_str(), i);
+ ofiles[i] = new ofstream(filename);
+ assert(ofiles[i]->fail() && "Too many files to open");
+ }
+ SplitToFiles(ireadstream(ifile_name, quality_offset),
+ ofiles, quality_threshold);
+ for (uint32_t i = 0; i < hash_file_number; ++i) {
+ delete ofiles[i];
+ }
+ ofstream ofile(ofile_name.c_str());
+ assert(ofile != NULL && "Too many files to open");
+ for (uint32_t i = 0; i < hash_file_number; ++i) {
+ char ifile_name[50];
+ snprintf(ifile_name, sizeof(ifile_name), "%s%u.part",
+ hash_file_prefix.c_str(), i);
+ ifstream ifile(ifile_name);
+ EvalFile(ifile, ofile);
+ remove(ifile_name);
+ }
+ cur_state_ = kRealHistPrepared;
+}
diff --git a/src/hammer/quake_enhanced/count/CMakeLists.txt b/src/projects/hammer/quake_enhanced/count/CMakeLists.txt
similarity index 100%
rename from src/hammer/quake_enhanced/count/CMakeLists.txt
rename to src/projects/hammer/quake_enhanced/count/CMakeLists.txt
diff --git a/src/projects/hammer/quake_enhanced/count/count.cpp b/src/projects/hammer/quake_enhanced/count/count.cpp
new file mode 100644
index 0000000..eafe3cd
--- /dev/null
+++ b/src/projects/hammer/quake_enhanced/count/count.cpp
@@ -0,0 +1,226 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/**
+ * @file preproc.cpp
+ * @author Alex Davydow
+ * @version 1.0
+ *
+ * @section LICENSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * @section DESCRIPTION
+ *
+ * For each k-mer this program calculates number of occurring in
+ * the reads provided. Reads file is supposed to be in fastq
+ * format.
+ */
+#include <stdint.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+#include <string>
+#include <set>
+#include <unordered_map>
+#include <vector>
+#include "logging.hpp"
+#include "io/ireadstream.hpp"
+#include "io/read.hpp"
+#include "data_structures/sequence/seq.hpp"
+#include "valid_kmer_generator.hpp"
+#define SUPPRESS_UNUSED(X) ((void) (X))
+
+using std::string;
+using std::set;
+using std::vector;
+using std::unordered_map;
+using std::map;
+
+namespace {
+
+DECL_LOGGER("count")
+
+struct KMerInfo {
+ int count;
+ double q_count;
+ double q_inversed_count;
+};
+
+const uint32_t kK = 55;
+typedef Seq<kK> KMer;
+typedef unordered_map<KMer, KMerInfo, KMer::hash> UnorderedMap;
+
+/**
+ * @variable Every kStep k-mer will appear in the log.
+ */
+const int kStep = 1e5;
+
+struct Options {
+ /**
+ * @variable An offset for quality in a fastq file.
+ */
+ uint32_t qvoffset;
+ string ifile;
+ string ofile;
+ uint32_t error_threshold;
+ /**
+ * @variable How many files will be used when splitting k-mers.
+ */
+ uint32_t file_number;
+ bool valid;
+ Options()
+ : qvoffset(0),
+ ifile(""),
+ ofile(""),
+ error_threshold(0),
+ file_number(3),
+ valid(true) {}
+};
+
+void PrintHelp(char *program_name) {
+ printf("Usage: %s qvoffset ifile.fastq ofile.[q]cst file_number error_threshold\n",
+ program_name);
+ printf("Where:\n");
+ printf("\tqvoffset\tan offset of fastq quality data\n");
+ printf("\tifile.fastq\tan input file with reads in fastq format\n");
+ printf("\tofile.[q]cst\ta filename where k-mer statistics will be outputted\n");
+ printf("\terror_threshold\tnucliotides with quality lower then threshold will be cut from the ends of reads\n");
+ printf("\tfile_number\thow many files will be used when splitting k-mers\n");
+}
+
+Options ParseOptions(int argc, char *argv[]) {
+ Options ret;
+ if (argc != 6) {
+ ret.valid = false;
+ } else {
+ ret.qvoffset = atoi(argv[1]);
+ ret.valid &= (ret.qvoffset >= 0 && ret.qvoffset <= 255);
+ ret.ifile = argv[2];
+ ret.ofile = argv[3];
+ ret.error_threshold = atoi(argv[4]);
+ ret.valid &= (ret.error_threshold >= 0 && ret.error_threshold <= 255);
+ ret.file_number = atoi(argv[5]);
+ }
+ return ret;
+}
+
+/**
+ * This function reads reads from the stream and splits them into
+ * k-mers. Then k-mers are written to several file almost
+ * uniformly. It is guaranteed that the same k-mers are written to the
+ * same files.
+ * @param ifs Steam to read reads from.
+ * @param ofiles Files to write the result k-mers. They are written
+ * one per line.
+ */
+void SplitToFiles(ireadstream ifs, const vector<FILE*> &ofiles,
+ uint8_t error_threshold) {
+ uint32_t file_number = ofiles.size();
+ uint64_t read_number = 0;
+ while (!ifs.eof()) {
+ ++read_number;
+ if (read_number % kStep == 0) {
+ INFO("Reading read " << read_number << ".");
+ }
+ Read r;
+ ifs >> r;
+ KMer::hash hash_function;
+ for (ValidKMerGenerator<kK> gen(r, error_threshold); gen.HasMore(); gen.Next()) {
+ KMer kmer = gen.kmer();
+ if (KMer::less2()(!kmer, kmer)) {
+ kmer = !kmer;
+ }
+ FILE *cur_file = ofiles[hash_function(kmer) % file_number];
+ KMer::BinWrite(cur_file, kmer);
+ double correct_probability = gen.correct_probability();
+ fwrite(&correct_probability, sizeof(correct_probability), 1, cur_file);
+ }
+ }
+}
+
+/**
+ * This function reads k-mer and calculates number of occurrences for
+ * each of them.
+ * @param ifile File with k-mer to process. One per line.
+ * @param ofile Output file. For each unique k-mer there will be a
+ * line with k-mer itself and number of its occurrences.
+ */
+void EvalFile(FILE *ifile, FILE *ofile) {
+ UnorderedMap stat_map;
+ char buffer[kK + 1];
+ buffer[kK] = 0;
+ KMer kmer;
+ while (KMer::BinRead(ifile, &kmer)) {
+ KMerInfo &info = stat_map[kmer];
+ double correct_probability = -1;
+ bool readed =
+ fread(&correct_probability, sizeof(correct_probability),
+ 1, ifile);
+ assert(readed == 1);
+ SUPPRESS_UNUSED(readed);
+ double inversed_probability = 1 / correct_probability;
+ // ToDo 0.5 threshold ==>> command line option
+ if (correct_probability < 0.5) {
+ inversed_probability = 0;
+ }
+ info.q_count += correct_probability;
+ info.count += 1;
+ info.q_inversed_count += inversed_probability;
+ }
+ for (UnorderedMap::iterator it = stat_map.begin();
+ it != stat_map.end(); ++it) {
+ const KMerInfo &info = it->second;
+ fprintf(ofile, "%s %d %f %f\n", it->first.str().c_str(),
+ info.count, info.q_count, info.q_inversed_count);
+ }
+}
+
+void run(const Options &opts) {
+ INFO("Starting preproc: evaluating "
+ << opts.ifile << ".");
+ vector<FILE*> ofiles(opts.file_number);
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ char filename[50];
+ snprintf(filename, sizeof(filename), "%u.kmer.part", i);
+ ofiles[i] = fopen(filename, "wb");
+ assert(ofiles[i] != NULL && "Too many files to open");
+ }
+ SplitToFiles(ireadstream(opts.ifile, opts.qvoffset),
+ ofiles, opts.error_threshold);
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ fclose(ofiles[i]);
+ }
+ FILE *ofile = fopen(opts.ofile.c_str(), "w");
+ assert(ofile != NULL && "Too many files to open");
+ for (uint32_t i = 0; i < opts.file_number; ++i) {
+ char ifile_name[50];
+ snprintf(ifile_name, sizeof(ifile_name), "%u.kmer.part", i);
+ FILE *ifile = fopen(ifile_name, "rb");
+ INFO("Processing " << ifile_name << ".");
+ EvalFile(ifile, ofile);
+ INFO("Processed " << ifile_name << ".");
+ fclose(ifile);
+ }
+ fclose(ofile);
+ INFO("Preprocessing done. You can find results in " <<
+ opts.ofile << ".");
+}
+}
+
+int main(int argc, char *argv[]) {
+ Options opts = ParseOptions(argc, argv);
+ if (!opts.valid) {
+ PrintHelp(argv[0]);
+ return 1;
+ }
+ run(opts);
+ return 0;
+}
diff --git a/src/hammer/quake_enhanced/filter_trusted.cpp b/src/projects/hammer/quake_enhanced/filter_trusted.cpp
similarity index 100%
rename from src/hammer/quake_enhanced/filter_trusted.cpp
rename to src/projects/hammer/quake_enhanced/filter_trusted.cpp
diff --git a/src/hammer/quake_enhanced/filter_trusted/CMakeLists.txt b/src/projects/hammer/quake_enhanced/filter_trusted/CMakeLists.txt
similarity index 100%
rename from src/hammer/quake_enhanced/filter_trusted/CMakeLists.txt
rename to src/projects/hammer/quake_enhanced/filter_trusted/CMakeLists.txt
diff --git a/src/hammer/quake_enhanced/filter_trusted/main.cpp b/src/projects/hammer/quake_enhanced/filter_trusted/main.cpp
similarity index 100%
rename from src/hammer/quake_enhanced/filter_trusted/main.cpp
rename to src/projects/hammer/quake_enhanced/filter_trusted/main.cpp
diff --git a/src/hammer/quake_enhanced/filter_trusted_enh/CMakeLists.txt b/src/projects/hammer/quake_enhanced/filter_trusted_enh/CMakeLists.txt
similarity index 100%
rename from src/hammer/quake_enhanced/filter_trusted_enh/CMakeLists.txt
rename to src/projects/hammer/quake_enhanced/filter_trusted_enh/CMakeLists.txt
diff --git a/src/projects/hammer/quake_enhanced/filter_trusted_enh/main.cpp b/src/projects/hammer/quake_enhanced/filter_trusted_enh/main.cpp
new file mode 100644
index 0000000..cbe54e7
--- /dev/null
+++ b/src/projects/hammer/quake_enhanced/filter_trusted_enh/main.cpp
@@ -0,0 +1,106 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include <stdint.h>
+#include <cstdlib>
+#include <cstdio>
+#include <string>
+#include <unordered_map>
+#include "dev_support/logger/logger.hpp"
+
+using std::string;
+using std::unordered_map;
+
+namespace {
+/**
+ * @variable Length of string buffer which will store k-mer.
+ */
+const uint32_t kMaxK = 100;
+/**
+ * @variable Every kStep k-mer will appear in the log.
+ */
+const int kStep = 1e5;
+
+DECL_LOGGER("filter_trusted_enh")
+
+struct Options {
+ string ifile;
+ string ofile;
+ string badfile;
+ string limits;
+ bool valid;
+ Options()
+ : ifile(""),
+ ofile(""),
+ badfile(""),
+ limits(""),
+ valid(true) {}
+};
+
+void PrintHelp(char *progname) {
+ printf("Usage: %s ifile.[q]cst ofile.trust ofile.bad file.limits\n", progname);
+ printf("Where:\n");
+ printf("\tifile.[q]cst\tfile with k|q-mer statistics\n");
+ printf("\tofile.trust\ta filename where filtered data will be outputted\n");
+ printf("\tofile.bud\ta filename where filtered garbage will be outputted\n");
+ printf("\tfile.limits\tfile with q-value limits for k-mers\n");
+}
+
+Options ParseOptions(int argc, char *argv[]) {
+ Options ret;
+ if (argc != 5) {
+ ret.valid = false;
+ } else {
+ ret.ifile = argv[1];
+ ret.ofile = argv[2];
+ ret.badfile = argv[3];
+ ret.limits = argv[4];
+ }
+ return ret;
+}
+}
+
+
+int main(int argc, char *argv[]) {
+ Options opts = ParseOptions(argc, argv);
+ if (!opts.valid) {
+ PrintHelp(argv[0]);
+ return 1;
+ }
+ BasicConfigurator::configure();
+ INFO(logger, "Starting filter_trusted: evaluating "
+ << opts.ifile << ".");
+ FILE *ifile = fopen(opts.ifile.c_str(), "r");
+ FILE *ofile = fopen(opts.ofile.c_str(), "w");
+ FILE *badfile = fopen(opts.badfile.c_str(), "w");
+ FILE *limits_file = fopen(opts.limits.c_str(), "r");
+ unordered_map<uint32_t, long double> limits;
+ uint32_t x;
+ long double limit;
+ while (fscanf(limits_file, "%u %Lf", &x, &limit) == 2) {
+ limits[x] = limit;
+ }
+ char kmer[kMaxK];
+ char format[20];
+ float freq = -1;
+ int count;
+ float q_count;
+ snprintf(format, sizeof(format), "%%%ds%%d%%f%%f", kMaxK);
+ uint64_t read_number = 0;
+ while (fscanf(ifile, format, kmer, &count, &q_count, &freq) != EOF) {
+ ++read_number;
+ if (read_number % kStep == 0) {
+ INFO(logger, "Reading k-mer " << read_number << ".");
+ }
+ if (q_count / count > limits[count]) {
+ fprintf(ofile, "%s %d %f %f\n", kmer, count, q_count, freq);
+ } else {
+ fprintf(badfile, "%s %d %f %f\n", kmer, count, q_count, freq);
+ }
+ }
+ return 0;
+}
diff --git a/src/hammer/quake_enhanced/generate_limits/CMakeLists.txt b/src/projects/hammer/quake_enhanced/generate_limits/CMakeLists.txt
similarity index 100%
rename from src/hammer/quake_enhanced/generate_limits/CMakeLists.txt
rename to src/projects/hammer/quake_enhanced/generate_limits/CMakeLists.txt
diff --git a/src/hammer/quake_enhanced/generate_limits/main.cpp b/src/projects/hammer/quake_enhanced/generate_limits/main.cpp
similarity index 100%
rename from src/hammer/quake_enhanced/generate_limits/main.cpp
rename to src/projects/hammer/quake_enhanced/generate_limits/main.cpp
diff --git a/src/hammer/quake_enhanced/main.cpp b/src/projects/hammer/quake_enhanced/main.cpp
similarity index 100%
rename from src/hammer/quake_enhanced/main.cpp
rename to src/projects/hammer/quake_enhanced/main.cpp
diff --git a/src/projects/hammer/quake_enhanced/options.cpp b/src/projects/hammer/quake_enhanced/options.cpp
new file mode 100644
index 0000000..22536c8
--- /dev/null
+++ b/src/projects/hammer/quake_enhanced/options.cpp
@@ -0,0 +1,206 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include <iomanip>
+#include <string>
+#include <sstream>
+#include <vector>
+#include "getopt_pp/getopt_pp_standalone.h"
+#include "options.hpp"
+using quake_enhanced::Options;
+using GetOpt::GetOpt_pp;
+using GetOpt::Option;
+using GetOpt::OptionPresent;
+using GetOpt::Include_Environment;
+using std::string;
+using std::vector;
+using std::endl;
+using std::ostringstream;
+using std::setw;
+
+Options::Options(int argc, char **argv) :
+ read_file(""),
+ corrected_read_file(""),
+ help_message(""),
+ kmer_count_file("kmer.count"),
+ hash_file_prefix("kmer_"),
+ hash_file_number(1000),
+ quality_offset(33),
+ quality_threshold(2),
+ hist_file(""),
+ trusted_hist_file(""),
+ bad_hist_file(""),
+ top_threshold(5),
+ average_min(0.9),
+ limits_file(""),
+ bad_threshold(0.1),
+ trusted_kmer_file(""),
+ bad_kmer_file("") {
+ string help_module = "";
+ bool need_help;
+ vector<string> global_options;
+ help_builder << "Usage: " << argv[0] <<
+ " --read-file <file> --corrected-read-file <file> --trusted-kmer-file <file>[options]\n";
+ GetOpt_pp options(argc, argv, Include_Environment);
+ // Help Options
+ options >> OptionPresent('h', "help", need_help);
+ options >> Option('\0', "help-module", help_module);
+ // General Options
+ options >> Option('\0', "read-file", read_file, read_file);
+ options >> Option('\0', "corrected-read-file",
+ corrected_read_file, corrected_read_file);
+ // Count Options
+ options >> Option('\0', "hash-file-number",
+ hash_file_number, hash_file_number);
+ options >> Option('\0', "hash-file-prefix",
+ hash_file_prefix, hash_file_prefix);
+ options >> Option('\0', "quality-offset",
+ quality_offset, quality_offset);
+ options >> Option('\0', "quality-threshold",
+ quality_threshold, quality_threshold);
+ options >> Option('\0', "kmer-count-file",
+ kmer_count_file, kmer_count_file);
+ // PrepareHist Options
+ options >> Option('\0', "hist-file",
+ hist_file, hist_file);
+ options >> Option('\0', "trusted-hist-file",
+ trusted_hist_file, trusted_hist_file);
+ options >> Option('\0', "bad-hist-file",
+ bad_hist_file, bad_hist_file);
+ options >> Option('\0', "top-threshold",
+ top_threshold, top_threshold);
+ options >> Option('\0', "average-min",
+ average_min, average_min);
+ // PrepareLimits Options
+ options >> Option('\0', "limits-file",
+ limits_file, limits_file);
+ options >> Option('\0', "bad-threshold",
+ bad_threshold, bad_threshold);
+ // FilterTrusted Options
+ options >> Option('\0', "trusted-kmer-file",
+ trusted_kmer_file, trusted_kmer_file);
+ options >> Option('\0', "bad-kmer-file",
+ bad_kmer_file, bad_kmer_file);
+ if (need_help || help_module != "") {
+ valid = false;
+ } else {
+ Validate();
+ }
+ help_builder << std::left << endl;
+ if (!valid) {
+ help_builder <<
+ "General options: \n"
+ "--read-file <str> file with reads to correct in one of \n"
+ " supported formats: fastq, fasta \n"
+ "--corrected-read-file <str> fasta file, where corrected reads will \n"
+ " be written \n"
+ "--help-module <str> produce a help for a given module, \n"
+ " module can be: count, prepare_hist \n"
+ " prepare_limits, filter_trusted \n";
+
+ if (help_module == "count") {
+ help_builder <<
+ "Count options: \n"
+ "--kmer-count-file <str> file where kmer count info will be \n"
+ " written, default kmer.count \n"
+ "--hash-file-prefix <str> prefix for hash_file, default: kmer_ \n"
+ "--hash-file-number <int(>0)> number of hash_files, default: 1000. \n"
+ " Generally the greater this number is, \n"
+ " the faster will program work, but there\n"
+ " is a risk of running out of file \n"
+ " descriptors \n"
+ "--quality-offset <int([0..255])> offset of quality values (for fastq \n"
+ " files). It's usually 33 or 64, \n"
+ " default: 33 \n"
+ "--quality-threshold <int([0..255])> nucleotides with quality lower than \n"
+ " threshold will be cut from the ends of \n"
+ " the read, default: 2 \n";
+
+ } else if (help_module == "prepare_hist") {
+ help_builder <<
+ "PrepareHist options: \n"
+ "--hist-file <str> file where k-mer histogram will be \n"
+ " written, default \"\" - no histogram \n"
+ "--trusted-hist <str> file where trusted k-mer histogram will\n"
+ " be written, default \"\" - no histogram\n"
+ "--bad-hist <str> file where bad k-mer histogram will be \n"
+ " written, default \"\" - no histogram \n"
+ "--top-threshold <int(>0)> we will look for maximum which is at \n"
+ " least top_threshold times higher than \n"
+ " previous, default 5 \n"
+ "--average-min <float([0..1])> trying to find Gauss's average we will \n"
+ " go to the left and to the right until \n"
+ " we rich coverage average_min * max \n";
+ } else if (help_module == "prepare_limits") {
+ help_builder <<
+ "PrepareLimits options: \n"
+ "--limits-file <str> file where 1-value limits for every \n"
+ " k-value will be written, \n"
+ " default \"\" - not to save limits \n"
+ "--bad-threshold <float(>0)> k-mer will be considered untrusted if \n"
+ " its probability of being bad is at \n"
+ " least bad-threshold times greater then \n"
+ " probability of being good \n";
+ } else if (help_module == "filter_trusted") {
+ help_builder <<
+ "FilterTrusted options: \n"
+ "--trusted-kmer-file <str> file where trusted k-mer will be \n"
+ " written \n"
+ "--bad--kmer-fil <str> file where trusted k-mer will be \n"
+ " written, default \"\" - no file \n";
+ }
+
+ }
+ help_message += help_builder.str();
+}
+
+void Options::Validate() {
+ // General Validation
+ if (read_file == "") {
+ help_builder <<
+ "Error: You must provide read_file\n";
+ valid = false;
+ }
+ if (corrected_read_file == "") {
+ help_builder <<
+ "Error: You must provide corrected_read_file\n";
+ valid = false;
+ }
+ // Count Validation
+ if (hash_file_number < 1) {
+ help_builder <<
+ "Error: hash_file_number can not be lesser than one\n";
+ valid = false;
+ }
+ if (quality_offset < 0 || quality_offset > 255) {
+ help_builder <<
+ "Error: quality_offset must be in 0..255\n";
+ valid = false;
+ }
+ if (quality_threshold < 0 || quality_threshold > 255) {
+ help_builder <<
+ "Error: quality_threshold must be in 0..255\n";
+ valid = false;
+ }
+ // PrepareHist Validation
+ if (average_min < 0 || average_min > 1) {
+ help_builder <<
+ "Error: average_min must be in 0..1\n";
+ valid = false;
+ }
+ // PrepareLimits Validation
+ if (bad_threshold < 0) {
+ help_builder <<
+ "Error: bad_threshold must be in 0..*\n";
+ valid = false;
+ }
+ // FilterTrusted Validation
+ if (trusted_kmer_file == "") {
+ help_builder << "Error: trusted_kmer_file must be provided\n";
+ valid = false;
+ }
+}
diff --git a/src/hammer/quake_enhanced/options.hpp b/src/projects/hammer/quake_enhanced/options.hpp
similarity index 100%
rename from src/hammer/quake_enhanced/options.hpp
rename to src/projects/hammer/quake_enhanced/options.hpp
diff --git a/src/hammer/quake_enhanced/prepare_graph/CMakeLists.txt b/src/projects/hammer/quake_enhanced/prepare_graph/CMakeLists.txt
similarity index 100%
rename from src/hammer/quake_enhanced/prepare_graph/CMakeLists.txt
rename to src/projects/hammer/quake_enhanced/prepare_graph/CMakeLists.txt
diff --git a/src/hammer/quake_enhanced/prepare_graph/prepare_graph.cpp b/src/projects/hammer/quake_enhanced/prepare_graph/prepare_graph.cpp
similarity index 100%
rename from src/hammer/quake_enhanced/prepare_graph/prepare_graph.cpp
rename to src/projects/hammer/quake_enhanced/prepare_graph/prepare_graph.cpp
diff --git a/src/hammer/quake_enhanced/prepare_hist.cpp b/src/projects/hammer/quake_enhanced/prepare_hist.cpp
similarity index 100%
rename from src/hammer/quake_enhanced/prepare_hist.cpp
rename to src/projects/hammer/quake_enhanced/prepare_hist.cpp
diff --git a/src/hammer/quake_enhanced/prepare_limits.cpp b/src/projects/hammer/quake_enhanced/prepare_limits.cpp
similarity index 100%
rename from src/hammer/quake_enhanced/prepare_limits.cpp
rename to src/projects/hammer/quake_enhanced/prepare_limits.cpp
diff --git a/src/hammer/quake_enhanced/quake.hpp b/src/projects/hammer/quake_enhanced/quake.hpp
similarity index 100%
rename from src/hammer/quake_enhanced/quake.hpp
rename to src/projects/hammer/quake_enhanced/quake.hpp
diff --git a/src/hammer/quake_enhanced/test_correction_quality/CMakeLists.txt b/src/projects/hammer/quake_enhanced/test_correction_quality/CMakeLists.txt
similarity index 100%
rename from src/hammer/quake_enhanced/test_correction_quality/CMakeLists.txt
rename to src/projects/hammer/quake_enhanced/test_correction_quality/CMakeLists.txt
diff --git a/src/projects/hammer/quake_enhanced/test_correction_quality/main.cpp b/src/projects/hammer/quake_enhanced/test_correction_quality/main.cpp
new file mode 100644
index 0000000..a2dc78a
--- /dev/null
+++ b/src/projects/hammer/quake_enhanced/test_correction_quality/main.cpp
@@ -0,0 +1,108 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include <stdint.h>
+#include <string.h>
+#include <cstdlib>
+#include <cstdio>
+#include <string>
+#include <unordered_set>
+
+using std::unordered_set;
+using std::string;
+
+namespace {
+/**
+ * @variable Length of string buffer which will store k-mer.
+ */
+const uint32_t kMaxK = 100;
+
+struct Options {
+ string genom_file;
+ string trust_file;
+ string bad_file;
+ bool full;
+ float threshold;
+ bool valid;
+ Options()
+ : genom_file(""),
+ trust_file(""),
+ bad_file(""),
+ full(false),
+ valid(true) {}
+};
+
+void PrintHelp(char *progname) {
+ printf("Usage: %s genom.[q]cst ifile.trust ifile.bad [--full]\n", progname);
+ printf("Where:\n");
+ printf("\tgenom.[q]cst\tfile with k|q-mer statistics from real genom\n");
+ printf("\tifile.trust\ta filename where filtered data is\n");
+ printf("\tifile.bud\ta filename where filtered garbage is\n");
+ printf("\t--full\tpass this option to output all incorrect k-mers with their names to stdout\n");
+}
+
+Options ParseOptions(int argc, char *argv[]) {
+ Options ret;
+ if (argc < 4 || argc > 5) {
+ ret.valid = false;
+ } else {
+ ret.genom_file = argv[1];
+ ret.trust_file = argv[2];
+ ret.bad_file = argv[3];
+ if (argc == 5 && ( !strcmp(argv[4], "--full") || !strcmp(argv[4], "-f") ) )
+ ret.full = true;
+ }
+ return ret;
+}
+}
+
+
+int main(int argc, char *argv[]) {
+ Options opts = ParseOptions(argc, argv);
+ if (!opts.valid) {
+ PrintHelp(argv[0]);
+ return 1;
+ }
+ FILE *genom_file = fopen(opts.genom_file.c_str(), "r");
+ FILE *trust_file = fopen(opts.trust_file.c_str(), "r");
+ FILE *bad_file = fopen(opts.bad_file.c_str(), "r");
+ char kmer[kMaxK];
+ char format[20];
+ float freq = -1;
+ int count;
+ float q_count;
+ snprintf(format, sizeof(format), "%%%ds%%d%%f%%f", kMaxK);
+ unordered_set<string> real_kmers;
+ while (fscanf(genom_file, format, kmer, &count, &q_count, &freq) != EOF) {
+ real_kmers.insert(string(kmer));
+ }
+ int trusted = 0;
+ int trusted_fail = 0;
+ int bad = 0;
+ int bad_fail = 0;
+ while (fscanf(trust_file, format, kmer, &count, &q_count, &freq) != EOF) {
+ if (real_kmers.count(string(kmer)) > 0) {
+ ++trusted;
+ } else {
+ ++trusted_fail;
+ if ( opts.full ) printf(" %s\t%d\t%f\t%f\n", kmer, count, q_count, freq);
+ }
+ }
+ printf("trusted: %d\n", trusted + trusted_fail);
+ printf("erroneous: %d\n", trusted_fail);
+ while (fscanf(bad_file, format, kmer, &count, &q_count, &freq) != EOF) {
+ if (real_kmers.count(string(kmer)) > 0) {
+ ++bad_fail;
+ if ( opts.full ) printf(" %s\t%d\t%f\t%f\n", kmer, count, q_count, freq);
+ } else {
+ ++bad;
+ }
+ }
+ printf("bad: %d\n", bad + bad_fail);
+ printf("erroneous: %d\n", bad_fail);
+ return 0;
+}
diff --git a/src/hammer/read_corrector.cpp b/src/projects/hammer/read_corrector.cpp
similarity index 100%
rename from src/hammer/read_corrector.cpp
rename to src/projects/hammer/read_corrector.cpp
diff --git a/src/hammer/read_corrector.hpp b/src/projects/hammer/read_corrector.hpp
similarity index 100%
rename from src/hammer/read_corrector.hpp
rename to src/projects/hammer/read_corrector.hpp
diff --git a/src/projects/hammer/valid_kmer_generator.hpp b/src/projects/hammer/valid_kmer_generator.hpp
new file mode 100644
index 0000000..c4128c4
--- /dev/null
+++ b/src/projects/hammer/valid_kmer_generator.hpp
@@ -0,0 +1,200 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef HAMMER_VALIDKMERGENERATOR_HPP_
+#define HAMMER_VALIDKMERGENERATOR_HPP_
+
+#include "globals.hpp"
+
+#include "io/reads/read.hpp"
+#include "data_structures/sequence/seq.hpp"
+
+#include <string>
+#include <vector>
+
+#include <cstdint>
+#include <cmath>
+
+/**
+ * This class is designed to iterate through valid k-mers in read.
+ * @example
+ * ValidKMerGenerator<2> gen(read, 4);
+ * while (gen.HasMore()) {
+ * MyTrickyFunction(gen.kmer());
+ * gen.Next();
+ * }
+ * or
+ * for (ValidKMerGenerator<2> gen(read, 2); gen.HasMore; gen.Next() {
+ * MyTrickyFunction(gen.kmer(), gen.pos(), gen.correct_probability());
+ * }
+ * @param kK k-mer length.
+ */
+template<uint32_t kK>
+class ValidKMerGenerator {
+ public:
+ /**
+ * @param read Read to generate k-mers from.
+ * @param bad_quality_threshold This class virtually cuts
+ * nucleotides with quality lower the threshold from the ends of the
+ * read.
+ */
+ explicit ValidKMerGenerator(const Read &read,
+ uint8_t bad_quality_threshold = 2) {
+ Reset(read.getSequenceString().data(),
+ read.getQualityString().data(),
+ read.getSequenceString().size(),
+ bad_quality_threshold);
+ }
+ /**
+ * @param seq sequence to generate k-mers from.
+ * @param qual quality string
+ * @param bad_quality_threshold This class virtually cuts
+ * nucleotides with quality lower the threshold from the ends of the
+ * read.
+ */
+ explicit ValidKMerGenerator(const char *seq, const char *qual,
+ size_t len,
+ uint8_t bad_quality_threshold = 2) {
+ Reset(seq, qual, len, bad_quality_threshold);
+ }
+
+ ValidKMerGenerator()
+ : kmer_(), seq_(0), qual_(0),
+ pos_(-1), end_(-1), len_(0),
+ correct_probability_(1), bad_quality_threshold_(2),
+ has_more_(false), first(true) {}
+
+ void Reset(const char *seq, const char *qual,
+ size_t len,
+ uint8_t bad_quality_threshold = 2) {
+ kmer_ = Seq<kK>();
+ seq_ = seq;
+ qual_ = qual;
+ pos_ = -1;
+ end_ = -1;
+ len_ = len;
+ correct_probability_ = 1.0;
+ bad_quality_threshold_ = bad_quality_threshold;
+ has_more_ = true;
+ first = true;
+
+ TrimBadQuality();
+ Next();
+ }
+
+ /**
+ * @result true if Next() succeed while generating new k-mer, false
+ * otherwise.
+ */
+ bool HasMore() const {
+ return has_more_;
+ }
+ /**
+ * @result last k-mer generated by Next().
+ */
+ const Seq<kK>& kmer() const {
+ return kmer_;
+ }
+ /**
+ * @result last k-mer position in initial read.
+ */
+ size_t pos() const {
+ return pos_;
+ }
+ /**
+ * @result probability that last generated k-mer is correct.
+ */
+ double correct_probability() const {
+ return correct_probability_;
+ }
+ /**
+ * This functions reads next k-mer from the read and sets hasmore to
+ * if succeeded. You can access k-mer read with kmer().
+ */
+ void Next();
+ private:
+ void TrimBadQuality();
+ double Prob(uint8_t qual) {
+ return Globals::quality_probs[qual];
+ }
+ uint8_t GetQual(uint32_t pos) {
+ if (pos >= len_) {
+ return 2;
+ } else {
+ return qual_[pos];
+ }
+ }
+ Seq<kK> kmer_;
+ const char* seq_;
+ const char* qual_;
+ size_t pos_;
+ size_t end_;
+ size_t len_;
+ double correct_probability_;
+ uint8_t bad_quality_threshold_;
+ bool has_more_;
+ bool first;
+
+ // Disallow copy and assign
+ ValidKMerGenerator(const ValidKMerGenerator&) = delete;
+ void operator=(const ValidKMerGenerator&) = delete;
+};
+
+template<uint32_t kK>
+void ValidKMerGenerator<kK>::TrimBadQuality() {
+ pos_ = 0;
+ if (qual_)
+ for (; pos_ < len_; ++pos_) {
+ if (GetQual((uint32_t)pos_) >= bad_quality_threshold_)
+ break;
+ }
+ end_ = len_;
+ if (qual_)
+ for (; end_ > pos_; --end_) {
+ if (GetQual((uint32_t)(end_ - 1)) >= bad_quality_threshold_)
+ break;
+ }
+}
+
+template<uint32_t kK>
+void ValidKMerGenerator<kK>::Next() {
+ if (pos_ + kK > end_) {
+ has_more_ = false;
+ } else if (first || !is_nucl(seq_[pos_ + kK - 1])) {
+ // in this case we have to look for new k-mer
+ correct_probability_ = 1.0;
+ uint32_t start_hypothesis = (uint32_t)pos_;
+ uint32_t i = (uint32_t)pos_;
+ for (; i < len_; ++i) {
+ if (i == kK + start_hypothesis) {
+ break;
+ }
+ if (qual_)
+ correct_probability_ *= Prob(GetQual(i));
+ if (!is_nucl(seq_[i])) {
+ start_hypothesis = i + 1;
+ correct_probability_ = 1.0;
+ }
+ }
+ if (i == kK + start_hypothesis) {
+ kmer_ = Seq<kK>(seq_ + start_hypothesis, 0, kK, /* raw */ true);
+ pos_ = start_hypothesis + 1;
+ } else {
+ has_more_ = false;
+ }
+ } else {
+ // good case we can just shift our previous answer
+ kmer_ = kmer_ << seq_[pos_ + kK - 1];
+ if (qual_) {
+ correct_probability_ *= Prob(GetQual((uint32_t)pos_ + kK - 1));
+ correct_probability_ /= Prob(GetQual((uint32_t)pos_ - 1));
+ }
+ ++pos_;
+ }
+ first = false;
+}
+#endif // HAMMER_VALIDKMERGENERATOR_HPP__
diff --git a/src/projects/ionhammer/CMakeLists.txt b/src/projects/ionhammer/CMakeLists.txt
new file mode 100644
index 0000000..c78cbca
--- /dev/null
+++ b/src/projects/ionhammer/CMakeLists.txt
@@ -0,0 +1,33 @@
+############################################################################
+# Copyright (c) 2015 Saint Petersburg State University
+# Copyright (c) 2011-2014 Saint-Petersburg Academic University
+# All Rights Reserved
+# See file LICENSE for details.
+############################################################################
+
+project(ionhammer CXX)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
+add_executable(ionhammer
+ kmer_data.cpp
+ hamcluster.cpp
+ subcluster.cpp
+ err_helper_table.cpp
+ config_struct.cpp
+ expander.cpp
+ seqeval/BaseHypothesisEvaluator.cpp
+ seqeval/TreephaserLite.cpp
+ main.cpp)
+
+target_link_libraries(ionhammer input dev_support pipeline mph_index BamTools ${COMMON_LIBRARIES})
+
+if (SPADES_STATIC_BUILD)
+ set_target_properties(ionhammer PROPERTIES LINK_SEARCH_END_STATIC 1)
+endif()
+
+install(TARGETS ionhammer
+ RUNTIME DESTINATION bin)
+install(DIRECTORY "${SPADES_CFG_DIR}/ionhammer"
+ DESTINATION share/spades/configs
+ FILES_MATCHING PATTERN "*.cfg")
diff --git a/src/projects/ionhammer/HSeq.hpp b/src/projects/ionhammer/HSeq.hpp
new file mode 100644
index 0000000..b6a3ad6
--- /dev/null
+++ b/src/projects/ionhammer/HSeq.hpp
@@ -0,0 +1,289 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef __HAMMER_HSEQ_HPP__
+#define __HAMMER_HSEQ_HPP__
+
+#include "data_structures/sequence/nucl.hpp"
+#include <city/city.h>
+
+#include <array>
+#include <string>
+#include <vector>
+#include <deque>
+
+#include <cstdint>
+
+namespace hammer {
+
+union HomopolymerRun {
+ uint8_t raw;
+ struct {
+ uint8_t len : 6;
+ uint8_t nucl : 2;
+ };
+
+ HomopolymerRun()
+ : raw(0) {}
+ HomopolymerRun(uint8_t nucl, uint8_t len)
+ : len(len & 63), nucl(nucl & 3) {}
+
+ bool operator==(const HomopolymerRun &that) const {
+ return raw == that.raw;
+ }
+
+ bool operator!=(const HomopolymerRun &that) const {
+ return raw != that.raw;
+ }
+
+ bool operator<(const HomopolymerRun &that) const {
+ return raw < that.raw;
+ }
+
+ std::string str() const {
+ return std::string(len, ::nucl(nucl));
+ }
+};
+
+namespace iontorrent {
+ // Container shall have push_back method
+ template <typename Container>
+ void toHomopolymerRuns(const std::string &seq, Container& runs) {
+ if (seq.empty())
+ return;
+
+ char nucl = seq[0];
+ uint8_t len = 1;
+ for (size_t i = 1; i < seq.size(); ++i) {
+ if (seq[i] != nucl) {
+ runs.push_back(HomopolymerRun(dignucl(nucl), len));
+ len = 1;
+ nucl = seq[i];
+ } else {
+ ++len;
+ }
+ }
+ if (len > 0) {
+ runs.push_back(HomopolymerRun(dignucl(nucl), len));
+ }
+ }
+
+};
+
+template <size_t N = 16>
+class HSeq {
+ public:
+ typedef std::array<HomopolymerRun, N> StorageType;
+
+ private:
+ StorageType data_;
+
+ const static size_t PrimeNum = 239;
+
+ public:
+ HSeq() {}
+
+ HSeq(typename StorageType::const_iterator Start,
+ typename StorageType::const_iterator End) {
+ std::copy(Start, End, data_.begin());
+ }
+
+ typedef HomopolymerRun DataType;
+ const static size_t DataSize = N;
+ const static size_t TotalBytes = sizeof(DataType) * DataSize;
+
+ static size_t GetDataSize(size_t size) {
+ VERIFY(size == N);
+ return N * sizeof(HomopolymerRun);
+ }
+
+ typename StorageType::const_iterator begin() const {
+ return data_.begin();
+ }
+
+ typename StorageType::const_iterator end() const {
+ return data_.end();
+ }
+
+ typename StorageType::const_reverse_iterator rbegin() const {
+ return data_.rbegin();
+ }
+
+ typename StorageType::const_reverse_iterator rend() const {
+ return data_.rend();
+ }
+
+ const HomopolymerRun *data() const {
+ return data_.data();
+ }
+
+ size_t data_size() const {
+ return DataSize;
+ }
+
+ HomopolymerRun &operator[](size_t idx) {
+ return data_[idx];
+ }
+
+ const HomopolymerRun &operator[](size_t idx) const {
+ return data_[idx];
+ }
+
+ HSeq<N> operator!() const {
+ HSeq<N> res(*this);
+
+ for (size_t i = 0; i < N / 2; ++i) {
+ HomopolymerRun front = res[i], back = res[N - i - 1];
+ front.nucl = complement(front.nucl) & 3;
+ back.nucl = complement(back.nucl) & 3;
+ res[i] = back;
+ res[N - i - 1] = front;
+ }
+
+ if (N & 1)
+ res[N/2].nucl = complement(res[N/2].nucl) & 3;
+
+ return res;
+ }
+
+ HSeq<N> operator<<(char nucl) const {
+ if (is_nucl(nucl))
+ nucl = dignucl(nucl);
+
+ HSeq<N> res(*this);
+ // Easy case - just add to run
+ HomopolymerRun &last = res[N-1];
+ if (last.nucl == nucl) {
+ last.len += 1;
+ return res;
+ }
+
+ // Hard case - have to shift the stuff
+ for (size_t i = 0; i < N - 1; ++i)
+ res[i] = res[i + 1];
+ res[N - 1].nucl = nucl;
+ res[N - 1].len = 1;
+
+ return res;
+ }
+
+ HSeq<N>& operator<<=(char nucl) {
+ if (is_nucl(nucl))
+ nucl = dignucl(nucl);
+
+ // Easy case - just add to run
+ HomopolymerRun &last = data_[N-1];
+ if (last.nucl == nucl) {
+ last.len = (last.len + 1) & 63;
+ return *this;
+ }
+
+ // Hard case - have to shift the stuff
+ for (size_t i = 0; i < N - 1; ++i)
+ data_[i] = data_[i + 1];
+ data_[N - 1].nucl = nucl & 3;
+ data_[N - 1].len = 1;
+
+ return *this;
+ }
+
+ HSeq<N> operator>>(char nucl) const {
+ if (is_nucl(nucl))
+ nucl = dignucl(nucl);
+
+ HSeq<N> res(*this);
+ // Easy case - just add to run
+ HomopolymerRun &first = res[0];
+ if (first.nucl == nucl) {
+ first.len += 1;
+ return res;
+ }
+
+ // Hard case - have to shift the stuff
+ for (size_t i = 0; i < N - 1; ++i)
+ res[i + 1] = res[i];
+ res[0].nucl = nucl;
+ res[0].len = 1;
+
+ return res;
+ }
+
+ bool operator==(const HSeq<N> &that) const {
+ return (data_ == that.data_);
+ }
+ bool operator!=(const HSeq<N> &that) const {
+ return (data_ != that.data_);
+ }
+
+ size_t size() const {
+ size_t res = 0;
+ for (size_t i = 0; i < N; ++i)
+ res += data_[i].len;
+
+ return res;
+ }
+
+ std::string str() const {
+ std::string res;
+ for (size_t i = 0; i < N; ++i)
+ res += data_[i].str();
+
+ return res;
+ }
+
+ static size_t GetHash(const DataType *data, size_t sz = DataSize, uint32_t seed = 0) {
+ return CityHash64WithSeed((const char*)data, sz * sizeof(DataType), 0x9E3779B9 ^ seed);
+ }
+
+ size_t GetHash(uint32_t seed = 0) const {
+ return GetHash(data_.data(), DataSize, seed);
+ }
+
+ struct hash {
+ size_t operator()(const HSeq<N> &seq, uint32_t seed = 0) const {
+ return seq.GetHash(seed);
+ }
+
+ size_t operator()(const DataType *data, size_t sz = DataSize, uint32_t seed = 0) const {
+ return GetHash(data, sz, seed);
+ }
+ };
+
+ struct less2_fast {
+ bool operator()(const HSeq<N> &l, const HSeq<N> &r) const {
+ for (size_t i = 0; i < N; ++i) {
+ const uint8_t lr = l[i].raw, rr = r[i].raw;
+ if (lr != rr)
+ return lr < rr;
+ }
+
+ return false;
+ }
+ };
+};
+
+template<size_t N>
+std::ostream& operator<<(std::ostream& os, const HSeq<N> &seq) {
+ os << seq.str();
+ return os;
+}
+
+namespace internal {
+ template <size_t N>
+ inline size_t getSize(const hammer::HSeq<N> &) {
+ return N;
+ }
+
+ template <typename T>
+ inline size_t getSize(const T& a) {
+ return a.size();
+ }
+}
+
+};
+
+#endif // __HAMMER_HSEQ_HPP__
diff --git a/src/projects/ionhammer/config_struct.cpp b/src/projects/ionhammer/config_struct.cpp
new file mode 100644
index 0000000..d821d99
--- /dev/null
+++ b/src/projects/ionhammer/config_struct.cpp
@@ -0,0 +1,84 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "config_struct.hpp"
+
+#include "dev_support/openmp_wrapper.h"
+
+#include "llvm/Support/YAMLParser.h"
+#include "llvm/Support/YAMLTraits.h"
+
+#include <string>
+
+using namespace llvm;
+
+namespace llvm { namespace yaml {
+template <>
+struct ScalarEnumerationTraits<hammer_config::HammerStage> {
+ static void enumeration(yaml::IO &io, hammer_config::HammerStage &value) {
+ io.enumCase(value, "count", hammer_config::HammerStage::KMerCounting);
+ io.enumCase(value, "hamcluster", hammer_config::HammerStage::HammingClustering);
+ io.enumCase(value, "subcluster", hammer_config::HammerStage::SubClustering);
+ io.enumCase(value, "correct", hammer_config::HammerStage::ReadCorrection);
+ }
+};
+}}
+
+// FIXME: This is temporary
+class DataSetReader {
+ public:
+ DataSetReader(yaml::IO&) {}
+ DataSetReader(yaml::IO&, io::DataSet<>&) {}
+
+ io::DataSet<> denormalize(yaml::IO &) {
+ return io::DataSet<>(path);
+ }
+
+ std::string path;
+};
+
+namespace llvm { namespace yaml {
+template <>
+struct MappingTraits<hammer_config::hammer_config> {
+ static void mapping(yaml::IO &io, hammer_config::hammer_config &cfg) {
+ yaml::MappingNormalization<DataSetReader, io::DataSet<>> dataset(io, cfg.dataset);
+
+ io.mapRequired("dataset", dataset->path);
+ io.mapOptional("working_dir", cfg.working_dir, std::string("."));
+ io.mapOptional("output_dir", cfg.output_dir, std::string("."));
+ io.mapRequired("hard_memory_limit", cfg.hard_memory_limit);
+ io.mapOptional("count_split_buffer", cfg.count_split_buffer, 0ul);
+ io.mapOptional("max_nthreads", cfg.max_nthreads, 1u);
+ io.mapRequired("kmer_qual_threshold", cfg.kmer_qual_threshold);
+ io.mapRequired("center_qual_threshold", cfg.center_qual_threshold);
+ io.mapRequired("delta_score_threshold", cfg.delta_score_threshold);
+ io.mapRequired("keep_uncorrected_ends", cfg.keep_uncorrected_ends);
+ io.mapRequired("tau", cfg.tau);
+ io.mapOptional("debug_mode", cfg.debug_mode, false);
+ io.mapOptional("start_stage", cfg.start_stage, hammer_config::HammerStage::KMerCounting);
+ }
+};
+}}
+
+namespace hammer_config {
+void load(hammer_config& cfg, const std::string &filename) {
+ ErrorOr<std::unique_ptr<MemoryBuffer>> Buf = MemoryBuffer::getFile(filename);
+ if (!Buf)
+ throw(std::string("Failed to load config file ") + filename);
+
+ yaml::Input yin(*Buf.get());
+ yin >> cfg;
+
+ if (yin.error())
+ throw(std::string("Failed to load config file ") + filename);
+
+ // Fix number of threads according to OMP capabilities.
+ cfg.max_nthreads = std::min(cfg.max_nthreads, (unsigned)omp_get_max_threads());
+ // Inform OpenMP runtime about this :)
+ omp_set_num_threads(cfg.max_nthreads);
+}
+}
diff --git a/src/projects/ionhammer/config_struct.hpp b/src/projects/ionhammer/config_struct.hpp
new file mode 100644
index 0000000..64fe4b2
--- /dev/null
+++ b/src/projects/ionhammer/config_struct.hpp
@@ -0,0 +1,49 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef __HAMMER_IT_CONFIG_HPP__
+#define __HAMMER_IT_CONFIG_HPP__
+
+#include "pipeline/config_singl.hpp"
+
+#include "pipeline/library.hpp"
+
+namespace hammer_config {
+enum class HammerStage {
+ KMerCounting = 1,
+ HammingClustering = 2,
+ SubClustering = 3,
+ ReadCorrection = 4
+};
+
+struct hammer_config {
+ io::DataSet<> dataset;
+
+ std::string working_dir;
+ std::string output_dir;
+
+ unsigned max_nthreads;
+ unsigned tau;
+ unsigned hard_memory_limit;
+
+ size_t count_split_buffer;
+
+ double kmer_qual_threshold;
+ double center_qual_threshold;
+ double delta_score_threshold;
+ bool keep_uncorrected_ends;
+
+ bool debug_mode;
+ HammerStage start_stage;
+};
+
+void load(hammer_config& cfg, const std::string &filename);
+}
+
+typedef config_common::config<hammer_config::hammer_config> cfg;
+
+#endif // __HAMMER_IT_CONFIG_HPP__
diff --git a/src/ionhammer/consensus.hpp b/src/projects/ionhammer/consensus.hpp
similarity index 100%
rename from src/ionhammer/consensus.hpp
rename to src/projects/ionhammer/consensus.hpp
diff --git a/src/projects/ionhammer/err_helper_table.cpp b/src/projects/ionhammer/err_helper_table.cpp
new file mode 100644
index 0000000..c283a5b
--- /dev/null
+++ b/src/projects/ionhammer/err_helper_table.cpp
@@ -0,0 +1,39 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "err_helper_table.hpp"
+
+#include <fstream>
+#include <istream>
+
+#include "dev_support/logger/logger.hpp"
+
+namespace hammer {
+namespace errHelper {
+
+namespace internal {
+
+static const uint32_t helper_table_data[] = {
+#include "err_helper_table.inc"
+};
+
+// numbers are cumulative sums of
+// (2 * 4^^2) / 32,
+// (2 * 4^^4) / 32,
+// ...
+const HelperTable helper_tables[] = {
+ { 1, helper_table_data },
+ { 2, helper_table_data + 1 },
+ { 3, helper_table_data + 17 },
+ { 4, helper_table_data + 273 },
+ { 5, helper_table_data + 4369 }
+};
+
+}; // namespace internal
+
+}; // namespace errHelper
+}; // namespace hammer
diff --git a/src/projects/ionhammer/err_helper_table.hpp b/src/projects/ionhammer/err_helper_table.hpp
new file mode 100644
index 0000000..e24494d
--- /dev/null
+++ b/src/projects/ionhammer/err_helper_table.hpp
@@ -0,0 +1,117 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef __HAMMER_ERR_HELPER_TABLE_HPP__
+#define __HAMMER_ERR_HELPER_TABLE_HPP__
+
+#include "hkmer.hpp"
+
+#include <vector>
+#include <istream>
+#include <string>
+#include <cstdlib>
+#include <cassert>
+
+#include "dev_support/logger/logger.hpp"
+
+namespace hammer {
+
+namespace errHelper {
+
+/// Type of error
+enum Hint {
+ kMismatch,
+ kInsertion,
+ kDeletion
+};
+
+namespace internal {
+
+// maximum size of K-mers in the helper tables
+static const unsigned int MAX_K = 5;
+
+struct HelperTable {
+ const unsigned k_;
+ const uint32_t* storage_;
+
+ template <typename It1, typename It2>
+ Hint lookupHint(const It1 &x_it, const It2 &y_it,
+ size_t x_nfront, size_t y_nfront) const {
+
+ VERIFY(k_ <= MAX_K);
+ unsigned x_code = getCode(x_it, x_nfront, k_);
+ unsigned y_code = getCode(y_it, y_nfront, k_);
+
+ unsigned code = x_code + (y_code << (2 * k_));
+ uint32_t bt = storage_[code / 16]; // 16 hints per uint32_t
+ unsigned shift = (code % 16) * 2;
+ return static_cast<Hint>((bt >> shift) & 0x3);
+ }
+
+ template <typename HRunIter>
+ static unsigned getCode(const HRunIter& x_it, size_t x_nfront, size_t k) {
+ unsigned code = 0;
+ unsigned len = 0;
+ auto nucl = x_it->nucl;
+ for (len = 0; len < x_nfront && len < k; ++len)
+ code |= nucl << (2 * len);
+
+ if (len == k)
+ return code;
+
+ for (HRunIter it = x_it + 1; ; ++it) {
+ for (size_t i = 0; i < it->len; ++i) {
+ code |= it->nucl << (2 * len++);
+ if (len == k)
+ return code;
+ }
+ }
+
+ assert(false);
+ }
+};
+
+// tables for k = 1, 2, ..., MAX_K
+extern const HelperTable helper_tables[];
+
+template <typename HRunIter>
+static inline size_t getNumberOfRemainingBases(const HRunIter &x_it,
+ const HRunIter &x_end,
+ size_t x_nfront) {
+ size_t n = x_nfront;
+ if (n >= MAX_K)
+ return MAX_K;
+
+ for (HRunIter it = x_it + 1; it != x_end; ++it) {
+ n += it->len;
+ if (n >= MAX_K)
+ return MAX_K;
+ }
+
+ return n;
+}
+
+}; // namespace internal
+
+/// Estimate what kind of error occurred at the position
+template <typename It1, typename It2>
+static inline Hint getHint(const It1 &x_begin, const It1 &x_end,
+ const It2 &y_begin, const It2 &y_end,
+ size_t x_nfront, size_t y_nfront) {
+ VERIFY(x_nfront <= x_begin->len);
+ VERIFY(y_nfront <= y_begin->len);
+ size_t x_rem = internal::getNumberOfRemainingBases(x_begin, x_end, x_nfront);
+ size_t y_rem = internal::getNumberOfRemainingBases(y_begin, y_end, y_nfront);
+
+ auto& table = internal::helper_tables[std::min(x_rem, y_rem) - 1];
+ return table.lookupHint<It1, It2>(x_begin, y_begin, x_nfront, y_nfront);
+}
+
+}; // namespace errHelper
+}; // namespace hammer
+
+#endif // __HAMMER_ERR_HELPER_TABLE_HPP__
diff --git a/src/ionhammer/err_helper_table.inc b/src/projects/ionhammer/err_helper_table.inc
similarity index 100%
rename from src/ionhammer/err_helper_table.inc
rename to src/projects/ionhammer/err_helper_table.inc
diff --git a/src/projects/ionhammer/expander.cpp b/src/projects/ionhammer/expander.cpp
new file mode 100644
index 0000000..14f4d98
--- /dev/null
+++ b/src/projects/ionhammer/expander.cpp
@@ -0,0 +1,60 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "expander.hpp"
+
+#include "config_struct.hpp"
+#include "kmer_data.hpp"
+#include "valid_hkmer_generator.hpp"
+
+#include "io/reads_io/file_reader.hpp"
+
+#include <vector>
+#include <cstring>
+
+bool Expander::operator()(const io::SingleRead &r) {
+ size_t sz = r.size();
+
+ std::vector<unsigned> covered_by_solid(sz, false);
+ std::vector<size_t> kmer_indices(sz, -1ull);
+
+ ValidHKMerGenerator<hammer::K> gen(r);
+ while (gen.HasMore()) {
+ hammer::HKMer kmer = gen.kmer();
+ size_t idx = data_.seq_idx(kmer), kl = kmer.size();
+ size_t read_pos = gen.pos() - kl;
+
+ kmer_indices[read_pos] = idx;
+ if (data_[idx].changeto == idx &&
+ data_[idx].qual < cfg::get().center_qual_threshold) {
+ for (size_t j = read_pos; j < read_pos + kl; ++j) {
+ VERIFY_MSG(j < sz, "read_pos == " << read_pos << ", r.size() == " << r.size() << ", kmer: " << kmer << ", read: " << r.GetSequenceString());
+ covered_by_solid[j] = true;
+ }
+ }
+
+ gen.Next();
+ }
+
+ for (size_t j = 0; j < sz; ++j) {
+ if (!covered_by_solid[j] || kmer_indices[j] == -1ull)
+ continue;
+
+ size_t idx = kmer_indices[j];
+ auto &kmer_data = data_[idx];
+ if (kmer_data.changeto != idx) {
+# pragma omp atomic
+ changed_ += 1;
+
+ kmer_data.lock();
+ kmer_data.changeto = static_cast<unsigned>(idx);
+ kmer_data.unlock();
+ }
+ }
+
+ return false;
+}
diff --git a/src/ionhammer/expander.hpp b/src/projects/ionhammer/expander.hpp
similarity index 100%
rename from src/ionhammer/expander.hpp
rename to src/projects/ionhammer/expander.hpp
diff --git a/src/projects/ionhammer/flow_space_read.hpp b/src/projects/ionhammer/flow_space_read.hpp
new file mode 100644
index 0000000..d308f4d
--- /dev/null
+++ b/src/projects/ionhammer/flow_space_read.hpp
@@ -0,0 +1,77 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef __HAMMER_IT_FLOW_SPACE_READ_HPP__
+#define __HAMMER_IT_FLOW_SPACE_READ_HPP__
+
+#include "io/reads/single_read.hpp"
+#include "HSeq.hpp"
+
+#include <deque>
+#include <cstddef>
+#include <string>
+
+namespace hammer {
+
+/// Read interpreted as series of homopolymer runs
+class FlowSpaceRead {
+ std::string name_;
+ std::deque<HomopolymerRun> runs_;
+ public:
+ FlowSpaceRead(const io::SingleRead& read) : name_(read.name()) {
+ const auto& seq = read.GetSequenceString();
+ hammer::iontorrent::toHomopolymerRuns(seq, runs_);
+ }
+
+ template <typename It>
+ FlowSpaceRead(It runs_beg, It runs_end) :
+ runs_(runs_beg, runs_end) {}
+
+ size_t size() const {
+ return runs_.size();
+ }
+
+ const std::string& name() const {
+ return name_;
+ }
+
+ HomopolymerRun operator[](size_t index) const {
+ return runs_[index];
+ }
+
+ HomopolymerRun& operator[](size_t index) {
+ return runs_[index];
+ }
+
+ void TrimLeft(size_t n_runs) {
+ if (n_runs >= runs_.size())
+ runs_.clear();
+ else
+ runs_.erase(runs_.begin(), runs_.begin() + n_runs);
+ }
+
+ void TrimRight(size_t n_runs) {
+ if (n_runs >= runs_.size())
+ runs_.clear();
+ else
+ runs_.erase(runs_.end() - n_runs, runs_.end());
+ }
+
+ std::string GetSequenceString() const {
+ std::string seq;
+ for (size_t i = 0; i < runs_.size(); ++i)
+ seq += runs_[i].str();
+ return seq;
+ }
+
+ const std::deque<hammer::HomopolymerRun>& data() const {
+ return runs_;
+ }
+};
+
+} // namespace hammer
+#endif
diff --git a/src/projects/ionhammer/hamcluster.cpp b/src/projects/ionhammer/hamcluster.cpp
new file mode 100644
index 0000000..a54a66b
--- /dev/null
+++ b/src/projects/ionhammer/hamcluster.cpp
@@ -0,0 +1,219 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "hamcluster.hpp"
+
+#include "hkmer_distance.hpp"
+#include "utils/adt/concurrent_dsu.hpp"
+#include "io/kmers_io/mmapped_reader.hpp"
+
+#include <iostream>
+#include <sstream>
+
+#ifdef USE_GLIBCXX_PARALLEL
+#include <parallel/algorithm>
+#endif
+
+struct SubKMerComparator {
+ bool operator()(const SubKMerData &lhs, const SubKMerData &rhs) {
+ return SubKMer::less2_fast()(lhs.data, rhs.data);
+ }
+};
+
+std::pair<size_t, size_t> SubKMerSplitter::split() {
+ std::vector<SubKMerData> data;
+
+ MMappedReader ifs(ifname_, /* unlink */ true);
+ std::ofstream ofs(ofname_, std::ios::out | std::ios::binary);
+ VERIFY(ofs.good());
+ size_t icnt = 0, ocnt = 0;
+ while (ifs.good()) {
+ SubKMerComparator comp;
+
+ deserialize(data, ifs);
+
+#ifdef USE_GLIBCXX_PARALLEL
+ // Explicitly force a call to parallel sort routine.
+ __gnu_parallel::sort(data.begin(), data.end(), comp);
+#else
+ std::sort(data.begin(), data.end(), comp);
+#endif
+ for (auto start = data.begin(), end = data.end(); start != end;) {
+ auto chunk_end = std::upper_bound(start + 1, data.end(), *start, comp);
+ serialize(ofs, start, chunk_end);
+ start = chunk_end;
+ ocnt += 1;
+ }
+ icnt += 1;
+ }
+ VERIFY(!ofs.fail());
+
+ ofs.close();
+
+ return std::make_pair(icnt, ocnt);
+}
+
+#if 1
+static bool canMerge(const ConcurrentDSU &uf, unsigned x, unsigned y) {
+ size_t szx = uf.set_size(x), szy = uf.set_size(y);
+ const size_t hardthr = 2500;
+
+ // Global threshold - no cluster larger than hard threshold
+ if (szx + szy > hardthr)
+ return false;
+
+ // If one of the clusters is moderately large, than attach "almost" singletons
+ // only.
+ if ((szx > hardthr * 3 / 4 && szy > 50) ||
+ (szy > hardthr * 3 / 4 && szx > 50))
+ return false;
+
+ return true;
+}
+#else
+static bool canMerge(const ConcurrentDSU &uf, unsigned x, unsigned y) {
+ return (uf.set_size(x) + uf.set_size(y)) < 10000;
+}
+#endif
+
+
+static void processBlockQuadratic(ConcurrentDSU &uf,
+ const std::vector<size_t> &block,
+ const KMerData &data,
+ unsigned tau) {
+ size_t blockSize = block.size();
+ for (size_t i = 0; i < blockSize; ++i) {
+ auto x = static_cast<unsigned>(block[i]);
+ hammer::HKMer kmerx = data[x].kmer;
+ hammer::HKMer rkmerx = !kmerx;
+ auto rcx = static_cast<unsigned>(data.seq_idx(rkmerx));
+
+ for (size_t j = i + 1; j < blockSize; j++) {
+ auto y = static_cast<unsigned>(block[j]);
+ hammer::HKMer kmery = data[y].kmer;
+ hammer::HKMer rkmery = !kmery;
+ auto rcy = static_cast<unsigned>(data.seq_idx(rkmery));
+ if ((uf.find_set(x) != uf.find_set(y) || uf.find_set(rcx) !=
+ uf.find_set(rcy)) &&
+ (canMerge(uf, x, y) || canMerge(uf, rcx, rcy)) &&
+ (hammer::distanceHKMer(kmerx.begin(), kmerx.end(),
+ kmery.begin(), kmery.end(), tau) <= tau ||
+ hammer::distanceHKMer(rkmerx.begin(), rkmerx.end(),
+ rkmery.begin(), rkmery.end(), tau) <= tau)) {
+ uf.unite(x, y);
+ uf.unite(rcx, rcy);
+ }
+ }
+ }
+}
+
+void KMerHamClusterer::cluster(const std::string &prefix,
+ const KMerData &data,
+ ConcurrentDSU &uf) {
+ // First pass - split & sort the k-mers
+ std::ostringstream tmp;
+ tmp << prefix << ".first";
+ std::string fname(tmp.str());
+ std::ofstream ofs(fname, std::ios::out | std::ios::binary);
+ VERIFY(ofs.good());
+
+ INFO("Serializing sub-kmers.");
+ for (unsigned i = 0; i < tau_ + 1; ++i) {
+ // size_t from = (*Globals::subKMerPositions)[i];
+ // size_t to = (*Globals::subKMerPositions)[i+1];
+ size_t from = 0 + i*hammer::K / (tau_ + 1);
+ size_t to = 0 + (i+1)*hammer::K / (tau_ + 1);
+
+ INFO("Serializing: [" << from << ", " << to << ")");
+ serialize(ofs, data, NULL,
+ SubKMerPartSerializer(from, to));
+ }
+ VERIFY(!ofs.fail());
+ ofs.close();
+
+ size_t big_blocks1 = 0;
+ {
+ INFO("Splitting sub-kmers, pass 1.");
+ SubKMerSplitter Splitter(fname, fname + ".blocks");
+ std::pair<size_t, size_t> stat = Splitter.split();
+ INFO("Splitting done."
+ " Processed " << stat.first << " blocks."
+ " Produced " << stat.second << " blocks.");
+
+ // Sanity check - there cannot be more blocks than tau + 1 times of total
+ // kmer number. And on the first pass we have only tau + 1 input blocks!
+ VERIFY(stat.first == tau_ + 1);
+ VERIFY(stat.second <= (tau_ + 1) * data.size());
+
+ // Ok, now in the files we have everything grouped in blocks in the output files.
+
+ std::vector<size_t> block;
+
+ INFO("Merge sub-kmers, pass 1");
+ SubKMerBlockFile blocks(fname + ".blocks", /* unlink */ true);
+
+ std::ostringstream tmp;
+ tmp << prefix << ".second";
+ fname = tmp.str();
+
+ ofs.open(fname, std::ios::out | std::ios::binary);
+ VERIFY(ofs.good());
+ while (blocks.get_block(block)) {
+ // unsigned block_thr = cfg::get().hamming_blocksize_quadratic_threshold;
+ unsigned block_thr = 50;
+ if (block.size() < block_thr) {
+ // Merge small blocks.
+ processBlockQuadratic(uf, block, data, tau_);
+ } else {
+ big_blocks1 += 1;
+ // Otherwise - dump for next iteration.
+ for (unsigned i = 0; i < tau_ + 1; ++i) {
+ serialize(ofs, data, &block,
+ SubKMerStridedSerializer(i, tau_ + 1));
+ }
+ }
+ }
+ VERIFY(!ofs.fail());
+ ofs.close();
+ INFO("Merge done, total " << big_blocks1 << " new blocks generated.");
+ }
+
+ size_t big_blocks2 = 0;
+ {
+ INFO("Spliting sub-kmers, pass 2.");
+ SubKMerSplitter Splitter(fname, fname + ".blocks");
+ std::pair<size_t, size_t> stat = Splitter.split();
+ INFO("Splitting done."
+ " Processed " << stat.first << " blocks."
+ " Produced " << stat.second << " blocks.");
+
+ // Sanity check - there cannot be more blocks than tau + 1 times of total
+ // kmer number. And there should be tau + 1 times big_blocks input blocks.
+ VERIFY(stat.first == (tau_ + 1)*big_blocks1);
+ VERIFY(stat.second <= (tau_ + 1) * (tau_ + 1) * data.size());
+
+ INFO("Merge sub-kmers, pass 2");
+ SubKMerBlockFile blocks(fname + ".blocks", /* unlink */ true);
+ std::vector<size_t> block;
+
+ size_t nblocks = 0;
+ while (blocks.get_block(block)) {
+ if (block.size() > 50) {
+ big_blocks2 += 1;
+#if 0
+ for (size_t i = 0; i < block.size(); ++i) {
+ std::string s(Globals::blob + data[block[i]], K);
+ INFO("" << block[i] << ": " << s);
+ }
+#endif
+ }
+ processBlockQuadratic(uf, block, data, tau_);
+ nblocks += 1;
+ }
+ INFO("Merge done, saw " << big_blocks2 << " big blocks out of " << nblocks << " processed.");
+ }
+}
diff --git a/src/projects/ionhammer/hamcluster.hpp b/src/projects/ionhammer/hamcluster.hpp
new file mode 100644
index 0000000..019404f
--- /dev/null
+++ b/src/projects/ionhammer/hamcluster.hpp
@@ -0,0 +1,192 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef HAMMER_SUBKMER_SORTER_HPP
+#define HAMMER_SUBKMER_SORTER_HPP
+
+#include "kmer_data.hpp"
+#include "io/kmers_io/mmapped_reader.hpp"
+
+#include "dev_support/logger/logger.hpp"
+#include "HSeq.hpp"
+
+#include <iostream>
+#include <vector>
+
+class ConcurrentDSU;
+
+typedef hammer::HSeq<(hammer::K + 1) / 2> SubKMer;
+
+struct SubKMerData {
+ uint64_t idx;
+ SubKMer data;
+};
+
+template<class Reader>
+inline void binary_read(Reader &is, SubKMerData &s) {
+ SubKMer::DataType seq_data[SubKMer::DataSize];
+
+ is.read((char*)&s.idx, sizeof(s.idx));
+ is.read((char*)seq_data, sizeof(seq_data));
+
+ s.data = SubKMer(seq_data, seq_data + SubKMer::DataSize);
+}
+
+template<class Writer>
+inline Writer &binary_write(Writer &os, const SubKMerData &s) {
+ os.write((char*)&s.idx, sizeof(s.idx));
+ os.write((char*)s.data.data(), SubKMer::TotalBytes);
+
+ return os;
+}
+
+static_assert(sizeof(SubKMerData) == 16, "Too big SubKMer");
+
+class SubKMerPartSerializer{
+ size_t from_;
+ size_t to_;
+
+public:
+ SubKMerPartSerializer(size_t from, size_t to)
+ :from_(from), to_(to) { VERIFY(to_ - from_ <= hammer::K); }
+
+ SubKMerData serialize(hammer::HKMer k, size_t fidx) const {
+ SubKMerData s;
+
+ s.idx = fidx;
+ s.data = SubKMer(k.data() + from_, k.data() + to_);
+
+ // Yay for NRVO!
+ return s;
+ }
+};
+
+class SubKMerStridedSerializer{
+ size_t from_;
+ size_t to_;
+ size_t stride_;
+
+public:
+ SubKMerStridedSerializer(size_t from, size_t stride)
+ :from_(from), stride_(stride) { VERIFY(from_ + stride_ <= hammer::K); }
+
+ SubKMerData serialize(hammer::HKMer k, size_t fidx) const {
+ SubKMerData s;
+
+ s.idx = fidx;
+
+ size_t sz = (hammer::K - from_ + stride_ - 1) / stride_;
+
+ std::vector<hammer::HKMer::DataType> v(sz);
+ for (size_t i = from_, j = 0; i < hammer::K; i+= stride_, ++j)
+ v[j] = k[i];
+
+ s.data = SubKMer(&v[0], &v[0] + sz);
+
+ // Yay for NRVO!
+ return s;
+ }
+};
+
+class SubKMerBlockFile {
+ MMappedReader ifs_;
+
+ public:
+ SubKMerBlockFile(const std::string &fname, bool unlink = false)
+ : ifs_(fname, unlink) { }
+
+ bool get_block(std::vector<size_t> &block) {
+ block.clear();
+#if 0
+ block.shrink_to_fit();
+#else
+ std::vector<size_t>().swap(block);
+#endif
+
+ if (!ifs_.good())
+ return false;
+
+ size_t sz;
+ ifs_.read((char*)&sz, sizeof(sz));
+ block.resize(sz);
+ for (size_t i = 0; i < sz; ++i) {
+ SubKMerData s;
+ binary_read(ifs_, s);
+ block[i] = s.idx;
+ }
+
+ return true;
+ }
+};
+
+template<class Writer,
+ class SubKMerSerializer>
+void serialize(Writer &os,
+ const KMerData &data, const std::vector<size_t> *block = NULL,
+ const SubKMerSerializer &serializer = SubKMerSerializer()) {
+ size_t sz = (block == NULL ? data.size() : block->size());
+ os.write((char*)&sz, sizeof(sz));
+ for (size_t i = 0, e = sz; i != e; ++i) {
+ size_t idx = (block == NULL ? i : (*block)[i]);
+ SubKMerData s = serializer.serialize(data[idx].kmer, idx);
+ binary_write(os, s);
+ }
+}
+
+class SubKMerSplitter {
+ const std::string ifname_;
+ const std::string ofname_;
+
+ public:
+ SubKMerSplitter(const std::string &ifname, const std::string &ofname)
+ : ifname_(ifname), ofname_(ofname) {}
+
+ template<class Writer>
+ void serialize(Writer &os,
+ const std::vector<SubKMerData>::iterator &start,
+ const std::vector<SubKMerData>::iterator &end) {
+ size_t sz = end - start;
+
+ os.write((char*)&sz, sizeof(sz));
+ for (auto I = start, E = end; I != E; ++I)
+ binary_write(os, *I);
+ }
+
+ template<class Reader>
+ void deserialize(std::vector<SubKMerData> &res,
+ Reader &is) {
+ res.clear();
+#if 0
+ res.shrink_to_fit();
+#else
+ std::vector<SubKMerData>().swap(res);
+#endif
+
+ size_t sz;
+ is.read((char*)&sz, sizeof(sz));
+ res.resize(sz);
+
+ for (size_t i = 0, e = sz; i != e; ++i)
+ binary_read(is, res[i]);
+ }
+
+ std::pair<size_t, size_t> split();
+};
+
+class KMerHamClusterer {
+ unsigned tau_;
+
+ public:
+ KMerHamClusterer(unsigned tau)
+ : tau_(tau) {}
+
+ void cluster(const std::string &prefix, const KMerData &data, ConcurrentDSU &uf);
+ private:
+ DECL_LOGGER("Hamming Clustering");
+};
+
+#endif // HAMMER_SUBKMER_SORTER_HPP
diff --git a/src/ionhammer/hkmer.hpp b/src/projects/ionhammer/hkmer.hpp
similarity index 100%
rename from src/ionhammer/hkmer.hpp
rename to src/projects/ionhammer/hkmer.hpp
diff --git a/src/ionhammer/hkmer_distance.hpp b/src/projects/ionhammer/hkmer_distance.hpp
similarity index 100%
rename from src/ionhammer/hkmer_distance.hpp
rename to src/projects/ionhammer/hkmer_distance.hpp
diff --git a/src/projects/ionhammer/kmer_data.cpp b/src/projects/ionhammer/kmer_data.cpp
new file mode 100644
index 0000000..3aae09d
--- /dev/null
+++ b/src/projects/ionhammer/kmer_data.cpp
@@ -0,0 +1,245 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "kmer_data.hpp"
+#include "config_struct.hpp"
+#include "valid_hkmer_generator.hpp"
+
+#include "io/kmers_io/mmapped_writer.hpp"
+#include "io/reads_io/file_reader.hpp"
+#include "io/reads_io/read_processor.hpp"
+
+#include <libcxx/sort.hpp>
+
+using namespace hammer;
+
+class BufferFiller;
+
+class HammerKMerSplitter : public KMerSplitter<hammer::HKMer> {
+ typedef std::vector<std::vector<HKMer> > KMerBuffer;
+
+ void DumpBuffers(size_t num_files, size_t nthreads,
+ std::vector<KMerBuffer> &buffers,
+ const path::files_t &ostreams) const;
+
+ public:
+ HammerKMerSplitter(const std::string &work_dir)
+ : KMerSplitter<hammer::HKMer>(work_dir, hammer::K) {}
+
+ virtual path::files_t Split(size_t num_files);
+
+ friend class BufferFiller;
+};
+
+void HammerKMerSplitter::DumpBuffers(size_t num_files, size_t nthreads,
+ std::vector<KMerBuffer> &buffers,
+ const path::files_t &ostreams) const {
+# pragma omp parallel for num_threads(nthreads)
+ for (unsigned k = 0; k < num_files; ++k) {
+ size_t sz = 0;
+ for (size_t i = 0; i < nthreads; ++i)
+ sz += buffers[i][k].size();
+
+ std::vector<HKMer> SortBuffer;
+ SortBuffer.reserve(sz);
+ for (size_t i = 0; i < nthreads; ++i) {
+ KMerBuffer &entry = buffers[i];
+ SortBuffer.insert(SortBuffer.end(), entry[k].begin(), entry[k].end());
+ }
+ libcxx::sort(SortBuffer.begin(), SortBuffer.end(), HKMer::less2_fast());
+ auto it = std::unique(SortBuffer.begin(), SortBuffer.end());
+
+# pragma omp critical
+ {
+ FILE *f = fopen(ostreams[k].c_str(), "ab");
+ VERIFY_MSG(f, "Cannot open temporary file to write");
+ fwrite(SortBuffer.data(), sizeof(HKMer), it - SortBuffer.begin(), f);
+ fclose(f);
+ }
+ }
+
+ for (unsigned i = 0; i < nthreads; ++i) {
+ for (unsigned j = 0; j < num_files; ++j) {
+ buffers[i][j].clear();
+ }
+ }
+}
+
+class BufferFiller {
+ std::vector<HammerKMerSplitter::KMerBuffer> &tmp_entries_;
+ unsigned num_files_;
+ size_t cell_size_;
+ size_t processed_;
+ const HammerKMerSplitter &splitter_;
+
+ public:
+ BufferFiller(std::vector<HammerKMerSplitter::KMerBuffer> &tmp_entries, size_t cell_size, const HammerKMerSplitter &splitter):
+ tmp_entries_(tmp_entries), num_files_((unsigned)tmp_entries[0].size()), cell_size_(cell_size), processed_(0), splitter_(splitter) {}
+
+ size_t processed() const { return processed_; }
+
+ bool operator()(const io::SingleRead &r) {
+ ValidHKMerGenerator<hammer::K> gen(r);
+ HammerKMerSplitter::KMerBuffer &entry = tmp_entries_[omp_get_thread_num()];
+
+# pragma omp atomic
+ processed_ += 1;
+
+ bool stop = false;
+ while (gen.HasMore()) {
+ HKMer seq = gen.kmer(); size_t idx;
+
+ idx = splitter_.GetFileNumForSeq(seq, num_files_);
+ entry[idx].push_back(seq);
+ stop |= entry[idx].size() > cell_size_;
+
+ seq = !seq;
+
+ idx = splitter_.GetFileNumForSeq(seq, num_files_);
+ entry[idx].push_back(seq);
+ stop |= entry[idx].size() > cell_size_;
+
+ gen.Next();
+ }
+
+ return stop;
+ }
+};
+
+path::files_t HammerKMerSplitter::Split(size_t num_files) {
+ unsigned nthreads = cfg::get().max_nthreads;
+
+ INFO("Splitting kmer instances into " << num_files << " buckets. This might take a while.");
+
+ // Determine the set of output files
+ path::files_t out;
+ for (unsigned i = 0; i < num_files; ++i)
+ out.push_back(GetRawKMersFname(i));
+
+ size_t reads_buffer_size = cfg::get().count_split_buffer;
+ if (reads_buffer_size == 0) {
+ reads_buffer_size = 536870912ull;
+ size_t mem_limit = (size_t)((double)(get_free_memory()) / (nthreads * 3));
+ INFO("Memory available for splitting buffers: " << (double)mem_limit / 1024.0 / 1024.0 / 1024.0 << " Gb");
+ reads_buffer_size = std::min(reads_buffer_size, mem_limit);
+ }
+ size_t cell_size = reads_buffer_size / (num_files * sizeof(HKMer));
+ // Set sane minimum cell size
+ if (cell_size < 16384)
+ cell_size = 16384;
+
+ INFO("Using cell size of " << cell_size);
+ std::vector<KMerBuffer> tmp_entries(nthreads);
+ for (unsigned i = 0; i < nthreads; ++i) {
+ KMerBuffer &entry = tmp_entries[i];
+ entry.resize(num_files);
+ for (unsigned j = 0; j < num_files; ++j) {
+ entry[j].reserve((size_t)(1.1 * (double)cell_size));
+ }
+ }
+
+ size_t n = 15;
+ const auto& dataset = cfg::get().dataset;
+ BufferFiller filler(tmp_entries, cell_size, *this);
+ for (auto it = dataset.reads_begin(), et = dataset.reads_end(); it != et; ++it) {
+ INFO("Processing " << *it);
+ io::FileReadStream irs(*it, io::PhredOffset);
+ hammer::ReadProcessor rp(nthreads);
+ while (!irs.eof()) {
+ rp.Run(irs, filler);
+ DumpBuffers(num_files, nthreads, tmp_entries, out);
+ VERIFY_MSG(rp.read() == rp.processed(), "Queue unbalanced");
+
+ if (filler.processed() >> n) {
+ INFO("Processed " << filler.processed() << " reads");
+ n += 1;
+ }
+ }
+ }
+ INFO("Processed " << filler.processed() << " reads");
+
+ return out;
+}
+
+static inline void Merge(KMerStat &lhs, const KMerStat &rhs) {
+ if (lhs.count == 0)
+ lhs.kmer = rhs.kmer;
+
+ lhs.count += rhs.count;
+ lhs.qual *= rhs.qual;
+}
+
+static void PushKMer(KMerData &data, HKMer kmer, double qual) {
+ KMerStat &kmc = data[kmer];
+ kmc.lock();
+ Merge(kmc, KMerStat(1, kmer, qual));
+ kmc.unlock();
+}
+
+static void PushKMerRC(KMerData &data, HKMer kmer, double qual) {
+ kmer = !kmer;
+
+ KMerStat &kmc = data[kmer];
+ kmc.lock();
+ Merge(kmc, KMerStat(1, kmer, qual));
+ kmc.unlock();
+}
+
+class KMerDataFiller {
+ KMerData &data_;
+
+ public:
+ KMerDataFiller(KMerData &data)
+ : data_(data) {}
+
+ bool operator()(const io::SingleRead &r) const {
+ ValidHKMerGenerator<hammer::K> gen(r);
+ while (gen.HasMore()) {
+ HKMer kmer = gen.kmer();
+ double correct = gen.correct_probability();
+
+ PushKMer(data_, kmer, 1 - correct);
+ PushKMerRC(data_, kmer, 1 - correct);
+
+ gen.Next();
+ }
+
+ // Do not stop
+ return false;
+ }
+};
+
+void KMerDataCounter::FillKMerData(KMerData &data) {
+ HammerKMerSplitter splitter(cfg::get().working_dir);
+ KMerDiskCounter<hammer::HKMer> counter(cfg::get().working_dir, splitter);
+ size_t sz = KMerIndexBuilder<HammerKMerIndex>(cfg::get().working_dir, num_files_, cfg::get().max_nthreads).BuildIndex(data.index_, counter);
+
+ // Now use the index to fill the kmer quality information.
+ INFO("Collecting K-mer information, this takes a while.");
+ data.data_.resize(sz);
+
+ const auto& dataset = cfg::get().dataset;
+ for (auto it = dataset.reads_begin(), et = dataset.reads_end(); it != et; ++it) {
+ INFO("Processing " << *it);
+ io::FileReadStream irs(*it, io::PhredOffset);
+ KMerDataFiller filler(data);
+ hammer::ReadProcessor(cfg::get().max_nthreads).Run(irs, filler);
+ }
+
+ INFO("Collection done, postprocessing.");
+
+ size_t singletons = 0;
+ for (size_t i = 0; i < data.size(); ++i) {
+ VERIFY(data[i].count);
+
+ if (data[i].count == 1)
+ singletons += 1;
+ }
+
+ INFO("Merge done. There are " << data.size() << " kmers in total. "
+ "Among them " << singletons << " (" << 100.0 * double(singletons) / double(data.size()) << "%) are singletons.");
+}
diff --git a/src/projects/ionhammer/kmer_data.hpp b/src/projects/ionhammer/kmer_data.hpp
new file mode 100644
index 0000000..8afd216
--- /dev/null
+++ b/src/projects/ionhammer/kmer_data.hpp
@@ -0,0 +1,124 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef __HAMMER_KMER_DATA_HPP__
+#define __HAMMER_KMER_DATA_HPP__
+
+#include "data_structures/mph_index/kmer_index.hpp"
+#include "hkmer.hpp"
+
+#include <vector>
+
+#include <cstdlib>
+
+namespace hammer {
+
+struct KMerStat {
+ size_t count;
+ HKMer kmer;
+ double qual;
+ unsigned changeto;
+ uint8_t lock_;
+
+ KMerStat(size_t count = 0, HKMer kmer = HKMer(), double qual = 1.0, unsigned changeto = -1)
+ : count(count), kmer(kmer), qual(qual), changeto(changeto), lock_(0) { }
+
+ void lock() {
+ while (__sync_val_compare_and_swap(&lock_, 0, 1) == 1)
+ sched_yield();
+ }
+ void unlock() {
+ lock_ = 0;
+ __sync_synchronize();
+ }
+};
+
+};
+
+typedef KMerIndex<kmer_index_traits<hammer::HKMer> > HammerKMerIndex;
+
+class KMerData {
+ typedef std::vector<hammer::KMerStat> KMerDataStorageType;
+
+ public:
+ KMerData() {}
+
+ size_t size() const { return data_.size(); }
+ size_t capacity() const { return data_.capacity(); }
+ void clear() {
+ data_.clear();
+ push_back_buffer_.clear();
+ KMerDataStorageType().swap(data_);
+ KMerDataStorageType().swap(push_back_buffer_);
+ }
+ size_t push_back(const hammer::KMerStat &k) {
+ push_back_buffer_.push_back(k);
+
+ return data_.size() + push_back_buffer_.size() - 1;
+ }
+
+ hammer::KMerStat& operator[](size_t idx) {
+ size_t dsz = data_.size();
+ return (idx < dsz ? data_[idx] : push_back_buffer_[idx - dsz]);
+ }
+ const hammer::KMerStat& operator[](size_t idx) const {
+ size_t dsz = data_.size();
+ return (idx < dsz ? data_[idx] : push_back_buffer_[idx - dsz]);
+ }
+ hammer::KMerStat& operator[](hammer::HKMer s) { return operator[](index_.seq_idx(s)); }
+ const hammer::KMerStat& operator[](hammer::HKMer s) const { return operator[](index_.seq_idx(s)); }
+ size_t seq_idx(hammer::HKMer s) const { return index_.seq_idx(s); }
+
+ template <class Writer>
+ void binary_write(Writer &os) {
+ size_t sz = data_.size();
+ os.write((char*)&sz, sizeof(sz));
+ os.write((char*)&data_[0], sz*sizeof(data_[0]));
+ index_.serialize(os);
+ }
+
+ template <class Reader>
+ void binary_read(Reader &is) {
+ size_t sz = 0;
+ is.read((char*)&sz, sizeof(sz));
+ data_.resize(sz);
+ is.read((char*)&data_[0], sz*sizeof(data_[0]));
+ index_.deserialize(is);
+ }
+
+ private:
+ KMerDataStorageType data_;
+ KMerDataStorageType push_back_buffer_;
+ HammerKMerIndex index_;
+
+ friend class KMerDataCounter;
+};
+
+struct CountCmp {
+ const KMerData &kmer_data_;
+
+ CountCmp(const KMerData &kmer_data)
+ : kmer_data_(kmer_data) {}
+
+ bool operator()(unsigned lhs, unsigned rhs) {
+ return kmer_data_[lhs].count > kmer_data_[rhs].count;
+ }
+};
+
+class KMerDataCounter {
+ unsigned num_files_;
+
+ public:
+ KMerDataCounter(unsigned num_files) : num_files_(num_files) {}
+
+ void FillKMerData(KMerData &data);
+
+ private:
+ DECL_LOGGER("K-mer Counting");
+};
+
+#endif // __HAMMER_KMER_DATA_HPP__
diff --git a/src/projects/ionhammer/main.cpp b/src/projects/ionhammer/main.cpp
new file mode 100644
index 0000000..cb3f35b
--- /dev/null
+++ b/src/projects/ionhammer/main.cpp
@@ -0,0 +1,332 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "dev_support/logger/log_writers.hpp"
+
+#include "io/reads_io/file_reader.hpp"
+#include "io/sam_io/bam_reader.hpp"
+#include "io/reads_io/paired_readers.hpp"
+#include "io/reads_io/osequencestream.hpp"
+#include "io/reads_io/read_processor.hpp"
+
+#include "utils/adt/concurrent_dsu.hpp"
+
+#include "dev_support/segfault_handler.hpp"
+#include "dev_support/memory_limit.hpp"
+
+#include "HSeq.hpp"
+#include "kmer_data.hpp"
+#include "hamcluster.hpp"
+#include "subcluster.hpp"
+#include "err_helper_table.hpp"
+#include "read_corrector.hpp"
+#include "expander.hpp"
+#include "config_struct.hpp"
+
+#include "dev_support/openmp_wrapper.h"
+
+#include "version.hpp"
+
+#include <fstream>
+#include <iomanip>
+
+#include <bamtools/api/BamReader.h>
+#include <bamtools/api/SamHeader.h>
+
+void create_console_logger() {
+ using namespace logging;
+
+ logger *lg = create_logger("");
+ lg->add_writer(std::make_shared<console_writer>());
+ attach_logger(lg);
+}
+
+struct UfCmp {
+ bool operator()(const std::vector<unsigned long> &lhs,
+ const std::vector<unsigned long> &rhs) {
+ return lhs.size() > rhs.size();
+ }
+};
+
+// This is weird workaround for bug in gcc 4.4.7
+static bool stage(hammer_config::HammerStage start, hammer_config::HammerStage current) {
+ switch (start) {
+ case hammer_config::HammerStage::KMerCounting:
+ return true;
+ case hammer_config::HammerStage::HammingClustering:
+ return current != hammer_config::HammerStage::KMerCounting;
+ case hammer_config::HammerStage::SubClustering:
+ return (current != hammer_config::HammerStage::KMerCounting &&
+ current != hammer_config::HammerStage::HammingClustering);
+ case hammer_config::HammerStage::ReadCorrection:
+ return current == hammer_config::HammerStage::ReadCorrection;
+ }
+ assert(0);
+}
+
+int main(int argc, char** argv) {
+ segfault_handler sh;
+
+ srand(42);
+ srandom(42);
+
+ try {
+ create_console_logger();
+
+ std::string config_file = "hammer-it.cfg";
+ if (argc > 1) config_file = argv[1];
+ INFO("Starting IonHammer, built from " SPADES_GIT_REFSPEC ", git revision " SPADES_GIT_SHA1);
+ INFO("Loading config from " << config_file.c_str());
+ cfg::create_instance(config_file);
+
+ // hard memory limit
+ const size_t GB = 1 << 30;
+ limit_memory(cfg::get().hard_memory_limit * GB);
+
+ KMerData kmer_data;
+ if (stage(cfg::get().start_stage, hammer_config::HammerStage::KMerCounting)) {
+ // FIXME: Actually it's num_files here
+ KMerDataCounter(32).FillKMerData(kmer_data);
+ if (cfg::get().debug_mode) {
+ INFO("Debug mode on. Saving K-mer index.");
+ std::ofstream ofs(path::append_path(cfg::get().working_dir, "count.kmdata"), std::ios::binary);
+ kmer_data.binary_write(ofs);
+ }
+ } else {
+ INFO("Loading K-mer index.");
+ std::ifstream ifs(path::append_path(cfg::get().working_dir, "count.kmdata"), std::ios::binary);
+ VERIFY(ifs.good());
+ kmer_data.binary_read(ifs);
+ INFO("Total " << kmer_data.size() << " entries were loader");
+ }
+
+ std::vector<std::vector<size_t> > classes;
+ if (stage(cfg::get().start_stage, hammer_config::HammerStage::HammingClustering)) {
+ ConcurrentDSU uf(kmer_data.size());
+ KMerHamClusterer clusterer(cfg::get().tau);
+ INFO("Clustering Hamming graph.");
+ clusterer.cluster(path::append_path(cfg::get().working_dir, "kmers.hamcls"), kmer_data, uf);
+ uf.get_sets(classes);
+ size_t num_classes = classes.size();
+ INFO("Clustering done. Total clusters: " << num_classes);
+
+ if (cfg::get().debug_mode) {
+ INFO("Debug mode on. Writing down clusters.");
+ std::ofstream ofs(path::append_path(cfg::get().working_dir, "hamming.cls"), std::ios::binary);
+
+ ofs.write((char*)&num_classes, sizeof(num_classes));
+ for (size_t i=0; i < classes.size(); ++i) {
+ size_t sz = classes[i].size();
+ ofs.write((char*)&sz, sizeof(sz));
+ ofs.write((char*)&classes[i][0], sz * sizeof(classes[i][0]));
+ }
+ }
+ } else {
+ INFO("Loading clusters.");
+ std::ifstream ifs(path::append_path(cfg::get().working_dir, "hamming.cls"), std::ios::binary);
+ VERIFY(ifs.good());
+
+ size_t num_classes = 0;
+ ifs.read((char*)&num_classes, sizeof(num_classes));
+ classes.resize(num_classes);
+
+ for (size_t i = 0; i < num_classes; ++i) {
+ size_t sz = 0;
+ ifs.read((char*)&sz, sizeof(sz));
+ classes[i].resize(sz);
+ ifs.read((char*)&classes[i][0], sz * sizeof(classes[i][0]));
+ }
+ }
+
+ size_t singletons = 0;
+ for (size_t i = 0; i < classes.size(); ++i)
+ if (classes[i].size() == 1)
+ singletons += 1;
+ INFO("Singleton clusters: " << singletons);
+
+ if (stage(cfg::get().start_stage, hammer_config::HammerStage::SubClustering)) {
+ size_t nonread = 0;
+#if 1
+ INFO("Subclustering.");
+# pragma omp parallel for shared(nonread, classes, kmer_data)
+ for (size_t i = 0; i < classes.size(); ++i) {
+ auto& cluster = classes[i];
+
+# pragma omp atomic
+ nonread += subcluster(kmer_data, cluster);
+ }
+#else
+ INFO("Assigning centers");
+# pragma omp parallel for shared(nonread, classes, kmer_data)
+ for (size_t i = 0; i < classes.size(); ++i) {
+ const auto& cluster = classes[i];
+# pragma omp atomic
+ nonread += assign(kmer_data, cluster);
+ }
+#endif
+ INFO("Total " << nonread << " nonread kmers were generated");
+
+ if (cfg::get().debug_mode) {
+ INFO("Debug mode on. Saving K-mer index.");
+ std::ofstream ofs(path::append_path(cfg::get().working_dir, "cluster.kmdata"), std::ios::binary);
+ kmer_data.binary_write(ofs);
+ }
+ } else {
+ INFO("Loading K-mer index.");
+ std::ifstream ifs(path::append_path(cfg::get().working_dir, "cluster.kmdata"), std::ios::binary);
+ VERIFY(ifs.good());
+ kmer_data.binary_read(ifs);
+ INFO("Total " << kmer_data.size() << " entries were loader");
+ }
+
+#if 0
+ INFO("Starting solid k-mers expansion in " << cfg::get().max_nthreads << " threads.");
+ while (true) {
+ Expander expander(kmer_data);
+ const io::DataSet<> &dataset = cfg::get().dataset;
+ for (auto I = dataset.reads_begin(), E = dataset.reads_end(); I != E; ++I) {
+ io::FileReadStream irs(*I, io::PhredOffset);
+ hammer::ReadProcessor rp(cfg::get().max_nthreads);
+ rp.Run(irs, expander);
+ VERIFY_MSG(rp.read() == rp.processed(), "Queue unbalanced");
+ }
+ INFO("" << expander.changed() << " solid k-mers were generated");
+ if (expander.changed() == 0)
+ break;
+ }
+#endif
+
+#if 0
+ std::ofstream fasta_ofs("centers.fasta");
+ fasta_ofs << std::fixed << std::setprecision(6) << std::setfill('0');
+ std::sort(classes.begin(), classes.end(), UfCmp());
+ for (size_t i = 0; i < classes.size(); ++i) {
+ auto& cluster = classes[i];
+ std::sort(cluster.begin(), cluster.end(), CountCmp(kmer_data));
+ hammer::HKMer c = center(kmer_data, cluster);
+ size_t idx = kmer_data.seq_idx(c);
+ if (kmer_data[idx].kmer == c) {
+ fasta_ofs << '>' << std::setw(6) << i
+ << "-cov_" << std::setw(0) << kmer_data[idx].count
+ << "-qual_" << 1.0 - kmer_data[idx].qual;
+
+ if (cluster.size() == 1)
+ fasta_ofs << "_singleton";
+ fasta_ofs << '\n' << c << '\n';
+ }
+ }
+#endif
+
+ INFO("Correcting reads.");
+ using namespace hammer::correction;
+ SingleReadCorrector::NoDebug debug_pred;
+ SingleReadCorrector::SelectAll select_pred;
+ const auto& dataset = cfg::get().dataset;
+ io::DataSet<> outdataset;
+ size_t ilib = 0;
+ for (auto it = dataset.library_begin(), et = dataset.library_end(); it != et; ++it, ++ilib) {
+ const auto& lib = *it;
+ auto outlib = lib;
+ outlib.clear();
+
+ size_t iread = 0;
+ // First, correct all the paired FASTQ files
+ for (auto I = lib.paired_begin(), E = lib.paired_end(); I != E; ++I, ++iread) {
+ if (path::extension(I->first) == ".bam" || path::extension(I->second) == ".bam")
+ continue;
+
+ INFO("Correcting pair of reads: " << I->first << " and " << I->second);
+
+ std::string usuffix = std::to_string(ilib) + "_" +
+ std::to_string(iread) + ".cor.fasta";
+
+ std::string outcorl = path::append_path(cfg::get().output_dir, path::basename(I->first) + usuffix);
+ std::string outcorr = path::append_path(cfg::get().output_dir, path::basename(I->second) + usuffix);
+
+ io::PairedOutputSequenceStream ors(outcorl, outcorr);
+
+ io::SeparatePairedReadStream irs(I->first, I->second, 0, false, false);
+ PairedReadCorrector read_corrector(kmer_data, debug_pred, select_pred);
+ hammer::ReadProcessor(cfg::get().max_nthreads).Run(irs, read_corrector, ors);
+
+ outlib.push_back_paired(outcorl, outcorr);
+ }
+
+ // Second, correct all the single FASTQ files
+ for (auto I = lib.single_begin(), E = lib.single_end(); I != E; ++I, ++iread) {
+ if (path::extension(*I) == ".bam")
+ continue;
+
+ INFO("Correcting " << *I);
+
+ std::string usuffix = std::to_string(ilib) + "_" +
+ std::to_string(iread) + ".cor.fasta";
+
+ std::string outcor = path::append_path(cfg::get().output_dir, path::basename(*I) + usuffix);
+ io::osequencestream ors(outcor);
+
+ io::FileReadStream irs(*I, io::PhredOffset);
+ SingleReadCorrector read_corrector(kmer_data, debug_pred, select_pred);
+ hammer::ReadProcessor(cfg::get().max_nthreads).Run(irs, read_corrector, ors);
+
+ outlib.push_back_single(outcor);
+ }
+
+ // Finally, correct all the BAM stuff in a row
+ for (auto I = lib.reads_begin(), E = lib.reads_end(); I != E; ++I, ++iread) {
+ if (path::extension(*I) != ".bam")
+ continue;
+
+ INFO("Correcting " << *I);
+
+ std::string usuffix = std::to_string(ilib) + "_" +
+ std::to_string(iread) + ".cor.fasta";
+
+ std::string outcor = path::append_path(cfg::get().output_dir, path::basename(*I) + usuffix);
+ io::osequencestream ors(outcor);
+
+ BamTools::BamReader bam_reader;
+ bam_reader.Open(*I);
+ auto header = bam_reader.GetHeader();
+ bam_reader.Close();
+
+ SingleReadCorrector read_corrector(kmer_data, &header, debug_pred, select_pred);
+ io::UnmappedBamStream irs(*I);
+ hammer::ReadProcessor(cfg::get().max_nthreads).Run(irs, read_corrector, ors);
+
+ outlib.push_back_single(outcor);
+ }
+
+ outdataset.push_back(outlib);
+ }
+ cfg::get_writable().dataset = outdataset;
+
+ std::string fname = path::append_path(cfg::get().output_dir, "corrected.yaml");
+ INFO("Saving corrected dataset description to " << fname);
+ cfg::get_writable().dataset.save(fname);
+
+#if 0
+ std::sort(classes.begin(), classes.end(), UfCmp());
+ for (size_t i = 0; i < classes.size(); ++i) {
+ auto& cluster = classes[i];
+ std::sort(cluster.begin(), cluster.end(), CountCmp(kmer_data));
+ dump(kmer_data, cluster);
+ }
+#endif
+ } catch (std::bad_alloc const& e) {
+ std::cerr << "Not enough memory to run IonHammer. " << e.what() << std::endl;
+ return EINTR;
+ } catch (std::exception const& e) {
+ std::cerr << "Exception caught " << e.what() << std::endl;
+ return EINTR;
+ } catch (...) {
+ std::cerr << "Unknown exception caught " << std::endl;
+ return EINTR;
+ }
+
+ return 0;
+}
diff --git a/src/projects/ionhammer/read_corrector.hpp b/src/projects/ionhammer/read_corrector.hpp
new file mode 100644
index 0000000..952972c
--- /dev/null
+++ b/src/projects/ionhammer/read_corrector.hpp
@@ -0,0 +1,1220 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef __HAMMER_IT_READ_CORRECTOR_HPP__
+#define __HAMMER_IT_READ_CORRECTOR_HPP__
+
+#include "HSeq.hpp"
+#include "flow_space_read.hpp"
+#include "hkmer_distance.hpp"
+#include "consensus.hpp"
+#include "valid_hkmer_generator.hpp"
+#include "config_struct.hpp"
+#include "io/reads/single_read.hpp"
+
+#include <boost/optional.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/storage.hpp>
+
+#include <bamtools/api/BamAlignment.h>
+#include <bamtools/api/SamHeader.h>
+#include "seqeval/BaseHypothesisEvaluator.h"
+
+#include <deque>
+#include <vector>
+#include <iterator>
+#include <limits>
+#include <cassert>
+#include <list>
+#include <string>
+#include <algorithm>
+#include <fstream>
+
+#if 1
+#include "data_structures/sequence/nucl.hpp"
+#include <iostream>
+#include <iomanip>
+#endif
+
+namespace hammer {
+namespace correction {
+
+namespace numeric = boost::numeric::ublas;
+
+typedef numeric::matrix<double> ScoreMatrix;
+typedef std::vector<ScoreMatrix> ScoreStorage;
+
+template <typename It1, typename It2>
+static bool exactAlignH(It1 a_begin, It1 a_initial_pos, It1 a_end,
+ It2 b_initial_pos, It2 /*b_end*/,
+ uint8_t max_offset, uint8_t n_cmp, int* p_offset)
+{
+ int M = max_offset * 2 + 1;
+ for (int i = 0; i < M; i++) {
+ int offset = (i / 2) * ((i & 1) ? 1 : -1); // 0, -1, 1, -2, 2, ...
+ auto a_it = a_initial_pos + offset;
+ auto b_it = b_initial_pos;
+ if (a_it < a_begin || a_it + n_cmp > a_end)
+ continue;
+ bool match = true;
+ for (size_t j = 0; j < n_cmp; j++)
+ if ((a_it + j)->raw != (b_it + j)->raw) {
+ match = false;
+ break;
+ }
+ if (match) {
+ *p_offset = offset;
+ return true;
+ }
+ }
+ return false;
+}
+
+template <typename It1, typename It2>
+static int overlapAlignH(It1 a_begin, It1 a_end, It2 b_begin, It2 b_end,
+ uint8_t max_offset)
+{
+ // TODO: use dynamic programming
+ int M = max_offset * 2 + 1;
+ int best_offset = 0;
+ int best_score = 0;
+ for (int i = 0; i < M; i++) {
+ int offset = (i / 2) * ((i & 1) ? 1 : -1); // 0, -1, 1, -2, 2, ...
+ auto a_it = offset < 0 ? a_begin : a_begin + offset;
+ auto b_it = offset < 0 ? b_begin - offset : b_begin;
+ if (b_it < b_begin || a_it >= a_end)
+ continue;
+ int score = 0;
+ for ( ; a_it != a_end && b_it != b_end; ++a_it, ++b_it)
+ if (a_it->nucl == b_it->nucl)
+ score += std::min(a_it->len, b_it->len);
+ score -= i / 4;
+ if (score > best_score) {
+ best_offset = offset;
+ best_score = score;
+ }
+ }
+ return best_offset;
+}
+
+
+struct Score {
+ short value;
+ short dir;
+ Score(short v, short d) : value(v), dir(d) {}
+};
+
+#if 1
+template <typename It1, typename It2>
+static void dump(boost::numeric::ublas::matrix<Score> &scores,
+ It1 x_begin, It1 x_end, It2 y_begin, It2 y_end) {
+ std::cerr << " ";
+ for (auto it = x_begin; it != x_end; ++it)
+ std::cerr << std::setw(3) << int(it->len) << nucl(it->nucl);
+ std::cerr << "\n ";
+ auto m = x_end - x_begin;
+ auto n = y_end - y_begin;
+ for (int i = 0; i <= m; i++)
+ std::cerr << std::setw(4) << scores(i, 0).value;
+ std::cerr << '\n';
+ for (int i = 1; i <= n; i++) {
+ auto run = *(y_begin + i - 1);
+ std::cerr << std::setw(2) << int(run.len) << nucl(run.nucl) << ' ';
+ for (int j = 0; j <= m; j++)
+ std::cerr << std::setw(4) << scores(j, i).value;
+ std::cerr << '\n';
+ }
+}
+#endif
+
+template <typename It1, typename It2>
+static int alignH(It1 read_begin, It1 read_end,
+ It2 consensus_begin, It2 consensus_end,
+ int approx_read_offset, size_t n_skip_consensus,
+ uint8_t n_side = 5, uint8_t n_cmp = 8) {
+
+ int left_offset = n_side;
+ int read_len = int(read_end - read_begin);
+ int consensus_len = int(consensus_end - consensus_begin);
+
+ It1 x_begin = read_begin + std::max(approx_read_offset - n_side, 0);
+ if (x_begin == read_begin)
+ left_offset = approx_read_offset;
+
+ if (approx_read_offset - n_side + n_cmp >= read_len) {
+ x_begin = read_end - std::min(n_cmp + 2 * n_side, read_len);
+ left_offset = int(read_begin + approx_read_offset - x_begin);
+ }
+
+ auto x_end = x_begin + std::min(int(2 * n_side + n_cmp),
+ int(read_end - x_begin));
+
+ auto y_begin = consensus_begin +
+ std::min(int(n_skip_consensus), consensus_len);
+ if (y_begin == consensus_end)
+ return 0; // weird situation
+ auto y_end = y_begin + std::min(int(n_cmp),
+ int(consensus_end - y_begin));
+
+ // glocal alignment of homopolymer runs
+ const short kDirUpLeft = 0;
+ const short kDirUp = 1;
+ const short kDirLeft = 2;
+
+ const short kBaseDiff = -3;
+ const short kRunInsertionStart = -4;
+ const short kRunInsertionExtend = -5;
+ const short kRunDeletionStart = -4;
+ const short kRunDeletionExtend = -5;
+ const short kNuclMismatch = -5;
+ const short kNuclMatch = 1;
+ const short kFullMatch = 5;
+
+ int m = int(x_end - x_begin);
+ int n = int(y_end - y_begin);
+
+ using namespace boost::numeric::ublas;
+ matrix<Score> scores(m + 1, n + 1, Score(0, 0));
+
+ size_t highest_x = 0, highest_y = 0;
+ int highest_entry = std::numeric_limits<int>::min();
+
+ for (int i = 1; i <= m; i++) {
+ for (int j = 1; j <= n; j++) {
+ int best_score = std::numeric_limits<int>::min();
+ short best_dir = 0;
+
+ auto run_x = *(x_begin + i - 1);
+ auto run_y = *(y_begin + j - 1);
+
+ int score;
+ if (run_x.raw == run_y.raw) {
+ score = kNuclMatch * run_x.len + scores(i - 1, j - 1).value;
+ score += kFullMatch;
+ if (score > best_score) {
+ best_score = score;
+ best_dir = kDirUpLeft;
+ }
+ } else if (run_x.nucl == run_y.nucl) {
+ score = kBaseDiff * std::abs(run_x.len - run_y.len);
+ score += kNuclMatch * std::min(run_x.len, run_y.len);
+ score += scores(i - 1, j - 1).value;
+ if (score > best_score) {
+ best_score = score;
+ best_dir = kDirUpLeft;
+ }
+ } else {
+ score = scores(i - 1, j - 1).value;
+ score += kNuclMismatch * std::max(run_x.len, run_y.len);
+
+ if (score > best_score) {
+ best_score = score;
+ best_dir = kDirUpLeft;
+ }
+ }
+
+ int multiplier;
+
+ if (scores(i - 1, j).dir == kDirUp)
+ multiplier = kRunDeletionExtend;
+ else
+ multiplier = kRunDeletionStart;
+ score = scores(i - 1, j).value + multiplier * run_x.len;
+ if (score > best_score) {
+ best_score = score;
+ best_dir = kDirUp;
+ }
+
+ if (scores(i, j - 1).dir == kDirLeft)
+ multiplier = kRunInsertionStart;
+ else
+ multiplier = kRunInsertionExtend;
+ score = scores(i, j - 1).value + multiplier * run_y.len;
+ if (score > best_score) {
+ best_score = score;
+ best_dir = kDirLeft;
+ }
+
+ scores(i, j) = Score(static_cast<short>(best_score), best_dir);
+
+ if (i == m || j == n) {
+ const int kOffset = 4;
+ int approx_offset = i - j - left_offset;
+ int offset_penalty = std::abs(approx_offset) * kOffset;
+ if (best_score - offset_penalty > highest_entry) {
+ highest_entry = best_score - offset_penalty;
+ highest_x = i;
+ highest_y = j;
+ }
+ }
+ }
+ }
+
+ int min_acceptable_score = ((kNuclMatch + kFullMatch) * n_cmp * 4) / 5;
+ if (scores(highest_x, highest_y).value < min_acceptable_score && n_cmp < 16U)
+ return alignH(read_begin, read_end,
+ consensus_begin, consensus_end,
+ approx_read_offset, n_skip_consensus,
+ n_side, uint8_t(n_cmp * 2));
+
+ int x = int(highest_x);
+ int y = int(highest_y);
+ while (x > 0 && y > 0) {
+ int dir = scores(x, y).dir;
+ switch (dir) {
+ case kDirUp:
+ --x; break;
+ case kDirLeft:
+ --y; break;
+ case kDirUpLeft:
+ --x, --y; break;
+ default:
+ break;
+ }
+ }
+
+#if 0
+ if (std::abs(x - y - left_offset) >= 4)
+ dump(scores, x_begin, x_end, y_begin, y_end);
+#endif
+
+ return x - y - left_offset;
+}
+
+// Not used now
+class HKMerProlonger {
+ const KMerData& kmer_data_;
+
+ public:
+ struct RightSide {
+ static size_t changingPosition() { return hammer::K - 1; }
+ static hammer::HKMer shift(const hammer::HKMer &kmer) {
+ hammer::HKMer res;
+ for (size_t i = 1; i < hammer::K; ++i)
+ res[i - 1] = kmer[i];
+ return res;
+ }
+ template <typename T, typename U>
+ static void append(T& cont, U obj) { cont.push_back(obj); }
+ };
+
+ struct LeftSide {
+ static size_t changingPosition() { return 0; }
+ static hammer::HKMer shift(const hammer::HKMer &kmer) {
+ hammer::HKMer res;
+ for (size_t i = 1; i < hammer::K; ++i)
+ res[i] = kmer[i - 1];
+ return res;
+ }
+ template <typename T, typename U>
+ static void append(T& cont, U obj) { cont.push_front(obj); }
+ };
+
+ public:
+
+ /// @param[in] seed kmer to prolong
+ /// @param[in] bases_to_recover maximum number of bases to recover
+ /// @param[in] side side to prolong to (RightSide/LeftSide)
+ template <typename Side>
+ std::deque<hammer::HomopolymerRun> prolong(const hammer::HKMer &seed,
+ size_t bases_to_recover,
+ Side side) {
+ std::deque<hammer::HomopolymerRun> good_runs(hammer::K);
+ for (size_t i = 0; i < hammer::K; ++i)
+ good_runs[i] = seed[i];
+
+ auto good_kmer = seed;
+ auto changing_pos = Side::changingPosition();
+
+ for (size_t recov = 0; recov < bases_to_recover; ++recov) {
+ double inf = -std::numeric_limits<double>::infinity();
+ double best_qual = inf;
+ int best_nucl = -1;
+ int best_len = -1;
+ double next_best_qual = inf;
+
+ auto kmer = Side::shift(good_kmer);
+
+ for (size_t nucl = 0; nucl < 4; ++nucl) {
+ if (nucl == good_kmer[changing_pos].nucl)
+ continue;
+ for (size_t len = 1; len <= 4; ++len) {
+ kmer[changing_pos] = hammer::HomopolymerRun(nucl, len);
+ auto &k = kmer_data_[kmer];
+ auto qual = k.count * (1 - k.qual);
+ if (qual > best_qual) {
+ next_best_qual = best_qual;
+ best_qual = qual;
+ best_nucl = nucl;
+ best_len = len;
+ }
+ }
+ }
+
+ // stop if high-quality kmer is not unique
+ if (best_nucl == -1 || best_qual - next_best_qual < 0.8 * best_qual)
+ break;
+
+ kmer[changing_pos] = hammer::HomopolymerRun(best_nucl, best_len);
+ Side::append(good_runs, kmer[changing_pos]);
+ good_kmer = kmer;
+ }
+
+ return good_runs;
+ }
+
+ public:
+ HKMerProlonger(const KMerData& kmer_data) : kmer_data_(kmer_data) {}
+};
+
+static const double kLowScoreThreshold = 1.0;
+
+class CorrectedRead {
+ FlowSpaceRead raw_read_; // Uncorrected read
+ const KMerData& kmer_data_;
+ bool debug_mode_;
+
+ // Stores runs after joining chunks
+ std::vector<hammer::HomopolymerRun> corrected_runs_;
+
+ // Contiguous part of read with strong consensus
+ struct ConsensusChunk {
+ int approx_read_offset; // in the vector of raw read runs
+ int approx_end_read_offset_;
+ unsigned rollback_end; // remove if don't align well
+
+ int initial_read_offset_;
+
+ enum {
+ kChunkLeftAligned,
+ kChunkRightAligned,
+ kChunkNotAligned
+ } alignment;
+
+ const FlowSpaceRead& raw_read;
+ size_t trimmed_left;
+ size_t trimmed_right;
+ bool debug_mode;
+
+ std::vector<hammer::HomopolymerRun> consensus;
+ std::vector<double> consensus_scores;
+
+ int raw_start_offset() const {
+ return initial_read_offset_;
+ }
+
+ ConsensusChunk(int initial_read_offset,
+ int approximate_read_offset,
+ int approximate_end_read_offset,
+ const ScoreStorage &scores,
+ unsigned rollback_end,
+ const FlowSpaceRead &read,
+ bool debug_mode)
+ : approx_read_offset(approximate_read_offset),
+ approx_end_read_offset_(approximate_end_read_offset),
+ rollback_end(rollback_end),
+ initial_read_offset_(initial_read_offset),
+ alignment(kChunkNotAligned), raw_read(read),
+ trimmed_left(0), trimmed_right(0), debug_mode(debug_mode)
+ {
+ bool left_trim = true;
+ for (size_t i = 0; i < scores.size(); ++i) {
+ auto run = hammer::iontorrent::consensus(scores[i]);
+
+ // trim low-quality runs from the left side
+ if (run.second <= kLowScoreThreshold && left_trim) {
+ approx_read_offset += 1;
+ trimmed_left += 1;
+ continue;
+ }
+
+ if (debug_mode && left_trim) {
+ std::cerr << "[ConsensusChunk] trimmed from left: " << trimmed_left << std::endl;
+ std::cerr << "[ConsensusChunk] approx. read offset: " << approx_read_offset << std::endl;
+ }
+
+ left_trim = false;
+ VERIFY(run.first.len > 0);
+ consensus.push_back(run.first);
+ consensus_scores.push_back(run.second);
+ }
+
+ size_t right_end = consensus_scores.size();
+ if (right_end == 0)
+ return;
+
+ while (consensus_scores[right_end - 1] <= kLowScoreThreshold) {
+ --right_end;
+ if (right_end == 0)
+ break;
+ }
+
+ trimmed_right = consensus.size() - right_end;
+ consensus.resize(right_end);
+ consensus_scores.resize(right_end);
+ }
+
+ void AlignLeftEndAgainstRead(size_t skip=0) {
+ const auto& data = raw_read.data();
+
+ int offset = alignH(data.begin(), data.end(),
+ consensus.begin(), consensus.end(),
+ approx_read_offset, skip);
+
+ if (debug_mode) {
+ std::cerr << "[approx. read offset (left)] before: " << approx_read_offset << "; after: "
+ << approx_read_offset + offset << std::endl;
+ }
+
+ approx_read_offset += offset;
+ alignment = kChunkLeftAligned;
+ }
+
+ void AlignRightEndAgainstRead(size_t skip=0) {
+ const auto& data = raw_read.data();
+ int position_on_read = approx_end_read_offset_ - 1;
+ int offset = alignH(data.rbegin(), data.rend(),
+ consensus.rbegin(), consensus.rend(),
+ int(data.size()) - 1 - position_on_read, skip);
+ if (debug_mode) {
+ std::cerr << "[approx. read offset (right)] before: " << approx_read_offset << "; after: "
+ << approx_read_offset - offset << std::endl;
+ }
+ approx_read_offset -= offset;
+ alignment = kChunkRightAligned;
+ }
+
+ int approx_end_read_offset() const {
+ return approx_end_read_offset_;
+ }
+
+ int approx_end_read_offset_untrimmed() const {
+ return approx_end_read_offset() + int(trimmed_right);
+ }
+
+ private:
+ void RollBack() {
+ trimmed_right += rollback_end;
+ auto old_size = consensus.size();
+ VERIFY(old_size >= rollback_end);
+ consensus.resize(old_size - rollback_end);
+ approx_end_read_offset_ -= rollback_end;
+ consensus_scores.resize(old_size - rollback_end);
+ rollback_end = 0;
+ }
+
+ bool DoMerge(ConsensusChunk& chunk) {
+ int right_end_offset = approx_end_read_offset();
+
+ if (debug_mode) {
+ std::cerr << "============== Merging chunks ===============" << std::endl;
+ std::cerr << "(" << approx_read_offset << " .. " << right_end_offset << ")";
+ std::cerr << " -- (" << chunk.approx_read_offset << " .. " << chunk.approx_end_read_offset() << ")" << std::endl;
+
+ int white_l = 0;
+ for (int i = right_end_offset - 1; i >= 0; --i)
+ white_l += raw_read[i].len;
+ for (size_t i = 0; i < consensus.size(); ++i)
+ white_l -= consensus[i].len;
+ for (int i = 0; i < white_l; ++i)
+ std::cerr << ' ';
+ for (size_t i = std::max(-white_l, 0); i < consensus.size(); ++i)
+ std::cerr << consensus[i].str();
+ std::cerr << std::endl;
+
+ for (int i = 0; i < chunk.approx_read_offset; ++i)
+ for (int j = 0; j < raw_read[i].len; ++j)
+ std::cerr << ' ';
+ for (size_t i = 0; i < chunk.consensus.size(); ++i)
+ std::cerr << chunk.consensus[i].str();
+ std::cerr << std::endl;
+ }
+
+ if (right_end_offset <= chunk.approx_read_offset) {
+
+ for (int i = right_end_offset; i < chunk.approx_read_offset; ++i) {
+ if (i >= static_cast<int>(raw_read.size()))
+ return false;
+ consensus.push_back(raw_read[i]);
+ alignment = kChunkNotAligned;
+
+ // TODO: maintain quality scores in raw_read_
+ consensus_scores.push_back(0);
+ }
+
+ consensus.insert(consensus.end(),
+ chunk.consensus.begin(), chunk.consensus.end());
+
+ consensus_scores.insert(consensus_scores.end(),
+ chunk.consensus_scores.begin(),
+ chunk.consensus_scores.end());
+
+ } else {
+ int overlap = right_end_offset - chunk.approx_read_offset;
+ overlap -= overlapAlignH(consensus.end() - overlap,
+ consensus.end(),
+ chunk.consensus.begin(),
+ chunk.consensus.begin() + overlap,
+ 5);
+
+ if (overlap > static_cast<int>(chunk.consensus.size()))
+ return false;
+
+ if (overlap < 0) {
+ chunk.approx_read_offset = right_end_offset - overlap;
+ return DoMerge(chunk);
+ }
+
+ int n_trim = 0;
+ int n_runs = int(consensus.size());
+
+ // FIXME
+ if (overlap > 0 && rollback_end > 0) {
+ for (int i = 0; i < overlap; i++) {
+ if (n_runs - overlap + i < 0 || n_runs - overlap + i >= consensus.size())
+ continue;
+ auto left_run = consensus[n_runs - overlap + i];
+ auto right_run = chunk.consensus[i];
+ if (left_run != right_run) {
+ RollBack();
+ AlignRightEndAgainstRead();
+ return DoMerge(chunk);
+ }
+ }
+ }
+
+ if (overlap >= 3 && n_runs > overlap) {
+ for ( ; n_trim < overlap / 3; ++n_trim) {
+ auto score1 = consensus_scores[n_runs - n_trim - 1];
+ auto score2 = chunk.consensus_scores[overlap - n_trim - 1];
+ if (score1 > score2)
+ break;
+ }
+
+ consensus.resize(consensus.size() - n_trim);
+ consensus_scores.resize(consensus_scores.size() - n_trim);
+ }
+
+ consensus.insert(consensus.end(),
+ chunk.consensus.begin() + overlap - n_trim,
+ chunk.consensus.end());
+
+ consensus_scores.insert(consensus_scores.end(),
+ chunk.consensus_scores.begin() + overlap - n_trim,
+ chunk.consensus_scores.end());
+ }
+
+ approx_end_read_offset_ = chunk.approx_end_read_offset();
+ return true;
+ }
+
+ bool MergeWithDisjointChunk(ConsensusChunk& chunk) {
+ if (debug_mode)
+ std::cerr << "[MergeWithDisjointChunk]" << std::endl;
+ AlignRightEndAgainstRead();
+ if (chunk.alignment != kChunkLeftAligned)
+ chunk.AlignLeftEndAgainstRead();
+ return DoMerge(chunk);
+ }
+
+ bool MergeWithOverlappingChunk(ConsensusChunk& chunk) {
+ if (debug_mode)
+ std::cerr << "[MergeWithOverlappingChunk]" << std::endl;
+ int right_end_offset = approx_end_read_offset_;
+ size_t overlap = right_end_offset - chunk.approx_read_offset;
+ if (overlap > chunk.consensus_scores.size())
+ return false;
+
+ AlignRightEndAgainstRead();
+ if (chunk.alignment != kChunkLeftAligned)
+ chunk.AlignLeftEndAgainstRead();
+ return DoMerge(chunk);
+ }
+
+ public:
+
+ bool TryMergeWith(ConsensusChunk& chunk) {
+ if (chunk.consensus.empty())
+ return true;
+
+ alignment = kChunkNotAligned;
+ int right_end_offset = approx_end_read_offset_;
+
+ if (right_end_offset <= chunk.approx_read_offset)
+ return MergeWithDisjointChunk(chunk);
+ else
+ return MergeWithOverlappingChunk(chunk);
+ }
+
+ };
+
+ // Chunks where strong consensus was obtained
+ std::list<ConsensusChunk> chunks_;
+ int trimmed_by_gen_;
+
+ void PushChunk(const ScoreStorage &scores,
+ int initial_read_offset,
+ int approx_read_offset,
+ int approx_end_read_offset,
+ unsigned rollback_end) {
+ chunks_.push_back(ConsensusChunk(initial_read_offset, approx_read_offset,
+ approx_end_read_offset, scores,
+ rollback_end, raw_read_, debug_mode_));
+ if (debug_mode_) {
+ auto &consensus = chunks_.back().consensus;
+ size_t len = consensus.size();
+ size_t nucl_len = 0;
+ for (size_t i = 0; i < len; ++i)
+ nucl_len += consensus[i].len;
+ }
+
+ chunks_.back().AlignLeftEndAgainstRead();
+ if (chunks_.size() == 1)
+ trimmed_by_gen_ = chunks_.back().raw_start_offset();
+ }
+
+ const ConsensusChunk& LastChunk() const {
+ return chunks_.back();
+ }
+
+ class ChunkCollector {
+ CorrectedRead &cread_;
+ const KMerData &kmer_data_;
+ bool debug_mode_;
+
+ ValidHKMerGenerator<hammer::K> gen;
+ int pos;
+ unsigned skipped;
+ int raw_pos;
+
+ struct Center {
+ hammer::HKMer seq;
+ int end_offset;
+ };
+
+ Center last_good_center;
+ bool last_good_center_is_defined;
+ bool is_first_center;
+ bool replacing;
+ int rollback_size;
+
+ bool need_to_align;
+
+ int approx_read_offset;
+ int approx_end_read_offset;
+ ScoreStorage scores;
+ int chunk_pos;
+ int raw_chunk_start_pos;
+
+ unsigned approx_n_insertions;
+
+ Center GetCenterOfCluster(const hammer::HKMer &seq, int start_pos) const {
+ hammer::KMerStat k[2];
+ k[0] = kmer_data_[kmer_data_[seq].changeto];
+ k[1] = kmer_data_[kmer_data_[!seq].changeto];
+ k[1].kmer = !k[1].kmer;
+
+ if (k[0].qual > k[1].qual)
+ std::swap(k[0], k[1]);
+ using namespace hammer;
+ for (size_t i = 0; i < 2; ++i) {
+ auto &kmer = k[i].kmer;
+ int end_diff;
+ auto dist = distanceHKMer(kmer.begin(), kmer.end(), seq.begin(), seq.end(), 3, &end_diff);
+ if (debug_mode_) {
+ std::cerr << "[GetCenterOfCluster] distance("
+ << seq << ", " << kmer << ") = " << dist << std::endl;
+
+ }
+ if (dist <= 2) {
+ return Center{kmer, start_pos + int(hammer::K) + end_diff};
+ }
+ }
+ return Center{seq, start_pos + int(hammer::K)};
+ }
+
+ bool IsInconsistent(const Center ¢er) const {
+ if (!last_good_center_is_defined)
+ return false;
+
+ for (size_t i = 0; i < hammer::K - skipped - 1; ++i)
+ if (last_good_center.seq[i + skipped + 1].nucl != center.seq[i].nucl)
+ return true;
+
+ return false;
+ }
+
+ void FlushCurrentChunk() {
+ unsigned rollback_end = 0;
+
+ if (replacing) {
+ if (rollback_size < 0)
+ rollback_size = 0;
+ if (rollback_size < int(scores.size()))
+ rollback_end = int(scores.size()) - rollback_size;
+ replacing = false;
+ rollback_size = 0;
+ }
+
+ if (scores.size() > hammer::K) {
+ cread_.PushChunk(scores, raw_chunk_start_pos,
+ approx_read_offset, approx_end_read_offset, rollback_end);
+ pos = cread_.LastChunk().approx_end_read_offset_untrimmed() - hammer::K;
+ pos += skipped;
+ } else {
+ pos -= approx_n_insertions;
+ }
+
+ scores.clear();
+ need_to_align = false;
+ chunk_pos = 0;
+ skipped = 0;
+ approx_n_insertions = 0;
+ approx_read_offset = pos;
+
+ last_good_center_is_defined = false;
+ }
+
+ // side effect: changes chunk_pos, pos, and approx_n_insertions
+ bool TryToAlignCurrentCenter(const Center ¢er) {
+ if (!last_good_center_is_defined)
+ return true;
+
+ if (debug_mode_) {
+ std::cerr << "[TryToAlignCurrentCenter] " << center.seq.str()
+ << " (previous good center is " << last_good_center.seq.str() << ","
+ << " skipped " << skipped << " centers)" << std::endl;
+ }
+
+ // offset is how many positions the center should be shifted
+ // in order to agree with last_good_center
+ int offset;
+ bool aligned = exactAlignH(last_good_center.seq.begin(),
+ last_good_center.seq.begin() + skipped + 1,
+ last_good_center.seq.end(),
+ center.seq.begin(), center.seq.end(), 3, 8, &offset);
+
+ bool result = aligned && chunk_pos + offset >= 0;
+ if (result) {
+ if (debug_mode_)
+ std::cerr << "[TryToAlignCurrentCenter] offset = " << offset << std::endl;
+ if (offset < 0)
+ approx_n_insertions -= offset;
+ pos += offset;
+ chunk_pos += offset;
+ }
+
+ return result;
+ }
+
+ void IncludeIntoConsensus(const Center ¢er) {
+ VERIFY(chunk_pos >= 0);
+ VERIFY(chunk_pos < (1 << 16));
+ is_first_center = false;
+
+ if (chunk_pos + hammer::K > scores.size())
+ scores.resize(chunk_pos + hammer::K, ScoreMatrix(4, 64, 0));
+
+ auto k = kmer_data_[center.seq];
+
+ for (size_t i = 0; i < hammer::K; ++i)
+ scores[chunk_pos + i](center.seq[i].nucl, center.seq[i].len) += double(k.count) * (1.0 - k.qual);
+
+ last_good_center = center;
+ last_good_center_is_defined = true;
+ if (raw_chunk_start_pos == -1)
+ raw_chunk_start_pos = raw_pos;
+ approx_end_read_offset = center.end_offset;
+ if (debug_mode_) {
+ std::cerr << "e.o. = " << approx_end_read_offset << std::endl;
+ }
+ need_to_align = false;
+ skipped = 0;
+ }
+
+ public:
+ ChunkCollector(const io::SingleRead& r, CorrectedRead &cread,
+ const KMerData &kmer_data, bool debug_mode) :
+ cread_(cread), kmer_data_(kmer_data), debug_mode_(debug_mode),
+ gen(r), pos(int(gen.trimmed_left())), skipped(0),
+ last_good_center(), last_good_center_is_defined(false),
+ is_first_center(true),
+ replacing(false), rollback_size(0),
+ need_to_align(false),
+ approx_read_offset(0), approx_end_read_offset(0),
+ scores(), chunk_pos(0),
+ raw_chunk_start_pos(-1),
+ approx_n_insertions(0)
+ {
+ --pos;
+ --chunk_pos;
+ }
+
+ void Run() {
+ double lowQualThreshold = cfg::get().kmer_qual_threshold;
+
+ raw_pos = int(gen.trimmed_left()) - 1;
+
+ if (debug_mode_) {
+ std::cerr << "gen. trimmed = " << gen.trimmed_left() << std::endl;
+ }
+
+ while (gen.HasMore()) {
+ auto prev_chunk_pos = chunk_pos;
+ auto seq = gen.kmer();
+ gen.Next();
+ ++pos;
+ ++raw_pos;
+ if (debug_mode_) {
+ std::cerr << "=================================" << std::endl;
+ std::cerr << "pos = " << pos << ", raw_pos = " << raw_pos <<
+ ", last_good_center_is_defined = " << last_good_center_is_defined <<
+ ", skipped = " << skipped << std::endl;
+ }
+ ++chunk_pos;
+
+ auto center = Center{seq, raw_pos + int(hammer::K)};
+ auto qual = kmer_data_[seq].qual;
+
+ bool can_be_changed = last_good_center_is_defined || is_first_center;
+ if (qual > lowQualThreshold && can_be_changed) {
+ center = GetCenterOfCluster(seq, raw_pos);
+ qual = kmer_data_[center.seq].qual;
+ }
+
+ if (qual > lowQualThreshold && last_good_center_is_defined && skipped == 0) {
+ if (debug_mode_) {
+ std::cerr << "raw_pos + hammer::K = " << raw_pos + hammer::K << std::endl;
+ std::cerr << "last_good_center.end_offset + 1 = " << last_good_center.end_offset + 1 << std::endl;
+ }
+ // Finding a center by means of clustering failed.
+ // Let's try the following: take last good center and make a new one
+ // from it by appending next homopolymer run; if its quality is high, we use it.
+ if (raw_pos + hammer::K < last_good_center.end_offset + 1) {
+ --pos;
+ --chunk_pos;
+ if (debug_mode_) {
+ std::cerr << "skipping low-quality hk-mer" << std::endl;
+ }
+ continue; // move to next hk-mer
+ } else if (raw_pos + hammer::K == last_good_center.end_offset + 1) {
+ auto seq_corr = last_good_center.seq;
+ for (size_t i = 0; i < hammer::K - 1; ++i)
+ seq_corr[i] = seq_corr[i + 1];
+ seq_corr[hammer::K - 1] = seq[hammer::K - 1];
+ center = Center{seq_corr, last_good_center.end_offset + 1};
+ qual = kmer_data_[center.seq].qual;
+ if (debug_mode_) {
+ std::cerr << "seq_corr = " << seq_corr.str() << " , qual = " << qual << std::endl;
+ }
+
+ if (qual > lowQualThreshold && can_be_changed) {
+ // our last resort...
+ center = GetCenterOfCluster(seq_corr, raw_pos);
+ qual = kmer_data_[center.seq].qual;
+ }
+ }
+ }
+
+ bool low_qual = qual > lowQualThreshold;
+ bool inconsistent = IsInconsistent(center);
+
+ if (debug_mode_ && !low_qual && seq != center.seq) {
+ std::cerr << "replaced " << seq.str()
+ << " (quality " << kmer_data_[seq].qual
+ << ", count " << kmer_data_[seq].count << ")"
+ << " with " << center.seq.str() << std::endl;
+ }
+
+ if (debug_mode_) {
+ std::cerr << "quality of " << center.seq.str() << " is " << qual
+ << " (count " << kmer_data_[center.seq].count << ") "
+ << (inconsistent ? " INCONSISTENT" : "") << std::endl;
+ }
+
+ if (low_qual) {
+ ++skipped;
+ } else if (inconsistent) {
+ if (!TryToAlignCurrentCenter(center)) {
+ low_qual = true;
+ ++skipped;
+ }
+ }
+
+ if (skipped > hammer::K / 4) {
+ FlushCurrentChunk();
+ } else if (!low_qual) {
+ if (seq != center.seq && !replacing) {
+ rollback_size = prev_chunk_pos + hammer::K;
+ replacing = true;
+ } else if (seq == center.seq && replacing) {
+ replacing = false;
+ }
+
+ if (debug_mode_) {
+ std::cerr << "[include into consensus] raw_pos = " << raw_pos << std::endl;
+ }
+ IncludeIntoConsensus(center);
+ }
+ }
+
+ FlushCurrentChunk();
+ }
+ };
+
+ void CollectChunks(const io::SingleRead& r) {
+ ChunkCollector chunk_collector(r, *this, kmer_data_, debug_mode_);
+ chunk_collector.Run();
+ }
+
+ public:
+ CorrectedRead(const io::SingleRead& read, const KMerData& kmer_data,
+ bool debug_mode = false) :
+ raw_read_(read),
+ kmer_data_(kmer_data),
+ debug_mode_(debug_mode)
+ {
+ CollectChunks(read);
+ }
+
+ void MergeChunks() {
+ if (chunks_.empty())
+ return;
+
+ auto iter = chunks_.begin();
+ ConsensusChunk& merged = *iter;
+
+ if (debug_mode_) {
+ if (chunks_.size() == 1) {
+ iter->AlignLeftEndAgainstRead();
+ for (int i = 0; i < iter->approx_read_offset; ++i)
+ for (int j = 0; j < raw_read_[i].len; ++j)
+ std::cerr << ' ';
+ for (size_t i = 0; i < iter->consensus.size(); ++i)
+ std::cerr << iter->consensus[i].str();
+ std::cerr << std::endl;
+ }
+ }
+
+ ++iter;
+ while (iter != chunks_.end()) {
+ if (iter->consensus.size() > hammer::K)
+ merged.TryMergeWith(*iter);
+ iter = chunks_.erase(iter);
+ }
+
+ corrected_runs_ = std::move(merged.consensus);
+ }
+
+ void AttachUncorrectedRuns() {
+ // attach runs from the right
+ const auto& data = raw_read_.data();
+ int n_raw = int(raw_read_.size());
+ int end_read_offset = LastChunk().approx_end_read_offset();
+ if (end_read_offset < n_raw && end_read_offset >= 0) {
+ corrected_runs_.insert(corrected_runs_.end(),
+ data.begin() + end_read_offset,
+ data.end());
+ }
+ if (debug_mode_) {
+ std::cerr << "n_raw = " << n_raw << ", end_read_offset = " << end_read_offset << std::endl;
+ }
+
+ // attach runs from the left
+ if (trimmed_by_gen_ > 0 && size_t(trimmed_by_gen_) <= data.size()) {
+ std::vector<HomopolymerRun> runs;
+ runs.reserve(corrected_runs_.size() + trimmed_by_gen_);
+ runs.insert(runs.end(), data.begin(), data.begin() + trimmed_by_gen_);
+ runs.insert(runs.end(), corrected_runs_.begin(), corrected_runs_.end());
+ std::swap(runs, corrected_runs_);
+ }
+ }
+
+ std::string GetSequenceString() const {
+ if (chunks_.empty() && corrected_runs_.empty())
+ return "";
+ std::string res;
+ if (!corrected_runs_.empty()) {
+ for (auto it = corrected_runs_.begin(); it != corrected_runs_.end(); ++it)
+ res += it->str();
+ } else {
+ auto& runs = chunks_.front().consensus;
+ for (auto it = runs.begin(); it != runs.end(); ++it)
+ res += it->str();
+ }
+ return res;
+ }
+};
+
+class SingleReadCorrector {
+ const KMerData &kmer_data_;
+
+ public:
+
+ struct ReadSelectionPredicate {
+ virtual bool operator()(const io::SingleRead &read) = 0;
+ };
+
+ struct DebugOutputPredicate : public ReadSelectionPredicate {};
+
+ struct NoDebug : public DebugOutputPredicate {
+ virtual bool operator()(const io::SingleRead &) {
+ return false;
+ }
+ };
+
+ struct FullDebug : public DebugOutputPredicate {
+ virtual bool operator()(const io::SingleRead &) {
+ return true;
+ }
+ };
+
+ class DebugIfContains : public DebugOutputPredicate {
+ Sequence needle_;
+ Sequence needle_rc_;
+ public:
+ DebugIfContains(const Sequence &seq) :
+ needle_(seq), needle_rc_(!seq) {}
+
+ virtual bool operator()(const io::SingleRead &read) {
+ auto read_seq = read.sequence();
+ if (read_seq.size() < needle_.size())
+ return false;
+ if (read_seq.find(needle_, 0) != -1ULL)
+ return true;
+ if (read_seq.find(needle_rc_, 0) != -1ULL)
+ return true;
+ return false;
+ }
+ };
+
+ struct SelectPredicate : public ReadSelectionPredicate {};
+ struct SelectAll : public SelectPredicate {
+ virtual bool operator()(const io::SingleRead &) {
+ return true;
+ }
+ };
+
+ class SelectByName : public SelectPredicate {
+ std::set<std::string> names_;
+ public:
+ SelectByName(const std::set<std::string>& names) :
+ names_(names) {}
+ virtual bool operator()(const io::SingleRead &r) {
+ return names_.find(r.name()) != names_.end();
+ }
+ };
+
+private:
+ BamTools::SamHeader* sam_header_;
+ DebugOutputPredicate &debug_pred_;
+ SelectPredicate &select_pred_;
+
+public:
+ SingleReadCorrector(const KMerData &kmer_data,
+ BamTools::SamHeader *sam_header,
+ DebugOutputPredicate &debug,
+ SelectPredicate &select) :
+ kmer_data_(kmer_data), sam_header_(sam_header),
+ debug_pred_(debug), select_pred_(select) {}
+
+ SingleReadCorrector(const KMerData &kmer_data,
+ DebugOutputPredicate &debug,
+ SelectPredicate &select) :
+ kmer_data_(kmer_data), sam_header_(NULL),
+ debug_pred_(debug), select_pred_(select) {}
+
+ boost::optional<io::SingleRead> operator()(const io::SingleRead &r) {
+ if (!select_pred_(r))return boost::optional<io::SingleRead>();
+ bool debug_mode = debug_pred_(r);
+ if (debug_mode) {
+ std::cerr << "=============================================" << std::endl;
+
+ std::cerr << '>' << r.name() << '\n'
+ << r.GetSequenceString() << std::endl;
+ }
+
+ CorrectedRead read(r, kmer_data_, debug_mode);
+ read.MergeChunks();
+ if (cfg::get().keep_uncorrected_ends)
+ read.AttachUncorrectedRuns();
+
+ if (debug_mode) {
+ std::cerr << "final result: " << read.GetSequenceString() << std::endl;
+ }
+
+ auto seq = read.GetSequenceString();
+ if (seq.empty())
+ return boost::optional<io::SingleRead>();
+
+ return io::SingleRead(r.name(), seq);
+ }
+
+ boost::optional<io::BamRead>
+ operator()(BamTools::BamAlignment &alignment) {
+ VERIFY(sam_header_);
+ io::SingleRead r(alignment.Name, alignment.QueryBases);
+ // reverse strand means we're working with a mapped BAM, might be
+ // the case for datasets downloaded from IonCommunity
+ if (alignment.IsReverseStrand())
+ r = !r;
+ auto corrected_r = operator()(r);
+ std::string rg;
+ if (!alignment.GetTag("RG", rg) || !corrected_r)
+ return boost::optional<io::BamRead>();
+ auto flow_order = sam_header_->ReadGroups[rg].FlowOrder;
+
+ float delta_score, fit_score;
+ auto seq = corrected_r.get().GetSequenceString();
+ if (alignment.IsReverseStrand()) {
+ std::reverse(seq.begin(), seq.end());
+ for (auto it = seq.begin(); it != seq.end(); ++it) {
+ switch (*it) {
+ case 'A': *it = 'T'; break;
+ case 'C': *it = 'G'; break;
+ case 'G': *it = 'C'; break;
+ case 'T': *it = 'A'; break;
+ default: break;
+ }
+ }
+ }
+
+ BaseHypothesisEvaluator(alignment, flow_order, seq,
+ delta_score, fit_score, 0);
+ std::stringstream ss;
+ ss << alignment.Name << "_" << delta_score << "_" << fit_score;
+ alignment.Name = ss.str();
+ if (delta_score >= cfg::get().delta_score_threshold)
+ return io::BamRead(alignment);
+
+ BamTools::BamAlignment corrected(alignment);
+ corrected.QueryBases = corrected_r.get().GetSequenceString();
+ return io::BamRead(corrected);
+ }
+};
+
+class PairedReadCorrector : public SingleReadCorrector {
+ public:
+ PairedReadCorrector(const KMerData &kmer_data,
+ DebugOutputPredicate &debug,
+ SelectPredicate &select)
+ : SingleReadCorrector(kmer_data, debug, select) {}
+
+ boost::optional<io::PairedRead> operator()(const io::PairedRead &r) {
+ auto corrected_r = SingleReadCorrector::operator()(r.first());
+ auto corrected_l = SingleReadCorrector::operator()(r.second());
+
+ if (!corrected_r || !corrected_l)
+ return boost::optional<io::PairedRead>();
+
+ return io::PairedRead(corrected_r.get(), corrected_l.get(), 0);
+ }
+};
+
+}; // namespace correction
+}; // namespace hammer
+#endif // __HAMMER_IT_READ_CORRECTOR_HPP__
diff --git a/src/ionhammer/seqeval/BaseCallerUtils.h b/src/projects/ionhammer/seqeval/BaseCallerUtils.h
similarity index 100%
rename from src/ionhammer/seqeval/BaseCallerUtils.h
rename to src/projects/ionhammer/seqeval/BaseCallerUtils.h
diff --git a/src/projects/ionhammer/seqeval/BaseHypothesisEvaluator.cpp b/src/projects/ionhammer/seqeval/BaseHypothesisEvaluator.cpp
new file mode 100644
index 0000000..74cd750
--- /dev/null
+++ b/src/projects/ionhammer/seqeval/BaseHypothesisEvaluator.cpp
@@ -0,0 +1,302 @@
+/* Copyright (C) 2013 Ion Torrent Systems, Inc. All Rights Reserved */
+
+//! @file BaseHypothesisEvaluator.cpp
+//! @ingroup SpadesHelpers
+//! @brief Combines code from the TS Basecaller and the TS Variant Caller to
+//! @brief give an indication about the feasibility of an alternative read sequence
+
+#include "BaseHypothesisEvaluator.h"
+
+// Function to fill in predicted signal values
+void BaseHypothesisEvaluator(BamTools::BamAlignment &alignment,
+ const string &flow_order_str,
+ const string &alt_base_hyp,
+ float &delta_score,
+ float &fit_score,
+ int heavy_verbose) {
+
+ // --- Step 1: Initialize Objects and retrieve relevant tags
+
+ delta_score = 1e5;
+ fit_score = 1e5;
+ vector<string> Hypotheses(2);
+ vector<float> measurements, phase_params;
+ int start_flow, num_flows, prefix_flow=0;
+
+ if (not GetBamTags(alignment, flow_order_str.length(), measurements, phase_params, start_flow))
+ return;
+ num_flows = measurements.size();
+ ion::FlowOrder flow_order(flow_order_str, num_flows);
+ BasecallerRead master_read;
+ master_read.SetData(measurements, flow_order.num_flows());
+ TreephaserLite treephaser(flow_order);
+ treephaser.SetModelParameters(phase_params[0], phase_params[1]);
+
+ // --- Step 2: Solve beginning of the read
+ // Look at mapped vs. unmapped reads in BAM
+ Hypotheses[0] = alignment.QueryBases;
+ Hypotheses[1] = alt_base_hyp;
+ // Safety: reverse complement reverse strand reads in mapped bam
+ if (alignment.IsMapped() and alignment.IsReverseStrand()) {
+ RevComplementInPlace(Hypotheses[0]);
+ RevComplementInPlace(Hypotheses[1]);
+ }
+
+ prefix_flow = GetMasterReadPrefix(treephaser, flow_order, start_flow, Hypotheses[0], master_read);
+ unsigned int prefix_size = master_read.sequence.size();
+
+ // --- Step 3: creating predictions for the individual hypotheses
+
+ vector<BasecallerRead> hypothesesReads(Hypotheses.size());
+ vector<float> squared_distances(Hypotheses.size(), 0.0);
+ int max_last_flow = 0;
+
+ for (unsigned int i_hyp=0; i_hyp<hypothesesReads.size(); ++i_hyp) {
+
+ hypothesesReads[i_hyp] = master_read;
+ // --- add hypothesis sequence to clipped prefix
+ unsigned int i_base = 0;
+ int i_flow = prefix_flow;
+
+ while (i_base<Hypotheses[i_hyp].length() and i_base<(2*(unsigned int)flow_order.num_flows()-prefix_size)) {
+ while (i_flow < flow_order.num_flows() and flow_order.nuc_at(i_flow) != Hypotheses[i_hyp][i_base])
+ i_flow++;
+ if (i_flow < flow_order.num_flows() and i_flow > max_last_flow)
+ max_last_flow = i_flow;
+ if (i_flow >= flow_order.num_flows())
+ break;
+ // Add base to sequence only if it fits into flow order
+ hypothesesReads[i_hyp].sequence.push_back(Hypotheses[i_hyp][i_base]);
+ i_base++;
+ }
+ i_flow = min(i_flow, flow_order.num_flows()-1);
+
+ // Solver simulates beginning of the read and then fills in the remaining clipped bases for which we have flow information
+ treephaser.Solve(hypothesesReads[i_hyp], num_flows, i_flow);
+ }
+ // Compute L2-distance of measurements and predictions
+ for (unsigned int i_hyp=0; i_hyp<hypothesesReads.size(); ++i_hyp) {
+ for (int iFlow=0; iFlow<=max_last_flow; iFlow++)
+ squared_distances[i_hyp] += (measurements.at(iFlow) - hypothesesReads[i_hyp].prediction.at(iFlow)) *
+ (measurements.at(iFlow) - hypothesesReads[i_hyp].prediction.at(iFlow));
+ }
+
+ // Delta: L2-distance of alternative base Hypothesis - L2-distance of bases as called
+ delta_score = squared_distances.at(1) - squared_distances.at(0);
+ fit_score = min(squared_distances.at(1), squared_distances.at(0));
+
+
+ // --- verbose ---
+ if (heavy_verbose > 1 or (delta_score < 0 and heavy_verbose > 0)) {
+ cout << "Processed read " << alignment.Name << endl;
+ cout << "Delta Fit: " << delta_score << " Overall Fit: " << fit_score << endl;
+ PredictionGenerationVerbose(Hypotheses, hypothesesReads, phase_params, flow_order, start_flow, prefix_size);
+ }
+
+}
+
+// ----------------------------------------------------------------------
+
+bool GetBamTags(BamTools::BamAlignment &alignment,
+ const int &num_flows,
+ vector<float> &measurements,
+ vector<float> &phase_params,
+ int &start_flow) {
+
+ vector<int16_t> quantized_measurements;
+ // Retrieve normalized measurements from BAM file
+ if (not alignment.GetTag("ZM", quantized_measurements)) {
+ cerr << "ERROR: Normalized measurements ZM:tag is not present in read " << alignment.Name << endl;
+ return false;
+ }
+ if ((int)quantized_measurements.size() > num_flows) {
+ cerr << "ERROR: Normalized measurements ZM:tag length exceeds flow order length in read " << alignment.Name << endl;
+ return false;
+ }
+ measurements.assign(quantized_measurements.size(), 0.0);
+ for (size_t counter = 0; counter < quantized_measurements.size(); ++counter)
+ measurements.at(counter) = (float)quantized_measurements.at(counter)/256;
+
+ // Retrieve phasing parameters from BAM file
+ if (not alignment.GetTag("ZP", phase_params)) {
+ cerr << "ERROR: Phasing Parameters ZP:tag is not present in read " << alignment.Name << endl;
+ return false;
+ }
+ if (phase_params.size() != 3) {
+ cerr << "ERROR: Phasing Parameters ZP:tag does not have 3 phase parameters in read " << alignment.Name << endl;
+ return false;
+ }
+ if (phase_params[0] < 0 or phase_params[0] > 1 or phase_params[1] < 0 or phase_params[1] > 1
+ or phase_params[2] < 0 or phase_params[2] > 1) {
+ cerr << "ERROR: Phasing Parameters ZP:tag outside of [0,1] range in read " << alignment.Name << endl;
+ return false;
+ }
+ phase_params[2] = 0.0f; // ad-hoc corrector: zero droop
+
+ // Retrieve start flow
+ if (not alignment.GetTag("ZF", start_flow)) {
+ cerr << "ERROR: Start Flow ZF:tag not found in read " << alignment.Name << endl;
+ return false;
+ }
+ if (start_flow < 0 or start_flow >= num_flows) {
+ cerr << "ERROR: Start flow outsize of [0,num_flows) range in read " << alignment.Name << endl;
+ cerr << "Start flow: " << start_flow << " Number of flows: " << num_flows;
+ return false;
+ }
+ // A start flow of zero indicated a read that did not pass basecaller filters
+ if (start_flow == 0) {
+ cerr << "WARNING: Start Flow ZF:tag has zero value in read " << alignment.Name << endl;
+ return false;
+ }
+ return true;
+}
+
+// ----------------------------------------------------------------------
+
+int GetMasterReadPrefix(TreephaserLite &treephaser,
+ const ion::FlowOrder &flow_order,
+ const int &start_flow,
+ const string &called_bases,
+ BasecallerRead &master_read) {
+
+ // Solve beginning of maybe clipped read
+ int until_flow = min((start_flow+20), flow_order.num_flows());
+ treephaser.Solve(master_read, until_flow, 0);
+
+ // StartFlow clipped? Get solved HP length at startFlow.
+ unsigned int base = 0;
+ int flow = 0;
+ unsigned int HPlength = 0;
+ while (base < master_read.sequence.size()) {
+ while (flow < flow_order.num_flows() and flow_order.nuc_at(flow) != master_read.sequence[base]) {
+ flow++;
+ }
+ if (flow > start_flow or flow == flow_order.num_flows())
+ break;
+ if (flow == start_flow)
+ HPlength++;
+ base++;
+ }
+ //if (global_context.DEBUG>2)
+ // printf("Solved %d bases until (not incl.) flow %d. HP of height %d at flow %d.\n", base, flow, HPlength, start_flow);
+
+ // Get HP size at the start of the read as called in Hypotheses[0]
+ unsigned int count = 1;
+ while (count < called_bases.length() and called_bases.at(count) == called_bases.at(0))
+ count++;
+ //if (global_context.DEBUG>2)
+ // printf("Hypothesis starts with an HP of length %d\n", count);
+ // Adjust the length of the prefix and erase extra solved bases
+ if (HPlength>count)
+ base -= count;
+ else
+ base -= HPlength;
+ master_read.sequence.erase(master_read.sequence.begin()+base, master_read.sequence.end());
+
+ // Get flow of last prefix base
+ int prefix_flow = 0;
+ for (unsigned int i_base = 0; i_base < master_read.sequence.size(); i_base++) {
+ while (prefix_flow < flow_order.num_flows() and flow_order.nuc_at(prefix_flow) != master_read.sequence[i_base])
+ prefix_flow++;
+ }
+
+ return prefix_flow;
+}
+
+
+// ----------------------------------------------------------------------
+
+void PredictionGenerationVerbose(const vector<string> &Hypotheses,
+ const vector<BasecallerRead> &hypothesesReads,
+ const vector<float> &phase_params,
+ const ion::FlowOrder &flow_order,
+ const int &start_flow,
+ const int &prefix_size) {
+
+ printf("Calculating predictions for %d hypotheses starting at flow %d:\n", (int)Hypotheses.size(), start_flow);
+ for (unsigned int iHyp=0; iHyp<Hypotheses.size(); ++iHyp) {
+ for (unsigned int iBase=0; iBase<Hypotheses[iHyp].length(); ++iBase)
+ printf("%c", Hypotheses[iHyp][iBase]);
+ printf("\n");
+ }
+ printf("Solved read prefix: ");
+ for (int iBase=0; iBase<prefix_size; ++iBase)
+ printf("%c", hypothesesReads[0].sequence[iBase]);
+ printf("\n");
+ printf("Extended Hypotheses reads to:\n");
+ for (unsigned int iHyp=0; iHyp<hypothesesReads.size(); ++iHyp) {
+ for (unsigned int iBase=0; iBase<hypothesesReads[iHyp].sequence.size(); ++iBase)
+ printf("%c", hypothesesReads[iHyp].sequence[iBase]);
+ printf("\n");
+ }
+ printf("Phasing Parameters, cf: %f ie: %f dr: %f \n Predictions: \n",
+ phase_params[0], phase_params[1], phase_params[2]);
+ cout << "Flow Order : ";
+ for (int i_flow=0; i_flow<flow_order.num_flows(); i_flow++) {
+ cout << flow_order.nuc_at(i_flow) << " ";
+ if (hypothesesReads[0].normalized_measurements[i_flow] < 0)
+ cout << " ";
+ }
+ cout << endl << "Flow Index : ";
+ for (int i_flow=0; i_flow<flow_order.num_flows(); i_flow++) {
+ cout << i_flow << " ";
+ if (i_flow<10) cout << " ";
+ else if (i_flow<100) cout << " ";
+ else if (i_flow<1000) cout << " ";
+ if (hypothesesReads[0].normalized_measurements[i_flow] < 0)
+ cout << " ";
+ }
+ cout << endl << "Measured : ";
+ for (unsigned int i_flow=0; i_flow<hypothesesReads[0].normalized_measurements.size(); ++i_flow) {
+ printf("%.2f", hypothesesReads[0].normalized_measurements[i_flow]);
+ if (hypothesesReads[0].normalized_measurements[i_flow] < 10)
+ cout << " ";
+ }
+ cout << endl;
+ for (unsigned int i_Hyp=0; i_Hyp<hypothesesReads.size(); ++i_Hyp) {
+ cout << "Prediction "<< i_Hyp << ": ";
+ for (unsigned int i_flow=0; i_flow<hypothesesReads[i_Hyp].prediction.size(); ++i_flow) {
+ printf("%.2f", hypothesesReads[i_Hyp].prediction[i_flow]);
+ if (hypothesesReads[i_Hyp].prediction[i_flow] < 10)
+ cout << " ";
+ if (hypothesesReads[0].normalized_measurements[i_flow] < 0)
+ cout << " ";
+ }
+ cout << endl;
+ }
+ cout << " ------------------- " << endl;
+}
+
+// ----------------------------------------------------------------------
+
+char NucComplement (char nuc)
+{
+ switch(nuc) {
+ case ('A') : return 'T';
+ case ('C') : return 'G';
+ case ('G') : return 'C';
+ case ('T') : return 'A';
+ case ('a') : return 't';
+ case ('c') : return 'g';
+ case ('g') : return 'c';
+ case ('t') : return 'a';
+
+ default: return nuc; // e.g. 'N' and '-' handled by default
+ }
+}
+
+void RevComplementInPlace(string& seq) {
+
+ char c;
+ int forward_idx = 0;
+ int backward_idx = seq.size()-1;
+ while (forward_idx < backward_idx) {
+ c = seq[forward_idx];
+ seq[forward_idx] = NucComplement(seq[backward_idx]);
+ seq[backward_idx] = NucComplement(c);
+ forward_idx++;
+ backward_idx--;
+ }
+ if (forward_idx == backward_idx)
+ seq[forward_idx] = NucComplement(seq[forward_idx]);
+}
diff --git a/src/ionhammer/seqeval/BaseHypothesisEvaluator.h b/src/projects/ionhammer/seqeval/BaseHypothesisEvaluator.h
similarity index 100%
rename from src/ionhammer/seqeval/BaseHypothesisEvaluator.h
rename to src/projects/ionhammer/seqeval/BaseHypothesisEvaluator.h
diff --git a/src/projects/ionhammer/seqeval/TreephaserLite.cpp b/src/projects/ionhammer/seqeval/TreephaserLite.cpp
new file mode 100644
index 0000000..cfe3896
--- /dev/null
+++ b/src/projects/ionhammer/seqeval/TreephaserLite.cpp
@@ -0,0 +1,593 @@
+/* Copyright (C) 2013 Ion Torrent Systems, Inc. All Rights Reserved */
+
+//! @file TreephaserLite.cpp
+//! @ingroup Spades Helper
+//! @brief A lighter version of TS-TreephaserLite. Perform dephasing and call base sequence by tree search.
+
+
+#include "TreephaserLite.h"
+
+//-------------------------------------------------------------------------
+
+void BasecallerRead::SetData(const vector<float> &measurements, int num_flows) {
+
+ raw_measurements = measurements;
+ raw_measurements.resize(num_flows, 0);
+ for (int iFlow = 0; iFlow < num_flows; iFlow++) {
+ if (isnan(measurements[iFlow])) {
+ std::cerr << "Warning: Basecaller Read: NAN in measurements!"<< std::endl;
+ raw_measurements.at(iFlow) = 0;
+ }
+ }
+
+ key_normalizer = 1.0f;
+ normalized_measurements = raw_measurements;
+ sequence.reserve(2*num_flows);
+ sequence.clear();
+ prediction.assign(num_flows, 0);
+}
+
+//--------------------------------------------------------------------------
+
+void BasecallerRead::SetDataAndKeyNormalize(const float *measurements, int num_flows, const int *key_flows, int num_key_flows)
+{
+ raw_measurements.resize(num_flows);
+ normalized_measurements.resize(num_flows);
+ prediction.assign(num_flows, 0);
+ sequence.reserve(2*num_flows);
+
+ float onemer_sum = 0.0f;
+ int onemer_count = 0;
+ for (int flow = 0; flow < num_key_flows; ++flow) {
+ if (key_flows[flow] == 1) {
+ onemer_sum += measurements[flow];
+ ++onemer_count;
+ }
+ }
+
+ key_normalizer = 1.0f;
+ if (onemer_sum and onemer_count)
+ key_normalizer = static_cast<float>(onemer_count) / onemer_sum;
+
+ for (int flow = 0; flow < num_flows; ++flow) {
+ raw_measurements[flow] = measurements[flow] * key_normalizer;
+ normalized_measurements[flow] = raw_measurements[flow];
+ }
+}
+
+//=========================================================================
+
+TreephaserLite::TreephaserLite(const ion::FlowOrder& flow_order, const int windowSize)
+ : flow_order_(flow_order)
+{
+ SetNormalizationWindowSize(windowSize);
+ for (int i = 0; i < 8; i++) {
+ transition_base_[i].resize(flow_order_.num_flows());
+ transition_flow_[i].resize(flow_order_.num_flows());
+ }
+ path_.resize(kNumPaths);
+ for (int p = 0; p < kNumPaths; ++p) {
+ path_[p].state.resize(flow_order_.num_flows());
+ path_[p].prediction.resize(flow_order_.num_flows());
+ path_[p].sequence.reserve(2*flow_order_.num_flows());
+ }
+}
+
+//-------------------------------------------------------------------------
+
+void TreephaserLite::SetModelParameters(double carry_forward_rate, double incomplete_extension_rate, double droop_rate)
+{
+
+ double nuc_avaliability[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+ for (int flow = 0; flow < flow_order_.num_flows(); ++flow) {
+ nuc_avaliability[flow_order_[flow]&7] = 1;
+ for (int nuc = 0; nuc < 8; nuc++) {
+ transition_base_[nuc][flow] = nuc_avaliability[nuc] * (1-droop_rate) * (1-incomplete_extension_rate);
+ transition_flow_[nuc][flow] = (1-nuc_avaliability[nuc]) + nuc_avaliability[nuc] * (1-droop_rate) * incomplete_extension_rate;
+ nuc_avaliability[nuc] *= carry_forward_rate;
+ }
+ }
+
+}
+
+// ----------------------------------------------------------------------
+
+void TreephaserLite::WindowedNormalize(BasecallerRead& read, int num_steps, int window_size) const
+{
+ int num_flows = read.raw_measurements.size();
+ float median_set[window_size];
+
+ // Estimate and correct for additive offset
+
+ float next_normalizer = 0;
+ int estim_flow = 0;
+ int apply_flow = 0;
+
+ for (int step = 0; step < num_steps; ++step) {
+
+ int window_end = estim_flow + window_size;
+ int window_middle = estim_flow + window_size / 2;
+ if (window_middle > num_flows)
+ break;
+
+ float normalizer = next_normalizer;
+
+ int median_set_size = 0;
+ for (; estim_flow < window_end and estim_flow < num_flows; ++estim_flow)
+ if (read.prediction[estim_flow] < 0.3)
+ median_set[median_set_size++] = read.raw_measurements[estim_flow] - read.prediction[estim_flow];
+
+ if (median_set_size > 5) {
+ std::nth_element(median_set, median_set + median_set_size/2, median_set + median_set_size);
+ next_normalizer = median_set[median_set_size / 2];
+ if (step == 0)
+ normalizer = next_normalizer;
+ }
+
+ float delta = (next_normalizer - normalizer) / window_size;
+
+ for (; apply_flow < window_middle and apply_flow < num_flows; ++apply_flow) {
+ read.normalized_measurements[apply_flow] = read.raw_measurements[apply_flow] - normalizer;
+ normalizer += delta;
+ }
+ }
+
+ for (; apply_flow < num_flows; ++apply_flow) {
+ read.normalized_measurements[apply_flow] = read.raw_measurements[apply_flow] - next_normalizer;
+ }
+
+ // Estimate and correct for multiplicative scaling
+
+ next_normalizer = 1;
+ estim_flow = 0;
+ apply_flow = 0;
+
+ for (int step = 0; step < num_steps; ++step) {
+
+ int window_end = estim_flow + window_size;
+ int window_middle = estim_flow + window_size / 2;
+ if (window_middle > num_flows)
+ break;
+
+ float normalizer = next_normalizer;
+
+ int median_set_size = 0;
+ for (; estim_flow < window_end and estim_flow < num_flows; ++estim_flow)
+ if (read.prediction[estim_flow] > 0.5 and read.normalized_measurements[estim_flow] > 0)
+ median_set[median_set_size++] = read.normalized_measurements[estim_flow] / read.prediction[estim_flow];
+
+ if (median_set_size > 5) {
+ std::nth_element(median_set, median_set + median_set_size/2, median_set + median_set_size);
+ next_normalizer = median_set[median_set_size / 2];
+ if (step == 0)
+ normalizer = next_normalizer;
+ }
+
+ float delta = (next_normalizer - normalizer) / window_size;
+
+ for (; apply_flow < window_middle and apply_flow < num_flows; ++apply_flow) {
+ read.normalized_measurements[apply_flow] /= normalizer;
+ normalizer += delta;
+ }
+ }
+
+ for (; apply_flow < num_flows; ++apply_flow) {
+ read.normalized_measurements[apply_flow] /= next_normalizer;
+ }
+}
+
+//-------------------------------------------------------------------------
+
+// Sliding window adaptive normalization and joint solving of sequence
+void TreephaserLite::NormalizeAndSolve(BasecallerRead& well, int max_flows, bool sliding_window)
+{
+ int window_size = windowSize_;
+ int solve_flows = 0;
+
+ for (int num_steps = 1; solve_flows < max_flows; ++num_steps) {
+ solve_flows = min((num_steps+1) * window_size, max_flows);
+ int restart_flows = 0;
+ if(sliding_window)
+ restart_flows = max(solve_flows-100, 0);
+
+ Solve(well, solve_flows, restart_flows);
+ WindowedNormalize(well, num_steps, window_size);
+ }
+
+ Solve(well, max_flows);
+}
+
+//-------------------------------------------------------------------------
+
+void TreephaserLite::InitializeState(TreephaserPath *state) const
+{
+ state->flow = 0;
+ state->state[0] = 1;
+ state->window_start = 0;
+ state->window_end = 1;
+ state->prediction.assign(flow_order_.num_flows(), 0);
+ state->sequence.clear();
+ state->sequence.reserve(2*flow_order_.num_flows());
+ state->last_hp = 0;
+}
+
+
+//-------------------------------------------------------------------------
+
+void TreephaserLite::AdvanceState(TreephaserPath *child, const TreephaserPath *parent, char nuc, int max_flow) const
+{
+ assert (child != parent);
+
+ // Advance flow
+ child->flow = parent->flow;
+ while (child->flow < max_flow and flow_order_[child->flow] != nuc)
+ child->flow++;
+
+ if (child->flow == parent->flow)
+ child->last_hp = parent->last_hp + 1;
+ else
+ child->last_hp = 1;
+
+ // Initialize window
+ child->window_start = parent->window_start;
+ child->window_end = min(parent->window_end, max_flow);
+
+ if (parent->flow != child->flow or parent->flow == 0) {
+
+ // This nuc begins a new homopolymer
+ float alive = 0;
+ child->state[parent->window_start] = 0;
+
+ for (int flow = parent->window_start; flow < child->window_end; ++flow) {
+
+ // State progression according to phasing model
+ if ((flow) < parent->window_end)
+ alive += parent->state[flow];
+ child->state[flow] = alive * transition_base_[nuc&7][flow];
+ alive *= transition_flow_[nuc&7][flow];
+
+ // Window maintenance
+ if (flow == child->window_start and child->state[flow] < kStateWindowCutoff)
+ child->window_start++;
+
+ if (flow == child->window_end-1 and child->window_end < max_flow and alive > kStateWindowCutoff)
+ child->window_end++;
+ }
+
+ } else {
+ // This nuc simply prolongs current homopolymer, inherits state from parent
+ //for (int flow = child->window_start; flow < child->window_end; ++flow)
+ // child->state[flow] = parent->state[flow];
+ memcpy(&child->state[child->window_start], &parent->state[child->window_start],
+ (child->window_end-child->window_start)*sizeof(float));
+ }
+
+ for (int flow = parent->window_start; flow < parent->window_end; ++flow)
+ child->prediction[flow] = parent->prediction[flow] + child->state[flow];
+ for (int flow = parent->window_end; flow < child->window_end; ++flow)
+ child->prediction[flow] = child->state[flow];
+}
+
+//-------------------------------------------------------------------------
+
+void TreephaserLite::AdvanceStateInPlace(TreephaserPath *state, char nuc, int max_flow) const
+{
+
+ int old_flow = state->flow;
+ int old_window_start = state->window_start;
+ int old_window_end = state->window_end;
+
+ // Advance in-phase flow
+ while (state->flow < max_flow and flow_order_[state->flow] != nuc)
+ state->flow++;
+ if (state->flow == max_flow) // Immediately return if base does not fit any more
+ return;
+
+ if (old_flow == state->flow)
+ state->last_hp++;
+ else
+ state->last_hp = 1;
+
+ if (old_flow != state->flow or old_flow == 0) {
+
+ // This nuc begins a new homopolymer, need to adjust state
+ float alive = 0;
+ for (int flow = old_window_start; flow < state->window_end; flow++) {
+
+ // State progression according to phasing model
+ if (flow < old_window_end)
+ alive += state->state[flow];
+ state->state[flow] = alive * transition_base_[nuc&7][flow];
+ alive *= transition_flow_[nuc&7][flow];
+
+ // Window maintenance
+ if (flow == state->window_start and state->state[flow] < kStateWindowCutoff)
+ state->window_start++;
+
+ if (flow == state->window_end-1 and state->window_end < max_flow and alive > kStateWindowCutoff)
+ state->window_end++;
+ }
+ }
+
+ for (int flow = old_window_start; flow < state->window_end; ++flow)
+ state->prediction[flow] += state->state[flow];
+}
+
+
+//-------------------------------------------------------------------------
+
+void TreephaserLite::Simulate(BasecallerRead& data, int max_flows)
+{
+ InitializeState(&path_[0]);
+
+ for (vector<char>::iterator nuc = data.sequence.begin(); nuc != data.sequence.end()
+ and path_[0].flow < max_flows; ++nuc) {
+ AdvanceStateInPlace(&path_[0], *nuc, flow_order_.num_flows());
+ }
+
+ data.prediction.swap(path_[0].prediction);
+}
+
+//-------------------------------------------------------------------------
+
+void TreephaserLite::Solve(BasecallerRead& read, int max_flows, int restart_flows)
+{
+ static const char nuc_int_to_char[5] = "ACGT";
+ assert(max_flows <= flow_order_.num_flows());
+
+ // Initialize stack: just one root path
+ for (int p = 1; p < kNumPaths; ++p)
+ path_[p].in_use = false;
+
+ InitializeState(&path_[0]);
+ path_[0].path_metric = 0;
+ path_[0].per_flow_metric = 0;
+ path_[0].residual_left_of_window = 0;
+ path_[0].dot_counter = 0;
+ path_[0].in_use = true;
+
+ int space_on_stack = kNumPaths - 1;
+ float sum_of_squares_upper_bound = 1e20; //max_flows; // Squared distance of solution to measurements
+
+ if (restart_flows > 0) {
+ // The solver will not attempt to solve initial restart_flows
+ // - Simulate restart_flows instead of solving
+ // - If it turns out that solving was finished before restart_flows, simply exit without any changes to the read.
+
+ restart_flows = min(restart_flows, flow_order_.num_flows());
+
+ for (vector<char>::iterator nuc = read.sequence.begin();
+ nuc != read.sequence.end() and path_[0].flow < restart_flows; ++nuc) {
+ AdvanceStateInPlace(&path_[0], *nuc, flow_order_.num_flows());
+ if (path_[0].flow < flow_order_.num_flows())
+ path_[0].sequence.push_back(*nuc);
+ }
+
+ if (path_[0].flow < restart_flows-10) { // This read ended before restart_flows. No point resolving it.
+ read.prediction.swap(path_[0].prediction);
+ return;
+ }
+
+ for (int flow = 0; flow < path_[0].window_start; ++flow) {
+ float residual = read.normalized_measurements[flow] - path_[0].prediction[flow];
+ path_[0].residual_left_of_window += residual * residual;
+ }
+ }
+
+ // Initializing variables
+ //read.solution.assign(flow_order_.num_flows(), 0);
+ read.sequence.clear();
+ read.sequence.reserve(2*flow_order_.num_flows());
+ read.prediction.assign(flow_order_.num_flows(), 0);
+
+ // Main loop to select / expand / delete paths
+ while (1) {
+
+ // ------------------------------------------
+ // Step 1: Prune the content of the stack and make sure there are at least 4 empty slots
+
+ // Remove paths that are more than 'maxPathDelay' behind the longest one
+ if (space_on_stack < kNumPaths-3) {
+ int longest_path = 0;
+ for (int p = 0; p < kNumPaths; ++p)
+ if (path_[p].in_use)
+ longest_path = max(longest_path, path_[p].flow);
+
+ if (longest_path > kMaxPathDelay) {
+ for (int p = 0; p < kNumPaths; ++p) {
+ if (path_[p].in_use and path_[p].flow < longest_path-kMaxPathDelay) {
+ path_[p].in_use = false;
+ space_on_stack++;
+ }
+ }
+ }
+ }
+
+ // If necessary, remove paths with worst perFlowMetric
+ while (space_on_stack < 4) {
+ // find maximum per flow metric
+ float max_per_flow_metric = -0.1;
+ int max_metric_path = kNumPaths;
+ for (int p = 0; p < kNumPaths; ++p) {
+ if (path_[p].in_use and path_[p].per_flow_metric > max_per_flow_metric) {
+ max_per_flow_metric = path_[p].per_flow_metric;
+ max_metric_path = p;
+ }
+ }
+
+ // killing path with largest per flow metric
+ if (!(max_metric_path < kNumPaths)) {
+ printf("Failed assertion in Treephaser\n");
+ for (int p = 0; p < kNumPaths; ++p) {
+ if (path_[p].in_use)
+ printf("Path %d, in_use = true, per_flow_metric = %f\n", p, path_[p].per_flow_metric);
+ else
+ printf("Path %d, in_use = false, per_flow_metric = %f\n", p, path_[p].per_flow_metric);
+ }
+ fflush(NULL);
+ }
+ assert (max_metric_path < kNumPaths);
+
+ path_[max_metric_path].in_use = false;
+ space_on_stack++;
+ }
+
+ // ------------------------------------------
+ // Step 2: Select a path to expand or break if there is none
+
+ TreephaserPath *parent = NULL;
+ float min_path_metric = 1000;
+ for (int p = 0; p < kNumPaths; ++p) {
+ if (path_[p].in_use and path_[p].path_metric < min_path_metric) {
+ min_path_metric = path_[p].path_metric;
+ parent = &path_[p];
+ }
+ }
+ if (!parent)
+ break;
+
+
+ // ------------------------------------------
+ // Step 3: Construct four expanded paths and calculate feasibility metrics
+ assert (space_on_stack >= 4);
+
+ TreephaserPath *children[4];
+
+ for (int nuc = 0, p = 0; nuc < 4; ++p)
+ if (not path_[p].in_use)
+ children[nuc++] = &path_[p];
+
+ float penalty[4] = { 0, 0, 0, 0 };
+
+ for (int nuc = 0; nuc < 4; ++nuc) {
+
+ TreephaserPath *child = children[nuc];
+
+ AdvanceState(child, parent, nuc_int_to_char[nuc], max_flows);
+
+ // Apply easy termination rules
+
+ if (child->flow >= max_flows) {
+ penalty[nuc] = 25; // Mark for deletion
+ continue;
+ }
+
+ if (child->last_hp > kMaxHP) {
+ penalty[nuc] = 25; // Mark for deletion
+ continue;
+ }
+
+ if ((int)parent->sequence.size() >= (2 * flow_order_.num_flows() - 10)) {
+ penalty[nuc] = 25; // Mark for deletion
+ continue;
+ }
+
+ child->path_metric = parent->residual_left_of_window;
+ child->residual_left_of_window = parent->residual_left_of_window;
+
+ float penaltyN = 0;
+ float penalty1 = 0;
+
+ for (int flow = parent->window_start; flow < child->window_end; ++flow) {
+
+ float residual = read.normalized_measurements[flow] - child->prediction[flow];
+ float residual_squared = residual * residual;
+
+ // Metric calculation
+ if (flow < child->window_start) {
+ child->residual_left_of_window += residual_squared;
+ child->path_metric += residual_squared;
+ } else if (residual <= 0)
+ child->path_metric += residual_squared;
+
+ if (residual <= 0)
+ penaltyN += residual_squared;
+ else if (flow < child->flow)
+ penalty1 += residual_squared;
+ }
+
+
+ penalty[nuc] = penalty1 + kNegativeMultiplier * penaltyN;
+ penalty1 += penaltyN;
+
+ if (child->flow>0)
+ child->per_flow_metric = (child->path_metric + 0.5 * penalty1) / child->flow;
+
+ } //looping over nucs
+
+
+ // Find out which nuc has the least penalty (the greedy choice nuc)
+ int best_nuc = 0;
+ if (penalty[best_nuc] > penalty[1])
+ best_nuc = 1;
+ if (penalty[best_nuc] > penalty[2])
+ best_nuc = 2;
+ if (penalty[best_nuc] > penalty[3])
+ best_nuc = 3;
+
+ // ------------------------------------------
+ // Step 4: Use calculated metrics to decide which paths are worth keeping
+
+ for (int nuc = 0; nuc < 4; ++nuc) {
+
+ TreephaserPath *child = children[nuc];
+
+ // Path termination rules
+
+ if (penalty[nuc] >= 20)
+ continue;
+
+ if (child->path_metric > sum_of_squares_upper_bound)
+ continue;
+
+ // This is the only rule that depends on finding the "best nuc"
+ if (penalty[nuc] - penalty[best_nuc] >= kExtendThreshold)
+ continue;
+
+ float dot_signal = (read.normalized_measurements[child->flow]
+ - parent->prediction[child->flow])
+ / child->state[child->flow];
+
+ child->dot_counter = (dot_signal < kDotThreshold) ? (parent->dot_counter + 1) : 0;
+ if (child->dot_counter > 1)
+ continue;
+
+ // Path survived termination rules and will be kept on stack
+ child->in_use = true;
+ space_on_stack--;
+
+ // Fill out the remaining portion of the prediction
+ memcpy(&child->prediction[0], &parent->prediction[0], (parent->window_start)*sizeof(float));
+
+ for (int flow = child->window_end; flow < max_flows; ++flow)
+ child->prediction[flow] = 0;
+
+ // Fill out the solution
+ child->sequence = parent->sequence;
+ child->sequence.push_back(nuc_int_to_char[nuc]);
+ }
+
+ // ------------------------------------------
+ // Step 5. Check if the selected path is in fact the best path so far
+
+ // Computing sequence squared distance
+ float sum_of_squares = parent->residual_left_of_window;
+ for (int flow = parent->window_start; flow < max_flows; flow++) {
+
+ float residual = read.normalized_measurements[flow] - parent->prediction[flow];
+ sum_of_squares += residual * residual;
+ }
+
+ // Updating best path
+ if (sum_of_squares < sum_of_squares_upper_bound) {
+ read.prediction.swap(parent->prediction);
+ read.sequence.swap(parent->sequence);
+ sum_of_squares_upper_bound = sum_of_squares;
+ }
+
+ parent->in_use = false;
+ space_on_stack++;
+
+ } // main decision loop
+}
+
diff --git a/src/ionhammer/seqeval/TreephaserLite.h b/src/projects/ionhammer/seqeval/TreephaserLite.h
similarity index 100%
rename from src/ionhammer/seqeval/TreephaserLite.h
rename to src/projects/ionhammer/seqeval/TreephaserLite.h
diff --git a/src/projects/ionhammer/subcluster.cpp b/src/projects/ionhammer/subcluster.cpp
new file mode 100644
index 0000000..d5dc0a2
--- /dev/null
+++ b/src/projects/ionhammer/subcluster.cpp
@@ -0,0 +1,135 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "subcluster.hpp"
+#include "config_struct.hpp"
+#include "consensus.hpp"
+#include "hkmer_distance.hpp"
+#include "kmer_data.hpp"
+#include "dev_support/logger/log_writers.hpp"
+
+#include <boost/numeric/ublas/matrix.hpp>
+
+#include <vector>
+#include <iostream>
+
+hammer::HKMer center(const KMerData &data, const std::vector<size_t>& kmers) {
+ hammer::HKMer res;
+ namespace numeric = boost::numeric::ublas;
+
+ for (unsigned i = 0; i < hammer::K; ++i) {
+ numeric::matrix<double> scores(4, 64, 0);
+ for (size_t j = 0; j < kmers.size(); ++j) {
+ const hammer::KMerStat &k = data[kmers[j]];
+ // FIXME: switch to MLE when we'll have use per-run quality values
+#if 1
+ scores(k.kmer[i].nucl, k.kmer[i].len) += double(k.count) * (1 - k.qual);
+#else
+ for (unsigned n = 0; n < 4; ++n)
+ for (unsigned l = 1; l < 64; ++l)
+ scores(n, l) += k.count * (n == k.kmer[i].nucl && l == k.kmer[i].len ?
+ log(1 - k.qual) : log(k.qual) - log(4*63 - 1));
+#endif
+ }
+
+ res[i] = hammer::iontorrent::consensus(scores).first;
+ }
+
+ return res;
+}
+
+bool assign(KMerData &kmer_data, const std::vector<size_t> &cluster) {
+ hammer::HKMer c = center(kmer_data, cluster);
+ bool nonread = false;
+
+ size_t idx = kmer_data.seq_idx(c);
+ if (kmer_data[idx].kmer != c) {
+# pragma omp critical
+ {
+ idx = kmer_data.push_back(hammer::KMerStat(0, c, 1.0));
+ }
+ nonread = true;
+ }
+
+ for (size_t j = 0; j < cluster.size(); ++j)
+ kmer_data[cluster[j]].changeto = unsigned(idx);
+
+ return nonread;
+}
+
+void dump(const KMerData &kmer_data, const std::vector<size_t> &cluster) {
+ std::cerr << "{ \n\"kmers\": {";
+ for (size_t j = 0; j < cluster.size(); ++j) {
+ if (j > 0) std::cerr << ", ";
+ std::cerr << '"' << kmer_data[cluster[j]].kmer << "\": ["
+ << kmer_data[cluster[j]].count << ", "
+ << 1 - kmer_data[cluster[j]].qual << "] \n";
+ }
+ std::cerr << "}, \"center\": { \"status\": ";
+ hammer::HKMer c = center(kmer_data, cluster);
+ size_t idx = kmer_data.seq_idx(c);
+ if (kmer_data[idx].kmer == c) {
+ std::cerr << "\"ok\", \"center\": \"" << c << "\"}\n";
+ } else {
+ std::cerr << "\"not\", \"kmer\": \"" << kmer_data[idx].kmer
+ << "\", \"center\": \"" << c << "\"}\n";
+ }
+ std::cerr << "}" << std::endl;
+}
+
+size_t subcluster(KMerData &kmer_data, std::vector<size_t> &cluster) {
+ size_t nonread = 0;
+
+ // First, sort the kmer indicies wrt count
+ std::sort(cluster.begin(), cluster.end(), CountCmp(kmer_data));
+
+ // The number of subclusters for now is really dumb: we assume that the quality should be 1.
+ size_t k = 0;
+ for (size_t i = 0; i < cluster.size(); ++i)
+ k += kmer_data[cluster[i]].qual < cfg::get().center_qual_threshold;
+
+ if (k <= 1) {
+#if 0
+ dump(kmer_data, cluster);
+#endif
+ return assign(kmer_data, cluster);
+ }
+
+ // Find the closest center
+ std::vector<std::vector<size_t> > idx(k, std::vector<size_t>());
+ for (size_t i = 0; i < k; ++i)
+ idx[i].push_back(cluster[i]);
+ for (size_t i = k; i < cluster.size(); ++i) {
+ unsigned dist = std::numeric_limits<unsigned>::max();
+ size_t cidx = k;
+ hammer::HKMer kmerx = kmer_data[cluster[i]].kmer;
+ for (size_t j = 0; j < k; ++j) {
+ hammer::HKMer kmery = kmer_data[cluster[j]].kmer;
+ unsigned cdist = hammer::distanceHKMer(kmerx.begin(), kmerx.end(),
+ kmery.begin(), kmery.end());
+ if (cdist < dist) {
+ cidx = j;
+ dist = cdist;
+ }
+ }
+ VERIFY(cidx < k);
+ idx[cidx].push_back(cluster[i]);
+ }
+
+ for (auto it = idx.begin(), et = idx.end(); it != et; ++it) {
+ const std::vector<size_t> &subcluster = *it;
+
+ if (assign(kmer_data, subcluster)) {
+ nonread += 1;
+#if 0
+ dump(kmer_data, subcluster);
+#endif
+ }
+ }
+
+ return nonread;
+}
diff --git a/src/ionhammer/subcluster.hpp b/src/projects/ionhammer/subcluster.hpp
similarity index 100%
rename from src/ionhammer/subcluster.hpp
rename to src/projects/ionhammer/subcluster.hpp
diff --git a/src/projects/ionhammer/valid_hkmer_generator.hpp b/src/projects/ionhammer/valid_hkmer_generator.hpp
new file mode 100644
index 0000000..468bee6
--- /dev/null
+++ b/src/projects/ionhammer/valid_hkmer_generator.hpp
@@ -0,0 +1,250 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef HAMMER_VALIDHKMERGENERATOR_HPP_
+#define HAMMER_VALIDHKMERGENERATOR_HPP_
+
+#include <deque>
+#include <string>
+#include <vector>
+
+#include "io/reads/single_read.hpp"
+#include "HSeq.hpp"
+
+#include <cstdint>
+#include <cmath>
+
+template<size_t kK>
+class ValidHKMerGenerator {
+ public:
+ /**
+ * @param read Read to generate k-mers from.
+ * @param bad_quality_threshold This class virtually cuts
+ * nucleotides with quality lower the threshold from the ends of the
+ * read.
+ */
+ // FIXME: Switch to delegating ctor.
+ explicit ValidHKMerGenerator(const io::SingleRead &read,
+ unsigned bad_quality_threshold = 2) {
+ Reset(read.GetSequenceString().data(),
+ read.GetQualityString().data(),
+ read.GetSequenceString().size(),
+ bad_quality_threshold);
+ }
+
+ /**
+ * @param seq sequence to generate k-mers from.
+ * @param qual quality string
+ * @param bad_quality_threshold This class virtually cuts
+ * nucleotides with quality lower the threshold from the ends of the
+ * read.
+ */
+ explicit ValidHKMerGenerator(const char *seq, const char *qual,
+ size_t len,
+ unsigned bad_quality_threshold = 2) {
+ Reset(seq, qual, len, bad_quality_threshold);
+ }
+
+ ValidHKMerGenerator()
+ : kmer_(), seq_(0), qual_(0),
+ pos_(-1), nlen_(-1), end_(-1), len_(0),
+ correct_probability_(1), bad_quality_threshold_(2),
+ has_more_(false), first_(true) {}
+
+ void Reset(const char *seq, const char *qual,
+ size_t len,
+ unsigned bad_quality_threshold = 2) {
+ kmer_ = hammer::HSeq<kK>();
+ seq_ = seq;
+ qual_ = qual;
+ pos_ = -1;
+ nlen_ = -1;
+ end_ = -1;
+ len_ = len;
+ correct_probability_ = 1.0;
+ bad_quality_threshold_ = bad_quality_threshold;
+ has_more_ = true;
+ first_ = true;
+ last_ = false;
+ probs_.resize(0);
+
+ TrimBadQuality();
+ Next();
+ }
+
+ /**
+ * @result true if Next() succeed while generating new k-mer, false
+ * otherwise.
+ */
+ bool HasMore() const {
+ return has_more_;
+ }
+
+ /**
+ * @result last k-mer generated by Next().
+ */
+ const hammer::HSeq<kK>& kmer() const {
+ return kmer_;
+ }
+
+ /**
+ * @result last k-mer position in initial read.
+ */
+ size_t pos() const {
+ return pos_;
+ }
+
+ size_t nlen() const {
+ return nlen_;
+ }
+
+ /**
+ * @result number of nucleotides trimmed from left end
+ */
+ size_t trimmed_left() const {
+ return beg_;
+ }
+
+ /**
+ * @result number of nucleotides trimmed from right end
+ */
+ size_t trimmed_right() const {
+ return len_ - end_;
+ }
+
+ /**
+ * @result probability that last generated k-mer is correct.
+ */
+ double correct_probability() const {
+ return correct_probability_;
+ }
+
+ /**
+ * This functions reads next k-mer from the read and sets hasmore to
+ * if succeeded. You can access k-mer read with kmer().
+ */
+ void Next();
+ private:
+ void TrimBadQuality();
+
+ double Prob(unsigned qual) {
+ return (qual < 3 ? 0.25 : 1 - pow(10.0, -(qual / 10.0)));
+ // return Globals::quality_probs[qual];
+ }
+
+ unsigned GetQual(size_t pos) {
+ if (pos >= len_) {
+ return 2;
+ } else {
+ return qual_[pos];
+ }
+ }
+
+ hammer::HSeq<kK> kmer_;
+ const char* seq_;
+ const char* qual_;
+ size_t pos_;
+ size_t nlen_;
+ size_t beg_;
+ size_t end_;
+ size_t len_;
+ double correct_probability_;
+ unsigned bad_quality_threshold_;
+ bool has_more_;
+ bool first_;
+ bool last_;
+ std::deque<double> probs_;
+
+ // Disallow copy and assign
+ ValidHKMerGenerator(const ValidHKMerGenerator&) = delete;
+ void operator=(const ValidHKMerGenerator&) = delete;
+};
+
+template<size_t kK>
+void ValidHKMerGenerator<kK>::TrimBadQuality() {
+ pos_ = 0;
+ if (qual_)
+ for (; pos_ < len_; ++pos_) {
+ if (GetQual(pos_) >= bad_quality_threshold_)
+ break;
+ }
+ beg_ = pos_;
+ end_ = len_;
+ if (qual_)
+ for (; end_ > pos_; --end_) {
+ if (GetQual(end_ - 1) >= bad_quality_threshold_)
+ break;
+ }
+}
+
+template<size_t kK>
+void ValidHKMerGenerator<kK>::Next() {
+ if (last_) {
+ has_more_ = false;
+ return;
+ }
+
+ size_t toadd = (first_ ? kK : 1);
+ char pnucl = -1;
+ double cprob = 1.0;
+ nlen_ = 0;
+ // Build the flow-space kmer looking over homopolymer streches.
+ while (toadd) {
+ // If we went past the end, then there are no new kmers anymore.
+ // The current one might be incomplete but we yield it anyway
+ // because one hk-mer can't have much influence on the consensus.
+ if (pos_ >= end_) {
+ last_ = true;
+ if (toadd > 0) {
+ has_more_ = false;
+ }
+ return;
+ }
+
+ // Check, whether the current nucl is good (not 'N')
+ char cnucl = seq_[pos_ + nlen_];
+ if (!is_nucl(cnucl)) {
+ toadd = kK;
+ pnucl = -1;
+ pos_ += nlen_ + 1;
+ nlen_ = 0;
+ correct_probability_ = 1.0;
+ probs_.resize(0);
+ continue;
+ }
+ if (qual_)
+ cprob *= Prob(GetQual(pos_ + nlen_));
+
+ // If current nucl differs from previous nucl then either we're starting the
+ // k-mer or just finished the homopolymer run.
+ if (cnucl != pnucl) {
+ // If previous nucl was valid then finish the current homopolymer run
+ if (pnucl != -1) {
+ toadd -= 1;
+ correct_probability_ *= cprob;
+ if (probs_.size() == kK) {
+ correct_probability_ /= probs_[0];
+ probs_.pop_front();
+ }
+
+ probs_.push_back(cprob);
+ cprob = 1.0;
+ }
+ pnucl = cnucl;
+ }
+
+ // If we have something to add to flowspace kmer - do it now.
+ if (toadd) {
+ kmer_ <<= cnucl;
+ nlen_ += 1;
+ }
+ }
+
+ pos_ += nlen_;
+ first_ = false;
+}
+#endif // HAMMER_VALIDHKMERGENERATOR_HPP__
diff --git a/src/projects/online_vis/CMakeLists.txt b/src/projects/online_vis/CMakeLists.txt
new file mode 100644
index 0000000..d020b82
--- /dev/null
+++ b/src/projects/online_vis/CMakeLists.txt
@@ -0,0 +1,37 @@
+############################################################################
+# Copyright (c) 2015 Saint Petersburg State University
+# Copyright (c) 2011-2014 Saint Petersburg Academic University
+# All Rights Reserved
+# See file LICENSE for details.
+############################################################################
+
+project(online_vis CXX)
+
+add_executable(online_vis
+ main.cpp)
+
+if (READLINE_FOUND)
+ include_directories(${READLINE_INCLUDE_DIR})
+else(READLINE_FOUND)
+ #MESSAGE(WARNING "Library `readline' was not found (not installed?).")
+endif()
+if (CURSES_FOUND)
+ include_directories(${CURSES_INCLUDE_PATH})
+else(CURSES_FOUND)
+ #MESSAGE(WARNING "Library `ncurses' was not found (not installed?)")
+endif()
+include_directories(./drawing_commands)
+include_directories(${CMAKE_SOURCE_DIR}/debruijn)
+
+if (READLINE_FOUND AND CURSES_FOUND)
+ target_link_libraries(online_vis spades_modules nlopt format ${COMMON_LIBRARIES} ${READLINE_LIBRARY} ${CURSES_NCURSES_LIBRARY})
+elseif (READLINE_FOUND)
+ target_link_libraries(online_vis spades_modules nlopt format ${COMMON_LIBRARIES} ${READLINE_LIBRARY})
+else()
+ target_link_libraries(online_vis spades_modules nlopt format ${COMMON_LIBRARIES})
+endif()
+
+if (DEFINED static_build)
+ set_target_properties(online_vis PROPERTIES LINK_SEARCH_END_STATIC 1)
+endif()
+
diff --git a/src/projects/online_vis/argument_list.hpp b/src/projects/online_vis/argument_list.hpp
new file mode 100644
index 0000000..2e085e6
--- /dev/null
+++ b/src/projects/online_vis/argument_list.hpp
@@ -0,0 +1,217 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "vis_utils.hpp"
+#include "history.hpp"
+#include "errors.hpp"
+
+#include <boost/tokenizer.hpp>
+
+namespace online_visualization {
+
+ class ArgumentList {
+
+ public:
+ ArgumentList() {
+ }
+
+ ArgumentList(stringstream& stream) {
+ DEBUG("Splitting in tokens");
+ vector<string> args = SplitInTokens(stream);
+ DEBUG("Parsing args");
+ ParseArguments(args);
+ }
+
+ string operator[](const string& option_name) const {
+ // usual option
+ if (options.find(option_name) == options.end()) {
+ return "null";
+ }
+ string result = options.find(option_name)->second;
+ return result;
+ }
+
+ bool contains(string short_opt) const {
+ return (short_options.count(short_opt) > 0);
+ }
+
+ const vector<string>& GetAllArguments() const {
+ return arguments;
+ }
+
+ string Preprocess(const History& history) {
+ vector<string> new_arguments;
+
+ for (auto iter = arguments.begin(); iter != arguments.end(); ++iter) {
+ string arg = *iter;
+ TRACE("Argument " << arg);
+ if (arg == "!$") {
+ TRACE("!$");
+ stringstream ss(history.back());
+ TRACE("Last command " << ss.str());
+ ArgumentList arg_list(ss);
+ const vector<string>& args = arg_list.GetAllArguments();
+ string new_arg = args[args.size() - 1];
+ TRACE("All args " << args);
+ TRACE("New arg " << new_arg);
+ new_arguments.push_back(new_arg);
+ }
+ else if (arg[0] == '!') {
+ stringstream ss(history.back());
+ size_t i = 1;
+ if (arg[1] == '-')
+ i = 2;
+ string num_of_command = "";
+ while (i < arg.size() && arg[i] != ':') {
+ num_of_command = num_of_command + arg[i];
+ ++i;
+ }
+
+ if (num_of_command == "")
+ num_of_command = "1";
+ TRACE("Number of the command " << num_of_command);
+
+ if (IsNumber(num_of_command) && arg[i] == ':') {
+ ++i;
+ string num_of_arg = "";
+ while (i < arg.size()) {
+ num_of_arg = num_of_arg + arg[i];
+ ++i;
+ }
+ TRACE("Number of the argument " << num_of_arg);
+ if (num_of_arg == "$" || IsNumber(num_of_arg)) {
+ int command_num = GetInt(num_of_command);
+ if (command_num <= 0 || command_num > int(history.size())) {
+ FireNumberOutOfBounds(command_num);
+ return "";
+ }
+ stringstream ss(history[int(history.size()) - GetInt(num_of_command)]);
+ TRACE("Got the command " << ss.str());
+ ArgumentList arg_list(ss);
+ string new_arg;
+ // $ means the last one
+ if (num_of_arg == "$")
+ new_arg = arg_list.GetAllArguments()[arg_list.GetAllArguments().size() - 1];
+ else {
+ int arg_num = GetInt(num_of_arg);
+ if (0 <= arg_num && arg_num < int(arg_list.GetAllArguments().size())) {
+ TRACE("Got the argument " << arg_num);
+ new_arg = arg_list.GetAllArguments()[arg_num];
+ } else {
+ FireBadArgument(arg);
+ return "";
+ }
+ }
+ TRACE("New arg " << new_arg);
+ new_arguments.push_back(new_arg);
+ }
+ else {
+ FireBadArgument(arg);
+ return "";
+ }
+ }
+ else {
+ new_arguments.push_back(arg);
+ }
+ }
+ else {
+ new_arguments.push_back(arg);
+ }
+ }
+ arguments = new_arguments;
+ stringstream result;
+ for (auto iter = options.begin(); iter != options.end(); ++iter)
+ result << iter->first + "=" + iter->second + " ";
+ for (auto iter = short_options.begin(); iter != short_options.end(); ++iter)
+ result << *iter + " ";
+ for (size_t i = 0; i < arguments.size(); ++i) {
+ result << arguments[i];
+ if (i < arguments.size() - 1)
+ result << " ";
+ }
+
+ return result.str();
+ }
+
+ private:
+ const vector<string> SplitInTokens(stringstream& args) const {
+ vector<string> answer;
+ while (!args.eof()) {
+ string arg;
+ args >> arg;
+ boost::char_separator<char> sep (" ,;");
+ boost::tokenizer<boost::char_separator<char>> tokens(arg, sep);
+ for (auto I = tokens.begin(); I != tokens.end(); ++I) {
+ TRACE("Found argument " << *I);
+ answer.push_back(*I);
+ }
+ }
+ return answer;
+ }
+
+ pair<string, string> ParseOption(const string& arg) const {
+ string opt_name;
+ string opt_value = "";
+
+ size_t i = 2;
+ for (; i < arg.size() && arg[i] != '='; ++i) {
+ opt_name = opt_name + arg[i];
+ }
+ for (; i < arg.size(); ++i) {
+ opt_value = opt_value + arg[i];
+ }
+
+ TRACE("Name/Value " << opt_name << " " << opt_value);
+ if (opt_value == "")
+ opt_value = "true";
+
+
+ return make_pair(opt_name, opt_value);
+ }
+
+ vector<string> ParseShortOption(const string& arg) const {
+ vector<string> result;
+ size_t i = 1;
+ for (; i < arg.size(); ++i) {
+ string s = "";
+ s = s + arg[i];
+ result.push_back(s);
+ }
+ return result;
+ }
+
+ void ParseArguments(const vector<string>& args) {
+ for (size_t i = 0; i < args.size(); ++i) {
+ TRACE("Parsing argument " << args[i]);
+ if (args[i][0] == '-' && args[i][1] == '-') {
+ //--smth=<smth>
+ TRACE("it is an option");
+ pair<string, string> opt_val = ParseOption(args[i]);
+ options.insert(opt_val);
+ }
+ else if (args[i][0] == '-') {
+ TRACE("it is a short option");
+ const vector<string>& short_opt = ParseShortOption(args[i]);
+ TRACE("short options in a vector " << short_opt);
+ short_options.insert(short_opt.begin(), short_opt.end());
+ }
+ else {
+ TRACE("it is a usual arg");
+ arguments.push_back(args[i]);
+ }
+ }
+ }
+ map<string, string> options;
+ set<string> short_options;
+ vector<string> arguments;
+
+ DECL_LOGGER("ArgumentList");
+ };
+
+}
diff --git a/src/projects/online_vis/base_commands.hpp b/src/projects/online_vis/base_commands.hpp
new file mode 100644
index 0000000..8dc2e01
--- /dev/null
+++ b/src/projects/online_vis/base_commands.hpp
@@ -0,0 +1,503 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "environment.hpp"
+#include "command.hpp"
+#include "errors.hpp"
+#include "command_mapping.hpp"
+
+namespace online_visualization {
+
+// null
+ template <class Env>
+ class NullCommand : public Command<Env> {
+
+ public:
+ NullCommand() : Command<Env>("null")
+ {
+ }
+
+ void Execute(shared_ptr<Env>& /* curr_env */,
+ LoadedEnvironments<Env>& /* loaded_environments */,
+ const ArgumentList& /* args */) const
+ {
+ }
+
+ string Usage() const {
+ return "Nothing to do here";
+ }
+ };
+
+ template <class Env>
+ class HelpCommand : public CommandServingCommand<Env> {
+ std::string GetCommonUsageString() const {
+ std::string answer =
+ " Welcome to GAF (Graph Analysis Framework). This framework allows to work with the de Bruijn Graph interactively.\n "
+ " You can see the list of command names below. To see a command's help message just type\n"
+ "> help <command_name>\n"
+ " The list of command names : \n";
+
+ vector<string> command_names = this->command_container_->GetCommandNamesList();
+ for (auto it = command_names.begin(); it != command_names.end(); ++it) {
+ answer += *it;
+ answer += '\n';
+ }
+ return answer;
+ }
+
+ public:
+ string Usage() const {
+ string answer;
+ answer = answer + "The command `help` allows you to see a help message for any command. \n " +
+ "Usage: \n" +
+ "> help <name_of_command> \n" +
+ " Running `help` without parameters yields a list of all commands.";
+ return answer;
+ }
+
+ HelpCommand(CommandMapping<Env> *command_mapping)
+ : CommandServingCommand<Env>("help", command_mapping) {
+ }
+
+ void Execute(shared_ptr<Env>& /* curr_env */, LoadedEnvironments<Env>& /* loaded_environments */, const ArgumentList& arg_list) const {
+ const vector<string>& args = arg_list.GetAllArguments();
+ if (args.size() == 1) {
+ cout << GetCommonUsageString() << endl;
+ } else {
+ string command_name = args[1];
+ const Command<Env>& command = this->command_container_->GetCommand(command_name);
+ if (command.invocation_string() == "null")
+ return;
+ cout << command.Usage() << endl;
+ }
+ }
+ };
+
+ // exit
+ template <class Env>
+ class ExitCommand : public Command<Env> {
+ public:
+ string Usage() const {
+ return "The command `exit` allows you to exit this application.";
+ }
+
+ ExitCommand() :
+ Command<Env>("exit")
+ {
+ }
+
+ void Execute(shared_ptr<Env>& /* curr_env */, LoadedEnvironments<Env>& /* loaded_environments */, const ArgumentList& /* args */) const {
+ cout << "Exiting" << endl;
+ exit(0);
+ }
+ };
+
+ // loading new environment from folder with saves
+ template <class Env>
+ class LoadCommand : public Command<Env> {
+ private:
+ shared_ptr<Env> MakeNewEnvironment(const string& name, const string& saves, size_t K) const {
+ DEBUG("Making new environment " << name);
+ shared_ptr<Env> EnvPointer(new Env(name, saves, K));
+ DEBUG("Done");
+ return EnvPointer;
+ }
+
+ protected:
+ size_t MinArgNumber() const {
+ return 2;
+ }
+
+ virtual bool CheckCorrectness(const vector<string>& args, LoadedEnvironments<Env>& loaded_environments) const
+ {
+ if (!this->CheckEnoughArguments(args))
+ return false;
+
+ string path = args[2];
+ size_t K;
+ if (args.size() > 3) {
+ if (!CheckIsNumber(args[3]))
+ return false;
+ K = GetInt(args[3]);
+ } else {
+ K = cfg::get().K;
+ }
+ if (!CheckEnvIsCorrect(path, K))
+ return false;
+
+ string name = args[1];
+ for (auto iterator = loaded_environments.begin(); iterator != loaded_environments.end(); ++iterator) {
+ if (name == iterator->first) {
+ cout << "Name " << name << " already exists" << endl;
+ cout << "Maybe you want to switch to this environment? " << name << endl;
+ cout << "Please try again" << endl;
+ return false;
+ }
+ }
+ return true;
+ }
+
+ virtual bool CheckCorrectness(const vector<string>& args) const {
+ return this->CheckEnoughArguments(args);
+ }
+
+ public:
+ string Usage() const {
+ string answer;
+ answer = answer + "Command `load` \n" +
+ "Usage:\n" +
+ "> load <environment_name> <path_to_saves> [<k-value>]\n" +
+ " You should specify the name of the new environment as well as a path to the graph saves. Optionally, \n" +
+ " you can provide a k-value for these saves. \n: " +
+ " For example:\n" +
+ "> load GraphSimplified data/saves/simplification\n" +
+ " would load a new environment with the name `GraphSimplified` from the files\n" +
+ " in the folder `data/saves/` with the basename `simplification` (simplification.grp, simplification.sqn, e.t.c).";
+ return answer;
+ }
+
+ LoadCommand() : Command<Env>("load")
+ {
+ }
+
+ void Execute(shared_ptr<Env>& curr_env,
+ LoadedEnvironments<Env>& loaded_environments,
+ const ArgumentList& arg_list) const
+ {
+ vector<string> args = arg_list.GetAllArguments();
+ if (!CheckCorrectness(args))
+ return;
+
+ string name = args[1];
+ string saves = args[2];
+ size_t K;
+ if (args.size() > 3) {
+ K = GetInt(args[3]);
+ } else {
+ K = cfg::get().K;
+ }
+
+ cout << "Loading " << name << " " << saves << endl;
+ if (!CheckCorrectness(args, loaded_environments))
+ return;
+
+ shared_ptr<Env> new_env = MakeNewEnvironment(name, saves, K);
+ loaded_environments.insert(make_pair(name, new_env));
+ curr_env = new_env;
+ }
+
+ };
+
+ // loading new environment from folder with saves
+ template <class Env>
+ class SwitchCommand : public Command<Env> {
+ protected:
+ size_t MinArgNumber() const {
+ return 1;
+ }
+
+ virtual bool CheckCorrectness(const vector<string>& args) const {
+ return this->CheckEnoughArguments(args);
+ }
+
+ public:
+ string Usage() const {
+ string answer;
+ answer = answer + "Command `switch` \n" +
+ "Usage:\n" +
+ " switch <environment_name>\n" +
+ " You should specify the name of the environment you want to switch to. For example:\n" +
+ "> switch GraphSimplified \n" +
+ " would switch you to the environment with the name `GraphSimplified`.\n" +
+ " Of course this environment must be loaded first. To see all loaded environments, run command `list`.";
+ return answer;
+ }
+
+ SwitchCommand() :
+ Command<Env>("switch_env")
+ {
+ }
+
+ void Execute(shared_ptr<Env>& curr_env, LoadedEnvironments<Env>& loaded_environments, const ArgumentList& arg_list) const {
+ const vector<string>& args = arg_list.GetAllArguments();
+
+ if (!CheckCorrectness(args))
+ return;
+
+ string name = args[1];
+
+ bool okay = false;
+ for (auto iterator = loaded_environments.begin(); iterator != loaded_environments.end(); ++iterator) {
+ if (name == iterator->first) {
+ okay = true;
+ curr_env = iterator->second;
+ break;
+ }
+ }
+ if (!okay) {
+ cout << "Name " << name << " does not exist" << endl;
+ cout << "Please try again" << endl;
+ return;
+ } else
+ cout << "Switching to " << name << endl;
+ }
+
+ };
+
+ template <class Env>
+ class ListCommand : public Command<Env> {
+ protected:
+ virtual bool CheckCorrectness() const {
+ return true;
+ }
+
+ public:
+ string Usage() const {
+ string answer;
+ answer = answer + "Command `list` \n" +
+ "Usage:\n" +
+ "> list\n" +
+ " This command lists all loaded environments.";
+ return answer;
+ }
+
+ ListCommand() : Command<Env>("list")
+ {
+ }
+
+ void Execute(shared_ptr<Env>& curr_env, LoadedEnvironments<Env>& loaded_environments, const ArgumentList& /* arg_list */) const {
+ cout << "Environments :" << endl;
+ for (auto iter = loaded_environments.begin(); iter != loaded_environments.end(); ++iter) {
+ cout << " " << iter->first << endl;
+ }
+ if (curr_env)
+ cout << "Current environment is " << curr_env->str() << endl;
+ else
+ cout << "Current environment was not set" << endl;
+ }
+ };
+
+ template <class Env>
+ class ReplayCommand : public CommandServingCommand<Env> {
+ private:
+ virtual bool CheckCorrectness(const vector<string>& args) const {
+ if (args.size() == 1)
+ return true;
+ return CheckIsNumber(args[1]);
+ }
+
+ public:
+ string Usage() const {
+ string answer;
+ answer = answer + "Command `replay` \n" +
+ " Usage:\n" +
+ " > rep <command_number>\n" +
+ " Runs the command <command_number> commands before. For example:\n" +
+ " > rep 1 \n" +
+ " would run the previous command.\n" +
+ " It is still under development.";
+ return answer;
+ }
+
+ ReplayCommand(CommandMapping<Env> *command_mapping) : CommandServingCommand<Env>("replay", command_mapping)
+ {
+ }
+
+ void Execute(shared_ptr<Env>& curr_env, LoadedEnvironments<Env>& loaded_environments, const ArgumentList& arg_list) const {
+ const vector<string>& args = arg_list.GetAllArguments();
+
+ if (!CheckCorrectness(args))
+ return;
+
+ size_t number = GetInt(args[1]);
+ if (number == 0 || number > 100000) {
+ LOG(number << " is not in the range");
+ }
+
+ History& history = History::GetHistory();
+
+ cout << "Executing the command " << number << " command(s) before... " << endl;
+ string command_with_args = history[int(history.size() - number)];
+ cout << command_with_args << endl;
+ //inserting a command, which is to be repeated
+ history.SetEntry(int(history.size() - 1), command_with_args);
+
+ stringstream ss(command_with_args);
+ TRACE("Delegating to the ArgumentList class");
+ ArgumentList tmp_arg_list(ss);
+ //inserting a command, which is to be repeated
+ string processed_command = tmp_arg_list.Preprocess(history);
+ DEBUG("processed string " << processed_command);
+ const string& command_string = tmp_arg_list.GetAllArguments()[0];
+ const Command<Env>& command = this->command_container_->GetCommand(command_string);
+ command.Execute(curr_env, loaded_environments, tmp_arg_list);
+ history.AddEntry(command_with_args);
+ }
+ };
+
+ template <class Env>
+ class LogCommand : public Command<Env> {
+ private:
+ size_t MinArgNumber() const {
+ return 0;
+ }
+
+ virtual bool CheckCorrectness(const vector<string>& args) const {
+ if (args.size() > 1)
+ return CheckIsNumber(args[1]);
+ return true;
+ }
+ public:
+ string Usage() const {
+ string answer;
+ answer = answer + "Command `log` \n" +
+ " Usage:\n" +
+ " > log [<number_of_commands>]\n" +
+ " Shows last <number_of_commands> in the history. Shows the whole log by default.";
+ return answer;
+ }
+
+ LogCommand() : Command<Env>("log")
+ {
+ }
+
+ void Execute(shared_ptr<Env>& /* curr_env */, LoadedEnvironments<Env>& /* loaded_environments */, const ArgumentList& arg_list) const {
+ vector<string> args = arg_list.GetAllArguments();
+ if (!CheckCorrectness(args))
+ return;
+
+ History& history = History::GetHistory();
+ if (args.size() > 1) {
+ size_t number = GetInt(args[1]);
+ if (number > history.size())
+ number = history.size();
+ for (size_t i = 0; i < number; ++i)
+ cout << " " << history[history.size() - int(number) + i] << endl;
+ }
+ else {
+ for (size_t i = 0; i < history.size(); ++i) {
+ cout << history[i] << endl;
+ }
+ }
+ }
+ };
+
+ template <class Env>
+ class SaveBatchCommand : public Command<Env> {
+ private:
+ size_t MinArgNumber() const {
+ return 2;
+ }
+
+ virtual bool CheckCorrectness(const vector<string>& args) const {
+ if (!this->CheckEnoughArguments(args))
+ return false;
+ return CheckIsNumber(args[1]);
+ }
+
+ public:
+ string Usage() const {
+ string answer;
+ answer = answer + "Command `save` \n" +
+ " Usage:\n" +
+ " > save <number_of_commands> <file_name>\n" +
+ " Saves last <number_of_commands> of the history in the file filename.";
+ return answer;
+ }
+
+ SaveBatchCommand() : Command<Env>("save")
+ {
+ }
+
+ void Execute(shared_ptr<Env>& /* curr_env */, LoadedEnvironments<Env>& /* loaded_environments */, const ArgumentList& arg_list) const {
+ const vector<string>& args = arg_list.GetAllArguments();
+
+ if (!CheckCorrectness(args))
+ return;
+
+ size_t number = GetInt(args[1]);
+ const string& file = args[2];
+
+ ofstream outfile;
+ outfile.open(file);
+ History& history = History::GetHistory();
+
+ if (number > history.size())
+ number = history.size();
+
+ for (size_t i = 0; i < number; ++i) {
+ outfile << history[int(history.size() - number + i)];
+ if (i < number - 1)
+ outfile << endl;
+ }
+ outfile.close();
+ }
+ };
+
+ template <class Env>
+ class BatchCommand : public CommandServingCommand<Env> {
+ private:
+ size_t MinArgNumber() const {
+ return 1;
+ }
+
+ virtual bool CheckCorrectness(const vector<string>& args) const {
+ if (!this->CheckEnoughArguments(args))
+ return false;
+ return true;
+ }
+
+ public:
+ string Usage() const {
+ string answer;
+ answer = answer + "Command `batch` \n" +
+ "Usage:\n" +
+ "> batch <batch_filename>\n" +
+ " Runs the commands from the file <batch_filename>.";
+ return answer;
+ }
+
+ BatchCommand(CommandMapping<Env> *command_mapping) : CommandServingCommand<Env>("batch", command_mapping)
+ {
+ }
+
+ void Execute(shared_ptr<Env>& curr_env, LoadedEnvironments<Env>& loaded_environments, const ArgumentList& arg_list) const {
+ const vector<string>& args = arg_list.GetAllArguments();
+
+ if (!CheckCorrectness(args))
+ return;
+
+ const string& file = args[1];
+ if (!CheckFileExists(file))
+ return;
+
+ ifstream infile;
+ infile.open(file);
+ History& history = History::GetHistory();
+ while (!infile.eof()) {
+ string command_with_args;
+ getline(infile, command_with_args);
+ if (command_with_args == "")
+ continue;
+ cout << "> " << command_with_args << endl;
+ stringstream ss(command_with_args);
+ ArgumentList arg_list(ss);
+ string processed_command = arg_list.Preprocess(history);
+
+ const string& command_string = arg_list.GetAllArguments()[0];
+ const Command<Env>& command = this->command_container_->GetCommand(command_string);
+ command.Execute(curr_env, loaded_environments, arg_list);
+
+ history.AddEntry(processed_command);
+ }
+ infile.close();
+ }
+ };
+
+}
diff --git a/src/projects/online_vis/command.hpp b/src/projects/online_vis/command.hpp
new file mode 100644
index 0000000..48953e9
--- /dev/null
+++ b/src/projects/online_vis/command.hpp
@@ -0,0 +1,173 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "standard_vis.hpp"
+#include "environment.hpp"
+#include "loaded_environments.hpp"
+#include "argument_list.hpp"
+#include "errors.hpp"
+
+namespace online_visualization {
+
+template <class Env>
+class CommandMapping;
+
+template <class Env>
+class Command {
+ private:
+ virtual size_t MinArgNumber() const {
+ return 0;
+ }
+
+ //todo fix modifier
+ protected:
+ string invocation_string_;
+
+
+ bool CheckEnoughArguments(const vector<string>& args) const {
+ bool result = (args.size() > MinArgNumber());
+ if (!result)
+ FireNotEnoughArguments();
+ return result;
+ }
+
+ public:
+ virtual string Usage() const = 0;
+
+ Command(string invocation_string)
+ : invocation_string_(invocation_string) {
+ }
+
+ virtual ~Command() {
+ }
+
+ string invocation_string() const {
+ return invocation_string_;
+ }
+
+ // system command, curr_env can point to null
+ virtual void Execute(shared_ptr<Env>& curr_env, LoadedEnvironments<Env>& loaded_environments, const ArgumentList& arg_list) const = 0;
+
+ // virtual void Execute(shared_ptr<Env>& curr_env, const ArgumentList& arg_list) const = 0;
+
+};
+
+//todo reduce code duplication in cap's test_utils
+void MakeDirPath(const std::string& path) {
+ if (path.size() == 0) {
+ TRACE("Somewhat delirium: trying to create directory ``");
+ return;
+ }
+
+ size_t slash_pos = 0;
+ while ((slash_pos = path.find_first_of('/', slash_pos + 1)) != std::string::npos) {
+ make_dir(path.substr(0, slash_pos));
+ }
+ if (path[path.size() - 1] != '/') {
+ make_dir(path);
+ }
+}
+
+bool DirExist(std::string path) {
+ struct stat st;
+ return (stat(path.c_str(), &st) == 0) && (S_ISDIR(st.st_mode));
+}
+
+template <class Env>
+class LocalCommand : public Command<Env> {
+
+ public:
+ LocalCommand(string invocation_string) : Command<Env>(invocation_string) {
+ }
+
+ // command for the current environment
+ virtual void Execute(Env& curr_env, const ArgumentList& arg_list) const = 0;
+
+ // !!!! NO OVERRIDING !!!!
+ virtual void Execute(shared_ptr<Env>& curr_env, LoadedEnvironments<Env>& loaded_environments, const ArgumentList& arg_list) const {
+ if (arg_list["all"] == "true")
+ for (auto iter = loaded_environments.begin(); iter != loaded_environments.end(); ++iter)
+ Execute(*(iter->second), arg_list);
+ else if (curr_env) {
+ Execute(*curr_env, arg_list);
+ }
+ else
+ cout << "The environment is not loaded" << endl;
+ }
+
+ protected:
+
+ string TryFetchFolder(Env& curr_env, const vector<string>& args, size_t arg_nmb = 1) const {
+ if (args.size() > arg_nmb) {
+ return MakeDirIfAbsent(args[arg_nmb] + "/");
+ } else {
+ return MakeDirIfAbsent(CurrentFolder(curr_env));
+ }
+ }
+
+ string TryFetchFolder(Env& curr_env, const ArgumentList& arg_list, size_t arg_nmb = 1) const {
+ const vector<string>& args = arg_list.GetAllArguments();
+ return TryFetchFolder(curr_env, args, arg_nmb);
+ }
+
+ string CurrentFolder(Env& curr_env) const {
+ return curr_env.manager().GetDirForCurrentState();
+ }
+
+private:
+ string MakeDirIfAbsent(const string& folder) const {
+ if (!DirExist(folder))
+ MakeDirPath(folder);
+ return folder;
+ }
+
+};
+
+//todo integrate into basic LocalCommand (after iteratively switching to it in all commands)
+template <class Env>
+class NewLocalCommand : public LocalCommand<Env> {
+ size_t min_arg_num_;
+
+public:
+ NewLocalCommand(string invocation_string, size_t min_arg_num)
+ : LocalCommand<Env>(invocation_string), min_arg_num_(min_arg_num) {
+ }
+
+ // command for the current environment
+ /*virtual*/ void Execute(Env& curr_env, const ArgumentList& arg_list) const {
+ const vector<string>& args = arg_list.GetAllArguments();
+ if (!this->CheckEnoughArguments(args))
+ return;
+ InnerExecute(curr_env, args);
+ }
+
+private:
+
+ virtual size_t MinArgNumber() const {
+ return min_arg_num_;
+ }
+
+ virtual void InnerExecute(Env& curr_env, const vector<string>& args) const = 0;
+
+};
+
+template <class Env>
+class CommandServingCommand : public Command<Env> {
+ protected:
+ CommandMapping<Env> *command_container_;
+
+ public:
+ CommandServingCommand(string invocation_string, CommandMapping<Env> *command_mapper)
+ : Command<Env>(invocation_string),
+ command_container_(command_mapper)
+ {
+ }
+};
+
+}
diff --git a/src/projects/online_vis/command_mapping.hpp b/src/projects/online_vis/command_mapping.hpp
new file mode 100644
index 0000000..70168ee
--- /dev/null
+++ b/src/projects/online_vis/command_mapping.hpp
@@ -0,0 +1,54 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "command.hpp"
+
+
+namespace online_visualization {
+
+template <class Env>
+class CommandMapping {
+ map<string, shared_ptr<Command<Env>>> command_map_;
+
+ public:
+ CommandMapping() : command_map_() {
+ }
+
+ const Command<Env>& GetCommand(string name) const {
+ auto it = command_map_.find(name);
+ if (it == command_map_.end()) {
+ cout << "No such command `" << name << "`, try again" << endl;
+ it = command_map_.find("null");
+ VERIFY(it != command_map_.end());
+ }
+ return *(it->second);
+ }
+
+ void AddCommand(shared_ptr<Command<Env>> command) {
+ string command_invocation_string = command->invocation_string();
+ auto it = command_map_.find(command_invocation_string);
+ VERIFY_MSG(it == command_map_.end(),
+ "Cannot add a command with existing name `"
+ << command_invocation_string << "'. Program exits.");
+
+ command_map_[command_invocation_string] = command;
+ }
+
+ vector<string> GetCommandNamesList() const {
+ vector<string> result;
+ result.reserve(command_map_.size());
+ for (auto it = command_map_.begin(); it != command_map_.end(); ++it) {
+ if (it->first != "null")
+ result.push_back(it->first);
+ }
+ return result;
+ }
+};
+
+}
diff --git a/src/projects/online_vis/debruijn_commands.hpp b/src/projects/online_vis/debruijn_commands.hpp
new file mode 100644
index 0000000..2e31cfe
--- /dev/null
+++ b/src/projects/online_vis/debruijn_commands.hpp
@@ -0,0 +1,14 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "drawing_commands.hpp"
+#include "setting_commands.hpp"
+#include "position_commands.hpp"
+#include "statistics_commands.hpp"
+#include "processing_commands.hpp"
diff --git a/src/projects/online_vis/debruijn_environment.hpp b/src/projects/online_vis/debruijn_environment.hpp
new file mode 100644
index 0000000..0bd7e3a
--- /dev/null
+++ b/src/projects/online_vis/debruijn_environment.hpp
@@ -0,0 +1,206 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "environment.hpp"
+#include "pipeline/graphio.hpp"
+namespace online_visualization {
+
+class DebruijnEnvironment : public Environment {
+ friend class DrawingCommand;
+
+ private :
+ size_t picture_counter_;
+ string folder_;
+ string file_name_base_;
+ size_t max_vertices_;
+ size_t edge_length_bound_;
+
+ GraphPack gp_;
+ GraphElementFinder<Graph> element_finder_;
+ std::shared_ptr<MapperClass> mapper_;
+ FillerClass filler_;
+ omnigraph::DefaultLabeler<Graph> labeler_;
+ debruijn_graph::ReadPathFinder<Graph> path_finder_;
+ ColoringClass coloring_;
+ //CompositeLabeler<Graph> labeler_;
+
+ public :
+
+ typedef debruijn_graph::Index EdgeIndexT;
+
+ DebruijnEnvironment(const string& env_name, const string& env_path, size_t K = cfg::get().K)
+ : Environment(env_name, env_path),
+ picture_counter_(0),
+ folder_("pictures_" + name_),
+ file_name_base_("picture"),
+ max_vertices_(40),
+ edge_length_bound_(1000),
+ gp_(K, "./tmp", cfg::get().ds.reads.lib_count(),
+ "",
+ cfg::get().flanking_range,
+ cfg::get().pos.max_mapping_gap,
+ cfg::get().pos.max_gap_diff),
+ element_finder_(gp_.g),
+ mapper_(new MapperClass(gp_.g, gp_.index, gp_.kmer_mapper)),
+ filler_(gp_.g, mapper_, gp_.edge_pos),
+ labeler_(gp_.g, gp_.edge_pos),
+ path_finder_(gp_.g) {
+ DEBUG("Environment constructor");
+ gp_.kmer_mapper.Attach();
+ debruijn_graph::graphio::ScanGraphPack(path_, gp_);
+// debruijn_graph::graphio::ScanGraphPack(path_, gp_);
+ DEBUG("Graph pack created")
+ LoadFromGP();
+ }
+
+ inline bool IsCorrect() const {
+ if (!CheckFileExists(path_ + ".grp"))
+ return false;
+ if (!CheckFileExists(path_ + ".sqn"))
+ return false;
+
+ size_t K = gp_.k_value;
+ if (!(K >= runtime_k::MIN_K && cfg::get().K < runtime_k::MAX_K)) {
+ LOG("K " << K << " is out of bounds");
+ return false;
+ }
+ if (K % 2 == 0) {
+ LOG("K must be odd");
+ return false;
+ }
+
+ return true;
+ }
+
+ void LoadFromGP() {
+ if (!gp_.edge_pos.IsAttached()) {
+ gp_.edge_pos.Attach();
+ }
+
+ //Loading Genome and Handlers
+ DEBUG("Colorer done");
+ Path<EdgeId> path1 = mapper_->MapSequence(gp_.genome.GetSequence()).path();
+ Path<EdgeId> path2 = mapper_->MapSequence(!gp_.genome.GetSequence()).path();
+ coloring_ = omnigraph::visualization::DefaultColorer(gp_.g, path1, path2);
+ ResetPositions();
+ }
+
+ void LoadNewGenome(const Sequence& genome) {
+ gp_.genome.SetSequence(genome);
+ ResetPositions();
+ }
+
+ void ResetPositions() {
+ if (!gp_.edge_pos.IsAttached())
+ gp_.edge_pos.Attach();
+
+ gp_.edge_pos.clear();
+ filler_.Process(gp_.genome.GetSequence(), "ref0");
+ filler_.Process(!gp_.genome.GetSequence(), "ref1");
+ }
+
+ string GetFormattedPictureCounter() const {
+ stringstream tmpstream;
+ size_t number_of_digs = 0;
+ size_t pc = picture_counter_;
+
+ do {
+ pc /= 10;
+ number_of_digs++;
+ } while (pc > 0);
+
+ for (size_t i = 0; i < 4 - number_of_digs; ++i)
+ tmpstream << '0';
+ tmpstream << picture_counter_;
+ return tmpstream.str();
+ }
+
+ void inc_pic_counter() {
+ picture_counter_++;
+ }
+
+ size_t k_value() const {
+ return gp_.k_value;
+ }
+
+ const Graph& graph() const {
+ return gp_.g;
+ }
+
+ GraphPack& graph_pack() {
+ return gp_;
+ }
+
+ Graph& graph() {
+ return gp_.g;
+ }
+
+ Sequence genome() const {
+ return gp_.genome.GetSequence();
+ }
+
+ const MapperClass& mapper() const {
+ return *mapper_;
+ }
+
+ const debruijn_graph::ReadPathFinder<Graph>& path_finder() const {
+ return path_finder_;
+ }
+
+ const EdgeIndexT& index() const {
+ return gp_.index;
+ }
+
+ const KmerMapperClass& kmer_mapper() const {
+ return gp_.kmer_mapper;
+ }
+
+ const ElementFinder& finder() const {
+ return element_finder_;
+ }
+
+ void set_max_vertices(size_t max_vertices) {
+ max_vertices_ = max_vertices;
+ }
+
+ void set_folder(string folder) {
+ folder_ = folder;
+ }
+
+ string folder() const {
+ return folder_;
+ }
+
+ void set_file_name(string file_name) {
+ file_name_base_ = file_name;
+ }
+
+ string file_name() {
+ return file_name_base_;
+ }
+
+ size_t edge_length_bound() const {
+ return edge_length_bound_;
+ }
+
+ FillerClass& filler() {
+ return filler_;
+ }
+
+ omnigraph::GraphLabeler<Graph>& labeler() {
+ return labeler_;
+ }
+
+ ColoringClass& coloring() {
+ return coloring_;
+ }
+
+};
+
+}
diff --git a/src/projects/online_vis/debruijn_online_visualizer.hpp b/src/projects/online_vis/debruijn_online_visualizer.hpp
new file mode 100644
index 0000000..63ff0a7
--- /dev/null
+++ b/src/projects/online_vis/debruijn_online_visualizer.hpp
@@ -0,0 +1,53 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "online_visualizer.hpp"
+#include "debruijn_environment.hpp"
+#include "debruijn_commands.hpp"
+
+namespace online_visualization {
+
+class DebruijnOnlineVisualizer : public OnlineVisualizer<DebruijnEnvironment> {
+ protected:
+ void AddSpecificCommands() {
+ AddCommand(make_shared<LoadGenomeCommand>());
+ AddCommand(make_shared<SetMaxVertCommand>());
+ AddCommand(make_shared<SetFolderCommand>());
+ AddCommand(make_shared<SetFileNameCommand>());
+
+ AddCommand(make_shared<FillPositionCommand>());
+ AddCommand(make_shared<ClearPositionCommand>());
+
+ AddCommand(make_shared<DrawVertexCommand>());
+ AddCommand(make_shared<DrawEdgeCommand>());
+ AddCommand(make_shared<DrawPositionCommand>());
+ AddCommand(make_shared<DrawPartOfGenomeCommand>());
+ AddCommand(make_shared<DrawContigCommand>());
+ AddCommand(make_shared<DrawContigsCommand>());
+ AddCommand(make_shared<DrawPolymorphicRegions>());
+ AddCommand(make_shared<DrawPoorlyAssembledCommand>());
+ AddCommand(make_shared<DrawUnresolvedWRTAssemblyCommand>());
+ AddCommand(make_shared<DrawUnresolvedWRTReferenceCommand>());
+ AddCommand(make_shared<DrawConnectedCommand>());
+ AddCommand(make_shared<ShowPositionCommand>());
+ AddCommand(make_shared<DrawMisassemblies>());
+
+ AddCommand(make_shared<PrintPathsCommand>());
+ AddCommand(make_shared<PrintContigsStatsCommand>());
+ AddCommand(make_shared<JunctionSequenceCommand>());
+ AddCommand(make_shared<PrintEdgeCommand>());
+ AddCommand(make_shared<ClipTipsCommand>());
+ }
+
+ public:
+ DebruijnOnlineVisualizer() : OnlineVisualizer<DebruijnEnvironment>() {
+ }
+};
+
+}
diff --git a/src/projects/online_vis/drawing_commands.hpp b/src/projects/online_vis/drawing_commands.hpp
new file mode 100644
index 0000000..bc58f2b
--- /dev/null
+++ b/src/projects/online_vis/drawing_commands.hpp
@@ -0,0 +1,113 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "drawing_commands/drawing_command.hpp"
+#include "drawing_commands/draw_position_command.hpp"
+#include "drawing_commands/show_position_command.hpp"
+#include "drawing_commands/draw_part_of_genome_command.hpp"
+#include "drawing_commands/draw_contig_command.hpp"
+#include "drawing_commands/draw_poorly_assembled.hpp"
+#include "drawing_commands/draw_connected_command.hpp"
+#include "drawing_commands/draw_missasemblies.hpp"
+#include "environment.hpp"
+#include "command.hpp"
+#include "errors.hpp"
+#include "argument_list.hpp"
+#include "drawing_commands/draw_polymorphic_regions.hpp"
+
+namespace online_visualization {
+
+ class DrawVertexCommand : public DrawingCommand {
+ protected:
+ size_t MinArgNumber() const {
+ return 1;
+ }
+
+ bool CheckCorrectness(const vector<string>& args) const {
+ if (!CheckEnoughArguments(args))
+ return false;
+ if (!CheckIsNumber(args[1]))
+ return false;
+
+ return true;
+ }
+
+ public:
+ string Usage() const {
+ string answer;
+ answer = answer + "Command `draw_vertex` \n" +
+ "Usage:\n" +
+ "> draw_vertex <vertex_id>\n" +
+ " This command prints pictures for a neigbourhood of a vertex in the DB graph.\n" +
+ " You should specify an id of the vertex in the DB graph, which neighbourhood you want to look at.";
+ return answer;
+ }
+
+ DrawVertexCommand() : DrawingCommand("draw_vertex")
+ {
+ }
+
+ void Execute(DebruijnEnvironment& curr_env, const ArgumentList& arg_list) const {
+ const vector<string>& args = arg_list.GetAllArguments();
+
+ if (!CheckCorrectness(args))
+ return;
+ size_t vertex_id = GetInt(args[1]);
+ if (CheckVertexExists(curr_env.finder(), vertex_id))
+ DrawVertex(curr_env, vertex_id, args[1]);
+ }
+ };
+
+ class DrawEdgeCommand : public DrawingCommand {
+ protected:
+ size_t MinArgNumber() const {
+ return 1;
+ }
+
+ bool CheckCorrectness(const vector<string>& args) const {
+ return CheckEnoughArguments(args);
+ }
+
+ void DrawEdge(DebruijnEnvironment& curr_env, EdgeId edge, string label = "") const {
+ DrawingCommand::DrawPicture(curr_env, curr_env.graph().EdgeStart(edge), label);
+ }
+
+ void DrawEdge(DebruijnEnvironment& curr_env, size_t edge_id, string label = "") const {
+ DrawEdge(curr_env, curr_env.finder().ReturnEdgeId(edge_id), label);
+ }
+
+ public:
+ string Usage() const {
+ string answer;
+ answer = answer + "Command `draw_edge` \n" +
+ "Usage:\n" +
+ "> draw_edge <edge_id>\n" +
+ " This command prints pictures for a neigbourhood of an edge in the DB graph.\n" +
+ " You should specify an id of the edge in the DB graph, which location you want to look at.";
+ return answer;
+ }
+
+ DrawEdgeCommand() : DrawingCommand("draw_edge")
+ {
+ }
+
+ void Execute(DebruijnEnvironment& curr_env, const ArgumentList& arg_list) const {
+ const vector<string>& args = arg_list.GetAllArguments();
+
+ if (!CheckCorrectness(args))
+ return;
+
+ size_t edge_id = GetInt(args[1]);
+ if (CheckEdgeExists(curr_env.finder(), edge_id)) {
+ DrawEdge(curr_env, edge_id, args[1]);
+ }
+ }
+ };
+}
+
diff --git a/src/projects/online_vis/drawing_commands/draw_connected_command.hpp b/src/projects/online_vis/drawing_commands/draw_connected_command.hpp
new file mode 100644
index 0000000..8711834
--- /dev/null
+++ b/src/projects/online_vis/drawing_commands/draw_connected_command.hpp
@@ -0,0 +1,62 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "../environment.hpp"
+#include "../command.hpp"
+#include "../errors.hpp"
+#include "../argument_list.hpp"
+
+#include "drawing_command.hpp"
+
+namespace online_visualization {
+ class DrawConnectedCommand : public DrawingCommand {
+
+
+ protected:
+ size_t MinArgNumber() const {
+ return 0;
+ }
+
+ bool CheckCorrectness(const vector<string>& args) const {
+ if (!CheckEnoughArguments(args))
+ return false;
+ if (args.size() > 1 && !CheckIsNumber(args[1]))
+ return false;
+ if (args.size() > 2 && !CheckIsNumber(args[2]))
+ return false;
+ return true;
+ }
+
+ public:
+ string Usage() const {
+ string answer;
+ answer = answer + "Command `draw_connected` \n" +
+ "Usage: draw_connected\n" +
+ "or draw_connected min_component_size \n" +
+ "or draw_connected min_component_size max_component_size\n" +
+ "Takes no arguments, draw connected components with given size of the graph";
+ return answer;
+ }
+
+ DrawConnectedCommand() : DrawingCommand("draw_connected")
+ {
+ }
+
+ void Execute(DebruijnEnvironment& curr_env, const ArgumentList& arg_list) const {
+ const vector<string>& args = arg_list.GetAllArguments();
+ if (!CheckCorrectness(args))
+ return;
+ int min_size = 1;
+ int max_size = 1000000000;
+ if (args.size() > 1) min_size = GetInt(args[1]);
+ if (args.size() > 2) max_size = GetInt(args[2]);
+ DrawConnectedComponents(curr_env, min_size, max_size);
+ }
+ };
+}
diff --git a/src/projects/online_vis/drawing_commands/draw_contig_command.hpp b/src/projects/online_vis/drawing_commands/draw_contig_command.hpp
new file mode 100644
index 0000000..c8baba1
--- /dev/null
+++ b/src/projects/online_vis/drawing_commands/draw_contig_command.hpp
@@ -0,0 +1,126 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "../environment.hpp"
+#include "../command.hpp"
+#include "../errors.hpp"
+#include "io/reads_io/wrapper_collection.hpp"
+
+namespace online_visualization {
+class DrawContigCommand : public DrawingCommand {
+
+protected:
+ size_t MinArgNumber() const {
+ return 2;
+ }
+
+ bool CheckCorrectness(const vector<string>& args) const {
+ if (!CheckEnoughArguments(args))
+ return false;
+
+ return true;
+ }
+
+public:
+ string Usage() const {
+ string answer;
+ answer = answer + "Command `draw_contig` \n" +
+ "Usage:\n" +
+ "> draw_contig <name_of_contig> <contigs_file>\n" +
+ " Draws graph pictures for a contig.";
+ return answer;
+ }
+
+ DrawContigCommand() : DrawingCommand("draw_contig")
+ {
+ }
+
+ void Execute(DebruijnEnvironment& curr_env, const ArgumentList& arg_list) const {
+ const vector<string>& args = arg_list.GetAllArguments();
+ if (!CheckCorrectness(args))
+ return;
+
+ string contig_name = args[1];
+ LOG("Trying to draw contig " << contig_name);
+
+ bool starts_with = false;
+ if (contig_name[contig_name.size() - 1] == '*') {
+ starts_with = true;
+ contig_name = contig_name.substr(0, contig_name.size() - 1);
+ }
+ string contigs_file = args[2];
+ if (!CheckFileExists(contigs_file))
+ return;
+
+ auto reader = make_shared<io::FixingWrapper>(make_shared<io::FileReadStream>(contigs_file));
+
+ while (!reader->eof()) {
+ io::SingleRead read;
+ (*reader) >> read;
+ //LOG("Contig " << read.name() << " is being processed now");
+
+ // if the name contains a given string <contig_name> as a substring.
+ if((starts_with && read.name().find(contig_name) != string::npos) || contig_name == read.name()) {
+ DrawPicturesAlongContig(curr_env, read);
+ }
+ }
+ }
+};
+
+class DrawContigsCommand : public DrawingCommand {
+
+protected:
+ size_t MinArgNumber() const {
+ return 1;
+ }
+
+ bool CheckCorrectness(const vector<string>& args) const {
+ if (!CheckEnoughArguments(args))
+ return false;
+
+ return true;
+ }
+
+public:
+ string Usage() const {
+ string answer;
+ answer = answer + "Command `draw_contigs` \n" +
+ "Usage:\n" +
+ "> draw_contigs <contigs_file>\n" +
+ " Draws graph pictures for contigs.";
+ return answer;
+ }
+
+ DrawContigsCommand() : DrawingCommand("draw_contigs")
+ {
+ }
+
+ void Execute(DebruijnEnvironment& curr_env, const ArgumentList& arg_list) const {
+ const vector<string>& args = arg_list.GetAllArguments();
+ if (!CheckCorrectness(args))
+ return;
+
+ string contigs_file = args[1];
+
+ LOG("Drawing contigs from " << contigs_file);
+ if (!CheckFileExists(contigs_file))
+ return;
+
+ auto reader = make_shared<io::FixingWrapper>(make_shared<io::FileReadStream>(contigs_file));
+
+ while (!reader->eof()) {
+ io::SingleRead read;
+ (*reader) >> read;
+ //LOG("Contig " << read.name() << " is being processed now");
+
+ DrawPicturesAlongContig(curr_env, read);
+ }
+ }
+};
+}
diff --git a/src/projects/online_vis/drawing_commands/draw_missasemblies.hpp b/src/projects/online_vis/drawing_commands/draw_missasemblies.hpp
new file mode 100644
index 0000000..f123b87
--- /dev/null
+++ b/src/projects/online_vis/drawing_commands/draw_missasemblies.hpp
@@ -0,0 +1,211 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "../environment.hpp"
+#include "../command.hpp"
+#include "../errors.hpp"
+#include "io/reads_io/wrapper_collection.hpp"
+
+namespace online_visualization {
+class DrawMisassemblies : public DrawingCommand {
+
+protected:
+ size_t MinArgNumber() const {
+ return 1;
+ }
+
+ bool CheckCorrectness(const vector<string>& args) const {
+ if (!CheckEnoughArguments(args))
+ return false;
+ if(!CheckFileExists(args[1]))
+ return false;
+ return true;
+ }
+
+private:
+
+ vector<EdgeId> FilterByLength(Graph& g, const vector<EdgeId>& edges) const {
+ vector<EdgeId> filtered_edges;
+ for(auto e : edges) {
+ if(g.length(e) > 500) {
+ filtered_edges.push_back(e);
+ }
+ }
+ return filtered_edges;
+ }
+
+ vector<EdgeId> FilterNonUnique(Graph& g, const vector<EdgeId>& edges, const vector<EdgeId>& genome_edges) const {
+ vector<EdgeId> filtered_edges;
+ std::set<EdgeId> set_edges;
+ std::set<EdgeId> non_unique;
+ std::set<EdgeId> genome_set;
+ std::set<EdgeId> non_unique_genome;
+
+
+ for(auto e : edges) {
+ if(set_edges.find(e) != set_edges.end()) {
+ non_unique.insert(e);
+ }
+ set_edges.insert(e);
+ }
+
+ for(auto e : genome_edges) {
+ if(genome_set.find(e) != genome_set.end()) {
+ non_unique_genome.insert(e);
+ }
+ genome_set.insert(e);
+ }
+
+
+
+ for(auto e : edges) {
+ if(non_unique.find(e) == non_unique.end() && non_unique_genome.find(e) == non_unique_genome.end()) {
+ filtered_edges.push_back(e);
+ INFO("Put " << g.int_id(e) << " into filtered set");
+ }
+ }
+ return filtered_edges;
+ }
+
+
+ void ProcessContig(DebruijnEnvironment& curr_env, MappingPath<EdgeId>& genome_path, MappingPath<EdgeId>& reverse_genome_path, MappingPath<EdgeId>& path, string name = "") const {
+ genome_path.join(reverse_genome_path);
+ vector<EdgeId> genome_edges = curr_env.path_finder().FindReadPath(genome_path);
+ vector<EdgeId> rc_genome_edges = curr_env.path_finder().FindReadPath(reverse_genome_path);
+ vector<EdgeId> rc_and_usual_genome_edges(genome_edges);
+ push_back_all(rc_and_usual_genome_edges, rc_genome_edges);
+ vector<EdgeId> edges = path.simple_path();
+ auto filtered_edges = FilterNonUnique(curr_env.graph(), edges, rc_and_usual_genome_edges);
+ if(filtered_edges.size() < 2)
+ return;
+
+ auto it_genome = find(rc_and_usual_genome_edges.begin(), rc_and_usual_genome_edges.end(), filtered_edges[0]);
+ size_t index_genome = it_genome - rc_and_usual_genome_edges.begin();
+ size_t i = 0;
+
+
+ auto it_contig = find(edges.begin(), edges.end(), filtered_edges[i]);
+ while(it_contig == edges.end()) {
+ ++i;
+ if(i > filtered_edges.size()) {
+ return;
+ }
+ it_contig = find(edges.begin(), edges.end(), filtered_edges[i]);
+ }
+ size_t index_contig = it_contig - edges.begin();
+ INFO("Now at edge " << curr_env.graph().int_id(filtered_edges[i]));
+ const int allowed_error = 3000;
+ int real_difference = (int)genome_path[index_genome].second.initial_range.start_pos - (int)path[index_contig].second.initial_range.start_pos;
+ INFO("Diff is set to " << real_difference);
+
+
+ while(i < filtered_edges.size()) {
+ INFO("Now at edge " << curr_env.graph().int_id(filtered_edges[i]));
+ it_genome = find(rc_and_usual_genome_edges.begin(), rc_and_usual_genome_edges.end(), filtered_edges[i]);
+ it_contig = find(edges.begin(), edges.end(), filtered_edges[i]);
+
+ size_t index_genome = it_genome - rc_and_usual_genome_edges.begin();
+ size_t index_contig = it_contig - edges.begin();
+
+ if(it_genome == rc_and_usual_genome_edges.end()) {
+ vector<EdgeId> path_to_draw;
+
+ while(it_genome == rc_and_usual_genome_edges.end()) {
+ ++i;
+ if(i == filtered_edges.size())
+ {
+ break;
+ }
+ it_genome = find(rc_and_usual_genome_edges.begin(), rc_and_usual_genome_edges.end(), filtered_edges[i]);
+ }
+
+ auto new_it_contig = find(edges.begin(), edges.end(), filtered_edges[i]);
+ size_t new_index_contig = new_it_contig - edges.begin();
+
+ for(size_t z = index_contig; z <= new_index_contig ; ++z) {
+ path_to_draw.push_back(edges[z]);
+ }
+
+
+ DrawPicturesAlongPath(curr_env, path_to_draw, name + "_" + ToString(curr_env.graph().int_id(filtered_edges[i])));
+ real_difference = (int)genome_path[index_genome].second.initial_range.start_pos - (int)path[index_contig].second.initial_range.start_pos;
+ INFO("Diff is set to " << real_difference);
+ continue;
+ }
+
+ int difference = (int)genome_path[index_genome].second.initial_range.start_pos - (int)path[index_contig].second.initial_range.start_pos;
+ if(abs(difference - real_difference) > allowed_error) {
+ real_difference = (int)genome_path[index_genome].second.initial_range.start_pos - (int)path[index_contig].second.initial_range.start_pos;
+ vector<EdgeId> path_to_draw;
+ path_to_draw.push_back(genome_path[index_genome].first);
+ DrawPicturesAlongPath(curr_env, path_to_draw, name + "_" + ToString(curr_env.graph().int_id(filtered_edges[i])));
+ INFO("Diff is set to " << real_difference);
+ }
+ ++i;
+
+ }
+ }
+
+public:
+ DrawMisassemblies() : DrawingCommand("draw_misassemblies") {
+
+ }
+
+ string Usage() const {
+ string answer;
+ answer = answer + "Command `draw_misassemblies` \n" +
+ "Usage:\n" +
+ "> draw_misassemblies <file with missasembled quast contigs>\n" +
+ "Reference genome should be loaded to use this command.\n" +
+ "This command tries to draw exact places of misassembles.";
+ return answer;
+ }
+
+ void Execute(DebruijnEnvironment& curr_env, const ArgumentList& arg_list) const {
+ const vector<string>& args = arg_list.GetAllArguments();
+ if (!CheckCorrectness(args)) {
+ return;
+ }
+
+ if(curr_env.genome() == Sequence()) {
+ cout << "Reference should be loaded. Command will not be executed" << endl;
+ return;
+ }
+
+ string file = args[1];
+ auto reader = make_shared<io::FixingWrapper>(make_shared<io::FileReadStream>(file));
+ FillerClass& filler = curr_env.filler();
+ while (!reader->eof()) {
+ io::SingleRead read;
+ (*reader) >> read;
+ Sequence contig = read.sequence();
+ filler.Process(contig, "miss_" + read.name());
+ filler.Process(!contig, "miss_" + read.name() + "_RC");
+ }
+ reader->close();
+ cout << "All contigs are mapped" << endl;
+ reader = make_shared<io::FixingWrapper>(make_shared<io::FileReadStream>(file));
+
+ auto genome_mapping_path = curr_env.mapper().MapSequence(curr_env.genome());
+ auto rc_genome_mapping_path = curr_env.mapper().MapSequence(!curr_env.genome());
+
+ cout << "Genome is mapped" << endl;
+
+ while(!reader->eof()) {
+ io::SingleRead read;
+ (*reader) >> read;
+ Sequence contig = read.sequence();
+ cout << "Read " << read.name() << " is processed." << endl;
+
+ auto mapping_path = curr_env.mapper().MapSequence(contig);
+ ProcessContig(curr_env, genome_mapping_path, rc_genome_mapping_path, mapping_path, read.name());
+ }
+ }
+
+};
+}
diff --git a/src/projects/online_vis/drawing_commands/draw_part_of_genome_command.hpp b/src/projects/online_vis/drawing_commands/draw_part_of_genome_command.hpp
new file mode 100644
index 0000000..1529561
--- /dev/null
+++ b/src/projects/online_vis/drawing_commands/draw_part_of_genome_command.hpp
@@ -0,0 +1,115 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "paired_info/distance_estimation.hpp"
+#include "../environment.hpp"
+#include "../command.hpp"
+#include "../errors.hpp"
+#include "../argument_list.hpp"
+
+#include "drawing_command.hpp"
+
+namespace online_visualization {
+ class DrawPartOfGenomeCommand : public DrawingCommand {
+ private:
+ void CheckPathIntegrity(const omnigraph::de::GraphDistanceFinder<Graph>& dist_finder, EdgeId first_edge, EdgeId second_edge) const {
+ vector<size_t> distances = dist_finder.GetGraphDistancesLengths(first_edge, second_edge);
+ if (distances[0] == 0) {
+ INFO("Edges " << first_edge << " and " << second_edge << " are neighbouring");
+ } else
+ INFO("Edges " << first_edge << " and " << second_edge << " are at distance of " << distances[0]);
+ }
+
+ private:
+ void DrawPicturesAlongGenomePart(DebruijnEnvironment& curr_env, const Sequence& piece_of_genome, string label = "") const {
+ const MappingPath<EdgeId>& mapping_path = curr_env.mapper().MapSequence(piece_of_genome);
+ DrawingCommand::DrawPicturesAlongPath(curr_env, mapping_path.simple_path(), label);
+ }
+
+// void CountStatsAlongGenomePart(DebruijnEnvironment& curr_env, Sequence& piece_of_genome,
+// const io::SequencingLibrary<debruijn_graph::debruijn_config::DataSetData> &lib) const {
+//
+// omnigraph::de::GraphDistanceFinder<Graph> dist_finder(curr_env.graph(), size_t(lib.data().mean_insert_size), lib.data().read_length,
+// size_t(lib.data().insert_size_deviation));
+// cout << "Statistics for the part of genome :" << endl;
+// const MappingPath<EdgeId>& mapping_path = curr_env.mapper().MapSequence(piece_of_genome);
+// for (size_t i = 0; i < mapping_path.size(); ++i) {
+// cout << "Edge # " << i << endl;
+// //const pair<EdgeId, MappingRange>& mapping_edge = mapping_path[i];
+//
+// if (i > 0) {
+// INFO("Checking connection between neighbouring edges");
+// CheckPathIntegrity(dist_finder, mapping_path[i - 1].first, mapping_path[i].first);
+// }
+// }
+// }
+
+ protected:
+ size_t MinArgNumber() const {
+ return 2;
+ }
+
+ bool CheckCorrectness(const vector<string>& args) const {
+ if (!CheckEnoughArguments(args))
+ return false;
+ if (!CheckIsNumber(args[1]))
+ return false;
+ if (!CheckIsNumber(args[2]))
+ return false;
+
+ return true;
+ }
+
+ public:
+ string Usage() const {
+ string answer;
+ answer = answer + "Command `draw_part_of_genome` \n" +
+ " Usage:\n" +
+ "> genome <first_pos> <second_pos> [--rc] [-r]\n" +
+ " Prints a .dot picture of a substring [first_pos, second_pos] of the genome.\n" +
+ " Optionally you can use a flag -r, whether you want the tool to invert the positions,\n" +
+ " and an option --rc, if you would like to see the pictures of the second strand.";
+ return answer;
+ }
+
+ DrawPartOfGenomeCommand() : DrawingCommand("draw_part_of_genome")
+ {
+ }
+
+ void Execute(DebruijnEnvironment& curr_env, const ArgumentList& arg_list) const {
+ const vector<string>& args = arg_list.GetAllArguments();
+ if (!CheckCorrectness(args))
+ return;
+
+ size_t first_position = (size_t) GetInt(args[1]);
+ size_t second_position = (size_t) GetInt(args[2]);
+ Sequence genome = curr_env.genome();
+ if (arg_list["rc"] == "true") {
+ cout << "Inverting genome..." << endl;
+ genome = !genome;
+ }
+
+ //experimental
+ if (arg_list.contains("r")) {
+ cout << "Inverting positions..." << endl;
+ first_position = genome.size() - curr_env.k_value() - 1 - first_position;
+ second_position = genome.size() - curr_env.k_value() - 1 - first_position;
+ }
+
+ if (CheckPositionBounds(first_position, genome.size(), curr_env.k_value()) &&
+ CheckPositionBounds(second_position, genome.size(), curr_env.k_value()))
+ {
+ const Sequence& part_of_genome = genome.Subseq(first_position, second_position);
+ string label = args[1] + "_" + args[2];
+ DrawPicturesAlongGenomePart(curr_env, part_of_genome, label);
+ }
+
+ }
+ };
+}
diff --git a/src/projects/online_vis/drawing_commands/draw_polymorphic_regions.hpp b/src/projects/online_vis/drawing_commands/draw_polymorphic_regions.hpp
new file mode 100644
index 0000000..68ae311
--- /dev/null
+++ b/src/projects/online_vis/drawing_commands/draw_polymorphic_regions.hpp
@@ -0,0 +1,139 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "../environment.hpp"
+#include "../command.hpp"
+#include "../errors.hpp"
+#include "io/reads_io/wrapper_collection.hpp"
+
+namespace online_visualization {
+
+class DrawPolymorphicRegions : public DrawingCommand {
+
+ GraphComponent<Graph> ConstructComponent(DebruijnEnvironment& curr_env, EdgeId firstEdge, EdgeId secondEdge, size_t windowSize) const
+ {
+ PathStorageCallback<Graph> callback(curr_env.graph());
+ ProcessPaths(curr_env.graph(), 0, windowSize*4,
+ curr_env.graph().EdgeEnd(firstEdge), curr_env.graph().EdgeStart(secondEdge),
+ callback);
+ vector<vector<EdgeId>> paths = callback.paths();
+ vector<VertexId> verticesToAdd;
+ verticesToAdd.push_back(curr_env.graph().EdgeEnd(firstEdge));
+ for(auto edges : paths)
+ {
+ for(auto edge : edges)
+ {
+ verticesToAdd.push_back(curr_env.graph().EdgeEnd(edge));
+ }
+ }
+ GraphComponent<Graph> polymorphicComponent(curr_env.graph(), verticesToAdd.begin(), verticesToAdd.end());
+ return polymorphicComponent;
+ }
+
+ void DrawPicture(DebruijnEnvironment& curr_env, Sequence& genome) const {
+ size_t windowSize = 500;
+ for(size_t i = 0; i < genome.size() - windowSize - 1 - curr_env.k_value(); ++i)
+ {
+ runtime_k::RtSeq firstKmer = genome.Subseq(i).start<runtime_k::RtSeq>(curr_env.k_value() + 1);
+ runtime_k::RtSeq secondKmer = genome.Subseq(i + windowSize).start<runtime_k::RtSeq>(curr_env.k_value() + 1);
+ firstKmer = curr_env.kmer_mapper().Substitute(firstKmer);
+ secondKmer = curr_env.kmer_mapper().Substitute(secondKmer);
+ pair<EdgeId, size_t> positionFirst = curr_env.index().get(firstKmer);
+ if(positionFirst.first == EdgeId(0))
+ {
+ continue;
+ }
+
+ if(curr_env.graph().length(positionFirst.first) < 300)
+ {
+ i += curr_env.graph().length(positionFirst.first) - positionFirst.second;
+ continue;
+ }
+ else
+ {
+ pair<EdgeId, size_t> positionSecond = curr_env.index().get(secondKmer);
+ if(positionSecond.first == EdgeId(0))
+ {
+ continue;
+ }
+
+ if(curr_env.graph().length(positionSecond.first) < 300)
+ {
+ i += curr_env.graph().length(positionSecond.first) - positionSecond.second;
+ continue;
+ }
+ else
+ {
+ if(positionFirst.first == positionSecond.first)
+ {
+ i += curr_env.graph().length(positionSecond.first) - positionSecond.second;
+ continue;
+ }
+ INFO("Constructing component around " << i << "-th position in the genome");
+ GraphComponent<Graph> polymorphicRegion = ConstructComponent(curr_env, positionFirst.first, positionSecond.first, windowSize);
+
+ if(polymorphicRegion.e_size() > 5)
+ {
+ visualization::WriteComponentSinksSources(polymorphicRegion, curr_env.folder() + "/" + ToString(curr_env.graph().int_id(*polymorphicRegion.vertices().begin())) + ".dot", visualization::DefaultColorer(curr_env.graph()),
+ curr_env.labeler());
+
+ INFO("Component is written to " + curr_env.folder() + ToString(curr_env.graph().int_id(*polymorphicRegion.vertices().begin())) + ".dot");
+ }
+
+ i += curr_env.graph().length(positionSecond.first) - positionSecond.second;
+ continue;
+ }
+ }
+
+ }
+ }
+
+
+protected:
+ size_t MinArgNumber() const {
+ return 0;
+ }
+
+ bool CheckCorrectness(const vector<string>&) const {
+ return true;
+ }
+
+public:
+ string Usage() const {
+ string answer;
+ answer = answer + "Command `draw_polymorphic` \n" +
+ "Usage:\n" +
+ "> draw_polymorphic\n" +
+ " You should run load_genome command before it, to proceed. \n" +
+ "This command draws polymorphic regions between two conserved edges.";
+ return answer;
+ }
+
+ DrawPolymorphicRegions() : DrawingCommand("draw_polymorphic")
+ {
+ }
+
+ void Execute(DebruijnEnvironment& curr_env, const ArgumentList& arg_list) const {
+ const vector<string>& args = arg_list.GetAllArguments();
+ if (!CheckCorrectness(args))
+ return;
+ make_dir(curr_env.folder());
+ Sequence genome = curr_env.genome();
+ if(genome.size() == 0)
+ {
+ cout << "Reference genome is not uploaded\n";
+ return;
+ }
+ DrawPicture(curr_env, genome);
+ INFO("End");
+
+ }
+};
+}
+
diff --git a/src/projects/online_vis/drawing_commands/draw_poorly_assembled.hpp b/src/projects/online_vis/drawing_commands/draw_poorly_assembled.hpp
new file mode 100644
index 0000000..2044e6a
--- /dev/null
+++ b/src/projects/online_vis/drawing_commands/draw_poorly_assembled.hpp
@@ -0,0 +1,617 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "../environment.hpp"
+#include "../command.hpp"
+#include "../errors.hpp"
+#include "io/reads_io/wrapper_collection.hpp"
+#include <boost/algorithm/string.hpp>
+#include "assembly_graph/graph_core/basic_graph_stats.hpp"
+
+#include <boost/algorithm/string/predicate.hpp>
+
+namespace online_visualization {
+
+//class RepeatProcessor {
+// const Graph& g_;
+//
+//protected:
+// const Graph& g() const {
+// return g_;
+// }
+//
+//public:
+// virtual void ProcessResolved(EdgeId e1, EdgeId e2, size_t gap_length, const vector<EdgeId> repeat_edges) {
+// }
+//
+// virtual void ProcessUnresolved(EdgeId e1, EdgeId e2, size_t gap_length, const vector<EdgeId> repeat_edges) {
+// }
+//
+// virtual ~RepeatProcessor() {
+// }
+//};
+//
+//class CompositeRepeatProcessor : public RepeatProcessor {
+// vector<shared_ptr<RepeatProcessor>> processors_;
+//
+//public:
+// virtual void ProcessResolved(EdgeId e1, EdgeId e2, size_t gap_length, const vector<EdgeId> repeat_edges) {
+// for (auto p : processors_) {
+// p->ProcessResolved(e1, e2, gap_length, repeat_edges);
+// }
+// }
+//
+// virtual void ProcessUnresolved(EdgeId e1, EdgeId e2, size_t gap_length, const vector<EdgeId> repeat_edges) {
+// for (auto p : processors_) {
+// p->ProcessUnresolved(e1, e2, gap_length, repeat_edges);
+// }
+// }
+//
+//};
+
+struct RepeatInfo {
+ EdgeId e1;
+ EdgeId e2;
+ size_t genomic_gap;
+ vector<EdgeId> ref_path;
+ string seq_name;
+ //number of repeat in seq
+ size_t local_cnt;
+
+ RepeatInfo(EdgeId e1_,
+ EdgeId e2_,
+ size_t genomic_gap_,
+ const vector<EdgeId>& ref_path_,
+ string seq_name_,
+ size_t local_cnt_) :
+ e1(e1_), e2(e2_),
+ genomic_gap(genomic_gap_), ref_path(ref_path_),
+ seq_name(seq_name_), local_cnt(local_cnt_) {
+
+ }
+};
+
+class RepeatProcessor {
+public:
+ virtual ~RepeatProcessor() {
+ }
+
+ virtual void ProcessUnresolved(DebruijnEnvironment&, const RepeatInfo&) const {}
+
+ virtual void ProcessResolved(DebruijnEnvironment&, const RepeatInfo&) const {}
+};
+
+class StructuredFileLogger : public RepeatProcessor {
+
+ string Log(const GraphPack& gp, const RepeatInfo& repeat_info) const {
+ return fmt::format("{:d} {:d} {:d} {:d} {:d}", repeat_info.genomic_gap, repeat_info.ref_path.size(),
+ CumulativeLength(gp.g, repeat_info.ref_path),
+ gp.g.int_id(repeat_info.e1), gp.g.int_id(repeat_info.e2));
+ }
+
+public:
+ virtual void ProcessUnresolved(DebruijnEnvironment& curr_env, const RepeatInfo& repeat_info) const {
+ cerr << "RI: 0 " << Log(curr_env.graph_pack(), repeat_info) << endl;
+
+ }
+
+ virtual void ProcessResolved(DebruijnEnvironment& curr_env, const RepeatInfo& repeat_info) const {
+ cerr << "RI: 1 " << Log(curr_env.graph_pack(), repeat_info) << endl;
+ }
+};
+
+class ReadableUnresolvedLogger : public RepeatProcessor {
+
+public:
+ virtual void ProcessUnresolved(DebruijnEnvironment& curr_env, const RepeatInfo& repeat_info) const {
+ LOG(fmt::format("Genomic gap: {:d}, number of edges: {:d}, edge 1: {:s}, edge 2 {:s}", repeat_info.genomic_gap,
+ repeat_info.ref_path.size(), curr_env.graph().str(repeat_info.e1),
+ curr_env.graph().str(repeat_info.e2)));
+ }
+
+};
+
+class UnresolvedPrinter : public RepeatProcessor {
+
+ void DrawGap(DebruijnEnvironment& curr_env, const vector<EdgeId>& path, string filename, string /*label*/ = "") const {
+ omnigraph::visualization::WriteComponentsAlongPath<Graph>(curr_env.graph(), path, filename, curr_env.coloring(), curr_env.labeler());
+ LOG("The pictures is written to " << filename);
+ }
+
+public:
+
+ virtual void ProcessUnresolved(DebruijnEnvironment& curr_env, const RepeatInfo& repeat_info) const {
+ make_dir(curr_env.folder());
+ string pics_folder = curr_env.folder() + "/" + curr_env.GetFormattedPictureCounter() + "_" + repeat_info.seq_name + "/";
+ make_dir(pics_folder);
+ string pic_name = ToString(repeat_info.local_cnt) + "_" + ToString(repeat_info.genomic_gap) +
+ "_" + ToString(curr_env.graph().int_id(repeat_info.e1)) + "_" + ToString(curr_env.graph().int_id(repeat_info.e2)) + "_";
+
+ DrawGap(curr_env, repeat_info.ref_path, pics_folder + pic_name);
+ }
+
+};
+
+class PairedInfoChecker : public RepeatProcessor {
+
+ bool CheckInfo(const omnigraph::de::PairedInfoIndexT<Graph>& clustered_pi_idx, EdgeId e1, EdgeId e2) const {
+ //return !clustered_pi_idx.Get(e1, e2).empty();
+ //We don't store empty histograms, do we?
+ return clustered_pi_idx.contains(e1, e2);
+ }
+
+public:
+
+ virtual void ProcessUnresolved(DebruijnEnvironment& curr_env, const RepeatInfo& repeat_info) const {
+ const omnigraph::de::PairedInfoIndexT<Graph>& clustered_pi_idx = curr_env.graph_pack().clustered_indices[0];
+ const Graph& g = curr_env.graph();
+ vector<EdgeId> edges;
+ edges.push_back(repeat_info.e1);
+ push_back_all(edges, repeat_info.ref_path);
+ edges.push_back(repeat_info.e2);
+ for (EdgeId e : edges) {
+ if (!CheckInfo(clustered_pi_idx, repeat_info.e1, e)) {
+ cerr << "NO_PI: " << g.int_id(repeat_info.e1) <<
+ " " << g.int_id(repeat_info.e2) <<
+ " " << g.int_id(e) << endl;
+ }
+ }
+ }
+
+};
+
+class DrawUnresolvedRepeatsCommand : public DrawingCommand {
+private:
+ const string ref_prefix_;
+ const size_t gap_diff_threshold_;
+ const double good_mapping_coeff_;
+ vector<shared_ptr<RepeatProcessor>> processors_;
+
+ vector<EdgePosition> GatherPositions(const GraphPack& gp, EdgeId e, const string& prefix) const {
+ vector<EdgePosition> answer;
+ for (EdgePosition pos : gp.edge_pos.GetEdgePositions(e)) {
+ if (boost::starts_with(pos.contigId, prefix)) {
+ answer.push_back(pos);
+ }
+ }
+ return answer;
+ }
+
+// bool IsNext(MappingRange mr1, MappingRange mr2, size_t max_gap = 5) const {
+// return mr2.initial_range.start_pos >= mr1.initial_range.end_pos && mr2.initial_range.start_pos <= mr1.initial_range.end_pos + max_gap;
+// }
+//
+// shared_ptr<MappingRange> FindNextRange(MappingRange curr_range, const set<MappingRange>& ranges) const {
+// cout << "Looking for next range for " << curr_range << endl;
+// for (auto r : ranges) {
+// cout << "Considering range " << r << endl;
+// if (IsNext(curr_range, r)) {
+// cout << "Found next" << endl;
+// return make_shared<MappingRange>(r);
+// }
+// }
+// cout << "Couldn't find suitable range" << endl;
+// return shared_ptr<MappingRange>(0);
+// }
+//
+// shared_ptr<pair<EdgeId, EdgePosition>> NextEdge(const GraphPack& gp, VertexId v, EdgePosition curr_pos) const {
+// for (EdgeId next_e : gp.g.OutgoingEdges(v)) {
+// cout << "Considering " << gp.g.str(next_e) << " as next edge " << endl;
+// set<MappingRange> relevant_ranges = gp.edge_pos.GetEdgePositions(next_e, curr_pos.contigId);
+// auto next_range = FindNextRange(curr_pos.mr, relevant_ranges);
+// cout << "Considered " << relevant_ranges.size() << " relevant ranges" << endl;
+// if (next_range) {
+// cout << "Found next edge" << endl;
+// return make_shared<pair<EdgeId, EdgePosition>>(next_e, EdgePosition(curr_pos.contigId, *next_range));
+// }
+// }
+// cout << "Couldn't find next edge" << endl;
+// return shared_ptr<pair<EdgeId, EdgePosition>>(0);
+// }
+//
+// vector<EdgeId> FindReferencePath(const GraphPack& gp, EdgeId e1, EdgeId e2) const {
+// EdgePosition curr_pos = GatherPositions(gp, e1, ref_prefix_).front();
+// VertexId curr_v = gp.g.EdgeEnd(e1);
+// vector<EdgeId> answer;
+// answer.push_back(e1);
+// // for (size_t i = 0 ; i < 1000 ; ++i) {
+// while(true) {
+// auto next_info = NextEdge(gp, curr_v, curr_pos);
+// if (next_info) {
+// EdgeId next_e = next_info->first;
+// answer.push_back(next_e);
+// if (next_e == e2) {
+// break;
+// }
+// curr_v = gp.g.EdgeEnd(next_e);
+// curr_pos = next_info->second;
+// } else {
+// return vector<EdgeId>();
+// }
+// }
+// return answer;
+// }
+
+ MappingPath<EdgeId> FindReferencePath(const GraphPack& gp, EdgeId e1, EdgeId e2) const {
+ auto e1_poss = GatherPositions(gp, e1, ref_prefix_);
+ auto e2_poss = GatherPositions(gp, e2, ref_prefix_);
+ VERIFY(e1_poss.size() == 1 && e2_poss.size() == 1);
+ EdgePosition e1_pos = e1_poss.front();
+ EdgePosition e2_pos = e2_poss.front();
+ VERIFY(e1_pos.contigId == e1_pos.contigId);
+ Sequence ref = (e1_pos.contigId == "ref0") ? gp.genome.GetSequence() : !gp.genome.GetSequence();
+ size_t gap_start = e1_pos.mr.initial_range.end_pos;
+ size_t gap_end = e2_pos.mr.initial_range.start_pos + gp.g.k();
+ VERIFY(gap_end >= gap_start && gap_end <= ref.size());
+ Sequence gap_fragment = ref.Subseq(gap_start, gap_end);
+ return debruijn_graph::MapperInstance(gp)->MapSequence(gap_fragment);
+ }
+
+ size_t GenomicGap(const GraphPack& gp, EdgeId e1, EdgeId e2) const {
+ auto poss1 = GatherPositions(gp, e1, ref_prefix_);
+ auto poss2 = GatherPositions(gp, e2, ref_prefix_);
+ VERIFY_MSG(poss1.size() == 1, "Positions of first edge " << poss1);
+ VERIFY_MSG(poss2.size() == 1, "Positions of second edge " << poss2);
+ if (poss1.front().contigId != poss2.front().contigId) {
+ WARN("Assembly contig stitching edges from different strains");
+ return -1u;
+ }
+
+ MappingRange r1 = poss1.front().mr;
+ MappingRange r2 = poss2.front().mr;
+ if (r2.initial_range.start_pos < r1.initial_range.end_pos) {
+ WARN("Wrong order of edges in the contig");
+ return -1u;
+ }
+
+ return r2.initial_range.start_pos - r1.initial_range.end_pos;
+ }
+
+ set<string> GatherNames(const GraphPack& gp, EdgeId e, const string& prefix) const {
+ set<string> answer;
+ for (auto pos : GatherPositions(gp, e, prefix)) {
+ answer.insert(pos.contigId);
+ }
+ return answer;
+ }
+
+ bool IsOfMultiplicityOne(const GraphPack& gp, EdgeId e) const {
+ return GatherPositions(gp, e, ref_prefix_).size() == 1;
+ }
+
+ bool BelongToSameContig(const GraphPack& gp, EdgeId e1, EdgeId e2, string assembly_prefix) const {
+ auto names1 = GatherNames(gp, e1, assembly_prefix);
+ auto names2 = GatherNames(gp, e2, assembly_prefix);
+ //checking non-empty intersection
+ for (auto name: names1) {
+ if (names2.count(name)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ bool CheckMapping(size_t edge_length, size_t mapped_range_length) const {
+ return double(mapped_range_length) > good_mapping_coeff_ * double(edge_length);
+ }
+
+ vector<EdgeId> EdgesOfInterest(const GraphPack& gp, const MappingPath<EdgeId>& mapping_path, size_t length_threshold) const {
+ vector<EdgeId> answer;
+ for (size_t i = 0; i < mapping_path.size(); ++i) {
+ EdgeId e = mapping_path[i].first;
+ if (gp.g.length(e) >= length_threshold
+ && IsOfMultiplicityOne(gp, e)
+ && CheckMapping(gp.g.length(e), mapping_path[i].second.mapped_range.size())) {
+ answer.push_back(e);
+ }
+ }
+ return answer;
+ }
+
+protected:
+
+ bool AnalyzeGaps(DebruijnEnvironment& curr_env,
+ io::SingleRead contig,
+ string base_assembly_prefix,
+ size_t edge_length,
+ size_t max_genomic_gap,
+ size_t max_gap_cnt = -1u) const {
+ GraphPack& gp = curr_env.graph_pack();
+ auto mapper_ptr = debruijn_graph::MapperInstance(gp);
+ MappingPath<EdgeId> mapping_path = mapper_ptr->MapRead(contig);
+ auto pos_handler = gp.edge_pos;
+
+ auto long_unique = EdgesOfInterest(gp, mapping_path, edge_length);
+
+ bool found_smth = false;
+ size_t cnt = 0;
+ for (size_t i = 1; i < long_unique.size(); ++i) {
+ if (max_gap_cnt != -1u && cnt >= max_gap_cnt) {
+ INFO("Number of gaps exceeded " << max_gap_cnt);
+ return found_smth;
+ }
+ EdgeId e1 = long_unique[i-1];
+ EdgeId e2 = long_unique[i];
+
+ size_t contig_gap = mapping_path[i].second.initial_range.start_pos - mapping_path[i-1].second.initial_range.end_pos;
+ size_t genomic_gap = GenomicGap(gp, e1, e2);
+ if (genomic_gap == -1u) {
+ DEBUG("Contig likely misassembled. Unique long regions in wrong order. e1 " <<
+ gp.g.str(e1) << " genome pos : " << GatherPositions(gp, e1, "ref") << " and e2 " << gp.g.str(e2) <<
+ " genome pos : " << GatherPositions(gp, e2, "ref"));
+ continue;
+ }
+ DEBUG("Found genomic gap " << genomic_gap <<
+ " between e1 " << gp.g.str(e1) << " genome pos : " << GatherPositions(gp, e1, "ref") << " and e2 " << gp.g.str(e2)
+ << " genome pos : " << GatherPositions(gp, e2, "ref"));
+
+ DEBUG("Looking for reference path");
+ auto ref_mapping_path = FindReferencePath(gp, e1, e2);
+ if (ref_mapping_path.size() == 0) {
+ DEBUG("Couldn't find ref path between " << gp.g.str(e1) << " and " << gp.g.str(e2));
+ continue;
+ }
+
+ vector<EdgeId> ref_path = curr_env.path_finder().FindReadPath(ref_mapping_path);
+ if (ref_path.empty()) {
+ DEBUG("Couldn't fix ref path");
+ ref_path = ref_mapping_path.simple_path();
+ }
+ DEBUG("Found ref path between " << gp.g.str(e1) << " and " << gp.g.str(e2));
+ DEBUG(ref_path.size() << " edges of cumulative length " << CumulativeLength(gp.g, ref_path));
+
+ if (std::abs(int(genomic_gap) - int(contig_gap)) >= int(gap_diff_threshold_)) {
+ DEBUG("Contig likely misassembled. Genomic gap is " << genomic_gap << " while contig gap was " << contig_gap);
+ continue;
+ }
+
+ if (genomic_gap >= max_genomic_gap) {
+ DEBUG("Genomic gap exceeded max_gap value and will be skipped. Gap " << genomic_gap << " max_gap " << max_genomic_gap);
+ continue;
+ }
+
+ RepeatInfo info(e1, e2, genomic_gap, ref_path, contig.name(), cnt++);
+ if (!BelongToSameContig(gp, e1, e2, base_assembly_prefix)) {
+ DEBUG("Long unique edges not in the same contig of base assembly");
+ for (auto processor : processors_) {
+ processor->ProcessUnresolved(curr_env, info);
+ }
+ found_smth = true;
+ } else {
+ for (auto processor : processors_) {
+ processor->ProcessResolved(curr_env, info);
+ }
+ }
+ }
+ return found_smth;
+ }
+
+ DrawUnresolvedRepeatsCommand(const string& command_name,
+ const vector<shared_ptr<RepeatProcessor>>& processors)
+ : DrawingCommand(command_name), ref_prefix_("ref"), gap_diff_threshold_(1000),
+ good_mapping_coeff_(0.7), processors_(processors) {
+ }
+
+};
+
+class DrawUnresolvedWRTAssemblyCommand : public DrawUnresolvedRepeatsCommand {
+
+protected:
+ size_t MinArgNumber() const {
+ return 3;
+ }
+
+ bool CheckCorrectness(const vector<string>& args) const {
+ if (!CheckEnoughArguments(args))
+ return false;
+
+ return true;
+ }
+
+public:
+ DrawUnresolvedWRTAssemblyCommand() :
+ DrawUnresolvedRepeatsCommand("draw_unresolved_wrt_assembly",
+ vector<shared_ptr<RepeatProcessor>>{make_shared<StructuredFileLogger>()}) {
+ }
+
+ string Usage() const {
+ string answer;
+ answer = answer + "Command `draw_unresolved_wrt_assembly` \n" + "Usage:\n"
+ + "> draw_unresolved_wrt_assembly <contigs_file> <prefix_of_base_assembly> <unique_edge_length> [first N contigs to analyze]\n"
+ + " Draws pictures of unresolved repeats.";
+ return answer;
+ }
+
+ void Execute(DebruijnEnvironment& curr_env, const ArgumentList& arg_list) const {
+ const vector<string>& args = arg_list.GetAllArguments();
+ if (!CheckCorrectness(args))
+ return;
+
+ std::string contigs_file = args[1];
+ string base_assembly_prefix = args[2];
+ size_t edge_length = std::stoll(args[3]);
+
+ if (!CheckFileExists(contigs_file)) {
+ LOG("File with contigs " << contigs_file << " not found");
+ return;
+ }
+
+ size_t contig_cnt = -1u;
+ if (args.size() > 4) {
+ LOG("Will analyze first " << args[4] << " contigs");
+ contig_cnt = std::stoll(args[4]);
+ }
+
+ auto reader = make_shared<io::FixingWrapper>(make_shared<io::FileReadStream>(contigs_file));
+
+ size_t i = 0;
+ while (!reader->eof() && i < contig_cnt) {
+ io::SingleRead contig;
+ (*reader) >> contig;
+ LOG("Considering contig " << contig.name());
+
+ if (AnalyzeGaps(curr_env, contig, base_assembly_prefix,
+ edge_length, numeric_limits<size_t>::max())) {
+ curr_env.inc_pic_counter();
+ }
+ ++i;
+ }
+
+ }
+
+};
+
+class DrawUnresolvedWRTReferenceCommand : public DrawUnresolvedRepeatsCommand {
+
+protected:
+ size_t MinArgNumber() const {
+ return 3;
+ }
+
+ bool CheckCorrectness(const vector<string>& args) const {
+ if (!CheckEnoughArguments(args))
+ return false;
+
+ return true;
+ }
+
+public:
+ DrawUnresolvedWRTReferenceCommand() :
+ DrawUnresolvedRepeatsCommand("draw_unresolved_wrt_reference",
+ vector<shared_ptr<RepeatProcessor>>{make_shared<StructuredFileLogger>(),
+ make_shared<PairedInfoChecker>()}) {
+ }
+
+ string Usage() const {
+ string answer;
+ answer = answer + "Command `draw_unresolved_wrt_reference ` \n" + "Usage:\n"
+ + "> draw_unresolved_wrt_reference <gap_length> <prefix_of_base_assembly> <unique_edge_length> [first N gaps to analyze]\n"
+ + " Draws pictures of unresolved repeats longer then gap_length between unique edges longer than some constant.";
+ return answer;
+ }
+
+ void Execute(DebruijnEnvironment& curr_env, const ArgumentList& arg_list) const {
+ const vector<string>& args = arg_list.GetAllArguments();
+ if (!CheckCorrectness(args))
+ return;
+
+ size_t max_interesting_gap = std::stoll(args[1]);
+ std::string base_assembly_prefix = args[2];
+ size_t edge_length = std::stoll(args[3]);
+
+ if (curr_env.graph_pack().genome.size() == 0) {
+ LOG("Reference genome hasn't been loaded");
+ return;
+ }
+
+ size_t gap_cnt = -1u;
+ if (args.size() > 4) {
+ LOG("Will analyze first " << args[4] << " gaps");
+ gap_cnt = std::stoll(args[4]);
+ }
+
+ io::SingleRead ref_as_read("ref", curr_env.graph_pack().genome.str());
+ AnalyzeGaps(curr_env, ref_as_read, base_assembly_prefix,
+ edge_length, max_interesting_gap, gap_cnt);
+ }
+
+};
+
+class DrawPoorlyAssembledCommand : public DrawingCommand {
+ const double WELL_ASSEMBLED_CONSTANT = 0.7;
+private:
+
+ bool IsPoorlyAssembled(const GraphPack& gp, io::SingleRead contig, string base_assembly_prefix) const {
+ MappingPath<EdgeId> mapping_path = debruijn_graph::MapperInstance(gp)->MapRead(contig);
+ auto pos_handler = gp.edge_pos;
+ map<string, size_t> base_ctg_2_len;
+ for (EdgeId e : mapping_path.simple_path()) {
+ auto positions = pos_handler.GetEdgePositions(e);
+ for (EdgePosition pos : positions) {
+ if (boost::starts_with(pos.contigId, base_assembly_prefix)) {
+ base_ctg_2_len[pos.contigId] += pos.mr.mapped_range.size();
+ }
+ }
+ }
+ for (pair<string, size_t> entry : base_ctg_2_len) {
+ if (double(entry.second) > double(contig.size()) * WELL_ASSEMBLED_CONSTANT) {
+ LOG("Contig " << contig.name() <<
+ " was well covered by contig " << entry.first << " of base assembly")
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+protected:
+ size_t MinArgNumber() const {
+ return 2;
+ }
+
+ bool CheckCorrectness(const vector<string>& args) const {
+ if (!CheckEnoughArguments(args))
+ return false;
+
+ return true;
+ }
+
+public:
+ string Usage() const {
+ string answer;
+ answer = answer + "Command `draw_poorly_assembled` \n" + "Usage:\n"
+ + "> draw_poorly_assembled <contigs_file> <prefix_of_base_assembly> [first N contigs to analyze]\n"
+ + " Draws pictures of contigs that are not well covered with any contig in base assembly.";
+ return answer;
+ }
+
+ DrawPoorlyAssembledCommand()
+ : DrawingCommand("draw_poorly_assembled") {
+ }
+
+ void Execute(DebruijnEnvironment& curr_env, const ArgumentList& arg_list) const {
+ const vector<string>& args = arg_list.GetAllArguments();
+ if (!CheckCorrectness(args))
+ return;
+
+ std::string contigs_file = args[1];
+ string base_assembly_prefix = args[2];
+
+ if (!CheckFileExists(contigs_file)) {
+ LOG("File with contigs " << contigs_file << " not found");
+ }
+
+ size_t contig_cnt = -1u;
+ if (args.size() > 3) {
+ LOG("Will analyze first " << args[3] << " contigs");
+ contig_cnt = std::stoll(args[3]);
+ }
+
+ auto reader = make_shared<io::FixingWrapper>(make_shared<io::FileReadStream>(contigs_file));
+
+ size_t i = 0;
+ while (!reader->eof() && i < contig_cnt) {
+ io::SingleRead contig;
+ (*reader) >> contig;
+ LOG("Considering contig " << contig.name());
+
+ if (IsPoorlyAssembled(curr_env.graph_pack(), contig, base_assembly_prefix)) {
+ LOG("Was poorly assembled, drawing");
+ DrawPicturesAlongContig(curr_env, contig);
+ } else {
+ LOG("Was well assembled");
+ }
+
+ ++i;
+ }
+
+ }
+
+};
+}
diff --git a/src/projects/online_vis/drawing_commands/draw_position_command.hpp b/src/projects/online_vis/drawing_commands/draw_position_command.hpp
new file mode 100644
index 0000000..51e792b
--- /dev/null
+++ b/src/projects/online_vis/drawing_commands/draw_position_command.hpp
@@ -0,0 +1,80 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "../environment.hpp"
+#include "../command.hpp"
+#include "../errors.hpp"
+#include "../argument_list.hpp"
+
+#include "drawing_command.hpp"
+
+namespace online_visualization {
+ class DrawPositionCommand : public DrawingCommand {
+ private:
+ void DrawPicture(DebruijnEnvironment& curr_env, runtime_k::RtSeq kmer, string label = "") const {
+ kmer = curr_env.kmer_mapper().Substitute(kmer);
+ if (!curr_env.index().contains(kmer)) {
+ cout << "No corresponding graph location " << endl;
+ return;
+ }
+ pair<EdgeId, size_t> position = curr_env.index().get(kmer);
+ if (position.second * 2 < curr_env.graph().length(position.first))
+ DrawingCommand::DrawPicture(curr_env, curr_env.graph().EdgeStart(position.first), label);
+ else
+ DrawingCommand::DrawPicture(curr_env, curr_env.graph().EdgeEnd(position.first), label);
+ }
+
+ protected:
+ size_t MinArgNumber() const {
+ return 1;
+ }
+
+ bool CheckCorrectness(const vector<string>& args) const {
+ if (!CheckEnoughArguments(args))
+ return false;
+ if (!CheckIsNumber(args[1]))
+ return false;
+
+ return true;
+ }
+
+ public:
+ string Usage() const {
+ string answer;
+ answer = answer + "Command `draw_position` \n" +
+ "Usage:\n" +
+ "> position <position> [--rc] [-r]\n" +
+ " You should specify an integer position in the genome, which location you want to look at. Optionally you can use a flag -r, whether you want the tool to invert the positions,\n" +
+ "and an option --rc, if you would like to see the pictures of the second strand.";
+ return answer;
+ }
+
+ DrawPositionCommand() : DrawingCommand("draw_position")
+ {
+ }
+
+ void Execute(DebruijnEnvironment& curr_env, const ArgumentList& arg_list) const {
+ const vector<string>& args = arg_list.GetAllArguments();
+ if (!CheckCorrectness(args))
+ return;
+
+ int position = GetInt(args[1]);
+ Sequence genome = curr_env.genome();
+ if (arg_list["rc"] == "true") {
+ cout << "Inverting genome...";
+ genome = !genome;
+ }
+
+ if (CheckPositionBounds(position, genome.size(), curr_env.k_value())) {
+ DrawPicture(curr_env, genome.Subseq(position).start<runtime_k::RtSeq>(curr_env.k_value() + 1), args[1]);
+ }
+
+ }
+ };
+}
diff --git a/src/projects/online_vis/drawing_commands/drawing_command.hpp b/src/projects/online_vis/drawing_commands/drawing_command.hpp
new file mode 100644
index 0000000..c393978
--- /dev/null
+++ b/src/projects/online_vis/drawing_commands/drawing_command.hpp
@@ -0,0 +1,100 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "../environment.hpp"
+#include "../command.hpp"
+#include "../errors.hpp"
+#include "../argument_list.hpp"
+#include "assembly_graph/graph_alignment/sequence_mapper.hpp"
+#include "io/reads/single_read.hpp"
+
+namespace online_visualization {
+
+class DrawingCommand : public LocalCommand<DebruijnEnvironment> {
+protected:
+ void DrawPicture(DebruijnEnvironment& curr_env, VertexId vertex, string label = "") const {
+ make_dir(curr_env.folder_);
+
+ stringstream namestream;
+ namestream << curr_env.folder_ << "/" << curr_env.GetFormattedPictureCounter() << "_" << curr_env.file_name_base_ << "_" << label << "_" << ".dot";
+ string file_name = namestream.str();
+ //stringstream linkstream;
+ //linkstream << curr_env.folder_ << "/" << curr_env.file_name_base_ << "_latest.dot";
+ //EdgePosGraphLabeler<Graph> labeler(curr_env.graph(), gp_.edge_pos);
+ omnigraph::GraphComponent<Graph> component = VertexNeighborhood(curr_env.graph(), vertex, curr_env.max_vertices_, curr_env.edge_length_bound_);
+ omnigraph::visualization::WriteComponent<Graph>(component, file_name, curr_env.coloring_, curr_env.labeler());
+ //WriteComponents <Graph> (curr_env.graph(), splitter, linkstream.str(), *DefaultColorer(curr_env.graph(), curr_env.coloring_), curr_env.labeler());
+ LOG("The picture is written to " << file_name);
+
+ curr_env.picture_counter_++;
+ }
+
+ void DrawPicturesAlongPath(DebruijnEnvironment& curr_env, const vector<EdgeId>& path, string label = "") const {
+ make_dir(curr_env.folder_);
+ stringstream namestream;
+ namestream << curr_env.folder_ << "/" << label << "_" << curr_env.file_name_base_ << "_" << curr_env.GetFormattedPictureCounter() << "/";
+ string directory = namestream.str();
+ make_dir(directory);
+ namestream << label << "_";
+ omnigraph::visualization::WriteComponentsAlongPath<Graph>(curr_env.graph(), path, namestream.str(), curr_env.coloring_, curr_env.labeler());
+ LOG("The pictures is written to " << directory);
+
+ curr_env.picture_counter_++;
+ }
+
+ void DrawPicturesAlongSequence(DebruijnEnvironment& curr_env, const Sequence& s, string label = "") const {
+ DrawPicturesAlongPath(curr_env, curr_env.mapper().MapSequence(s).simple_path(), label);
+ }
+
+ void DrawPicturesAlongContig(DebruijnEnvironment& curr_env, io::SingleRead contig) const {
+ Sequence seq = contig.sequence();
+ string label = contig.name();
+ DrawPicturesAlongSequence(curr_env, seq, label);
+ LOG("Contig " << contig.name() << " has been drawn");
+ }
+
+ void DrawConnectedComponents (DebruijnEnvironment& curr_env, int min_size, int max_size, string label = "") const {
+ make_dir(curr_env.folder_);
+ stringstream namestream;
+ namestream << curr_env.folder_ << "/" << curr_env.GetFormattedPictureCounter() << "_" << curr_env.file_name_base_ << "/";
+ make_dir(namestream.str());
+ namestream << label;
+ make_dir(namestream.str());
+ omnigraph::visualization::WriteSizeLimitedComponents<Graph>(curr_env.graph(), namestream.str(), omnigraph::ConnectedSplitter<Graph>(curr_env.graph()), curr_env.coloring_, curr_env.labeler(), min_size, max_size, 10000000);
+ LOG("The pictures is written to " << namestream.str());
+ curr_env.picture_counter_++;
+ }
+
+ //TODO: copy zgrviewer
+ int ShowPicture(DebruijnEnvironment& curr_env, VertexId vertex, string label = "") const {
+ DrawPicture(curr_env, vertex, label);
+ stringstream command_line_string;
+ command_line_string << "gnome-open " << curr_env.folder_ << "/" << curr_env.file_name_base_
+ << "_" << label << "_" << curr_env.GetFormattedPictureCounter()
+ << "_*_.dot & > /dev/null < /dev/null";
+ int result = system(command_line_string.str().c_str());
+
+ return result;
+ }
+
+ void DrawVertex(DebruijnEnvironment& curr_env, size_t vertex_id, string label = "") const {
+ DrawPicture(curr_env, curr_env.finder().ReturnVertexId(vertex_id), label);
+ }
+
+
+public:
+ DrawingCommand(string command_type) : LocalCommand<DebruijnEnvironment>(command_type)
+ {
+ }
+
+ virtual ~DrawingCommand()
+ {
+ }
+};
+}
diff --git a/src/projects/online_vis/drawing_commands/show_position_command.hpp b/src/projects/online_vis/drawing_commands/show_position_command.hpp
new file mode 100644
index 0000000..eb9daa1
--- /dev/null
+++ b/src/projects/online_vis/drawing_commands/show_position_command.hpp
@@ -0,0 +1,80 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "../environment.hpp"
+#include "../command.hpp"
+#include "../errors.hpp"
+#include "../argument_list.hpp"
+
+#include "drawing_command.hpp"
+
+namespace online_visualization {
+ class ShowPositionCommand : public DrawingCommand {
+ private:
+ int ShowPicture(DebruijnEnvironment& curr_env, runtime_k::RtSeq kmer, string label = "") const {
+ kmer = curr_env.kmer_mapper().Substitute(kmer);
+ if (!curr_env.index().contains(kmer)) {
+ FireNoCorrespondingGraphLocation(label);
+ return -1;
+ }
+ pair<EdgeId, size_t> position = curr_env.index().get(kmer);
+ if (position.second * 2 < curr_env.graph().length(position.first))
+ return DrawingCommand::ShowPicture(curr_env, curr_env.graph().EdgeStart(position.first), label);
+ else
+ return DrawingCommand::ShowPicture(curr_env, curr_env.graph().EdgeEnd(position.first), label);
+ }
+
+ protected:
+ size_t MinArgNumber() const {
+ return 1;
+ }
+
+ bool CheckCorrectness(const vector<string>& args) const {
+ if (!CheckEnoughArguments(args))
+ return false;
+ bool result = true;
+ result = result & CheckIsNumber(args[1]);
+ return result;
+ }
+
+ public:
+ string Usage() const {
+ string answer;
+ answer = answer + "Command `show_position` \n" +
+ "Usage:\n" +
+ "> show_position <position>\n" +
+ " This command prints pictures for a neigbourhood of an edge in the DB graph, which corresponds to a given genome position.\n" +
+ " You should specify an integer position in the genome, which location you want to look at.";
+ return answer;
+ }
+
+ ShowPositionCommand() : DrawingCommand("show_position")
+ {
+ }
+
+ void Execute(DebruijnEnvironment& curr_env, const ArgumentList& arg_list) const {
+ const vector<string>& args = arg_list.GetAllArguments();
+ if (!CheckCorrectness(args))
+ return;
+
+ int position = GetInt(args[1]);
+ Sequence genome = curr_env.genome();
+ if (arg_list["rc"] == "true") {
+ cout << "Inverting genome..." << endl;
+ genome = !genome;
+ }
+ if (CheckPositionBounds(position, genome.size(), curr_env.k_value())) {
+ int result = ShowPicture(curr_env, genome.Subseq(position).start<runtime_k::RtSeq>(curr_env.k_value() + 1), args[1]);
+ if (result)
+ FireGenericError("Something is wrong");
+ }
+
+ }
+ };
+}
diff --git a/src/projects/online_vis/environment.hpp b/src/projects/online_vis/environment.hpp
new file mode 100644
index 0000000..8f6a05a
--- /dev/null
+++ b/src/projects/online_vis/environment.hpp
@@ -0,0 +1,58 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "visualization/position_filler.hpp"
+#include "pipeline/graph_pack.hpp"
+#include "visualization/visualization_utils.hpp"
+#include "standard_vis.hpp"
+
+namespace online_visualization {
+
+typedef debruijn_graph::NewExtendedSequenceMapper<debruijn_graph::Graph, Index> MapperClass;
+typedef debruijn_graph::PosFiller<Graph> FillerClass;
+typedef debruijn_graph::KmerMapper<Graph> KmerMapperClass;
+typedef omnigraph::GraphElementFinder<Graph> ElementFinder;
+typedef shared_ptr<omnigraph::visualization::GraphColorer<Graph>> ColoringClass;
+
+class Environment : private boost::noncopyable {
+ protected:
+ const string name_;
+ const string path_;
+
+ public:
+ Environment(const string &name, const string &path)
+ : name_(name),
+ path_(path) {
+ }
+
+ virtual ~Environment() {
+ }
+
+ inline string name() const {
+ return name_;
+ }
+
+ inline string path() const {
+ return path_;
+ }
+
+ virtual string str() const {
+ stringstream ss;
+ ss << name_ + " " + path_;
+ return ss.str();
+ }
+
+ virtual inline bool IsCorrect() const {
+ // make here some checks! for path etc
+ return true;
+ }
+
+};
+
+}
diff --git a/src/projects/online_vis/errors.hpp b/src/projects/online_vis/errors.hpp
new file mode 100644
index 0000000..5c1ae80
--- /dev/null
+++ b/src/projects/online_vis/errors.hpp
@@ -0,0 +1,122 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "vis_utils.hpp"
+
+namespace online_visualization {
+
+ void FireGenericError(const string& msg) {
+ cout << msg << endl;
+ cout << "Please try again" << endl;
+ }
+
+ void FireEdgeDoesNotExist(size_t edge_id) {
+ cout << "Ignoring the request. The edge " << edge_id << " does not exist" << endl;
+ cout << "Please try again" << endl;
+ }
+
+ void FireVertexDoesNotExist(size_t vertex_id) {
+ cout << "Ignoring the request. The vertex " << vertex_id << " does not exist" << endl;
+ cout << "Please try again" << endl;
+ }
+
+ void FireNoCorrespondingGraphLocation(string location) {
+ cout << "No corresponding graph location " << location << endl;
+ }
+
+ void FireNotEnoughArguments() {
+ cout << "Not enough arguments" << endl;
+ cout << "Please try again" << endl;
+ }
+
+ void FireFileDoesNotExist(const string& file) {
+ cout << "File " << file << " does not exist." << endl;
+ cout << "Please try again" << endl;
+ }
+
+ void FireBadArgument(const string& arg) {
+ cout << "Bad word specifier: `" << arg << "'" << endl;
+ cout << "Please try again" << endl;
+ }
+
+ void FireNumberOutOfBounds(int num_of_command) {
+ cout << "The command number parameter " << num_of_command
+ << " must be positive and not exceed the size of history" << endl;
+ cout << "Please try again" << endl;
+ }
+
+ bool CheckFileExists(const string& file) {
+ if (!path::is_regular_file(file)) {
+ FireFileDoesNotExist(file);
+ return false;
+ }
+ return true;
+ }
+
+ bool CheckPositionBounds(size_t position, size_t total_size, size_t K) {
+ bool result = (position + K + 1) <= total_size;
+ if (!result) {
+ cout << "Ignoring the request. Position is out of range : required position is "
+ << position << " while length of the sequence is "
+ << total_size << endl;
+ cout << "Please try again" << endl;
+ }
+ return result;
+ }
+
+ bool CheckIsNumber(const string& str) {
+ if (!IsNumber(str)) {
+ cout << "The argument `" << str << "' is not a number" << endl;
+ cout << "Please try again" << endl;
+ return false;
+ }
+ return true;
+ }
+
+ bool CheckEnvIsCorrect(string path, size_t K) {
+ if (!CheckFileExists(path + ".grp"))
+ return false;
+ if (!CheckFileExists(path + ".sqn"))
+ return false;
+
+ if (!(K >= runtime_k::MIN_K && cfg::get().K < runtime_k::MAX_K)) {
+ LOG("K " << K << " is out of bounds");
+ return false;
+ }
+ if (K % 2 == 0) {
+ LOG("K must be odd");
+ return false;
+ }
+
+ return true;
+ }
+
+ bool CheckVertexExists(const GraphElementFinder<Graph>& finder, size_t vertex_id) {
+ VertexId vertex = finder.ReturnVertexId(vertex_id);
+ if (vertex == VertexId(NULL)) {
+ FireVertexDoesNotExist(vertex_id);
+ return false;
+ }
+ else {
+ return true;
+ }
+ }
+
+ bool CheckEdgeExists(const GraphElementFinder<Graph>& finder, size_t edge_id) {
+ EdgeId edge = finder.ReturnEdgeId(edge_id);
+ if (edge == EdgeId(NULL)) {
+ FireEdgeDoesNotExist(edge_id);
+ return false;
+ }
+ else {
+ return true;
+ }
+ }
+
+}
diff --git a/src/projects/online_vis/history.hpp b/src/projects/online_vis/history.hpp
new file mode 100644
index 0000000..70ebd66
--- /dev/null
+++ b/src/projects/online_vis/history.hpp
@@ -0,0 +1,64 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "standard_vis.hpp"
+
+namespace online_visualization {
+
+ // write-once history
+ class History
+ {
+ typedef HIST_ENTRY EntryT;
+
+ public:
+ void AddEntry(const string& entry) {
+ add_history(entry.c_str());
+ ++size_;
+ VERIFY(int(size_) == history_get_history_state()->length);
+ }
+
+ const char* operator[](size_t k) const {
+ VERIFY(k < size_);
+ //EntryT** my_history = history_list();
+ EntryT* entry = history_get(int(k + 1));
+ return entry->line;
+ }
+
+ void SetEntry(size_t k, const string& entry) const {
+ VERIFY(k < size_);
+ //replace_history_entry(k, entry.c_str(), history_list()[k]->data);
+ replace_history_entry(int(k), entry.c_str(), history_get(int(k + 1))->data);
+ }
+
+ size_t size() const {
+ return size_;
+ }
+
+ const char* front() const {
+ return this->operator[](0);
+ }
+
+ const char* back() const {
+ return this->operator[](this->size() - 1);
+ }
+
+ static History& GetHistory() {
+ static History hist;
+ return hist;
+ }
+
+ private:
+ size_t size_;
+
+ History() : size_(0)
+ {
+ }
+ };
+
+}
diff --git a/src/projects/online_vis/loaded_environments.hpp b/src/projects/online_vis/loaded_environments.hpp
new file mode 100644
index 0000000..3503222
--- /dev/null
+++ b/src/projects/online_vis/loaded_environments.hpp
@@ -0,0 +1,17 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "environment.hpp"
+
+namespace online_visualization {
+
+ template <class Env>
+ class LoadedEnvironments : public map<string, shared_ptr<Env> > {};
+// typedef map<string, shared_ptr<Env> > LoadedEnvironments;
+}
diff --git a/src/projects/online_vis/main.cpp b/src/projects/online_vis/main.cpp
new file mode 100644
index 0000000..2a7d08a
--- /dev/null
+++ b/src/projects/online_vis/main.cpp
@@ -0,0 +1,71 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+// just to check that headers from include and debruijn folders are correctly included
+#include "vis_logger.hpp"
+
+#include "standard_vis.hpp"
+#include "dev_support/segfault_handler.hpp"
+#include "dev_support/stacktrace.hpp"
+#include "pipeline/config_struct.hpp"
+#include "io/reads_io/io_helper.hpp"
+#include "dev_support/simple_tools.hpp"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include "dev_support/memory_limit.hpp"
+#include "io/dataset_support/read_converter.hpp"
+
+#include "debruijn_online_visualizer.hpp"
+
+void create_console_logger(string const& cfg_filename) {
+ using namespace logging;
+
+ string log_props_file = cfg::get().log_filename;
+
+ if (!path::FileExists(log_props_file))
+ log_props_file = path::append_path(path::parent_path(cfg_filename), cfg::get().log_filename);
+
+ logger *lg = create_logger(path::FileExists(log_props_file) ? log_props_file : "");
+ lg->add_writer(std::make_shared<console_writer>());
+
+ attach_logger(lg);
+}
+
+int main(int argc, char** argv) {
+ const size_t GB = 1 << 30;
+
+ try {
+ VERIFY(argc > 1)
+ using namespace online_visualization;
+ string cfg_filename = argv[1];
+ path::CheckFileExistenceFATAL(cfg_filename);
+
+ cfg::create_instance(cfg_filename);
+
+ VERIFY(cfg::get().K >= runtime_k::MIN_K && cfg::get().K < runtime_k::MAX_K);
+ VERIFY(cfg::get().K % 2 != 0);
+
+ create_console_logger(cfg_filename);
+ cout << "\nGAF (Graph Analysis Framework) started" << endl;
+ cout << "Print help to see readme file" << endl;
+ limit_memory(cfg::get().max_memory * GB);
+
+ DebruijnOnlineVisualizer online_vis;
+ online_vis.init();
+ online_vis.run();
+ }
+ catch (std::exception const& e) {
+ std::cerr << "Exception caught " << e.what() << std::endl;
+ return EINTR;
+ }
+ catch (...) {
+ std::cerr << "Unknown exception caught " << std::endl;
+ return EINTR;
+ }
+ return 0;
+}
diff --git a/src/projects/online_vis/online_visualizer.hpp b/src/projects/online_vis/online_visualizer.hpp
new file mode 100644
index 0000000..551a9f3
--- /dev/null
+++ b/src/projects/online_vis/online_visualizer.hpp
@@ -0,0 +1,167 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "pipeline/graph_pack.hpp"
+#include "visualization/visualization_utils.hpp"
+#include "standard_vis.hpp"
+#include "command.hpp"
+#include "loaded_environments.hpp"
+#include "environment.hpp"
+#include "dev_support/autocompletion.hpp"
+
+//#include "all_commands.hpp"
+#include "base_commands.hpp"
+
+#include <signal.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <atomic>
+
+
+namespace online_visualization {
+
+std::atomic<bool> ctrlc_handler_;
+
+inline void * wait_for_second_ctrlc(void *) {
+ ctrlc_handler_ = true;
+ cerr << endl << "Hit Ctrl+C within 1 second once more to exit" << endl;
+ sleep(1);
+ ctrlc_handler_ = false;
+ return NULL;
+}
+
+inline void ctrlc_handler(int /*s*/) {
+ if (!ctrlc_handler_) {
+ pthread_t thread;
+ pthread_create( &thread, NULL, wait_for_second_ctrlc, NULL);
+ }
+ else {
+ exit(-1);
+ }
+}
+
+template <class Env = Environment>
+class OnlineVisualizer {
+ public:
+ OnlineVisualizer() : command_mapping_() {
+ }
+
+ virtual ~OnlineVisualizer() {
+ }
+
+ inline void init() {
+ string p = path::append_path(cfg::get().load_from, "simplification"); // just for default
+
+ path::make_dir("tmp");
+ DEBUG("Adding Commands");
+ AddBaseCommands();
+ AddSpecificCommands();
+ DEBUG("Commands added");
+ DEBUG("Adding auto-completion option");
+ InitAutocompletion(command_mapping_.GetCommandNamesList());
+ //stringstream ss("load default " + p);
+ //const Command<Env>& load_command = command_mapping_.GetCommand("load");
+ //DEBUG("Loading current environment");
+ //load_command.Execute(current_environment_, loaded_environments_, ss);
+ //DEBUG("Environment loaded");
+ }
+
+ string read_line() {
+ cout << "[end]" << endl;
+ char* line = readline(prompt);
+ if (!line)
+ exit(1);
+ string answer(line);
+ free(line);
+ return answer;
+ }
+
+ void run(const string& batch_file = "") {
+ ctrlc_handler_ = false;
+ struct sigaction sigIntHandler;
+ sigIntHandler.sa_handler = ctrlc_handler;
+ sigemptyset(&sigIntHandler.sa_mask);
+ sigIntHandler.sa_flags = 0;
+ sigaction(SIGINT, &sigIntHandler, NULL);
+
+ History& history = History::GetHistory();
+
+ string command_with_args;
+ if (batch_file != "") {
+ command_with_args = "batch " + batch_file;
+ } else {
+ command_with_args = read_line();
+ }
+ bool done = false;
+
+ while (!done) {
+ if (!command_with_args.empty()) {
+ stringstream ss(command_with_args);
+ TRACE("Delegating to the ArgumentList class");
+ ArgumentList arg_list(ss);
+ string processed_command = arg_list.Preprocess(history);
+ if (processed_command == "")
+ continue;
+ //DEBUG("Processed string " << processed_command);
+ string command_string = arg_list.GetAllArguments()[0];
+ const Command<Env>& command = command_mapping_.GetCommand(command_string);
+ DEBUG("Command " << processed_command << " starting to execute");
+ command.Execute(current_environment_, loaded_environments_, arg_list);
+ DEBUG("Command " << processed_command << " executed");
+
+ history.AddEntry(command_with_args);
+ DEBUG("Command " << processed_command << " added to history");
+
+ }
+ command_with_args = read_line();
+ }
+ }
+
+ protected:
+ void AddCommand(shared_ptr<Command<Env>> command) {
+ command_mapping_.AddCommand(command);
+ }
+
+ virtual void AddSpecificCommands() {
+ }
+
+
+ private:
+ static const char* prompt;
+
+ void AddBaseCommands() {
+ AddCommand(make_shared<NullCommand<Env>>());
+ AddCommand(make_shared<ExitCommand<Env>>());
+ AddCommand(make_shared<ListCommand<Env>>());
+ AddCommand(make_shared<HelpCommand<Env>>(&command_mapping_));
+
+ AddCommand(make_shared<LogCommand<Env>>());
+ AddCommand(make_shared<SaveBatchCommand<Env>>());
+ AddCommand(make_shared<BatchCommand<Env>>(&command_mapping_));
+
+ AddCommand(make_shared<SwitchCommand<Env>>());
+ AddCommand(make_shared<ReplayCommand<Env>>(&command_mapping_));
+
+ //todo think about why it was in the specific commands
+ AddCommand(make_shared<LoadCommand<Env>>());
+ }
+
+ shared_ptr<Env> current_environment_;
+ LoadedEnvironments<Env> loaded_environments_;
+ CommandMapping<Env> command_mapping_;
+
+ DECL_LOGGER("OnlineVisualizer");
+};
+
+ template<class Env>
+ const char* OnlineVisualizer<Env>::prompt = "GAF$> ";
+
+}
diff --git a/src/projects/online_vis/position_commands.hpp b/src/projects/online_vis/position_commands.hpp
new file mode 100644
index 0000000..adbcf76
--- /dev/null
+++ b/src/projects/online_vis/position_commands.hpp
@@ -0,0 +1,11 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "position_commands/clear_position_command.hpp"
+#include "position_commands/fill_position_command.hpp"
diff --git a/src/projects/online_vis/position_commands/clear_position_command.hpp b/src/projects/online_vis/position_commands/clear_position_command.hpp
new file mode 100644
index 0000000..a6f9fd1
--- /dev/null
+++ b/src/projects/online_vis/position_commands/clear_position_command.hpp
@@ -0,0 +1,35 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "../environment.hpp"
+#include "../command.hpp"
+#include "../errors.hpp"
+
+namespace online_visualization {
+ class ClearPositionCommand : public LocalCommand<DebruijnEnvironment> {
+ public:
+ string Usage() const {
+ string answer;
+ answer = answer + "Command `clear_pos` \n" +
+ "Usage:\n" +
+ "> clear_pos\n" +
+ " This command resets the graph and clears all the labels you previously filled in.\n";
+ return answer;
+ }
+
+ ClearPositionCommand() : LocalCommand<DebruijnEnvironment>("clear_pos")
+ {
+ }
+
+ void Execute(DebruijnEnvironment& curr_env, const ArgumentList&) const {
+ curr_env.ResetPositions();
+ }
+
+ };
+}
diff --git a/src/projects/online_vis/position_commands/fill_position_command.hpp b/src/projects/online_vis/position_commands/fill_position_command.hpp
new file mode 100644
index 0000000..b48b04b
--- /dev/null
+++ b/src/projects/online_vis/position_commands/fill_position_command.hpp
@@ -0,0 +1,65 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "../environment.hpp"
+#include "../command.hpp"
+#include "../errors.hpp"
+
+namespace online_visualization {
+
+ class FillPositionCommand : public LocalCommand<DebruijnEnvironment> {
+
+ protected:
+ size_t MinArgNumber() const {
+ return 2;
+ }
+
+ bool CheckCorrectness(const vector<string>& args) const {
+ if (!CheckEnoughArguments(args))
+ return false;
+ //const string& name = args[1];
+ const string& file = args[2];
+ return CheckFileExists(file);
+ }
+ public:
+ string Usage() const {
+ string answer;
+ answer = answer + "Command `fill_pos` \n" +
+ "Usage:\n" +
+ "> fill_pos <label> <path_to_contigs>\n" +
+ " This command maps contigs you provide to the graph.\n" +
+ " You should specify a label of this contigs, which you want to see at the edge in the DB graph.";
+ return answer;
+ }
+
+ FillPositionCommand() : LocalCommand<DebruijnEnvironment>("fill_pos")
+ {
+ }
+
+ void Execute(DebruijnEnvironment& curr_env, const ArgumentList& arg_list) const {
+ const vector<string>& args = arg_list.GetAllArguments();
+ if (!CheckCorrectness(args))
+ return;
+
+ string name = args[1];
+ string file = args[2];
+
+ auto reader = make_shared<io::FixingWrapper>(make_shared<io::FileReadStream>(file));
+
+ FillerClass& filler = curr_env.filler();
+ while (!reader->eof()) {
+ io::SingleRead read;
+ (*reader) >> read;
+ Sequence contig = read.sequence();
+ filler.Process(contig, name + "_" + read.name());
+ filler.Process(!contig, name + "_" + read.name() + "_RC");
+ }
+ }
+ };
+}
diff --git a/src/projects/online_vis/processing_commands.hpp b/src/projects/online_vis/processing_commands.hpp
new file mode 100644
index 0000000..de8e60c
--- /dev/null
+++ b/src/projects/online_vis/processing_commands.hpp
@@ -0,0 +1,66 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "environment.hpp"
+#include "command.hpp"
+#include "errors.hpp"
+#include "argument_list.hpp"
+#include "assembly_graph/graph_support/genomic_quality.hpp"
+#include "stages/simplification_pipeline/graph_simplification.hpp"
+
+namespace online_visualization {
+
+class ClipTipsCommand : public NewLocalCommand<DebruijnEnvironment> {
+
+public:
+ string Usage() const {
+ string answer;
+ answer = answer + "Command `clip_tips` \n" + "Usage:\n"
+ + "> clip_tips <length> (Y/y)\n" + " This command clips tips.\n"
+ + " If length is not specified, "
+ + "it will be counted from global settings. "
+ + "If second argument Y/y is specified then genomic edges will be retained.";
+ return answer;
+ }
+
+ ClipTipsCommand()
+ : NewLocalCommand<DebruijnEnvironment>("clip_tips", 0) {
+ }
+
+private:
+ /*virtual*/ void InnerExecute(DebruijnEnvironment& curr_env,
+ const vector<string>& args) const {
+ size_t length = 0;
+ if(args.size() > 0) {
+ length = GetInt(args[1]);
+ } else {
+ length = curr_env.edge_length_bound();
+ }
+
+ pred::TypedPredicate<EdgeId> condition = LengthUpperBound<Graph>(curr_env.graph(), length);
+ if (args.size() > 2 && (args[2] == "Y" || args[2] == "y")) {
+ cout << "Trying to activate genome quality condition" << endl;
+ if (curr_env.genome().size() == 0) {
+ cout << "No reference was provided!!!" << endl;
+ } else {
+ cout << "Genome quality condition will be used" << endl;
+
+ curr_env.graph_pack().ClearQuality();
+ curr_env.graph_pack().FillQuality();
+// condition = make_shared<make_shared<debruijn_graph::ZeroQualityCondition<Graph, Index>>(curr_env.graph(), edge_qual);
+ condition = std::bind(&debruijn_graph::EdgeQuality<Graph>::IsZeroQuality,
+ std::ref(curr_env.graph_pack().edge_qual), std::placeholders::_1);
+ }
+ }
+ debruijn::simplification::SimplifInfoContainer info;
+ info.set_chunk_cnt(10);
+ debruijn::simplification::TipClipperInstance(curr_env.graph(), condition, info, (omnigraph::HandlerF<Graph>)nullptr)->Run();
+ }
+};
+}
diff --git a/src/projects/online_vis/setting_commands.hpp b/src/projects/online_vis/setting_commands.hpp
new file mode 100644
index 0000000..3d78d6a
--- /dev/null
+++ b/src/projects/online_vis/setting_commands.hpp
@@ -0,0 +1,161 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "environment.hpp"
+#include "command.hpp"
+#include "errors.hpp"
+
+namespace online_visualization {
+
+class LoadGenomeCommand : public LocalCommand<DebruijnEnvironment> {
+
+ protected:
+ size_t MinArgNumber() const {
+ return 1;
+ }
+
+ bool CheckCorrectness(const vector<string>& args) const {
+ if (!CheckEnoughArguments(args))
+ return false;
+ const string& file = args[1];
+ if (!CheckFileExists(file))
+ return false;
+ return true;
+ }
+
+ public:
+ string Usage() const {
+ string answer;
+ answer = answer + "Command `load_genome` \n" +
+ " Usage:\n" +
+ " > load_genome <path_to_genome>\n" +
+ " You should specify a path to the genome you want to load from.\n" +
+ " Previously loaded genomes would be lost.";
+ return answer;
+ }
+
+ LoadGenomeCommand() : LocalCommand<DebruijnEnvironment>("load_genome")
+ {
+ }
+
+ void Execute(DebruijnEnvironment& curr_env, const ArgumentList& arg_list) const {
+ const vector<string>& args = arg_list.GetAllArguments();
+ if (!CheckCorrectness(args))
+ return;
+ const string& file = args[1];
+ auto genome_reader = make_shared<io::FixingWrapper>(make_shared<io::FileReadStream>(file));
+ io::SingleRead genome;
+ (*genome_reader) >> genome;
+ curr_env.LoadNewGenome(genome.sequence());
+ }
+};
+
+class SetMaxVertCommand : public LocalCommand<DebruijnEnvironment> {
+ protected:
+ size_t MinArgNumber() const {
+ return 1;
+ }
+
+ bool CheckCorrectness(const vector<string>& args) const {
+ if (!CheckEnoughArguments(args))
+ return false;
+ return CheckIsNumber(args[1]);
+ }
+
+ public:
+ string Usage() const {
+ string answer;
+ answer = answer + "Command `set_max_vertices` \n" +
+ "Usage:\n" +
+ "> set_max_vertices <max_vertices> \n" +
+ " You should specify an integer, which is an upper bound for the number of vertices in the picture.";
+ return answer;
+ }
+
+ SetMaxVertCommand() : LocalCommand<DebruijnEnvironment>("set_max_vertices")
+ {
+ }
+
+ void Execute(DebruijnEnvironment& curr_env, const ArgumentList& arg_list) const {
+ const vector<string>& args = arg_list.GetAllArguments();
+ if (!CheckCorrectness(args)) {
+ return;
+ }
+ size_t max_v = GetInt(args[1]);
+ curr_env.set_max_vertices(max_v);
+ }
+};
+
+class SetFolderCommand : public LocalCommand<DebruijnEnvironment> {
+ protected:
+ size_t MinArgNumber() const {
+ return 1;
+ }
+
+ bool CheckCorrectness(const vector<string>& args) const {
+ return CheckEnoughArguments(args);
+ }
+
+ public:
+ string Usage() const {
+ string answer;
+ answer = answer + "Command `set_folder` \n" +
+ "Usage:\n" +
+ "> set_folder <folder_name> \n" +
+ " You should specify a string, which is a new name for a pictures' folder.";
+ return answer;
+ }
+ SetFolderCommand() : LocalCommand<DebruijnEnvironment>("set_folder")
+ {
+ }
+
+ void Execute(DebruijnEnvironment& curr_env, const ArgumentList& arg_list) const {
+ const vector<string>& args = arg_list.GetAllArguments();
+ if (!CheckCorrectness(args))
+ return;
+ string folder_name = args[1];
+ path::make_dirs(folder_name);
+ curr_env.set_folder(folder_name);
+ }
+};
+
+class SetFileNameCommand : public LocalCommand<DebruijnEnvironment> {
+ protected:
+ size_t MinArgNumber() const {
+ return 1;
+ }
+
+ bool CheckCorrectness(const vector<string>& args) const {
+ return CheckEnoughArguments(args);
+ }
+
+ public:
+ string Usage() const {
+ string answer;
+ answer = answer + "Command `set_file_name` \n" +
+ "Usage:\n" +
+ "> set_file_name <file_base_name>\n" +
+ " You should specify a string, which is a new base_name for all the pictures, that you generate.";
+ return answer;
+ }
+
+ SetFileNameCommand() : LocalCommand<DebruijnEnvironment>("set_file_name")
+ {
+ }
+
+ void Execute(DebruijnEnvironment& curr_env, const ArgumentList& arg_list) const {
+ const vector<string>& args = arg_list.GetAllArguments();
+ if (!CheckCorrectness(args))
+ return;
+ string file_name = args[1];
+ curr_env.set_file_name(file_name);
+ }
+};
+}
+
diff --git a/src/projects/online_vis/standard_vis.hpp b/src/projects/online_vis/standard_vis.hpp
new file mode 100644
index 0000000..68fde86
--- /dev/null
+++ b/src/projects/online_vis/standard_vis.hpp
@@ -0,0 +1,23 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "pipeline/graph_pack.hpp"
+#include "dev_support/standard_base.hpp"
+
+#include <readline/readline.h>
+#include <readline/history.h>
+
+namespace online_visualization {
+ typedef debruijn_graph::conj_graph_pack GraphPack;
+ typedef GraphPack::graph_t Graph;
+ typedef GraphPack::index_t Index;
+ typedef EdgesPositionHandler<Graph> EdgePos;
+ typedef Graph::VertexId VertexId;
+ typedef Graph::EdgeId EdgeId;
+}
diff --git a/src/projects/online_vis/statistics_commands.hpp b/src/projects/online_vis/statistics_commands.hpp
new file mode 100644
index 0000000..0d02b25
--- /dev/null
+++ b/src/projects/online_vis/statistics_commands.hpp
@@ -0,0 +1,13 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "statistics_commands/print_paths_command.hpp"
+#include "statistics_commands/print_contigs_stats.hpp"
+#include "statistics_commands/junction_sequence_command.hpp"
+#include "statistics_commands/print_edge_command.hpp"
diff --git a/src/projects/online_vis/statistics_commands/junction_sequence_command.hpp b/src/projects/online_vis/statistics_commands/junction_sequence_command.hpp
new file mode 100644
index 0000000..ee1f5fd
--- /dev/null
+++ b/src/projects/online_vis/statistics_commands/junction_sequence_command.hpp
@@ -0,0 +1,97 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "../environment.hpp"
+#include "../command.hpp"
+#include "../errors.hpp"
+#include "assembly_graph/graph_core/basic_graph_stats.hpp"
+#include "assembly_graph/graph_alignment/sequence_mapper.hpp"
+#include "assembly_graph/paths/path_utils.hpp"
+
+namespace online_visualization {
+class JunctionSequenceCommand : public LocalCommand<DebruijnEnvironment> {
+
+protected:
+ size_t MinArgNumber() const {
+ return 3;
+ }
+
+ bool CheckCorrectness(const vector<string>& args) const {
+ return CheckEnoughArguments(args);
+ }
+
+public:
+ string Usage() const {
+ return string() + "Command `junction_sequence` \n" + " Usage:\n"
+ + "> junction_sequence <insert size> <int ids of edges> \n"
+ + " Edges might be not consecutive, then will try to fix the path. \n"
+ + " <insert size> specifies how many bp will be taken from first and last edges in the path. \n "
+ + " flank size = IS - K - (cumulative length of intermediate edge).";
+ }
+
+ JunctionSequenceCommand()
+ : LocalCommand<DebruijnEnvironment>("junction_sequence") {
+ }
+
+ void Execute(DebruijnEnvironment& curr_env, const ArgumentList& arg_list) const {
+ const vector<string>& args = arg_list.GetAllArguments();
+ if (!CheckCorrectness(args))
+ return;
+
+ size_t insert_size = std::stoll(args[1]);
+ LOG("Insert size " << insert_size);
+
+ stringstream ss;
+ ss << "Provided path: ";
+ vector<EdgeId> edges;
+ for (size_t i = 2; i < args.size(); ++i) {
+ EdgeId e = curr_env.finder().ReturnEdgeId(std::stoll(args[i]));
+ edges.push_back(e);
+ ss << curr_env.graph().str(e) << " ; ";
+ }
+
+ LOG(ss.str());
+
+ debruijn_graph::MappingPathFixer<Graph> path_fixer(curr_env.graph());
+ edges = path_fixer.TryFixPath(edges, insert_size);
+
+ if (path_fixer.CheckContiguous(edges)) {
+ LOG("Successfully fixed path");
+ } else {
+ LOG("Couldn't fix path!");
+ return;
+ }
+
+ VERIFY(edges.size() > 1);
+ size_t interm_length = CumulativeLength(curr_env.graph(), vector<EdgeId>(edges.begin() + 1, edges.end() - 1));
+ if (insert_size < curr_env.graph().k() + interm_length) {
+ LOG("Intermediate sequence too long");
+ return;
+ }
+
+ size_t flank_length = insert_size - curr_env.graph().k() - interm_length;
+ LOG("Flank length " << flank_length);
+
+ if (curr_env.graph().length(edges.front()) < flank_length || curr_env.graph().length(edges.back()) < flank_length) {
+ LOG("Flanking path edges can not be shorter than flank length!");
+ return;
+ }
+
+ Path<EdgeId> path(edges, curr_env.graph().length(edges.front()) - flank_length, flank_length);
+
+ LOG("Outputting sequence:");
+ LOG(PathSequence(curr_env.graph(), path));
+ }
+
+private:
+ DECL_LOGGER("JunctionSequenceCommand")
+ ;
+};
+
+}
diff --git a/src/projects/online_vis/statistics_commands/print_contigs_stats.hpp b/src/projects/online_vis/statistics_commands/print_contigs_stats.hpp
new file mode 100644
index 0000000..eaf3485
--- /dev/null
+++ b/src/projects/online_vis/statistics_commands/print_contigs_stats.hpp
@@ -0,0 +1,220 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "../environment.hpp"
+#include "../command.hpp"
+#include "../errors.hpp"
+#include "assembly_graph/graph_core/basic_graph_stats.hpp"
+
+namespace online_visualization {
+ class PrintContigsStatsCommand : public LocalCommand<DebruijnEnvironment> {
+ //typedef vector<EdgeId> Path;
+
+ private:
+ mutable bool ext_output;
+
+ vector<EdgeId> TryCloseGap(const Graph& graph, VertexId v1, VertexId v2) const {
+ if (v1 == v2)
+ return vector<EdgeId>();
+ TRACE("Trying to close gap between v1 =" << graph.int_id(v1) << " and v2 =" << graph.int_id(v2));
+ PathStorageCallback<Graph> path_storage(graph);
+
+ // todo reduce value after investigation
+ ProcessPaths(graph, 0, 50, v1, v2, path_storage);
+
+ if (path_storage.size() == 0) {
+ TRACE("Failed to find closing path");
+ return vector<EdgeId>();
+ } else if (path_storage.size() == 1) {
+ TRACE("Unique closing path found");
+ } else {
+ TRACE("Several closing paths found, first chosen");
+ }
+ vector<EdgeId> answer = path_storage.paths().front();
+ TRACE("Gap closed");
+ TRACE("Cumulative closure length is "
+ << CumulativeLength(graph, answer));
+ return answer;
+ }
+
+ vector<EdgeId> TryFixPath(DebruijnEnvironment& curr_env, const vector<EdgeId>& edges) const {
+ vector<EdgeId> answer;
+ if (edges.empty()) {
+ // WARN("Mapping path was empty");
+ return vector<EdgeId>();
+ }
+ // VERIFY(edges.size() > 0);
+ answer.push_back(edges[0]);
+ for (size_t i = 1; i < edges.size(); ++i) {
+ vector<EdgeId> closure = TryCloseGap(curr_env.graph(), curr_env.graph().EdgeEnd(edges[i - 1]), curr_env.graph().EdgeStart(edges[i]));
+ answer.insert(answer.end(), closure.begin(), closure.end());
+ answer.push_back(edges[i]);
+ }
+ return answer;
+ }
+
+ Path<EdgeId> TryFixPath(DebruijnEnvironment& curr_env, const Path<EdgeId>& path) const {
+ return Path<EdgeId>(TryFixPath(curr_env, path.sequence()), path.start_pos(), path.end_pos());
+ }
+
+ private:
+
+ //TODO: do something!
+ //bool ProcessContigCarefully(DebruijnEnvironment& curr_env, const Sequence& contig, const MappingPath<EdgeId>& genome_path, const string& contig_name) const {
+ //debug(ext_output, " Checking the contig " << contig_name);
+ //debug(ext_output, " Length " << contig.size());
+ //const MappingPath<EdgeId>& contig_path = curr_env.mapper().MapSequence(contig);
+ //set<EdgeId> contig_edges;
+ //map<EdgeId, vector<MappingRange> > edge_positions;
+ //if (contig_path.size() == 0) {
+ //debug(ext_output, "Contig could not be aligned at all!");
+ //return false;
+ //}
+ //for (size_t i = 0; i < contig_path.size(); ++i) {
+ //contig_edges.insert(contig_path[i].first);
+ //}
+
+ //for (size_t i = 0; i < genome_path.size(); ++i) {
+ //if (contig_edges.find(genome_path[i].first) != contig_edges::end())
+ //edge_positions[genome_path[i].first].push_back(genome_path[i].second);
+ //}
+
+ //bool found = false;
+
+ //for (size_t i = 0; i < contig_path.size(); ++i) {
+ //TRACE(i << "-th edge of the contig " << contig_name);
+ //CheckEdgeIsNotMisassembled(contig_path[i], edge_positions[contig_path[i].first]);
+ //}
+
+ //contig_path.
+ //for (size_t i = 0; i + 1 < contig_path.size(); ++i) {
+
+
+ //}
+
+ //for (size_t i = 0; i < genome_path_completed.size(); ++i) {
+ //TRACE(i << "-th edge of the genome " << genome_path_completed[i]);
+ //if (genome_path_completed[i] == first_edge) {
+ //found = true;
+ //for (size_t j = 1; j < contig_path.size(); ++j) {
+ //if (genome_path_completed[i + j] != contig_path[j].first) {
+ //debug(ext_output, " Break in the edge " << curr_env.graph().int_id(contig_path[j].first));
+ //return false;
+ //}
+ //}
+ //}
+ //}
+ //if (!found) {
+ //debug(ext_output, " First edge " << curr_env.graph().int_id(first_edge) << " was not found");
+ //return false;
+ //} else {
+ //debug(ext_output, " No misassemblies");
+ //return true;
+ //}
+ //}
+
+ bool ProcessContig(DebruijnEnvironment& curr_env, const Sequence& contig, const MappingPath<EdgeId>& genome_path, const string& contig_name) const {
+ debug(ext_output, " Checking the contig " << contig_name);
+ debug(ext_output, " Length " << contig.size());
+ const Path<EdgeId>& genome_path_completed = TryFixPath(curr_env, genome_path.path());
+ const MappingPath<EdgeId>& contig_path = curr_env.mapper().MapSequence(contig);
+ if (contig_path.size() == 0) {
+ debug(ext_output, "Contig could not be aligned at all!");
+ return false;
+ }
+ bool found = false;
+ EdgeId first_edge = contig_path[0].first;
+ for (size_t i = 0; i < genome_path_completed.size(); ++i) {
+ TRACE(" i-th edge of the genome " << genome_path_completed[i]);
+ if (genome_path_completed[i] == first_edge) {
+ found = true;
+ for (size_t j = 1; j < contig_path.size(); ++j) {
+ if (genome_path_completed[i + j] != contig_path[j].first) {
+ debug(ext_output, " Break in the edge " << curr_env.graph().int_id(contig_path[j].first));
+ return false;
+ }
+ }
+ }
+ }
+ if (!found) {
+ debug(ext_output, " First edge " << curr_env.graph().int_id(first_edge) << " was not found");
+ return false;
+ } else {
+ debug(ext_output, " No misassemblies");
+ return true;
+ }
+ }
+
+ protected:
+ size_t MinArgNumber() const {
+ return 1;
+ }
+
+ bool CheckCorrectness(const vector<string>& args) const {
+ if (!CheckEnoughArguments(args))
+ return false;
+
+ const string& file = args[1];
+ if (!CheckFileExists(file))
+ return false;
+
+ return true;
+ }
+
+ public:
+ string Usage() const {
+ string answer;
+ answer = answer + "Command `print_contigs_stats` \n" +
+ " Usage:\n" +
+ "> print_contigs_stats <contigs_file> [--stats]\n" +
+ " Shows the results of aligning the contigs in the <contigs_file> to the current DB graph. \n" +
+ " --stats allows to see the details.";
+ return answer;
+ }
+
+ PrintContigsStatsCommand() : LocalCommand<DebruijnEnvironment>("print_contigs_stats")
+ {
+ }
+
+ void Execute(DebruijnEnvironment& curr_env, const ArgumentList& arg_list) const {
+ const vector<string>& args = arg_list.GetAllArguments();
+ if (!CheckCorrectness(args))
+ return;
+
+ string file = args[1];
+ ext_output = (arg_list["stats"] == "true");
+
+ TRACE("Printing stats " << ext_output);
+
+ io::FileReadStream irs(file);
+
+ const Sequence& genome = curr_env.genome();
+
+ const MappingPath<EdgeId>& genome_path = curr_env.mapper().MapSequence(genome);
+
+ while (!irs.eof()) {
+ io::SingleRead read;
+ irs >> read;
+ if (read.IsValid()) {
+ const Sequence& contig = read.sequence();
+ bool result = false;
+ result = result | ProcessContig(curr_env, contig, genome_path, "CONTIG_" + read.name());
+ result = result | ProcessContig(curr_env, !contig, genome_path, "CONTIG_" + read.name() + "_RC");
+ if (result) {
+ INFO(" contig " + read.name() + " is OKAY");
+ }
+ else
+ INFO(" contig " + read.name() + " is MISASSEMBLED");
+ }
+ }
+ }
+ private:
+ DECL_LOGGER("PrintContigsStats");
+ };
+}
diff --git a/src/projects/online_vis/statistics_commands/print_edge_command.hpp b/src/projects/online_vis/statistics_commands/print_edge_command.hpp
new file mode 100644
index 0000000..f91aa67
--- /dev/null
+++ b/src/projects/online_vis/statistics_commands/print_edge_command.hpp
@@ -0,0 +1,60 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "../environment.hpp"
+#include "../command.hpp"
+#include "../errors.hpp"
+
+namespace online_visualization {
+
+class PrintEdgeCommand : public LocalCommand<DebruijnEnvironment> {
+
+ protected:
+ size_t MinArgNumber() const {
+ return 1;
+ }
+
+ bool CheckCorrectness(const vector<string>& args) const {
+ return CheckEnoughArguments(args);
+ }
+
+ public:
+ string Usage() const {
+ string answer;
+ answer = answer + "Command `paths` \n" +
+ "Usage:\n" +
+ "> print_edge <edge_id> \n" +
+ " This command prints edge coverage and sequence.";
+ return answer;
+ }
+
+ PrintEdgeCommand() : LocalCommand<DebruijnEnvironment>("print_edge")
+ {
+ }
+
+ void Execute(DebruijnEnvironment& curr_env, const ArgumentList& arg_list) const {
+ const vector<string>& args = arg_list.GetAllArguments();
+ if (!CheckCorrectness(args))
+ return;
+ TRACE("Executing `print_edge` command");
+ size_t edgeID = GetInt(args[1]);
+ if (!CheckEdgeExists(curr_env.finder(), edgeID))
+ return;
+ EdgeId edge = curr_env.finder().ReturnEdgeId(edgeID);
+ cout << curr_env.graph().str(edge) << endl;
+
+ cout << curr_env.graph().EdgeNucls(edge) << endl;
+
+
+ }
+
+ DECL_LOGGER("PrintEdgeCommand");
+};
+
+}
diff --git a/src/projects/online_vis/statistics_commands/print_paths_command.hpp b/src/projects/online_vis/statistics_commands/print_paths_command.hpp
new file mode 100644
index 0000000..9a14140
--- /dev/null
+++ b/src/projects/online_vis/statistics_commands/print_paths_command.hpp
@@ -0,0 +1,103 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "../environment.hpp"
+#include "../command.hpp"
+#include "../errors.hpp"
+
+namespace online_visualization {
+
+class PrintPathsCommand : public LocalCommand<DebruijnEnvironment> {
+ typedef vector<EdgeId> Path;
+
+ protected:
+ size_t MinArgNumber() const {
+ return 2;
+ }
+
+ bool CheckCorrectness(const vector<string>& args) const {
+ return CheckEnoughArguments(args);
+ }
+
+ public:
+ string Usage() const {
+ string answer;
+ answer = answer + "Command `paths` \n" +
+ "Usage:\n" +
+ "> paths <vertex_from> <vertex_to> [<max_length>] \n" +
+ " This command prints all paths between two given vertices, that do not exceed `max_length` parameter.\n" +
+ " You should specify two integers (id of vertices), between which you want to find paths." +
+ " Optionally you can provide `max_length` integer, \n" +
+ " so that tool does not consider paths longer than `max_length`. It is equal to 100000 by default.";
+ return answer;
+ }
+
+ PrintPathsCommand() : LocalCommand<DebruijnEnvironment>("print_paths")
+ {
+ }
+
+ void Execute(DebruijnEnvironment& curr_env, const ArgumentList& arg_list) const {
+ const vector<string>& args = arg_list.GetAllArguments();
+ if (!CheckCorrectness(args))
+ return;
+
+ TRACE("Executing `paths` command");
+// bool first_edge = false;
+// bool second_edge = false;
+ size_t from = GetInt(args[1]);
+ size_t to = GetInt(args[2]);
+ size_t max_length = 100000;
+ if (args.size() > 3)
+ max_length = GetInt(args[3]);
+ if (arg_list["edge"]=="true") {
+ // not so good, maybe do something
+// first_edge = second_edge = true;
+ TRACE("Looking at edges");
+ if (!CheckEdgeExists(curr_env.finder(), from) || !CheckEdgeExists(curr_env.finder(), to))
+ return;
+ from = curr_env.graph().int_id(curr_env.graph().EdgeEnd(curr_env.finder().ReturnEdgeId(from)));
+ to = curr_env.graph().int_id(curr_env.graph().EdgeStart(curr_env.finder().ReturnEdgeId(to)));
+ }
+
+ if (!CheckVertexExists(curr_env.finder(), from) || !CheckVertexExists(curr_env.finder(), to))
+ return;
+
+ const Graph& graph = curr_env.graph();
+
+ TRACE("Looking for the paths");
+ PathStorageCallback<Graph> callback(curr_env.graph());
+ ProcessPaths(graph, 0, max_length,
+ curr_env.finder().ReturnVertexId(from),
+ curr_env.finder().ReturnVertexId(to), callback);
+ const vector<Path>& paths = callback.paths();
+
+ cout << paths.size() << " path(s) have been found : " << endl;
+ for (size_t i = 0; i < paths.size(); ++i) {
+ cout << (i + 1) << "-th path (" << PathLength(graph, paths[i]) << ") ::: ";
+ for (size_t j = 0; j < paths[i].size(); ++j) {
+ cout << paths[i][j].int_id()
+ << "(" << graph.length(paths[i][j]) << ") ";
+ }
+ cout << endl;
+ }
+
+ }
+
+ private:
+ size_t PathLength(const Graph& g, const vector<EdgeId>& path) const {
+ size_t res = 0;
+ for (size_t i = 0; i < path.size(); ++i)
+ res += g.length(path[i]);
+ return res;
+ }
+
+ DECL_LOGGER("PrintPathsCommand");
+};
+
+}
diff --git a/src/projects/online_vis/vis_logger.hpp b/src/projects/online_vis/vis_logger.hpp
new file mode 100644
index 0000000..0b38093
--- /dev/null
+++ b/src/projects/online_vis/vis_logger.hpp
@@ -0,0 +1,28 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "dev_support/logger/log_writers.hpp"
+
+#undef INFO
+#define INFO(message) \
+{ \
+ cout << __FILE__ << " " << __LINE__ << " ::: " << message << endl; \
+} \
+
+
+#define LOG(message) \
+{ \
+ cout << message << endl; \
+} \
+
+//#define trace(message) LOG_MSG(logging::L_TRACE, message)
+#define debug(print, message) \
+{ \
+ if (print) { \
+ cout << message << endl; \
+ } \
+}
diff --git a/src/projects/online_vis/vis_utils.hpp b/src/projects/online_vis/vis_utils.hpp
new file mode 100644
index 0000000..17cc099
--- /dev/null
+++ b/src/projects/online_vis/vis_utils.hpp
@@ -0,0 +1,40 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "standard_vis.hpp"
+
+namespace online_visualization {
+
+ bool IsNumber(const string& s) {
+ if (s.empty())
+ return false;
+ for (auto iter = s.begin(); iter != s.end(); ++iter) {
+ if (!std::isdigit(*iter))
+ return false;
+ }
+ return true;
+ }
+
+ int GetInt(string str) {
+ stringstream ss(str);
+ int ans;
+ ss >> ans;
+ return ans;
+ }
+
+ vector<string> SplitInTokens(stringstream& args) {
+ vector<string> answer;
+ while (!args.eof()) {
+ string arg;
+ args >> arg;
+ answer.push_back(arg);
+ }
+ return answer;
+ }
+}
diff --git a/src/projects/scaffold_correction/CMakeLists.txt b/src/projects/scaffold_correction/CMakeLists.txt
new file mode 100644
index 0000000..45e47b8
--- /dev/null
+++ b/src/projects/scaffold_correction/CMakeLists.txt
@@ -0,0 +1,23 @@
+############################################################################
+# Copyright (c) 2015 Saint Petersburg State University
+# All Rights Reserved
+# See file LICENSE for details.
+############################################################################
+
+project(moleculo CXX)
+
+
+add_executable(scaffold_correction
+ main.cpp)
+target_link_libraries(scaffold_correction spades_modules ${COMMON_LIBRARIES})
+
+if (SPADES_STATIC_BUILD)
+ set_target_properties(scaffold_correction PROPERTIES LINK_SEARCH_END_STATIC 1)
+endif()
+
+install(TARGETS scaffold_correction
+ DESTINATION bin
+ COMPONENT runtime)
+install(DIRECTORY "${SPADES_CFG_DIR}/scaffold_correction"
+ DESTINATION share/spades/configs
+ FILES_MATCHING PATTERN "*.info")
diff --git a/src/projects/scaffold_correction/main.cpp b/src/projects/scaffold_correction/main.cpp
new file mode 100644
index 0000000..9086e90
--- /dev/null
+++ b/src/projects/scaffold_correction/main.cpp
@@ -0,0 +1,112 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * Assembler Main
+ */
+#include "dev_support/logger/log_writers.hpp"
+
+#include "dev_support/segfault_handler.hpp"
+#include "dev_support/stacktrace.hpp"
+#include "dev_support/memory_limit.hpp"
+#include "dev_support/copy_file.hpp"
+#include "dev_support/perfcounter.hpp"
+#include "data_structures/sequence/runtime_k.hpp"
+#include "scaffold_correction.hpp"
+
+#include "pipeline/config_struct.hpp"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include "scaffold_correction.hpp"
+
+//FIXME code duplication
+void load_config(const vector<string>& cfg_fns) {
+ for (const auto& s : cfg_fns) {
+ path::CheckFileExistenceFATAL(s);
+ }
+
+ cfg::create_instance(cfg_fns);
+
+ if (!cfg::get().project_name.empty()) {
+ make_dir(cfg::get().output_base + cfg::get().project_name);
+ }
+
+ make_dir(cfg::get().output_dir);
+ make_dir(cfg::get().tmp_dir);
+
+ if (cfg::get().developer_mode)
+ make_dir(cfg::get().output_saves);
+
+ make_dir(cfg::get().temp_bin_reads_path);
+}
+
+void create_console_logger(string cfg_filename) {
+ using namespace logging;
+
+ string log_props_file = cfg::get().log_filename;
+
+ if (!path::FileExists(log_props_file))
+ log_props_file = path::append_path(path::parent_path(cfg_filename), cfg::get().log_filename);
+
+ logger *lg = create_logger(path::FileExists(log_props_file) ? log_props_file : "");
+ lg->add_writer(std::make_shared<console_writer>());
+ attach_logger(lg);
+}
+
+int main(int argc, char** argv) {
+ perf_counter pc;
+
+ const size_t GB = 1 << 30;
+
+ srand(42);
+ srandom(42);
+
+ try {
+ using namespace debruijn_graph;
+
+ string cfg_dir = path::parent_path(argv[1]);
+
+ vector<string> cfg_fns;
+ for (int i = 1; i < argc; ++i) {
+ cfg_fns.push_back(argv[i]);
+ }
+
+ load_config(cfg_fns);
+ create_console_logger(cfg_dir);
+
+ VERIFY(cfg::get().K >= runtime_k::MIN_K && cfg::get().K < runtime_k::MAX_K);
+ VERIFY(cfg::get().K % 2 != 0);
+
+ // read configuration file (dataset path etc.)
+
+ limit_memory(cfg::get().max_memory * GB);
+
+ // assemble it!
+ INFO("Assembling dataset (" << cfg::get().dataset_file << ") with K=" << cfg::get().K);
+
+ spades::run_scaffold_correction();
+ } catch (std::bad_alloc const& e) {
+ std::cerr << "Not enough memory to run SPAdes. " << e.what() << std::endl;
+ return EINTR;
+ } catch (std::exception const& e) {
+ std::cerr << "Exception caught " << e.what() << std::endl;
+ return EINTR;
+ } catch (...) {
+ std::cerr << "Unknown exception caught " << std::endl;
+ return EINTR;
+ }
+
+ unsigned ms = (unsigned)pc.time_ms();
+ unsigned secs = (ms / 1000) % 60;
+ unsigned mins = (ms / 1000 / 60) % 60;
+ unsigned hours = (ms / 1000 / 60 / 60);
+ INFO("Assembling time: " << hours << " hours " << mins << " minutes " << secs << " seconds");
+
+ // OK
+ return 0;
+}
diff --git a/src/projects/scaffold_correction/scaffold_correction.hpp b/src/projects/scaffold_correction/scaffold_correction.hpp
new file mode 100644
index 0000000..8488bee
--- /dev/null
+++ b/src/projects/scaffold_correction/scaffold_correction.hpp
@@ -0,0 +1,333 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+#include "io/reads_io/osequencestream.hpp"
+#include "pipeline/stage.hpp"
+#include "pipeline/graph_pack.hpp"
+#include "assembly_graph/paths/path_processor.hpp"
+#include "stages/construction.hpp"
+#include "pipeline/config_struct.hpp"
+#include "algorithms/dijkstra/dijkstra_algorithm.hpp"
+#include "algorithms/dijkstra/dijkstra_helper.hpp"
+#include "assembly_graph/graph_core/basic_graph_stats.hpp"
+
+namespace scaffold_correction {
+ typedef debruijn_graph::ConjugateDeBruijnGraph Graph;
+
+ class PathSideSimilarityComparator {
+ private:
+ typedef Graph::EdgeId EdgeId;
+ const Graph &graph_;
+ vector<EdgeId> initial_path_;
+
+ public:
+ size_t CalculateDifference(vector<EdgeId> path) {
+ size_t lside = 0;
+ size_t rside = 0;
+ size_t min_size = std::min(path.size(), initial_path_.size());
+ while(lside < min_size && path[lside] == initial_path_[lside])
+ lside++;
+ while(rside < min_size && lside + rside < min_size &&
+ path[path.size() - 1 - rside] == initial_path_[initial_path_.size() - 1 - rside])
+ rside++;
+ size_t result = 0;
+ for(size_t i = lside; i < path.size() - rside; i++) {
+ result += graph_.length(path[i]);
+ }
+ return result;
+ }
+
+ PathSideSimilarityComparator(const Graph &graph, vector<EdgeId> path) : graph_(graph), initial_path_(path) {
+ }
+
+
+ bool operator()(const vector<EdgeId> &path1, const vector<EdgeId> &path2) {
+ return CalculateDifference(path1) < CalculateDifference(path2);
+ }
+ };
+
+ class CarefulPathFixer {
+ private:
+ typedef Graph::EdgeId EdgeId;
+ typedef Graph::VertexId VertexId;
+ const Graph &graph_;
+ size_t max_cut_length_;
+ size_t max_insert_;
+
+ bool Consistent(EdgeId e1, EdgeId e2) const {
+ return graph_.EdgeEnd(e1) == graph_.EdgeStart(e2);
+ }
+
+ size_t StepBack(size_t pos, const vector<EdgeId> &edges) const {
+ size_t step_size = 0;
+ while(pos > 0 && Consistent(edges[pos - 1], edges[pos]) && step_size + graph_.length(edges[pos]) <= max_cut_length_) {
+ step_size += graph_.length(edges[pos]);
+ pos -= 1;
+ }
+ return pos;
+ }
+
+ size_t StepForward(size_t pos, const vector<EdgeId> &edges) const {
+ size_t step_size = 0;
+ while(pos + 1 < edges.size() && Consistent(edges[pos], edges[pos + 1])&& step_size + graph_.length(edges[pos]) <= max_cut_length_) {
+ step_size += graph_.length(edges[pos]);
+ pos += 1;
+ }
+ return pos;
+ }
+
+
+ void PrintPath(vector<EdgeId> path) const {
+ for(size_t i = 0; i < path.size(); i++) {
+ TRACE(graph_.EdgeNucls(path[i]));
+ }
+ }
+
+
+ vector<EdgeId> TryCloseGap(VertexId v1, VertexId v2, const vector<EdgeId> &path) const {
+ if (v1 == v2)
+ return vector<EdgeId>();
+ TRACE(
+ "Trying to close gap between v1=" << graph_.int_id(v1) << " and v2=" << graph_.int_id(v2));
+ typedef omnigraph::DijkstraHelper<Graph>::PathIgnoringDijkstraSettings DS;
+ size_t max_path_length = max_insert_ + 2 * max_cut_length_;
+ DS ds(DS::LC(graph_, path), DS::VPrC(max_path_length), DS::VPuC(max_path_length), DS::NIF(graph_));
+ omnigraph::Dijkstra<Graph, DS> dj(graph_, ds);
+ dj.Run(v1);
+ if(dj.DistanceCounted(v2) && dj.GetDistance(v2) <= max_insert_) {
+ vector<EdgeId> result = dj.GetShortestPathTo(v2);
+ VERIFY(graph_.EdgeStart(result.front()) == v1);
+ VERIFY(graph_.EdgeEnd(result.back()) == v2);
+ TRACE("Gap closed");
+ TRACE("Cumulative closure length is " << CumulativeLength(graph_, result));
+ TRACE("Difference from initial path: " << dj.GetDistance(v2));
+ return result;
+ } else {
+ TRACE("Failed to find closing path");
+ return vector<EdgeId>();
+ }
+/*
+ PathSideSimilarityComparator comparator(garph_, path);
+ omnigraph::BestPathStorage<Graph, PathSideSimilarityComparator> storage(graph_, comparator);
+// omnigraph::PathStorageCallback<Graph> path_store(graph_);
+ //todo reduce value after investigation
+ omnigraph::PathProcessor<Graph> path_processor(graph_, 0, max_insert_, v1, v2, 1000000, storage);
+ path_processor.Process();
+ TRACE(graph_.VertexNucls(v1));
+ TRACE(graph_.VertexNucls(v2));
+ size_t error_code = path_processor.Process();
+ TRACE("Error code: " << error_code);
+
+ if (storage.size() == 0) {
+ TRACE("Failed to find closing path");
+ return vector<EdgeId>();
+ } else if (storage.size() == 1) {
+ TRACE("Unique closing path found");
+ } else {
+ TRACE("Several closing paths found(" << path_store.paths().size() << "), first chosen");
+ }
+ auto tmp = path_store.paths();
+ TRACE("Number of paths: " << tmp.size());
+// for(auto it = tmp.begin(); it != tmp.end(); ++it) {
+// TRACE(ConstructSequence(*it));
+// }
+ vector<EdgeId> answer = storage.BestPath();
+ TRACE("Gap closed");
+ TRACE( "Cumulative closure length is " << CumulativeLength(graph_, answer));
+ TRACE( "Difference from initial path: " << comparator.CalculateDifference(answer));
+ return answer;
+*/
+ }
+
+ public:
+ CarefulPathFixer(const Graph &graph, size_t max_cut_length, size_t max_insert)
+ : graph_(graph), max_cut_length_(max_cut_length), max_insert_(max_insert) {
+ }
+
+ vector<EdgeId> TryFixPath(const vector<EdgeId>& edges) const {
+ vector<EdgeId> result;
+ if (edges.empty()) {
+ return vector<EdgeId>();
+ }
+ result.push_back(edges[0]);
+ for (size_t i = 1; i < edges.size(); ++i) {
+ if (!Consistent(result.back(), edges[i])) {
+ size_t lindex = StepBack(result.size() - 1, result);
+ size_t rindex = StepForward(i, edges);
+ vector<EdgeId> current_path(result.begin() + lindex + 1, result.end());
+ current_path.insert(current_path.end(), edges.begin() + i, edges.begin() + rindex);
+ vector<EdgeId> closure = TryCloseGap(graph_.EdgeEnd(result[lindex]), graph_.EdgeStart(edges[rindex]), current_path);
+ if(closure.size() != 0 || Consistent(result[lindex], edges[rindex])) {
+ result.resize(lindex + 1);
+ VERIFY(closure.size() == 0 || Consistent(result.back(), closure.front()));
+ result.insert(result.end(), closure.begin(), closure.end());
+ i = rindex;
+ VERIFY(Consistent(result.back(), edges[i]));
+ }
+ }
+ result.push_back(edges[i]);
+ }
+ return result;
+ }
+ DECL_LOGGER("CarefulPathFixer")
+ };
+
+ class ScaffoldCorrector {
+ typedef debruijn_graph::conj_graph_pack graph_pack;
+ private:
+
+ const graph_pack& gp_;
+ const CarefulPathFixer &fixer_;
+
+
+ bool CheckPath(const vector<Graph::EdgeId> &path) const {
+ if(path.size() == 0)
+ return false;
+ for(size_t i = 1; i < path.size(); i++) {
+ if(gp_.g.EdgeEnd(path[i - 1]) != gp_.g.EdgeStart(path[i])) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ Sequence ConstructSequence(const vector<Graph::EdgeId> &path) const {
+ Sequence result = gp_.g.EdgeNucls(path[0]);
+ for(size_t i = 1; i < path.size(); i++) {
+ result = result + gp_.g.EdgeNucls(path[i]).Subseq(gp_.k_value);
+ }
+ return result;
+ }
+
+ public:
+ ScaffoldCorrector(const graph_pack &gp, const CarefulPathFixer &fixer) : gp_(gp), fixer_(fixer) {
+ }
+
+ Sequence correct(const vector<Sequence> &scaffold) const {
+ auto mapper = debruijn_graph::MapperInstance(gp_);
+ MappingPath <debruijn_graph::EdgeId> path;
+ for(auto it = scaffold.begin(); it != scaffold.end(); ++it) {
+ path.join(mapper->MapSequence(*it));
+ }
+ vector<Graph::EdgeId> corrected_path = fixer_.TryFixPath(path.simple_path());
+ if(CheckPath(corrected_path)) {
+ return ConstructSequence(corrected_path);
+ } else {
+ return Sequence();
+ }
+ }
+ };
+}
+
+namespace spades {
+ class ScaffoldCorrectionStage : public AssemblyStage {
+ public:
+ typedef debruijn_graph::config::debruijn_config::scaffold_correction Config;
+ private:
+ size_t k_;
+ std::string output_file_;
+ const Config &config_;
+ public:
+ ScaffoldCorrectionStage(size_t k, string output_file,
+ const Config &config) :
+ AssemblyStage("ScaffoldCorrection", "scaffold_correction"),
+ k_(k), output_file_(output_file), config_(config) {
+ }
+
+ vector<Sequence> CollectScaffoldParts(const io::SingleRead &scaffold) const {
+ vector<Sequence> result;
+ for(size_t i = 0; i < scaffold.size(); i++) {
+ size_t j = i;
+ while(j < scaffold.size() && is_nucl(scaffold.GetSequenceString()[j])) {
+ j++;
+ }
+ if(j > i) {
+ result.push_back(scaffold.Substr(i, j).sequence());
+ i = j - 1;
+ }
+ }
+ return result;
+ }
+
+ void OutputResults(const vector<io::SingleRead> &results) {
+ io::osequencestream_simple oss(output_file_);
+ for(size_t i = 0; i < results.size(); i++) {
+ string sequence = results[i].GetSequenceString();
+ if(sequence != "") {
+ oss.set_header(results[i].name());
+ oss << sequence;
+ }
+ }
+ }
+
+ vector<io::SingleRead> ReadScaffolds(const string &scaffolds_file) {
+ io::FileReadStream scaffold_stream(scaffolds_file);
+ vector<io::SingleRead> scaffolds;
+ while(!scaffold_stream.eof()) {
+ io::SingleRead scaffold;
+ scaffold_stream >> scaffold;
+ scaffolds.push_back(scaffold);
+ }
+ return scaffolds;
+ }
+
+ vector<io::SingleRead> RunParallelCorrection(const vector<io::SingleRead> &scaffolds, const scaffold_correction::ScaffoldCorrector &corrector) {
+ vector<io::SingleRead> results(scaffolds.size());
+#pragma omp parallel for
+ for(size_t i = 0; i < scaffolds.size(); i++) {
+ auto scaffold = scaffolds[i];
+ std::string name = scaffold.name();
+ vector<Sequence> part_list = CollectScaffoldParts(scaffold);
+ TRACE("Correcting scaffold " << name);
+ TRACE("Number of parts: " << part_list.size());
+ Sequence result = corrector.correct(part_list);
+ if (result.size() != 0) {
+ TRACE("Correction successful");
+ results[i] = io::SingleRead(name, result.str());
+ } else if (config_.output_unfilled) {
+ TRACE("Correction unsuccessful. Using uncorrected scaffold");
+ results[i] = scaffold;
+ }
+ }
+ return results;
+ }
+
+ void run(debruijn_graph::conj_graph_pack &graph_pack, const char *) {
+ INFO("Correcting scaffolds from " << config_.scaffolds_file);
+ scaffold_correction::CarefulPathFixer fixer(graph_pack.g, config_.max_cut_length, config_.max_insert);
+ scaffold_correction::ScaffoldCorrector corrector(graph_pack, fixer);
+ vector<io::SingleRead> scaffolds = ReadScaffolds(config_.scaffolds_file);
+ vector<io::SingleRead> results = RunParallelCorrection(scaffolds, corrector);
+ OutputResults(results);
+ INFO(scaffolds.size() << " reads processed");
+ INFO("Corrected scaffolds written to " << output_file_);
+ }
+ DECL_LOGGER("ScaffoldCorrectionStage")
+ };
+
+ void run_scaffold_correction() {
+ INFO("Scaffold correction started");
+
+ debruijn_graph::conj_graph_pack conj_gp(cfg::get().K,
+ cfg::get().tmp_dir,
+ cfg::get().ds.reads.lib_count(),
+ cfg::get().ds.reference_genome,
+ cfg::get().flanking_range,
+ cfg::get().pos.max_mapping_gap,
+ cfg::get().pos.max_gap_diff);
+ StageManager manager({cfg::get().developer_mode,
+ cfg::get().load_from,
+ cfg::get().output_saves});
+ manager.add(new debruijn_graph::Construction())
+ .add(new ScaffoldCorrectionStage(cfg::get().K, cfg::get().output_dir + "corrected_scaffolds.fasta", *cfg::get().sc_cor));
+ INFO("Output directory: " << cfg::get().output_dir);
+ conj_gp.kmer_mapper.Attach();
+ manager.run(conj_gp, cfg::get().entry_point.c_str());
+ INFO("Scaffold correction finished.");
+ }
+
+}
diff --git a/src/projects/spades/CMakeLists.txt b/src/projects/spades/CMakeLists.txt
new file mode 100644
index 0000000..f245266
--- /dev/null
+++ b/src/projects/spades/CMakeLists.txt
@@ -0,0 +1,32 @@
+############################################################################
+# Copyright (c) 2015 Saint Petersburg State University
+# Copyright (c) 2011-2014 Saint Petersburg Academic University
+# All Rights Reserved
+# See file LICENSE for details.
+############################################################################
+
+project(spades CXX)
+
+add_executable(spades main.cpp
+ gap_closer.cpp
+ mismatch_correction.cpp
+ pair_info_count.cpp
+ second_phase_setup.cpp
+ distance_estimation.cpp
+ repeat_resolving.cpp
+ pacbio_aligning.cpp
+ chromosome_removal.cpp)
+
+target_include_directories(spades PRIVATE ${EXT_DIR}/include/ConsensusCore)
+target_link_libraries(spades ConsensusCore spades_modules nlopt BamTools ssw ${COMMON_LIBRARIES})
+
+if (SPADES_STATIC_BUILD)
+ set_target_properties(spades PROPERTIES LINK_SEARCH_END_STATIC 1)
+endif()
+
+install(TARGETS spades
+ DESTINATION bin
+ COMPONENT runtime)
+install(DIRECTORY "${SPADES_CFG_DIR}/debruijn"
+ DESTINATION share/spades/configs
+ FILES_MATCHING PATTERN "*.info")
diff --git a/src/projects/spades/chromosome_removal.cpp b/src/projects/spades/chromosome_removal.cpp
new file mode 100644
index 0000000..aec5b82
--- /dev/null
+++ b/src/projects/spades/chromosome_removal.cpp
@@ -0,0 +1,244 @@
+//****************************************************************************
+//* Copyright (c) 2011-2014 Saint-Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//****************************************************************************
+
+#include "assembly_graph/graph_support/contig_output.hpp"
+#include "stages/simplification_pipeline/graph_simplification.hpp"
+#include "algorithms/simplification/ec_threshold_finder.hpp"
+#include "assembly_graph/graph_core/basic_graph_stats.hpp"
+
+#include "chromosome_removal.hpp"
+
+namespace debruijn_graph {
+
+
+void ChromosomeRemoval::CompressAll(Graph &g) {
+ for (auto it = g.SmartVertexBegin(); ! it.IsEnd(); ++it) {
+ if (g.IsDeadStart(*it) && g.IsDeadEnd(*it)) {
+ g.DeleteVertex(*it);
+ } else {
+ g.CompressVertex(*it);
+ }
+ }
+}
+
+void ChromosomeRemoval::DeleteAndCompress(EdgeId e, Graph &g){
+ auto start = g.EdgeStart(e);
+ auto end = g.EdgeEnd(e);
+ g.DeleteEdge(e);
+ bool is_cycle = (start == end || start == g.conjugate(end));
+ if (g.IsDeadStart(start) && g.IsDeadEnd(start)) {
+ g.DeleteVertex(start);
+ } else {
+ g.CompressVertex(start);
+ }
+ if (is_cycle) {
+ return;
+ }
+ if (g.IsDeadStart(end) && g.IsDeadEnd(end)) {
+ g.DeleteVertex(end);
+ } else {
+ g.CompressVertex(end);
+ }
+}
+
+
+size_t ChromosomeRemoval::CalculateComponentSize(EdgeId e, Graph &g_) {
+ std::stack<EdgeId> next;
+ size_t deadend_count = 0;
+ next.push(e);
+ std::unordered_set<EdgeId> used;
+ size_t ans = 0;
+ while (!next.empty()){
+ auto cur = next.top();
+ next.pop();
+ if (used.find(cur) != used.end()) {
+ continue;
+ }
+ ans += g_.length(cur);
+
+ used.insert(cur);
+ vector<EdgeId> neighbours;
+ neighbours.push_back(g_.conjugate(cur));
+ auto start = g_.EdgeStart(cur);
+ auto tmp = g_.IncidentEdges(start);
+
+ neighbours.insert(neighbours.end(), tmp.begin(), tmp.end());
+ auto end = g_.EdgeEnd(cur);
+ if (g_.IsDeadStart(start))
+ deadend_count++;
+ if (g_.IsDeadEnd(end))
+ deadend_count++;
+ tmp = g_.IncidentEdges(end);
+ neighbours.insert(neighbours.end(), tmp.begin(), tmp.end());
+ for (auto ee:neighbours) {
+ if (used.find(ee) == used.end()) {
+ next.push(ee);
+ }
+ }
+ }
+ for (auto edge: used) {
+ long_component_[edge] = ans;
+ long_vertex_component_[g_.EdgeStart(edge)] = ans;
+ long_vertex_component_[g_.EdgeEnd(edge)] = ans;
+ deadends_count_[edge] = deadend_count;
+ }
+ return ans;
+}
+
+double ChromosomeRemoval::RemoveLongGenomicEdges(conj_graph_pack &gp, size_t long_edge_bound, double coverage_limits, double external_chromosome_coverage){
+ INFO("Removing of long chromosomal edges started");
+ vector <pair<double, size_t> > coverages;
+ size_t total_len = 0, short_len = 0, cur_len = 0;
+ for (auto iter = gp.g.ConstEdgeBegin(); ! iter.IsEnd(); ++iter){
+ if (gp.g.length(*iter) > cfg::get().pd->edge_length_for_median) {
+ coverages.push_back(make_pair(gp.g.coverage(*iter), gp.g.length(*iter)));
+ total_len += gp.g.length(*iter);
+ } else {
+ short_len += gp.g.length(*iter);
+ }
+ }
+ if (total_len == 0) {
+ if (external_chromosome_coverage < 1.0) {
+ WARN("plasmid detection failed, not enough long edges");
+ }
+ else {
+ INFO("All long edges deleted, stopping detection");
+ }
+ return 0;
+ }
+ std::sort(coverages.begin(), coverages.end());
+ size_t i = 0;
+ while (cur_len < total_len/2 && i <coverages.size()) {
+ cur_len += coverages[i].second;
+ i++;
+ }
+
+ double median_long_edge_coverage;
+ if (external_chromosome_coverage < 1.0) {
+ median_long_edge_coverage = coverages[i-1].first;
+ INFO ("genomic coverage is "<< median_long_edge_coverage << " calculated of length " << size_t (double(total_len) * 0.5));
+ for (auto iter = gp.g.ConstEdgeBegin(); ! iter.IsEnd(); ++iter) {
+ if (long_component_.find(*iter) == long_component_.end()) {
+ CalculateComponentSize(*iter, gp.g);
+ }
+ }
+ INFO("Connected components calculated");
+ } else {
+ median_long_edge_coverage = external_chromosome_coverage;
+ }
+ size_t deleted = 0;
+ for (auto iter = gp.g.SmartEdgeBegin(); ! iter.IsEnd(); ++iter){
+ if (gp.g.length(*iter) > long_edge_bound) {
+ if (gp.g.coverage(*iter) < median_long_edge_coverage * (1 + coverage_limits) && gp.g.coverage(*iter) > median_long_edge_coverage * (1 - coverage_limits)) {
+ DEBUG("Considering long edge: id " << gp.g.int_id(*iter) << " length: " << gp.g.length(*iter) <<" coverage: " << gp.g.coverage(*iter));
+ if ( long_component_.find(*iter) != long_component_.end() && 300000 > long_component_[*iter] && deadends_count_[*iter] == 0) {
+ DEBUG("Edge " << gp.g.int_id(*iter) << " skipped - because of small nondeadend connected component of size " << long_component_[*iter]);
+ } else {
+ DEBUG(" Edge " << gp.g.int_id(*iter) << " deleted");
+ deleted++;
+ gp.g.DeleteEdge(*iter);
+ }
+ }
+ }
+ }
+ INFO("Deleted " << deleted <<" long edges");
+ CompressAll(gp.g);
+ return median_long_edge_coverage;
+}
+
+void ChromosomeRemoval::PlasmidSimplify(conj_graph_pack &gp, size_t long_edge_bound,
+ std::function<void (EdgeId)> removal_handler ) {
+ DEBUG("Simplifying graph for plasmid project");
+ size_t iteration_count = 10;
+ for (size_t i = 0; i < iteration_count; i++) {
+ //pred::TypedPredicate<typename Graph::EdgeId> condition = make_shared<LengthUpperBound<Graph>>(gp.g, long_edge_bound) ;
+ omnigraph::EdgeRemovingAlgorithm<Graph> tc(gp.g, pred::And(DeadEndCondition<Graph>(gp.g), LengthUpperBound<Graph>(gp.g, long_edge_bound)),
+ removal_handler, true);
+ tc.Run();
+ }
+ gp.EnsureIndex();
+}
+
+void ChromosomeRemoval::run(conj_graph_pack &gp, const char*) {
+ //FIXME Seriously?! cfg::get().ds like hundred times...
+ OutputContigs(gp.g, cfg::get().output_dir + "before_chromosome_removal", false, 0, false);
+ INFO("Before iteration " << 0 << ", " << gp.g.size() << " vertices in graph");
+ double chromosome_coverage = RemoveLongGenomicEdges(gp, cfg::get().pd->long_edge_length, cfg::get().pd->relative_coverage );
+ PlasmidSimplify(gp, cfg::get().pd->long_edge_length);
+//TODO:: reconsider and move somewhere(not to config)
+ size_t max_iteration_count = 30;
+
+ for (size_t i = 0; i < max_iteration_count; i++) {
+ size_t graph_size = gp.g.size();
+ INFO("Before iteration " << i + 1 << " " << graph_size << " vertices in graph");
+ RemoveLongGenomicEdges(gp, cfg::get().pd->long_edge_length, cfg::get().pd->relative_coverage, chromosome_coverage );
+ INFO("Before dead_end simplification " << i << " " << gp.g.size() << " vertices in graph");
+
+ PlasmidSimplify(gp, cfg::get().pd->long_edge_length);
+ size_t new_graph_size = gp.g.size();
+ if (new_graph_size == graph_size) {
+ INFO("Iteration " << i << " graph was not changed");
+ INFO(new_graph_size << " vertices left");
+ break;
+ }
+ }
+//Small repetitive components after filtering
+ std::unordered_map<VertexId, size_t> old_vertex_weights (long_vertex_component_.begin(), long_vertex_component_.end());
+ for (size_t i = 0; i < max_iteration_count; i++) {
+ size_t graph_size = gp.g.size();
+ long_vertex_component_.clear();
+ long_component_.clear();
+ deadends_count_.clear();
+ for (auto iter = gp.g.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
+ CalculateComponentSize(*iter, gp.g);
+ }
+
+ for (auto iter = gp.g.SmartEdgeBegin(); !iter.IsEnd(); ++iter) {
+ if (gp.g.IsDeadEnd(gp.g.EdgeEnd(*iter)) && gp.g.IsDeadStart(gp.g.EdgeStart(*iter))
+ && old_vertex_weights.find(gp.g.EdgeStart(*iter)) != old_vertex_weights.end()
+//* 2 - because all coverages are taken with rc
+ && old_vertex_weights[gp.g.EdgeStart(*iter)] > long_component_[*iter] + cfg::get().pd->long_edge_length * 2) {
+ DEBUG("deleting isolated edge of length" << gp.g.length(*iter));
+ gp.g.DeleteEdge(*iter);
+ }
+ }
+ for (auto iter = gp.g.SmartEdgeBegin(); !iter.IsEnd(); ++iter) {
+ if (long_component_[*iter] < 2 * cfg::get().pd->small_component_size) {
+ if (old_vertex_weights.find(gp.g.EdgeStart(*iter)) != old_vertex_weights.end() &&
+ old_vertex_weights[gp.g.EdgeStart(*iter)] >
+ long_component_[*iter] + cfg::get().pd->long_edge_length * 2 &&
+ gp.g.coverage(*iter) < chromosome_coverage * (1 + cfg::get().pd->small_component_relative_coverage)
+ && gp.g.coverage(*iter) > chromosome_coverage * (1 - cfg::get().pd->small_component_relative_coverage)) {
+ DEBUG("Deleting edge from fake small component, length " << gp.g.length(*iter) << " component_size " << old_vertex_weights[gp.g.EdgeStart(*iter)]) ;
+ gp.g.DeleteEdge(*iter);
+ }
+ }
+ }
+ for (auto iter = gp.g.SmartEdgeBegin(); !iter.IsEnd(); ++iter) {
+ if (long_component_[*iter] < 2 * cfg::get().pd->min_component_length &&
+ !(deadends_count_[*iter] == 0 &&
+ gp.g.length(*iter) > cfg::get().pd->min_isolated_length)) {
+ gp.g.DeleteEdge(*iter);
+ }
+ }
+
+ CompressAll(gp.g);
+ PlasmidSimplify(gp, cfg::get().pd->long_edge_length);
+ size_t new_graph_size = gp.g.size();
+ if (new_graph_size == graph_size) {
+ INFO("Iteration " << i << " of small components additional filtering graph was not changed");
+ INFO("After plasmidSPAdes subroutine " << new_graph_size << " vertices left");
+ break;
+ }
+ }
+ INFO("Counting average coverage after genomic edge removal");
+ AvgCovereageCounter<Graph> cov_counter(gp.g);
+ cfg::get_writable().ds.set_avg_coverage(cov_counter.Count());
+ INFO("Average coverage = " << cfg::get().ds.avg_coverage());
+}
+
+
+} //debruijn_graph
diff --git a/src/projects/spades/chromosome_removal.hpp b/src/projects/spades/chromosome_removal.hpp
new file mode 100644
index 0000000..f5e2cf9
--- /dev/null
+++ b/src/projects/spades/chromosome_removal.hpp
@@ -0,0 +1,35 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint-Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//****************************************************************************
+
+#pragma once
+
+#include "pipeline/stage.hpp"
+#include "assembly_graph/graph_core/graph.hpp"
+
+namespace debruijn_graph {
+
+class ChromosomeRemoval : public spades::AssemblyStage {
+public:
+ ChromosomeRemoval()
+ : AssemblyStage("Chromosome Removal", "chromosome_removal"), long_component_(), long_vertex_component_(),deadends_count_() { }
+
+ void run(conj_graph_pack &gp, const char *);
+
+private:
+ std::unordered_map <EdgeId, size_t> long_component_;
+ std::unordered_map <VertexId, size_t> long_vertex_component_;
+ std::unordered_map <EdgeId, size_t> deadends_count_;
+
+ size_t CalculateComponentSize(debruijn_graph::EdgeId e, Graph &g_);
+
+ double RemoveLongGenomicEdges(conj_graph_pack &gp, size_t long_edge_bound, double coverage_limits,
+ double external_chromosome_coverage = 0);
+ void PlasmidSimplify(conj_graph_pack &gp, size_t long_edge_bound,
+ std::function<void(typename Graph::EdgeId)> removal_handler = 0);
+ void CompressAll(Graph &g);
+ void DeleteAndCompress(EdgeId e, Graph &g);
+};
+}
diff --git a/src/projects/spades/distance_estimation.cpp b/src/projects/spades/distance_estimation.cpp
new file mode 100644
index 0000000..4444f90
--- /dev/null
+++ b/src/projects/spades/distance_estimation.cpp
@@ -0,0 +1,231 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "io/dataset_support/dataset_readers.hpp"
+#include "paired_info/pair_info_improver.hpp"
+
+#include "paired_info/paired_info_helpers.hpp"
+#include "paired_info/pair_info_filters.hpp"
+#include "paired_info/distance_estimation.hpp"
+#include "paired_info/weighted_distance_estimation.hpp"
+#include "paired_info/smoothing_distance_estimation.hpp"
+#include "paired_info/weights.hpp"
+
+#include "distance_estimation.hpp"
+#include <set>
+
+namespace debruijn_graph {
+
+using namespace omnigraph::de;
+
+template<class Graph>
+void estimate_with_estimator(const Graph &graph,
+ const omnigraph::de::AbstractDistanceEstimator<Graph>& estimator,
+ omnigraph::de::AbstractPairInfoChecker<Graph>& checker,
+ PairedIndexT& clustered_index) {
+ using config::estimation_mode;
+ DEBUG("Estimating distances");
+
+ estimator.Estimate(clustered_index, cfg::get().max_threads);
+
+ INFO("Filtering info");
+ if(cfg::get().amb_de.enabled){
+ AmbiguousPairInfoChecker<Graph> amb_de_checker(graph,
+ clustered_index,
+ checker,
+ cfg::get().amb_de.haplom_threshold,
+ cfg::get().amb_de.relative_length_threshold,
+ cfg::get().amb_de.relative_seq_threshold);
+ PairInfoFilter<Graph>(amb_de_checker).Filter(clustered_index);
+ }
+ else
+ PairInfoFilter<Graph>(checker).Filter(clustered_index);
+// filter.Filter(clustered_index);
+ DEBUG("Info Filtered");
+}
+
+
+// Postprocessing, checking that clusters do not intersect
+template<class Graph>
+void RefinePairedInfo(const Graph& graph, PairedInfoIndexT<Graph>& clustered_index) {
+ for (auto iter = pair_begin(clustered_index); iter != pair_end(clustered_index); ++iter) {
+ EdgeId first_edge = iter.first();
+ EdgeId second_edge = iter.second();
+ auto infos = iter->Unwrap(); //we need an ordered histogram here
+ if (infos.empty())
+ continue;
+
+ auto prev_it = infos.begin();
+ auto it = prev_it;
+ ++it;
+ for (auto end_it = infos.end(); it != end_it; ++it) {
+ if (math::le(abs(it->d - prev_it->d), it->var + prev_it->var)) {
+ WARN("Clusters intersect, edges -- " << graph.int_id(first_edge)
+ << " " << graph.int_id(second_edge));
+ INFO("Trying to handle this case");
+ // seeking the symmetric pair info to [i - 1]
+ bool success = false;
+ double total_weight = prev_it->weight;
+ for (auto inner_it = it; inner_it != end_it; ++inner_it) {
+ total_weight += inner_it->weight;
+ if (math::eq(inner_it->d + prev_it->d, 0.f)) {
+ success = true;
+ DEDistance center = 0.;
+ DEVariance var = inner_it->d + inner_it->var;
+ for (auto inner_it_2 = prev_it; inner_it_2 != inner_it; ++inner_it_2) {
+ TRACE("Removing pair info " << *inner_it_2);
+ clustered_index.Remove(first_edge, second_edge, *inner_it_2);
+ }
+ clustered_index.Remove(first_edge, second_edge, *inner_it);
+ Point new_point(center, total_weight, var);
+ TRACE("Adding new pair info " << first_edge << " " << second_edge << " " << new_point);
+ clustered_index.Add(first_edge, second_edge, new_point);
+ break;
+ }
+ }
+ INFO("Pair information was resolved");
+
+ if (!success)
+ WARN("This intersection can not be handled in the right way");
+
+ break;
+ }
+ }
+ }
+}
+
+void estimate_distance(conj_graph_pack& gp,
+ const io::SequencingLibrary<config::DataSetData> &lib,
+ const UnclusteredPairedIndexT& paired_index,
+ PairedIndexT& clustered_index,
+ PairedIndexT& scaffolding_index) {
+ using config::estimation_mode;
+
+ const config::debruijn_config& config = cfg::get();
+ size_t delta = size_t(lib.data().insert_size_deviation);
+ size_t linkage_distance = size_t(config.de.linkage_distance_coeff * lib.data().insert_size_deviation);
+ GraphDistanceFinder<Graph> dist_finder(gp.g, (size_t)math::round(lib.data().mean_insert_size), lib.data().read_length, delta);
+ size_t max_distance = size_t(config.de.max_distance_coeff * lib.data().insert_size_deviation);
+
+ std::function<double(int)> weight_function;
+
+ if (config.est_mode == estimation_mode::weighted || // in these cases we need a weight function
+ config.est_mode == estimation_mode::smoothing) { // to estimate graph distances in the histogram
+ if (lib.data().insert_size_distribution.size() == 0) {
+ WARN("No insert size distribution found, stopping distance estimation");
+ return;
+ }
+
+ WeightDEWrapper wrapper(lib.data().insert_size_distribution, lib.data().mean_insert_size);
+ DEBUG("Weight Wrapper Done");
+ weight_function = std::bind(&WeightDEWrapper::CountWeight, wrapper, std::placeholders::_1);
+ } else
+ weight_function = UnityFunction;
+
+// PairInfoWeightFilter<Graph> filter(gp.g, config.de.filter_threshold);
+ PairInfoWeightChecker<Graph> checker(gp.g, config.de.filter_threshold);
+
+ INFO("Weight Filter Done");
+
+ switch (config.est_mode) {
+ case estimation_mode::simple: {
+ const AbstractDistanceEstimator<Graph>&
+ estimator =
+ DistanceEstimator<Graph>(gp.g, paired_index, dist_finder,
+ linkage_distance, max_distance);
+
+ estimate_with_estimator<Graph>(gp.g, estimator, checker, clustered_index);
+ break;
+ }
+ case estimation_mode::weighted: {
+ const AbstractDistanceEstimator<Graph>&
+ estimator =
+ WeightedDistanceEstimator<Graph>(gp.g, paired_index,
+ dist_finder, weight_function, linkage_distance, max_distance);
+
+ estimate_with_estimator<Graph>(gp.g, estimator, checker, clustered_index);
+ break;
+ }
+ case estimation_mode::smoothing: {
+ const AbstractDistanceEstimator<Graph>&
+ estimator =
+ SmoothingDistanceEstimator<Graph>(gp.g, paired_index,
+ dist_finder, weight_function, linkage_distance, max_distance,
+ config.ade.threshold,
+ config.ade.range_coeff,
+ config.ade.delta_coeff, config.ade.cutoff,
+ config.ade.min_peak_points,
+ config.ade.inv_density,
+ config.ade.percentage,
+ config.ade.derivative_threshold);
+
+ estimate_with_estimator<Graph>(gp.g, estimator, checker, clustered_index);
+ break;
+ }
+ default: {
+ VERIFY_MSG(false, "Unexpected estimation mode value")
+ }
+ }
+
+ INFO("Refining clustered pair information "); // this procedure checks, whether index
+ RefinePairedInfo(gp.g, clustered_index); // contains intersecting paired info clusters,
+ INFO("The refining of clustered pair information has been finished "); // if so, it resolves such conflicts.
+
+ INFO("Improving paired information");
+ PairInfoImprover<Graph> improver(gp.g, clustered_index, lib);
+ improver.ImprovePairedInfo((unsigned) config.max_threads);
+
+ if (cfg::get().pe_params.param_set.scaffolder_options.cluster_info) {
+ INFO("Filling scaffolding index");
+
+ double is_var = lib.data().insert_size_deviation;
+ size_t delta = size_t(is_var);
+ size_t linkage_distance = size_t(cfg::get().de.linkage_distance_coeff * is_var);
+ GraphDistanceFinder<Graph> dist_finder(gp.g, (size_t) math::round(lib.data().mean_insert_size),
+ lib.data().read_length, delta);
+ size_t max_distance = size_t(cfg::get().de.max_distance_coeff_scaff * is_var);
+ std::function<double(int)> weight_function;
+
+ DEBUG("Retaining insert size distribution for it");
+ if (lib.data().insert_size_distribution.size() == 0) {
+ WARN("The library will not be used for scaffolding");
+ return;
+ }
+
+
+ WeightDEWrapper wrapper(lib.data().insert_size_distribution, lib.data().mean_insert_size);
+ DEBUG("Weight Wrapper Done");
+ weight_function = std::bind(&WeightDEWrapper::CountWeight, wrapper, std::placeholders::_1);
+
+// PairInfoWeightFilter<Graph> filter(gp.g, 0.);
+ PairInfoWeightChecker<Graph> checker(gp.g, 0.);
+ DEBUG("Weight Filter Done");
+
+ const AbstractDistanceEstimator<Graph>& estimator =
+ SmoothingDistanceEstimator<Graph>(gp.g, paired_index, dist_finder,
+ weight_function, linkage_distance, max_distance,
+ cfg::get().ade.threshold, cfg::get().ade.range_coeff,
+ cfg::get().ade.delta_coeff, cfg::get().ade.cutoff,
+ cfg::get().ade.min_peak_points, cfg::get().ade.inv_density,
+ cfg::get().ade.percentage,
+ cfg::get().ade.derivative_threshold, true);
+ estimate_with_estimator<Graph>(gp.g, estimator, checker, scaffolding_index);
+ }
+}
+
+void DistanceEstimation::run(conj_graph_pack &gp, const char*) {
+ for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i)
+ if (cfg::get().ds.reads[i].type() == io::LibraryType::PairedEnd) {
+ if (cfg::get().ds.reads[i].data().mean_insert_size != 0.0) {
+ INFO("Processing library #" << i);
+ estimate_distance(gp, cfg::get().ds.reads[i], gp.paired_indices[i], gp.clustered_indices[i], gp.scaffolding_indices[i]);
+ }
+ gp.paired_indices[i].Clear();
+ }
+}
+
+}
diff --git a/src/projects/spades/distance_estimation.hpp b/src/projects/spades/distance_estimation.hpp
new file mode 100644
index 0000000..5843cfb
--- /dev/null
+++ b/src/projects/spades/distance_estimation.hpp
@@ -0,0 +1,24 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "pipeline/stage.hpp"
+
+namespace debruijn_graph {
+
+class DistanceEstimation : public spades::AssemblyStage {
+ public:
+ DistanceEstimation(bool preliminary = false)
+ : AssemblyStage(preliminary ? "Preliminary Distance Estimation" : "Distance Estimation",
+ preliminary ? "distance_estimation_preliminary" : "distance_estimation") {}
+
+ void run(conj_graph_pack &gp, const char*);
+};
+
+}
+
diff --git a/src/projects/spades/gap_closer.cpp b/src/projects/spades/gap_closer.cpp
new file mode 100644
index 0000000..7c43178
--- /dev/null
+++ b/src/projects/spades/gap_closer.cpp
@@ -0,0 +1,502 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "gap_closer.hpp"
+#include "assembly_graph/stats/picture_dump.hpp"
+#include "algorithms/simplification/compressor.hpp"
+#include "io/dataset_support/read_converter.hpp"
+#include <stack>
+
+namespace debruijn_graph {
+
+template<class Graph, class SequenceMapper>
+class GapCloserPairedIndexFiller {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ const Graph &graph_;
+ const SequenceMapper &mapper_;
+
+ size_t CorrectLength(Path<EdgeId> path, size_t idx) const {
+ size_t answer = graph_.length(path[idx]);
+ if (idx == 0)
+ answer -= path.start_pos();
+ if (idx == path.size() - 1)
+ answer -= graph_.length(path[idx]) - path.end_pos();
+ return answer;
+ }
+
+ template<typename PairedRead>
+ void ProcessPairedRead(omnigraph::de::PairedInfoBuffer<Graph> &paired_index,
+ const PairedRead &p_r,
+ const std::unordered_map<EdgeId, pair<EdgeId, int> > &OutTipMap,
+ const std::unordered_map<EdgeId, pair<EdgeId, int> > &InTipMap) const {
+ Sequence read1 = p_r.first().sequence();
+ Sequence read2 = p_r.second().sequence();
+
+ Path<EdgeId> path1 = mapper_.MapSequence(read1).path();
+ Path<EdgeId> path2 = mapper_.MapSequence(read2).path();
+ for (size_t i = 0; i < path1.size(); ++i) {
+ auto OutTipIter = OutTipMap.find(path1[i]);
+ if (OutTipIter != OutTipMap.end()) {
+ for (size_t j = 0; j < path2.size(); ++j) {
+ auto InTipIter = InTipMap.find(path2[j]);
+ if (InTipIter != InTipMap.end()) {
+ auto e1 = OutTipIter->second.first;
+ auto e2 = InTipIter->second.first;
+ //FIXME: Normalize fake points
+ auto sp = std::make_pair(e1, e2);
+ auto cp = paired_index.ConjugatePair(e1, e2);
+ auto ip = std::min(sp, cp);
+ paired_index.Add(ip.first, ip.second, omnigraph::de::RawPoint(1000000., 1.));
+ }
+ }
+ }
+ }
+ }
+
+ void PrepareShiftMaps(std::unordered_map<EdgeId, pair<EdgeId, int> > &OutTipMap,
+ std::unordered_map<EdgeId, pair<EdgeId, int> > &InTipMap) {
+ std::stack<pair<EdgeId, int>> edge_stack;
+ for (auto iterator = graph_.ConstEdgeBegin(); !iterator.IsEnd();) {
+ EdgeId edge = *iterator;
+ if (graph_.IncomingEdgeCount(graph_.EdgeStart(edge)) == 0) {
+ InTipMap.insert(std::make_pair(edge, std::make_pair(edge, 0)));
+ edge_stack.push(std::make_pair(edge, 0));
+ while (edge_stack.size() > 0) {
+ pair<EdgeId, int> checking_pair = edge_stack.top();
+ edge_stack.pop();
+ if (graph_.IncomingEdgeCount(graph_.EdgeEnd(checking_pair.first)) == 1) {
+ VertexId v = graph_.EdgeEnd(checking_pair.first);
+ if (graph_.OutgoingEdgeCount(v)) {
+ for (auto I = graph_.out_begin(v), E = graph_.out_end(v); I != E; ++I) {
+ EdgeId Cur_edge = *I;
+ InTipMap.insert(
+ std::make_pair(Cur_edge,
+ std::make_pair(edge,
+ graph_.length(checking_pair.first) +
+ checking_pair.second)));
+ edge_stack.push(
+ std::make_pair(Cur_edge,
+ graph_.length(checking_pair.first) + checking_pair.second));
+
+ }
+ }
+ }
+ }
+ }
+
+ if (graph_.OutgoingEdgeCount(graph_.EdgeEnd(edge)) == 0) {
+ OutTipMap.insert(std::make_pair(edge, std::make_pair(edge, 0)));
+ edge_stack.push(std::make_pair(edge, 0));
+ while (edge_stack.size() > 0) {
+ std::pair<EdgeId, int> checking_pair = edge_stack.top();
+ edge_stack.pop();
+ if (graph_.OutgoingEdgeCount(graph_.EdgeStart(checking_pair.first)) == 1) {
+ if (graph_.IncomingEdgeCount(graph_.EdgeStart(checking_pair.first))) {
+ for (EdgeId e : graph_.IncomingEdges(graph_.EdgeStart(checking_pair.first))) {
+ OutTipMap.insert(std::make_pair(e,
+ std::make_pair(edge,
+ graph_.length(e) +
+ checking_pair.second)));
+ edge_stack.push(std::make_pair(e,
+ graph_.length(e) + checking_pair.second));
+ }
+ }
+ }
+
+ }
+ }
+ ++iterator;
+ }
+ }
+
+ template<class Streams>
+ void MapReads(omnigraph::de::PairedInfoIndexT<Graph> &paired_index, Streams &streams,
+ const std::unordered_map<EdgeId, pair<EdgeId, int> > &OutTipMap,
+ const std::unordered_map<EdgeId, pair<EdgeId, int> > &InTipMap) const {
+ INFO("Processing paired reads (takes a while)");
+
+ size_t nthreads = streams.size();
+ omnigraph::de::PairedInfoBuffersT<Graph> buffer_pi(graph_, nthreads);
+
+ size_t counter = 0;
+# pragma omp parallel for num_threads(nthreads) reduction(+ : counter)
+ for (size_t i = 0; i < nthreads; ++i) {
+ typename Streams::ReadT r;
+ auto &stream = streams[i];
+ stream.reset();
+
+ while (!stream.eof()) {
+ stream >> r;
+ ++counter;
+ ProcessPairedRead(buffer_pi[i], r, OutTipMap, InTipMap);
+ }
+ }
+
+ INFO("Used " << counter << " paired reads");
+
+ INFO("Merging paired indices");
+ for (auto &index: buffer_pi) {
+ paired_index.Merge(index);
+ index.Clear();
+ }
+ }
+
+public:
+
+ GapCloserPairedIndexFiller(const Graph &graph, const SequenceMapper &mapper)
+ : graph_(graph), mapper_(mapper) { }
+
+ /**
+ * Method reads paired data from stream, maps it to genome and stores it in this PairInfoIndex.
+ */
+ template<class Streams>
+ void FillIndex(omnigraph::de::PairedInfoIndexT<Graph> &paired_index, Streams &streams) {
+ std::unordered_map<EdgeId, pair<EdgeId, int> > OutTipMap, InTipMap;
+
+ INFO("Preparing shift maps");
+ PrepareShiftMaps(OutTipMap, InTipMap);
+
+ MapReads(paired_index, streams, OutTipMap, InTipMap);
+ }
+
+};
+
+template<class Graph, class SequenceMapper>
+class GapCloser {
+public:
+ typedef std::function<bool(const Sequence &)> SequenceCheckF;
+private:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ Graph &g_;
+ int k_;
+ omnigraph::de::PairedInfoIndexT<Graph> &tips_paired_idx_;
+ const size_t min_intersection_;
+ const size_t hamming_dist_bound_;
+ const int init_gap_val_;
+ const omnigraph::de::DEWeight weight_threshold_;
+
+ SequenceMapper mapper_;
+ std::unordered_set<runtime_k::RtSeq> new_kmers_;
+
+ bool CheckNoKmerClash(const Sequence &s) {
+ runtime_k::RtSeq kmer(k_ + 1, s);
+ kmer >>= 'A';
+ for (size_t i = k_; i < s.size(); ++i) {
+ kmer <<= s[i];
+ if (new_kmers_.count(kmer)) {
+ return false;
+ }
+ }
+ std::vector<EdgeId> path = mapper_.MapSequence(s).simple_path();
+ return path.empty();
+ }
+
+ std::vector<size_t> DiffPos(const Sequence &s1, const Sequence &s2) const {
+ VERIFY(s1.size() == s2.size());
+ std::vector<size_t> answer;
+ for (size_t i = 0; i < s1.size(); ++i)
+ if (s1[i] != s2[i])
+ answer.push_back(i);
+ return answer;
+ }
+
+ size_t HammingDistance(const Sequence &s1, const Sequence &s2) const {
+ VERIFY(s1.size() == s2.size());
+ size_t dist = 0;
+ for (size_t i = 0; i < s1.size(); ++i)
+ if (s1[i] != s2[i])
+ dist++;
+ return dist;
+ }
+
+ // size_t HammingDistance(const Sequence& s1, const Sequence& s2) const {
+ // return DiffPos(s1, s2).size();
+ // }
+
+ vector<size_t> PosThatCanCorrect(size_t overlap_length/*in nucls*/,
+ const vector<size_t> &mismatch_pos, size_t edge_length/*in nucls*/,
+ bool left_edge) const {
+ TRACE("Try correct left edge " << left_edge);
+ TRACE("Overlap length " << overlap_length);
+ TRACE("Edge length " << edge_length);
+ TRACE("Mismatches " << mismatch_pos);
+
+ vector<size_t> answer;
+ for (size_t i = 0; i < mismatch_pos.size(); ++i) {
+ size_t relative_mm_pos =
+ left_edge ?
+ mismatch_pos[i] :
+ overlap_length - 1 - mismatch_pos[i];
+ if (overlap_length - relative_mm_pos + g_.k() < edge_length)
+ //can correct mismatch
+ answer.push_back(mismatch_pos[i]);
+ }
+ TRACE("Can correct mismatches: " << answer);
+ return answer;
+ }
+
+ //todo write easier
+ bool CanCorrectLeft(EdgeId e, int overlap, const vector<size_t> &mismatch_pos) const {
+ return PosThatCanCorrect(overlap, mismatch_pos, g_.length(e) + g_.k(), true).size() == mismatch_pos.size();
+ }
+
+ //todo write easier
+ bool CanCorrectRight(EdgeId e, int overlap,
+ const vector<size_t> &mismatch_pos) const {
+ return PosThatCanCorrect(overlap, mismatch_pos, g_.length(e) + g_.k(), false).size() == mismatch_pos.size();
+ }
+
+ bool MatchesEnd(const Sequence &long_seq, const Sequence &short_seq, bool from_begin) const {
+ return from_begin ? long_seq.Subseq(0, short_seq.size()) == short_seq
+ : long_seq.Subseq(long_seq.size() - short_seq.size()) == short_seq;
+ }
+
+ void AddEdge(VertexId start, VertexId end, const Sequence &s) {
+ runtime_k::RtSeq kmer(k_ + 1, s);
+ kmer >>= 'A';
+ for (size_t i = k_; i < s.size(); ++i) {
+ kmer <<= s[i];
+ new_kmers_.insert(kmer);
+ new_kmers_.insert(!kmer);
+ }
+ g_.AddEdge(start, end, s);
+ }
+
+ bool CorrectLeft(EdgeId first, EdgeId second, int overlap, const vector<size_t> &diff_pos) {
+ DEBUG("Can correct first with sequence from second.");
+ Sequence new_sequence = g_.EdgeNucls(first).Subseq(g_.length(first) - overlap + diff_pos.front(),
+ g_.length(first) + k_ - overlap)
+ + g_.EdgeNucls(second).First(k_);
+ DEBUG("Checking new k+1-mers.");
+ if (CheckNoKmerClash(new_sequence)) {
+ DEBUG("Check ok.");
+ DEBUG("Splitting first edge.");
+ pair<EdgeId, EdgeId> split_res = g_.SplitEdge(first, g_.length(first) - overlap + diff_pos.front());
+ first = split_res.first;
+ tips_paired_idx_.Remove(split_res.second);
+ DEBUG("Adding new edge.");
+ VERIFY(MatchesEnd(new_sequence, g_.VertexNucls(g_.EdgeEnd(first)), true));
+ VERIFY(MatchesEnd(new_sequence, g_.VertexNucls(g_.EdgeStart(second)), false));
+ AddEdge(g_.EdgeEnd(first), g_.EdgeStart(second),
+ new_sequence);
+ return true;
+ } else {
+ DEBUG("Check fail.");
+ DEBUG("Filled k-mer already present in graph");
+ return false;
+ }
+ return false;
+ }
+
+ bool CorrectRight(EdgeId first, EdgeId second, int overlap, const vector<size_t> &diff_pos) {
+ DEBUG("Can correct second with sequence from first.");
+ Sequence new_sequence =
+ g_.EdgeNucls(first).Last(k_) + g_.EdgeNucls(second).Subseq(overlap, diff_pos.back() + 1 + k_);
+ DEBUG("Checking new k+1-mers.");
+ if (CheckNoKmerClash(new_sequence)) {
+ DEBUG("Check ok.");
+ DEBUG("Splitting second edge.");
+ pair<EdgeId, EdgeId> split_res = g_.SplitEdge(second, diff_pos.back() + 1);
+ second = split_res.second;
+ tips_paired_idx_.Remove(split_res.first);
+ DEBUG("Adding new edge.");
+ VERIFY(MatchesEnd(new_sequence, g_.VertexNucls(g_.EdgeEnd(first)), true));
+ VERIFY(MatchesEnd(new_sequence, g_.VertexNucls(g_.EdgeStart(second)), false));
+
+ AddEdge(g_.EdgeEnd(first), g_.EdgeStart(second),
+ new_sequence);
+ return true;
+ } else {
+ DEBUG("Check fail.");
+ DEBUG("Filled k-mer already present in graph");
+ return false;
+ }
+ return false;
+ }
+
+ bool HandlePositiveHammingDistanceCase(EdgeId first, EdgeId second, int overlap) {
+ DEBUG("Match was imperfect. Trying to correct one of the tips");
+ vector<size_t> diff_pos = DiffPos(g_.EdgeNucls(first).Last(overlap),
+ g_.EdgeNucls(second).First(overlap));
+ if (CanCorrectLeft(first, overlap, diff_pos)) {
+ return CorrectLeft(first, second, overlap, diff_pos);
+ } else if (CanCorrectRight(second, overlap, diff_pos)) {
+ return CorrectRight(first, second, overlap, diff_pos);
+ } else {
+ DEBUG("Can't correct tips due to the graph structure");
+ return false;
+ }
+ }
+
+ bool HandleSimpleCase(EdgeId first, EdgeId second, int overlap) {
+ DEBUG("Match was perfect. No correction needed");
+ //strange info guard
+ VERIFY(overlap <= k_);
+ if (overlap == k_) {
+ DEBUG("Tried to close zero gap");
+ return false;
+ }
+ //old code
+ Sequence edge_sequence = g_.EdgeNucls(first).Last(k_)
+ + g_.EdgeNucls(second).Subseq(overlap, k_);
+ if (CheckNoKmerClash(edge_sequence)) {
+ DEBUG("Gap filled: Gap size = " << k_ - overlap << " Result seq "
+ << edge_sequence.str());
+ AddEdge(g_.EdgeEnd(first), g_.EdgeStart(second), edge_sequence);
+ return true;
+ } else {
+ DEBUG("Filled k-mer already present in graph");
+ return false;
+ }
+ }
+
+ bool ProcessPair(EdgeId first, EdgeId second) {
+ TRACE("Processing edges " << g_.str(first) << " and " << g_.str(second));
+ TRACE("first " << g_.EdgeNucls(first) << " second " << g_.EdgeNucls(second));
+
+ if (cfg::get().avoid_rc_connections &&
+ (first == g_.conjugate(second) || first == second)) {
+ DEBUG("Trying to join conjugate edges " << g_.int_id(first));
+ return false;
+ }
+ //may be negative!
+ int gap = max(init_gap_val_,
+ -1 * (int) (min(g_.length(first), g_.length(second)) - 1));
+
+ Sequence seq1 = g_.EdgeNucls(first);
+ Sequence seq2 = g_.EdgeNucls(second);
+ TRACE("Checking possible gaps from " << gap << " to " << k_ - min_intersection_);
+ for (; gap <= k_ - (int) min_intersection_; ++gap) {
+ int overlap = k_ - gap;
+ size_t hamming_distance = HammingDistance(g_.EdgeNucls(first).Last(overlap),
+ g_.EdgeNucls(second).First(overlap));
+ if (hamming_distance <= hamming_dist_bound_) {
+ DEBUG("For edges " << g_.str(first) << " and " << g_.str(second)
+ << ". For gap value " << gap << " (overlap " << overlap << "bp) hamming distance was " <<
+ hamming_distance);
+ // DEBUG("Sequences of distance " << tip_distance << " :"
+ // << seq1.Subseq(seq1.size() - k).str() << " "
+ // << seq2.Subseq(0, k).str());
+
+ if (hamming_distance > 0) {
+ return HandlePositiveHammingDistanceCase(first, second, overlap);
+ } else {
+ return HandleSimpleCase(first, second, overlap);
+ }
+ }
+ }
+ return false;
+ }
+
+public:
+ //TODO extract methods
+ void CloseShortGaps() {
+ INFO("Closing short gaps");
+ size_t gaps_filled = 0;
+ size_t gaps_checked = 0;
+ for (auto edge = g_.SmartEdgeBegin(); !edge.IsEnd(); ++edge) {
+ EdgeId first_edge = *edge;
+ for (auto i : tips_paired_idx_.Get(first_edge)) {
+ EdgeId second_edge = i.first;
+ if (first_edge == second_edge)
+ continue;
+
+ if (!g_.IsDeadEnd(g_.EdgeEnd(first_edge)) || !g_.IsDeadStart(g_.EdgeStart(second_edge))) {
+ // WARN("Topologically wrong tips");
+ continue;
+ }
+
+ bool closed = false;
+ for (auto point : i.second) {
+ if (math::ls(point.weight, weight_threshold_))
+ continue;
+
+ ++gaps_checked;
+ closed = ProcessPair(first_edge, second_edge);
+ if (closed) {
+ ++gaps_filled;
+ break;
+ }
+ }
+ if (closed)
+ break;
+ } // second edge
+ } // first edge
+
+ INFO("Closing short gaps complete: filled " << gaps_filled
+ << " gaps after checking " << gaps_checked
+ << " candidates");
+ omnigraph::CompressAllVertices<Graph>(g_);
+ }
+
+ GapCloser(Graph &g, omnigraph::de::PairedInfoIndexT<Graph> &tips_paired_idx,
+ size_t min_intersection, double weight_threshold,
+ const SequenceMapper &mapper,
+ size_t hamming_dist_bound = 0 /*min_intersection_ / 5*/)
+ : g_(g),
+ k_((int) g_.k()),
+ tips_paired_idx_(tips_paired_idx),
+ min_intersection_(min_intersection),
+ hamming_dist_bound_(hamming_dist_bound),
+ init_gap_val_(-10),
+ weight_threshold_(weight_threshold),
+ mapper_(mapper),
+ new_kmers_() {
+ VERIFY(min_intersection_ < g_.k());
+ DEBUG("weight_threshold=" << weight_threshold_);
+ DEBUG("min_intersect=" << min_intersection_);
+ DEBUG("paired_index size=" << tips_paired_idx_.size());
+ }
+
+private:
+ DECL_LOGGER("GapCloser");
+};
+
+template<class Streams>
+void CloseGaps(conj_graph_pack &gp, Streams &streams) {
+ typedef NewExtendedSequenceMapper<Graph, Index> Mapper;
+ auto mapper = MapperInstance(gp);
+ GapCloserPairedIndexFiller<Graph, Mapper> gcpif(gp.g, *mapper);
+ PairedIndexT tips_paired_idx(gp.g);
+ gcpif.FillIndex(tips_paired_idx, streams);
+ GapCloser<Graph, Mapper> gap_closer(gp.g, tips_paired_idx,
+ cfg::get().gc.minimal_intersection, cfg::get().gc.weight_threshold,
+ *mapper);
+ gap_closer.CloseShortGaps();
+}
+
+void GapClosing::run(conj_graph_pack &gp, const char *) {
+ omnigraph::DefaultLabeler<Graph> labeler(gp.g, gp.edge_pos);
+ stats::detail_info_printer printer(gp, labeler, cfg::get().output_dir);
+ printer(config::info_printer_pos::before_first_gap_closer);
+
+ bool pe_exist = false;
+ for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i) {
+ if (cfg::get().ds.reads[i].type() == io::LibraryType::PairedEnd) {
+ pe_exist = true;
+ break;
+ }
+ }
+ if (!pe_exist) {
+ INFO("No paired-end libraries exist, skipping gap closer");
+ return;
+ }
+ gp.EnsureIndex();
+
+ for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i) {
+ if (cfg::get().ds.reads[i].type() == io::LibraryType::PairedEnd) {
+ auto streams = paired_binary_readers(cfg::get().ds.reads[i], false, 0);
+ CloseGaps(gp, streams);
+ }
+ }
+}
+
+}
diff --git a/src/projects/spades/gap_closer.hpp b/src/projects/spades/gap_closer.hpp
new file mode 100644
index 0000000..f0d7718
--- /dev/null
+++ b/src/projects/spades/gap_closer.hpp
@@ -0,0 +1,33 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef GAP_CLOSER_HPP_
+#define GAP_CLOSER_HPP_
+
+#include "pipeline/stage.hpp"
+
+namespace debruijn_graph {
+
+class GapClosing : public spades::AssemblyStage {
+ public:
+ GapClosing(const char* id)
+ : AssemblyStage("Gap Closer", id) {}
+
+ void run(conj_graph_pack &gp, const char*);
+};
+
+}
+
+
+
+#endif /* GAP_CLOSER_HPP_ */
diff --git a/src/projects/spades/launch.hpp b/src/projects/spades/launch.hpp
new file mode 100644
index 0000000..d6e6589
--- /dev/null
+++ b/src/projects/spades/launch.hpp
@@ -0,0 +1,120 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "pipeline/config_struct.hpp"
+
+#include "pipeline/graph_pack.hpp"
+#include "stages/construction.hpp"
+#include "pipeline/genomic_info_filler.hpp"
+#include "gap_closer.hpp"
+#include "stages/simplification.hpp"
+#include "mismatch_correction.hpp"
+#include "pair_info_count.hpp"
+#include "second_phase_setup.hpp"
+#include "repeat_resolving.hpp"
+#include "distance_estimation.hpp"
+#include "pacbio_aligning.hpp"
+#include "chromosome_removal.hpp"
+#include "pipeline/stage.hpp"
+
+namespace spades {
+
+void assemble_genome() {
+ INFO("SPAdes started");
+ if (cfg::get().mode == debruijn_graph::config::pipeline_type::meta &&
+ (cfg::get().ds.reads.lib_count() != 1 || cfg::get().ds.reads[0].type() != io::LibraryType::PairedEnd)) {
+ ERROR("Sorry, current version of metaSPAdes can work with single library only (paired-end only).");
+ exit(239);
+ }
+
+ INFO("Starting from stage: " << cfg::get().entry_point);
+
+ bool two_step_rr = cfg::get().two_step_rr && cfg::get().rr_enable;
+ INFO("Two-step RR enabled: " << two_step_rr);
+
+ StageManager SPAdes({cfg::get().developer_mode,
+ cfg::get().load_from,
+ cfg::get().output_saves});
+
+ size_t read_index_cnt = cfg::get().ds.reads.lib_count();
+ if (two_step_rr)
+ read_index_cnt++;
+
+ debruijn_graph::conj_graph_pack conj_gp(cfg::get().K,
+ cfg::get().tmp_dir,
+ read_index_cnt,
+ cfg::get().ds.reference_genome,
+ cfg::get().flanking_range,
+ cfg::get().pos.max_mapping_gap,
+ cfg::get().pos.max_gap_diff);
+
+ if (cfg::get().need_mapping) {
+ INFO("Will need read mapping, kmer mapper will be attached");
+ conj_gp.kmer_mapper.Attach();
+ }
+ // Build the pipeline
+ SPAdes.add(new debruijn_graph::Construction())
+ .add(new debruijn_graph::GenomicInfoFiller());
+ if (cfg::get().gap_closer_enable && cfg::get().gc.before_simplify)
+ SPAdes.add(new debruijn_graph::GapClosing("early_gapcloser"));
+
+ SPAdes.add(new debruijn_graph::Simplification(two_step_rr));
+
+ if (cfg::get().gap_closer_enable && cfg::get().gc.after_simplify)
+ SPAdes.add(new debruijn_graph::GapClosing("late_gapcloser"));
+
+ SPAdes.add(new debruijn_graph::SimplificationCleanup());
+ //currently cannot be used with two step rr
+ if (cfg::get().correct_mismatches && !cfg::get().two_step_rr)
+ SPAdes.add(new debruijn_graph::MismatchCorrection());
+ if (cfg::get().rr_enable) {
+ if (two_step_rr) {
+ if (cfg::get().use_intermediate_contigs)
+ SPAdes.add(new debruijn_graph::PairInfoCount(true))
+ .add(new debruijn_graph::DistanceEstimation(true))
+ .add(new debruijn_graph::RepeatResolution(true))
+ .add(new debruijn_graph::SecondPhaseSetup());
+
+ SPAdes.add(new debruijn_graph::Simplification());
+ }
+
+ //begin pacbio
+ bool run_pacbio = false;
+ for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i) {
+ if (cfg::get().ds.reads[i].is_pacbio_alignable()) {
+ run_pacbio = true;
+ break;
+ }
+ }
+ if (run_pacbio) {
+ //currently not integrated with two step rr process
+ VERIFY(!two_step_rr);
+ SPAdes.add(new debruijn_graph::PacBioAligning());
+ }
+ //end pacbio
+ if (cfg::get().pd) {
+ SPAdes.add(new debruijn_graph::ChromosomeRemoval());
+ }
+ SPAdes.add(new debruijn_graph::PairInfoCount())
+ .add(new debruijn_graph::DistanceEstimation())
+ .add(new debruijn_graph::RepeatResolution());
+
+ } else {
+ SPAdes.add(new debruijn_graph::ContigOutput());
+ }
+
+ SPAdes.run(conj_gp, cfg::get().entry_point.c_str());
+
+ // For informing spades.py about estimated params
+ debruijn_graph::config::write_lib_data(path::append_path(cfg::get().output_dir, "final"));
+
+ INFO("SPAdes finished");
+}
+
+}
diff --git a/src/projects/spades/main.cpp b/src/projects/spades/main.cpp
new file mode 100644
index 0000000..fd3eafb
--- /dev/null
+++ b/src/projects/spades/main.cpp
@@ -0,0 +1,110 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * Assembler Main
+ */
+#include "dev_support/logger/log_writers.hpp"
+
+#include "dev_support/segfault_handler.hpp"
+#include "launch.hpp"
+#include "dev_support/copy_file.hpp"
+
+void load_config(const vector<string>& cfg_fns) {
+ for (const auto& s : cfg_fns) {
+ path::CheckFileExistenceFATAL(s);
+ }
+
+ cfg::create_instance(cfg_fns);
+
+ if (!cfg::get().project_name.empty()) {
+ make_dir(cfg::get().output_base + cfg::get().project_name);
+ }
+
+ make_dir(cfg::get().output_dir);
+ make_dir(cfg::get().tmp_dir);
+
+ if (cfg::get().developer_mode)
+ make_dir(cfg::get().output_saves);
+
+ make_dir(cfg::get().temp_bin_reads_path);
+}
+
+void create_console_logger(const string& dir) {
+ using namespace logging;
+
+ string log_props_file = cfg::get().log_filename;
+
+ if (!path::FileExists(log_props_file))
+ log_props_file = path::append_path(dir, cfg::get().log_filename);
+
+ logger *lg = create_logger(path::FileExists(log_props_file) ? log_props_file : "");
+ lg->add_writer(std::make_shared<console_writer>());
+ attach_logger(lg);
+}
+
+int main(int argc, char **argv) {
+ perf_counter pc;
+
+ const size_t GB = 1 << 30;
+
+ srand(42);
+ srandom(42);
+
+ try {
+ using namespace debruijn_graph;
+
+ string cfg_dir = path::parent_path(argv[1]);
+
+ vector<string> cfg_fns;
+ for (int i = 1; i < argc; ++i) {
+ cfg_fns.push_back(argv[i]);
+ }
+
+ load_config(cfg_fns);
+
+ create_console_logger(cfg_dir);
+
+ for (const auto& cfg_fn : cfg_fns)
+ INFO("Loading config from " << cfg_fn);
+
+ VERIFY(cfg::get().K >= runtime_k::MIN_K && cfg::get().K < runtime_k::MAX_K);
+ VERIFY(cfg::get().K % 2 != 0);
+
+ // read configuration file (dataset path etc.)
+
+ limit_memory(cfg::get().max_memory * GB);
+
+ // assemble it!
+ INFO("Starting SPAdes, built from "
+ SPADES_GIT_REFSPEC
+ ", git revision "
+ SPADES_GIT_SHA1);
+ INFO("Assembling dataset (" << cfg::get().dataset_file << ") with K=" << cfg::get().K);
+
+ spades::assemble_genome();
+
+ } catch (std::bad_alloc const &e) {
+ std::cerr << "Not enough memory to run SPAdes. " << e.what() << std::endl;
+ return EINTR;
+ } catch (std::exception const &e) {
+ std::cerr << "Exception caught " << e.what() << std::endl;
+ return EINTR;
+ } catch (...) {
+ std::cerr << "Unknown exception caught " << std::endl;
+ return EINTR;
+ }
+
+ unsigned ms = (unsigned) pc.time_ms();
+ unsigned secs = (ms / 1000) % 60;
+ unsigned mins = (ms / 1000 / 60) % 60;
+ unsigned hours = (ms / 1000 / 60 / 60);
+ INFO("Assembling time: " << hours << " hours " << mins << " minutes " << secs << " seconds");
+
+ // OK
+ return 0;
+}
diff --git a/src/projects/spades/mismatch_correction.cpp b/src/projects/spades/mismatch_correction.cpp
new file mode 100644
index 0000000..6e6cae0
--- /dev/null
+++ b/src/projects/spades/mismatch_correction.cpp
@@ -0,0 +1,27 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include <algorithms/mismatch_shall_not_pass.hpp>
+#include "mismatch_correction.hpp"
+
+#include "io/dataset_support/read_converter.hpp"
+
+namespace debruijn_graph {
+
+void MismatchCorrection::run(conj_graph_pack &gp, const char*) {
+ gp.EnsureBasicMapping();
+ std::vector<size_t> libs;
+ for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i) {
+ if (cfg::get().ds.reads[i].is_mismatch_correctable())
+ libs.push_back(i);
+ }
+ auto streams = single_binary_readers_for_libs(libs, true, true);
+ size_t corrected = MismatchShallNotPass<conj_graph_pack, io::SingleReadSeq>(gp, 2).ParallelStopAllMismatches(streams, 1);
+ INFO("Corrected " << corrected << " nucleotides");
+}
+
+}
diff --git a/src/projects/spades/mismatch_correction.hpp b/src/projects/spades/mismatch_correction.hpp
new file mode 100644
index 0000000..82f6aef
--- /dev/null
+++ b/src/projects/spades/mismatch_correction.hpp
@@ -0,0 +1,23 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "pipeline/stage.hpp"
+
+namespace debruijn_graph {
+
+class MismatchCorrection : public spades::AssemblyStage {
+public:
+ MismatchCorrection()
+ : AssemblyStage("Mismatch Correction", "mismatch_correction") { }
+
+ void run(conj_graph_pack &gp, const char *);
+};
+
+}
+
diff --git a/src/projects/spades/pacbio_aligning.cpp b/src/projects/spades/pacbio_aligning.cpp
new file mode 100644
index 0000000..2c62e26
--- /dev/null
+++ b/src/projects/spades/pacbio_aligning.cpp
@@ -0,0 +1,185 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "assembly_graph/graph_alignment/pacbio/pac_index.hpp"
+#include "assembly_graph/graph_alignment/pacbio/pacbio_gap_closer.hpp"
+#include "assembly_graph/graph_alignment/long_read_storage.hpp"
+#include "io/reads_io/wrapper_collection.hpp"
+#include "assembly_graph/stats/picture_dump.hpp"
+#include "pacbio_aligning.hpp"
+
+namespace debruijn_graph {
+
+void ProcessReadsBatch(conj_graph_pack &gp,
+ std::vector<io::SingleRead>& reads,
+ pacbio::PacBioMappingIndex<ConjugateDeBruijnGraph>& pac_index,
+ PathStorage<Graph>& long_reads, pacbio::GapStorage<Graph>& gaps,
+ size_t buf_size, int n, size_t min_gap_quantity, pacbio::StatsCounter& stats) {
+ vector<PathStorage<Graph> > long_reads_by_thread(cfg::get().max_threads,
+ PathStorage<Graph>(gp.g));
+ vector<pacbio::GapStorage<Graph> > gaps_by_thread(cfg::get().max_threads,
+ pacbio::GapStorage<Graph>(gp.g, min_gap_quantity));
+ vector<pacbio::StatsCounter> stats_by_thread(cfg::get().max_threads);
+
+ size_t longer_500 = 0;
+ size_t aligned = 0;
+ size_t nontrivial_aligned = 0;
+
+# pragma omp parallel for shared(reads, long_reads_by_thread, pac_index, n, aligned, nontrivial_aligned)
+ for (size_t i = 0; i < buf_size; ++i) {
+ if (i % 1000 == 0) {
+ DEBUG("thread number " << omp_get_thread_num());
+ }
+ size_t thread_num = omp_get_thread_num();
+ Sequence seq(reads[i].sequence());
+# pragma omp atomic
+ n++;
+ auto current_read_mapping = pac_index.GetReadAlignment(seq);
+ auto aligned_edges = current_read_mapping.main_storage;
+ auto gaps = current_read_mapping.gaps;
+ for (auto iter = gaps.begin(); iter != gaps.end(); ++iter)
+ gaps_by_thread[thread_num].AddGap(*iter, true);
+
+ for (auto iter = aligned_edges.begin(); iter != aligned_edges.end(); ++iter)
+ long_reads_by_thread[thread_num].AddPath(*iter, 1, true);
+ //counting stats:
+ for (auto iter = aligned_edges.begin(); iter != aligned_edges.end(); ++iter) {
+ stats_by_thread[thread_num].path_len_in_edges[iter->size()]++;
+ }
+# pragma omp critical
+ {
+// INFO(current_read_mapping.seed_num);
+ if (seq.size() > 500) {
+ longer_500++;
+ if (aligned_edges.size() > 0) {
+ aligned++;
+ stats_by_thread[thread_num].seeds_percentage[size_t(
+ floor(double(current_read_mapping.seed_num) * 1000.0 / (double) seq.size()))]++;
+ for (size_t j = 0; j < aligned_edges.size(); j++) {
+ if (aligned_edges[j].size() > 1) {
+ nontrivial_aligned++;
+ break;
+ }
+ }
+ }
+ }
+ }
+# pragma omp critical
+ {
+ VERBOSE_POWER(n, " reads processed");
+ }
+ }
+ INFO("Read batch of size: " << buf_size << " processed; "<< longer_500 << " of them longer than 500; among long reads aligned: " << aligned << "; paths of more than one edge received: " << nontrivial_aligned );
+
+ for (size_t i = 0; i < cfg::get().max_threads; i++) {
+ long_reads.AddStorage(long_reads_by_thread[i]);
+ gaps.AddStorage(gaps_by_thread[i]);
+ stats.AddStorage(stats_by_thread[i]);
+ }
+}
+
+void align_pacbio(conj_graph_pack &gp, int lib_id, bool make_additional_saves) {
+ io::ReadStreamList<io::SingleRead> streams;
+ for (const auto& reads : cfg::get().ds.reads[lib_id].single_reads())
+ //do we need input_file function here?
+ streams.push_back(make_shared<io::FixingWrapper>(make_shared<io::FileReadStream>(reads)));
+
+ //make_shared<io::FixingWrapper>(make_shared<io::FileReadStream>(file));
+ // auto pacbio_read_stream = single_easy_reader(cfg::get().ds.reads[lib_id],
+// false, false);
+
+// io::ReadStreamList<io::SingleRead> streams(pacbio_read_stream);
+ // pacbio_read_stream.release();
+ int n = 0;
+ PathStorage<Graph>& long_reads = gp.single_long_reads[lib_id];
+ pacbio::StatsCounter stats;
+ size_t min_gap_quantity = 2;
+ size_t rtype = 0;
+ bool consensus_gap_closing = false;
+ if (cfg::get().ds.reads[lib_id].type() == io::LibraryType::PacBioReads ||
+ cfg::get().ds.reads[lib_id].type() == io::LibraryType::SangerReads ||
+ cfg::get().ds.reads[lib_id].type() == io::LibraryType::NanoporeReads) {
+ min_gap_quantity = cfg::get().pb.pacbio_min_gap_quantity;
+ rtype = 1;
+ consensus_gap_closing = true;
+ } else {
+ min_gap_quantity = cfg::get().pb.contigs_min_gap_quantity;
+ rtype = 2;
+ }
+ pacbio::GapStorage<ConjugateDeBruijnGraph> gaps(gp.g, min_gap_quantity);
+ size_t read_buffer_size = 50000;
+ std::vector<io::SingleRead> reads(read_buffer_size);
+ io::SingleRead read;
+ size_t buffer_no = 0;
+ INFO("Usign seed size: " << cfg::get().pb.pacbio_k);
+ pacbio::PacBioMappingIndex<ConjugateDeBruijnGraph> pac_index(gp.g,
+ cfg::get().pb.pacbio_k,
+ cfg::get().K, cfg::get().pb.ignore_middle_alignment);
+
+// path_extend::ContigWriter cw(gp.g);
+// cw.WriteEdges("before_rr_with_ids.fasta");
+// ofstream filestr("pacbio_mapped.mpr");
+// filestr.close();
+ for (auto iter = streams.begin(); iter != streams.end(); ++iter) {
+ auto &stream = *iter;
+ while (!stream.eof()) {
+ size_t buf_size = 0;
+ for (; buf_size < read_buffer_size && !stream.eof(); ++buf_size)
+ stream >> reads[buf_size];
+ INFO("Prepared batch " << buffer_no << " of " << buf_size << " reads.");
+ DEBUG("master thread number " << omp_get_thread_num());
+ ProcessReadsBatch(gp, reads, pac_index, long_reads, gaps, buf_size, n, min_gap_quantity, stats);
+ // INFO("Processed batch " << buffer_no);
+ ++buffer_no;
+ }
+ }
+ string ss = (rtype == 1 ? "long reads": "contigs");
+ INFO("For lib " << lib_id << " of " << ss <<" :");
+ stats.report();
+ map<EdgeId, EdgeId> replacement;
+ size_t min_stats_cutoff =(rtype == 1 ? 1 : 0);
+ if (make_additional_saves)
+ long_reads.DumpToFile(cfg::get().output_saves + "long_reads_before_rep.mpr",
+ replacement, min_stats_cutoff, true);
+ gaps.DumpToFile(cfg::get().output_saves + "gaps.mpr");
+ gaps.PadGapStrings();
+ if (make_additional_saves)
+ gaps.DumpToFile(cfg::get().output_saves + "gaps_padded.mpr");
+ pacbio::PacbioGapCloser<Graph> gap_closer(gp.g, consensus_gap_closing);
+ gap_closer.ConstructConsensus(cfg::get().max_threads, gaps);
+ gap_closer.CloseGapsInGraph(replacement);
+ long_reads.ReplaceEdges(replacement);
+ for(int j = 0; j < lib_id; j++) {
+ gp.single_long_reads[j].ReplaceEdges(replacement);
+ }
+
+ gap_closer.DumpToFile(cfg::get().output_saves + "gaps_pb_closed.fasta");
+ INFO("PacBio aligning finished");
+ return;
+}
+
+void PacBioAligning::run(conj_graph_pack &gp, const char*) {
+ using namespace omnigraph;
+ omnigraph::DefaultLabeler<Graph> labeler(gp.g, gp.edge_pos);
+ int lib_id = -1;
+ bool make_additional_saves = parent_->saves_policy().make_saves_;
+ for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i) {
+ if ( cfg::get().ds.reads[i].is_pacbio_alignable() ) {
+ lib_id = (int) i;
+ align_pacbio(gp, lib_id, make_additional_saves);
+ }
+ }
+
+ if (lib_id == -1)
+ INFO("no PacBio lib found");
+
+ stats::detail_info_printer printer(gp, labeler, cfg::get().output_dir);
+ printer(config::info_printer_pos::final_gap_closed);
+}
+
+}
+
diff --git a/src/projects/spades/pacbio_aligning.hpp b/src/projects/spades/pacbio_aligning.hpp
new file mode 100644
index 0000000..4e7d2a9
--- /dev/null
+++ b/src/projects/spades/pacbio_aligning.hpp
@@ -0,0 +1,23 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "pipeline/stage.hpp"
+
+namespace debruijn_graph {
+
+class PacBioAligning : public spades::AssemblyStage {
+public:
+ PacBioAligning()
+ : AssemblyStage("PacBio Aligning", "pacbio_aligning") {
+ }
+ void run(conj_graph_pack &gp, const char*);
+};
+
+}
+
diff --git a/src/projects/spades/pair_info_count.cpp b/src/projects/spades/pair_info_count.cpp
new file mode 100644
index 0000000..79a85d3
--- /dev/null
+++ b/src/projects/spades/pair_info_count.cpp
@@ -0,0 +1,259 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include <paired_info/is_counter.hpp>
+#include "io/dataset_support/read_converter.hpp"
+
+#include "pair_info_count.hpp"
+#include "assembly_graph/graph_alignment/short_read_mapper.hpp"
+#include "assembly_graph/graph_alignment/long_read_mapper.hpp"
+#include "paired_info/pair_info_filler.hpp"
+#include "algorithms/path_extend/split_graph_pair_info.hpp"
+#include "paired_info/bwa_pair_info_filler.hpp"
+
+namespace debruijn_graph {
+
+typedef io::SequencingLibrary<config::DataSetData> SequencingLib;
+
+bool RefineInsertSizeForLib(conj_graph_pack &gp, size_t ilib, size_t edge_length_threshold) {
+
+ INFO("Estimating insert size (takes a while)");
+ InsertSizeCounter hist_counter(gp, edge_length_threshold, /* ignore negative */ true);
+ SequenceMapperNotifier notifier(gp);
+ notifier.Subscribe(ilib, &hist_counter);
+
+ SequencingLib &reads = cfg::get_writable().ds.reads[ilib];
+ VERIFY(reads.data().read_length != 0);
+ auto paired_streams = paired_binary_readers(reads, false);
+ notifier.ProcessLibrary(paired_streams, ilib, *ChooseProperMapper(gp, reads));
+
+ INFO(hist_counter.mapped() << " paired reads (" <<
+ ((double) hist_counter.mapped() * 100.0 / (double) hist_counter.total()) <<
+ "% of all) aligned to long edges");
+ if (hist_counter.negative() > 3 * hist_counter.mapped())
+ WARN("Too much reads aligned with negative insert size. Is the library orientation set properly?");
+ if (hist_counter.mapped() == 0)
+ return false;
+
+ std::map<size_t, size_t> percentiles;
+ hist_counter.FindMean(reads.data().mean_insert_size, reads.data().insert_size_deviation, percentiles);
+ hist_counter.FindMedian(reads.data().median_insert_size, reads.data().insert_size_mad,
+ reads.data().insert_size_distribution);
+ if (reads.data().median_insert_size < gp.k_value + 2) {
+ return false;
+ }
+
+ std::tie(reads.data().insert_size_left_quantile,
+ reads.data().insert_size_right_quantile) = omnigraph::GetISInterval(0.8,
+ reads.data().insert_size_distribution);
+
+ return !reads.data().insert_size_distribution.empty();
+}
+
+void ProcessSingleReads(conj_graph_pack &gp, size_t ilib,
+ bool use_binary = true) {
+ const SequencingLib &reads = cfg::get().ds.reads[ilib];
+ SequenceMapperNotifier notifier(gp);
+ GappedLongReadMapper read_mapper(gp, gp.single_long_reads[ilib]);
+ SimpleLongReadMapper simple_read_mapper(gp, gp.single_long_reads[ilib]);
+
+ if(reads.type() == io::LibraryType::PathExtendContigs) {
+ notifier.Subscribe(ilib, &read_mapper);
+ } else {
+ notifier.Subscribe(ilib, &simple_read_mapper);
+ }
+
+ auto mapper_ptr = ChooseProperMapper(gp, reads);
+ if (use_binary) {
+ auto single_streams = single_binary_readers(reads, false, true);
+ notifier.ProcessLibrary(single_streams, ilib, *mapper_ptr);
+ } else {
+ auto single_streams = single_easy_readers(reads, false,
+ true, /*handle Ns*/false);
+ notifier.ProcessLibrary(single_streams, ilib, *mapper_ptr);
+ }
+ cfg::get_writable().ds.reads[ilib].data().single_reads_mapped = true;
+}
+
+void ProcessPairedReads(conj_graph_pack &gp, size_t ilib, bool map_single_reads) {
+ const SequencingLib &reads = cfg::get().ds.reads[ilib];
+ bool calculate_threshold = (reads.type() == io::LibraryType::PairedEnd);
+ SequenceMapperNotifier notifier(gp);
+ INFO("Left insert size qauntile " << reads.data().insert_size_left_quantile <<
+ ", right insert size quantile " << reads.data().insert_size_right_quantile);
+
+ SimpleLongReadMapper read_mapper(gp, gp.single_long_reads[ilib]);
+ if (map_single_reads) {
+ notifier.Subscribe(ilib, &read_mapper);
+ }
+
+ path_extend::SplitGraphPairInfo split_graph(
+ gp, (size_t) reads.data().median_insert_size,
+ (size_t) reads.data().insert_size_deviation,
+ (size_t) reads.data().insert_size_left_quantile,
+ (size_t) reads.data().insert_size_right_quantile,
+ reads.data().read_length, gp.g.k(),
+ cfg::get().pe_params.param_set.split_edge_length,
+ reads.data().insert_size_distribution);
+ if (calculate_threshold) {
+ notifier.Subscribe(ilib, &split_graph);
+ }
+
+ LatePairedIndexFiller pif(gp.g, PairedReadCountWeight, gp.paired_indices[ilib]);
+ notifier.Subscribe(ilib, &pif);
+
+ auto paired_streams = paired_binary_readers(reads, false, (size_t) reads.data().mean_insert_size);
+ notifier.ProcessLibrary(paired_streams, ilib, *ChooseProperMapper(gp, reads));
+ cfg::get_writable().ds.reads[ilib].data().pi_threshold = split_graph.GetThreshold();
+
+ if (map_single_reads) {
+ ProcessSingleReads(gp, ilib);
+ }
+}
+
+bool HasGoodRRLibs() {
+ for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i) {
+ const auto &lib = cfg::get().ds.reads[i];
+ if (lib.is_contig_lib())
+ continue;
+ if (lib.is_paired() &&
+ lib.data().mean_insert_size == 0.0) {
+ continue;
+ }
+ if (lib.is_repeat_resolvable()) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool HasOnlyMP() {
+ for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i) {
+ if (cfg::get().ds.reads[i].type() == io::LibraryType::PathExtendContigs)
+ continue;
+ if (cfg::get().ds.reads[i].type() != io::LibraryType::MatePairs &&
+ cfg::get().ds.reads[i].type() != io::LibraryType::HQMatePairs) {
+ return false;
+ }
+ }
+ return true;
+}
+
+//todo improve logic
+bool ShouldMapSingleReads(size_t ilib) {
+ using config::single_read_resolving_mode;
+ switch (cfg::get().single_reads_rr) {
+ case single_read_resolving_mode::none: {
+ return false;
+ }
+ case single_read_resolving_mode::all: {
+ return true;
+ }
+ case single_read_resolving_mode::only_single_libs: {
+ //Map when no PacBio/paried libs or only mate-pairs or single lib itself
+ return !HasGoodRRLibs() || HasOnlyMP() ||
+ (cfg::get().ds.reads[ilib].type() == io::LibraryType::SingleReads);
+ }
+ default:
+ VERIFY_MSG(false, "Invalid mode value");
+ }
+ return false;
+}
+
+void PairInfoCount::run(conj_graph_pack &gp, const char *) {
+ gp.InitRRIndices();
+ gp.EnsureBasicMapping();
+
+ //fixme implement better universal logic
+ size_t edge_length_threshold = cfg::get().mode == config::pipeline_type::meta ? 1000 : stats::Nx(gp.g, 50);
+ INFO("Min edge length for estimation: " << edge_length_threshold);
+ bwa_pair_info::BWAPairInfoFiller bwa_counter(gp.g,
+ cfg::get().bwa.path_to_bwa,
+ path::append_path(cfg::get().output_dir, "bwa_count"),
+ cfg::get().max_threads, !cfg::get().bwa.debug);
+
+ for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i) {
+ const auto &lib = cfg::get().ds.reads[i];
+
+ if (cfg::get().bwa.bwa_enable && lib.is_bwa_alignable()) {
+ //Run insert size estimation and pair index filler together to save disc space (removes SAM file right after processing the lib)
+ bwa_counter.ProcessLib(i, cfg::get_writable().ds.reads[i], gp.paired_indices[i],
+ edge_length_threshold, cfg::get().bwa.min_contig_len);
+ } else if (lib.is_paired()) {
+ INFO("Estimating insert size for library #" << i);
+ const auto &lib_data = lib.data();
+ size_t rl = lib_data.read_length;
+ size_t k = cfg::get().K;
+ bool insert_size_refined = RefineInsertSizeForLib(gp, i, edge_length_threshold);
+
+ if (!insert_size_refined) {
+ cfg::get_writable().ds.reads[i].data().mean_insert_size = 0.0;
+ WARN("Unable to estimate insert size for paired library #" << i);
+ if (rl > 0 && rl <= k) {
+ WARN("Maximum read length (" << rl << ") should be greater than K (" << k << ")");
+ } else if (rl <= k * 11 / 10) {
+ WARN("Maximum read length (" << rl << ") is probably too close to K (" << k << ")");
+ } else {
+ WARN("None of paired reads aligned properly. Please, check orientation of your read pairs.");
+ }
+ continue;
+ } else {
+ INFO(" Insert size = " << lib_data.mean_insert_size <<
+ ", deviation = " << lib_data.insert_size_deviation <<
+ ", left quantile = " << lib_data.insert_size_left_quantile <<
+ ", right quantile = " << lib_data.insert_size_right_quantile <<
+ ", read length = " << lib_data.read_length);
+
+ if (lib_data.mean_insert_size < 1.1 * (double) rl) {
+ WARN("Estimated mean insert size " << lib_data.mean_insert_size
+ << " is very small compared to read length " << rl);
+ }
+ }
+ }
+ }
+
+ for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i) {
+ const auto &lib = cfg::get().ds.reads[i];
+ if (lib.is_pacbio_alignable()) {
+ INFO("Library #" << i << " was mapped by PacBio mapper, skipping");
+ continue;
+ } else if (lib.is_contig_lib()) {
+ INFO("Mapping contigs library #" << i);
+ ProcessSingleReads(gp, i, false);
+ } else if (cfg::get().bwa.bwa_enable && lib.is_bwa_alignable()) {
+ INFO("Library #" << i << " was mapped by BWA, skipping");
+ continue;
+ } else {
+ INFO("Mapping library #" << i);
+ bool map_single_reads = ShouldMapSingleReads(i);
+ cfg::get_writable().use_single_reads |= map_single_reads;
+
+ if(cfg::get().mode == debruijn_graph::config::pipeline_type::meta
+ && cfg::get().use_single_reads) {
+ map_single_reads = false;
+ cfg::get_writable().use_single_reads = false;
+ WARN("Single reads mappings are not used in metagenomic mode");
+ }
+
+ if (lib.is_paired() && lib.data().mean_insert_size != 0.0) {
+ INFO("Mapping paired reads (takes a while) ");
+ ProcessPairedReads(gp, i, map_single_reads);
+ } else if (map_single_reads) {
+ INFO("Mapping single reads (takes a while) ");
+ ProcessSingleReads(gp, i);
+ }
+
+ if (map_single_reads) {
+ INFO("Total paths obtained from single reads: " << gp.single_long_reads[i].size());
+ }
+ }
+ }
+
+ SensitiveReadMapper<Graph>::EraseIndices();
+}
+
+}
diff --git a/src/projects/spades/pair_info_count.hpp b/src/projects/spades/pair_info_count.hpp
new file mode 100644
index 0000000..6c416bc
--- /dev/null
+++ b/src/projects/spades/pair_info_count.hpp
@@ -0,0 +1,24 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "pipeline/stage.hpp"
+
+namespace debruijn_graph {
+
+class PairInfoCount : public spades::AssemblyStage {
+ public:
+ PairInfoCount(bool preliminary = false)
+ : AssemblyStage(preliminary ? "Preliminary Paired Information Counting" : "Paired Information Counting",
+ preliminary ? "late_pair_info_count_preliminary" : "late_pair_info_count") {}
+
+ void run(conj_graph_pack &gp, const char*);
+};
+
+}
+
diff --git a/src/projects/spades/repeat_resolving.cpp b/src/projects/spades/repeat_resolving.cpp
new file mode 100644
index 0000000..9a5424a
--- /dev/null
+++ b/src/projects/spades/repeat_resolving.cpp
@@ -0,0 +1,96 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "dev_support/logger/logger.hpp"
+#include "assembly_graph/stats/picture_dump.hpp"
+#include "visualization/graph_labeler.hpp"
+#include "paired_info/distance_estimation.hpp"
+#include "paired_info/smoothing_distance_estimation.hpp"
+#include "algorithms/path_extend/path_extend_launch.hpp"
+#include "assembly_graph/graph_support/contig_output.hpp"
+#include "visualization/position_filler.hpp"
+#include "assembly_graph/graph_alignment/long_read_storage.hpp"
+#include "repeat_resolving.hpp"
+
+namespace debruijn_graph {
+
+void PEResolving(conj_graph_pack& gp) {
+ vector<size_t> indexes;
+ std::string name = "scaffolds";
+ bool traverse_loops = true;
+ if (!(cfg::get().use_scaffolder && cfg::get().pe_params.param_set.scaffolder_options.on)) {
+ name = "final_contigs";
+ traverse_loops = false;
+ }
+ path_extend::ResolveRepeatsPe(gp, cfg::get().output_dir, name, traverse_loops, boost::optional<std::string>("final_contigs"));
+}
+
+inline bool HasValidLibs() {
+ for (const auto& lib : cfg::get().ds.reads) {
+ if (lib.is_repeat_resolvable()) {
+ if (!lib.is_paired() || !math::eq(lib.data().mean_insert_size, 0.0)) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+void RepeatResolution::run(conj_graph_pack &gp, const char*) {
+ if (cfg::get().developer_mode) {
+ stats::PrepareForDrawing(gp);
+ }
+
+ omnigraph::DefaultLabeler<Graph> labeler(gp.g, gp.edge_pos);
+ stats::detail_info_printer printer(gp, labeler, cfg::get().output_dir);
+ printer(config::info_printer_pos::before_repeat_resolution);
+
+ //todo awful hack to get around PE using cfg::get everywhere...
+ auto tmp_params_storage = cfg::get().pe_params;
+ if (preliminary_) {
+ INFO("Setting up preliminary path extend settings")
+ cfg::get_writable().pe_params = *cfg::get().prelim_pe_params;
+ }
+ OutputContigs(gp.g, cfg::get().output_dir + "before_rr", false, 0, false);
+ OutputContigsToFASTG(gp.g, cfg::get().output_dir + "assembly_graph",gp.components);
+
+ bool no_valid_libs = !HasValidLibs();
+
+ bool use_single_reads = cfg::get().use_single_reads;
+ if (cfg::get().rr_enable && no_valid_libs && !use_single_reads)
+ WARN("Insert size was not estimated for any of the paired libraries, repeat resolution module will not run.");
+
+ if ((no_valid_libs || cfg::get().rm == config::resolving_mode::none) && !use_single_reads) {
+ OutputContigs(gp.g, cfg::get().output_dir + "final_contigs", false, 0, false);
+ return;
+ }
+ if (cfg::get().rm == config::resolving_mode::path_extend) {
+ INFO("Using Path-Extend repeat resolving");
+ PEResolving(gp);
+ } else {
+ INFO("Unsupported repeat resolver");
+ OutputContigs(gp.g, cfg::get().output_dir + "final_contigs", false, 0, false);
+ }
+ if (preliminary_) {
+ INFO("Restoring initial path extend settings")
+ cfg::get_writable().pe_params = tmp_params_storage;
+ }
+}
+
+void ContigOutput::run(conj_graph_pack &gp, const char*) {
+ OutputContigs(gp.g, cfg::get().output_dir + "simplified_contigs", cfg::get().use_unipaths,
+ cfg::get().simp.tec.plausibility_length, false);
+ OutputContigs(gp.g, cfg::get().output_dir + "before_rr", false, 0, false);
+ OutputContigsToFASTG(gp.g, cfg::get().output_dir + "assembly_graph", gp.components);
+ OutputContigs(gp.g, cfg::get().output_dir + "final_contigs", false, 0, false);
+
+}
+
+} // debruijn_graph
+
+
+
diff --git a/src/projects/spades/repeat_resolving.hpp b/src/projects/spades/repeat_resolving.hpp
new file mode 100644
index 0000000..8178e4a
--- /dev/null
+++ b/src/projects/spades/repeat_resolving.hpp
@@ -0,0 +1,42 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "pipeline/stage.hpp"
+
+namespace debruijn_graph {
+
+class RepeatResolution : public spades::AssemblyStage {
+ const bool preliminary_;
+public:
+ RepeatResolution(bool preliminary = false)
+ : AssemblyStage(preliminary ? "Preliminary Repeat Resolving" : "Repeat Resolving",
+ preliminary ? "repeat_resolving_preliminary" : "repeat_resolving"),
+ preliminary_(preliminary) { }
+
+ void load(conj_graph_pack &, const std::string &, const char *) { }
+
+ void save(const conj_graph_pack &, const std::string &, const char *) const { }
+
+ void run(conj_graph_pack &gp, const char *);
+};
+
+class ContigOutput : public spades::AssemblyStage {
+public:
+ ContigOutput()
+ : AssemblyStage("Contig Output", "contig_output") { }
+
+ void load(conj_graph_pack &, const std::string &, const char *) { }
+
+ void save(const conj_graph_pack &, const std::string &, const char *) const { }
+
+ void run(conj_graph_pack &gp, const char *);
+};
+
+}
+
diff --git a/src/projects/spades/second_phase_setup.cpp b/src/projects/spades/second_phase_setup.cpp
new file mode 100644
index 0000000..9f09674
--- /dev/null
+++ b/src/projects/spades/second_phase_setup.cpp
@@ -0,0 +1,42 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "io/dataset_support/dataset_readers.hpp"
+#include "io/dataset_support/read_converter.hpp"
+
+#include "paired_info/paired_info.hpp"
+
+#include "assembly_graph/stats/picture_dump.hpp"
+#include "pair_info_count.hpp"
+#include "second_phase_setup.hpp"
+
+
+namespace debruijn_graph {
+
+void SecondPhaseSetup::run(conj_graph_pack &gp, const char*) {
+ INFO("Preparing second phase");
+ gp.ClearRRIndices();
+
+ std::string old_pe_contigs_filename = cfg::get().output_dir + "final_contigs.fasta";
+ std::string new_pe_contigs_filename = cfg::get().output_dir + "first_pe_contigs.fasta";
+
+ VERIFY(path::check_existence(old_pe_contigs_filename));
+ INFO("Moving preliminary contigs from " << old_pe_contigs_filename << " to " << new_pe_contigs_filename);
+ int code = rename(old_pe_contigs_filename.c_str(), new_pe_contigs_filename.c_str());
+ VERIFY(code == 0);
+
+ io::SequencingLibrary<config::DataSetData> untrusted_contigs;
+ untrusted_contigs.push_back_single(new_pe_contigs_filename);
+ untrusted_contigs.set_orientation(io::LibraryOrientation::Undefined);
+ untrusted_contigs.set_type(io::LibraryType::PathExtendContigs);
+ cfg::get_writable().ds.reads.push_back(untrusted_contigs);
+
+ //FIXME get rid of this awful variable
+ cfg::get_writable().use_single_reads = false;
+ INFO("Ready to run second phase");
+}
+
+}
diff --git a/src/projects/spades/second_phase_setup.hpp b/src/projects/spades/second_phase_setup.hpp
new file mode 100644
index 0000000..bd40d88
--- /dev/null
+++ b/src/projects/spades/second_phase_setup.hpp
@@ -0,0 +1,22 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "pipeline/stage.hpp"
+
+namespace debruijn_graph {
+
+//todo rename
+class SecondPhaseSetup : public spades::AssemblyStage {
+public:
+ SecondPhaseSetup()
+ : AssemblyStage("Second Phase Setup", "second_phase_setup") { }
+
+ void run(conj_graph_pack &gp, const char *);
+};
+
+}
diff --git a/src/projects/truseq_analysis/AlignmentAnalyserNew.cpp b/src/projects/truseq_analysis/AlignmentAnalyserNew.cpp
new file mode 100644
index 0000000..b0c4f8f
--- /dev/null
+++ b/src/projects/truseq_analysis/AlignmentAnalyserNew.cpp
@@ -0,0 +1,102 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+//
+// Created by anton on 5/15/15.
+//
+
+#include "dev_support/standard_base.hpp"
+#include "algorithms/dijkstra/dijkstra_helper.hpp"
+#include "AlignmentAnalyserNew.hpp"
+
+namespace alignment_analysis {
+ using omnigraph::Range;
+
+ size_t AlignmentAnalyserNew::StepBack(const vector<ConsistentMapping> &path) const {
+ size_t cur_step = 0;
+ size_t cur = path.size() - 1;
+ while(cur > 0 && cur_step + path[cur].size() < step_ && path[cur - 1].CheckConnect(path[cur])) {
+ cur_step += path[cur].size();
+ cur--;
+ }
+ return cur;
+ }
+
+ vector <ConsistentMapping> AlignmentAnalyserNew::Analyse(const omnigraph::MappingPath<EdgeId> &path) const {
+ TRACE("");
+ TRACE("Analysis of path of length " << path.size() << ": " << path);
+ vector <ConsistentMapping> result;
+ TRACE("Adding " << path[0]);
+ result.push_back(ConsistentMapping(graph_, path[0].first, path[0].second));
+ for(size_t i = 1; i < path.size(); i++) {
+ TRACE("Attempting to add new part");
+ ConsistentMapping &mapping = result.back();
+ if(mapping.CheckConnect(path[i].first, path[i].second)) {
+ TRACE("Adding #" << result.size() << ": " << path[i]);
+ result.push_back(ConsistentMapping(graph_, path[i].first, path[i].second));
+ } else if (mapping.EndEdge() == path[i].first && mapping.Back().second.end_pos <= path[i].second.mapped_range.start_pos) {
+ Range initial(mapping.GetInitialRange().end_pos, path[i].second.initial_range.start_pos);
+ Range mapped(mapping.Back().second.end_pos, path[i].second.mapped_range.start_pos);
+ result.push_back(ConsistentMapping(graph_, path[i].first, omnigraph::MappingRange(initial, mapped)));
+ result.push_back(ConsistentMapping(graph_, path[i].first, path[i].second));
+ } else {
+ TRACE("Could not add " << path[i]);
+ size_t pos = StepBack(result);
+ VertexId start = result[pos].EndVertex();
+ TRACE("Start vertex: " << start);
+ omnigraph::DijkstraHelper<Graph>::BoundedDijkstra d = omnigraph::DijkstraHelper<Graph>::CreateBoundedDijkstra(graph_, 3000 + graph_.k(), 1000);
+ d.Run(start);
+ size_t best = i;
+ for (size_t j = i, cur_step = 0; j < path.size() && cur_step < step_; j++) {
+ TRACE("Checking candidate #" << j << ": " << path[j]);
+ if (d.DistanceCounted(graph_.EdgeStart(path[j].first))){
+ best = j;
+ break;
+ }
+ cur_step += path[j].second.mapped_range.size();
+ }
+ if(best < path.size() && d.DistanceCounted(graph_.EdgeStart(path[best].first))) {
+ //todo: make better cutting
+ this->Cut(result, start);
+ vector<EdgeId> detour_path = d.GetShortestPathTo(graph_.EdgeStart(path[best].first));
+ vector<EdgeRange> detour_mapping;
+ EdgeRange er = result.back().GetMappedPath().back();
+ if (er.second.end_pos != graph_.length(er.first)) {
+ detour_mapping.push_back(EdgeRange(er.first, Range(er.second.end_pos, graph_.length(er.first))));
+ }
+ for(auto it = detour_path.begin(); it != detour_path.end(); ++it) {
+ detour_mapping.push_back(EdgeRange(*it, Range(0, graph_.length(*it))));
+ }
+ if (path[best].second.mapped_range.start_pos != 0) {
+ detour_mapping.push_back(EdgeRange(path[best].first, Range(0, path[best].second.mapped_range.start_pos)));
+ }
+ if(detour_mapping.size() > 0) {
+ Range r(result.back().GetInitialRange().end_pos, path[best].second.initial_range.start_pos);
+ ConsistentMapping detour(graph_, r, detour_mapping);
+ TRACE("Adding #" << result.size() << ": " << detour);
+ result.push_back(detour);
+ } else {
+ TRACE("Empty detour path");
+ }
+ TRACE("Adding #" << result.size() << ": " << path[best]);
+ result.push_back(ConsistentMapping(graph_, path[best].first, path[best].second));
+ i = best;
+ } else {
+ TRACE("Adding #" << result.size() << ": " << path[i]);
+ result.push_back(ConsistentMapping(graph_, path[i].first, path[i].second));
+ }
+ }
+ }
+ return result;
+ }
+
+ void AlignmentAnalyserNew::Cut(vector<ConsistentMapping> &path, VertexId start) const {
+ while(path.back().EndVertex() != start) {
+ path.pop_back();
+ }
+ }
+}
diff --git a/src/projects/truseq_analysis/AlignmentAnalyserNew.hpp b/src/projects/truseq_analysis/AlignmentAnalyserNew.hpp
new file mode 100644
index 0000000..0ad6484
--- /dev/null
+++ b/src/projects/truseq_analysis/AlignmentAnalyserNew.hpp
@@ -0,0 +1,34 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "assembly_graph/graph_core/graph.hpp"
+#include "assembly_graph/paths/mapping_path.hpp"
+#include "consistent_mapping.h"
+
+namespace alignment_analysis {
+
+ class AlignmentAnalyserNew {
+ private:
+ typedef debruijn_graph::DeBruijnGraph Graph;
+ typedef Graph::EdgeId EdgeId;
+ typedef Graph::VertexId VertexId;
+ public:
+ AlignmentAnalyserNew(Graph const &graph, size_t step) : graph_(graph), step_(step) { }
+ vector <ConsistentMapping> Analyse(const omnigraph::MappingPath<EdgeId> &path) const;
+ private:
+ void Cut(vector<ConsistentMapping> &path, VertexId start) const;
+
+ size_t StepBack(const vector<ConsistentMapping> &path) const;
+
+ const Graph &graph_;
+ size_t step_;
+ DECL_LOGGER("AlignmentAnalyserNew")
+ };
+
+}
diff --git a/src/projects/truseq_analysis/CMakeLists.txt b/src/projects/truseq_analysis/CMakeLists.txt
new file mode 100644
index 0000000..0b07475
--- /dev/null
+++ b/src/projects/truseq_analysis/CMakeLists.txt
@@ -0,0 +1,15 @@
+############################################################################
+# Copyright (c) 2015 Saint Petersburg State University
+# Copyright (c) 2011-2014 Saint Petersburg Academic University
+# All Rights Reserved
+# See file LICENSE for details.
+############################################################################
+
+project(truseq_analysis CXX)
+
+add_executable(truseq_analysis
+ main.cpp
+ alignment_analyser.cpp AlignmentAnalyserNew.cpp consistent_mapping.cpp analysis_pipeline.cpp)
+
+target_link_libraries(truseq_analysis spades_modules ${COMMON_LIBRARIES})
+
diff --git a/src/projects/truseq_analysis/alignment_analyser.cpp b/src/projects/truseq_analysis/alignment_analyser.cpp
new file mode 100644
index 0000000..9f5c102
--- /dev/null
+++ b/src/projects/truseq_analysis/alignment_analyser.cpp
@@ -0,0 +1,116 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "dev_support/standard_base.hpp"
+#include "alignment_analyser.hpp"
+
+namespace alignment_analysis {
+ AlignmentAnalyser::AlignmentAnalyser(const vector <io::SingleRead> &scaffolds,
+ const vector <io::SingleRead> &genome,
+ const Graph &graph, const Mapper &mapper) : graph_(graph),
+ mapper_(mapper),
+ scaffolds_(scaffolds),
+ genome_(genome) {
+ }
+
+ string AlignmentAnalyser::str(const EdgeRange &er) const {
+ stringstream result;
+ result << "[" << er.first.int_id() << " len: " << er.first->length(55) << " from: " <<
+ graph_.EdgeStart(er.first) << " to: " << graph_.EdgeEnd(er.first) << ", " << er.second << "]";
+ return result.str();
+ }
+
+ using std::cout;
+ using std::endl;
+
+ vector <ConsistentMapping> AlignmentAnalyser::DetectAndMaskShortMutations(
+ const vector <ConsistentMapping> &alignments) {
+
+ if (alignments.empty()) {
+ return vector<ConsistentMapping>();
+ }
+ DijkstraHelper<Graph>::BoundedDijkstra d = DijkstraHelper<Graph>::CreateBoundedDijkstra(graph_,
+ 3000 + graph_.k(),
+ 1000);
+ vector <ConsistentMapping> result = {alignments.front()};
+ for (size_t i = 0; i + 1 < alignments.size(); i++) {
+ ConsistentMapping &prev = result.back();
+ const ConsistentMapping &next = alignments[i + 1];
+ const EdgeRange &back = prev.Back();
+ const EdgeRange &front = next.Front();
+ if (back.first == front.first) {
+ vector <EdgeRange> v;
+ if (back.second.end_pos < front.second.start_pos)
+ v.push_back(EdgeRange(back.first,
+ Range(back.second.end_pos, front.second.start_pos)));
+ if (back.second.end_pos <= front.second.start_pos) {
+ prev.Join(next, v);
+ continue;
+ } else {
+ std::cout << "incompatible alignments" << std::endl;
+ }
+ }
+ VertexId gap_start = graph_.EdgeEnd(back.first);
+ VertexId gap_end = graph_.EdgeStart(front.first);
+ d.Run(gap_start);
+ if (d.DistanceCounted(gap_end)) {
+ vector <EdgeId> path = d.GetShortestPathTo(gap_end);
+ int s = (int(graph_.length(back.first)) - int(back.second.end_pos)) + int(front.second.start_pos);
+ for (auto it = path.begin(); it != path.end(); ++it)
+ s += graph_.length(*it);
+ int diff = int(next.GetInitialRange().start_pos) - int(prev.GetInitialRange().end_pos) - s;
+ this->log_ << "Found short mutation: segment [" << prev.GetInitialRange().end_pos << ", " <<
+ next.GetInitialRange().start_pos <<
+ "] was replaced with path of length " << s << "(difference : " << diff << ")" << endl;
+ result.back().ForceJoin(next, path);
+ } else {
+ result.push_back(next);
+ if (graph_.OutgoingEdgeCount(graph_.EdgeEnd(back.first)) == 0 &&
+ graph_.IncomingEdgeCount(graph_.EdgeStart(front.first)) == 0
+ && back.second.end_pos + 100 > graph_.length(back.first) && front.second.start_pos < 100) {
+ this->log_ << "Coverage break detected " << prev.GetInitialRange().end_pos << " till " <<
+ next.GetInitialRange().start_pos << endl;
+ } else {
+ this->log_ << "Found break: from " << prev.GetInitialRange().end_pos << " till " <<
+ next.GetInitialRange().start_pos << endl;
+ }
+ }
+ }
+ return result;
+ }
+
+ vector <ConsistentMapping> AlignmentAnalyser::ExtractConsistentMappings(const MappingPath<EdgeId> &path) {
+ vector <ConsistentMapping> result;
+ for (size_t i = 0; i < path.size(); i++) {
+ pair<EdgeId, MappingRange> m = path[i];
+ ConsistentMapping mapping = ConsistentMapping(this->graph_, m.first, m.second);
+ if (result.empty()) {
+ result.push_back(mapping);
+ } else {
+ ConsistentMapping &back = result.back();
+ if (back.CheckConnect(m.first, m.second.mapped_range) &&
+ back.GetInitialRange().end_pos == m.second.initial_range.start_pos) {
+ back.Join(mapping);
+ } else {
+ result.push_back(mapping);
+ }
+ }
+ }
+ return result;
+ }
+
+ string AlignmentAnalyser::Analyse(const io::SingleRead &genome_part) {
+ log_.str("");
+ MappingPath<EdgeId> path = mapper_.MapRead(genome_part);
+ stringstream result;
+ log_ << "Analysis of part " << genome_part.name() << endl;
+ cout << "Analysis of part " << genome_part.name() << endl;
+ vector <ConsistentMapping> mapping = ExtractConsistentMappings(path);
+ mapping = DetectAndMaskShortMutations(mapping);
+ return log_.str();
+ }
+}
diff --git a/src/projects/truseq_analysis/alignment_analyser.hpp b/src/projects/truseq_analysis/alignment_analyser.hpp
new file mode 100644
index 0000000..2da4fde
--- /dev/null
+++ b/src/projects/truseq_analysis/alignment_analyser.hpp
@@ -0,0 +1,41 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "dev_support/standard_base.hpp"
+#include "pipeline/graph_pack.hpp"
+#include "consistent_mapping.h"
+
+namespace alignment_analysis {
+
+ class AlignmentAnalyser {
+ private:
+ typedef debruijn_graph::conj_graph_pack::graph_t Graph;
+ typedef Graph::EdgeId EdgeId;
+ typedef Graph::VertexId VertexId;
+ typedef debruijn_graph::NewExtendedSequenceMapper<Graph, debruijn_graph::conj_graph_pack::index_t> Mapper;
+ stringstream log_;
+ const Graph &graph_;
+ const Mapper &mapper_;
+ const vector <io::SingleRead> &scaffolds_;
+ const vector <io::SingleRead> &genome_;
+
+ string str(const EdgeRange &er) const;
+
+ public:
+ AlignmentAnalyser(const vector <io::SingleRead> &scaffolds, const vector <io::SingleRead> &genome,
+ const Graph &graph, const Mapper &mapper);
+
+ string Analyse(const io::SingleRead &genome_part);
+
+ private:
+ vector <ConsistentMapping> ExtractConsistentMappings(const MappingPath<EdgeId> &path);
+
+ vector <ConsistentMapping> DetectAndMaskShortMutations(const vector <ConsistentMapping> &vector);
+ };
+}
diff --git a/src/projects/truseq_analysis/analysis_pipeline.cpp b/src/projects/truseq_analysis/analysis_pipeline.cpp
new file mode 100644
index 0000000..c7ef6a5
--- /dev/null
+++ b/src/projects/truseq_analysis/analysis_pipeline.cpp
@@ -0,0 +1,140 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+//
+// Created by anton on 16.05.15.
+//
+
+#include "stages/construction.hpp"
+#include "dev_support/standard_base.hpp"
+#include "analysis_pipeline.hpp"
+
+spades::VariationDetectionStage::VariationDetectionStage(string output_file, const Config &config) : AssemblyStage("VariationDetection", "variation_detection"),
+ output_file_(output_file), config_(config) {
+}
+
+vector<io::SingleRead> spades::VariationDetectionStage::ReadScaffolds(const string &scaffolds_file) {
+ io::FileReadStream scaffold_stream(scaffolds_file);
+ vector<io::SingleRead> scaffolds;
+ while(!scaffold_stream.eof()) {
+ io::SingleRead scaffold;
+ scaffold_stream >> scaffold;
+ scaffolds.push_back(scaffold);
+ }
+ return scaffolds;
+}
+
+void spades::VariationDetectionStage::run(debruijn_graph::conj_graph_pack &graph_pack, const char *) {
+ using debruijn_graph::EdgeId;
+ using alignment_analysis::EdgeRange;
+ INFO("Analysis of contigs from " << config_.scaffolds_file);
+ vector<io::SingleRead> scaffolds = ReadScaffolds(config_.scaffolds_file);
+ vector<io::SingleRead> genome = ReadScaffolds(config_.genome_file);
+ auto mapper_ptr = MapperInstance(graph_pack);
+// alignment_analysis::AlignmentAnalyser aa(scaffolds, genome, graph_pack.g, *mapper_ptr);
+ const debruijn_graph::DeBruijnGraph &graph = graph_pack.g;
+ alignment_analysis::AlignmentAnalyserNew aa(graph, 2 * graph_pack.k_value + 10);
+
+ ofstream os(output_file_);
+ for(auto it = genome.begin(); it != genome.end(); ++it) {
+ const io::SingleRead &part = *it;
+ MappingPath<EdgeId> path = mapper_ptr->MapRead(part);
+ vector<alignment_analysis::ConsistentMapping> result = aa.Analyse(path);
+ os << "Analysis of part " << part.name() << endl;
+ for(size_t i = 0; i < result.size(); i++) {
+ alignment_analysis::ConsistentMapping &cm = result[i];
+// os << "Alignment: " << cm.GetInitialRange() << " -> ";
+// const vector<EdgeRange> &mappedPath = cm.GetMappedPath();
+// for(auto pit = mappedPath.begin(); pit != mappedPath.end(); ++pit) {
+// const EdgeRange &er = *pit;
+// os << er << " ";
+// }
+// os << endl;
+ size_t diff = cm.GetInitialRange().size() > cm.size() ? cm.GetInitialRange().size() - cm.size() : cm.size() - cm.GetInitialRange().size();
+ if(diff > 500)
+ os << cm.CompareToReference(part.GetSequenceString()) << endl;
+ }
+ result = ExtractConsistentMappings(result);
+ for(size_t i = 0; i + 1 < result.size(); i++) {
+ alignment_analysis::ConsistentMapping &cm = result[i];
+ alignment_analysis::ConsistentMapping &next_cm = result[i + 1];
+ if (this->CheckEndVertex(graph, cm.EndEdge(), 150 + cm.Back().second.end_pos) &&
+ this->CheckEndVertex(graph, graph.conjugate(next_cm.StartEdge()),
+ 150 + graph.length(next_cm.StartEdge()) - next_cm.Front().second.start_pos)) {
+// os << "Coverage break: " << "[" << cm.GetInitialRange().end_pos << ", " << next_cm.GetInitialRange().start_pos << "]"<< endl;
+ } else {
+ if(cm.GetInitialRange().size() < 100 || next_cm.GetInitialRange().size() < 100) {
+// os << "Unreliable alignment event: " << "[" << cm.GetInitialRange().end_pos << ", " <<
+// next_cm.GetInitialRange().start_pos << "]" << endl;
+ } else {
+ os << "Breakpoint: " << "[" << cm.GetInitialRange().end_pos << ", " <<
+ next_cm.GetInitialRange().start_pos << "]" << endl;
+ }
+ }
+ }
+ }
+ os.close();
+ INFO("Analisys results written to " << output_file_);
+}
+
+void spades::run_truseq_analysis() {
+ INFO("TruSeq analysis started");
+
+ debruijn_graph::conj_graph_pack conj_gp(cfg::get().K,
+ cfg::get().tmp_dir,
+ cfg::get().ds.reads.lib_count(),
+ cfg::get().ds.reference_genome,
+ cfg::get().flanking_range,
+ cfg::get().pos.max_mapping_gap,
+ cfg::get().pos.max_gap_diff);
+ StageManager manager({cfg::get().developer_mode,
+ cfg::get().load_from,
+ cfg::get().output_saves});
+ manager.add(new debruijn_graph::Construction());
+ std::string output_file = cfg::get().output_dir + "analysis_report";
+ manager.add(new VariationDetectionStage(output_file, cfg::get().tsa));
+ INFO("Output directory: " << cfg::get().output_dir);
+ conj_gp.kmer_mapper.Attach();
+ manager.run(conj_gp, cfg::get().entry_point.c_str());
+ INFO("Scaffold correction finished.");
+}
+
+bool spades::VariationDetectionStage::CheckEndVertex(debruijn_graph::DeBruijnGraph const &graph,
+ debruijn_graph::EdgeId e, size_t dist) {
+ using debruijn_graph::VertexId;
+ if(graph.coverage(e) == 0) {
+ return true;
+ }
+ if (graph.length(e) > dist) {
+ return false;
+ }
+ VertexId v = graph.EdgeEnd(e);
+ GraphCore<debruijn_graph::DeBruijnDataMaster>::IteratorContainer outgoingEdges = graph.OutgoingEdges(v);
+ for(auto it = outgoingEdges.begin(); it != outgoingEdges.end(); ++it) {
+ if(!CheckEndVertex(graph, *it, dist - graph.length(e)))
+ return false;
+ }
+ return true;
+}
+
+vector <alignment_analysis::ConsistentMapping> spades::VariationDetectionStage::ExtractConsistentMappings(const vector<alignment_analysis::ConsistentMapping> &path) {
+ vector <alignment_analysis::ConsistentMapping> result;
+ result.push_back(path[0]);
+ for (size_t i = 1; i < path.size(); i++) {
+ if (result.empty()) {
+ result.push_back(path[i]);
+ } else {
+ alignment_analysis::ConsistentMapping &back = result.back();
+ if (back.CheckConnect(path[i])) {
+ back.Join(path[i]);
+ } else {
+ result.push_back(path[i]);
+ }
+ }
+ }
+ return result;
+}
\ No newline at end of file
diff --git a/src/projects/truseq_analysis/analysis_pipeline.hpp b/src/projects/truseq_analysis/analysis_pipeline.hpp
new file mode 100644
index 0000000..a2d330f
--- /dev/null
+++ b/src/projects/truseq_analysis/analysis_pipeline.hpp
@@ -0,0 +1,39 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "dev_support/standard_base.hpp"
+#include <pipeline/stage.hpp>
+#include "alignment_analyser.hpp"
+#include "AlignmentAnalyserNew.hpp"
+
+namespace spades {
+ class VariationDetectionStage : public AssemblyStage {
+ public:
+ typedef debruijn_graph::config::debruijn_config::truseq_analysis Config;
+ private:
+ std::string output_file_;
+ const Config &config_;
+ public:
+ VariationDetectionStage(string output_file, const Config &config);
+
+ vector<io::SingleRead> ReadScaffolds(const string &scaffolds_file);
+
+ void run(debruijn_graph::conj_graph_pack &graph_pack, const char *);
+
+ DECL_LOGGER("AlignmntAnalysis")
+
+ bool CheckEndVertex(debruijn_graph::DeBruijnGraph const &graph,
+ debruijn_graph::EdgeId id, size_t i);
+ private:
+ vector <alignment_analysis::ConsistentMapping> ExtractConsistentMappings(const vector<alignment_analysis::ConsistentMapping> &path);
+ };
+
+ void run_truseq_analysis();
+
+}
diff --git a/src/projects/truseq_analysis/consistent_mapping.cpp b/src/projects/truseq_analysis/consistent_mapping.cpp
new file mode 100644
index 0000000..2e3cc63
--- /dev/null
+++ b/src/projects/truseq_analysis/consistent_mapping.cpp
@@ -0,0 +1,245 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "dev_support/standard_base.hpp"
+#include "AlignmentAnalyserNew.hpp"
+#include "consistent_mapping.h"
+
+namespace alignment_analysis {
+ using omnigraph::Range;
+ using omnigraph::MappingRange;
+
+ bool ConsistentMapping::CheckConnect(EdgeId e, Range r) const {
+ return CheckConnect(mapped_path.back(), EdgeRange(e, r));
+ }
+
+ bool ConsistentMapping::CheckConnect(const EdgeRange &er) const {
+ return CheckConnect(mapped_path.back(), er);
+ }
+
+ bool ConsistentMapping::CheckConnect(const vector <EdgeRange> &path) const {
+ for (size_t i = 0; i + 1 < path.size(); i++) {
+ if (!CheckConnect(path[i], path[i + 1]))
+ return false;
+ }
+ return true;
+ }
+
+ bool ConsistentMapping::CheckConnect(EdgeId e, MappingRange r) const {
+ return CheckConnect(e, r.mapped_range) && initial_range.end_pos == r.initial_range.start_pos;
+ }
+
+ bool ConsistentMapping::CheckConnect(const ConsistentMapping &other) const {
+ return this->CheckConnect(other.Front()) && initial_range.end_pos == other.initial_range.start_pos;
+ }
+
+ void ConsistentMapping::Join(const ConsistentMapping &other) {
+ if (!this->IsEmpty() && !other.IsEmpty()) {
+ VERIFY(this->initial_range.end_pos == other.initial_range.start_pos);
+ VERIFY(CheckConnect(this->mapped_path.back(), other.mapped_path.front()));
+ }
+ this->initial_range = this->initial_range.Merge(other.initial_range);
+ this->mapped_path.insert(this->mapped_path.end(), other.mapped_path.begin(), other.mapped_path.end());
+ }
+
+ void ConsistentMapping::Join(const ConsistentMapping &other, const vector <EdgeRange> &path) {
+ VERIFY(!this->IsEmpty());
+ VERIFY(!other.IsEmpty());
+ VERIFY(path.size() == 0 || CheckConnect(this->Back(), path.front()));
+ VERIFY(path.size() == 0 || CheckConnect(path.back(), other.Front()));
+ VERIFY(CheckConnect(path));
+ this->initial_range = this->initial_range.Merge(other.initial_range);
+ this->mapped_path.insert(this->mapped_path.end(), path.begin(), path.end());
+ this->mapped_path.insert(this->mapped_path.end(), other.mapped_path.begin(), other.mapped_path.end());
+ }
+
+ void ConsistentMapping::ForceJoin(const ConsistentMapping &other, const vector <EdgeId> &path) {
+ VERIFY(!this->IsEmpty());
+ VERIFY(!other.IsEmpty());
+ auto pos = other.mapped_path.begin();
+ if (path.empty()) {
+ VERIFY(graph_.EdgeEnd(this->mapped_path.back().first) == graph_.EdgeEnd(other.mapped_path.front().first));
+ } else {
+ CutToVertex(graph_.EdgeStart(path.front()));
+ while (pos != other.mapped_path.end() && graph_.EdgeEnd(path.back()) != graph_.EdgeStart(pos->first)) {
+ ++pos;
+ }
+ VERIFY(pos != other.mapped_path.end());
+ }
+ this->mapped_path.back().second.end_pos = graph_.length(this->mapped_path.back().first);
+ for (auto it = path.begin(); it != path.end(); ++it) {
+ this->mapped_path.push_back(EdgeRange(*it, Range(0, graph_.length(*it))));
+ }
+ this->initial_range = this->initial_range.Merge(other.initial_range);
+ EdgeRange er = *pos;
+ er.second.start_pos = 0;
+ this->mapped_path.push_back(er);
+ this->mapped_path.insert(this->mapped_path.end(), pos + 1, other.mapped_path.end());
+ }
+
+ void ConsistentMapping::CutToVertex(VertexId path_start) {
+ while (mapped_path.size() > 0 &&
+ graph_.EdgeEnd(mapped_path.back().first) != path_start) {
+ initial_range.end_pos -= mapped_path.back().second.size();
+ mapped_path.pop_back();
+ }
+ VERIFY(this->mapped_path.size() > 0);
+ }
+
+ bool ConsistentMapping::IsEmpty() const {
+ return initial_range.empty();
+ }
+
+ bool ConsistentMapping::CheckConnect(const EdgeRange &r1, const EdgeRange &r2) const {
+ bool result = true;
+ if (r1.first != r2.first) {
+ result &= graph_.EdgeEnd(r1.first) == graph_.EdgeStart(r2.first);
+ result &= r1.second.end_pos == graph_.length(r1.first);
+ result &= r2.second.start_pos == 0;
+ } else {
+ result &= r1.second.end_pos == r2.second.start_pos;
+ }
+ return result;
+ }
+
+ ConsistentMapping::ConsistentMapping(const Graph &graph, const omnigraph::MappingPath<EdgeId> &path) : graph_(graph) {
+ VERIFY(path.size() > 0);
+ this->initial_range = Range(path.start_pos(), path.end_pos());
+ for (size_t i = 0; i < path.size(); i++) {
+ VERIFY(i == 0 ||graph.EdgeEnd(path[i - 1].first) == graph.EdgeStart(path[i].first));
+ EdgeRange p(path[i].first, path[i].second.mapped_range);
+ mapped_path.push_back(p);
+ }
+ VERIFY(CheckConnect(mapped_path));
+ }
+
+ ConsistentMapping::ConsistentMapping(const Graph &graph) : graph_(graph) {
+ }
+
+ ConsistentMapping::ConsistentMapping(const Graph &graph, EdgeId e, MappingRange m)
+ : graph_(graph), initial_range(m.initial_range), mapped_path{EdgeRange(e, m.mapped_range)} {
+ }
+
+ const Range &ConsistentMapping::GetInitialRange() const {
+ return initial_range;
+ }
+
+ const EdgeRange &ConsistentMapping::Back() const {
+ return this->mapped_path.back();
+ }
+
+ const EdgeRange &ConsistentMapping::Front() const {
+ return this->mapped_path.front();
+ }
+
+ Sequence ConsistentMapping::CorrectSequence() const {
+ SequenceBuilder sb;
+ if(mapped_path.size() == 1) {
+ return graph_.EdgeNucls(Front().first).Subseq(Front().second.start_pos, Front().second.end_pos + graph_.k());
+ }
+ sb.append(graph_.EdgeNucls(Front().first).Subseq(Front().second.start_pos));
+ for(auto it = mapped_path.begin(); it != mapped_path.end(); ++it) {
+ EdgeId e = it->first;
+ Range r = it->second;
+ VERIFY(it == mapped_path.begin() || r.start_pos == 0);
+ sb.append(graph_.EdgeNucls(e).Subseq(graph_.k(), graph_.k() + r.end_pos));
+ }
+ return sb.BuildSequence();
+
+ }
+
+ size_t ConsistentMapping::size() const {
+ size_t result = 0;
+ for(auto it = mapped_path.begin(); it != mapped_path.end(); ++it) {
+ result += it->second.size();
+ }
+ return result;
+ }
+
+ VertexId ConsistentMapping::StartVertex() const {
+ return graph_.EdgeStart(Front().first);
+ }
+
+ VertexId ConsistentMapping::EndVertex() const {
+ return graph_.EdgeEnd(Back().first);
+ }
+
+ EdgeId ConsistentMapping::StartEdge() const {
+ return Front().first;
+ }
+
+ EdgeId ConsistentMapping::EndEdge() const {
+ return Back().first;
+ }
+
+ ConsistentMapping::ConsistentMapping(Graph const &graph, Range r, const vector<EdgeRange> &path) : graph_(graph), initial_range(r), mapped_path(path){
+
+ }
+
+ vector <EdgeRange> ConsistentMapping::GenerateMappingPath(const vector <EdgeId> &path) const {
+ vector <EdgeRange> result;
+ for(auto it = path.begin(); it != path.end(); ++it) {
+ result.push_back(EdgeRange(*it, Range(0, graph_.length(*it))));
+ }
+ return result;
+ }
+
+ const vector <EdgeRange> &ConsistentMapping::GetMappedPath() const {
+ return mapped_path;
+ }
+
+ string ConsistentMapping::CompareToReference(string const &reference) const {
+ string reference_part = reference.substr(initial_range.start_pos, initial_range.size() + graph_.k());
+ string correct = CorrectSequence().str();
+ if (correct == reference_part) {
+ return "Match";
+ }
+ size_t l = 0;
+ size_t r = 0;
+ while(l < min(reference_part.size(), correct.size()) && reference_part[l] == correct[l])
+ l++;
+ while(l + r <= min(reference_part.size(), correct.size()) && reference_part[reference_part.size() - 1 - r] == correct[correct.size() - 1 - r])
+ r++;
+ stringstream ss;
+ if(l + r == reference_part.size()) {
+ ss << "Insertion (" << initial_range.start_pos + l << "): Length: " << correct.substr(l, correct.size() - l - r).size();
+ } else if(l + r == correct.size()) {
+ ss << "Deletion (" << initial_range.start_pos + l + 1 << ", " << initial_range.end_pos - r + graph_.k() << "): Length: " << reference_part.substr(l, reference_part.size() - l - r).size();
+ } else {
+ ss << "Substitution (" << initial_range.start_pos + l + 1 << ", " <<
+ initial_range.end_pos - r + graph_.k() << "): Lengths: " <<
+ reference_part.substr(l, reference_part.size() - l - r).size() << " -> " <<
+ correct.substr(l, correct.size() - l - r).size();
+ }
+ return ss.str();
+ }
+
+ ostream &operator<<(ostream& os, const EdgeRange& er) {
+ os << "EdgeRange(" << er.first.int_id() << " : " << er.second << ")";
+ return os;
+ }
+
+// void ConsistentMapping::CloseEnd() {
+// EdgeRange & er = this->mapped_path.back();
+// er.second.end_pos = graph_.length(er.first);
+// }
+//
+// void ConsistentMapping::CloseStart() {
+// EdgeRange & er = this->mapped_path.front();
+// er.second.start_pos = 0;
+// }
+
+ ostream &operator<<(ostream& os, const ConsistentMapping& cm) {
+ os << cm.GetInitialRange() << " -> ( ";
+ for(auto it = cm.GetMappedPath().begin(); it != cm.GetMappedPath().end(); ++it) {
+ EdgeRange er = *it;
+ os << er << " ";
+ }
+ os << ")";
+ return os;
+ }
+}
\ No newline at end of file
diff --git a/src/projects/truseq_analysis/consistent_mapping.h b/src/projects/truseq_analysis/consistent_mapping.h
new file mode 100644
index 0000000..162be38
--- /dev/null
+++ b/src/projects/truseq_analysis/consistent_mapping.h
@@ -0,0 +1,90 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+namespace alignment_analysis {
+ typedef debruijn_graph::DeBruijnGraph Graph;
+ typedef Graph::VertexId VertexId;
+ typedef Graph::EdgeId EdgeId;
+
+ struct EdgeRange {
+ EdgeRange(const EdgeId &first, const omnigraph::Range &second) : first(first), second(second) { }
+ EdgeId first;
+ omnigraph::Range second;
+ };
+
+ ostream & operator<<(ostream& os, const EdgeRange& er);
+
+ struct ConsistentMapping {
+ ConsistentMapping(const Graph &graph);
+
+ ConsistentMapping(const Graph &graph, EdgeId e, omnigraph::MappingRange m);
+
+ ConsistentMapping(const Graph &graph, const omnigraph::MappingPath<EdgeId> &path);
+
+ ConsistentMapping(Graph const &graph, omnigraph::Range r, const vector<EdgeRange> &path);
+
+ bool CheckConnect(EdgeId e, omnigraph::Range r) const;
+
+ bool CheckConnect(const EdgeRange &er) const;
+
+ bool CheckConnect(EdgeId e, omnigraph::MappingRange r) const;
+
+ bool CheckConnect(const ConsistentMapping &other) const;
+
+ bool IsEmpty() const;
+
+ void Join(const ConsistentMapping &other);
+
+ void Join(const ConsistentMapping &other, const vector <EdgeRange> &path);
+
+ void ForceJoin(const ConsistentMapping &other, const vector <EdgeId> &path);
+
+ omnigraph::Range const &GetInitialRange() const;
+
+ const vector <EdgeRange> &GetMappedPath() const;
+
+ VertexId StartVertex() const;
+
+ VertexId EndVertex() const;
+
+ EdgeId StartEdge() const;
+
+ EdgeId EndEdge() const;
+
+ const EdgeRange &Back() const;
+
+ const EdgeRange &Front() const;
+
+ void CutToVertex(VertexId path_start);
+
+ Sequence CorrectSequence() const;
+
+ size_t size() const;
+
+ string description_;
+
+ string CompareToReference(const string &reference_part) const;
+
+// void CloseEnd();
+//
+// void CloseStart();
+
+ private:
+ bool CheckConnect(const EdgeRange &r1, const EdgeRange &r2) const;
+ bool CheckConnect(const vector <EdgeRange> &path) const;
+ vector<EdgeRange> GenerateMappingPath(const vector<EdgeId> &path) const;
+
+ const Graph &graph_;
+ omnigraph::Range initial_range;
+ vector <EdgeRange> mapped_path;
+ DECL_LOGGER("ConsistentMapping");
+ };
+
+ ostream & operator<<(ostream& os, const ConsistentMapping& cm);
+}
diff --git a/src/projects/truseq_analysis/main.cpp b/src/projects/truseq_analysis/main.cpp
new file mode 100644
index 0000000..3cd961b
--- /dev/null
+++ b/src/projects/truseq_analysis/main.cpp
@@ -0,0 +1,95 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * TruSeq Analysis Main
+ */
+#include "dev_support/logger/log_writers.hpp"
+#include "dev_support/segfault_handler.hpp"
+#include "dev_support/memory_limit.hpp"
+#include "dev_support/copy_file.hpp"
+#include "pipeline/config_struct.hpp"
+#include "analysis_pipeline.hpp"
+
+void load_config(string cfg_filename) {
+ path::CheckFileExistenceFATAL(cfg_filename);
+
+ cfg::create_instance(cfg_filename);
+
+ if (!cfg::get().project_name.empty()) {
+ make_dir(cfg::get().output_base + cfg::get().project_name);
+ }
+
+ make_dir(cfg::get().output_dir);
+ make_dir(cfg::get().tmp_dir);
+
+ if (cfg::get().developer_mode)
+ make_dir(cfg::get().output_saves);
+
+ make_dir(cfg::get().temp_bin_reads_path);
+}
+
+void create_console_logger(string cfg_filename) {
+ using namespace logging;
+
+ string log_props_file = cfg::get().log_filename;
+
+ if (!path::FileExists(log_props_file))
+ log_props_file = path::append_path(path::parent_path(cfg_filename), cfg::get().log_filename);
+
+ logger *lg = create_logger(path::FileExists(log_props_file) ? log_props_file : "");
+ lg->add_writer(std::make_shared<console_writer>());
+ attach_logger(lg);
+}
+
+int main(int /*argc*/, char** argv) {
+ perf_counter pc;
+
+ const size_t GB = 1 << 30;
+
+ srand(42);
+ srandom(42);
+
+ try {
+ using namespace debruijn_graph;
+
+ string cfg_filename = argv[1];
+
+ load_config (cfg_filename);
+ create_console_logger(cfg_filename);
+
+ VERIFY(cfg::get().K >= runtime_k::MIN_K && cfg::get().K < runtime_k::MAX_K);
+ VERIFY(cfg::get().K % 2 != 0);
+
+ // read configuration file (dataset path etc.)
+
+ limit_memory(cfg::get().max_memory * GB);
+
+ // assemble it!
+ INFO("Assembling dataset (" << cfg::get().dataset_file << ") with K=" << cfg::get().K);
+
+ spades::run_truseq_analysis();
+ } catch (std::bad_alloc const& e) {
+ std::cerr << "Not enough memory to run SPAdes. " << e.what() << std::endl;
+ return EINTR;
+ } catch (std::exception const& e) {
+ std::cerr << "Exception caught " << e.what() << std::endl;
+ return EINTR;
+ } catch (...) {
+ std::cerr << "Unknown exception caught " << std::endl;
+ return EINTR;
+ }
+
+ unsigned ms = (unsigned)pc.time_ms();
+ unsigned secs = (ms / 1000) % 60;
+ unsigned mins = (ms / 1000 / 60) % 60;
+ unsigned hours = (ms / 1000 / 60 / 60);
+ INFO("Assembling time: " << hours << " hours " << mins << " minutes " << secs << " seconds");
+
+ // OK
+ return 0;
+}
diff --git a/src/scaffold_correction/CMakeLists.txt b/src/scaffold_correction/CMakeLists.txt
deleted file mode 100644
index d35119a..0000000
--- a/src/scaffold_correction/CMakeLists.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-############################################################################
-# Copyright (c) 2015 Saint Petersburg State University
-# All Rights Reserved
-# See file LICENSE for details.
-############################################################################
-
-project(moleculo CXX)
-
-
-add_executable(scaffold_correction
- main.cpp)
-target_include_directories(scaffold_correction PRIVATE ${CMAKE_SOURCE_DIR}/debruijn)
-target_link_libraries(scaffold_correction debruijn)
-
-if (SPADES_STATIC_BUILD)
- set_target_properties(scaffold_correction PROPERTIES LINK_SEARCH_END_STATIC 1)
-endif()
-
-install(TARGETS scaffold_correction
- DESTINATION bin
- COMPONENT runtime)
-install(DIRECTORY "${SPADES_CFG_DIR}/scaffold_correction"
- DESTINATION share/spades/configs
- FILES_MATCHING PATTERN "*.info.template")
diff --git a/src/scaffold_correction/main.cpp b/src/scaffold_correction/main.cpp
deleted file mode 100644
index de820c6..0000000
--- a/src/scaffold_correction/main.cpp
+++ /dev/null
@@ -1,171 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * Assembler Main
- */
-#include "standard.hpp"
-#include "logger/log_writers.hpp"
-
-#include "segfault_handler.hpp"
-#include "stacktrace.hpp"
-#include "memory_limit.hpp"
-#include "copy_file.hpp"
-#include "perfcounter.hpp"
-#include "runtime_k.hpp"
-#include "scaffold_correction.hpp"
-
-#include "config_struct.hpp"
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <unistd.h>
-#include "scaffold_correction.hpp"
-
-void link_output(std::string const& link_name) {
- if (!cfg::get().run_mode)
- return;
-
- std::string link = cfg::get().output_root + link_name;
- unlink(link.c_str());
- if (symlink(cfg::get().output_suffix.c_str(), link.c_str()) != 0)
- WARN( "Symlink to \"" << link << "\" launch failed");
-}
-
-void link_previous_run(std::string const& previous_link_name, std::string const& link_name) {
- if (!cfg::get().run_mode)
- return;
-
- char buf[255];
-
- std::string link = cfg::get().output_dir + previous_link_name;
- unlink(link.c_str());
- ssize_t count = readlink((cfg::get().output_root + link_name).c_str(), buf, sizeof(buf) - 1);
- if (count >= 0){
- buf[count] = '\0';
- std::string previous_run("../");
- previous_run = previous_run + buf;
- if (symlink(previous_run.c_str(), link.c_str()) != 0) {
- DEBUG( "Symlink to \"" << link << "\" launch failed : " << previous_run);
- }
- } else {
- DEBUG( "Symlink to \"" << link << "\" launch failed");
- }
-}
-
-struct on_exit_output_linker {
- on_exit_output_linker(std::string const& link_name) :
- link_name_(link_name) { }
-
- ~on_exit_output_linker() {
- link_previous_run("previous", link_name_);
- link_output(link_name_);
- }
-
-private:
- std::string link_name_;
-};
-
-void copy_configs(string cfg_filename, string to) {
- if (!cfg::get().run_mode)
- return;
-
- using namespace debruijn_graph;
-
- if (!make_dir(to)) {
- WARN("Could not create files use in /tmp directory");
- }
- path::copy_files_by_ext(path::parent_path(cfg_filename), to, ".info", true);
-}
-
-void load_config(string cfg_filename) {
- path::CheckFileExistenceFATAL(cfg_filename);
-
- cfg::create_instance(cfg_filename);
-
- if (!cfg::get().project_name.empty()) {
- make_dir(cfg::get().output_base + cfg::get().project_name);
- }
-
- make_dir(cfg::get().output_root);
- make_dir(cfg::get().tmp_dir);
-
- make_dir(cfg::get().output_dir);
- if (cfg::get().developer_mode)
- make_dir(cfg::get().output_saves);
-
- make_dir(cfg::get().temp_bin_reads_path);
-
- string path_to_copy = path::append_path(cfg::get().output_dir, "configs");
- copy_configs(cfg_filename, path_to_copy);
-}
-
-void create_console_logger(string cfg_filename) {
- using namespace logging;
-
- string log_props_file = cfg::get().log_filename;
-
- if (!path::FileExists(log_props_file))
- log_props_file = path::append_path(path::parent_path(cfg_filename), cfg::get().log_filename);
-
- logger *lg = create_logger(path::FileExists(log_props_file) ? log_props_file : "");
- lg->add_writer(std::make_shared<console_writer>());
- attach_logger(lg);
-}
-
-int main(int /*argc*/, char** argv) {
- perf_counter pc;
-
- const size_t GB = 1 << 30;
-
- segfault_handler sh(bind(link_output, "latest"));
-
- srand(42);
- srandom(42);
-
- try {
- using namespace debruijn_graph;
-
- string cfg_filename = argv[1];
-
- load_config (cfg_filename);
- create_console_logger(cfg_filename);
-
-// on_exit_output_linker try_linker("latest");
-
- VERIFY(cfg::get().K >= runtime_k::MIN_K && cfg::get().K < runtime_k::MAX_K);
- VERIFY(cfg::get().K % 2 != 0);
-
- // read configuration file (dataset path etc.)
-
- limit_memory(cfg::get().max_memory * GB);
-
- // assemble it!
- INFO("Assembling dataset (" << cfg::get().dataset_file << ") with K=" << cfg::get().K);
-
- spades::run_scaffold_correction();
-
- link_output("latest_success");
- } catch (std::bad_alloc const& e) {
- std::cerr << "Not enough memory to run SPAdes. " << e.what() << std::endl;
- return EINTR;
- } catch (std::exception const& e) {
- std::cerr << "Exception caught " << e.what() << std::endl;
- return EINTR;
- } catch (...) {
- std::cerr << "Unknown exception caught " << std::endl;
- return EINTR;
- }
-
- unsigned ms = (unsigned)pc.time_ms();
- unsigned secs = (ms / 1000) % 60;
- unsigned mins = (ms / 1000 / 60) % 60;
- unsigned hours = (ms / 1000 / 60 / 60);
- INFO("Assembling time: " << hours << " hours " << mins << " minutes " << secs << " seconds");
-
- // OK
- return 0;
-}
diff --git a/src/scaffold_correction/scaffold_correction.hpp b/src/scaffold_correction/scaffold_correction.hpp
deleted file mode 100644
index 0e2309f..0000000
--- a/src/scaffold_correction/scaffold_correction.hpp
+++ /dev/null
@@ -1,332 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-#include "io/osequencestream.hpp"
-#include "stage.hpp"
-#include "graph_pack.hpp"
-#include "omni/path_processor.hpp"
-#include "construction.hpp"
-#include "config_struct.hpp"
-#include "omni/dijkstra_tools/dijkstra_algorithm.hpp"
-#include "omni/dijkstra_tools/dijkstra_helper.hpp"
-
-namespace scaffold_correction {
- typedef debruijn_graph::ConjugateDeBruijnGraph Graph;
-
- class PathSideSimilarityComparator {
- private:
- typedef Graph::EdgeId EdgeId;
- const Graph &graph_;
- vector<EdgeId> initial_path_;
-
- public:
- size_t CalculateDifference(vector<EdgeId> path) {
- size_t lside = 0;
- size_t rside = 0;
- size_t min_size = std::min(path.size(), initial_path_.size());
- while(lside < min_size && path[lside] == initial_path_[lside])
- lside++;
- while(rside < min_size && lside + rside < min_size &&
- path[path.size() - 1 - rside] == initial_path_[initial_path_.size() - 1 - rside])
- rside++;
- size_t result = 0;
- for(size_t i = lside; i < path.size() - rside; i++) {
- result += graph_.length(path[i]);
- }
- return result;
- }
-
- PathSideSimilarityComparator(const Graph &graph, vector<EdgeId> path) : graph_(graph), initial_path_(path) {
- }
-
-
- bool operator()(const vector<EdgeId> &path1, const vector<EdgeId> &path2) {
- return CalculateDifference(path1) < CalculateDifference(path2);
- }
- };
-
- class CarefulPathFixer {
- private:
- typedef Graph::EdgeId EdgeId;
- typedef Graph::VertexId VertexId;
- const Graph &graph_;
- size_t max_cut_length_;
- size_t max_insert_;
-
- bool Consistent(EdgeId e1, EdgeId e2) const {
- return graph_.EdgeEnd(e1) == graph_.EdgeStart(e2);
- }
-
- size_t StepBack(size_t pos, const vector<EdgeId> &edges) const {
- size_t step_size = 0;
- while(pos > 0 && Consistent(edges[pos - 1], edges[pos]) && step_size + graph_.length(edges[pos]) <= max_cut_length_) {
- step_size += graph_.length(edges[pos]);
- pos -= 1;
- }
- return pos;
- }
-
- size_t StepForward(size_t pos, const vector<EdgeId> &edges) const {
- size_t step_size = 0;
- while(pos + 1 < edges.size() && Consistent(edges[pos], edges[pos + 1])&& step_size + graph_.length(edges[pos]) <= max_cut_length_) {
- step_size += graph_.length(edges[pos]);
- pos += 1;
- }
- return pos;
- }
-
-
- void PrintPath(vector<EdgeId> path) const {
- for(size_t i = 0; i < path.size(); i++) {
- TRACE(graph_.EdgeNucls(path[i]));
- }
- }
-
-
- vector<EdgeId> TryCloseGap(VertexId v1, VertexId v2, const vector<EdgeId> &path) const {
- if (v1 == v2)
- return vector<EdgeId>();
- TRACE(
- "Trying to close gap between v1=" << graph_.int_id(v1) << " and v2=" << graph_.int_id(v2));
- typedef omnigraph::DijkstraHelper<Graph>::PathIgnoringDijkstraSettings DS;
- size_t max_path_length = max_insert_ + 2 * max_cut_length_;
- DS ds(DS::LC(graph_, path), DS::VPrC(max_path_length), DS::VPuC(max_path_length), DS::NIF(graph_));
- omnigraph::Dijkstra<Graph, DS> dj(graph_, ds);
- dj.Run(v1);
- if(dj.DistanceCounted(v2) && dj.GetDistance(v2) <= max_insert_) {
- vector<EdgeId> result = dj.GetShortestPathTo(v2);
- VERIFY(graph_.EdgeStart(result.front()) == v1);
- VERIFY(graph_.EdgeEnd(result.back()) == v2);
- TRACE("Gap closed");
- TRACE("Cumulative closure length is " << CumulativeLength(graph_, result));
- TRACE("Difference from initial path: " << dj.GetDistance(v2));
- return result;
- } else {
- TRACE("Failed to find closing path");
- return vector<EdgeId>();
- }
-/*
- PathSideSimilarityComparator comparator(garph_, path);
- omnigraph::BestPathStorage<Graph, PathSideSimilarityComparator> storage(graph_, comparator);
-// omnigraph::PathStorageCallback<Graph> path_store(graph_);
- //todo reduce value after investigation
- omnigraph::PathProcessor<Graph> path_processor(graph_, 0, max_insert_, v1, v2, 1000000, storage);
- path_processor.Process();
- TRACE(graph_.VertexNucls(v1));
- TRACE(graph_.VertexNucls(v2));
- size_t error_code = path_processor.Process();
- TRACE("Error code: " << error_code);
-
- if (storage.size() == 0) {
- TRACE("Failed to find closing path");
- return vector<EdgeId>();
- } else if (storage.size() == 1) {
- TRACE("Unique closing path found");
- } else {
- TRACE("Several closing paths found(" << path_store.paths().size() << "), first chosen");
- }
- auto tmp = path_store.paths();
- TRACE("Number of paths: " << tmp.size());
-// for(auto it = tmp.begin(); it != tmp.end(); ++it) {
-// TRACE(ConstructSequence(*it));
-// }
- vector<EdgeId> answer = storage.BestPath();
- TRACE("Gap closed");
- TRACE( "Cumulative closure length is " << CumulativeLength(graph_, answer));
- TRACE( "Difference from initial path: " << comparator.CalculateDifference(answer));
- return answer;
-*/
- }
-
- public:
- CarefulPathFixer(const Graph &graph, size_t max_cut_length, size_t max_insert)
- : graph_(graph), max_cut_length_(max_cut_length), max_insert_(max_insert) {
- }
-
- vector<EdgeId> TryFixPath(const vector<EdgeId>& edges) const {
- vector<EdgeId> result;
- if (edges.empty()) {
- return vector<EdgeId>();
- }
- result.push_back(edges[0]);
- for (size_t i = 1; i < edges.size(); ++i) {
- if (!Consistent(result.back(), edges[i])) {
- size_t lindex = StepBack(result.size() - 1, result);
- size_t rindex = StepForward(i, edges);
- vector<EdgeId> current_path(result.begin() + lindex + 1, result.end());
- current_path.insert(current_path.end(), edges.begin() + i, edges.begin() + rindex);
- vector<EdgeId> closure = TryCloseGap(graph_.EdgeEnd(result[lindex]), graph_.EdgeStart(edges[rindex]), current_path);
- if(closure.size() != 0 || Consistent(result[lindex], edges[rindex])) {
- result.resize(lindex + 1);
- VERIFY(closure.size() == 0 || Consistent(result.back(), closure.front()));
- result.insert(result.end(), closure.begin(), closure.end());
- i = rindex;
- VERIFY(Consistent(result.back(), edges[i]));
- }
- }
- result.push_back(edges[i]);
- }
- return result;
- }
- DECL_LOGGER("CarefulPathFixer")
- };
-
- class ScaffoldCorrector {
- typedef debruijn_graph::conj_graph_pack graph_pack;
- private:
-
- const graph_pack& gp_;
- const CarefulPathFixer &fixer_;
-
-
- bool CheckPath(const vector<Graph::EdgeId> &path) const {
- if(path.size() == 0)
- return false;
- for(size_t i = 1; i < path.size(); i++) {
- if(gp_.g.EdgeEnd(path[i - 1]) != gp_.g.EdgeStart(path[i])) {
- return false;
- }
- }
- return true;
- }
-
- Sequence ConstructSequence(const vector<Graph::EdgeId> &path) const {
- Sequence result = gp_.g.EdgeNucls(path[0]);
- for(size_t i = 1; i < path.size(); i++) {
- result = result + gp_.g.EdgeNucls(path[i]).Subseq(gp_.k_value);
- }
- return result;
- }
-
- public:
- ScaffoldCorrector(const graph_pack &gp, const CarefulPathFixer &fixer) : gp_(gp), fixer_(fixer) {
- }
-
- Sequence correct(const vector<Sequence> &scaffold) const {
- auto mapper = debruijn_graph::MapperInstance(gp_);
- MappingPath <debruijn_graph::EdgeId> path;
- for(auto it = scaffold.begin(); it != scaffold.end(); ++it) {
- path.join(mapper->MapSequence(*it));
- }
- vector<Graph::EdgeId> corrected_path = fixer_.TryFixPath(path.simple_path());
- if(CheckPath(corrected_path)) {
- return ConstructSequence(corrected_path);
- } else {
- return Sequence();
- }
- }
- };
-}
-
-namespace spades {
- class ScaffoldCorrectionStage : public AssemblyStage {
- public:
- typedef debruijn_graph::debruijn_config::scaffold_correction Config;
- private:
- size_t k_;
- std::string output_file_;
- const Config &config_;
- public:
- ScaffoldCorrectionStage(size_t k, string output_file,
- const Config &config) :
- AssemblyStage("ScaffoldCorrection", "scaffold_correction"),
- k_(k), output_file_(output_file), config_(config) {
- }
-
- vector<Sequence> CollectScaffoldParts(const io::SingleRead &scaffold) const {
- vector<Sequence> result;
- for(size_t i = 0; i < scaffold.size(); i++) {
- size_t j = i;
- while(j < scaffold.size() && is_nucl(scaffold.GetSequenceString()[j])) {
- j++;
- }
- if(j > i) {
- result.push_back(scaffold.Substr(i, j).sequence());
- i = j - 1;
- }
- }
- return result;
- }
-
- void OutputResults(const vector<io::SingleRead> &results) {
- io::osequencestream_simple oss(output_file_);
- for(size_t i = 0; i < results.size(); i++) {
- string sequence = results[i].GetSequenceString();
- if(sequence != "") {
- oss.set_header(results[i].name());
- oss << sequence;
- }
- }
- }
-
- vector<io::SingleRead> ReadScaffolds(const string &scaffolds_file) {
- io::FileReadStream scaffold_stream(scaffolds_file);
- vector<io::SingleRead> scaffolds;
- while(!scaffold_stream.eof()) {
- io::SingleRead scaffold;
- scaffold_stream >> scaffold;
- scaffolds.push_back(scaffold);
- }
- return scaffolds;
- }
-
- vector<io::SingleRead> RunParallelCorrection(const vector<io::SingleRead> &scaffolds, const scaffold_correction::ScaffoldCorrector &corrector) {
- vector<io::SingleRead> results(scaffolds.size());
-#pragma omp parallel for
- for(size_t i = 0; i < scaffolds.size(); i++) {
- auto scaffold = scaffolds[i];
- std::string name = scaffold.name();
- vector<Sequence> part_list = CollectScaffoldParts(scaffold);
- TRACE("Correcting scaffold " << name);
- TRACE("Number of parts: " << part_list.size());
- Sequence result = corrector.correct(part_list);
- if (result.size() != 0) {
- TRACE("Correction successful");
- results[i] = io::SingleRead(name, result.str());
- } else if (config_.output_unfilled) {
- TRACE("Correction unsuccessful. Using uncorrected scaffold");
- results[i] = scaffold;
- }
- }
- return results;
- }
-
- void run(debruijn_graph::conj_graph_pack &graph_pack, const char *) {
- INFO("Correcting scaffolds from " << config_.scaffolds_file);
- scaffold_correction::CarefulPathFixer fixer(graph_pack.g, config_.max_cut_length, config_.max_insert);
- scaffold_correction::ScaffoldCorrector corrector(graph_pack, fixer);
- vector<io::SingleRead> scaffolds = ReadScaffolds(config_.scaffolds_file);
- vector<io::SingleRead> results = RunParallelCorrection(scaffolds, corrector);
- OutputResults(results);
- INFO(scaffolds.size() << " reads processed");
- INFO("Corrected scaffolds written to " << output_file_);
- }
- DECL_LOGGER("ScaffoldCorrectionStage")
- };
-
- void run_scaffold_correction() {
- INFO("Scaffold correction started");
-
- debruijn_graph::conj_graph_pack conj_gp(cfg::get().K,
- cfg::get().tmp_dir,
- cfg::get().ds.reads.lib_count(),
- cfg::get().ds.reference_genome,
- cfg::get().flanking_range,
- cfg::get().pos.max_mapping_gap,
- cfg::get().pos.max_gap_diff);
- StageManager manager({cfg::get().developer_mode,
- cfg::get().load_from,
- cfg::get().output_saves});
- manager.add(new debruijn_graph::Construction())
- .add(new ScaffoldCorrectionStage(cfg::get().K, cfg::get().output_dir + "corrected_scaffolds.fasta", cfg::get().sc_cor));
- INFO("Output directory: " << cfg::get().output_dir);
- conj_gp.kmer_mapper.Attach();
- manager.run(conj_gp, cfg::get().entry_point.c_str());
- INFO("Scaffold correction finished.");
- }
-
-}
diff --git a/src/spades_pipeline/corrector_logic.py b/src/spades_pipeline/corrector_logic.py
index 2781747..158d6fb 100644
--- a/src/spades_pipeline/corrector_logic.py
+++ b/src/spades_pipeline/corrector_logic.py
@@ -31,7 +31,7 @@ def prepare_config_corr(filename, cfg, ext_python_modules_home):
data["max_nthreads"] = cfg.max_threads
data["bwa"] = cfg.bwa
file_c = open(filename, 'w')
- pyyaml.dump(data, file_c)
+ pyyaml.dump(data, file_c, default_flow_style = False, default_style='"', width=100500)
file_c.close()
@@ -49,15 +49,6 @@ def run_corrector(configs_dir, execution_home, cfg,
shutil.rmtree(dst_configs)
dir_util.copy_tree(os.path.join(configs_dir, "corrector"), dst_configs, preserve_times=False)
cfg_file_name = os.path.join(dst_configs, "corrector.info")
- # removing template configs
- for root, dirs, files in os.walk(dst_configs):
- for cfg_file in files:
- cfg_file = os.path.join(root, cfg_file)
- if cfg_file.endswith('.template'):
- if os.path.isfile(cfg_file.split('.template')[0]):
- os.remove(cfg_file)
- else:
- os.rename(cfg_file, cfg_file.split('.template')[0])
cfg.tmp_dir = support.get_tmp_dir(prefix="corrector_")
diff --git a/src/spades_pipeline/dipspades_logic.py b/src/spades_pipeline/dipspades_logic.py
index f671518..b85ea95 100755
--- a/src/spades_pipeline/dipspades_logic.py
+++ b/src/spades_pipeline/dipspades_logic.py
@@ -68,9 +68,6 @@ def prepare_configs(src_config_dir, ds_args, log):
copy_configs(src_config_dir, config_dir)
#log.info("dipSPAdes configs were copied to " + config_dir)
config_fname = os.path.join(config_dir, "config.info")
- if not os.path.exists(config_fname):
- support.check_file_existence(config_fname + ".template")
- os.rename(config_fname + ".template", config_fname)
return os.path.abspath(config_fname)
diff --git a/src/spades_pipeline/hammer_logic.py b/src/spades_pipeline/hammer_logic.py
index 213bee3..1e2b035 100644
--- a/src/spades_pipeline/hammer_logic.py
+++ b/src/spades_pipeline/hammer_logic.py
@@ -13,9 +13,11 @@ import sys
import glob
import shutil
import support
+import options_storage
import process_cfg
from site import addsitedir
from distutils import dir_util
+from os.path import isfile
def compress_dataset_files(dataset_data, ext_python_modules_home, max_threads, log):
@@ -26,10 +28,12 @@ def compress_dataset_files(dataset_data, ext_python_modules_home, max_threads, l
if key.endswith('reads'):
compressed_reads_filenames = []
for reads_file in value:
- if not os.path.isfile(reads_file):
+ compressed_reads_filenames.append(reads_file + ".gz")
+ if not isfile(reads_file):
+ if isfile(compressed_reads_filenames[-1]):
+ continue # already compressed (--continue/--restart-from case)
support.error('something went wrong and file with corrected reads (' + reads_file + ') is missing!', log)
to_compress.append(reads_file)
- compressed_reads_filenames.append(reads_file + ".gz")
reads_library[key] = compressed_reads_filenames
if len(to_compress):
pigz_path = support.which('pigz')
@@ -87,50 +91,57 @@ def prepare_config_ih(filename, cfg, ext_python_modules_home):
data["output_dir"] = cfg.output_dir
data["hard_memory_limit"] = cfg.max_memory
data["max_nthreads"] = cfg.max_threads
- pyyaml.dump(data, open(filename, 'w'))
+ pyyaml.dump(data, open(filename, 'w'), default_flow_style = False, default_style='"', width=100500)
def run_hammer(corrected_dataset_yaml_filename, configs_dir, execution_home, cfg,
- not_used_dataset_data, ext_python_modules_home, log):
+ dataset_data, ext_python_modules_home, only_compressing_is_needed, log):
addsitedir(ext_python_modules_home)
if sys.version.startswith('2.'):
import pyyaml2 as pyyaml
elif sys.version.startswith('3.'):
import pyyaml3 as pyyaml
- dst_configs = os.path.join(cfg.output_dir, "configs")
- if os.path.exists(dst_configs):
- shutil.rmtree(dst_configs)
- if cfg.iontorrent:
- dir_util.copy_tree(os.path.join(configs_dir, "ionhammer"), dst_configs, preserve_times=False)
- cfg_file_name = os.path.join(dst_configs, "ionhammer.cfg")
- else:
- dir_util.copy_tree(os.path.join(configs_dir, "hammer"), dst_configs, preserve_times=False)
- cfg_file_name = os.path.join(dst_configs, "config.info")
- # removing template configs
- for root, dirs, files in os.walk(dst_configs):
- for cfg_file in files:
- cfg_file = os.path.join(root, cfg_file)
- if cfg_file.endswith('.template'):
- if os.path.isfile(cfg_file.split('.template')[0]):
- os.remove(cfg_file)
- else:
- os.rename(cfg_file, cfg_file.split('.template')[0])
-
- cfg.tmp_dir = support.get_tmp_dir(prefix="hammer_")
- if cfg.iontorrent:
- prepare_config_ih(cfg_file_name, cfg, ext_python_modules_home)
- binary_name = "ionhammer"
+
+ # not all reads need processing
+ if support.get_lib_ids_by_type(dataset_data, options_storage.LONG_READS_TYPES):
+ not_used_dataset_data = support.get_libs_by_type(dataset_data, options_storage.LONG_READS_TYPES)
+ to_correct_dataset_data = support.rm_libs_by_type(dataset_data, options_storage.LONG_READS_TYPES)
+ to_correct_dataset_yaml_filename = os.path.join(cfg.output_dir, "to_correct.yaml")
+ pyyaml.dump(to_correct_dataset_data, open(to_correct_dataset_yaml_filename, 'w'), default_flow_style = False, default_style='"', width=100500)
+ cfg.dataset_yaml_filename = to_correct_dataset_yaml_filename
else:
- prepare_config_bh(cfg_file_name, cfg, log)
- binary_name = "hammer"
+ not_used_dataset_data = None
+
+ if not only_compressing_is_needed:
+ dst_configs = os.path.join(cfg.output_dir, "configs")
+ if os.path.exists(dst_configs):
+ shutil.rmtree(dst_configs)
+ if cfg.iontorrent:
+ dir_util.copy_tree(os.path.join(configs_dir, "ionhammer"), dst_configs, preserve_times=False)
+ cfg_file_name = os.path.join(dst_configs, "ionhammer.cfg")
+ else:
+ dir_util.copy_tree(os.path.join(configs_dir, "hammer"), dst_configs, preserve_times=False)
+ cfg_file_name = os.path.join(dst_configs, "config.info")
- command = [os.path.join(execution_home, binary_name),
- os.path.abspath(cfg_file_name)]
+ cfg.tmp_dir = support.get_tmp_dir(prefix="hammer_")
+ if cfg.iontorrent:
+ prepare_config_ih(cfg_file_name, cfg, ext_python_modules_home)
+ binary_name = "ionhammer"
+ else:
+ prepare_config_bh(cfg_file_name, cfg, log)
+ binary_name = "hammer"
+
+ command = [os.path.join(execution_home, binary_name),
+ os.path.abspath(cfg_file_name)]
+
+ log.info("\n== Running read error correction tool: " + ' '.join(command) + "\n")
+ support.sys_call(command, log)
+ if not os.path.isfile(corrected_dataset_yaml_filename):
+ support.error("read error correction finished abnormally: " + corrected_dataset_yaml_filename + " not found!")
+ else:
+ log.info("\n===== Skipping %s (already processed). \n" % "read error correction tool")
+ support.continue_from_here(log)
- log.info("\n== Running read error correction tool: " + ' '.join(command) + "\n")
- support.sys_call(command, log)
- if not os.path.isfile(corrected_dataset_yaml_filename):
- support.error("read error correction finished abnormally: " + corrected_dataset_yaml_filename + " not found!")
corrected_dataset_data = pyyaml.load(open(corrected_dataset_yaml_filename, 'r'))
remove_not_corrected_reads(cfg.output_dir)
is_changed = False
@@ -141,8 +152,8 @@ def run_hammer(corrected_dataset_yaml_filename, configs_dir, execution_home, cfg
is_changed = True
corrected_dataset_data += not_used_dataset_data
if is_changed:
- pyyaml.dump(corrected_dataset_data, open(corrected_dataset_yaml_filename, 'w'))
+ pyyaml.dump(corrected_dataset_data, open(corrected_dataset_yaml_filename, 'w'), default_flow_style = False, default_style='"', width=100500)
log.info("\n== Dataset description file was created: " + corrected_dataset_yaml_filename + "\n")
if os.path.isdir(cfg.tmp_dir):
- shutil.rmtree(cfg.tmp_dir)
\ No newline at end of file
+ shutil.rmtree(cfg.tmp_dir)
diff --git a/src/spades_pipeline/options_storage.py b/src/spades_pipeline/options_storage.py
index 8b39abf..01fb868 100644
--- a/src/spades_pipeline/options_storage.py
+++ b/src/spades_pipeline/options_storage.py
@@ -10,6 +10,7 @@
import os
import sys
import support
+from os.path import basename
SUPPORTED_PYTHON_VERSIONS = ['2.4', '2.5', '2.6', '2.7', '3.2', '3.3', '3.4', '3.5']
# allowed reads extensions for BayesHammer and for thw whole SPAdes pipeline
@@ -57,7 +58,10 @@ output_dir = None
single_cell = False
iontorrent = False
meta = False
+rna = False
+large_genome = False
test_mode = False
+plasmid = False
# pipeline options
only_error_correction = False
@@ -119,7 +123,7 @@ dict_of_prefixes = dict()
dict_of_rel2abs = dict()
# list of spades.py options
-long_options = "12= threads= memory= tmp-dir= iterations= phred-offset= sc iontorrent meta "\
+long_options = "12= threads= memory= tmp-dir= iterations= phred-offset= sc iontorrent meta large-genome rna plasmid "\
"only-error-correction only-assembler "\
"disable-gzip-output disable-gzip-output:false disable-rr disable-rr:false " \
"help version test debug debug:false reference= config-file= dataset= "\
@@ -145,34 +149,50 @@ reads_options = list(map(lambda x: "--" + x.split('=')[0], reads_options))
reads_options += OLD_STYLE_READS_OPTIONS
+def get_mode():
+ mode = None
+ if basename(sys.argv[0]) == "rnaspades.py":
+ mode = 'rna'
+ elif basename(sys.argv[0]) == "plasmidspades.py":
+ mode = 'plasmid'
+ elif basename(sys.argv[0]) == "metaspades.py":
+ mode = 'meta'
+ return mode
+
+
def version(spades_version, mode=None):
sys.stderr.write("SPAdes v" + str(spades_version))
+ if mode is None:
+ mode = get_mode()
if mode is not None:
- sys.stderr.write(" (" + mode + " mode)")
+ sys.stderr.write(" [" + mode + "SPAdes mode]")
sys.stderr.write("\n")
sys.stderr.flush()
-def usage(spades_version, show_hidden=False, dipspades=False):
- if not dipspades:
- sys.stderr.write("SPAdes genome assembler v" + str(spades_version) + "\n\n")
- else:
- sys.stderr.write("dipSPAdes v" + str(spades_version) +
- ": genome assembler designed for diploid genomes with high heterozygosity rate\n\n")
+def usage(spades_version, show_hidden=False, mode=None):
+ sys.stderr.write("SPAdes genome assembler v" + str(spades_version))
+ if mode is None:
+ mode = get_mode()
+ if mode is not None:
+ sys.stderr.write(" [" + mode + "SPAdes mode]")
+ sys.stderr.write("\n\n")
sys.stderr.write("Usage: " + str(sys.argv[0]) + " [options] -o <output_dir>" + "\n")
sys.stderr.write("" + "\n")
sys.stderr.write("Basic options:" + "\n")
sys.stderr.write("-o\t<output_dir>\tdirectory to store all the resulting files (required)" + "\n")
- if not dipspades:
+ if mode != "dip":
sys.stderr.write("--sc\t\t\tthis flag is required for MDA (single-cell) data" + "\n")
sys.stderr.write("--meta\t\t\tthis flag is required for metagenomic sample data" + "\n")
+ sys.stderr.write("--plasmid\tRuns plasmidSPAdes pipeline for plasmid detection \n");
+
sys.stderr.write("--iontorrent\t\tthis flag is required for IonTorrent data" + "\n")
sys.stderr.write("--test\t\t\truns SPAdes on toy dataset" + "\n")
sys.stderr.write("-h/--help\t\tprints this usage message" + "\n")
sys.stderr.write("-v/--version\t\tprints version" + "\n")
sys.stderr.write("" + "\n")
- if not dipspades:
+ if mode != "dip":
sys.stderr.write("Input data:" + "\n")
else:
sys.stderr.write("Input reads:" + "\n")
@@ -222,18 +242,18 @@ def usage(spades_version, show_hidden=False, dipspades=False):
sys.stderr.write("--nanopore\t<filename>\tfile with Nanopore reads\n")
sys.stderr.write("--trusted-contigs\t<filename>\tfile with trusted contigs\n")
sys.stderr.write("--untrusted-contigs\t<filename>\tfile with untrusted contigs\n")
- if dipspades:
+ if mode == "dip":
sys.stderr.write("Input haplocontigs:" + "\n")
sys.stderr.write("--hap\t<filename>\tfile with haplocontigs" + "\n")
sys.stderr.write("" + "\n")
sys.stderr.write("Pipeline options:" + "\n")
- if not dipspades:
+ if mode != "dip":
sys.stderr.write("--only-error-correction\truns only read error correction"\
" (without assembling)" + "\n")
sys.stderr.write("--only-assembler\truns only assembling (without read error"\
" correction)" + "\n")
- if not dipspades:
+ if mode != "dip":
sys.stderr.write("--careful\t\ttries to reduce number"\
" of mismatches and short indels" + "\n")
sys.stderr.write("--continue\t\tcontinue run from the last available check-point" + "\n")
@@ -243,7 +263,7 @@ def usage(spades_version, show_hidden=False, dipspades=False):
sys.stderr.write("--disable-rr\t\tdisables repeat resolution stage"\
" of assembling" + "\n")
- if dipspades:
+ if mode == "dip":
sys.stderr.write("" + "\n")
sys.stderr.write("DipSPAdes options:" + "\n")
sys.stderr.write("--expect-gaps\t\tindicates that significant number of gaps in coverage is expected" + "\n")
@@ -287,9 +307,11 @@ def usage(spades_version, show_hidden=False, dipspades=False):
" for BayesHammer" + "\n")
sys.stderr.write("--spades-heap-check\t<value>\tsets HEAPCHECK environment variable"\
" for SPAdes" + "\n")
+ sys.stderr.write("--large-genome\tEnables optimizations for large genomes \n");
+ sys.stderr.write("--rna\tRuns rnaSPAdes pipeline for RNA-Seq data \n");
sys.stderr.write("--help-hidden\tprints this usage message with all hidden options" + "\n")
- if show_hidden and dipspades:
+ if show_hidden and mode == "dip":
sys.stderr.write("" + "\n")
sys.stderr.write("HIDDEN dipSPAdes options:" + "\n")
sys.stderr.write("--dsK\t\t<int>\t\tk value used in dipSPAdes [default: '55']" + '\n')
diff --git a/src/spades_pipeline/spades_logic.py b/src/spades_pipeline/spades_logic.py
index 764b317..e9237cc 100644
--- a/src/spades_pipeline/spades_logic.py
+++ b/src/spades_pipeline/spades_logic.py
@@ -20,13 +20,11 @@ import options_storage
BASE_STAGE = "construction"
READS_TYPES_USED_IN_CONSTRUCTION = ["paired-end", "single", "hq-mate-pairs"]
+
def prepare_config_spades(filename, cfg, log, additional_contigs_fname, K, stage, saves_dir, last_one, execution_home):
subst_dict = dict()
subst_dict["K"] = str(K)
- subst_dict["run_mode"] = "false"
- if "diploid_mode" in cfg.__dict__:
- subst_dict["diploid_mode"] = bool_to_str(cfg.diploid_mode)
subst_dict["dataset"] = process_cfg.process_spaces(cfg.dataset)
subst_dict["output_base"] = process_cfg.process_spaces(cfg.output_dir)
subst_dict["tmp_dir"] = process_cfg.process_spaces(cfg.tmp_dir)
@@ -44,11 +42,10 @@ def prepare_config_spades(filename, cfg, log, additional_contigs_fname, K, stage
# subst_dict["topology_simplif_enabled"] = bool_to_str(last_one)
subst_dict["max_threads"] = cfg.max_threads
subst_dict["max_memory"] = cfg.max_memory
- subst_dict["correct_mismatches"] = bool_to_str(last_one)
+ if (not last_one):
+ subst_dict["correct_mismatches"] = bool_to_str(False)
if "resolving_mode" in cfg.__dict__:
subst_dict["resolving_mode"] = cfg.resolving_mode
- if "careful" in cfg.__dict__:
- subst_dict["mismatch_careful"] = bool_to_str(cfg.careful)
if "pacbio_mode" in cfg.__dict__:
subst_dict["pacbio_test_on"] = bool_to_str(cfg.pacbio_mode)
subst_dict["pacbio_reads"] = process_cfg.process_spaces(cfg.pacbio_reads)
@@ -61,6 +58,9 @@ def prepare_config_spades(filename, cfg, log, additional_contigs_fname, K, stage
else:
subst_dict["coverage_threshold"] = cfg.cov_cutoff
+ #TODO: make something about spades.py and config param substitution
+ if "bwa_paired" in cfg.__dict__:
+ subst_dict["bwa_enable"] = bool_to_str(True)
subst_dict["path_to_bwa"] = os.path.join(execution_home, "bwa-spades")
process_cfg.substitute_params(filename, subst_dict, log)
@@ -96,6 +96,12 @@ def update_k_mers_in_special_cases(cur_k_mers, RL, log, silent=False):
support.warning("Default k-mer sizes were set to %s because estimated "
"read length (%d) is equal to or greater than 150" % (str(options_storage.K_MERS_150), RL), log)
return options_storage.K_MERS_150
+ if RL <= max(cur_k_mers):
+ new_k_mers = [k for k in cur_k_mers if k < RL]
+ if not silent:
+ support.warning("K-mer sizes were set to %s because estimated "
+ "read length (%d) is less than %d" % (str(new_k_mers), RL, max(cur_k_mers)), log)
+ return new_k_mers
return cur_k_mers
@@ -110,13 +116,26 @@ def reveal_original_k_mers(RL):
original_k_mers = [k for k in original_k_mers if k < RL]
return original_k_mers
+def add_configs(command, configs_dir):
+ #Order matters here!
+ mode_config_mapping = [("single_cell", "mda_mode"),
+ ("meta", "meta_mode"),
+ ("truseq_mode", "moleculo_mode"),
+ ("rna", "rna_mode"),
+ ("plasmid", "plasmid_mode"),
+ ("careful", "careful_mode"),
+ ("diploid_mode", "diploid_mode")]
+
+ for (mode, config) in mode_config_mapping:
+ if options_storage.__dict__[mode]:
+ command.append(os.path.join(configs_dir, config + ".info"))
+
def run_iteration(configs_dir, execution_home, cfg, log, K, prev_K, last_one):
data_dir = os.path.join(cfg.output_dir, "K%d" % K)
stage = BASE_STAGE
saves_dir = os.path.join(data_dir, 'saves')
dst_configs = os.path.join(data_dir, "configs")
- cfg_file_name = os.path.join(dst_configs, "config.info")
if options_storage.continue_mode:
if os.path.isfile(os.path.join(data_dir, "final_contigs.fasta")) and not (options_storage.restart_from and
@@ -135,16 +154,8 @@ def run_iteration(configs_dir, execution_home, cfg, log, K, prev_K, last_one):
shutil.rmtree(data_dir)
os.makedirs(data_dir)
+ dir_util._path_created = {} # see http://stackoverflow.com/questions/9160227/dir-util-copy-tree-fails-after-shutil-rmtree
dir_util.copy_tree(os.path.join(configs_dir, "debruijn"), dst_configs, preserve_times=False)
- # removing template configs
- for root, dirs, files in os.walk(dst_configs):
- for cfg_file in files:
- cfg_file = os.path.join(root, cfg_file)
- if cfg_file.endswith('.info.template'):
- if os.path.isfile(cfg_file.split('.template')[0]):
- os.remove(cfg_file)
- else:
- os.rename(cfg_file, cfg_file.split('.template')[0])
log.info("\n== Running assembler: " + ("K%d" % K) + "\n")
if prev_K:
@@ -155,21 +166,29 @@ def run_iteration(configs_dir, execution_home, cfg, log, K, prev_K, last_one):
else:
additional_contigs_fname = None
if "read_buffer_size" in cfg.__dict__:
- construction_cfg_file_name = os.path.join(dst_configs, "construction.info")
- process_cfg.substitute_params(construction_cfg_file_name, {"read_buffer_size": cfg.read_buffer_size}, log)
- prepare_config_spades(cfg_file_name, cfg, log, additional_contigs_fname, K, stage, saves_dir, last_one, execution_home)
+ #FIXME why here???
+ process_cfg.substitute_params(os.path.join(dst_configs, "construction.info"), {"read_buffer_size": cfg.read_buffer_size}, log)
+ if "scaffolding_mode" in cfg.__dict__:
+ #FIXME why here???
+ process_cfg.substitute_params(os.path.join(dst_configs, "pe_params.info"), {"scaffolding_mode": cfg.scaffolding_mode}, log)
+
+ cfg_fn = os.path.join(dst_configs, "config.info")
+ prepare_config_spades(cfg_fn, cfg, log, additional_contigs_fname, K, stage, saves_dir, last_one, execution_home)
+
+ command = [os.path.join(execution_home, "spades"), cfg_fn]
- command = [os.path.join(execution_home, "spades"), cfg_file_name]
+ add_configs(command, dst_configs)
+
+ #print("Calling: " + " ".join(command))
support.sys_call(command, log)
-def prepare_config_scaffold_correction(filename, cfg, log, saves_dir, scaffolds_file):
+def prepare_config_scaffold_correction(filename, cfg, log, saves_dir, K):
subst_dict = dict()
- subst_dict["K"] = str(21)
- subst_dict["run_mode"] = bool_to_str(False)
+ subst_dict["K"] = str(K)
subst_dict["dataset"] = process_cfg.process_spaces(cfg.dataset)
- subst_dict["output_base"] = process_cfg.process_spaces(cfg.output_dir)
+ subst_dict["output_base"] = process_cfg.process_spaces(os.path.join(cfg.output_dir, "SCC"))
subst_dict["tmp_dir"] = process_cfg.process_spaces(cfg.tmp_dir)
subst_dict["use_additional_contigs"] = bool_to_str(False)
subst_dict["main_iteration"] = bool_to_str(False)
@@ -178,15 +197,12 @@ def prepare_config_scaffold_correction(filename, cfg, log, saves_dir, scaffolds_
subst_dict["developer_mode"] = bool_to_str(cfg.developer_mode)
subst_dict["max_threads"] = cfg.max_threads
subst_dict["max_memory"] = cfg.max_memory
- subst_dict["scaffold_correction_mode"] = bool_to_str(True)
- subst_dict["scaffolds_file"] = scaffolds_file
#todo
process_cfg.substitute_params(filename, subst_dict, log)
-
-def run_scaffold_correction(configs_dir, execution_home, cfg, log, K):
- data_dir = os.path.join(cfg.output_dir, "SCC")
+def run_scaffold_correction(configs_dir, execution_home, cfg, log, latest, K):
+ data_dir = os.path.join(cfg.output_dir, "SCC", "K%d" % K)
saves_dir = os.path.join(data_dir, 'saves')
dst_configs = os.path.join(data_dir, "configs")
cfg_file_name = os.path.join(dst_configs, "config.info")
@@ -196,26 +212,18 @@ def run_scaffold_correction(configs_dir, execution_home, cfg, log, K):
os.makedirs(data_dir)
dir_util.copy_tree(os.path.join(configs_dir, "debruijn"), dst_configs, preserve_times=False)
- # removing template configs
- for root, dirs, files in os.walk(dst_configs):
- for cfg_file in files:
- cfg_file = os.path.join(root, cfg_file)
- if cfg_file.endswith('.info.template'):
- if os.path.isfile(cfg_file.split('.template')[0]):
- os.remove(cfg_file)
- else:
- os.rename(cfg_file, cfg_file.split('.template')[0])
log.info("\n== Running scaffold correction \n")
- latest = os.path.join(cfg.output_dir, "K%d" % K)
scaffolds_file = os.path.join(latest, "scaffolds.fasta")
if not os.path.isfile(scaffolds_file):
support.error("Scaffodls were not found in " + scaffolds_file, log)
if "read_buffer_size" in cfg.__dict__:
construction_cfg_file_name = os.path.join(dst_configs, "construction.info")
process_cfg.substitute_params(construction_cfg_file_name, {"read_buffer_size": cfg.read_buffer_size}, log)
- prepare_config_scaffold_correction(cfg_file_name, cfg, log, saves_dir, scaffolds_file)
+ process_cfg.substitute_params(os.path.join(dst_configs, "moleculo_mode.info"), {"scaffolds_file": scaffolds_file}, log)
+ prepare_config_scaffold_correction(cfg_file_name, cfg, log, saves_dir, K)
command = [os.path.join(execution_home, "scaffold_correction"), cfg_file_name]
+ add_configs(command, dst_configs)
log.info(str(command))
support.sys_call(command, log)
@@ -272,11 +280,15 @@ def run_spades(configs_dir, execution_home, cfg, dataset_data, ext_python_module
prev_K = K
RL = get_read_length(cfg.output_dir, K, ext_python_modules_home, log)
cfg.iterative_K = update_k_mers_in_special_cases(cfg.iterative_K, RL, log)
- if cfg.iterative_K[1] + 1 > RL:
+ if len(cfg.iterative_K) < 2 or cfg.iterative_K[1] + 1 > RL:
if cfg.rr_enable:
- support.warning("Second value of iterative K (%d) exceeded estimated read length (%d). "
- "Rerunning for the first value of K (%d) with Repeat Resolving" %
- (cfg.iterative_K[1], RL, cfg.iterative_K[0]), log)
+ if len(cfg.iterative_K) < 2:
+ log.info("== Rerunning for the first value of K (%d) with Repeat Resolving" %
+ cfg.iterative_K[0])
+ else:
+ support.warning("Second value of iterative K (%d) exceeded estimated read length (%d). "
+ "Rerunning for the first value of K (%d) with Repeat Resolving" %
+ (cfg.iterative_K[1], RL, cfg.iterative_K[0]), log)
run_iteration(configs_dir, execution_home, cfg, log, cfg.iterative_K[0], None, True)
K = cfg.iterative_K[0]
else:
@@ -307,8 +319,8 @@ def run_spades(configs_dir, execution_home, cfg, dataset_data, ext_python_module
else:
if options_storage.continue_mode:
support.continue_from_here(log)
- run_scaffold_correction(configs_dir, execution_home, cfg, log, K)
- latest = os.path.join(cfg.output_dir, "SCC")
+ run_scaffold_correction(configs_dir, execution_home, cfg, log, latest, 21)
+ latest = os.path.join(os.path.join(cfg.output_dir, "SCC"), "K21")
if options_storage.stop_after == 'scc':
support.finish_here(log)
diff --git a/src/spades_pipeline/support.py b/src/spades_pipeline/support.py
index 4306267..77b8fa0 100644
--- a/src/spades_pipeline/support.py
+++ b/src/spades_pipeline/support.py
@@ -173,8 +173,8 @@ def get_available_memory():
def process_readline(line, is_python3=sys.version.startswith('3.')):
if is_python3:
- return str(line, 'utf-8')
- return line
+ return str(line, 'utf-8').rstrip()
+ return line.rstrip()
def process_spaces(str):
@@ -196,7 +196,7 @@ def sys_call(cmd, log=None, cwd=None):
output = ''
while not proc.poll():
- line = process_readline(proc.stdout.readline()).rstrip()
+ line = process_readline(proc.stdout.readline())
if line:
if log:
log.info(line)
@@ -206,7 +206,7 @@ def sys_call(cmd, log=None, cwd=None):
break
for line in proc.stdout.readlines():
- line = process_readline(line).rstrip()
+ line = process_readline(line)
if line:
if log:
log.info(line)
@@ -244,11 +244,11 @@ def universal_sys_call(cmd, log, out_filename=None, err_filename=None, cwd=None)
if log and (not out_filename or not err_filename):
while not proc.poll():
if not out_filename:
- line = process_readline(proc.stdout.readline()).rstrip()
+ line = process_readline(proc.stdout.readline())
if line:
log.info(line)
if not err_filename:
- line = process_readline(proc.stderr.readline()).rstrip()
+ line = process_readline(proc.stderr.readline())
if line:
log.info(line)
if proc.returncode is not None:
@@ -257,11 +257,11 @@ def universal_sys_call(cmd, log, out_filename=None, err_filename=None, cwd=None)
if not out_filename:
for line in proc.stdout.readlines():
if line != '':
- log.info(process_readline(line).rstrip())
+ log.info(process_readline(line))
if not err_filename:
for line in proc.stderr.readlines():
if line != '':
- log.info(process_readline(line).rstrip())
+ log.info(process_readline(line))
else:
proc.wait()
@@ -629,6 +629,16 @@ def dataset_is_empty(dataset_data):
return True
+def dataset_has_gzipped_reads(dataset_data):
+ for reads_library in dataset_data:
+ for key in reads_library:
+ if key.endswith('reads'):
+ for reads_file in reads_library[key]:
+ if reads_file.endswith('.gz'):
+ return True
+ return False
+
+
def dataset_has_interlaced_reads(dataset_data):
for reads_library in dataset_data:
if 'interlaced reads' in reads_library:
@@ -688,30 +698,41 @@ def process_Ns_in_additional_contigs(dataset_data, dst, log):
def split_interlaced_reads(dataset_data, dst, log):
- def write_single_read(in_file, out_file, fasta_read_name=None, is_fastq=False, is_python3=False):
- next_read_str = "" # if there is no next read: empty string
- if not is_fastq and fasta_read_name is not None:
- read_name = fasta_read_name
- else:
+ def write_single_read(in_file, out_file, read_name=None, is_fastq=False, is_python3=False):
+ if read_name is None:
read_name = process_readline(in_file.readline(), is_python3)
if not read_name:
- return next_read_str
+ return '' # no next read
read_value = process_readline(in_file.readline(), is_python3)
line = process_readline(in_file.readline(), is_python3)
- while line and ((is_fastq and not line.startswith('+')) or (not is_fastq and not line.startswith('>'))):
+ fpos = in_file.tell()
+ while (is_fastq and not line.startswith('+')) or (not is_fastq and not line.startswith('>')):
read_value += line
line = process_readline(in_file.readline(), is_python3)
- next_read_str = line # if there is a next read: "+" (for fastq) or next read name (for fasta)
- out_file.write(read_name)
- out_file.write(read_value)
+ if not line:
+ if fpos == in_file.tell():
+ break
+ fpos = in_file.tell()
+ out_file.write(read_name + '\n')
+ out_file.write(read_value + '\n')
if is_fastq:
read_quality = process_readline(in_file.readline(), is_python3)
- while len(read_value) != len(read_quality):
- read_quality += process_readline(in_file.readline(), is_python3)
- out_file.write("+\n")
- out_file.write(read_quality)
- return next_read_str
+ line = process_readline(in_file.readline(), is_python3)
+ while not line.startswith('@'):
+ read_quality += line
+ line = process_readline(in_file.readline(), is_python3)
+ if not line:
+ if fpos == in_file.tell():
+ break
+ fpos = in_file.tell()
+ if len(read_value) != len(read_quality):
+ error('The length of sequence and quality lines should be the same! '
+ 'Check read %s (SEQ length is %d, QUAL length is %d)' %
+ (read_name, len(read_value), len(read_quality)), log)
+ out_file.write('+\n')
+ out_file.write(read_quality + '\n')
+ return line # next read name or empty string
new_dataset_data = list()
for reads_library in dataset_data:
@@ -753,16 +774,14 @@ def split_interlaced_reads(dataset_data, dst, log):
log.info("== Splitting " + interlaced_reads + " into left and right reads (in " + dst + " directory)")
out_files = [open(out_left_filename, 'w'), open(out_right_filename, 'w')]
i = 0
- next_read_str = write_single_read(input_file, out_files[i], None, is_fastq,
- sys.version.startswith('3.') and was_compressed)
- while next_read_str:
+ next_read_name = write_single_read(input_file, out_files[i], None, is_fastq,
+ sys.version.startswith('3.') and was_compressed)
+ while next_read_name:
i = (i + 1) % 2
- next_read_str = write_single_read(input_file, out_files[i], next_read_str, is_fastq,
- sys.version.startswith('3.') and was_compressed)
- if (is_fastq and i % 2 == 1) or (not is_fastq and i % 2 == 0):
- # when fastq, the number of writes is equal to number of READS (should be EVEN)
- # when fasta, the number of writes is equal to number of NEXT READS (should be ODD)
- error("The number of reads in file with interlaced reads (" + interlaced_reads + ") is ODD!", log)
+ next_read_name = write_single_read(input_file, out_files[i], next_read_name, is_fastq,
+ sys.version.startswith('3.') and was_compressed)
+ if i == 0:
+ error("The number of reads in file with interlaced reads (" + interlaced_reads + ") should be EVEN!", log)
out_files[0].close()
out_files[1].close()
input_file.close()
diff --git a/src/spades_pipeline/truspades/launch_options.py b/src/spades_pipeline/truspades/launch_options.py
index 75a7268..1a7ad18 100644
--- a/src/spades_pipeline/truspades/launch_options.py
+++ b/src/spades_pipeline/truspades/launch_options.py
@@ -26,11 +26,12 @@ class Options:
self.mode = "run_truspades"
self.possible_modes = ["run_truspades", "generate_dataset", "construct_subreferences"]
self.test = False
+ self.clean = False
def __init__(self, argv, bin, home, version):
if len(argv) == 1:
print_usage_and_exit(1, version)
- long_params = "test help-hidden construct-dataset reference= reference-index= do= continue " \
+ long_params = "test clean help-hidden construct-dataset reference= reference-index= do= continue " \
"threads= help version dataset= input-dir= additional-options=".split(" ")
short_params = "o:t:hv"
self.set_default_options()
@@ -79,6 +80,8 @@ class Options:
self.output_dir = value
elif key == "--threads" or key == "-t":
self.threads = int(value)
+ elif key == "--clean":
+ self.clean = True
elif key == "--help-hidden":
print_usage_and_exit(0, self.version, show_hidden=True)
if not self.mode in self.possible_modes:
@@ -103,8 +106,7 @@ class Options:
def print_usage_and_exit(code, version, show_hidden=False):
- sys.stderr.write("TruSPAdes v" + str(version) +
- ": genome assembler designed for short reads produced by Illumina TruSeq Long Read technology\n\n")
+ sys.stderr.write("SPAdes genome assembler v" + str(version) + " [truSPAdes mode]\n\n")
sys.stderr.write("Usage: " + str(sys.argv[0]) + " [options] -o <output_dir>" + "\n")
sys.stderr.write("" + "\n")
sys.stderr.write("Basic options:" + "\n")
@@ -132,5 +134,5 @@ def print_usage_and_exit(code, version, show_hidden=False):
def print_version_and_exit(version):
- options_storage.version(version, mode="TruSPAdes")
+ options_storage.version(version, mode="tru")
sys.exit(0)
diff --git a/src/utils/adt/array_vector.hpp b/src/utils/adt/array_vector.hpp
new file mode 100644
index 0000000..819aa49
--- /dev/null
+++ b/src/utils/adt/array_vector.hpp
@@ -0,0 +1,677 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef __ARRAY_VECTOR_HPP__
+#define __ARRAY_VECTOR_HPP__
+
+#include <algorithm>
+#include <memory>
+
+#include <cstdlib>
+#include <cstring>
+#include <cstddef>
+
+template<class _Cp, bool _IsConst>
+class __array_vector_iterator;
+
+template<class _Cp>
+class __array_reference;
+
+template<class _Cp>
+class __array_const_reference;
+
+template<typename ElTy>
+struct array_equal_to;
+
+template<class _Cp>
+class __array {
+ typedef typename _Cp::__storage_type __storage_type;
+ typedef typename _Cp::__storage_pointer __storage_pointer;
+ typedef typename _Cp::__const_storage_pointer __const_storage_pointer;
+ typedef typename _Cp::size_type __size_type;
+
+#if defined(__clang__)
+ friend typename _Cp::__self;
+#else
+
+ friend class _Cp::__self;
+
+#endif
+
+ friend class __array_vector_iterator<_Cp, false>;
+
+ friend class __array_reference<_Cp>;
+
+ friend class __array_const_reference<_Cp>;
+
+ __storage_pointer ptr_;
+ __size_type size_;
+ bool allocated;
+
+public:
+ ~__array() {
+ if (allocated)
+ delete[] ptr_;
+ }
+
+ size_t size() const {
+ return size_;
+ }
+
+ size_t data_size() const {
+ return size_ * sizeof(__storage_type);
+ }
+
+ __storage_pointer data() const {
+ return ptr_;
+ }
+
+ __array(const __array &that) {
+ size_ = that.size_;
+ ptr_ = new __storage_type[size_];
+ allocated = true;
+ memcpy(ptr_, that.ptr_, data_size());
+ }
+
+ __array(const __array_reference<_Cp> that) {
+ size_ = that.size();
+ ptr_ = new __storage_type[size_];
+ allocated = true;
+ memcpy(ptr_, that.data(), data_size());
+ }
+
+ __array &operator=(const __array &that) {
+ __storage_pointer this_ptr = data(), that_ptr = that.data();
+ if (this_ptr != that_ptr)
+ memcpy(this_ptr, that_ptr, data_size());
+
+ return *this;
+ }
+
+ __array &operator=(const __array_reference<_Cp> that) {
+ __storage_pointer this_ptr = data(), that_ptr = that.data();
+ if (this_ptr != that_ptr)
+ memcpy(this_ptr, that_ptr, data_size());
+
+ return *this;
+ }
+
+ __array &operator=(__const_storage_pointer that_ptr) {
+ __storage_pointer this_ptr = data();
+ if (this_ptr != that_ptr)
+ memcpy(this_ptr, that_ptr, data_size());
+
+ return *this;
+ }
+
+ bool operator<(const __array &that) const {
+ __storage_pointer this_ptr = data(), that_ptr = that.data();
+
+ for (size_t i = 0; i < size(); ++i) {
+ if (this_ptr[i] != that_ptr[i])
+ return this_ptr[i] < that_ptr[i];
+ }
+
+ return false;
+ }
+
+ bool operator<(const __array_reference<_Cp> that) const {
+ __storage_pointer this_ptr = data(), that_ptr = that.data();
+
+ for (size_t i = 0; i < size(); ++i) {
+ if (this_ptr[i] != that_ptr[i])
+ return this_ptr[i] < that_ptr[i];
+ }
+
+ return false;
+ }
+
+ bool operator==(const __array &that) const {
+ __storage_pointer this_ptr = data(), that_ptr = that.data();
+
+ for (size_t i = 0; i < size(); ++i) {
+ if (this_ptr[i] != that_ptr[i])
+ return false;
+ }
+
+ return true;
+ }
+
+ bool operator==(const __array_reference<_Cp> that) const {
+ __storage_pointer this_ptr = data(), that_ptr = that.data();
+
+ for (size_t i = 0; i < size(); ++i) {
+ if (this_ptr[i] != that_ptr[i])
+ return false;
+ }
+
+ return true;
+ }
+
+ bool operator!=(const __array &that) const {
+ return !operator==(that);
+ }
+
+ bool operator!=(const __array_reference<_Cp> that) const {
+ return !operator==(that);
+ }
+
+private:
+ __array(__storage_pointer p, __size_type sz) :
+ ptr_(p), size_(sz), allocated(false) { }
+};
+
+template<class _Cp>
+class __array_reference {
+ typedef typename _Cp::__storage_type __storage_type;
+ typedef typename _Cp::__storage_pointer __storage_pointer;
+ typedef typename _Cp::__const_storage_pointer __const_storage_pointer;
+ typedef typename _Cp::size_type __size_type;
+
+#if defined(__clang__)
+ friend typename _Cp::__self;
+#else
+
+ friend class _Cp::__self;
+
+#endif
+
+ friend class __array_vector_iterator<_Cp, false>;
+
+ friend class __array<_Cp>;
+
+ friend struct array_equal_to<__storage_type>;
+
+ __storage_pointer ptr_;
+ __size_type size_;
+
+public:
+ size_t size() const {
+ return size_;
+ }
+
+ size_t data_size() const {
+ return size() * sizeof(__storage_type);
+ }
+
+ __storage_pointer data() const {
+ return ptr_;
+ }
+
+ __array_reference &operator=(const __array<_Cp> &that) {
+ __storage_pointer this_ptr = data(), that_ptr = that.data();
+ if (this_ptr != that_ptr)
+ memcpy(this_ptr, that_ptr, data_size());
+
+ return *this;
+ }
+
+ __array_reference &operator=(__const_storage_pointer that_ptr) {
+ __storage_pointer this_ptr = data();
+ if (this_ptr != that_ptr)
+ memcpy(this_ptr, that_ptr, data_size());
+
+ return *this;
+ }
+
+ __array_reference &operator=(const __array_reference that) {
+ __storage_pointer this_ptr = data(), that_ptr = that.data();
+ if (this_ptr != that_ptr)
+ memcpy(this_ptr, that_ptr, data_size());
+
+ return *this;
+ }
+
+ bool operator<(const __array<_Cp> &that) const {
+ __storage_pointer this_ptr = data(), that_ptr = that.data();
+
+ for (size_t i = 0; i < size(); ++i) {
+ if (this_ptr[i] != that_ptr[i])
+ return this_ptr[i] < that_ptr[i];
+ }
+
+ return false;
+ }
+
+ bool operator<(const __array_reference that) const {
+ __storage_pointer this_ptr = data(), that_ptr = that.data();
+
+ for (size_t i = 0; i < size(); ++i) {
+ if (this_ptr[i] != that_ptr[i])
+ return this_ptr[i] < that_ptr[i];
+ }
+
+ return false;
+ }
+
+ bool operator==(const __array<_Cp> &that) const {
+ __storage_pointer this_ptr = data(), that_ptr = that.data();
+
+ for (size_t i = 0; i < size(); ++i) {
+ if (this_ptr[i] != that_ptr[i])
+ return false;
+ }
+
+ return true;
+ }
+
+ bool operator==(const __array_reference that) const {
+ __storage_pointer this_ptr = data(), that_ptr = that.data();
+
+ for (size_t i = 0; i < size(); ++i) {
+ if (this_ptr[i] != that_ptr[i])
+ return false;
+ }
+
+ return true;
+ }
+
+ bool operator!=(const __array_reference that) const {
+ return !operator==(that);
+ }
+
+ bool operator!=(const __array<_Cp> &that) const {
+ return !operator==(that);
+ }
+
+private:
+ __array_reference(__storage_pointer p, __size_type sz) :
+ ptr_(p), size_(sz) { }
+};
+
+template<class _Cp>
+class __array_const_reference {
+ typedef typename _Cp::__storage_type __storage_type;
+ typedef typename _Cp::__storage_pointer __storage_pointer;
+ typedef typename _Cp::__const_storage_pointer __const_storage_pointer;
+ typedef typename _Cp::size_type __size_type;
+
+#if defined(__clang__)
+ friend typename _Cp::__self;
+#else
+
+ friend class _Cp::__self;
+
+#endif
+
+ friend class __array_vector_iterator<_Cp, true>;
+
+ friend struct array_equal_to<__storage_type>;
+
+ __const_storage_pointer ptr_;
+ __size_type size_;
+
+public:
+ size_t size() const {
+ return size_;
+ }
+
+ size_t data_size() const {
+ return size() * sizeof(__storage_type);
+ }
+
+ __const_storage_pointer data() const {
+ return ptr_;
+ }
+
+ __array_const_reference(const __array_const_reference &that)
+ : ptr_(that.ptr_), size_(that.size_) { }
+
+ bool operator<(__array_const_reference that) const {
+ const __storage_pointer this_ptr = data(), that_ptr = that.data();
+
+ for (size_t i = 0; i < size(); ++i) {
+ if (this_ptr[i] != that_ptr[i])
+ return this_ptr[i] < that_ptr[i];
+ }
+
+ return false;
+ }
+
+ bool operator==(__array_const_reference that) const {
+ const __storage_pointer this_ptr = data(), that_ptr = that.data();
+
+ for (size_t i = 0; i < size(); ++i) {
+ if (this_ptr[i] != that_ptr[i])
+ return false;
+ }
+
+ return true;
+ }
+
+ bool operator==(const __array_reference<_Cp> that) const {
+ __const_storage_pointer this_ptr = data();
+ const __storage_pointer that_ptr = that.data();
+
+ for (size_t i = 0; i < size(); ++i) {
+ if (this_ptr[i] != that_ptr[i])
+ return false;
+ }
+
+ return true;
+ }
+
+ bool operator!=(const __array_const_reference that) const {
+ return !operator==(that);
+ }
+
+ bool operator!=(const __array_reference<_Cp> that) const {
+ return !operator==(that);
+ }
+
+private:
+ __array_const_reference(__const_storage_pointer p, __size_type sz) :
+ ptr_(p), size_(sz) { }
+
+ __array_const_reference &operator=(const __array_const_reference &that);
+};
+
+// This is hack. Never do this again!
+#ifdef __GLIBCXX__
+namespace std {
+ template<typename _Cp>
+ struct __are_same<__array_reference<_Cp>, __array<_Cp> &> {
+ enum {
+ __value = 1
+ };
+ typedef __true_type __type;
+ };
+
+ template<typename _Cp>
+ struct __are_same<__array<_Cp> &, __array_reference<_Cp> > {
+ enum {
+ __value = 1
+ };
+ typedef __true_type __type;
+ };
+}
+#endif
+
+template<typename _Cp>
+void swap(__array_reference<_Cp> lhs, __array_reference<_Cp> rhs) {
+ std::swap_ranges(lhs.data(), lhs.data() + lhs.size(), rhs.data());
+}
+
+template<typename _Cp>
+void swap(__array<_Cp> &lhs, __array_reference<_Cp> rhs) {
+ std::swap_ranges(lhs.data(), lhs.data() + lhs.size(), rhs.data());
+}
+
+template<typename _Cp>
+void swap(__array_reference<_Cp> lhs, __array<_Cp> &rhs) {
+ std::swap_ranges(lhs.data(), lhs.data() + lhs.size(), rhs.data());
+}
+
+template<typename _Cp, bool _IsConst>
+class __array_vector_iterator {
+public:
+ typedef typename _Cp::difference_type difference_type;
+ typedef __array_vector_iterator pointer;
+ typedef typename std::conditional<_IsConst, __array_const_reference<_Cp>, __array_reference<_Cp> >::type reference;
+ typedef __array<_Cp> value_type;
+
+ typedef std::random_access_iterator_tag iterator_category;
+
+private:
+ typedef typename _Cp::__storage_type __storage_type;
+ typedef typename _Cp::__storage_pointer __storage_pointer;
+ typedef typename _Cp::size_type __size_type;
+
+#if defined(__clang__)
+ friend typename _Cp::__self;
+#else
+
+ friend class _Cp::__self;
+
+#endif
+
+ __storage_pointer data_;
+ __size_type el_sz_;
+
+public:
+ __array_vector_iterator(__storage_pointer data, __size_type el_sz)
+ : data_(data), el_sz_(el_sz) { }
+
+ size_t size() const {
+ return el_sz_;
+ }
+
+ size_t data_size() const {
+ return el_sz_ * sizeof(__storage_type);
+ }
+
+ __storage_pointer data() const {
+ return data_;
+ }
+
+ reference operator*() const {
+ return reference(data_, el_sz_);
+ }
+
+ reference operator[](difference_type n) const {
+ return *(*this + n);
+ }
+
+ __array_vector_iterator &operator++() {
+ data_ += el_sz_;
+ return *this;
+ }
+
+ __array_vector_iterator &operator--() {
+ data_ -= el_sz_;
+ return *this;
+ }
+
+ __array_vector_iterator operator++(int) {
+ __array_vector_iterator res = *this;
+ data_ += el_sz_;
+ return res;
+ }
+
+ __array_vector_iterator operator--(int) {
+ __array_vector_iterator res = *this;
+ data_ -= el_sz_;
+ return res;
+ }
+
+ __array_vector_iterator operator+(const difference_type &n) const {
+ return __array_vector_iterator(data_ + n * el_sz_, el_sz_);
+ }
+
+ __array_vector_iterator &operator+=(const difference_type &n) {
+ data_ += n * el_sz_;
+ return *this;
+ }
+
+ __array_vector_iterator operator-(const difference_type &n) const {
+ return __array_vector_iterator(data_ - n * el_sz_, el_sz_);
+ }
+
+ __array_vector_iterator &operator-=(const difference_type &n) {
+ data_ -= n * el_sz_;
+ return *this;
+ }
+
+ friend bool operator==(const __array_vector_iterator &r1,
+ const __array_vector_iterator &r2) {
+ return r1.data_ == r2.data_;
+ }
+
+ friend bool operator!=(const __array_vector_iterator &r1,
+ const __array_vector_iterator &r2) {
+ return r1.data_ != r2.data_;
+ }
+
+ friend bool operator<(const __array_vector_iterator &r1,
+ const __array_vector_iterator &r2) {
+ return r1.data_ < r2.data_;
+ }
+
+ friend bool operator<=(const __array_vector_iterator &r1,
+ const __array_vector_iterator &r2) {
+ return r1.data_ <= r2.data_;
+ }
+
+ friend bool operator>(const __array_vector_iterator &r1,
+ const __array_vector_iterator &r2) {
+ return r1.data_ > r2.data_;
+ }
+
+ friend bool operator>=(const __array_vector_iterator &r1,
+ const __array_vector_iterator &r2) {
+ return r1.data_ >= r2.data_;
+ }
+
+
+ friend __array_vector_iterator
+ operator+(difference_type n,
+ const __array_vector_iterator &r2) {
+ return r2 + n;
+ }
+
+ friend difference_type
+ operator-(const __array_vector_iterator &r1,
+ const __array_vector_iterator &r2) {
+ return (r1.data_ - r2.data_) / r1.el_sz_;
+ }
+};
+
+template<typename ElTy>
+class array_vector {
+public:
+ typedef size_t size_type;
+ typedef ptrdiff_t difference_type;
+
+ typedef __array_reference<array_vector> reference;
+ typedef __array_const_reference<array_vector> const_reference;
+ typedef __array<array_vector> value_type;
+ typedef __array_vector_iterator<array_vector, false> iterator;
+ typedef __array_vector_iterator<array_vector, true> const_iterator;
+
+private:
+ typedef ElTy __storage_type;
+ typedef array_vector __self;
+ typedef __storage_type *__storage_pointer;
+ typedef const __storage_type *__const_storage_pointer;
+
+ friend class __array<__self>;
+
+ friend class __array_reference<__self>;
+
+ friend class __array_const_reference<__self>;
+
+ friend class __array_vector_iterator<__self, true>;
+
+ friend class __array_vector_iterator<__self, false>;
+
+ __storage_pointer data_;
+ size_type size_;
+ size_type el_sz_;
+
+public:
+ array_vector(__storage_pointer data, size_type sz, size_type el_sz)
+ : data_(data), size_(sz), el_sz_(el_sz) { }
+
+ reference operator[](size_t pos) {
+ return reference(data_ + pos * el_sz_, el_sz_);
+ }
+
+ const ElTy *operator[](size_t pos) const {
+ return data_ + pos * el_sz_;
+ }
+
+ iterator begin() {
+ return iterator(data_, el_sz_);
+ }
+
+ iterator end() {
+ return iterator(data_ + size_ * el_sz_, el_sz_);
+ }
+
+ const_iterator begin() const {
+ return const_iterator(data_, el_sz_);
+ }
+
+ const_iterator end() const {
+ return const_iterator(data_ + size_ * el_sz_, el_sz_);
+ }
+
+ const_iterator cbegin() const {
+ return const_iterator(data_, el_sz_);
+ }
+
+ const_iterator cend() const {
+ return const_iterator(data_ + size_ * el_sz_, el_sz_);
+ }
+
+ size_t size() const { return size_; }
+
+ __storage_pointer data() const { return data_; }
+
+ void set_size(size_t size) {
+ size_ = size;
+ }
+
+ void set_data(__storage_pointer data) {
+ data_ = data;
+ }
+};
+
+template<typename ElTy>
+struct array_less {
+ typedef typename array_vector<ElTy>::value_type value;
+ typedef typename array_vector<ElTy>::reference reference;
+
+ bool operator()(const value &lhs, const value &rhs) const {
+ return lhs < rhs;
+ }
+
+ bool operator()(const value &lhs, const reference rhs) const {
+ return lhs < rhs;
+ }
+
+ bool operator()(const reference lhs, const value &rhs) const {
+ return lhs < rhs;
+ }
+
+ bool operator()(const reference lhs, const reference rhs) const {
+ return lhs < rhs;
+ }
+};
+
+template<typename ElTy>
+struct array_equal_to {
+ typedef typename array_vector<ElTy>::value_type value;
+ typedef typename array_vector<ElTy>::reference reference;
+ typedef typename array_vector<ElTy>::const_reference const_reference;
+
+ bool operator()(const value &lhs, const value &rhs) const {
+ return lhs == rhs;
+ }
+
+ bool operator()(const value &lhs, const reference rhs) const {
+ return lhs == rhs;
+ }
+
+ bool operator()(const reference lhs, const value &rhs) const {
+ return lhs == rhs;
+ }
+
+ bool operator()(const reference lhs, const ElTy *rhs, size_t sz) const {
+ return lhs == reference(rhs, sz);
+ }
+
+ bool operator()(const reference lhs, const reference rhs) const {
+ return lhs == rhs;
+ }
+
+ bool operator()(const ElTy *lhs, size_t sz, const reference rhs) const {
+ return const_reference(lhs, sz) == rhs;
+ }
+};
+
+#endif
diff --git a/src/utils/adt/bag.hpp b/src/utils/adt/bag.hpp
new file mode 100644
index 0000000..c5abbb3
--- /dev/null
+++ b/src/utils/adt/bag.hpp
@@ -0,0 +1,87 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "dev_support/verify.hpp"
+
+template<class T, class hash = std::hash<T>>
+class bag {
+ typedef std::unordered_map<T, size_t, hash> Data;
+ Data data_;
+ size_t size_;
+public:
+
+ bag() : size_(0) {
+ }
+
+ typedef typename Data::const_iterator const_iterator;
+
+ void put(const T& t, size_t mult) {
+ VERIFY(mult > 0);
+ data_[t] += mult;
+ size_ += mult;
+ }
+
+ void put(const T& t) {
+ put(t, 1);
+ }
+
+ bool take(const T& t, size_t mult) {
+ VERIFY(mult > 0);
+ /*typename map<T, size_t>::iterator*/auto it = data_.find(t);
+ if (it == data_.end()) {
+ return false;
+ } else {
+ size_t have = it->second;
+ if (have < mult) {
+ data_.erase(it->first);
+ size_ -= have;
+ return false;
+ } else if (have == mult) {
+ data_.erase(it->first);
+ size_ -= have;
+ return true;
+ } else {
+ it->second -= mult;
+ size_ -= mult;
+ return true;
+ }
+ }
+ }
+
+ bool take(const T& t) {
+ return take(t, 1);
+ }
+
+ size_t mult(const T& t) const {
+ auto it = data_.find(t);
+ if (it == data_.end()) {
+ return 0;
+ } else {
+ return it->second;
+ }
+ }
+
+ void clear() {
+ data_.clear();
+ size_ = 0;
+ }
+
+ const_iterator begin() const {
+ return data_.begin();
+ }
+
+ const_iterator end() const {
+ return data_.end();
+ }
+
+ size_t size() const {
+ return size_;
+ }
+
+};
diff --git a/src/utils/adt/chained_iterator.hpp b/src/utils/adt/chained_iterator.hpp
new file mode 100644
index 0000000..c7ef9d2
--- /dev/null
+++ b/src/utils/adt/chained_iterator.hpp
@@ -0,0 +1,76 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef __ADT_CHAINED_ITERATOR_HPP__
+#define __ADT_CHAINED_ITERATOR_HPP__
+
+#include <boost/iterator/iterator_facade.hpp>
+
+#include <iterator>
+#include <vector>
+
+template<class It>
+class chained_iterator :
+ public boost::iterator_facade<chained_iterator<It>,
+ typename std::iterator_traits<It>::value_type,
+ boost::forward_traversal_tag,
+ typename std::iterator_traits<It>::value_type> {
+public:
+ chained_iterator(It begin, It end) :
+ section_(0), current_(begin) {
+ join(begin, end);
+ }
+
+ void join(It begin, It end) {
+ begins_.push_back(begin);
+ ends_.push_back(end);
+ skip_empty();
+ }
+
+private:
+ friend class boost::iterator_core_access;
+
+ bool is_end() const {
+ return current_ == ends_[section_];
+ }
+
+ void skip_empty() {
+ while ((section_ + 1) < begins_.size() &&
+ current_ == ends_[section_])
+ current_ = begins_[++section_];
+ }
+
+ void increment() {
+ skip_empty();
+ ++current_;
+ skip_empty();
+ }
+
+ bool equal(const chained_iterator &other) const {
+ // Special case: both ends
+ bool other_end = other.is_end(), current_end = is_end();
+ if (current_end || other_end)
+ return other_end == current_end;
+
+ // Now, make sure we are comparing the iterators from the same sequences
+ // (actually, not, but this would be undefined behavior)
+ return (section_ == other.section_ &&
+ current_ == other.current_);
+ }
+
+ typename std::iterator_traits<It>::value_type dereference() const {
+ return *current_;
+ }
+
+ size_t section_;
+ It current_;
+ std::vector<It> begins_;
+ std::vector<It> ends_;
+};
+
+
+#endif
diff --git a/src/utils/adt/concurrent_dsu.hpp b/src/utils/adt/concurrent_dsu.hpp
new file mode 100644
index 0000000..176a5e3
--- /dev/null
+++ b/src/utils/adt/concurrent_dsu.hpp
@@ -0,0 +1,297 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef CONCURRENTDSU_HPP_
+#define CONCURRENTDSU_HPP_
+
+#include "io/kmers_io/mmapped_writer.hpp"
+
+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+#include <cstdarg>
+#include <cstdint>
+
+#include <algorithm>
+#include <vector>
+#include <unordered_map>
+#include <atomic>
+#include <fstream>
+
+// Silence bogus gcc warnings
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+
+class ConcurrentDSU {
+ struct atomic_set_t {
+ uint64_t data : 61;
+ uint64_t aux : 2;
+ bool root : 1;
+ } __attribute__ ((packed));
+
+ static_assert(sizeof(atomic_set_t) == 8, "Unexpected size of atomic_set_t");
+
+public:
+ ConcurrentDSU(size_t size)
+ : data_(size) {
+
+ for (size_t i = 0; i < size; i++)
+ data_[i] = {.data = 1, .aux = 0, .root = true};
+ }
+
+ ~ConcurrentDSU() { }
+
+ void unite(size_t x, size_t y) {
+ uint64_t x_size, y_size;
+ uint64_t x_aux, y_aux;
+
+ // Step one: update the links
+ while (true) {
+ x = find_set(x);
+ y = find_set(y);
+ if (x == y)
+ return;
+
+ atomic_set_t x_entry = data_[x], y_entry = data_[y];
+ // If someone already changed roots => retry
+ if (!x_entry.root || !y_entry.root)
+ continue;
+
+ // We need to link the smallest subtree to the largest
+ x_size = x_entry.data, y_size = y_entry.data;
+ x_aux = x_entry.aux, y_aux = y_entry.aux;
+ if (x_size > y_size || (x_size == y_size && x > y)) {
+ std::swap(x, y);
+ std::swap(x_size, y_size);
+ std::swap(x_aux, y_aux);
+ std::swap(x_entry, y_entry);
+ }
+
+ // Link 'x' to 'y'. If someone already changed 'x' => try again.
+ atomic_set_t new_x_entry = {.data = y, .aux = x_aux, .root = false};
+ if (!data_[x].compare_exchange_strong(x_entry, new_x_entry))
+ continue;
+
+ break;
+ }
+
+ // Step two: update the size. We already linked 'x' to 'y'. Therefore we
+ // need to add 'x_size' to whichever value is currently inside 'y'.
+ while (true) {
+ y = find_set(y);
+ atomic_set_t y_entry = data_[y];
+ // If someone already changed the roots => retry
+ if (!y_entry.root)
+ continue;
+
+ // Update the size. If someone already changed 'y' => try again.
+ atomic_set_t new_y_entry = {.data = x_size + y_entry.data, .aux = y_aux, .root = true};
+ if (!data_[y].compare_exchange_strong(y_entry, new_y_entry))
+ continue;
+
+ break;
+ }
+ }
+
+ size_t set_size(size_t i) const {
+ while (true) {
+ size_t el = find_set(i);
+ atomic_set_t entry = data_[el];
+ if (!entry.root)
+ continue;
+
+ return entry.data;
+ }
+ }
+
+ size_t find_set(size_t x) const {
+ // Step one: find the root
+ size_t r = x;
+ atomic_set_t r_entry = data_[r];
+ while (!r_entry.root) {
+ r = r_entry.data;
+ r_entry = data_[r];
+ }
+
+ // Step two: traverse the path from 'x' to root trying to update the links
+ // Note that the links might change, therefore we stop as soon as we'll
+ // end at 'some' root.
+ while (x != r) {
+ atomic_set_t x_entry = data_[x];
+ if (x_entry.root)
+ break;
+
+ // Try to update parent (may fail, it's ok)
+ atomic_set_t new_x_entry = {.data = r, .aux = x_entry.aux, .root = false};
+ data_[x].compare_exchange_weak(x_entry, new_x_entry);
+ x = x_entry.data;
+ }
+
+ return x;
+ }
+
+ bool same(size_t x, size_t y) const {
+ while (true) {
+ x = find_set(x);
+ y = find_set(y);
+ if (x == y)
+ return true;
+ if (data_[x].load().root)
+ return false;
+ }
+ }
+
+ size_t num_sets() const {
+ size_t count = 0;
+ for (const auto &entry : data_) {
+ count += entry.load(std::memory_order_relaxed).root;
+ }
+
+ return count;
+ }
+
+ bool is_root(size_t x) const {
+ return data_[x].load(std::memory_order_relaxed).root;
+ }
+
+ uint64_t aux(size_t x) const {
+ return data_[x].load(std::memory_order_relaxed).aux;
+ }
+
+ uint64_t root_aux(size_t x) const {
+ while (true) {
+ x = find_set(x);
+ atomic_set_t entry = data_[x];
+
+ if (!entry.root)
+ continue;
+
+ return entry.aux;
+ }
+ }
+
+ void set_aux(size_t x, uint64_t data) {
+ while (true) {
+ atomic_set_t x_entry = data_[x];
+ atomic_set_t new_x_entry = {.data = x_entry.data, .aux = data, .root = x_entry.root};
+ if (!data_[x].compare_exchange_strong(x_entry, new_x_entry))
+ continue;
+
+ break;
+ }
+ }
+
+ void set_root_aux(size_t x, uint64_t data) {
+ while (true) {
+ x = find_set(x);
+ atomic_set_t x_entry = data_[x];
+ if (!x_entry.root)
+ continue;
+
+ atomic_set_t new_x_entry = {.data = x_entry.data, .aux = data, .root = true};
+ if (!data_[x].compare_exchange_strong(x_entry, new_x_entry))
+ continue;
+
+ break;
+ }
+ }
+
+ size_t extract_to_file(const std::string &Prefix) {
+ // First, touch all the sets to make them directly connect to the root
+# pragma omp parallel for
+ for (size_t x = 0; x < data_.size(); ++x)
+ (void) find_set(x);
+
+ std::unordered_map<size_t, size_t> sizes;
+
+#if 0
+ for (size_t x = 0; x < size; ++x) {
+ if (data_[x].parent != x) {
+ size_t t = data_[x].parent;
+ VERIFY(data_[t].parent == t)
+ }
+ }
+#endif
+
+ // Insert all the root elements into the map
+ sizes.reserve(num_sets());
+ for (size_t x = 0; x < data_.size(); ++x) {
+ if (is_root(x))
+ sizes[x] = 0;
+ }
+
+ // Now, calculate the counts. We can do this in parallel, because we know no
+ // insertion can occur.
+# pragma omp parallel for
+ for (size_t x = 0; x < data_.size(); ++x) {
+ size_t &entry = sizes[parent(x)];
+# pragma omp atomic
+ entry += 1;
+ }
+
+ // Now we know the sizes of each cluster. Go over again and calculate the
+ // file-relative (cumulative) offsets.
+ size_t off = 0;
+ for (size_t x = 0; x < data_.size(); ++x) {
+ if (is_root(x)) {
+ size_t &entry = sizes[x];
+ size_t noff = off + entry;
+ entry = off;
+ off = noff;
+ }
+ }
+
+ // Write down the entries
+ std::vector<size_t> out(off);
+ for (size_t x = 0; x < data_.size(); ++x) {
+ size_t &entry = sizes[parent(x)];
+ out[entry++] = x;
+ }
+ std::ofstream os(Prefix, std::ios::binary | std::ios::out);
+ os.write((char *) &out[0], out.size() * sizeof(out[0]));
+ os.close();
+
+ // Write down the sizes
+ MMappedRecordWriter<size_t> index(Prefix + ".idx");
+ index.reserve(sizes.size());
+ size_t *idx = index.data();
+ for (size_t x = 0, i = 0, sz = 0; x < data_.size(); ++x) {
+ if (is_root(x)) {
+ idx[i++] = sizes[x] - sz;
+ sz = sizes[x];
+ }
+ }
+
+ return sizes.size();
+ }
+
+ void get_sets(std::vector<std::vector<size_t> > &otherWay) {
+ otherWay.resize(data_.size());
+ for (size_t i = 0; i < data_.size(); i++) {
+ size_t set = find_set(i);
+ otherWay[set].push_back(i);
+ }
+ otherWay.erase(remove_if(otherWay.begin(), otherWay.end(), zero_size),
+ otherWay.end());
+ }
+
+private:
+ size_t parent(size_t x) const {
+ atomic_set_t val = data_[x];
+ return (val.root ? x : val.data);
+ }
+
+ static bool zero_size(const std::vector<size_t> &v) {
+ return v.size() == 0;
+ }
+
+ mutable std::vector<std::atomic<atomic_set_t> > data_;
+};
+
+#pragma GCC diagnostic pop
+
+#endif /* CONCURRENTDSU_HPP_ */
diff --git a/src/utils/adt/filter_iterator.hpp b/src/utils/adt/filter_iterator.hpp
new file mode 100644
index 0000000..fc5293a
--- /dev/null
+++ b/src/utils/adt/filter_iterator.hpp
@@ -0,0 +1,49 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef FILTER_ITERATOR_H_
+#define FILTER_ITERATOR_H_
+
+/**
+ * Iterator with some predicate -- iterates only on elements with predicate(item) == true
+ */
+template<typename iterator_type, typename predicate_type>
+class filter_iterator {
+public:
+ typedef typename iterator_type::value_type value_type;
+
+ filter_iterator(const iterator_type& begin, const iterator_type& end, const predicate_type& pred):
+ current_(begin), end_(end), pred_(pred)
+ {
+ while((current_ != end_) && (!pred_(*current_))) // why do we need here? DRY, see method advance() below.
+ ++current_;
+ } // filter_iterator
+
+ value_type operator*() const { return *current_; }
+ value_type operator->() const { return *current_; }
+
+ filter_iterator& operator++() { advance(); return *this; }
+
+ bool operator==(const filter_iterator& rhs) const { return current_ == rhs.current_; }
+ bool operator!=(const filter_iterator& rhs) const { return !(operator==(rhs)); }
+
+private:
+ void advance()
+ {
+ do
+ {
+ ++current_;
+ }
+ while((current_ != end_) && (!pred_(*current_)));
+ } // advance
+
+ iterator_type current_;
+ iterator_type end_;
+ predicate_type pred_;
+};
+
+#endif /* FILTER_ITERATOR_H_ */
diff --git a/src/include/adt/flat_map.hpp b/src/utils/adt/flat_map.hpp
similarity index 100%
rename from src/include/adt/flat_map.hpp
rename to src/utils/adt/flat_map.hpp
diff --git a/src/include/adt/flat_set.hpp b/src/utils/adt/flat_set.hpp
similarity index 100%
rename from src/include/adt/flat_set.hpp
rename to src/utils/adt/flat_set.hpp
diff --git a/src/utils/adt/function_traits.hpp b/src/utils/adt/function_traits.hpp
new file mode 100644
index 0000000..5729a41
--- /dev/null
+++ b/src/utils/adt/function_traits.hpp
@@ -0,0 +1,76 @@
+#ifndef __ADT_FUNCTION_TRAITS__
+#define __ADT_FUNCTION_TRAITS__
+
+#pragma once
+
+#include <functional>
+
+namespace adt {
+
+template<class F>
+struct function_traits;
+
+// function pointer
+template<class R, class... Args>
+struct function_traits<R(*)(Args...)> : public function_traits<R(Args...)> {
+};
+
+// member function pointer
+template<class C, class R, class... Args>
+struct function_traits<R(C::*)(Args...)> : public function_traits<R(C &, Args...)> {
+};
+
+// const member function pointer
+template<class C, class R, class... Args>
+struct function_traits<R(C::*)(Args...) const> : public function_traits<R(C &, Args...)> {
+};
+
+// member object pointer
+template<class C, class R>
+struct function_traits<R(C::*)> : public function_traits<R(C &)> {
+};
+
+template<class R, class... Args>
+struct function_traits<R(Args...)> {
+ using return_type = R;
+
+ static constexpr std::size_t arity = sizeof...(Args);
+
+ template<std::size_t N>
+ struct arg {
+ static_assert(N < arity, "invalid argument index");
+ using type = typename std::tuple_element<N, std::tuple<Args...>>::type;
+ };
+};
+
+template<class F>
+struct function_traits<F &> : public function_traits<F> {
+};
+
+template<class F>
+struct function_traits<F &&> : public function_traits<F> {
+};
+
+// functors & default implementation
+template<class F>
+struct function_traits {
+private:
+ using call_type = function_traits<decltype(&F::operator())>;
+
+public:
+ using return_type = typename call_type::return_type;
+
+ // Remeber to get rid of this argument
+ static constexpr std::size_t arity = call_type::arity - 1;
+
+ template<std::size_t N>
+ struct arg {
+ static_assert(N < arity, "invalid argument index");
+ // Remeber to get rid of this argument
+ using type = typename call_type::template arg<N + 1>::type;
+ };
+};
+
+} // namespace adt
+
+#endif // __ADT_FUNCTION_TRAITS__
diff --git a/src/utils/adt/iterator_range.hpp b/src/utils/adt/iterator_range.hpp
new file mode 100644
index 0000000..7b5db6b
--- /dev/null
+++ b/src/utils/adt/iterator_range.hpp
@@ -0,0 +1,50 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef __ITERATOR_RANGE_H__
+#define __ITERATOR_RANGE_H__
+
+#include <utility>
+#include <iterator>
+
+namespace adt {
+
+template<typename IteratorT>
+class iterator_range {
+ IteratorT begin_iterator, end_iterator;
+
+public:
+ template<typename Container>
+ iterator_range(Container &&c)
+ //TODO: Consider ADL/non-member begin/end calls.
+ : begin_iterator(c.begin()), end_iterator(c.end()) { }
+
+ iterator_range(IteratorT begin_iterator, IteratorT end_iterator)
+ : begin_iterator(std::move(begin_iterator)),
+ end_iterator(std::move(end_iterator)) { }
+
+ IteratorT begin() const { return begin_iterator; }
+
+ IteratorT end() const { return end_iterator; }
+};
+
+template<class T>
+iterator_range<T> make_range(T x, T y) {
+ return iterator_range<T>(std::move(x), std::move(y));
+}
+
+template<typename T>
+iterator_range<T> make_range(std::pair<T, T> p) {
+ return iterator_range<T>(std::move(p.first), std::move(p.second));
+}
+
+template<typename T>
+iterator_range<decltype(begin(std::declval<T>()))> drop_begin(T &&t, int n) {
+ return make_range(std::next(begin(t), n), end(t));
+}
+}
+
+#endif
diff --git a/src/utils/adt/kmer_hash_vector.hpp b/src/utils/adt/kmer_hash_vector.hpp
new file mode 100644
index 0000000..f2b6861
--- /dev/null
+++ b/src/utils/adt/kmer_hash_vector.hpp
@@ -0,0 +1,370 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * kmer_hash_vector.hpp
+ *
+ * Created on: Jul 19, 2012
+ * Author: alex
+ */
+
+#ifndef KMER_HASH_VECTOR_HPP_
+#define KMER_HASH_VECTOR_HPP_
+
+
+#include "data_structures/sequence/runtime_k.hpp"
+#include "kmer_map.hpp"
+
+
+namespace runtime_k {
+
+class IKmerHashVector {
+
+protected:
+ static const size_t LOAD_OVERHEAD = 1000;
+
+ size_t nthreads_;
+
+ size_t cell_size_;
+
+public:
+ typedef RtSeq input_value_type;
+
+ IKmerHashVector(size_t nthreads)
+ : nthreads_ (nthreads)
+ , cell_size_ (LOAD_OVERHEAD) {
+ }
+
+ virtual ~IKmerHashVector() {
+
+ }
+
+ virtual IKmerHashVector * copy() const = 0;
+
+ virtual void clear() = 0;
+
+ virtual void clear(size_t i) = 0;
+
+ virtual bool is_full() const = 0;
+
+ virtual bool is_presisely_full() const = 0;
+
+ virtual size_t capacity(size_t i) const = 0;
+
+ virtual size_t size(size_t i) const = 0;
+
+
+ virtual void insert(const input_value_type& value) = 0;
+
+ virtual void reserve(size_t cell_size) = 0;
+
+
+ virtual size_t get_k() const = 0;
+
+ size_t get_threads_num() const
+ {
+ return nthreads_;
+ }
+
+ virtual void dump (KmerMap<int>& destination, size_t bucketNum) = 0;
+};
+
+
+
+class KmerHashVector {
+
+public:
+
+ typedef IKmerHashVector base_vector_type;
+
+private:
+
+ base_vector_type * data_;
+
+public:
+
+ typedef KmerHashVector vector_type;
+
+ typedef base_vector_type::input_value_type input_value_type;
+
+
+ KmerHashVector(size_t k, size_t nthreads);
+
+ KmerHashVector(base_vector_type * vec): data_(vec) {
+ }
+
+ KmerHashVector(const vector_type& vec) {
+ data_ = vec.data_->copy();
+ }
+
+ vector_type& operator=(const vector_type& vec) {
+ if (vec.data_ != data_) {
+ delete data_;
+ data_ = vec.data_->copy();
+ }
+
+ return *this;
+ }
+
+ ~KmerHashVector() {
+ delete data_;
+ }
+
+
+
+ bool is_full() const {
+ return data_->is_full();
+ }
+
+ bool is_presisely_full() const {
+ return data_->is_presisely_full();
+ }
+
+ size_t get_threads_num() const
+ {
+ return data_->get_threads_num();
+ }
+
+
+ void insert(const input_value_type& value) {
+ data_->insert(value);
+ }
+
+ void clear() {
+ data_->clear();
+ }
+
+
+ void clear(size_t i) {
+ data_->clear(i);
+ }
+
+ size_t get_k() const {
+ return data_->get_k();
+ }
+
+ size_t capacity(size_t i) const {
+ return data_->capacity(i);
+ }
+
+ void reserve(size_t cell_size) {
+ data_->reserve(cell_size);
+ }
+
+ base_vector_type * get_data() const {
+ return data_;
+ }
+
+ void print_sizes() {
+ for (size_t i = 0; i < data_->get_threads_num(); ++i) {
+ INFO("Size " << i << ": " << data_->size(i));
+ }
+ }
+
+ void dump (KmerMap<int>& destination, size_t bucketNum) {
+ data_->dump(destination, bucketNum);
+ }
+};
+
+
+// ================================= VECTOR IMPLEMENTATION =================================
+
+template <size_t size_>
+class KmerHashVectorImpl: public IKmerHashVector {
+
+public:
+
+ typedef TypeContainerImpl<size_> type_container;
+
+ typedef typename type_container::Kmer Kmer;
+
+ typedef typename type_container::vector_type vector_type;
+
+ typedef std::vector<vector_type> data_type;
+
+ typedef IKmerHashVector base_type;
+
+ typedef typename base_type::input_value_type input_value_type;
+
+private:
+
+ data_type data_;
+
+ size_t k_;
+
+public:
+
+ KmerHashVectorImpl(size_t k, size_t nthreads):
+ IKmerHashVector(nthreads)
+ , data_ (nthreads)
+ , k_ (k) {
+ }
+
+ virtual base_type * copy() const {
+ return new KmerHashVectorImpl<size_>(*this);
+ }
+
+ virtual bool is_full() const {
+ return data_[0].size() >= cell_size_;
+ }
+
+ virtual bool is_presisely_full() const {
+ for (size_t i = 0; i < nthreads_; ++i) {
+ if (data_[i].size() >= cell_size_)
+ return true;
+ }
+ return false;
+ }
+
+ virtual void insert(const input_value_type& value) {
+ Kmer kmer = type_container::from_sequence(value);
+ data_[kmer.GetHash() % nthreads_].push_back(kmer);
+ }
+
+ virtual void clear() {
+ for (size_t i = 0; i < nthreads_; ++i) {
+ data_[i].clear();
+ }
+ }
+
+ virtual void clear(size_t i) {
+ data_[i].clear();
+ }
+
+ virtual size_t get_k() const {
+ return k_;
+ }
+
+ virtual size_t capacity(size_t i) const {
+ return data_[i].capacity();
+ }
+
+ virtual size_t size(size_t i) const {
+ return data_[i].size();
+ }
+
+ virtual void reserve(size_t cell_size) {
+ cell_size_ = cell_size;
+ for (size_t i = 0; i < nthreads_; ++i) {
+ data_[i].reserve(cell_size_ + LOAD_OVERHEAD);
+ }
+ }
+
+ const data_type& get_data() const {
+ return data_;
+ }
+
+ virtual void dump (KmerMap<int>& destination, size_t bucketNum) {
+ KmerMapImpl<size_, int>& destImpl = dynamic_cast<KmerMapImpl<size_, int>&>(destination.get_data());
+
+ for (auto it = data_[bucketNum].begin(), end = data_[bucketNum].end(); it != end; ++it) {
+ ++destImpl[*it];
+ }
+ }
+};
+
+
+// ================================= VECTOR FACTORIES =================================
+// Single factory interface
+class SingleKmerHashVectorFactory {
+
+public:
+
+ virtual IKmerHashVector * GetHashVector(size_t k, size_t nthreads) const = 0;
+
+ virtual ~SingleKmerHashVectorFactory() {
+
+ }
+};
+
+
+// Single factory for specific k and value
+template <size_t ts_>
+class SingleKmerHashVectorFactoryImpl: public SingleKmerHashVectorFactory {
+
+public:
+
+ virtual IKmerHashVector * GetHashVector(size_t k, size_t nthreads) const {
+ VERIFY_MSG(GET_UPPER_BOUND(k) == GET_K_BY_TS(ts_), k << " -> " << GET_UPPER_BOUND(k) << ", " << ts_ << " -> " << GET_K_BY_TS(ts_));
+ //INFO(k << " -> " << GET_UPPER_BOUND(k) << ", " << ts_ << " -> " << GET_K_BY_TS(ts_));
+
+ return new KmerHashVectorImpl< GET_K_BY_TS(ts_) >(k, nthreads);
+ }
+
+};
+
+//Factory genetator
+template<size_t ts_>
+class HashVectorGenerator {
+
+public:
+
+ static void GenerateHashVectors(std::vector< SingleKmerHashVectorFactory* > & factories) {
+ factories[ts_] = new SingleKmerHashVectorFactoryImpl<ts_>();
+ HashVectorGenerator<ts_ - 1> :: GenerateHashVectors (factories);
+ }
+};
+
+//Terminating factory generator
+template<>
+class HashVectorGenerator<MIN_TS> {
+
+public:
+
+ static void GenerateHashVectors(std::vector< SingleKmerHashVectorFactory* > & factories) {
+ factories[MIN_TS] = new SingleKmerHashVectorFactoryImpl<MIN_TS>;
+ }
+};
+
+
+//Lazy singleton for factory for every required value
+class KmerHashVectorFactory {
+
+private:
+
+ std::vector < SingleKmerHashVectorFactory* > single_factories_;
+
+ KmerHashVectorFactory() {
+ VERIFY_MSG(MIN_K <= MAX_K, "Invalid K value range");
+
+ single_factories_ = std::vector < SingleKmerHashVectorFactory* >(MAX_TS + 1);
+ HashVectorGenerator<MAX_TS>::GenerateHashVectors(single_factories_);
+ }
+
+public:
+
+ static KmerHashVectorFactory& GetInstance() {
+ static KmerHashVectorFactory instance;
+
+ return instance;
+ }
+
+ KmerHashVector GetHashVector(size_t k, size_t nthreads) {
+ VERIFY_MSG(k >= MIN_K && k <= MAX_K, "K value " + ToString(k) + " is not supported, should be >= " +
+ ToString(MIN_K) + " and <= " + ToString(MAX_K));
+
+ return KmerHashVector(single_factories_[GET_T_ELEMENTS_NUMBER(k)]->GetHashVector(k, nthreads));
+ }
+
+ IKmerHashVector * GetRawHashVector(size_t k, size_t nthreads) {
+ VERIFY_MSG(k >= MIN_K && k <= MAX_K, "K value " + ToString(k) + " is not supported, should be >= " +
+ ToString(MIN_K) + " and <= " + ToString(MAX_K));
+
+ return single_factories_[GET_T_ELEMENTS_NUMBER(k)]->GetHashVector(k, nthreads);
+ }
+};
+
+KmerHashVector GetHashVector(size_t k, size_t nthreads) {
+ return KmerHashVectorFactory::GetInstance().GetHashVector(k, nthreads);
+}
+
+KmerHashVector::KmerHashVector(size_t k, size_t nthreads): data_(KmerHashVectorFactory::GetInstance().GetRawHashVector(k, nthreads)) {
+}
+
+} //namespace runtime_k
+
+#endif /* KMER_HASH_VECTOR_HPP_ */
diff --git a/src/utils/adt/kmer_vector.hpp b/src/utils/adt/kmer_vector.hpp
new file mode 100644
index 0000000..06b9eb3
--- /dev/null
+++ b/src/utils/adt/kmer_vector.hpp
@@ -0,0 +1,179 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef __KMER_VECTOR_HPP__
+#define __KMER_VECTOR_HPP__
+
+#include "array_vector.hpp"
+#include "config.hpp"
+
+#ifdef SPADES_USE_JEMALLOC
+
+# include <jemalloc/jemalloc.h>
+
+#endif
+
+template<class Seq>
+class KMerVector {
+private:
+ typedef typename Seq::DataType ElTy;
+
+ ElTy *realloc() {
+#ifdef SPADES_USE_JEMALLOC
+ // First, try to expand in-place
+ if (storage_ && sizeof(ElTy) * capacity_ * el_sz_ > 4096 &&
+ je_rallocm((void **) &storage_, NULL, sizeof(ElTy) * capacity_ * el_sz_, 0, ALLOCM_NO_MOVE) ==
+ ALLOCM_SUCCESS)
+ return storage_;
+
+ // Failed, do usual malloc / memcpy / free cycle
+ ElTy *res = (ElTy *) je_malloc(sizeof(ElTy) * capacity_ * el_sz_);
+ if (storage_)
+ std::memcpy(res, storage_, size_ * sizeof(ElTy) * el_sz_);
+ je_free(storage_);
+ storage_ = res;
+#else
+ // No JEMalloc, no cookies
+ ElTy *res = new ElTy[capacity_ * el_sz_];
+ if (storage_)
+ std:: memcpy(res, storage_, size_ * sizeof(ElTy) * el_sz_);
+
+ delete[] storage_;
+ storage_ = res;
+#endif
+
+ return storage_;
+ }
+
+public:
+ typedef typename array_vector<ElTy>::reference reference;
+ typedef typename array_vector<ElTy>::value_type value_type;
+ typedef typename array_vector<ElTy>::iterator iterator;
+ typedef typename array_vector<ElTy>::const_iterator const_iterator;
+
+ typedef array_less<ElTy> less2_fast;
+ typedef array_equal_to<ElTy> equal_to;
+
+ explicit KMerVector(unsigned K, size_t capacity = 1)
+ : K_(K), size_(0), capacity_(std::max(capacity, (size_t) 1)), el_sz_(Seq::GetDataSize(K)), storage_(NULL),
+ vector_(realloc(), size_, el_sz_) {
+ }
+
+ KMerVector(KMerVector &&that)
+ : K_(that.K_), size_(that.size_), capacity_(that.capacity_), el_sz_(that.el_sz_), storage_(that.storage_),
+ vector_(storage_, size_, el_sz_) {
+ that.storage_ = NULL;
+ }
+
+ KMerVector(const KMerVector &that)
+ : K_(that.K_), size_(that.size_), capacity_(that.capacity_), el_sz_(that.el_sz_), storage_(NULL),
+ vector_(realloc(), size_, el_sz_) {
+ memcpy(storage_, that.storage_, size_ * sizeof(ElTy) * el_sz_);
+ }
+
+ ~KMerVector() {
+#ifdef SPADES_USE_JEMALLOC
+ je_free(storage_);
+#else
+ delete[] storage_;
+#endif
+ }
+
+ KMerVector &operator=(const KMerVector &that) {
+ if (this != &that) {
+ K_ = that.K_;
+ size_ = that.size_;
+ capacity_ = that.capacity_;
+ el_sz_ = that.el_sz_;
+
+ storage_ = NULL;
+ realloc();
+ memcpy(storage_, that.storage_, size_ * sizeof(ElTy) * el_sz_);
+
+ vector_.set_data(storage_);
+ vector_.set_size(size_);
+ }
+
+ return *this;
+ }
+
+ void push_back(const ElTy *data) {
+ if (capacity_ == size_)
+ reserve(capacity_ * 2);
+
+ vector_[size_] = data;
+ size_ += 1;
+ vector_.set_size(size_);
+ }
+
+ void push_back(const Seq &s) {
+ push_back(s.data());
+ }
+
+ void reserve(size_t amount) {
+ if (capacity_ < amount) {
+ capacity_ = amount;
+ vector_.set_data(realloc());
+ }
+ }
+
+ void clear() {
+ size_ = 0;
+ vector_.set_size(size_);
+ }
+
+ iterator begin() {
+ return vector_.begin();
+ }
+
+ const_iterator begin() const {
+ return vector_.begin();
+ }
+
+ iterator end() {
+ return vector_.end();
+ }
+
+ const_iterator end() const {
+ return vector_.end();
+ }
+
+ const ElTy *data() const {
+ return storage_;
+ }
+
+ size_t size() const {
+ return size_;
+ }
+
+ size_t el_size() const {
+ return el_sz_;
+ }
+
+ size_t el_data_size() const {
+ return el_sz_ * sizeof(ElTy);
+ }
+
+ size_t capacity() const {
+ return capacity_;
+ }
+
+ const ElTy *operator[](size_t idx) const {
+ return vector_[idx];
+ }
+
+private:
+ unsigned K_;
+ size_t size_;
+ size_t capacity_;
+ size_t el_sz_;
+ ElTy *storage_;
+ array_vector<ElTy> vector_;
+};
+
+
+#endif /* __KMER_VECTOR_HPP */
diff --git a/src/utils/adt/parallel_seq_vector.hpp b/src/utils/adt/parallel_seq_vector.hpp
new file mode 100644
index 0000000..209cb84
--- /dev/null
+++ b/src/utils/adt/parallel_seq_vector.hpp
@@ -0,0 +1,110 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "utils/adt/parallel_unordered_map.hpp"
+#include "dev_support/openmp_wrapper.h"
+
+#include "data_structures/sequence/runtime_k.hpp"
+#include "kmer_map.hpp"
+#include "kmer_hash_vector.hpp"
+
+class ParallelSeqVector {
+
+public:
+ typedef runtime_k::KmerHashVector par_container_t;
+
+ typedef runtime_k::KmerMap<int> destination_container_t;
+
+ typedef runtime_k::RtSeq Kmer;
+
+private:
+
+ size_t k_;
+
+ size_t nthreads_;
+
+ std::vector<par_container_t> nodes_;
+
+public:
+
+ ParallelSeqVector(size_t k, size_t nthreads, size_t cell_size) :
+ k_(k),
+ nthreads_(nthreads),
+ nodes_()
+
+ {
+ for (size_t i = 0; i < nthreads_; ++i) {
+ nodes_.push_back(runtime_k::GetHashVector(k_, nthreads_));
+ }
+
+ for (size_t i = 0; i < nthreads_; ++i) {
+ nodes_[i].reserve(cell_size);
+ }
+ }
+
+
+ void AddEdge(const Kmer &kmer, size_t thread_number) {
+ nodes_[thread_number].insert(kmer);
+ }
+
+ void CountSequence(const Sequence& s, size_t thread_number) {
+ if (s.size() < k_)
+ return;
+
+ Kmer kmer = s.start<Kmer>(k_);
+
+ AddEdge(kmer, thread_number);
+ for (size_t j = k_; j < s.size(); ++j) {
+ kmer <<= s[j];
+ AddEdge(kmer, thread_number);
+ }
+
+ }
+//
+// void MergeMaps(destination_container_t & dest_container, size_t i) {
+// for (size_t j = 0; j < nthreads_; ++j) {
+// dest_container.transfer(nodes_[j], i);
+// }
+// }
+
+ void Dump(destination_container_t & bucket, size_t bucket_number) {
+ for (size_t i = 0; i < nodes_.size(); ++i) {
+ nodes_[i].dump(bucket, bucket_number);
+ nodes_[i].clear(bucket_number);
+ }
+ }
+
+
+ size_t SingleBucketCount() const {
+ return nodes_[0].capacity(0);
+ }
+
+ bool IsFull(size_t i) const {
+ return nodes_[i].is_full();
+ }
+
+ void Clear(size_t i) {
+ nodes_[i].clear();
+ }
+
+ void Clear() {
+ for (size_t i = 0; i < nthreads_; ++i) {
+ nodes_[i].clear();
+ }
+ }
+
+ void print_sizes() {
+ for (size_t i = 0; i < nodes_.size(); ++i) {
+ INFO("Size " << i << "::: ");
+ nodes_[i].print_sizes();
+ }
+ }
+
+
+};
diff --git a/src/include/adt/parallel_unordered_map.hpp b/src/utils/adt/parallel_unordered_map.hpp
similarity index 100%
rename from src/include/adt/parallel_unordered_map.hpp
rename to src/utils/adt/parallel_unordered_map.hpp
diff --git a/src/utils/adt/pointer_iterator.hpp b/src/utils/adt/pointer_iterator.hpp
new file mode 100644
index 0000000..3f2e5a2
--- /dev/null
+++ b/src/utils/adt/pointer_iterator.hpp
@@ -0,0 +1,174 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef __HAMMER_POINTER_ITERATOR_HPP__
+#define __HAMMER_POINTER_ITERATOR_HPP__
+
+#include <iterator>
+#include <algorithm>
+#include <iostream>
+#include <stdexcept>
+#include <vector>
+
+template<typename T>
+class pointer_iterator : public std::iterator<std::random_access_iterator_tag, T> {
+protected:
+ T *data_;
+
+public:
+ typedef std::random_access_iterator_tag iterator_category;
+ typedef typename std::iterator<std::random_access_iterator_tag, T>::value_type value_type;
+ typedef typename std::iterator<std::random_access_iterator_tag, T>::difference_type difference_type;
+ typedef typename std::iterator<std::random_access_iterator_tag, T>::reference reference;
+ typedef typename std::iterator<std::random_access_iterator_tag, T>::pointer pointer;
+
+ pointer_iterator() : data_(NULL) { }
+
+ template<typename T2>
+ pointer_iterator(const pointer_iterator<T2> &r) : data_(&(*r)) { }
+
+ pointer_iterator(pointer data) : data_(data) { }
+
+ template<typename T2>
+ pointer_iterator &operator=(const pointer_iterator<T2> &r) {
+ data_ = &(*r);
+ return *this;
+ }
+
+ pointer_iterator &operator++() {
+ data_ += 1;
+ return *this;
+ }
+
+ pointer_iterator &operator--() {
+ data_ -= 1;
+ return *this;
+ }
+
+ pointer_iterator operator++(int) {
+ pointer_iterator res = *this;
+ data_ += 1;
+
+ return res;
+ }
+
+ pointer_iterator operator--(int) {
+ pointer_iterator res = *this;
+ data_ -= 1;
+
+ return res;
+ }
+
+ pointer_iterator operator+(const difference_type &n) const {
+ return pointer_iterator(data_ + n);
+ }
+
+ pointer_iterator &operator+=(const difference_type &n) {
+ data_ += n;
+ return *this;
+ }
+
+ pointer_iterator operator-(const difference_type &n) const {
+ return pointer_iterator(pointer(data_ - n));
+ }
+
+ pointer_iterator &operator-=(const difference_type &n) {
+ data_ -= n;
+ return *this;
+ }
+
+ reference operator*() const {
+ return *data_;
+ }
+
+ pointer operator->() const {
+ return data_;
+ }
+
+ reference operator[](const difference_type &n) const {
+ return data_[n];
+ }
+
+ template<typename T2>
+ friend bool operator==(const pointer_iterator<T2> &r1,
+ const pointer_iterator<T2> &r2);
+
+ template<typename T2>
+ friend bool operator!=(const pointer_iterator<T2> &r1,
+ const pointer_iterator<T2> &r2);
+
+ template<typename T2>
+ friend bool operator<(const pointer_iterator<T2> &r1,
+ const pointer_iterator<T2> &r2);
+
+ template<typename T2>
+ friend bool operator>(const pointer_iterator<T2> &r1,
+ const pointer_iterator<T2> &r2);
+
+ template<typename T2>
+ friend bool operator<=(const pointer_iterator<T2> &r1,
+ const pointer_iterator<T2> &r2);
+
+ template<typename T2>
+ friend bool operator>=(const pointer_iterator<T2> &r1,
+ const pointer_iterator<T2> &r2);
+
+ template<typename T2>
+ friend typename pointer_iterator<T2>::difference_type
+ operator+(const pointer_iterator<T2> &r1,
+ const pointer_iterator<T2> &r2);
+
+ template<typename T2>
+ friend typename pointer_iterator<T2>::difference_type
+ operator-(const pointer_iterator<T2> &r1,
+ const pointer_iterator<T2> &r2);
+};
+
+template<typename T>
+inline bool operator==(const pointer_iterator<T> &r1,
+ const pointer_iterator<T> &r2) {
+ return (r1.data_ == r2.data_);
+}
+
+template<typename T>
+inline bool operator!=(const pointer_iterator<T> &r1,
+ const pointer_iterator<T> &r2) {
+ return (r1.data_ != r2.data_);
+}
+
+template<typename T>
+inline bool operator<(const pointer_iterator<T> &r1,
+ const pointer_iterator<T> &r2) {
+ return (r1.data_ < r2.data_);
+}
+
+template<typename T>
+inline bool operator>(const pointer_iterator<T> &r1,
+ const pointer_iterator<T> &r2) {
+ return (r1.data_ > r2.data_);
+}
+
+template<typename T>
+inline bool operator<=(const pointer_iterator<T> &r1,
+ const pointer_iterator<T> &r2) {
+ return (r1.data_ <= r2.data_);
+}
+
+template<typename T>
+inline bool operator>=(const pointer_iterator<T> &r1,
+ const pointer_iterator<T> &r2) {
+ return (r1.data_ >= r2.data_);
+}
+
+template<typename T>
+inline typename pointer_iterator<T>::difference_type
+operator-(const pointer_iterator<T> &r1,
+ const pointer_iterator<T> &r2) {
+ return (r1.data_ - r2.data_);
+}
+
+#endif // __HAMMER_POINTER_ITERATOR_HPP__
diff --git a/src/utils/adt/queue_iterator.hpp b/src/utils/adt/queue_iterator.hpp
new file mode 100644
index 0000000..c879541
--- /dev/null
+++ b/src/utils/adt/queue_iterator.hpp
@@ -0,0 +1,143 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef QUEUE_ITERATOR_HPP_
+#define QUEUE_ITERATOR_HPP_
+
+#include "dev_support/verify.hpp"
+#include <set>
+
+template<typename T, typename Comparator>
+class erasable_priority_queue {
+private:
+ std::set<T, Comparator> storage_;
+public:
+ /*
+ * Be careful! This constructor requires Comparator to have default constructor even if you call it with
+ * specified comparator. In this case just create default constructor with VERIFY(false) inside it.
+ */
+ erasable_priority_queue(const Comparator& comparator = Comparator()) :
+ storage_(comparator) {
+ }
+
+ template<typename InputIterator>
+ erasable_priority_queue(InputIterator begin, InputIterator end,
+ const Comparator& comparator = Comparator()) :
+ storage_(begin, end, comparator) {
+ }
+
+ void pop() {
+ VERIFY(!storage_.empty());
+ storage_.erase(storage_.begin());
+ }
+
+ const T& top() const {
+ VERIFY(!storage_.empty());
+ return *(storage_.begin());
+ }
+
+ void push(const T& key) {
+ storage_.insert(key);
+ }
+
+ bool erase(const T& key) {
+ bool res = storage_.erase(key) > 0;
+ return res;
+ }
+
+ void clear() {
+ storage_.clear();
+ }
+
+ bool empty() const {
+ return storage_.empty();
+ }
+
+ size_t size() const {
+ return storage_.size();
+ }
+
+ template <class InputIterator>
+ void insert ( InputIterator first, InputIterator last ) {
+ storage_.insert(first, last);
+ }
+
+};
+
+template<typename T, typename Comparator = std::less<T>>
+class DynamicQueueIterator {
+
+ bool current_actual_;
+ bool current_deleted_;
+ T current_;
+ erasable_priority_queue<T, Comparator> queue_;
+
+public:
+
+ DynamicQueueIterator(const Comparator& comparator = Comparator()) :
+ current_actual_(false), current_deleted_(false), queue_(comparator) {
+ }
+
+ template<typename InputIterator>
+ void insert(InputIterator begin, InputIterator end) {
+ queue_.insert(begin, end);
+ }
+
+ void push(const T& to_add) {
+ queue_.push(to_add);
+ }
+
+ void erase(const T& to_remove) {
+ if (current_actual_ && to_remove == current_) {
+ current_deleted_ = true;
+ }
+ queue_.erase(to_remove);
+ }
+
+ void clear() {
+ queue_.clear();
+ current_actual_ = false;
+ current_deleted_ = false;
+ }
+
+ bool IsEnd() const {
+ return queue_.empty();
+ }
+
+ size_t size() const {
+ return queue_.size();
+ }
+
+ const T& operator*() {
+ VERIFY(!queue_.empty());
+ if(!current_actual_ || current_deleted_) {
+ current_ = queue_.top();
+ current_actual_ = true;
+ current_deleted_ = false;
+ }
+ return current_;
+ }
+
+ void operator++() {
+ if (!current_actual_) {
+ queue_.pop();
+ } else if (!current_deleted_) {
+ queue_.erase(current_);
+ }
+ current_actual_ = false;
+ }
+
+ //use carefully!
+ void ReleaseCurrent() {
+ current_actual_ = false;
+ }
+
+};
+
+
+#endif /* QUEUE_ITERATOR_HPP_ */
+
diff --git a/src/utils/adt/small_pod_vector.hpp b/src/utils/adt/small_pod_vector.hpp
new file mode 100644
index 0000000..184e1bc
--- /dev/null
+++ b/src/utils/adt/small_pod_vector.hpp
@@ -0,0 +1,399 @@
+#ifndef __ADT_SMALL_POD_VECTOR__
+#define __ADT_SMALL_POD_VECTOR__
+
+#pragma once
+
+#include <llvm/PointerIntPair.h>
+
+#include <vector>
+#include <type_traits>
+
+namespace adt {
+
+#define LIKELY(EXPR) __builtin_expect((bool)(EXPR), true)
+#define UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false)
+
+template<class T>
+class SmallPODVector {
+ template<typename PT1, typename PT2>
+ class PointerUnionTraits {
+ public:
+ static inline void *getAsVoidPointer(void *P) { return P; }
+
+ static inline void *getFromVoidPointer(void *P) { return P; }
+
+ enum {
+ PT1BitsAv = (int) (llvm::PointerLikeTypeTraits<PT1>::NumLowBitsAvailable),
+ PT2BitsAv = (int) (llvm::PointerLikeTypeTraits<PT2>::NumLowBitsAvailable),
+ NumLowBitsAvailable = PT1BitsAv < PT2BitsAv ? PT1BitsAv : PT2BitsAv
+ };
+ };
+
+ static const unsigned SmallSizeIntBits = 3;
+ static const unsigned MaxSmall = (1 << SmallSizeIntBits) - 1;
+
+ typedef typename std::vector<T> vector_type;
+
+ typedef llvm::PointerIntPair<void *, SmallSizeIntBits, size_t,
+ PointerUnionTraits<T *, vector_type *> > container_type;
+
+ typedef SmallPODVector<T> self;
+ container_type data_;
+
+public:
+ typedef size_t size_type;
+ typedef ptrdiff_t difference_type;
+ typedef T value_type;
+ typedef T *iterator;
+ typedef const T *const_iterator;
+
+ typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+ typedef std::reverse_iterator<iterator> reverse_iterator;
+
+ typedef T &reference;
+ typedef const T &const_reference;
+ typedef T *pointer;
+ typedef const T *const_pointer;
+
+// workaround missing "is_trivially_copyable" in g++ < 5.0
+#if __GNUG__ && __GNUC__ < 5
+#define IS_TRIVIALLY_COPYABLE(T) __has_trivial_copy(T)
+#else
+#define IS_TRIVIALLY_COPYABLE(T) std::is_trivially_copyable<T>::value
+#endif
+
+ static_assert(IS_TRIVIALLY_COPYABLE(value_type), "Value type for SmallPODVector should be trivially copyable");
+
+#undef IS_TRIVIALLY_COPYABLE
+
+private:
+ vector_type *vector() const {
+ return (data_.getInt() == 0 ? static_cast<vector_type *>(data_.getPointer()) : nullptr);
+ }
+
+ void impl_resize(size_type N) {
+ void *data = data_.getPointer(), *new_data = data;
+ size_t sz = data_.getInt(), new_sz = N;
+
+ if (UNLIKELY(sz == 0 && data != nullptr)) { // vector case
+ vector_type *v = static_cast<vector_type *>(data);
+ if (N > MaxSmall) {
+ v->resize(N);
+ new_data = v;
+ new_sz = 0;
+ } else { // We have to turn vector into array
+ if (N) {
+ new_data = malloc(N * sizeof(T));
+ new_sz = N;
+ memcpy(new_data, v->data(), N * sizeof(T));
+ } else {
+ new_data = nullptr;
+ new_sz = 0;
+ }
+ delete v;
+ }
+ } else if (UNLIKELY(N > MaxSmall)) {
+ // Ok, we have to grow too much - allocate new vector
+ vector_type *new_vector = new vector_type((T *) data, (T *) data + sz);
+ new_vector->resize(N);
+ if (data)
+ free(data);
+ new_data = new_vector;
+ new_sz = 0;
+ } else {
+ // Otherwise, simply change the size of the allocated space
+ if (N) {
+ new_data = realloc(data, N * sizeof(T));
+ new_sz = N;
+ } else {
+ free(data);
+ new_data = nullptr;
+ new_sz = 0;
+ }
+ }
+
+ data_.setPointer(new_data);
+ data_.setInt(new_sz);
+ }
+
+public:
+ SmallPODVector<T>() = default;
+
+ SmallPODVector<T>(size_type size, const T &value = T()) {
+ this->assign(size, value);
+ }
+
+ SmallPODVector<T>(const self &that) {
+ assign(that.begin(), that.end());
+ }
+
+ const self &operator=(const self &that) {
+ // Avoid self-assignment.
+ if (this == &that) return *this;
+ assign(that.begin(), that.end());
+ return *this;
+ }
+
+ SmallPODVector<T>(self &&that) {
+ data_ = that.data_;
+ that.data_.setPointer(nullptr);
+ that.data_.setInt(0);
+ }
+
+ const self &operator=(const self &&that) {
+ // Avoid self-assignment.
+ if (this == &that) return *this;
+
+ this->impl_resize(0);
+ data_ = that.data_;
+ that.data_.setPointer(nullptr);
+ that.data_.setInt(0);
+
+ return *this;
+ }
+
+ ~SmallPODVector<T>() {
+ this->impl_resize(0);
+ }
+
+ __attribute__((always_inline))
+ bool empty() const {
+ return data_.getInt() == 0 && data_.getPointer() == nullptr;
+ }
+
+ __attribute__((always_inline))
+ size_type size() const {
+ const auto v = vector();
+ if (UNLIKELY(v != nullptr))
+ return v->size();
+
+ return data_.getInt();
+ }
+
+ __attribute__((always_inline))
+ pointer data() {
+ const auto v = vector();
+ if (UNLIKELY(v != nullptr))
+ return v->data();
+
+ return pointer(data_.getPointer());
+ }
+
+ __attribute__((always_inline))
+ const_pointer cdata() const {
+ const auto v = vector();
+ if (UNLIKELY(v != nullptr))
+ return v->data();
+
+ return const_pointer(data_.getPointer());
+ }
+
+ size_type max_size() const { return size_type(-1) / sizeof(T); }
+
+ size_t capacity() const {
+ const auto v = vector();
+ if (UNLIKELY(v != nullptr))
+ return v->capacity();
+
+ return data_.getInt();
+ }
+
+ // forward iterator creation methods.
+ __attribute__((always_inline))
+ iterator begin() {
+ return (iterator)(data());
+ }
+
+ __attribute__((always_inline))
+ const_iterator begin() const {
+ return (const_iterator)(cdata());
+ }
+
+ __attribute__((always_inline))
+ const_iterator cbegin() const {
+ return (const_iterator)(cdata());
+ }
+
+ __attribute__((always_inline))
+ iterator end() {
+ return (iterator)(data() + size());
+ }
+
+ __attribute__((always_inline))
+ const_iterator end() const {
+ return (const_iterator)(cdata() + size());
+ }
+
+ __attribute__((always_inline))
+ const_iterator cend() const {
+ return (const_iterator)(cdata() + size());
+ }
+
+ // reverse iterator creation methods.
+ reverse_iterator rbegin() { return reverse_iterator(end()); }
+
+ const_reverse_iterator rbegin() const { return const_reverse_iterator(end()); }
+
+ reverse_iterator rend() { return reverse_iterator(begin()); }
+
+ const_reverse_iterator rend() const { return const_reverse_iterator(begin()); }
+
+ __attribute__((always_inline))
+ reference operator[](size_type idx) {
+ assert(idx < size());
+ return begin()[idx];
+ }
+
+ __attribute__((always_inline))
+ const_reference operator[](size_type idx) const {
+ assert(idx < size());
+ return begin()[idx];
+ }
+
+ reference front() {
+ assert(!empty());
+ return begin()[0];
+ }
+
+ const_reference front() const {
+ assert(!empty());
+ return begin()[0];
+ }
+
+ reference back() {
+ assert(!empty());
+ return end()[-1];
+ }
+
+ const_reference back() const {
+ assert(!empty());
+ return end()[-1];
+ }
+
+ void push_back(const T &value) {
+ const auto v = vector();
+ if (UNLIKELY(v != nullptr)) {
+ v->push_back(value);
+ return;
+ }
+
+ this->impl_resize(this->size() + 1);
+ memcpy(this->end() - 1, &value, sizeof(T));
+ }
+
+ void pop_back() {
+ // This will reallocate to array, if necessary.
+ this->impl_resize(this->size() - 1);
+ }
+
+ T pop_back_val() {
+ T res = ::std::move(this->back());
+ this->pop_back();
+ return res;
+ }
+
+ void clear() {
+ this->impl_resize(0);
+ }
+
+ void resize(size_type count) {
+ this->impl_resize(count);
+ std::uninitialized_fill(this->begin() + count, this->end(), T());
+ }
+
+ void resize(size_type count, const T &value) {
+ this->impl_resize(count);
+ std::uninitialized_fill(this->begin() + count, this->end(), value);
+ }
+
+ void reserve(size_type count) {
+ if (auto v = vector()) {
+ v->reserve(count);
+ }
+ }
+
+ void assign(size_type count, const T &value) {
+ this->impl_resize(count);
+ std::uninitialized_fill(this->begin(), this->end(), value);
+ }
+
+ template<class InputIt>
+ void assign(InputIt first, InputIt last) {
+ this->impl_resize(last - first);
+ std::uninitialized_copy(first, last, this->begin());
+ }
+
+ iterator erase(const_iterator pos) {
+ size_type idx = pos - this->begin();
+ std::copy(iterator(pos + 1), this->end(), iterator(pos));
+ this->impl_resize(this->size() - 1); // This might invalidate iterators
+
+ return this->begin() + idx;
+ }
+
+ iterator erase(const_iterator first, const_iterator last) {
+ difference_type idx = first - this->begin();
+ std::copy(iterator(last), this->end(), iterator(first));
+ this->impl_resize(this->size() - (last - first)); // This might invalidate iterators
+
+ return this->begin() + idx;
+ }
+
+ iterator insert(const_iterator pos, const T &value) {
+ if (pos == this->end()) {
+ this->push_back(value);
+ return this->end() - 1;
+ }
+
+ difference_type idx = pos - this->begin();
+ size_type sz = this->size();
+
+ this->impl_resize(sz + 1); // This might invalidate iterators
+
+ iterator it = this->begin() + idx;
+ std::copy_backward(it, this->end() - 1, this->end());
+
+ // If we just moved the element we're inserting, be sure to update the
+ // reference.
+ const T *vptr = &value;
+ if (it <= vptr && vptr < this->end())
+ ++vptr;
+
+ *it = *vptr;
+
+ return it;
+ }
+
+ template<typename... ArgTypes>
+ void emplace_back(ArgTypes &&... args) {
+ value_type tmp(std::forward<ArgTypes>(args)...);
+ push_back(std::move(tmp));
+ }
+
+ template<typename... ArgTypes>
+ iterator emplace(const_iterator pos, ArgTypes &&... args) {
+ value_type tmp(std::forward<ArgTypes>(args)...);
+ return insert(pos, std::move(tmp));
+ }
+
+ bool operator==(const self &rhs) const {
+ if (this->size() != rhs.size()) return false;
+ return std::equal(this->begin(), this->end(), rhs.begin());
+ }
+
+ bool operator!=(const self &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator<(const self &rhs) const {
+ return std::lexicographical_compare(this->begin(), this->end(),
+ rhs.begin(), rhs.end());
+ }
+};
+
+#undef LIKELY
+#undef UNLIKELY
+
+}
+
+#endif // __ADT_SMALL_POD_VECTOR__
diff --git a/src/utils/levenshtein.hpp b/src/utils/levenshtein.hpp
new file mode 100644
index 0000000..007966a
--- /dev/null
+++ b/src/utils/levenshtein.hpp
@@ -0,0 +1,241 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "dev_support/simple_tools.hpp"
+
+/*
+ * Little modified copy-paste from http://www.merriampark.com/ldcpp.htm
+ */
+inline size_t edit_distance(const std::string &source, const std::string &target) {
+
+ // Step 1
+
+ const size_t n = source.length();
+ const size_t m = target.length();
+ if (n == 0) {
+ return m;
+ }
+ if (m == 0) {
+ return n;
+ }
+
+ // Good form to declare a TYPEDEF
+
+ typedef std::vector<std::vector<size_t> > Tmatrix;
+
+ Tmatrix matrix(n + 1);
+
+ // Size the vectors in the 2.nd dimension. Unfortunately C++ doesn't
+ // allow for allocation on declaration of 2.nd dimension of vec of vec
+
+ for (size_t i = 0; i <= n; i++) {
+ matrix[i].resize(m + 1);
+ }
+
+ // Step 2
+
+ for (size_t i = 0; i <= n; i++) {
+ matrix[i][0] = i;
+ }
+
+ for (size_t j = 0; j <= m; j++) {
+ matrix[0][j] = j;
+ }
+
+ // Step 3
+
+ for (size_t i = 1; i <= n; i++) {
+
+ const char s_i = source[i - 1];
+
+ // Step 4
+
+ for (size_t j = 1; j <= m; j++) {
+
+ const char t_j = target[j - 1];
+
+ // Step 5
+
+ size_t cost;
+ if (s_i == t_j) {
+ cost = 0;
+ }
+ else {
+ cost = 1;
+ }
+
+ // Step 6
+
+ const size_t above = matrix[i - 1][j];
+ const size_t left = matrix[i][j - 1];
+ const size_t diag = matrix[i - 1][j - 1];
+ size_t cell = std::min(above + 1, std::min(left + 1, diag + cost));
+
+ // Step 6A: Cover transposition, in addition to deletion,
+ // insertion and substitution. This step is taken from:
+ // Berghel, Hal ; Roach, David : "An Extension of Ukkonen's
+ // Enhanced Dynamic Programming ASM Algorithm"
+ // (http://www.acm.org/~hlb/publications/asm/asm.html)
+
+ if (i > 2 && j > 2) {
+ size_t trans = matrix[i - 2][j - 2] + 1;
+ if (source[i - 2] != t_j) trans++;
+ if (s_i != target[j - 2]) trans++;
+ if (cell > trans) cell = trans;
+ }
+
+ matrix[i][j] = cell;
+ }
+ }
+
+ // Step 7
+
+ return matrix[n][m];
+}
+
+inline std::pair<std::pair<int, int>, std::string> best_edit_distance_cigar(const std::string &source,
+ const std::string &target) {
+
+ // Step 1
+
+ const size_t n = source.length();
+ const size_t m = target.length();
+// if (n == 0) {
+// return m;
+// }
+// if (m == 0) {
+// return n;
+// }
+
+ // Good form to declare a TYPEDEF
+
+ typedef std::vector<std::vector<int> > Tmatrix;
+
+ Tmatrix matrix(n + 1);
+
+ // Size the vectors in the 2.nd dimension. Unfortunately C++ doesn't
+ // allow for allocation on declaration of 2.nd dimension of vec of vec
+
+ for (size_t i = 0; i <= n; i++) {
+ matrix[i].resize(m + 1);
+ }
+
+ // Step 2
+
+ for (size_t i = 0; i <= n; i++) {
+ matrix[i][0] = (int) i;
+ }
+
+ for (size_t j = 0; j <= m; j++) {
+ matrix[0][j] = 0; //free inserts in front
+ }
+
+ // Step 3
+
+ for (size_t i = 1; i <= n; i++) {
+
+ const char s_i = source[i - 1];
+
+ // Step 4
+
+ for (size_t j = 1; j <= m; j++) {
+
+ const char t_j = target[j - 1];
+
+ // Step 5
+
+ int cost;
+ if (s_i == t_j) {
+ cost = 0;
+ }
+ else {
+ cost = 1;
+ }
+
+ // Step 6
+
+ const int above = matrix[i - 1][j];
+ const int left = matrix[i][j - 1];
+ const int diag = matrix[i - 1][j - 1];
+ int cell = std::min(above + 1, std::min(left + 1, diag + cost));
+
+ // Step 6A: Cover transposition, in addition to deletion,
+ // insertion and substitution. This step is taken from:
+ // Berghel, Hal ; Roach, David : "An Extension of Ukkonen's
+ // Enhanced Dynamic Programming ASM Algorithm"
+ // (http://www.acm.org/~hlb/publications/asm/asm.html)
+
+// if (i>2 && j>2) {
+// int trans=matrix[i-2][j-2]+1;
+// if (source[i-2]!=t_j) trans++;
+// if (s_i!=target[j-2]) trans++;
+// if (cell>trans) cell=trans;
+// }
+
+ matrix[i][j] = cell;
+ }
+ }
+
+ // Step 7
+ int min = matrix[n][m];
+ size_t min_m = m;
+
+ for (size_t j = 0; j <= m; j++) {
+ if (min > matrix[n][j]) {
+ min = matrix[n][j];
+ min_m = j;
+ }
+ }
+
+// INFO("min = "<<min<< " min_m = "<< min_m);
+ std::string res = "";
+ char last_operation = 0;
+ int cnt_last_operation = 0;
+ size_t cur_pos_i = n;
+ size_t cur_pos_j = min_m;
+ char cur_operation = 0;
+
+
+// if (min > 0) {
+// for (int i = 0; i <= n; i++) {
+// INFO(ToString(matrix[i]));
+// }
+// }
+
+ while ((cur_pos_i > 0) && (cur_pos_j > 0)) {
+ if (matrix[cur_pos_i - 1][cur_pos_j] < matrix[cur_pos_i][cur_pos_j]) {
+ cur_operation = 'I';
+ cur_pos_i--;
+ }
+ else {
+ if (matrix[cur_pos_i][cur_pos_j - 1] < matrix[cur_pos_i][cur_pos_j]) {
+ cur_operation = 'D';
+ cur_pos_j--;
+ }
+ else {
+ cur_operation = 'M';
+ cur_pos_i--;
+ cur_pos_j--;
+ }
+ }
+ if (cur_operation != last_operation) {
+ if (last_operation != 0)
+ res = ToString(cnt_last_operation) + last_operation + res;
+ last_operation = cur_operation;
+ cnt_last_operation = 1;
+ }
+ else {
+ cnt_last_operation++;
+ }
+ }
+ res = ToString(cnt_last_operation) + last_operation + res;
+ return std::make_pair(std::make_pair(cur_pos_j, min_m), res);
+}
diff --git a/truspades.py b/truspades.py
index f93324e..dbca5d3 100755
--- a/truspades.py
+++ b/truspades.py
@@ -5,10 +5,12 @@
# All Rights Reserved
# See file LICENSE for details.
############################################################################
+import gzip
import logging
import os
import sys
+import shutil
import spades_init
spades_init.init()
truspades_home = spades_init.spades_home
@@ -61,10 +63,10 @@ def print_commands(commands, options, log):
log.info("Printing commands to " + output_file)
open(output_file, "w").write("\n".join([str(line).strip() for line in commands]) + "\n")
-def collect_contigs(dataset, output_dir, output_base, format):
+def collect_contigs(dataset, barcodes_dir, output_base, format):
output = open(output_base + "." + format, "w")
for barcode in dataset:
- file = os.path.join(output_dir, barcode.id, "truseq_long_reads." + format)
+ file = os.path.join(barcodes_dir, barcode.id, "truseq_long_reads." + format)
if os.path.exists(file):
contigs = SeqIO.parse(open(file), format)
for contig in contigs:
@@ -109,6 +111,19 @@ def RunTruSPAdes(dataset, log_dir, options, log):
collect_contigs(dataset, barcodes_dir, output_base, "fastq")
log.info("Assembled virtual long TruSeq reads can be found in " + os.path.join(options.output_dir,
"TSLR.fasta"))
+ if options.clean:
+ SaveContigs(barcodes_dir, dataset, "fasta")
+ SaveContigs(barcodes_dir, dataset, "fastq")
+ for barcode in dataset:
+ shutil.rmtree(os.path.join(barcodes_dir, barcode.id))
+
+
+def SaveContigs(barcodes_dir, dataset, format):
+ contig_dir = os.path.join(barcodes_dir, format)
+ support.ensure_dir_existence(contig_dir)
+ for barcode in dataset:
+ if os.path.isfile(os.path.join(barcodes_dir, barcode.id, "truseq_long_reads." + format)):
+ shutil.copyfileobj(open(os.path.join(barcodes_dir, barcode.id, "truseq_long_reads." + format), "rb"), gzip.open(os.path.join(contig_dir, barcode.id + "." + format + ".gz"), "wb"))
def create_log(options):
@@ -142,7 +157,7 @@ def CheckTestSuccess(options, log):
def main(argv):
options = launch_options.Options(argv, spades_home, truspades_home, spades_version)
support.ensure_dir_existence(options.output_dir)
- if options.test:
+ if options.test and not options.continue_launch:
support.recreate_dir(options.output_dir)
log = create_log(options)
dataset_file = os.path.join(options.output_dir, "dataset.info")
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/spades.git
More information about the debian-med-commit
mailing list